plaintextextractor.cpp source code [kfilemetadata/src/extractors/plaintextextractor.cpp]

1	/*
2	SPDX-FileCopyrightText: 2012 Vishesh Handa <me@vhanda.in>
3
4	SPDX-License-Identifier: LGPL-2.1-or-later
5	*/
6
7
8	#include "kfilemetadata_debug.h"
9	#include "plaintextextractor.h"
10
11	#include <QFile>
12	#include <QStringDecoder>
13	#include <QtMinMax>
14
15	#include <KEncodingProber>
16
17	#if defined(Q_OS_LINUX) \|\| defined(__GLIBC__)
18	#include <fcntl.h>
19	#endif
20
21	using namespace KFileMetaData;
22
23	namespace {
24
25	bool looksLikeText(QStringView text)
26	{
27	std::array<uint16_t, `7`> groups{`0`};
28	for (const QChar c : text) {
29	if (auto category = c.category(); category > `29`) {
30	continue;
31	} else if (category <= `2`) { // Mark
32	groups [`0`]++;
33	} else if (category <= `5`) { // Number
34	groups [`1`]++;
35	} else if (category <= `8`) { // Separator
36	groups [`2`]++;
37	} else if (category <= `13`) { // Other
38	groups [`3`]++;
39	} else if (category <= `18`) { // Letter
40	groups [`4`]++;
41	} else if (category <= `25`) { // Punctuation
42	groups [`5`]++;
43	} else if (category <= `29`) { // Symbol
44	groups [`6`]++;
45	}
46	}
47	return (groups [`0`] + groups [`3`] + groups [`6`]) < (text.size() / `2`);
48	}
49	} // namespace <anonymous>
50
51	PlainTextExtractor::PlainTextExtractor(QObject* parent)
52	: ExtractorPlugin (parent)
53	{
54
55	}
56
57	const QStringList supportedMimeTypes = {
58	QStringLiteral("text/plain"),
59	};
60
61	QStringList PlainTextExtractor::mimetypes() const
62	{
63	return supportedMimeTypes;
64	}
65
66	void PlainTextExtractor::extract(ExtractionResult* result)
67	{
68	QFile file(result->inputUrl());
69	bool isOpen = false;
70
71	#ifdef O_NOATIME
72	const QByteArray filePath = QFile::encodeName(fileName: result->inputUrl());
73	int fd = open(file: filePath.constData(), O_RDONLY \| O_NOATIME);
74	if (fd >= `0`) {
75	isOpen = file.open(fd, ioFlags: QIODevice::ReadOnly, handleFlags: QFileDevice::AutoCloseHandle);
76	} else
77	#endif
78	{
79	isOpen = file.open(flags: QIODevice::ReadOnly);
80	}
81
82	if (!isOpen) {
83	return;
84	}
85
86	if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
87	result->addType(type: Type::Text);
88	return;
89	}
90
91	auto autodetectCodec = [](const QByteArrayView buffer) -> QStringDecoder {
92
93	// First 16 bytes for detecting by BOM.
94	const QByteArrayView bufferForBom(buffer.begin(), qMin(a: `16`, b: buffer.size()));
95
96	// first: try to get encoding by BOM handling
97	// If BOM has been found, trust it
98	if (auto encoding = QStringConverter::encodingForData(data: bufferForBom)) {
99	QStringDecoder decoder(encoding.value());
100	return decoder;
101	}
102
103	// second: try to get encoding by KEncodingProber
104	KEncodingProber prober(KEncodingProber::Universal);
105	prober.feed(data: buffer);
106
107	// we found codec with some confidence?
108	if (auto confidence = prober.confidence(); (confidence > `0.5`) //
109	\|\| ((confidence > `0.1`) && (prober.encoding().toLower() == "utf-8"))) {
110	auto proberDecoder = QStringDecoder(prober.encoding().constData());
111	// rare case, but if not valid, do not return proberDecoder
112	if (proberDecoder.isValid()) {
113	return proberDecoder;
114	}
115	}
116
117	return QStringDecoder{QStringConverter::System};
118	};
119
120	// Read the first chunk, detect the encoding and decode it
121	constexpr int32_t chunkSize{`256` * `1024`};
122	QByteArray chunk(chunkSize, Qt::Uninitialized);
123
124	QByteArrayView chunkData = [&file, &chunk]() {
125	// read() returns [0...chunkSize] or -1 (error), so no narrowing when casting from qint64
126	const auto size = static_cast<int32_t>(file.read(data: chunk.data(), maxlen: chunkSize));
127	return QByteArrayView{chunk.data(), size};
128	}();
129
130	QStringDecoder codec{autodetectCodec (chunkData)};
131
132	QString text = codec.decode(ba: chunkData);
133	if (codec.hasError()) {
134	qCDebug(KFILEMETADATA_LOG) << "Invalid" << codec.name() << "encoding. Ignoring" << result->inputUrl();
135	return;
136	}
137
138	// Sanity check
139	if (!looksLikeText(text: QStringView{text}.mid(pos: `0`, n: `512`))) {
140	return;
141	}
142
143	// Detect the end-of-line variant
144	const auto eol = [](const QString &text) {
145	auto nl = text.indexOf(ch: QLatin1Char(`'\n'`));
146	if ((nl >= `1`) && (text[nl - `1`] == QLatin1Char(`'\r'`))) {
147	return QStringLiteral("\r\n");
148	} else if (nl >= `0`) {
149	return QStringLiteral("\n");
150	} else if (text.indexOf(ch: QLatin1Char(`'\r'`)) >= `0`) {
151	return QStringLiteral("\r");
152	}
153	return QStringLiteral("\n");
154	}(text);
155
156	qCDebug(KFILEMETADATA_LOG) << "Extracting" << codec.name() << eol << "plain text from" << result->inputUrl();
157
158	// Read and decode the remainder
159	while (!file.atEnd()) {
160	const auto size = static_cast<int32_t>(file.read(data: chunk.data(), maxlen: chunkSize));
161	if (size < `0`) {
162	// may happen when the file is truncated during read
163	qCWarning(KFILEMETADATA_LOG) << "Error reading" << result->inputUrl();
164	break;
165	} else if (size == `0`) {
166	break;
167	}
168
169	text += codec.decode(ba: {chunk.data(), size});
170	if (codec.hasError()) {
171	qCDebug(KFILEMETADATA_LOG) << "Invalid encoding. Ignoring" << result->inputUrl();
172	return;
173	}
174	}
175
176	// Split lines and count
177	int lines = `0`;
178	qsizetype start = `0`;
179	while (start < text.size()) {
180	auto end = text.indexOf(s: eol, from: start);
181	lines += `1`;
182	if (end == -`1`) {
183	result->append(text: text.mid(position: start));
184	break;
185	}
186	result->append(text: text.mid(position: start, n: end - start));
187	start = end + eol.size();
188	}
189
190	result->addType(type: Type::Text);
191	if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
192	result->add(property: Property::LineCount, value: lines);
193	}
194	}
195
196	#include "moc_plaintextextractor.cpp"
197

source code of kfilemetadata/src/extractors/plaintextextractor.cpp