1/*
2 SPDX-FileCopyrightText: 2012 Vishesh Handa <me@vhanda.in>
3
4 SPDX-License-Identifier: LGPL-2.1-or-later
5*/
6
7
8#include "kfilemetadata_debug.h"
9#include "plaintextextractor.h"
10
11#include <QFile>
12#include <QStringDecoder>
13#include <QtMinMax>
14
15#include <KEncodingProber>
16
17#if defined(Q_OS_LINUX) || defined(__GLIBC__)
18 #include <fcntl.h>
19#endif
20
21using namespace KFileMetaData;
22
23namespace {
24
25bool looksLikeText(QStringView text)
26{
27 std::array<uint16_t, 7> groups{0};
28 for (const QChar c : text) {
29 if (auto category = c.category(); category > 29) {
30 continue;
31 } else if (category <= 2) { // Mark
32 groups[0]++;
33 } else if (category <= 5) { // Number
34 groups[1]++;
35 } else if (category <= 8) { // Separator
36 groups[2]++;
37 } else if (category <= 13) { // Other
38 groups[3]++;
39 } else if (category <= 18) { // Letter
40 groups[4]++;
41 } else if (category <= 25) { // Punctuation
42 groups[5]++;
43 } else if (category <= 29) { // Symbol
44 groups[6]++;
45 }
46 }
47 return (groups[0] + groups[3] + groups[6]) < (text.size() / 2);
48}
49} // namespace <anonymous>
50
51PlainTextExtractor::PlainTextExtractor(QObject* parent)
52 : ExtractorPlugin(parent)
53{
54
55}
56
57const QStringList supportedMimeTypes = {
58 QStringLiteral("text/plain"),
59};
60
61QStringList PlainTextExtractor::mimetypes() const
62{
63 return supportedMimeTypes;
64}
65
66void PlainTextExtractor::extract(ExtractionResult* result)
67{
68 QFile file(result->inputUrl());
69 bool isOpen = false;
70
71#ifdef O_NOATIME
72 const QByteArray filePath = QFile::encodeName(fileName: result->inputUrl());
73 int fd = open(file: filePath.constData(), O_RDONLY | O_NOATIME);
74 if (fd >= 0) {
75 isOpen = file.open(fd, ioFlags: QIODevice::ReadOnly, handleFlags: QFileDevice::AutoCloseHandle);
76 } else
77#endif
78 {
79 isOpen = file.open(flags: QIODevice::ReadOnly);
80 }
81
82 if (!isOpen) {
83 return;
84 }
85
86 if (!(result->inputFlags() & ExtractionResult::ExtractPlainText)) {
87 result->addType(type: Type::Text);
88 return;
89 }
90
91 auto autodetectCodec = [](const QByteArrayView buffer) -> QStringDecoder {
92
93 // First 16 bytes for detecting by BOM.
94 const QByteArrayView bufferForBom(buffer.begin(), qMin(a: 16, b: buffer.size()));
95
96 // first: try to get encoding by BOM handling
97 // If BOM has been found, trust it
98 if (auto encoding = QStringConverter::encodingForData(data: bufferForBom)) {
99 QStringDecoder decoder(encoding.value());
100 return decoder;
101 }
102
103 // second: try to get encoding by KEncodingProber
104 KEncodingProber prober(KEncodingProber::Universal);
105 prober.feed(data: buffer);
106
107 // we found codec with some confidence?
108 if (auto confidence = prober.confidence(); (confidence > 0.5) //
109 || ((confidence > 0.1) && (prober.encoding().toLower() == "utf-8"))) {
110 auto proberDecoder = QStringDecoder(prober.encoding().constData());
111 // rare case, but if not valid, do not return proberDecoder
112 if (proberDecoder.isValid()) {
113 return proberDecoder;
114 }
115 }
116
117 return QStringDecoder{QStringConverter::System};
118 };
119
120 // Read the first chunk, detect the encoding and decode it
121 constexpr int32_t chunkSize{256 * 1024};
122 QByteArray chunk(chunkSize, Qt::Uninitialized);
123
124 QByteArrayView chunkData = [&file, &chunk]() {
125 // read() returns [0...chunkSize] or -1 (error), so no narrowing when casting from qint64
126 const auto size = static_cast<int32_t>(file.read(data: chunk.data(), maxlen: chunkSize));
127 return QByteArrayView{chunk.data(), size};
128 }();
129
130 QStringDecoder codec{autodetectCodec(chunkData)};
131
132 QString text = codec.decode(ba: chunkData);
133 if (codec.hasError()) {
134 qCDebug(KFILEMETADATA_LOG) << "Invalid" << codec.name() << "encoding. Ignoring" << result->inputUrl();
135 return;
136 }
137
138 // Sanity check
139 if (!looksLikeText(text: QStringView{text}.mid(pos: 0, n: 512))) {
140 return;
141 }
142
143 // Detect the end-of-line variant
144 const auto eol = [](const QString &text) {
145 auto nl = text.indexOf(ch: QLatin1Char('\n'));
146 if ((nl >= 1) && (text[nl - 1] == QLatin1Char('\r'))) {
147 return QStringLiteral("\r\n");
148 } else if (nl >= 0) {
149 return QStringLiteral("\n");
150 } else if (text.indexOf(ch: QLatin1Char('\r')) >= 0) {
151 return QStringLiteral("\r");
152 }
153 return QStringLiteral("\n");
154 }(text);
155
156 qCDebug(KFILEMETADATA_LOG) << "Extracting" << codec.name() << eol << "plain text from" << result->inputUrl();
157
158 // Read and decode the remainder
159 while (!file.atEnd()) {
160 const auto size = static_cast<int32_t>(file.read(data: chunk.data(), maxlen: chunkSize));
161 if (size < 0) {
162 // may happen when the file is truncated during read
163 qCWarning(KFILEMETADATA_LOG) << "Error reading" << result->inputUrl();
164 break;
165 } else if (size == 0) {
166 break;
167 }
168
169 text += codec.decode(ba: {chunk.data(), size});
170 if (codec.hasError()) {
171 qCDebug(KFILEMETADATA_LOG) << "Invalid encoding. Ignoring" << result->inputUrl();
172 return;
173 }
174 }
175
176 // Split lines and count
177 int lines = 0;
178 qsizetype start = 0;
179 while (start < text.size()) {
180 auto end = text.indexOf(s: eol, from: start);
181 lines += 1;
182 if (end == -1) {
183 result->append(text: text.mid(position: start));
184 break;
185 }
186 result->append(text: text.mid(position: start, n: end - start));
187 start = end + eol.size();
188 }
189
190 result->addType(type: Type::Text);
191 if (result->inputFlags() & ExtractionResult::ExtractMetaData) {
192 result->add(property: Property::LineCount, value: lines);
193 }
194}
195
196#include "moc_plaintextextractor.cpp"
197

source code of kfilemetadata/src/extractors/plaintextextractor.cpp