1 | /* |
2 | SPDX-FileCopyrightText: 2010 Christoph Cullmann <cullmann@kde.org> |
3 | |
4 | SPDX-License-Identifier: LGPL-2.0-or-later |
5 | */ |
6 | |
7 | #ifndef KATE_TEXTLOADER_H |
8 | #define KATE_TEXTLOADER_H |
9 | |
10 | #include <QCryptographicHash> |
11 | #include <QFile> |
12 | #include <QMimeDatabase> |
13 | #include <QString> |
14 | #include <QStringDecoder> |
15 | |
16 | #include <KCompressionDevice> |
17 | #include <KEncodingProber> |
18 | |
19 | #include "katetextbuffer.h" |
20 | |
21 | namespace Kate |
22 | { |
23 | /** |
24 | * loader block size, load 256 kb at once per default |
25 | * if file size is smaller, fall back to file size |
26 | * must be a multiple of 2 |
27 | */ |
28 | static const qint64 KATE_FILE_LOADER_BS = 256 * 1024; |
29 | |
30 | /** |
31 | * File Loader, will handle reading of files + detecting encoding |
32 | */ |
33 | class TextLoader |
34 | { |
35 | public: |
36 | /** |
37 | * Construct file loader for given file. |
38 | * @param filename file to open |
39 | * @param proberType prober type |
40 | * @param lineLengthLimit limit for lines to load, else we break them up in smaller ones |
41 | */ |
42 | TextLoader(const QString &filename, KEncodingProber::ProberType proberType, int lineLengthLimit) |
43 | : m_eof(false) // default to not eof |
44 | , m_lastWasEndOfLine(true) // at start of file, we had a virtual newline |
45 | , m_lastWasR(false) // we have not found a \r as last char |
46 | , m_position(0) |
47 | , m_lastLineStart(0) |
48 | , m_eol(TextBuffer::eolUnknown) // no eol type detected atm |
49 | , m_buffer(KATE_FILE_LOADER_BS, 0) |
50 | , m_digest(QCryptographicHash::Sha1) |
51 | , m_bomFound(false) |
52 | , m_firstRead(true) |
53 | , m_proberType(proberType) |
54 | , m_fileSize(0) |
55 | , m_lineLengthLimit(lineLengthLimit) |
56 | { |
57 | // try to get mimetype for on the fly decompression, don't rely on filename! |
58 | QFile testMime(filename); |
59 | if (testMime.open(flags: QIODevice::ReadOnly)) { |
60 | m_fileSize = testMime.size(); |
61 | } |
62 | m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(fileName: filename, device: &testMime).name(); |
63 | |
64 | // construct filter device |
65 | KCompressionDevice::CompressionType compressionType = KCompressionDevice::compressionTypeForMimeType(mimetype: m_mimeType); |
66 | m_file = new KCompressionDevice(filename, compressionType); |
67 | } |
68 | |
69 | /** |
70 | * Destructor |
71 | */ |
72 | ~TextLoader() |
73 | { |
74 | delete m_file; |
75 | } |
76 | |
77 | /** |
78 | * open file with given codec |
79 | * @param codec codec to use, if 0, will do some auto-detect or fallback |
80 | * @return success |
81 | */ |
82 | bool open(const QString &codec) |
83 | { |
84 | m_codec = codec; |
85 | m_eof = false; |
86 | m_lastWasEndOfLine = true; |
87 | m_lastWasR = false; |
88 | m_position = 0; |
89 | m_lastLineStart = 0; |
90 | m_alreadyScanned = -1; |
91 | m_eol = TextBuffer::eolUnknown; |
92 | m_text.clear(); |
93 | m_converterState = m_codec.isEmpty() ? QStringDecoder() : QStringDecoder(m_codec.toUtf8().constData()); |
94 | m_bomFound = false; |
95 | m_firstRead = true; |
96 | |
97 | // init the hash with the git header |
98 | const QString = QStringLiteral("blob %1" ).arg(a: m_fileSize); |
99 | m_digest.reset(); |
100 | m_digest.addData(data: QByteArray(header.toLatin1() + '\0')); |
101 | |
102 | // if already opened, close the file... |
103 | if (m_file->isOpen()) { |
104 | m_file->close(); |
105 | } |
106 | |
107 | return m_file->open(mode: QIODevice::ReadOnly); |
108 | } |
109 | |
110 | /** |
111 | * end of file reached? |
112 | * @return end of file reached |
113 | */ |
114 | bool eof() const |
115 | { |
116 | return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length()); |
117 | } |
118 | |
119 | /** |
120 | * Detected end of line mode for this file. |
121 | * Detected during reading, is valid after complete file is read. |
122 | * @return eol mode of this file |
123 | */ |
124 | TextBuffer::EndOfLineMode eol() const |
125 | { |
126 | return m_eol; |
127 | } |
128 | |
129 | /** |
130 | * BOM found? |
131 | * @return byte order mark found? |
132 | */ |
133 | bool byteOrderMarkFound() const |
134 | { |
135 | return m_bomFound; |
136 | } |
137 | |
138 | /** |
139 | * mime type used to create filter dev |
140 | * @return mime-type of filter device |
141 | */ |
142 | const QString &mimeTypeForFilterDev() const |
143 | { |
144 | return m_mimeType; |
145 | } |
146 | |
147 | /** |
148 | * internal Unicode data array |
149 | * @return internal Unicode data |
150 | */ |
151 | const QChar *unicode() const |
152 | { |
153 | return m_text.unicode(); |
154 | } |
155 | |
156 | /** |
157 | * Get codec for this loader |
158 | * @return currently in use codec of this loader |
159 | */ |
160 | QString textCodec() const |
161 | { |
162 | return m_codec; |
163 | } |
164 | |
165 | /** |
166 | * read a line, return length + offset in Unicode data |
167 | * @param offset offset into internal Unicode data for read line |
168 | * @param length length of read line |
169 | * @param tooLongLinesWrapped was a too long line seen? |
170 | * @param longestLineLoaded length of the longest line that hit the limit |
171 | * @return true if no encoding errors occurred |
172 | */ |
173 | bool readLine(int &offset, int &length, bool &tooLongLinesWrapped, int &longestLineLoaded) |
174 | { |
175 | length = 0; |
176 | offset = 0; |
177 | bool encodingError = false; |
178 | |
179 | static const QLatin1Char cr(QLatin1Char('\r')); |
180 | static const QLatin1Char lf(QLatin1Char('\n')); |
181 | |
182 | /** |
183 | * did we read two time but got no stuff? encoding error |
184 | * fixes problem with one character latin-1 files, which lead to crash otherwise! |
185 | * bug 272579 |
186 | */ |
187 | bool failedToConvertOnce = false; |
188 | |
189 | /** |
190 | * keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true |
191 | * BUG: 440359 |
192 | */ |
193 | bool bomPreviouslyFound = m_bomFound; |
194 | |
195 | // honor the line length limit early |
196 | const auto lineLimitHandler = [this, &offset, &length, &tooLongLinesWrapped, &longestLineLoaded](int lineStart, int textLength) { |
197 | if ((m_lineLengthLimit <= 0) || (textLength <= m_lineLengthLimit)) { |
198 | return false; |
199 | } |
200 | |
201 | // remember stick error |
202 | tooLongLinesWrapped = true; |
203 | longestLineLoaded = std::max(a: longestLineLoaded, b: textLength); |
204 | |
205 | // search for place to wrap |
206 | int spacePosition = m_lineLengthLimit - 1; |
207 | for (int testPosition = m_lineLengthLimit - 1; (testPosition >= 0) && (testPosition >= (m_lineLengthLimit - (m_lineLengthLimit / 10))); |
208 | --testPosition) { |
209 | // wrap place found? |
210 | if (m_text[lineStart + testPosition].isSpace() || m_text[lineStart + testPosition].isPunct()) { |
211 | spacePosition = testPosition; |
212 | break; |
213 | } |
214 | } |
215 | |
216 | m_lastWasEndOfLine = false; |
217 | m_lastWasR = false; |
218 | |
219 | // line data |
220 | offset = lineStart; |
221 | length = spacePosition + 1; |
222 | |
223 | m_lastLineStart = m_position = (lineStart + length); |
224 | return true; |
225 | }; |
226 | |
227 | /** |
228 | * reading loop |
229 | */ |
230 | while (m_position <= m_text.length()) { |
231 | // handle too long lines early even if we not yet have seen the end |
232 | if (m_alreadyScanned > m_lastLineStart && lineLimitHandler(m_lastLineStart, m_alreadyScanned - m_lastLineStart)) { |
233 | return !encodingError; |
234 | } |
235 | |
236 | if (m_position == m_text.length()) { |
237 | // try to load more text if something is around |
238 | if (!m_eof) { |
239 | // kill the old lines... |
240 | m_text.remove(i: 0, len: m_lastLineStart); |
241 | |
242 | // try to read new data |
243 | const int c = m_file->read(data: m_buffer.data(), maxlen: m_buffer.size()); |
244 | |
245 | // if any text is there, append it.... |
246 | if (c > 0) { |
247 | // update hash sum |
248 | m_digest.addData(data: QByteArrayView(m_buffer.data(), c)); |
249 | |
250 | // detect byte order marks & codec for byte order marks on first read |
251 | if (m_firstRead) { |
252 | // if no codec given, do autodetection |
253 | if (!m_converterState.isValid()) { |
254 | // use KEncodingProber first, QStringDecoder::decoderForHtml does fallback to UTF-8 |
255 | KEncodingProber prober(m_proberType); |
256 | prober.feed(data: QByteArrayView(m_buffer.data(), c)); |
257 | |
258 | // we found a codec with some confidence? |
259 | if (const QStringDecoder decoder(prober.encoding().constData()); decoder.isValid() && (prober.confidence() > 0.5)) { |
260 | m_converterState = QStringDecoder(prober.encoding().constData()); |
261 | } else { |
262 | // try to get HTML encoding, will default to UTF-8 |
263 | // see https://doc.qt.io/qt-6/qstringdecoder.html#decoderForHtml |
264 | m_converterState = QStringDecoder::decoderForHtml(data: m_buffer); |
265 | } |
266 | |
267 | // no codec, no chance, encoding error, else remember the codec name |
268 | if (!m_converterState.isValid()) { |
269 | return false; |
270 | } |
271 | } |
272 | |
273 | // we want to convert the bom for later detection |
274 | m_converterState = QStringDecoder(m_converterState.name(), QStringConverter::Flag::ConvertInitialBom); |
275 | |
276 | // remember name, might have changed |
277 | m_codec = QString::fromUtf8(utf8: m_converterState.name()); |
278 | } |
279 | |
280 | // detect broken encoding |
281 | Q_ASSERT(m_converterState.isValid()); |
282 | const QString unicode = m_converterState.decode(ba: QByteArrayView(m_buffer.data(), c)); |
283 | encodingError = encodingError || m_converterState.hasError(); |
284 | |
285 | // check and remove bom |
286 | if (m_firstRead && !unicode.isEmpty() && (unicode.front() == QChar::ByteOrderMark || unicode.front() == QChar::ByteOrderSwapped)) { |
287 | m_bomFound = true; |
288 | m_text.append(v: QStringView(unicode).last(n: unicode.size() - 1)); |
289 | |
290 | // swapped BOM is encoding error |
291 | encodingError = encodingError || unicode.front() == QChar::ByteOrderSwapped; |
292 | } else { |
293 | m_text.append(s: unicode); |
294 | } |
295 | m_firstRead = false; |
296 | } |
297 | |
298 | // is file completely read ? |
299 | m_eof = (c == -1) || (c == 0); |
300 | |
301 | // recalc current pos and last pos |
302 | m_position -= m_lastLineStart; |
303 | m_alreadyScanned = m_position - 1; |
304 | m_lastLineStart = 0; |
305 | } |
306 | |
307 | // oh oh, end of file, escape ! |
308 | if (m_eof && (m_position == m_text.length())) { |
309 | m_lastWasEndOfLine = false; |
310 | |
311 | // line data |
312 | offset = m_lastLineStart; |
313 | length = m_position - m_lastLineStart; |
314 | |
315 | m_lastLineStart = m_position; |
316 | |
317 | lineLimitHandler(offset, length); |
318 | return !encodingError && !failedToConvertOnce; |
319 | } |
320 | |
321 | // empty? try again |
322 | if (m_position == m_text.length()) { |
323 | if (!bomPreviouslyFound && m_bomFound) { |
324 | // BOM was processed above, so we didn't fail to convert |
325 | bomPreviouslyFound = true; |
326 | } else { |
327 | failedToConvertOnce = true; |
328 | } |
329 | continue; |
330 | } |
331 | } |
332 | |
333 | for (; m_position < m_text.length(); m_position++) { |
334 | m_alreadyScanned = m_position; |
335 | QChar current_char = m_text.at(i: m_position); |
336 | if (current_char == lf) { |
337 | m_lastWasEndOfLine = true; |
338 | |
339 | if (m_lastWasR) { |
340 | m_lastLineStart++; |
341 | m_lastWasR = false; |
342 | m_eol = TextBuffer::eolDos; |
343 | } else { |
344 | // line data |
345 | offset = m_lastLineStart; |
346 | length = m_position - m_lastLineStart; |
347 | |
348 | m_lastLineStart = m_position + 1; |
349 | m_position++; |
350 | |
351 | // only win, if not dos! |
352 | if (m_eol != TextBuffer::eolDos) { |
353 | m_eol = TextBuffer::eolUnix; |
354 | } |
355 | |
356 | lineLimitHandler(offset, length); |
357 | return !encodingError; |
358 | } |
359 | } else if (current_char == cr) { |
360 | m_lastWasEndOfLine = true; |
361 | m_lastWasR = true; |
362 | |
363 | // line data |
364 | offset = m_lastLineStart; |
365 | length = m_position - m_lastLineStart; |
366 | |
367 | m_lastLineStart = m_position + 1; |
368 | m_position++; |
369 | |
370 | // should only win of first time! |
371 | if (m_eol == TextBuffer::eolUnknown) { |
372 | m_eol = TextBuffer::eolMac; |
373 | } |
374 | |
375 | lineLimitHandler(offset, length); |
376 | return !encodingError; |
377 | } else if (current_char == QChar::LineSeparator) { |
378 | m_lastWasEndOfLine = true; |
379 | |
380 | // line data |
381 | offset = m_lastLineStart; |
382 | length = m_position - m_lastLineStart; |
383 | |
384 | m_lastLineStart = m_position + 1; |
385 | m_position++; |
386 | |
387 | lineLimitHandler(offset, length); |
388 | return !encodingError; |
389 | } else { |
390 | m_lastWasEndOfLine = false; |
391 | m_lastWasR = false; |
392 | } |
393 | } |
394 | } |
395 | |
396 | return !encodingError; |
397 | } |
398 | |
399 | QByteArray digest() |
400 | { |
401 | return m_digest.result(); |
402 | } |
403 | |
404 | private: |
405 | QString m_codec; |
406 | bool m_eof; |
407 | bool m_lastWasEndOfLine; |
408 | bool m_lastWasR; |
409 | int m_position; |
410 | int m_lastLineStart; |
411 | int m_alreadyScanned = -1; |
412 | TextBuffer::EndOfLineMode m_eol; |
413 | QString m_mimeType; |
414 | QIODevice *m_file; |
415 | QByteArray m_buffer; |
416 | QCryptographicHash m_digest; |
417 | QString m_text; |
418 | QStringDecoder m_converterState; |
419 | bool m_bomFound; |
420 | bool m_firstRead; |
421 | KEncodingProber::ProberType m_proberType; |
422 | quint64 m_fileSize; |
423 | const int m_lineLengthLimit; |
424 | }; |
425 | |
426 | } |
427 | |
428 | #endif |
429 | |