1/*
2 SPDX-FileCopyrightText: 2010 Christoph Cullmann <cullmann@kde.org>
3
4 SPDX-License-Identifier: LGPL-2.0-or-later
5*/
6
7#ifndef KATE_TEXTLOADER_H
8#define KATE_TEXTLOADER_H
9
10#include <QCryptographicHash>
11#include <QFile>
12#include <QMimeDatabase>
13#include <QString>
14#include <QStringDecoder>
15
16#include <KCompressionDevice>
17#include <KEncodingProber>
18
19#include "katetextbuffer.h"
20
21namespace Kate
22{
23/**
24 * loader block size, load 256 kb at once per default
25 * if file size is smaller, fall back to file size
26 * must be a multiple of 2
27 */
28static const qint64 KATE_FILE_LOADER_BS = 256 * 1024;
29
30/**
31 * File Loader, will handle reading of files + detecting encoding
32 */
33class TextLoader
34{
35public:
36 /**
37 * Construct file loader for given file.
38 * @param filename file to open
39 * @param proberType prober type
40 * @param lineLengthLimit limit for lines to load, else we break them up in smaller ones
41 */
42 TextLoader(const QString &filename, KEncodingProber::ProberType proberType, int lineLengthLimit)
43 : m_eof(false) // default to not eof
44 , m_lastWasEndOfLine(true) // at start of file, we had a virtual newline
45 , m_lastWasR(false) // we have not found a \r as last char
46 , m_position(0)
47 , m_lastLineStart(0)
48 , m_eol(TextBuffer::eolUnknown) // no eol type detected atm
49 , m_buffer(KATE_FILE_LOADER_BS, 0)
50 , m_digest(QCryptographicHash::Sha1)
51 , m_bomFound(false)
52 , m_firstRead(true)
53 , m_proberType(proberType)
54 , m_fileSize(0)
55 , m_lineLengthLimit(lineLengthLimit)
56 {
57 // try to get mimetype for on the fly decompression, don't rely on filename!
58 QFile testMime(filename);
59 if (testMime.open(flags: QIODevice::ReadOnly)) {
60 m_fileSize = testMime.size();
61 }
62 m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(fileName: filename, device: &testMime).name();
63
64 // construct filter device
65 KCompressionDevice::CompressionType compressionType = KCompressionDevice::compressionTypeForMimeType(mimetype: m_mimeType);
66 m_file = new KCompressionDevice(filename, compressionType);
67 }
68
69 /**
70 * Destructor
71 */
72 ~TextLoader()
73 {
74 delete m_file;
75 }
76
77 /**
78 * open file with given codec
79 * @param codec codec to use, if 0, will do some auto-detect or fallback
80 * @return success
81 */
82 bool open(const QString &codec)
83 {
84 m_codec = codec;
85 m_eof = false;
86 m_lastWasEndOfLine = true;
87 m_lastWasR = false;
88 m_position = 0;
89 m_lastLineStart = 0;
90 m_eol = TextBuffer::eolUnknown;
91 m_text.clear();
92 m_converterState = m_codec.isEmpty() ? QStringDecoder() : QStringDecoder(m_codec.toUtf8().constData());
93 m_bomFound = false;
94 m_firstRead = true;
95
96 // init the hash with the git header
97 const QString header = QStringLiteral("blob %1").arg(a: m_fileSize);
98 m_digest.reset();
99 m_digest.addData(data: QByteArray(header.toLatin1() + '\0'));
100
101 // if already opened, close the file...
102 if (m_file->isOpen()) {
103 m_file->close();
104 }
105
106 return m_file->open(mode: QIODevice::ReadOnly);
107 }
108
109 /**
110 * end of file reached?
111 * @return end of file reached
112 */
113 bool eof() const
114 {
115 return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length());
116 }
117
118 /**
119 * Detected end of line mode for this file.
120 * Detected during reading, is valid after complete file is read.
121 * @return eol mode of this file
122 */
123 TextBuffer::EndOfLineMode eol() const
124 {
125 return m_eol;
126 }
127
128 /**
129 * BOM found?
130 * @return byte order mark found?
131 */
132 bool byteOrderMarkFound() const
133 {
134 return m_bomFound;
135 }
136
137 /**
138 * mime type used to create filter dev
139 * @return mime-type of filter device
140 */
141 const QString &mimeTypeForFilterDev() const
142 {
143 return m_mimeType;
144 }
145
146 /**
147 * internal Unicode data array
148 * @return internal Unicode data
149 */
150 const QChar *unicode() const
151 {
152 return m_text.unicode();
153 }
154
155 /**
156 * Get codec for this loader
157 * @return currently in use codec of this loader
158 */
159 QString textCodec() const
160 {
161 return m_codec;
162 }
163
164 /**
165 * read a line, return length + offset in Unicode data
166 * @param offset offset into internal Unicode data for read line
167 * @param length length of read line
168 * @param tooLongLinesWrapped was a too long line seen?
169 * @param longestLineLoaded length of the longest line that hit the limit
170 * @return true if no encoding errors occurred
171 */
172 bool readLine(int &offset, int &length, bool &tooLongLinesWrapped, int &longestLineLoaded)
173 {
174 length = 0;
175 offset = 0;
176 bool encodingError = false;
177
178 static const QLatin1Char cr(QLatin1Char('\r'));
179 static const QLatin1Char lf(QLatin1Char('\n'));
180
181 /**
182 * did we read two time but got no stuff? encoding error
183 * fixes problem with one character latin-1 files, which lead to crash otherwise!
184 * bug 272579
185 */
186 bool failedToConvertOnce = false;
187
188 /**
189 * keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true
190 * BUG: 440359
191 */
192 bool bomPreviouslyFound = m_bomFound;
193
194 // honor the line length limit early
195 const auto lineLimitHandler = [this, &offset, &length, &tooLongLinesWrapped, &longestLineLoaded](int lineStart, int textLength) {
196 if ((m_lineLengthLimit <= 0) || (textLength <= m_lineLengthLimit)) {
197 return false;
198 }
199
200 // remember stick error
201 tooLongLinesWrapped = true;
202 longestLineLoaded = std::max(a: longestLineLoaded, b: textLength);
203
204 // search for place to wrap
205 int spacePosition = m_lineLengthLimit - 1;
206 for (int testPosition = m_lineLengthLimit - 1; (testPosition >= 0) && (testPosition >= (m_lineLengthLimit - (m_lineLengthLimit / 10)));
207 --testPosition) {
208 // wrap place found?
209 if (m_text[lineStart + testPosition].isSpace() || m_text[lineStart + testPosition].isPunct()) {
210 spacePosition = testPosition;
211 break;
212 }
213 }
214
215 m_lastWasEndOfLine = false;
216 m_lastWasR = false;
217
218 // line data
219 offset = lineStart;
220 length = spacePosition + 1;
221
222 m_lastLineStart = m_position = (lineStart + length);
223 return true;
224 };
225
226 /**
227 * reading loop
228 */
229 while (m_position <= m_text.length()) {
230 // handle too long lines early even if we not yet have seen the end
231 if (m_alreadyScanned > m_lastLineStart && lineLimitHandler(m_lastLineStart, m_alreadyScanned - m_lastLineStart)) {
232 return !encodingError;
233 }
234
235 if (m_position == m_text.length()) {
236 // try to load more text if something is around
237 if (!m_eof) {
238 // kill the old lines...
239 m_text.remove(i: 0, len: m_lastLineStart);
240
241 // try to read new data
242 const int c = m_file->read(data: m_buffer.data(), maxlen: m_buffer.size());
243
244 // if any text is there, append it....
245 if (c > 0) {
246 // update hash sum
247 m_digest.addData(data: QByteArrayView(m_buffer.data(), c));
248
249 // detect byte order marks & codec for byte order marks on first read
250 if (m_firstRead) {
251 /**
252 * if no codec given, do autodetection
253 */
254 if (!m_converterState.isValid()) {
255 /**
256 * first: try to get HTML header encoding, includes BOM handling
257 */
258 m_converterState = QStringDecoder::decoderForHtml(data: m_buffer);
259
260 /**
261 * else: use KEncodingProber
262 */
263 if (!m_converterState.isValid()) {
264 KEncodingProber prober(m_proberType);
265 prober.feed(data: m_buffer.constData(), len: c);
266
267 // we found codec with some confidence?
268 if (prober.confidence() > 0.5) {
269 m_converterState = QStringDecoder(prober.encoding().constData());
270 }
271 }
272
273 // no codec, no chance, encoding error, else remember the codec name
274 if (!m_converterState.isValid()) {
275 return false;
276 }
277 }
278
279 // we want to convert the bom for later detection
280 m_converterState = QStringDecoder(m_converterState.name(), QStringConverter::Flag::ConvertInitialBom);
281
282 // remember name, might have changed
283 m_codec = QString::fromUtf8(utf8: m_converterState.name());
284 }
285
286 // detect broken encoding
287 Q_ASSERT(m_converterState.isValid());
288 const QString unicode = m_converterState.decode(ba: QByteArrayView(m_buffer.data(), c));
289 encodingError = encodingError || m_converterState.hasError();
290
291 // check and remove bom
292 if (m_firstRead && !unicode.isEmpty() && (unicode.front() == QChar::ByteOrderMark || unicode.front() == QChar::ByteOrderSwapped)) {
293 m_bomFound = true;
294 m_text.append(v: QStringView(unicode).last(n: unicode.size() - 1));
295
296 // swapped BOM is encoding error
297 encodingError = encodingError || unicode.front() == QChar::ByteOrderSwapped;
298 } else {
299 m_text.append(s: unicode);
300 }
301 m_firstRead = false;
302 }
303
304 // is file completely read ?
305 m_eof = (c == -1) || (c == 0);
306
307 // recalc current pos and last pos
308 m_position -= m_lastLineStart;
309 m_alreadyScanned = m_position - 1;
310 m_lastLineStart = 0;
311 }
312
313 // oh oh, end of file, escape !
314 if (m_eof && (m_position == m_text.length())) {
315 m_lastWasEndOfLine = false;
316
317 // line data
318 offset = m_lastLineStart;
319 length = m_position - m_lastLineStart;
320
321 m_lastLineStart = m_position;
322
323 lineLimitHandler(offset, length);
324 return !encodingError && !failedToConvertOnce;
325 }
326
327 // empty? try again
328 if (m_position == m_text.length()) {
329 if (!bomPreviouslyFound && m_bomFound) {
330 // BOM was processed above, so we didn't fail to convert
331 bomPreviouslyFound = true;
332 } else {
333 failedToConvertOnce = true;
334 }
335 continue;
336 }
337 }
338
339 for (; m_position < m_text.length(); m_position++) {
340 m_alreadyScanned = m_position;
341 QChar current_char = m_text.at(i: m_position);
342 if (current_char == lf) {
343 m_lastWasEndOfLine = true;
344
345 if (m_lastWasR) {
346 m_lastLineStart++;
347 m_lastWasR = false;
348 m_eol = TextBuffer::eolDos;
349 } else {
350 // line data
351 offset = m_lastLineStart;
352 length = m_position - m_lastLineStart;
353
354 m_lastLineStart = m_position + 1;
355 m_position++;
356
357 // only win, if not dos!
358 if (m_eol != TextBuffer::eolDos) {
359 m_eol = TextBuffer::eolUnix;
360 }
361
362 lineLimitHandler(offset, length);
363 return !encodingError;
364 }
365 } else if (current_char == cr) {
366 m_lastWasEndOfLine = true;
367 m_lastWasR = true;
368
369 // line data
370 offset = m_lastLineStart;
371 length = m_position - m_lastLineStart;
372
373 m_lastLineStart = m_position + 1;
374 m_position++;
375
376 // should only win of first time!
377 if (m_eol == TextBuffer::eolUnknown) {
378 m_eol = TextBuffer::eolMac;
379 }
380
381 lineLimitHandler(offset, length);
382 return !encodingError;
383 } else if (current_char == QChar::LineSeparator) {
384 m_lastWasEndOfLine = true;
385
386 // line data
387 offset = m_lastLineStart;
388 length = m_position - m_lastLineStart;
389
390 m_lastLineStart = m_position + 1;
391 m_position++;
392
393 lineLimitHandler(offset, length);
394 return !encodingError;
395 } else {
396 m_lastWasEndOfLine = false;
397 m_lastWasR = false;
398 }
399 }
400 }
401
402 return !encodingError;
403 }
404
405 QByteArray digest()
406 {
407 return m_digest.result();
408 }
409
410private:
411 QString m_codec;
412 bool m_eof;
413 bool m_lastWasEndOfLine;
414 bool m_lastWasR;
415 int m_position;
416 int m_lastLineStart;
417 int m_alreadyScanned = -1;
418 TextBuffer::EndOfLineMode m_eol;
419 QString m_mimeType;
420 QIODevice *m_file;
421 QByteArray m_buffer;
422 QCryptographicHash m_digest;
423 QString m_text;
424 QStringDecoder m_converterState;
425 bool m_bomFound;
426 bool m_firstRead;
427 KEncodingProber::ProberType m_proberType;
428 quint64 m_fileSize;
429 const int m_lineLengthLimit;
430};
431
432}
433
434#endif
435

source code of ktexteditor/src/buffer/katetextloader.h