| 1 | /* |
| 2 | SPDX-FileCopyrightText: 2010 Christoph Cullmann <cullmann@kde.org> |
| 3 | |
| 4 | SPDX-License-Identifier: LGPL-2.0-or-later |
| 5 | */ |
| 6 | |
| 7 | #ifndef KATE_TEXTLOADER_H |
| 8 | #define KATE_TEXTLOADER_H |
| 9 | |
| 10 | #include <QCryptographicHash> |
| 11 | #include <QFile> |
| 12 | #include <QMimeDatabase> |
| 13 | #include <QString> |
| 14 | #include <QStringDecoder> |
| 15 | |
| 16 | #include <KCompressionDevice> |
| 17 | #include <KEncodingProber> |
| 18 | |
| 19 | #include "katetextbuffer.h" |
| 20 | |
| 21 | namespace Kate |
| 22 | { |
| 23 | /** |
| 24 | * loader block size, load 256 kb at once per default |
| 25 | * if file size is smaller, fall back to file size |
| 26 | * must be a multiple of 2 |
| 27 | */ |
| 28 | static const qint64 KATE_FILE_LOADER_BS = 256 * 1024; |
| 29 | |
| 30 | /** |
| 31 | * File Loader, will handle reading of files + detecting encoding |
| 32 | */ |
| 33 | class TextLoader |
| 34 | { |
| 35 | public: |
| 36 | /** |
| 37 | * Construct file loader for given file. |
| 38 | * @param filename file to open |
| 39 | * @param proberType prober type |
| 40 | * @param lineLengthLimit limit for lines to load, else we break them up in smaller ones |
| 41 | */ |
| 42 | TextLoader(const QString &filename, KEncodingProber::ProberType proberType, int lineLengthLimit) |
| 43 | : m_eof(false) // default to not eof |
| 44 | , m_lastWasEndOfLine(true) // at start of file, we had a virtual newline |
| 45 | , m_lastWasR(false) // we have not found a \r as last char |
| 46 | , m_position(0) |
| 47 | , m_lastLineStart(0) |
| 48 | , m_eol(TextBuffer::eolUnknown) // no eol type detected atm |
| 49 | , m_buffer(KATE_FILE_LOADER_BS, 0) |
| 50 | , m_digest(QCryptographicHash::Sha1) |
| 51 | , m_bomFound(false) |
| 52 | , m_firstRead(true) |
| 53 | , m_proberType(proberType) |
| 54 | , m_fileSize(0) |
| 55 | , m_lineLengthLimit(lineLengthLimit) |
| 56 | { |
| 57 | // try to get mimetype for on the fly decompression, don't rely on filename! |
| 58 | QFile testMime(filename); |
| 59 | if (testMime.open(flags: QIODevice::ReadOnly)) { |
| 60 | m_fileSize = testMime.size(); |
| 61 | } |
| 62 | m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(fileName: filename, device: &testMime).name(); |
| 63 | |
| 64 | // construct filter device |
| 65 | KCompressionDevice::CompressionType compressionType = KCompressionDevice::compressionTypeForMimeType(mimetype: m_mimeType); |
| 66 | m_file = new KCompressionDevice(filename, compressionType); |
| 67 | } |
| 68 | |
| 69 | /** |
| 70 | * Destructor |
| 71 | */ |
| 72 | ~TextLoader() |
| 73 | { |
| 74 | delete m_file; |
| 75 | } |
| 76 | |
| 77 | /** |
| 78 | * open file with given codec |
| 79 | * @param codec codec to use, if 0, will do some auto-detect or fallback |
| 80 | * @return success |
| 81 | */ |
| 82 | bool open(const QString &codec) |
| 83 | { |
| 84 | m_codec = codec; |
| 85 | m_eof = false; |
| 86 | m_lastWasEndOfLine = true; |
| 87 | m_lastWasR = false; |
| 88 | m_position = 0; |
| 89 | m_lastLineStart = 0; |
| 90 | m_alreadyScanned = -1; |
| 91 | m_eol = TextBuffer::eolUnknown; |
| 92 | m_text.clear(); |
| 93 | m_converterState = m_codec.isEmpty() ? QStringDecoder() : QStringDecoder(m_codec.toUtf8().constData()); |
| 94 | m_bomFound = false; |
| 95 | m_firstRead = true; |
| 96 | |
| 97 | // init the hash with the git header |
| 98 | const QString = QStringLiteral("blob %1" ).arg(a: m_fileSize); |
| 99 | m_digest.reset(); |
| 100 | m_digest.addData(data: QByteArray(header.toLatin1() + '\0')); |
| 101 | |
| 102 | // if already opened, close the file... |
| 103 | if (m_file->isOpen()) { |
| 104 | m_file->close(); |
| 105 | } |
| 106 | |
| 107 | return m_file->open(mode: QIODevice::ReadOnly); |
| 108 | } |
| 109 | |
| 110 | /** |
| 111 | * end of file reached? |
| 112 | * @return end of file reached |
| 113 | */ |
| 114 | bool eof() const |
| 115 | { |
| 116 | return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length()); |
| 117 | } |
| 118 | |
| 119 | /** |
| 120 | * Detected end of line mode for this file. |
| 121 | * Detected during reading, is valid after complete file is read. |
| 122 | * @return eol mode of this file |
| 123 | */ |
| 124 | TextBuffer::EndOfLineMode eol() const |
| 125 | { |
| 126 | return m_eol; |
| 127 | } |
| 128 | |
| 129 | /** |
| 130 | * BOM found? |
| 131 | * @return byte order mark found? |
| 132 | */ |
| 133 | bool byteOrderMarkFound() const |
| 134 | { |
| 135 | return m_bomFound; |
| 136 | } |
| 137 | |
| 138 | /** |
| 139 | * mime type used to create filter dev |
| 140 | * @return mime-type of filter device |
| 141 | */ |
| 142 | const QString &mimeTypeForFilterDev() const |
| 143 | { |
| 144 | return m_mimeType; |
| 145 | } |
| 146 | |
| 147 | /** |
| 148 | * internal Unicode data array |
| 149 | * @return internal Unicode data |
| 150 | */ |
| 151 | const QChar *unicode() const |
| 152 | { |
| 153 | return m_text.unicode(); |
| 154 | } |
| 155 | |
| 156 | /** |
| 157 | * Get codec for this loader |
| 158 | * @return currently in use codec of this loader |
| 159 | */ |
| 160 | QString textCodec() const |
| 161 | { |
| 162 | return m_codec; |
| 163 | } |
| 164 | |
| 165 | /** |
| 166 | * read a line, return length + offset in Unicode data |
| 167 | * @param offset offset into internal Unicode data for read line |
| 168 | * @param length length of read line |
| 169 | * @param tooLongLinesWrapped was a too long line seen? |
| 170 | * @param longestLineLoaded length of the longest line that hit the limit |
| 171 | * @return true if no encoding errors occurred |
| 172 | */ |
| 173 | bool readLine(int &offset, int &length, bool &tooLongLinesWrapped, int &longestLineLoaded) |
| 174 | { |
| 175 | length = 0; |
| 176 | offset = 0; |
| 177 | bool encodingError = false; |
| 178 | |
| 179 | static const QLatin1Char cr(QLatin1Char('\r')); |
| 180 | static const QLatin1Char lf(QLatin1Char('\n')); |
| 181 | |
| 182 | /** |
| 183 | * did we read two time but got no stuff? encoding error |
| 184 | * fixes problem with one character latin-1 files, which lead to crash otherwise! |
| 185 | * bug 272579 |
| 186 | */ |
| 187 | bool failedToConvertOnce = false; |
| 188 | |
| 189 | /** |
| 190 | * keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true |
| 191 | * BUG: 440359 |
| 192 | */ |
| 193 | bool bomPreviouslyFound = m_bomFound; |
| 194 | |
| 195 | // honor the line length limit early |
| 196 | const auto lineLimitHandler = [this, &offset, &length, &tooLongLinesWrapped, &longestLineLoaded](int lineStart, int textLength) { |
| 197 | if ((m_lineLengthLimit <= 0) || (textLength <= m_lineLengthLimit)) { |
| 198 | return false; |
| 199 | } |
| 200 | |
| 201 | // remember stick error |
| 202 | tooLongLinesWrapped = true; |
| 203 | longestLineLoaded = std::max(a: longestLineLoaded, b: textLength); |
| 204 | |
| 205 | // search for place to wrap |
| 206 | int spacePosition = m_lineLengthLimit - 1; |
| 207 | for (int testPosition = m_lineLengthLimit - 1; (testPosition >= 0) && (testPosition >= (m_lineLengthLimit - (m_lineLengthLimit / 10))); |
| 208 | --testPosition) { |
| 209 | // wrap place found? |
| 210 | if (m_text[lineStart + testPosition].isSpace() || m_text[lineStart + testPosition].isPunct()) { |
| 211 | spacePosition = testPosition; |
| 212 | break; |
| 213 | } |
| 214 | } |
| 215 | |
| 216 | m_lastWasEndOfLine = false; |
| 217 | m_lastWasR = false; |
| 218 | |
| 219 | // line data |
| 220 | offset = lineStart; |
| 221 | length = spacePosition + 1; |
| 222 | |
| 223 | m_lastLineStart = m_position = (lineStart + length); |
| 224 | return true; |
| 225 | }; |
| 226 | |
| 227 | /** |
| 228 | * reading loop |
| 229 | */ |
| 230 | while (m_position <= m_text.length()) { |
| 231 | // handle too long lines early even if we not yet have seen the end |
| 232 | if (m_alreadyScanned > m_lastLineStart && lineLimitHandler(m_lastLineStart, m_alreadyScanned - m_lastLineStart)) { |
| 233 | return !encodingError; |
| 234 | } |
| 235 | |
| 236 | if (m_position == m_text.length()) { |
| 237 | // try to load more text if something is around |
| 238 | if (!m_eof) { |
| 239 | // kill the old lines... |
| 240 | m_text.remove(i: 0, len: m_lastLineStart); |
| 241 | |
| 242 | // try to read new data |
| 243 | const int c = m_file->read(data: m_buffer.data(), maxlen: m_buffer.size()); |
| 244 | |
| 245 | // if any text is there, append it.... |
| 246 | if (c > 0) { |
| 247 | // update hash sum |
| 248 | m_digest.addData(data: QByteArrayView(m_buffer.data(), c)); |
| 249 | |
| 250 | // detect byte order marks & codec for byte order marks on first read |
| 251 | if (m_firstRead) { |
| 252 | // if no codec given, do autodetection |
| 253 | if (!m_converterState.isValid()) { |
| 254 | // use KEncodingProber first, QStringDecoder::decoderForHtml does fallback to UTF-8 |
| 255 | KEncodingProber prober(m_proberType); |
| 256 | prober.feed(data: QByteArrayView(m_buffer.data(), c)); |
| 257 | |
| 258 | // we found a codec with some confidence? |
| 259 | if (const QStringDecoder decoder(prober.encoding().constData()); decoder.isValid() && (prober.confidence() > 0.5)) { |
| 260 | m_converterState = QStringDecoder(prober.encoding().constData()); |
| 261 | } else { |
| 262 | // try to get HTML encoding, will default to UTF-8 |
| 263 | // see https://doc.qt.io/qt-6/qstringdecoder.html#decoderForHtml |
| 264 | m_converterState = QStringDecoder::decoderForHtml(data: m_buffer); |
| 265 | } |
| 266 | |
| 267 | // no codec, no chance, encoding error, else remember the codec name |
| 268 | if (!m_converterState.isValid()) { |
| 269 | return false; |
| 270 | } |
| 271 | } |
| 272 | |
| 273 | // we want to convert the bom for later detection |
| 274 | m_converterState = QStringDecoder(m_converterState.name(), QStringConverter::Flag::ConvertInitialBom); |
| 275 | |
| 276 | // remember name, might have changed |
| 277 | m_codec = QString::fromUtf8(utf8: m_converterState.name()); |
| 278 | } |
| 279 | |
| 280 | // detect broken encoding |
| 281 | Q_ASSERT(m_converterState.isValid()); |
| 282 | const QString unicode = m_converterState.decode(ba: QByteArrayView(m_buffer.data(), c)); |
| 283 | encodingError = encodingError || m_converterState.hasError(); |
| 284 | |
| 285 | // check and remove bom |
| 286 | if (m_firstRead && !unicode.isEmpty() && (unicode.front() == QChar::ByteOrderMark || unicode.front() == QChar::ByteOrderSwapped)) { |
| 287 | m_bomFound = true; |
| 288 | m_text.append(v: QStringView(unicode).last(n: unicode.size() - 1)); |
| 289 | |
| 290 | // swapped BOM is encoding error |
| 291 | encodingError = encodingError || unicode.front() == QChar::ByteOrderSwapped; |
| 292 | } else { |
| 293 | m_text.append(s: unicode); |
| 294 | } |
| 295 | m_firstRead = false; |
| 296 | } |
| 297 | |
| 298 | // is file completely read ? |
| 299 | m_eof = (c == -1) || (c == 0); |
| 300 | |
| 301 | // recalc current pos and last pos |
| 302 | m_position -= m_lastLineStart; |
| 303 | m_alreadyScanned = m_position - 1; |
| 304 | m_lastLineStart = 0; |
| 305 | } |
| 306 | |
| 307 | // oh oh, end of file, escape ! |
| 308 | if (m_eof && (m_position == m_text.length())) { |
| 309 | m_lastWasEndOfLine = false; |
| 310 | |
| 311 | // line data |
| 312 | offset = m_lastLineStart; |
| 313 | length = m_position - m_lastLineStart; |
| 314 | |
| 315 | m_lastLineStart = m_position; |
| 316 | |
| 317 | lineLimitHandler(offset, length); |
| 318 | return !encodingError && !failedToConvertOnce; |
| 319 | } |
| 320 | |
| 321 | // empty? try again |
| 322 | if (m_position == m_text.length()) { |
| 323 | if (!bomPreviouslyFound && m_bomFound) { |
| 324 | // BOM was processed above, so we didn't fail to convert |
| 325 | bomPreviouslyFound = true; |
| 326 | } else { |
| 327 | failedToConvertOnce = true; |
| 328 | } |
| 329 | continue; |
| 330 | } |
| 331 | } |
| 332 | |
| 333 | for (; m_position < m_text.length(); m_position++) { |
| 334 | m_alreadyScanned = m_position; |
| 335 | QChar current_char = m_text.at(i: m_position); |
| 336 | if (current_char == lf) { |
| 337 | m_lastWasEndOfLine = true; |
| 338 | |
| 339 | if (m_lastWasR) { |
| 340 | m_lastLineStart++; |
| 341 | m_lastWasR = false; |
| 342 | m_eol = TextBuffer::eolDos; |
| 343 | } else { |
| 344 | // line data |
| 345 | offset = m_lastLineStart; |
| 346 | length = m_position - m_lastLineStart; |
| 347 | |
| 348 | m_lastLineStart = m_position + 1; |
| 349 | m_position++; |
| 350 | |
| 351 | // only win, if not dos! |
| 352 | if (m_eol != TextBuffer::eolDos) { |
| 353 | m_eol = TextBuffer::eolUnix; |
| 354 | } |
| 355 | |
| 356 | lineLimitHandler(offset, length); |
| 357 | return !encodingError; |
| 358 | } |
| 359 | } else if (current_char == cr) { |
| 360 | m_lastWasEndOfLine = true; |
| 361 | m_lastWasR = true; |
| 362 | |
| 363 | // line data |
| 364 | offset = m_lastLineStart; |
| 365 | length = m_position - m_lastLineStart; |
| 366 | |
| 367 | m_lastLineStart = m_position + 1; |
| 368 | m_position++; |
| 369 | |
| 370 | // should only win of first time! |
| 371 | if (m_eol == TextBuffer::eolUnknown) { |
| 372 | m_eol = TextBuffer::eolMac; |
| 373 | } |
| 374 | |
| 375 | lineLimitHandler(offset, length); |
| 376 | return !encodingError; |
| 377 | } else if (current_char == QChar::LineSeparator) { |
| 378 | m_lastWasEndOfLine = true; |
| 379 | |
| 380 | // line data |
| 381 | offset = m_lastLineStart; |
| 382 | length = m_position - m_lastLineStart; |
| 383 | |
| 384 | m_lastLineStart = m_position + 1; |
| 385 | m_position++; |
| 386 | |
| 387 | lineLimitHandler(offset, length); |
| 388 | return !encodingError; |
| 389 | } else { |
| 390 | m_lastWasEndOfLine = false; |
| 391 | m_lastWasR = false; |
| 392 | } |
| 393 | } |
| 394 | } |
| 395 | |
| 396 | return !encodingError; |
| 397 | } |
| 398 | |
| 399 | QByteArray digest() |
| 400 | { |
| 401 | return m_digest.result(); |
| 402 | } |
| 403 | |
| 404 | private: |
| 405 | QString m_codec; |
| 406 | bool m_eof; |
| 407 | bool m_lastWasEndOfLine; |
| 408 | bool m_lastWasR; |
| 409 | int m_position; |
| 410 | int m_lastLineStart; |
| 411 | int m_alreadyScanned = -1; |
| 412 | TextBuffer::EndOfLineMode m_eol; |
| 413 | QString m_mimeType; |
| 414 | QIODevice *m_file; |
| 415 | QByteArray m_buffer; |
| 416 | QCryptographicHash m_digest; |
| 417 | QString m_text; |
| 418 | QStringDecoder m_converterState; |
| 419 | bool m_bomFound; |
| 420 | bool m_firstRead; |
| 421 | KEncodingProber::ProberType m_proberType; |
| 422 | quint64 m_fileSize; |
| 423 | const int m_lineLengthLimit; |
| 424 | }; |
| 425 | |
| 426 | } |
| 427 | |
| 428 | #endif |
| 429 | |