katetextloader.h source code [ktexteditor/src/buffer/katetextloader.h]

1	/*
2	SPDX-FileCopyrightText: 2010 Christoph Cullmann <cullmann@kde.org>
3
4	SPDX-License-Identifier: LGPL-2.0-or-later
5	*/
6
7	#ifndef KATE_TEXTLOADER_H
8	#define KATE_TEXTLOADER_H
9
10	#include <QCryptographicHash>
11	#include <QFile>
12	#include <QMimeDatabase>
13	#include <QString>
14	#include <QStringDecoder>
15
16	#include <KCompressionDevice>
17	#include <KEncodingProber>
18
19	#include "katetextbuffer.h"
20
21	namespace Kate
22	{
23	/**
24	* loader block size, load 256 kb at once per default
25	* if file size is smaller, fall back to file size
26	* must be a multiple of 2
27	*/
28	static const qint64 KATE_FILE_LOADER_BS = `256` * `1024`;
29
30	/**
31	* File Loader, will handle reading of files + detecting encoding
32	*/
33	class TextLoader
34	{
35	public:
36	/**
37	* Construct file loader for given file.
38	* @param filename file to open
39	* @param proberType prober type
40	* @param lineLengthLimit limit for lines to load, else we break them up in smaller ones
41	*/
42	TextLoader(const QString &filename, KEncodingProber::ProberType proberType, int lineLengthLimit)
43	: m_eof(false) // default to not eof
44	, m_lastWasEndOfLine(true) // at start of file, we had a virtual newline
45	, m_lastWasR(false) // we have not found a \r as last char
46	, m_position(`0`)
47	, m_lastLineStart(`0`)
48	, m_eol(TextBuffer::eolUnknown) // no eol type detected atm
49	, m_buffer(KATE_FILE_LOADER_BS, `0`)
50	, m_digest(QCryptographicHash::Sha1)
51	, m_bomFound(false)
52	, m_firstRead(true)
53	, m_proberType(proberType)
54	, m_fileSize(`0`)
55	, m_lineLengthLimit(lineLengthLimit)
56	{
57	// try to get mimetype for on the fly decompression, don't rely on filename!
58	QFile testMime(filename);
59	if (testMime.open(flags: QIODevice::ReadOnly)) {
60	m_fileSize = testMime.size();
61	}
62	m_mimeType = QMimeDatabase().mimeTypeForFileNameAndData(fileName: filename, device: &testMime).name();
63
64	// construct filter device
65	KCompressionDevice::CompressionType compressionType = KCompressionDevice::compressionTypeForMimeType(mimetype: m_mimeType);
66	m_file = new KCompressionDevice(filename, compressionType);
67	}
68
69	/**
70	* Destructor
71	*/
72	~TextLoader()
73	{
74	delete m_file;
75	}
76
77	/**
78	* open file with given codec
79	* @param codec codec to use, if 0, will do some auto-detect or fallback
80	* @return success
81	*/
82	bool open(const QString &codec)
83	{
84	m_codec = codec;
85	m_eof = false;
86	m_lastWasEndOfLine = true;
87	m_lastWasR = false;
88	m_position = `0`;
89	m_lastLineStart = `0`;
90	m_eol = TextBuffer::eolUnknown;
91	m_text.clear();
92	m_converterState = m_codec.isEmpty() ? QStringDecoder() : QStringDecoder(m_codec.toUtf8().constData());
93	m_bomFound = false;
94	m_firstRead = true;
95
96	// init the hash with the git header
97	const QString header = QStringLiteral("blob %1").arg(a: m_fileSize);
98	m_digest.reset();
99	m_digest.addData(data: QByteArray(header.toLatin1() + `'\0'`));
100
101	// if already opened, close the file...
102	if (m_file->isOpen()) {
103	m_file->close();
104	}
105
106	return m_file->open(mode: QIODevice::ReadOnly);
107	}
108
109	/**
110	* end of file reached?
111	* @return end of file reached
112	*/
113	bool eof() const
114	{
115	return m_eof && !m_lastWasEndOfLine && (m_lastLineStart == m_text.length());
116	}
117
118	/**
119	* Detected end of line mode for this file.
120	* Detected during reading, is valid after complete file is read.
121	* @return eol mode of this file
122	*/
123	TextBuffer::EndOfLineMode eol() const
124	{
125	return m_eol;
126	}
127
128	/**
129	* BOM found?
130	* @return byte order mark found?
131	*/
132	bool byteOrderMarkFound() const
133	{
134	return m_bomFound;
135	}
136
137	/**
138	* mime type used to create filter dev
139	* @return mime-type of filter device
140	*/
141	const QString &mimeTypeForFilterDev() const
142	{
143	return m_mimeType;
144	}
145
146	/**
147	* internal Unicode data array
148	* @return internal Unicode data
149	*/
150	const QChar unicode() const*
151	{
152	return m_text.unicode();
153	}
154
155	/**
156	* Get codec for this loader
157	* @return currently in use codec of this loader
158	*/
159	QString textCodec() const
160	{
161	return m_codec;
162	}
163
164	/**
165	* read a line, return length + offset in Unicode data
166	* @param offset offset into internal Unicode data for read line
167	* @param length length of read line
168	* @param tooLongLinesWrapped was a too long line seen?
169	* @param longestLineLoaded length of the longest line that hit the limit
170	* @return true if no encoding errors occurred
171	*/
172	bool readLine(int &offset, int &length, bool &tooLongLinesWrapped, int &longestLineLoaded)
173	{
174	length = `0`;
175	offset = `0`;
176	bool encodingError = false;
177
178	static const QLatin1Char cr(QLatin1Char(`'\r'`));
179	static const QLatin1Char lf(QLatin1Char(`'\n'`));
180
181	/**
182	* did we read two time but got no stuff? encoding error
183	* fixes problem with one character latin-1 files, which lead to crash otherwise!
184	* bug 272579
185	*/
186	bool failedToConvertOnce = false;
187
188	/**
189	* keep track if we have found BOM so that failedToConvertOnce is not erroneously set to true
190	* BUG: 440359
191	*/
192	bool bomPreviouslyFound = m_bomFound;
193
194	// honor the line length limit early
195	const auto lineLimitHandler = [this, &offset, &length, &tooLongLinesWrapped, &longestLineLoaded](int lineStart, int textLength) {
196	if ((m_lineLengthLimit <= `0`) \|\| (textLength <= m_lineLengthLimit)) {
197	return false;
198	}
199
200	// remember stick error
201	tooLongLinesWrapped = true;
202	longestLineLoaded = std::max(a: longestLineLoaded, b: textLength);
203
204	// search for place to wrap
205	int spacePosition = m_lineLengthLimit - `1`;
206	for (int testPosition = m_lineLengthLimit - `1`; (testPosition >= `0`) && (testPosition >= (m_lineLengthLimit - (m_lineLengthLimit / `10`)));
207	--testPosition) {
208	// wrap place found?
209	if (m_text[lineStart + testPosition].isSpace() \|\| m_text[lineStart + testPosition].isPunct()) {
210	spacePosition = testPosition;
211	break;
212	}
213	}
214
215	m_lastWasEndOfLine = false;
216	m_lastWasR = false;
217
218	// line data
219	offset = lineStart;
220	length = spacePosition + `1`;
221
222	m_lastLineStart = m_position = (lineStart + length);
223	return true;
224	};
225
226	/**
227	* reading loop
228	*/
229	while (m_position <= m_text.length()) {
230	// handle too long lines early even if we not yet have seen the end
231	if (m_alreadyScanned > m_lastLineStart && lineLimitHandler(m_lastLineStart, m_alreadyScanned - m_lastLineStart)) {
232	return !encodingError;
233	}
234
235	if (m_position == m_text.length()) {
236	// try to load more text if something is around
237	if (!m_eof) {
238	// kill the old lines...
239	m_text.remove(i: `0`, len: m_lastLineStart);
240
241	// try to read new data
242	const int c = m_file->read(data: m_buffer.data(), maxlen: m_buffer.size());
243
244	// if any text is there, append it....
245	if (c > `0`) {
246	// update hash sum
247	m_digest.addData(data: QByteArrayView(m_buffer.data(), c));
248
249	// detect byte order marks & codec for byte order marks on first read
250	if (m_firstRead) {
251	/**
252	* if no codec given, do autodetection
253	*/
254	if (!m_converterState.isValid()) {
255	/**
256	* first: try to get HTML header encoding, includes BOM handling
257	*/
258	m_converterState = QStringDecoder::decoderForHtml(data: m_buffer);
259
260	/**
261	* else: use KEncodingProber
262	*/
263	if (!m_converterState.isValid()) {
264	KEncodingProber prober(m_proberType);
265	prober.feed(data: m_buffer.constData(), len: c);
266
267	// we found codec with some confidence?
268	if (prober.confidence() > `0.5`) {
269	m_converterState = QStringDecoder(prober.encoding().constData());
270	}
271	}
272
273	// no codec, no chance, encoding error, else remember the codec name
274	if (!m_converterState.isValid()) {
275	return false;
276	}
277	}
278
279	// we want to convert the bom for later detection
280	m_converterState = QStringDecoder(m_converterState.name(), QStringConverter::Flag::ConvertInitialBom);
281
282	// remember name, might have changed
283	m_codec = QString::fromUtf8(utf8: m_converterState.name());
284	}
285
286	// detect broken encoding
287	Q_ASSERT(m_converterState.isValid());
288	const QString unicode = m_converterState.decode(ba: QByteArrayView(m_buffer.data(), c));
289	encodingError = encodingError \|\| m_converterState.hasError();
290
291	// check and remove bom
292	if (m_firstRead && !unicode.isEmpty() && (unicode.front() == QChar::ByteOrderMark \|\| unicode.front() == QChar::ByteOrderSwapped)) {
293	m_bomFound = true;
294	m_text.append(v: QStringView(unicode).last(n: unicode.size() - `1`));
295
296	// swapped BOM is encoding error
297	encodingError = encodingError \|\| unicode.front() == QChar::ByteOrderSwapped;
298	} else {
299	m_text.append(s: unicode);
300	}
301	m_firstRead = false;
302	}
303
304	// is file completely read ?
305	m_eof = (c == -`1`) \|\| (c == `0`);
306
307	// recalc current pos and last pos
308	m_position -= m_lastLineStart;
309	m_alreadyScanned = m_position - `1`;
310	m_lastLineStart = `0`;
311	}
312
313	// oh oh, end of file, escape !
314	if (m_eof && (m_position == m_text.length())) {
315	m_lastWasEndOfLine = false;
316
317	// line data
318	offset = m_lastLineStart;
319	length = m_position - m_lastLineStart;
320
321	m_lastLineStart = m_position;
322
323	lineLimitHandler(offset, length);
324	return !encodingError && !failedToConvertOnce;
325	}
326
327	// empty? try again
328	if (m_position == m_text.length()) {
329	if (!bomPreviouslyFound && m_bomFound) {
330	// BOM was processed above, so we didn't fail to convert
331	bomPreviouslyFound = true;
332	} else {
333	failedToConvertOnce = true;
334	}
335	continue;
336	}
337	}
338
339	for (; m_position < m_text.length(); m_position++) {
340	m_alreadyScanned = m_position;
341	QChar current_char = m_text.at(i: m_position);
342	if (current_char == lf) {
343	m_lastWasEndOfLine = true;
344
345	if (m_lastWasR) {
346	m_lastLineStart++;
347	m_lastWasR = false;
348	m_eol = TextBuffer::eolDos;
349	} else {
350	// line data
351	offset = m_lastLineStart;
352	length = m_position - m_lastLineStart;
353
354	m_lastLineStart = m_position + `1`;
355	m_position++;
356
357	// only win, if not dos!
358	if (m_eol != TextBuffer::eolDos) {
359	m_eol = TextBuffer::eolUnix;
360	}
361
362	lineLimitHandler(offset, length);
363	return !encodingError;
364	}
365	} else if (current_char == cr) {
366	m_lastWasEndOfLine = true;
367	m_lastWasR = true;
368
369	// line data
370	offset = m_lastLineStart;
371	length = m_position - m_lastLineStart;
372
373	m_lastLineStart = m_position + `1`;
374	m_position++;
375
376	// should only win of first time!
377	if (m_eol == TextBuffer::eolUnknown) {
378	m_eol = TextBuffer::eolMac;
379	}
380
381	lineLimitHandler(offset, length);
382	return !encodingError;
383	} else if (current_char == QChar::LineSeparator) {
384	m_lastWasEndOfLine = true;
385
386	// line data
387	offset = m_lastLineStart;
388	length = m_position - m_lastLineStart;
389
390	m_lastLineStart = m_position + `1`;
391	m_position++;
392
393	lineLimitHandler(offset, length);
394	return !encodingError;
395	} else {
396	m_lastWasEndOfLine = false;
397	m_lastWasR = false;
398	}
399	}
400	}
401
402	return !encodingError;
403	}
404
405	QByteArray digest()
406	{
407	return m_digest.result();
408	}
409
410	private:
411	QString m_codec;
412	bool m_eof;
413	bool m_lastWasEndOfLine;
414	bool m_lastWasR;
415	int m_position;
416	int m_lastLineStart;
417	int m_alreadyScanned = -`1`;
418	TextBuffer::EndOfLineMode m_eol;
419	QString m_mimeType;
420	QIODevice *m_file;
421	QByteArray m_buffer;
422	QCryptographicHash m_digest;
423	QString m_text;
424	QStringDecoder m_converterState;
425	bool m_bomFound;
426	bool m_firstRead;
427	KEncodingProber::ProberType m_proberType;
428	quint64 m_fileSize;
429	const int m_lineLengthLimit;
430	};
431
432	}
433
434	#endif
435

source code of ktexteditor/src/buffer/katetextloader.h