1// Copyright (C) 2022 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3// Qt-Security score:critical reason:data-parser
4
5#include <QtCore/qtextboundaryfinder.h>
6#include <QtCore/qvarlengtharray.h>
7
8#include <private/qunicodetools_p.h>
9
10QT_BEGIN_NAMESPACE
11
12static void init(QTextBoundaryFinder::BoundaryType type, QStringView str, QCharAttributes *attributes)
13{
14 QUnicodeTools::ScriptItemArray scriptItems;
15 QUnicodeTools::initScripts(str, scripts: &scriptItems);
16
17 QUnicodeTools::CharAttributeOptions options;
18 switch (type) {
19 case QTextBoundaryFinder::Grapheme: options |= QUnicodeTools::GraphemeBreaks; break;
20 case QTextBoundaryFinder::Word: options |= QUnicodeTools::WordBreaks; break;
21 case QTextBoundaryFinder::Sentence: options |= QUnicodeTools::SentenceBreaks; break;
22 case QTextBoundaryFinder::Line: options |= QUnicodeTools::LineBreaks; break;
23 default: break;
24 }
25 QUnicodeTools::initCharAttributes(str, items: scriptItems.data(), numItems: scriptItems.size(), attributes, options);
26}
27
28/*!
29 \class QTextBoundaryFinder
30 \inmodule QtCore
31
32 \brief The QTextBoundaryFinder class provides a way of finding Unicode text boundaries in a string.
33
34 \since 4.4
35 \ingroup tools
36 \ingroup shared
37 \ingroup string-processing
38 \reentrant
39
40 QTextBoundaryFinder allows to find Unicode text boundaries in a
41 string, accordingly to the Unicode text boundary specification (see
42 \l{https://www.unicode.org/reports/tr14/}{Unicode Standard Annex #14} and
43 \l{https://www.unicode.org/reports/tr29/}{Unicode Standard Annex #29}).
44
45 QTextBoundaryFinder can operate on a QString in four possible
46 modes depending on the value of \a BoundaryType.
47
48 Units of Unicode characters that make up what the user thinks of
49 as a character or basic unit of the language are here called
50 Grapheme clusters. The two unicode characters 'A' + diaeresis do
51 for example form one grapheme cluster as the user thinks of them
52 as one character, yet it is in this case represented by two
53 unicode code points
54 (see \l{https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}).
55
56 Word boundaries are there to locate the start and end of what a
57 language considers to be a word
58 (see \l{https://www.unicode.org/reports/tr29/#Word_Boundaries}).
59
60 Line break boundaries give possible places where a line break
61 might happen and sentence boundaries will show the beginning and
62 end of whole sentences
63 (see \l{https://www.unicode.org/reports/tr29/#Sentence_Boundaries} and
64 \l{https://www.unicode.org/reports/tr14/}).
65
66 The first position in a string is always a valid boundary and
67 refers to the position before the first character. The last
68 position at the length of the string is also valid and refers
69 to the position after the last character.
70*/
71
72/*!
73 \enum QTextBoundaryFinder::BoundaryType
74
75 \value Grapheme Finds a grapheme which is the smallest boundary. It
76 including letters, punctuation marks, numerals and more.
77 \value Word Finds a word.
78 \value Line Finds possible positions for breaking the text into multiple
79 lines.
80 \value Sentence Finds sentence boundaries. These include periods, question
81 marks etc.
82*/
83
84/*!
85 \enum QTextBoundaryFinder::BoundaryReason
86
87 \value NotAtBoundary The boundary finder is not at a boundary position.
88 \value BreakOpportunity The boundary finder is at a break opportunity position.
89 Such a break opportunity might also be an item boundary
90 (either StartOfItem, EndOfItem, or combination of both),
91 a mandatory line break, or a soft hyphen.
92 \value [since 5.0] StartOfItem The boundary finder is at the start of
93 a grapheme, a word, a sentence, or a line.
94 \value [since 5.0] EndOfItem The boundary finder is at the end of
95 a grapheme, a word, a sentence, or a line.
96 \value [since 5.0] MandatoryBreak The boundary finder is at the end of line
97 (can occur for a Line boundary type only).
98 \value SoftHyphen The boundary finder is at the soft hyphen
99 (can occur for a Line boundary type only).
100*/
101
102/*!
103 Constructs an invalid QTextBoundaryFinder object.
104*/
105QTextBoundaryFinder::QTextBoundaryFinder()
106 : freeBuffer(true)
107 , unused{0}
108{
109}
110
111/*!
112 Copies the QTextBoundaryFinder object, \a other.
113*/
114QTextBoundaryFinder::QTextBoundaryFinder(const QTextBoundaryFinder &other)
115 : t(other.t)
116 , s(other.s)
117 , sv(other.sv)
118 , pos(other.pos)
119 , freeBuffer(true)
120 , unused{0}
121{
122 if (other.attributes) {
123 Q_ASSERT(sv.size() > 0);
124 attributes = (QCharAttributes *) malloc(size: (sv.size() + 1) * sizeof(QCharAttributes));
125 Q_CHECK_PTR(attributes);
126 memcpy(dest: attributes, src: other.attributes, n: (sv.size() + 1) * sizeof(QCharAttributes));
127 }
128}
129
130/*!
131 Assigns the object, \a other, to another QTextBoundaryFinder object.
132*/
133QTextBoundaryFinder &QTextBoundaryFinder::operator=(const QTextBoundaryFinder &other)
134{
135 if (&other == this)
136 return *this;
137
138 if (other.attributes) {
139 Q_ASSERT(other.sv.size() > 0);
140 size_t newCapacity = (size_t(other.sv.size()) + 1) * sizeof(QCharAttributes);
141 QCharAttributes *newD = (QCharAttributes *) realloc(ptr: freeBuffer ? attributes : nullptr, size: newCapacity);
142 Q_CHECK_PTR(newD);
143 freeBuffer = true;
144 attributes = newD;
145 }
146
147 t = other.t;
148 s = other.s;
149 sv = other.sv;
150 pos = other.pos;
151
152 if (other.attributes) {
153 memcpy(dest: attributes, src: other.attributes, n: (sv.size() + 1) * sizeof(QCharAttributes));
154 } else {
155 if (freeBuffer)
156 free(ptr: attributes);
157 attributes = nullptr;
158 }
159
160 return *this;
161}
162
163/*!
164 Destructs the QTextBoundaryFinder object.
165*/
166QTextBoundaryFinder::~QTextBoundaryFinder()
167{
168 Q_UNUSED(unused);
169 if (freeBuffer)
170 free(ptr: attributes);
171}
172
173/*!
174 Creates a QTextBoundaryFinder object of \a type operating on \a string.
175*/
176QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QString &string)
177 : t(type)
178 , s(string)
179 , sv(s)
180 , freeBuffer(true)
181 , unused{0}
182{
183 if (sv.size() > 0) {
184 attributes = (QCharAttributes *) malloc(size: (sv.size() + 1) * sizeof(QCharAttributes));
185 Q_CHECK_PTR(attributes);
186 init(type: t, str: sv, attributes);
187 }
188}
189
190/*!
191 \fn QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QChar *chars, qsizetype length, unsigned char *buffer, qsizetype bufferSize)
192 \overload
193
194 The same as QTextBoundaryFinder(type, QStringView(chars, length), buffer, bufferSize).
195*/
196
197/*!
198 Creates a QTextBoundaryFinder object of \a type operating on \a string.
199 \since 6.0
200
201 \a buffer is an optional working buffer of size \a bufferSize you can pass to
202 the QTextBoundaryFinder. If the buffer is large enough to hold the working
203 data required (bufferSize >= length + 1), it will use this
204 instead of allocating its own buffer.
205
206 \warning QTextBoundaryFinder does not create a copy of \a string. It is the
207 application programmer's responsibility to ensure the array is allocated for
208 as long as the QTextBoundaryFinder object stays alive. The same applies to
209 \a buffer.
210*/
211QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, QStringView string, unsigned char *buffer, qsizetype bufferSize)
212 : t(type)
213 , sv(string)
214 , freeBuffer(true)
215 , unused{0}
216{
217 if (!sv.isEmpty()) {
218 if (buffer && bufferSize / int(sizeof(QCharAttributes)) >= sv.size() + 1) {
219 attributes = reinterpret_cast<QCharAttributes *>(buffer);
220 freeBuffer = false;
221 } else {
222 attributes = (QCharAttributes *) malloc(size: (sv.size() + 1) * sizeof(QCharAttributes));
223 Q_CHECK_PTR(attributes);
224 }
225 init(type: t, str: sv, attributes);
226 }
227}
228
229/*!
230 Moves the finder to the start of the string. This is equivalent to setPosition(0).
231
232 \sa setPosition(), position()
233*/
234void QTextBoundaryFinder::toStart()
235{
236 pos = 0;
237}
238
239/*!
240 Moves the finder to the end of the string. This is equivalent to setPosition(string.length()).
241
242 \sa setPosition(), position()
243*/
244void QTextBoundaryFinder::toEnd()
245{
246 pos = sv.size();
247}
248
249/*!
250 Returns the current position of the QTextBoundaryFinder.
251
252 The range is from 0 (the beginning of the string) to the length of
253 the string inclusive.
254
255 \sa setPosition()
256*/
257qsizetype QTextBoundaryFinder::position() const
258{
259 return pos;
260}
261
262/*!
263 Sets the current position of the QTextBoundaryFinder to \a position.
264
265 If \a position is out of bounds, it will be bound to only valid
266 positions. In this case, valid positions are from 0 to the length of
267 the string inclusive.
268
269 \sa position()
270*/
271void QTextBoundaryFinder::setPosition(qsizetype position)
272{
273 pos = qBound(min: 0, val: position, max: sv.size());
274}
275
276/*! \fn QTextBoundaryFinder::BoundaryType QTextBoundaryFinder::type() const
277
278 Returns the type of the QTextBoundaryFinder.
279*/
280
281/*! \fn bool QTextBoundaryFinder::isValid() const
282
283 Returns \c true if the text boundary finder is valid; otherwise returns \c false.
284 A default QTextBoundaryFinder is invalid.
285*/
286
287/*!
288 Returns the string the QTextBoundaryFinder object operates on.
289*/
290QString QTextBoundaryFinder::string() const
291{
292 if (sv.data() == s.unicode() && sv.size() == s.size())
293 return s;
294 return sv.toString();
295}
296
297
298/*!
299 Moves the QTextBoundaryFinder to the next boundary position and returns that position.
300
301 Returns -1 if there is no next boundary.
302*/
303qsizetype QTextBoundaryFinder::toNextBoundary()
304{
305 if (!attributes || pos < 0 || pos >= sv.size()) {
306 pos = -1;
307 return pos;
308 }
309
310 ++pos;
311 switch(t) {
312 case Grapheme:
313 while (pos < sv.size() && !attributes[pos].graphemeBoundary)
314 ++pos;
315 break;
316 case Word:
317 while (pos < sv.size() && !attributes[pos].wordBreak)
318 ++pos;
319 break;
320 case Sentence:
321 while (pos < sv.size() && !attributes[pos].sentenceBoundary)
322 ++pos;
323 break;
324 case Line:
325 while (pos < sv.size() && !attributes[pos].lineBreak)
326 ++pos;
327 break;
328 }
329
330 return pos;
331}
332
333/*!
334 Moves the QTextBoundaryFinder to the previous boundary position and returns that position.
335
336 Returns -1 if there is no previous boundary.
337*/
338qsizetype QTextBoundaryFinder::toPreviousBoundary()
339{
340 if (!attributes || pos <= 0 || pos > sv.size()) {
341 pos = -1;
342 return pos;
343 }
344
345 --pos;
346 switch(t) {
347 case Grapheme:
348 while (pos > 0 && !attributes[pos].graphemeBoundary)
349 --pos;
350 break;
351 case Word:
352 while (pos > 0 && !attributes[pos].wordBreak)
353 --pos;
354 break;
355 case Sentence:
356 while (pos > 0 && !attributes[pos].sentenceBoundary)
357 --pos;
358 break;
359 case Line:
360 while (pos > 0 && !attributes[pos].lineBreak)
361 --pos;
362 break;
363 }
364
365 return pos;
366}
367
368/*!
369 Returns \c true if the object's position() is currently at a valid text boundary.
370*/
371bool QTextBoundaryFinder::isAtBoundary() const
372{
373 if (!attributes || pos < 0 || pos > sv.size())
374 return false;
375
376 switch(t) {
377 case Grapheme:
378 return attributes[pos].graphemeBoundary;
379 case Word:
380 return attributes[pos].wordBreak;
381 case Sentence:
382 return attributes[pos].sentenceBoundary;
383 case Line:
384 // ### TR#14 LB2 prohibits break at sot
385 return attributes[pos].lineBreak || pos == 0;
386 }
387 return false;
388}
389
390/*!
391 Returns the reasons for the boundary finder to have chosen the current position as a boundary.
392*/
393QTextBoundaryFinder::BoundaryReasons QTextBoundaryFinder::boundaryReasons() const
394{
395 BoundaryReasons reasons = NotAtBoundary;
396 if (!attributes || pos < 0 || pos > sv.size())
397 return reasons;
398
399 const QCharAttributes attr = attributes[pos];
400 switch (t) {
401 case Grapheme:
402 if (attr.graphemeBoundary) {
403 reasons |= BreakOpportunity | StartOfItem | EndOfItem;
404 if (pos == 0)
405 reasons &= (~EndOfItem);
406 else if (pos == sv.size())
407 reasons &= (~StartOfItem);
408 }
409 break;
410 case Word:
411 if (attr.wordBreak) {
412 reasons |= BreakOpportunity;
413 if (attr.wordStart)
414 reasons |= StartOfItem;
415 if (attr.wordEnd)
416 reasons |= EndOfItem;
417 }
418 break;
419 case Sentence:
420 if (attr.sentenceBoundary) {
421 reasons |= BreakOpportunity | StartOfItem | EndOfItem;
422 if (pos == 0)
423 reasons &= (~EndOfItem);
424 else if (pos == sv.size())
425 reasons &= (~StartOfItem);
426 }
427 break;
428 case Line:
429 // ### TR#14 LB2 prohibits break at sot
430 if (attr.lineBreak || pos == 0) {
431 reasons |= BreakOpportunity;
432 if (attr.mandatoryBreak || pos == 0) {
433 reasons |= MandatoryBreak | StartOfItem | EndOfItem;
434 if (pos == 0)
435 reasons &= (~EndOfItem);
436 else if (pos == sv.size())
437 reasons &= (~StartOfItem);
438 } else if (pos > 0 && sv[pos - 1].unicode() == QChar::SoftHyphen) {
439 reasons |= SoftHyphen;
440 }
441 }
442 break;
443 default:
444 break;
445 }
446
447 return reasons;
448}
449
450QT_END_NAMESPACE
451

source code of qtbase/src/corelib/text/qtextboundaryfinder.cpp