1// Copyright (C) 2020 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qunicodetools_p.h"
5
6#include "qunicodetables_p.h"
7#include "qvarlengtharray.h"
8#if QT_CONFIG(library)
9#include "qlibrary.h"
10#endif
11
12#include <limits.h>
13
14#define FLAG(x) (1 << (x))
15
16QT_BEGIN_NAMESPACE
17
18using namespace Qt::StringLiterals;
19
20#ifdef QT_BUILD_INTERNAL
21Q_CONSTINIT Q_AUTOTEST_EXPORT
22#else
23constexpr
24#endif
25int qt_initcharattributes_default_algorithm_only = 0;
26
27namespace QUnicodeTools {
28
29// -----------------------------------------------------------------------------------------------------
30//
31// The text boundaries determination algorithm.
32// See https://www.unicode.org/reports/tr29/tr29-37.html
33//
34// -----------------------------------------------------------------------------------------------------
35
36namespace GB {
37
38// This table is indexed by the grapheme break classes of two
39// (adjacent) code points.
40// The class of the first code point selects an entry.
41// If the entry's bit at position second_cp_class is set
42// (in other words: if entry & (1u << second_cp_class) is non-zero)
43// then there is NO grapheme break between the two code points.
44
45using GBTableEntryType = quint16;
46
47// Check that we have enough bits in the table (in case
48// NumGraphemeBreakClasses grows too much).
49static_assert(sizeof(GBTableEntryType) * CHAR_BIT >= QUnicodeTables::NumGraphemeBreakClasses,
50 "Internal error: increase the size in bits of GBTableEntryType");
51
52// GB9, GB9a
53static const GBTableEntryType Extend_SpacingMark_ZWJ =
54 FLAG(QUnicodeTables::GraphemeBreak_Extend)
55 | FLAG(QUnicodeTables::GraphemeBreak_SpacingMark)
56 | FLAG(QUnicodeTables::GraphemeBreak_ZWJ);
57
58static const GBTableEntryType HardBreak = 0u;
59
60static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses] = {
61 Extend_SpacingMark_ZWJ, // Any
62 FLAG(QUnicodeTables::GraphemeBreak_LF), // CR
63 HardBreak, // LF
64 HardBreak, // Control
65 Extend_SpacingMark_ZWJ, // Extend
66 Extend_SpacingMark_ZWJ, // ZWJ
67 Extend_SpacingMark_ZWJ, // RegionalIndicator
68 (Extend_SpacingMark_ZWJ
69 | FLAG(QUnicodeTables::GraphemeBreak_Any)
70 | FLAG(QUnicodeTables::GraphemeBreak_Prepend)
71 | FLAG(QUnicodeTables::GraphemeBreak_L)
72 | FLAG(QUnicodeTables::GraphemeBreak_V)
73 | FLAG(QUnicodeTables::GraphemeBreak_T)
74 | FLAG(QUnicodeTables::GraphemeBreak_LV)
75 | FLAG(QUnicodeTables::GraphemeBreak_LVT)
76 | FLAG(QUnicodeTables::GraphemeBreak_RegionalIndicator)
77 | FLAG(QUnicodeTables::GraphemeBreak_Extended_Pictographic)
78 ), // Prepend
79 Extend_SpacingMark_ZWJ, // SpacingMark
80 (Extend_SpacingMark_ZWJ
81 | FLAG(QUnicodeTables::GraphemeBreak_L)
82 | FLAG(QUnicodeTables::GraphemeBreak_V)
83 | FLAG(QUnicodeTables::GraphemeBreak_LV)
84 | FLAG(QUnicodeTables::GraphemeBreak_LVT)
85 ), // L
86 (Extend_SpacingMark_ZWJ
87 | FLAG(QUnicodeTables::GraphemeBreak_V)
88 | FLAG(QUnicodeTables::GraphemeBreak_T)
89 ), // V
90 (Extend_SpacingMark_ZWJ
91 | FLAG(QUnicodeTables::GraphemeBreak_T)
92 ), // T
93 (Extend_SpacingMark_ZWJ
94 | FLAG(QUnicodeTables::GraphemeBreak_V)
95 | FLAG(QUnicodeTables::GraphemeBreak_T)
96 ), // LV
97 (Extend_SpacingMark_ZWJ
98 | FLAG(QUnicodeTables::GraphemeBreak_T)
99 ), // LVT
100 Extend_SpacingMark_ZWJ // Extended_Pictographic
101};
102
103static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first,
104 QUnicodeTables::GraphemeBreakClass second)
105{
106 return (breakTable[first] & FLAG(second)) == 0;
107}
108
109// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
110// so we need to store some local state.
111enum class State : uchar {
112 Normal,
113 GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
114 GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
115 GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
116};
117
118} // namespace GB
119
120static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
121{
122 QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
123 GB::State state = GB::State::Normal;
124 for (qsizetype i = 0; i != len; ++i) {
125 qsizetype pos = i;
126 char32_t ucs4 = string[i];
127 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
128 ushort low = string[i + 1];
129 if (QChar::isLowSurrogate(ucs4: low)) {
130 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
131 ++i;
132 }
133 }
134
135 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
136 QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
137
138 bool shouldBreak = GB::shouldBreakBetweenClasses(first: lcls, second: cls);
139 bool handled = false;
140
141 switch (state) {
142 case GB::State::Normal:
143 break; // will deal with it below
144
145 case GB::State::GB11_ExtPicExt:
146 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
147 if (cls == QUnicodeTables::GraphemeBreak_Extend) {
148 // keep going in the current state
149 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
150 handled = true;
151 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
152 state = GB::State::GB11_ExtPicExtZWJ;
153 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
154 handled = true;
155 } else {
156 state = GB::State::Normal;
157 }
158 break;
159
160 case GB::State::GB11_ExtPicExtZWJ:
161 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
162 if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) {
163 shouldBreak = false;
164 handled = true;
165 }
166
167 state = GB::State::Normal;
168 break;
169
170 case GB::State::GB12_13_RI:
171 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator);
172 if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) {
173 shouldBreak = false;
174 handled = true;
175 }
176
177 state = GB::State::Normal;
178 break;
179 }
180
181 if (!handled) {
182 Q_ASSERT(state == GB::State::Normal);
183 if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11
184 if (cls == QUnicodeTables::GraphemeBreak_Extend) {
185 state = GB::State::GB11_ExtPicExt;
186 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
187 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
188 state = GB::State::GB11_ExtPicExtZWJ;
189 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
190 }
191 } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
192 state = GB::State::GB12_13_RI;
193 }
194 }
195
196 if (shouldBreak)
197 attributes[pos].graphemeBoundary = true;
198
199 lcls = cls;
200 }
201
202 attributes[len].graphemeBoundary = true; // GB2
203}
204
205
206namespace WB {
207
208enum Action {
209 NoBreak,
210 Break,
211 Lookup,
212 LookupW
213};
214
215static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
216// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
217 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
218 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
219 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
220 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
221 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
222 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
223 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
224 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
225 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
226 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // HebrewLetter
227 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // ALetter
228 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
229 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
230 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
231 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
232 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
233 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break }, // Numeric
234 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ExtendNumLet
235 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
236};
237
238} // namespace WB
239
240static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
241{
242 enum WordType {
243 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
244 } currentWordType = WordTypeNone;
245
246 QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
247 auto real_cls = cls; // Unaffected by WB4
248
249 for (qsizetype i = 0; i != len; ++i) {
250 qsizetype pos = i;
251 char32_t ucs4 = string[i];
252 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
253 ushort low = string[i + 1];
254 if (QChar::isLowSurrogate(ucs4: low)) {
255 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
256 ++i;
257 }
258 }
259
260 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
261 QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
262 if (qt_initcharattributes_default_algorithm_only) {
263 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
264 // which caused "hi.there" to be treated like if it were just a single word;
265 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
266 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
267 if (ucs4 == 0x002E) // FULL STOP
268 ncls = QUnicodeTables::WordBreak_MidNumLet;
269 else if (ucs4 == 0x003A) // COLON
270 ncls = QUnicodeTables::WordBreak_MidLetter;
271 }
272
273 uchar action = WB::breakTable[cls][ncls];
274 switch (action) {
275 case WB::Break:
276 if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ
277 && prop->graphemeBreakClass
278 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
279 // WB3c: ZWJ × \p{Extended_Pictographic}
280 action = WB::NoBreak;
281 }
282 break;
283 case WB::NoBreak:
284 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend || ncls == QUnicodeTables::WordBreak_ZWJ || ncls == QUnicodeTables::WordBreak_Format)) {
285 // WB4: X(Extend|Format)* -> X
286 real_cls = ncls;
287 continue;
288 }
289 if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
290 // WB15/WB16: break between pairs of Regional indicator
291 ncls = QUnicodeTables::WordBreak_Any;
292 }
293 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace
294 && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
295 // WB3d should not be affected by WB4
296 action = WB::Break;
297 }
298 break;
299 case WB::Lookup:
300 case WB::LookupW:
301 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
302 ucs4 = string[lookahead];
303 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
304 ushort low = string[lookahead + 1];
305 if (QChar::isLowSurrogate(ucs4: low)) {
306 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
307 ++lookahead;
308 }
309 }
310
311 prop = QUnicodeTables::properties(ucs4);
312 QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
313
314 if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend || tcls == QUnicodeTables::WordBreak_ZWJ || tcls == QUnicodeTables::WordBreak_Format)) {
315 // WB4: X(Extend|Format)* -> X
316 continue;
317 }
318
319 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
320 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
321 i = lookahead;
322 ncls = tcls;
323 action = WB::NoBreak;
324 }
325 break;
326 }
327 if (action != WB::NoBreak) {
328 action = WB::Break;
329 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
330 action = WB::NoBreak; // WB7a
331 }
332 break;
333 }
334
335 cls = ncls;
336 real_cls = ncls;
337
338 if (action == WB::Break) {
339 attributes[pos].wordBreak = true;
340 if (currentWordType != WordTypeNone)
341 attributes[pos].wordEnd = true;
342 switch (cls) {
343 case QUnicodeTables::WordBreak_Katakana:
344 currentWordType = WordTypeHiraganaKatakana;
345 attributes[pos].wordStart = true;
346 break;
347 case QUnicodeTables::WordBreak_HebrewLetter:
348 case QUnicodeTables::WordBreak_ALetter:
349 case QUnicodeTables::WordBreak_Numeric:
350 currentWordType = WordTypeAlphaNumeric;
351 attributes[pos].wordStart = true;
352 break;
353 default:
354 currentWordType = WordTypeNone;
355 break;
356 }
357 }
358 }
359
360 if (currentWordType != WordTypeNone)
361 attributes[len].wordEnd = true;
362 attributes[len].wordBreak = true; // WB2
363}
364
365
366namespace SB {
367
368enum State {
369 Initial,
370 Lower,
371 Upper,
372 LUATerm,
373 ATerm,
374 ATermC,
375 ACS,
376 STerm,
377 STermC,
378 SCS,
379 BAfterC,
380 BAfter,
381 Break,
382 Lookup
383};
384
385static const uchar breakTable[BAfter + 1][QUnicodeTables::NumSentenceBreakClasses] = {
386// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
387 { Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
388 { Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
389 { Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, Initial, STerm , Initial }, // Upper
390
391 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
392 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
393 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
394 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
395
396 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
397 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
398 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
399 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
400 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
401};
402
403} // namespace SB
404
405static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
406{
407 uchar state = SB::BAfter; // to meet SB1
408 for (qsizetype i = 0; i != len; ++i) {
409 qsizetype pos = i;
410 char32_t ucs4 = string[i];
411 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
412 ushort low = string[i + 1];
413 if (QChar::isLowSurrogate(ucs4: low)) {
414 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
415 ++i;
416 }
417 }
418
419 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
420 QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
421
422 Q_ASSERT(state <= SB::BAfter);
423 state = SB::breakTable[state][ncls];
424 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
425 state = SB::Break;
426 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
427 ucs4 = string[lookahead];
428 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
429 ushort low = string[lookahead + 1];
430 if (QChar::isLowSurrogate(ucs4: low)) {
431 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
432 ++lookahead;
433 }
434 }
435
436 prop = QUnicodeTables::properties(ucs4);
437 QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
438 switch (tcls) {
439 case QUnicodeTables::SentenceBreak_Any:
440 case QUnicodeTables::SentenceBreak_Extend:
441 case QUnicodeTables::SentenceBreak_Sp:
442 case QUnicodeTables::SentenceBreak_Numeric:
443 case QUnicodeTables::SentenceBreak_SContinue:
444 case QUnicodeTables::SentenceBreak_Close:
445 continue;
446 case QUnicodeTables::SentenceBreak_Lower:
447 i = lookahead;
448 state = SB::Initial;
449 break;
450 default:
451 break;
452 }
453 break;
454 }
455 }
456 if (Q_UNLIKELY(state == SB::Break)) {
457 attributes[pos].sentenceBoundary = true;
458 state = SB::breakTable[SB::Initial][ncls];
459 }
460 }
461
462 attributes[len].sentenceBoundary = true; // SB2
463}
464
465
466// -----------------------------------------------------------------------------------------------------
467//
468// The line breaking algorithm.
469// See http://www.unicode.org/reports/tr14/tr14-39.html
470//
471// -----------------------------------------------------------------------------------------------------
472
473namespace LB {
474
475namespace NS { // Number Sequence
476
477// This namespace is used to implement LB25 which, as of Unicode 16, has this
478// definition:
479// NU ( SY | IS )* CL × PO
480// NU ( SY | IS )* CP × PO
481// NU ( SY | IS )* CL × PR
482// NU ( SY | IS )* CP × PR
483// NU ( SY | IS )* × PO
484// NU ( SY | IS )* × PR
485// PO × OP NU
486// PO × OP IS NU
487// PO × NU
488// PR × OP NU
489// PR × OP IS NU
490// PR × NU
491// HY × NU
492// IS × NU
493// NU ( SY | IS )* × NU
494
495enum Action {
496 None,
497 Start,
498 Continue,
499 Break,
500 NeedOPNU, // Like Start, but must be followed by sequence `(OP (IS)?)? NU`
501 // These are 'synthetic' actions and are not used in the table but are
502 // tracked otherwise in the code for LB25, to track the state of specific
503 // sequences:
504 CNeedNU, // Like Continue, but must be followed by NU
505 CNeedISNU, // Like Continue, but must be followed by IS? NU
506};
507
508enum Class {
509 XX,
510 PRPO,
511 OP,
512 HY,
513 NU,
514 SY,
515 IS,
516 CLCP
517};
518
519static const uchar actionTable[CLCP + 1][CLCP + 1] = {
520// XX PRPO OP HY NU SY IS CLCP
521 { None , NeedOPNU, Start , None , Start , None , None , None }, // XX
522 { None , NeedOPNU, Continue, Break , Start , None , None , None }, // PRPO
523 { None , Start , Start , Break , Continue, None , Continue, None }, // OP
524 { None , None , None , Start , Continue, None , None , None }, // HY
525 { Break , Break , Break , Break , Continue, Continue, Continue, Continue }, // NU
526 { Break , Break , Break , Break , Continue, Continue, Continue, Continue }, // SY
527 { Break , Break , Break , Break , Continue, Continue, Continue, Continue }, // IS
528 { Break , Continue, Break , Break , Break , Break , Break , Break }, // CLCP
529};
530
531inline Class toClass(QUnicodeTables::LineBreakClass lbc)
532{
533 switch (lbc) {
534 case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
535 return PRPO;
536 case QUnicodeTables::LineBreak_OP:
537 return OP;
538 case QUnicodeTables::LineBreak_HY:
539 return HY;
540 case QUnicodeTables::LineBreak_NU:
541 return NU;
542 case QUnicodeTables::LineBreak_SY:
543 return SY;
544 case QUnicodeTables::LineBreak_IS:
545 return IS;
546 case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
547 return CLCP;
548 default:
549 break;
550 }
551 return XX;
552}
553
554} // namespace NS
555
556namespace BRS { // Brahmic Sequence, used to implement LB28a
557 constexpr char32_t DottedCircle = U'\u25CC';
558
559 // The LB28a_{n} value maps to the 'regex' on the nth line in LB28a
560 // The only special case is LB28a_2VI which is a direct match to the 2nd
561 // line, but it also leads to LB28a_3VIAK, the 3rd line.
562 enum State {
563 None,
564 Start, // => Have: `(AK | [◌] | AS)`
565 LB28a_2VF, // => Have: `(AK | [◌] | AS) VF`
566 LB28a_2VI, // => Have: `(AK | [◌] | AS) VI` May find: `(AK | [◌])`
567 LB28a_3VIAK, // => Have: `(AK | [◌] | AS) VI (AK | [◌])`
568 LB28a_4, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS)` May find: `VF`
569 LB28a_4VF, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS) VF`
570 Restart,
571 };
572 struct LinebreakUnit {
573 QUnicodeTables::LineBreakClass lbc;
574 char32_t ucs4;
575 };
576 struct ParseState {
577 State state = None;
578 qsizetype start = 0;
579 };
580 State updateState(State state, LinebreakUnit lb)
581 {
582 using LBC = QUnicodeTables::LineBreakClass;
583 if (lb.lbc == LBC::LineBreak_CM)
584 return state;
585
586 switch (state) {
587 case Start:
588 if (lb.lbc == LBC::LineBreak_VF)
589 return LB28a_2VF;
590 if (lb.lbc == LBC::LineBreak_VI)
591 return LB28a_2VI;
592 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
593 || lb.lbc == LBC::LineBreak_AS)
594 return LB28a_4;
595 break;
596 case LB28a_2VI:
597 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK)
598 return LB28a_3VIAK;
599 break;
600 case LB28a_4:
601 if (lb.lbc == LBC::LineBreak_VF)
602 return LB28a_4VF;
603 // Had (AK | [◌] | AS) (AK | [◌] | AS), which could mean the 2nd capture is the start
604 // of a new sequence, so we need to check if it makes sense.
605 return Restart;
606 case None:
607 if (Q_UNLIKELY(lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
608 || lb.lbc == LBC::LineBreak_AS)) {
609 return Start;
610 }
611 break;
612 case LB28a_2VF:
613 case LB28a_4VF:
614 case LB28a_3VIAK:
615 case Restart:
616 // These are all terminal states, so no need to update
617 Q_UNREACHABLE();
618 }
619 return None;
620 }
621}
622
623enum Action {
624 ProhibitedBreak, PB = ProhibitedBreak,
625 DirectBreak, DB = DirectBreak,
626 IndirectBreak, IB = IndirectBreak,
627 CombiningIndirectBreak, CI = CombiningIndirectBreak,
628 CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
629 ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen,
630 IndirectBreakIfNarrow, IN = IndirectBreakIfNarrow, // For LB30
631 DirectBreakOutsideNumericSequence, DN = DirectBreakOutsideNumericSequence, // For LB25
632};
633
634// See https://www.unicode.org/reports/tr14/tr14-37.html for the information
635// about the table. It was removed in the later versions of the standard.
636static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ] = {
637/* 1↓ 2→ OP CL CP QU +Pi +Pf +19 GL NS EX SY IS PR PO NU AL HL ID IN HY +WS BA +WS HYBA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM AK AP AS VI VF*/
638/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
639/* CL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
640/* CP */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
641/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
642/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
643/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
644/* +19*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
645/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
646/* NS */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
647/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
648/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
649/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DN, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
650/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, DB, DB, DB },
651/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
652/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
653/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
654/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, CI, CI, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
655/* ID */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
656/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
657/* HY */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
658/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
659/* BA */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
660/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
661/*HYBA*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, DB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
662/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB },
663/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
664/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
665/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
666/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
667/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
668/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
669/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
670/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
671/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
672/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB },
673/* CB */ { DB, PB, PB, IB, IB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
674/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB },
675/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
676/* AK */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
677/* AP */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, IB, DB, DB },
678/* AS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
679/* VI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
680/* VF */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
681};
682
683// The following line break classes are not treated by the pair table
684// and must be resolved outside:
685// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX, ZWJ
686
687} // namespace LB
688
689static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
690{
691 qsizetype nestart = 0;
692 LB::NS::Class nelast = LB::NS::XX;
693 LB::NS::Action neactlast = LB::NS::None;
694
695 LB::BRS::ParseState brsState;
696
697 QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
698 QUnicodeTables::LineBreakClass cls = lcls;
699 const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(ucs4: U'\n');
700
701 constexpr static auto isEastAsian = [](QUnicodeTables::EastAsianWidth eaw) {
702 using EAW = QUnicodeTables::EastAsianWidth;
703 return eaw == EAW::W || eaw == EAW::F || eaw == EAW::H;
704 };
705
706 for (qsizetype i = 0; i != len; ++i) {
707 qsizetype pos = i;
708 char32_t ucs4 = string[i];
709 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
710 ushort low = string[i + 1];
711 if (QChar::isLowSurrogate(ucs4: low)) {
712 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
713 ++i;
714 }
715 }
716
717 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
718 QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
719 QUnicodeTables::LineBreakClass tcls;
720
721 if (options & QUnicodeTools::HangulLineBreakTailoring) {
722 if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
723 && ncls <= QUnicodeTables::LineBreak_JT)
724 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
725 ) {
726 // LB27: use SPACE for line breaking
727 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
728 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
729 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
730 ncls = QUnicodeTables::LineBreak_AL;
731 } else {
732 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
733 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
734 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
735 if (FLAG(prop->category) & test)
736 ncls = QUnicodeTables::LineBreak_CM;
737 }
738 }
739 }
740
741 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
742 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
743 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
744 if (FLAG(prop->category) & test)
745 ncls = QUnicodeTables::LineBreak_CM;
746 }
747
748 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) {
749 if (prop->category == QChar::Punctuation_InitialQuote) {
750 // LB15a: Do not break after an unresolved initial punctuation
751 // that lies at the start of the line, after a space, after
752 // opening punctuation, or after an unresolved quotation mark,
753 // even after spaces.
754 // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW)
755 // [\p{Pi}&QU] SP* ×
756 // Note: sot is treated as LF here due to initial loop setup.
757 constexpr QUnicodeTables::LineBreakClass lb15a[] = {
758 QUnicodeTables::LineBreak_BK, QUnicodeTables::LineBreak_CR,
759 QUnicodeTables::LineBreak_LF, QUnicodeTables::LineBreak_OP,
760 QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
761 QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_GL,
762 QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_ZW};
763 if (std::any_of(first: std::begin(arr: lb15a), last: std::end(arr: lb15a),
764 pred: [lcls](auto x) { return x == lcls; })) {
765 ncls = QUnicodeTables::LineBreak_QU_Pi;
766 }
767 } else if (prop->category == QChar::Punctuation_FinalQuote) {
768 // LB15b: Do not break before an unresolved final punctuation
769 // that lies at the end of the line, before a space, before
770 // a prohibited break, or before an unresolved quotation mark,
771 // even after spaces.
772 // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS
773 // | SY | BK | CR | LF | NL | ZW | eot)
774 auto nncls = QUnicodeTables::LineBreak_LF;
775
776 if (i + 1 < len) {
777 char32_t c = string[i + 1];
778 if (QChar::isHighSurrogate(ucs4: c) && i + 2 < len) {
779 ushort low = string[i + 2];
780 if (QChar::isLowSurrogate(ucs4: low))
781 c = QChar::surrogateToUcs4(high: c, low);
782 }
783 nncls = QUnicodeTables::LineBreakClass(
784 QUnicodeTables::properties(ucs4: c)->lineBreakClass);
785 }
786
787 constexpr QUnicodeTables::LineBreakClass lb15b[] = {
788 QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_GL,
789 QUnicodeTables::LineBreak_WJ, QUnicodeTables::LineBreak_CL,
790 QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
791 QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_CP,
792 QUnicodeTables::LineBreak_EX, QUnicodeTables::LineBreak_IS,
793 QUnicodeTables::LineBreak_SY, QUnicodeTables::LineBreak_BK,
794 QUnicodeTables::LineBreak_CR, QUnicodeTables::LineBreak_LF,
795 QUnicodeTables::LineBreak_ZW};
796 if (std::any_of(first: std::begin(arr: lb15b), last: std::end(arr: lb15b),
797 pred: [nncls](auto x) { return x == nncls; })) {
798 ncls = QUnicodeTables::LineBreak_QU_Pf;
799 }
800 }
801 }
802
803 if (Q_UNLIKELY((lcls >= QUnicodeTables::LineBreak_SP || lcls == QUnicodeTables::LineBreak_ZW
804 || lcls == QUnicodeTables::LineBreak_GL
805 || lcls == QUnicodeTables::LineBreak_CB)
806 && (ncls == QUnicodeTables::LineBreak_HY || ucs4 == u'\u2010'))) {
807 // LB20a: Do not break after a word-initial hyphen.
808 // ( sot | BK | CR | LF | NL | SP | ZW | CB | GL ) ( HY | [\u2010] ) × AL
809
810 // Remap to the synthetic class WS_* (whitespace+*), which is just
811 // like the current respective linebreak class but with an IB action
812 // if the next class is AL.
813 if (ucs4 == u'\u2010')
814 ncls = QUnicodeTables::LineBreak_WS_BA;
815 else
816 ncls = QUnicodeTables::LineBreak_WS_HY;
817 }
818
819 if (Q_UNLIKELY(cls == QUnicodeTables::LineBreak_AP && ucs4 == LB::BRS::DottedCircle)) {
820 // LB28a: Do not break inside the orthographic syllables of Brahmic scripts
821 // AP × (AK | [◌] | AS)
822 // @note: AP × (AK | AS) is checked by the breakTable
823 goto next;
824 }
825 while (true) { // May need to recheck once.
826 // LB28a cont'd
827 LB::BRS::State oldState = brsState.state;
828 brsState.state = LB::BRS::updateState(state: brsState.state, lb: {.lbc: ncls, .ucs4: ucs4});
829 if (Q_LIKELY(brsState.state == oldState))
830 break;
831 switch (brsState.state) {
832 case LB::BRS::Start:
833 brsState.start = i;
834 break;
835 case LB::BRS::LB28a_2VI: // Wait for more characters, but also valid sequence
836 // We may get another character, but this is already a complete
837 // sequence that should not have any breaks:
838 for (qsizetype j = brsState.start + 1; j < i; ++j)
839 attributes[j].lineBreak = false;
840 // No need to mark this sequence again later, so move 'start'
841 // up to the current position:
842 brsState.start = i;
843 goto next;
844 case LB::BRS::Restart:
845 // The previous character was possibly the start of a new sequence
846 brsState.state = LB::BRS::Start;
847 brsState.start = pos - 1;
848 continue; // Doing the loop again!
849 case LB::BRS::LB28a_2VF:
850 case LB::BRS::LB28a_4VF:
851 case LB::BRS::LB28a_3VIAK:
852 for (qsizetype j = brsState.start + 1; j < i; ++j)
853 attributes[j].lineBreak = false;
854 if (brsState.state == LB::BRS::LB28a_3VIAK) {
855 // This might be the start of a new sequence
856 brsState.state = LB::BRS::Start;
857 brsState.start = i;
858 } else {
859 brsState.state = LB::BRS::None;
860 }
861 goto next;
862 case LB::BRS::LB28a_4: // Wait for more characters
863 case LB::BRS::None: // Nothing to do
864 break;
865 }
866 break;
867 }
868
869 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_IS)) {
870 // LB15c Break before a decimal mark that follows a space, for instance, in
871 // ‘subtract .5’.
872 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_SP)) {
873 if (i + 1 < len) {
874 char32_t ch = string[i + 1];
875 if (QChar::isHighSurrogate(ucs4: ch) && i + 2 < len) {
876 ushort low = string[i + 2];
877 if (QChar::isLowSurrogate(ucs4: low))
878 ch = QChar::surrogateToUcs4(high: ch, low);
879 }
880 if (QUnicodeTables::properties(ucs4: ch)->lineBreakClass
881 == QUnicodeTables::LineBreak_NU) {
882 attributes[pos].lineBreak = true;
883 goto next;
884 }
885 }
886 }
887 }
888
889 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_HL)) {
890 // LB21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
891 // HL (HY | [ BA - $EastAsian ]) × [^HL]
892 auto eaw = QUnicodeTables::EastAsianWidth(prop->eastAsianWidth);
893 const bool isNonEaBA = ncls == QUnicodeTables::LineBreak_BA && !isEastAsian(eaw);
894 if (isNonEaBA || ncls == QUnicodeTables::LineBreak_HY) {
895 // Remap to synthetic HYBA class which handles the next
896 // character. Generally (LB21) there are no breaks before
897 // HY or BA, so we can skip ahead to the next character.
898 ncls = QUnicodeTables::LineBreak_HYBA;
899 goto next;
900 }
901 }
902
903 // LB25: do not break lines inside numbers
904 {
905 LB::NS::Class necur = LB::NS::toClass(lbc: ncls);
906 LB::NS::Action neact = LB::NS::Action(LB::NS::actionTable[nelast][necur]);
907 if (Q_UNLIKELY(neactlast == LB::NS::CNeedNU && necur != LB::NS::NU)) {
908 neact = LB::NS::None;
909 } else if (Q_UNLIKELY(neactlast == LB::NS::NeedOPNU)) {
910 if (necur == LB::NS::OP)
911 neact = LB::NS::CNeedISNU;
912 else if (necur == LB::NS::NU)
913 neact = LB::NS::Continue;
914 else // Anything else and we ignore the sequence
915 neact = LB::NS::None;
916 } else if (Q_UNLIKELY(neactlast == LB::NS::CNeedISNU)) {
917 if (necur == LB::NS::IS)
918 neact = LB::NS::CNeedNU;
919 else if (necur == LB::NS::NU)
920 neact = LB::NS::Continue;
921 else // Anything else and we ignore the sequence
922 neact = LB::NS::None;
923 }
924 switch (neact) {
925 case LB::NS::Break:
926 // do not change breaks before and after the expression
927 for (qsizetype j = nestart + 1; j < pos; ++j)
928 attributes[j].lineBreak = false;
929 Q_FALLTHROUGH();
930 case LB::NS::None:
931 nelast = LB::NS::XX; // reset state
932 break;
933 case LB::NS::NeedOPNU:
934 case LB::NS::Start:
935 if (neactlast == LB::NS::Start || neactlast == LB::NS::Continue) {
936 // Apply the linebreaks for the previous stretch; we need to start a new one
937 for (qsizetype j = nestart + 1; j < pos; ++j)
938 attributes[j].lineBreak = false;
939 }
940 nestart = i;
941 Q_FALLTHROUGH();
942 case LB::NS::CNeedNU:
943 case LB::NS::CNeedISNU:
944 case LB::NS::Continue:
945 nelast = necur;
946 break;
947 }
948 neactlast = neact;
949 }
950
951 // LB19a Unless surrounded by East Asian characters, do not break either side of any
952 // unresolved quotation marks
953 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU
954 && lcls != QUnicodeTables::LineBreak_SP
955 && lcls != QUnicodeTables::LineBreak_ZW)) {
956 using EAW = QUnicodeTables::EastAsianWidth;
957 constexpr static auto nextCharNonEastAsian = [](const char16_t *string, qsizetype len) {
958 if (len > 0) {
959 char32_t nch = string[0];
960 if (QChar::isHighSurrogate(ucs4: nch) && len > 1) {
961 char16_t low = string[1];
962 if (QChar::isLowSurrogate(ucs4: low))
963 nch = QChar::surrogateToUcs4(high: char16_t(nch), low);
964 }
965 const auto *nextProp = QUnicodeTables::properties(ucs4: nch);
966 QUnicodeTables::LineBreakClass nncls = QUnicodeTables::LineBreakClass(
967 nextProp->lineBreakClass);
968 QUnicodeTables::EastAsianWidth neaw = EAW(nextProp->eastAsianWidth);
969 return nncls != QUnicodeTables::LineBreak_CM
970 && nncls <= QUnicodeTables::LineBreak_SP
971 && !isEastAsian(neaw);
972 }
973 return true; // end-of-text counts as non-East-Asian
974 };
975 if (Q_UNLIKELY(!isEastAsian(EAW(lastProp->eastAsianWidth))
976 || nextCharNonEastAsian(string + i + 1, len - i - 1))) {
977 // Remap to the synthetic QU_19 class which has indirect breaks
978 // for most following classes.
979 ncls = QUnicodeTables::LineBreak_QU_19;
980 }
981 }
982
983 if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
984 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
985 if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
986 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
987 goto next;
988 }
989
990 if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
991 if (ncls > QUnicodeTables::LineBreak_SP)
992 goto next; // LB6: x(BK|CR|LF|NL)
993 goto next_no_cls_update; // LB7: xSP
994 }
995
996 // LB19 - do not break before non-initial unresolved quotation marks, or after non-final
997 // unresolved quotation marks
998 if (Q_UNLIKELY(((ncls == QUnicodeTables::LineBreak_QU
999 || ncls == QUnicodeTables::LineBreak_QU_19)
1000 && prop->category != QChar::Punctuation_InitialQuote)
1001 || (cls == QUnicodeTables::LineBreak_QU
1002 && lastProp->category != QChar::Punctuation_FinalQuote))) {
1003 // Make sure the previous character is not one that we have to break after.
1004 // Also skip if ncls is CM so it can be treated as lcls (LB9)
1005 if (lcls != QUnicodeTables::LineBreak_SP && lcls != QUnicodeTables::LineBreak_ZW
1006 && ncls != QUnicodeTables::LineBreak_CM) {
1007 goto next;
1008 }
1009 }
1010
1011 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
1012 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
1013 if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP)
1014 // don't update anything
1015 goto next_no_cls_update;
1016 }
1017
1018 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
1019 // LB8a: ZWJ x
1020 goto next;
1021 }
1022
1023 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
1024 // LB30a
1025 ncls = QUnicodeTables::LineBreak_SP;
1026 goto next;
1027 }
1028
1029 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
1030 && lastProp->category == QChar::Other_NotAssigned
1031 && lastProp->graphemeBreakClass
1032 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
1033 // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
1034 goto next;
1035 }
1036
1037 // for South East Asian chars that require a complex analysis, the Unicode
1038 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
1039 if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
1040 cls = QUnicodeTables::LineBreak_AL;
1041
1042 tcls = cls;
1043
1044 constexpr static auto remapToAL = [](QUnicodeTables::LineBreakClass &c, auto &property) {
1045 if (Q_UNLIKELY(c == QUnicodeTables::LineBreak_CM
1046 || c == QUnicodeTables::LineBreak_ZWJ)) {
1047 c = QUnicodeTables::LineBreak_AL;
1048 property = QUnicodeTables::properties(ucs4: U'\u0041');
1049 }
1050 };
1051 // LB10 Treat any remaining combining mark or ZWJ as AL,
1052 // as if it had the properties of U+0041 A LATIN CAPITAL LETTER
1053 remapToAL(tcls, lastProp);
1054 remapToAL(ncls, prop);
1055
1056 switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_ZWJ ? ncls : QUnicodeTables::LineBreak_AL]) {
1057 case LB::DirectBreak:
1058 attributes[pos].lineBreak = true;
1059 break;
1060 case LB::IndirectBreak:
1061 if (lcls == QUnicodeTables::LineBreak_SP)
1062 attributes[pos].lineBreak = true;
1063 break;
1064 case LB::CombiningIndirectBreak:
1065 if (lcls != QUnicodeTables::LineBreak_SP)
1066 goto next_no_cls_update;
1067 attributes[pos].lineBreak = true;
1068 break;
1069 case LB::CombiningProhibitedBreak:
1070 if (lcls != QUnicodeTables::LineBreak_SP)
1071 goto next_no_cls_update;
1072 break;
1073 case LB::ProhibitedBreakAfterHebrewPlusHyphen:
1074 if (lcls != QUnicodeTables::LineBreak_HL)
1075 attributes[pos].lineBreak = true;
1076 break;
1077 case LB::IndirectBreakIfNarrow:
1078 using EAW = QUnicodeTables::EastAsianWidth;
1079 switch (EAW(prop->eastAsianWidth)) {
1080 default:
1081 if (lcls != QUnicodeTables::LineBreak_SP)
1082 break;
1083 Q_FALLTHROUGH();
1084 case QUnicodeTables::EastAsianWidth::F:
1085 case QUnicodeTables::EastAsianWidth::W:
1086 case QUnicodeTables::EastAsianWidth::H:
1087 attributes[pos].lineBreak = true;
1088 break;
1089 }
1090 break;
1091 case LB::DirectBreakOutsideNumericSequence:
1092 if (neactlast == LB::NS::None || neactlast > LB::NS::Break)
1093 attributes[pos].lineBreak = true;
1094 break;
1095 case LB::ProhibitedBreak:
1096 // nothing to do
1097 default:
1098 break;
1099 }
1100
1101 next:
1102 if (ncls != QUnicodeTables::LineBreak_CM && ncls != QUnicodeTables::LineBreak_ZWJ) {
1103 cls = ncls;
1104 lastProp = prop;
1105 }
1106 next_no_cls_update:
1107 lcls = ncls;
1108 }
1109
1110 if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
1111 // LB25: do not break lines inside numbers
1112 for (qsizetype j = nestart + 1; j < len; ++j)
1113 attributes[j].lineBreak = false;
1114 }
1115
1116 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
1117 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
1118}
1119
1120
1121static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1122{
1123 for (qsizetype i = 0; i != len; ++i) {
1124 uint ucs4 = string[i];
1125 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
1126 ushort low = string[i + 1];
1127 if (QChar::isLowSurrogate(ucs4: low)) {
1128 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
1129 ++i;
1130 }
1131 }
1132
1133 if (Q_UNLIKELY(QChar::isSpace(ucs4)))
1134 attributes[i].whiteSpace = true;
1135 }
1136}
1137
1138namespace Tailored {
1139
1140using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
1141
1142
1143enum Form {
1144 Invalid = 0x0,
1145 UnknownForm = Invalid,
1146 Consonant,
1147 Nukta,
1148 Halant,
1149 Matra,
1150 VowelMark,
1151 StressMark,
1152 IndependentVowel,
1153 LengthMark,
1154 Control,
1155 Other
1156};
1157
1158static const unsigned char indicForms[0xe00-0x900] = {
1159 // Devangari
1160 Invalid, VowelMark, VowelMark, VowelMark,
1161 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1162 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1163 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1164
1165 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1166 IndependentVowel, Consonant, Consonant, Consonant,
1167 Consonant, Consonant, Consonant, Consonant,
1168 Consonant, Consonant, Consonant, Consonant,
1169
1170 Consonant, Consonant, Consonant, Consonant,
1171 Consonant, Consonant, Consonant, Consonant,
1172 Consonant, Consonant, Consonant, Consonant,
1173 Consonant, Consonant, Consonant, Consonant,
1174
1175 Consonant, Consonant, Consonant, Consonant,
1176 Consonant, Consonant, Consonant, Consonant,
1177 Consonant, Consonant, UnknownForm, UnknownForm,
1178 Nukta, Other, Matra, Matra,
1179
1180 Matra, Matra, Matra, Matra,
1181 Matra, Matra, Matra, Matra,
1182 Matra, Matra, Matra, Matra,
1183 Matra, Halant, UnknownForm, UnknownForm,
1184
1185 Other, StressMark, StressMark, StressMark,
1186 StressMark, UnknownForm, UnknownForm, UnknownForm,
1187 Consonant, Consonant, Consonant, Consonant,
1188 Consonant, Consonant, Consonant, Consonant,
1189
1190 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1191 Other, Other, Other, Other,
1192 Other, Other, Other, Other,
1193 Other, Other, Other, Other,
1194
1195 Other, Other, Other, Other,
1196 Other, Other, Other, Other,
1197 Other, Other, Other, Consonant,
1198 Consonant, Consonant /* ??? */, Consonant, Consonant,
1199
1200 // Bengali
1201 Invalid, VowelMark, VowelMark, VowelMark,
1202 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1203 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1204 IndependentVowel, Invalid, Invalid, IndependentVowel,
1205
1206 IndependentVowel, Invalid, Invalid, IndependentVowel,
1207 IndependentVowel, Consonant, Consonant, Consonant,
1208 Consonant, Consonant, Consonant, Consonant,
1209 Consonant, Consonant, Consonant, Consonant,
1210
1211 Consonant, Consonant, Consonant, Consonant,
1212 Consonant, Consonant, Consonant, Consonant,
1213 Consonant, Invalid, Consonant, Consonant,
1214 Consonant, Consonant, Consonant, Consonant,
1215
1216 Consonant, Invalid, Consonant, Invalid,
1217 Invalid, Invalid, Consonant, Consonant,
1218 Consonant, Consonant, UnknownForm, UnknownForm,
1219 Nukta, Other, Matra, Matra,
1220
1221 Matra, Matra, Matra, Matra,
1222 Matra, Invalid, Invalid, Matra,
1223 Matra, Invalid, Invalid, Matra,
1224 Matra, Halant, Consonant, UnknownForm,
1225
1226 Invalid, Invalid, Invalid, Invalid,
1227 Invalid, Invalid, Invalid, VowelMark,
1228 Invalid, Invalid, Invalid, Invalid,
1229 Consonant, Consonant, Invalid, Consonant,
1230
1231 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1232 Other, Other, Other, Other,
1233 Other, Other, Other, Other,
1234 Other, Other, Other, Other,
1235
1236 Consonant, Consonant, Other, Other,
1237 Other, Other, Other, Other,
1238 Other, Other, Other, Other,
1239 Other, Other, Other, Other,
1240
1241 // Gurmukhi
1242 Invalid, VowelMark, VowelMark, VowelMark,
1243 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1244 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1245 Invalid, Invalid, Invalid, IndependentVowel,
1246
1247 IndependentVowel, Invalid, Invalid, IndependentVowel,
1248 IndependentVowel, Consonant, Consonant, Consonant,
1249 Consonant, Consonant, Consonant, Consonant,
1250 Consonant, Consonant, Consonant, Consonant,
1251
1252 Consonant, Consonant, Consonant, Consonant,
1253 Consonant, Consonant, Consonant, Consonant,
1254 Consonant, Invalid, Consonant, Consonant,
1255 Consonant, Consonant, Consonant, Consonant,
1256
1257 Consonant, Invalid, Consonant, Consonant,
1258 Invalid, Consonant, Consonant, Invalid,
1259 Consonant, Consonant, UnknownForm, UnknownForm,
1260 Nukta, Other, Matra, Matra,
1261
1262 Matra, Matra, Matra, Invalid,
1263 Invalid, Invalid, Invalid, Matra,
1264 Matra, Invalid, Invalid, Matra,
1265 Matra, Halant, UnknownForm, UnknownForm,
1266
1267 Invalid, Invalid, Invalid, Invalid,
1268 Invalid, UnknownForm, UnknownForm, UnknownForm,
1269 Invalid, Consonant, Consonant, Consonant,
1270 Consonant, Invalid, Consonant, Invalid,
1271
1272 Other, Other, Invalid, Invalid,
1273 Other, Other, Other, Other,
1274 Other, Other, Other, Other,
1275 Other, Other, Other, Other,
1276
1277 StressMark, StressMark, Consonant, Consonant,
1278 Other, Other, Other, Other,
1279 Other, Other, Other, Other,
1280 Other, Other, Other, Other,
1281
1282 // Gujarati
1283 Invalid, VowelMark, VowelMark, VowelMark,
1284 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1285 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1286 IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
1287
1288 IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
1289 IndependentVowel, Consonant, Consonant, Consonant,
1290 Consonant, Consonant, Consonant, Consonant,
1291 Consonant, Consonant, Consonant, Consonant,
1292
1293 Consonant, Consonant, Consonant, Consonant,
1294 Consonant, Consonant, Consonant, Consonant,
1295 Consonant, Invalid, Consonant, Consonant,
1296 Consonant, Consonant, Consonant, Consonant,
1297
1298 Consonant, Invalid, Consonant, Consonant,
1299 Invalid, Consonant, Consonant, Consonant,
1300 Consonant, Consonant, UnknownForm, UnknownForm,
1301 Nukta, Other, Matra, Matra,
1302
1303 Matra, Matra, Matra, Matra,
1304 Matra, Matra, Invalid, Matra,
1305 Matra, Matra, Invalid, Matra,
1306 Matra, Halant, UnknownForm, UnknownForm,
1307
1308 Other, UnknownForm, UnknownForm, UnknownForm,
1309 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1310 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1311 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1312
1313 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1314 Other, Other, Other, Other,
1315 Other, Other, Other, Other,
1316 Other, Other, Other, Other,
1317
1318 Other, Other, Other, Other,
1319 Other, Other, Other, Other,
1320 Other, Other, Other, Other,
1321 Other, Other, Other, Other,
1322
1323 // Oriya
1324 Invalid, VowelMark, VowelMark, VowelMark,
1325 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1326 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1327 IndependentVowel, Invalid, Invalid, IndependentVowel,
1328
1329 IndependentVowel, Invalid, Invalid, IndependentVowel,
1330 IndependentVowel, Consonant, Consonant, Consonant,
1331 Consonant, Consonant, Consonant, Consonant,
1332 Consonant, Consonant, Consonant, Consonant,
1333
1334 Consonant, Consonant, Consonant, Consonant,
1335 Consonant, Consonant, Consonant, Consonant,
1336 Consonant, Invalid, Consonant, Consonant,
1337 Consonant, Consonant, Consonant, Consonant,
1338
1339 Consonant, Invalid, Consonant, Consonant,
1340 Invalid, Consonant, Consonant, Consonant,
1341 Consonant, Consonant, UnknownForm, UnknownForm,
1342 Nukta, Other, Matra, Matra,
1343
1344 Matra, Matra, Matra, Matra,
1345 Invalid, Invalid, Invalid, Matra,
1346 Matra, Invalid, Invalid, Matra,
1347 Matra, Halant, UnknownForm, UnknownForm,
1348
1349 Other, Invalid, Invalid, Invalid,
1350 Invalid, UnknownForm, LengthMark, LengthMark,
1351 Invalid, Invalid, Invalid, Invalid,
1352 Consonant, Consonant, Invalid, Consonant,
1353
1354 IndependentVowel, IndependentVowel, Invalid, Invalid,
1355 Invalid, Invalid, Other, Other,
1356 Other, Other, Other, Other,
1357 Other, Other, Other, Other,
1358
1359 Other, Consonant, Other, Other,
1360 Other, Other, Other, Other,
1361 Other, Other, Other, Other,
1362 Other, Other, Other, Other,
1363
1364 //Tamil
1365 Invalid, Invalid, VowelMark, Other,
1366 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1367 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1368 Invalid, Invalid, IndependentVowel, IndependentVowel,
1369
1370 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1371 IndependentVowel, Consonant, Invalid, Invalid,
1372 Invalid, Consonant, Consonant, Invalid,
1373 Consonant, Invalid, Consonant, Consonant,
1374
1375 Invalid, Invalid, Invalid, Consonant,
1376 Consonant, Invalid, Invalid, Invalid,
1377 Consonant, Consonant, Consonant, Invalid,
1378 Invalid, Invalid, Consonant, Consonant,
1379
1380 Consonant, Consonant, Consonant, Consonant,
1381 Consonant, Consonant, Consonant, Consonant,
1382 Consonant, Consonant, UnknownForm, UnknownForm,
1383 Invalid, Invalid, Matra, Matra,
1384
1385 Matra, Matra, Matra, Invalid,
1386 Invalid, Invalid, Matra, Matra,
1387 Matra, Invalid, Matra, Matra,
1388 Matra, Halant, Invalid, Invalid,
1389
1390 Invalid, Invalid, Invalid, Invalid,
1391 Invalid, Invalid, Invalid, LengthMark,
1392 Invalid, Invalid, Invalid, Invalid,
1393 Invalid, Invalid, Invalid, Invalid,
1394
1395 Invalid, Invalid, Invalid, Invalid,
1396 Invalid, Invalid, Other, Other,
1397 Other, Other, Other, Other,
1398 Other, Other, Other, Other,
1399
1400 Other, Other, Other, Other,
1401 Other, Other, Other, Other,
1402 Other, Other, Other, Other,
1403 Other, Other, Other, Other,
1404
1405 // Telugu
1406 Invalid, VowelMark, VowelMark, VowelMark,
1407 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1408 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1409 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1410
1411 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1412 IndependentVowel, Consonant, Consonant, Consonant,
1413 Consonant, Consonant, Consonant, Consonant,
1414 Consonant, Consonant, Consonant, Consonant,
1415
1416 Consonant, Consonant, Consonant, Consonant,
1417 Consonant, Consonant, Consonant, Consonant,
1418 Consonant, Invalid, Consonant, Consonant,
1419 Consonant, Consonant, Consonant, Consonant,
1420
1421 Consonant, Consonant, Consonant, Consonant,
1422 Invalid, Consonant, Consonant, Consonant,
1423 Consonant, Consonant, UnknownForm, UnknownForm,
1424 Invalid, Invalid, Matra, Matra,
1425
1426 Matra, Matra, Matra, Matra,
1427 Matra, Invalid, Matra, Matra,
1428 Matra, Invalid, Matra, Matra,
1429 Matra, Halant, Invalid, Invalid,
1430
1431 Invalid, Invalid, Invalid, Invalid,
1432 Invalid, LengthMark, Matra, Invalid,
1433 Invalid, Invalid, Invalid, Invalid,
1434 Invalid, Invalid, Invalid, Invalid,
1435
1436 IndependentVowel, IndependentVowel, Invalid, Invalid,
1437 Invalid, Invalid, Other, Other,
1438 Other, Other, Other, Other,
1439 Other, Other, Other, Other,
1440
1441 Other, Other, Other, Other,
1442 Other, Other, Other, Other,
1443 Other, Other, Other, Other,
1444 Other, Other, Other, Other,
1445
1446 // Kannada
1447 Invalid, Invalid, VowelMark, VowelMark,
1448 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1449 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1450 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1451
1452 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1453 IndependentVowel, Consonant, Consonant, Consonant,
1454 Consonant, Consonant, Consonant, Consonant,
1455 Consonant, Consonant, Consonant, Consonant,
1456
1457 Consonant, Consonant, Consonant, Consonant,
1458 Consonant, Consonant, Consonant, Consonant,
1459 Consonant, Invalid, Consonant, Consonant,
1460 Consonant, Consonant, Consonant, Consonant,
1461
1462 Consonant, Consonant, Consonant, Consonant,
1463 Invalid, Consonant, Consonant, Consonant,
1464 Consonant, Consonant, UnknownForm, UnknownForm,
1465 Nukta, Other, Matra, Matra,
1466
1467 Matra, Matra, Matra, Matra,
1468 Matra, Invalid, Matra, Matra,
1469 Matra, Invalid, Matra, Matra,
1470 Matra, Halant, Invalid, Invalid,
1471
1472 Invalid, Invalid, Invalid, Invalid,
1473 Invalid, LengthMark, LengthMark, Invalid,
1474 Invalid, Invalid, Invalid, Invalid,
1475 Invalid, Invalid, Consonant, Invalid,
1476
1477 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1478 Invalid, Invalid, Other, Other,
1479 Other, Other, Other, Other,
1480 Other, Other, Other, Other,
1481
1482 Other, Other, Other, Other,
1483 Other, Other, Other, Other,
1484 Other, Other, Other, Other,
1485 Other, Other, Other, Other,
1486
1487 // Malayalam
1488 Invalid, Invalid, VowelMark, VowelMark,
1489 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1490 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1491 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1492
1493 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1494 IndependentVowel, Consonant, Consonant, Consonant,
1495 Consonant, Consonant, Consonant, Consonant,
1496 Consonant, Consonant, Consonant, Consonant,
1497
1498 Consonant, Consonant, Consonant, Consonant,
1499 Consonant, Consonant, Consonant, Consonant,
1500 Consonant, Invalid, Consonant, Consonant,
1501 Consonant, Consonant, Consonant, Consonant,
1502
1503 Consonant, Consonant, Consonant, Consonant,
1504 Consonant, Consonant, Consonant, Consonant,
1505 Consonant, Consonant, UnknownForm, UnknownForm,
1506 Invalid, Invalid, Matra, Matra,
1507
1508 Matra, Matra, Matra, Matra,
1509 Invalid, Invalid, Matra, Matra,
1510 Matra, Invalid, Matra, Matra,
1511 Matra, Halant, Invalid, Invalid,
1512
1513 Invalid, Invalid, Invalid, Invalid,
1514 Invalid, Invalid, Invalid, Matra,
1515 Invalid, Invalid, Invalid, Invalid,
1516 Invalid, Invalid, Invalid, Invalid,
1517
1518 IndependentVowel, IndependentVowel, Invalid, Invalid,
1519 Invalid, Invalid, Other, Other,
1520 Other, Other, Other, Other,
1521 Other, Other, Other, Other,
1522
1523 Other, Other, Other, Other,
1524 Other, Other, Other, Other,
1525 Other, Other, Other, Other,
1526 Other, Other, Other, Other,
1527
1528 // Sinhala
1529 Invalid, Invalid, VowelMark, VowelMark,
1530 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1531 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1532 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1533
1534 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1535 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1536 Invalid, Invalid, Consonant, Consonant,
1537 Consonant, Consonant, Consonant, Consonant,
1538
1539 Consonant, Consonant, Consonant, Consonant,
1540 Consonant, Consonant, Consonant, Consonant,
1541 Consonant, Consonant, Consonant, Consonant,
1542 Consonant, Consonant, Consonant, Consonant,
1543
1544 Consonant, Consonant, Invalid, Consonant,
1545 Consonant, Consonant, Consonant, Consonant,
1546 Consonant, Consonant, Consonant, Consonant,
1547 Invalid, Consonant, Invalid, Invalid,
1548
1549 Consonant, Consonant, Consonant, Consonant,
1550 Consonant, Consonant, Consonant, Invalid,
1551 Invalid, Invalid, Halant, Invalid,
1552 Invalid, Invalid, Invalid, Matra,
1553
1554 Matra, Matra, Matra, Matra,
1555 Matra, Invalid, Matra, Invalid,
1556 Matra, Matra, Matra, Matra,
1557 Matra, Matra, Matra, Matra,
1558
1559 Invalid, Invalid, Invalid, Invalid,
1560 Invalid, Invalid, Invalid, Invalid,
1561 Invalid, Invalid, Invalid, Invalid,
1562 Invalid, Invalid, Invalid, Invalid,
1563
1564 Invalid, Invalid, Matra, Matra,
1565 Other, Other, Other, Other,
1566 Other, Other, Other, Other,
1567 Other, Other, Other, Other,
1568};
1569
1570static inline Form form(unsigned short uc) {
1571 if (uc < 0x900 || uc > 0xdff) {
1572 if (uc == 0x25cc)
1573 return Consonant;
1574 if (uc == 0x200c || uc == 0x200d)
1575 return Control;
1576 return Other;
1577 }
1578 return (Form)indicForms[uc-0x900];
1579}
1580
1581// #define INDIC_DEBUG
1582#ifdef INDIC_DEBUG
1583#define IDEBUG qDebug
1584#else
1585#define IDEBUG if constexpr (1) ; else qDebug
1586#endif
1587
1588/* syllables are of the form:
1589
1590 (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1591 (Consonant Nukta? Halant)* Consonant Halant
1592 IndependentVowel VowelMark? StressMark?
1593
1594 We return syllable boundaries on invalid combinations as well
1595*/
1596static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1597{
1598 *invalid = false;
1599 IDEBUG(msg: "indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1600 const char16_t *uc = s+start;
1601
1602 qsizetype pos = 0;
1603 Form state = form(uc: uc[pos]);
1604 IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1605 pos++;
1606
1607 if (state != Consonant && state != IndependentVowel) {
1608 if (state != Other)
1609 *invalid = true;
1610 goto finish;
1611 }
1612
1613 while (pos < end - start) {
1614 Form newState = form(uc: uc[pos]);
1615 IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1616 switch (newState) {
1617 case Control:
1618 newState = state;
1619 if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1620 break;
1621 // the control character should be the last char in the item
1622 if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1623 break;
1624 if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1625 break;
1626 // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1627 ++pos;
1628 goto finish;
1629 case Consonant:
1630 if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1631 break;
1632 goto finish;
1633 case Halant:
1634 if (state == Nukta || state == Consonant)
1635 break;
1636 // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1637 if (script == QChar::Script_Bengali && pos == 1 &&
1638 (uc[0] == 0x0985 || uc[0] == 0x098f))
1639 break;
1640 // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1641 if (script == QChar::Script_Sinhala && state == Matra) {
1642 ++pos;
1643 continue;
1644 }
1645 if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1646 ++pos;
1647 continue;
1648 }
1649 goto finish;
1650 case Nukta:
1651 if (state == Consonant)
1652 break;
1653 goto finish;
1654 case StressMark:
1655 if (state == VowelMark)
1656 break;
1657 Q_FALLTHROUGH();
1658 case VowelMark:
1659 if (state == Matra || state == LengthMark || state == IndependentVowel)
1660 break;
1661 Q_FALLTHROUGH();
1662 case Matra:
1663 if (state == Consonant || state == Nukta)
1664 break;
1665 if (state == Matra) {
1666 // ### needs proper testing for correct two/three part matras
1667 break;
1668 }
1669 // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1670 // it work for all Indic languages?
1671 // the combination Independent_A + Vowel Sign AA is allowed.
1672 if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1673 break;
1674 if (script == QChar::Script_Tamil && state == Matra) {
1675 if (uc[pos-1] == 0x0bc6 &&
1676 (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1677 break;
1678 if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1679 break;
1680 }
1681 goto finish;
1682
1683 case LengthMark:
1684 if (state == Matra) {
1685 // ### needs proper testing for correct two/three part matras
1686 break;
1687 }
1688 Q_FALLTHROUGH();
1689 case IndependentVowel:
1690 case Invalid:
1691 case Other:
1692 goto finish;
1693 }
1694 state = newState;
1695 pos++;
1696 }
1697 finish:
1698 return pos+start;
1699}
1700
1701static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1702{
1703 qsizetype end = from + len;
1704 attributes += from;
1705 qsizetype i = 0;
1706 while (i < len) {
1707 bool invalid;
1708 qsizetype boundary = indic_nextSyllableBoundary(script, s: text, start: from+i, end, invalid: &invalid) - from;
1709 attributes[i].graphemeBoundary = true;
1710
1711 if (boundary > len-1) boundary = len;
1712 i++;
1713 while (i < boundary) {
1714 attributes[i].graphemeBoundary = false;
1715 ++i;
1716 }
1717 assert(i == boundary);
1718 }
1719
1720
1721}
1722
1723#if QT_CONFIG(library)
1724
1725#define LIBTHAI_MAJOR 0
1726
1727/*
1728 * if libthai changed please update these codes too.
1729 */
1730struct thcell_t {
1731 unsigned char base; /**< base character */
1732 unsigned char hilo; /**< upper/lower vowel/diacritic */
1733 unsigned char top; /**< top-level mark */
1734};
1735
1736using ThBrk = struct _ThBrk;
1737
1738namespace {
1739
1740class LibThai final
1741{
1742 Q_DISABLE_COPY_MOVE(LibThai)
1743
1744 using th_brk_new_def = ThBrk *(*)(const char *);
1745 using th_brk_delete_def = void (*)(ThBrk *);
1746 using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
1747 using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
1748
1749public:
1750 LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
1751 {
1752 m_th_brk_find_breaks =
1753 reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve(symbol: "th_brk_find_breaks"));
1754 m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve(symbol: "th_next_cell"));
1755
1756 auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve(symbol: "th_brk_new"));
1757 if (th_brk_new) {
1758 m_state = th_brk_new(nullptr);
1759 m_th_brk_delete =
1760 reinterpret_cast<th_brk_delete_def>(m_library.resolve(symbol: "th_brk_delete"));
1761 }
1762 }
1763
1764 ~LibThai()
1765 {
1766 if (m_state && m_th_brk_delete)
1767 m_th_brk_delete(m_state);
1768 m_library.unload();
1769 }
1770
1771 bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1772
1773 int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
1774 {
1775 Q_ASSERT(m_state);
1776 Q_ASSERT(m_th_brk_find_breaks);
1777 return m_th_brk_find_breaks(m_state, s, pos, pos_sz);
1778 }
1779
1780 size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
1781 {
1782 Q_ASSERT(m_th_next_cell);
1783 return m_th_next_cell(s, len, cell, is_decomp_am);
1784 }
1785
1786private:
1787 QLibrary m_library;
1788
1789 // Global state for th_brk_find_breaks().
1790 // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1791 // state is read-only, and so it is safe to use it from multiple threads after
1792 // initialization. This is also stated in the libthai documentation.
1793 ThBrk *m_state = nullptr;
1794
1795 th_brk_find_breaks_def m_th_brk_find_breaks = nullptr;
1796 th_next_cell_def m_th_next_cell = nullptr;
1797 th_brk_delete_def m_th_brk_delete = nullptr;
1798};
1799
1800} // unnamed namespace
1801
1802Q_GLOBAL_STATIC(LibThai, g_libThai)
1803
1804static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1805{
1806 qsizetype i;
1807 unsigned char *result = reinterpret_cast<unsigned char *>(cstr);
1808
1809 for (i = 0; i < len; ++i) {
1810 if (string[i] <= 0xa0)
1811 result[i] = static_cast<unsigned char>(string[i]);
1812 else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1813 result[i] = static_cast<unsigned char>(string[i] - 0xe00 + 0xa0);
1814 else
1815 result[i] = static_cast<unsigned char>(~0); // Same encoding as libthai uses for invalid chars
1816 }
1817
1818 result[len] = 0;
1819}
1820
1821/*
1822 * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1823 */
1824static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1825{
1826 constexpr qsizetype Prealloc = 128;
1827 QVarLengthArray<char, Prealloc + 1> s(len + 1);
1828 QVarLengthArray<int, Prealloc> break_positions(len);
1829 qsizetype numbreaks, i;
1830 struct thcell_t tis_cell;
1831
1832 LibThai *libThai = g_libThai;
1833 if (!libThai || !libThai->isInitialized())
1834 return;
1835
1836 to_tis620(string, len, cstr: s.data());
1837
1838 for (i = 0; i < len; ++i) {
1839 attributes[i].wordBreak = false;
1840 attributes[i].wordStart = false;
1841 attributes[i].wordEnd = false;
1842 attributes[i].lineBreak = false;
1843 }
1844
1845 attributes[0].wordBreak = true;
1846 attributes[0].wordStart = true;
1847 attributes[0].wordEnd = false;
1848 numbreaks = libThai->brk_find_breaks(s: reinterpret_cast<const unsigned char *>(s.data()),
1849 pos: break_positions.data(),
1850 pos_sz: static_cast<size_t>(break_positions.size()));
1851 for (i = 0; i < numbreaks; ++i) {
1852 attributes[break_positions[i]].wordBreak = true;
1853 attributes[break_positions[i]].wordStart = true;
1854 attributes[break_positions[i]].wordEnd = true;
1855 attributes[break_positions[i]].lineBreak = true;
1856 }
1857 if (numbreaks > 0)
1858 attributes[break_positions[numbreaks - 1]].wordStart = false;
1859
1860 /* manage grapheme boundaries */
1861 i = 0;
1862 while (i < len) {
1863 size_t cell_length =
1864 libThai->next_cell(s: reinterpret_cast<const unsigned char *>(s.data()) + i,
1865 len: size_t(len - i), cell: &tis_cell, is_decomp_am: true);
1866
1867 attributes[i].graphemeBoundary = true;
1868 for (size_t j = 1; j < cell_length; ++j)
1869 attributes[i + j].graphemeBoundary = false;
1870
1871 i += cell_length;
1872 }
1873}
1874
1875#endif // QT_CONFIG(library)
1876
1877static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1878{
1879 assert(script == QChar::Script_Thai);
1880#if QT_CONFIG(library)
1881 const char16_t *uc = text + from;
1882 attributes += from;
1883 Q_UNUSED(script);
1884 thaiAssignAttributes(string: uc, len, attributes);
1885#else
1886 Q_UNUSED(script);
1887 Q_UNUSED(text);
1888 Q_UNUSED(from);
1889 Q_UNUSED(len);
1890 Q_UNUSED(attributes);
1891#endif
1892}
1893
1894/*
1895 tibetan syllables are of the form:
1896 head position consonant
1897 first sub-joined consonant
1898 ....intermediate sub-joined consonants (if any)
1899 last sub-joined consonant
1900 sub-joined vowel (a-chung U+0F71)
1901 standard or compound vowel sign (or 'virama' for devanagari transliteration)
1902*/
1903
1904typedef enum {
1905 TibetanOther,
1906 TibetanHeadConsonant,
1907 TibetanSubjoinedConsonant,
1908 TibetanSubjoinedVowel,
1909 TibetanVowel
1910} TibetanForm;
1911
1912/* this table starts at U+0f40 */
1913static const unsigned char tibetanForm[0x80] = {
1914 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1915 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1916 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1917 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1918
1919 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1920 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1921 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1922 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1923
1924 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1925 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1926 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1927 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1928
1929 TibetanOther, TibetanVowel, TibetanVowel, TibetanVowel,
1930 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1931 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1932 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1933
1934 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1935 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1936 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1937 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1938
1939 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1940 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1941 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1942 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1943
1944 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1945 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1946 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1947 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1948
1949 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1950 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1951 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1952 TibetanSubjoinedConsonant, TibetanOther, TibetanOther, TibetanOther
1953};
1954
1955#define tibetan_form(c) \
1956 ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1957
1958static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1959{
1960 const char16_t *uc = s + start;
1961
1962 qsizetype pos = 0;
1963 TibetanForm state = tibetan_form(*uc);
1964
1965/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1966 pos++;
1967
1968 if (state != TibetanHeadConsonant) {
1969 if (state != TibetanOther)
1970 *invalid = true;
1971 goto finish;
1972 }
1973
1974 while (pos < end - start) {
1975 TibetanForm newState = tibetan_form(uc[pos]);
1976 switch (newState) {
1977 case TibetanSubjoinedConsonant:
1978 case TibetanSubjoinedVowel:
1979 if (state != TibetanHeadConsonant &&
1980 state != TibetanSubjoinedConsonant)
1981 goto finish;
1982 state = newState;
1983 break;
1984 case TibetanVowel:
1985 if (state != TibetanHeadConsonant &&
1986 state != TibetanSubjoinedConsonant &&
1987 state != TibetanSubjoinedVowel)
1988 goto finish;
1989 break;
1990 case TibetanOther:
1991 case TibetanHeadConsonant:
1992 goto finish;
1993 }
1994 pos++;
1995 }
1996
1997finish:
1998 *invalid = false;
1999 return start+pos;
2000}
2001
2002static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2003{
2004 qsizetype end = from + len;
2005 qsizetype i = 0;
2006 Q_UNUSED(script);
2007 attributes += from;
2008 while (i < len) {
2009 bool invalid;
2010 qsizetype boundary = tibetan_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
2011
2012 attributes[i].graphemeBoundary = true;
2013
2014 if (boundary > len-1) boundary = len;
2015 i++;
2016 while (i < boundary) {
2017 attributes[i].graphemeBoundary = false;
2018 ++i;
2019 }
2020 assert(i == boundary);
2021 }
2022}
2023
2024enum MymrCharClassValues {
2025 Mymr_CC_RESERVED = 0,
2026 Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
2027 Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
2028 Mymr_CC_NGA = 3, /* Consonant NGA */
2029 Mymr_CC_YA = 4, /* Consonant YA */
2030 Mymr_CC_RA = 5, /* Consonant RA */
2031 Mymr_CC_WA = 6, /* Consonant WA */
2032 Mymr_CC_HA = 7, /* Consonant HA */
2033 Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
2034 Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
2035 Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
2036 Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
2037 Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
2038 Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
2039 Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
2040 Mymr_CC_SIGN_ABOVE = 15,
2041 Mymr_CC_SIGN_BELOW = 16,
2042 Mymr_CC_SIGN_AFTER = 17,
2043 Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
2044 Mymr_CC_COUNT = 19 /* This is the number of character classes */
2045};
2046
2047enum MymrCharClassFlags {
2048 Mymr_CF_CLASS_MASK = 0x0000FFFF,
2049
2050 Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2051 Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
2052 Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
2053 Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
2054 Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
2055 first in a syllable */
2056 Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
2057
2058 /* position flags */
2059 Mymr_CF_POS_BEFORE = 0x00080000,
2060 Mymr_CF_POS_BELOW = 0x00040000,
2061 Mymr_CF_POS_ABOVE = 0x00020000,
2062 Mymr_CF_POS_AFTER = 0x00010000,
2063 Mymr_CF_POS_MASK = 0x000f0000,
2064
2065 Mymr_CF_AFTER_KINZI = 0x00100000
2066};
2067
2068Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags)
2069
2070/* Characters that get refrered to by name */
2071enum MymrChar
2072{
2073 Mymr_C_SIGN_ZWNJ = 0x200C,
2074 Mymr_C_SIGN_ZWJ = 0x200D,
2075 Mymr_C_DOTTED_CIRCLE = 0x25CC,
2076 Mymr_C_RA = 0x101B,
2077 Mymr_C_YA = 0x101A,
2078 Mymr_C_NGA = 0x1004,
2079 Mymr_C_VOWEL_E = 0x1031,
2080 Mymr_C_VIRAMA = 0x1039
2081};
2082
2083enum
2084{
2085 Mymr_xx = Mymr_CC_RESERVED,
2086 Mymr_c1 = Mymr_CC_CONSONANT | Mymr_CF_CONSONANT | Mymr_CF_POS_BELOW,
2087 Mymr_c2 = Mymr_CC_CONSONANT2 | Mymr_CF_CONSONANT,
2088 Mymr_ng = Mymr_CC_NGA | Mymr_CF_CONSONANT | Mymr_CF_POS_ABOVE,
2089 Mymr_ya = Mymr_CC_YA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_AFTER | Mymr_CF_AFTER_KINZI,
2090 Mymr_ra = Mymr_CC_RA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BEFORE,
2091 Mymr_wa = Mymr_CC_WA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW,
2092 Mymr_ha = Mymr_CC_HA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW,
2093 Mymr_id = Mymr_CC_IND_VOWEL | Mymr_CF_IND_VOWEL,
2094 Mymr_vi = Mymr_CC_VIRAMA | Mymr_CF_VIRAMA | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE,
2095 Mymr_dl = Mymr_CC_PRE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BEFORE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
2096 Mymr_db = Mymr_CC_BELOW_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
2097 Mymr_da = Mymr_CC_ABOVE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
2098 Mymr_dr = Mymr_CC_POST_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
2099 Mymr_sa = Mymr_CC_SIGN_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_ABOVE | Mymr_CF_AFTER_KINZI,
2100 Mymr_sb = Mymr_CC_SIGN_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_BELOW | Mymr_CF_AFTER_KINZI,
2101 Mymr_sp = Mymr_CC_SIGN_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI
2102};
2103
2104
2105typedef int MymrCharClass;
2106
2107
2108static const MymrCharClass mymrCharClasses[] =
2109{
2110 Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_ng, Mymr_c1, Mymr_c1, Mymr_c1,
2111 Mymr_c1, Mymr_c1, Mymr_c2, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, /* 1000 - 100F */
2112 Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1,
2113 Mymr_c1, Mymr_c1, Mymr_ya, Mymr_ra, Mymr_c1, Mymr_wa, Mymr_c1, Mymr_ha, /* 1010 - 101F */
2114 Mymr_c2, Mymr_c2, Mymr_xx, Mymr_id, Mymr_id, Mymr_id, Mymr_id, Mymr_id,
2115 Mymr_xx, Mymr_id, Mymr_id, Mymr_xx, Mymr_dr, Mymr_da, Mymr_da, Mymr_db, /* 1020 - 102F */
2116 Mymr_db, Mymr_dl, Mymr_da, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_sa, Mymr_sb,
2117 Mymr_sp, Mymr_vi, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1030 - 103F */
2118 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
2119 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1040 - 104F */
2120 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
2121 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1050 - 105F */
2122};
2123
2124static MymrCharClass
2125getMyanmarCharClass (ushort ch)
2126{
2127 if (ch == Mymr_C_SIGN_ZWJ)
2128 return Mymr_CC_ZERO_WIDTH_J_MARK;
2129
2130 if (ch == Mymr_C_SIGN_ZWNJ)
2131 return Mymr_CC_ZERO_WIDTH_NJ_MARK;
2132
2133 if (ch < 0x1000 || ch > 0x105f)
2134 return Mymr_CC_RESERVED;
2135
2136 return mymrCharClasses[ch - 0x1000];
2137}
2138
2139static const signed char mymrStateTable[][Mymr_CC_COUNT] =
2140{
2141/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
2142 { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
2143 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
2144 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
2145 {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
2146 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
2147 {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
2148 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
2149 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
2150 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
2151 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
2152 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
2153 {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
2154 {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
2155 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
2156 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
2157 {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
2158 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
2159 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
2160 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
2161 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
2162 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
2163 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
2164 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
2165 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
2166 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
2167 {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
2168 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
2169 {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
2170/* exit state -2 is for invalid order of medials and combination of invalids
2171 with virama where virama should treat as start of next syllable
2172 */
2173};
2174
2175/*#define MYANMAR_DEBUG */
2176#ifdef MYANMAR_DEBUG
2177#define MMDEBUG qDebug
2178#else
2179# define MMDEBUG \
2180 if (0) \
2181 printf
2182#endif
2183
2184/*
2185// Given an input string of characters and a location in which to start looking
2186// calculate, using the state table, which one is the last character of the syllable
2187// that starts in the starting position.
2188*/
2189static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2190{
2191 const char16_t *uc = s + start;
2192 int state = 0;
2193 qsizetype pos = start;
2194 *invalid = false;
2195
2196 while (pos < end) {
2197 MymrCharClass charClass = getMyanmarCharClass(ch: *uc);
2198 state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
2199 if (pos == start)
2200 *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
2201
2202 MMDEBUG(format: "state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
2203
2204 if (state < 0) {
2205 if (state < -1)
2206 --pos;
2207 break;
2208 }
2209 ++uc;
2210 ++pos;
2211 }
2212 return pos;
2213}
2214
2215static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2216{
2217 qsizetype end = from + len;
2218 qsizetype i = 0;
2219 Q_UNUSED(script);
2220 attributes += from;
2221 while (i < len) {
2222 bool invalid;
2223 qsizetype boundary = myanmar_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
2224
2225 attributes[i].graphemeBoundary = true;
2226 attributes[i].lineBreak = true;
2227
2228 if (boundary > len-1)
2229 boundary = len;
2230 i++;
2231 while (i < boundary) {
2232 attributes[i].graphemeBoundary = false;
2233 ++i;
2234 }
2235 assert(i == boundary);
2236 }
2237}
2238
2239/*
2240// Vocabulary
2241// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
2242// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
2243// split vowels, signs... but there is only one base in a syllable, it has to be coded as
2244// the first character of the syllable.
2245// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
2246// Khmer language has five of them. Khmer split vowels either have one part before the
2247// base and one after the base or they have a part before the base and a part above the base.
2248// The first part of all Khmer split vowels is the same character, identical to
2249// the glyph of Khmer dependent vowel SRA EI
2250// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
2251// Differently than indian languages, the coeng modifies the consonant that follows it,
2252// not the one preceding it Each consonant has two forms, the base form and the subscript form
2253// the base form is the normal one (using the consonants code-point), the subscript form is
2254// displayed when the combination coeng + consonant is encountered.
2255// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
2256// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
2257// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
2258// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
2259// if it is attached to a consonant of the first series or a consonant of the second series
2260// Most consonants have an equivalent in the other series, but some of theme exist only in
2261// one series (for example SA). If we want to use the consonant SA with a vowel sound that
2262// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
2263// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
2264// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
2265// MUSIKATOAN a second series consonant to have a first series vowel sound.
2266// Consonant shifter are both normally supercript marks, but, when they are followed by a
2267// superscript, they change shape and take the form of subscript dependent vowel SRA U.
2268// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
2269// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
2270// be placed after the coeng consonant.
2271// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
2272// Each vowel has its own position. Only one vowel per syllable is allowed.
2273// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
2274// Allowed in a syllable.
2275//
2276//
2277// order is important here! This order must be the same that is found in each horizontal
2278// line in the statetable for Khmer (see khmerStateTable) .
2279*/
2280enum KhmerCharClassValues {
2281 CC_RESERVED = 0,
2282 CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
2283 CC_CONSONANT2 = 2, /* Consonant of type 2 */
2284 CC_CONSONANT3 = 3, /* Consonant of type 3 */
2285 CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
2286 CC_CONSONANT_SHIFTER = 5,
2287 CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
2288 CC_COENG = 7, /* Subscript consonant combining character */
2289 CC_DEPENDENT_VOWEL = 8,
2290 CC_SIGN_ABOVE = 9,
2291 CC_SIGN_AFTER = 10,
2292 CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
2293 CC_COUNT = 12 /* This is the number of character classes */
2294};
2295
2296
2297enum KhmerCharClassFlags {
2298 CF_CLASS_MASK = 0x0000FFFF,
2299
2300 CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2301 CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
2302 CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
2303 CF_COENG = 0x08000000, /* flag to speed up comparing */
2304 CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
2305 CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
2306
2307 /* position flags */
2308 CF_POS_BEFORE = 0x00080000,
2309 CF_POS_BELOW = 0x00040000,
2310 CF_POS_ABOVE = 0x00020000,
2311 CF_POS_AFTER = 0x00010000,
2312 CF_POS_MASK = 0x000f0000
2313};
2314
2315Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags)
2316
2317/* Characters that get referred to by name */
2318enum KhmerChar {
2319 C_SIGN_ZWNJ = 0x200C,
2320 C_SIGN_ZWJ = 0x200D,
2321 C_RO = 0x179A,
2322 C_VOWEL_AA = 0x17B6,
2323 C_SIGN_NIKAHIT = 0x17C6,
2324 C_VOWEL_E = 0x17C1,
2325 C_COENG = 0x17D2
2326};
2327
2328
2329/*
2330// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2331// they are also used to know where a character should be placed (location in reference to the base character)
2332// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2333// indicate error in syllable construction
2334*/
2335enum {
2336 _xx = CC_RESERVED,
2337 _sa = CC_SIGN_ABOVE | CF_DOTTED_CIRCLE | CF_POS_ABOVE,
2338 _sp = CC_SIGN_AFTER | CF_DOTTED_CIRCLE| CF_POS_AFTER,
2339 _c1 = CC_CONSONANT | CF_CONSONANT,
2340 _c2 = CC_CONSONANT2 | CF_CONSONANT,
2341 _c3 = CC_CONSONANT3 | CF_CONSONANT,
2342 _rb = CC_ROBAT | CF_POS_ABOVE | CF_DOTTED_CIRCLE,
2343 _cs = CC_CONSONANT_SHIFTER | CF_DOTTED_CIRCLE | CF_SHIFTER,
2344 _dl = CC_DEPENDENT_VOWEL | CF_POS_BEFORE | CF_DOTTED_CIRCLE,
2345 _db = CC_DEPENDENT_VOWEL | CF_POS_BELOW | CF_DOTTED_CIRCLE,
2346 _da = CC_DEPENDENT_VOWEL | CF_POS_ABOVE | CF_DOTTED_CIRCLE | CF_ABOVE_VOWEL,
2347 _dr = CC_DEPENDENT_VOWEL | CF_POS_AFTER | CF_DOTTED_CIRCLE,
2348 _co = CC_COENG | CF_COENG | CF_DOTTED_CIRCLE,
2349
2350 /* split vowel */
2351 _va = _da | CF_SPLIT_VOWEL,
2352 _vr = _dr | CF_SPLIT_VOWEL
2353};
2354
2355
2356/*
2357// Character class: a character class value
2358// ORed with character class flags.
2359*/
2360typedef unsigned long KhmerCharClass;
2361
2362
2363/*
2364// Character class tables
2365// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2366// _sa Sign placed above the base
2367// _sp Sign placed after the base
2368// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2369// _c2 Consonant of type 2 (only RO)
2370// _c3 Consonant of type 3
2371// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2372// _cd Consonant-shifter
2373// _dl Dependent vowel placed before the base (left of the base)
2374// _db Dependent vowel placed below the base
2375// _da Dependent vowel placed above the base
2376// _dr Dependent vowel placed behind the base (right of the base)
2377// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2378// it to create a subscript consonant or independent vowel
2379// _va Khmer split vowel in which the first part is before the base and the second one above the base
2380// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2381*/
2382static const KhmerCharClass khmerCharClasses[] = {
2383 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
2384 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
2385 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
2386 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
2387 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
2388 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
2389};
2390
2391/* this enum must reflect the range of khmerCharClasses */
2392enum KhmerCharClassesRange {
2393 KhmerFirstChar = 0x1780,
2394 KhmerLastChar = 0x17df
2395};
2396
2397/*
2398// Below we define how a character in the input string is either in the khmerCharClasses table
2399// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2400// within the syllable, but are not in the table) we also get their type back, or an unknown object
2401// in which case we get _xx (CC_RESERVED) back
2402*/
2403static KhmerCharClass getKhmerCharClass(ushort uc)
2404{
2405 if (uc == C_SIGN_ZWJ) {
2406 return CC_ZERO_WIDTH_J_MARK;
2407 }
2408
2409 if (uc == C_SIGN_ZWNJ) {
2410 return CC_ZERO_WIDTH_NJ_MARK;
2411 }
2412
2413 if (uc < KhmerFirstChar || uc > KhmerLastChar) {
2414 return CC_RESERVED;
2415 }
2416
2417 return khmerCharClasses[uc - KhmerFirstChar];
2418}
2419
2420
2421/*
2422// The stateTable is used to calculate the end (the length) of a well
2423// formed Khmer Syllable.
2424//
2425// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2426// CharClassValues. This coincidence of values allows the follow up of the table.
2427//
2428// Each line corresponds to a state, which does not necessarily need to be a type
2429// of component... for example, state 2 is a base, with is always a first character
2430// in the syllable, but the state could be produced a consonant of any type when
2431// it is the first character that is analysed (in ground state).
2432//
2433// Differentiating 3 types of consonants is necessary in order to
2434// forbid the use of certain combinations, such as having a second
2435// coeng after a coeng RO,
2436// The inexistent possibility of having a type 3 after another type 3 is permitted,
2437// eliminating it would very much complicate the table, and it does not create typing
2438// problems, as the case above.
2439//
2440// The table is quite complex, in order to limit the number of coeng consonants
2441// to 2 (by means of the table).
2442//
2443// There a peculiarity, as far as Unicode is concerned:
2444// - The consonant-shifter is considered in two possible different
2445// locations, the one considered in Unicode 3.0 and the one considered in
2446// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2447//
2448//
2449// xx independent character, such as a number, punctuation sign or non-khmer char
2450//
2451// c1 Khmer consonant of type 1 or an independent vowel
2452// that is, a letter in which the subscript for is only under the
2453// base, not taking any space to the right or to the left
2454//
2455// c2 Khmer consonant of type 2, the coeng form takes space under
2456// and to the left of the base (only RO is of this type)
2457//
2458// c3 Khmer consonant of type 3. Its subscript form takes space under
2459// and to the right of the base.
2460//
2461// cs Khmer consonant shifter
2462//
2463// rb Khmer robat
2464//
2465// co coeng character (u17D2)
2466//
2467// dv dependent vowel (including split vowels, they are treated in the same way).
2468// even if dv is not defined above, the component that is really tested for is
2469// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2470//
2471// zwj Zero Width joiner
2472//
2473// zwnj Zero width non joiner
2474//
2475// sa above sign
2476//
2477// sp post sign
2478//
2479// there are lines with equal content but for an easier understanding
2480// (and maybe change in the future) we did not join them
2481*/
2482static const signed char khmerStateTable[][CC_COUNT] =
2483{
2484 /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2485 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2486 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2487 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2488 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2489 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2490 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2491 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2492 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2493 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2494 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2495 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2496 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2497 {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2498 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2499 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2500 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2501 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2502 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2503 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2504 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2505 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2506};
2507
2508
2509/* #define KHMER_DEBUG */
2510#ifdef KHMER_DEBUG
2511#define KHDEBUG qDebug
2512#else
2513# define KHDEBUG \
2514 if (0) \
2515 printf
2516#endif
2517
2518/*
2519// Given an input string of characters and a location in which to start looking
2520// calculate, using the state table, which one is the last character of the syllable
2521// that starts in the starting position.
2522*/
2523static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2524{
2525 const char16_t *uc = s + start;
2526 int state = 0;
2527 qsizetype pos = start;
2528 *invalid = false;
2529
2530 while (pos < end) {
2531 KhmerCharClass charClass = getKhmerCharClass(uc: *uc);
2532 if (pos == start) {
2533 *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2534 }
2535 state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2536
2537 KHDEBUG(format: "state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2538 charClass, *uc );
2539
2540 if (state < 0) {
2541 break;
2542 }
2543 ++uc;
2544 ++pos;
2545 }
2546 return pos;
2547}
2548
2549static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2550{
2551 qsizetype end = from + len;
2552 qsizetype i = 0;
2553 Q_UNUSED(script);
2554 attributes += from;
2555 while ( i < len ) {
2556 bool invalid;
2557 qsizetype boundary = khmer_nextSyllableBoundary( s: text, start: from+i, end, invalid: &invalid ) - from;
2558
2559 attributes[i].graphemeBoundary = true;
2560
2561 if ( boundary > len-1 ) boundary = len;
2562 i++;
2563 while ( i < boundary ) {
2564 attributes[i].graphemeBoundary = false;
2565 ++i;
2566 }
2567 assert( i == boundary );
2568 }
2569}
2570
2571
2572const CharAttributeFunction charAttributeFunction[] = {
2573// Script_Unknown,
2574 nullptr,
2575// Script_Inherited,
2576 nullptr,
2577// Script_Common,
2578 nullptr,
2579// Script_Latin,
2580 nullptr,
2581// Script_Greek,
2582 nullptr,
2583// Script_Cyrillic,
2584 nullptr,
2585// Script_Armenian,
2586 nullptr,
2587// Script_Hebrew,
2588 nullptr,
2589// Script_Arabic,
2590 nullptr,
2591// Script_Syriac,
2592 nullptr,
2593// Script_Thaana,
2594 nullptr,
2595// Script_Devanagari,
2596 indicAttributes,
2597// Script_Bengali,
2598 indicAttributes,
2599// Script_Gurmukhi,
2600 indicAttributes,
2601// Script_Gujarati,
2602 indicAttributes,
2603// Script_Oriya,
2604 indicAttributes,
2605// Script_Tamil,
2606 indicAttributes,
2607// Script_Telugu,
2608 indicAttributes,
2609// Script_Kannada,
2610 indicAttributes,
2611// Script_Malayalam,
2612 indicAttributes,
2613// Script_Sinhala,
2614 indicAttributes,
2615// Script_Thai,
2616 thaiAttributes,
2617// Script_Lao,
2618 nullptr,
2619// Script_Tibetan,
2620 tibetanAttributes,
2621// Script_Myanmar,
2622 myanmarAttributes,
2623// Script_Georgian,
2624 nullptr,
2625// Script_Hangul,
2626 nullptr,
2627// Script_Ethiopic,
2628 nullptr,
2629// Script_Cherokee,
2630 nullptr,
2631// Script_CanadianAboriginal,
2632 nullptr,
2633// Script_Ogham,
2634 nullptr,
2635// Script_Runic,
2636 nullptr,
2637// Script_Khmer,
2638 khmerAttributes
2639};
2640
2641static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2642 const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2643 QCharAttributes *attributes)
2644{
2645 if (stringLength == 0)
2646 return;
2647 for (qsizetype i = 0; i < numItems; ++i) {
2648 QChar::Script script = items[i].script;
2649 if (script > QChar::Script_Khmer)
2650 script = QChar::Script_Common;
2651 CharAttributeFunction attributeFunction = charAttributeFunction[script];
2652 if (!attributeFunction)
2653 continue;
2654 qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2655 attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2656 }
2657}
2658
2659}
2660
2661Q_CORE_EXPORT void initCharAttributes(QStringView string,
2662 const ScriptItem *items, qsizetype numItems,
2663 QCharAttributes *attributes, CharAttributeOptions options)
2664{
2665 if (string.size() <= 0)
2666 return;
2667
2668 if (!(options & DontClearAttributes))
2669 ::memset(s: attributes, c: 0, n: (string.size() + 1) * sizeof(QCharAttributes));
2670
2671 if (options & GraphemeBreaks)
2672 getGraphemeBreaks(string: string.utf16(), len: string.size(), attributes);
2673 if (options & WordBreaks)
2674 getWordBreaks(string: string.utf16(), len: string.size(), attributes);
2675 if (options & SentenceBreaks)
2676 getSentenceBreaks(string: string.utf16(), len: string.size(), attributes);
2677 if (options & LineBreaks)
2678 getLineBreaks(string: string.utf16(), len: string.size(), attributes, options);
2679 if (options & WhiteSpaces)
2680 getWhiteSpaces(string: string.utf16(), len: string.size(), attributes);
2681
2682 if (!qt_initcharattributes_default_algorithm_only) {
2683 if (!items || numItems <= 0)
2684 return;
2685
2686 Tailored::getCharAttributes(string: string.utf16(), stringLength: string.size(), items, numItems, attributes);
2687 }
2688}
2689
2690
2691// ----------------------------------------------------------------------------
2692//
2693// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2694//
2695// ----------------------------------------------------------------------------
2696
2697Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2698{
2699 qsizetype sor = 0;
2700 qsizetype eor = 0;
2701 QChar::Script script = QChar::Script_Common;
2702
2703 for (qsizetype i = 0; i < string.size(); ++i, eor = i) {
2704 char32_t ucs4 = string[i].unicode();
2705 if (QChar::isHighSurrogate(ucs4) && i + 1 < string.size()) {
2706 ushort low = string[i + 1].unicode();
2707 if (QChar::isLowSurrogate(ucs4: low)) {
2708 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
2709 ++i;
2710 }
2711 }
2712
2713 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2714
2715 QChar::Script nscript = QChar::Script(prop->script);
2716
2717 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2718 continue;
2719
2720 // inherit preceding Common-s
2721 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2722 // also covers a case where the base character of Common script followed
2723 // by one or more combining marks of non-Inherited, non-Common script
2724 script = nscript;
2725 continue;
2726 }
2727
2728 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2729 // Thus, a combining mark - whatever its script property value is - should inherit
2730 // the script property value of its base character.
2731 static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
2732 if (Q_UNLIKELY(FLAG(prop->category) & test))
2733 continue;
2734
2735 Q_ASSERT(script > QChar::Script_Common);
2736 Q_ASSERT(sor < eor);
2737 scripts->append(t: ScriptItem{.position: sor, .script: script});
2738 sor = eor;
2739
2740 script = nscript;
2741 }
2742
2743 Q_ASSERT(script >= QChar::Script_Common);
2744 Q_ASSERT(eor == string.size());
2745 scripts->append(t: ScriptItem{.position: sor, .script: script});
2746}
2747
2748} // namespace QUnicodeTools
2749
2750QT_END_NAMESPACE
2751

Provided by KDAB

Privacy Policy
Start learning QML with our Intro Training
Find out more

source code of qtbase/src/corelib/text/qunicodetools.cpp