1// Copyright (C) 2020 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qunicodetools_p.h"
5
6#include "qunicodetables_p.h"
7#include "qvarlengtharray.h"
8#if QT_CONFIG(library)
9#include "qlibrary.h"
10#endif
11
12#include <limits.h>
13
14#define FLAG(x) (1 << (x))
15
16QT_BEGIN_NAMESPACE
17
18using namespace Qt::StringLiterals;
19
20#ifdef QT_BUILD_INTERNAL
21Q_CONSTINIT Q_AUTOTEST_EXPORT
22#else
23constexpr
24#endif
25int qt_initcharattributes_default_algorithm_only = 0;
26
27namespace QUnicodeTools {
28
29// -----------------------------------------------------------------------------------------------------
30//
31// The text boundaries determination algorithm.
32// See https://www.unicode.org/reports/tr29/tr29-37.html
33//
34// -----------------------------------------------------------------------------------------------------
35
36namespace GB {
37
38// This table is indexed by the grapheme break classes of two
39// (adjacent) code points.
40// The class of the first code point selects an entry.
41// If the entry's bit at position second_cp_class is set
42// (in other words: if entry & (1u << second_cp_class) is non-zero)
43// then there is NO grapheme break between the two code points.
44
45using GBTableEntryType = quint16;
46
47// Check that we have enough bits in the table (in case
48// NumGraphemeBreakClasses grows too much).
49static_assert(sizeof(GBTableEntryType) * CHAR_BIT >= QUnicodeTables::NumGraphemeBreakClasses,
50 "Internal error: increase the size in bits of GBTableEntryType");
51
52// GB9, GB9a
53static const GBTableEntryType Extend_SpacingMark_ZWJ =
54 FLAG(QUnicodeTables::GraphemeBreak_Extend)
55 | FLAG(QUnicodeTables::GraphemeBreak_SpacingMark)
56 | FLAG(QUnicodeTables::GraphemeBreak_ZWJ);
57
58static const GBTableEntryType HardBreak = 0u;
59
60static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses] = {
61 Extend_SpacingMark_ZWJ, // Any
62 FLAG(QUnicodeTables::GraphemeBreak_LF), // CR
63 HardBreak, // LF
64 HardBreak, // Control
65 Extend_SpacingMark_ZWJ, // Extend
66 Extend_SpacingMark_ZWJ, // ZWJ
67 Extend_SpacingMark_ZWJ, // RegionalIndicator
68 (Extend_SpacingMark_ZWJ
69 | FLAG(QUnicodeTables::GraphemeBreak_Any)
70 | FLAG(QUnicodeTables::GraphemeBreak_Prepend)
71 | FLAG(QUnicodeTables::GraphemeBreak_L)
72 | FLAG(QUnicodeTables::GraphemeBreak_V)
73 | FLAG(QUnicodeTables::GraphemeBreak_T)
74 | FLAG(QUnicodeTables::GraphemeBreak_LV)
75 | FLAG(QUnicodeTables::GraphemeBreak_LVT)
76 | FLAG(QUnicodeTables::GraphemeBreak_RegionalIndicator)
77 | FLAG(QUnicodeTables::GraphemeBreak_Extended_Pictographic)
78 ), // Prepend
79 Extend_SpacingMark_ZWJ, // SpacingMark
80 (Extend_SpacingMark_ZWJ
81 | FLAG(QUnicodeTables::GraphemeBreak_L)
82 | FLAG(QUnicodeTables::GraphemeBreak_V)
83 | FLAG(QUnicodeTables::GraphemeBreak_LV)
84 | FLAG(QUnicodeTables::GraphemeBreak_LVT)
85 ), // L
86 (Extend_SpacingMark_ZWJ
87 | FLAG(QUnicodeTables::GraphemeBreak_V)
88 | FLAG(QUnicodeTables::GraphemeBreak_T)
89 ), // V
90 (Extend_SpacingMark_ZWJ
91 | FLAG(QUnicodeTables::GraphemeBreak_T)
92 ), // T
93 (Extend_SpacingMark_ZWJ
94 | FLAG(QUnicodeTables::GraphemeBreak_V)
95 | FLAG(QUnicodeTables::GraphemeBreak_T)
96 ), // LV
97 (Extend_SpacingMark_ZWJ
98 | FLAG(QUnicodeTables::GraphemeBreak_T)
99 ), // LVT
100 Extend_SpacingMark_ZWJ // Extended_Pictographic
101};
102
103static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first,
104 QUnicodeTables::GraphemeBreakClass second)
105{
106 return (breakTable[first] & FLAG(second)) == 0;
107}
108
109// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
110// so we need to store some local state.
111enum class State : uchar {
112 Normal,
113 GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
114 GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
115 GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
116};
117
118} // namespace GB
119
120static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
121{
122 QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
123 GB::State state = GB::State::Normal;
124 for (qsizetype i = 0; i != len; ++i) {
125 qsizetype pos = i;
126 char32_t ucs4 = string[i];
127 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
128 ushort low = string[i + 1];
129 if (QChar::isLowSurrogate(ucs4: low)) {
130 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
131 ++i;
132 }
133 }
134
135 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
136 QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
137
138 bool shouldBreak = GB::shouldBreakBetweenClasses(first: lcls, second: cls);
139 bool handled = false;
140
141 switch (state) {
142 case GB::State::Normal:
143 break; // will deal with it below
144
145 case GB::State::GB11_ExtPicExt:
146 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
147 if (cls == QUnicodeTables::GraphemeBreak_Extend) {
148 // keep going in the current state
149 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
150 handled = true;
151 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
152 state = GB::State::GB11_ExtPicExtZWJ;
153 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
154 handled = true;
155 } else {
156 state = GB::State::Normal;
157 }
158 break;
159
160 case GB::State::GB11_ExtPicExtZWJ:
161 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
162 if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) {
163 shouldBreak = false;
164 handled = true;
165 }
166
167 state = GB::State::Normal;
168 break;
169
170 case GB::State::GB12_13_RI:
171 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator);
172 if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) {
173 shouldBreak = false;
174 handled = true;
175 }
176
177 state = GB::State::Normal;
178 break;
179 }
180
181 if (!handled) {
182 Q_ASSERT(state == GB::State::Normal);
183 if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11
184 if (cls == QUnicodeTables::GraphemeBreak_Extend) {
185 state = GB::State::GB11_ExtPicExt;
186 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
187 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
188 state = GB::State::GB11_ExtPicExtZWJ;
189 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
190 }
191 } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
192 state = GB::State::GB12_13_RI;
193 }
194 }
195
196 if (shouldBreak)
197 attributes[pos].graphemeBoundary = true;
198
199 lcls = cls;
200 }
201
202 attributes[len].graphemeBoundary = true; // GB2
203}
204
205
206namespace WB {
207
208enum Action {
209 NoBreak,
210 Break,
211 Lookup,
212 LookupW
213};
214
215static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
216// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
217 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
218 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
219 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
220 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
221 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
222 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
223 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
224 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
225 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
226 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // HebrewLetter
227 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // ALetter
228 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
229 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
230 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
231 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
232 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
233 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break }, // Numeric
234 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ExtendNumLet
235 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
236};
237
238} // namespace WB
239
240static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
241{
242 enum WordType {
243 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
244 } currentWordType = WordTypeNone;
245
246 QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
247 auto real_cls = cls; // Unaffected by WB4
248
249 for (qsizetype i = 0; i != len; ++i) {
250 qsizetype pos = i;
251 char32_t ucs4 = string[i];
252 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
253 ushort low = string[i + 1];
254 if (QChar::isLowSurrogate(ucs4: low)) {
255 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
256 ++i;
257 }
258 }
259
260 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
261 QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
262 if (qt_initcharattributes_default_algorithm_only) {
263 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
264 // which caused "hi.there" to be treated like if it were just a single word;
265 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
266 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
267 if (ucs4 == 0x002E) // FULL STOP
268 ncls = QUnicodeTables::WordBreak_MidNumLet;
269 else if (ucs4 == 0x003A) // COLON
270 ncls = QUnicodeTables::WordBreak_MidLetter;
271 }
272
273 uchar action = WB::breakTable[cls][ncls];
274 switch (action) {
275 case WB::Break:
276 if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ
277 && prop->graphemeBreakClass
278 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
279 // WB3c: ZWJ × \p{Extended_Pictographic}
280 action = WB::NoBreak;
281 }
282 break;
283 case WB::NoBreak:
284 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend || ncls == QUnicodeTables::WordBreak_ZWJ || ncls == QUnicodeTables::WordBreak_Format)) {
285 // WB4: X(Extend|Format)* -> X
286 real_cls = ncls;
287 continue;
288 }
289 if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
290 // WB15/WB16: break between pairs of Regional indicator
291 ncls = QUnicodeTables::WordBreak_Any;
292 }
293 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace
294 && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
295 // WB3d should not be affected by WB4
296 action = WB::Break;
297 }
298 break;
299 case WB::Lookup:
300 case WB::LookupW:
301 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
302 ucs4 = string[lookahead];
303 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
304 ushort low = string[lookahead + 1];
305 if (QChar::isLowSurrogate(ucs4: low)) {
306 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
307 ++lookahead;
308 }
309 }
310
311 prop = QUnicodeTables::properties(ucs4);
312 QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
313
314 if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend || tcls == QUnicodeTables::WordBreak_ZWJ || tcls == QUnicodeTables::WordBreak_Format)) {
315 // WB4: X(Extend|Format)* -> X
316 continue;
317 }
318
319 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
320 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
321 i = lookahead;
322 ncls = tcls;
323 action = WB::NoBreak;
324 }
325 break;
326 }
327 if (action != WB::NoBreak) {
328 action = WB::Break;
329 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
330 action = WB::NoBreak; // WB7a
331 }
332 break;
333 }
334
335 cls = ncls;
336 real_cls = ncls;
337
338 if (action == WB::Break) {
339 attributes[pos].wordBreak = true;
340 if (currentWordType != WordTypeNone)
341 attributes[pos].wordEnd = true;
342 switch (cls) {
343 case QUnicodeTables::WordBreak_Katakana:
344 currentWordType = WordTypeHiraganaKatakana;
345 attributes[pos].wordStart = true;
346 break;
347 case QUnicodeTables::WordBreak_HebrewLetter:
348 case QUnicodeTables::WordBreak_ALetter:
349 case QUnicodeTables::WordBreak_Numeric:
350 currentWordType = WordTypeAlphaNumeric;
351 attributes[pos].wordStart = true;
352 break;
353 default:
354 currentWordType = WordTypeNone;
355 break;
356 }
357 }
358 }
359
360 if (currentWordType != WordTypeNone)
361 attributes[len].wordEnd = true;
362 attributes[len].wordBreak = true; // WB2
363}
364
365
366namespace SB {
367
368enum State {
369 Initial,
370 Lower,
371 Upper,
372 LUATerm,
373 ATerm,
374 ATermC,
375 ACS,
376 STerm,
377 STermC,
378 SCS,
379 BAfterC,
380 BAfter,
381 Break,
382 Lookup
383};
384
385static const uchar breakTable[BAfter + 1][QUnicodeTables::NumSentenceBreakClasses] = {
386// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
387 { Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
388 { Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
389 { Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, Initial, STerm , Initial }, // Upper
390
391 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
392 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
393 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
394 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
395
396 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
397 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
398 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
399 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
400 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
401};
402
403} // namespace SB
404
405static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
406{
407 uchar state = SB::BAfter; // to meet SB1
408 for (qsizetype i = 0; i != len; ++i) {
409 qsizetype pos = i;
410 char32_t ucs4 = string[i];
411 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
412 ushort low = string[i + 1];
413 if (QChar::isLowSurrogate(ucs4: low)) {
414 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
415 ++i;
416 }
417 }
418
419 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
420 QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
421
422 Q_ASSERT(state <= SB::BAfter);
423 state = SB::breakTable[state][ncls];
424 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
425 state = SB::Break;
426 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
427 ucs4 = string[lookahead];
428 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
429 ushort low = string[lookahead + 1];
430 if (QChar::isLowSurrogate(ucs4: low)) {
431 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
432 ++lookahead;
433 }
434 }
435
436 prop = QUnicodeTables::properties(ucs4);
437 QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
438 switch (tcls) {
439 case QUnicodeTables::SentenceBreak_Any:
440 case QUnicodeTables::SentenceBreak_Extend:
441 case QUnicodeTables::SentenceBreak_Sp:
442 case QUnicodeTables::SentenceBreak_Numeric:
443 case QUnicodeTables::SentenceBreak_SContinue:
444 case QUnicodeTables::SentenceBreak_Close:
445 continue;
446 case QUnicodeTables::SentenceBreak_Lower:
447 i = lookahead;
448 state = SB::Initial;
449 break;
450 default:
451 break;
452 }
453 break;
454 }
455 }
456 if (Q_UNLIKELY(state == SB::Break)) {
457 attributes[pos].sentenceBoundary = true;
458 state = SB::breakTable[SB::Initial][ncls];
459 }
460 }
461
462 attributes[len].sentenceBoundary = true; // SB2
463}
464
465
466// -----------------------------------------------------------------------------------------------------
467//
468// The line breaking algorithm.
469// See http://www.unicode.org/reports/tr14/tr14-39.html
470//
471// -----------------------------------------------------------------------------------------------------
472
473namespace LB {
474
475namespace NS { // Number Sequence
476
477// This namespace is used to implement LB25 which, as of Unicode 16, has this
478// definition:
479// NU ( SY | IS )* CL × PO
480// NU ( SY | IS )* CP × PO
481// NU ( SY | IS )* CL × PR
482// NU ( SY | IS )* CP × PR
483// NU ( SY | IS )* × PO
484// NU ( SY | IS )* × PR
485// PO × OP NU
486// PO × OP IS NU
487// PO × NU
488// PR × OP NU
489// PR × OP IS NU
490// PR × NU
491// HY × NU
492// IS × NU
493// NU ( SY | IS )* × NU
494
495enum Action {
496 None,
497 Start,
498 Continue,
499 Break,
500 NeedOPNU, // Like Start, but must be followed by sequence `(OP (IS)?)? NU`
501 // These are 'synthetic' actions and are not used in the table but are
502 // tracked otherwise in the code for LB25, to track the state of specific
503 // sequences:
504 CNeedNU, // Like Continue, but must be followed by NU
505 CNeedISNU, // Like Continue, but must be followed by IS? NU
506};
507
508enum Class {
509 XX,
510 PRPO,
511 OP,
512 HY,
513 NU,
514 SY,
515 IS,
516 CLCP
517};
518
519static const uchar actionTable[CLCP + 1][CLCP + 1] = {
520// XX PRPO OP HY NU SY IS CLCP
521 { None , NeedOPNU, Start , None , Start , None , None , None }, // XX
522 { None , NeedOPNU, Continue, Break , Start , None , None , None }, // PRPO
523 { None , Start , Start , Break , Continue, None , Continue, None }, // OP
524 { None , None , None , Start , Continue, None , None , None }, // HY
525 { Break , Break , Break , Break , Continue, Continue, Continue, Continue }, // NU
526 { Break , Break , Break , Break , Continue, Continue, Continue, Continue }, // SY
527 { Break , Break , Break , Break , Continue, Continue, Continue, Continue }, // IS
528 { Break , Continue, Break , Break , Break , Break , Break , Break }, // CLCP
529};
530
531inline Class toClass(QUnicodeTables::LineBreakClass lbc)
532{
533 switch (lbc) {
534 case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
535 return PRPO;
536 case QUnicodeTables::LineBreak_OP:
537 return OP;
538 case QUnicodeTables::LineBreak_HY:
539 return HY;
540 case QUnicodeTables::LineBreak_NU:
541 return NU;
542 case QUnicodeTables::LineBreak_SY:
543 return SY;
544 case QUnicodeTables::LineBreak_IS:
545 return IS;
546 case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
547 return CLCP;
548 default:
549 break;
550 }
551 return XX;
552}
553
554} // namespace NS
555
556namespace BRS { // Brahmic Sequence, used to implement LB28a
557 constexpr char32_t DottedCircle = U'\u25CC';
558
559 // The LB28a_{n} value maps to the 'regex' on the nth line in LB28a
560 // The only special case is LB28a_2VI which is a direct match to the 2nd
561 // line, but it also leads to LB28a_3VIAK, the 3rd line.
562 enum State {
563 None,
564 Start, // => Have: `(AK | [◌] | AS)`
565 LB28a_2VF, // => Have: `(AK | [◌] | AS) VF`
566 LB28a_2VI, // => Have: `(AK | [◌] | AS) VI` May find: `(AK | [◌])`
567 LB28a_3VIAK, // => Have: `(AK | [◌] | AS) VI (AK | [◌])`
568 LB28a_4, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS)` May find: `VF`
569 LB28a_4VF, // => Have: `(AK | [◌] | AS) (AK | [◌] | AS) VF`
570 Restart,
571 };
572 struct LinebreakUnit {
573 QUnicodeTables::LineBreakClass lbc;
574 char32_t ucs4;
575 };
576 struct ParseState {
577 State state = None;
578 qsizetype start = 0;
579 };
580 State updateState(State state, LinebreakUnit lb)
581 {
582 using LBC = QUnicodeTables::LineBreakClass;
583 if (lb.lbc == LBC::LineBreak_CM)
584 return state;
585
586 switch (state) {
587 case Start:
588 if (lb.lbc == LBC::LineBreak_VF)
589 return LB28a_2VF;
590 if (lb.lbc == LBC::LineBreak_VI)
591 return LB28a_2VI;
592 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
593 || lb.lbc == LBC::LineBreak_AS)
594 return LB28a_4;
595 break;
596 case LB28a_2VI:
597 if (lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK)
598 return LB28a_3VIAK;
599 break;
600 case LB28a_4:
601 if (lb.lbc == LBC::LineBreak_VF)
602 return LB28a_4VF;
603 // Had (AK | [◌] | AS) (AK | [◌] | AS), which could mean the 2nd capture is the start
604 // of a new sequence, so we need to check if it makes sense.
605 return Restart;
606 case None:
607 if (Q_UNLIKELY(lb.ucs4 == DottedCircle || lb.lbc == LBC::LineBreak_AK
608 || lb.lbc == LBC::LineBreak_AS)) {
609 return Start;
610 }
611 break;
612 case LB28a_2VF:
613 case LB28a_4VF:
614 case LB28a_3VIAK:
615 case Restart:
616 // These are all terminal states, so no need to update
617 Q_UNREACHABLE();
618 }
619 return None;
620 }
621}
622
623enum Action {
624 ProhibitedBreak, PB = ProhibitedBreak,
625 DirectBreak, DB = DirectBreak,
626 IndirectBreak, IB = IndirectBreak,
627 CombiningIndirectBreak, CI = CombiningIndirectBreak,
628 CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
629 ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen,
630 IndirectBreakIfNarrow, IN = IndirectBreakIfNarrow, // For LB30
631 DirectBreakOutsideNumericSequence, DN = DirectBreakOutsideNumericSequence, // For LB25
632};
633
634// See https://www.unicode.org/reports/tr14/tr14-37.html for the information
635// about the table. It was removed in the later versions of the standard.
636static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ] = {
637/* 1↓ 2→ OP CL CP QU +Pi +Pf +19 GL NS EX SY IS PR PO NU AL HL ID IN HY +WS BA +WS HYBA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM AK AP AS VI VF*/
638/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
639/* CL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
640/* CP */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
641/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
642/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
643/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
644/* +19*/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
645/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
646/* NS */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
647/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
648/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
649/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DN, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
650/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, DB, DB, DB },
651/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
652/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
653/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
654/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, CI, CI, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
655/* ID */ { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
656/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
657/* HY */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
658/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
659/* BA */ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
660/* +WS*/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
661/*HYBA*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, DB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
662/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB },
663/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
664/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
665/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
666/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
667/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
668/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
669/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
670/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
671/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
672/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB },
673/* CB */ { DB, PB, PB, IB, IB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
674/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB },
675/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
676/* AK */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
677/* AP */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, IB, DB, DB },
678/* AS */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
679/* VI */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
680/* VF */ { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
681};
682
683// The following line break classes are not treated by the pair table
684// and must be resolved outside:
685// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX, ZWJ
686
687} // namespace LB
688
689static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
690{
691 qsizetype nestart = 0;
692 LB::NS::Class nelast = LB::NS::XX;
693 LB::NS::Action neactlast = LB::NS::None;
694
695 LB::BRS::ParseState brsState;
696
697 QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
698 QUnicodeTables::LineBreakClass cls = lcls;
699 const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(ucs4: U'\n');
700
701 constexpr static auto isEastAsian = [](QUnicodeTables::EastAsianWidth eaw) {
702 using EAW = QUnicodeTables::EastAsianWidth;
703 return eaw == EAW::W || eaw == EAW::F || eaw == EAW::H;
704 };
705
706 for (qsizetype i = 0; i != len; ++i) {
707 qsizetype pos = i;
708 char32_t ucs4 = string[i];
709 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
710 ushort low = string[i + 1];
711 if (QChar::isLowSurrogate(ucs4: low)) {
712 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
713 ++i;
714 }
715 }
716
717 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
718 QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
719 QUnicodeTables::LineBreakClass tcls;
720
721 if (options & QUnicodeTools::HangulLineBreakTailoring) {
722 if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
723 && ncls <= QUnicodeTables::LineBreak_JT)
724 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
725 ) {
726 // LB27: use SPACE for line breaking
727 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
728 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
729 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
730 ncls = QUnicodeTables::LineBreak_AL;
731 } else {
732 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
733 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
734 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
735 if (FLAG(prop->category) & test)
736 ncls = QUnicodeTables::LineBreak_CM;
737 }
738 }
739 }
740
741 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
742 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
743 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
744 if (FLAG(prop->category) & test)
745 ncls = QUnicodeTables::LineBreak_CM;
746 }
747
748 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) {
749 if (prop->category == QChar::Punctuation_InitialQuote) {
750 // LB15a: Do not break after an unresolved initial punctuation
751 // that lies at the start of the line, after a space, after
752 // opening punctuation, or after an unresolved quotation mark,
753 // even after spaces.
754 // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW)
755 // [\p{Pi}&QU] SP* ×
756 // Note: sot is treated as LF here due to initial loop setup.
757 constexpr QUnicodeTables::LineBreakClass lb15a[] = {
758 QUnicodeTables::LineBreak_BK, QUnicodeTables::LineBreak_CR,
759 QUnicodeTables::LineBreak_LF, QUnicodeTables::LineBreak_OP,
760 QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
761 QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_GL,
762 QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_ZW};
763 if (std::any_of(first: std::begin(arr: lb15a), last: std::end(arr: lb15a),
764 pred: [lcls](auto x) { return x == lcls; })) {
765 ncls = QUnicodeTables::LineBreak_QU_Pi;
766 }
767 } else if (prop->category == QChar::Punctuation_FinalQuote) {
768 // LB15b: Do not break before an unresolved final punctuation
769 // that lies at the end of the line, before a space, before
770 // a prohibited break, or before an unresolved quotation mark,
771 // even after spaces.
772 // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS
773 // | SY | BK | CR | LF | NL | ZW | eot)
774 auto nncls = QUnicodeTables::LineBreak_LF;
775
776 if (i + 1 < len) {
777 char32_t c = string[i + 1];
778 if (QChar::isHighSurrogate(ucs4: c) && i + 2 < len) {
779 ushort low = string[i + 2];
780 if (QChar::isLowSurrogate(ucs4: low))
781 c = QChar::surrogateToUcs4(high: c, low);
782 }
783 nncls = QUnicodeTables::LineBreakClass(
784 QUnicodeTables::properties(ucs4: c)->lineBreakClass);
785 }
786
787 constexpr QUnicodeTables::LineBreakClass lb15b[] = {
788 QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_GL,
789 QUnicodeTables::LineBreak_WJ, QUnicodeTables::LineBreak_CL,
790 QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
791 QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_CP,
792 QUnicodeTables::LineBreak_EX, QUnicodeTables::LineBreak_IS,
793 QUnicodeTables::LineBreak_SY, QUnicodeTables::LineBreak_BK,
794 QUnicodeTables::LineBreak_CR, QUnicodeTables::LineBreak_LF,
795 QUnicodeTables::LineBreak_ZW};
796 if (std::any_of(first: std::begin(arr: lb15b), last: std::end(arr: lb15b),
797 pred: [nncls](auto x) { return x == nncls; })) {
798 ncls = QUnicodeTables::LineBreak_QU_Pf;
799 }
800 }
801 }
802
803 if (Q_UNLIKELY((lcls >= QUnicodeTables::LineBreak_SP || lcls == QUnicodeTables::LineBreak_ZW
804 || lcls == QUnicodeTables::LineBreak_GL
805 || lcls == QUnicodeTables::LineBreak_CB)
806 && (ncls == QUnicodeTables::LineBreak_HY || ucs4 == u'\u2010'))) {
807 // LB20a: Do not break after a word-initial hyphen.
808 // ( sot | BK | CR | LF | NL | SP | ZW | CB | GL ) ( HY | [\u2010] ) × AL
809
810 // Remap to the synthetic class WS_* (whitespace+*), which is just
811 // like the current respective linebreak class but with an IB action
812 // if the next class is AL.
813 if (ucs4 == u'\u2010')
814 ncls = QUnicodeTables::LineBreak_WS_BA;
815 else
816 ncls = QUnicodeTables::LineBreak_WS_HY;
817 }
818
819 if (Q_UNLIKELY(cls == QUnicodeTables::LineBreak_AP && ucs4 == LB::BRS::DottedCircle)) {
820 // LB28a: Do not break inside the orthographic syllables of Brahmic scripts
821 // AP × (AK | [◌] | AS)
822 // @note: AP × (AK | AS) is checked by the breakTable
823 goto next;
824 }
825 while (true) { // May need to recheck once.
826 // LB28a cont'd
827 LB::BRS::State oldState = brsState.state;
828 brsState.state = LB::BRS::updateState(state: brsState.state, lb: {.lbc: ncls, .ucs4: ucs4});
829 if (Q_LIKELY(brsState.state == oldState))
830 break;
831 switch (brsState.state) {
832 case LB::BRS::Start:
833 brsState.start = i;
834 break;
835 case LB::BRS::LB28a_2VI: // Wait for more characters, but also valid sequence
836 // We may get another character, but this is already a complete
837 // sequence that should not have any breaks:
838 for (qsizetype j = brsState.start + 1; j < i; ++j)
839 attributes[j].lineBreak = false;
840 // No need to mark this sequence again later, so move 'start'
841 // up to the current position:
842 brsState.start = i;
843 goto next;
844 case LB::BRS::Restart:
845 // The previous character was possibly the start of a new sequence
846 brsState.state = LB::BRS::Start;
847 brsState.start = pos - 1;
848 continue; // Doing the loop again!
849 case LB::BRS::LB28a_2VF:
850 case LB::BRS::LB28a_4VF:
851 case LB::BRS::LB28a_3VIAK:
852 for (qsizetype j = brsState.start + 1; j < i; ++j)
853 attributes[j].lineBreak = false;
854 if (brsState.state == LB::BRS::LB28a_3VIAK) {
855 // This might be the start of a new sequence
856 brsState.state = LB::BRS::Start;
857 brsState.start = i;
858 } else {
859 brsState.state = LB::BRS::None;
860 }
861 goto next;
862 case LB::BRS::LB28a_4: // Wait for more characters
863 Q_LIKELY_BRANCH
864 case LB::BRS::None: // Nothing to do
865 break;
866 }
867 break;
868 }
869
870 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_IS)) {
871 // LB15c Break before a decimal mark that follows a space, for instance, in
872 // ‘subtract .5’.
873 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_SP)) {
874 if (i + 1 < len) {
875 char32_t ch = string[i + 1];
876 if (QChar::isHighSurrogate(ucs4: ch) && i + 2 < len) {
877 ushort low = string[i + 2];
878 if (QChar::isLowSurrogate(ucs4: low))
879 ch = QChar::surrogateToUcs4(high: ch, low);
880 }
881 if (QUnicodeTables::properties(ucs4: ch)->lineBreakClass
882 == QUnicodeTables::LineBreak_NU) {
883 attributes[pos].lineBreak = true;
884 goto next;
885 }
886 }
887 }
888 }
889
890 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_HL)) {
891 // LB21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
892 // HL (HY | [ BA - $EastAsian ]) × [^HL]
893 auto eaw = QUnicodeTables::EastAsianWidth(prop->eastAsianWidth);
894 const bool isNonEaBA = ncls == QUnicodeTables::LineBreak_BA && !isEastAsian(eaw);
895 if (isNonEaBA || ncls == QUnicodeTables::LineBreak_HY) {
896 // Remap to synthetic HYBA class which handles the next
897 // character. Generally (LB21) there are no breaks before
898 // HY or BA, so we can skip ahead to the next character.
899 ncls = QUnicodeTables::LineBreak_HYBA;
900 goto next;
901 }
902 }
903
904 // LB25: do not break lines inside numbers
905 {
906 LB::NS::Class necur = LB::NS::toClass(lbc: ncls);
907 LB::NS::Action neact = LB::NS::Action(LB::NS::actionTable[nelast][necur]);
908 if (Q_UNLIKELY(neactlast == LB::NS::CNeedNU && necur != LB::NS::NU)) {
909 neact = LB::NS::None;
910 } else if (Q_UNLIKELY(neactlast == LB::NS::NeedOPNU)) {
911 if (necur == LB::NS::OP)
912 neact = LB::NS::CNeedISNU;
913 else if (necur == LB::NS::NU)
914 neact = LB::NS::Continue;
915 else // Anything else and we ignore the sequence
916 neact = LB::NS::None;
917 } else if (Q_UNLIKELY(neactlast == LB::NS::CNeedISNU)) {
918 if (necur == LB::NS::IS)
919 neact = LB::NS::CNeedNU;
920 else if (necur == LB::NS::NU)
921 neact = LB::NS::Continue;
922 else // Anything else and we ignore the sequence
923 neact = LB::NS::None;
924 }
925 switch (neact) {
926 case LB::NS::Break:
927 // do not change breaks before and after the expression
928 for (qsizetype j = nestart + 1; j < pos; ++j)
929 attributes[j].lineBreak = false;
930 Q_FALLTHROUGH();
931 Q_LIKELY_BRANCH
932 case LB::NS::None:
933 nelast = LB::NS::XX; // reset state
934 break;
935 case LB::NS::NeedOPNU:
936 case LB::NS::Start:
937 if (neactlast == LB::NS::Start || neactlast == LB::NS::Continue) {
938 // Apply the linebreaks for the previous stretch; we need to start a new one
939 for (qsizetype j = nestart + 1; j < pos; ++j)
940 attributes[j].lineBreak = false;
941 }
942 nestart = i;
943 Q_FALLTHROUGH();
944 case LB::NS::CNeedNU:
945 case LB::NS::CNeedISNU:
946 case LB::NS::Continue:
947 nelast = necur;
948 break;
949 }
950 neactlast = neact;
951 }
952
953 // LB19a Unless surrounded by East Asian characters, do not break either side of any
954 // unresolved quotation marks
955 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU
956 && lcls != QUnicodeTables::LineBreak_SP
957 && lcls != QUnicodeTables::LineBreak_ZW)) {
958 using EAW = QUnicodeTables::EastAsianWidth;
959 constexpr static auto nextCharNonEastAsian = [](const char16_t *string, qsizetype len) {
960 if (len > 0) {
961 char32_t nch = string[0];
962 if (QChar::isHighSurrogate(ucs4: nch) && len > 1) {
963 char16_t low = string[1];
964 if (QChar::isLowSurrogate(ucs4: low))
965 nch = QChar::surrogateToUcs4(high: char16_t(nch), low);
966 }
967 const auto *nextProp = QUnicodeTables::properties(ucs4: nch);
968 QUnicodeTables::LineBreakClass nncls = QUnicodeTables::LineBreakClass(
969 nextProp->lineBreakClass);
970 QUnicodeTables::EastAsianWidth neaw = EAW(nextProp->eastAsianWidth);
971 return nncls != QUnicodeTables::LineBreak_CM
972 && nncls <= QUnicodeTables::LineBreak_SP
973 && !isEastAsian(neaw);
974 }
975 return true; // end-of-text counts as non-East-Asian
976 };
977 if (Q_UNLIKELY(!isEastAsian(EAW(lastProp->eastAsianWidth))
978 || nextCharNonEastAsian(string + i + 1, len - i - 1))) {
979 // Remap to the synthetic QU_19 class which has indirect breaks
980 // for most following classes.
981 ncls = QUnicodeTables::LineBreak_QU_19;
982 }
983 }
984
985 if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
986 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
987 if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
988 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
989 goto next;
990 }
991
992 if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
993 if (ncls > QUnicodeTables::LineBreak_SP)
994 goto next; // LB6: x(BK|CR|LF|NL)
995 goto next_no_cls_update; // LB7: xSP
996 }
997
998 // LB19 - do not break before non-initial unresolved quotation marks, or after non-final
999 // unresolved quotation marks
1000 if (Q_UNLIKELY(((ncls == QUnicodeTables::LineBreak_QU
1001 || ncls == QUnicodeTables::LineBreak_QU_19)
1002 && prop->category != QChar::Punctuation_InitialQuote)
1003 || (cls == QUnicodeTables::LineBreak_QU
1004 && lastProp->category != QChar::Punctuation_FinalQuote))) {
1005 // Make sure the previous character is not one that we have to break after.
1006 // Also skip if ncls is CM so it can be treated as lcls (LB9)
1007 if (lcls != QUnicodeTables::LineBreak_SP && lcls != QUnicodeTables::LineBreak_ZW
1008 && ncls != QUnicodeTables::LineBreak_CM) {
1009 goto next;
1010 }
1011 }
1012
1013 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
1014 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
1015 if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP)
1016 // don't update anything
1017 goto next_no_cls_update;
1018 }
1019
1020 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
1021 // LB8a: ZWJ x
1022 goto next;
1023 }
1024
1025 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
1026 // LB30a
1027 ncls = QUnicodeTables::LineBreak_SP;
1028 goto next;
1029 }
1030
1031 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
1032 && lastProp->category == QChar::Other_NotAssigned
1033 && lastProp->graphemeBreakClass
1034 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
1035 // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
1036 goto next;
1037 }
1038
1039 // for South East Asian chars that require a complex analysis, the Unicode
1040 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
1041 if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
1042 cls = QUnicodeTables::LineBreak_AL;
1043
1044 tcls = cls;
1045
1046 constexpr static auto remapToAL = [](QUnicodeTables::LineBreakClass &c, auto &property) {
1047 if (Q_UNLIKELY(c == QUnicodeTables::LineBreak_CM
1048 || c == QUnicodeTables::LineBreak_ZWJ)) {
1049 c = QUnicodeTables::LineBreak_AL;
1050 property = QUnicodeTables::properties(ucs4: U'\u0041');
1051 }
1052 };
1053 // LB10 Treat any remaining combining mark or ZWJ as AL,
1054 // as if it had the properties of U+0041 A LATIN CAPITAL LETTER
1055 remapToAL(tcls, lastProp);
1056 remapToAL(ncls, prop);
1057
1058 switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_ZWJ ? ncls : QUnicodeTables::LineBreak_AL]) {
1059 case LB::DirectBreak:
1060 attributes[pos].lineBreak = true;
1061 break;
1062 case LB::IndirectBreak:
1063 if (lcls == QUnicodeTables::LineBreak_SP)
1064 attributes[pos].lineBreak = true;
1065 break;
1066 case LB::CombiningIndirectBreak:
1067 if (lcls != QUnicodeTables::LineBreak_SP)
1068 goto next_no_cls_update;
1069 attributes[pos].lineBreak = true;
1070 break;
1071 case LB::CombiningProhibitedBreak:
1072 if (lcls != QUnicodeTables::LineBreak_SP)
1073 goto next_no_cls_update;
1074 break;
1075 case LB::ProhibitedBreakAfterHebrewPlusHyphen:
1076 if (lcls != QUnicodeTables::LineBreak_HL)
1077 attributes[pos].lineBreak = true;
1078 break;
1079 case LB::IndirectBreakIfNarrow:
1080 using EAW = QUnicodeTables::EastAsianWidth;
1081 switch (EAW(prop->eastAsianWidth)) {
1082 default:
1083 if (lcls != QUnicodeTables::LineBreak_SP)
1084 break;
1085 Q_FALLTHROUGH();
1086 case QUnicodeTables::EastAsianWidth::F:
1087 case QUnicodeTables::EastAsianWidth::W:
1088 case QUnicodeTables::EastAsianWidth::H:
1089 attributes[pos].lineBreak = true;
1090 break;
1091 }
1092 break;
1093 case LB::DirectBreakOutsideNumericSequence:
1094 if (neactlast == LB::NS::None || neactlast > LB::NS::Break)
1095 attributes[pos].lineBreak = true;
1096 break;
1097 case LB::ProhibitedBreak:
1098 // nothing to do
1099 default:
1100 break;
1101 }
1102
1103 next:
1104 if (ncls != QUnicodeTables::LineBreak_CM && ncls != QUnicodeTables::LineBreak_ZWJ) {
1105 cls = ncls;
1106 lastProp = prop;
1107 }
1108 next_no_cls_update:
1109 lcls = ncls;
1110 }
1111
1112 if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
1113 // LB25: do not break lines inside numbers
1114 for (qsizetype j = nestart + 1; j < len; ++j)
1115 attributes[j].lineBreak = false;
1116 }
1117
1118 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
1119 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
1120}
1121
1122
1123static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1124{
1125 for (qsizetype i = 0; i != len; ++i) {
1126 uint ucs4 = string[i];
1127 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
1128 ushort low = string[i + 1];
1129 if (QChar::isLowSurrogate(ucs4: low)) {
1130 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
1131 ++i;
1132 }
1133 }
1134
1135 if (Q_UNLIKELY(QChar::isSpace(ucs4)))
1136 attributes[i].whiteSpace = true;
1137 }
1138}
1139
1140namespace Tailored {
1141
1142using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
1143
1144
1145enum Form {
1146 Invalid = 0x0,
1147 UnknownForm = Invalid,
1148 Consonant,
1149 Nukta,
1150 Halant,
1151 Matra,
1152 VowelMark,
1153 StressMark,
1154 IndependentVowel,
1155 LengthMark,
1156 Control,
1157 Other
1158};
1159
1160static const unsigned char indicForms[0xe00-0x900] = {
1161 // Devangari
1162 Invalid, VowelMark, VowelMark, VowelMark,
1163 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1164 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1165 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1166
1167 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1168 IndependentVowel, Consonant, Consonant, Consonant,
1169 Consonant, Consonant, Consonant, Consonant,
1170 Consonant, Consonant, Consonant, Consonant,
1171
1172 Consonant, Consonant, Consonant, Consonant,
1173 Consonant, Consonant, Consonant, Consonant,
1174 Consonant, Consonant, Consonant, Consonant,
1175 Consonant, Consonant, Consonant, Consonant,
1176
1177 Consonant, Consonant, Consonant, Consonant,
1178 Consonant, Consonant, Consonant, Consonant,
1179 Consonant, Consonant, UnknownForm, UnknownForm,
1180 Nukta, Other, Matra, Matra,
1181
1182 Matra, Matra, Matra, Matra,
1183 Matra, Matra, Matra, Matra,
1184 Matra, Matra, Matra, Matra,
1185 Matra, Halant, UnknownForm, UnknownForm,
1186
1187 Other, StressMark, StressMark, StressMark,
1188 StressMark, UnknownForm, UnknownForm, UnknownForm,
1189 Consonant, Consonant, Consonant, Consonant,
1190 Consonant, Consonant, Consonant, Consonant,
1191
1192 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1193 Other, Other, Other, Other,
1194 Other, Other, Other, Other,
1195 Other, Other, Other, Other,
1196
1197 Other, Other, Other, Other,
1198 Other, Other, Other, Other,
1199 Other, Other, Other, Consonant,
1200 Consonant, Consonant /* ??? */, Consonant, Consonant,
1201
1202 // Bengali
1203 Invalid, VowelMark, VowelMark, VowelMark,
1204 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1205 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1206 IndependentVowel, Invalid, Invalid, IndependentVowel,
1207
1208 IndependentVowel, Invalid, Invalid, IndependentVowel,
1209 IndependentVowel, Consonant, Consonant, Consonant,
1210 Consonant, Consonant, Consonant, Consonant,
1211 Consonant, Consonant, Consonant, Consonant,
1212
1213 Consonant, Consonant, Consonant, Consonant,
1214 Consonant, Consonant, Consonant, Consonant,
1215 Consonant, Invalid, Consonant, Consonant,
1216 Consonant, Consonant, Consonant, Consonant,
1217
1218 Consonant, Invalid, Consonant, Invalid,
1219 Invalid, Invalid, Consonant, Consonant,
1220 Consonant, Consonant, UnknownForm, UnknownForm,
1221 Nukta, Other, Matra, Matra,
1222
1223 Matra, Matra, Matra, Matra,
1224 Matra, Invalid, Invalid, Matra,
1225 Matra, Invalid, Invalid, Matra,
1226 Matra, Halant, Consonant, UnknownForm,
1227
1228 Invalid, Invalid, Invalid, Invalid,
1229 Invalid, Invalid, Invalid, VowelMark,
1230 Invalid, Invalid, Invalid, Invalid,
1231 Consonant, Consonant, Invalid, Consonant,
1232
1233 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1234 Other, Other, Other, Other,
1235 Other, Other, Other, Other,
1236 Other, Other, Other, Other,
1237
1238 Consonant, Consonant, Other, Other,
1239 Other, Other, Other, Other,
1240 Other, Other, Other, Other,
1241 Other, Other, Other, Other,
1242
1243 // Gurmukhi
1244 Invalid, VowelMark, VowelMark, VowelMark,
1245 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1246 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1247 Invalid, Invalid, Invalid, IndependentVowel,
1248
1249 IndependentVowel, Invalid, Invalid, IndependentVowel,
1250 IndependentVowel, Consonant, Consonant, Consonant,
1251 Consonant, Consonant, Consonant, Consonant,
1252 Consonant, Consonant, Consonant, Consonant,
1253
1254 Consonant, Consonant, Consonant, Consonant,
1255 Consonant, Consonant, Consonant, Consonant,
1256 Consonant, Invalid, Consonant, Consonant,
1257 Consonant, Consonant, Consonant, Consonant,
1258
1259 Consonant, Invalid, Consonant, Consonant,
1260 Invalid, Consonant, Consonant, Invalid,
1261 Consonant, Consonant, UnknownForm, UnknownForm,
1262 Nukta, Other, Matra, Matra,
1263
1264 Matra, Matra, Matra, Invalid,
1265 Invalid, Invalid, Invalid, Matra,
1266 Matra, Invalid, Invalid, Matra,
1267 Matra, Halant, UnknownForm, UnknownForm,
1268
1269 Invalid, Invalid, Invalid, Invalid,
1270 Invalid, UnknownForm, UnknownForm, UnknownForm,
1271 Invalid, Consonant, Consonant, Consonant,
1272 Consonant, Invalid, Consonant, Invalid,
1273
1274 Other, Other, Invalid, Invalid,
1275 Other, Other, Other, Other,
1276 Other, Other, Other, Other,
1277 Other, Other, Other, Other,
1278
1279 StressMark, StressMark, Consonant, Consonant,
1280 Other, Other, Other, Other,
1281 Other, Other, Other, Other,
1282 Other, Other, Other, Other,
1283
1284 // Gujarati
1285 Invalid, VowelMark, VowelMark, VowelMark,
1286 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1287 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1288 IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
1289
1290 IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
1291 IndependentVowel, Consonant, Consonant, Consonant,
1292 Consonant, Consonant, Consonant, Consonant,
1293 Consonant, Consonant, Consonant, Consonant,
1294
1295 Consonant, Consonant, Consonant, Consonant,
1296 Consonant, Consonant, Consonant, Consonant,
1297 Consonant, Invalid, Consonant, Consonant,
1298 Consonant, Consonant, Consonant, Consonant,
1299
1300 Consonant, Invalid, Consonant, Consonant,
1301 Invalid, Consonant, Consonant, Consonant,
1302 Consonant, Consonant, UnknownForm, UnknownForm,
1303 Nukta, Other, Matra, Matra,
1304
1305 Matra, Matra, Matra, Matra,
1306 Matra, Matra, Invalid, Matra,
1307 Matra, Matra, Invalid, Matra,
1308 Matra, Halant, UnknownForm, UnknownForm,
1309
1310 Other, UnknownForm, UnknownForm, UnknownForm,
1311 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1312 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1313 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1314
1315 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1316 Other, Other, Other, Other,
1317 Other, Other, Other, Other,
1318 Other, Other, Other, Other,
1319
1320 Other, Other, Other, Other,
1321 Other, Other, Other, Other,
1322 Other, Other, Other, Other,
1323 Other, Other, Other, Other,
1324
1325 // Oriya
1326 Invalid, VowelMark, VowelMark, VowelMark,
1327 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1328 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1329 IndependentVowel, Invalid, Invalid, IndependentVowel,
1330
1331 IndependentVowel, Invalid, Invalid, IndependentVowel,
1332 IndependentVowel, Consonant, Consonant, Consonant,
1333 Consonant, Consonant, Consonant, Consonant,
1334 Consonant, Consonant, Consonant, Consonant,
1335
1336 Consonant, Consonant, Consonant, Consonant,
1337 Consonant, Consonant, Consonant, Consonant,
1338 Consonant, Invalid, Consonant, Consonant,
1339 Consonant, Consonant, Consonant, Consonant,
1340
1341 Consonant, Invalid, Consonant, Consonant,
1342 Invalid, Consonant, Consonant, Consonant,
1343 Consonant, Consonant, UnknownForm, UnknownForm,
1344 Nukta, Other, Matra, Matra,
1345
1346 Matra, Matra, Matra, Matra,
1347 Invalid, Invalid, Invalid, Matra,
1348 Matra, Invalid, Invalid, Matra,
1349 Matra, Halant, UnknownForm, UnknownForm,
1350
1351 Other, Invalid, Invalid, Invalid,
1352 Invalid, UnknownForm, LengthMark, LengthMark,
1353 Invalid, Invalid, Invalid, Invalid,
1354 Consonant, Consonant, Invalid, Consonant,
1355
1356 IndependentVowel, IndependentVowel, Invalid, Invalid,
1357 Invalid, Invalid, Other, Other,
1358 Other, Other, Other, Other,
1359 Other, Other, Other, Other,
1360
1361 Other, Consonant, Other, Other,
1362 Other, Other, Other, Other,
1363 Other, Other, Other, Other,
1364 Other, Other, Other, Other,
1365
1366 //Tamil
1367 Invalid, Invalid, VowelMark, Other,
1368 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1369 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1370 Invalid, Invalid, IndependentVowel, IndependentVowel,
1371
1372 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1373 IndependentVowel, Consonant, Invalid, Invalid,
1374 Invalid, Consonant, Consonant, Invalid,
1375 Consonant, Invalid, Consonant, Consonant,
1376
1377 Invalid, Invalid, Invalid, Consonant,
1378 Consonant, Invalid, Invalid, Invalid,
1379 Consonant, Consonant, Consonant, Invalid,
1380 Invalid, Invalid, Consonant, Consonant,
1381
1382 Consonant, Consonant, Consonant, Consonant,
1383 Consonant, Consonant, Consonant, Consonant,
1384 Consonant, Consonant, UnknownForm, UnknownForm,
1385 Invalid, Invalid, Matra, Matra,
1386
1387 Matra, Matra, Matra, Invalid,
1388 Invalid, Invalid, Matra, Matra,
1389 Matra, Invalid, Matra, Matra,
1390 Matra, Halant, Invalid, Invalid,
1391
1392 Invalid, Invalid, Invalid, Invalid,
1393 Invalid, Invalid, Invalid, LengthMark,
1394 Invalid, Invalid, Invalid, Invalid,
1395 Invalid, Invalid, Invalid, Invalid,
1396
1397 Invalid, Invalid, Invalid, Invalid,
1398 Invalid, Invalid, Other, Other,
1399 Other, Other, Other, Other,
1400 Other, Other, Other, Other,
1401
1402 Other, Other, Other, Other,
1403 Other, Other, Other, Other,
1404 Other, Other, Other, Other,
1405 Other, Other, Other, Other,
1406
1407 // Telugu
1408 Invalid, VowelMark, VowelMark, VowelMark,
1409 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1410 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1411 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1412
1413 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1414 IndependentVowel, Consonant, Consonant, Consonant,
1415 Consonant, Consonant, Consonant, Consonant,
1416 Consonant, Consonant, Consonant, Consonant,
1417
1418 Consonant, Consonant, Consonant, Consonant,
1419 Consonant, Consonant, Consonant, Consonant,
1420 Consonant, Invalid, Consonant, Consonant,
1421 Consonant, Consonant, Consonant, Consonant,
1422
1423 Consonant, Consonant, Consonant, Consonant,
1424 Invalid, Consonant, Consonant, Consonant,
1425 Consonant, Consonant, UnknownForm, UnknownForm,
1426 Invalid, Invalid, Matra, Matra,
1427
1428 Matra, Matra, Matra, Matra,
1429 Matra, Invalid, Matra, Matra,
1430 Matra, Invalid, Matra, Matra,
1431 Matra, Halant, Invalid, Invalid,
1432
1433 Invalid, Invalid, Invalid, Invalid,
1434 Invalid, LengthMark, Matra, Invalid,
1435 Invalid, Invalid, Invalid, Invalid,
1436 Invalid, Invalid, Invalid, Invalid,
1437
1438 IndependentVowel, IndependentVowel, Invalid, Invalid,
1439 Invalid, Invalid, Other, Other,
1440 Other, Other, Other, Other,
1441 Other, Other, Other, Other,
1442
1443 Other, Other, Other, Other,
1444 Other, Other, Other, Other,
1445 Other, Other, Other, Other,
1446 Other, Other, Other, Other,
1447
1448 // Kannada
1449 Invalid, Invalid, VowelMark, VowelMark,
1450 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1451 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1452 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1453
1454 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1455 IndependentVowel, Consonant, Consonant, Consonant,
1456 Consonant, Consonant, Consonant, Consonant,
1457 Consonant, Consonant, Consonant, Consonant,
1458
1459 Consonant, Consonant, Consonant, Consonant,
1460 Consonant, Consonant, Consonant, Consonant,
1461 Consonant, Invalid, Consonant, Consonant,
1462 Consonant, Consonant, Consonant, Consonant,
1463
1464 Consonant, Consonant, Consonant, Consonant,
1465 Invalid, Consonant, Consonant, Consonant,
1466 Consonant, Consonant, UnknownForm, UnknownForm,
1467 Nukta, Other, Matra, Matra,
1468
1469 Matra, Matra, Matra, Matra,
1470 Matra, Invalid, Matra, Matra,
1471 Matra, Invalid, Matra, Matra,
1472 Matra, Halant, Invalid, Invalid,
1473
1474 Invalid, Invalid, Invalid, Invalid,
1475 Invalid, LengthMark, LengthMark, Invalid,
1476 Invalid, Invalid, Invalid, Invalid,
1477 Invalid, Invalid, Consonant, Invalid,
1478
1479 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1480 Invalid, Invalid, Other, Other,
1481 Other, Other, Other, Other,
1482 Other, Other, Other, Other,
1483
1484 Other, Other, Other, Other,
1485 Other, Other, Other, Other,
1486 Other, Other, Other, Other,
1487 Other, Other, Other, Other,
1488
1489 // Malayalam
1490 Invalid, Invalid, VowelMark, VowelMark,
1491 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1492 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1493 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1494
1495 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1496 IndependentVowel, Consonant, Consonant, Consonant,
1497 Consonant, Consonant, Consonant, Consonant,
1498 Consonant, Consonant, Consonant, Consonant,
1499
1500 Consonant, Consonant, Consonant, Consonant,
1501 Consonant, Consonant, Consonant, Consonant,
1502 Consonant, Invalid, Consonant, Consonant,
1503 Consonant, Consonant, Consonant, Consonant,
1504
1505 Consonant, Consonant, Consonant, Consonant,
1506 Consonant, Consonant, Consonant, Consonant,
1507 Consonant, Consonant, UnknownForm, UnknownForm,
1508 Invalid, Invalid, Matra, Matra,
1509
1510 Matra, Matra, Matra, Matra,
1511 Invalid, Invalid, Matra, Matra,
1512 Matra, Invalid, Matra, Matra,
1513 Matra, Halant, Invalid, Invalid,
1514
1515 Invalid, Invalid, Invalid, Invalid,
1516 Invalid, Invalid, Invalid, Matra,
1517 Invalid, Invalid, Invalid, Invalid,
1518 Invalid, Invalid, Invalid, Invalid,
1519
1520 IndependentVowel, IndependentVowel, Invalid, Invalid,
1521 Invalid, Invalid, Other, Other,
1522 Other, Other, Other, Other,
1523 Other, Other, Other, Other,
1524
1525 Other, Other, Other, Other,
1526 Other, Other, Other, Other,
1527 Other, Other, Other, Other,
1528 Other, Other, Other, Other,
1529
1530 // Sinhala
1531 Invalid, Invalid, VowelMark, VowelMark,
1532 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1533 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1534 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1535
1536 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1537 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1538 Invalid, Invalid, Consonant, Consonant,
1539 Consonant, Consonant, Consonant, Consonant,
1540
1541 Consonant, Consonant, Consonant, Consonant,
1542 Consonant, Consonant, Consonant, Consonant,
1543 Consonant, Consonant, Consonant, Consonant,
1544 Consonant, Consonant, Consonant, Consonant,
1545
1546 Consonant, Consonant, Invalid, Consonant,
1547 Consonant, Consonant, Consonant, Consonant,
1548 Consonant, Consonant, Consonant, Consonant,
1549 Invalid, Consonant, Invalid, Invalid,
1550
1551 Consonant, Consonant, Consonant, Consonant,
1552 Consonant, Consonant, Consonant, Invalid,
1553 Invalid, Invalid, Halant, Invalid,
1554 Invalid, Invalid, Invalid, Matra,
1555
1556 Matra, Matra, Matra, Matra,
1557 Matra, Invalid, Matra, Invalid,
1558 Matra, Matra, Matra, Matra,
1559 Matra, Matra, Matra, Matra,
1560
1561 Invalid, Invalid, Invalid, Invalid,
1562 Invalid, Invalid, Invalid, Invalid,
1563 Invalid, Invalid, Invalid, Invalid,
1564 Invalid, Invalid, Invalid, Invalid,
1565
1566 Invalid, Invalid, Matra, Matra,
1567 Other, Other, Other, Other,
1568 Other, Other, Other, Other,
1569 Other, Other, Other, Other,
1570};
1571
1572static inline Form form(unsigned short uc) {
1573 if (uc < 0x900 || uc > 0xdff) {
1574 if (uc == 0x25cc)
1575 return Consonant;
1576 if (uc == 0x200c || uc == 0x200d)
1577 return Control;
1578 return Other;
1579 }
1580 return (Form)indicForms[uc-0x900];
1581}
1582
1583// #define INDIC_DEBUG
1584#ifdef INDIC_DEBUG
1585#define IDEBUG qDebug
1586#else
1587#define IDEBUG if constexpr (1) ; else qDebug
1588#endif
1589
1590/* syllables are of the form:
1591
1592 (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1593 (Consonant Nukta? Halant)* Consonant Halant
1594 IndependentVowel VowelMark? StressMark?
1595
1596 We return syllable boundaries on invalid combinations as well
1597*/
1598static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1599{
1600 *invalid = false;
1601 IDEBUG(msg: "indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1602 const char16_t *uc = s+start;
1603
1604 qsizetype pos = 0;
1605 Form state = form(uc: uc[pos]);
1606 IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1607 pos++;
1608
1609 if (state != Consonant && state != IndependentVowel) {
1610 if (state != Other)
1611 *invalid = true;
1612 goto finish;
1613 }
1614
1615 while (pos < end - start) {
1616 Form newState = form(uc: uc[pos]);
1617 IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1618 switch (newState) {
1619 case Control:
1620 newState = state;
1621 if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1622 break;
1623 // the control character should be the last char in the item
1624 if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1625 break;
1626 if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1627 break;
1628 // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1629 ++pos;
1630 goto finish;
1631 case Consonant:
1632 if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1633 break;
1634 goto finish;
1635 case Halant:
1636 if (state == Nukta || state == Consonant)
1637 break;
1638 // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1639 if (script == QChar::Script_Bengali && pos == 1 &&
1640 (uc[0] == 0x0985 || uc[0] == 0x098f))
1641 break;
1642 // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1643 if (script == QChar::Script_Sinhala && state == Matra) {
1644 ++pos;
1645 continue;
1646 }
1647 if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1648 ++pos;
1649 continue;
1650 }
1651 goto finish;
1652 case Nukta:
1653 if (state == Consonant)
1654 break;
1655 goto finish;
1656 case StressMark:
1657 if (state == VowelMark)
1658 break;
1659 Q_FALLTHROUGH();
1660 case VowelMark:
1661 if (state == Matra || state == LengthMark || state == IndependentVowel)
1662 break;
1663 Q_FALLTHROUGH();
1664 case Matra:
1665 if (state == Consonant || state == Nukta)
1666 break;
1667 if (state == Matra) {
1668 // ### needs proper testing for correct two/three part matras
1669 break;
1670 }
1671 // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1672 // it work for all Indic languages?
1673 // the combination Independent_A + Vowel Sign AA is allowed.
1674 if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1675 break;
1676 if (script == QChar::Script_Tamil && state == Matra) {
1677 if (uc[pos-1] == 0x0bc6 &&
1678 (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1679 break;
1680 if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1681 break;
1682 }
1683 goto finish;
1684
1685 case LengthMark:
1686 if (state == Matra) {
1687 // ### needs proper testing for correct two/three part matras
1688 break;
1689 }
1690 Q_FALLTHROUGH();
1691 case IndependentVowel:
1692 case Invalid:
1693 case Other:
1694 goto finish;
1695 }
1696 state = newState;
1697 pos++;
1698 }
1699 finish:
1700 return pos+start;
1701}
1702
1703static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1704{
1705 qsizetype end = from + len;
1706 attributes += from;
1707 qsizetype i = 0;
1708 while (i < len) {
1709 bool invalid;
1710 qsizetype boundary = indic_nextSyllableBoundary(script, s: text, start: from+i, end, invalid: &invalid) - from;
1711 attributes[i].graphemeBoundary = true;
1712
1713 if (boundary > len-1) boundary = len;
1714 i++;
1715 while (i < boundary) {
1716 attributes[i].graphemeBoundary = false;
1717 ++i;
1718 }
1719 assert(i == boundary);
1720 }
1721
1722
1723}
1724
1725#if QT_CONFIG(library)
1726
1727#define LIBTHAI_MAJOR 0
1728
1729/*
1730 * if libthai changed please update these codes too.
1731 */
1732struct thcell_t {
1733 unsigned char base; /**< base character */
1734 unsigned char hilo; /**< upper/lower vowel/diacritic */
1735 unsigned char top; /**< top-level mark */
1736};
1737
1738using ThBrk = struct _ThBrk;
1739
1740namespace {
1741
1742class LibThai final
1743{
1744 Q_DISABLE_COPY_MOVE(LibThai)
1745
1746 using th_brk_new_def = ThBrk *(*)(const char *);
1747 using th_brk_delete_def = void (*)(ThBrk *);
1748 using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
1749 using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
1750
1751public:
1752 LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
1753 {
1754 m_th_brk_find_breaks =
1755 reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve(symbol: "th_brk_find_breaks"));
1756 m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve(symbol: "th_next_cell"));
1757
1758 auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve(symbol: "th_brk_new"));
1759 if (th_brk_new) {
1760 m_state = th_brk_new(nullptr);
1761 m_th_brk_delete =
1762 reinterpret_cast<th_brk_delete_def>(m_library.resolve(symbol: "th_brk_delete"));
1763 }
1764 }
1765
1766 ~LibThai()
1767 {
1768 if (m_state && m_th_brk_delete)
1769 m_th_brk_delete(m_state);
1770 m_library.unload();
1771 }
1772
1773 bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1774
1775 int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
1776 {
1777 Q_ASSERT(m_state);
1778 Q_ASSERT(m_th_brk_find_breaks);
1779 return m_th_brk_find_breaks(m_state, s, pos, pos_sz);
1780 }
1781
1782 size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
1783 {
1784 Q_ASSERT(m_th_next_cell);
1785 return m_th_next_cell(s, len, cell, is_decomp_am);
1786 }
1787
1788private:
1789 QLibrary m_library;
1790
1791 // Global state for th_brk_find_breaks().
1792 // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1793 // state is read-only, and so it is safe to use it from multiple threads after
1794 // initialization. This is also stated in the libthai documentation.
1795 ThBrk *m_state = nullptr;
1796
1797 th_brk_find_breaks_def m_th_brk_find_breaks = nullptr;
1798 th_next_cell_def m_th_next_cell = nullptr;
1799 th_brk_delete_def m_th_brk_delete = nullptr;
1800};
1801
1802} // unnamed namespace
1803
1804Q_GLOBAL_STATIC(LibThai, g_libThai)
1805
1806static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1807{
1808 qsizetype i;
1809 unsigned char *result = reinterpret_cast<unsigned char *>(cstr);
1810
1811 for (i = 0; i < len; ++i) {
1812 if (string[i] <= 0xa0)
1813 result[i] = static_cast<unsigned char>(string[i]);
1814 else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1815 result[i] = static_cast<unsigned char>(string[i] - 0xe00 + 0xa0);
1816 else
1817 result[i] = static_cast<unsigned char>(~0); // Same encoding as libthai uses for invalid chars
1818 }
1819
1820 result[len] = 0;
1821}
1822
1823/*
1824 * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1825 */
1826static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1827{
1828 constexpr qsizetype Prealloc = 128;
1829 QVarLengthArray<char, Prealloc + 1> s(len + 1);
1830 QVarLengthArray<int, Prealloc> break_positions(len);
1831 qsizetype numbreaks, i;
1832 struct thcell_t tis_cell;
1833
1834 LibThai *libThai = g_libThai;
1835 if (!libThai || !libThai->isInitialized())
1836 return;
1837
1838 to_tis620(string, len, cstr: s.data());
1839
1840 for (i = 0; i < len; ++i) {
1841 attributes[i].wordBreak = false;
1842 attributes[i].wordStart = false;
1843 attributes[i].wordEnd = false;
1844 attributes[i].lineBreak = false;
1845 }
1846
1847 attributes[0].wordBreak = true;
1848 attributes[0].wordStart = true;
1849 attributes[0].wordEnd = false;
1850 numbreaks = libThai->brk_find_breaks(s: reinterpret_cast<const unsigned char *>(s.data()),
1851 pos: break_positions.data(),
1852 pos_sz: static_cast<size_t>(break_positions.size()));
1853 for (i = 0; i < numbreaks; ++i) {
1854 attributes[break_positions[i]].wordBreak = true;
1855 attributes[break_positions[i]].wordStart = true;
1856 attributes[break_positions[i]].wordEnd = true;
1857 attributes[break_positions[i]].lineBreak = true;
1858 }
1859 if (numbreaks > 0)
1860 attributes[break_positions[numbreaks - 1]].wordStart = false;
1861
1862 /* manage grapheme boundaries */
1863 i = 0;
1864 while (i < len) {
1865 size_t cell_length =
1866 libThai->next_cell(s: reinterpret_cast<const unsigned char *>(s.data()) + i,
1867 len: size_t(len - i), cell: &tis_cell, is_decomp_am: true);
1868
1869 attributes[i].graphemeBoundary = true;
1870 for (size_t j = 1; j < cell_length; ++j)
1871 attributes[i + j].graphemeBoundary = false;
1872
1873 i += cell_length;
1874 }
1875}
1876
1877#endif // QT_CONFIG(library)
1878
1879static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1880{
1881 assert(script == QChar::Script_Thai);
1882#if QT_CONFIG(library)
1883 const char16_t *uc = text + from;
1884 attributes += from;
1885 Q_UNUSED(script);
1886 thaiAssignAttributes(string: uc, len, attributes);
1887#else
1888 Q_UNUSED(script);
1889 Q_UNUSED(text);
1890 Q_UNUSED(from);
1891 Q_UNUSED(len);
1892 Q_UNUSED(attributes);
1893#endif
1894}
1895
1896/*
1897 tibetan syllables are of the form:
1898 head position consonant
1899 first sub-joined consonant
1900 ....intermediate sub-joined consonants (if any)
1901 last sub-joined consonant
1902 sub-joined vowel (a-chung U+0F71)
1903 standard or compound vowel sign (or 'virama' for devanagari transliteration)
1904*/
1905
1906typedef enum {
1907 TibetanOther,
1908 TibetanHeadConsonant,
1909 TibetanSubjoinedConsonant,
1910 TibetanSubjoinedVowel,
1911 TibetanVowel
1912} TibetanForm;
1913
1914/* this table starts at U+0f40 */
1915static const unsigned char tibetanForm[0x80] = {
1916 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1917 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1918 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1919 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1920
1921 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1922 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1923 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1924 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1925
1926 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1927 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1928 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1929 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1930
1931 TibetanOther, TibetanVowel, TibetanVowel, TibetanVowel,
1932 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1933 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1934 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1935
1936 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1937 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1938 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1939 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1940
1941 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1942 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1943 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1944 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1945
1946 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1947 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1948 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1949 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1950
1951 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1952 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1953 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1954 TibetanSubjoinedConsonant, TibetanOther, TibetanOther, TibetanOther
1955};
1956
1957#define tibetan_form(c) \
1958 ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1959
1960static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1961{
1962 const char16_t *uc = s + start;
1963
1964 qsizetype pos = 0;
1965 TibetanForm state = tibetan_form(*uc);
1966
1967/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1968 pos++;
1969
1970 if (state != TibetanHeadConsonant) {
1971 if (state != TibetanOther)
1972 *invalid = true;
1973 goto finish;
1974 }
1975
1976 while (pos < end - start) {
1977 TibetanForm newState = tibetan_form(uc[pos]);
1978 switch (newState) {
1979 case TibetanSubjoinedConsonant:
1980 case TibetanSubjoinedVowel:
1981 if (state != TibetanHeadConsonant &&
1982 state != TibetanSubjoinedConsonant)
1983 goto finish;
1984 state = newState;
1985 break;
1986 case TibetanVowel:
1987 if (state != TibetanHeadConsonant &&
1988 state != TibetanSubjoinedConsonant &&
1989 state != TibetanSubjoinedVowel)
1990 goto finish;
1991 break;
1992 case TibetanOther:
1993 case TibetanHeadConsonant:
1994 goto finish;
1995 }
1996 pos++;
1997 }
1998
1999finish:
2000 *invalid = false;
2001 return start+pos;
2002}
2003
2004static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2005{
2006 qsizetype end = from + len;
2007 qsizetype i = 0;
2008 Q_UNUSED(script);
2009 attributes += from;
2010 while (i < len) {
2011 bool invalid;
2012 qsizetype boundary = tibetan_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
2013
2014 attributes[i].graphemeBoundary = true;
2015
2016 if (boundary > len-1) boundary = len;
2017 i++;
2018 while (i < boundary) {
2019 attributes[i].graphemeBoundary = false;
2020 ++i;
2021 }
2022 assert(i == boundary);
2023 }
2024}
2025
2026enum MymrCharClassValues {
2027 Mymr_CC_RESERVED = 0,
2028 Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
2029 Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
2030 Mymr_CC_NGA = 3, /* Consonant NGA */
2031 Mymr_CC_YA = 4, /* Consonant YA */
2032 Mymr_CC_RA = 5, /* Consonant RA */
2033 Mymr_CC_WA = 6, /* Consonant WA */
2034 Mymr_CC_HA = 7, /* Consonant HA */
2035 Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
2036 Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
2037 Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
2038 Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
2039 Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
2040 Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
2041 Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
2042 Mymr_CC_SIGN_ABOVE = 15,
2043 Mymr_CC_SIGN_BELOW = 16,
2044 Mymr_CC_SIGN_AFTER = 17,
2045 Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
2046 Mymr_CC_COUNT = 19 /* This is the number of character classes */
2047};
2048
2049enum MymrCharClassFlags {
2050 Mymr_CF_CLASS_MASK = 0x0000FFFF,
2051
2052 Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2053 Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
2054 Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
2055 Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
2056 Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
2057 first in a syllable */
2058 Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
2059
2060 /* position flags */
2061 Mymr_CF_POS_BEFORE = 0x00080000,
2062 Mymr_CF_POS_BELOW = 0x00040000,
2063 Mymr_CF_POS_ABOVE = 0x00020000,
2064 Mymr_CF_POS_AFTER = 0x00010000,
2065 Mymr_CF_POS_MASK = 0x000f0000,
2066
2067 Mymr_CF_AFTER_KINZI = 0x00100000
2068};
2069
2070Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags)
2071
2072/* Characters that get refrered to by name */
2073enum MymrChar
2074{
2075 Mymr_C_SIGN_ZWNJ = 0x200C,
2076 Mymr_C_SIGN_ZWJ = 0x200D,
2077 Mymr_C_DOTTED_CIRCLE = 0x25CC,
2078 Mymr_C_RA = 0x101B,
2079 Mymr_C_YA = 0x101A,
2080 Mymr_C_NGA = 0x1004,
2081 Mymr_C_VOWEL_E = 0x1031,
2082 Mymr_C_VIRAMA = 0x1039
2083};
2084
2085enum
2086{
2087 Mymr_xx = Mymr_CC_RESERVED,
2088 Mymr_c1 = Mymr_CC_CONSONANT | Mymr_CF_CONSONANT | Mymr_CF_POS_BELOW,
2089 Mymr_c2 = Mymr_CC_CONSONANT2 | Mymr_CF_CONSONANT,
2090 Mymr_ng = Mymr_CC_NGA | Mymr_CF_CONSONANT | Mymr_CF_POS_ABOVE,
2091 Mymr_ya = Mymr_CC_YA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_AFTER | Mymr_CF_AFTER_KINZI,
2092 Mymr_ra = Mymr_CC_RA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BEFORE,
2093 Mymr_wa = Mymr_CC_WA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW,
2094 Mymr_ha = Mymr_CC_HA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW,
2095 Mymr_id = Mymr_CC_IND_VOWEL | Mymr_CF_IND_VOWEL,
2096 Mymr_vi = Mymr_CC_VIRAMA | Mymr_CF_VIRAMA | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE,
2097 Mymr_dl = Mymr_CC_PRE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BEFORE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
2098 Mymr_db = Mymr_CC_BELOW_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
2099 Mymr_da = Mymr_CC_ABOVE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
2100 Mymr_dr = Mymr_CC_POST_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
2101 Mymr_sa = Mymr_CC_SIGN_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_ABOVE | Mymr_CF_AFTER_KINZI,
2102 Mymr_sb = Mymr_CC_SIGN_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_BELOW | Mymr_CF_AFTER_KINZI,
2103 Mymr_sp = Mymr_CC_SIGN_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI
2104};
2105
2106
2107typedef int MymrCharClass;
2108
2109
2110static const MymrCharClass mymrCharClasses[] =
2111{
2112 Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_ng, Mymr_c1, Mymr_c1, Mymr_c1,
2113 Mymr_c1, Mymr_c1, Mymr_c2, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, /* 1000 - 100F */
2114 Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1,
2115 Mymr_c1, Mymr_c1, Mymr_ya, Mymr_ra, Mymr_c1, Mymr_wa, Mymr_c1, Mymr_ha, /* 1010 - 101F */
2116 Mymr_c2, Mymr_c2, Mymr_xx, Mymr_id, Mymr_id, Mymr_id, Mymr_id, Mymr_id,
2117 Mymr_xx, Mymr_id, Mymr_id, Mymr_xx, Mymr_dr, Mymr_da, Mymr_da, Mymr_db, /* 1020 - 102F */
2118 Mymr_db, Mymr_dl, Mymr_da, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_sa, Mymr_sb,
2119 Mymr_sp, Mymr_vi, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1030 - 103F */
2120 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
2121 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1040 - 104F */
2122 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
2123 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1050 - 105F */
2124};
2125
2126static MymrCharClass
2127getMyanmarCharClass (ushort ch)
2128{
2129 if (ch == Mymr_C_SIGN_ZWJ)
2130 return Mymr_CC_ZERO_WIDTH_J_MARK;
2131
2132 if (ch == Mymr_C_SIGN_ZWNJ)
2133 return Mymr_CC_ZERO_WIDTH_NJ_MARK;
2134
2135 if (ch < 0x1000 || ch > 0x105f)
2136 return Mymr_CC_RESERVED;
2137
2138 return mymrCharClasses[ch - 0x1000];
2139}
2140
2141static const signed char mymrStateTable[][Mymr_CC_COUNT] =
2142{
2143/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
2144 { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
2145 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
2146 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
2147 {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
2148 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
2149 {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
2150 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
2151 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
2152 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
2153 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
2154 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
2155 {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
2156 {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
2157 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
2158 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
2159 {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
2160 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
2161 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
2162 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
2163 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
2164 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
2165 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
2166 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
2167 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
2168 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
2169 {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
2170 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
2171 {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
2172/* exit state -2 is for invalid order of medials and combination of invalids
2173 with virama where virama should treat as start of next syllable
2174 */
2175};
2176
2177/*#define MYANMAR_DEBUG */
2178#ifdef MYANMAR_DEBUG
2179#define MMDEBUG qDebug
2180#else
2181# define MMDEBUG \
2182 if (0) \
2183 printf
2184#endif
2185
2186/*
2187// Given an input string of characters and a location in which to start looking
2188// calculate, using the state table, which one is the last character of the syllable
2189// that starts in the starting position.
2190*/
2191static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2192{
2193 const char16_t *uc = s + start;
2194 int state = 0;
2195 qsizetype pos = start;
2196 *invalid = false;
2197
2198 while (pos < end) {
2199 MymrCharClass charClass = getMyanmarCharClass(ch: *uc);
2200 state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
2201 if (pos == start)
2202 *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
2203
2204 MMDEBUG(format: "state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
2205
2206 if (state < 0) {
2207 if (state < -1)
2208 --pos;
2209 break;
2210 }
2211 ++uc;
2212 ++pos;
2213 }
2214 return pos;
2215}
2216
2217static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2218{
2219 qsizetype end = from + len;
2220 qsizetype i = 0;
2221 Q_UNUSED(script);
2222 attributes += from;
2223 while (i < len) {
2224 bool invalid;
2225 qsizetype boundary = myanmar_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
2226
2227 attributes[i].graphemeBoundary = true;
2228 attributes[i].lineBreak = true;
2229
2230 if (boundary > len-1)
2231 boundary = len;
2232 i++;
2233 while (i < boundary) {
2234 attributes[i].graphemeBoundary = false;
2235 ++i;
2236 }
2237 assert(i == boundary);
2238 }
2239}
2240
2241/*
2242// Vocabulary
2243// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
2244// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
2245// split vowels, signs... but there is only one base in a syllable, it has to be coded as
2246// the first character of the syllable.
2247// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
2248// Khmer language has five of them. Khmer split vowels either have one part before the
2249// base and one after the base or they have a part before the base and a part above the base.
2250// The first part of all Khmer split vowels is the same character, identical to
2251// the glyph of Khmer dependent vowel SRA EI
2252// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
2253// Differently than indian languages, the coeng modifies the consonant that follows it,
2254// not the one preceding it Each consonant has two forms, the base form and the subscript form
2255// the base form is the normal one (using the consonants code-point), the subscript form is
2256// displayed when the combination coeng + consonant is encountered.
2257// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
2258// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
2259// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
2260// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
2261// if it is attached to a consonant of the first series or a consonant of the second series
2262// Most consonants have an equivalent in the other series, but some of theme exist only in
2263// one series (for example SA). If we want to use the consonant SA with a vowel sound that
2264// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
2265// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
2266// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
2267// MUSIKATOAN a second series consonant to have a first series vowel sound.
2268// Consonant shifter are both normally supercript marks, but, when they are followed by a
2269// superscript, they change shape and take the form of subscript dependent vowel SRA U.
2270// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
2271// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
2272// be placed after the coeng consonant.
2273// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
2274// Each vowel has its own position. Only one vowel per syllable is allowed.
2275// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
2276// Allowed in a syllable.
2277//
2278//
2279// order is important here! This order must be the same that is found in each horizontal
2280// line in the statetable for Khmer (see khmerStateTable) .
2281*/
2282enum KhmerCharClassValues {
2283 CC_RESERVED = 0,
2284 CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
2285 CC_CONSONANT2 = 2, /* Consonant of type 2 */
2286 CC_CONSONANT3 = 3, /* Consonant of type 3 */
2287 CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
2288 CC_CONSONANT_SHIFTER = 5,
2289 CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
2290 CC_COENG = 7, /* Subscript consonant combining character */
2291 CC_DEPENDENT_VOWEL = 8,
2292 CC_SIGN_ABOVE = 9,
2293 CC_SIGN_AFTER = 10,
2294 CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
2295 CC_COUNT = 12 /* This is the number of character classes */
2296};
2297
2298
2299enum KhmerCharClassFlags {
2300 CF_CLASS_MASK = 0x0000FFFF,
2301
2302 CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2303 CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
2304 CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
2305 CF_COENG = 0x08000000, /* flag to speed up comparing */
2306 CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
2307 CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
2308
2309 /* position flags */
2310 CF_POS_BEFORE = 0x00080000,
2311 CF_POS_BELOW = 0x00040000,
2312 CF_POS_ABOVE = 0x00020000,
2313 CF_POS_AFTER = 0x00010000,
2314 CF_POS_MASK = 0x000f0000
2315};
2316
2317Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags)
2318
2319/* Characters that get referred to by name */
2320enum KhmerChar {
2321 C_SIGN_ZWNJ = 0x200C,
2322 C_SIGN_ZWJ = 0x200D,
2323 C_RO = 0x179A,
2324 C_VOWEL_AA = 0x17B6,
2325 C_SIGN_NIKAHIT = 0x17C6,
2326 C_VOWEL_E = 0x17C1,
2327 C_COENG = 0x17D2
2328};
2329
2330
2331/*
2332// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2333// they are also used to know where a character should be placed (location in reference to the base character)
2334// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2335// indicate error in syllable construction
2336*/
2337enum {
2338 _xx = CC_RESERVED,
2339 _sa = CC_SIGN_ABOVE | CF_DOTTED_CIRCLE | CF_POS_ABOVE,
2340 _sp = CC_SIGN_AFTER | CF_DOTTED_CIRCLE| CF_POS_AFTER,
2341 _c1 = CC_CONSONANT | CF_CONSONANT,
2342 _c2 = CC_CONSONANT2 | CF_CONSONANT,
2343 _c3 = CC_CONSONANT3 | CF_CONSONANT,
2344 _rb = CC_ROBAT | CF_POS_ABOVE | CF_DOTTED_CIRCLE,
2345 _cs = CC_CONSONANT_SHIFTER | CF_DOTTED_CIRCLE | CF_SHIFTER,
2346 _dl = CC_DEPENDENT_VOWEL | CF_POS_BEFORE | CF_DOTTED_CIRCLE,
2347 _db = CC_DEPENDENT_VOWEL | CF_POS_BELOW | CF_DOTTED_CIRCLE,
2348 _da = CC_DEPENDENT_VOWEL | CF_POS_ABOVE | CF_DOTTED_CIRCLE | CF_ABOVE_VOWEL,
2349 _dr = CC_DEPENDENT_VOWEL | CF_POS_AFTER | CF_DOTTED_CIRCLE,
2350 _co = CC_COENG | CF_COENG | CF_DOTTED_CIRCLE,
2351
2352 /* split vowel */
2353 _va = _da | CF_SPLIT_VOWEL,
2354 _vr = _dr | CF_SPLIT_VOWEL
2355};
2356
2357
2358/*
2359// Character class: a character class value
2360// ORed with character class flags.
2361*/
2362typedef unsigned long KhmerCharClass;
2363
2364
2365/*
2366// Character class tables
2367// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2368// _sa Sign placed above the base
2369// _sp Sign placed after the base
2370// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2371// _c2 Consonant of type 2 (only RO)
2372// _c3 Consonant of type 3
2373// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2374// _cd Consonant-shifter
2375// _dl Dependent vowel placed before the base (left of the base)
2376// _db Dependent vowel placed below the base
2377// _da Dependent vowel placed above the base
2378// _dr Dependent vowel placed behind the base (right of the base)
2379// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2380// it to create a subscript consonant or independent vowel
2381// _va Khmer split vowel in which the first part is before the base and the second one above the base
2382// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2383*/
2384static const KhmerCharClass khmerCharClasses[] = {
2385 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
2386 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
2387 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
2388 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
2389 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
2390 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
2391};
2392
2393/* this enum must reflect the range of khmerCharClasses */
2394enum KhmerCharClassesRange {
2395 KhmerFirstChar = 0x1780,
2396 KhmerLastChar = 0x17df
2397};
2398
2399/*
2400// Below we define how a character in the input string is either in the khmerCharClasses table
2401// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2402// within the syllable, but are not in the table) we also get their type back, or an unknown object
2403// in which case we get _xx (CC_RESERVED) back
2404*/
2405static KhmerCharClass getKhmerCharClass(ushort uc)
2406{
2407 if (uc == C_SIGN_ZWJ) {
2408 return CC_ZERO_WIDTH_J_MARK;
2409 }
2410
2411 if (uc == C_SIGN_ZWNJ) {
2412 return CC_ZERO_WIDTH_NJ_MARK;
2413 }
2414
2415 if (uc < KhmerFirstChar || uc > KhmerLastChar) {
2416 return CC_RESERVED;
2417 }
2418
2419 return khmerCharClasses[uc - KhmerFirstChar];
2420}
2421
2422
2423/*
2424// The stateTable is used to calculate the end (the length) of a well
2425// formed Khmer Syllable.
2426//
2427// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2428// CharClassValues. This coincidence of values allows the follow up of the table.
2429//
2430// Each line corresponds to a state, which does not necessarily need to be a type
2431// of component... for example, state 2 is a base, with is always a first character
2432// in the syllable, but the state could be produced a consonant of any type when
2433// it is the first character that is analysed (in ground state).
2434//
2435// Differentiating 3 types of consonants is necessary in order to
2436// forbid the use of certain combinations, such as having a second
2437// coeng after a coeng RO,
2438// The inexistent possibility of having a type 3 after another type 3 is permitted,
2439// eliminating it would very much complicate the table, and it does not create typing
2440// problems, as the case above.
2441//
2442// The table is quite complex, in order to limit the number of coeng consonants
2443// to 2 (by means of the table).
2444//
2445// There a peculiarity, as far as Unicode is concerned:
2446// - The consonant-shifter is considered in two possible different
2447// locations, the one considered in Unicode 3.0 and the one considered in
2448// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2449//
2450//
2451// xx independent character, such as a number, punctuation sign or non-khmer char
2452//
2453// c1 Khmer consonant of type 1 or an independent vowel
2454// that is, a letter in which the subscript for is only under the
2455// base, not taking any space to the right or to the left
2456//
2457// c2 Khmer consonant of type 2, the coeng form takes space under
2458// and to the left of the base (only RO is of this type)
2459//
2460// c3 Khmer consonant of type 3. Its subscript form takes space under
2461// and to the right of the base.
2462//
2463// cs Khmer consonant shifter
2464//
2465// rb Khmer robat
2466//
2467// co coeng character (u17D2)
2468//
2469// dv dependent vowel (including split vowels, they are treated in the same way).
2470// even if dv is not defined above, the component that is really tested for is
2471// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2472//
2473// zwj Zero Width joiner
2474//
2475// zwnj Zero width non joiner
2476//
2477// sa above sign
2478//
2479// sp post sign
2480//
2481// there are lines with equal content but for an easier understanding
2482// (and maybe change in the future) we did not join them
2483*/
2484static const signed char khmerStateTable[][CC_COUNT] =
2485{
2486 /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2487 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2488 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2489 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2490 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2491 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2492 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2493 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2494 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2495 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2496 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2497 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2498 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2499 {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2500 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2501 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2502 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2503 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2504 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2505 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2506 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2507 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2508};
2509
2510
2511/* #define KHMER_DEBUG */
2512#ifdef KHMER_DEBUG
2513#define KHDEBUG qDebug
2514#else
2515# define KHDEBUG \
2516 if (0) \
2517 printf
2518#endif
2519
2520/*
2521// Given an input string of characters and a location in which to start looking
2522// calculate, using the state table, which one is the last character of the syllable
2523// that starts in the starting position.
2524*/
2525static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2526{
2527 const char16_t *uc = s + start;
2528 int state = 0;
2529 qsizetype pos = start;
2530 *invalid = false;
2531
2532 while (pos < end) {
2533 KhmerCharClass charClass = getKhmerCharClass(uc: *uc);
2534 if (pos == start) {
2535 *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2536 }
2537 state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2538
2539 KHDEBUG(format: "state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2540 charClass, *uc );
2541
2542 if (state < 0) {
2543 break;
2544 }
2545 ++uc;
2546 ++pos;
2547 }
2548 return pos;
2549}
2550
2551static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2552{
2553 qsizetype end = from + len;
2554 qsizetype i = 0;
2555 Q_UNUSED(script);
2556 attributes += from;
2557 while ( i < len ) {
2558 bool invalid;
2559 qsizetype boundary = khmer_nextSyllableBoundary( s: text, start: from+i, end, invalid: &invalid ) - from;
2560
2561 attributes[i].graphemeBoundary = true;
2562
2563 if ( boundary > len-1 ) boundary = len;
2564 i++;
2565 while ( i < boundary ) {
2566 attributes[i].graphemeBoundary = false;
2567 ++i;
2568 }
2569 assert( i == boundary );
2570 }
2571}
2572
2573
2574static CharAttributeFunction charAttributeFunction(QChar::Script script)
2575{
2576 switch (script) {
2577 case QChar::Script_Unknown:
2578 case QChar::Script_Inherited:
2579 case QChar::Script_Common:
2580 case QChar::Script_Latin:
2581 case QChar::Script_Greek:
2582 case QChar::Script_Cyrillic:
2583 case QChar::Script_Armenian:
2584 case QChar::Script_Hebrew:
2585 case QChar::Script_Arabic:
2586 case QChar::Script_Syriac:
2587 case QChar::Script_Thaana:
2588 return nullptr;
2589 case QChar::Script_Devanagari:
2590 case QChar::Script_Bengali:
2591 case QChar::Script_Gurmukhi:
2592 case QChar::Script_Gujarati:
2593 case QChar::Script_Oriya:
2594 case QChar::Script_Tamil:
2595 case QChar::Script_Telugu:
2596 case QChar::Script_Kannada:
2597 case QChar::Script_Malayalam:
2598 case QChar::Script_Sinhala:
2599 return &indicAttributes;
2600 case QChar::Script_Thai:
2601 return &thaiAttributes;
2602 case QChar::Script_Lao:
2603 return nullptr;
2604 case QChar::Script_Tibetan:
2605 return &tibetanAttributes;
2606 case QChar::Script_Myanmar:
2607 return &myanmarAttributes;
2608 case QChar::Script_Georgian:
2609 case QChar::Script_Hangul:
2610 case QChar::Script_Ethiopic:
2611 case QChar::Script_Cherokee:
2612 case QChar::Script_CanadianAboriginal:
2613 case QChar::Script_Ogham:
2614 case QChar::Script_Runic:
2615 return nullptr;
2616 case QChar::Script_Khmer:
2617 return &khmerAttributes;
2618 case QChar::Script_Mongolian:
2619 case QChar::Script_Hiragana:
2620 case QChar::Script_Katakana:
2621 case QChar::Script_Bopomofo:
2622 case QChar::Script_Han:
2623 case QChar::Script_Yi:
2624 case QChar::Script_OldItalic:
2625 case QChar::Script_Gothic:
2626 case QChar::Script_Deseret:
2627 case QChar::Script_Tagalog:
2628 case QChar::Script_Hanunoo:
2629 case QChar::Script_Buhid:
2630 case QChar::Script_Tagbanwa:
2631 case QChar::Script_Coptic:
2632 case QChar::Script_Limbu:
2633 case QChar::Script_TaiLe:
2634 case QChar::Script_LinearB:
2635 case QChar::Script_Ugaritic:
2636 case QChar::Script_Shavian:
2637 case QChar::Script_Osmanya:
2638 case QChar::Script_Cypriot:
2639 case QChar::Script_Braille:
2640 case QChar::Script_Buginese:
2641 case QChar::Script_NewTaiLue:
2642 case QChar::Script_Glagolitic:
2643 case QChar::Script_Tifinagh:
2644 case QChar::Script_SylotiNagri:
2645 case QChar::Script_OldPersian:
2646 case QChar::Script_Kharoshthi:
2647 case QChar::Script_Balinese:
2648 case QChar::Script_Cuneiform:
2649 case QChar::Script_Phoenician:
2650 case QChar::Script_PhagsPa:
2651 case QChar::Script_Nko:
2652 case QChar::Script_Sundanese:
2653 case QChar::Script_Lepcha:
2654 case QChar::Script_OlChiki:
2655 case QChar::Script_Vai:
2656 case QChar::Script_Saurashtra:
2657 case QChar::Script_KayahLi:
2658 case QChar::Script_Rejang:
2659 case QChar::Script_Lycian:
2660 case QChar::Script_Carian:
2661 case QChar::Script_Lydian:
2662 case QChar::Script_Cham:
2663 case QChar::Script_TaiTham:
2664 case QChar::Script_TaiViet:
2665 case QChar::Script_Avestan:
2666 case QChar::Script_EgyptianHieroglyphs:
2667 case QChar::Script_Samaritan:
2668 case QChar::Script_Lisu:
2669 case QChar::Script_Bamum:
2670 case QChar::Script_Javanese:
2671 case QChar::Script_MeeteiMayek:
2672 case QChar::Script_ImperialAramaic:
2673 case QChar::Script_OldSouthArabian:
2674 case QChar::Script_InscriptionalParthian:
2675 case QChar::Script_InscriptionalPahlavi:
2676 case QChar::Script_OldTurkic:
2677 case QChar::Script_Kaithi:
2678 case QChar::Script_Batak:
2679 case QChar::Script_Brahmi:
2680 case QChar::Script_Mandaic:
2681 case QChar::Script_Chakma:
2682 case QChar::Script_MeroiticCursive:
2683 case QChar::Script_MeroiticHieroglyphs:
2684 case QChar::Script_Miao:
2685 case QChar::Script_Sharada:
2686 case QChar::Script_SoraSompeng:
2687 case QChar::Script_Takri:
2688 case QChar::Script_CaucasianAlbanian:
2689 case QChar::Script_BassaVah:
2690 case QChar::Script_Duployan:
2691 case QChar::Script_Elbasan:
2692 case QChar::Script_Grantha:
2693 case QChar::Script_PahawhHmong:
2694 case QChar::Script_Khojki:
2695 case QChar::Script_LinearA:
2696 case QChar::Script_Mahajani:
2697 case QChar::Script_Manichaean:
2698 case QChar::Script_MendeKikakui:
2699 case QChar::Script_Modi:
2700 case QChar::Script_Mro:
2701 case QChar::Script_OldNorthArabian:
2702 case QChar::Script_Nabataean:
2703 case QChar::Script_Palmyrene:
2704 case QChar::Script_PauCinHau:
2705 case QChar::Script_OldPermic:
2706 case QChar::Script_PsalterPahlavi:
2707 case QChar::Script_Siddham:
2708 case QChar::Script_Khudawadi:
2709 case QChar::Script_Tirhuta:
2710 case QChar::Script_WarangCiti:
2711 case QChar::Script_Ahom:
2712 case QChar::Script_AnatolianHieroglyphs:
2713 case QChar::Script_Hatran:
2714 case QChar::Script_Multani:
2715 case QChar::Script_OldHungarian:
2716 case QChar::Script_SignWriting:
2717 case QChar::Script_Adlam:
2718 case QChar::Script_Bhaiksuki:
2719 case QChar::Script_Marchen:
2720 case QChar::Script_Newa:
2721 case QChar::Script_Osage:
2722 case QChar::Script_Tangut:
2723 case QChar::Script_MasaramGondi:
2724 case QChar::Script_Nushu:
2725 case QChar::Script_Soyombo:
2726 case QChar::Script_ZanabazarSquare:
2727 case QChar::Script_Dogra:
2728 case QChar::Script_GunjalaGondi:
2729 case QChar::Script_HanifiRohingya:
2730 case QChar::Script_Makasar:
2731 case QChar::Script_Medefaidrin:
2732 case QChar::Script_OldSogdian:
2733 case QChar::Script_Sogdian:
2734 case QChar::Script_Elymaic:
2735 case QChar::Script_Nandinagari:
2736 case QChar::Script_NyiakengPuachueHmong:
2737 case QChar::Script_Wancho:
2738 case QChar::Script_Chorasmian:
2739 case QChar::Script_DivesAkuru:
2740 case QChar::Script_KhitanSmallScript:
2741 case QChar::Script_Yezidi:
2742 case QChar::Script_CyproMinoan:
2743 case QChar::Script_OldUyghur:
2744 case QChar::Script_Tangsa:
2745 case QChar::Script_Toto:
2746 case QChar::Script_Vithkuqi:
2747 case QChar::Script_Kawi:
2748 case QChar::Script_NagMundari:
2749 case QChar::Script_Garay:
2750 case QChar::Script_GurungKhema:
2751 case QChar::Script_KiratRai:
2752 case QChar::Script_OlOnal:
2753 case QChar::Script_Sunuwar:
2754 case QChar::Script_Todhri:
2755 case QChar::Script_TuluTigalari:
2756 return nullptr;
2757 case QChar::ScriptCount:
2758 // Don't Q_UNREACHABLE here; this might be a newer value in later Qt versions
2759 // (incl. patch releases)
2760 ;
2761 }
2762 return nullptr;
2763};
2764
2765static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2766 const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2767 QCharAttributes *attributes)
2768{
2769 if (stringLength == 0)
2770 return;
2771 for (qsizetype i = 0; i < numItems; ++i) {
2772 QChar::Script script = items[i].script;
2773 CharAttributeFunction attributeFunction = charAttributeFunction(script);
2774 if (!attributeFunction)
2775 continue;
2776 qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2777 attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2778 }
2779}
2780
2781}
2782
2783Q_CORE_EXPORT void initCharAttributes(QStringView string,
2784 const ScriptItem *items, qsizetype numItems,
2785 QCharAttributes *attributes, CharAttributeOptions options)
2786{
2787 if (string.size() <= 0)
2788 return;
2789
2790 if (!(options & DontClearAttributes))
2791 ::memset(s: attributes, c: 0, n: (string.size() + 1) * sizeof(QCharAttributes));
2792
2793 if (options & GraphemeBreaks)
2794 getGraphemeBreaks(string: string.utf16(), len: string.size(), attributes);
2795 if (options & WordBreaks)
2796 getWordBreaks(string: string.utf16(), len: string.size(), attributes);
2797 if (options & SentenceBreaks)
2798 getSentenceBreaks(string: string.utf16(), len: string.size(), attributes);
2799 if (options & LineBreaks)
2800 getLineBreaks(string: string.utf16(), len: string.size(), attributes, options);
2801 if (options & WhiteSpaces)
2802 getWhiteSpaces(string: string.utf16(), len: string.size(), attributes);
2803
2804 if (!qt_initcharattributes_default_algorithm_only) {
2805 if (!items || numItems <= 0)
2806 return;
2807
2808 Tailored::getCharAttributes(string: string.utf16(), stringLength: string.size(), items, numItems, attributes);
2809 }
2810}
2811
2812
2813// ----------------------------------------------------------------------------
2814//
2815// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2816//
2817// ----------------------------------------------------------------------------
2818
2819Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2820{
2821 qsizetype sor = 0;
2822 qsizetype eor = 0;
2823 QChar::Script script = QChar::Script_Common;
2824
2825 for (qsizetype i = 0; i < string.size(); ++i, eor = i) {
2826 char32_t ucs4 = string[i].unicode();
2827 if (QChar::isHighSurrogate(ucs4) && i + 1 < string.size()) {
2828 ushort low = string[i + 1].unicode();
2829 if (QChar::isLowSurrogate(ucs4: low)) {
2830 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
2831 ++i;
2832 }
2833 }
2834
2835 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2836
2837 QChar::Script nscript = QChar::Script(prop->script);
2838
2839 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2840 continue;
2841
2842 // inherit preceding Common-s
2843 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2844 // also covers a case where the base character of Common script followed
2845 // by one or more combining marks of non-Inherited, non-Common script
2846 script = nscript;
2847 continue;
2848 }
2849
2850 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2851 // Thus, a combining mark - whatever its script property value is - should inherit
2852 // the script property value of its base character.
2853 static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
2854 if (Q_UNLIKELY(FLAG(prop->category) & test))
2855 continue;
2856
2857 Q_ASSERT(script > QChar::Script_Common);
2858 Q_ASSERT(sor < eor);
2859 scripts->append(t: ScriptItem{.position: sor, .script: script});
2860 sor = eor;
2861
2862 script = nscript;
2863 }
2864
2865 Q_ASSERT(script >= QChar::Script_Common);
2866 Q_ASSERT(eor == string.size());
2867 scripts->append(t: ScriptItem{.position: sor, .script: script});
2868}
2869
2870} // namespace QUnicodeTools
2871
2872QT_END_NAMESPACE
2873

source code of qtbase/src/corelib/text/qunicodetools.cpp