1// Copyright (C) 2020 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qunicodetools_p.h"
5
6#include "qunicodetables_p.h"
7#include "qvarlengtharray.h"
8#if QT_CONFIG(library)
9#include "qlibrary.h"
10#endif
11
12#include <limits.h>
13
14#define FLAG(x) (1 << (x))
15
16QT_BEGIN_NAMESPACE
17
18using namespace Qt::StringLiterals;
19
20#ifdef QT_BUILD_INTERNAL
21Q_CONSTINIT Q_AUTOTEST_EXPORT
22#else
23constexpr
24#endif
25int qt_initcharattributes_default_algorithm_only = 0;
26
27namespace QUnicodeTools {
28
29// -----------------------------------------------------------------------------------------------------
30//
31// The text boundaries determination algorithm.
32// See https://www.unicode.org/reports/tr29/tr29-37.html
33//
34// -----------------------------------------------------------------------------------------------------
35
36namespace GB {
37
38// This table is indexed by the grapheme break classes of two
39// (adjacent) code points.
40// The class of the first code point selects an entry.
41// If the entry's bit at position second_cp_class is set
42// (in other words: if entry & (1u << second_cp_class) is non-zero)
43// then there is NO grapheme break between the two code points.
44
45using GBTableEntryType = quint16;
46
47// Check that we have enough bits in the table (in case
48// NumGraphemeBreakClasses grows too much).
49static_assert(sizeof(GBTableEntryType) * CHAR_BIT >= QUnicodeTables::NumGraphemeBreakClasses,
50 "Internal error: increase the size in bits of GBTableEntryType");
51
52// GB9, GB9a
53static const GBTableEntryType Extend_SpacingMark_ZWJ =
54 FLAG(QUnicodeTables::GraphemeBreak_Extend)
55 | FLAG(QUnicodeTables::GraphemeBreak_SpacingMark)
56 | FLAG(QUnicodeTables::GraphemeBreak_ZWJ);
57
58static const GBTableEntryType HardBreak = 0u;
59
60static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses] = {
61 Extend_SpacingMark_ZWJ, // Any
62 FLAG(QUnicodeTables::GraphemeBreak_LF), // CR
63 HardBreak, // LF
64 HardBreak, // Control
65 Extend_SpacingMark_ZWJ, // Extend
66 Extend_SpacingMark_ZWJ, // ZWJ
67 Extend_SpacingMark_ZWJ, // RegionalIndicator
68 (Extend_SpacingMark_ZWJ
69 | FLAG(QUnicodeTables::GraphemeBreak_Any)
70 | FLAG(QUnicodeTables::GraphemeBreak_Prepend)
71 | FLAG(QUnicodeTables::GraphemeBreak_L)
72 | FLAG(QUnicodeTables::GraphemeBreak_V)
73 | FLAG(QUnicodeTables::GraphemeBreak_T)
74 | FLAG(QUnicodeTables::GraphemeBreak_LV)
75 | FLAG(QUnicodeTables::GraphemeBreak_LVT)
76 | FLAG(QUnicodeTables::GraphemeBreak_RegionalIndicator)
77 | FLAG(QUnicodeTables::GraphemeBreak_Extended_Pictographic)
78 ), // Prepend
79 Extend_SpacingMark_ZWJ, // SpacingMark
80 (Extend_SpacingMark_ZWJ
81 | FLAG(QUnicodeTables::GraphemeBreak_L)
82 | FLAG(QUnicodeTables::GraphemeBreak_V)
83 | FLAG(QUnicodeTables::GraphemeBreak_LV)
84 | FLAG(QUnicodeTables::GraphemeBreak_LVT)
85 ), // L
86 (Extend_SpacingMark_ZWJ
87 | FLAG(QUnicodeTables::GraphemeBreak_V)
88 | FLAG(QUnicodeTables::GraphemeBreak_T)
89 ), // V
90 (Extend_SpacingMark_ZWJ
91 | FLAG(QUnicodeTables::GraphemeBreak_T)
92 ), // T
93 (Extend_SpacingMark_ZWJ
94 | FLAG(QUnicodeTables::GraphemeBreak_V)
95 | FLAG(QUnicodeTables::GraphemeBreak_T)
96 ), // LV
97 (Extend_SpacingMark_ZWJ
98 | FLAG(QUnicodeTables::GraphemeBreak_T)
99 ), // LVT
100 Extend_SpacingMark_ZWJ // Extended_Pictographic
101};
102
103static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first,
104 QUnicodeTables::GraphemeBreakClass second)
105{
106 return (breakTable[first] & FLAG(second)) == 0;
107}
108
109// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
110// so we need to store some local state.
111enum class State : uchar {
112 Normal,
113 GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
114 GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
115 GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
116};
117
118} // namespace GB
119
120static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
121{
122 QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
123 GB::State state = GB::State::Normal;
124 for (qsizetype i = 0; i != len; ++i) {
125 qsizetype pos = i;
126 char32_t ucs4 = string[i];
127 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
128 ushort low = string[i + 1];
129 if (QChar::isLowSurrogate(ucs4: low)) {
130 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
131 ++i;
132 }
133 }
134
135 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
136 QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
137
138 bool shouldBreak = GB::shouldBreakBetweenClasses(first: lcls, second: cls);
139 bool handled = false;
140
141 switch (state) {
142 case GB::State::Normal:
143 break; // will deal with it below
144
145 case GB::State::GB11_ExtPicExt:
146 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
147 if (cls == QUnicodeTables::GraphemeBreak_Extend) {
148 // keep going in the current state
149 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
150 handled = true;
151 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
152 state = GB::State::GB11_ExtPicExtZWJ;
153 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
154 handled = true;
155 } else {
156 state = GB::State::Normal;
157 }
158 break;
159
160 case GB::State::GB11_ExtPicExtZWJ:
161 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
162 if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) {
163 shouldBreak = false;
164 handled = true;
165 }
166
167 state = GB::State::Normal;
168 break;
169
170 case GB::State::GB12_13_RI:
171 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator);
172 if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) {
173 shouldBreak = false;
174 handled = true;
175 }
176
177 state = GB::State::Normal;
178 break;
179 }
180
181 if (!handled) {
182 Q_ASSERT(state == GB::State::Normal);
183 if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11
184 if (cls == QUnicodeTables::GraphemeBreak_Extend) {
185 state = GB::State::GB11_ExtPicExt;
186 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
187 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
188 state = GB::State::GB11_ExtPicExtZWJ;
189 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
190 }
191 } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
192 state = GB::State::GB12_13_RI;
193 }
194 }
195
196 if (shouldBreak)
197 attributes[pos].graphemeBoundary = true;
198
199 lcls = cls;
200 }
201
202 attributes[len].graphemeBoundary = true; // GB2
203}
204
205
206namespace WB {
207
208enum Action {
209 NoBreak,
210 Break,
211 Lookup,
212 LookupW
213};
214
215static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
216// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
217 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
218 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
219 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
220 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
221 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
222 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
223 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
224 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
225 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
226 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // HebrewLetter
227 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // ALetter
228 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
229 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
230 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
231 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
232 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
233 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break }, // Numeric
234 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ExtendNumLet
235 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
236};
237
238} // namespace WB
239
240static void getWordBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
241{
242 enum WordType {
243 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
244 } currentWordType = WordTypeNone;
245
246 QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
247 auto real_cls = cls; // Unaffected by WB4
248
249 for (qsizetype i = 0; i != len; ++i) {
250 qsizetype pos = i;
251 char32_t ucs4 = string[i];
252 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
253 ushort low = string[i + 1];
254 if (QChar::isLowSurrogate(ucs4: low)) {
255 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
256 ++i;
257 }
258 }
259
260 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
261 QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
262 if (qt_initcharattributes_default_algorithm_only) {
263 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
264 // which caused "hi.there" to be treated like if it were just a single word;
265 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
266 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
267 if (ucs4 == 0x002E) // FULL STOP
268 ncls = QUnicodeTables::WordBreak_MidNumLet;
269 else if (ucs4 == 0x003A) // COLON
270 ncls = QUnicodeTables::WordBreak_MidLetter;
271 }
272
273 uchar action = WB::breakTable[cls][ncls];
274 switch (action) {
275 case WB::Break:
276 if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ
277 && prop->graphemeBreakClass
278 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
279 // WB3c: ZWJ × \p{Extended_Pictographic}
280 action = WB::NoBreak;
281 }
282 break;
283 case WB::NoBreak:
284 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend || ncls == QUnicodeTables::WordBreak_ZWJ || ncls == QUnicodeTables::WordBreak_Format)) {
285 // WB4: X(Extend|Format)* -> X
286 real_cls = ncls;
287 continue;
288 }
289 if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
290 // WB15/WB16: break between pairs of Regional indicator
291 ncls = QUnicodeTables::WordBreak_Any;
292 }
293 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace
294 && real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
295 // WB3d should not be affected by WB4
296 action = WB::Break;
297 }
298 break;
299 case WB::Lookup:
300 case WB::LookupW:
301 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
302 ucs4 = string[lookahead];
303 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
304 ushort low = string[lookahead + 1];
305 if (QChar::isLowSurrogate(ucs4: low)) {
306 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
307 ++lookahead;
308 }
309 }
310
311 prop = QUnicodeTables::properties(ucs4);
312 QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
313
314 if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend || tcls == QUnicodeTables::WordBreak_ZWJ || tcls == QUnicodeTables::WordBreak_Format)) {
315 // WB4: X(Extend|Format)* -> X
316 continue;
317 }
318
319 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
320 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
321 i = lookahead;
322 ncls = tcls;
323 action = WB::NoBreak;
324 }
325 break;
326 }
327 if (action != WB::NoBreak) {
328 action = WB::Break;
329 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
330 action = WB::NoBreak; // WB7a
331 }
332 break;
333 }
334
335 cls = ncls;
336 real_cls = ncls;
337
338 if (action == WB::Break) {
339 attributes[pos].wordBreak = true;
340 if (currentWordType != WordTypeNone)
341 attributes[pos].wordEnd = true;
342 switch (cls) {
343 case QUnicodeTables::WordBreak_Katakana:
344 currentWordType = WordTypeHiraganaKatakana;
345 attributes[pos].wordStart = true;
346 break;
347 case QUnicodeTables::WordBreak_HebrewLetter:
348 case QUnicodeTables::WordBreak_ALetter:
349 case QUnicodeTables::WordBreak_Numeric:
350 currentWordType = WordTypeAlphaNumeric;
351 attributes[pos].wordStart = true;
352 break;
353 default:
354 currentWordType = WordTypeNone;
355 break;
356 }
357 }
358 }
359
360 if (currentWordType != WordTypeNone)
361 attributes[len].wordEnd = true;
362 attributes[len].wordBreak = true; // WB2
363}
364
365
366namespace SB {
367
368enum State {
369 Initial,
370 Lower,
371 Upper,
372 LUATerm,
373 ATerm,
374 ATermC,
375 ACS,
376 STerm,
377 STermC,
378 SCS,
379 BAfterC,
380 BAfter,
381 Break,
382 Lookup
383};
384
385static const uchar breakTable[BAfter + 1][QUnicodeTables::NumSentenceBreakClasses] = {
386// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
387 { Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
388 { Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
389 { Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, Initial, STerm , Initial }, // Upper
390
391 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
392 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
393 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
394 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
395
396 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
397 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
398 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
399 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
400 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
401};
402
403} // namespace SB
404
405static void getSentenceBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes)
406{
407 uchar state = SB::BAfter; // to meet SB1
408 for (qsizetype i = 0; i != len; ++i) {
409 qsizetype pos = i;
410 char32_t ucs4 = string[i];
411 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
412 ushort low = string[i + 1];
413 if (QChar::isLowSurrogate(ucs4: low)) {
414 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
415 ++i;
416 }
417 }
418
419 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
420 QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
421
422 Q_ASSERT(state <= SB::BAfter);
423 state = SB::breakTable[state][ncls];
424 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
425 state = SB::Break;
426 for (qsizetype lookahead = i + 1; lookahead < len; ++lookahead) {
427 ucs4 = string[lookahead];
428 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
429 ushort low = string[lookahead + 1];
430 if (QChar::isLowSurrogate(ucs4: low)) {
431 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
432 ++lookahead;
433 }
434 }
435
436 prop = QUnicodeTables::properties(ucs4);
437 QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
438 switch (tcls) {
439 case QUnicodeTables::SentenceBreak_Any:
440 case QUnicodeTables::SentenceBreak_Extend:
441 case QUnicodeTables::SentenceBreak_Sp:
442 case QUnicodeTables::SentenceBreak_Numeric:
443 case QUnicodeTables::SentenceBreak_SContinue:
444 case QUnicodeTables::SentenceBreak_Close:
445 continue;
446 case QUnicodeTables::SentenceBreak_Lower:
447 i = lookahead;
448 state = SB::Initial;
449 break;
450 default:
451 break;
452 }
453 break;
454 }
455 }
456 if (Q_UNLIKELY(state == SB::Break)) {
457 attributes[pos].sentenceBoundary = true;
458 state = SB::breakTable[SB::Initial][ncls];
459 }
460 }
461
462 attributes[len].sentenceBoundary = true; // SB2
463}
464
465
466// -----------------------------------------------------------------------------------------------------
467//
468// The line breaking algorithm.
469// See http://www.unicode.org/reports/tr14/tr14-39.html
470//
471// -----------------------------------------------------------------------------------------------------
472
473namespace LB {
474
475namespace NS { // Number Sequence
476
477// LB25 recommends to not break lines inside numbers of the form
478// described by the following regular expression:
479// (PR|PO)?(OP|HY)?NU(NU|SY|IS)*(CL|CP)?(PR|PO)?
480
481enum Action {
482 None,
483 Start,
484 Continue,
485 Break
486};
487
488enum Class {
489 XX,
490 PRPO,
491 OPHY,
492 NU,
493 SYIS,
494 CLCP
495};
496
497static const uchar actionTable[CLCP + 1][CLCP + 1] = {
498// XX PRPO OPHY NU SYIS CLCP
499 { None , Start , Start , Start , None , None }, // XX
500 { None , Start , Continue, Continue, None , None }, // PRPO
501 { None , Start , Start , Continue, None , None }, // OPHY
502 { Break , Break , Break , Continue, Continue, Continue }, // NU
503 { Break , Break , Break , Continue, Continue, Continue }, // SYIS
504 { Break , Continue, Break , Break , Break , Break }, // CLCP
505};
506
507inline Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
508{
509 switch (lbc) {
510 case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
511 // resolve AI math symbols in numerical context to IS
512 if (category == QChar::Symbol_Math)
513 return SYIS;
514 break;
515 case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
516 return PRPO;
517 case QUnicodeTables::LineBreak_OP: case QUnicodeTables::LineBreak_HY:
518 return OPHY;
519 case QUnicodeTables::LineBreak_NU:
520 return NU;
521 case QUnicodeTables::LineBreak_SY: case QUnicodeTables::LineBreak_IS:
522 return SYIS;
523 case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
524 return CLCP;
525 default:
526 break;
527 }
528 return XX;
529}
530
531} // namespace NS
532
533/* In order to support the tailored implementation of LB25 properly
534 the following changes were made in the pair table to allow breaks
535 where the numeric expression doesn't match the template (i.e. [^NU](IS|SY)NU):
536 (CL)(PO) from IB to DB
537 (CP)(PO) from IB to DB
538 (CL)(PR) from IB to DB
539 (CP)(PR) from IB to DB
540 (PO)(OP) from IB to DB
541 (PR)(OP) from IB to DB
542 (IS)(NU) from IB to DB
543 (SY)(NU) from IB to DB
544*/
545
546/* In order to implementat LB21a properly a special rule HH has been introduced and
547 the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
548 (HL)(HY|BA) from IB to CI
549 (HY|BA)(!CB) from DB to HH
550*/
551
552enum Action {
553 ProhibitedBreak, PB = ProhibitedBreak,
554 DirectBreak, DB = DirectBreak,
555 IndirectBreak, IB = IndirectBreak,
556 CombiningIndirectBreak, CI = CombiningIndirectBreak,
557 CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
558 ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen,
559 IndirectBreakIfNarrow, IN = IndirectBreakIfNarrow, // For LB30
560};
561
562// See https://www.unicode.org/reports/tr14/tr14-37.html for the information
563// about the table. It was removed in the later versions of the standard.
564static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ] = {
565/* 1↓ 2→ OP CL CP QU +Pi +Pf GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM*/
566/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
567/* CL */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
568/* CP */ { DB, PB, PB, IB, IB, PB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
569/* QU */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
570/* +Pi*/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
571/* +Pf*/ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
572/* GL */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
573/* NS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
574/* EX */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
575/* SY */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
576/* IS */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
577/* PR */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB },
578/* PO */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
579/* NU */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
580/* AL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
581/* HL */ { IN, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
582/* ID */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
583/* IN */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
584/* HY */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
585/* BA */ { HH, PB, PB, IB, IB, PB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB },
586/* BB */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB },
587/* B2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
588/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
589/* CM */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
590/* WJ */ { IB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
591/* H2 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
592/* H3 */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
593/* JL */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB },
594/* JV */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB },
595/* JT */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB },
596/* RI */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB },
597/* CB */ { DB, PB, PB, IB, IB, PB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
598/* EB */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
599/* EM */ { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
600};
601
602// The following line break classes are not treated by the pair table
603// and must be resolved outside:
604// AI, AK, AP, AS, BK, CB, CJ, CR, LF, NL, SA, SG, SP, VF, VI, XX, ZWJ
605
606} // namespace LB
607
608static void getLineBreaks(const char16_t *string, qsizetype len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
609{
610 qsizetype nestart = 0;
611 LB::NS::Class nelast = LB::NS::XX;
612
613 QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
614 QUnicodeTables::LineBreakClass cls = lcls;
615 const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(ucs4: U'\n');
616
617 for (qsizetype i = 0; i != len; ++i) {
618 qsizetype pos = i;
619 char32_t ucs4 = string[i];
620 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
621 ushort low = string[i + 1];
622 if (QChar::isLowSurrogate(ucs4: low)) {
623 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
624 ++i;
625 }
626 }
627
628 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
629 QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
630 QUnicodeTables::LineBreakClass tcls;
631
632 if (options & QUnicodeTools::HangulLineBreakTailoring) {
633 if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
634 && ncls <= QUnicodeTables::LineBreak_JT)
635 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
636 ) {
637 // LB27: use SPACE for line breaking
638 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
639 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
640 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
641 ncls = QUnicodeTables::LineBreak_AL;
642 } else {
643 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
644 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
645 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
646 if (FLAG(prop->category) & test)
647 ncls = QUnicodeTables::LineBreak_CM;
648 }
649 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM)) {
650 // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
651 if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP)
652 ncls = QUnicodeTables::LineBreak_AL;
653 }
654 }
655 }
656
657 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
658 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
659 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
660 if (FLAG(prop->category) & test)
661 ncls = QUnicodeTables::LineBreak_CM;
662 }
663
664 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) {
665 if (prop->category == QChar::Punctuation_InitialQuote) {
666 // LB15a: Do not break after an unresolved initial punctuation
667 // that lies at the start of the line, after a space, after
668 // opening punctuation, or after an unresolved quotation mark,
669 // even after spaces.
670 // (sot | BK | CR | LF | NL | OP | QU | GL | SP | ZW)
671 // [\p{Pi}&QU] SP* ×
672 // Note: sot is treated as LF here due to initial loop setup.
673 constexpr QUnicodeTables::LineBreakClass lb15a[] = {
674 QUnicodeTables::LineBreak_BK, QUnicodeTables::LineBreak_CR,
675 QUnicodeTables::LineBreak_LF, QUnicodeTables::LineBreak_OP,
676 QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
677 QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_GL,
678 QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_ZW};
679 if (std::any_of(first: std::begin(arr: lb15a), last: std::end(arr: lb15a),
680 pred: [lcls](auto x) { return x == lcls; })) {
681 ncls = QUnicodeTables::LineBreak_QU_Pi;
682 }
683 } else if (prop->category == QChar::Punctuation_FinalQuote) {
684 // LB15b: Do not break before an unresolved final punctuation
685 // that lies at the end of the line, before a space, before
686 // a prohibited break, or before an unresolved quotation mark,
687 // even after spaces.
688 // × [\p{Pf}&QU] ( SP | GL | WJ | CL | QU | CP | EX | IS
689 // | SY | BK | CR | LF | NL | ZW | eot)
690 auto nncls = QUnicodeTables::LineBreak_LF;
691
692 if (i + 1 < len) {
693 char32_t c = string[i + 1];
694 if (QChar::isHighSurrogate(ucs4: c) && i + 2 != len) {
695 ushort low = string[i + 2];
696 if (QChar::isLowSurrogate(ucs4: low))
697 c = QChar::surrogateToUcs4(high: c, low);
698 }
699 nncls = QUnicodeTables::LineBreakClass(
700 QUnicodeTables::properties(ucs4: c)->lineBreakClass);
701 }
702
703 constexpr QUnicodeTables::LineBreakClass lb15b[] = {
704 QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_GL,
705 QUnicodeTables::LineBreak_WJ, QUnicodeTables::LineBreak_CL,
706 QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
707 QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_CP,
708 QUnicodeTables::LineBreak_EX, QUnicodeTables::LineBreak_IS,
709 QUnicodeTables::LineBreak_SY, QUnicodeTables::LineBreak_BK,
710 QUnicodeTables::LineBreak_CR, QUnicodeTables::LineBreak_LF,
711 QUnicodeTables::LineBreak_ZW};
712 if (std::any_of(first: std::begin(arr: lb15b), last: std::end(arr: lb15b),
713 pred: [nncls](auto x) { return x == nncls; })) {
714 ncls = QUnicodeTables::LineBreak_QU_Pf;
715 }
716 }
717 }
718
719 if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
720 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
721 if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
722 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
723 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
724 cls = QUnicodeTables::LineBreak_AL;
725 goto next_no_cls_update;
726 }
727 goto next;
728 }
729
730 if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
731 if (ncls > QUnicodeTables::LineBreak_SP)
732 goto next; // LB6: x(BK|CR|LF|NL)
733 goto next_no_cls_update; // LB7: xSP
734 }
735
736 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
737 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
738 if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP)
739 // don't update anything
740 goto next_no_cls_update;
741 }
742
743 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
744 // LB8a: ZWJ x
745 goto next;
746 }
747
748 // LB25: do not break lines inside numbers
749 {
750 LB::NS::Class necur = LB::NS::toClass(lbc: ncls, category: (QChar::Category)prop->category);
751 switch (LB::NS::actionTable[nelast][necur]) {
752 case LB::NS::Break:
753 // do not change breaks before and after the expression
754 for (qsizetype j = nestart + 1; j < pos; ++j)
755 attributes[j].lineBreak = false;
756 Q_FALLTHROUGH();
757 case LB::NS::None:
758 nelast = LB::NS::XX; // reset state
759 break;
760 case LB::NS::Start:
761 nestart = i;
762 Q_FALLTHROUGH();
763 default:
764 nelast = necur;
765 break;
766 }
767 }
768
769 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
770 // LB30a
771 ncls = QUnicodeTables::LineBreak_SP;
772 goto next;
773 }
774
775 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
776 && lastProp->category == QChar::Other_NotAssigned
777 && lastProp->graphemeBreakClass
778 == QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
779 // LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
780 goto next;
781 }
782
783 // for South East Asian chars that require a complex analysis, the Unicode
784 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
785 if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
786 cls = QUnicodeTables::LineBreak_AL;
787
788 tcls = cls;
789 if (tcls == QUnicodeTables::LineBreak_CM || tcls == QUnicodeTables::LineBreak_ZWJ)
790 // LB10
791 tcls = QUnicodeTables::LineBreak_AL;
792 switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_ZWJ ? ncls : QUnicodeTables::LineBreak_AL]) {
793 case LB::DirectBreak:
794 attributes[pos].lineBreak = true;
795 break;
796 case LB::IndirectBreak:
797 if (lcls == QUnicodeTables::LineBreak_SP)
798 attributes[pos].lineBreak = true;
799 break;
800 case LB::CombiningIndirectBreak:
801 if (lcls != QUnicodeTables::LineBreak_SP)
802 goto next_no_cls_update;
803 attributes[pos].lineBreak = true;
804 break;
805 case LB::CombiningProhibitedBreak:
806 if (lcls != QUnicodeTables::LineBreak_SP)
807 goto next_no_cls_update;
808 break;
809 case LB::ProhibitedBreakAfterHebrewPlusHyphen:
810 if (lcls != QUnicodeTables::LineBreak_HL)
811 attributes[pos].lineBreak = true;
812 break;
813 case LB::IndirectBreakIfNarrow:
814 switch (static_cast<QUnicodeTables::EastAsianWidth>(prop->eastAsianWidth)) {
815 default:
816 if (lcls != QUnicodeTables::LineBreak_SP)
817 break;
818 Q_FALLTHROUGH();
819 case QUnicodeTables::EastAsianWidth::F:
820 case QUnicodeTables::EastAsianWidth::W:
821 case QUnicodeTables::EastAsianWidth::H:
822 attributes[pos].lineBreak = true;
823 break;
824 }
825 break;
826 case LB::ProhibitedBreak:
827 // nothing to do
828 default:
829 break;
830 }
831
832 next:
833 cls = ncls;
834 lastProp = prop;
835 next_no_cls_update:
836 lcls = ncls;
837 }
838
839 if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
840 // LB25: do not break lines inside numbers
841 for (qsizetype j = nestart + 1; j < len; ++j)
842 attributes[j].lineBreak = false;
843 }
844
845 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
846 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
847}
848
849
850static void getWhiteSpaces(const char16_t *string, qsizetype len, QCharAttributes *attributes)
851{
852 for (qsizetype i = 0; i != len; ++i) {
853 uint ucs4 = string[i];
854 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
855 ushort low = string[i + 1];
856 if (QChar::isLowSurrogate(ucs4: low)) {
857 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
858 ++i;
859 }
860 }
861
862 if (Q_UNLIKELY(QChar::isSpace(ucs4)))
863 attributes[i].whiteSpace = true;
864 }
865}
866
867namespace Tailored {
868
869using CharAttributeFunction = void (*)(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes);
870
871
872enum Form {
873 Invalid = 0x0,
874 UnknownForm = Invalid,
875 Consonant,
876 Nukta,
877 Halant,
878 Matra,
879 VowelMark,
880 StressMark,
881 IndependentVowel,
882 LengthMark,
883 Control,
884 Other
885};
886
887static const unsigned char indicForms[0xe00-0x900] = {
888 // Devangari
889 Invalid, VowelMark, VowelMark, VowelMark,
890 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
891 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
892 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
893
894 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
895 IndependentVowel, Consonant, Consonant, Consonant,
896 Consonant, Consonant, Consonant, Consonant,
897 Consonant, Consonant, Consonant, Consonant,
898
899 Consonant, Consonant, Consonant, Consonant,
900 Consonant, Consonant, Consonant, Consonant,
901 Consonant, Consonant, Consonant, Consonant,
902 Consonant, Consonant, Consonant, Consonant,
903
904 Consonant, Consonant, Consonant, Consonant,
905 Consonant, Consonant, Consonant, Consonant,
906 Consonant, Consonant, UnknownForm, UnknownForm,
907 Nukta, Other, Matra, Matra,
908
909 Matra, Matra, Matra, Matra,
910 Matra, Matra, Matra, Matra,
911 Matra, Matra, Matra, Matra,
912 Matra, Halant, UnknownForm, UnknownForm,
913
914 Other, StressMark, StressMark, StressMark,
915 StressMark, UnknownForm, UnknownForm, UnknownForm,
916 Consonant, Consonant, Consonant, Consonant,
917 Consonant, Consonant, Consonant, Consonant,
918
919 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
920 Other, Other, Other, Other,
921 Other, Other, Other, Other,
922 Other, Other, Other, Other,
923
924 Other, Other, Other, Other,
925 Other, Other, Other, Other,
926 Other, Other, Other, Consonant,
927 Consonant, Consonant /* ??? */, Consonant, Consonant,
928
929 // Bengali
930 Invalid, VowelMark, VowelMark, VowelMark,
931 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
932 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
933 IndependentVowel, Invalid, Invalid, IndependentVowel,
934
935 IndependentVowel, Invalid, Invalid, IndependentVowel,
936 IndependentVowel, Consonant, Consonant, Consonant,
937 Consonant, Consonant, Consonant, Consonant,
938 Consonant, Consonant, Consonant, Consonant,
939
940 Consonant, Consonant, Consonant, Consonant,
941 Consonant, Consonant, Consonant, Consonant,
942 Consonant, Invalid, Consonant, Consonant,
943 Consonant, Consonant, Consonant, Consonant,
944
945 Consonant, Invalid, Consonant, Invalid,
946 Invalid, Invalid, Consonant, Consonant,
947 Consonant, Consonant, UnknownForm, UnknownForm,
948 Nukta, Other, Matra, Matra,
949
950 Matra, Matra, Matra, Matra,
951 Matra, Invalid, Invalid, Matra,
952 Matra, Invalid, Invalid, Matra,
953 Matra, Halant, Consonant, UnknownForm,
954
955 Invalid, Invalid, Invalid, Invalid,
956 Invalid, Invalid, Invalid, VowelMark,
957 Invalid, Invalid, Invalid, Invalid,
958 Consonant, Consonant, Invalid, Consonant,
959
960 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
961 Other, Other, Other, Other,
962 Other, Other, Other, Other,
963 Other, Other, Other, Other,
964
965 Consonant, Consonant, Other, Other,
966 Other, Other, Other, Other,
967 Other, Other, Other, Other,
968 Other, Other, Other, Other,
969
970 // Gurmukhi
971 Invalid, VowelMark, VowelMark, VowelMark,
972 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
973 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
974 Invalid, Invalid, Invalid, IndependentVowel,
975
976 IndependentVowel, Invalid, Invalid, IndependentVowel,
977 IndependentVowel, Consonant, Consonant, Consonant,
978 Consonant, Consonant, Consonant, Consonant,
979 Consonant, Consonant, Consonant, Consonant,
980
981 Consonant, Consonant, Consonant, Consonant,
982 Consonant, Consonant, Consonant, Consonant,
983 Consonant, Invalid, Consonant, Consonant,
984 Consonant, Consonant, Consonant, Consonant,
985
986 Consonant, Invalid, Consonant, Consonant,
987 Invalid, Consonant, Consonant, Invalid,
988 Consonant, Consonant, UnknownForm, UnknownForm,
989 Nukta, Other, Matra, Matra,
990
991 Matra, Matra, Matra, Invalid,
992 Invalid, Invalid, Invalid, Matra,
993 Matra, Invalid, Invalid, Matra,
994 Matra, Halant, UnknownForm, UnknownForm,
995
996 Invalid, Invalid, Invalid, Invalid,
997 Invalid, UnknownForm, UnknownForm, UnknownForm,
998 Invalid, Consonant, Consonant, Consonant,
999 Consonant, Invalid, Consonant, Invalid,
1000
1001 Other, Other, Invalid, Invalid,
1002 Other, Other, Other, Other,
1003 Other, Other, Other, Other,
1004 Other, Other, Other, Other,
1005
1006 StressMark, StressMark, Consonant, Consonant,
1007 Other, Other, Other, Other,
1008 Other, Other, Other, Other,
1009 Other, Other, Other, Other,
1010
1011 // Gujarati
1012 Invalid, VowelMark, VowelMark, VowelMark,
1013 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1014 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1015 IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
1016
1017 IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
1018 IndependentVowel, Consonant, Consonant, Consonant,
1019 Consonant, Consonant, Consonant, Consonant,
1020 Consonant, Consonant, Consonant, Consonant,
1021
1022 Consonant, Consonant, Consonant, Consonant,
1023 Consonant, Consonant, Consonant, Consonant,
1024 Consonant, Invalid, Consonant, Consonant,
1025 Consonant, Consonant, Consonant, Consonant,
1026
1027 Consonant, Invalid, Consonant, Consonant,
1028 Invalid, Consonant, Consonant, Consonant,
1029 Consonant, Consonant, UnknownForm, UnknownForm,
1030 Nukta, Other, Matra, Matra,
1031
1032 Matra, Matra, Matra, Matra,
1033 Matra, Matra, Invalid, Matra,
1034 Matra, Matra, Invalid, Matra,
1035 Matra, Halant, UnknownForm, UnknownForm,
1036
1037 Other, UnknownForm, UnknownForm, UnknownForm,
1038 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1039 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1040 UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1041
1042 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1043 Other, Other, Other, Other,
1044 Other, Other, Other, Other,
1045 Other, Other, Other, Other,
1046
1047 Other, Other, Other, Other,
1048 Other, Other, Other, Other,
1049 Other, Other, Other, Other,
1050 Other, Other, Other, Other,
1051
1052 // Oriya
1053 Invalid, VowelMark, VowelMark, VowelMark,
1054 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1055 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1056 IndependentVowel, Invalid, Invalid, IndependentVowel,
1057
1058 IndependentVowel, Invalid, Invalid, IndependentVowel,
1059 IndependentVowel, Consonant, Consonant, Consonant,
1060 Consonant, Consonant, Consonant, Consonant,
1061 Consonant, Consonant, Consonant, Consonant,
1062
1063 Consonant, Consonant, Consonant, Consonant,
1064 Consonant, Consonant, Consonant, Consonant,
1065 Consonant, Invalid, Consonant, Consonant,
1066 Consonant, Consonant, Consonant, Consonant,
1067
1068 Consonant, Invalid, Consonant, Consonant,
1069 Invalid, Consonant, Consonant, Consonant,
1070 Consonant, Consonant, UnknownForm, UnknownForm,
1071 Nukta, Other, Matra, Matra,
1072
1073 Matra, Matra, Matra, Matra,
1074 Invalid, Invalid, Invalid, Matra,
1075 Matra, Invalid, Invalid, Matra,
1076 Matra, Halant, UnknownForm, UnknownForm,
1077
1078 Other, Invalid, Invalid, Invalid,
1079 Invalid, UnknownForm, LengthMark, LengthMark,
1080 Invalid, Invalid, Invalid, Invalid,
1081 Consonant, Consonant, Invalid, Consonant,
1082
1083 IndependentVowel, IndependentVowel, Invalid, Invalid,
1084 Invalid, Invalid, Other, Other,
1085 Other, Other, Other, Other,
1086 Other, Other, Other, Other,
1087
1088 Other, Consonant, Other, Other,
1089 Other, Other, Other, Other,
1090 Other, Other, Other, Other,
1091 Other, Other, Other, Other,
1092
1093 //Tamil
1094 Invalid, Invalid, VowelMark, Other,
1095 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1096 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1097 Invalid, Invalid, IndependentVowel, IndependentVowel,
1098
1099 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1100 IndependentVowel, Consonant, Invalid, Invalid,
1101 Invalid, Consonant, Consonant, Invalid,
1102 Consonant, Invalid, Consonant, Consonant,
1103
1104 Invalid, Invalid, Invalid, Consonant,
1105 Consonant, Invalid, Invalid, Invalid,
1106 Consonant, Consonant, Consonant, Invalid,
1107 Invalid, Invalid, Consonant, Consonant,
1108
1109 Consonant, Consonant, Consonant, Consonant,
1110 Consonant, Consonant, Consonant, Consonant,
1111 Consonant, Consonant, UnknownForm, UnknownForm,
1112 Invalid, Invalid, Matra, Matra,
1113
1114 Matra, Matra, Matra, Invalid,
1115 Invalid, Invalid, Matra, Matra,
1116 Matra, Invalid, Matra, Matra,
1117 Matra, Halant, Invalid, Invalid,
1118
1119 Invalid, Invalid, Invalid, Invalid,
1120 Invalid, Invalid, Invalid, LengthMark,
1121 Invalid, Invalid, Invalid, Invalid,
1122 Invalid, Invalid, Invalid, Invalid,
1123
1124 Invalid, Invalid, Invalid, Invalid,
1125 Invalid, Invalid, Other, Other,
1126 Other, Other, Other, Other,
1127 Other, Other, Other, Other,
1128
1129 Other, Other, Other, Other,
1130 Other, Other, Other, Other,
1131 Other, Other, Other, Other,
1132 Other, Other, Other, Other,
1133
1134 // Telugu
1135 Invalid, VowelMark, VowelMark, VowelMark,
1136 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1137 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1138 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1139
1140 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1141 IndependentVowel, Consonant, Consonant, Consonant,
1142 Consonant, Consonant, Consonant, Consonant,
1143 Consonant, Consonant, Consonant, Consonant,
1144
1145 Consonant, Consonant, Consonant, Consonant,
1146 Consonant, Consonant, Consonant, Consonant,
1147 Consonant, Invalid, Consonant, Consonant,
1148 Consonant, Consonant, Consonant, Consonant,
1149
1150 Consonant, Consonant, Consonant, Consonant,
1151 Invalid, Consonant, Consonant, Consonant,
1152 Consonant, Consonant, UnknownForm, UnknownForm,
1153 Invalid, Invalid, Matra, Matra,
1154
1155 Matra, Matra, Matra, Matra,
1156 Matra, Invalid, Matra, Matra,
1157 Matra, Invalid, Matra, Matra,
1158 Matra, Halant, Invalid, Invalid,
1159
1160 Invalid, Invalid, Invalid, Invalid,
1161 Invalid, LengthMark, Matra, Invalid,
1162 Invalid, Invalid, Invalid, Invalid,
1163 Invalid, Invalid, Invalid, Invalid,
1164
1165 IndependentVowel, IndependentVowel, Invalid, Invalid,
1166 Invalid, Invalid, Other, Other,
1167 Other, Other, Other, Other,
1168 Other, Other, Other, Other,
1169
1170 Other, Other, Other, Other,
1171 Other, Other, Other, Other,
1172 Other, Other, Other, Other,
1173 Other, Other, Other, Other,
1174
1175 // Kannada
1176 Invalid, Invalid, VowelMark, VowelMark,
1177 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1178 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1179 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1180
1181 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1182 IndependentVowel, Consonant, Consonant, Consonant,
1183 Consonant, Consonant, Consonant, Consonant,
1184 Consonant, Consonant, Consonant, Consonant,
1185
1186 Consonant, Consonant, Consonant, Consonant,
1187 Consonant, Consonant, Consonant, Consonant,
1188 Consonant, Invalid, Consonant, Consonant,
1189 Consonant, Consonant, Consonant, Consonant,
1190
1191 Consonant, Consonant, Consonant, Consonant,
1192 Invalid, Consonant, Consonant, Consonant,
1193 Consonant, Consonant, UnknownForm, UnknownForm,
1194 Nukta, Other, Matra, Matra,
1195
1196 Matra, Matra, Matra, Matra,
1197 Matra, Invalid, Matra, Matra,
1198 Matra, Invalid, Matra, Matra,
1199 Matra, Halant, Invalid, Invalid,
1200
1201 Invalid, Invalid, Invalid, Invalid,
1202 Invalid, LengthMark, LengthMark, Invalid,
1203 Invalid, Invalid, Invalid, Invalid,
1204 Invalid, Invalid, Consonant, Invalid,
1205
1206 IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1207 Invalid, Invalid, Other, Other,
1208 Other, Other, Other, Other,
1209 Other, Other, Other, Other,
1210
1211 Other, Other, Other, Other,
1212 Other, Other, Other, Other,
1213 Other, Other, Other, Other,
1214 Other, Other, Other, Other,
1215
1216 // Malayalam
1217 Invalid, Invalid, VowelMark, VowelMark,
1218 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1219 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1220 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1221
1222 IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1223 IndependentVowel, Consonant, Consonant, Consonant,
1224 Consonant, Consonant, Consonant, Consonant,
1225 Consonant, Consonant, Consonant, Consonant,
1226
1227 Consonant, Consonant, Consonant, Consonant,
1228 Consonant, Consonant, Consonant, Consonant,
1229 Consonant, Invalid, Consonant, Consonant,
1230 Consonant, Consonant, Consonant, Consonant,
1231
1232 Consonant, Consonant, Consonant, Consonant,
1233 Consonant, Consonant, Consonant, Consonant,
1234 Consonant, Consonant, UnknownForm, UnknownForm,
1235 Invalid, Invalid, Matra, Matra,
1236
1237 Matra, Matra, Matra, Matra,
1238 Invalid, Invalid, Matra, Matra,
1239 Matra, Invalid, Matra, Matra,
1240 Matra, Halant, Invalid, Invalid,
1241
1242 Invalid, Invalid, Invalid, Invalid,
1243 Invalid, Invalid, Invalid, Matra,
1244 Invalid, Invalid, Invalid, Invalid,
1245 Invalid, Invalid, Invalid, Invalid,
1246
1247 IndependentVowel, IndependentVowel, Invalid, Invalid,
1248 Invalid, Invalid, Other, Other,
1249 Other, Other, Other, Other,
1250 Other, Other, Other, Other,
1251
1252 Other, Other, Other, Other,
1253 Other, Other, Other, Other,
1254 Other, Other, Other, Other,
1255 Other, Other, Other, Other,
1256
1257 // Sinhala
1258 Invalid, Invalid, VowelMark, VowelMark,
1259 Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1260 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1261 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1262
1263 IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1264 IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1265 Invalid, Invalid, Consonant, Consonant,
1266 Consonant, Consonant, Consonant, Consonant,
1267
1268 Consonant, Consonant, Consonant, Consonant,
1269 Consonant, Consonant, Consonant, Consonant,
1270 Consonant, Consonant, Consonant, Consonant,
1271 Consonant, Consonant, Consonant, Consonant,
1272
1273 Consonant, Consonant, Invalid, Consonant,
1274 Consonant, Consonant, Consonant, Consonant,
1275 Consonant, Consonant, Consonant, Consonant,
1276 Invalid, Consonant, Invalid, Invalid,
1277
1278 Consonant, Consonant, Consonant, Consonant,
1279 Consonant, Consonant, Consonant, Invalid,
1280 Invalid, Invalid, Halant, Invalid,
1281 Invalid, Invalid, Invalid, Matra,
1282
1283 Matra, Matra, Matra, Matra,
1284 Matra, Invalid, Matra, Invalid,
1285 Matra, Matra, Matra, Matra,
1286 Matra, Matra, Matra, Matra,
1287
1288 Invalid, Invalid, Invalid, Invalid,
1289 Invalid, Invalid, Invalid, Invalid,
1290 Invalid, Invalid, Invalid, Invalid,
1291 Invalid, Invalid, Invalid, Invalid,
1292
1293 Invalid, Invalid, Matra, Matra,
1294 Other, Other, Other, Other,
1295 Other, Other, Other, Other,
1296 Other, Other, Other, Other,
1297};
1298
1299static inline Form form(unsigned short uc) {
1300 if (uc < 0x900 || uc > 0xdff) {
1301 if (uc == 0x25cc)
1302 return Consonant;
1303 if (uc == 0x200c || uc == 0x200d)
1304 return Control;
1305 return Other;
1306 }
1307 return (Form)indicForms[uc-0x900];
1308}
1309
1310// #define INDIC_DEBUG
1311#ifdef INDIC_DEBUG
1312#define IDEBUG qDebug
1313#else
1314#define IDEBUG if constexpr (1) ; else qDebug
1315#endif
1316
1317/* syllables are of the form:
1318
1319 (Consonant Nukta? Halant)* Consonant Matra? VowelMark? StressMark?
1320 (Consonant Nukta? Halant)* Consonant Halant
1321 IndependentVowel VowelMark? StressMark?
1322
1323 We return syllable boundaries on invalid combinations as well
1324*/
1325static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1326{
1327 *invalid = false;
1328 IDEBUG(msg: "indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1329 const char16_t *uc = s+start;
1330
1331 qsizetype pos = 0;
1332 Form state = form(uc: uc[pos]);
1333 IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1334 pos++;
1335
1336 if (state != Consonant && state != IndependentVowel) {
1337 if (state != Other)
1338 *invalid = true;
1339 goto finish;
1340 }
1341
1342 while (pos < end - start) {
1343 Form newState = form(uc: uc[pos]);
1344 IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1345 switch (newState) {
1346 case Control:
1347 newState = state;
1348 if (state == Halant && uc[pos] == 0x200d /* ZWJ */)
1349 break;
1350 // the control character should be the last char in the item
1351 if (state == Consonant && script == QChar::Script_Bengali && uc[pos-1] == 0x09B0 && uc[pos] == 0x200d /* ZWJ */)
1352 break;
1353 if (state == Consonant && script == QChar::Script_Kannada && uc[pos-1] == 0x0CB0 && uc[pos] == 0x200d /* ZWJ */)
1354 break;
1355 // Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1356 ++pos;
1357 goto finish;
1358 case Consonant:
1359 if (state == Halant && (script != QChar::Script_Sinhala || uc[pos-1] == 0x200d /* ZWJ */))
1360 break;
1361 goto finish;
1362 case Halant:
1363 if (state == Nukta || state == Consonant)
1364 break;
1365 // Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1366 if (script == QChar::Script_Bengali && pos == 1 &&
1367 (uc[0] == 0x0985 || uc[0] == 0x098f))
1368 break;
1369 // Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1370 if (script == QChar::Script_Sinhala && state == Matra) {
1371 ++pos;
1372 continue;
1373 }
1374 if (script == QChar::Script_Malayalam && state == Matra && uc[pos-1] == 0x0d41) {
1375 ++pos;
1376 continue;
1377 }
1378 goto finish;
1379 case Nukta:
1380 if (state == Consonant)
1381 break;
1382 goto finish;
1383 case StressMark:
1384 if (state == VowelMark)
1385 break;
1386 Q_FALLTHROUGH();
1387 case VowelMark:
1388 if (state == Matra || state == LengthMark || state == IndependentVowel)
1389 break;
1390 Q_FALLTHROUGH();
1391 case Matra:
1392 if (state == Consonant || state == Nukta)
1393 break;
1394 if (state == Matra) {
1395 // ### needs proper testing for correct two/three part matras
1396 break;
1397 }
1398 // ### not sure if this is correct. If it is, does it apply only to Bengali or should
1399 // it work for all Indic languages?
1400 // the combination Independent_A + Vowel Sign AA is allowed.
1401 if (script == QChar::Script_Bengali && uc[pos] == 0x9be && uc[pos-1] == 0x985)
1402 break;
1403 if (script == QChar::Script_Tamil && state == Matra) {
1404 if (uc[pos-1] == 0x0bc6 &&
1405 (uc[pos] == 0xbbe || uc[pos] == 0xbd7))
1406 break;
1407 if (uc[pos-1] == 0x0bc7 && uc[pos] == 0xbbe)
1408 break;
1409 }
1410 goto finish;
1411
1412 case LengthMark:
1413 if (state == Matra) {
1414 // ### needs proper testing for correct two/three part matras
1415 break;
1416 }
1417 Q_FALLTHROUGH();
1418 case IndependentVowel:
1419 case Invalid:
1420 case Other:
1421 goto finish;
1422 }
1423 state = newState;
1424 pos++;
1425 }
1426 finish:
1427 return pos+start;
1428}
1429
1430static void indicAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1431{
1432 qsizetype end = from + len;
1433 attributes += from;
1434 qsizetype i = 0;
1435 while (i < len) {
1436 bool invalid;
1437 qsizetype boundary = indic_nextSyllableBoundary(script, s: text, start: from+i, end, invalid: &invalid) - from;
1438 attributes[i].graphemeBoundary = true;
1439
1440 if (boundary > len-1) boundary = len;
1441 i++;
1442 while (i < boundary) {
1443 attributes[i].graphemeBoundary = false;
1444 ++i;
1445 }
1446 assert(i == boundary);
1447 }
1448
1449
1450}
1451
1452#if QT_CONFIG(library)
1453
1454#define LIBTHAI_MAJOR 0
1455
1456/*
1457 * if libthai changed please update these codes too.
1458 */
1459struct thcell_t {
1460 unsigned char base; /**< base character */
1461 unsigned char hilo; /**< upper/lower vowel/diacritic */
1462 unsigned char top; /**< top-level mark */
1463};
1464
1465using ThBrk = struct _ThBrk;
1466
1467namespace {
1468
1469class LibThai final
1470{
1471 Q_DISABLE_COPY_MOVE(LibThai)
1472
1473 using th_brk_new_def = ThBrk *(*)(const char *);
1474 using th_brk_delete_def = void (*)(ThBrk *);
1475 using th_brk_find_breaks_def = int (*)(ThBrk *, const unsigned char *, int *, size_t);
1476 using th_next_cell_def = size_t (*)(const unsigned char *, size_t, struct thcell_t *, int);
1477
1478public:
1479 LibThai() : m_library("thai"_L1, LIBTHAI_MAJOR)
1480 {
1481 m_th_brk_find_breaks =
1482 reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve(symbol: "th_brk_find_breaks"));
1483 m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve(symbol: "th_next_cell"));
1484
1485 auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve(symbol: "th_brk_new"));
1486 if (th_brk_new) {
1487 m_state = th_brk_new(nullptr);
1488 m_th_brk_delete =
1489 reinterpret_cast<th_brk_delete_def>(m_library.resolve(symbol: "th_brk_delete"));
1490 }
1491 }
1492
1493 ~LibThai()
1494 {
1495 if (m_state && m_th_brk_delete)
1496 m_th_brk_delete(m_state);
1497 m_library.unload();
1498 }
1499
1500 bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1501
1502 int brk_find_breaks(const unsigned char *s, int *pos, size_t pos_sz) const
1503 {
1504 Q_ASSERT(m_state);
1505 Q_ASSERT(m_th_brk_find_breaks);
1506 return m_th_brk_find_breaks(m_state, s, pos, pos_sz);
1507 }
1508
1509 size_t next_cell(const unsigned char *s, size_t len, struct thcell_t *cell, int is_decomp_am)
1510 {
1511 Q_ASSERT(m_th_next_cell);
1512 return m_th_next_cell(s, len, cell, is_decomp_am);
1513 }
1514
1515private:
1516 QLibrary m_library;
1517
1518 // Global state for th_brk_find_breaks().
1519 // Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1520 // state is read-only, and so it is safe to use it from multiple threads after
1521 // initialization. This is also stated in the libthai documentation.
1522 ThBrk *m_state = nullptr;
1523
1524 th_brk_find_breaks_def m_th_brk_find_breaks = nullptr;
1525 th_next_cell_def m_th_next_cell = nullptr;
1526 th_brk_delete_def m_th_brk_delete = nullptr;
1527};
1528
1529} // unnamed namespace
1530
1531Q_GLOBAL_STATIC(LibThai, g_libThai)
1532
1533static void to_tis620(const char16_t *string, qsizetype len, char *cstr)
1534{
1535 qsizetype i;
1536 unsigned char *result = reinterpret_cast<unsigned char *>(cstr);
1537
1538 for (i = 0; i < len; ++i) {
1539 if (string[i] <= 0xa0)
1540 result[i] = static_cast<unsigned char>(string[i]);
1541 else if (string[i] >= 0xe01 && string[i] <= 0xe5b)
1542 result[i] = static_cast<unsigned char>(string[i] - 0xe00 + 0xa0);
1543 else
1544 result[i] = static_cast<unsigned char>(~0); // Same encoding as libthai uses for invalid chars
1545 }
1546
1547 result[len] = 0;
1548}
1549
1550/*
1551 * Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1552 */
1553static void thaiAssignAttributes(const char16_t *string, qsizetype len, QCharAttributes *attributes)
1554{
1555 constexpr qsizetype Prealloc = 128;
1556 QVarLengthArray<char, Prealloc + 1> s(len + 1);
1557 QVarLengthArray<int, Prealloc> break_positions(len);
1558 qsizetype numbreaks, i;
1559 struct thcell_t tis_cell;
1560
1561 LibThai *libThai = g_libThai;
1562 if (!libThai || !libThai->isInitialized())
1563 return;
1564
1565 to_tis620(string, len, cstr: s.data());
1566
1567 for (i = 0; i < len; ++i) {
1568 attributes[i].wordBreak = false;
1569 attributes[i].wordStart = false;
1570 attributes[i].wordEnd = false;
1571 attributes[i].lineBreak = false;
1572 }
1573
1574 attributes[0].wordBreak = true;
1575 attributes[0].wordStart = true;
1576 attributes[0].wordEnd = false;
1577 numbreaks = libThai->brk_find_breaks(s: reinterpret_cast<const unsigned char *>(s.data()),
1578 pos: break_positions.data(),
1579 pos_sz: static_cast<size_t>(break_positions.size()));
1580 for (i = 0; i < numbreaks; ++i) {
1581 attributes[break_positions[i]].wordBreak = true;
1582 attributes[break_positions[i]].wordStart = true;
1583 attributes[break_positions[i]].wordEnd = true;
1584 attributes[break_positions[i]].lineBreak = true;
1585 }
1586 if (numbreaks > 0)
1587 attributes[break_positions[numbreaks - 1]].wordStart = false;
1588
1589 /* manage grapheme boundaries */
1590 i = 0;
1591 while (i < len) {
1592 size_t cell_length =
1593 libThai->next_cell(s: reinterpret_cast<const unsigned char *>(s.data()) + i,
1594 len: size_t(len - i), cell: &tis_cell, is_decomp_am: true);
1595
1596 attributes[i].graphemeBoundary = true;
1597 for (size_t j = 1; j < cell_length; ++j)
1598 attributes[i + j].graphemeBoundary = false;
1599
1600 i += cell_length;
1601 }
1602}
1603
1604#endif // QT_CONFIG(library)
1605
1606static void thaiAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1607{
1608 assert(script == QChar::Script_Thai);
1609#if QT_CONFIG(library)
1610 const char16_t *uc = text + from;
1611 attributes += from;
1612 Q_UNUSED(script);
1613 thaiAssignAttributes(string: uc, len, attributes);
1614#else
1615 Q_UNUSED(script);
1616 Q_UNUSED(text);
1617 Q_UNUSED(from);
1618 Q_UNUSED(len);
1619 Q_UNUSED(attributes);
1620#endif
1621}
1622
1623/*
1624 tibetan syllables are of the form:
1625 head position consonant
1626 first sub-joined consonant
1627 ....intermediate sub-joined consonants (if any)
1628 last sub-joined consonant
1629 sub-joined vowel (a-chung U+0F71)
1630 standard or compound vowel sign (or 'virama' for devanagari transliteration)
1631*/
1632
1633typedef enum {
1634 TibetanOther,
1635 TibetanHeadConsonant,
1636 TibetanSubjoinedConsonant,
1637 TibetanSubjoinedVowel,
1638 TibetanVowel
1639} TibetanForm;
1640
1641/* this table starts at U+0f40 */
1642static const unsigned char tibetanForm[0x80] = {
1643 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1644 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1645 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1646 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1647
1648 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1649 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1650 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1651 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1652
1653 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1654 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1655 TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1656 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1657
1658 TibetanOther, TibetanVowel, TibetanVowel, TibetanVowel,
1659 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1660 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1661 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1662
1663 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1664 TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1665 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1666 TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1667
1668 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1669 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1670 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1671 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1672
1673 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1674 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1675 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1676 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1677
1678 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1679 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1680 TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1681 TibetanSubjoinedConsonant, TibetanOther, TibetanOther, TibetanOther
1682};
1683
1684#define tibetan_form(c) \
1685 ((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1686
1687static qsizetype tibetan_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1688{
1689 const char16_t *uc = s + start;
1690
1691 qsizetype pos = 0;
1692 TibetanForm state = tibetan_form(*uc);
1693
1694/* qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);*/
1695 pos++;
1696
1697 if (state != TibetanHeadConsonant) {
1698 if (state != TibetanOther)
1699 *invalid = true;
1700 goto finish;
1701 }
1702
1703 while (pos < end - start) {
1704 TibetanForm newState = tibetan_form(uc[pos]);
1705 switch (newState) {
1706 case TibetanSubjoinedConsonant:
1707 case TibetanSubjoinedVowel:
1708 if (state != TibetanHeadConsonant &&
1709 state != TibetanSubjoinedConsonant)
1710 goto finish;
1711 state = newState;
1712 break;
1713 case TibetanVowel:
1714 if (state != TibetanHeadConsonant &&
1715 state != TibetanSubjoinedConsonant &&
1716 state != TibetanSubjoinedVowel)
1717 goto finish;
1718 break;
1719 case TibetanOther:
1720 case TibetanHeadConsonant:
1721 goto finish;
1722 }
1723 pos++;
1724 }
1725
1726finish:
1727 *invalid = false;
1728 return start+pos;
1729}
1730
1731static void tibetanAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1732{
1733 qsizetype end = from + len;
1734 qsizetype i = 0;
1735 Q_UNUSED(script);
1736 attributes += from;
1737 while (i < len) {
1738 bool invalid;
1739 qsizetype boundary = tibetan_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
1740
1741 attributes[i].graphemeBoundary = true;
1742
1743 if (boundary > len-1) boundary = len;
1744 i++;
1745 while (i < boundary) {
1746 attributes[i].graphemeBoundary = false;
1747 ++i;
1748 }
1749 assert(i == boundary);
1750 }
1751}
1752
1753enum MymrCharClassValues {
1754 Mymr_CC_RESERVED = 0,
1755 Mymr_CC_CONSONANT = 1, /* Consonant of type 1, that has subscript form */
1756 Mymr_CC_CONSONANT2 = 2, /* Consonant of type 2, that has no subscript form */
1757 Mymr_CC_NGA = 3, /* Consonant NGA */
1758 Mymr_CC_YA = 4, /* Consonant YA */
1759 Mymr_CC_RA = 5, /* Consonant RA */
1760 Mymr_CC_WA = 6, /* Consonant WA */
1761 Mymr_CC_HA = 7, /* Consonant HA */
1762 Mymr_CC_IND_VOWEL = 8, /* Independent vowel */
1763 Mymr_CC_ZERO_WIDTH_NJ_MARK = 9, /* Zero Width non joiner character (0x200C) */
1764 Mymr_CC_VIRAMA = 10, /* Subscript consonant combining character */
1765 Mymr_CC_PRE_VOWEL = 11, /* Dependent vowel, prebase (Vowel e) */
1766 Mymr_CC_BELOW_VOWEL = 12, /* Dependent vowel, prebase (Vowel u, uu) */
1767 Mymr_CC_ABOVE_VOWEL = 13, /* Dependent vowel, prebase (Vowel i, ii, ai) */
1768 Mymr_CC_POST_VOWEL = 14, /* Dependent vowel, prebase (Vowel aa) */
1769 Mymr_CC_SIGN_ABOVE = 15,
1770 Mymr_CC_SIGN_BELOW = 16,
1771 Mymr_CC_SIGN_AFTER = 17,
1772 Mymr_CC_ZERO_WIDTH_J_MARK = 18, /* Zero width joiner character */
1773 Mymr_CC_COUNT = 19 /* This is the number of character classes */
1774};
1775
1776enum MymrCharClassFlags {
1777 Mymr_CF_CLASS_MASK = 0x0000FFFF,
1778
1779 Mymr_CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
1780 Mymr_CF_MEDIAL = 0x02000000, /* flag to speed up comparing */
1781 Mymr_CF_IND_VOWEL = 0x04000000, /* flag to speed up comparing */
1782 Mymr_CF_DEP_VOWEL = 0x08000000, /* flag to speed up comparing */
1783 Mymr_CF_DOTTED_CIRCLE = 0x10000000, /* add a dotted circle if a character with this flag is the
1784 first in a syllable */
1785 Mymr_CF_VIRAMA = 0x20000000, /* flag to speed up comparing */
1786
1787 /* position flags */
1788 Mymr_CF_POS_BEFORE = 0x00080000,
1789 Mymr_CF_POS_BELOW = 0x00040000,
1790 Mymr_CF_POS_ABOVE = 0x00020000,
1791 Mymr_CF_POS_AFTER = 0x00010000,
1792 Mymr_CF_POS_MASK = 0x000f0000,
1793
1794 Mymr_CF_AFTER_KINZI = 0x00100000
1795};
1796
1797Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags)
1798
1799/* Characters that get refrered to by name */
1800enum MymrChar
1801{
1802 Mymr_C_SIGN_ZWNJ = 0x200C,
1803 Mymr_C_SIGN_ZWJ = 0x200D,
1804 Mymr_C_DOTTED_CIRCLE = 0x25CC,
1805 Mymr_C_RA = 0x101B,
1806 Mymr_C_YA = 0x101A,
1807 Mymr_C_NGA = 0x1004,
1808 Mymr_C_VOWEL_E = 0x1031,
1809 Mymr_C_VIRAMA = 0x1039
1810};
1811
1812enum
1813{
1814 Mymr_xx = Mymr_CC_RESERVED,
1815 Mymr_c1 = Mymr_CC_CONSONANT | Mymr_CF_CONSONANT | Mymr_CF_POS_BELOW,
1816 Mymr_c2 = Mymr_CC_CONSONANT2 | Mymr_CF_CONSONANT,
1817 Mymr_ng = Mymr_CC_NGA | Mymr_CF_CONSONANT | Mymr_CF_POS_ABOVE,
1818 Mymr_ya = Mymr_CC_YA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_AFTER | Mymr_CF_AFTER_KINZI,
1819 Mymr_ra = Mymr_CC_RA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BEFORE,
1820 Mymr_wa = Mymr_CC_WA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW,
1821 Mymr_ha = Mymr_CC_HA | Mymr_CF_CONSONANT | Mymr_CF_MEDIAL | Mymr_CF_POS_BELOW,
1822 Mymr_id = Mymr_CC_IND_VOWEL | Mymr_CF_IND_VOWEL,
1823 Mymr_vi = Mymr_CC_VIRAMA | Mymr_CF_VIRAMA | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE,
1824 Mymr_dl = Mymr_CC_PRE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BEFORE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1825 Mymr_db = Mymr_CC_BELOW_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1826 Mymr_da = Mymr_CC_ABOVE_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1827 Mymr_dr = Mymr_CC_POST_VOWEL | Mymr_CF_DEP_VOWEL | Mymr_CF_POS_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI,
1828 Mymr_sa = Mymr_CC_SIGN_ABOVE | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_ABOVE | Mymr_CF_AFTER_KINZI,
1829 Mymr_sb = Mymr_CC_SIGN_BELOW | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_POS_BELOW | Mymr_CF_AFTER_KINZI,
1830 Mymr_sp = Mymr_CC_SIGN_AFTER | Mymr_CF_DOTTED_CIRCLE | Mymr_CF_AFTER_KINZI
1831};
1832
1833
1834typedef int MymrCharClass;
1835
1836
1837static const MymrCharClass mymrCharClasses[] =
1838{
1839 Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_ng, Mymr_c1, Mymr_c1, Mymr_c1,
1840 Mymr_c1, Mymr_c1, Mymr_c2, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, /* 1000 - 100F */
1841 Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1,
1842 Mymr_c1, Mymr_c1, Mymr_ya, Mymr_ra, Mymr_c1, Mymr_wa, Mymr_c1, Mymr_ha, /* 1010 - 101F */
1843 Mymr_c2, Mymr_c2, Mymr_xx, Mymr_id, Mymr_id, Mymr_id, Mymr_id, Mymr_id,
1844 Mymr_xx, Mymr_id, Mymr_id, Mymr_xx, Mymr_dr, Mymr_da, Mymr_da, Mymr_db, /* 1020 - 102F */
1845 Mymr_db, Mymr_dl, Mymr_da, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_sa, Mymr_sb,
1846 Mymr_sp, Mymr_vi, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1030 - 103F */
1847 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
1848 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1040 - 104F */
1849 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
1850 Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, /* 1050 - 105F */
1851};
1852
1853static MymrCharClass
1854getMyanmarCharClass (ushort ch)
1855{
1856 if (ch == Mymr_C_SIGN_ZWJ)
1857 return Mymr_CC_ZERO_WIDTH_J_MARK;
1858
1859 if (ch == Mymr_C_SIGN_ZWNJ)
1860 return Mymr_CC_ZERO_WIDTH_NJ_MARK;
1861
1862 if (ch < 0x1000 || ch > 0x105f)
1863 return Mymr_CC_RESERVED;
1864
1865 return mymrCharClasses[ch - 0x1000];
1866}
1867
1868static const signed char mymrStateTable[][Mymr_CC_COUNT] =
1869{
1870/* xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj */
1871 { 1, 4, 4, 2, 4, 4, 4, 4, 24, 1, 27, 17, 18, 19, 20, 21, 1, 1, 4}, /* 0 - ground state */
1872 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sp to the right of the syllable) */
1873 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 3, 17, 18, 19, 20, 21, -1, -1, 4}, /* 2 - NGA */
1874 {-1, 4, 4, 4, 4, 4, 4, 4, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 3 - Virama after NGA */
1875 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 5, 17, 18, 19, 20, 21, 1, 1, -1}, /* 4 - Base consonant */
1876 {-2, 6, -2, -2, 7, 8, 9, 10, -2, 23, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 5 - First virama */
1877 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 25, 17, 18, 19, 20, 21, -1, -1, -1}, /* 6 - c1 after virama */
1878 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 7 - ya after virama */
1879 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 8 - ra after virama */
1880 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, -1, -1}, /* 9 - wa after virama */
1881 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 10 - ha after virama */
1882 {-1, -1, -1, -1, 7, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 11 - Virama after NGA+zwj */
1883 {-2, -2, -2, -2, -2, -2, 13, 14, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 12 - Second virama */
1884 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 17, 18, 19, 20, 21, -1, -1, -1}, /* 13 - wa after virama */
1885 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 14 - ha after virama */
1886 {-2, -2, -2, -2, -2, -2, -2, 16, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 15 - Third virama */
1887 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 18, 19, 20, 21, -1, -1, -1}, /* 16 - ha after virama */
1888 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 20, 21, 1, 1, -1}, /* 17 - dl, Dependent vowel e */
1889 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 19, -1, 21, 1, 1, -1}, /* 18 - db, Dependent vowel u,uu */
1890 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1}, /* 19 - da, Dependent vowel i,ii,ai */
1891 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 22, -1, -1, -1, -1, -1, 1, 1, -1}, /* 20 - dr, Dependent vowel aa */
1892 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 21 - sa, Sign anusvara */
1893 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 22 - atha */
1894 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1}, /* 23 - zwnj for atha */
1895 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 24 - Independent vowel */
1896 {-2, -2, -2, -2, 26, 26, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2, -2}, /* 25 - Virama after subscript consonant */
1897 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 17, 18, 19, 20, 21, -1, 1, -1}, /* 26 - ra/ya after subscript consonant + virama */
1898 {-1, 6, -1, -1, 7, 8, 9, 10, -1, 23, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 27 - Virama after ground state */
1899/* exit state -2 is for invalid order of medials and combination of invalids
1900 with virama where virama should treat as start of next syllable
1901 */
1902};
1903
1904/*#define MYANMAR_DEBUG */
1905#ifdef MYANMAR_DEBUG
1906#define MMDEBUG qDebug
1907#else
1908# define MMDEBUG \
1909 if (0) \
1910 printf
1911#endif
1912
1913/*
1914// Given an input string of characters and a location in which to start looking
1915// calculate, using the state table, which one is the last character of the syllable
1916// that starts in the starting position.
1917*/
1918static qsizetype myanmar_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
1919{
1920 const char16_t *uc = s + start;
1921 int state = 0;
1922 qsizetype pos = start;
1923 *invalid = false;
1924
1925 while (pos < end) {
1926 MymrCharClass charClass = getMyanmarCharClass(ch: *uc);
1927 state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
1928 if (pos == start)
1929 *invalid = (bool)(charClass & Mymr_CF_DOTTED_CIRCLE);
1930
1931 MMDEBUG(format: "state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
1932
1933 if (state < 0) {
1934 if (state < -1)
1935 --pos;
1936 break;
1937 }
1938 ++uc;
1939 ++pos;
1940 }
1941 return pos;
1942}
1943
1944static void myanmarAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
1945{
1946 qsizetype end = from + len;
1947 qsizetype i = 0;
1948 Q_UNUSED(script);
1949 attributes += from;
1950 while (i < len) {
1951 bool invalid;
1952 qsizetype boundary = myanmar_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
1953
1954 attributes[i].graphemeBoundary = true;
1955 attributes[i].lineBreak = true;
1956
1957 if (boundary > len-1)
1958 boundary = len;
1959 i++;
1960 while (i < boundary) {
1961 attributes[i].graphemeBoundary = false;
1962 ++i;
1963 }
1964 assert(i == boundary);
1965 }
1966}
1967
1968/*
1969// Vocabulary
1970// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
1971// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
1972// split vowels, signs... but there is only one base in a syllable, it has to be coded as
1973// the first character of the syllable.
1974// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
1975// Khmer language has five of them. Khmer split vowels either have one part before the
1976// base and one after the base or they have a part before the base and a part above the base.
1977// The first part of all Khmer split vowels is the same character, identical to
1978// the glyph of Khmer dependent vowel SRA EI
1979// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
1980// Differently than indian languages, the coeng modifies the consonant that follows it,
1981// not the one preceding it Each consonant has two forms, the base form and the subscript form
1982// the base form is the normal one (using the consonants code-point), the subscript form is
1983// displayed when the combination coeng + consonant is encountered.
1984// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
1985// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
1986// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
1987// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
1988// if it is attached to a consonant of the first series or a consonant of the second series
1989// Most consonants have an equivalent in the other series, but some of theme exist only in
1990// one series (for example SA). If we want to use the consonant SA with a vowel sound that
1991// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
1992// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
1993// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
1994// MUSIKATOAN a second series consonant to have a first series vowel sound.
1995// Consonant shifter are both normally supercript marks, but, when they are followed by a
1996// superscript, they change shape and take the form of subscript dependent vowel SRA U.
1997// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
1998// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
1999// be placed after the coeng consonant.
2000// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
2001// Each vowel has its own position. Only one vowel per syllable is allowed.
2002// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
2003// Allowed in a syllable.
2004//
2005//
2006// order is important here! This order must be the same that is found in each horizontal
2007// line in the statetable for Khmer (see khmerStateTable) .
2008*/
2009enum KhmerCharClassValues {
2010 CC_RESERVED = 0,
2011 CC_CONSONANT = 1, /* Consonant of type 1 or independent vowel */
2012 CC_CONSONANT2 = 2, /* Consonant of type 2 */
2013 CC_CONSONANT3 = 3, /* Consonant of type 3 */
2014 CC_ZERO_WIDTH_NJ_MARK = 4, /* Zero Width non joiner character (0x200C) */
2015 CC_CONSONANT_SHIFTER = 5,
2016 CC_ROBAT = 6, /* Khmer special diacritic accent -treated differently in state table */
2017 CC_COENG = 7, /* Subscript consonant combining character */
2018 CC_DEPENDENT_VOWEL = 8,
2019 CC_SIGN_ABOVE = 9,
2020 CC_SIGN_AFTER = 10,
2021 CC_ZERO_WIDTH_J_MARK = 11, /* Zero width joiner character */
2022 CC_COUNT = 12 /* This is the number of character classes */
2023};
2024
2025
2026enum KhmerCharClassFlags {
2027 CF_CLASS_MASK = 0x0000FFFF,
2028
2029 CF_CONSONANT = 0x01000000, /* flag to speed up comparing */
2030 CF_SPLIT_VOWEL = 0x02000000, /* flag for a split vowel -> the first part is added in front of the syllable */
2031 CF_DOTTED_CIRCLE = 0x04000000, /* add a dotted circle if a character with this flag is the first in a syllable */
2032 CF_COENG = 0x08000000, /* flag to speed up comparing */
2033 CF_SHIFTER = 0x10000000, /* flag to speed up comparing */
2034 CF_ABOVE_VOWEL = 0x20000000, /* flag to speed up comparing */
2035
2036 /* position flags */
2037 CF_POS_BEFORE = 0x00080000,
2038 CF_POS_BELOW = 0x00040000,
2039 CF_POS_ABOVE = 0x00020000,
2040 CF_POS_AFTER = 0x00010000,
2041 CF_POS_MASK = 0x000f0000
2042};
2043
2044Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags)
2045
2046/* Characters that get referred to by name */
2047enum KhmerChar {
2048 C_SIGN_ZWNJ = 0x200C,
2049 C_SIGN_ZWJ = 0x200D,
2050 C_RO = 0x179A,
2051 C_VOWEL_AA = 0x17B6,
2052 C_SIGN_NIKAHIT = 0x17C6,
2053 C_VOWEL_E = 0x17C1,
2054 C_COENG = 0x17D2
2055};
2056
2057
2058/*
2059// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2060// they are also used to know where a character should be placed (location in reference to the base character)
2061// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2062// indicate error in syllable construction
2063*/
2064enum {
2065 _xx = CC_RESERVED,
2066 _sa = CC_SIGN_ABOVE | CF_DOTTED_CIRCLE | CF_POS_ABOVE,
2067 _sp = CC_SIGN_AFTER | CF_DOTTED_CIRCLE| CF_POS_AFTER,
2068 _c1 = CC_CONSONANT | CF_CONSONANT,
2069 _c2 = CC_CONSONANT2 | CF_CONSONANT,
2070 _c3 = CC_CONSONANT3 | CF_CONSONANT,
2071 _rb = CC_ROBAT | CF_POS_ABOVE | CF_DOTTED_CIRCLE,
2072 _cs = CC_CONSONANT_SHIFTER | CF_DOTTED_CIRCLE | CF_SHIFTER,
2073 _dl = CC_DEPENDENT_VOWEL | CF_POS_BEFORE | CF_DOTTED_CIRCLE,
2074 _db = CC_DEPENDENT_VOWEL | CF_POS_BELOW | CF_DOTTED_CIRCLE,
2075 _da = CC_DEPENDENT_VOWEL | CF_POS_ABOVE | CF_DOTTED_CIRCLE | CF_ABOVE_VOWEL,
2076 _dr = CC_DEPENDENT_VOWEL | CF_POS_AFTER | CF_DOTTED_CIRCLE,
2077 _co = CC_COENG | CF_COENG | CF_DOTTED_CIRCLE,
2078
2079 /* split vowel */
2080 _va = _da | CF_SPLIT_VOWEL,
2081 _vr = _dr | CF_SPLIT_VOWEL
2082};
2083
2084
2085/*
2086// Character class: a character class value
2087// ORed with character class flags.
2088*/
2089typedef unsigned long KhmerCharClass;
2090
2091
2092/*
2093// Character class tables
2094// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2095// _sa Sign placed above the base
2096// _sp Sign placed after the base
2097// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2098// _c2 Consonant of type 2 (only RO)
2099// _c3 Consonant of type 3
2100// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2101// _cd Consonant-shifter
2102// _dl Dependent vowel placed before the base (left of the base)
2103// _db Dependent vowel placed below the base
2104// _da Dependent vowel placed above the base
2105// _dr Dependent vowel placed behind the base (right of the base)
2106// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2107// it to create a subscript consonant or independent vowel
2108// _va Khmer split vowel in which the first part is before the base and the second one above the base
2109// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2110*/
2111static const KhmerCharClass khmerCharClasses[] = {
2112 _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, /* 1780 - 178F */
2113 _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, /* 1790 - 179F */
2114 _c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, /* 17A0 - 17AF */
2115 _c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, /* 17B0 - 17BF */
2116 _vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, /* 17C0 - 17CF */
2117 _sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx /* 17D0 - 17DF */
2118};
2119
2120/* this enum must reflect the range of khmerCharClasses */
2121enum KhmerCharClassesRange {
2122 KhmerFirstChar = 0x1780,
2123 KhmerLastChar = 0x17df
2124};
2125
2126/*
2127// Below we define how a character in the input string is either in the khmerCharClasses table
2128// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2129// within the syllable, but are not in the table) we also get their type back, or an unknown object
2130// in which case we get _xx (CC_RESERVED) back
2131*/
2132static KhmerCharClass getKhmerCharClass(ushort uc)
2133{
2134 if (uc == C_SIGN_ZWJ) {
2135 return CC_ZERO_WIDTH_J_MARK;
2136 }
2137
2138 if (uc == C_SIGN_ZWNJ) {
2139 return CC_ZERO_WIDTH_NJ_MARK;
2140 }
2141
2142 if (uc < KhmerFirstChar || uc > KhmerLastChar) {
2143 return CC_RESERVED;
2144 }
2145
2146 return khmerCharClasses[uc - KhmerFirstChar];
2147}
2148
2149
2150/*
2151// The stateTable is used to calculate the end (the length) of a well
2152// formed Khmer Syllable.
2153//
2154// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2155// CharClassValues. This coincidence of values allows the follow up of the table.
2156//
2157// Each line corresponds to a state, which does not necessarily need to be a type
2158// of component... for example, state 2 is a base, with is always a first character
2159// in the syllable, but the state could be produced a consonant of any type when
2160// it is the first character that is analysed (in ground state).
2161//
2162// Differentiating 3 types of consonants is necessary in order to
2163// forbid the use of certain combinations, such as having a second
2164// coeng after a coeng RO,
2165// The inexistent possibility of having a type 3 after another type 3 is permitted,
2166// eliminating it would very much complicate the table, and it does not create typing
2167// problems, as the case above.
2168//
2169// The table is quite complex, in order to limit the number of coeng consonants
2170// to 2 (by means of the table).
2171//
2172// There a peculiarity, as far as Unicode is concerned:
2173// - The consonant-shifter is considered in two possible different
2174// locations, the one considered in Unicode 3.0 and the one considered in
2175// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2176//
2177//
2178// xx independent character, such as a number, punctuation sign or non-khmer char
2179//
2180// c1 Khmer consonant of type 1 or an independent vowel
2181// that is, a letter in which the subscript for is only under the
2182// base, not taking any space to the right or to the left
2183//
2184// c2 Khmer consonant of type 2, the coeng form takes space under
2185// and to the left of the base (only RO is of this type)
2186//
2187// c3 Khmer consonant of type 3. Its subscript form takes space under
2188// and to the right of the base.
2189//
2190// cs Khmer consonant shifter
2191//
2192// rb Khmer robat
2193//
2194// co coeng character (u17D2)
2195//
2196// dv dependent vowel (including split vowels, they are treated in the same way).
2197// even if dv is not defined above, the component that is really tested for is
2198// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2199//
2200// zwj Zero Width joiner
2201//
2202// zwnj Zero width non joiner
2203//
2204// sa above sign
2205//
2206// sp post sign
2207//
2208// there are lines with equal content but for an easier understanding
2209// (and maybe change in the future) we did not join them
2210*/
2211static const signed char khmerStateTable[][CC_COUNT] =
2212{
2213 /* xx c1 c2 c3 zwnj cs rb co dv sa sp zwj */
2214 { 1, 2, 2, 2, 1, 1, 1, 6, 1, 1, 1, 2}, /* 0 - ground state */
2215 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 1 - exit state (or sign to the right of the syllable) */
2216 {-1, -1, -1, -1, 3, 4, 5, 6, 16, 17, 1, -1}, /* 2 - Base consonant */
2217 {-1, -1, -1, -1, -1, 4, -1, -1, 16, -1, -1, -1}, /* 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel */
2218 {-1, -1, -1, -1, 15, -1, -1, 6, 16, 17, 1, 14}, /* 4 - First register shifter */
2219 {-1, -1, -1, -1, -1, -1, -1, -1, 20, -1, 1, -1}, /* 5 - Robat */
2220 {-1, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, -1}, /* 6 - First Coeng */
2221 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 7 - First consonant of type 1 after coeng */
2222 {-1, -1, -1, -1, 12, 13, -1, -1, 16, 17, 1, 14}, /* 8 - First consonant of type 2 after coeng */
2223 {-1, -1, -1, -1, 12, 13, -1, 10, 16, 17, 1, 14}, /* 9 - First consonant or type 3 after ceong */
2224 {-1, 11, 11, 11, -1, -1, -1, -1, -1, -1, -1, -1}, /* 10 - Second Coeng (no register shifter before) */
2225 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 11 - Second coeng consonant (or ind. vowel) no register shifter before */
2226 {-1, -1, -1, -1, -1, 13, -1, -1, 16, -1, -1, -1}, /* 12 - Second ZWNJ before a register shifter */
2227 {-1, -1, -1, -1, 15, -1, -1, -1, 16, 17, 1, 14}, /* 13 - Second register shifter */
2228 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 14 - ZWJ before vowel */
2229 {-1, -1, -1, -1, -1, -1, -1, -1, 16, -1, -1, -1}, /* 15 - ZWNJ before vowel */
2230 {-1, -1, -1, -1, -1, -1, -1, -1, -1, 17, 1, 18}, /* 16 - dependent vowel */
2231 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 18}, /* 17 - sign above */
2232 {-1, -1, -1, -1, -1, -1, -1, 19, -1, -1, -1, -1}, /* 18 - ZWJ after vowel */
2233 {-1, 1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1}, /* 19 - Third coeng */
2234 {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1}, /* 20 - dependent vowel after a Robat */
2235};
2236
2237
2238/* #define KHMER_DEBUG */
2239#ifdef KHMER_DEBUG
2240#define KHDEBUG qDebug
2241#else
2242# define KHDEBUG \
2243 if (0) \
2244 printf
2245#endif
2246
2247/*
2248// Given an input string of characters and a location in which to start looking
2249// calculate, using the state table, which one is the last character of the syllable
2250// that starts in the starting position.
2251*/
2252static qsizetype khmer_nextSyllableBoundary(const char16_t *s, qsizetype start, qsizetype end, bool *invalid)
2253{
2254 const char16_t *uc = s + start;
2255 int state = 0;
2256 qsizetype pos = start;
2257 *invalid = false;
2258
2259 while (pos < end) {
2260 KhmerCharClass charClass = getKhmerCharClass(uc: *uc);
2261 if (pos == start) {
2262 *invalid = (charClass > 0) && ! (charClass & CF_CONSONANT);
2263 }
2264 state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2265
2266 KHDEBUG(format: "state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2267 charClass, *uc );
2268
2269 if (state < 0) {
2270 break;
2271 }
2272 ++uc;
2273 ++pos;
2274 }
2275 return pos;
2276}
2277
2278static void khmerAttributes(QChar::Script script, const char16_t *text, qsizetype from, qsizetype len, QCharAttributes *attributes)
2279{
2280 qsizetype end = from + len;
2281 qsizetype i = 0;
2282 Q_UNUSED(script);
2283 attributes += from;
2284 while ( i < len ) {
2285 bool invalid;
2286 qsizetype boundary = khmer_nextSyllableBoundary( s: text, start: from+i, end, invalid: &invalid ) - from;
2287
2288 attributes[i].graphemeBoundary = true;
2289
2290 if ( boundary > len-1 ) boundary = len;
2291 i++;
2292 while ( i < boundary ) {
2293 attributes[i].graphemeBoundary = false;
2294 ++i;
2295 }
2296 assert( i == boundary );
2297 }
2298}
2299
2300
2301const CharAttributeFunction charAttributeFunction[] = {
2302// Script_Unknown,
2303 nullptr,
2304// Script_Inherited,
2305 nullptr,
2306// Script_Common,
2307 nullptr,
2308// Script_Latin,
2309 nullptr,
2310// Script_Greek,
2311 nullptr,
2312// Script_Cyrillic,
2313 nullptr,
2314// Script_Armenian,
2315 nullptr,
2316// Script_Hebrew,
2317 nullptr,
2318// Script_Arabic,
2319 nullptr,
2320// Script_Syriac,
2321 nullptr,
2322// Script_Thaana,
2323 nullptr,
2324// Script_Devanagari,
2325 indicAttributes,
2326// Script_Bengali,
2327 indicAttributes,
2328// Script_Gurmukhi,
2329 indicAttributes,
2330// Script_Gujarati,
2331 indicAttributes,
2332// Script_Oriya,
2333 indicAttributes,
2334// Script_Tamil,
2335 indicAttributes,
2336// Script_Telugu,
2337 indicAttributes,
2338// Script_Kannada,
2339 indicAttributes,
2340// Script_Malayalam,
2341 indicAttributes,
2342// Script_Sinhala,
2343 indicAttributes,
2344// Script_Thai,
2345 thaiAttributes,
2346// Script_Lao,
2347 nullptr,
2348// Script_Tibetan,
2349 tibetanAttributes,
2350// Script_Myanmar,
2351 myanmarAttributes,
2352// Script_Georgian,
2353 nullptr,
2354// Script_Hangul,
2355 nullptr,
2356// Script_Ethiopic,
2357 nullptr,
2358// Script_Cherokee,
2359 nullptr,
2360// Script_CanadianAboriginal,
2361 nullptr,
2362// Script_Ogham,
2363 nullptr,
2364// Script_Runic,
2365 nullptr,
2366// Script_Khmer,
2367 khmerAttributes
2368};
2369
2370static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2371 const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2372 QCharAttributes *attributes)
2373{
2374 if (stringLength == 0)
2375 return;
2376 for (qsizetype i = 0; i < numItems; ++i) {
2377 QChar::Script script = items[i].script;
2378 if (script > QChar::Script_Khmer)
2379 script = QChar::Script_Common;
2380 CharAttributeFunction attributeFunction = charAttributeFunction[script];
2381 if (!attributeFunction)
2382 continue;
2383 qsizetype end = i < numItems - 1 ? items[i + 1].position : stringLength;
2384 attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2385 }
2386}
2387
2388}
2389
2390Q_CORE_EXPORT void initCharAttributes(QStringView string,
2391 const ScriptItem *items, qsizetype numItems,
2392 QCharAttributes *attributes, CharAttributeOptions options)
2393{
2394 if (string.size() <= 0)
2395 return;
2396
2397 if (!(options & DontClearAttributes))
2398 ::memset(s: attributes, c: 0, n: (string.size() + 1) * sizeof(QCharAttributes));
2399
2400 if (options & GraphemeBreaks)
2401 getGraphemeBreaks(string: string.utf16(), len: string.size(), attributes);
2402 if (options & WordBreaks)
2403 getWordBreaks(string: string.utf16(), len: string.size(), attributes);
2404 if (options & SentenceBreaks)
2405 getSentenceBreaks(string: string.utf16(), len: string.size(), attributes);
2406 if (options & LineBreaks)
2407 getLineBreaks(string: string.utf16(), len: string.size(), attributes, options);
2408 if (options & WhiteSpaces)
2409 getWhiteSpaces(string: string.utf16(), len: string.size(), attributes);
2410
2411 if (!qt_initcharattributes_default_algorithm_only) {
2412 if (!items || numItems <= 0)
2413 return;
2414
2415 Tailored::getCharAttributes(string: string.utf16(), stringLength: string.size(), items, numItems, attributes);
2416 }
2417}
2418
2419
2420// ----------------------------------------------------------------------------
2421//
2422// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2423//
2424// ----------------------------------------------------------------------------
2425
2426Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2427{
2428 qsizetype sor = 0;
2429 qsizetype eor = 0;
2430 QChar::Script script = QChar::Script_Common;
2431
2432 for (qsizetype i = 0; i < string.size(); ++i, eor = i) {
2433 char32_t ucs4 = string[i].unicode();
2434 if (QChar::isHighSurrogate(ucs4) && i + 1 < string.size()) {
2435 ushort low = string[i + 1].unicode();
2436 if (QChar::isLowSurrogate(ucs4: low)) {
2437 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
2438 ++i;
2439 }
2440 }
2441
2442 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2443
2444 QChar::Script nscript = QChar::Script(prop->script);
2445
2446 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
2447 continue;
2448
2449 // inherit preceding Common-s
2450 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2451 // also covers a case where the base character of Common script followed
2452 // by one or more combining marks of non-Inherited, non-Common script
2453 script = nscript;
2454 continue;
2455 }
2456
2457 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2458 // Thus, a combining mark - whatever its script property value is - should inherit
2459 // the script property value of its base character.
2460 static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
2461 if (Q_UNLIKELY(FLAG(prop->category) & test))
2462 continue;
2463
2464 Q_ASSERT(script > QChar::Script_Common);
2465 Q_ASSERT(sor < eor);
2466 scripts->append(t: ScriptItem{.position: sor, .script: script});
2467 sor = eor;
2468
2469 script = nscript;
2470 }
2471
2472 Q_ASSERT(script >= QChar::Script_Common);
2473 Q_ASSERT(eor == string.size());
2474 scripts->append(t: ScriptItem{.position: sor, .script: script});
2475}
2476
2477} // namespace QUnicodeTools
2478
2479QT_END_NAMESPACE
2480

source code of qtbase/src/corelib/text/qunicodetools.cpp