qunicodetools.cpp source code [qtbase/src/corelib/text/qunicodetools.cpp]

1	// Copyright (C) 2020 The Qt Company Ltd.
2	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4	#include "qunicodetools_p.h"
5
6	#include "qunicodetables_p.h"
7	#include "qvarlengtharray.h"
8	#if QT_CONFIG(library)
9	#include "qlibrary.h"
10	#endif
11
12	#include <limits.h>
13
14	#define FLAG(x) (1 << (x))
15
16	QT_BEGIN_NAMESPACE
17
18	using namespace Qt::StringLiterals;
19
20	#ifdef QT_BUILD_INTERNAL
21	Q_CONSTINIT Q_AUTOTEST_EXPORT
22	#else
23	constexpr
24	#endif
25	int qt_initcharattributes_default_algorithm_only = `0`;
26
27	namespace QUnicodeTools {
28
29	// -----------------------------------------------------------------------------------------------------
30	//
31	// The text boundaries determination algorithm.
32	// See https://www.unicode.org/reports/tr29/tr29-37.html
33	//
34	// -----------------------------------------------------------------------------------------------------
35
36	namespace GB {
37
38	// This table is indexed by the grapheme break classes of two
39	// (adjacent) code points.
40	// The class of the first code point selects an entry.
41	// If the entry's bit at position second_cp_class is set
42	// (in other words: if entry & (1u << second_cp_class) is non-zero)
43	// then there is NO grapheme break between the two code points.
44
45	using GBTableEntryType = quint16;
46
47	// Check that we have enough bits in the table (in case
48	// NumGraphemeBreakClasses grows too much).
49	static_assert(sizeof(GBTableEntryType) * CHAR_BIT >= QUnicodeTables::NumGraphemeBreakClasses,
50	"Internal error: increase the size in bits of GBTableEntryType");
51
52	// GB9, GB9a
53	static const GBTableEntryType Extend_SpacingMark_ZWJ =
54	FLAG(QUnicodeTables::GraphemeBreak_Extend)
55	\| FLAG(QUnicodeTables::GraphemeBreak_SpacingMark)
56	\| FLAG(QUnicodeTables::GraphemeBreak_ZWJ);
57
58	static const GBTableEntryType HardBreak = `0u`;
59
60	static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses] = {
61	Extend_SpacingMark_ZWJ, // Any
62	FLAG(QUnicodeTables::GraphemeBreak_LF), // CR
63	HardBreak, // LF
64	HardBreak, // Control
65	Extend_SpacingMark_ZWJ, // Extend
66	Extend_SpacingMark_ZWJ, // ZWJ
67	Extend_SpacingMark_ZWJ, // RegionalIndicator
68	(Extend_SpacingMark_ZWJ
69	\| FLAG(QUnicodeTables::GraphemeBreak_Any)
70	\| FLAG(QUnicodeTables::GraphemeBreak_Prepend)
71	\| FLAG(QUnicodeTables::GraphemeBreak_L)
72	\| FLAG(QUnicodeTables::GraphemeBreak_V)
73	\| FLAG(QUnicodeTables::GraphemeBreak_T)
74	\| FLAG(QUnicodeTables::GraphemeBreak_LV)
75	\| FLAG(QUnicodeTables::GraphemeBreak_LVT)
76	\| FLAG(QUnicodeTables::GraphemeBreak_RegionalIndicator)
77	\| FLAG(QUnicodeTables::GraphemeBreak_Extended_Pictographic)
78	), // Prepend
79	Extend_SpacingMark_ZWJ, // SpacingMark
80	(Extend_SpacingMark_ZWJ
81	\| FLAG(QUnicodeTables::GraphemeBreak_L)
82	\| FLAG(QUnicodeTables::GraphemeBreak_V)
83	\| FLAG(QUnicodeTables::GraphemeBreak_LV)
84	\| FLAG(QUnicodeTables::GraphemeBreak_LVT)
85	), // L
86	(Extend_SpacingMark_ZWJ
87	\| FLAG(QUnicodeTables::GraphemeBreak_V)
88	\| FLAG(QUnicodeTables::GraphemeBreak_T)
89	), // V
90	(Extend_SpacingMark_ZWJ
91	\| FLAG(QUnicodeTables::GraphemeBreak_T)
92	), // T
93	(Extend_SpacingMark_ZWJ
94	\| FLAG(QUnicodeTables::GraphemeBreak_V)
95	\| FLAG(QUnicodeTables::GraphemeBreak_T)
96	), // LV
97	(Extend_SpacingMark_ZWJ
98	\| FLAG(QUnicodeTables::GraphemeBreak_T)
99	), // LVT
100	Extend_SpacingMark_ZWJ // Extended_Pictographic
101	};
102
103	static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first,
104	QUnicodeTables::GraphemeBreakClass second)
105	{
106	return (breakTable[first] & FLAG(second)) == `0`;
107	}
108
109	// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
110	// so we need to store some local state.
111	enum class State : uchar {
112	Normal,
113	GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
114	GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
115	GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
116	};
117
118	} // namespace GB
119
120	static void getGraphemeBreaks(const char16_t string, qsizetype len, QCharAttributes attributes)
121	{
122	QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
123	GB::State state = GB::State::Normal;
124	for (qsizetype i = `0`; i != len; ++i) {
125	qsizetype pos = i;
126	char32_t ucs4 = string[i];
127	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
128	ushort low = string[i + `1`];
129	if (QChar::isLowSurrogate(ucs4: low)) {
130	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
131	++i;
132	}
133	}
134
135	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
136	QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
137
138	bool shouldBreak = GB::shouldBreakBetweenClasses(first: lcls, second: cls);
139	bool handled = false;
140
141	switch (state) {
142	case GB::State::Normal:
143	break; // will deal with it below
144
145	case GB::State::GB11_ExtPicExt:
146	Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
147	if (cls == QUnicodeTables::GraphemeBreak_Extend) {
148	// keep going in the current state
149	Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
150	handled = true;
151	} else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
152	state = GB::State::GB11_ExtPicExtZWJ;
153	Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
154	handled = true;
155	} else {
156	state = GB::State::Normal;
157	}
158	break;
159
160	case GB::State::GB11_ExtPicExtZWJ:
161	Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
162	if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) {
163	shouldBreak = false;
164	handled = true;
165	}
166
167	state = GB::State::Normal;
168	break;
169
170	case GB::State::GB12_13_RI:
171	Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator);
172	if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) {
173	shouldBreak = false;
174	handled = true;
175	}
176
177	state = GB::State::Normal;
178	break;
179	}
180
181	if (!handled) {
182	Q_ASSERT(state == GB::State::Normal);
183	if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11
184	if (cls == QUnicodeTables::GraphemeBreak_Extend) {
185	state = GB::State::GB11_ExtPicExt;
186	Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
187	} else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
188	state = GB::State::GB11_ExtPicExtZWJ;
189	Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
190	}
191	} else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
192	state = GB::State::GB12_13_RI;
193	}
194	}
195
196	if (shouldBreak)
197	attributes[pos].graphemeBoundary = true;
198
199	lcls = cls;
200	}
201
202	attributes[len].graphemeBoundary = true; // GB2
203	}
204
205
206	namespace WB {
207
208	enum Action {
209	NoBreak,
210	Break,
211	Lookup,
212	LookupW
213	};
214
215	static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
216	// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet WSeg
217	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
218	{ Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
219	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
220	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
221	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
222	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // ZWJ
223	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
224	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
225	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break }, // Katakana
226	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // HebrewLetter
227	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break }, // ALetter
228	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
229	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
230	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
231	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
232	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
233	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break }, // Numeric
234	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ExtendNumLet
235	{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak }, // WSegSpace
236	};
237
238	} // namespace WB
239
240	static void getWordBreaks(const char16_t string, qsizetype len, QCharAttributes attributes)
241	{
242	enum WordType {
243	WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
244	} currentWordType = WordTypeNone;
245
246	QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
247	auto real_cls = cls; // Unaffected by WB4
248
249	for (qsizetype i = `0`; i != len; ++i) {
250	qsizetype pos = i;
251	char32_t ucs4 = string[i];
252	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
253	ushort low = string[i + `1`];
254	if (QChar::isLowSurrogate(ucs4: low)) {
255	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
256	++i;
257	}
258	}
259
260	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
261	QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
262	if (qt_initcharattributes_default_algorithm_only) {
263	// as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
264	// which caused "hi.there" to be treated like if it were just a single word;
265	// we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
266	// and this code is needed to pass the coverage tests; remove once the issue is fixed.
267	if (ucs4 == `0x002E`) // FULL STOP
268	ncls = QUnicodeTables::WordBreak_MidNumLet;
269	else if (ucs4 == `0x003A`) // COLON
270	ncls = QUnicodeTables::WordBreak_MidLetter;
271	}
272
273	uchar action = WB::breakTable[cls][ncls];
274	switch (action) {
275	case WB::Break:
276	if (Q_UNLIKELY(real_cls == QUnicodeTables::WordBreak_ZWJ
277	&& prop->graphemeBreakClass
278	== QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
279	// WB3c: ZWJ × \p{Extended_Pictographic}
280	action = WB::NoBreak;
281	}
282	break;
283	case WB::NoBreak:
284	if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend \|\| ncls == QUnicodeTables::WordBreak_ZWJ \|\| ncls == QUnicodeTables::WordBreak_Format)) {
285	// WB4: X(Extend\|Format) -> X*
286	real_cls = ncls;
287	continue;
288	}
289	if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
290	// WB15/WB16: break between pairs of Regional indicator
291	ncls = QUnicodeTables::WordBreak_Any;
292	}
293	if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_WSegSpace
294	&& real_cls != QUnicodeTables::WordBreak_WSegSpace)) {
295	// WB3d should not be affected by WB4
296	action = WB::Break;
297	}
298	break;
299	case WB::Lookup:
300	case WB::LookupW:
301	for (qsizetype lookahead = i + `1`; lookahead < len; ++lookahead) {
302	ucs4 = string[lookahead];
303	if (QChar::isHighSurrogate(ucs4) && lookahead + `1` != len) {
304	ushort low = string[lookahead + `1`];
305	if (QChar::isLowSurrogate(ucs4: low)) {
306	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
307	++lookahead;
308	}
309	}
310
311	prop = QUnicodeTables::properties(ucs4);
312	QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
313
314	if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend \|\| tcls == QUnicodeTables::WordBreak_ZWJ \|\| tcls == QUnicodeTables::WordBreak_Format)) {
315	// WB4: X(Extend\|Format) -> X*
316	continue;
317	}
318
319	if (Q_LIKELY(tcls == cls \|\| (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
320	\|\| tcls == QUnicodeTables::WordBreak_ALetter)))) {
321	i = lookahead;
322	ncls = tcls;
323	action = WB::NoBreak;
324	}
325	break;
326	}
327	if (action != WB::NoBreak) {
328	action = WB::Break;
329	if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
330	action = WB::NoBreak; // WB7a
331	}
332	break;
333	}
334
335	cls = ncls;
336	real_cls = ncls;
337
338	if (action == WB::Break) {
339	attributes[pos].wordBreak = true;
340	if (currentWordType != WordTypeNone)
341	attributes[pos].wordEnd = true;
342	switch (cls) {
343	case QUnicodeTables::WordBreak_Katakana:
344	currentWordType = WordTypeHiraganaKatakana;
345	attributes[pos].wordStart = true;
346	break;
347	case QUnicodeTables::WordBreak_HebrewLetter:
348	case QUnicodeTables::WordBreak_ALetter:
349	case QUnicodeTables::WordBreak_Numeric:
350	currentWordType = WordTypeAlphaNumeric;
351	attributes[pos].wordStart = true;
352	break;
353	default:
354	currentWordType = WordTypeNone;
355	break;
356	}
357	}
358	}
359
360	if (currentWordType != WordTypeNone)
361	attributes[len].wordEnd = true;
362	attributes[len].wordBreak = true; // WB2
363	}
364
365
366	namespace SB {
367
368	enum State {
369	Initial,
370	Lower,
371	Upper,
372	LUATerm,
373	ATerm,
374	ATermC,
375	ACS,
376	STerm,
377	STermC,
378	SCS,
379	BAfterC,
380	BAfter,
381	Break,
382	Lookup
383	};
384
385	static const uchar breakTable[BAfter + `1`][QUnicodeTables::NumSentenceBreakClasses] = {
386	// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
387	{ Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
388	{ Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
389	{ Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, Initial, STerm , Initial }, // Upper
390
391	{ Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
392	{ Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
393	{ Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
394	{ Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
395
396	{ Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
397	{ Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
398	{ Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
399	{ Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
400	{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
401	};
402
403	} // namespace SB
404
405	static void getSentenceBreaks(const char16_t string, qsizetype len, QCharAttributes attributes)
406	{
407	uchar state = SB::BAfter; // to meet SB1
408	for (qsizetype i = `0`; i != len; ++i) {
409	qsizetype pos = i;
410	char32_t ucs4 = string[i];
411	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
412	ushort low = string[i + `1`];
413	if (QChar::isLowSurrogate(ucs4: low)) {
414	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
415	++i;
416	}
417	}
418
419	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
420	QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
421
422	Q_ASSERT(state <= SB::BAfter);
423	state = SB::breakTable[state][ncls];
424	if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
425	state = SB::Break;
426	for (qsizetype lookahead = i + `1`; lookahead < len; ++lookahead) {
427	ucs4 = string[lookahead];
428	if (QChar::isHighSurrogate(ucs4) && lookahead + `1` != len) {
429	ushort low = string[lookahead + `1`];
430	if (QChar::isLowSurrogate(ucs4: low)) {
431	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
432	++lookahead;
433	}
434	}
435
436	prop = QUnicodeTables::properties(ucs4);
437	QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
438	switch (tcls) {
439	case QUnicodeTables::SentenceBreak_Any:
440	case QUnicodeTables::SentenceBreak_Extend:
441	case QUnicodeTables::SentenceBreak_Sp:
442	case QUnicodeTables::SentenceBreak_Numeric:
443	case QUnicodeTables::SentenceBreak_SContinue:
444	case QUnicodeTables::SentenceBreak_Close:
445	continue;
446	case QUnicodeTables::SentenceBreak_Lower:
447	i = lookahead;
448	state = SB::Initial;
449	break;
450	default:
451	break;
452	}
453	break;
454	}
455	}
456	if (Q_UNLIKELY(state == SB::Break)) {
457	attributes[pos].sentenceBoundary = true;
458	state = SB::breakTable[SB::Initial][ncls];
459	}
460	}
461
462	attributes[len].sentenceBoundary = true; // SB2
463	}
464
465
466	// -----------------------------------------------------------------------------------------------------
467	//
468	// The line breaking algorithm.
469	// See http://www.unicode.org/reports/tr14/tr14-39.html
470	//
471	// -----------------------------------------------------------------------------------------------------
472
473	namespace LB {
474
475	namespace NS { // Number Sequence
476
477	// This namespace is used to implement LB25 which, as of Unicode 16, has this
478	// definition:
479	// NU ( SY \| IS ) CL × PO*
480	// NU ( SY \| IS ) CP × PO*
481	// NU ( SY \| IS ) CL × PR*
482	// NU ( SY \| IS ) CP × PR*
483	// NU ( SY \| IS ) × PO*
484	// NU ( SY \| IS ) × PR*
485	// PO × OP NU
486	// PO × OP IS NU
487	// PO × NU
488	// PR × OP NU
489	// PR × OP IS NU
490	// PR × NU
491	// HY × NU
492	// IS × NU
493	// NU ( SY \| IS ) × NU*
494
495	enum Action {
496	None,
497	Start,
498	Continue,
499	Break,
500	NeedOPNU, // Like Start, but must be followed by sequence `(OP (IS)?)? NU`
501	// These are 'synthetic' actions and are not used in the table but are
502	// tracked otherwise in the code for LB25, to track the state of specific
503	// sequences:
504	CNeedNU, // Like Continue, but must be followed by NU
505	CNeedISNU, // Like Continue, but must be followed by IS? NU
506	};
507
508	enum Class {
509	XX,
510	PRPO,
511	OP,
512	HY,
513	NU,
514	SY,
515	IS,
516	CLCP
517	};
518
519	static const uchar actionTable[CLCP + `1`][CLCP + `1`] = {
520	// XX PRPO OP HY NU SY IS CLCP
521	{ None , NeedOPNU, Start , None , Start , None , None , None }, // XX
522	{ None , NeedOPNU, Continue, Break , Start , None , None , None }, // PRPO
523	{ None , Start , Start , Break , Continue, None , Continue, None }, // OP
524	{ None , None , None , Start , Continue, None , None , None }, // HY
525	{ Break , Break , Break , Break , Continue, Continue, Continue, Continue }, // NU
526	{ Break , Break , Break , Break , Continue, Continue, Continue, Continue }, // SY
527	{ Break , Break , Break , Break , Continue, Continue, Continue, Continue }, // IS
528	{ Break , Continue, Break , Break , Break , Break , Break , Break }, // CLCP
529	};
530
531	inline Class toClass(QUnicodeTables::LineBreakClass lbc)
532	{
533	switch (lbc) {
534	case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
535	return PRPO;
536	case QUnicodeTables::LineBreak_OP:
537	return OP;
538	case QUnicodeTables::LineBreak_HY:
539	return HY;
540	case QUnicodeTables::LineBreak_NU:
541	return NU;
542	case QUnicodeTables::LineBreak_SY:
543	return SY;
544	case QUnicodeTables::LineBreak_IS:
545	return IS;
546	case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
547	return CLCP;
548	default:
549	break;
550	}
551	return XX;
552	}
553
554	} // namespace NS
555
556	namespace BRS { // Brahmic Sequence, used to implement LB28a
557	constexpr char32_t DottedCircle = U`'\u25CC'`;
558
559	// The LB28a_{n} value maps to the 'regex' on the nth line in LB28a
560	// The only special case is LB28a_2VI which is a direct match to the 2nd
561	// line, but it also leads to LB28a_3VIAK, the 3rd line.
562	enum State {
563	None,
564	Start, // => Have: `(AK \| [◌] \| AS)`
565	LB28a_2VF, // => Have: `(AK \| [◌] \| AS) VF`
566	LB28a_2VI, // => Have: `(AK \| [◌] \| AS) VI` May find: `(AK \| [◌])`
567	LB28a_3VIAK, // => Have: `(AK \| [◌] \| AS) VI (AK \| [◌])`
568	LB28a_4, // => Have: `(AK \| [◌] \| AS) (AK \| [◌] \| AS)` May find: `VF`
569	LB28a_4VF, // => Have: `(AK \| [◌] \| AS) (AK \| [◌] \| AS) VF`
570	Restart,
571	};
572	struct LinebreakUnit {
573	QUnicodeTables::LineBreakClass lbc;
574	char32_t ucs4;
575	};
576	struct ParseState {
577	State state = None;
578	qsizetype start = `0`;
579	};
580	State updateState(State state, LinebreakUnit lb)
581	{
582	using LBC = QUnicodeTables::LineBreakClass;
583	if (lb.lbc == LBC::LineBreak_CM)
584	return state;
585
586	switch (state) {
587	case Start:
588	if (lb.lbc == LBC::LineBreak_VF)
589	return LB28a_2VF;
590	if (lb.lbc == LBC::LineBreak_VI)
591	return LB28a_2VI;
592	if (lb.ucs4 == DottedCircle \|\| lb.lbc == LBC::LineBreak_AK
593	\|\| lb.lbc == LBC::LineBreak_AS)
594	return LB28a_4;
595	break;
596	case LB28a_2VI:
597	if (lb.ucs4 == DottedCircle \|\| lb.lbc == LBC::LineBreak_AK)
598	return LB28a_3VIAK;
599	break;
600	case LB28a_4:
601	if (lb.lbc == LBC::LineBreak_VF)
602	return LB28a_4VF;
603	// Had (AK \| [◌] \| AS) (AK \| [◌] \| AS), which could mean the 2nd capture is the start
604	// of a new sequence, so we need to check if it makes sense.
605	return Restart;
606	case None:
607	if (Q_UNLIKELY(lb.ucs4 == DottedCircle \|\| lb.lbc == LBC::LineBreak_AK
608	\|\| lb.lbc == LBC::LineBreak_AS)) {
609	return Start;
610	}
611	break;
612	case LB28a_2VF:
613	case LB28a_4VF:
614	case LB28a_3VIAK:
615	case Restart:
616	// These are all terminal states, so no need to update
617	Q_UNREACHABLE();
618	}
619	return None;
620	}
621	}
622
623	enum Action {
624	ProhibitedBreak, PB = ProhibitedBreak,
625	DirectBreak, DB = DirectBreak,
626	IndirectBreak, IB = IndirectBreak,
627	CombiningIndirectBreak, CI = CombiningIndirectBreak,
628	CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
629	ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen,
630	IndirectBreakIfNarrow, IN = IndirectBreakIfNarrow, // For LB30
631	DirectBreakOutsideNumericSequence, DN = DirectBreakOutsideNumericSequence, // For LB25
632	};
633
634	// See https://www.unicode.org/reports/tr14/tr14-37.html for the information
635	// about the table. It was removed in the later versions of the standard.
636	static const uchar breakTable[QUnicodeTables::LineBreak_ZWJ][QUnicodeTables::LineBreak_ZWJ] = {
637	/ 1↓ 2→ OP CL CP QU +Pi +Pf +19 GL NS EX SY IS PR PO NU AL HL ID IN HY +WS BA +WS HYBA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM AK AP AS VI VF/
638	/ OP / { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
639	/ CL / { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
640	/ CP / { DB, PB, PB, IB, IB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
641	/ QU / { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
642	/ +Pi/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
643	/ +Pf/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
644	/ +19/ { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
645	/ GL / { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
646	/ NS / { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
647	/ EX / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
648	/ SY / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
649	/ IS / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DN, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
650	/ PR / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, DB, DB, DB },
651	/ PO / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
652	/ NU / { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
653	/ AL / { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
654	/ HL / { IN, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, CI, CI, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
655	/ ID / { DB, PB, PB, DB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
656	/ IN / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
657	/ HY / { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
658	/ +WS/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
659	/ BA / { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
660	/ +WS/ { HH, PB, PB, IB, IB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, IB, HH, HH, IB, IB, IB, IB, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, HH, HH, HH, HH, HH },
661	/HYBA/ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, DB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
662	/ BB / { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, IB },
663	/ B2 / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
664	/ ZW / { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
665	/ CM / { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
666	/ WJ / { IB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
667	/ H2 / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
668	/ H3 / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
669	/ JL / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
670	/ JV / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
671	/ JT / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
672	/ RI / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB, DB, DB, DB },
673	/ CB / { DB, PB, PB, IB, IB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
674	/ EB / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, DB, DB, DB, DB },
675	/ EM / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
676	/ AK / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
677	/ AP / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, DB, IB, DB, DB },
678	/ AS / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
679	/ VI / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
680	/ VF / { DB, PB, PB, IB, IB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, IB, IB, IB, DB, DB, PB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
681	};
682
683	// The following line break classes are not treated by the pair table
684	// and must be resolved outside:
685	// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX, ZWJ
686
687	} // namespace LB
688
689	static void getLineBreaks(const char16_t string, qsizetype len, QCharAttributes attributes, QUnicodeTools::CharAttributeOptions options)
690	{
691	qsizetype nestart = `0`;
692	LB::NS::Class nelast = LB::NS::XX;
693	LB::NS::Action neactlast = LB::NS::None;
694
695	LB::BRS::ParseState brsState;
696
697	QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
698	QUnicodeTables::LineBreakClass cls = lcls;
699	const QUnicodeTables::Properties *lastProp = QUnicodeTables::properties(ucs4: U`'\n'`);
700
701	constexpr static auto isEastAsian = [](QUnicodeTables::EastAsianWidth eaw) {
702	using EAW = QUnicodeTables::EastAsianWidth;
703	return eaw == EAW::W \|\| eaw == EAW::F \|\| eaw == EAW::H;
704	};
705
706	for (qsizetype i = `0`; i != len; ++i) {
707	qsizetype pos = i;
708	char32_t ucs4 = string[i];
709	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
710	ushort low = string[i + `1`];
711	if (QChar::isLowSurrogate(ucs4: low)) {
712	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
713	++i;
714	}
715	}
716
717	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
718	QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
719	QUnicodeTables::LineBreakClass tcls;
720
721	if (options & QUnicodeTools::HangulLineBreakTailoring) {
722	if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
723	&& ncls <= QUnicodeTables::LineBreak_JT)
724	\|\| (ucs4 >= `0x3130` && ucs4 <= `0x318F` && ncls == QUnicodeTables::LineBreak_ID))
725	) {
726	// LB27: use SPACE for line breaking
727	// "When Korean uses SPACE for line breaking, the classes in rule LB26,
728	// as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
729	// In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
730	ncls = QUnicodeTables::LineBreak_AL;
731	} else {
732	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
733	// LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
734	static const int test = FLAG(QChar::Mark_NonSpacing) \| FLAG(QChar::Mark_SpacingCombining);
735	if (FLAG(prop->category) & test)
736	ncls = QUnicodeTables::LineBreak_CM;
737	}
738	}
739	}
740
741	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
742	// LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
743	static const int test = FLAG(QChar::Mark_NonSpacing) \| FLAG(QChar::Mark_SpacingCombining);
744	if (FLAG(prop->category) & test)
745	ncls = QUnicodeTables::LineBreak_CM;
746	}
747
748	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU)) {
749	if (prop->category == QChar::Punctuation_InitialQuote) {
750	// LB15a: Do not break after an unresolved initial punctuation
751	// that lies at the start of the line, after a space, after
752	// opening punctuation, or after an unresolved quotation mark,
753	// even after spaces.
754	// (sot \| BK \| CR \| LF \| NL \| OP \| QU \| GL \| SP \| ZW)
755	// [\p{Pi}&QU] SP ×*
756	// Note: sot is treated as LF here due to initial loop setup.
757	constexpr QUnicodeTables::LineBreakClass lb15a[] = {
758	QUnicodeTables::LineBreak_BK, QUnicodeTables::LineBreak_CR,
759	QUnicodeTables::LineBreak_LF, QUnicodeTables::LineBreak_OP,
760	QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
761	QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_GL,
762	QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_ZW};
763	if (std::any_of(first: std::begin(arr: lb15a), last: std::end(arr: lb15a),
764	pred: [lcls](auto x) { return x == lcls; })) {
765	ncls = QUnicodeTables::LineBreak_QU_Pi;
766	}
767	} else if (prop->category == QChar::Punctuation_FinalQuote) {
768	// LB15b: Do not break before an unresolved final punctuation
769	// that lies at the end of the line, before a space, before
770	// a prohibited break, or before an unresolved quotation mark,
771	// even after spaces.
772	// × [\p{Pf}&QU] ( SP \| GL \| WJ \| CL \| QU \| CP \| EX \| IS
773	// \| SY \| BK \| CR \| LF \| NL \| ZW \| eot)
774	auto nncls = QUnicodeTables::LineBreak_LF;
775
776	if (i + `1` < len) {
777	char32_t c = string[i + `1`];
778	if (QChar::isHighSurrogate(ucs4: c) && i + `2` < len) {
779	ushort low = string[i + `2`];
780	if (QChar::isLowSurrogate(ucs4: low))
781	c = QChar::surrogateToUcs4(high: c, low);
782	}
783	nncls = QUnicodeTables::LineBreakClass(
784	QUnicodeTables::properties(ucs4: c)->lineBreakClass);
785	}
786
787	constexpr QUnicodeTables::LineBreakClass lb15b[] = {
788	QUnicodeTables::LineBreak_SP, QUnicodeTables::LineBreak_GL,
789	QUnicodeTables::LineBreak_WJ, QUnicodeTables::LineBreak_CL,
790	QUnicodeTables::LineBreak_QU, QUnicodeTables::LineBreak_QU_Pi,
791	QUnicodeTables::LineBreak_QU_Pf, QUnicodeTables::LineBreak_CP,
792	QUnicodeTables::LineBreak_EX, QUnicodeTables::LineBreak_IS,
793	QUnicodeTables::LineBreak_SY, QUnicodeTables::LineBreak_BK,
794	QUnicodeTables::LineBreak_CR, QUnicodeTables::LineBreak_LF,
795	QUnicodeTables::LineBreak_ZW};
796	if (std::any_of(first: std::begin(arr: lb15b), last: std::end(arr: lb15b),
797	pred: [nncls](auto x) { return x == nncls; })) {
798	ncls = QUnicodeTables::LineBreak_QU_Pf;
799	}
800	}
801	}
802
803	if (Q_UNLIKELY((lcls >= QUnicodeTables::LineBreak_SP \|\| lcls == QUnicodeTables::LineBreak_ZW
804	\|\| lcls == QUnicodeTables::LineBreak_GL
805	\|\| lcls == QUnicodeTables::LineBreak_CB)
806	&& (ncls == QUnicodeTables::LineBreak_HY \|\| ucs4 == u`'\u2010'`))) {
807	// LB20a: Do not break after a word-initial hyphen.
808	// ( sot \| BK \| CR \| LF \| NL \| SP \| ZW \| CB \| GL ) ( HY \| [\u2010] ) × AL
809
810	// Remap to the synthetic class WS_ (whitespace+), which is just
811	// like the current respective linebreak class but with an IB action
812	// if the next class is AL.
813	if (ucs4 == u`'\u2010'`)
814	ncls = QUnicodeTables::LineBreak_WS_BA;
815	else
816	ncls = QUnicodeTables::LineBreak_WS_HY;
817	}
818
819	if (Q_UNLIKELY(cls == QUnicodeTables::LineBreak_AP && ucs4 == LB::BRS::DottedCircle)) {
820	// LB28a: Do not break inside the orthographic syllables of Brahmic scripts
821	// AP × (AK \| [◌] \| AS)
822	// @note: AP × (AK \| AS) is checked by the breakTable
823	goto next;
824	}
825	while (true) { // May need to recheck once.
826	// LB28a cont'd
827	LB::BRS::State oldState = brsState.state;
828	brsState.state = LB::BRS::updateState(state: brsState.state, lb: {.lbc: ncls, .ucs4: ucs4});
829	if (Q_LIKELY(brsState.state == oldState))
830	break;
831	switch (brsState.state) {
832	case LB::BRS::Start:
833	brsState.start = i;
834	break;
835	case LB::BRS::LB28a_2VI: // Wait for more characters, but also valid sequence
836	// We may get another character, but this is already a complete
837	// sequence that should not have any breaks:
838	for (qsizetype j = brsState.start + `1`; j < i; ++j)
839	attributes[j].lineBreak = false;
840	// No need to mark this sequence again later, so move 'start'
841	// up to the current position:
842	brsState.start = i;
843	goto next;
844	case LB::BRS::Restart:
845	// The previous character was possibly the start of a new sequence
846	brsState.state = LB::BRS::Start;
847	brsState.start = pos - `1`;
848	continue; // Doing the loop again!
849	case LB::BRS::LB28a_2VF:
850	case LB::BRS::LB28a_4VF:
851	case LB::BRS::LB28a_3VIAK:
852	for (qsizetype j = brsState.start + `1`; j < i; ++j)
853	attributes[j].lineBreak = false;
854	if (brsState.state == LB::BRS::LB28a_3VIAK) {
855	// This might be the start of a new sequence
856	brsState.state = LB::BRS::Start;
857	brsState.start = i;
858	} else {
859	brsState.state = LB::BRS::None;
860	}
861	goto next;
862	case LB::BRS::LB28a_4: // Wait for more characters
863	case LB::BRS::None: // Nothing to do
864	break;
865	}
866	break;
867	}
868
869	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_IS)) {
870	// LB15c Break before a decimal mark that follows a space, for instance, in
871	// ‘subtract .5’.
872	if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_SP)) {
873	if (i + `1` < len) {
874	char32_t ch = string[i + `1`];
875	if (QChar::isHighSurrogate(ucs4: ch) && i + `2` < len) {
876	ushort low = string[i + `2`];
877	if (QChar::isLowSurrogate(ucs4: low))
878	ch = QChar::surrogateToUcs4(high: ch, low);
879	}
880	if (QUnicodeTables::properties(ucs4: ch)->lineBreakClass
881	== QUnicodeTables::LineBreak_NU) {
882	attributes[pos].lineBreak = true;
883	goto next;
884	}
885	}
886	}
887	}
888
889	if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_HL)) {
890	// LB21a Do not break after the hyphen in Hebrew + Hyphen + non-Hebrew
891	// HL (HY \| [ BA - $EastAsian ]) × [^HL]
892	auto eaw = QUnicodeTables::EastAsianWidth(prop->eastAsianWidth);
893	const bool isNonEaBA = ncls == QUnicodeTables::LineBreak_BA && !isEastAsian (eaw);
894	if (isNonEaBA \|\| ncls == QUnicodeTables::LineBreak_HY) {
895	// Remap to synthetic HYBA class which handles the next
896	// character. Generally (LB21) there are no breaks before
897	// HY or BA, so we can skip ahead to the next character.
898	ncls = QUnicodeTables::LineBreak_HYBA;
899	goto next;
900	}
901	}
902
903	// LB25: do not break lines inside numbers
904	{
905	LB::NS::Class necur = LB::NS::toClass(lbc: ncls);
906	LB::NS::Action neact = LB::NS::Action(LB::NS::actionTable[nelast][necur]);
907	if (Q_UNLIKELY(neactlast == LB::NS::CNeedNU && necur != LB::NS::NU)) {
908	neact = LB::NS::None;
909	} else if (Q_UNLIKELY(neactlast == LB::NS::NeedOPNU)) {
910	if (necur == LB::NS::OP)
911	neact = LB::NS::CNeedISNU;
912	else if (necur == LB::NS::NU)
913	neact = LB::NS::Continue;
914	else // Anything else and we ignore the sequence
915	neact = LB::NS::None;
916	} else if (Q_UNLIKELY(neactlast == LB::NS::CNeedISNU)) {
917	if (necur == LB::NS::IS)
918	neact = LB::NS::CNeedNU;
919	else if (necur == LB::NS::NU)
920	neact = LB::NS::Continue;
921	else // Anything else and we ignore the sequence
922	neact = LB::NS::None;
923	}
924	switch (neact) {
925	case LB::NS::Break:
926	// do not change breaks before and after the expression
927	for (qsizetype j = nestart + `1`; j < pos; ++j)
928	attributes[j].lineBreak = false;
929	Q_FALLTHROUGH();
930	case LB::NS::None:
931	nelast = LB::NS::XX; // reset state
932	break;
933	case LB::NS::NeedOPNU:
934	case LB::NS::Start:
935	if (neactlast == LB::NS::Start \|\| neactlast == LB::NS::Continue) {
936	// Apply the linebreaks for the previous stretch; we need to start a new one
937	for (qsizetype j = nestart + `1`; j < pos; ++j)
938	attributes[j].lineBreak = false;
939	}
940	nestart = i;
941	Q_FALLTHROUGH();
942	case LB::NS::CNeedNU:
943	case LB::NS::CNeedISNU:
944	case LB::NS::Continue:
945	nelast = necur;
946	break;
947	}
948	neactlast = neact;
949	}
950
951	// LB19a Unless surrounded by East Asian characters, do not break either side of any
952	// unresolved quotation marks
953	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_QU
954	&& lcls != QUnicodeTables::LineBreak_SP
955	&& lcls != QUnicodeTables::LineBreak_ZW)) {
956	using EAW = QUnicodeTables::EastAsianWidth;
957	constexpr static auto nextCharNonEastAsian = [](const char16_t *string, qsizetype len) {
958	if (len > `0`) {
959	char32_t nch = string[`0`];
960	if (QChar::isHighSurrogate(ucs4: nch) && len > `1`) {
961	char16_t low = string[`1`];
962	if (QChar::isLowSurrogate(ucs4: low))
963	nch = QChar::surrogateToUcs4(high: char16_t(nch), low);
964	}
965	const auto *nextProp = QUnicodeTables::properties(ucs4: nch);
966	QUnicodeTables::LineBreakClass nncls = QUnicodeTables::LineBreakClass(
967	nextProp->lineBreakClass);
968	QUnicodeTables::EastAsianWidth neaw = EAW(nextProp->eastAsianWidth);
969	return nncls != QUnicodeTables::LineBreak_CM
970	&& nncls <= QUnicodeTables::LineBreak_SP
971	&& !isEastAsian (neaw);
972	}
973	return true; // end-of-text counts as non-East-Asian
974	};
975	if (Q_UNLIKELY(!isEastAsian(EAW(lastProp->eastAsianWidth))
976	\|\| nextCharNonEastAsian(string + i + `1`, len - i - `1`))) {
977	// Remap to the synthetic QU_19 class which has indirect breaks
978	// for most following classes.
979	ncls = QUnicodeTables::LineBreak_QU_19;
980	}
981	}
982
983	if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
984	// LB4: BK!, LB5: (CRxLF\|CR\|LF\|NL)!
985	if (lcls > QUnicodeTables::LineBreak_CR \|\| ncls != QUnicodeTables::LineBreak_LF)
986	attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
987	goto next;
988	}
989
990	if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
991	if (ncls > QUnicodeTables::LineBreak_SP)
992	goto next; // LB6: x(BK\|CR\|LF\|NL)
993	goto next_no_cls_update; // LB7: xSP
994	}
995
996	// LB19 - do not break before non-initial unresolved quotation marks, or after non-final
997	// unresolved quotation marks
998	if (Q_UNLIKELY(((ncls == QUnicodeTables::LineBreak_QU
999	\|\| ncls == QUnicodeTables::LineBreak_QU_19)
1000	&& prop->category != QChar::Punctuation_InitialQuote)
1001	\|\| (cls == QUnicodeTables::LineBreak_QU
1002	&& lastProp->category != QChar::Punctuation_FinalQuote))) {
1003	// Make sure the previous character is not one that we have to break after.
1004	// Also skip if ncls is CM so it can be treated as lcls (LB9)
1005	if (lcls != QUnicodeTables::LineBreak_SP && lcls != QUnicodeTables::LineBreak_ZW
1006	&& ncls != QUnicodeTables::LineBreak_CM) {
1007	goto next;
1008	}
1009	}
1010
1011	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM \|\| ncls == QUnicodeTables::LineBreak_ZWJ)) {
1012	// LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
1013	if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP)
1014	// don't update anything
1015	goto next_no_cls_update;
1016	}
1017
1018	if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
1019	// LB8a: ZWJ x
1020	goto next;
1021	}
1022
1023	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
1024	// LB30a
1025	ncls = QUnicodeTables::LineBreak_SP;
1026	goto next;
1027	}
1028
1029	if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_EM
1030	&& lastProp->category == QChar::Other_NotAssigned
1031	&& lastProp->graphemeBreakClass
1032	== QUnicodeTables::GraphemeBreak_Extended_Pictographic)) {
1033	// LB30b: [\p{Extended_Pictographic}&\p{Cn}] × EM
1034	goto next;
1035	}
1036
1037	// for South East Asian chars that require a complex analysis, the Unicode
1038	// standard recommends to treat them as AL. tailoring that do dictionary analysis can override
1039	if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
1040	cls = QUnicodeTables::LineBreak_AL;
1041
1042	tcls = cls;
1043
1044	constexpr static auto remapToAL = [](QUnicodeTables::LineBreakClass &c, auto &property) {
1045	if (Q_UNLIKELY(c == QUnicodeTables::LineBreak_CM
1046	\|\| c == QUnicodeTables::LineBreak_ZWJ)) {
1047	c = QUnicodeTables::LineBreak_AL;
1048	property = QUnicodeTables::properties(ucs4: U`'\u0041'`);
1049	}
1050	};
1051	// LB10 Treat any remaining combining mark or ZWJ as AL,
1052	// as if it had the properties of U+0041 A LATIN CAPITAL LETTER
1053	remapToAL (tcls, lastProp);
1054	remapToAL (ncls, prop);
1055
1056	switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_ZWJ ? ncls : QUnicodeTables::LineBreak_AL]) {
1057	case LB::DirectBreak:
1058	attributes[pos].lineBreak = true;
1059	break;
1060	case LB::IndirectBreak:
1061	if (lcls == QUnicodeTables::LineBreak_SP)
1062	attributes[pos].lineBreak = true;
1063	break;
1064	case LB::CombiningIndirectBreak:
1065	if (lcls != QUnicodeTables::LineBreak_SP)
1066	goto next_no_cls_update;
1067	attributes[pos].lineBreak = true;
1068	break;
1069	case LB::CombiningProhibitedBreak:
1070	if (lcls != QUnicodeTables::LineBreak_SP)
1071	goto next_no_cls_update;
1072	break;
1073	case LB::ProhibitedBreakAfterHebrewPlusHyphen:
1074	if (lcls != QUnicodeTables::LineBreak_HL)
1075	attributes[pos].lineBreak = true;
1076	break;
1077	case LB::IndirectBreakIfNarrow:
1078	using EAW = QUnicodeTables::EastAsianWidth;
1079	switch (EAW(prop->eastAsianWidth)) {
1080	default:
1081	if (lcls != QUnicodeTables::LineBreak_SP)
1082	break;
1083	Q_FALLTHROUGH();
1084	case QUnicodeTables::EastAsianWidth::F:
1085	case QUnicodeTables::EastAsianWidth::W:
1086	case QUnicodeTables::EastAsianWidth::H:
1087	attributes[pos].lineBreak = true;
1088	break;
1089	}
1090	break;
1091	case LB::DirectBreakOutsideNumericSequence:
1092	if (neactlast == LB::NS::None \|\| neactlast > LB::NS::Break)
1093	attributes[pos].lineBreak = true;
1094	break;
1095	case LB::ProhibitedBreak:
1096	// nothing to do
1097	default:
1098	break;
1099	}
1100
1101	next:
1102	if (ncls != QUnicodeTables::LineBreak_CM && ncls != QUnicodeTables::LineBreak_ZWJ) {
1103	cls = ncls;
1104	lastProp = prop;
1105	}
1106	next_no_cls_update:
1107	lcls = ncls;
1108	}
1109
1110	if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
1111	// LB25: do not break lines inside numbers
1112	for (qsizetype j = nestart + `1`; j < len; ++j)
1113	attributes[j].lineBreak = false;
1114	}
1115
1116	attributes[`0`].lineBreak = attributes[`0`].mandatoryBreak = false; // LB2
1117	attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
1118	}
1119
1120
1121	static void getWhiteSpaces(const char16_t string, qsizetype len, QCharAttributes attributes)
1122	{
1123	for (qsizetype i = `0`; i != len; ++i) {
1124	uint ucs4 = string[i];
1125	if (QChar::isHighSurrogate(ucs4) && i + `1` != len) {
1126	ushort low = string[i + `1`];
1127	if (QChar::isLowSurrogate(ucs4: low)) {
1128	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
1129	++i;
1130	}
1131	}
1132
1133	if (Q_UNLIKELY(QChar::isSpace(ucs4)))
1134	attributes[i].whiteSpace = true;
1135	}
1136	}
1137
1138	namespace Tailored {
1139
1140	using CharAttributeFunction = void ()(QChar::Script script, const* char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes);
1141
1142
1143	enum Form {
1144	Invalid = `0x0`,
1145	UnknownForm = Invalid,
1146	Consonant,
1147	Nukta,
1148	Halant,
1149	Matra,
1150	VowelMark,
1151	StressMark,
1152	IndependentVowel,
1153	LengthMark,
1154	Control,
1155	Other
1156	};
1157
1158	static const unsigned char indicForms[`0xe00`-`0x900`] = {
1159	// Devangari
1160	Invalid, VowelMark, VowelMark, VowelMark,
1161	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1162	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1163	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1164
1165	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1166	IndependentVowel, Consonant, Consonant, Consonant,
1167	Consonant, Consonant, Consonant, Consonant,
1168	Consonant, Consonant, Consonant, Consonant,
1169
1170	Consonant, Consonant, Consonant, Consonant,
1171	Consonant, Consonant, Consonant, Consonant,
1172	Consonant, Consonant, Consonant, Consonant,
1173	Consonant, Consonant, Consonant, Consonant,
1174
1175	Consonant, Consonant, Consonant, Consonant,
1176	Consonant, Consonant, Consonant, Consonant,
1177	Consonant, Consonant, UnknownForm, UnknownForm,
1178	Nukta, Other, Matra, Matra,
1179
1180	Matra, Matra, Matra, Matra,
1181	Matra, Matra, Matra, Matra,
1182	Matra, Matra, Matra, Matra,
1183	Matra, Halant, UnknownForm, UnknownForm,
1184
1185	Other, StressMark, StressMark, StressMark,
1186	StressMark, UnknownForm, UnknownForm, UnknownForm,
1187	Consonant, Consonant, Consonant, Consonant,
1188	Consonant, Consonant, Consonant, Consonant,
1189
1190	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1191	Other, Other, Other, Other,
1192	Other, Other, Other, Other,
1193	Other, Other, Other, Other,
1194
1195	Other, Other, Other, Other,
1196	Other, Other, Other, Other,
1197	Other, Other, Other, Consonant,
1198	Consonant, Consonant / ??? /, Consonant, Consonant,
1199
1200	// Bengali
1201	Invalid, VowelMark, VowelMark, VowelMark,
1202	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1203	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1204	IndependentVowel, Invalid, Invalid, IndependentVowel,
1205
1206	IndependentVowel, Invalid, Invalid, IndependentVowel,
1207	IndependentVowel, Consonant, Consonant, Consonant,
1208	Consonant, Consonant, Consonant, Consonant,
1209	Consonant, Consonant, Consonant, Consonant,
1210
1211	Consonant, Consonant, Consonant, Consonant,
1212	Consonant, Consonant, Consonant, Consonant,
1213	Consonant, Invalid, Consonant, Consonant,
1214	Consonant, Consonant, Consonant, Consonant,
1215
1216	Consonant, Invalid, Consonant, Invalid,
1217	Invalid, Invalid, Consonant, Consonant,
1218	Consonant, Consonant, UnknownForm, UnknownForm,
1219	Nukta, Other, Matra, Matra,
1220
1221	Matra, Matra, Matra, Matra,
1222	Matra, Invalid, Invalid, Matra,
1223	Matra, Invalid, Invalid, Matra,
1224	Matra, Halant, Consonant, UnknownForm,
1225
1226	Invalid, Invalid, Invalid, Invalid,
1227	Invalid, Invalid, Invalid, VowelMark,
1228	Invalid, Invalid, Invalid, Invalid,
1229	Consonant, Consonant, Invalid, Consonant,
1230
1231	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1232	Other, Other, Other, Other,
1233	Other, Other, Other, Other,
1234	Other, Other, Other, Other,
1235
1236	Consonant, Consonant, Other, Other,
1237	Other, Other, Other, Other,
1238	Other, Other, Other, Other,
1239	Other, Other, Other, Other,
1240
1241	// Gurmukhi
1242	Invalid, VowelMark, VowelMark, VowelMark,
1243	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1244	IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1245	Invalid, Invalid, Invalid, IndependentVowel,
1246
1247	IndependentVowel, Invalid, Invalid, IndependentVowel,
1248	IndependentVowel, Consonant, Consonant, Consonant,
1249	Consonant, Consonant, Consonant, Consonant,
1250	Consonant, Consonant, Consonant, Consonant,
1251
1252	Consonant, Consonant, Consonant, Consonant,
1253	Consonant, Consonant, Consonant, Consonant,
1254	Consonant, Invalid, Consonant, Consonant,
1255	Consonant, Consonant, Consonant, Consonant,
1256
1257	Consonant, Invalid, Consonant, Consonant,
1258	Invalid, Consonant, Consonant, Invalid,
1259	Consonant, Consonant, UnknownForm, UnknownForm,
1260	Nukta, Other, Matra, Matra,
1261
1262	Matra, Matra, Matra, Invalid,
1263	Invalid, Invalid, Invalid, Matra,
1264	Matra, Invalid, Invalid, Matra,
1265	Matra, Halant, UnknownForm, UnknownForm,
1266
1267	Invalid, Invalid, Invalid, Invalid,
1268	Invalid, UnknownForm, UnknownForm, UnknownForm,
1269	Invalid, Consonant, Consonant, Consonant,
1270	Consonant, Invalid, Consonant, Invalid,
1271
1272	Other, Other, Invalid, Invalid,
1273	Other, Other, Other, Other,
1274	Other, Other, Other, Other,
1275	Other, Other, Other, Other,
1276
1277	StressMark, StressMark, Consonant, Consonant,
1278	Other, Other, Other, Other,
1279	Other, Other, Other, Other,
1280	Other, Other, Other, Other,
1281
1282	// Gujarati
1283	Invalid, VowelMark, VowelMark, VowelMark,
1284	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1285	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1286	IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
1287
1288	IndependentVowel, IndependentVowel, Invalid, IndependentVowel,
1289	IndependentVowel, Consonant, Consonant, Consonant,
1290	Consonant, Consonant, Consonant, Consonant,
1291	Consonant, Consonant, Consonant, Consonant,
1292
1293	Consonant, Consonant, Consonant, Consonant,
1294	Consonant, Consonant, Consonant, Consonant,
1295	Consonant, Invalid, Consonant, Consonant,
1296	Consonant, Consonant, Consonant, Consonant,
1297
1298	Consonant, Invalid, Consonant, Consonant,
1299	Invalid, Consonant, Consonant, Consonant,
1300	Consonant, Consonant, UnknownForm, UnknownForm,
1301	Nukta, Other, Matra, Matra,
1302
1303	Matra, Matra, Matra, Matra,
1304	Matra, Matra, Invalid, Matra,
1305	Matra, Matra, Invalid, Matra,
1306	Matra, Halant, UnknownForm, UnknownForm,
1307
1308	Other, UnknownForm, UnknownForm, UnknownForm,
1309	UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1310	UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1311	UnknownForm, UnknownForm, UnknownForm, UnknownForm,
1312
1313	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1314	Other, Other, Other, Other,
1315	Other, Other, Other, Other,
1316	Other, Other, Other, Other,
1317
1318	Other, Other, Other, Other,
1319	Other, Other, Other, Other,
1320	Other, Other, Other, Other,
1321	Other, Other, Other, Other,
1322
1323	// Oriya
1324	Invalid, VowelMark, VowelMark, VowelMark,
1325	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1326	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1327	IndependentVowel, Invalid, Invalid, IndependentVowel,
1328
1329	IndependentVowel, Invalid, Invalid, IndependentVowel,
1330	IndependentVowel, Consonant, Consonant, Consonant,
1331	Consonant, Consonant, Consonant, Consonant,
1332	Consonant, Consonant, Consonant, Consonant,
1333
1334	Consonant, Consonant, Consonant, Consonant,
1335	Consonant, Consonant, Consonant, Consonant,
1336	Consonant, Invalid, Consonant, Consonant,
1337	Consonant, Consonant, Consonant, Consonant,
1338
1339	Consonant, Invalid, Consonant, Consonant,
1340	Invalid, Consonant, Consonant, Consonant,
1341	Consonant, Consonant, UnknownForm, UnknownForm,
1342	Nukta, Other, Matra, Matra,
1343
1344	Matra, Matra, Matra, Matra,
1345	Invalid, Invalid, Invalid, Matra,
1346	Matra, Invalid, Invalid, Matra,
1347	Matra, Halant, UnknownForm, UnknownForm,
1348
1349	Other, Invalid, Invalid, Invalid,
1350	Invalid, UnknownForm, LengthMark, LengthMark,
1351	Invalid, Invalid, Invalid, Invalid,
1352	Consonant, Consonant, Invalid, Consonant,
1353
1354	IndependentVowel, IndependentVowel, Invalid, Invalid,
1355	Invalid, Invalid, Other, Other,
1356	Other, Other, Other, Other,
1357	Other, Other, Other, Other,
1358
1359	Other, Consonant, Other, Other,
1360	Other, Other, Other, Other,
1361	Other, Other, Other, Other,
1362	Other, Other, Other, Other,
1363
1364	//Tamil
1365	Invalid, Invalid, VowelMark, Other,
1366	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1367	IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1368	Invalid, Invalid, IndependentVowel, IndependentVowel,
1369
1370	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1371	IndependentVowel, Consonant, Invalid, Invalid,
1372	Invalid, Consonant, Consonant, Invalid,
1373	Consonant, Invalid, Consonant, Consonant,
1374
1375	Invalid, Invalid, Invalid, Consonant,
1376	Consonant, Invalid, Invalid, Invalid,
1377	Consonant, Consonant, Consonant, Invalid,
1378	Invalid, Invalid, Consonant, Consonant,
1379
1380	Consonant, Consonant, Consonant, Consonant,
1381	Consonant, Consonant, Consonant, Consonant,
1382	Consonant, Consonant, UnknownForm, UnknownForm,
1383	Invalid, Invalid, Matra, Matra,
1384
1385	Matra, Matra, Matra, Invalid,
1386	Invalid, Invalid, Matra, Matra,
1387	Matra, Invalid, Matra, Matra,
1388	Matra, Halant, Invalid, Invalid,
1389
1390	Invalid, Invalid, Invalid, Invalid,
1391	Invalid, Invalid, Invalid, LengthMark,
1392	Invalid, Invalid, Invalid, Invalid,
1393	Invalid, Invalid, Invalid, Invalid,
1394
1395	Invalid, Invalid, Invalid, Invalid,
1396	Invalid, Invalid, Other, Other,
1397	Other, Other, Other, Other,
1398	Other, Other, Other, Other,
1399
1400	Other, Other, Other, Other,
1401	Other, Other, Other, Other,
1402	Other, Other, Other, Other,
1403	Other, Other, Other, Other,
1404
1405	// Telugu
1406	Invalid, VowelMark, VowelMark, VowelMark,
1407	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1408	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1409	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1410
1411	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1412	IndependentVowel, Consonant, Consonant, Consonant,
1413	Consonant, Consonant, Consonant, Consonant,
1414	Consonant, Consonant, Consonant, Consonant,
1415
1416	Consonant, Consonant, Consonant, Consonant,
1417	Consonant, Consonant, Consonant, Consonant,
1418	Consonant, Invalid, Consonant, Consonant,
1419	Consonant, Consonant, Consonant, Consonant,
1420
1421	Consonant, Consonant, Consonant, Consonant,
1422	Invalid, Consonant, Consonant, Consonant,
1423	Consonant, Consonant, UnknownForm, UnknownForm,
1424	Invalid, Invalid, Matra, Matra,
1425
1426	Matra, Matra, Matra, Matra,
1427	Matra, Invalid, Matra, Matra,
1428	Matra, Invalid, Matra, Matra,
1429	Matra, Halant, Invalid, Invalid,
1430
1431	Invalid, Invalid, Invalid, Invalid,
1432	Invalid, LengthMark, Matra, Invalid,
1433	Invalid, Invalid, Invalid, Invalid,
1434	Invalid, Invalid, Invalid, Invalid,
1435
1436	IndependentVowel, IndependentVowel, Invalid, Invalid,
1437	Invalid, Invalid, Other, Other,
1438	Other, Other, Other, Other,
1439	Other, Other, Other, Other,
1440
1441	Other, Other, Other, Other,
1442	Other, Other, Other, Other,
1443	Other, Other, Other, Other,
1444	Other, Other, Other, Other,
1445
1446	// Kannada
1447	Invalid, Invalid, VowelMark, VowelMark,
1448	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1449	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1450	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1451
1452	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1453	IndependentVowel, Consonant, Consonant, Consonant,
1454	Consonant, Consonant, Consonant, Consonant,
1455	Consonant, Consonant, Consonant, Consonant,
1456
1457	Consonant, Consonant, Consonant, Consonant,
1458	Consonant, Consonant, Consonant, Consonant,
1459	Consonant, Invalid, Consonant, Consonant,
1460	Consonant, Consonant, Consonant, Consonant,
1461
1462	Consonant, Consonant, Consonant, Consonant,
1463	Invalid, Consonant, Consonant, Consonant,
1464	Consonant, Consonant, UnknownForm, UnknownForm,
1465	Nukta, Other, Matra, Matra,
1466
1467	Matra, Matra, Matra, Matra,
1468	Matra, Invalid, Matra, Matra,
1469	Matra, Invalid, Matra, Matra,
1470	Matra, Halant, Invalid, Invalid,
1471
1472	Invalid, Invalid, Invalid, Invalid,
1473	Invalid, LengthMark, LengthMark, Invalid,
1474	Invalid, Invalid, Invalid, Invalid,
1475	Invalid, Invalid, Consonant, Invalid,
1476
1477	IndependentVowel, IndependentVowel, VowelMark, VowelMark,
1478	Invalid, Invalid, Other, Other,
1479	Other, Other, Other, Other,
1480	Other, Other, Other, Other,
1481
1482	Other, Other, Other, Other,
1483	Other, Other, Other, Other,
1484	Other, Other, Other, Other,
1485	Other, Other, Other, Other,
1486
1487	// Malayalam
1488	Invalid, Invalid, VowelMark, VowelMark,
1489	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1490	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1491	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1492
1493	IndependentVowel, Invalid, IndependentVowel, IndependentVowel,
1494	IndependentVowel, Consonant, Consonant, Consonant,
1495	Consonant, Consonant, Consonant, Consonant,
1496	Consonant, Consonant, Consonant, Consonant,
1497
1498	Consonant, Consonant, Consonant, Consonant,
1499	Consonant, Consonant, Consonant, Consonant,
1500	Consonant, Invalid, Consonant, Consonant,
1501	Consonant, Consonant, Consonant, Consonant,
1502
1503	Consonant, Consonant, Consonant, Consonant,
1504	Consonant, Consonant, Consonant, Consonant,
1505	Consonant, Consonant, UnknownForm, UnknownForm,
1506	Invalid, Invalid, Matra, Matra,
1507
1508	Matra, Matra, Matra, Matra,
1509	Invalid, Invalid, Matra, Matra,
1510	Matra, Invalid, Matra, Matra,
1511	Matra, Halant, Invalid, Invalid,
1512
1513	Invalid, Invalid, Invalid, Invalid,
1514	Invalid, Invalid, Invalid, Matra,
1515	Invalid, Invalid, Invalid, Invalid,
1516	Invalid, Invalid, Invalid, Invalid,
1517
1518	IndependentVowel, IndependentVowel, Invalid, Invalid,
1519	Invalid, Invalid, Other, Other,
1520	Other, Other, Other, Other,
1521	Other, Other, Other, Other,
1522
1523	Other, Other, Other, Other,
1524	Other, Other, Other, Other,
1525	Other, Other, Other, Other,
1526	Other, Other, Other, Other,
1527
1528	// Sinhala
1529	Invalid, Invalid, VowelMark, VowelMark,
1530	Invalid, IndependentVowel, IndependentVowel, IndependentVowel,
1531	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1532	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1533
1534	IndependentVowel, IndependentVowel, IndependentVowel, IndependentVowel,
1535	IndependentVowel, IndependentVowel, IndependentVowel, Invalid,
1536	Invalid, Invalid, Consonant, Consonant,
1537	Consonant, Consonant, Consonant, Consonant,
1538
1539	Consonant, Consonant, Consonant, Consonant,
1540	Consonant, Consonant, Consonant, Consonant,
1541	Consonant, Consonant, Consonant, Consonant,
1542	Consonant, Consonant, Consonant, Consonant,
1543
1544	Consonant, Consonant, Invalid, Consonant,
1545	Consonant, Consonant, Consonant, Consonant,
1546	Consonant, Consonant, Consonant, Consonant,
1547	Invalid, Consonant, Invalid, Invalid,
1548
1549	Consonant, Consonant, Consonant, Consonant,
1550	Consonant, Consonant, Consonant, Invalid,
1551	Invalid, Invalid, Halant, Invalid,
1552	Invalid, Invalid, Invalid, Matra,
1553
1554	Matra, Matra, Matra, Matra,
1555	Matra, Invalid, Matra, Invalid,
1556	Matra, Matra, Matra, Matra,
1557	Matra, Matra, Matra, Matra,
1558
1559	Invalid, Invalid, Invalid, Invalid,
1560	Invalid, Invalid, Invalid, Invalid,
1561	Invalid, Invalid, Invalid, Invalid,
1562	Invalid, Invalid, Invalid, Invalid,
1563
1564	Invalid, Invalid, Matra, Matra,
1565	Other, Other, Other, Other,
1566	Other, Other, Other, Other,
1567	Other, Other, Other, Other,
1568	};
1569
1570	static inline Form form(unsigned short uc) {
1571	if (uc < `0x900` \|\| uc > `0xdff`) {
1572	if (uc == `0x25cc`)
1573	return Consonant;
1574	if (uc == `0x200c` \|\| uc == `0x200d`)
1575	return Control;
1576	return Other;
1577	}
1578	return (Form)indicForms[uc-`0x900`];
1579	}
1580
1581	// #define INDIC_DEBUG
1582	#ifdef INDIC_DEBUG
1583	#define IDEBUG qDebug
1584	#else
1585	#define IDEBUG if constexpr (1) ; else qDebug
1586	#endif
1587
1588	/ syllables are of the form:*
1589
1590	(Consonant Nukta? Halant) Consonant Matra? VowelMark? StressMark?*
1591	(Consonant Nukta? Halant) Consonant Halant*
1592	IndependentVowel VowelMark? StressMark?
1593
1594	We return syllable boundaries on invalid combinations as well
1595	*/
1596	static qsizetype indic_nextSyllableBoundary(QChar::Script script, const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
1597	{
1598	invalid = false*;
1599	IDEBUG(msg: "indic_nextSyllableBoundary: start=%lld, end=%lld", qlonglong(start), qlonglong(end));
1600	const char16_t *uc = s+start;
1601
1602	qsizetype pos = `0`;
1603	Form state = form(uc: uc[pos]);
1604	IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), state, uc[pos]);
1605	pos++;
1606
1607	if (state != Consonant && state != IndependentVowel) {
1608	if (state != Other)
1609	invalid = true*;
1610	goto finish;
1611	}
1612
1613	while (pos < end - start) {
1614	Form newState = form(uc: uc[pos]);
1615	IDEBUG(msg: "state[%lld]=%d (uc=%4x)", qlonglong(pos), newState, uc[pos]);
1616	switch (newState) {
1617	case Control:
1618	newState = state;
1619	if (state == Halant && uc[pos] == `0x200d` / ZWJ /)
1620	break;
1621	// the control character should be the last char in the item
1622	if (state == Consonant && script == QChar::Script_Bengali && uc[pos-`1`] == `0x09B0` && uc[pos] == `0x200d` / ZWJ /)
1623	break;
1624	if (state == Consonant && script == QChar::Script_Kannada && uc[pos-`1`] == `0x0CB0` && uc[pos] == `0x200d` / ZWJ /)
1625	break;
1626	// Bengali and Kannada has a special exception for rendering yaphala with ra (to avoid reph) see http://www.unicode.org/faq/indic.html#15
1627	++pos;
1628	goto finish;
1629	case Consonant:
1630	if (state == Halant && (script != QChar::Script_Sinhala \|\| uc[pos-`1`] == `0x200d` / ZWJ /))
1631	break;
1632	goto finish;
1633	case Halant:
1634	if (state == Nukta \|\| state == Consonant)
1635	break;
1636	// Bengali has a special exception allowing the combination Vowel_A/E + Halant + Ya
1637	if (script == QChar::Script_Bengali && pos == `1` &&
1638	(uc[`0`] == `0x0985` \|\| uc[`0`] == `0x098f`))
1639	break;
1640	// Sinhala uses the Halant as a component of certain matras. Allow these, but keep the state on Matra.
1641	if (script == QChar::Script_Sinhala && state == Matra) {
1642	++pos;
1643	continue;
1644	}
1645	if (script == QChar::Script_Malayalam && state == Matra && uc[pos-`1`] == `0x0d41`) {
1646	++pos;
1647	continue;
1648	}
1649	goto finish;
1650	case Nukta:
1651	if (state == Consonant)
1652	break;
1653	goto finish;
1654	case StressMark:
1655	if (state == VowelMark)
1656	break;
1657	Q_FALLTHROUGH();
1658	case VowelMark:
1659	if (state == Matra \|\| state == LengthMark \|\| state == IndependentVowel)
1660	break;
1661	Q_FALLTHROUGH();
1662	case Matra:
1663	if (state == Consonant \|\| state == Nukta)
1664	break;
1665	if (state == Matra) {
1666	// ### needs proper testing for correct two/three part matras
1667	break;
1668	}
1669	// ### not sure if this is correct. If it is, does it apply only to Bengali or should
1670	// it work for all Indic languages?
1671	// the combination Independent_A + Vowel Sign AA is allowed.
1672	if (script == QChar::Script_Bengali && uc[pos] == `0x9be` && uc[pos-`1`] == `0x985`)
1673	break;
1674	if (script == QChar::Script_Tamil && state == Matra) {
1675	if (uc[pos-`1`] == `0x0bc6` &&
1676	(uc[pos] == `0xbbe` \|\| uc[pos] == `0xbd7`))
1677	break;
1678	if (uc[pos-`1`] == `0x0bc7` && uc[pos] == `0xbbe`)
1679	break;
1680	}
1681	goto finish;
1682
1683	case LengthMark:
1684	if (state == Matra) {
1685	// ### needs proper testing for correct two/three part matras
1686	break;
1687	}
1688	Q_FALLTHROUGH();
1689	case IndependentVowel:
1690	case Invalid:
1691	case Other:
1692	goto finish;
1693	}
1694	state = newState;
1695	pos++;
1696	}
1697	finish:
1698	return pos+start;
1699	}
1700
1701	static void indicAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
1702	{
1703	qsizetype end = from + len;
1704	attributes += from;
1705	qsizetype i = `0`;
1706	while (i < len) {
1707	bool invalid;
1708	qsizetype boundary = indic_nextSyllableBoundary(script, s: text, start: from+i, end, invalid: &invalid) - from;
1709	attributes[i].graphemeBoundary = true;
1710
1711	if (boundary > len-`1`) boundary = len;
1712	i++;
1713	while (i < boundary) {
1714	attributes[i].graphemeBoundary = false;
1715	++i;
1716	}
1717	assert(i == boundary);
1718	}
1719
1720
1721	}
1722
1723	#if QT_CONFIG(library)
1724
1725	#define LIBTHAI_MAJOR 0
1726
1727	/*
1728	* if libthai changed please update these codes too.
1729	*/
1730	struct thcell_t {
1731	unsigned char base; /< base character /*
1732	unsigned char hilo; /< upper/lower vowel/diacritic /*
1733	unsigned char top; /< top-level mark /*
1734	};
1735
1736	using ThBrk = struct _ThBrk;
1737
1738	namespace {
1739
1740	class LibThai final
1741	{
1742	Q_DISABLE_COPY_MOVE(LibThai)
1743
1744	using th_brk_new_def = ThBrk ()(const char *);
1745	using th_brk_delete_def = void ()(ThBrk );
1746	using th_brk_find_breaks_def = int ()(ThBrk , const unsigned char , int* *, size_t);
1747	using th_next_cell_def = size_t ()(const* unsigned char , size_t, struct* thcell_t , int*);
1748
1749	public:
1750	LibThai() : m_library ("thai"_L1, LIBTHAI_MAJOR)
1751	{
1752	m_th_brk_find_breaks =
1753	reinterpret_cast<th_brk_find_breaks_def>(m_library.resolve(symbol: "th_brk_find_breaks"));
1754	m_th_next_cell = reinterpret_cast<th_next_cell_def>(m_library.resolve(symbol: "th_next_cell"));
1755
1756	auto th_brk_new = reinterpret_cast<th_brk_new_def>(m_library.resolve(symbol: "th_brk_new"));
1757	if (th_brk_new) {
1758	m_state = th_brk_new(nullptr);
1759	m_th_brk_delete =
1760	reinterpret_cast<th_brk_delete_def>(m_library.resolve(symbol: "th_brk_delete"));
1761	}
1762	}
1763
1764	~LibThai()
1765	{
1766	if (m_state && m_th_brk_delete)
1767	m_th_brk_delete(m_state);
1768	m_library.unload();
1769	}
1770
1771	bool isInitialized() const { return m_th_brk_find_breaks && m_th_next_cell && m_state; }
1772
1773	int brk_find_breaks(const unsigned char s, int* pos, size_t pos_sz) const*
1774	{
1775	Q_ASSERT(m_state);
1776	Q_ASSERT(m_th_brk_find_breaks);
1777	return m_th_brk_find_breaks(m_state, s, pos, pos_sz);
1778	}
1779
1780	size_t next_cell(const unsigned char s, size_t len, struct* thcell_t cell, int* is_decomp_am)
1781	{
1782	Q_ASSERT(m_th_next_cell);
1783	return m_th_next_cell(s, len, cell, is_decomp_am);
1784	}
1785
1786	private:
1787	QLibrary m_library;
1788
1789	// Global state for th_brk_find_breaks().
1790	// Note: even if signature for th_brk_find_breaks() suggests otherwise, the
1791	// state is read-only, and so it is safe to use it from multiple threads after
1792	// initialization. This is also stated in the libthai documentation.
1793	ThBrk m_state = nullptr*;
1794
1795	th_brk_find_breaks_def m_th_brk_find_breaks = nullptr;
1796	th_next_cell_def m_th_next_cell = nullptr;
1797	th_brk_delete_def m_th_brk_delete = nullptr;
1798	};
1799
1800	} // unnamed namespace
1801
1802	Q_GLOBAL_STATIC(LibThai, g_libThai)
1803
1804	static void to_tis620(const char16_t string, qsizetype len, char* *cstr)
1805	{
1806	qsizetype i;
1807	unsigned char result = reinterpret_cast<unsigned* char *>(cstr);
1808
1809	for (i = `0`; i < len; ++i) {
1810	if (string[i] <= `0xa0`)
1811	result[i] = static_cast<unsigned char>(string[i]);
1812	else if (string[i] >= `0xe01` && string[i] <= `0xe5b`)
1813	result[i] = static_cast<unsigned char>(string[i] - `0xe00` + `0xa0`);
1814	else
1815	result[i] = static_cast<unsigned char>(~`0`); // Same encoding as libthai uses for invalid chars
1816	}
1817
1818	result[len] = `0`;
1819	}
1820
1821	/*
1822	* Thai Attributes: computes Word Break, Word Boundary and Char stop for THAI.
1823	*/
1824	static void thaiAssignAttributes(const char16_t string, qsizetype len, QCharAttributes attributes)
1825	{
1826	constexpr qsizetype Prealloc = `128`;
1827	QVarLengthArray<char, Prealloc + `1`> s(len + `1`);
1828	QVarLengthArray<int, Prealloc> break_positions(len);
1829	qsizetype numbreaks, i;
1830	struct thcell_t tis_cell;
1831
1832	LibThai *libThai = g_libThai;
1833	if (!libThai \|\| !libThai->isInitialized())
1834	return;
1835
1836	to_tis620(string, len, cstr: s.data());
1837
1838	for (i = `0`; i < len; ++i) {
1839	attributes[i].wordBreak = false;
1840	attributes[i].wordStart = false;
1841	attributes[i].wordEnd = false;
1842	attributes[i].lineBreak = false;
1843	}
1844
1845	attributes[`0`].wordBreak = true;
1846	attributes[`0`].wordStart = true;
1847	attributes[`0`].wordEnd = false;
1848	numbreaks = libThai->brk_find_breaks(s: reinterpret_cast<const unsigned char *>(s.data()),
1849	pos: break_positions.data(),
1850	pos_sz: static_cast<size_t>(break_positions.size()));
1851	for (i = `0`; i < numbreaks; ++i) {
1852	attributes[break_positions [i]].wordBreak = true;
1853	attributes[break_positions [i]].wordStart = true;
1854	attributes[break_positions [i]].wordEnd = true;
1855	attributes[break_positions [i]].lineBreak = true;
1856	}
1857	if (numbreaks > `0`)
1858	attributes[break_positions [numbreaks - `1`]].wordStart = false;
1859
1860	/ manage grapheme boundaries /
1861	i = `0`;
1862	while (i < len) {
1863	size_t cell_length =
1864	libThai->next_cell(s: reinterpret_cast<const unsigned char *>(s.data()) + i,
1865	len: size_t(len - i), cell: &tis_cell, is_decomp_am: true);
1866
1867	attributes[i].graphemeBoundary = true;
1868	for (size_t j = `1`; j < cell_length; ++j)
1869	attributes[i + j].graphemeBoundary = false;
1870
1871	i += cell_length;
1872	}
1873	}
1874
1875	#endif // QT_CONFIG(library)
1876
1877	static void thaiAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
1878	{
1879	assert(script == QChar::Script_Thai);
1880	#if QT_CONFIG(library)
1881	const char16_t *uc = text + from;
1882	attributes += from;
1883	Q_UNUSED(script);
1884	thaiAssignAttributes(string: uc, len, attributes);
1885	#else
1886	Q_UNUSED(script);
1887	Q_UNUSED(text);
1888	Q_UNUSED(from);
1889	Q_UNUSED(len);
1890	Q_UNUSED(attributes);
1891	#endif
1892	}
1893
1894	/*
1895	tibetan syllables are of the form:
1896	head position consonant
1897	first sub-joined consonant
1898	....intermediate sub-joined consonants (if any)
1899	last sub-joined consonant
1900	sub-joined vowel (a-chung U+0F71)
1901	standard or compound vowel sign (or 'virama' for devanagari transliteration)
1902	*/
1903
1904	typedef enum {
1905	TibetanOther,
1906	TibetanHeadConsonant,
1907	TibetanSubjoinedConsonant,
1908	TibetanSubjoinedVowel,
1909	TibetanVowel
1910	} TibetanForm;
1911
1912	/ this table starts at U+0f40 /
1913	static const unsigned char tibetanForm[`0x80`] = {
1914	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1915	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1916	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1917	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1918
1919	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1920	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1921	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1922	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1923
1924	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1925	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1926	TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant, TibetanHeadConsonant,
1927	TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1928
1929	TibetanOther, TibetanVowel, TibetanVowel, TibetanVowel,
1930	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1931	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1932	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1933
1934	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1935	TibetanVowel, TibetanVowel, TibetanVowel, TibetanVowel,
1936	TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1937	TibetanOther, TibetanOther, TibetanOther, TibetanOther,
1938
1939	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1940	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1941	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1942	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1943
1944	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1945	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1946	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1947	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1948
1949	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1950	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1951	TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant, TibetanSubjoinedConsonant,
1952	TibetanSubjoinedConsonant, TibetanOther, TibetanOther, TibetanOther
1953	};
1954
1955	#define tibetan_form(c) \
1956	((c) >= 0x0f40 && (c) < 0x0fc0 ? (TibetanForm)tibetanForm[(c) - 0x0f40] : TibetanOther)
1957
1958	static qsizetype tibetan_nextSyllableBoundary(const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
1959	{
1960	const char16_t *uc = s + start;
1961
1962	qsizetype pos = `0`;
1963	TibetanForm state = tibetan_form(*uc);
1964
1965	/ qDebug("state[%d]=%d (uc=%4x)", pos, state, uc[pos]);/
1966	pos++;
1967
1968	if (state != TibetanHeadConsonant) {
1969	if (state != TibetanOther)
1970	invalid = true*;
1971	goto finish;
1972	}
1973
1974	while (pos < end - start) {
1975	TibetanForm newState = tibetan_form(uc[pos]);
1976	switch (newState) {
1977	case TibetanSubjoinedConsonant:
1978	case TibetanSubjoinedVowel:
1979	if (state != TibetanHeadConsonant &&
1980	state != TibetanSubjoinedConsonant)
1981	goto finish;
1982	state = newState;
1983	break;
1984	case TibetanVowel:
1985	if (state != TibetanHeadConsonant &&
1986	state != TibetanSubjoinedConsonant &&
1987	state != TibetanSubjoinedVowel)
1988	goto finish;
1989	break;
1990	case TibetanOther:
1991	case TibetanHeadConsonant:
1992	goto finish;
1993	}
1994	pos++;
1995	}
1996
1997	finish:
1998	invalid = false*;
1999	return start+pos;
2000	}
2001
2002	static void tibetanAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
2003	{
2004	qsizetype end = from + len;
2005	qsizetype i = `0`;
2006	Q_UNUSED(script);
2007	attributes += from;
2008	while (i < len) {
2009	bool invalid;
2010	qsizetype boundary = tibetan_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
2011
2012	attributes[i].graphemeBoundary = true;
2013
2014	if (boundary > len-`1`) boundary = len;
2015	i++;
2016	while (i < boundary) {
2017	attributes[i].graphemeBoundary = false;
2018	++i;
2019	}
2020	assert(i == boundary);
2021	}
2022	}
2023
2024	enum MymrCharClassValues {
2025	Mymr_CC_RESERVED = `0`,
2026	Mymr_CC_CONSONANT = `1`, / Consonant of type 1, that has subscript form /
2027	Mymr_CC_CONSONANT2 = `2`, / Consonant of type 2, that has no subscript form /
2028	Mymr_CC_NGA = `3`, / Consonant NGA /
2029	Mymr_CC_YA = `4`, / Consonant YA /
2030	Mymr_CC_RA = `5`, / Consonant RA /
2031	Mymr_CC_WA = `6`, / Consonant WA /
2032	Mymr_CC_HA = `7`, / Consonant HA /
2033	Mymr_CC_IND_VOWEL = `8`, / Independent vowel /
2034	Mymr_CC_ZERO_WIDTH_NJ_MARK = `9`, / Zero Width non joiner character (0x200C) /
2035	Mymr_CC_VIRAMA = `10`, / Subscript consonant combining character /
2036	Mymr_CC_PRE_VOWEL = `11`, / Dependent vowel, prebase (Vowel e) /
2037	Mymr_CC_BELOW_VOWEL = `12`, / Dependent vowel, prebase (Vowel u, uu) /
2038	Mymr_CC_ABOVE_VOWEL = `13`, / Dependent vowel, prebase (Vowel i, ii, ai) /
2039	Mymr_CC_POST_VOWEL = `14`, / Dependent vowel, prebase (Vowel aa) /
2040	Mymr_CC_SIGN_ABOVE = `15`,
2041	Mymr_CC_SIGN_BELOW = `16`,
2042	Mymr_CC_SIGN_AFTER = `17`,
2043	Mymr_CC_ZERO_WIDTH_J_MARK = `18`, / Zero width joiner character /
2044	Mymr_CC_COUNT = `19` / This is the number of character classes /
2045	};
2046
2047	enum MymrCharClassFlags {
2048	Mymr_CF_CLASS_MASK = `0x0000FFFF`,
2049
2050	Mymr_CF_CONSONANT = `0x01000000`, / flag to speed up comparing /
2051	Mymr_CF_MEDIAL = `0x02000000`, / flag to speed up comparing /
2052	Mymr_CF_IND_VOWEL = `0x04000000`, / flag to speed up comparing /
2053	Mymr_CF_DEP_VOWEL = `0x08000000`, / flag to speed up comparing /
2054	Mymr_CF_DOTTED_CIRCLE = `0x10000000`, / add a dotted circle if a character with this flag is the*
2055	first in a syllable /*
2056	Mymr_CF_VIRAMA = `0x20000000`, / flag to speed up comparing /
2057
2058	/ position flags /
2059	Mymr_CF_POS_BEFORE = `0x00080000`,
2060	Mymr_CF_POS_BELOW = `0x00040000`,
2061	Mymr_CF_POS_ABOVE = `0x00020000`,
2062	Mymr_CF_POS_AFTER = `0x00010000`,
2063	Mymr_CF_POS_MASK = `0x000f0000`,
2064
2065	Mymr_CF_AFTER_KINZI = `0x00100000`
2066	};
2067
2068	Q_DECLARE_MIXED_ENUM_OPERATORS(int, MymrCharClassValues, MymrCharClassFlags)
2069
2070	/ Characters that get refrered to by name /
2071	enum MymrChar
2072	{
2073	Mymr_C_SIGN_ZWNJ = `0x200C`,
2074	Mymr_C_SIGN_ZWJ = `0x200D`,
2075	Mymr_C_DOTTED_CIRCLE = `0x25CC`,
2076	Mymr_C_RA = `0x101B`,
2077	Mymr_C_YA = `0x101A`,
2078	Mymr_C_NGA = `0x1004`,
2079	Mymr_C_VOWEL_E = `0x1031`,
2080	Mymr_C_VIRAMA = `0x1039`
2081	};
2082
2083	enum
2084	{
2085	Mymr_xx = Mymr_CC_RESERVED,
2086	Mymr_c1 = Mymr_CC_CONSONANT \| Mymr_CF_CONSONANT \| Mymr_CF_POS_BELOW,
2087	Mymr_c2 = Mymr_CC_CONSONANT2 \| Mymr_CF_CONSONANT,
2088	Mymr_ng = Mymr_CC_NGA \| Mymr_CF_CONSONANT \| Mymr_CF_POS_ABOVE,
2089	Mymr_ya = Mymr_CC_YA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_AFTER \| Mymr_CF_AFTER_KINZI,
2090	Mymr_ra = Mymr_CC_RA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_BEFORE,
2091	Mymr_wa = Mymr_CC_WA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_BELOW,
2092	Mymr_ha = Mymr_CC_HA \| Mymr_CF_CONSONANT \| Mymr_CF_MEDIAL \| Mymr_CF_POS_BELOW,
2093	Mymr_id = Mymr_CC_IND_VOWEL \| Mymr_CF_IND_VOWEL,
2094	Mymr_vi = Mymr_CC_VIRAMA \| Mymr_CF_VIRAMA \| Mymr_CF_POS_ABOVE \| Mymr_CF_DOTTED_CIRCLE,
2095	Mymr_dl = Mymr_CC_PRE_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_BEFORE \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
2096	Mymr_db = Mymr_CC_BELOW_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_BELOW \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
2097	Mymr_da = Mymr_CC_ABOVE_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_ABOVE \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
2098	Mymr_dr = Mymr_CC_POST_VOWEL \| Mymr_CF_DEP_VOWEL \| Mymr_CF_POS_AFTER \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI,
2099	Mymr_sa = Mymr_CC_SIGN_ABOVE \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_POS_ABOVE \| Mymr_CF_AFTER_KINZI,
2100	Mymr_sb = Mymr_CC_SIGN_BELOW \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_POS_BELOW \| Mymr_CF_AFTER_KINZI,
2101	Mymr_sp = Mymr_CC_SIGN_AFTER \| Mymr_CF_DOTTED_CIRCLE \| Mymr_CF_AFTER_KINZI
2102	};
2103
2104
2105	typedef int MymrCharClass;
2106
2107
2108	static const MymrCharClass mymrCharClasses[] =
2109	{
2110	Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_ng, Mymr_c1, Mymr_c1, Mymr_c1,
2111	Mymr_c1, Mymr_c1, Mymr_c2, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, / 1000 - 100F /
2112	Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1, Mymr_c1,
2113	Mymr_c1, Mymr_c1, Mymr_ya, Mymr_ra, Mymr_c1, Mymr_wa, Mymr_c1, Mymr_ha, / 1010 - 101F /
2114	Mymr_c2, Mymr_c2, Mymr_xx, Mymr_id, Mymr_id, Mymr_id, Mymr_id, Mymr_id,
2115	Mymr_xx, Mymr_id, Mymr_id, Mymr_xx, Mymr_dr, Mymr_da, Mymr_da, Mymr_db, / 1020 - 102F /
2116	Mymr_db, Mymr_dl, Mymr_da, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_sa, Mymr_sb,
2117	Mymr_sp, Mymr_vi, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, / 1030 - 103F /
2118	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
2119	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, / 1040 - 104F /
2120	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx,
2121	Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, Mymr_xx, / 1050 - 105F /
2122	};
2123
2124	static MymrCharClass
2125	getMyanmarCharClass (ushort ch)
2126	{
2127	if (ch == Mymr_C_SIGN_ZWJ)
2128	return Mymr_CC_ZERO_WIDTH_J_MARK;
2129
2130	if (ch == Mymr_C_SIGN_ZWNJ)
2131	return Mymr_CC_ZERO_WIDTH_NJ_MARK;
2132
2133	if (ch < `0x1000` \|\| ch > `0x105f`)
2134	return Mymr_CC_RESERVED;
2135
2136	return mymrCharClasses[ch - `0x1000`];
2137	}
2138
2139	static const signed char mymrStateTable[][Mymr_CC_COUNT] =
2140	{
2141	/ xx c1, c2 ng ya ra wa ha id zwnj vi dl db da dr sa sb sp zwj /
2142	{ `1`, `4`, `4`, `2`, `4`, `4`, `4`, `4`, `24`, `1`, `27`, `17`, `18`, `19`, `20`, `21`, `1`, `1`, `4`}, / 0 - ground state /
2143	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 1 - exit state (or sp to the right of the syllable) /
2144	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `3`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, `4`}, / 2 - NGA /
2145	{-`1`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, -`1`, `23`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 3 - Virama after NGA /
2146	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `5`, `17`, `18`, `19`, `20`, `21`, `1`, `1`, -`1`}, / 4 - Base consonant /
2147	{-`2`, `6`, -`2`, -`2`, `7`, `8`, `9`, `10`, -`2`, `23`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 5 - First virama /
2148	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `25`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 6 - c1 after virama /
2149	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 7 - ya after virama /
2150	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 8 - ra after virama /
2151	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 9 - wa after virama /
2152	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 10 - ha after virama /
2153	{-`1`, -`1`, -`1`, -`1`, `7`, `8`, `9`, `10`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 11 - Virama after NGA+zwj /
2154	{-`2`, -`2`, -`2`, -`2`, -`2`, -`2`, `13`, `14`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 12 - Second virama /
2155	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `15`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 13 - wa after virama /
2156	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 14 - ha after virama /
2157	{-`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, `16`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 15 - Third virama /
2158	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `18`, `19`, `20`, `21`, -`1`, -`1`, -`1`}, / 16 - ha after virama /
2159	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `20`, `21`, `1`, `1`, -`1`}, / 17 - dl, Dependent vowel e /
2160	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `19`, -`1`, `21`, `1`, `1`, -`1`}, / 18 - db, Dependent vowel u,uu /
2161	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, `1`, -`1`}, / 19 - da, Dependent vowel i,ii,ai /
2162	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `22`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, -`1`}, / 20 - dr, Dependent vowel aa /
2163	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, -`1`}, / 21 - sa, Sign anusvara /
2164	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `23`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 22 - atha /
2165	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, -`1`}, / 23 - zwnj for atha /
2166	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, -`1`}, / 24 - Independent vowel /
2167	{-`2`, -`2`, -`2`, -`2`, `26`, `26`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`, -`2`}, / 25 - Virama after subscript consonant /
2168	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `12`, `17`, `18`, `19`, `20`, `21`, -`1`, `1`, -`1`}, / 26 - ra/ya after subscript consonant + virama /
2169	{-`1`, `6`, -`1`, -`1`, `7`, `8`, `9`, `10`, -`1`, `23`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 27 - Virama after ground state /
2170	/ exit state -2 is for invalid order of medials and combination of invalids*
2171	with virama where virama should treat as start of next syllable
2172	*/
2173	};
2174
2175	/#define MYANMAR_DEBUG /
2176	#ifdef MYANMAR_DEBUG
2177	#define MMDEBUG qDebug
2178	#else
2179	# define MMDEBUG \
2180	if (0) \
2181	printf
2182	#endif
2183
2184	/*
2185	// Given an input string of characters and a location in which to start looking
2186	// calculate, using the state table, which one is the last character of the syllable
2187	// that starts in the starting position.
2188	*/
2189	static qsizetype myanmar_nextSyllableBoundary(const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
2190	{
2191	const char16_t *uc = s + start;
2192	int state = `0`;
2193	qsizetype pos = start;
2194	invalid = false*;
2195
2196	while (pos < end) {
2197	MymrCharClass charClass = getMyanmarCharClass(ch: *uc);
2198	state = mymrStateTable[state][charClass & Mymr_CF_CLASS_MASK];
2199	if (pos == start)
2200	invalid = (bool*)(charClass & Mymr_CF_DOTTED_CIRCLE);
2201
2202	MMDEBUG(format: "state[%lld]=%d class=%8x (uc=%4x)", qlonglong(pos - start), state, charClass, *uc);
2203
2204	if (state < `0`) {
2205	if (state < -`1`)
2206	--pos;
2207	break;
2208	}
2209	++uc;
2210	++pos;
2211	}
2212	return pos;
2213	}
2214
2215	static void myanmarAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
2216	{
2217	qsizetype end = from + len;
2218	qsizetype i = `0`;
2219	Q_UNUSED(script);
2220	attributes += from;
2221	while (i < len) {
2222	bool invalid;
2223	qsizetype boundary = myanmar_nextSyllableBoundary(s: text, start: from+i, end, invalid: &invalid) - from;
2224
2225	attributes[i].graphemeBoundary = true;
2226	attributes[i].lineBreak = true;
2227
2228	if (boundary > len-`1`)
2229	boundary = len;
2230	i++;
2231	while (i < boundary) {
2232	attributes[i].graphemeBoundary = false;
2233	++i;
2234	}
2235	assert(i == boundary);
2236	}
2237	}
2238
2239	/*
2240	// Vocabulary
2241	// Base -> A consonant or an independent vowel in its full (not subscript) form. It is the
2242	// center of the syllable, it can be surrounded by coeng (subscript) consonants, vowels,
2243	// split vowels, signs... but there is only one base in a syllable, it has to be coded as
2244	// the first character of the syllable.
2245	// split vowel --> vowel that has two parts placed separately (e.g. Before and after the consonant).
2246	// Khmer language has five of them. Khmer split vowels either have one part before the
2247	// base and one after the base or they have a part before the base and a part above the base.
2248	// The first part of all Khmer split vowels is the same character, identical to
2249	// the glyph of Khmer dependent vowel SRA EI
2250	// coeng --> modifier used in Khmer to construct coeng (subscript) consonants
2251	// Differently than indian languages, the coeng modifies the consonant that follows it,
2252	// not the one preceding it Each consonant has two forms, the base form and the subscript form
2253	// the base form is the normal one (using the consonants code-point), the subscript form is
2254	// displayed when the combination coeng + consonant is encountered.
2255	// Consonant of type 1 -> A consonant which has subscript for that only occupies space under a base consonant
2256	// Consonant of type 2.-> Its subscript form occupies space under and before the base (only one, RO)
2257	// Consonant of Type 3 -> Its subscript form occupies space under and after the base (KHO, CHHO, THHO, BA, YO, SA)
2258	// Consonant shifter -> Khmer has to series of consonants. The same dependent vowel has different sounds
2259	// if it is attached to a consonant of the first series or a consonant of the second series
2260	// Most consonants have an equivalent in the other series, but some of theme exist only in
2261	// one series (for example SA). If we want to use the consonant SA with a vowel sound that
2262	// can only be done with a vowel sound that corresponds to a vowel accompanying a consonant
2263	// of the other series, then we need to use a consonant shifter: TRIISAP or MUSIKATOAN
2264	// x17C9 y x17CA. TRIISAP changes a first series consonant to second series sound and
2265	// MUSIKATOAN a second series consonant to have a first series vowel sound.
2266	// Consonant shifter are both normally supercript marks, but, when they are followed by a
2267	// superscript, they change shape and take the form of subscript dependent vowel SRA U.
2268	// If they are in the same syllable as a coeng consonant, Unicode 3.0 says that they
2269	// should be typed before the coeng. Unicode 4.0 breaks the standard and says that it should
2270	// be placed after the coeng consonant.
2271	// Dependent vowel -> In khmer dependent vowels can be placed above, below, before or after the base
2272	// Each vowel has its own position. Only one vowel per syllable is allowed.
2273	// Signs -> Khmer has above signs and post signs. Only one above sign and/or one post sign are
2274	// Allowed in a syllable.
2275	//
2276	//
2277	// order is important here! This order must be the same that is found in each horizontal
2278	// line in the statetable for Khmer (see khmerStateTable) .
2279	*/
2280	enum KhmerCharClassValues {
2281	CC_RESERVED = `0`,
2282	CC_CONSONANT = `1`, / Consonant of type 1 or independent vowel /
2283	CC_CONSONANT2 = `2`, / Consonant of type 2 /
2284	CC_CONSONANT3 = `3`, / Consonant of type 3 /
2285	CC_ZERO_WIDTH_NJ_MARK = `4`, / Zero Width non joiner character (0x200C) /
2286	CC_CONSONANT_SHIFTER = `5`,
2287	CC_ROBAT = `6`, / Khmer special diacritic accent -treated differently in state table /
2288	CC_COENG = `7`, / Subscript consonant combining character /
2289	CC_DEPENDENT_VOWEL = `8`,
2290	CC_SIGN_ABOVE = `9`,
2291	CC_SIGN_AFTER = `10`,
2292	CC_ZERO_WIDTH_J_MARK = `11`, / Zero width joiner character /
2293	CC_COUNT = `12` / This is the number of character classes /
2294	};
2295
2296
2297	enum KhmerCharClassFlags {
2298	CF_CLASS_MASK = `0x0000FFFF`,
2299
2300	CF_CONSONANT = `0x01000000`, / flag to speed up comparing /
2301	CF_SPLIT_VOWEL = `0x02000000`, / flag for a split vowel -> the first part is added in front of the syllable /
2302	CF_DOTTED_CIRCLE = `0x04000000`, / add a dotted circle if a character with this flag is the first in a syllable /
2303	CF_COENG = `0x08000000`, / flag to speed up comparing /
2304	CF_SHIFTER = `0x10000000`, / flag to speed up comparing /
2305	CF_ABOVE_VOWEL = `0x20000000`, / flag to speed up comparing /
2306
2307	/ position flags /
2308	CF_POS_BEFORE = `0x00080000`,
2309	CF_POS_BELOW = `0x00040000`,
2310	CF_POS_ABOVE = `0x00020000`,
2311	CF_POS_AFTER = `0x00010000`,
2312	CF_POS_MASK = `0x000f0000`
2313	};
2314
2315	Q_DECLARE_MIXED_ENUM_OPERATORS(int, KhmerCharClassValues, KhmerCharClassFlags)
2316
2317	/ Characters that get referred to by name /
2318	enum KhmerChar {
2319	C_SIGN_ZWNJ = `0x200C`,
2320	C_SIGN_ZWJ = `0x200D`,
2321	C_RO = `0x179A`,
2322	C_VOWEL_AA = `0x17B6`,
2323	C_SIGN_NIKAHIT = `0x17C6`,
2324	C_VOWEL_E = `0x17C1`,
2325	C_COENG = `0x17D2`
2326	};
2327
2328
2329	/*
2330	// simple classes, they are used in the statetable (in this file) to control the length of a syllable
2331	// they are also used to know where a character should be placed (location in reference to the base character)
2332	// and also to know if a character, when independently displayed, should be displayed with a dotted-circle to
2333	// indicate error in syllable construction
2334	*/
2335	enum {
2336	_xx = CC_RESERVED,
2337	_sa = CC_SIGN_ABOVE \| CF_DOTTED_CIRCLE \| CF_POS_ABOVE,
2338	_sp = CC_SIGN_AFTER \| CF_DOTTED_CIRCLE\| CF_POS_AFTER,
2339	_c1 = CC_CONSONANT \| CF_CONSONANT,
2340	_c2 = CC_CONSONANT2 \| CF_CONSONANT,
2341	_c3 = CC_CONSONANT3 \| CF_CONSONANT,
2342	_rb = CC_ROBAT \| CF_POS_ABOVE \| CF_DOTTED_CIRCLE,
2343	_cs = CC_CONSONANT_SHIFTER \| CF_DOTTED_CIRCLE \| CF_SHIFTER,
2344	_dl = CC_DEPENDENT_VOWEL \| CF_POS_BEFORE \| CF_DOTTED_CIRCLE,
2345	_db = CC_DEPENDENT_VOWEL \| CF_POS_BELOW \| CF_DOTTED_CIRCLE,
2346	_da = CC_DEPENDENT_VOWEL \| CF_POS_ABOVE \| CF_DOTTED_CIRCLE \| CF_ABOVE_VOWEL,
2347	_dr = CC_DEPENDENT_VOWEL \| CF_POS_AFTER \| CF_DOTTED_CIRCLE,
2348	_co = CC_COENG \| CF_COENG \| CF_DOTTED_CIRCLE,
2349
2350	/ split vowel /
2351	_va = _da \| CF_SPLIT_VOWEL,
2352	_vr = _dr \| CF_SPLIT_VOWEL
2353	};
2354
2355
2356	/*
2357	// Character class: a character class value
2358	// ORed with character class flags.
2359	*/
2360	typedef unsigned long KhmerCharClass;
2361
2362
2363	/*
2364	// Character class tables
2365	// _xx character does not combine into syllable, such as numbers, puntuation marks, non-Khmer signs...
2366	// _sa Sign placed above the base
2367	// _sp Sign placed after the base
2368	// _c1 Consonant of type 1 or independent vowel (independent vowels behave as type 1 consonants)
2369	// _c2 Consonant of type 2 (only RO)
2370	// _c3 Consonant of type 3
2371	// _rb Khmer sign robat u17CC. combining mark for subscript consonants
2372	// _cd Consonant-shifter
2373	// _dl Dependent vowel placed before the base (left of the base)
2374	// _db Dependent vowel placed below the base
2375	// _da Dependent vowel placed above the base
2376	// _dr Dependent vowel placed behind the base (right of the base)
2377	// _co Khmer combining mark COENG u17D2, combines with the consonant or independent vowel following
2378	// it to create a subscript consonant or independent vowel
2379	// _va Khmer split vowel in which the first part is before the base and the second one above the base
2380	// _vr Khmer split vowel in which the first part is before the base and the second one behind (right of) the base
2381	*/
2382	static const KhmerCharClass khmerCharClasses[] = {
2383	_c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c1, _c1, / 1780 - 178F /
2384	_c1, _c1, _c1, _c1, _c3, _c1, _c1, _c1, _c1, _c3, _c2, _c1, _c1, _c1, _c3, _c3, / 1790 - 179F /
2385	_c1, _c3, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, _c1, / 17A0 - 17AF /
2386	_c1, _c1, _c1, _c1, _dr, _dr, _dr, _da, _da, _da, _da, _db, _db, _db, _va, _vr, / 17B0 - 17BF /
2387	_vr, _dl, _dl, _dl, _vr, _vr, _sa, _sp, _sp, _cs, _cs, _sa, _rb, _sa, _sa, _sa, / 17C0 - 17CF /
2388	_sa, _sa, _co, _sa, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _xx, _sa, _xx, _xx / 17D0 - 17DF /
2389	};
2390
2391	/ this enum must reflect the range of khmerCharClasses /
2392	enum KhmerCharClassesRange {
2393	KhmerFirstChar = `0x1780`,
2394	KhmerLastChar = `0x17df`
2395	};
2396
2397	/*
2398	// Below we define how a character in the input string is either in the khmerCharClasses table
2399	// (in which case we get its type back), a ZWJ or ZWNJ (two characters that may appear
2400	// within the syllable, but are not in the table) we also get their type back, or an unknown object
2401	// in which case we get _xx (CC_RESERVED) back
2402	*/
2403	static KhmerCharClass getKhmerCharClass(ushort uc)
2404	{
2405	if (uc == C_SIGN_ZWJ) {
2406	return CC_ZERO_WIDTH_J_MARK;
2407	}
2408
2409	if (uc == C_SIGN_ZWNJ) {
2410	return CC_ZERO_WIDTH_NJ_MARK;
2411	}
2412
2413	if (uc < KhmerFirstChar \|\| uc > KhmerLastChar) {
2414	return CC_RESERVED;
2415	}
2416
2417	return khmerCharClasses[uc - KhmerFirstChar];
2418	}
2419
2420
2421	/*
2422	// The stateTable is used to calculate the end (the length) of a well
2423	// formed Khmer Syllable.
2424	//
2425	// Each horizontal line is ordered exactly the same way as the values in KhmerClassTable
2426	// CharClassValues. This coincidence of values allows the follow up of the table.
2427	//
2428	// Each line corresponds to a state, which does not necessarily need to be a type
2429	// of component... for example, state 2 is a base, with is always a first character
2430	// in the syllable, but the state could be produced a consonant of any type when
2431	// it is the first character that is analysed (in ground state).
2432	//
2433	// Differentiating 3 types of consonants is necessary in order to
2434	// forbid the use of certain combinations, such as having a second
2435	// coeng after a coeng RO,
2436	// The inexistent possibility of having a type 3 after another type 3 is permitted,
2437	// eliminating it would very much complicate the table, and it does not create typing
2438	// problems, as the case above.
2439	//
2440	// The table is quite complex, in order to limit the number of coeng consonants
2441	// to 2 (by means of the table).
2442	//
2443	// There a peculiarity, as far as Unicode is concerned:
2444	// - The consonant-shifter is considered in two possible different
2445	// locations, the one considered in Unicode 3.0 and the one considered in
2446	// Unicode 4.0. (there is a backwards compatibility problem in this standard).
2447	//
2448	//
2449	// xx independent character, such as a number, punctuation sign or non-khmer char
2450	//
2451	// c1 Khmer consonant of type 1 or an independent vowel
2452	// that is, a letter in which the subscript for is only under the
2453	// base, not taking any space to the right or to the left
2454	//
2455	// c2 Khmer consonant of type 2, the coeng form takes space under
2456	// and to the left of the base (only RO is of this type)
2457	//
2458	// c3 Khmer consonant of type 3. Its subscript form takes space under
2459	// and to the right of the base.
2460	//
2461	// cs Khmer consonant shifter
2462	//
2463	// rb Khmer robat
2464	//
2465	// co coeng character (u17D2)
2466	//
2467	// dv dependent vowel (including split vowels, they are treated in the same way).
2468	// even if dv is not defined above, the component that is really tested for is
2469	// KhmerClassTable::CC_DEPENDENT_VOWEL, which is common to all dependent vowels
2470	//
2471	// zwj Zero Width joiner
2472	//
2473	// zwnj Zero width non joiner
2474	//
2475	// sa above sign
2476	//
2477	// sp post sign
2478	//
2479	// there are lines with equal content but for an easier understanding
2480	// (and maybe change in the future) we did not join them
2481	*/
2482	static const signed char khmerStateTable[][CC_COUNT] =
2483	{
2484	/ xx c1 c2 c3 zwnj cs rb co dv sa sp zwj /
2485	{ `1`, `2`, `2`, `2`, `1`, `1`, `1`, `6`, `1`, `1`, `1`, `2`}, / 0 - ground state /
2486	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 1 - exit state (or sign to the right of the syllable) /
2487	{-`1`, -`1`, -`1`, -`1`, `3`, `4`, `5`, `6`, `16`, `17`, `1`, -`1`}, / 2 - Base consonant /
2488	{-`1`, -`1`, -`1`, -`1`, -`1`, `4`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 3 - First ZWNJ before a register shifter It can only be followed by a shifter or a vowel /
2489	{-`1`, -`1`, -`1`, -`1`, `15`, -`1`, -`1`, `6`, `16`, `17`, `1`, `14`}, / 4 - First register shifter /
2490	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `20`, -`1`, `1`, -`1`}, / 5 - Robat /
2491	{-`1`, `7`, `8`, `9`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 6 - First Coeng /
2492	{-`1`, -`1`, -`1`, -`1`, `12`, `13`, -`1`, `10`, `16`, `17`, `1`, `14`}, / 7 - First consonant of type 1 after coeng /
2493	{-`1`, -`1`, -`1`, -`1`, `12`, `13`, -`1`, -`1`, `16`, `17`, `1`, `14`}, / 8 - First consonant of type 2 after coeng /
2494	{-`1`, -`1`, -`1`, -`1`, `12`, `13`, -`1`, `10`, `16`, `17`, `1`, `14`}, / 9 - First consonant or type 3 after ceong /
2495	{-`1`, `11`, `11`, `11`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 10 - Second Coeng (no register shifter before) /
2496	{-`1`, -`1`, -`1`, -`1`, `15`, -`1`, -`1`, -`1`, `16`, `17`, `1`, `14`}, / 11 - Second coeng consonant (or ind. vowel) no register shifter before /
2497	{-`1`, -`1`, -`1`, -`1`, -`1`, `13`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 12 - Second ZWNJ before a register shifter /
2498	{-`1`, -`1`, -`1`, -`1`, `15`, -`1`, -`1`, -`1`, `16`, `17`, `1`, `14`}, / 13 - Second register shifter /
2499	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 14 - ZWJ before vowel /
2500	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `16`, -`1`, -`1`, -`1`}, / 15 - ZWNJ before vowel /
2501	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `17`, `1`, `18`}, / 16 - dependent vowel /
2502	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `18`}, / 17 - sign above /
2503	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `19`, -`1`, -`1`, -`1`, -`1`}, / 18 - ZWJ after vowel /
2504	{-`1`, `1`, -`1`, `1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`}, / 19 - Third coeng /
2505	{-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, -`1`}, / 20 - dependent vowel after a Robat /
2506	};
2507
2508
2509	/ #define KHMER_DEBUG /
2510	#ifdef KHMER_DEBUG
2511	#define KHDEBUG qDebug
2512	#else
2513	# define KHDEBUG \
2514	if (0) \
2515	printf
2516	#endif
2517
2518	/*
2519	// Given an input string of characters and a location in which to start looking
2520	// calculate, using the state table, which one is the last character of the syllable
2521	// that starts in the starting position.
2522	*/
2523	static qsizetype khmer_nextSyllableBoundary(const char16_t s, qsizetype start, qsizetype end, bool* *invalid)
2524	{
2525	const char16_t *uc = s + start;
2526	int state = `0`;
2527	qsizetype pos = start;
2528	invalid = false*;
2529
2530	while (pos < end) {
2531	KhmerCharClass charClass = getKhmerCharClass(uc: *uc);
2532	if (pos == start) {
2533	*invalid = (charClass > `0`) && ! (charClass & CF_CONSONANT);
2534	}
2535	state = khmerStateTable[state][charClass & CF_CLASS_MASK];
2536
2537	KHDEBUG(format: "state[%lld]=%d class=%8lx (uc=%4x)", qlonglong(pos - start), state,
2538	charClass, *uc );
2539
2540	if (state < `0`) {
2541	break;
2542	}
2543	++uc;
2544	++pos;
2545	}
2546	return pos;
2547	}
2548
2549	static void khmerAttributes(QChar::Script script, const char16_t text, qsizetype from, qsizetype len, QCharAttributes attributes)
2550	{
2551	qsizetype end = from + len;
2552	qsizetype i = `0`;
2553	Q_UNUSED(script);
2554	attributes += from;
2555	while ( i < len ) {
2556	bool invalid;
2557	qsizetype boundary = khmer_nextSyllableBoundary( s: text, start: from+i, end, invalid: &invalid ) - from;
2558
2559	attributes[i].graphemeBoundary = true;
2560
2561	if ( boundary > len-`1` ) boundary = len;
2562	i++;
2563	while ( i < boundary ) {
2564	attributes[i].graphemeBoundary = false;
2565	++i;
2566	}
2567	assert( i == boundary );
2568	}
2569	}
2570
2571
2572	const CharAttributeFunction charAttributeFunction[] = {
2573	// Script_Unknown,
2574	nullptr,
2575	// Script_Inherited,
2576	nullptr,
2577	// Script_Common,
2578	nullptr,
2579	// Script_Latin,
2580	nullptr,
2581	// Script_Greek,
2582	nullptr,
2583	// Script_Cyrillic,
2584	nullptr,
2585	// Script_Armenian,
2586	nullptr,
2587	// Script_Hebrew,
2588	nullptr,
2589	// Script_Arabic,
2590	nullptr,
2591	// Script_Syriac,
2592	nullptr,
2593	// Script_Thaana,
2594	nullptr,
2595	// Script_Devanagari,
2596	indicAttributes,
2597	// Script_Bengali,
2598	indicAttributes,
2599	// Script_Gurmukhi,
2600	indicAttributes,
2601	// Script_Gujarati,
2602	indicAttributes,
2603	// Script_Oriya,
2604	indicAttributes,
2605	// Script_Tamil,
2606	indicAttributes,
2607	// Script_Telugu,
2608	indicAttributes,
2609	// Script_Kannada,
2610	indicAttributes,
2611	// Script_Malayalam,
2612	indicAttributes,
2613	// Script_Sinhala,
2614	indicAttributes,
2615	// Script_Thai,
2616	thaiAttributes,
2617	// Script_Lao,
2618	nullptr,
2619	// Script_Tibetan,
2620	tibetanAttributes,
2621	// Script_Myanmar,
2622	myanmarAttributes,
2623	// Script_Georgian,
2624	nullptr,
2625	// Script_Hangul,
2626	nullptr,
2627	// Script_Ethiopic,
2628	nullptr,
2629	// Script_Cherokee,
2630	nullptr,
2631	// Script_CanadianAboriginal,
2632	nullptr,
2633	// Script_Ogham,
2634	nullptr,
2635	// Script_Runic,
2636	nullptr,
2637	// Script_Khmer,
2638	khmerAttributes
2639	};
2640
2641	static void getCharAttributes(const char16_t *string, qsizetype stringLength,
2642	const QUnicodeTools::ScriptItem *items, qsizetype numItems,
2643	QCharAttributes *attributes)
2644	{
2645	if (stringLength == `0`)
2646	return;
2647	for (qsizetype i = `0`; i < numItems; ++i) {
2648	QChar::Script script = items[i].script;
2649	if (script > QChar::Script_Khmer)
2650	script = QChar::Script_Common;
2651	CharAttributeFunction attributeFunction = charAttributeFunction[script];
2652	if (!attributeFunction)
2653	continue;
2654	qsizetype end = i < numItems - `1` ? items[i + `1`].position : stringLength;
2655	attributeFunction(script, string, items[i].position, end - items[i].position, attributes);
2656	}
2657	}
2658
2659	}
2660
2661	Q_CORE_EXPORT void initCharAttributes(QStringView string,
2662	const ScriptItem *items, qsizetype numItems,
2663	QCharAttributes *attributes, CharAttributeOptions options)
2664	{
2665	if (string.size() <= `0`)
2666	return;
2667
2668	if (!(options & DontClearAttributes))
2669	::memset(s: attributes, c: `0`, n: (string.size() + `1`) * sizeof(QCharAttributes));
2670
2671	if (options & GraphemeBreaks)
2672	getGraphemeBreaks(string: string.utf16(), len: string.size(), attributes);
2673	if (options & WordBreaks)
2674	getWordBreaks(string: string.utf16(), len: string.size(), attributes);
2675	if (options & SentenceBreaks)
2676	getSentenceBreaks(string: string.utf16(), len: string.size(), attributes);
2677	if (options & LineBreaks)
2678	getLineBreaks(string: string.utf16(), len: string.size(), attributes, options);
2679	if (options & WhiteSpaces)
2680	getWhiteSpaces(string: string.utf16(), len: string.size(), attributes);
2681
2682	if (!qt_initcharattributes_default_algorithm_only) {
2683	if (!items \|\| numItems <= `0`)
2684	return;
2685
2686	Tailored::getCharAttributes(string: string.utf16(), stringLength: string.size(), items, numItems, attributes);
2687	}
2688	}
2689
2690
2691	// ----------------------------------------------------------------------------
2692	//
2693	// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
2694	//
2695	// ----------------------------------------------------------------------------
2696
2697	Q_CORE_EXPORT void initScripts(QStringView string, ScriptItemArray *scripts)
2698	{
2699	qsizetype sor = `0`;
2700	qsizetype eor = `0`;
2701	QChar::Script script = QChar::Script_Common;
2702
2703	for (qsizetype i = `0`; i < string.size(); ++i, eor = i) {
2704	char32_t ucs4 = string [i].unicode();
2705	if (QChar::isHighSurrogate(ucs4) && i + `1` < string.size()) {
2706	ushort low = string [i + `1`].unicode();
2707	if (QChar::isLowSurrogate(ucs4: low)) {
2708	ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
2709	++i;
2710	}
2711	}
2712
2713	const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
2714
2715	QChar::Script nscript = QChar::Script(prop->script);
2716
2717	if (Q_LIKELY(nscript == script \|\| nscript <= QChar::Script_Common))
2718	continue;
2719
2720	// inherit preceding Common-s
2721	if (Q_UNLIKELY(script <= QChar::Script_Common)) {
2722	// also covers a case where the base character of Common script followed
2723	// by one or more combining marks of non-Inherited, non-Common script
2724	script = nscript;
2725	continue;
2726	}
2727
2728	// Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
2729	// Thus, a combining mark - whatever its script property value is - should inherit
2730	// the script property value of its base character.
2731	static const int test = (FLAG(QChar::Mark_NonSpacing) \| FLAG(QChar::Mark_SpacingCombining) \| FLAG(QChar::Mark_Enclosing));
2732	if (Q_UNLIKELY(FLAG(prop->category) & test))
2733	continue;
2734
2735	Q_ASSERT(script > QChar::Script_Common);
2736	Q_ASSERT(sor < eor);
2737	scripts->append(t: ScriptItem{.position: sor, .script: script});
2738	sor = eor;
2739
2740	script = nscript;
2741	}
2742
2743	Q_ASSERT(script >= QChar::Script_Common);
2744	Q_ASSERT(eor == string.size());
2745	scripts->append(t: ScriptItem{.position: sor, .script: script});
2746	}
2747
2748	} // namespace QUnicodeTools
2749
2750	QT_END_NAMESPACE
2751

Provided by KDAB

Definitions

qt_initcharattributes_default_algorithm_only
Extend_SpacingMark_ZWJ
HardBreak
breakTable
shouldBreakBetweenClasses
State
getGraphemeBreaks
Action
breakTable
getWordBreaks
State
breakTable
getSentenceBreaks
Action
Class
actionTable
toClass
DottedCircle
State
LinebreakUnit
ParseState
updateState
Action
breakTable
getLineBreaks
getWhiteSpaces
Form
indicForms
form
indic_nextSyllableBoundary
indicAttributes
thcell_t
LibThai
LibThai
LibThai
~LibThai
isInitialized
brk_find_breaks
next_cell
g_libThai
to_tis620
thaiAssignAttributes
thaiAttributes
tibetanForm
tibetan_nextSyllableBoundary
tibetanAttributes
MymrCharClassValues
MymrCharClassFlags
MymrChar
mymrCharClasses
getMyanmarCharClass
mymrStateTable
myanmar_nextSyllableBoundary
myanmarAttributes
KhmerCharClassValues
KhmerCharClassFlags
KhmerChar
khmerCharClasses
KhmerCharClassesRange
getKhmerCharClass
khmerStateTable
khmer_nextSyllableBoundary
khmerAttributes
charAttributeFunction
getCharAttributes
initCharAttributes

Start learning QML with our Intro Training

Find out more

Definitions

source code of qtbase/src/corelib/text/qunicodetools.cpp