1/****************************************************************************
2**
3** Copyright (C) 2016 The Qt Company Ltd.
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the QtCore module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU Lesser General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU Lesser
19** General Public License version 3 as published by the Free Software
20** Foundation and appearing in the file LICENSE.LGPL3 included in the
21** packaging of this file. Please review the following information to
22** ensure the GNU Lesser General Public License version 3 requirements
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24**
25** GNU General Public License Usage
26** Alternatively, this file may be used under the terms of the GNU
27** General Public License version 2.0 or (at your option) the GNU General
28** Public license version 3 or any later version approved by the KDE Free
29** Qt Foundation. The licenses are as published by the Free Software
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31** included in the packaging of this file. Please review the following
32** information to ensure the GNU General Public License requirements will
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34** https://www.gnu.org/licenses/gpl-3.0.html.
35**
36** $QT_END_LICENSE$
37**
38****************************************************************************/
39
40#include "qunicodetools_p.h"
41
42#include "qunicodetables_p.h"
43#include "qvarlengtharray.h"
44
45#include "qharfbuzz_p.h"
46
47#include <limits.h>
48
49#define FLAG(x) (1 << (x))
50
51QT_BEGIN_NAMESPACE
52
53Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0;
54
55namespace QUnicodeTools {
56
57// -----------------------------------------------------------------------------------------------------
58//
59// The text boundaries determination algorithm.
60// See https://www.unicode.org/reports/tr29/tr29-37.html
61//
62// -----------------------------------------------------------------------------------------------------
63
64namespace GB {
65
66// This table is indexed by the grapheme break classes of two
67// (adjacent) code points.
68// The class of the first code point selects an entry.
69// If the entry's bit at position second_cp_class is set
70// (in other words: if entry & (1u << second_cp_class) is non-zero)
71// then there is NO grapheme break between the two code points.
72
73using GBTableEntryType = quint16;
74
75// Check that we have enough bits in the table (in case
76// NumGraphemeBreakClasses grows too much).
77static_assert(sizeof(GBTableEntryType) * CHAR_BIT >= QUnicodeTables::NumGraphemeBreakClasses,
78 "Internal error: increase the size in bits of GBTableEntryType");
79
80// GB9, GB9a
81static const GBTableEntryType Extend_SpacingMark_ZWJ =
82 FLAG(QUnicodeTables::GraphemeBreak_Extend)
83 | FLAG(QUnicodeTables::GraphemeBreak_SpacingMark)
84 | FLAG(QUnicodeTables::GraphemeBreak_ZWJ);
85
86static const GBTableEntryType HardBreak = 0u;
87
88static const GBTableEntryType breakTable[QUnicodeTables::NumGraphemeBreakClasses] = {
89 Extend_SpacingMark_ZWJ, // Any
90 FLAG(QUnicodeTables::GraphemeBreak_LF), // CR
91 HardBreak, // LF
92 HardBreak, // Control
93 Extend_SpacingMark_ZWJ, // Extend
94 Extend_SpacingMark_ZWJ, // ZWJ
95 Extend_SpacingMark_ZWJ, // RegionalIndicator
96 (Extend_SpacingMark_ZWJ
97 | FLAG(QUnicodeTables::GraphemeBreak_Any)
98 | FLAG(QUnicodeTables::GraphemeBreak_Prepend)
99 | FLAG(QUnicodeTables::GraphemeBreak_L)
100 | FLAG(QUnicodeTables::GraphemeBreak_V)
101 | FLAG(QUnicodeTables::GraphemeBreak_T)
102 | FLAG(QUnicodeTables::GraphemeBreak_LV)
103 | FLAG(QUnicodeTables::GraphemeBreak_LVT)
104 | FLAG(QUnicodeTables::GraphemeBreak_RegionalIndicator)
105 | FLAG(QUnicodeTables::GraphemeBreak_Extended_Pictographic)
106 ), // Prepend
107 Extend_SpacingMark_ZWJ, // SpacingMark
108 (Extend_SpacingMark_ZWJ
109 | FLAG(QUnicodeTables::GraphemeBreak_L)
110 | FLAG(QUnicodeTables::GraphemeBreak_V)
111 | FLAG(QUnicodeTables::GraphemeBreak_LV)
112 | FLAG(QUnicodeTables::GraphemeBreak_LVT)
113 ), // L
114 (Extend_SpacingMark_ZWJ
115 | FLAG(QUnicodeTables::GraphemeBreak_V)
116 | FLAG(QUnicodeTables::GraphemeBreak_T)
117 ), // V
118 (Extend_SpacingMark_ZWJ
119 | FLAG(QUnicodeTables::GraphemeBreak_T)
120 ), // T
121 (Extend_SpacingMark_ZWJ
122 | FLAG(QUnicodeTables::GraphemeBreak_V)
123 | FLAG(QUnicodeTables::GraphemeBreak_T)
124 ), // LV
125 (Extend_SpacingMark_ZWJ
126 | FLAG(QUnicodeTables::GraphemeBreak_T)
127 ), // LVT
128 Extend_SpacingMark_ZWJ // Extended_Pictographic
129};
130
131static bool shouldBreakBetweenClasses(QUnicodeTables::GraphemeBreakClass first,
132 QUnicodeTables::GraphemeBreakClass second)
133{
134 return (breakTable[first] & FLAG(second)) == 0;
135}
136
137// Some rules (GB11, GB12, GB13) cannot be represented by the table alone,
138// so we need to store some local state.
139enum class State : uchar {
140 Normal,
141 GB11_ExtPicExt, // saw a Extend after a Extended_Pictographic
142 GB11_ExtPicExtZWJ, // saw a ZWG after a Extended_Pictographic and zero or more Extend
143 GB12_13_RI, // saw a RegionalIndicator following a non-RegionalIndicator
144};
145
146} // namespace GB
147
148static void getGraphemeBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
149{
150 QUnicodeTables::GraphemeBreakClass lcls = QUnicodeTables::GraphemeBreak_LF; // to meet GB1
151 GB::State state = GB::State::Normal;
152 for (quint32 i = 0; i != len; ++i) {
153 quint32 pos = i;
154 uint ucs4 = string[i];
155 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
156 ushort low = string[i + 1];
157 if (QChar::isLowSurrogate(ucs4: low)) {
158 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
159 ++i;
160 }
161 }
162
163 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
164 QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
165
166 bool shouldBreak = GB::shouldBreakBetweenClasses(first: lcls, second: cls);
167 bool handled = false;
168
169 switch (state) {
170 case GB::State::Normal:
171 break; // will deal with it below
172
173 case GB::State::GB11_ExtPicExt:
174 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
175 if (cls == QUnicodeTables::GraphemeBreak_Extend) {
176 // keep going in the current state
177 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
178 handled = true;
179 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
180 state = GB::State::GB11_ExtPicExtZWJ;
181 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
182 handled = true;
183 } else {
184 state = GB::State::Normal;
185 }
186 break;
187
188 case GB::State::GB11_ExtPicExtZWJ:
189 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
190 if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) {
191 shouldBreak = false;
192 handled = true;
193 }
194
195 state = GB::State::Normal;
196 break;
197
198 case GB::State::GB12_13_RI:
199 Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator);
200 if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) {
201 shouldBreak = false;
202 handled = true;
203 }
204
205 state = GB::State::Normal;
206 break;
207 }
208
209 if (!handled) {
210 Q_ASSERT(state == GB::State::Normal);
211 if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11
212 if (cls == QUnicodeTables::GraphemeBreak_Extend) {
213 state = GB::State::GB11_ExtPicExt;
214 Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
215 } else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
216 state = GB::State::GB11_ExtPicExtZWJ;
217 Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
218 }
219 } else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
220 state = GB::State::GB12_13_RI;
221 }
222 }
223
224 if (shouldBreak)
225 attributes[pos].graphemeBoundary = true;
226
227 lcls = cls;
228 }
229
230 attributes[len].graphemeBoundary = true; // GB2
231}
232
233
234namespace WB {
235
236enum Action {
237 NoBreak,
238 Break,
239 Lookup,
240 LookupW
241};
242
243static const uchar breakTable[QUnicodeTables::NumWordBreakClasses][QUnicodeTables::NumWordBreakClasses] = {
244// Any CR LF Newline Extend ZWJ Format RI Katakana HLetter ALetter SQuote DQuote MidNumLet MidLetter MidNum Numeric ExtNumLet E_Base E_Mod GAZ EBG WSeg
245 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Any
246 { Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
247 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
248 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
249 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Extend
250 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, NoBreak, Break }, // ZWJ
251 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Format
252 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // RegionalIndicator
253 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break }, // Katakana
254 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Lookup , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // HebrewLetter
255 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, LookupW, Break , LookupW, LookupW, Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // ALetter
256 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // SingleQuote
257 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // DoubleQuote
258 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
259 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
260 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
261 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , NoBreak, NoBreak, Lookup , Break , Lookup , Break , Lookup , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // Numeric
262 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break }, // ExtendNumLet
263 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break }, // E_Base
264 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // E_Mod
265 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // GAZ
266 { Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , NoBreak, Break , Break , Break }, // EBG
267 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // WSeg
268};
269
270} // namespace WB
271
272static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
273{
274 enum WordType {
275 WordTypeNone, WordTypeAlphaNumeric, WordTypeHiraganaKatakana
276 } currentWordType = WordTypeNone;
277
278 QUnicodeTables::WordBreakClass cls = QUnicodeTables::WordBreak_LF; // to meet WB1
279 for (quint32 i = 0; i != len; ++i) {
280 quint32 pos = i;
281 uint ucs4 = string[i];
282 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
283 ushort low = string[i + 1];
284 if (QChar::isLowSurrogate(ucs4: low)) {
285 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
286 ++i;
287 }
288 }
289
290 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
291 QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
292#ifdef QT_BUILD_INTERNAL
293 if (qt_initcharattributes_default_algorithm_only) {
294 // as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
295 // which caused "hi.there" to be treated like if it were just a single word;
296 // we keep the pre-5.1 behavior by remapping these characters in the Unicode tables generator
297 // and this code is needed to pass the coverage tests; remove once the issue is fixed.
298 if (ucs4 == 0x002E) // FULL STOP
299 ncls = QUnicodeTables::WordBreak_MidNumLet;
300 else if (ucs4 == 0x003A) // COLON
301 ncls = QUnicodeTables::WordBreak_MidLetter;
302 }
303#endif
304
305 uchar action = WB::breakTable[cls][ncls];
306 switch (action) {
307 case WB::Break:
308 break;
309 case WB::NoBreak:
310 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_Extend || ncls == QUnicodeTables::WordBreak_ZWJ || ncls == QUnicodeTables::WordBreak_Format)) {
311 // WB4: X(Extend|Format)* -> X
312 if (cls != QUnicodeTables::WordBreak_ZWJ) // WB3c
313 continue;
314 }
315 if (Q_UNLIKELY(cls == QUnicodeTables::WordBreak_RegionalIndicator)) {
316 // WB15/WB16: break between pairs of Regional indicator
317 ncls = QUnicodeTables::WordBreak_Any;
318 }
319 break;
320 case WB::Lookup:
321 case WB::LookupW:
322 for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
323 ucs4 = string[lookahead];
324 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
325 ushort low = string[lookahead + 1];
326 if (QChar::isLowSurrogate(ucs4: low)) {
327 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
328 ++lookahead;
329 }
330 }
331
332 prop = QUnicodeTables::properties(ucs4);
333 QUnicodeTables::WordBreakClass tcls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
334
335 if (Q_UNLIKELY(tcls == QUnicodeTables::WordBreak_Extend || tcls == QUnicodeTables::WordBreak_ZWJ || tcls == QUnicodeTables::WordBreak_Format)) {
336 // WB4: X(Extend|Format)* -> X
337 continue;
338 }
339
340 if (Q_LIKELY(tcls == cls || (action == WB::LookupW && (tcls == QUnicodeTables::WordBreak_HebrewLetter
341 || tcls == QUnicodeTables::WordBreak_ALetter)))) {
342 i = lookahead;
343 ncls = tcls;
344 action = WB::NoBreak;
345 }
346 break;
347 }
348 if (action != WB::NoBreak) {
349 action = WB::Break;
350 if (Q_UNLIKELY(ncls == QUnicodeTables::WordBreak_SingleQuote && cls == QUnicodeTables::WordBreak_HebrewLetter))
351 action = WB::NoBreak; // WB7a
352 }
353 break;
354 }
355
356 cls = ncls;
357 if (action == WB::Break) {
358 attributes[pos].wordBreak = true;
359 if (currentWordType != WordTypeNone)
360 attributes[pos].wordEnd = true;
361 switch (cls) {
362 case QUnicodeTables::WordBreak_Katakana:
363 currentWordType = WordTypeHiraganaKatakana;
364 attributes[pos].wordStart = true;
365 break;
366 case QUnicodeTables::WordBreak_HebrewLetter:
367 case QUnicodeTables::WordBreak_ALetter:
368 case QUnicodeTables::WordBreak_Numeric:
369 currentWordType = WordTypeAlphaNumeric;
370 attributes[pos].wordStart = true;
371 break;
372 default:
373 currentWordType = WordTypeNone;
374 break;
375 }
376 }
377 }
378
379 if (currentWordType != WordTypeNone)
380 attributes[len].wordEnd = true;
381 attributes[len].wordBreak = true; // WB2
382}
383
384
385namespace SB {
386
387enum State {
388 Initial,
389 Lower,
390 Upper,
391 LUATerm,
392 ATerm,
393 ATermC,
394 ACS,
395 STerm,
396 STermC,
397 SCS,
398 BAfterC,
399 BAfter,
400 Break,
401 Lookup
402};
403
404static const uchar breakTable[BAfter + 1][QUnicodeTables::NumSentenceBreakClasses] = {
405// Any CR LF Sep Extend Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
406 { Initial, BAfterC, BAfter , BAfter , Initial, Initial, Lower , Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
407 { Initial, BAfterC, BAfter , BAfter , Lower , Initial, Initial, Initial, Initial, Initial, LUATerm, Initial, STerm , Initial }, // Lower
408 { Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, LUATerm, Initial, STerm , Initial }, // Upper
409
410 { Lookup , BAfterC, BAfter , BAfter , LUATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // LUATerm
411 { Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
412 { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
413 { Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
414
415 { Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
416 { Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
417 { Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
418 { Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
419 { Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
420};
421
422} // namespace SB
423
424static void getSentenceBreaks(const ushort *string, quint32 len, QCharAttributes *attributes)
425{
426 uchar state = SB::BAfter; // to meet SB1
427 for (quint32 i = 0; i != len; ++i) {
428 quint32 pos = i;
429 uint ucs4 = string[i];
430 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
431 ushort low = string[i + 1];
432 if (QChar::isLowSurrogate(ucs4: low)) {
433 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
434 ++i;
435 }
436 }
437
438 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
439 QUnicodeTables::SentenceBreakClass ncls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
440
441 Q_ASSERT(state <= SB::BAfter);
442 state = SB::breakTable[state][ncls];
443 if (Q_UNLIKELY(state == SB::Lookup)) { // SB8
444 state = SB::Break;
445 for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
446 ucs4 = string[lookahead];
447 if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
448 ushort low = string[lookahead + 1];
449 if (QChar::isLowSurrogate(ucs4: low)) {
450 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
451 ++lookahead;
452 }
453 }
454
455 prop = QUnicodeTables::properties(ucs4);
456 QUnicodeTables::SentenceBreakClass tcls = (QUnicodeTables::SentenceBreakClass) prop->sentenceBreakClass;
457 switch (tcls) {
458 case QUnicodeTables::SentenceBreak_Any:
459 case QUnicodeTables::SentenceBreak_Extend:
460 case QUnicodeTables::SentenceBreak_Sp:
461 case QUnicodeTables::SentenceBreak_Numeric:
462 case QUnicodeTables::SentenceBreak_SContinue:
463 case QUnicodeTables::SentenceBreak_Close:
464 continue;
465 case QUnicodeTables::SentenceBreak_Lower:
466 i = lookahead;
467 state = SB::Initial;
468 break;
469 default:
470 break;
471 }
472 break;
473 }
474 }
475 if (Q_UNLIKELY(state == SB::Break)) {
476 attributes[pos].sentenceBoundary = true;
477 state = SB::breakTable[SB::Initial][ncls];
478 }
479 }
480
481 attributes[len].sentenceBoundary = true; // SB2
482}
483
484
485// -----------------------------------------------------------------------------------------------------
486//
487// The line breaking algorithm.
488// See http://www.unicode.org/reports/tr14/tr14-39.html
489//
490// -----------------------------------------------------------------------------------------------------
491
492namespace LB {
493
494namespace NS { // Number Sequence
495
496// LB25 recommends to not break lines inside numbers of the form
497// described by the following regular expression:
498// (PR|PO)?(OP|HY)?NU(NU|SY|IS)*(CL|CP)?(PR|PO)?
499
500enum Action {
501 None,
502 Start,
503 Continue,
504 Break
505};
506
507enum Class {
508 XX,
509 PRPO,
510 OPHY,
511 NU,
512 SYIS,
513 CLCP
514};
515
516static const uchar actionTable[CLCP + 1][CLCP + 1] = {
517// XX PRPO OPHY NU SYIS CLCP
518 { None , Start , Start , Start , None , None }, // XX
519 { None , Start , Continue, Continue, None , None }, // PRPO
520 { None , Start , Start , Continue, None , None }, // OPHY
521 { Break , Break , Break , Continue, Continue, Continue }, // NU
522 { Break , Break , Break , Continue, Continue, Continue }, // SYIS
523 { Break , Continue, Break , Break , Break , Break }, // CLCP
524};
525
526inline Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
527{
528 switch (lbc) {
529 case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
530 // resolve AI math symbols in numerical context to IS
531 if (category == QChar::Symbol_Math)
532 return SYIS;
533 break;
534 case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
535 return PRPO;
536 case QUnicodeTables::LineBreak_OP: case QUnicodeTables::LineBreak_HY:
537 return OPHY;
538 case QUnicodeTables::LineBreak_NU:
539 return NU;
540 case QUnicodeTables::LineBreak_SY: case QUnicodeTables::LineBreak_IS:
541 return SYIS;
542 case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
543 return CLCP;
544 default:
545 break;
546 }
547 return XX;
548}
549
550} // namespace NS
551
552/* In order to support the tailored implementation of LB25 properly
553 the following changes were made in the pair table to allow breaks
554 where the numeric expression doesn't match the template (i.e. [^NU](IS|SY)NU):
555 (CL)(PO) from IB to DB
556 (CP)(PO) from IB to DB
557 (CL)(PR) from IB to DB
558 (CP)(PR) from IB to DB
559 (PO)(OP) from IB to DB
560 (PR)(OP) from IB to DB
561 (IS)(NU) from IB to DB
562 (SY)(NU) from IB to DB
563*/
564
565/* In order to implementat LB21a properly a special rule HH has been introduced and
566 the following changes were made in the pair table to disallow breaks after Hebrew + Hyphen:
567 (HL)(HY|BA) from IB to CI
568 (HY|BA)(!CB) from DB to HH
569*/
570
571enum Action {
572 ProhibitedBreak, PB = ProhibitedBreak,
573 DirectBreak, DB = DirectBreak,
574 IndirectBreak, IB = IndirectBreak,
575 CombiningIndirectBreak, CI = CombiningIndirectBreak,
576 CombiningProhibitedBreak, CP = CombiningProhibitedBreak,
577 ProhibitedBreakAfterHebrewPlusHyphen, HH = ProhibitedBreakAfterHebrewPlusHyphen
578};
579
580static const uchar breakTable[QUnicodeTables::LineBreak_SA][QUnicodeTables::LineBreak_SA] = {
581/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT RI CB EB EM ZWJ*/
582/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB },
583/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
584/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
585/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
586/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
587/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
588/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
589/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
590/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
591/* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, IB },
592/* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
593/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
594/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
595/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, CI, CI, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
596/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
597/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
598/* HY */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, IB, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
599/* BA */ { HH, PB, PB, IB, HH, IB, PB, PB, PB, HH, HH, HH, HH, HH, HH, HH, IB, IB, HH, HH, PB, CI, PB, HH, HH, HH, HH, HH, HH, DB, DB, DB, IB },
600/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, DB, IB, IB, IB },
601/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
602/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB },
603/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
604/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB },
605/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
606/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
607/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB, DB, DB, DB, DB, IB },
608/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB, DB, DB, DB, DB, IB },
609/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB, DB, DB, DB, DB, IB },
610/* RI */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, IB, DB, DB, DB, IB },
611/* CB */ { DB, PB, PB, IB, IB, DB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
612/* EB */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, IB, IB },
613/* EM */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, DB, DB, IB },
614/* ZWJ*/ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, IB }
615};
616
617// The following line break classes are not treated by the pair table
618// and must be resolved outside:
619// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
620
621} // namespace LB
622
623static void getLineBreaks(const ushort *string, quint32 len, QCharAttributes *attributes, QUnicodeTools::CharAttributeOptions options)
624{
625 quint32 nestart = 0;
626 LB::NS::Class nelast = LB::NS::XX;
627
628 QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
629 QUnicodeTables::LineBreakClass cls = lcls;
630 for (quint32 i = 0; i != len; ++i) {
631 quint32 pos = i;
632 uint ucs4 = string[i];
633 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
634 ushort low = string[i + 1];
635 if (QChar::isLowSurrogate(ucs4: low)) {
636 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
637 ++i;
638 }
639 }
640
641 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
642 QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->lineBreakClass;
643 QUnicodeTables::LineBreakClass tcls;
644
645 if (options & QUnicodeTools::HangulLineBreakTailoring) {
646 if (Q_UNLIKELY((ncls >= QUnicodeTables::LineBreak_H2
647 && ncls <= QUnicodeTables::LineBreak_JT)
648 || (ucs4 >= 0x3130 && ucs4 <= 0x318F && ncls == QUnicodeTables::LineBreak_ID))
649 ) {
650 // LB27: use SPACE for line breaking
651 // "When Korean uses SPACE for line breaking, the classes in rule LB26,
652 // as well as characters of class ID, are often tailored to AL; see Section 8, Customization."
653 // In case of Korean syllables: "3130..318F HANGUL COMPATIBILITY JAMO"
654 ncls = QUnicodeTables::LineBreak_AL;
655 } else {
656 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
657 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
658 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
659 if (FLAG(prop->category) & test)
660 ncls = QUnicodeTables::LineBreak_CM;
661 }
662 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM)) {
663 // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
664 if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP)
665 ncls = QUnicodeTables::LineBreak_AL;
666 }
667 }
668 }
669
670 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_SA)) {
671 // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
672 static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
673 if (FLAG(prop->category) & test)
674 ncls = QUnicodeTables::LineBreak_CM;
675 }
676
677 if (Q_UNLIKELY(lcls >= QUnicodeTables::LineBreak_CR)) {
678 // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
679 if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
680 attributes[pos].lineBreak = attributes[pos].mandatoryBreak = true;
681 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
682 cls = QUnicodeTables::LineBreak_AL;
683 goto next_no_cls_update;
684 }
685 goto next;
686 }
687
688 if (Q_UNLIKELY(ncls >= QUnicodeTables::LineBreak_SP)) {
689 if (ncls > QUnicodeTables::LineBreak_SP)
690 goto next; // LB6: x(BK|CR|LF|NL)
691 goto next_no_cls_update; // LB7: xSP
692 }
693
694 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_CM || ncls == QUnicodeTables::LineBreak_ZWJ)) {
695 // LB9: treat CM that don't follows SP, BK, CR, LF, NL, or ZW as X
696 if (lcls != QUnicodeTables::LineBreak_ZW && lcls < QUnicodeTables::LineBreak_SP)
697 // don't update anything
698 goto next_no_cls_update;
699 }
700
701 if (Q_UNLIKELY(lcls == QUnicodeTables::LineBreak_ZWJ)) {
702 // LB8a: ZWJ x (ID | EB | EM)
703 if (ncls == QUnicodeTables::LineBreak_ID || ncls == QUnicodeTables::LineBreak_EB || ncls == QUnicodeTables::LineBreak_EM)
704 goto next;
705 }
706
707 // LB25: do not break lines inside numbers
708 {
709 LB::NS::Class necur = LB::NS::toClass(lbc: ncls, category: (QChar::Category)prop->category);
710 switch (LB::NS::actionTable[nelast][necur]) {
711 case LB::NS::Break:
712 // do not change breaks before and after the expression
713 for (quint32 j = nestart + 1; j < pos; ++j)
714 attributes[j].lineBreak = false;
715 Q_FALLTHROUGH();
716 case LB::NS::None:
717 nelast = LB::NS::XX; // reset state
718 break;
719 case LB::NS::Start:
720 nestart = i;
721 Q_FALLTHROUGH();
722 default:
723 nelast = necur;
724 break;
725 }
726 }
727
728 if (Q_UNLIKELY(ncls == QUnicodeTables::LineBreak_RI && lcls == QUnicodeTables::LineBreak_RI)) {
729 // LB30a
730 ncls = QUnicodeTables::LineBreak_SP;
731 goto next;
732 }
733
734 // for South East Asian chars that require a complex analysis, the Unicode
735 // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
736 if (Q_UNLIKELY(cls >= QUnicodeTables::LineBreak_SA))
737 cls = QUnicodeTables::LineBreak_AL;
738
739 tcls = cls;
740 if (tcls == QUnicodeTables::LineBreak_CM)
741 // LB10
742 tcls = QUnicodeTables::LineBreak_AL;
743 switch (LB::breakTable[tcls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
744 case LB::DirectBreak:
745 attributes[pos].lineBreak = true;
746 break;
747 case LB::IndirectBreak:
748 if (lcls == QUnicodeTables::LineBreak_SP)
749 attributes[pos].lineBreak = true;
750 break;
751 case LB::CombiningIndirectBreak:
752 if (lcls != QUnicodeTables::LineBreak_SP)
753 goto next_no_cls_update;
754 attributes[pos].lineBreak = true;
755 break;
756 case LB::CombiningProhibitedBreak:
757 if (lcls != QUnicodeTables::LineBreak_SP)
758 goto next_no_cls_update;
759 break;
760 case LB::ProhibitedBreakAfterHebrewPlusHyphen:
761 if (lcls != QUnicodeTables::LineBreak_HL)
762 attributes[pos].lineBreak = true;
763 break;
764 case LB::ProhibitedBreak:
765 // nothing to do
766 default:
767 break;
768 }
769
770 next:
771 cls = ncls;
772 next_no_cls_update:
773 lcls = ncls;
774 }
775
776 if (Q_UNLIKELY(LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break)) {
777 // LB25: do not break lines inside numbers
778 for (quint32 j = nestart + 1; j < len; ++j)
779 attributes[j].lineBreak = false;
780 }
781
782 attributes[0].lineBreak = attributes[0].mandatoryBreak = false; // LB2
783 attributes[len].lineBreak = attributes[len].mandatoryBreak = true; // LB3
784}
785
786
787static void getWhiteSpaces(const ushort *string, quint32 len, QCharAttributes *attributes)
788{
789 for (quint32 i = 0; i != len; ++i) {
790 uint ucs4 = string[i];
791 if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
792 ushort low = string[i + 1];
793 if (QChar::isLowSurrogate(ucs4: low)) {
794 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
795 ++i;
796 }
797 }
798
799 if (Q_UNLIKELY(QChar::isSpace(ucs4)))
800 attributes[i].whiteSpace = true;
801 }
802}
803
804
805Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
806 const ScriptItem *items, int numItems,
807 QCharAttributes *attributes, CharAttributeOptions options)
808{
809 if (length <= 0)
810 return;
811
812 if (!(options & DontClearAttributes))
813 ::memset(s: attributes, c: 0, n: (length + 1) * sizeof(QCharAttributes));
814
815 if (options & GraphemeBreaks)
816 getGraphemeBreaks(string, len: length, attributes);
817 if (options & WordBreaks)
818 getWordBreaks(string, len: length, attributes);
819 if (options & SentenceBreaks)
820 getSentenceBreaks(string, len: length, attributes);
821 if (options & LineBreaks)
822 getLineBreaks(string, len: length, attributes, options);
823 if (options & WhiteSpaces)
824 getWhiteSpaces(string, len: length, attributes);
825
826 if (!qt_initcharattributes_default_algorithm_only) {
827 if (!items || numItems <= 0)
828 return;
829
830 QVarLengthArray<HB_ScriptItem, 64> scriptItems;
831 scriptItems.reserve(asize: numItems);
832 int start = 0;
833 HB_Script startScript = script_to_hbscript(script: items[start].script);
834 if (Q_UNLIKELY(startScript == HB_Script_Inherited))
835 startScript = HB_Script_Common;
836 for (int i = start + 1; i < numItems; ++i) {
837 HB_Script script = script_to_hbscript(script: items[i].script);
838 if (Q_LIKELY(script == startScript || script == HB_Script_Inherited))
839 continue;
840 Q_ASSERT(items[i].position > items[start].position);
841 HB_ScriptItem item;
842 item.pos = items[start].position;
843 item.length = items[i].position - items[start].position;
844 item.script = startScript;
845 item.bidiLevel = 0; // unused
846 scriptItems.append(t: item);
847 start = i;
848 startScript = script;
849 }
850 if (items[start].position + 1 < length) {
851 HB_ScriptItem item;
852 item.pos = items[start].position;
853 item.length = length - items[start].position;
854 item.script = startScript;
855 item.bidiLevel = 0; // unused
856 scriptItems.append(t: item);
857 }
858 Q_STATIC_ASSERT(sizeof(QCharAttributes) == sizeof(HB_CharAttributes));
859 HB_GetTailoredCharAttributes(string, stringLength: length,
860 items: scriptItems.constData(), numItems: scriptItems.size(),
861 attributes: reinterpret_cast<HB_CharAttributes *>(attributes));
862 }
863}
864
865
866// ----------------------------------------------------------------------------
867//
868// The Unicode script property. See http://www.unicode.org/reports/tr24/tr24-24.html
869//
870// ----------------------------------------------------------------------------
871
872Q_CORE_EXPORT void initScripts(const ushort *string, int length, uchar *scripts)
873{
874 int sor = 0;
875 int eor = 0;
876 uchar script = QChar::Script_Common;
877
878 for (int i = 0; i < length; ++i, eor = i) {
879 uint ucs4 = string[i];
880 if (QChar::isHighSurrogate(ucs4) && i + 1 < length) {
881 ushort low = string[i + 1];
882 if (QChar::isLowSurrogate(ucs4: low)) {
883 ucs4 = QChar::surrogateToUcs4(high: ucs4, low);
884 ++i;
885 }
886 }
887
888 const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
889
890 uchar nscript = prop->script;
891
892 if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common))
893 continue;
894
895 // inherit preceding Common-s
896 if (Q_UNLIKELY(script <= QChar::Script_Common)) {
897 // also covers a case where the base character of Common script followed
898 // by one or more combining marks of non-Inherited, non-Common script
899 script = nscript;
900 continue;
901 }
902
903 // Never break between a combining mark (gc= Mc, Mn or Me) and its base character.
904 // Thus, a combining mark - whatever its script property value is - should inherit
905 // the script property value of its base character.
906 static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing));
907 if (Q_UNLIKELY(FLAG(prop->category) & test))
908 continue;
909
910 Q_ASSERT(script > QChar::Script_Common);
911 Q_ASSERT(sor < eor);
912 ::memset(s: scripts + sor, c: script, n: (eor - sor) * sizeof(uchar));
913 sor = eor;
914
915 script = nscript;
916 }
917
918 Q_ASSERT(script >= QChar::Script_Common);
919 Q_ASSERT(eor == length);
920 ::memset(s: scripts + sor, c: script, n: (eor - sor) * sizeof(uchar));
921}
922
923} // namespace QUnicodeTools
924
925QT_END_NAMESPACE
926

source code of qtbase/src/corelib/text/qunicodetools.cpp