1// Copyright (C) 2022 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qchar.h"
5
6#include "qdatastream.h"
7
8#include "qunicodetables_p.h"
9#include "qunicodetables.cpp"
10
11#include <algorithm>
12
13QT_BEGIN_NAMESPACE
14
15#define FLAG(x) (1 << (x))
16
17/*!
18 \class QLatin1Char
19 \inmodule QtCore
20 \reentrant
21 \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
22
23 \ingroup string-processing
24
25 This class is only useful to construct a QChar with 8-bit character.
26
27 \sa QChar, QLatin1StringView, QString
28*/
29
30/*!
31 \fn const char QLatin1Char::toLatin1() const
32
33 Converts a Latin-1 character to an 8-bit ASCII representation of the character.
34*/
35
36/*!
37 \fn QLatin1Char::unicode() const
38
39 Converts a Latin-1 character to an 16-bit-encoded Unicode representation
40 of the character.
41*/
42
43/*!
44 \fn QLatin1Char::QLatin1Char(char c)
45
46 Constructs a Latin-1 character for \a c. This constructor should be
47 used when the encoding of the input character is known to be Latin-1.
48*/
49
50/*!
51 \class QChar
52 \inmodule QtCore
53 \brief The QChar class provides a 16-bit Unicode character.
54
55 \ingroup string-processing
56 \reentrant
57
58 \compares strong
59 \compareswith strong char16_t QString QStringView QLatin1StringView QUtf8StringView
60 \endcompareswith
61 \compareswith strong {const char *} QByteArray QByteArrayView
62 The contents of the byte array is interpreted as UTF-8.
63 \endcompareswith
64
65 In Qt, Unicode characters are 16-bit entities without any markup
66 or structure. This class represents such an entity. It is
67 lightweight, so it can be used everywhere. Most compilers treat
68 it like an \c{unsigned short}.
69
70 QChar provides a full complement of testing/classification
71 functions, converting to and from other formats, converting from
72 composed to decomposed Unicode, and trying to compare and
73 case-convert if you ask it to.
74
75 The classification functions include functions like those in the
76 standard C++ header \<cctype\> (formerly \<ctype.h\>), but
77 operating on the full range of Unicode characters, not just for the ASCII
78 range. They all return true if the character is a certain type of character;
79 otherwise they return false. These classification functions are
80 isNull() (returns \c true if the character is '\\0'), isPrint()
81 (true if the character is any sort of printable character,
82 including whitespace), isPunct() (any sort of punctation),
83 isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
84 sort of numeric character, not just 0-9), isLetterOrNumber(), and
85 isDigit() (decimal digits). All of these are wrappers around
86 category() which return the Unicode-defined category of each
87 character. Some of these also calculate the derived properties
88 (for example isSpace() returns \c true if the character is of category
89 Separator_* or an exceptional code point from Other_Control category).
90
91 QChar also provides direction(), which indicates the "natural"
92 writing direction of this character. The joiningType() function
93 indicates how the character joins with it's neighbors (needed
94 mostly for Arabic or Syriac) and finally hasMirrored(), which indicates
95 whether the character needs to be mirrored when it is printed in
96 it's "unnatural" writing direction.
97
98 Composed Unicode characters (like \a ring) can be converted to
99 decomposed Unicode ("a" followed by "ring above") by using decomposition().
100
101 In Unicode, comparison is not necessarily possible and case
102 conversion is very difficult at best. Unicode, covering the
103 "entire" world, also includes most of the world's case and
104 sorting problems. operator==() and friends will do comparison
105 based purely on the numeric Unicode value (code point) of the
106 characters, and toUpper() and toLower() will do case changes when
107 the character has a well-defined uppercase/lowercase equivalent.
108 For locale-dependent comparisons, use QString::localeAwareCompare().
109
110 The conversion functions include unicode() (to a scalar),
111 toLatin1() (to scalar, but converts all non-Latin-1 characters to
112 0), row() (gives the Unicode row), cell() (gives the Unicode
113 cell), digitValue() (gives the integer value of any of the
114 numerous digit characters), and a host of constructors.
115
116 QChar provides constructors and cast operators that make it easy
117 to convert to and from traditional 8-bit \c{char}s. If you
118 defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
119 explained in the QString documentation, you will need to
120 explicitly call fromLatin1(), or use QLatin1Char,
121 to construct a QChar from an 8-bit \c char, and you will need to
122 call toLatin1() to get the 8-bit value back.
123
124 Starting with Qt 6.0, most QChar constructors are \c explicit. This
125 is done to avoid dangerous mistakes when accidentally mixing
126 integral types and strings.
127
128 For more information see
129 \l{https://www.unicode.org/ucd/}{"About the Unicode Character Database"}.
130
131 \sa Unicode, QString, QLatin1Char
132*/
133
134/*!
135 \enum QChar::UnicodeVersion
136
137 Specifies which version of the \l{Unicode standard} introduced a certain
138 character.
139
140 \value Unicode_1_1 Version 1.1
141 \value Unicode_2_0 Version 2.0
142 \value Unicode_2_1_2 Version 2.1.2
143 \value Unicode_3_0 Version 3.0
144 \value Unicode_3_1 Version 3.1
145 \value Unicode_3_2 Version 3.2
146 \value Unicode_4_0 Version 4.0
147 \value Unicode_4_1 Version 4.1
148 \value Unicode_5_0 Version 5.0
149 \value Unicode_5_1 Version 5.1
150 \value Unicode_5_2 Version 5.2
151 \value Unicode_6_0 Version 6.0
152 \value Unicode_6_1 Version 6.1
153 \value Unicode_6_2 Version 6.2
154 \value [since 5.3] Unicode_6_3 Version 6.3
155 \value [since 5.5] Unicode_7_0 Version 7.0
156 \value [since 5.6] Unicode_8_0 Version 8.0
157 \value [since 5.11] Unicode_9_0 Version 9.0
158 \value [since 5.11] Unicode_10_0 Version 10.0
159 \value [since 5.15] Unicode_11_0 Version 11.0
160 \value [since 5.15] Unicode_12_0 Version 12.0
161 \value [since 5.15] Unicode_12_1 Version 12.1
162 \value [since 5.15] Unicode_13_0 Version 13.0
163 \value [since 6.3] Unicode_14_0 Version 14.0
164 \value [since 6.5] Unicode_15_0 Version 15.0
165 \value [since 6.8] Unicode_15_1 Version 15.1
166 \value [since 6.9] Unicode_16_0 Version 16.0
167 \value Unicode_Unassigned The value is not assigned to any character
168 in version 8.0 of Unicode.
169
170 \sa unicodeVersion(), currentUnicodeVersion()
171*/
172
173/*!
174 \enum QChar::Category
175
176 This enum maps the Unicode character categories.
177
178 The following characters are normative in Unicode:
179
180 \value Mark_NonSpacing Unicode class name Mn
181
182 \value Mark_SpacingCombining Unicode class name Mc
183
184 \value Mark_Enclosing Unicode class name Me
185
186 \value Number_DecimalDigit Unicode class name Nd
187
188 \value Number_Letter Unicode class name Nl
189
190 \value Number_Other Unicode class name No
191
192 \value Separator_Space Unicode class name Zs
193
194 \value Separator_Line Unicode class name Zl
195
196 \value Separator_Paragraph Unicode class name Zp
197
198 \value Other_Control Unicode class name Cc
199
200 \value Other_Format Unicode class name Cf
201
202 \value Other_Surrogate Unicode class name Cs
203
204 \value Other_PrivateUse Unicode class name Co
205
206 \value Other_NotAssigned Unicode class name Cn
207
208
209 The following categories are informative in Unicode:
210
211 \value Letter_Uppercase Unicode class name Lu
212
213 \value Letter_Lowercase Unicode class name Ll
214
215 \value Letter_Titlecase Unicode class name Lt
216
217 \value Letter_Modifier Unicode class name Lm
218
219 \value Letter_Other Unicode class name Lo
220
221 \value Punctuation_Connector Unicode class name Pc
222
223 \value Punctuation_Dash Unicode class name Pd
224
225 \value Punctuation_Open Unicode class name Ps
226
227 \value Punctuation_Close Unicode class name Pe
228
229 \value Punctuation_InitialQuote Unicode class name Pi
230
231 \value Punctuation_FinalQuote Unicode class name Pf
232
233 \value Punctuation_Other Unicode class name Po
234
235 \value Symbol_Math Unicode class name Sm
236
237 \value Symbol_Currency Unicode class name Sc
238
239 \value Symbol_Modifier Unicode class name Sk
240
241 \value Symbol_Other Unicode class name So
242
243 \sa category()
244*/
245
246/*!
247 \enum QChar::Script
248 \since 5.1
249
250 This enum type defines the Unicode script property values.
251
252 For details about the Unicode script property values see
253 \l{https://www.unicode.org/reports/tr24/}{Unicode Standard Annex #24}.
254
255 In order to conform to C/C++ naming conventions "Script_" is prepended
256 to the codes used in the Unicode Standard.
257
258 \value Script_Unknown For unassigned, private-use, noncharacter, and surrogate code points.
259 \value Script_Inherited For characters that may be used with multiple scripts
260 and that inherit their script from the preceding characters.
261 These include nonspacing marks, enclosing marks,
262 and zero width joiner/non-joiner characters.
263 \value Script_Common For characters that may be used with multiple scripts
264 and that do not inherit their script from the preceding characters.
265
266 \value [since 5.11] Script_Adlam
267 \value [since 5.6] Script_Ahom
268 \value [since 5.6] Script_AnatolianHieroglyphs
269 \value Script_Arabic
270 \value Script_Armenian
271 \value Script_Avestan
272 \value Script_Balinese
273 \value Script_Bamum
274 \value [since 5.5] Script_BassaVah
275 \value Script_Batak
276 \value Script_Bengali
277 \value [since 5.11] Script_Bhaiksuki
278 \value Script_Bopomofo
279 \value Script_Brahmi
280 \value Script_Braille
281 \value Script_Buginese
282 \value Script_Buhid
283 \value Script_CanadianAboriginal
284 \value Script_Carian
285 \value [since 5.5] Script_CaucasianAlbanian
286 \value Script_Chakma
287 \value Script_Cham
288 \value Script_Cherokee
289 \value [since 5.15] Script_Chorasmian
290 \value Script_Coptic
291 \value Script_Cuneiform
292 \value Script_Cypriot
293 \value [since 6.3] Script_CyproMinoan
294 \value Script_Cyrillic
295 \value Script_Deseret
296 \value Script_Devanagari
297 \value [since 5.15] Script_DivesAkuru
298 \value [since 5.15] Script_Dogra
299 \value [since 5.5] Script_Duployan
300 \value Script_EgyptianHieroglyphs
301 \value [since 5.5] Script_Elbasan
302 \value [since 5.15] Script_Elymaic
303 \value Script_Ethiopic
304 \value [since 6.9] Script_Garay
305 \value Script_Georgian
306 \value Script_Glagolitic
307 \value Script_Gothic
308 \value [since 5.5] Script_Grantha
309 \value Script_Greek
310 \value Script_Gujarati
311 \value [since 5.15] Script_GunjalaGondi
312 \value Script_Gurmukhi
313 \value [since 6.9] Script_GurungKhema
314 \value Script_Han
315 \value Script_Hangul
316 \value [since 5.15] Script_HanifiRohingya
317 \value Script_Hanunoo
318 \value [since 5.6] Script_Hatran
319 \value Script_Hebrew
320 \value Script_Hiragana
321 \value Script_ImperialAramaic
322 \value Script_InscriptionalPahlavi
323 \value Script_InscriptionalParthian
324 \value Script_Javanese
325 \value Script_Kaithi
326 \value Script_Kannada
327 \value Script_Katakana
328 \value [since 6.5] Script_Kawi
329 \value Script_KayahLi
330 \value Script_Kharoshthi
331 \value [since 5.15] Script_KhitanSmallScript
332 \value Script_Khmer
333 \value [since 5.5] Script_Khojki
334 \value [since 5.5] Script_Khudawadi
335 \value [since 6.9] Script_KiratRai
336 \value Script_Lao
337 \value Script_Latin
338 \value Script_Lepcha
339 \value Script_Limbu
340 \value [since 5.5] Script_LinearA
341 \value Script_LinearB
342 \value Script_Lisu
343 \value Script_Lycian
344 \value Script_Lydian
345 \value [since 5.5] Script_Mahajani
346 \value [since 5.15] Script_Makasar
347 \value Script_Malayalam
348 \value Script_Mandaic
349 \value [since 5.5] Script_Manichaean
350 \value [since 5.11] Script_Marchen
351 \value [since 5.11] Script_MasaramGondi
352 \value [since 5.15] Script_Medefaidrin
353 \value Script_MeeteiMayek
354 \value [since 5.5] Script_MendeKikakui
355 \value Script_MeroiticCursive
356 \value Script_MeroiticHieroglyphs
357 \value Script_Miao
358 \value [since 5.5] Script_Modi
359 \value Script_Mongolian
360 \value [since 5.5] Script_Mro
361 \value [since 5.6] Script_Multani
362 \value Script_Myanmar
363 \value [since 5.5] Script_Nabataean
364 \value [since 6.3] Script_NagMundari
365 \value [since 5.15] Script_Nandinagari
366 \value [since 5.11] Script_Newa
367 \value Script_NewTaiLue
368 \value Script_Nko
369 \value [since 5.11] Script_Nushu
370 \value [since 5.15] Script_NyiakengPuachueHmong
371 \value Script_Ogham
372 \value Script_OlChiki
373 \value [since 6.9] Script_OlOnal
374 \value [since 5.6] Script_OldHungarian
375 \value Script_OldItalic
376 \value [since 5.5] Script_OldNorthArabian
377 \value [since 5.5] Script_OldPermic
378 \value Script_OldPersian
379 \value [since 5.15] Script_OldSogdian
380 \value Script_OldSouthArabian
381 \value Script_OldTurkic
382 \value [since 6.3] Script_OldUyghur
383 \value Script_Oriya
384 \value [since 5.11] Script_Osage
385 \value Script_Osmanya
386 \value [since 5.5] Script_PahawhHmong
387 \value [since 5.5] Script_Palmyrene
388 \value [since 5.5] Script_PauCinHau
389 \value Script_PhagsPa
390 \value Script_Phoenician
391 \value [since 5.5] Script_PsalterPahlavi
392 \value Script_Rejang
393 \value Script_Runic
394 \value Script_Samaritan
395 \value Script_Saurashtra
396 \value Script_Sharada
397 \value Script_Shavian
398 \value [since 5.5] Script_Siddham
399 \value [since 5.6] Script_SignWriting
400 \value Script_Sinhala
401 \value [since 5.15] Script_Sogdian
402 \value Script_SoraSompeng
403 \value [since 5.11] Script_Soyombo
404 \value Script_Sundanese
405 \value [since 6.9] Script_Sunuwar
406 \value Script_SylotiNagri
407 \value Script_Syriac
408 \value Script_Tagalog
409 \value Script_Tagbanwa
410 \value Script_TaiLe
411 \value Script_TaiTham
412 \value Script_TaiViet
413 \value Script_Takri
414 \value Script_Tamil
415 \value [since 5.11] Script_Tangut
416 \value [since 6.3] Script_Tangsa
417 \value Script_Telugu
418 \value Script_Thaana
419 \value Script_Thai
420 \value Script_Tibetan
421 \value Script_Tifinagh
422 \value [since 5.5] Script_Tirhuta
423 \value [since 6.9] Script_Todhri
424 \value [since 6.3] Script_Toto
425 \value [since 6.9] Script_TuluTigalari
426 \value Script_Ugaritic
427 \value Script_Vai
428 \value [since 6.3] Script_Vithkuqi
429 \value [since 5.15] Script_Wancho
430 \value [since 5.5] Script_WarangCiti
431 \value [since 5.15] Script_Yezidi
432 \value Script_Yi
433 \value [since 5.11] Script_ZanabazarSquare
434
435 \omitvalue ScriptCount
436
437 \sa script()
438*/
439
440/*!
441 \enum QChar::Direction
442
443 This enum type defines the Unicode direction attributes. See the
444 \l{https://www.unicode.org/reports/tr9/tr9-35.html#Table_Bidirectional_Character_Types}{Unicode
445 Standard} for a description of the values.
446
447 In order to conform to C/C++ naming conventions "Dir" is prepended
448 to the codes used in the Unicode Standard.
449
450 \value DirAL
451 \value DirAN
452 \value DirB
453 \value DirBN
454 \value DirCS
455 \value DirEN
456 \value DirES
457 \value DirET
458 \value [since 5.3] DirFSI
459 \value DirL
460 \value DirLRE
461 \value [since 5.3] DirLRI
462 \value DirLRO
463 \value DirNSM
464 \value DirON
465 \value DirPDF
466 \value [since 5.3] DirPDI
467 \value DirR
468 \value DirRLE
469 \value [since 5.3] DirRLI
470 \value DirRLO
471 \value DirS
472 \value DirWS
473
474 \sa direction()
475*/
476
477/*!
478 \enum QChar::Decomposition
479
480 This enum type defines the Unicode decomposition attributes. See
481 the \l{Unicode standard} for a description of the values.
482
483 \value NoDecomposition
484 \value Canonical
485 \value Circle
486 \value Compat
487 \value Final
488 \value Font
489 \value Fraction
490 \value Initial
491 \value Isolated
492 \value Medial
493 \value Narrow
494 \value NoBreak
495 \value Small
496 \value Square
497 \value Sub
498 \value Super
499 \value Vertical
500 \value Wide
501
502 \sa decomposition()
503*/
504
505/*!
506 \enum QChar::JoiningType
507 since 5.3
508
509 This enum type defines the Unicode joining type attributes. See the
510 \l{Unicode standard} for a description of the values.
511
512 In order to conform to C/C++ naming conventions "Joining_" is prepended
513 to the codes used in the Unicode Standard.
514
515 \value Joining_None
516 \value Joining_Causing
517 \value Joining_Dual
518 \value Joining_Right
519 \value Joining_Left
520 \value Joining_Transparent
521
522 \sa joiningType()
523*/
524
525/*!
526 \enum QChar::CombiningClass
527
528 \internal
529
530 This enum type defines names for some of the Unicode combining
531 classes. See the \l{Unicode Standard} for a description of the values.
532
533 \value Combining_Above
534 \value Combining_AboveAttached
535 \value Combining_AboveLeft
536 \value Combining_AboveLeftAttached
537 \value Combining_AboveRight
538 \value Combining_AboveRightAttached
539 \value Combining_Below
540 \value Combining_BelowAttached
541 \value Combining_BelowLeft
542 \value Combining_BelowLeftAttached
543 \value Combining_BelowRight
544 \value Combining_BelowRightAttached
545 \value Combining_DoubleAbove
546 \value Combining_DoubleBelow
547 \value Combining_IotaSubscript
548 \value Combining_Left
549 \value Combining_LeftAttached
550 \value Combining_Right
551 \value Combining_RightAttached
552*/
553
554/*!
555 \enum QChar::SpecialCharacter
556
557 \value Null A QChar with this value isNull().
558 \value Tabulation Character tabulation.
559 \value LineFeed
560 \value FormFeed
561 \value CarriageReturn
562 \value Space
563 \value Nbsp Non-breaking space.
564 \value SoftHyphen
565 \value ReplacementCharacter The character shown when a font has no glyph
566 for a certain codepoint. A special question mark character is often
567 used. Codecs use this codepoint when input data cannot be
568 represented in Unicode.
569 \value ObjectReplacementCharacter Used to represent an object such as an
570 image when such objects cannot be presented.
571 \value ByteOrderMark
572 \value ByteOrderSwapped
573 \value ParagraphSeparator
574 \value LineSeparator
575 \value [since 6.2] VisualTabCharacter Used to represent a tabulation as a horizontal arrow.
576 \value LastValidCodePoint
577*/
578
579/*!
580 \fn void QChar::setCell(uchar cell)
581 \internal
582*/
583
584/*!
585 \fn void QChar::setRow(uchar row)
586 \internal
587*/
588
589/*!
590 \fn QChar::QChar()
591
592 Constructs a null QChar ('\\0').
593
594 \sa isNull()
595*/
596
597/*!
598 \fn QChar::QChar(QLatin1Char ch)
599
600 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
601*/
602
603/*!
604 \fn QChar::QChar(SpecialCharacter ch)
605
606 Constructs a QChar for the predefined character value \a ch.
607*/
608
609/*!
610 \fn QChar::QChar(char16_t ch)
611 \since 5.10
612
613 Constructs a QChar corresponding to the UTF-16 character \a ch.
614*/
615
616/*!
617 \fn QChar::QChar(wchar_t ch)
618 \since 5.10
619
620 Constructs a QChar corresponding to the wide character \a ch.
621
622 \note This constructor is only available on Windows.
623*/
624
625/*!
626 \fn QChar::QChar(char ch)
627
628 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
629
630 \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
631 is defined.
632
633 \sa QT_NO_CAST_FROM_ASCII
634*/
635
636/*!
637 \fn QChar::QChar(uchar ch)
638
639 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
640
641 \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
642 or \c QT_RESTRICTED_CAST_FROM_ASCII is defined.
643
644 \sa QT_NO_CAST_FROM_ASCII, QT_RESTRICTED_CAST_FROM_ASCII
645*/
646
647/*!
648 \fn QChar::QChar(uchar cell, uchar row)
649
650 Constructs a QChar for Unicode cell \a cell in row \a row.
651
652 \sa cell(), row()
653*/
654
655/*!
656 \fn QChar::QChar(ushort code)
657
658 Constructs a QChar for the character with Unicode code point \a code.
659*/
660
661/*!
662 \fn QChar::QChar(short code)
663
664 Constructs a QChar for the character with Unicode code point \a code.
665*/
666
667/*!
668 \fn QChar::QChar(uint code)
669
670 Constructs a QChar for the character with Unicode code point \a code.
671*/
672
673/*!
674 \fn QChar::QChar(int code)
675
676 Constructs a QChar for the character with Unicode code point \a code.
677*/
678
679/*!
680 \fn static QChar QChar::fromUcs2(char16_t c)
681 \since 6.0
682
683 Constructs a QChar from UTF-16 character \a c.
684
685 \sa fromUcs4()
686*/
687
688/*!
689 \fn static auto QChar::fromUcs4(char32_t c)
690 \since 6.0
691
692 Returns an anonymous struct that
693 \list
694 \li contains a \c{char16_t chars[2]} array,
695 \li can be implicitly converted to a QStringView, and
696 \li iterated over with a C++11 ranged for loop.
697 \endlist
698
699 If \a c requires surrogates, \c{chars[0]} contains the high surrogate
700 and \c{chars[1]} the low surrogate, and the QStringView has size 2.
701 Otherwise, \c{chars[0]} contains \a c and \c{chars[1]} is
702 \l{QChar::isNull}{null}, and the QStringView has size 1.
703
704 This allows easy use of the result:
705
706 \code
707 QString s;
708 s += QChar::fromUcs4(ch);
709 \endcode
710
711 \code
712 for (char16_t c16 : QChar::fromUcs4(ch))
713 use(c16);
714 \endcode
715
716 \sa fromUcs2(), requiresSurrogates()
717*/
718
719/*!
720 \fn bool QChar::isNull() const
721
722 Returns \c true if the character is the Unicode character 0x0000
723 ('\\0'); otherwise returns \c false.
724*/
725
726/*!
727 \fn uchar QChar::cell() const
728
729 Returns the cell (least significant byte) of the Unicode character.
730
731 \sa row()
732*/
733
734/*!
735 \fn uchar QChar::row() const
736
737 Returns the row (most significant byte) of the Unicode character.
738
739 \sa cell()
740*/
741
742/*!
743 \fn bool QChar::isPrint() const
744
745 Returns \c true if the character is a printable character; otherwise
746 returns \c false. This is any character not of category Other_*.
747
748 Note that this gives no indication of whether the character is
749 available in a particular font.
750*/
751
752/*!
753 \overload
754 \since 5.0
755
756 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
757 a printable character; otherwise returns \c false.
758 This is any character not of category Other_*.
759
760 Note that this gives no indication of whether the character is
761 available in a particular font.
762
763 \note Before Qt 6, this function took a \c uint argument.
764*/
765bool QChar::isPrint(char32_t ucs4) noexcept
766{
767 if (ucs4 > LastValidCodePoint)
768 return false;
769 const int test = FLAG(Other_Control) |
770 FLAG(Other_Format) |
771 FLAG(Other_Surrogate) |
772 FLAG(Other_PrivateUse) |
773 FLAG(Other_NotAssigned);
774 return !(FLAG(qGetProp(ucs4)->category) & test);
775}
776
777/*!
778 \fn bool QChar::isSpace() const
779
780 Returns \c true if the character is a separator character
781 (Separator_* categories or certain code points from Other_Control category);
782 otherwise returns \c false.
783*/
784
785/*!
786 \fn bool QChar::isSpace(char32_t ucs4)
787 \overload
788 \since 5.0
789
790 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
791 a separator character (Separator_* categories or certain code points
792 from Other_Control category); otherwise returns \c false.
793
794 \note Before Qt 6, this function took a \c uint argument.
795*/
796
797/*!
798 \internal
799*/
800bool QT_FASTCALL QChar::isSpace_helper(char32_t ucs4) noexcept
801{
802 if (ucs4 > LastValidCodePoint)
803 return false;
804 const int test = FLAG(Separator_Space) |
805 FLAG(Separator_Line) |
806 FLAG(Separator_Paragraph);
807 return FLAG(qGetProp(ucs4)->category) & test;
808}
809
810/*!
811 \fn bool QChar::isMark() const
812
813 Returns \c true if the character is a mark (Mark_* categories);
814 otherwise returns \c false.
815
816 See QChar::Category for more information regarding marks.
817*/
818
819/*!
820 \overload
821 \since 5.0
822
823 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
824 a mark (Mark_* categories); otherwise returns \c false.
825
826 \note Before Qt 6, this function took a \c uint argument.
827*/
828bool QChar::isMark(char32_t ucs4) noexcept
829{
830 if (ucs4 > LastValidCodePoint)
831 return false;
832 const int test = FLAG(Mark_NonSpacing) |
833 FLAG(Mark_SpacingCombining) |
834 FLAG(Mark_Enclosing);
835 return FLAG(qGetProp(ucs4)->category) & test;
836}
837
838/*!
839 \fn bool QChar::isPunct() const
840
841 Returns \c true if the character is a punctuation mark (Punctuation_*
842 categories); otherwise returns \c false.
843*/
844
845/*!
846 \overload
847 \since 5.0
848
849 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
850 a punctuation mark (Punctuation_* categories); otherwise returns \c false.
851
852 \note Before Qt 6, this function took a \c uint argument.
853*/
854bool QChar::isPunct(char32_t ucs4) noexcept
855{
856 if (ucs4 > LastValidCodePoint)
857 return false;
858 const int test = FLAG(Punctuation_Connector) |
859 FLAG(Punctuation_Dash) |
860 FLAG(Punctuation_Open) |
861 FLAG(Punctuation_Close) |
862 FLAG(Punctuation_InitialQuote) |
863 FLAG(Punctuation_FinalQuote) |
864 FLAG(Punctuation_Other);
865 return FLAG(qGetProp(ucs4)->category) & test;
866}
867
868/*!
869 \fn bool QChar::isSymbol() const
870
871 Returns \c true if the character is a symbol (Symbol_* categories);
872 otherwise returns \c false.
873*/
874
875/*!
876 \overload
877 \since 5.0
878
879 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
880 a symbol (Symbol_* categories); otherwise returns \c false.
881
882 \note Before Qt 6, this function took a \c uint argument.
883*/
884bool QChar::isSymbol(char32_t ucs4) noexcept
885{
886 if (ucs4 > LastValidCodePoint)
887 return false;
888 const int test = FLAG(Symbol_Math) |
889 FLAG(Symbol_Currency) |
890 FLAG(Symbol_Modifier) |
891 FLAG(Symbol_Other);
892 return FLAG(qGetProp(ucs4)->category) & test;
893}
894
895/*!
896 \fn bool QChar::isLetter() const
897
898 Returns \c true if the character is a letter (Letter_* categories);
899 otherwise returns \c false.
900*/
901
902/*!
903 \fn bool QChar::isLetter(char32_t ucs4)
904 \overload
905 \since 5.0
906
907 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
908 a letter (Letter_* categories); otherwise returns \c false.
909
910 \note Before Qt 6, this function took a \c uint argument.
911*/
912
913/*!
914 \internal
915*/
916bool QT_FASTCALL QChar::isLetter_helper(char32_t ucs4) noexcept
917{
918 if (ucs4 > LastValidCodePoint)
919 return false;
920 const int test = FLAG(Letter_Uppercase) |
921 FLAG(Letter_Lowercase) |
922 FLAG(Letter_Titlecase) |
923 FLAG(Letter_Modifier) |
924 FLAG(Letter_Other);
925 return FLAG(qGetProp(ucs4)->category) & test;
926}
927
928/*!
929 \fn bool QChar::isNumber() const
930
931 Returns \c true if the character is a number (Number_* categories,
932 not just 0-9); otherwise returns \c false.
933
934 \sa isDigit()
935*/
936
937/*!
938 \fn bool QChar::isNumber(char32_t ucs4)
939 \overload
940 \since 5.0
941
942 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
943 a number (Number_* categories, not just 0-9); otherwise returns \c false.
944
945 \note Before Qt 6, this function took a \c uint argument.
946
947 \sa isDigit()
948*/
949
950/*!
951 \internal
952*/
953bool QT_FASTCALL QChar::isNumber_helper(char32_t ucs4) noexcept
954{
955 if (ucs4 > LastValidCodePoint)
956 return false;
957 const int test = FLAG(Number_DecimalDigit) |
958 FLAG(Number_Letter) |
959 FLAG(Number_Other);
960 return FLAG(qGetProp(ucs4)->category) & test;
961}
962
963/*!
964 \fn bool QChar::isLetterOrNumber() const
965
966 Returns \c true if the character is a letter or number (Letter_* or
967 Number_* categories); otherwise returns \c false.
968*/
969
970/*!
971 \fn bool QChar::isLetterOrNumber(char32_t ucs4)
972 \overload
973 \since 5.0
974
975 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
976 a letter or number (Letter_* or Number_* categories); otherwise returns \c false.
977
978 \note Before Qt 6, this function took a \c uint argument.
979*/
980
981/*!
982 \internal
983*/
984bool QT_FASTCALL QChar::isLetterOrNumber_helper(char32_t ucs4) noexcept
985{
986 if (ucs4 > LastValidCodePoint)
987 return false;
988 const int test = FLAG(Letter_Uppercase) |
989 FLAG(Letter_Lowercase) |
990 FLAG(Letter_Titlecase) |
991 FLAG(Letter_Modifier) |
992 FLAG(Letter_Other) |
993 FLAG(Number_DecimalDigit) |
994 FLAG(Number_Letter) |
995 FLAG(Number_Other);
996 return FLAG(qGetProp(ucs4)->category) & test;
997}
998
999/*!
1000 \fn bool QChar::isDigit() const
1001
1002 Returns \c true if the character is a decimal digit
1003 (Number_DecimalDigit); otherwise returns \c false.
1004
1005 \sa isNumber()
1006*/
1007
1008/*!
1009 \fn bool QChar::isDigit(char32_t ucs4)
1010 \overload
1011 \since 5.0
1012
1013 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
1014 a decimal digit (Number_DecimalDigit); otherwise returns \c false.
1015
1016 \note Before Qt 6, this function took a \c uint argument.
1017
1018 \sa isNumber()
1019*/
1020
1021/*!
1022 \fn bool QChar::isNonCharacter() const
1023 \since 5.0
1024
1025 Returns \c true if the QChar is a non-character; false otherwise.
1026
1027 Unicode has a certain number of code points that are classified
1028 as "non-characters:" that is, they can be used for internal purposes
1029 in applications but cannot be used for text interchange.
1030 Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1031 [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1032*/
1033
1034/*!
1035 \fn bool QChar::isHighSurrogate() const
1036
1037 Returns \c true if the QChar is the high part of a UTF16 surrogate
1038 (for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1039*/
1040
1041/*!
1042 \fn bool QChar::isLowSurrogate() const
1043
1044 Returns \c true if the QChar is the low part of a UTF16 surrogate
1045 (for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1046*/
1047
1048/*!
1049 \fn bool QChar::isSurrogate() const
1050 \since 5.0
1051
1052 Returns \c true if the QChar contains a code point that is in either
1053 the high or the low part of the UTF-16 surrogate range
1054 (for example if its code point is in range [0xd800..0xdfff]); false otherwise.
1055*/
1056
1057/*!
1058 \fn static bool QChar::isNonCharacter(char32_t ucs4)
1059 \overload
1060 \since 5.0
1061
1062 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1063 is a non-character; false otherwise.
1064
1065 Unicode has a certain number of code points that are classified
1066 as "non-characters:" that is, they can be used for internal purposes
1067 in applications but cannot be used for text interchange.
1068 Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1069 [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1070
1071 \note Before Qt 6, this function took a \c uint argument.
1072*/
1073
1074/*!
1075 \fn static bool QChar::isHighSurrogate(char32_t ucs4)
1076 \overload
1077
1078 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1079 is the high part of a UTF16 surrogate
1080 (for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1081
1082 \note Before Qt 6, this function took a \c uint argument.
1083*/
1084
1085/*!
1086 \fn static bool QChar::isLowSurrogate(char32_t ucs4)
1087 \overload
1088
1089 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1090 is the low part of a UTF16 surrogate
1091 (for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1092
1093 \note Before Qt 6, this function took a \c uint argument.
1094*/
1095
1096/*!
1097 \fn static bool QChar::isSurrogate(char32_t ucs4)
1098 \overload
1099 \since 5.0
1100
1101 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1102 contains a code point that is in either the high or the low part of the
1103 UTF-16 surrogate range (for example if its code point is in range [0xd800..0xdfff]);
1104 false otherwise.
1105
1106 \note Before Qt 6, this function took a \c uint argument.
1107*/
1108
1109/*!
1110 \fn static bool QChar::requiresSurrogates(char32_t ucs4)
1111
1112 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1113 can be split into the high and low parts of a UTF16 surrogate
1114 (for example if its code point is greater than or equals to 0x10000);
1115 false otherwise.
1116
1117 \note Before Qt 6, this function took a \c uint argument.
1118*/
1119
1120/*!
1121 \fn static char32_t QChar::surrogateToUcs4(char16_t high, char16_t low)
1122
1123 Converts a UTF16 surrogate pair with the given \a high and \a low values
1124 to it's UCS-4-encoded code point.
1125
1126 \note Before Qt 6, this function took \c ushort arguments and returned \c uint.
1127*/
1128
1129/*!
1130 \fn static char32_t QChar::surrogateToUcs4(QChar high, QChar low)
1131 \overload
1132
1133 Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
1134
1135 \note Before Qt 6, this function returned \c uint.
1136*/
1137
1138/*!
1139 \fn static char16_t QChar::highSurrogate(char32_t ucs4)
1140
1141 Returns the high surrogate part of a UCS-4-encoded code point.
1142 The returned result is undefined if \a ucs4 is smaller than 0x10000.
1143
1144 \note Before Qt 6, this function took a \c uint argument and returned \c ushort.
1145*/
1146
1147/*!
1148 \fn static char16_t QChar::lowSurrogate(char32_t ucs4)
1149
1150 Returns the low surrogate part of a UCS-4-encoded code point.
1151 The returned result is undefined if \a ucs4 is smaller than 0x10000.
1152
1153 \note Before Qt 6, this function took a \c uint argument and returned \c ushort.
1154*/
1155
1156/*!
1157 \fn int QChar::digitValue() const
1158
1159 Returns the numeric value of the digit, or -1 if the character is not a digit.
1160*/
1161
1162/*!
1163 \overload
1164 Returns the numeric value of the digit specified by the UCS-4-encoded
1165 character, \a ucs4, or -1 if the character is not a digit.
1166
1167 \note Before Qt 6, this function took a \c uint argument.
1168*/
1169int QChar::digitValue(char32_t ucs4) noexcept
1170{
1171 if (ucs4 > LastValidCodePoint)
1172 return -1;
1173 return qGetProp(ucs4)->digitValue;
1174}
1175
1176/*!
1177 \fn QChar::Category QChar::category() const
1178
1179 Returns the character's category.
1180*/
1181
1182/*!
1183 \overload
1184 Returns the category of the UCS-4-encoded character specified by \a ucs4.
1185
1186 \note Before Qt 6, this function took a \c uint argument.
1187*/
1188QChar::Category QChar::category(char32_t ucs4) noexcept
1189{
1190 if (ucs4 > LastValidCodePoint)
1191 return QChar::Other_NotAssigned;
1192 return (QChar::Category) qGetProp(ucs4)->category;
1193}
1194
1195/*!
1196 \fn QChar::Direction QChar::direction() const
1197
1198 Returns the character's direction.
1199*/
1200
1201/*!
1202 \overload
1203 Returns the direction of the UCS-4-encoded character specified by \a ucs4.
1204
1205 \note Before Qt 6, this function took a \c uint argument.
1206*/
1207QChar::Direction QChar::direction(char32_t ucs4) noexcept
1208{
1209 if (ucs4 > LastValidCodePoint)
1210 return QChar::DirL;
1211 return (QChar::Direction) qGetProp(ucs4)->direction;
1212}
1213
1214/*!
1215 \fn QChar::JoiningType QChar::joiningType() const
1216 \since 5.3
1217
1218 Returns information about the joining type attributes of the character
1219 (needed for certain languages such as Arabic or Syriac).
1220*/
1221
1222/*!
1223 \overload
1224 \since 5.3
1225
1226 Returns information about the joining type attributes of the UCS-4-encoded
1227 character specified by \a ucs4
1228 (needed for certain languages such as Arabic or Syriac).
1229
1230 \note Before Qt 6, this function took a \c uint argument.
1231*/
1232QChar::JoiningType QChar::joiningType(char32_t ucs4) noexcept
1233{
1234 if (ucs4 > LastValidCodePoint)
1235 return QChar::Joining_None;
1236 return QChar::JoiningType(qGetProp(ucs4)->joining);
1237}
1238
1239/*!
1240 \fn bool QChar::hasMirrored() const
1241
1242 Returns \c true if the character should be reversed if the text
1243 direction is reversed; otherwise returns \c false.
1244
1245 A bit faster equivalent of (ch.mirroredChar() != ch).
1246
1247 \sa mirroredChar()
1248*/
1249
1250/*!
1251 \overload
1252 \since 5.0
1253
1254 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1255 should be reversed if the text direction is reversed; otherwise returns \c false.
1256
1257 A bit faster equivalent of (QChar::mirroredChar(ucs4) != ucs4).
1258
1259 \note Before Qt 6, this function took a \c uint argument.
1260
1261 \sa mirroredChar()
1262*/
1263bool QChar::hasMirrored(char32_t ucs4) noexcept
1264{
1265 if (ucs4 > LastValidCodePoint)
1266 return false;
1267 return qGetProp(ucs4)->mirrorDiff != 0;
1268}
1269
1270/*!
1271 \fn bool QChar::isLower() const
1272
1273 Returns \c true if the character is a lowercase letter, for example
1274 category() is Letter_Lowercase.
1275
1276 \sa isUpper(), toLower(), toUpper()
1277*/
1278
1279/*!
1280 \fn static bool QChar::isLower(char32_t ucs4)
1281 \overload
1282 \since 5.0
1283
1284 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1285 is a lowercase letter, for example category() is Letter_Lowercase.
1286
1287 \note Before Qt 6, this function took a \c uint argument.
1288
1289 \sa isUpper(), toLower(), toUpper()
1290*/
1291
1292/*!
1293 \fn bool QChar::isUpper() const
1294
1295 Returns \c true if the character is an uppercase letter, for example
1296 category() is Letter_Uppercase.
1297
1298 \sa isLower(), toUpper(), toLower()
1299*/
1300
1301/*!
1302 \fn static bool QChar::isUpper(char32_t ucs4)
1303 \overload
1304 \since 5.0
1305
1306 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1307 is an uppercase letter, for example category() is Letter_Uppercase.
1308
1309 \note Before Qt 6, this function took a \c uint argument.
1310
1311 \sa isLower(), toUpper(), toLower()
1312*/
1313
1314/*!
1315 \fn bool QChar::isTitleCase() const
1316
1317 Returns \c true if the character is a titlecase letter, for example
1318 category() is Letter_Titlecase.
1319
1320 \sa isLower(), toUpper(), toLower(), toTitleCase()
1321*/
1322
1323/*!
1324 \fn static bool QChar::isTitleCase(char32_t ucs4)
1325 \overload
1326 \since 5.0
1327
1328 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1329 is a titlecase letter, for example category() is Letter_Titlecase.
1330
1331 \note Before Qt 6, this function took a \c uint argument.
1332
1333 \sa isLower(), toUpper(), toLower(), toTitleCase()
1334*/
1335/*!
1336 \fn QChar QChar::mirroredChar() const
1337
1338 Returns the mirrored character if this character is a mirrored
1339 character; otherwise returns the character itself.
1340
1341 \sa hasMirrored()
1342*/
1343
1344/*!
1345 \overload
1346 Returns the mirrored character if the UCS-4-encoded character specified
1347 by \a ucs4 is a mirrored character; otherwise returns the character itself.
1348
1349 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1350
1351 \sa hasMirrored()
1352*/
1353char32_t QChar::mirroredChar(char32_t ucs4) noexcept
1354{
1355 if (ucs4 > LastValidCodePoint)
1356 return ucs4;
1357 return ucs4 + qGetProp(ucs4)->mirrorDiff;
1358}
1359
1360// Constants for Hangul (de)composition, see UAX #15:
1361static constexpr char32_t Hangul_SBase = 0xac00;
1362static constexpr char32_t Hangul_LBase = 0x1100;
1363static constexpr char32_t Hangul_VBase = 0x1161;
1364static constexpr char32_t Hangul_TBase = 0x11a7;
1365static constexpr quint32 Hangul_LCount = 19;
1366static constexpr quint32 Hangul_VCount = 21;
1367static constexpr quint32 Hangul_TCount = 28;
1368static constexpr quint32 Hangul_NCount = Hangul_VCount * Hangul_TCount;
1369static constexpr quint32 Hangul_SCount = Hangul_LCount * Hangul_NCount;
1370
1371// buffer has to have a length of 3. It's needed for Hangul decomposition
1372static const QChar * QT_FASTCALL decompositionHelper(
1373 char32_t ucs4, qsizetype *length, QChar::Decomposition *tag, QChar *buffer)
1374{
1375 if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
1376 // compute Hangul syllable decomposition as per UAX #15
1377 const char32_t SIndex = ucs4 - Hangul_SBase;
1378 buffer[0] = QChar(Hangul_LBase + SIndex / Hangul_NCount); // L
1379 buffer[1] = QChar(Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount); // V
1380 buffer[2] = QChar(Hangul_TBase + SIndex % Hangul_TCount); // T
1381 *length = buffer[2].unicode() == Hangul_TBase ? 2 : 3;
1382 *tag = QChar::Canonical;
1383 return buffer;
1384 }
1385
1386 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1387 if (index == 0xffff) {
1388 *length = 0;
1389 *tag = QChar::NoDecomposition;
1390 return nullptr;
1391 }
1392
1393 const unsigned short *decomposition = uc_decomposition_map+index;
1394 *tag = QChar::Decomposition((*decomposition) & 0xff);
1395 *length = (*decomposition) >> 8;
1396 return reinterpret_cast<const QChar *>(decomposition + 1);
1397}
1398
1399/*!
1400 Decomposes a character into it's constituent parts. Returns an empty string
1401 if no decomposition exists.
1402*/
1403QString QChar::decomposition() const
1404{
1405 return QChar::decomposition(ucs4: ucs);
1406}
1407
1408/*!
1409 \overload
1410 Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
1411 constituent parts. Returns an empty string if no decomposition exists.
1412
1413 \note Before Qt 6, this function took a \c uint argument.
1414*/
1415QString QChar::decomposition(char32_t ucs4)
1416{
1417 QChar buffer[3];
1418 qsizetype length;
1419 QChar::Decomposition tag;
1420 const QChar *d = decompositionHelper(ucs4, length: &length, tag: &tag, buffer);
1421 return QString(d, length);
1422}
1423
1424/*!
1425 \fn QChar::Decomposition QChar::decompositionTag() const
1426
1427 Returns the tag defining the composition of the character. Returns
1428 QChar::NoDecomposition if no decomposition exists.
1429*/
1430
1431/*!
1432 \overload
1433 Returns the tag defining the composition of the UCS-4-encoded character
1434 specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
1435
1436 \note Before Qt 6, this function took a \c uint argument.
1437*/
1438QChar::Decomposition QChar::decompositionTag(char32_t ucs4) noexcept
1439{
1440 if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount)
1441 return QChar::Canonical;
1442 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1443 if (index == 0xffff)
1444 return QChar::NoDecomposition;
1445 return (QChar::Decomposition)(uc_decomposition_map[index] & 0xff);
1446}
1447
1448/*!
1449 \fn unsigned char QChar::combiningClass() const
1450
1451 Returns the combining class for the character as defined in the
1452 Unicode standard. This is mainly useful as a positioning hint for
1453 marks attached to a base character.
1454
1455 The Qt text rendering engine uses this information to correctly
1456 position non-spacing marks around a base character.
1457*/
1458
1459/*!
1460 \overload
1461 Returns the combining class for the UCS-4-encoded character specified by
1462 \a ucs4, as defined in the Unicode standard.
1463
1464 \note Before Qt 6, this function took a \c uint argument.
1465*/
1466unsigned char QChar::combiningClass(char32_t ucs4) noexcept
1467{
1468 if (ucs4 > LastValidCodePoint)
1469 return 0;
1470 return (unsigned char) qGetProp(ucs4)->combiningClass;
1471}
1472
1473/*!
1474 \fn QChar::Script QChar::script() const
1475 \since 5.1
1476
1477 Returns the Unicode script property value for this character.
1478*/
1479
1480/*!
1481 \overload
1482 \since 5.1
1483
1484 Returns the Unicode script property value for the character specified in
1485 its UCS-4-encoded form as \a ucs4.
1486
1487 \note Before Qt 6, this function took a \c uint argument.
1488*/
1489QChar::Script QChar::script(char32_t ucs4) noexcept
1490{
1491 if (ucs4 > LastValidCodePoint)
1492 return QChar::Script_Unknown;
1493 return (QChar::Script) qGetProp(ucs4)->script;
1494}
1495
1496/*!
1497 \fn QChar::UnicodeVersion QChar::unicodeVersion() const
1498
1499 Returns the Unicode version that introduced this character.
1500*/
1501
1502/*!
1503 \overload
1504 Returns the Unicode version that introduced the character specified in
1505 its UCS-4-encoded form as \a ucs4.
1506
1507 \note Before Qt 6, this function took a \c uint argument.
1508*/
1509QChar::UnicodeVersion QChar::unicodeVersion(char32_t ucs4) noexcept
1510{
1511 if (ucs4 > LastValidCodePoint)
1512 return QChar::Unicode_Unassigned;
1513 return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1514}
1515
1516/*!
1517 Returns the most recent supported Unicode version.
1518*/
1519QChar::UnicodeVersion QChar::currentUnicodeVersion() noexcept
1520{
1521 return UNICODE_DATA_VERSION;
1522}
1523
1524static auto fullConvertCase(char32_t uc, QUnicodeTables::Case which) noexcept
1525{
1526 struct R {
1527 char16_t chars[MaxSpecialCaseLength + 1];
1528 qint8 sz;
1529
1530 // iterable
1531 auto begin() const { return chars; }
1532 auto end() const { return chars + sz; }
1533 // QStringView-compatible
1534 auto data() const { return chars; }
1535 auto size() const { return sz; }
1536 } result;
1537 Q_ASSERT(uc <= QChar::LastValidCodePoint);
1538
1539 auto pp = result.chars;
1540
1541 const auto fold = qGetProp(ucs4: uc)->cases[which];
1542 const auto caseDiff = fold.diff;
1543
1544 if (Q_UNLIKELY(fold.special)) {
1545 const auto *specialCase = specialCaseMap + caseDiff;
1546 auto length = *specialCase++;
1547 while (length--)
1548 *pp++ = *specialCase++;
1549 } else {
1550 // so far, case conversion never changes planes (guaranteed by the qunicodetables generator)
1551 for (char16_t c : QChar::fromUcs4(c: uc + caseDiff))
1552 *pp++ = c;
1553 }
1554 result.sz = pp - result.chars;
1555 return result;
1556}
1557
1558template <typename T>
1559Q_DECL_CONST_FUNCTION static inline T convertCase_helper(T uc, QUnicodeTables::Case which) noexcept
1560{
1561 const auto fold = qGetProp(uc)->cases[which];
1562
1563 if (Q_UNLIKELY(fold.special)) {
1564 const ushort *specialCase = specialCaseMap + fold.diff;
1565 // so far, there are no special cases beyond BMP (guaranteed by the qunicodetables generator)
1566 return *specialCase == 1 ? specialCase[1] : uc;
1567 }
1568
1569 return uc + fold.diff;
1570}
1571
1572/*!
1573 \fn QChar QChar::toLower() const
1574
1575 Returns the lowercase equivalent if the character is uppercase or titlecase;
1576 otherwise returns the character itself.
1577*/
1578
1579/*!
1580 \overload
1581 Returns the lowercase equivalent of the UCS-4-encoded character specified
1582 by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1583 the character itself.
1584
1585 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1586*/
1587char32_t QChar::toLower(char32_t ucs4) noexcept
1588{
1589 if (ucs4 > LastValidCodePoint)
1590 return ucs4;
1591 return convertCase_helper(uc: ucs4, which: QUnicodeTables::LowerCase);
1592}
1593
1594/*!
1595 \fn QChar QChar::toUpper() const
1596
1597 Returns the uppercase equivalent if the character is lowercase or titlecase;
1598 otherwise returns the character itself.
1599
1600 \note This function also returns the original character in the rare case of
1601 the uppercase form of the character requiring two or more characters.
1602
1603 \sa QString::toUpper()
1604*/
1605
1606/*!
1607 \overload
1608 Returns the uppercase equivalent of the UCS-4-encoded character specified
1609 by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1610 the character itself.
1611
1612 \note This function also returns the original character in the rare case of
1613 the uppercase form of the character requiring two or more characters.
1614
1615 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1616
1617 \sa QString::toUpper()
1618*/
1619char32_t QChar::toUpper(char32_t ucs4) noexcept
1620{
1621 if (ucs4 > LastValidCodePoint)
1622 return ucs4;
1623 return convertCase_helper(uc: ucs4, which: QUnicodeTables::UpperCase);
1624}
1625
1626/*!
1627 \fn QChar QChar::toTitleCase() const
1628
1629 Returns the title case equivalent if the character is lowercase or uppercase;
1630 otherwise returns the character itself.
1631*/
1632
1633/*!
1634 \overload
1635 Returns the title case equivalent of the UCS-4-encoded character specified
1636 by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1637 the character itself.
1638
1639 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1640*/
1641char32_t QChar::toTitleCase(char32_t ucs4) noexcept
1642{
1643 if (ucs4 > LastValidCodePoint)
1644 return ucs4;
1645 return convertCase_helper(uc: ucs4, which: QUnicodeTables::TitleCase);
1646}
1647
1648static inline char32_t foldCase(const char16_t *ch, const char16_t *start)
1649{
1650 char32_t ucs4 = *ch;
1651 if (QChar::isLowSurrogate(ucs4) && ch > start && QChar::isHighSurrogate(ucs4: *(ch - 1)))
1652 ucs4 = QChar::surrogateToUcs4(high: *(ch - 1), low: ucs4);
1653 return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1654}
1655
1656static inline char32_t foldCase(char32_t ch, char32_t &last) noexcept
1657{
1658 char32_t ucs4 = ch;
1659 if (QChar::isLowSurrogate(ucs4) && QChar::isHighSurrogate(ucs4: last))
1660 ucs4 = QChar::surrogateToUcs4(high: last, low: ucs4);
1661 last = ch;
1662 return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1663}
1664
1665static inline char16_t foldCase(char16_t ch) noexcept
1666{
1667 return convertCase_helper(uc: ch, which: QUnicodeTables::CaseFold);
1668}
1669
1670static inline QChar foldCase(QChar ch) noexcept
1671{
1672 return QChar(foldCase(ch: ch.unicode()));
1673}
1674
1675/*!
1676 \fn QChar QChar::toCaseFolded() const
1677
1678 Returns the case folded equivalent of the character.
1679 For most Unicode characters this is the same as toLower().
1680*/
1681
1682/*!
1683 \overload
1684 Returns the case folded equivalent of the UCS-4-encoded character specified
1685 by \a ucs4. For most Unicode characters this is the same as toLower().
1686
1687 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1688*/
1689char32_t QChar::toCaseFolded(char32_t ucs4) noexcept
1690{
1691 if (ucs4 > LastValidCodePoint)
1692 return ucs4;
1693 return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1694}
1695
1696/*!
1697 \fn char QChar::toLatin1() const
1698
1699 Returns the Latin-1 character equivalent to the QChar, or 0. This
1700 is mainly useful for non-internationalized software.
1701
1702 \note It is not possible to distinguish a non-Latin-1 character from a Latin-1 0
1703 (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1704
1705 \sa unicode()
1706*/
1707
1708/*!
1709 \fn QChar QChar::fromLatin1(char)
1710
1711 Converts the Latin-1 character \a c to its equivalent QChar. This
1712 is mainly useful for non-internationalized software.
1713
1714 An alternative is to use QLatin1Char.
1715
1716 \sa toLatin1(), unicode()
1717*/
1718
1719#ifndef QT_NO_DATASTREAM
1720/*!
1721 \relates QChar
1722
1723 Writes the char \a chr to the stream \a out.
1724
1725 \sa {Serializing Qt Data Types}
1726*/
1727QDataStream &operator<<(QDataStream &out, QChar chr)
1728{
1729 out << quint16(chr.unicode());
1730 return out;
1731}
1732
1733/*!
1734 \relates QChar
1735
1736 Reads a char from the stream \a in into char \a chr.
1737
1738 \sa {Serializing Qt Data Types}
1739*/
1740QDataStream &operator>>(QDataStream &in, QChar &chr)
1741{
1742 quint16 u;
1743 in >> u;
1744 chr.unicode() = char16_t(u);
1745 return in;
1746}
1747#endif // QT_NO_DATASTREAM
1748
1749/*!
1750 \fn QChar::unicode()
1751
1752 Returns a reference to the numeric Unicode value of the QChar.
1753*/
1754
1755/*!
1756 \fn QChar::unicode() const
1757
1758 Returns the numeric Unicode value of the QChar.
1759*/
1760
1761/*****************************************************************************
1762 Documentation of QChar related functions
1763 *****************************************************************************/
1764
1765/*!
1766 \fn bool QChar::operator==(const QChar &c1, const QChar &c2)
1767
1768 Returns \c true if \a c1 and \a c2 are the same Unicode character;
1769 otherwise returns \c false.
1770*/
1771
1772/*!
1773 \fn bool QChar::operator!=(const QChar &c1, const QChar &c2)
1774
1775 Returns \c true if \a c1 and \a c2 are not the same Unicode
1776 character; otherwise returns \c false.
1777*/
1778
1779/*!
1780 \fn bool QChar::operator<=(const QChar &c1, const QChar &c2)
1781
1782 Returns \c true if the numeric Unicode value of \a c1 is less than
1783 or equal to that of \a c2; otherwise returns \c false.
1784*/
1785
1786/*!
1787 \fn bool QChar::operator>=(const QChar &c1, const QChar &c2)
1788
1789 Returns \c true if the numeric Unicode value of \a c1 is greater than
1790 or equal to that of \a c2; otherwise returns \c false.
1791*/
1792
1793/*!
1794 \fn bool QChar::operator<(const QChar &c1, const QChar &c2)
1795
1796 Returns \c true if the numeric Unicode value of \a c1 is less than
1797 that of \a c2; otherwise returns \c false.
1798*/
1799
1800/*!
1801 \fn bool QChar::operator>(const QChar &c1, const QChar &c2)
1802
1803 Returns \c true if the numeric Unicode value of \a c1 is greater than
1804 that of \a c2; otherwise returns \c false.
1805*/
1806
1807/*!
1808 \fn Qt::Literals::StringLiterals::operator""_L1(char ch)
1809
1810 \relates QLatin1Char
1811 \since 6.4
1812
1813 Literal operator that creates a QLatin1Char out of \a ch.
1814
1815 The following code creates a QLatin1Char:
1816 \code
1817 using namespace Qt::Literals::StringLiterals;
1818
1819 auto ch = 'a'_L1;
1820 \endcode
1821
1822 \sa Qt::Literals::StringLiterals
1823*/
1824
1825// ---------------------------------------------------------------------------
1826
1827
1828static void decomposeHelper(QString *str, bool canonical, QChar::UnicodeVersion version, qsizetype from)
1829{
1830 qsizetype length;
1831 QChar::Decomposition tag;
1832 QChar buffer[3];
1833
1834 QString &s = *str;
1835
1836 const unsigned short *utf16 = reinterpret_cast<unsigned short *>(s.data());
1837 const unsigned short *uc = utf16 + s.size();
1838 while (uc != utf16 + from) {
1839 char32_t ucs4 = *(--uc);
1840 if (QChar(ucs4).isLowSurrogate() && uc != utf16) {
1841 ushort high = *(uc - 1);
1842 if (QChar(high).isHighSurrogate()) {
1843 --uc;
1844 ucs4 = QChar::surrogateToUcs4(high, low: ucs4);
1845 }
1846 }
1847
1848 if (QChar::unicodeVersion(ucs4) > version)
1849 continue;
1850
1851 const QChar *d = decompositionHelper(ucs4, length: &length, tag: &tag, buffer);
1852 if (!d || (canonical && tag != QChar::Canonical))
1853 continue;
1854
1855 qsizetype pos = uc - utf16;
1856 s.replace(i: pos, len: QChar::requiresSurrogates(ucs4) ? 2 : 1, s: d, slen: length);
1857 // since the replace invalidates the pointers and we do decomposition recursive
1858 utf16 = reinterpret_cast<unsigned short *>(s.data());
1859 uc = utf16 + pos + length;
1860 }
1861}
1862
1863
1864struct UCS2Pair {
1865 ushort u1;
1866 ushort u2;
1867};
1868
1869inline bool operator<(const UCS2Pair &ligature1, const UCS2Pair &ligature2)
1870{ return ligature1.u1 < ligature2.u1; }
1871inline bool operator<(ushort u1, const UCS2Pair &ligature)
1872{ return u1 < ligature.u1; }
1873inline bool operator<(const UCS2Pair &ligature, ushort u1)
1874{ return ligature.u1 < u1; }
1875
1876struct UCS2SurrogatePair {
1877 UCS2Pair p1;
1878 UCS2Pair p2;
1879};
1880
1881inline bool operator<(const UCS2SurrogatePair &ligature1, const UCS2SurrogatePair &ligature2)
1882{ return QChar::surrogateToUcs4(high: ligature1.p1.u1, low: ligature1.p1.u2) < QChar::surrogateToUcs4(high: ligature2.p1.u1, low: ligature2.p1.u2); }
1883inline bool operator<(char32_t u1, const UCS2SurrogatePair &ligature)
1884{ return u1 < QChar::surrogateToUcs4(high: ligature.p1.u1, low: ligature.p1.u2); }
1885inline bool operator<(const UCS2SurrogatePair &ligature, char32_t u1)
1886{ return QChar::surrogateToUcs4(high: ligature.p1.u1, low: ligature.p1.u2) < u1; }
1887
1888static char32_t inline ligatureHelper(char32_t u1, char32_t u2)
1889{
1890 if (u1 >= Hangul_LBase && u1 < Hangul_SBase + Hangul_SCount) {
1891 // compute Hangul syllable composition as per UAX #15
1892 // hangul L-V pair
1893 const char32_t LIndex = u1 - Hangul_LBase;
1894 if (LIndex < Hangul_LCount) {
1895 const char32_t VIndex = u2 - Hangul_VBase;
1896 if (VIndex < Hangul_VCount)
1897 return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1898 }
1899 // hangul LV-T pair
1900 const char32_t SIndex = u1 - Hangul_SBase;
1901 if (SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == 0) {
1902 const char32_t TIndex = u2 - Hangul_TBase;
1903 if (TIndex < Hangul_TCount && TIndex)
1904 return u1 + TIndex;
1905 }
1906 }
1907
1908 const unsigned short index = GET_LIGATURE_INDEX(u2);
1909 if (index == 0xffff)
1910 return 0;
1911 const unsigned short *ligatures = uc_ligature_map+index;
1912 ushort length = *ligatures++;
1913 if (QChar::requiresSurrogates(ucs4: u1)) {
1914 const UCS2SurrogatePair *data = reinterpret_cast<const UCS2SurrogatePair *>(ligatures);
1915 const UCS2SurrogatePair *r = std::lower_bound(first: data, last: data + length, val: u1);
1916 if (r != data + length && QChar::surrogateToUcs4(high: r->p1.u1, low: r->p1.u2) == u1)
1917 return QChar::surrogateToUcs4(high: r->p2.u1, low: r->p2.u2);
1918 } else {
1919 const UCS2Pair *data = reinterpret_cast<const UCS2Pair *>(ligatures);
1920 const UCS2Pair *r = std::lower_bound(first: data, last: data + length, val: ushort(u1));
1921 if (r != data + length && r->u1 == ushort(u1))
1922 return r->u2;
1923 }
1924
1925 return 0;
1926}
1927
1928static void composeHelper(QString *str, QChar::UnicodeVersion version, qsizetype from)
1929{
1930 QString &s = *str;
1931
1932 if (from < 0 || s.size() - from < 2)
1933 return;
1934
1935 char32_t stcode = 0; // starter code point
1936 qsizetype starter = -1; // starter position
1937 qsizetype next = -1; // to prevent i == next
1938 int lastCombining = 255; // to prevent combining > lastCombining
1939
1940 qsizetype pos = from;
1941 while (pos < s.size()) {
1942 qsizetype i = pos;
1943 char32_t uc = s.at(i: pos).unicode();
1944 if (QChar(uc).isHighSurrogate() && pos < s.size()-1) {
1945 ushort low = s.at(i: pos+1).unicode();
1946 if (QChar(low).isLowSurrogate()) {
1947 uc = QChar::surrogateToUcs4(high: uc, low);
1948 ++pos;
1949 }
1950 }
1951
1952 const QUnicodeTables::Properties *p = qGetProp(ucs4: uc);
1953 if (p->unicodeVersion > version) {
1954 starter = -1;
1955 next = -1; // to prevent i == next
1956 lastCombining = 255; // to prevent combining > lastCombining
1957 ++pos;
1958 continue;
1959 }
1960
1961 int combining = p->combiningClass;
1962 if ((i == next || combining > lastCombining) && starter >= from) {
1963 // allowed to form ligature with S
1964 char32_t ligature = ligatureHelper(u1: stcode, u2: uc);
1965 if (ligature) {
1966 stcode = ligature;
1967 QChar *d = s.data();
1968 // ligatureHelper() never changes planes
1969 qsizetype j = 0;
1970 for (QChar ch : QChar::fromUcs4(c: ligature))
1971 d[starter + j++] = ch;
1972 s.remove(i, len: j);
1973 continue;
1974 }
1975 }
1976 if (combining == 0) {
1977 starter = i;
1978 stcode = uc;
1979 next = pos + 1;
1980 }
1981 lastCombining = combining;
1982
1983 ++pos;
1984 }
1985}
1986
1987
1988static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, qsizetype from)
1989{
1990 QString &s = *str;
1991 const qsizetype l = s.size()-1;
1992
1993 char32_t u1, u2;
1994 char16_t c1, c2;
1995
1996 qsizetype pos = from;
1997 while (pos < l) {
1998 qsizetype p2 = pos+1;
1999 u1 = s.at(i: pos).unicode();
2000 if (QChar::isHighSurrogate(ucs4: u1)) {
2001 const char16_t low = s.at(i: p2).unicode();
2002 if (QChar::isLowSurrogate(ucs4: low)) {
2003 u1 = QChar::surrogateToUcs4(high: u1, low);
2004 if (p2 >= l)
2005 break;
2006 ++p2;
2007 }
2008 }
2009 c1 = 0;
2010
2011 advance:
2012 u2 = s.at(i: p2).unicode();
2013 if (QChar::isHighSurrogate(ucs4: u2) && p2 < l) {
2014 const char16_t low = s.at(i: p2+1).unicode();
2015 if (QChar::isLowSurrogate(ucs4: low)) {
2016 u2 = QChar::surrogateToUcs4(high: u2, low);
2017 ++p2;
2018 }
2019 }
2020
2021 c2 = 0;
2022 {
2023 const QUnicodeTables::Properties *p = qGetProp(ucs4: u2);
2024 if (p->unicodeVersion <= version)
2025 c2 = p->combiningClass;
2026 }
2027 if (c2 == 0) {
2028 pos = p2+1;
2029 continue;
2030 }
2031
2032 if (c1 == 0) {
2033 const QUnicodeTables::Properties *p = qGetProp(ucs4: u1);
2034 if (p->unicodeVersion <= version)
2035 c1 = p->combiningClass;
2036 }
2037
2038 if (c1 > c2) {
2039 QChar *uc = s.data();
2040 qsizetype p = pos;
2041 // exchange characters
2042 for (QChar ch : QChar::fromUcs4(c: u2))
2043 uc[p++] = ch;
2044 for (QChar ch : QChar::fromUcs4(c: u1))
2045 uc[p++] = ch;
2046 if (pos > 0)
2047 --pos;
2048 if (pos > 0 && s.at(i: pos).isLowSurrogate())
2049 --pos;
2050 } else {
2051 ++pos;
2052 if (QChar::requiresSurrogates(ucs4: u1))
2053 ++pos;
2054
2055 u1 = u2;
2056 c1 = c2; // != 0
2057 p2 = pos + 1;
2058 if (QChar::requiresSurrogates(ucs4: u1))
2059 ++p2;
2060 if (p2 > l)
2061 break;
2062
2063 goto advance;
2064 }
2065 }
2066}
2067
2068// returns true if the text is in a desired Normalization Form already; false otherwise.
2069// sets lastStable to the position of the last stable code point
2070static bool normalizationQuickCheckHelper(QString *str, QString::NormalizationForm mode, qsizetype from, qsizetype *lastStable)
2071{
2072 static_assert(QString::NormalizationForm_D == 0);
2073 static_assert(QString::NormalizationForm_C == 1);
2074 static_assert(QString::NormalizationForm_KD == 2);
2075 static_assert(QString::NormalizationForm_KC == 3);
2076
2077 enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 };
2078
2079 const auto *string = reinterpret_cast<const char16_t *>(str->constData());
2080 qsizetype length = str->size();
2081
2082 // this avoids one out of bounds check in the loop
2083 while (length > from && QChar::isHighSurrogate(ucs4: string[length - 1]))
2084 --length;
2085
2086 uchar lastCombining = 0;
2087 for (qsizetype i = from; i < length; ++i) {
2088 qsizetype pos = i;
2089 char32_t uc = string[i];
2090 if (uc < 0x80) {
2091 // ASCII characters are stable code points
2092 lastCombining = 0;
2093 *lastStable = pos;
2094 continue;
2095 }
2096
2097 if (QChar::isHighSurrogate(ucs4: uc)) {
2098 ushort low = string[i + 1];
2099 if (!QChar::isLowSurrogate(ucs4: low)) {
2100 // treat surrogate like stable code point
2101 lastCombining = 0;
2102 *lastStable = pos;
2103 continue;
2104 }
2105 ++i;
2106 uc = QChar::surrogateToUcs4(high: uc, low);
2107 }
2108
2109 const QUnicodeTables::Properties *p = qGetProp(ucs4: uc);
2110
2111 if (p->combiningClass < lastCombining && p->combiningClass > 0)
2112 return false;
2113
2114 const uchar check = (p->nfQuickCheck >> (mode << 1)) & 0x03;
2115 if (check != NFQC_YES)
2116 return false; // ### can we quick check NFQC_MAYBE ?
2117
2118 lastCombining = p->combiningClass;
2119 if (lastCombining == 0)
2120 *lastStable = pos;
2121 }
2122
2123 if (length != str->size()) // low surrogate parts at the end of text
2124 *lastStable = str->size() - 1;
2125
2126 return true;
2127}
2128
2129QT_END_NAMESPACE
2130

Provided by KDAB

Privacy Policy
Start learning QML with our Intro Training
Find out more

source code of qtbase/src/corelib/text/qchar.cpp