1// Copyright (C) 2022 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qchar.h"
5
6#include "qdatastream.h"
7
8#include "qunicodetables_p.h"
9#include "qunicodetables.cpp"
10
11#include <algorithm>
12
13QT_BEGIN_NAMESPACE
14
15#define FLAG(x) (1 << (x))
16
17/*!
18 \class QLatin1Char
19 \inmodule QtCore
20 \reentrant
21 \brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
22
23 \ingroup string-processing
24
25 This class is only useful to construct a QChar with 8-bit character.
26
27 \sa QChar, QLatin1StringView, QString
28*/
29
30/*!
31 \fn const char QLatin1Char::toLatin1() const
32
33 Converts a Latin-1 character to an 8-bit ASCII representation of the character.
34*/
35
36/*!
37 \fn QLatin1Char::unicode() const
38
39 Converts a Latin-1 character to an 16-bit-encoded Unicode representation
40 of the character.
41*/
42
43/*!
44 \fn QLatin1Char::QLatin1Char(char c)
45
46 Constructs a Latin-1 character for \a c. This constructor should be
47 used when the encoding of the input character is known to be Latin-1.
48*/
49
50/*!
51 \class QChar
52 \inmodule QtCore
53 \brief The QChar class provides a 16-bit Unicode character.
54
55 \ingroup string-processing
56 \reentrant
57
58 \compares strong
59 \compareswith strong char16_t QString QStringView QLatin1StringView QUtf8StringView
60 \endcompareswith
61 \compareswith strong {const char *} QByteArray QByteArrayView
62 The contents of the byte array is interpreted as utf-8.
63 \endcompareswith
64
65 In Qt, Unicode characters are 16-bit entities without any markup
66 or structure. This class represents such an entity. It is
67 lightweight, so it can be used everywhere. Most compilers treat
68 it like an \c{unsigned short}.
69
70 QChar provides a full complement of testing/classification
71 functions, converting to and from other formats, converting from
72 composed to decomposed Unicode, and trying to compare and
73 case-convert if you ask it to.
74
75 The classification functions include functions like those in the
76 standard C++ header \<cctype\> (formerly \<ctype.h\>), but
77 operating on the full range of Unicode characters, not just for the ASCII
78 range. They all return true if the character is a certain type of character;
79 otherwise they return false. These classification functions are
80 isNull() (returns \c true if the character is '\\0'), isPrint()
81 (true if the character is any sort of printable character,
82 including whitespace), isPunct() (any sort of punctation),
83 isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
84 sort of numeric character, not just 0-9), isLetterOrNumber(), and
85 isDigit() (decimal digits). All of these are wrappers around
86 category() which return the Unicode-defined category of each
87 character. Some of these also calculate the derived properties
88 (for example isSpace() returns \c true if the character is of category
89 Separator_* or an exceptional code point from Other_Control category).
90
91 QChar also provides direction(), which indicates the "natural"
92 writing direction of this character. The joiningType() function
93 indicates how the character joins with it's neighbors (needed
94 mostly for Arabic or Syriac) and finally hasMirrored(), which indicates
95 whether the character needs to be mirrored when it is printed in
96 it's "unnatural" writing direction.
97
98 Composed Unicode characters (like \a ring) can be converted to
99 decomposed Unicode ("a" followed by "ring above") by using decomposition().
100
101 In Unicode, comparison is not necessarily possible and case
102 conversion is very difficult at best. Unicode, covering the
103 "entire" world, also includes most of the world's case and
104 sorting problems. operator==() and friends will do comparison
105 based purely on the numeric Unicode value (code point) of the
106 characters, and toUpper() and toLower() will do case changes when
107 the character has a well-defined uppercase/lowercase equivalent.
108 For locale-dependent comparisons, use QString::localeAwareCompare().
109
110 The conversion functions include unicode() (to a scalar),
111 toLatin1() (to scalar, but converts all non-Latin-1 characters to
112 0), row() (gives the Unicode row), cell() (gives the Unicode
113 cell), digitValue() (gives the integer value of any of the
114 numerous digit characters), and a host of constructors.
115
116 QChar provides constructors and cast operators that make it easy
117 to convert to and from traditional 8-bit \c{char}s. If you
118 defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
119 explained in the QString documentation, you will need to
120 explicitly call fromLatin1(), or use QLatin1Char,
121 to construct a QChar from an 8-bit \c char, and you will need to
122 call toLatin1() to get the 8-bit value back.
123
124 Starting with Qt 6.0, most QChar constructors are \c explicit. This
125 is done to avoid dangerous mistakes when accidentally mixing
126 integral types and strings.
127
128 For more information see
129 \l{https://www.unicode.org/ucd/}{"About the Unicode Character Database"}.
130
131 \sa Unicode, QString, QLatin1Char
132*/
133
134/*!
135 \enum QChar::UnicodeVersion
136
137 Specifies which version of the \l{Unicode standard} introduced a certain
138 character.
139
140 \value Unicode_1_1 Version 1.1
141 \value Unicode_2_0 Version 2.0
142 \value Unicode_2_1_2 Version 2.1.2
143 \value Unicode_3_0 Version 3.0
144 \value Unicode_3_1 Version 3.1
145 \value Unicode_3_2 Version 3.2
146 \value Unicode_4_0 Version 4.0
147 \value Unicode_4_1 Version 4.1
148 \value Unicode_5_0 Version 5.0
149 \value Unicode_5_1 Version 5.1
150 \value Unicode_5_2 Version 5.2
151 \value Unicode_6_0 Version 6.0
152 \value Unicode_6_1 Version 6.1
153 \value Unicode_6_2 Version 6.2
154 \value [since 5.3] Unicode_6_3 Version 6.3
155 \value [since 5.5] Unicode_7_0 Version 7.0
156 \value [since 5.6] Unicode_8_0 Version 8.0
157 \value [since 5.11] Unicode_9_0 Version 9.0
158 \value [since 5.11] Unicode_10_0 Version 10.0
159 \value [since 5.15] Unicode_11_0 Version 11.0
160 \value [since 5.15] Unicode_12_0 Version 12.0
161 \value [since 5.15] Unicode_12_1 Version 12.1
162 \value [since 5.15] Unicode_13_0 Version 13.0
163 \value [since 6.3] Unicode_14_0 Version 14.0
164 \value [since 6.5] Unicode_15_0 Version 15.0
165 \value [since 6.8] Unicode_15_1 Version 15.1
166 \value Unicode_Unassigned The value is not assigned to any character
167 in version 8.0 of Unicode.
168
169 \sa unicodeVersion(), currentUnicodeVersion()
170*/
171
172/*!
173 \enum QChar::Category
174
175 This enum maps the Unicode character categories.
176
177 The following characters are normative in Unicode:
178
179 \value Mark_NonSpacing Unicode class name Mn
180
181 \value Mark_SpacingCombining Unicode class name Mc
182
183 \value Mark_Enclosing Unicode class name Me
184
185 \value Number_DecimalDigit Unicode class name Nd
186
187 \value Number_Letter Unicode class name Nl
188
189 \value Number_Other Unicode class name No
190
191 \value Separator_Space Unicode class name Zs
192
193 \value Separator_Line Unicode class name Zl
194
195 \value Separator_Paragraph Unicode class name Zp
196
197 \value Other_Control Unicode class name Cc
198
199 \value Other_Format Unicode class name Cf
200
201 \value Other_Surrogate Unicode class name Cs
202
203 \value Other_PrivateUse Unicode class name Co
204
205 \value Other_NotAssigned Unicode class name Cn
206
207
208 The following categories are informative in Unicode:
209
210 \value Letter_Uppercase Unicode class name Lu
211
212 \value Letter_Lowercase Unicode class name Ll
213
214 \value Letter_Titlecase Unicode class name Lt
215
216 \value Letter_Modifier Unicode class name Lm
217
218 \value Letter_Other Unicode class name Lo
219
220 \value Punctuation_Connector Unicode class name Pc
221
222 \value Punctuation_Dash Unicode class name Pd
223
224 \value Punctuation_Open Unicode class name Ps
225
226 \value Punctuation_Close Unicode class name Pe
227
228 \value Punctuation_InitialQuote Unicode class name Pi
229
230 \value Punctuation_FinalQuote Unicode class name Pf
231
232 \value Punctuation_Other Unicode class name Po
233
234 \value Symbol_Math Unicode class name Sm
235
236 \value Symbol_Currency Unicode class name Sc
237
238 \value Symbol_Modifier Unicode class name Sk
239
240 \value Symbol_Other Unicode class name So
241
242 \sa category()
243*/
244
245/*!
246 \enum QChar::Script
247 \since 5.1
248
249 This enum type defines the Unicode script property values.
250
251 For details about the Unicode script property values see
252 \l{https://www.unicode.org/reports/tr24/}{Unicode Standard Annex #24}.
253
254 In order to conform to C/C++ naming conventions "Script_" is prepended
255 to the codes used in the Unicode Standard.
256
257 \value Script_Unknown For unassigned, private-use, noncharacter, and surrogate code points.
258 \value Script_Inherited For characters that may be used with multiple scripts
259 and that inherit their script from the preceding characters.
260 These include nonspacing marks, enclosing marks,
261 and zero width joiner/non-joiner characters.
262 \value Script_Common For characters that may be used with multiple scripts
263 and that do not inherit their script from the preceding characters.
264
265 \value [since 5.11] Script_Adlam
266 \value [since 5.6] Script_Ahom
267 \value [since 5.6] Script_AnatolianHieroglyphs
268 \value Script_Arabic
269 \value Script_Armenian
270 \value Script_Avestan
271 \value Script_Balinese
272 \value Script_Bamum
273 \value [since 5.5] Script_BassaVah
274 \value Script_Batak
275 \value Script_Bengali
276 \value [since 5.11] Script_Bhaiksuki
277 \value Script_Bopomofo
278 \value Script_Brahmi
279 \value Script_Braille
280 \value Script_Buginese
281 \value Script_Buhid
282 \value Script_CanadianAboriginal
283 \value Script_Carian
284 \value [since 5.5] Script_CaucasianAlbanian
285 \value Script_Chakma
286 \value Script_Cham
287 \value Script_Cherokee
288 \value [since 5.15] Script_Chorasmian
289 \value Script_Coptic
290 \value Script_Cuneiform
291 \value Script_Cypriot
292 \value [since 6.3] Script_CyproMinoan
293 \value Script_Cyrillic
294 \value Script_Deseret
295 \value Script_Devanagari
296 \value [since 5.15] Script_DivesAkuru
297 \value [since 5.15] Script_Dogra
298 \value [since 5.5] Script_Duployan
299 \value Script_EgyptianHieroglyphs
300 \value [since 5.5] Script_Elbasan
301 \value [since 5.15] Script_Elymaic
302 \value Script_Ethiopic
303 \value Script_Georgian
304 \value Script_Glagolitic
305 \value Script_Gothic
306 \value [since 5.5] Script_Grantha
307 \value Script_Greek
308 \value Script_Gujarati
309 \value [since 5.15] Script_GunjalaGondi
310 \value Script_Gurmukhi
311 \value Script_Han
312 \value Script_Hangul
313 \value [since 5.15] Script_HanifiRohingya
314 \value Script_Hanunoo
315 \value [since 5.6] Script_Hatran
316 \value Script_Hebrew
317 \value Script_Hiragana
318 \value Script_ImperialAramaic
319 \value Script_InscriptionalPahlavi
320 \value Script_InscriptionalParthian
321 \value Script_Javanese
322 \value Script_Kaithi
323 \value Script_Kannada
324 \value Script_Katakana
325 \value [since 6.5] Script_Kawi
326 \value Script_KayahLi
327 \value Script_Kharoshthi
328 \value [since 5.15] Script_KhitanSmallScript
329 \value Script_Khmer
330 \value [since 5.5] Script_Khojki
331 \value [since 5.5] Script_Khudawadi
332 \value Script_Lao
333 \value Script_Latin
334 \value Script_Lepcha
335 \value Script_Limbu
336 \value [since 5.5] Script_LinearA
337 \value Script_LinearB
338 \value Script_Lisu
339 \value Script_Lycian
340 \value Script_Lydian
341 \value [since 5.5] Script_Mahajani
342 \value [since 5.15] Script_Makasar
343 \value Script_Malayalam
344 \value Script_Mandaic
345 \value [since 5.5] Script_Manichaean
346 \value [since 5.11] Script_Marchen
347 \value [since 5.11] Script_MasaramGondi
348 \value [since 5.15] Script_Medefaidrin
349 \value Script_MeeteiMayek
350 \value [since 5.5] Script_MendeKikakui
351 \value Script_MeroiticCursive
352 \value Script_MeroiticHieroglyphs
353 \value Script_Miao
354 \value [since 5.5] Script_Modi
355 \value Script_Mongolian
356 \value [since 5.5] Script_Mro
357 \value [since 5.6] Script_Multani
358 \value Script_Myanmar
359 \value [since 5.5] Script_Nabataean
360 \value [since 6.3] Script_NagMundari
361 \value [since 5.15] Script_Nandinagari
362 \value [since 5.11] Script_Newa
363 \value Script_NewTaiLue
364 \value Script_Nko
365 \value [since 5.11] Script_Nushu
366 \value [since 5.15] Script_NyiakengPuachueHmong
367 \value Script_Ogham
368 \value Script_OlChiki
369 \value [since 5.6] Script_OldHungarian
370 \value Script_OldItalic
371 \value [since 5.5] Script_OldNorthArabian
372 \value [since 5.5] Script_OldPermic
373 \value Script_OldPersian
374 \value [since 5.15] Script_OldSogdian
375 \value Script_OldSouthArabian
376 \value Script_OldTurkic
377 \value [since 6.3] Script_OldUyghur
378 \value Script_Oriya
379 \value [since 5.11] Script_Osage
380 \value Script_Osmanya
381 \value [since 5.5] Script_PahawhHmong
382 \value [since 5.5] Script_Palmyrene
383 \value [since 5.5] Script_PauCinHau
384 \value Script_PhagsPa
385 \value Script_Phoenician
386 \value [since 5.5] Script_PsalterPahlavi
387 \value Script_Rejang
388 \value Script_Runic
389 \value Script_Samaritan
390 \value Script_Saurashtra
391 \value Script_Sharada
392 \value Script_Shavian
393 \value [since 5.5] Script_Siddham
394 \value [since 5.6] Script_SignWriting
395 \value Script_Sinhala
396 \value [since 5.15] Script_Sogdian
397 \value Script_SoraSompeng
398 \value [since 5.11] Script_Soyombo
399 \value Script_Sundanese
400 \value Script_SylotiNagri
401 \value Script_Syriac
402 \value Script_Tagalog
403 \value Script_Tagbanwa
404 \value Script_TaiLe
405 \value Script_TaiTham
406 \value Script_TaiViet
407 \value Script_Takri
408 \value Script_Tamil
409 \value [since 5.11] Script_Tangut
410 \value [since 6.3] Script_Tangsa
411 \value Script_Telugu
412 \value Script_Thaana
413 \value Script_Thai
414 \value Script_Tibetan
415 \value Script_Tifinagh
416 \value [since 5.5] Script_Tirhuta
417 \value [since 6.3] Script_Toto
418 \value Script_Ugaritic
419 \value Script_Vai
420 \value [since 6.3] Script_Vithkuqi
421 \value [since 5.15] Script_Wancho
422 \value [since 5.5] Script_WarangCiti
423 \value [since 5.15] Script_Yezidi
424 \value Script_Yi
425 \value [since 5.11] Script_ZanabazarSquare
426
427 \omitvalue ScriptCount
428
429 \sa script()
430*/
431
432/*!
433 \enum QChar::Direction
434
435 This enum type defines the Unicode direction attributes. See the
436 \l{https://www.unicode.org/reports/tr9/tr9-35.html#Table_Bidirectional_Character_Types}{Unicode
437 Standard} for a description of the values.
438
439 In order to conform to C/C++ naming conventions "Dir" is prepended
440 to the codes used in the Unicode Standard.
441
442 \value DirAL
443 \value DirAN
444 \value DirB
445 \value DirBN
446 \value DirCS
447 \value DirEN
448 \value DirES
449 \value DirET
450 \value [since 5.3] DirFSI
451 \value DirL
452 \value DirLRE
453 \value [since 5.3] DirLRI
454 \value DirLRO
455 \value DirNSM
456 \value DirON
457 \value DirPDF
458 \value [since 5.3] DirPDI
459 \value DirR
460 \value DirRLE
461 \value [since 5.3] DirRLI
462 \value DirRLO
463 \value DirS
464 \value DirWS
465
466 \sa direction()
467*/
468
469/*!
470 \enum QChar::Decomposition
471
472 This enum type defines the Unicode decomposition attributes. See
473 the \l{Unicode standard} for a description of the values.
474
475 \value NoDecomposition
476 \value Canonical
477 \value Circle
478 \value Compat
479 \value Final
480 \value Font
481 \value Fraction
482 \value Initial
483 \value Isolated
484 \value Medial
485 \value Narrow
486 \value NoBreak
487 \value Small
488 \value Square
489 \value Sub
490 \value Super
491 \value Vertical
492 \value Wide
493
494 \sa decomposition()
495*/
496
497/*!
498 \enum QChar::JoiningType
499 since 5.3
500
501 This enum type defines the Unicode joining type attributes. See the
502 \l{Unicode standard} for a description of the values.
503
504 In order to conform to C/C++ naming conventions "Joining_" is prepended
505 to the codes used in the Unicode Standard.
506
507 \value Joining_None
508 \value Joining_Causing
509 \value Joining_Dual
510 \value Joining_Right
511 \value Joining_Left
512 \value Joining_Transparent
513
514 \sa joiningType()
515*/
516
517/*!
518 \enum QChar::CombiningClass
519
520 \internal
521
522 This enum type defines names for some of the Unicode combining
523 classes. See the \l{Unicode Standard} for a description of the values.
524
525 \value Combining_Above
526 \value Combining_AboveAttached
527 \value Combining_AboveLeft
528 \value Combining_AboveLeftAttached
529 \value Combining_AboveRight
530 \value Combining_AboveRightAttached
531 \value Combining_Below
532 \value Combining_BelowAttached
533 \value Combining_BelowLeft
534 \value Combining_BelowLeftAttached
535 \value Combining_BelowRight
536 \value Combining_BelowRightAttached
537 \value Combining_DoubleAbove
538 \value Combining_DoubleBelow
539 \value Combining_IotaSubscript
540 \value Combining_Left
541 \value Combining_LeftAttached
542 \value Combining_Right
543 \value Combining_RightAttached
544*/
545
546/*!
547 \enum QChar::SpecialCharacter
548
549 \value Null A QChar with this value isNull().
550 \value Tabulation Character tabulation.
551 \value LineFeed
552 \value FormFeed
553 \value CarriageReturn
554 \value Space
555 \value Nbsp Non-breaking space.
556 \value SoftHyphen
557 \value ReplacementCharacter The character shown when a font has no glyph
558 for a certain codepoint. A special question mark character is often
559 used. Codecs use this codepoint when input data cannot be
560 represented in Unicode.
561 \value ObjectReplacementCharacter Used to represent an object such as an
562 image when such objects cannot be presented.
563 \value ByteOrderMark
564 \value ByteOrderSwapped
565 \value ParagraphSeparator
566 \value LineSeparator
567 \value [since 6.2] VisualTabCharacter Used to represent a tabulation as a horizontal arrow.
568 \value LastValidCodePoint
569*/
570
571/*!
572 \fn void QChar::setCell(uchar cell)
573 \internal
574*/
575
576/*!
577 \fn void QChar::setRow(uchar row)
578 \internal
579*/
580
581/*!
582 \fn QChar::QChar()
583
584 Constructs a null QChar ('\\0').
585
586 \sa isNull()
587*/
588
589/*!
590 \fn QChar::QChar(QLatin1Char ch)
591
592 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
593*/
594
595/*!
596 \fn QChar::QChar(SpecialCharacter ch)
597
598 Constructs a QChar for the predefined character value \a ch.
599*/
600
601/*!
602 \fn QChar::QChar(char16_t ch)
603 \since 5.10
604
605 Constructs a QChar corresponding to the UTF-16 character \a ch.
606*/
607
608/*!
609 \fn QChar::QChar(wchar_t ch)
610 \since 5.10
611
612 Constructs a QChar corresponding to the wide character \a ch.
613
614 \note This constructor is only available on Windows.
615*/
616
617/*!
618 \fn QChar::QChar(char ch)
619
620 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
621
622 \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
623 is defined.
624
625 \sa QT_NO_CAST_FROM_ASCII
626*/
627
628/*!
629 \fn QChar::QChar(uchar ch)
630
631 Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
632
633 \note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
634 or \c QT_RESTRICTED_CAST_FROM_ASCII is defined.
635
636 \sa QT_NO_CAST_FROM_ASCII, QT_RESTRICTED_CAST_FROM_ASCII
637*/
638
639/*!
640 \fn QChar::QChar(uchar cell, uchar row)
641
642 Constructs a QChar for Unicode cell \a cell in row \a row.
643
644 \sa cell(), row()
645*/
646
647/*!
648 \fn QChar::QChar(ushort code)
649
650 Constructs a QChar for the character with Unicode code point \a code.
651*/
652
653/*!
654 \fn QChar::QChar(short code)
655
656 Constructs a QChar for the character with Unicode code point \a code.
657*/
658
659/*!
660 \fn QChar::QChar(uint code)
661
662 Constructs a QChar for the character with Unicode code point \a code.
663*/
664
665/*!
666 \fn QChar::QChar(int code)
667
668 Constructs a QChar for the character with Unicode code point \a code.
669*/
670
671/*!
672 \fn static QChar QChar::fromUcs2(char16_t c)
673 \since 6.0
674
675 Constructs a QChar from UTF-16 character \a c.
676
677 \sa fromUcs4()
678*/
679
680/*!
681 \fn static auto QChar::fromUcs4(char32_t c)
682 \since 6.0
683
684 Returns an anonymous struct that
685 \list
686 \li contains a \c{char16_t chars[2]} array,
687 \li can be implicitly converted to a QStringView, and
688 \li iterated over with a C++11 ranged for loop.
689 \endlist
690
691 If \a c requires surrogates, \c{chars[0]} contains the high surrogate
692 and \c{chars[1]} the low surrogate, and the QStringView has size 2.
693 Otherwise, \c{chars[0]} contains \a c and \c{chars[1]} is
694 \l{QChar::isNull}{null}, and the QStringView has size 1.
695
696 This allows easy use of the result:
697
698 \code
699 QString s;
700 s += QChar::fromUcs4(ch);
701 \endcode
702
703 \code
704 for (char16_t c16 : QChar::fromUcs4(ch))
705 use(c16);
706 \endcode
707
708 \sa fromUcs2(), requiresSurrogates()
709*/
710
711/*!
712 \fn bool QChar::isNull() const
713
714 Returns \c true if the character is the Unicode character 0x0000
715 ('\\0'); otherwise returns \c false.
716*/
717
718/*!
719 \fn uchar QChar::cell() const
720
721 Returns the cell (least significant byte) of the Unicode character.
722
723 \sa row()
724*/
725
726/*!
727 \fn uchar QChar::row() const
728
729 Returns the row (most significant byte) of the Unicode character.
730
731 \sa cell()
732*/
733
734/*!
735 \fn bool QChar::isPrint() const
736
737 Returns \c true if the character is a printable character; otherwise
738 returns \c false. This is any character not of category Other_*.
739
740 Note that this gives no indication of whether the character is
741 available in a particular font.
742*/
743
744/*!
745 \overload
746 \since 5.0
747
748 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
749 a printable character; otherwise returns \c false.
750 This is any character not of category Other_*.
751
752 Note that this gives no indication of whether the character is
753 available in a particular font.
754
755 \note Before Qt 6, this function took a \c uint argument.
756*/
757bool QChar::isPrint(char32_t ucs4) noexcept
758{
759 if (ucs4 > LastValidCodePoint)
760 return false;
761 const int test = FLAG(Other_Control) |
762 FLAG(Other_Format) |
763 FLAG(Other_Surrogate) |
764 FLAG(Other_PrivateUse) |
765 FLAG(Other_NotAssigned);
766 return !(FLAG(qGetProp(ucs4)->category) & test);
767}
768
769/*!
770 \fn bool QChar::isSpace() const
771
772 Returns \c true if the character is a separator character
773 (Separator_* categories or certain code points from Other_Control category);
774 otherwise returns \c false.
775*/
776
777/*!
778 \fn bool QChar::isSpace(char32_t ucs4)
779 \overload
780 \since 5.0
781
782 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
783 a separator character (Separator_* categories or certain code points
784 from Other_Control category); otherwise returns \c false.
785
786 \note Before Qt 6, this function took a \c uint argument.
787*/
788
789/*!
790 \internal
791*/
792bool QT_FASTCALL QChar::isSpace_helper(char32_t ucs4) noexcept
793{
794 if (ucs4 > LastValidCodePoint)
795 return false;
796 const int test = FLAG(Separator_Space) |
797 FLAG(Separator_Line) |
798 FLAG(Separator_Paragraph);
799 return FLAG(qGetProp(ucs4)->category) & test;
800}
801
802/*!
803 \fn bool QChar::isMark() const
804
805 Returns \c true if the character is a mark (Mark_* categories);
806 otherwise returns \c false.
807
808 See QChar::Category for more information regarding marks.
809*/
810
811/*!
812 \overload
813 \since 5.0
814
815 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
816 a mark (Mark_* categories); otherwise returns \c false.
817
818 \note Before Qt 6, this function took a \c uint argument.
819*/
820bool QChar::isMark(char32_t ucs4) noexcept
821{
822 if (ucs4 > LastValidCodePoint)
823 return false;
824 const int test = FLAG(Mark_NonSpacing) |
825 FLAG(Mark_SpacingCombining) |
826 FLAG(Mark_Enclosing);
827 return FLAG(qGetProp(ucs4)->category) & test;
828}
829
830/*!
831 \fn bool QChar::isPunct() const
832
833 Returns \c true if the character is a punctuation mark (Punctuation_*
834 categories); otherwise returns \c false.
835*/
836
837/*!
838 \overload
839 \since 5.0
840
841 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
842 a punctuation mark (Punctuation_* categories); otherwise returns \c false.
843
844 \note Before Qt 6, this function took a \c uint argument.
845*/
846bool QChar::isPunct(char32_t ucs4) noexcept
847{
848 if (ucs4 > LastValidCodePoint)
849 return false;
850 const int test = FLAG(Punctuation_Connector) |
851 FLAG(Punctuation_Dash) |
852 FLAG(Punctuation_Open) |
853 FLAG(Punctuation_Close) |
854 FLAG(Punctuation_InitialQuote) |
855 FLAG(Punctuation_FinalQuote) |
856 FLAG(Punctuation_Other);
857 return FLAG(qGetProp(ucs4)->category) & test;
858}
859
860/*!
861 \fn bool QChar::isSymbol() const
862
863 Returns \c true if the character is a symbol (Symbol_* categories);
864 otherwise returns \c false.
865*/
866
867/*!
868 \overload
869 \since 5.0
870
871 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
872 a symbol (Symbol_* categories); otherwise returns \c false.
873
874 \note Before Qt 6, this function took a \c uint argument.
875*/
876bool QChar::isSymbol(char32_t ucs4) noexcept
877{
878 if (ucs4 > LastValidCodePoint)
879 return false;
880 const int test = FLAG(Symbol_Math) |
881 FLAG(Symbol_Currency) |
882 FLAG(Symbol_Modifier) |
883 FLAG(Symbol_Other);
884 return FLAG(qGetProp(ucs4)->category) & test;
885}
886
887/*!
888 \fn bool QChar::isLetter() const
889
890 Returns \c true if the character is a letter (Letter_* categories);
891 otherwise returns \c false.
892*/
893
894/*!
895 \fn bool QChar::isLetter(char32_t ucs4)
896 \overload
897 \since 5.0
898
899 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
900 a letter (Letter_* categories); otherwise returns \c false.
901
902 \note Before Qt 6, this function took a \c uint argument.
903*/
904
905/*!
906 \internal
907*/
908bool QT_FASTCALL QChar::isLetter_helper(char32_t ucs4) noexcept
909{
910 if (ucs4 > LastValidCodePoint)
911 return false;
912 const int test = FLAG(Letter_Uppercase) |
913 FLAG(Letter_Lowercase) |
914 FLAG(Letter_Titlecase) |
915 FLAG(Letter_Modifier) |
916 FLAG(Letter_Other);
917 return FLAG(qGetProp(ucs4)->category) & test;
918}
919
920/*!
921 \fn bool QChar::isNumber() const
922
923 Returns \c true if the character is a number (Number_* categories,
924 not just 0-9); otherwise returns \c false.
925
926 \sa isDigit()
927*/
928
929/*!
930 \fn bool QChar::isNumber(char32_t ucs4)
931 \overload
932 \since 5.0
933
934 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
935 a number (Number_* categories, not just 0-9); otherwise returns \c false.
936
937 \note Before Qt 6, this function took a \c uint argument.
938
939 \sa isDigit()
940*/
941
942/*!
943 \internal
944*/
945bool QT_FASTCALL QChar::isNumber_helper(char32_t ucs4) noexcept
946{
947 if (ucs4 > LastValidCodePoint)
948 return false;
949 const int test = FLAG(Number_DecimalDigit) |
950 FLAG(Number_Letter) |
951 FLAG(Number_Other);
952 return FLAG(qGetProp(ucs4)->category) & test;
953}
954
955/*!
956 \fn bool QChar::isLetterOrNumber() const
957
958 Returns \c true if the character is a letter or number (Letter_* or
959 Number_* categories); otherwise returns \c false.
960*/
961
962/*!
963 \fn bool QChar::isLetterOrNumber(char32_t ucs4)
964 \overload
965 \since 5.0
966
967 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
968 a letter or number (Letter_* or Number_* categories); otherwise returns \c false.
969
970 \note Before Qt 6, this function took a \c uint argument.
971*/
972
973/*!
974 \internal
975*/
976bool QT_FASTCALL QChar::isLetterOrNumber_helper(char32_t ucs4) noexcept
977{
978 if (ucs4 > LastValidCodePoint)
979 return false;
980 const int test = FLAG(Letter_Uppercase) |
981 FLAG(Letter_Lowercase) |
982 FLAG(Letter_Titlecase) |
983 FLAG(Letter_Modifier) |
984 FLAG(Letter_Other) |
985 FLAG(Number_DecimalDigit) |
986 FLAG(Number_Letter) |
987 FLAG(Number_Other);
988 return FLAG(qGetProp(ucs4)->category) & test;
989}
990
991/*!
992 \fn bool QChar::isDigit() const
993
994 Returns \c true if the character is a decimal digit
995 (Number_DecimalDigit); otherwise returns \c false.
996
997 \sa isNumber()
998*/
999
1000/*!
1001 \fn bool QChar::isDigit(char32_t ucs4)
1002 \overload
1003 \since 5.0
1004
1005 Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
1006 a decimal digit (Number_DecimalDigit); otherwise returns \c false.
1007
1008 \note Before Qt 6, this function took a \c uint argument.
1009
1010 \sa isNumber()
1011*/
1012
1013/*!
1014 \fn bool QChar::isNonCharacter() const
1015 \since 5.0
1016
1017 Returns \c true if the QChar is a non-character; false otherwise.
1018
1019 Unicode has a certain number of code points that are classified
1020 as "non-characters:" that is, they can be used for internal purposes
1021 in applications but cannot be used for text interchange.
1022 Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1023 [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1024*/
1025
1026/*!
1027 \fn bool QChar::isHighSurrogate() const
1028
1029 Returns \c true if the QChar is the high part of a UTF16 surrogate
1030 (for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1031*/
1032
1033/*!
1034 \fn bool QChar::isLowSurrogate() const
1035
1036 Returns \c true if the QChar is the low part of a UTF16 surrogate
1037 (for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1038*/
1039
1040/*!
1041 \fn bool QChar::isSurrogate() const
1042 \since 5.0
1043
1044 Returns \c true if the QChar contains a code point that is in either
1045 the high or the low part of the UTF-16 surrogate range
1046 (for example if its code point is in range [0xd800..0xdfff]); false otherwise.
1047*/
1048
1049/*!
1050 \fn static bool QChar::isNonCharacter(char32_t ucs4)
1051 \overload
1052 \since 5.0
1053
1054 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1055 is a non-character; false otherwise.
1056
1057 Unicode has a certain number of code points that are classified
1058 as "non-characters:" that is, they can be used for internal purposes
1059 in applications but cannot be used for text interchange.
1060 Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1061 [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1062
1063 \note Before Qt 6, this function took a \c uint argument.
1064*/
1065
1066/*!
1067 \fn static bool QChar::isHighSurrogate(char32_t ucs4)
1068 \overload
1069
1070 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1071 is the high part of a UTF16 surrogate
1072 (for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1073
1074 \note Before Qt 6, this function took a \c uint argument.
1075*/
1076
1077/*!
1078 \fn static bool QChar::isLowSurrogate(char32_t ucs4)
1079 \overload
1080
1081 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1082 is the low part of a UTF16 surrogate
1083 (for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1084
1085 \note Before Qt 6, this function took a \c uint argument.
1086*/
1087
1088/*!
1089 \fn static bool QChar::isSurrogate(char32_t ucs4)
1090 \overload
1091 \since 5.0
1092
1093 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1094 contains a code point that is in either the high or the low part of the
1095 UTF-16 surrogate range (for example if its code point is in range [0xd800..0xdfff]);
1096 false otherwise.
1097
1098 \note Before Qt 6, this function took a \c uint argument.
1099*/
1100
1101/*!
1102 \fn static bool QChar::requiresSurrogates(char32_t ucs4)
1103
1104 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1105 can be split into the high and low parts of a UTF16 surrogate
1106 (for example if its code point is greater than or equals to 0x10000);
1107 false otherwise.
1108
1109 \note Before Qt 6, this function took a \c uint argument.
1110*/
1111
1112/*!
1113 \fn static char32_t QChar::surrogateToUcs4(char16_t high, char16_t low)
1114
1115 Converts a UTF16 surrogate pair with the given \a high and \a low values
1116 to it's UCS-4-encoded code point.
1117
1118 \note Before Qt 6, this function took \c ushort arguments and returned \c uint.
1119*/
1120
1121/*!
1122 \fn static char32_t QChar::surrogateToUcs4(QChar high, QChar low)
1123 \overload
1124
1125 Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
1126
1127 \note Before Qt 6, this function returned \c uint.
1128*/
1129
1130/*!
1131 \fn static char16_t QChar::highSurrogate(char32_t ucs4)
1132
1133 Returns the high surrogate part of a UCS-4-encoded code point.
1134 The returned result is undefined if \a ucs4 is smaller than 0x10000.
1135
1136 \note Before Qt 6, this function took a \c uint argument and returned \c ushort.
1137*/
1138
1139/*!
1140 \fn static char16_t QChar::lowSurrogate(char32_t ucs4)
1141
1142 Returns the low surrogate part of a UCS-4-encoded code point.
1143 The returned result is undefined if \a ucs4 is smaller than 0x10000.
1144
1145 \note Before Qt 6, this function took a \c uint argument and returned \c ushort.
1146*/
1147
1148/*!
1149 \fn int QChar::digitValue() const
1150
1151 Returns the numeric value of the digit, or -1 if the character is not a digit.
1152*/
1153
1154/*!
1155 \overload
1156 Returns the numeric value of the digit specified by the UCS-4-encoded
1157 character, \a ucs4, or -1 if the character is not a digit.
1158
1159 \note Before Qt 6, this function took a \c uint argument.
1160*/
1161int QChar::digitValue(char32_t ucs4) noexcept
1162{
1163 if (ucs4 > LastValidCodePoint)
1164 return -1;
1165 return qGetProp(ucs4)->digitValue;
1166}
1167
1168/*!
1169 \fn QChar::Category QChar::category() const
1170
1171 Returns the character's category.
1172*/
1173
1174/*!
1175 \overload
1176 Returns the category of the UCS-4-encoded character specified by \a ucs4.
1177
1178 \note Before Qt 6, this function took a \c uint argument.
1179*/
1180QChar::Category QChar::category(char32_t ucs4) noexcept
1181{
1182 if (ucs4 > LastValidCodePoint)
1183 return QChar::Other_NotAssigned;
1184 return (QChar::Category) qGetProp(ucs4)->category;
1185}
1186
1187/*!
1188 \fn QChar::Direction QChar::direction() const
1189
1190 Returns the character's direction.
1191*/
1192
1193/*!
1194 \overload
1195 Returns the direction of the UCS-4-encoded character specified by \a ucs4.
1196
1197 \note Before Qt 6, this function took a \c uint argument.
1198*/
1199QChar::Direction QChar::direction(char32_t ucs4) noexcept
1200{
1201 if (ucs4 > LastValidCodePoint)
1202 return QChar::DirL;
1203 return (QChar::Direction) qGetProp(ucs4)->direction;
1204}
1205
1206/*!
1207 \fn QChar::JoiningType QChar::joiningType() const
1208 \since 5.3
1209
1210 Returns information about the joining type attributes of the character
1211 (needed for certain languages such as Arabic or Syriac).
1212*/
1213
1214/*!
1215 \overload
1216 \since 5.3
1217
1218 Returns information about the joining type attributes of the UCS-4-encoded
1219 character specified by \a ucs4
1220 (needed for certain languages such as Arabic or Syriac).
1221
1222 \note Before Qt 6, this function took a \c uint argument.
1223*/
1224QChar::JoiningType QChar::joiningType(char32_t ucs4) noexcept
1225{
1226 if (ucs4 > LastValidCodePoint)
1227 return QChar::Joining_None;
1228 return QChar::JoiningType(qGetProp(ucs4)->joining);
1229}
1230
1231/*!
1232 \fn bool QChar::hasMirrored() const
1233
1234 Returns \c true if the character should be reversed if the text
1235 direction is reversed; otherwise returns \c false.
1236
1237 A bit faster equivalent of (ch.mirroredChar() != ch).
1238
1239 \sa mirroredChar()
1240*/
1241
1242/*!
1243 \overload
1244 \since 5.0
1245
1246 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1247 should be reversed if the text direction is reversed; otherwise returns \c false.
1248
1249 A bit faster equivalent of (QChar::mirroredChar(ucs4) != ucs4).
1250
1251 \note Before Qt 6, this function took a \c uint argument.
1252
1253 \sa mirroredChar()
1254*/
1255bool QChar::hasMirrored(char32_t ucs4) noexcept
1256{
1257 if (ucs4 > LastValidCodePoint)
1258 return false;
1259 return qGetProp(ucs4)->mirrorDiff != 0;
1260}
1261
1262/*!
1263 \fn bool QChar::isLower() const
1264
1265 Returns \c true if the character is a lowercase letter, for example
1266 category() is Letter_Lowercase.
1267
1268 \sa isUpper(), toLower(), toUpper()
1269*/
1270
1271/*!
1272 \fn static bool QChar::isLower(char32_t ucs4)
1273 \overload
1274 \since 5.0
1275
1276 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1277 is a lowercase letter, for example category() is Letter_Lowercase.
1278
1279 \note Before Qt 6, this function took a \c uint argument.
1280
1281 \sa isUpper(), toLower(), toUpper()
1282*/
1283
1284/*!
1285 \fn bool QChar::isUpper() const
1286
1287 Returns \c true if the character is an uppercase letter, for example
1288 category() is Letter_Uppercase.
1289
1290 \sa isLower(), toUpper(), toLower()
1291*/
1292
1293/*!
1294 \fn static bool QChar::isUpper(char32_t ucs4)
1295 \overload
1296 \since 5.0
1297
1298 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1299 is an uppercase letter, for example category() is Letter_Uppercase.
1300
1301 \note Before Qt 6, this function took a \c uint argument.
1302
1303 \sa isLower(), toUpper(), toLower()
1304*/
1305
1306/*!
1307 \fn bool QChar::isTitleCase() const
1308
1309 Returns \c true if the character is a titlecase letter, for example
1310 category() is Letter_Titlecase.
1311
1312 \sa isLower(), toUpper(), toLower(), toTitleCase()
1313*/
1314
1315/*!
1316 \fn static bool QChar::isTitleCase(char32_t ucs4)
1317 \overload
1318 \since 5.0
1319
1320 Returns \c true if the UCS-4-encoded character specified by \a ucs4
1321 is a titlecase letter, for example category() is Letter_Titlecase.
1322
1323 \note Before Qt 6, this function took a \c uint argument.
1324
1325 \sa isLower(), toUpper(), toLower(), toTitleCase()
1326*/
1327/*!
1328 \fn QChar QChar::mirroredChar() const
1329
1330 Returns the mirrored character if this character is a mirrored
1331 character; otherwise returns the character itself.
1332
1333 \sa hasMirrored()
1334*/
1335
1336/*!
1337 \overload
1338 Returns the mirrored character if the UCS-4-encoded character specified
1339 by \a ucs4 is a mirrored character; otherwise returns the character itself.
1340
1341 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1342
1343 \sa hasMirrored()
1344*/
1345char32_t QChar::mirroredChar(char32_t ucs4) noexcept
1346{
1347 if (ucs4 > LastValidCodePoint)
1348 return ucs4;
1349 return ucs4 + qGetProp(ucs4)->mirrorDiff;
1350}
1351
1352// Constants for Hangul (de)composition, see UAX #15:
1353static constexpr char32_t Hangul_SBase = 0xac00;
1354static constexpr char32_t Hangul_LBase = 0x1100;
1355static constexpr char32_t Hangul_VBase = 0x1161;
1356static constexpr char32_t Hangul_TBase = 0x11a7;
1357static constexpr quint32 Hangul_LCount = 19;
1358static constexpr quint32 Hangul_VCount = 21;
1359static constexpr quint32 Hangul_TCount = 28;
1360static constexpr quint32 Hangul_NCount = Hangul_VCount * Hangul_TCount;
1361static constexpr quint32 Hangul_SCount = Hangul_LCount * Hangul_NCount;
1362
1363// buffer has to have a length of 3. It's needed for Hangul decomposition
1364static const QChar * QT_FASTCALL decompositionHelper(
1365 char32_t ucs4, qsizetype *length, QChar::Decomposition *tag, QChar *buffer)
1366{
1367 if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
1368 // compute Hangul syllable decomposition as per UAX #15
1369 const char32_t SIndex = ucs4 - Hangul_SBase;
1370 buffer[0] = QChar(Hangul_LBase + SIndex / Hangul_NCount); // L
1371 buffer[1] = QChar(Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount); // V
1372 buffer[2] = QChar(Hangul_TBase + SIndex % Hangul_TCount); // T
1373 *length = buffer[2].unicode() == Hangul_TBase ? 2 : 3;
1374 *tag = QChar::Canonical;
1375 return buffer;
1376 }
1377
1378 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1379 if (index == 0xffff) {
1380 *length = 0;
1381 *tag = QChar::NoDecomposition;
1382 return nullptr;
1383 }
1384
1385 const unsigned short *decomposition = uc_decomposition_map+index;
1386 *tag = QChar::Decomposition((*decomposition) & 0xff);
1387 *length = (*decomposition) >> 8;
1388 return reinterpret_cast<const QChar *>(decomposition + 1);
1389}
1390
1391/*!
1392 Decomposes a character into it's constituent parts. Returns an empty string
1393 if no decomposition exists.
1394*/
1395QString QChar::decomposition() const
1396{
1397 return QChar::decomposition(ucs4: ucs);
1398}
1399
1400/*!
1401 \overload
1402 Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
1403 constituent parts. Returns an empty string if no decomposition exists.
1404
1405 \note Before Qt 6, this function took a \c uint argument.
1406*/
1407QString QChar::decomposition(char32_t ucs4)
1408{
1409 QChar buffer[3];
1410 qsizetype length;
1411 QChar::Decomposition tag;
1412 const QChar *d = decompositionHelper(ucs4, length: &length, tag: &tag, buffer);
1413 return QString(d, length);
1414}
1415
1416/*!
1417 \fn QChar::Decomposition QChar::decompositionTag() const
1418
1419 Returns the tag defining the composition of the character. Returns
1420 QChar::NoDecomposition if no decomposition exists.
1421*/
1422
1423/*!
1424 \overload
1425 Returns the tag defining the composition of the UCS-4-encoded character
1426 specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
1427
1428 \note Before Qt 6, this function took a \c uint argument.
1429*/
1430QChar::Decomposition QChar::decompositionTag(char32_t ucs4) noexcept
1431{
1432 if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount)
1433 return QChar::Canonical;
1434 const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1435 if (index == 0xffff)
1436 return QChar::NoDecomposition;
1437 return (QChar::Decomposition)(uc_decomposition_map[index] & 0xff);
1438}
1439
1440/*!
1441 \fn unsigned char QChar::combiningClass() const
1442
1443 Returns the combining class for the character as defined in the
1444 Unicode standard. This is mainly useful as a positioning hint for
1445 marks attached to a base character.
1446
1447 The Qt text rendering engine uses this information to correctly
1448 position non-spacing marks around a base character.
1449*/
1450
1451/*!
1452 \overload
1453 Returns the combining class for the UCS-4-encoded character specified by
1454 \a ucs4, as defined in the Unicode standard.
1455
1456 \note Before Qt 6, this function took a \c uint argument.
1457*/
1458unsigned char QChar::combiningClass(char32_t ucs4) noexcept
1459{
1460 if (ucs4 > LastValidCodePoint)
1461 return 0;
1462 return (unsigned char) qGetProp(ucs4)->combiningClass;
1463}
1464
1465/*!
1466 \fn QChar::Script QChar::script() const
1467 \since 5.1
1468
1469 Returns the Unicode script property value for this character.
1470*/
1471
1472/*!
1473 \overload
1474 \since 5.1
1475
1476 Returns the Unicode script property value for the character specified in
1477 its UCS-4-encoded form as \a ucs4.
1478
1479 \note Before Qt 6, this function took a \c uint argument.
1480*/
1481QChar::Script QChar::script(char32_t ucs4) noexcept
1482{
1483 if (ucs4 > LastValidCodePoint)
1484 return QChar::Script_Unknown;
1485 return (QChar::Script) qGetProp(ucs4)->script;
1486}
1487
1488/*!
1489 \fn QChar::UnicodeVersion QChar::unicodeVersion() const
1490
1491 Returns the Unicode version that introduced this character.
1492*/
1493
1494/*!
1495 \overload
1496 Returns the Unicode version that introduced the character specified in
1497 its UCS-4-encoded form as \a ucs4.
1498
1499 \note Before Qt 6, this function took a \c uint argument.
1500*/
1501QChar::UnicodeVersion QChar::unicodeVersion(char32_t ucs4) noexcept
1502{
1503 if (ucs4 > LastValidCodePoint)
1504 return QChar::Unicode_Unassigned;
1505 return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1506}
1507
1508/*!
1509 Returns the most recent supported Unicode version.
1510*/
1511QChar::UnicodeVersion QChar::currentUnicodeVersion() noexcept
1512{
1513 return UNICODE_DATA_VERSION;
1514}
1515
1516static auto fullConvertCase(char32_t uc, QUnicodeTables::Case which) noexcept
1517{
1518 struct R {
1519 char16_t chars[MaxSpecialCaseLength + 1];
1520 qint8 sz;
1521
1522 // iterable
1523 auto begin() const { return chars; }
1524 auto end() const { return chars + sz; }
1525 // QStringView-compatible
1526 auto data() const { return chars; }
1527 auto size() const { return sz; }
1528 } result;
1529 Q_ASSERT(uc <= QChar::LastValidCodePoint);
1530
1531 auto pp = result.chars;
1532
1533 const auto fold = qGetProp(ucs4: uc)->cases[which];
1534 const auto caseDiff = fold.diff;
1535
1536 if (Q_UNLIKELY(fold.special)) {
1537 const auto *specialCase = specialCaseMap + caseDiff;
1538 auto length = *specialCase++;
1539 while (length--)
1540 *pp++ = *specialCase++;
1541 } else {
1542 // so far, case conversion never changes planes (guaranteed by the qunicodetables generator)
1543 for (char16_t c : QChar::fromUcs4(c: uc + caseDiff))
1544 *pp++ = c;
1545 }
1546 result.sz = pp - result.chars;
1547 return result;
1548}
1549
1550template <typename T>
1551Q_DECL_CONST_FUNCTION static inline T convertCase_helper(T uc, QUnicodeTables::Case which) noexcept
1552{
1553 const auto fold = qGetProp(uc)->cases[which];
1554
1555 if (Q_UNLIKELY(fold.special)) {
1556 const ushort *specialCase = specialCaseMap + fold.diff;
1557 // so far, there are no special cases beyond BMP (guaranteed by the qunicodetables generator)
1558 return *specialCase == 1 ? specialCase[1] : uc;
1559 }
1560
1561 return uc + fold.diff;
1562}
1563
1564/*!
1565 \fn QChar QChar::toLower() const
1566
1567 Returns the lowercase equivalent if the character is uppercase or titlecase;
1568 otherwise returns the character itself.
1569*/
1570
1571/*!
1572 \overload
1573 Returns the lowercase equivalent of the UCS-4-encoded character specified
1574 by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1575 the character itself.
1576
1577 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1578*/
1579char32_t QChar::toLower(char32_t ucs4) noexcept
1580{
1581 if (ucs4 > LastValidCodePoint)
1582 return ucs4;
1583 return convertCase_helper(uc: ucs4, which: QUnicodeTables::LowerCase);
1584}
1585
1586/*!
1587 \fn QChar QChar::toUpper() const
1588
1589 Returns the uppercase equivalent if the character is lowercase or titlecase;
1590 otherwise returns the character itself.
1591
1592 \note This function also returns the original character in the rare case of
1593 the uppercase form of the character requiring two or more characters.
1594
1595 \sa QString::toUpper()
1596*/
1597
1598/*!
1599 \overload
1600 Returns the uppercase equivalent of the UCS-4-encoded character specified
1601 by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1602 the character itself.
1603
1604 \note This function also returns the original character in the rare case of
1605 the uppercase form of the character requiring two or more characters.
1606
1607 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1608
1609 \sa QString::toUpper()
1610*/
1611char32_t QChar::toUpper(char32_t ucs4) noexcept
1612{
1613 if (ucs4 > LastValidCodePoint)
1614 return ucs4;
1615 return convertCase_helper(uc: ucs4, which: QUnicodeTables::UpperCase);
1616}
1617
1618/*!
1619 \fn QChar QChar::toTitleCase() const
1620
1621 Returns the title case equivalent if the character is lowercase or uppercase;
1622 otherwise returns the character itself.
1623*/
1624
1625/*!
1626 \overload
1627 Returns the title case equivalent of the UCS-4-encoded character specified
1628 by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1629 the character itself.
1630
1631 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1632*/
1633char32_t QChar::toTitleCase(char32_t ucs4) noexcept
1634{
1635 if (ucs4 > LastValidCodePoint)
1636 return ucs4;
1637 return convertCase_helper(uc: ucs4, which: QUnicodeTables::TitleCase);
1638}
1639
1640static inline char32_t foldCase(const char16_t *ch, const char16_t *start)
1641{
1642 char32_t ucs4 = *ch;
1643 if (QChar::isLowSurrogate(ucs4) && ch > start && QChar::isHighSurrogate(ucs4: *(ch - 1)))
1644 ucs4 = QChar::surrogateToUcs4(high: *(ch - 1), low: ucs4);
1645 return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1646}
1647
1648static inline char32_t foldCase(char32_t ch, char32_t &last) noexcept
1649{
1650 char32_t ucs4 = ch;
1651 if (QChar::isLowSurrogate(ucs4) && QChar::isHighSurrogate(ucs4: last))
1652 ucs4 = QChar::surrogateToUcs4(high: last, low: ucs4);
1653 last = ch;
1654 return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1655}
1656
1657static inline char16_t foldCase(char16_t ch) noexcept
1658{
1659 return convertCase_helper(uc: ch, which: QUnicodeTables::CaseFold);
1660}
1661
1662static inline QChar foldCase(QChar ch) noexcept
1663{
1664 return QChar(foldCase(ch: ch.unicode()));
1665}
1666
1667/*!
1668 \fn QChar QChar::toCaseFolded() const
1669
1670 Returns the case folded equivalent of the character.
1671 For most Unicode characters this is the same as toLower().
1672*/
1673
1674/*!
1675 \overload
1676 Returns the case folded equivalent of the UCS-4-encoded character specified
1677 by \a ucs4. For most Unicode characters this is the same as toLower().
1678
1679 \note Before Qt 6, this function took a \c uint argument and returned \c uint.
1680*/
1681char32_t QChar::toCaseFolded(char32_t ucs4) noexcept
1682{
1683 if (ucs4 > LastValidCodePoint)
1684 return ucs4;
1685 return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1686}
1687
1688/*!
1689 \fn char QChar::toLatin1() const
1690
1691 Returns the Latin-1 character equivalent to the QChar, or 0. This
1692 is mainly useful for non-internationalized software.
1693
1694 \note It is not possible to distinguish a non-Latin-1 character from a Latin-1 0
1695 (NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1696
1697 \sa unicode()
1698*/
1699
1700/*!
1701 \fn QChar QChar::fromLatin1(char)
1702
1703 Converts the Latin-1 character \a c to its equivalent QChar. This
1704 is mainly useful for non-internationalized software.
1705
1706 An alternative is to use QLatin1Char.
1707
1708 \sa toLatin1(), unicode()
1709*/
1710
1711#ifndef QT_NO_DATASTREAM
1712/*!
1713 \relates QChar
1714
1715 Writes the char \a chr to the stream \a out.
1716
1717 \sa {Serializing Qt Data Types}
1718*/
1719QDataStream &operator<<(QDataStream &out, QChar chr)
1720{
1721 out << quint16(chr.unicode());
1722 return out;
1723}
1724
1725/*!
1726 \relates QChar
1727
1728 Reads a char from the stream \a in into char \a chr.
1729
1730 \sa {Serializing Qt Data Types}
1731*/
1732QDataStream &operator>>(QDataStream &in, QChar &chr)
1733{
1734 quint16 u;
1735 in >> u;
1736 chr.unicode() = char16_t(u);
1737 return in;
1738}
1739#endif // QT_NO_DATASTREAM
1740
1741/*!
1742 \fn QChar::unicode()
1743
1744 Returns a reference to the numeric Unicode value of the QChar.
1745*/
1746
1747/*!
1748 \fn QChar::unicode() const
1749
1750 Returns the numeric Unicode value of the QChar.
1751*/
1752
1753/*****************************************************************************
1754 Documentation of QChar related functions
1755 *****************************************************************************/
1756
1757/*!
1758 \fn bool QChar::operator==(const QChar &c1, const QChar &c2)
1759
1760 Returns \c true if \a c1 and \a c2 are the same Unicode character;
1761 otherwise returns \c false.
1762*/
1763
1764/*!
1765 \fn bool QChar::operator!=(const QChar &c1, const QChar &c2)
1766
1767 Returns \c true if \a c1 and \a c2 are not the same Unicode
1768 character; otherwise returns \c false.
1769*/
1770
1771/*!
1772 \fn bool QChar::operator<=(const QChar &c1, const QChar &c2)
1773
1774 Returns \c true if the numeric Unicode value of \a c1 is less than
1775 or equal to that of \a c2; otherwise returns \c false.
1776*/
1777
1778/*!
1779 \fn bool QChar::operator>=(const QChar &c1, const QChar &c2)
1780
1781 Returns \c true if the numeric Unicode value of \a c1 is greater than
1782 or equal to that of \a c2; otherwise returns \c false.
1783*/
1784
1785/*!
1786 \fn bool QChar::operator<(const QChar &c1, const QChar &c2)
1787
1788 Returns \c true if the numeric Unicode value of \a c1 is less than
1789 that of \a c2; otherwise returns \c false.
1790*/
1791
1792/*!
1793 \fn bool QChar::operator>(const QChar &c1, const QChar &c2)
1794
1795 Returns \c true if the numeric Unicode value of \a c1 is greater than
1796 that of \a c2; otherwise returns \c false.
1797*/
1798
1799/*!
1800 \fn Qt::Literals::StringLiterals::operator""_L1(char ch)
1801
1802 \relates QLatin1Char
1803 \since 6.4
1804
1805 Literal operator that creates a QLatin1Char out of \a ch.
1806
1807 The following code creates a QLatin1Char:
1808 \code
1809 using namespace Qt::Literals::StringLiterals;
1810
1811 auto ch = 'a'_L1;
1812 \endcode
1813
1814 \sa Qt::Literals::StringLiterals
1815*/
1816
1817// ---------------------------------------------------------------------------
1818
1819
1820static void decomposeHelper(QString *str, bool canonical, QChar::UnicodeVersion version, qsizetype from)
1821{
1822 qsizetype length;
1823 QChar::Decomposition tag;
1824 QChar buffer[3];
1825
1826 QString &s = *str;
1827
1828 const unsigned short *utf16 = reinterpret_cast<unsigned short *>(s.data());
1829 const unsigned short *uc = utf16 + s.size();
1830 while (uc != utf16 + from) {
1831 char32_t ucs4 = *(--uc);
1832 if (QChar(ucs4).isLowSurrogate() && uc != utf16) {
1833 ushort high = *(uc - 1);
1834 if (QChar(high).isHighSurrogate()) {
1835 --uc;
1836 ucs4 = QChar::surrogateToUcs4(high, low: ucs4);
1837 }
1838 }
1839
1840 if (QChar::unicodeVersion(ucs4) > version)
1841 continue;
1842
1843 const QChar *d = decompositionHelper(ucs4, length: &length, tag: &tag, buffer);
1844 if (!d || (canonical && tag != QChar::Canonical))
1845 continue;
1846
1847 qsizetype pos = uc - utf16;
1848 s.replace(i: pos, len: QChar::requiresSurrogates(ucs4) ? 2 : 1, s: d, slen: length);
1849 // since the replace invalidates the pointers and we do decomposition recursive
1850 utf16 = reinterpret_cast<unsigned short *>(s.data());
1851 uc = utf16 + pos + length;
1852 }
1853}
1854
1855
1856struct UCS2Pair {
1857 ushort u1;
1858 ushort u2;
1859};
1860
1861inline bool operator<(const UCS2Pair &ligature1, const UCS2Pair &ligature2)
1862{ return ligature1.u1 < ligature2.u1; }
1863inline bool operator<(ushort u1, const UCS2Pair &ligature)
1864{ return u1 < ligature.u1; }
1865inline bool operator<(const UCS2Pair &ligature, ushort u1)
1866{ return ligature.u1 < u1; }
1867
1868struct UCS2SurrogatePair {
1869 UCS2Pair p1;
1870 UCS2Pair p2;
1871};
1872
1873inline bool operator<(const UCS2SurrogatePair &ligature1, const UCS2SurrogatePair &ligature2)
1874{ return QChar::surrogateToUcs4(high: ligature1.p1.u1, low: ligature1.p1.u2) < QChar::surrogateToUcs4(high: ligature2.p1.u1, low: ligature2.p1.u2); }
1875inline bool operator<(char32_t u1, const UCS2SurrogatePair &ligature)
1876{ return u1 < QChar::surrogateToUcs4(high: ligature.p1.u1, low: ligature.p1.u2); }
1877inline bool operator<(const UCS2SurrogatePair &ligature, char32_t u1)
1878{ return QChar::surrogateToUcs4(high: ligature.p1.u1, low: ligature.p1.u2) < u1; }
1879
1880static char32_t inline ligatureHelper(char32_t u1, char32_t u2)
1881{
1882 if (u1 >= Hangul_LBase && u1 < Hangul_SBase + Hangul_SCount) {
1883 // compute Hangul syllable composition as per UAX #15
1884 // hangul L-V pair
1885 const char32_t LIndex = u1 - Hangul_LBase;
1886 if (LIndex < Hangul_LCount) {
1887 const char32_t VIndex = u2 - Hangul_VBase;
1888 if (VIndex < Hangul_VCount)
1889 return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1890 }
1891 // hangul LV-T pair
1892 const char32_t SIndex = u1 - Hangul_SBase;
1893 if (SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == 0) {
1894 const char32_t TIndex = u2 - Hangul_TBase;
1895 if (TIndex < Hangul_TCount && TIndex)
1896 return u1 + TIndex;
1897 }
1898 }
1899
1900 const unsigned short index = GET_LIGATURE_INDEX(u2);
1901 if (index == 0xffff)
1902 return 0;
1903 const unsigned short *ligatures = uc_ligature_map+index;
1904 ushort length = *ligatures++;
1905 if (QChar::requiresSurrogates(ucs4: u1)) {
1906 const UCS2SurrogatePair *data = reinterpret_cast<const UCS2SurrogatePair *>(ligatures);
1907 const UCS2SurrogatePair *r = std::lower_bound(first: data, last: data + length, val: u1);
1908 if (r != data + length && QChar::surrogateToUcs4(high: r->p1.u1, low: r->p1.u2) == u1)
1909 return QChar::surrogateToUcs4(high: r->p2.u1, low: r->p2.u2);
1910 } else {
1911 const UCS2Pair *data = reinterpret_cast<const UCS2Pair *>(ligatures);
1912 const UCS2Pair *r = std::lower_bound(first: data, last: data + length, val: ushort(u1));
1913 if (r != data + length && r->u1 == ushort(u1))
1914 return r->u2;
1915 }
1916
1917 return 0;
1918}
1919
1920static void composeHelper(QString *str, QChar::UnicodeVersion version, qsizetype from)
1921{
1922 QString &s = *str;
1923
1924 if (from < 0 || s.size() - from < 2)
1925 return;
1926
1927 char32_t stcode = 0; // starter code point
1928 qsizetype starter = -1; // starter position
1929 qsizetype next = -1; // to prevent i == next
1930 int lastCombining = 255; // to prevent combining > lastCombining
1931
1932 qsizetype pos = from;
1933 while (pos < s.size()) {
1934 qsizetype i = pos;
1935 char32_t uc = s.at(i: pos).unicode();
1936 if (QChar(uc).isHighSurrogate() && pos < s.size()-1) {
1937 ushort low = s.at(i: pos+1).unicode();
1938 if (QChar(low).isLowSurrogate()) {
1939 uc = QChar::surrogateToUcs4(high: uc, low);
1940 ++pos;
1941 }
1942 }
1943
1944 const QUnicodeTables::Properties *p = qGetProp(ucs4: uc);
1945 if (p->unicodeVersion > version) {
1946 starter = -1;
1947 next = -1; // to prevent i == next
1948 lastCombining = 255; // to prevent combining > lastCombining
1949 ++pos;
1950 continue;
1951 }
1952
1953 int combining = p->combiningClass;
1954 if ((i == next || combining > lastCombining) && starter >= from) {
1955 // allowed to form ligature with S
1956 char32_t ligature = ligatureHelper(u1: stcode, u2: uc);
1957 if (ligature) {
1958 stcode = ligature;
1959 QChar *d = s.data();
1960 // ligatureHelper() never changes planes
1961 qsizetype j = 0;
1962 for (QChar ch : QChar::fromUcs4(c: ligature))
1963 d[starter + j++] = ch;
1964 s.remove(i, len: j);
1965 continue;
1966 }
1967 }
1968 if (combining == 0) {
1969 starter = i;
1970 stcode = uc;
1971 next = pos + 1;
1972 }
1973 lastCombining = combining;
1974
1975 ++pos;
1976 }
1977}
1978
1979
1980static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, qsizetype from)
1981{
1982 QString &s = *str;
1983 const qsizetype l = s.size()-1;
1984
1985 char32_t u1, u2;
1986 char16_t c1, c2;
1987
1988 qsizetype pos = from;
1989 while (pos < l) {
1990 qsizetype p2 = pos+1;
1991 u1 = s.at(i: pos).unicode();
1992 if (QChar::isHighSurrogate(ucs4: u1)) {
1993 const char16_t low = s.at(i: p2).unicode();
1994 if (QChar::isLowSurrogate(ucs4: low)) {
1995 u1 = QChar::surrogateToUcs4(high: u1, low);
1996 if (p2 >= l)
1997 break;
1998 ++p2;
1999 }
2000 }
2001 c1 = 0;
2002
2003 advance:
2004 u2 = s.at(i: p2).unicode();
2005 if (QChar::isHighSurrogate(ucs4: u2) && p2 < l) {
2006 const char16_t low = s.at(i: p2+1).unicode();
2007 if (QChar::isLowSurrogate(ucs4: low)) {
2008 u2 = QChar::surrogateToUcs4(high: u2, low);
2009 ++p2;
2010 }
2011 }
2012
2013 c2 = 0;
2014 {
2015 const QUnicodeTables::Properties *p = qGetProp(ucs4: u2);
2016 if (p->unicodeVersion <= version)
2017 c2 = p->combiningClass;
2018 }
2019 if (c2 == 0) {
2020 pos = p2+1;
2021 continue;
2022 }
2023
2024 if (c1 == 0) {
2025 const QUnicodeTables::Properties *p = qGetProp(ucs4: u1);
2026 if (p->unicodeVersion <= version)
2027 c1 = p->combiningClass;
2028 }
2029
2030 if (c1 > c2) {
2031 QChar *uc = s.data();
2032 qsizetype p = pos;
2033 // exchange characters
2034 for (QChar ch : QChar::fromUcs4(c: u2))
2035 uc[p++] = ch;
2036 for (QChar ch : QChar::fromUcs4(c: u1))
2037 uc[p++] = ch;
2038 if (pos > 0)
2039 --pos;
2040 if (pos > 0 && s.at(i: pos).isLowSurrogate())
2041 --pos;
2042 } else {
2043 ++pos;
2044 if (QChar::requiresSurrogates(ucs4: u1))
2045 ++pos;
2046
2047 u1 = u2;
2048 c1 = c2; // != 0
2049 p2 = pos + 1;
2050 if (QChar::requiresSurrogates(ucs4: u1))
2051 ++p2;
2052 if (p2 > l)
2053 break;
2054
2055 goto advance;
2056 }
2057 }
2058}
2059
2060// returns true if the text is in a desired Normalization Form already; false otherwise.
2061// sets lastStable to the position of the last stable code point
2062static bool normalizationQuickCheckHelper(QString *str, QString::NormalizationForm mode, qsizetype from, qsizetype *lastStable)
2063{
2064 static_assert(QString::NormalizationForm_D == 0);
2065 static_assert(QString::NormalizationForm_C == 1);
2066 static_assert(QString::NormalizationForm_KD == 2);
2067 static_assert(QString::NormalizationForm_KC == 3);
2068
2069 enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 };
2070
2071 const auto *string = reinterpret_cast<const char16_t *>(str->constData());
2072 qsizetype length = str->size();
2073
2074 // this avoids one out of bounds check in the loop
2075 while (length > from && QChar::isHighSurrogate(ucs4: string[length - 1]))
2076 --length;
2077
2078 uchar lastCombining = 0;
2079 for (qsizetype i = from; i < length; ++i) {
2080 qsizetype pos = i;
2081 char32_t uc = string[i];
2082 if (uc < 0x80) {
2083 // ASCII characters are stable code points
2084 lastCombining = 0;
2085 *lastStable = pos;
2086 continue;
2087 }
2088
2089 if (QChar::isHighSurrogate(ucs4: uc)) {
2090 ushort low = string[i + 1];
2091 if (!QChar::isLowSurrogate(ucs4: low)) {
2092 // treat surrogate like stable code point
2093 lastCombining = 0;
2094 *lastStable = pos;
2095 continue;
2096 }
2097 ++i;
2098 uc = QChar::surrogateToUcs4(high: uc, low);
2099 }
2100
2101 const QUnicodeTables::Properties *p = qGetProp(ucs4: uc);
2102
2103 if (p->combiningClass < lastCombining && p->combiningClass > 0)
2104 return false;
2105
2106 const uchar check = (p->nfQuickCheck >> (mode << 1)) & 0x03;
2107 if (check != NFQC_YES)
2108 return false; // ### can we quick check NFQC_MAYBE ?
2109
2110 lastCombining = p->combiningClass;
2111 if (lastCombining == 0)
2112 *lastStable = pos;
2113 }
2114
2115 if (length != str->size()) // low surrogate parts at the end of text
2116 *lastStable = str->size() - 1;
2117
2118 return true;
2119}
2120
2121QT_END_NAMESPACE
2122

Provided by KDAB

Privacy Policy
Start learning QML with our Intro Training
Find out more

source code of qtbase/src/corelib/text/qchar.cpp