qchar.cpp source code [qtbase/src/corelib/text/qchar.cpp]

1	// Copyright (C) 2022 The Qt Company Ltd.
2	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4	#include "qchar.h"
5
6	#include "qdatastream.h"
7
8	#include "qunicodetables_p.h"
9	#include "qunicodetables.cpp"
10
11	#include <algorithm>
12
13	QT_BEGIN_NAMESPACE
14
15	#define FLAG(x) (1 << (x))
16
17	/!*
18	\class QLatin1Char
19	\inmodule QtCore
20	\reentrant
21	\brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
22
23	\ingroup string-processing
24
25	This class is only useful to construct a QChar with 8-bit character.
26
27	\sa QChar, QLatin1StringView, QString
28	*/
29
30	/!*
31	\fn const char QLatin1Char::toLatin1() const
32
33	Converts a Latin-1 character to an 8-bit ASCII representation of the character.
34	*/
35
36	/!*
37	\fn QLatin1Char::unicode() const
38
39	Converts a Latin-1 character to an 16-bit-encoded Unicode representation
40	of the character.
41	*/
42
43	/!*
44	\fn QLatin1Char::QLatin1Char(char c)
45
46	Constructs a Latin-1 character for \a c. This constructor should be
47	used when the encoding of the input character is known to be Latin-1.
48	*/
49
50	/!*
51	\class QChar
52	\inmodule QtCore
53	\brief The QChar class provides a 16-bit Unicode character.
54
55	\ingroup string-processing
56	\reentrant
57
58	\compares strong
59	\compareswith strong char16_t QString QStringView QLatin1StringView QUtf8StringView
60	\endcompareswith
61	\compareswith strong {const char } QByteArray QByteArrayView*
62	The contents of the byte array is interpreted as utf-8.
63	\endcompareswith
64
65	In Qt, Unicode characters are 16-bit entities without any markup
66	or structure. This class represents such an entity. It is
67	lightweight, so it can be used everywhere. Most compilers treat
68	it like an \c{unsigned short}.
69
70	QChar provides a full complement of testing/classification
71	functions, converting to and from other formats, converting from
72	composed to decomposed Unicode, and trying to compare and
73	case-convert if you ask it to.
74
75	The classification functions include functions like those in the
76	standard C++ header \<cctype\> (formerly \<ctype.h\>), but
77	operating on the full range of Unicode characters, not just for the ASCII
78	range. They all return true if the character is a certain type of character;
79	otherwise they return false. These classification functions are
80	isNull() (returns \c true if the character is '\\0'), isPrint()
81	(true if the character is any sort of printable character,
82	including whitespace), isPunct() (any sort of punctation),
83	isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
84	sort of numeric character, not just 0-9), isLetterOrNumber(), and
85	isDigit() (decimal digits). All of these are wrappers around
86	category() which return the Unicode-defined category of each
87	character. Some of these also calculate the derived properties
88	(for example isSpace() returns \c true if the character is of category
89	Separator_ or an exceptional code point from Other_Control category).*
90
91	QChar also provides direction(), which indicates the "natural"
92	writing direction of this character. The joiningType() function
93	indicates how the character joins with it's neighbors (needed
94	mostly for Arabic or Syriac) and finally hasMirrored(), which indicates
95	whether the character needs to be mirrored when it is printed in
96	it's "unnatural" writing direction.
97
98	Composed Unicode characters (like \a ring) can be converted to
99	decomposed Unicode ("a" followed by "ring above") by using decomposition().
100
101	In Unicode, comparison is not necessarily possible and case
102	conversion is very difficult at best. Unicode, covering the
103	"entire" world, also includes most of the world's case and
104	sorting problems. operator==() and friends will do comparison
105	based purely on the numeric Unicode value (code point) of the
106	characters, and toUpper() and toLower() will do case changes when
107	the character has a well-defined uppercase/lowercase equivalent.
108	For locale-dependent comparisons, use QString::localeAwareCompare().
109
110	The conversion functions include unicode() (to a scalar),
111	toLatin1() (to scalar, but converts all non-Latin-1 characters to
112	0), row() (gives the Unicode row), cell() (gives the Unicode
113	cell), digitValue() (gives the integer value of any of the
114	numerous digit characters), and a host of constructors.
115
116	QChar provides constructors and cast operators that make it easy
117	to convert to and from traditional 8-bit \c{char}s. If you
118	defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
119	explained in the QString documentation, you will need to
120	explicitly call fromLatin1(), or use QLatin1Char,
121	to construct a QChar from an 8-bit \c char, and you will need to
122	call toLatin1() to get the 8-bit value back.
123
124	Starting with Qt 6.0, most QChar constructors are \c explicit. This
125	is done to avoid dangerous mistakes when accidentally mixing
126	integral types and strings.
127
128	For more information see
129	\l{https://www.unicode.org/ucd/}{"About the Unicode Character Database"}.
130
131	\sa Unicode, QString, QLatin1Char
132	*/
133
134	/!*
135	\enum QChar::UnicodeVersion
136
137	Specifies which version of the \l{Unicode standard} introduced a certain
138	character.
139
140	\value Unicode_1_1 Version 1.1
141	\value Unicode_2_0 Version 2.0
142	\value Unicode_2_1_2 Version 2.1.2
143	\value Unicode_3_0 Version 3.0
144	\value Unicode_3_1 Version 3.1
145	\value Unicode_3_2 Version 3.2
146	\value Unicode_4_0 Version 4.0
147	\value Unicode_4_1 Version 4.1
148	\value Unicode_5_0 Version 5.0
149	\value Unicode_5_1 Version 5.1
150	\value Unicode_5_2 Version 5.2
151	\value Unicode_6_0 Version 6.0
152	\value Unicode_6_1 Version 6.1
153	\value Unicode_6_2 Version 6.2
154	\value [since 5.3] Unicode_6_3 Version 6.3
155	\value [since 5.5] Unicode_7_0 Version 7.0
156	\value [since 5.6] Unicode_8_0 Version 8.0
157	\value [since 5.11] Unicode_9_0 Version 9.0
158	\value [since 5.11] Unicode_10_0 Version 10.0
159	\value [since 5.15] Unicode_11_0 Version 11.0
160	\value [since 5.15] Unicode_12_0 Version 12.0
161	\value [since 5.15] Unicode_12_1 Version 12.1
162	\value [since 5.15] Unicode_13_0 Version 13.0
163	\value [since 6.3] Unicode_14_0 Version 14.0
164	\value [since 6.5] Unicode_15_0 Version 15.0
165	\value [since 6.8] Unicode_15_1 Version 15.1
166	\value Unicode_Unassigned The value is not assigned to any character
167	in version 8.0 of Unicode.
168
169	\sa unicodeVersion(), currentUnicodeVersion()
170	*/
171
172	/!*
173	\enum QChar::Category
174
175	This enum maps the Unicode character categories.
176
177	The following characters are normative in Unicode:
178
179	\value Mark_NonSpacing Unicode class name Mn
180
181	\value Mark_SpacingCombining Unicode class name Mc
182
183	\value Mark_Enclosing Unicode class name Me
184
185	\value Number_DecimalDigit Unicode class name Nd
186
187	\value Number_Letter Unicode class name Nl
188
189	\value Number_Other Unicode class name No
190
191	\value Separator_Space Unicode class name Zs
192
193	\value Separator_Line Unicode class name Zl
194
195	\value Separator_Paragraph Unicode class name Zp
196
197	\value Other_Control Unicode class name Cc
198
199	\value Other_Format Unicode class name Cf
200
201	\value Other_Surrogate Unicode class name Cs
202
203	\value Other_PrivateUse Unicode class name Co
204
205	\value Other_NotAssigned Unicode class name Cn
206
207
208	The following categories are informative in Unicode:
209
210	\value Letter_Uppercase Unicode class name Lu
211
212	\value Letter_Lowercase Unicode class name Ll
213
214	\value Letter_Titlecase Unicode class name Lt
215
216	\value Letter_Modifier Unicode class name Lm
217
218	\value Letter_Other Unicode class name Lo
219
220	\value Punctuation_Connector Unicode class name Pc
221
222	\value Punctuation_Dash Unicode class name Pd
223
224	\value Punctuation_Open Unicode class name Ps
225
226	\value Punctuation_Close Unicode class name Pe
227
228	\value Punctuation_InitialQuote Unicode class name Pi
229
230	\value Punctuation_FinalQuote Unicode class name Pf
231
232	\value Punctuation_Other Unicode class name Po
233
234	\value Symbol_Math Unicode class name Sm
235
236	\value Symbol_Currency Unicode class name Sc
237
238	\value Symbol_Modifier Unicode class name Sk
239
240	\value Symbol_Other Unicode class name So
241
242	\sa category()
243	*/
244
245	/!*
246	\enum QChar::Script
247	\since 5.1
248
249	This enum type defines the Unicode script property values.
250
251	For details about the Unicode script property values see
252	\l{https://www.unicode.org/reports/tr24/}{Unicode Standard Annex #24}.
253
254	In order to conform to C/C++ naming conventions "Script_" is prepended
255	to the codes used in the Unicode Standard.
256
257	\value Script_Unknown For unassigned, private-use, noncharacter, and surrogate code points.
258	\value Script_Inherited For characters that may be used with multiple scripts
259	and that inherit their script from the preceding characters.
260	These include nonspacing marks, enclosing marks,
261	and zero width joiner/non-joiner characters.
262	\value Script_Common For characters that may be used with multiple scripts
263	and that do not inherit their script from the preceding characters.
264
265	\value [since 5.11] Script_Adlam
266	\value [since 5.6] Script_Ahom
267	\value [since 5.6] Script_AnatolianHieroglyphs
268	\value Script_Arabic
269	\value Script_Armenian
270	\value Script_Avestan
271	\value Script_Balinese
272	\value Script_Bamum
273	\value [since 5.5] Script_BassaVah
274	\value Script_Batak
275	\value Script_Bengali
276	\value [since 5.11] Script_Bhaiksuki
277	\value Script_Bopomofo
278	\value Script_Brahmi
279	\value Script_Braille
280	\value Script_Buginese
281	\value Script_Buhid
282	\value Script_CanadianAboriginal
283	\value Script_Carian
284	\value [since 5.5] Script_CaucasianAlbanian
285	\value Script_Chakma
286	\value Script_Cham
287	\value Script_Cherokee
288	\value [since 5.15] Script_Chorasmian
289	\value Script_Coptic
290	\value Script_Cuneiform
291	\value Script_Cypriot
292	\value [since 6.3] Script_CyproMinoan
293	\value Script_Cyrillic
294	\value Script_Deseret
295	\value Script_Devanagari
296	\value [since 5.15] Script_DivesAkuru
297	\value [since 5.15] Script_Dogra
298	\value [since 5.5] Script_Duployan
299	\value Script_EgyptianHieroglyphs
300	\value [since 5.5] Script_Elbasan
301	\value [since 5.15] Script_Elymaic
302	\value Script_Ethiopic
303	\value Script_Georgian
304	\value Script_Glagolitic
305	\value Script_Gothic
306	\value [since 5.5] Script_Grantha
307	\value Script_Greek
308	\value Script_Gujarati
309	\value [since 5.15] Script_GunjalaGondi
310	\value Script_Gurmukhi
311	\value Script_Han
312	\value Script_Hangul
313	\value [since 5.15] Script_HanifiRohingya
314	\value Script_Hanunoo
315	\value [since 5.6] Script_Hatran
316	\value Script_Hebrew
317	\value Script_Hiragana
318	\value Script_ImperialAramaic
319	\value Script_InscriptionalPahlavi
320	\value Script_InscriptionalParthian
321	\value Script_Javanese
322	\value Script_Kaithi
323	\value Script_Kannada
324	\value Script_Katakana
325	\value [since 6.5] Script_Kawi
326	\value Script_KayahLi
327	\value Script_Kharoshthi
328	\value [since 5.15] Script_KhitanSmallScript
329	\value Script_Khmer
330	\value [since 5.5] Script_Khojki
331	\value [since 5.5] Script_Khudawadi
332	\value Script_Lao
333	\value Script_Latin
334	\value Script_Lepcha
335	\value Script_Limbu
336	\value [since 5.5] Script_LinearA
337	\value Script_LinearB
338	\value Script_Lisu
339	\value Script_Lycian
340	\value Script_Lydian
341	\value [since 5.5] Script_Mahajani
342	\value [since 5.15] Script_Makasar
343	\value Script_Malayalam
344	\value Script_Mandaic
345	\value [since 5.5] Script_Manichaean
346	\value [since 5.11] Script_Marchen
347	\value [since 5.11] Script_MasaramGondi
348	\value [since 5.15] Script_Medefaidrin
349	\value Script_MeeteiMayek
350	\value [since 5.5] Script_MendeKikakui
351	\value Script_MeroiticCursive
352	\value Script_MeroiticHieroglyphs
353	\value Script_Miao
354	\value [since 5.5] Script_Modi
355	\value Script_Mongolian
356	\value [since 5.5] Script_Mro
357	\value [since 5.6] Script_Multani
358	\value Script_Myanmar
359	\value [since 5.5] Script_Nabataean
360	\value [since 6.3] Script_NagMundari
361	\value [since 5.15] Script_Nandinagari
362	\value [since 5.11] Script_Newa
363	\value Script_NewTaiLue
364	\value Script_Nko
365	\value [since 5.11] Script_Nushu
366	\value [since 5.15] Script_NyiakengPuachueHmong
367	\value Script_Ogham
368	\value Script_OlChiki
369	\value [since 5.6] Script_OldHungarian
370	\value Script_OldItalic
371	\value [since 5.5] Script_OldNorthArabian
372	\value [since 5.5] Script_OldPermic
373	\value Script_OldPersian
374	\value [since 5.15] Script_OldSogdian
375	\value Script_OldSouthArabian
376	\value Script_OldTurkic
377	\value [since 6.3] Script_OldUyghur
378	\value Script_Oriya
379	\value [since 5.11] Script_Osage
380	\value Script_Osmanya
381	\value [since 5.5] Script_PahawhHmong
382	\value [since 5.5] Script_Palmyrene
383	\value [since 5.5] Script_PauCinHau
384	\value Script_PhagsPa
385	\value Script_Phoenician
386	\value [since 5.5] Script_PsalterPahlavi
387	\value Script_Rejang
388	\value Script_Runic
389	\value Script_Samaritan
390	\value Script_Saurashtra
391	\value Script_Sharada
392	\value Script_Shavian
393	\value [since 5.5] Script_Siddham
394	\value [since 5.6] Script_SignWriting
395	\value Script_Sinhala
396	\value [since 5.15] Script_Sogdian
397	\value Script_SoraSompeng
398	\value [since 5.11] Script_Soyombo
399	\value Script_Sundanese
400	\value Script_SylotiNagri
401	\value Script_Syriac
402	\value Script_Tagalog
403	\value Script_Tagbanwa
404	\value Script_TaiLe
405	\value Script_TaiTham
406	\value Script_TaiViet
407	\value Script_Takri
408	\value Script_Tamil
409	\value [since 5.11] Script_Tangut
410	\value [since 6.3] Script_Tangsa
411	\value Script_Telugu
412	\value Script_Thaana
413	\value Script_Thai
414	\value Script_Tibetan
415	\value Script_Tifinagh
416	\value [since 5.5] Script_Tirhuta
417	\value [since 6.3] Script_Toto
418	\value Script_Ugaritic
419	\value Script_Vai
420	\value [since 6.3] Script_Vithkuqi
421	\value [since 5.15] Script_Wancho
422	\value [since 5.5] Script_WarangCiti
423	\value [since 5.15] Script_Yezidi
424	\value Script_Yi
425	\value [since 5.11] Script_ZanabazarSquare
426
427	\omitvalue ScriptCount
428
429	\sa script()
430	*/
431
432	/!*
433	\enum QChar::Direction
434
435	This enum type defines the Unicode direction attributes. See the
436	\l{https://www.unicode.org/reports/tr9/tr9-35.html#Table_Bidirectional_Character_Types}{Unicode
437	Standard} for a description of the values.
438
439	In order to conform to C/C++ naming conventions "Dir" is prepended
440	to the codes used in the Unicode Standard.
441
442	\value DirAL
443	\value DirAN
444	\value DirB
445	\value DirBN
446	\value DirCS
447	\value DirEN
448	\value DirES
449	\value DirET
450	\value [since 5.3] DirFSI
451	\value DirL
452	\value DirLRE
453	\value [since 5.3] DirLRI
454	\value DirLRO
455	\value DirNSM
456	\value DirON
457	\value DirPDF
458	\value [since 5.3] DirPDI
459	\value DirR
460	\value DirRLE
461	\value [since 5.3] DirRLI
462	\value DirRLO
463	\value DirS
464	\value DirWS
465
466	\sa direction()
467	*/
468
469	/!*
470	\enum QChar::Decomposition
471
472	This enum type defines the Unicode decomposition attributes. See
473	the \l{Unicode standard} for a description of the values.
474
475	\value NoDecomposition
476	\value Canonical
477	\value Circle
478	\value Compat
479	\value Final
480	\value Font
481	\value Fraction
482	\value Initial
483	\value Isolated
484	\value Medial
485	\value Narrow
486	\value NoBreak
487	\value Small
488	\value Square
489	\value Sub
490	\value Super
491	\value Vertical
492	\value Wide
493
494	\sa decomposition()
495	*/
496
497	/!*
498	\enum QChar::JoiningType
499	since 5.3
500
501	This enum type defines the Unicode joining type attributes. See the
502	\l{Unicode standard} for a description of the values.
503
504	In order to conform to C/C++ naming conventions "Joining_" is prepended
505	to the codes used in the Unicode Standard.
506
507	\value Joining_None
508	\value Joining_Causing
509	\value Joining_Dual
510	\value Joining_Right
511	\value Joining_Left
512	\value Joining_Transparent
513
514	\sa joiningType()
515	*/
516
517	/!*
518	\enum QChar::CombiningClass
519
520	\internal
521
522	This enum type defines names for some of the Unicode combining
523	classes. See the \l{Unicode Standard} for a description of the values.
524
525	\value Combining_Above
526	\value Combining_AboveAttached
527	\value Combining_AboveLeft
528	\value Combining_AboveLeftAttached
529	\value Combining_AboveRight
530	\value Combining_AboveRightAttached
531	\value Combining_Below
532	\value Combining_BelowAttached
533	\value Combining_BelowLeft
534	\value Combining_BelowLeftAttached
535	\value Combining_BelowRight
536	\value Combining_BelowRightAttached
537	\value Combining_DoubleAbove
538	\value Combining_DoubleBelow
539	\value Combining_IotaSubscript
540	\value Combining_Left
541	\value Combining_LeftAttached
542	\value Combining_Right
543	\value Combining_RightAttached
544	*/
545
546	/!*
547	\enum QChar::SpecialCharacter
548
549	\value Null A QChar with this value isNull().
550	\value Tabulation Character tabulation.
551	\value LineFeed
552	\value FormFeed
553	\value CarriageReturn
554	\value Space
555	\value Nbsp Non-breaking space.
556	\value SoftHyphen
557	\value ReplacementCharacter The character shown when a font has no glyph
558	for a certain codepoint. A special question mark character is often
559	used. Codecs use this codepoint when input data cannot be
560	represented in Unicode.
561	\value ObjectReplacementCharacter Used to represent an object such as an
562	image when such objects cannot be presented.
563	\value ByteOrderMark
564	\value ByteOrderSwapped
565	\value ParagraphSeparator
566	\value LineSeparator
567	\value [since 6.2] VisualTabCharacter Used to represent a tabulation as a horizontal arrow.
568	\value LastValidCodePoint
569	*/
570
571	/!*
572	\fn void QChar::setCell(uchar cell)
573	\internal
574	*/
575
576	/!*
577	\fn void QChar::setRow(uchar row)
578	\internal
579	*/
580
581	/!*
582	\fn QChar::QChar()
583
584	Constructs a null QChar ('\\0').
585
586	\sa isNull()
587	*/
588
589	/!*
590	\fn QChar::QChar(QLatin1Char ch)
591
592	Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
593	*/
594
595	/!*
596	\fn QChar::QChar(SpecialCharacter ch)
597
598	Constructs a QChar for the predefined character value \a ch.
599	*/
600
601	/!*
602	\fn QChar::QChar(char16_t ch)
603	\since 5.10
604
605	Constructs a QChar corresponding to the UTF-16 character \a ch.
606	*/
607
608	/!*
609	\fn QChar::QChar(wchar_t ch)
610	\since 5.10
611
612	Constructs a QChar corresponding to the wide character \a ch.
613
614	\note This constructor is only available on Windows.
615	*/
616
617	/!*
618	\fn QChar::QChar(char ch)
619
620	Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
621
622	\note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
623	is defined.
624
625	\sa QT_NO_CAST_FROM_ASCII
626	*/
627
628	/!*
629	\fn QChar::QChar(uchar ch)
630
631	Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
632
633	\note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
634	or \c QT_RESTRICTED_CAST_FROM_ASCII is defined.
635
636	\sa QT_NO_CAST_FROM_ASCII, QT_RESTRICTED_CAST_FROM_ASCII
637	*/
638
639	/!*
640	\fn QChar::QChar(uchar cell, uchar row)
641
642	Constructs a QChar for Unicode cell \a cell in row \a row.
643
644	\sa cell(), row()
645	*/
646
647	/!*
648	\fn QChar::QChar(ushort code)
649
650	Constructs a QChar for the character with Unicode code point \a code.
651	*/
652
653	/!*
654	\fn QChar::QChar(short code)
655
656	Constructs a QChar for the character with Unicode code point \a code.
657	*/
658
659	/!*
660	\fn QChar::QChar(uint code)
661
662	Constructs a QChar for the character with Unicode code point \a code.
663	*/
664
665	/!*
666	\fn QChar::QChar(int code)
667
668	Constructs a QChar for the character with Unicode code point \a code.
669	*/
670
671	/!*
672	\fn static QChar QChar::fromUcs2(char16_t c)
673	\since 6.0
674
675	Constructs a QChar from UTF-16 character \a c.
676
677	\sa fromUcs4()
678	*/
679
680	/!*
681	\fn static auto QChar::fromUcs4(char32_t c)
682	\since 6.0
683
684	Returns an anonymous struct that
685	\list
686	\li contains a \c{char16_t chars[2]} array,
687	\li can be implicitly converted to a QStringView, and
688	\li iterated over with a C++11 ranged for loop.
689	\endlist
690
691	If \a c requires surrogates, \c{chars[0]} contains the high surrogate
692	and \c{chars[1]} the low surrogate, and the QStringView has size 2.
693	Otherwise, \c{chars[0]} contains \a c and \c{chars[1]} is
694	\l{QChar::isNull}{null}, and the QStringView has size 1.
695
696	This allows easy use of the result:
697
698	\code
699	QString s;
700	s += QChar::fromUcs4(ch);
701	\endcode
702
703	\code
704	for (char16_t c16 : QChar::fromUcs4(ch))
705	use(c16);
706	\endcode
707
708	\sa fromUcs2(), requiresSurrogates()
709	*/
710
711	/!*
712	\fn bool QChar::isNull() const
713
714	Returns \c true if the character is the Unicode character 0x0000
715	('\\0'); otherwise returns \c false.
716	*/
717
718	/!*
719	\fn uchar QChar::cell() const
720
721	Returns the cell (least significant byte) of the Unicode character.
722
723	\sa row()
724	*/
725
726	/!*
727	\fn uchar QChar::row() const
728
729	Returns the row (most significant byte) of the Unicode character.
730
731	\sa cell()
732	*/
733
734	/!*
735	\fn bool QChar::isPrint() const
736
737	Returns \c true if the character is a printable character; otherwise
738	returns \c false. This is any character not of category Other_.*
739
740	Note that this gives no indication of whether the character is
741	available in a particular font.
742	*/
743
744	/!*
745	\overload
746	\since 5.0
747
748	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
749	a printable character; otherwise returns \c false.
750	This is any character not of category Other_.*
751
752	Note that this gives no indication of whether the character is
753	available in a particular font.
754
755	\note Before Qt 6, this function took a \c uint argument.
756	*/
757	bool QChar::isPrint(char32_t ucs4) noexcept
758	{
759	if (ucs4 > LastValidCodePoint)
760	return false;
761	const int test = FLAG(Other_Control) \|
762	FLAG(Other_Format) \|
763	FLAG(Other_Surrogate) \|
764	FLAG(Other_PrivateUse) \|
765	FLAG(Other_NotAssigned);
766	return !(FLAG(qGetProp(ucs4)->category) & test);
767	}
768
769	/!*
770	\fn bool QChar::isSpace() const
771
772	Returns \c true if the character is a separator character
773	(Separator_ categories or certain code points from Other_Control category);*
774	otherwise returns \c false.
775	*/
776
777	/!*
778	\fn bool QChar::isSpace(char32_t ucs4)
779	\overload
780	\since 5.0
781
782	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
783	a separator character (Separator_ categories or certain code points*
784	from Other_Control category); otherwise returns \c false.
785
786	\note Before Qt 6, this function took a \c uint argument.
787	*/
788
789	/!*
790	\internal
791	*/
792	bool QT_FASTCALL QChar::isSpace_helper(char32_t ucs4) noexcept
793	{
794	if (ucs4 > LastValidCodePoint)
795	return false;
796	const int test = FLAG(Separator_Space) \|
797	FLAG(Separator_Line) \|
798	FLAG(Separator_Paragraph);
799	return FLAG(qGetProp(ucs4)->category) & test;
800	}
801
802	/!*
803	\fn bool QChar::isMark() const
804
805	Returns \c true if the character is a mark (Mark_ categories);*
806	otherwise returns \c false.
807
808	See QChar::Category for more information regarding marks.
809	*/
810
811	/!*
812	\overload
813	\since 5.0
814
815	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
816	a mark (Mark_ categories); otherwise returns \c false.*
817
818	\note Before Qt 6, this function took a \c uint argument.
819	*/
820	bool QChar::isMark(char32_t ucs4) noexcept
821	{
822	if (ucs4 > LastValidCodePoint)
823	return false;
824	const int test = FLAG(Mark_NonSpacing) \|
825	FLAG(Mark_SpacingCombining) \|
826	FLAG(Mark_Enclosing);
827	return FLAG(qGetProp(ucs4)->category) & test;
828	}
829
830	/!*
831	\fn bool QChar::isPunct() const
832
833	Returns \c true if the character is a punctuation mark (Punctuation_*
834	categories); otherwise returns \c false.
835	*/
836
837	/!*
838	\overload
839	\since 5.0
840
841	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
842	a punctuation mark (Punctuation_ categories); otherwise returns \c false.*
843
844	\note Before Qt 6, this function took a \c uint argument.
845	*/
846	bool QChar::isPunct(char32_t ucs4) noexcept
847	{
848	if (ucs4 > LastValidCodePoint)
849	return false;
850	const int test = FLAG(Punctuation_Connector) \|
851	FLAG(Punctuation_Dash) \|
852	FLAG(Punctuation_Open) \|
853	FLAG(Punctuation_Close) \|
854	FLAG(Punctuation_InitialQuote) \|
855	FLAG(Punctuation_FinalQuote) \|
856	FLAG(Punctuation_Other);
857	return FLAG(qGetProp(ucs4)->category) & test;
858	}
859
860	/!*
861	\fn bool QChar::isSymbol() const
862
863	Returns \c true if the character is a symbol (Symbol_ categories);*
864	otherwise returns \c false.
865	*/
866
867	/!*
868	\overload
869	\since 5.0
870
871	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
872	a symbol (Symbol_ categories); otherwise returns \c false.*
873
874	\note Before Qt 6, this function took a \c uint argument.
875	*/
876	bool QChar::isSymbol(char32_t ucs4) noexcept
877	{
878	if (ucs4 > LastValidCodePoint)
879	return false;
880	const int test = FLAG(Symbol_Math) \|
881	FLAG(Symbol_Currency) \|
882	FLAG(Symbol_Modifier) \|
883	FLAG(Symbol_Other);
884	return FLAG(qGetProp(ucs4)->category) & test;
885	}
886
887	/!*
888	\fn bool QChar::isLetter() const
889
890	Returns \c true if the character is a letter (Letter_ categories);*
891	otherwise returns \c false.
892	*/
893
894	/!*
895	\fn bool QChar::isLetter(char32_t ucs4)
896	\overload
897	\since 5.0
898
899	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
900	a letter (Letter_ categories); otherwise returns \c false.*
901
902	\note Before Qt 6, this function took a \c uint argument.
903	*/
904
905	/!*
906	\internal
907	*/
908	bool QT_FASTCALL QChar::isLetter_helper(char32_t ucs4) noexcept
909	{
910	if (ucs4 > LastValidCodePoint)
911	return false;
912	const int test = FLAG(Letter_Uppercase) \|
913	FLAG(Letter_Lowercase) \|
914	FLAG(Letter_Titlecase) \|
915	FLAG(Letter_Modifier) \|
916	FLAG(Letter_Other);
917	return FLAG(qGetProp(ucs4)->category) & test;
918	}
919
920	/!*
921	\fn bool QChar::isNumber() const
922
923	Returns \c true if the character is a number (Number_ categories,*
924	not just 0-9); otherwise returns \c false.
925
926	\sa isDigit()
927	*/
928
929	/!*
930	\fn bool QChar::isNumber(char32_t ucs4)
931	\overload
932	\since 5.0
933
934	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
935	a number (Number_ categories, not just 0-9); otherwise returns \c false.*
936
937	\note Before Qt 6, this function took a \c uint argument.
938
939	\sa isDigit()
940	*/
941
942	/!*
943	\internal
944	*/
945	bool QT_FASTCALL QChar::isNumber_helper(char32_t ucs4) noexcept
946	{
947	if (ucs4 > LastValidCodePoint)
948	return false;
949	const int test = FLAG(Number_DecimalDigit) \|
950	FLAG(Number_Letter) \|
951	FLAG(Number_Other);
952	return FLAG(qGetProp(ucs4)->category) & test;
953	}
954
955	/!*
956	\fn bool QChar::isLetterOrNumber() const
957
958	Returns \c true if the character is a letter or number (Letter_ or*
959	Number_ categories); otherwise returns \c false.*
960	*/
961
962	/!*
963	\fn bool QChar::isLetterOrNumber(char32_t ucs4)
964	\overload
965	\since 5.0
966
967	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
968	a letter or number (Letter_ or Number_* categories); otherwise returns \c false.*
969
970	\note Before Qt 6, this function took a \c uint argument.
971	*/
972
973	/!*
974	\internal
975	*/
976	bool QT_FASTCALL QChar::isLetterOrNumber_helper(char32_t ucs4) noexcept
977	{
978	if (ucs4 > LastValidCodePoint)
979	return false;
980	const int test = FLAG(Letter_Uppercase) \|
981	FLAG(Letter_Lowercase) \|
982	FLAG(Letter_Titlecase) \|
983	FLAG(Letter_Modifier) \|
984	FLAG(Letter_Other) \|
985	FLAG(Number_DecimalDigit) \|
986	FLAG(Number_Letter) \|
987	FLAG(Number_Other);
988	return FLAG(qGetProp(ucs4)->category) & test;
989	}
990
991	/!*
992	\fn bool QChar::isDigit() const
993
994	Returns \c true if the character is a decimal digit
995	(Number_DecimalDigit); otherwise returns \c false.
996
997	\sa isNumber()
998	*/
999
1000	/!*
1001	\fn bool QChar::isDigit(char32_t ucs4)
1002	\overload
1003	\since 5.0
1004
1005	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
1006	a decimal digit (Number_DecimalDigit); otherwise returns \c false.
1007
1008	\note Before Qt 6, this function took a \c uint argument.
1009
1010	\sa isNumber()
1011	*/
1012
1013	/!*
1014	\fn bool QChar::isNonCharacter() const
1015	\since 5.0
1016
1017	Returns \c true if the QChar is a non-character; false otherwise.
1018
1019	Unicode has a certain number of code points that are classified
1020	as "non-characters:" that is, they can be used for internal purposes
1021	in applications but cannot be used for text interchange.
1022	Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1023	[0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1024	*/
1025
1026	/!*
1027	\fn bool QChar::isHighSurrogate() const
1028
1029	Returns \c true if the QChar is the high part of a UTF16 surrogate
1030	(for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1031	*/
1032
1033	/!*
1034	\fn bool QChar::isLowSurrogate() const
1035
1036	Returns \c true if the QChar is the low part of a UTF16 surrogate
1037	(for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1038	*/
1039
1040	/!*
1041	\fn bool QChar::isSurrogate() const
1042	\since 5.0
1043
1044	Returns \c true if the QChar contains a code point that is in either
1045	the high or the low part of the UTF-16 surrogate range
1046	(for example if its code point is in range [0xd800..0xdfff]); false otherwise.
1047	*/
1048
1049	/!*
1050	\fn static bool QChar::isNonCharacter(char32_t ucs4)
1051	\overload
1052	\since 5.0
1053
1054	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1055	is a non-character; false otherwise.
1056
1057	Unicode has a certain number of code points that are classified
1058	as "non-characters:" that is, they can be used for internal purposes
1059	in applications but cannot be used for text interchange.
1060	Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1061	[0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1062
1063	\note Before Qt 6, this function took a \c uint argument.
1064	*/
1065
1066	/!*
1067	\fn static bool QChar::isHighSurrogate(char32_t ucs4)
1068	\overload
1069
1070	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1071	is the high part of a UTF16 surrogate
1072	(for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1073
1074	\note Before Qt 6, this function took a \c uint argument.
1075	*/
1076
1077	/!*
1078	\fn static bool QChar::isLowSurrogate(char32_t ucs4)
1079	\overload
1080
1081	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1082	is the low part of a UTF16 surrogate
1083	(for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1084
1085	\note Before Qt 6, this function took a \c uint argument.
1086	*/
1087
1088	/!*
1089	\fn static bool QChar::isSurrogate(char32_t ucs4)
1090	\overload
1091	\since 5.0
1092
1093	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1094	contains a code point that is in either the high or the low part of the
1095	UTF-16 surrogate range (for example if its code point is in range [0xd800..0xdfff]);
1096	false otherwise.
1097
1098	\note Before Qt 6, this function took a \c uint argument.
1099	*/
1100
1101	/!*
1102	\fn static bool QChar::requiresSurrogates(char32_t ucs4)
1103
1104	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1105	can be split into the high and low parts of a UTF16 surrogate
1106	(for example if its code point is greater than or equals to 0x10000);
1107	false otherwise.
1108
1109	\note Before Qt 6, this function took a \c uint argument.
1110	*/
1111
1112	/!*
1113	\fn static char32_t QChar::surrogateToUcs4(char16_t high, char16_t low)
1114
1115	Converts a UTF16 surrogate pair with the given \a high and \a low values
1116	to it's UCS-4-encoded code point.
1117
1118	\note Before Qt 6, this function took \c ushort arguments and returned \c uint.
1119	*/
1120
1121	/!*
1122	\fn static char32_t QChar::surrogateToUcs4(QChar high, QChar low)
1123	\overload
1124
1125	Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
1126
1127	\note Before Qt 6, this function returned \c uint.
1128	*/
1129
1130	/!*
1131	\fn static char16_t QChar::highSurrogate(char32_t ucs4)
1132
1133	Returns the high surrogate part of a UCS-4-encoded code point.
1134	The returned result is undefined if \a ucs4 is smaller than 0x10000.
1135
1136	\note Before Qt 6, this function took a \c uint argument and returned \c ushort.
1137	*/
1138
1139	/!*
1140	\fn static char16_t QChar::lowSurrogate(char32_t ucs4)
1141
1142	Returns the low surrogate part of a UCS-4-encoded code point.
1143	The returned result is undefined if \a ucs4 is smaller than 0x10000.
1144
1145	\note Before Qt 6, this function took a \c uint argument and returned \c ushort.
1146	*/
1147
1148	/!*
1149	\fn int QChar::digitValue() const
1150
1151	Returns the numeric value of the digit, or -1 if the character is not a digit.
1152	*/
1153
1154	/!*
1155	\overload
1156	Returns the numeric value of the digit specified by the UCS-4-encoded
1157	character, \a ucs4, or -1 if the character is not a digit.
1158
1159	\note Before Qt 6, this function took a \c uint argument.
1160	*/
1161	int QChar::digitValue(char32_t ucs4) noexcept
1162	{
1163	if (ucs4 > LastValidCodePoint)
1164	return -`1`;
1165	return qGetProp(ucs4)->digitValue;
1166	}
1167
1168	/!*
1169	\fn QChar::Category QChar::category() const
1170
1171	Returns the character's category.
1172	*/
1173
1174	/!*
1175	\overload
1176	Returns the category of the UCS-4-encoded character specified by \a ucs4.
1177
1178	\note Before Qt 6, this function took a \c uint argument.
1179	*/
1180	QChar::Category QChar::category(char32_t ucs4) noexcept
1181	{
1182	if (ucs4 > LastValidCodePoint)
1183	return QChar::Other_NotAssigned;
1184	return (QChar::Category) qGetProp(ucs4)->category;
1185	}
1186
1187	/!*
1188	\fn QChar::Direction QChar::direction() const
1189
1190	Returns the character's direction.
1191	*/
1192
1193	/!*
1194	\overload
1195	Returns the direction of the UCS-4-encoded character specified by \a ucs4.
1196
1197	\note Before Qt 6, this function took a \c uint argument.
1198	*/
1199	QChar::Direction QChar::direction(char32_t ucs4) noexcept
1200	{
1201	if (ucs4 > LastValidCodePoint)
1202	return QChar::DirL;
1203	return (QChar::Direction) qGetProp(ucs4)->direction;
1204	}
1205
1206	/!*
1207	\fn QChar::JoiningType QChar::joiningType() const
1208	\since 5.3
1209
1210	Returns information about the joining type attributes of the character
1211	(needed for certain languages such as Arabic or Syriac).
1212	*/
1213
1214	/!*
1215	\overload
1216	\since 5.3
1217
1218	Returns information about the joining type attributes of the UCS-4-encoded
1219	character specified by \a ucs4
1220	(needed for certain languages such as Arabic or Syriac).
1221
1222	\note Before Qt 6, this function took a \c uint argument.
1223	*/
1224	QChar::JoiningType QChar::joiningType(char32_t ucs4) noexcept
1225	{
1226	if (ucs4 > LastValidCodePoint)
1227	return QChar::Joining_None;
1228	return QChar::JoiningType(qGetProp(ucs4)->joining);
1229	}
1230
1231	/!*
1232	\fn bool QChar::hasMirrored() const
1233
1234	Returns \c true if the character should be reversed if the text
1235	direction is reversed; otherwise returns \c false.
1236
1237	A bit faster equivalent of (ch.mirroredChar() != ch).
1238
1239	\sa mirroredChar()
1240	*/
1241
1242	/!*
1243	\overload
1244	\since 5.0
1245
1246	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1247	should be reversed if the text direction is reversed; otherwise returns \c false.
1248
1249	A bit faster equivalent of (QChar::mirroredChar(ucs4) != ucs4).
1250
1251	\note Before Qt 6, this function took a \c uint argument.
1252
1253	\sa mirroredChar()
1254	*/
1255	bool QChar::hasMirrored(char32_t ucs4) noexcept
1256	{
1257	if (ucs4 > LastValidCodePoint)
1258	return false;
1259	return qGetProp(ucs4)->mirrorDiff != `0`;
1260	}
1261
1262	/!*
1263	\fn bool QChar::isLower() const
1264
1265	Returns \c true if the character is a lowercase letter, for example
1266	category() is Letter_Lowercase.
1267
1268	\sa isUpper(), toLower(), toUpper()
1269	*/
1270
1271	/!*
1272	\fn static bool QChar::isLower(char32_t ucs4)
1273	\overload
1274	\since 5.0
1275
1276	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1277	is a lowercase letter, for example category() is Letter_Lowercase.
1278
1279	\note Before Qt 6, this function took a \c uint argument.
1280
1281	\sa isUpper(), toLower(), toUpper()
1282	*/
1283
1284	/!*
1285	\fn bool QChar::isUpper() const
1286
1287	Returns \c true if the character is an uppercase letter, for example
1288	category() is Letter_Uppercase.
1289
1290	\sa isLower(), toUpper(), toLower()
1291	*/
1292
1293	/!*
1294	\fn static bool QChar::isUpper(char32_t ucs4)
1295	\overload
1296	\since 5.0
1297
1298	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1299	is an uppercase letter, for example category() is Letter_Uppercase.
1300
1301	\note Before Qt 6, this function took a \c uint argument.
1302
1303	\sa isLower(), toUpper(), toLower()
1304	*/
1305
1306	/!*
1307	\fn bool QChar::isTitleCase() const
1308
1309	Returns \c true if the character is a titlecase letter, for example
1310	category() is Letter_Titlecase.
1311
1312	\sa isLower(), toUpper(), toLower(), toTitleCase()
1313	*/
1314
1315	/!*
1316	\fn static bool QChar::isTitleCase(char32_t ucs4)
1317	\overload
1318	\since 5.0
1319
1320	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1321	is a titlecase letter, for example category() is Letter_Titlecase.
1322
1323	\note Before Qt 6, this function took a \c uint argument.
1324
1325	\sa isLower(), toUpper(), toLower(), toTitleCase()
1326	*/
1327	/!*
1328	\fn QChar QChar::mirroredChar() const
1329
1330	Returns the mirrored character if this character is a mirrored
1331	character; otherwise returns the character itself.
1332
1333	\sa hasMirrored()
1334	*/
1335
1336	/!*
1337	\overload
1338	Returns the mirrored character if the UCS-4-encoded character specified
1339	by \a ucs4 is a mirrored character; otherwise returns the character itself.
1340
1341	\note Before Qt 6, this function took a \c uint argument and returned \c uint.
1342
1343	\sa hasMirrored()
1344	*/
1345	char32_t QChar::mirroredChar(char32_t ucs4) noexcept
1346	{
1347	if (ucs4 > LastValidCodePoint)
1348	return ucs4;
1349	return ucs4 + qGetProp(ucs4)->mirrorDiff;
1350	}
1351
1352	// Constants for Hangul (de)composition, see UAX #15:
1353	static constexpr char32_t Hangul_SBase = `0xac00`;
1354	static constexpr char32_t Hangul_LBase = `0x1100`;
1355	static constexpr char32_t Hangul_VBase = `0x1161`;
1356	static constexpr char32_t Hangul_TBase = `0x11a7`;
1357	static constexpr quint32 Hangul_LCount = `19`;
1358	static constexpr quint32 Hangul_VCount = `21`;
1359	static constexpr quint32 Hangul_TCount = `28`;
1360	static constexpr quint32 Hangul_NCount = Hangul_VCount * Hangul_TCount;
1361	static constexpr quint32 Hangul_SCount = Hangul_LCount * Hangul_NCount;
1362
1363	// buffer has to have a length of 3. It's needed for Hangul decomposition
1364	static const QChar * QT_FASTCALL decompositionHelper(
1365	char32_t ucs4, qsizetype length, QChar::Decomposition tag, QChar *buffer)
1366	{
1367	if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
1368	// compute Hangul syllable decomposition as per UAX #15
1369	const char32_t SIndex = ucs4 - Hangul_SBase;
1370	buffer[`0`] = QChar (Hangul_LBase + SIndex / Hangul_NCount); // L
1371	buffer[`1`] = QChar (Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount); // V
1372	buffer[`2`] = QChar (Hangul_TBase + SIndex % Hangul_TCount); // T
1373	*length = buffer[`2`].unicode() == Hangul_TBase ? `2` : `3`;
1374	*tag = QChar::Canonical;
1375	return buffer;
1376	}
1377
1378	const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1379	if (index == `0xffff`) {
1380	*length = `0`;
1381	*tag = QChar::NoDecomposition;
1382	return nullptr;
1383	}
1384
1385	const unsigned short *decomposition = uc_decomposition_map+index;
1386	tag = QChar::Decomposition((decomposition) & `0xff`);
1387	length = (decomposition) >> `8`;
1388	return reinterpret_cast<const QChar *>(decomposition + `1`);
1389	}
1390
1391	/!*
1392	Decomposes a character into it's constituent parts. Returns an empty string
1393	if no decomposition exists.
1394	*/
1395	QString QChar::decomposition() const
1396	{
1397	return QChar::decomposition(ucs4: ucs);
1398	}
1399
1400	/!*
1401	\overload
1402	Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
1403	constituent parts. Returns an empty string if no decomposition exists.
1404
1405	\note Before Qt 6, this function took a \c uint argument.
1406	*/
1407	QString QChar::decomposition(char32_t ucs4)
1408	{
1409	QChar buffer[`3`];
1410	qsizetype length;
1411	QChar::Decomposition tag;
1412	const QChar *d = decompositionHelper(ucs4, length: &length, tag: &tag, buffer);
1413	return QString (d, length);
1414	}
1415
1416	/!*
1417	\fn QChar::Decomposition QChar::decompositionTag() const
1418
1419	Returns the tag defining the composition of the character. Returns
1420	QChar::NoDecomposition if no decomposition exists.
1421	*/
1422
1423	/!*
1424	\overload
1425	Returns the tag defining the composition of the UCS-4-encoded character
1426	specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
1427
1428	\note Before Qt 6, this function took a \c uint argument.
1429	*/
1430	QChar::Decomposition QChar::decompositionTag(char32_t ucs4) noexcept
1431	{
1432	if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount)
1433	return QChar::Canonical;
1434	const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1435	if (index == `0xffff`)
1436	return QChar::NoDecomposition;
1437	return (QChar::Decomposition)(uc_decomposition_map[index] & `0xff`);
1438	}
1439
1440	/!*
1441	\fn unsigned char QChar::combiningClass() const
1442
1443	Returns the combining class for the character as defined in the
1444	Unicode standard. This is mainly useful as a positioning hint for
1445	marks attached to a base character.
1446
1447	The Qt text rendering engine uses this information to correctly
1448	position non-spacing marks around a base character.
1449	*/
1450
1451	/!*
1452	\overload
1453	Returns the combining class for the UCS-4-encoded character specified by
1454	\a ucs4, as defined in the Unicode standard.
1455
1456	\note Before Qt 6, this function took a \c uint argument.
1457	*/
1458	unsigned char QChar::combiningClass(char32_t ucs4) noexcept
1459	{
1460	if (ucs4 > LastValidCodePoint)
1461	return `0`;
1462	return (unsigned char) qGetProp(ucs4)->combiningClass;
1463	}
1464
1465	/!*
1466	\fn QChar::Script QChar::script() const
1467	\since 5.1
1468
1469	Returns the Unicode script property value for this character.
1470	*/
1471
1472	/!*
1473	\overload
1474	\since 5.1
1475
1476	Returns the Unicode script property value for the character specified in
1477	its UCS-4-encoded form as \a ucs4.
1478
1479	\note Before Qt 6, this function took a \c uint argument.
1480	*/
1481	QChar::Script QChar::script(char32_t ucs4) noexcept
1482	{
1483	if (ucs4 > LastValidCodePoint)
1484	return QChar::Script_Unknown;
1485	return (QChar::Script) qGetProp(ucs4)->script;
1486	}
1487
1488	/!*
1489	\fn QChar::UnicodeVersion QChar::unicodeVersion() const
1490
1491	Returns the Unicode version that introduced this character.
1492	*/
1493
1494	/!*
1495	\overload
1496	Returns the Unicode version that introduced the character specified in
1497	its UCS-4-encoded form as \a ucs4.
1498
1499	\note Before Qt 6, this function took a \c uint argument.
1500	*/
1501	QChar::UnicodeVersion QChar::unicodeVersion(char32_t ucs4) noexcept
1502	{
1503	if (ucs4 > LastValidCodePoint)
1504	return QChar::Unicode_Unassigned;
1505	return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1506	}
1507
1508	/!*
1509	Returns the most recent supported Unicode version.
1510	*/
1511	QChar::UnicodeVersion QChar::currentUnicodeVersion() noexcept
1512	{
1513	return UNICODE_DATA_VERSION;
1514	}
1515
1516	static auto fullConvertCase(char32_t uc, QUnicodeTables::Case which) noexcept
1517	{
1518	struct R {
1519	char16_t chars[MaxSpecialCaseLength + `1`];
1520	qint8 sz;
1521
1522	// iterable
1523	auto begin() const { return chars; }
1524	auto end() const { return chars + sz; }
1525	// QStringView-compatible
1526	auto data() const { return chars; }
1527	auto size() const { return sz; }
1528	} result;
1529	Q_ASSERT(uc <= QChar::LastValidCodePoint);
1530
1531	auto pp = result.chars;
1532
1533	const auto fold = qGetProp(ucs4: uc)->cases[which];
1534	const auto caseDiff = fold.diff;
1535
1536	if (Q_UNLIKELY(fold.special)) {
1537	const auto *specialCase = specialCaseMap + caseDiff;
1538	auto length = *specialCase++;
1539	while (length--)
1540	pp++ = specialCase++;
1541	} else {
1542	// so far, case conversion never changes planes (guaranteed by the qunicodetables generator)
1543	for (char16_t c : QChar::fromUcs4(c: uc + caseDiff))
1544	*pp++ = c;
1545	}
1546	result.sz = pp - result.chars;
1547	return result;
1548	}
1549
1550	template <typename T>
1551	Q_DECL_CONST_FUNCTION static inline T convertCase_helper(T uc, QUnicodeTables::Case which) noexcept
1552	{
1553	const auto fold = qGetProp(uc)->cases[which];
1554
1555	if (Q_UNLIKELY(fold.special)) {
1556	const ushort *specialCase = specialCaseMap + fold.diff;
1557	// so far, there are no special cases beyond BMP (guaranteed by the qunicodetables generator)
1558	return *specialCase == `1` ? specialCase[`1`] : uc;
1559	}
1560
1561	return uc + fold.diff;
1562	}
1563
1564	/!*
1565	\fn QChar QChar::toLower() const
1566
1567	Returns the lowercase equivalent if the character is uppercase or titlecase;
1568	otherwise returns the character itself.
1569	*/
1570
1571	/!*
1572	\overload
1573	Returns the lowercase equivalent of the UCS-4-encoded character specified
1574	by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1575	the character itself.
1576
1577	\note Before Qt 6, this function took a \c uint argument and returned \c uint.
1578	*/
1579	char32_t QChar::toLower(char32_t ucs4) noexcept
1580	{
1581	if (ucs4 > LastValidCodePoint)
1582	return ucs4;
1583	return convertCase_helper(uc: ucs4, which: QUnicodeTables::LowerCase);
1584	}
1585
1586	/!*
1587	\fn QChar QChar::toUpper() const
1588
1589	Returns the uppercase equivalent if the character is lowercase or titlecase;
1590	otherwise returns the character itself.
1591
1592	\note This function also returns the original character in the rare case of
1593	the uppercase form of the character requiring two or more characters.
1594
1595	\sa QString::toUpper()
1596	*/
1597
1598	/!*
1599	\overload
1600	Returns the uppercase equivalent of the UCS-4-encoded character specified
1601	by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1602	the character itself.
1603
1604	\note This function also returns the original character in the rare case of
1605	the uppercase form of the character requiring two or more characters.
1606
1607	\note Before Qt 6, this function took a \c uint argument and returned \c uint.
1608
1609	\sa QString::toUpper()
1610	*/
1611	char32_t QChar::toUpper(char32_t ucs4) noexcept
1612	{
1613	if (ucs4 > LastValidCodePoint)
1614	return ucs4;
1615	return convertCase_helper(uc: ucs4, which: QUnicodeTables::UpperCase);
1616	}
1617
1618	/!*
1619	\fn QChar QChar::toTitleCase() const
1620
1621	Returns the title case equivalent if the character is lowercase or uppercase;
1622	otherwise returns the character itself.
1623	*/
1624
1625	/!*
1626	\overload
1627	Returns the title case equivalent of the UCS-4-encoded character specified
1628	by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1629	the character itself.
1630
1631	\note Before Qt 6, this function took a \c uint argument and returned \c uint.
1632	*/
1633	char32_t QChar::toTitleCase(char32_t ucs4) noexcept
1634	{
1635	if (ucs4 > LastValidCodePoint)
1636	return ucs4;
1637	return convertCase_helper(uc: ucs4, which: QUnicodeTables::TitleCase);
1638	}
1639
1640	static inline char32_t foldCase(const char16_t ch, const* char16_t *start)
1641	{
1642	char32_t ucs4 = *ch;
1643	if (QChar::isLowSurrogate(ucs4) && ch > start && QChar::isHighSurrogate(ucs4: *(ch - `1`)))
1644	ucs4 = QChar::surrogateToUcs4(high: *(ch - `1`), low: ucs4);
1645	return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1646	}
1647
1648	static inline char32_t foldCase(char32_t ch, char32_t &last) noexcept
1649	{
1650	char32_t ucs4 = ch;
1651	if (QChar::isLowSurrogate(ucs4) && QChar::isHighSurrogate(ucs4: last))
1652	ucs4 = QChar::surrogateToUcs4(high: last, low: ucs4);
1653	last = ch;
1654	return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1655	}
1656
1657	static inline char16_t foldCase(char16_t ch) noexcept
1658	{
1659	return convertCase_helper(uc: ch, which: QUnicodeTables::CaseFold);
1660	}
1661
1662	static inline QChar foldCase(QChar ch) noexcept
1663	{
1664	return QChar (foldCase(ch: ch.unicode()));
1665	}
1666
1667	/!*
1668	\fn QChar QChar::toCaseFolded() const
1669
1670	Returns the case folded equivalent of the character.
1671	For most Unicode characters this is the same as toLower().
1672	*/
1673
1674	/!*
1675	\overload
1676	Returns the case folded equivalent of the UCS-4-encoded character specified
1677	by \a ucs4. For most Unicode characters this is the same as toLower().
1678
1679	\note Before Qt 6, this function took a \c uint argument and returned \c uint.
1680	*/
1681	char32_t QChar::toCaseFolded(char32_t ucs4) noexcept
1682	{
1683	if (ucs4 > LastValidCodePoint)
1684	return ucs4;
1685	return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1686	}
1687
1688	/!*
1689	\fn char QChar::toLatin1() const
1690
1691	Returns the Latin-1 character equivalent to the QChar, or 0. This
1692	is mainly useful for non-internationalized software.
1693
1694	\note It is not possible to distinguish a non-Latin-1 character from a Latin-1 0
1695	(NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1696
1697	\sa unicode()
1698	*/
1699
1700	/!*
1701	\fn QChar QChar::fromLatin1(char)
1702
1703	Converts the Latin-1 character \a c to its equivalent QChar. This
1704	is mainly useful for non-internationalized software.
1705
1706	An alternative is to use QLatin1Char.
1707
1708	\sa toLatin1(), unicode()
1709	*/
1710
1711	#ifndef QT_NO_DATASTREAM
1712	/!*
1713	\relates QChar
1714
1715	Writes the char \a chr to the stream \a out.
1716
1717	\sa {Serializing Qt Data Types}
1718	*/
1719	QDataStream &operator<<(QDataStream &out, QChar chr)
1720	{
1721	out << quint16(chr.unicode());
1722	return out;
1723	}
1724
1725	/!*
1726	\relates QChar
1727
1728	Reads a char from the stream \a in into char \a chr.
1729
1730	\sa {Serializing Qt Data Types}
1731	*/
1732	QDataStream &operator>>(QDataStream &in, QChar &chr)
1733	{
1734	quint16 u;
1735	in >> u;
1736	chr.unicode() = char16_t(u);
1737	return in;
1738	}
1739	#endif // QT_NO_DATASTREAM
1740
1741	/!*
1742	\fn QChar::unicode()
1743
1744	Returns a reference to the numeric Unicode value of the QChar.
1745	*/
1746
1747	/!*
1748	\fn QChar::unicode() const
1749
1750	Returns the numeric Unicode value of the QChar.
1751	*/
1752
1753	/*****************************************************************************
1754	Documentation of QChar related functions
1755	*****************************************************************************/
1756
1757	/!*
1758	\fn bool QChar::operator==(const QChar &c1, const QChar &c2)
1759
1760	Returns \c true if \a c1 and \a c2 are the same Unicode character;
1761	otherwise returns \c false.
1762	*/
1763
1764	/!*
1765	\fn bool QChar::operator!=(const QChar &c1, const QChar &c2)
1766
1767	Returns \c true if \a c1 and \a c2 are not the same Unicode
1768	character; otherwise returns \c false.
1769	*/
1770
1771	/!*
1772	\fn bool QChar::operator<=(const QChar &c1, const QChar &c2)
1773
1774	Returns \c true if the numeric Unicode value of \a c1 is less than
1775	or equal to that of \a c2; otherwise returns \c false.
1776	*/
1777
1778	/!*
1779	\fn bool QChar::operator>=(const QChar &c1, const QChar &c2)
1780
1781	Returns \c true if the numeric Unicode value of \a c1 is greater than
1782	or equal to that of \a c2; otherwise returns \c false.
1783	*/
1784
1785	/!*
1786	\fn bool QChar::operator<(const QChar &c1, const QChar &c2)
1787
1788	Returns \c true if the numeric Unicode value of \a c1 is less than
1789	that of \a c2; otherwise returns \c false.
1790	*/
1791
1792	/!*
1793	\fn bool QChar::operator>(const QChar &c1, const QChar &c2)
1794
1795	Returns \c true if the numeric Unicode value of \a c1 is greater than
1796	that of \a c2; otherwise returns \c false.
1797	*/
1798
1799	/!*
1800	\fn Qt::Literals::StringLiterals::operator""_L1(char ch)
1801
1802	\relates QLatin1Char
1803	\since 6.4
1804
1805	Literal operator that creates a QLatin1Char out of \a ch.
1806
1807	The following code creates a QLatin1Char:
1808	\code
1809	using namespace Qt::Literals::StringLiterals;
1810
1811	auto ch = 'a'_L1;
1812	\endcode
1813
1814	\sa Qt::Literals::StringLiterals
1815	*/
1816
1817	// ---------------------------------------------------------------------------
1818
1819
1820	static void decomposeHelper(QString str, bool* canonical, QChar::UnicodeVersion version, qsizetype from)
1821	{
1822	qsizetype length;
1823	QChar::Decomposition tag;
1824	QChar buffer[`3`];
1825
1826	QString &s = *str;
1827
1828	const unsigned short utf16 = reinterpret_cast<unsigned* short *>(s.data());
1829	const unsigned short *uc = utf16 + s.size();
1830	while (uc != utf16 + from) {
1831	char32_t ucs4 = *(--uc);
1832	if (QChar (ucs4).isLowSurrogate() && uc != utf16) {
1833	ushort high = *(uc - `1`);
1834	if (QChar (high).isHighSurrogate()) {
1835	--uc;
1836	ucs4 = QChar::surrogateToUcs4(high, low: ucs4);
1837	}
1838	}
1839
1840	if (QChar::unicodeVersion(ucs4) > version)
1841	continue;
1842
1843	const QChar *d = decompositionHelper(ucs4, length: &length, tag: &tag, buffer);
1844	if (!d \|\| (canonical && tag != QChar::Canonical))
1845	continue;
1846
1847	qsizetype pos = uc - utf16;
1848	s.replace(i: pos, len: QChar::requiresSurrogates(ucs4) ? `2` : `1`, s: d, slen: length);
1849	// since the replace invalidates the pointers and we do decomposition recursive
1850	utf16 = reinterpret_cast<unsigned short *>(s.data());
1851	uc = utf16 + pos + length;
1852	}
1853	}
1854
1855
1856	struct UCS2Pair {
1857	ushort u1;
1858	ushort u2;
1859	};
1860
1861	inline bool operator<(const UCS2Pair &ligature1, const UCS2Pair &ligature2)
1862	{ return ligature1.u1 < ligature2.u1; }
1863	inline bool operator<(ushort u1, const UCS2Pair &ligature)
1864	{ return u1 < ligature.u1; }
1865	inline bool operator<(const UCS2Pair &ligature, ushort u1)
1866	{ return ligature.u1 < u1; }
1867
1868	struct UCS2SurrogatePair {
1869	UCS2Pair p1;
1870	UCS2Pair p2;
1871	};
1872
1873	inline bool operator<(const UCS2SurrogatePair &ligature1, const UCS2SurrogatePair &ligature2)
1874	{ return QChar::surrogateToUcs4(high: ligature1.p1.u1, low: ligature1.p1.u2) < QChar::surrogateToUcs4(high: ligature2.p1.u1, low: ligature2.p1.u2); }
1875	inline bool operator<(char32_t u1, const UCS2SurrogatePair &ligature)
1876	{ return u1 < QChar::surrogateToUcs4(high: ligature.p1.u1, low: ligature.p1.u2); }
1877	inline bool operator<(const UCS2SurrogatePair &ligature, char32_t u1)
1878	{ return QChar::surrogateToUcs4(high: ligature.p1.u1, low: ligature.p1.u2) < u1; }
1879
1880	static char32_t inline ligatureHelper(char32_t u1, char32_t u2)
1881	{
1882	if (u1 >= Hangul_LBase && u1 < Hangul_SBase + Hangul_SCount) {
1883	// compute Hangul syllable composition as per UAX #15
1884	// hangul L-V pair
1885	const char32_t LIndex = u1 - Hangul_LBase;
1886	if (LIndex < Hangul_LCount) {
1887	const char32_t VIndex = u2 - Hangul_VBase;
1888	if (VIndex < Hangul_VCount)
1889	return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1890	}
1891	// hangul LV-T pair
1892	const char32_t SIndex = u1 - Hangul_SBase;
1893	if (SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == `0`) {
1894	const char32_t TIndex = u2 - Hangul_TBase;
1895	if (TIndex < Hangul_TCount && TIndex)
1896	return u1 + TIndex;
1897	}
1898	}
1899
1900	const unsigned short index = GET_LIGATURE_INDEX(u2);
1901	if (index == `0xffff`)
1902	return `0`;
1903	const unsigned short *ligatures = uc_ligature_map+index;
1904	ushort length = *ligatures++;
1905	if (QChar::requiresSurrogates(ucs4: u1)) {
1906	const UCS2SurrogatePair data = reinterpret_cast<const* UCS2SurrogatePair *>(ligatures);
1907	const UCS2SurrogatePair *r = std::lower_bound(first: data, last: data + length, val: u1);
1908	if (r != data + length && QChar::surrogateToUcs4(high: r->p1.u1, low: r->p1.u2) == u1)
1909	return QChar::surrogateToUcs4(high: r->p2.u1, low: r->p2.u2);
1910	} else {
1911	const UCS2Pair data = reinterpret_cast<const* UCS2Pair *>(ligatures);
1912	const UCS2Pair *r = std::lower_bound(first: data, last: data + length, val: ushort(u1));
1913	if (r != data + length && r->u1 == ushort(u1))
1914	return r->u2;
1915	}
1916
1917	return `0`;
1918	}
1919
1920	static void composeHelper(QString *str, QChar::UnicodeVersion version, qsizetype from)
1921	{
1922	QString &s = *str;
1923
1924	if (from < `0` \|\| s.size() - from < `2`)
1925	return;
1926
1927	char32_t stcode = `0`; // starter code point
1928	qsizetype starter = -`1`; // starter position
1929	qsizetype next = -`1`; // to prevent i == next
1930	int lastCombining = `255`; // to prevent combining > lastCombining
1931
1932	qsizetype pos = from;
1933	while (pos < s.size()) {
1934	qsizetype i = pos;
1935	char32_t uc = s.at(i: pos).unicode();
1936	if (QChar (uc).isHighSurrogate() && pos < s.size()-`1`) {
1937	ushort low = s.at(i: pos+`1`).unicode();
1938	if (QChar (low).isLowSurrogate()) {
1939	uc = QChar::surrogateToUcs4(high: uc, low);
1940	++pos;
1941	}
1942	}
1943
1944	const QUnicodeTables::Properties *p = qGetProp(ucs4: uc);
1945	if (p->unicodeVersion > version) {
1946	starter = -`1`;
1947	next = -`1`; // to prevent i == next
1948	lastCombining = `255`; // to prevent combining > lastCombining
1949	++pos;
1950	continue;
1951	}
1952
1953	int combining = p->combiningClass;
1954	if ((i == next \|\| combining > lastCombining) && starter >= from) {
1955	// allowed to form ligature with S
1956	char32_t ligature = ligatureHelper(u1: stcode, u2: uc);
1957	if (ligature) {
1958	stcode = ligature;
1959	QChar *d = s.data();
1960	// ligatureHelper() never changes planes
1961	qsizetype j = `0`;
1962	for (QChar ch : QChar::fromUcs4(c: ligature))
1963	d[starter + j++] = ch;
1964	s.remove(i, len: j);
1965	continue;
1966	}
1967	}
1968	if (combining == `0`) {
1969	starter = i;
1970	stcode = uc;
1971	next = pos + `1`;
1972	}
1973	lastCombining = combining;
1974
1975	++pos;
1976	}
1977	}
1978
1979
1980	static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, qsizetype from)
1981	{
1982	QString &s = *str;
1983	const qsizetype l = s.size()-`1`;
1984
1985	char32_t u1, u2;
1986	char16_t c1, c2;
1987
1988	qsizetype pos = from;
1989	while (pos < l) {
1990	qsizetype p2 = pos+`1`;
1991	u1 = s.at(i: pos).unicode();
1992	if (QChar::isHighSurrogate(ucs4: u1)) {
1993	const char16_t low = s.at(i: p2).unicode();
1994	if (QChar::isLowSurrogate(ucs4: low)) {
1995	u1 = QChar::surrogateToUcs4(high: u1, low);
1996	if (p2 >= l)
1997	break;
1998	++p2;
1999	}
2000	}
2001	c1 = `0`;
2002
2003	advance:
2004	u2 = s.at(i: p2).unicode();
2005	if (QChar::isHighSurrogate(ucs4: u2) && p2 < l) {
2006	const char16_t low = s.at(i: p2+`1`).unicode();
2007	if (QChar::isLowSurrogate(ucs4: low)) {
2008	u2 = QChar::surrogateToUcs4(high: u2, low);
2009	++p2;
2010	}
2011	}
2012
2013	c2 = `0`;
2014	{
2015	const QUnicodeTables::Properties *p = qGetProp(ucs4: u2);
2016	if (p->unicodeVersion <= version)
2017	c2 = p->combiningClass;
2018	}
2019	if (c2 == `0`) {
2020	pos = p2+`1`;
2021	continue;
2022	}
2023
2024	if (c1 == `0`) {
2025	const QUnicodeTables::Properties *p = qGetProp(ucs4: u1);
2026	if (p->unicodeVersion <= version)
2027	c1 = p->combiningClass;
2028	}
2029
2030	if (c1 > c2) {
2031	QChar *uc = s.data();
2032	qsizetype p = pos;
2033	// exchange characters
2034	for (QChar ch : QChar::fromUcs4(c: u2))
2035	uc[p++] = ch;
2036	for (QChar ch : QChar::fromUcs4(c: u1))
2037	uc[p++] = ch;
2038	if (pos > `0`)
2039	--pos;
2040	if (pos > `0` && s.at(i: pos).isLowSurrogate())
2041	--pos;
2042	} else {
2043	++pos;
2044	if (QChar::requiresSurrogates(ucs4: u1))
2045	++pos;
2046
2047	u1 = u2;
2048	c1 = c2; // != 0
2049	p2 = pos + `1`;
2050	if (QChar::requiresSurrogates(ucs4: u1))
2051	++p2;
2052	if (p2 > l)
2053	break;
2054
2055	goto advance;
2056	}
2057	}
2058	}
2059
2060	// returns true if the text is in a desired Normalization Form already; false otherwise.
2061	// sets lastStable to the position of the last stable code point
2062	static bool normalizationQuickCheckHelper(QString str, QString::NormalizationForm mode, qsizetype from, qsizetype lastStable)
2063	{
2064	static_assert(QString::NormalizationForm_D == `0`);
2065	static_assert(QString::NormalizationForm_C == `1`);
2066	static_assert(QString::NormalizationForm_KD == `2`);
2067	static_assert(QString::NormalizationForm_KC == `3`);
2068
2069	enum { NFQC_YES = `0`, NFQC_NO = `1`, NFQC_MAYBE = `3` };
2070
2071	const auto string = reinterpret_cast<const* char16_t *>(str->constData());
2072	qsizetype length = str->size();
2073
2074	// this avoids one out of bounds check in the loop
2075	while (length > from && QChar::isHighSurrogate(ucs4: string[length - `1`]))
2076	--length;
2077
2078	uchar lastCombining = `0`;
2079	for (qsizetype i = from; i < length; ++i) {
2080	qsizetype pos = i;
2081	char32_t uc = string[i];
2082	if (uc < `0x80`) {
2083	// ASCII characters are stable code points
2084	lastCombining = `0`;
2085	*lastStable = pos;
2086	continue;
2087	}
2088
2089	if (QChar::isHighSurrogate(ucs4: uc)) {
2090	ushort low = string[i + `1`];
2091	if (!QChar::isLowSurrogate(ucs4: low)) {
2092	// treat surrogate like stable code point
2093	lastCombining = `0`;
2094	*lastStable = pos;
2095	continue;
2096	}
2097	++i;
2098	uc = QChar::surrogateToUcs4(high: uc, low);
2099	}
2100
2101	const QUnicodeTables::Properties *p = qGetProp(ucs4: uc);
2102
2103	if (p->combiningClass < lastCombining && p->combiningClass > `0`)
2104	return false;
2105
2106	const uchar check = (p->nfQuickCheck >> (mode << `1`)) & `0x03`;
2107	if (check != NFQC_YES)
2108	return false; // ### can we quick check NFQC_MAYBE ?
2109
2110	lastCombining = p->combiningClass;
2111	if (lastCombining == `0`)
2112	*lastStable = pos;
2113	}
2114
2115	if (length != str->size()) // low surrogate parts at the end of text
2116	*lastStable = str->size() - `1`;
2117
2118	return true;
2119	}
2120
2121	QT_END_NAMESPACE
2122

Provided by KDAB

Definitions

source code of qtbase/src/corelib/text/qchar.cpp