qchar.cpp source code [qtbase/src/corelib/text/qchar.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2019 The Qt Company Ltd.
4	** Contact: https://www.qt.io/licensing/
5	**
6	** This file is part of the QtCore module of the Qt Toolkit.
7	**
8	** $QT_BEGIN_LICENSE:LGPL$
9	** Commercial License Usage
10	** Licensees holding valid commercial Qt licenses may use this file in
11	** accordance with the commercial license agreement provided with the
12	** Software or, alternatively, in accordance with the terms contained in
13	** a written agreement between you and The Qt Company. For licensing terms
14	** and conditions see https://www.qt.io/terms-conditions. For further
15	** information use the contact form at https://www.qt.io/contact-us.
16	**
17	** GNU Lesser General Public License Usage
18	** Alternatively, this file may be used under the terms of the GNU Lesser
19	** General Public License version 3 as published by the Free Software
20	** Foundation and appearing in the file LICENSE.LGPL3 included in the
21	** packaging of this file. Please review the following information to
22	** ensure the GNU Lesser General Public License version 3 requirements
23	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24	**
25	** GNU General Public License Usage
26	** Alternatively, this file may be used under the terms of the GNU
27	** General Public License version 2.0 or (at your option) the GNU General
28	** Public license version 3 or any later version approved by the KDE Free
29	** Qt Foundation. The licenses are as published by the Free Software
30	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31	** included in the packaging of this file. Please review the following
32	** information to ensure the GNU General Public License requirements will
33	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34	** https://www.gnu.org/licenses/gpl-3.0.html.
35	**
36	** $QT_END_LICENSE$
37	**
38	****************************************************************************/
39
40	// Don't define it while compiling this module, or USERS of Qt will
41	// not be able to link.
42	#ifdef QT_NO_CAST_FROM_ASCII
43	# undef QT_NO_CAST_FROM_ASCII
44	#endif
45	#ifdef QT_NO_CAST_TO_ASCII
46	# undef QT_NO_CAST_TO_ASCII
47	#endif
48	#include "qchar.h"
49
50	#include "qdatastream.h"
51
52	#include "qunicodetables_p.h"
53	#include "qunicodetables.cpp"
54
55	#include <algorithm>
56
57	QT_BEGIN_NAMESPACE
58
59	#define FLAG(x) (1 << (x))
60
61	/!*
62	\class QLatin1Char
63	\inmodule QtCore
64	\reentrant
65	\brief The QLatin1Char class provides an 8-bit ASCII/Latin-1 character.
66
67	\ingroup string-processing
68
69	This class is only useful to construct a QChar with 8-bit character.
70
71	\sa QChar, QLatin1String, QString
72	*/
73
74	/!*
75	\fn const char QLatin1Char::toLatin1() const
76
77	Converts a Latin-1 character to an 8-bit ASCII representation of the character.
78	*/
79
80	/!*
81	\fn const ushort QLatin1Char::unicode() const
82
83	Converts a Latin-1 character to an 16-bit-encoded Unicode representation
84	of the character.
85	*/
86
87	/!*
88	\fn QLatin1Char::QLatin1Char(char c)
89
90	Constructs a Latin-1 character for \a c. This constructor should be
91	used when the encoding of the input character is known to be Latin-1.
92	*/
93
94	/!*
95	\class QChar
96	\inmodule QtCore
97	\brief The QChar class provides a 16-bit Unicode character.
98
99	\ingroup string-processing
100	\reentrant
101
102	In Qt, Unicode characters are 16-bit entities without any markup
103	or structure. This class represents such an entity. It is
104	lightweight, so it can be used everywhere. Most compilers treat
105	it like an \c{unsigned short}.
106
107	QChar provides a full complement of testing/classification
108	functions, converting to and from other formats, converting from
109	composed to decomposed Unicode, and trying to compare and
110	case-convert if you ask it to.
111
112	The classification functions include functions like those in the
113	standard C++ header \<cctype\> (formerly \<ctype.h\>), but
114	operating on the full range of Unicode characters, not just for the ASCII
115	range. They all return true if the character is a certain type of character;
116	otherwise they return false. These classification functions are
117	isNull() (returns \c true if the character is '\\0'), isPrint()
118	(true if the character is any sort of printable character,
119	including whitespace), isPunct() (any sort of punctation),
120	isMark() (Unicode Mark), isLetter() (a letter), isNumber() (any
121	sort of numeric character, not just 0-9), isLetterOrNumber(), and
122	isDigit() (decimal digits). All of these are wrappers around
123	category() which return the Unicode-defined category of each
124	character. Some of these also calculate the derived properties
125	(for example isSpace() returns \c true if the character is of category
126	Separator_ or an exceptional code point from Other_Control category).*
127
128	QChar also provides direction(), which indicates the "natural"
129	writing direction of this character. The joiningType() function
130	indicates how the character joins with it's neighbors (needed
131	mostly for Arabic or Syriac) and finally hasMirrored(), which indicates
132	whether the character needs to be mirrored when it is printed in
133	it's "unnatural" writing direction.
134
135	Composed Unicode characters (like \a ring) can be converted to
136	decomposed Unicode ("a" followed by "ring above") by using decomposition().
137
138	In Unicode, comparison is not necessarily possible and case
139	conversion is very difficult at best. Unicode, covering the
140	"entire" world, also includes most of the world's case and
141	sorting problems. operator==() and friends will do comparison
142	based purely on the numeric Unicode value (code point) of the
143	characters, and toUpper() and toLower() will do case changes when
144	the character has a well-defined uppercase/lowercase equivalent.
145	For locale-dependent comparisons, use QString::localeAwareCompare().
146
147	The conversion functions include unicode() (to a scalar),
148	toLatin1() (to scalar, but converts all non-Latin-1 characters to
149	0), row() (gives the Unicode row), cell() (gives the Unicode
150	cell), digitValue() (gives the integer value of any of the
151	numerous digit characters), and a host of constructors.
152
153	QChar provides constructors and cast operators that make it easy
154	to convert to and from traditional 8-bit \c{char}s. If you
155	defined \c QT_NO_CAST_FROM_ASCII and \c QT_NO_CAST_TO_ASCII, as
156	explained in the QString documentation, you will need to
157	explicitly call fromLatin1(), or use QLatin1Char,
158	to construct a QChar from an 8-bit \c char, and you will need to
159	call toLatin1() to get the 8-bit value back.
160
161	For more information see
162	\l{https://www.unicode.org/ucd/}{"About the Unicode Character Database"}.
163
164	\sa Unicode, QString, QLatin1Char
165	*/
166
167	/!*
168	\enum QChar::UnicodeVersion
169
170	Specifies which version of the \l{Unicode standard} introduced a certain
171	character.
172
173	\value Unicode_1_1 Version 1.1
174	\value Unicode_2_0 Version 2.0
175	\value Unicode_2_1_2 Version 2.1.2
176	\value Unicode_3_0 Version 3.0
177	\value Unicode_3_1 Version 3.1
178	\value Unicode_3_2 Version 3.2
179	\value Unicode_4_0 Version 4.0
180	\value Unicode_4_1 Version 4.1
181	\value Unicode_5_0 Version 5.0
182	\value Unicode_5_1 Version 5.1
183	\value Unicode_5_2 Version 5.2
184	\value Unicode_6_0 Version 6.0
185	\value Unicode_6_1 Version 6.1
186	\value Unicode_6_2 Version 6.2
187	\value Unicode_6_3 Version 6.3 Since Qt 5.3
188	\value Unicode_7_0 Version 7.0 Since Qt 5.5
189	\value Unicode_8_0 Version 8.0 Since Qt 5.6
190	\value Unicode_9_0 Version 9.0 Since Qt 5.11
191	\value Unicode_10_0 Version 10.0 Since Qt 5.11
192	\value Unicode_11_0 Version 11.0 Since Qt 5.15
193	\value Unicode_12_0 Version 12.0 Since Qt 5.15
194	\value Unicode_12_1 Version 12.1 Since Qt 5.15
195	\value Unicode_13_0 Version 13.0 Since Qt 5.15
196	\value Unicode_Unassigned The value is not assigned to any character
197	in version 8.0 of Unicode.
198
199	\sa unicodeVersion(), currentUnicodeVersion()
200	*/
201
202	/!*
203	\enum QChar::Category
204
205	This enum maps the Unicode character categories.
206
207	The following characters are normative in Unicode:
208
209	\value Mark_NonSpacing Unicode class name Mn
210
211	\value Mark_SpacingCombining Unicode class name Mc
212
213	\value Mark_Enclosing Unicode class name Me
214
215	\value Number_DecimalDigit Unicode class name Nd
216
217	\value Number_Letter Unicode class name Nl
218
219	\value Number_Other Unicode class name No
220
221	\value Separator_Space Unicode class name Zs
222
223	\value Separator_Line Unicode class name Zl
224
225	\value Separator_Paragraph Unicode class name Zp
226
227	\value Other_Control Unicode class name Cc
228
229	\value Other_Format Unicode class name Cf
230
231	\value Other_Surrogate Unicode class name Cs
232
233	\value Other_PrivateUse Unicode class name Co
234
235	\value Other_NotAssigned Unicode class name Cn
236
237
238	The following categories are informative in Unicode:
239
240	\value Letter_Uppercase Unicode class name Lu
241
242	\value Letter_Lowercase Unicode class name Ll
243
244	\value Letter_Titlecase Unicode class name Lt
245
246	\value Letter_Modifier Unicode class name Lm
247
248	\value Letter_Other Unicode class name Lo
249
250	\value Punctuation_Connector Unicode class name Pc
251
252	\value Punctuation_Dash Unicode class name Pd
253
254	\value Punctuation_Open Unicode class name Ps
255
256	\value Punctuation_Close Unicode class name Pe
257
258	\value Punctuation_InitialQuote Unicode class name Pi
259
260	\value Punctuation_FinalQuote Unicode class name Pf
261
262	\value Punctuation_Other Unicode class name Po
263
264	\value Symbol_Math Unicode class name Sm
265
266	\value Symbol_Currency Unicode class name Sc
267
268	\value Symbol_Modifier Unicode class name Sk
269
270	\value Symbol_Other Unicode class name So
271
272	\sa category()
273	*/
274
275	/!*
276	\enum QChar::Script
277	\since 5.1
278
279	This enum type defines the Unicode script property values.
280
281	For details about the Unicode script property values see
282	\l{https://www.unicode.org/reports/tr24/}{Unicode Standard Annex #24}.
283
284	In order to conform to C/C++ naming conventions "Script_" is prepended
285	to the codes used in the Unicode Standard.
286
287	\value Script_Unknown For unassigned, private-use, noncharacter, and surrogate code points.
288	\value Script_Inherited For characters that may be used with multiple scripts
289	and that inherit their script from the preceding characters.
290	These include nonspacing marks, enclosing marks,
291	and zero width joiner/non-joiner characters.
292	\value Script_Common For characters that may be used with multiple scripts
293	and that do not inherit their script from the preceding characters.
294
295	\value Script_Adlam Since Qt 5.11
296	\value Script_Ahom Since Qt 5.6
297	\value Script_AnatolianHieroglyphs Since Qt 5.6
298	\value Script_Arabic
299	\value Script_Armenian
300	\value Script_Avestan
301	\value Script_Balinese
302	\value Script_Bamum
303	\value Script_BassaVah Since Qt 5.5
304	\value Script_Batak
305	\value Script_Bengali
306	\value Script_Bhaiksuki Since Qt 5.11
307	\value Script_Bopomofo
308	\value Script_Brahmi
309	\value Script_Braille
310	\value Script_Buginese
311	\value Script_Buhid
312	\value Script_CanadianAboriginal
313	\value Script_Carian
314	\value Script_CaucasianAlbanian Since Qt 5.5
315	\value Script_Chakma
316	\value Script_Cham
317	\value Script_Cherokee
318	\value Script_Chorasmian Since Qt 5.15
319	\value Script_Coptic
320	\value Script_Cuneiform
321	\value Script_Cypriot
322	\value Script_Cyrillic
323	\value Script_Deseret
324	\value Script_Devanagari
325	\value Script_DivesAkuru Since Qt 5.15
326	\value Script_Dogra Since Qt 5.15
327	\value Script_Duployan Since Qt 5.5
328	\value Script_EgyptianHieroglyphs
329	\value Script_Elbasan Since Qt 5.5
330	\value Script_Elymaic Since Qt 5.15
331	\value Script_Ethiopic
332	\value Script_Georgian
333	\value Script_Glagolitic
334	\value Script_Gothic
335	\value Script_Grantha Since Qt 5.5
336	\value Script_Greek
337	\value Script_Gujarati
338	\value Script_GunjalaGondi Since Qt 5.15
339	\value Script_Gurmukhi
340	\value Script_Han
341	\value Script_Hangul
342	\value Script_HanifiRohingya Since Qt 5.15
343	\value Script_Hanunoo
344	\value Script_Hatran Since Qt 5.6
345	\value Script_Hebrew
346	\value Script_Hiragana
347	\value Script_ImperialAramaic
348	\value Script_InscriptionalPahlavi
349	\value Script_InscriptionalParthian
350	\value Script_Javanese
351	\value Script_Kaithi
352	\value Script_Kannada
353	\value Script_Katakana
354	\value Script_KayahLi
355	\value Script_Kharoshthi
356	\value Script_KhitanSmallScript Since Qt 5.15
357	\value Script_Khmer
358	\value Script_Khojki Since Qt 5.5
359	\value Script_Khudawadi Since Qt 5.5
360	\value Script_Lao
361	\value Script_Latin
362	\value Script_Lepcha
363	\value Script_Limbu
364	\value Script_LinearA Since Qt 5.5
365	\value Script_LinearB
366	\value Script_Lisu
367	\value Script_Lycian
368	\value Script_Lydian
369	\value Script_Mahajani Since Qt 5.5
370	\value Script_Makasar Since Qt 5.15
371	\value Script_Malayalam
372	\value Script_Mandaic
373	\value Script_Manichaean Since Qt 5.5
374	\value Script_Marchen Since Qt 5.11
375	\value Script_MasaramGondi Since Qt 5.11
376	\value Script_Medefaidrin Since Qt 5.15
377	\value Script_MeeteiMayek
378	\value Script_MendeKikakui Since Qt 5.5
379	\value Script_MeroiticCursive
380	\value Script_MeroiticHieroglyphs
381	\value Script_Miao
382	\value Script_Modi Since Qt 5.5
383	\value Script_Mongolian
384	\value Script_Mro Since Qt 5.5
385	\value Script_Multani Since Qt 5.6
386	\value Script_Myanmar
387	\value Script_Nabataean Since Qt 5.5
388	\value Script_Nandinagari Since Qt 5.15
389	\value Script_Newa Since Qt 5.11
390	\value Script_NewTaiLue
391	\value Script_Nko
392	\value Script_Nushu Since Qt 5.11
393	\value Script_NyiakengPuachueHmong Since Qt 5.15
394	\value Script_Ogham
395	\value Script_OlChiki
396	\value Script_OldHungarian Since Qt 5.6
397	\value Script_OldItalic
398	\value Script_OldNorthArabian Since Qt 5.5
399	\value Script_OldPermic Since Qt 5.5
400	\value Script_OldPersian
401	\value Script_OldSogdian Since Qt 5.15
402	\value Script_OldSouthArabian
403	\value Script_OldTurkic
404	\value Script_Oriya
405	\value Script_Osage Since Qt 5.11
406	\value Script_Osmanya
407	\value Script_PahawhHmong Since Qt 5.5
408	\value Script_Palmyrene Since Qt 5.5
409	\value Script_PauCinHau Since Qt 5.5
410	\value Script_PhagsPa
411	\value Script_Phoenician
412	\value Script_PsalterPahlavi Since Qt 5.5
413	\value Script_Rejang
414	\value Script_Runic
415	\value Script_Samaritan
416	\value Script_Saurashtra
417	\value Script_Sharada
418	\value Script_Shavian
419	\value Script_Siddham Since Qt 5.5
420	\value Script_SignWriting Since Qt 5.6
421	\value Script_Sinhala
422	\value Script_Sogdian Since Qt 5.15
423	\value Script_SoraSompeng
424	\value Script_Soyombo Since Qt 5.11
425	\value Script_Sundanese
426	\value Script_SylotiNagri
427	\value Script_Syriac
428	\value Script_Tagalog
429	\value Script_Tagbanwa
430	\value Script_TaiLe
431	\value Script_TaiTham
432	\value Script_TaiViet
433	\value Script_Takri
434	\value Script_Tamil
435	\value Script_Tangut Since Qt 5.11
436	\value Script_Telugu
437	\value Script_Thaana
438	\value Script_Thai
439	\value Script_Tibetan
440	\value Script_Tifinagh
441	\value Script_Tirhuta Since Qt 5.5
442	\value Script_Ugaritic
443	\value Script_Vai
444	\value Script_Wancho Since Qt 5.15
445	\value Script_WarangCiti Since Qt 5.5
446	\value Script_Yezidi Since Qt 5.15
447	\value Script_Yi
448	\value Script_ZanabazarSquare Since Qt 5.11
449
450	\omitvalue ScriptCount
451
452	\sa script()
453	*/
454
455	/!*
456	\enum QChar::Direction
457
458	This enum type defines the Unicode direction attributes. See the
459	\l{https://www.unicode.org/reports/tr9/tr9-35.html#Table_Bidirectional_Character_Types}{Unicode
460	Standard} for a description of the values.
461
462	In order to conform to C/C++ naming conventions "Dir" is prepended
463	to the codes used in the Unicode Standard.
464
465	\value DirAL
466	\value DirAN
467	\value DirB
468	\value DirBN
469	\value DirCS
470	\value DirEN
471	\value DirES
472	\value DirET
473	\value DirFSI Since Qt 5.3
474	\value DirL
475	\value DirLRE
476	\value DirLRI Since Qt 5.3
477	\value DirLRO
478	\value DirNSM
479	\value DirON
480	\value DirPDF
481	\value DirPDI Since Qt 5.3
482	\value DirR
483	\value DirRLE
484	\value DirRLI Since Qt 5.3
485	\value DirRLO
486	\value DirS
487	\value DirWS
488
489	\sa direction()
490	*/
491
492	/!*
493	\enum QChar::Decomposition
494
495	This enum type defines the Unicode decomposition attributes. See
496	the \l{Unicode Standard} for a description of the values.
497
498	\value NoDecomposition
499	\value Canonical
500	\value Circle
501	\value Compat
502	\value Final
503	\value Font
504	\value Fraction
505	\value Initial
506	\value Isolated
507	\value Medial
508	\value Narrow
509	\value NoBreak
510	\value Small
511	\value Square
512	\value Sub
513	\value Super
514	\value Vertical
515	\value Wide
516
517	\sa decomposition()
518	*/
519
520	/!*
521	\enum QChar::JoiningType
522	since 5.3
523
524	This enum type defines the Unicode joining type attributes. See the
525	\l{Unicode Standard} for a description of the values.
526
527	In order to conform to C/C++ naming conventions "Joining_" is prepended
528	to the codes used in the Unicode Standard.
529
530	\value Joining_None
531	\value Joining_Causing
532	\value Joining_Dual
533	\value Joining_Right
534	\value Joining_Left
535	\value Joining_Transparent
536
537	\sa joiningType()
538	*/
539
540	#if QT_DEPRECATED_SINCE(5, 3)
541	/!*
542	\enum QChar::Joining
543	\deprecated in 5.3, use JoiningType instead.
544
545	This enum type defines the Unicode joining attributes. See the
546	\l{http://www.unicode.org/}{Unicode Standard} for a description
547	of the values.
548
549	\value Center
550	\value Dual
551	\value OtherJoining
552	\value Right
553
554	\sa joining()
555	*/
556	#endif
557
558	/!*
559	\enum QChar::CombiningClass
560
561	\internal
562
563	This enum type defines names for some of the Unicode combining
564	classes. See the \l{Unicode Standard} for a description of the values.
565
566	\value Combining_Above
567	\value Combining_AboveAttached
568	\value Combining_AboveLeft
569	\value Combining_AboveLeftAttached
570	\value Combining_AboveRight
571	\value Combining_AboveRightAttached
572	\value Combining_Below
573	\value Combining_BelowAttached
574	\value Combining_BelowLeft
575	\value Combining_BelowLeftAttached
576	\value Combining_BelowRight
577	\value Combining_BelowRightAttached
578	\value Combining_DoubleAbove
579	\value Combining_DoubleBelow
580	\value Combining_IotaSubscript
581	\value Combining_Left
582	\value Combining_LeftAttached
583	\value Combining_Right
584	\value Combining_RightAttached
585	*/
586
587	/!*
588	\enum QChar::SpecialCharacter
589
590	\value Null A QChar with this value isNull().
591	\value Tabulation Character tabulation.
592	\value LineFeed
593	\value FormFeed
594	\value CarriageReturn
595	\value Space
596	\value Nbsp Non-breaking space.
597	\value SoftHyphen
598	\value ReplacementCharacter The character shown when a font has no glyph
599	for a certain codepoint. A special question mark character is often
600	used. Codecs use this codepoint when input data cannot be
601	represented in Unicode.
602	\value ObjectReplacementCharacter Used to represent an object such as an
603	image when such objects cannot be presented.
604	\value ByteOrderMark
605	\value ByteOrderSwapped
606	\value ParagraphSeparator
607	\value LineSeparator
608	\value LastValidCodePoint
609	*/
610
611	/!*
612	\fn void QChar::setCell(uchar cell)
613	\internal
614	*/
615
616	/!*
617	\fn void QChar::setRow(uchar row)
618	\internal
619	*/
620
621	/!*
622	\fn QChar::QChar()
623
624	Constructs a null QChar ('\\0').
625
626	\sa isNull()
627	*/
628
629	/!*
630	\fn QChar::QChar(QLatin1Char ch)
631
632	Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
633	*/
634
635	/!*
636	\fn QChar::QChar(SpecialCharacter ch)
637
638	Constructs a QChar for the predefined character value \a ch.
639	*/
640
641	/!*
642	\fn QChar::QChar(char16_t ch)
643	\since 5.10
644
645	Constructs a QChar corresponding to the UTF-16 character \a ch.
646	*/
647
648	/!*
649	\fn QChar::QChar(wchar_t ch)
650	\since 5.10
651
652	Constructs a QChar corresponding to the wide character \a ch.
653
654	\note This constructor is only available on Windows.
655	*/
656
657	/!*
658	\fn QChar::QChar(char ch)
659
660	Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
661
662	\note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
663	is defined.
664
665	\sa QT_NO_CAST_FROM_ASCII
666	*/
667
668	/!*
669	\fn QChar::QChar(uchar ch)
670
671	Constructs a QChar corresponding to ASCII/Latin-1 character \a ch.
672
673	\note This constructor is not available when \c QT_NO_CAST_FROM_ASCII
674	or \c QT_RESTRICTED_CAST_FROM_ASCII is defined.
675
676	\sa QT_NO_CAST_FROM_ASCII, QT_RESTRICTED_CAST_FROM_ASCII
677	*/
678
679	/!*
680	\fn QChar::QChar(uchar cell, uchar row)
681
682	Constructs a QChar for Unicode cell \a cell in row \a row.
683
684	\sa cell(), row()
685	*/
686
687	/!*
688	\fn QChar::QChar(ushort code)
689
690	Constructs a QChar for the character with Unicode code point \a code.
691	*/
692
693	/!*
694	\fn QChar::QChar(short code)
695
696	Constructs a QChar for the character with Unicode code point \a code.
697	*/
698
699	/!*
700	\fn QChar::QChar(uint code)
701
702	Constructs a QChar for the character with Unicode code point \a code.
703	*/
704
705	/!*
706	\fn QChar::QChar(int code)
707
708	Constructs a QChar for the character with Unicode code point \a code.
709	*/
710
711	/!*
712	\fn bool QChar::isNull() const
713
714	Returns \c true if the character is the Unicode character 0x0000
715	('\\0'); otherwise returns \c false.
716	*/
717
718	/!*
719	\fn uchar QChar::cell() const
720
721	Returns the cell (least significant byte) of the Unicode character.
722
723	\sa row()
724	*/
725
726	/!*
727	\fn uchar QChar::row() const
728
729	Returns the row (most significant byte) of the Unicode character.
730
731	\sa cell()
732	*/
733
734	/!*
735	\fn bool QChar::isPrint() const
736
737	Returns \c true if the character is a printable character; otherwise
738	returns \c false. This is any character not of category Other_.*
739
740	Note that this gives no indication of whether the character is
741	available in a particular font.
742	*/
743
744	/!*
745	\overload
746	\since 5.0
747
748	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
749	a printable character; otherwise returns \c false.
750	This is any character not of category Other_.*
751
752	Note that this gives no indication of whether the character is
753	available in a particular font.
754	*/
755	bool QChar::isPrint(uint ucs4) noexcept
756	{
757	if (ucs4 > LastValidCodePoint)
758	return false;
759	const int test = FLAG(Other_Control) \|
760	FLAG(Other_Format) \|
761	FLAG(Other_Surrogate) \|
762	FLAG(Other_PrivateUse) \|
763	FLAG(Other_NotAssigned);
764	return !(FLAG(qGetProp(ucs4)->category) & test);
765	}
766
767	/!*
768	\fn bool QChar::isSpace() const
769
770	Returns \c true if the character is a separator character
771	(Separator_ categories or certain code points from Other_Control category);*
772	otherwise returns \c false.
773	*/
774
775	/!*
776	\fn bool QChar::isSpace(uint ucs4)
777	\overload
778	\since 5.0
779
780	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
781	a separator character (Separator_ categories or certain code points*
782	from Other_Control category); otherwise returns \c false.
783	*/
784
785	/!*
786	\internal
787	*/
788	bool QT_FASTCALL QChar::isSpace_helper(uint ucs4) noexcept
789	{
790	if (ucs4 > LastValidCodePoint)
791	return false;
792	const int test = FLAG(Separator_Space) \|
793	FLAG(Separator_Line) \|
794	FLAG(Separator_Paragraph);
795	return FLAG(qGetProp(ucs4)->category) & test;
796	}
797
798	/!*
799	\fn bool QChar::isMark() const
800
801	Returns \c true if the character is a mark (Mark_ categories);*
802	otherwise returns \c false.
803
804	See QChar::Category for more information regarding marks.
805	*/
806
807	/!*
808	\overload
809	\since 5.0
810
811	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
812	a mark (Mark_ categories); otherwise returns \c false.*
813	*/
814	bool QChar::isMark(uint ucs4) noexcept
815	{
816	if (ucs4 > LastValidCodePoint)
817	return false;
818	const int test = FLAG(Mark_NonSpacing) \|
819	FLAG(Mark_SpacingCombining) \|
820	FLAG(Mark_Enclosing);
821	return FLAG(qGetProp(ucs4)->category) & test;
822	}
823
824	/!*
825	\fn bool QChar::isPunct() const
826
827	Returns \c true if the character is a punctuation mark (Punctuation_*
828	categories); otherwise returns \c false.
829	*/
830
831	/!*
832	\overload
833	\since 5.0
834
835	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
836	a punctuation mark (Punctuation_ categories); otherwise returns \c false.*
837	*/
838	bool QChar::isPunct(uint ucs4) noexcept
839	{
840	if (ucs4 > LastValidCodePoint)
841	return false;
842	const int test = FLAG(Punctuation_Connector) \|
843	FLAG(Punctuation_Dash) \|
844	FLAG(Punctuation_Open) \|
845	FLAG(Punctuation_Close) \|
846	FLAG(Punctuation_InitialQuote) \|
847	FLAG(Punctuation_FinalQuote) \|
848	FLAG(Punctuation_Other);
849	return FLAG(qGetProp(ucs4)->category) & test;
850	}
851
852	/!*
853	\fn bool QChar::isSymbol() const
854
855	Returns \c true if the character is a symbol (Symbol_ categories);*
856	otherwise returns \c false.
857	*/
858
859	/!*
860	\overload
861	\since 5.0
862
863	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
864	a symbol (Symbol_ categories); otherwise returns \c false.*
865	*/
866	bool QChar::isSymbol(uint ucs4) noexcept
867	{
868	if (ucs4 > LastValidCodePoint)
869	return false;
870	const int test = FLAG(Symbol_Math) \|
871	FLAG(Symbol_Currency) \|
872	FLAG(Symbol_Modifier) \|
873	FLAG(Symbol_Other);
874	return FLAG(qGetProp(ucs4)->category) & test;
875	}
876
877	/!*
878	\fn bool QChar::isLetter() const
879
880	Returns \c true if the character is a letter (Letter_ categories);*
881	otherwise returns \c false.
882	*/
883
884	/!*
885	\fn bool QChar::isLetter(uint ucs4)
886	\overload
887	\since 5.0
888
889	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
890	a letter (Letter_ categories); otherwise returns \c false.*
891	*/
892
893	/!*
894	\internal
895	*/
896	bool QT_FASTCALL QChar::isLetter_helper(uint ucs4) noexcept
897	{
898	if (ucs4 > LastValidCodePoint)
899	return false;
900	const int test = FLAG(Letter_Uppercase) \|
901	FLAG(Letter_Lowercase) \|
902	FLAG(Letter_Titlecase) \|
903	FLAG(Letter_Modifier) \|
904	FLAG(Letter_Other);
905	return FLAG(qGetProp(ucs4)->category) & test;
906	}
907
908	/!*
909	\fn bool QChar::isNumber() const
910
911	Returns \c true if the character is a number (Number_ categories,*
912	not just 0-9); otherwise returns \c false.
913
914	\sa isDigit()
915	*/
916
917	/!*
918	\fn bool QChar::isNumber(uint ucs4)
919	\overload
920	\since 5.0
921
922	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
923	a number (Number_ categories, not just 0-9); otherwise returns \c false.*
924
925	\sa isDigit()
926	*/
927
928	/!*
929	\internal
930	*/
931	bool QT_FASTCALL QChar::isNumber_helper(uint ucs4) noexcept
932	{
933	if (ucs4 > LastValidCodePoint)
934	return false;
935	const int test = FLAG(Number_DecimalDigit) \|
936	FLAG(Number_Letter) \|
937	FLAG(Number_Other);
938	return FLAG(qGetProp(ucs4)->category) & test;
939	}
940
941	/!*
942	\fn bool QChar::isLetterOrNumber() const
943
944	Returns \c true if the character is a letter or number (Letter_ or*
945	Number_ categories); otherwise returns \c false.*
946	*/
947
948	/!*
949	\fn bool QChar::isLetterOrNumber(uint ucs4)
950	\overload
951	\since 5.0
952
953	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
954	a letter or number (Letter_ or Number_* categories); otherwise returns \c false.*
955	*/
956
957	/!*
958	\internal
959	*/
960	bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4) noexcept
961	{
962	if (ucs4 > LastValidCodePoint)
963	return false;
964	const int test = FLAG(Letter_Uppercase) \|
965	FLAG(Letter_Lowercase) \|
966	FLAG(Letter_Titlecase) \|
967	FLAG(Letter_Modifier) \|
968	FLAG(Letter_Other) \|
969	FLAG(Number_DecimalDigit) \|
970	FLAG(Number_Letter) \|
971	FLAG(Number_Other);
972	return FLAG(qGetProp(ucs4)->category) & test;
973	}
974
975	/!*
976	\fn bool QChar::isDigit() const
977
978	Returns \c true if the character is a decimal digit
979	(Number_DecimalDigit); otherwise returns \c false.
980
981	\sa isNumber()
982	*/
983
984	/!*
985	\fn bool QChar::isDigit(uint ucs4)
986	\overload
987	\since 5.0
988
989	Returns \c true if the UCS-4-encoded character specified by \a ucs4 is
990	a decimal digit (Number_DecimalDigit); otherwise returns \c false.
991
992	\sa isNumber()
993	*/
994
995	/!*
996	\fn bool QChar::isNonCharacter() const
997	\since 5.0
998
999	Returns \c true if the QChar is a non-character; false otherwise.
1000
1001	Unicode has a certain number of code points that are classified
1002	as "non-characters:" that is, they can be used for internal purposes
1003	in applications but cannot be used for text interchange.
1004	Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1005	[0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1006	*/
1007
1008	/!*
1009	\fn bool QChar::isHighSurrogate() const
1010
1011	Returns \c true if the QChar is the high part of a UTF16 surrogate
1012	(for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1013	*/
1014
1015	/!*
1016	\fn bool QChar::isLowSurrogate() const
1017
1018	Returns \c true if the QChar is the low part of a UTF16 surrogate
1019	(for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1020	*/
1021
1022	/!*
1023	\fn bool QChar::isSurrogate() const
1024	\since 5.0
1025
1026	Returns \c true if the QChar contains a code point that is in either
1027	the high or the low part of the UTF-16 surrogate range
1028	(for example if its code point is in range [0xd800..0xdfff]); false otherwise.
1029	*/
1030
1031	/!*
1032	\fn static bool QChar::isNonCharacter(uint ucs4)
1033	\overload
1034	\since 5.0
1035
1036	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1037	is a non-character; false otherwise.
1038
1039	Unicode has a certain number of code points that are classified
1040	as "non-characters:" that is, they can be used for internal purposes
1041	in applications but cannot be used for text interchange.
1042	Those are the last two entries each Unicode Plane ([0xfffe..0xffff],
1043	[0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef].
1044	*/
1045
1046	/!*
1047	\fn static bool QChar::isHighSurrogate(uint ucs4)
1048	\overload
1049
1050	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1051	is the high part of a UTF16 surrogate
1052	(for example if its code point is in range [0xd800..0xdbff]); false otherwise.
1053	*/
1054
1055	/!*
1056	\fn static bool QChar::isLowSurrogate(uint ucs4)
1057	\overload
1058
1059	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1060	is the low part of a UTF16 surrogate
1061	(for example if its code point is in range [0xdc00..0xdfff]); false otherwise.
1062	*/
1063
1064	/!*
1065	\fn static bool QChar::isSurrogate(uint ucs4)
1066	\overload
1067	\since 5.0
1068
1069	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1070	contains a code point that is in either the high or the low part of the
1071	UTF-16 surrogate range (for example if its code point is in range [0xd800..0xdfff]);
1072	false otherwise.
1073	*/
1074
1075	/!*
1076	\fn static bool QChar::requiresSurrogates(uint ucs4)
1077
1078	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1079	can be split into the high and low parts of a UTF16 surrogate
1080	(for example if its code point is greater than or equals to 0x10000);
1081	false otherwise.
1082	*/
1083
1084	/!*
1085	\fn static uint QChar::surrogateToUcs4(ushort high, ushort low)
1086
1087	Converts a UTF16 surrogate pair with the given \a high and \a low values
1088	to it's UCS-4-encoded code point.
1089	*/
1090
1091	/!*
1092	\fn static uint QChar::surrogateToUcs4(QChar high, QChar low)
1093	\overload
1094
1095	Converts a UTF16 surrogate pair (\a high, \a low) to it's UCS-4-encoded code point.
1096	*/
1097
1098	/!*
1099	\fn static ushort QChar::highSurrogate(uint ucs4)
1100
1101	Returns the high surrogate part of a UCS-4-encoded code point.
1102	The returned result is undefined if \a ucs4 is smaller than 0x10000.
1103	*/
1104
1105	/!*
1106	\fn static ushort QChar::lowSurrogate(uint ucs4)
1107
1108	Returns the low surrogate part of a UCS-4-encoded code point.
1109	The returned result is undefined if \a ucs4 is smaller than 0x10000.
1110	*/
1111
1112	/!*
1113	\fn int QChar::digitValue() const
1114
1115	Returns the numeric value of the digit, or -1 if the character is not a digit.
1116	*/
1117
1118	/!*
1119	\overload
1120	Returns the numeric value of the digit specified by the UCS-4-encoded
1121	character, \a ucs4, or -1 if the character is not a digit.
1122	*/
1123	int QChar::digitValue(uint ucs4) noexcept
1124	{
1125	if (ucs4 > LastValidCodePoint)
1126	return -`1`;
1127	return qGetProp(ucs4)->digitValue;
1128	}
1129
1130	/!*
1131	\fn QChar::Category QChar::category() const
1132
1133	Returns the character's category.
1134	*/
1135
1136	/!*
1137	\overload
1138	Returns the category of the UCS-4-encoded character specified by \a ucs4.
1139	*/
1140	QChar::Category QChar::category(uint ucs4) noexcept
1141	{
1142	if (ucs4 > LastValidCodePoint)
1143	return QChar::Other_NotAssigned;
1144	return (QChar::Category) qGetProp(ucs4)->category;
1145	}
1146
1147	/!*
1148	\fn QChar::Direction QChar::direction() const
1149
1150	Returns the character's direction.
1151	*/
1152
1153	/!*
1154	\overload
1155	Returns the direction of the UCS-4-encoded character specified by \a ucs4.
1156	*/
1157	QChar::Direction QChar::direction(uint ucs4) noexcept
1158	{
1159	if (ucs4 > LastValidCodePoint)
1160	return QChar::DirL;
1161	return (QChar::Direction) qGetProp(ucs4)->direction;
1162	}
1163
1164	/!*
1165	\fn QChar::JoiningType QChar::joiningType() const
1166	\since 5.3
1167
1168	Returns information about the joining type attributes of the character
1169	(needed for certain languages such as Arabic or Syriac).
1170	*/
1171
1172	/!*
1173	\overload
1174	\since 5.3
1175
1176	Returns information about the joining type attributes of the UCS-4-encoded
1177	character specified by \a ucs4
1178	(needed for certain languages such as Arabic or Syriac).
1179	*/
1180	QChar::JoiningType QChar::joiningType(uint ucs4) noexcept
1181	{
1182	if (ucs4 > LastValidCodePoint)
1183	return QChar::Joining_None;
1184	return QChar::JoiningType(qGetProp(ucs4)->joining);
1185	}
1186
1187	#if QT_DEPRECATED_SINCE(5, 3)
1188	/!*
1189	\fn QChar::Joining QChar::joining() const
1190	\deprecated in 5.3, use joiningType() instead.
1191
1192	Returns information about the joining properties of the character
1193	(needed for certain languages such as Arabic).
1194	*/
1195
1196	/!*
1197	\overload
1198	\deprecated in 5.3, use joiningType() instead.
1199
1200	Returns information about the joining properties of the UCS-4-encoded
1201	character specified by \a ucs4 (needed for certain languages such as Arabic).
1202	*/
1203	QChar::Joining QChar::joining(uint ucs4) noexcept
1204	{
1205	if (ucs4 > LastValidCodePoint)
1206	return QChar::OtherJoining;
1207	switch (qGetProp(ucs4)->joining) {
1208	case QChar::Joining_Causing: return QChar::Center;
1209	case QChar::Joining_Dual: return QChar::Dual;
1210	case QChar::Joining_Right: return QChar::Right;
1211	default: break;
1212	}
1213	return QChar::OtherJoining;
1214	}
1215	#endif
1216
1217	/!*
1218	\fn bool QChar::hasMirrored() const
1219
1220	Returns \c true if the character should be reversed if the text
1221	direction is reversed; otherwise returns \c false.
1222
1223	A bit faster equivalent of (ch.mirroredChar() != ch).
1224
1225	\sa mirroredChar()
1226	*/
1227
1228	/!*
1229	\overload
1230	\since 5.0
1231
1232	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1233	should be reversed if the text direction is reversed; otherwise returns \c false.
1234
1235	A bit faster equivalent of (QChar::mirroredChar(ucs4) != ucs4).
1236
1237	\sa mirroredChar()
1238	*/
1239	bool QChar::hasMirrored(uint ucs4) noexcept
1240	{
1241	if (ucs4 > LastValidCodePoint)
1242	return false;
1243	return qGetProp(ucs4)->mirrorDiff != `0`;
1244	}
1245
1246	/!*
1247	\fn bool QChar::isLower() const
1248
1249	Returns \c true if the character is a lowercase letter, for example
1250	category() is Letter_Lowercase.
1251
1252	\sa isUpper(), toLower(), toUpper()
1253	*/
1254
1255	/!*
1256	\fn static bool QChar::isLower(uint ucs4)
1257	\overload
1258	\since 5.0
1259
1260	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1261	is a lowercase letter, for example category() is Letter_Lowercase.
1262
1263	\sa isUpper(), toLower(), toUpper()
1264	*/
1265
1266	/!*
1267	\fn bool QChar::isUpper() const
1268
1269	Returns \c true if the character is an uppercase letter, for example
1270	category() is Letter_Uppercase.
1271
1272	\sa isLower(), toUpper(), toLower()
1273	*/
1274
1275	/!*
1276	\fn static bool QChar::isUpper(uint ucs4)
1277	\overload
1278	\since 5.0
1279
1280	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1281	is an uppercase letter, for example category() is Letter_Uppercase.
1282
1283	\sa isLower(), toUpper(), toLower()
1284	*/
1285
1286	/!*
1287	\fn bool QChar::isTitleCase() const
1288
1289	Returns \c true if the character is a titlecase letter, for example
1290	category() is Letter_Titlecase.
1291
1292	\sa isLower(), toUpper(), toLower(), toTitleCase()
1293	*/
1294
1295	/!*
1296	\fn static bool QChar::isTitleCase(uint ucs4)
1297	\overload
1298	\since 5.0
1299
1300	Returns \c true if the UCS-4-encoded character specified by \a ucs4
1301	is a titlecase letter, for example category() is Letter_Titlecase.
1302
1303	\sa isLower(), toUpper(), toLower(), toTitleCase()
1304	*/
1305	/!*
1306	\fn QChar QChar::mirroredChar() const
1307
1308	Returns the mirrored character if this character is a mirrored
1309	character; otherwise returns the character itself.
1310
1311	\sa hasMirrored()
1312	*/
1313
1314	/!*
1315	\overload
1316	Returns the mirrored character if the UCS-4-encoded character specified
1317	by \a ucs4 is a mirrored character; otherwise returns the character itself.
1318
1319	\sa hasMirrored()
1320	*/
1321	uint QChar::mirroredChar(uint ucs4) noexcept
1322	{
1323	if (ucs4 > LastValidCodePoint)
1324	return ucs4;
1325	return ucs4 + qGetProp(ucs4)->mirrorDiff;
1326	}
1327
1328
1329	// constants for Hangul (de)composition, see UAX #15
1330	enum {
1331	Hangul_SBase = `0xac00`,
1332	Hangul_LBase = `0x1100`,
1333	Hangul_VBase = `0x1161`,
1334	Hangul_TBase = `0x11a7`,
1335	Hangul_LCount = `19`,
1336	Hangul_VCount = `21`,
1337	Hangul_TCount = `28`,
1338	Hangul_NCount = Hangul_VCount * Hangul_TCount,
1339	Hangul_SCount = Hangul_LCount * Hangul_NCount
1340	};
1341
1342	// buffer has to have a length of 3. It's needed for Hangul decomposition
1343	static const unsigned short * QT_FASTCALL decompositionHelper
1344	(uint ucs4, int length, int* tag, unsigned* short *buffer)
1345	{
1346	if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount) {
1347	// compute Hangul syllable decomposition as per UAX #15
1348	const uint SIndex = ucs4 - Hangul_SBase;
1349	buffer[`0`] = Hangul_LBase + SIndex / Hangul_NCount; // L
1350	buffer[`1`] = Hangul_VBase + (SIndex % Hangul_NCount) / Hangul_TCount; // V
1351	buffer[`2`] = Hangul_TBase + SIndex % Hangul_TCount; // T
1352	*length = buffer[`2`] == Hangul_TBase ? `2` : `3`;
1353	*tag = QChar::Canonical;
1354	return buffer;
1355	}
1356
1357	const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1358	if (index == `0xffff`) {
1359	*length = `0`;
1360	*tag = QChar::NoDecomposition;
1361	return nullptr;
1362	}
1363
1364	const unsigned short *decomposition = uc_decomposition_map+index;
1365	tag = (decomposition) & `0xff`;
1366	length = (decomposition) >> `8`;
1367	return decomposition+`1`;
1368	}
1369
1370	/!*
1371	Decomposes a character into it's constituent parts. Returns an empty string
1372	if no decomposition exists.
1373	*/
1374	QString QChar::decomposition() const
1375	{
1376	return QChar::decomposition(ucs4: ucs);
1377	}
1378
1379	/!*
1380	\overload
1381	Decomposes the UCS-4-encoded character specified by \a ucs4 into it's
1382	constituent parts. Returns an empty string if no decomposition exists.
1383	*/
1384	QString QChar::decomposition(uint ucs4)
1385	{
1386	unsigned short buffer[`3`];
1387	int length;
1388	int tag;
1389	const unsigned short *d = decompositionHelper(ucs4, length: &length, tag: &tag, buffer);
1390	return QString (reinterpret_cast<const QChar *>(d), length);
1391	}
1392
1393	/!*
1394	\fn QChar::Decomposition QChar::decompositionTag() const
1395
1396	Returns the tag defining the composition of the character. Returns
1397	QChar::NoDecomposition if no decomposition exists.
1398	*/
1399
1400	/!*
1401	\overload
1402	Returns the tag defining the composition of the UCS-4-encoded character
1403	specified by \a ucs4. Returns QChar::NoDecomposition if no decomposition exists.
1404	*/
1405	QChar::Decomposition QChar::decompositionTag(uint ucs4) noexcept
1406	{
1407	if (ucs4 >= Hangul_SBase && ucs4 < Hangul_SBase + Hangul_SCount)
1408	return QChar::Canonical;
1409	const unsigned short index = GET_DECOMPOSITION_INDEX(ucs4);
1410	if (index == `0xffff`)
1411	return QChar::NoDecomposition;
1412	return (QChar::Decomposition)(uc_decomposition_map[index] & `0xff`);
1413	}
1414
1415	/!*
1416	\fn unsigned char QChar::combiningClass() const
1417
1418	Returns the combining class for the character as defined in the
1419	Unicode standard. This is mainly useful as a positioning hint for
1420	marks attached to a base character.
1421
1422	The Qt text rendering engine uses this information to correctly
1423	position non-spacing marks around a base character.
1424	*/
1425
1426	/!*
1427	\overload
1428	Returns the combining class for the UCS-4-encoded character specified by
1429	\a ucs4, as defined in the Unicode standard.
1430	*/
1431	unsigned char QChar::combiningClass(uint ucs4) noexcept
1432	{
1433	if (ucs4 > LastValidCodePoint)
1434	return `0`;
1435	return (unsigned char) qGetProp(ucs4)->combiningClass;
1436	}
1437
1438	/!*
1439	\fn QChar::Script QChar::script() const
1440	\since 5.1
1441
1442	Returns the Unicode script property value for this character.
1443	*/
1444
1445	/!*
1446	\overload
1447	\since 5.1
1448
1449	Returns the Unicode script property value for the character specified in
1450	its UCS-4-encoded form as \a ucs4.
1451	*/
1452	QChar::Script QChar::script(uint ucs4) noexcept
1453	{
1454	if (ucs4 > LastValidCodePoint)
1455	return QChar::Script_Unknown;
1456	return (QChar::Script) qGetProp(ucs4)->script;
1457	}
1458
1459	/!*
1460	\fn QChar::UnicodeVersion QChar::unicodeVersion() const
1461
1462	Returns the Unicode version that introduced this character.
1463	*/
1464
1465	/!*
1466	\overload
1467	Returns the Unicode version that introduced the character specified in
1468	its UCS-4-encoded form as \a ucs4.
1469	*/
1470	QChar::UnicodeVersion QChar::unicodeVersion(uint ucs4) noexcept
1471	{
1472	if (ucs4 > LastValidCodePoint)
1473	return QChar::Unicode_Unassigned;
1474	return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion;
1475	}
1476
1477	/!*
1478	Returns the most recent supported Unicode version.
1479	*/
1480	QChar::UnicodeVersion QChar::currentUnicodeVersion() noexcept
1481	{
1482	return UNICODE_DATA_VERSION;
1483	}
1484
1485
1486	template <typename T>
1487	Q_DECL_CONST_FUNCTION static inline T convertCase_helper(T uc, QUnicodeTables::Case which) noexcept
1488	{
1489	const auto fold = qGetProp(uc)->cases[which];
1490
1491	if (Q_UNLIKELY(fold.special)) {
1492	const ushort *specialCase = specialCaseMap + fold.diff;
1493	// so far, there are no special cases beyond BMP (guaranteed by the qunicodetables generator)
1494	return *specialCase == `1` ? specialCase[`1`] : uc;
1495	}
1496
1497	return uc + fold.diff;
1498	}
1499
1500	/!*
1501	\fn QChar QChar::toLower() const
1502
1503	Returns the lowercase equivalent if the character is uppercase or titlecase;
1504	otherwise returns the character itself.
1505	*/
1506
1507	/!*
1508	\overload
1509	Returns the lowercase equivalent of the UCS-4-encoded character specified
1510	by \a ucs4 if the character is uppercase or titlecase; otherwise returns
1511	the character itself.
1512	*/
1513	uint QChar::toLower(uint ucs4) noexcept
1514	{
1515	if (ucs4 > LastValidCodePoint)
1516	return ucs4;
1517	return convertCase_helper(uc: ucs4, which: QUnicodeTables::LowerCase);
1518	}
1519
1520	/!*
1521	\fn QChar QChar::toUpper() const
1522
1523	Returns the uppercase equivalent if the character is lowercase or titlecase;
1524	otherwise returns the character itself.
1525	*/
1526
1527	/!*
1528	\overload
1529	Returns the uppercase equivalent of the UCS-4-encoded character specified
1530	by \a ucs4 if the character is lowercase or titlecase; otherwise returns
1531	the character itself.
1532	*/
1533	uint QChar::toUpper(uint ucs4) noexcept
1534	{
1535	if (ucs4 > LastValidCodePoint)
1536	return ucs4;
1537	return convertCase_helper(uc: ucs4, which: QUnicodeTables::UpperCase);
1538	}
1539
1540	/!*
1541	\fn QChar QChar::toTitleCase() const
1542
1543	Returns the title case equivalent if the character is lowercase or uppercase;
1544	otherwise returns the character itself.
1545	*/
1546
1547	/!*
1548	\overload
1549	Returns the title case equivalent of the UCS-4-encoded character specified
1550	by \a ucs4 if the character is lowercase or uppercase; otherwise returns
1551	the character itself.
1552	*/
1553	uint QChar::toTitleCase(uint ucs4) noexcept
1554	{
1555	if (ucs4 > LastValidCodePoint)
1556	return ucs4;
1557	return convertCase_helper(uc: ucs4, which: QUnicodeTables::TitleCase);
1558	}
1559
1560	static inline uint foldCase(const ushort ch, const* ushort *start)
1561	{
1562	uint ucs4 = *ch;
1563	if (QChar::isLowSurrogate(ucs4) && ch > start && QChar::isHighSurrogate(ucs4: *(ch - `1`)))
1564	ucs4 = QChar::surrogateToUcs4(high: *(ch - `1`), low: ucs4);
1565	return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1566	}
1567
1568	static inline uint foldCase(uint ch, uint &last) noexcept
1569	{
1570	uint ucs4 = ch;
1571	if (QChar::isLowSurrogate(ucs4) && QChar::isHighSurrogate(ucs4: last))
1572	ucs4 = QChar::surrogateToUcs4(high: last, low: ucs4);
1573	last = ch;
1574	return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1575	}
1576
1577	static inline ushort foldCase(ushort ch) noexcept
1578	{
1579	return convertCase_helper(uc: ch, which: QUnicodeTables::CaseFold);
1580	}
1581
1582	static inline QChar foldCase(QChar ch) noexcept
1583	{
1584	return QChar (foldCase(ch: ch.unicode()));
1585	}
1586
1587	/!*
1588	\fn QChar QChar::toCaseFolded() const
1589
1590	Returns the case folded equivalent of the character.
1591	For most Unicode characters this is the same as toLower().
1592	*/
1593
1594	/!*
1595	\overload
1596	Returns the case folded equivalent of the UCS-4-encoded character specified
1597	by \a ucs4. For most Unicode characters this is the same as toLower().
1598	*/
1599	uint QChar::toCaseFolded(uint ucs4) noexcept
1600	{
1601	if (ucs4 > LastValidCodePoint)
1602	return ucs4;
1603	return convertCase_helper(uc: ucs4, which: QUnicodeTables::CaseFold);
1604	}
1605
1606	/!*
1607	\fn char QChar::toLatin1() const
1608
1609	Returns the Latin-1 character equivalent to the QChar, or 0. This
1610	is mainly useful for non-internationalized software.
1611
1612	\note It is not possible to distinguish a non-Latin-1 character from a Latin-1 0
1613	(NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1614
1615	\sa unicode()
1616	*/
1617
1618	/!*
1619	\fn QChar QChar::fromLatin1(char)
1620
1621	Converts the Latin-1 character \a c to its equivalent QChar. This
1622	is mainly useful for non-internationalized software.
1623
1624	An alternative is to use QLatin1Char.
1625
1626	\sa toLatin1(), unicode()
1627	*/
1628
1629	/!*
1630	\fn char QChar::toAscii() const
1631	\deprecated
1632
1633	Returns the Latin-1 character value of the QChar, or 0 if the character is not
1634	representable.
1635
1636	The main purpose of this function is to preserve ASCII characters used
1637	in C strings. This is mainly useful for developers of non-internationalized
1638	software.
1639
1640	\note It is not possible to distinguish a non-Latin 1 character from an ASCII 0
1641	(NUL) character. Prefer to use unicode(), which does not have this ambiguity.
1642
1643	\note This function does not check whether the character value is inside
1644	the valid range of US-ASCII.
1645
1646	\sa toLatin1(), unicode()
1647	*/
1648
1649	/!*
1650	\fn QChar QChar::fromAscii(char)
1651	\deprecated
1652
1653	Converts the ASCII character \a c to it's equivalent QChar. This
1654	is mainly useful for non-internationalized software.
1655
1656	An alternative is to use QLatin1Char.
1657
1658	\sa fromLatin1(), unicode()
1659	*/
1660
1661	#ifndef QT_NO_DATASTREAM
1662	/!*
1663	\relates QChar
1664
1665	Writes the char \a chr to the stream \a out.
1666
1667	\sa {Serializing Qt Data Types}
1668	*/
1669	QDataStream &operator<<(QDataStream &out, QChar chr)
1670	{
1671	out << quint16(chr.unicode());
1672	return out;
1673	}
1674
1675	/!*
1676	\relates QChar
1677
1678	Reads a char from the stream \a in into char \a chr.
1679
1680	\sa {Serializing Qt Data Types}
1681	*/
1682	QDataStream &operator>>(QDataStream &in, QChar &chr)
1683	{
1684	quint16 u;
1685	in >> u;
1686	chr.unicode() = ushort(u);
1687	return in;
1688	}
1689	#endif // QT_NO_DATASTREAM
1690
1691	/!*
1692	\fn ushort & QChar::unicode()
1693
1694	Returns a reference to the numeric Unicode value of the QChar.
1695	*/
1696
1697	/!*
1698	\fn ushort QChar::unicode() const
1699
1700	Returns the numeric Unicode value of the QChar.
1701	*/
1702
1703	/*****************************************************************************
1704	Documentation of QChar related functions
1705	*****************************************************************************/
1706
1707	/!*
1708	\fn bool operator==(QChar c1, QChar c2)
1709
1710	\relates QChar
1711
1712	Returns \c true if \a c1 and \a c2 are the same Unicode character;
1713	otherwise returns \c false.
1714	*/
1715
1716	/!*
1717	\fn int operator!=(QChar c1, QChar c2)
1718
1719	\relates QChar
1720
1721	Returns \c true if \a c1 and \a c2 are not the same Unicode
1722	character; otherwise returns \c false.
1723	*/
1724
1725	/!*
1726	\fn int operator<=(QChar c1, QChar c2)
1727
1728	\relates QChar
1729
1730	Returns \c true if the numeric Unicode value of \a c1 is less than
1731	or equal to that of \a c2; otherwise returns \c false.
1732	*/
1733
1734	/!*
1735	\fn int operator>=(QChar c1, QChar c2)
1736
1737	\relates QChar
1738
1739	Returns \c true if the numeric Unicode value of \a c1 is greater than
1740	or equal to that of \a c2; otherwise returns \c false.
1741	*/
1742
1743	/!*
1744	\fn int operator<(QChar c1, QChar c2)
1745
1746	\relates QChar
1747
1748	Returns \c true if the numeric Unicode value of \a c1 is less than
1749	that of \a c2; otherwise returns \c false.
1750	*/
1751
1752	/!*
1753	\fn int operator>(QChar c1, QChar c2)
1754
1755	\relates QChar
1756
1757	Returns \c true if the numeric Unicode value of \a c1 is greater than
1758	that of \a c2; otherwise returns \c false.
1759	*/
1760
1761
1762	// ---------------------------------------------------------------------------
1763
1764
1765	static void decomposeHelper(QString str, bool* canonical, QChar::UnicodeVersion version, int from)
1766	{
1767	int length;
1768	int tag;
1769	unsigned short buffer[`3`];
1770
1771	QString &s = *str;
1772
1773	const unsigned short utf16 = reinterpret_cast<unsigned* short *>(s.data());
1774	const unsigned short *uc = utf16 + s.length();
1775	while (uc != utf16 + from) {
1776	uint ucs4 = *(--uc);
1777	if (QChar (ucs4).isLowSurrogate() && uc != utf16) {
1778	ushort high = *(uc - `1`);
1779	if (QChar (high).isHighSurrogate()) {
1780	--uc;
1781	ucs4 = QChar::surrogateToUcs4(high, low: ucs4);
1782	}
1783	}
1784
1785	if (QChar::unicodeVersion(ucs4) > version)
1786	continue;
1787
1788	const unsigned short *d = decompositionHelper(ucs4, length: &length, tag: &tag, buffer);
1789	if (!d \|\| (canonical && tag != QChar::Canonical))
1790	continue;
1791
1792	int pos = uc - utf16;
1793	s.replace(i: pos, len: QChar::requiresSurrogates(ucs4) ? `2` : `1`, s: reinterpret_cast<const QChar *>(d), slen: length);
1794	// since the replace invalidates the pointers and we do decomposition recursive
1795	utf16 = reinterpret_cast<unsigned short *>(s.data());
1796	uc = utf16 + pos + length;
1797	}
1798	}
1799
1800
1801	struct UCS2Pair {
1802	ushort u1;
1803	ushort u2;
1804	};
1805
1806	inline bool operator<(const UCS2Pair &ligature1, const UCS2Pair &ligature2)
1807	{ return ligature1.u1 < ligature2.u1; }
1808	inline bool operator<(ushort u1, const UCS2Pair &ligature)
1809	{ return u1 < ligature.u1; }
1810	inline bool operator<(const UCS2Pair &ligature, ushort u1)
1811	{ return ligature.u1 < u1; }
1812
1813	struct UCS2SurrogatePair {
1814	UCS2Pair p1;
1815	UCS2Pair p2;
1816	};
1817
1818	inline bool operator<(const UCS2SurrogatePair &ligature1, const UCS2SurrogatePair &ligature2)
1819	{ return QChar::surrogateToUcs4(high: ligature1.p1.u1, low: ligature1.p1.u2) < QChar::surrogateToUcs4(high: ligature2.p1.u1, low: ligature2.p1.u2); }
1820	inline bool operator<(uint u1, const UCS2SurrogatePair &ligature)
1821	{ return u1 < QChar::surrogateToUcs4(high: ligature.p1.u1, low: ligature.p1.u2); }
1822	inline bool operator<(const UCS2SurrogatePair &ligature, uint u1)
1823	{ return QChar::surrogateToUcs4(high: ligature.p1.u1, low: ligature.p1.u2) < u1; }
1824
1825	static uint inline ligatureHelper(uint u1, uint u2)
1826	{
1827	if (u1 >= Hangul_LBase && u1 < Hangul_SBase + Hangul_SCount) {
1828	// compute Hangul syllable composition as per UAX #15
1829	// hangul L-V pair
1830	const uint LIndex = u1 - Hangul_LBase;
1831	if (LIndex < Hangul_LCount) {
1832	const uint VIndex = u2 - Hangul_VBase;
1833	if (VIndex < Hangul_VCount)
1834	return Hangul_SBase + (LIndex * Hangul_VCount + VIndex) * Hangul_TCount;
1835	}
1836	// hangul LV-T pair
1837	const uint SIndex = u1 - Hangul_SBase;
1838	if (SIndex < Hangul_SCount && (SIndex % Hangul_TCount) == `0`) {
1839	const uint TIndex = u2 - Hangul_TBase;
1840	if (TIndex < Hangul_TCount && TIndex)
1841	return u1 + TIndex;
1842	}
1843	}
1844
1845	const unsigned short index = GET_LIGATURE_INDEX(u2);
1846	if (index == `0xffff`)
1847	return `0`;
1848	const unsigned short *ligatures = uc_ligature_map+index;
1849	ushort length = *ligatures++;
1850	if (QChar::requiresSurrogates(ucs4: u1)) {
1851	const UCS2SurrogatePair data = reinterpret_cast<const* UCS2SurrogatePair *>(ligatures);
1852	const UCS2SurrogatePair *r = std::lower_bound(first: data, last: data + length, val: u1);
1853	if (r != data + length && QChar::surrogateToUcs4(high: r->p1.u1, low: r->p1.u2) == u1)
1854	return QChar::surrogateToUcs4(high: r->p2.u1, low: r->p2.u2);
1855	} else {
1856	const UCS2Pair data = reinterpret_cast<const* UCS2Pair *>(ligatures);
1857	const UCS2Pair *r = std::lower_bound(first: data, last: data + length, val: ushort(u1));
1858	if (r != data + length && r->u1 == ushort(u1))
1859	return r->u2;
1860	}
1861
1862	return `0`;
1863	}
1864
1865	static void composeHelper(QString str, QChar::UnicodeVersion version, int* from)
1866	{
1867	QString &s = *str;
1868
1869	if (from < `0` \|\| s.length() - from < `2`)
1870	return;
1871
1872	uint stcode = `0`; // starter code point
1873	int starter = -`1`; // starter position
1874	int next = -`1`; // to prevent i == next
1875	int lastCombining = `255`; // to prevent combining > lastCombining
1876
1877	int pos = from;
1878	while (pos < s.length()) {
1879	int i = pos;
1880	uint uc = s.at(i: pos).unicode();
1881	if (QChar (uc).isHighSurrogate() && pos < s.length()-`1`) {
1882	ushort low = s.at(i: pos+`1`).unicode();
1883	if (QChar (low).isLowSurrogate()) {
1884	uc = QChar::surrogateToUcs4(high: uc, low);
1885	++pos;
1886	}
1887	}
1888
1889	const QUnicodeTables::Properties *p = qGetProp(ucs4: uc);
1890	if (p->unicodeVersion > version) {
1891	starter = -`1`;
1892	next = -`1`; // to prevent i == next
1893	lastCombining = `255`; // to prevent combining > lastCombining
1894	++pos;
1895	continue;
1896	}
1897
1898	int combining = p->combiningClass;
1899	if ((i == next \|\| combining > lastCombining) && starter >= from) {
1900	// allowed to form ligature with S
1901	uint ligature = ligatureHelper(u1: stcode, u2: uc);
1902	if (ligature) {
1903	stcode = ligature;
1904	QChar *d = s.data();
1905	// ligatureHelper() never changes planes
1906	if (QChar::requiresSurrogates(ucs4: ligature)) {
1907	d[starter] = QChar (QChar::highSurrogate(ucs4: ligature));
1908	d[starter + `1`] = QChar (QChar::lowSurrogate(ucs4: ligature));
1909	s.remove(i, len: `2`);
1910	} else {
1911	d[starter] = QChar (ligature);
1912	s.remove(i, len: `1`);
1913	}
1914	continue;
1915	}
1916	}
1917	if (combining == `0`) {
1918	starter = i;
1919	stcode = uc;
1920	next = pos + `1`;
1921	}
1922	lastCombining = combining;
1923
1924	++pos;
1925	}
1926	}
1927
1928
1929	static void canonicalOrderHelper(QString str, QChar::UnicodeVersion version, int* from)
1930	{
1931	QString &s = *str;
1932	const int l = s.length()-`1`;
1933
1934	uint u1, u2;
1935	ushort c1, c2;
1936
1937	int pos = from;
1938	while (pos < l) {
1939	int p2 = pos+`1`;
1940	u1 = s.at(i: pos).unicode();
1941	if (QChar (u1).isHighSurrogate()) {
1942	ushort low = s.at(i: p2).unicode();
1943	if (QChar (low).isLowSurrogate()) {
1944	u1 = QChar::surrogateToUcs4(high: u1, low);
1945	if (p2 >= l)
1946	break;
1947	++p2;
1948	}
1949	}
1950	c1 = `0`;
1951
1952	advance:
1953	u2 = s.at(i: p2).unicode();
1954	if (QChar (u2).isHighSurrogate() && p2 < l) {
1955	ushort low = s.at(i: p2+`1`).unicode();
1956	if (QChar (low).isLowSurrogate()) {
1957	u2 = QChar::surrogateToUcs4(high: u2, low);
1958	++p2;
1959	}
1960	}
1961
1962	c2 = `0`;
1963	{
1964	const QUnicodeTables::Properties *p = qGetProp(ucs4: u2);
1965	if (p->unicodeVersion <= version)
1966	c2 = p->combiningClass;
1967	}
1968	if (c2 == `0`) {
1969	pos = p2+`1`;
1970	continue;
1971	}
1972
1973	if (c1 == `0`) {
1974	const QUnicodeTables::Properties *p = qGetProp(ucs4: u1);
1975	if (p->unicodeVersion <= version)
1976	c1 = p->combiningClass;
1977	}
1978
1979	if (c1 > c2) {
1980	QChar *uc = s.data();
1981	int p = pos;
1982	// exchange characters
1983	if (!QChar::requiresSurrogates(ucs4: u2)) {
1984	uc[p++] = QChar (u2);
1985	} else {
1986	uc[p++] = QChar (QChar::highSurrogate(ucs4: u2));
1987	uc[p++] = QChar (QChar::lowSurrogate(ucs4: u2));
1988	}
1989	if (!QChar::requiresSurrogates(ucs4: u1)) {
1990	uc[p++] = QChar (u1);
1991	} else {
1992	uc[p++] = QChar (QChar::highSurrogate(ucs4: u1));
1993	uc[p++] = QChar (QChar::lowSurrogate(ucs4: u1));
1994	}
1995	if (pos > `0`)
1996	--pos;
1997	if (pos > `0` && s.at(i: pos).isLowSurrogate())
1998	--pos;
1999	} else {
2000	++pos;
2001	if (QChar::requiresSurrogates(ucs4: u1))
2002	++pos;
2003
2004	u1 = u2;
2005	c1 = c2; // != 0
2006	p2 = pos + `1`;
2007	if (QChar::requiresSurrogates(ucs4: u1))
2008	++p2;
2009	if (p2 > l)
2010	break;
2011
2012	goto advance;
2013	}
2014	}
2015	}
2016
2017	// returns true if the text is in a desired Normalization Form already; false otherwise.
2018	// sets lastStable to the position of the last stable code point
2019	static bool normalizationQuickCheckHelper(QString str, QString::NormalizationForm mode, int* from, int *lastStable)
2020	{
2021	Q_STATIC_ASSERT(QString::NormalizationForm_D == `0`);
2022	Q_STATIC_ASSERT(QString::NormalizationForm_C == `1`);
2023	Q_STATIC_ASSERT(QString::NormalizationForm_KD == `2`);
2024	Q_STATIC_ASSERT(QString::NormalizationForm_KC == `3`);
2025
2026	enum { NFQC_YES = `0`, NFQC_NO = `1`, NFQC_MAYBE = `3` };
2027
2028	const ushort string = reinterpret_cast<const* ushort *>(str->constData());
2029	int length = str->length();
2030
2031	// this avoids one out of bounds check in the loop
2032	while (length > from && QChar::isHighSurrogate(ucs4: string[length - `1`]))
2033	--length;
2034
2035	uchar lastCombining = `0`;
2036	for (int i = from; i < length; ++i) {
2037	int pos = i;
2038	uint uc = string[i];
2039	if (uc < `0x80`) {
2040	// ASCII characters are stable code points
2041	lastCombining = `0`;
2042	*lastStable = pos;
2043	continue;
2044	}
2045
2046	if (QChar::isHighSurrogate(ucs4: uc)) {
2047	ushort low = string[i + `1`];
2048	if (!QChar::isLowSurrogate(ucs4: low)) {
2049	// treat surrogate like stable code point
2050	lastCombining = `0`;
2051	*lastStable = pos;
2052	continue;
2053	}
2054	++i;
2055	uc = QChar::surrogateToUcs4(high: uc, low);
2056	}
2057
2058	const QUnicodeTables::Properties *p = qGetProp(ucs4: uc);
2059
2060	if (p->combiningClass < lastCombining && p->combiningClass > `0`)
2061	return false;
2062
2063	const uchar check = (p->nfQuickCheck >> (mode << `1`)) & `0x03`;
2064	if (check != NFQC_YES)
2065	return false; // ### can we quick check NFQC_MAYBE ?
2066
2067	lastCombining = p->combiningClass;
2068	if (lastCombining == `0`)
2069	*lastStable = pos;
2070	}
2071
2072	if (length != str->length()) // low surrogate parts at the end of text
2073	*lastStable = str->length() - `1`;
2074
2075	return true;
2076	}
2077
2078	QT_END_NAMESPACE
2079

source code of qtbase/src/corelib/text/qchar.cpp