guniprop.c source code [gtk/subprojects/glib/glib/guniprop.c]

1	/ guniprop.c - Unicode character properties.*
2	*
3	* Copyright (C) 1999 Tom Tromey
4	* Copyright (C) 2000 Red Hat, Inc.
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License as published by the Free Software Foundation; either
9	* version 2.1 of the License, or (at your option) any later version.
10	*
11	* This library is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
18	*/
19
20	#include "config.h"
21
22	#include <stdlib.h>
23	#include <stddef.h>
24	#include <string.h>
25	#include <locale.h>
26
27	#include "gmem.h"
28	#include "gstring.h"
29	#include "gtestutils.h"
30	#include "gtypes.h"
31	#include "gunicode.h"
32	#include "gunichartables.h"
33	#include "gmirroringtable.h"
34	#include "gscripttable.h"
35	#include "gunicodeprivate.h"
36	#ifdef G_OS_WIN32
37	#include "gwin32.h"
38	#endif
39
40	#define G_UNICHAR_FULLWIDTH_A 0xff21
41	#define G_UNICHAR_FULLWIDTH_I 0xff29
42	#define G_UNICHAR_FULLWIDTH_J 0xff2a
43	#define G_UNICHAR_FULLWIDTH_F 0xff26
44	#define G_UNICHAR_FULLWIDTH_a 0xff41
45	#define G_UNICHAR_FULLWIDTH_f 0xff46
46
47	#define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
48	? attr_table_part1[Page] \
49	: attr_table_part2[(Page) - 0xe00])
50
51	#define ATTTABLE(Page, Char) \
52	((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))
53
54	#define TTYPE_PART1(Page, Char) \
55	((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
56	? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
57	: (type_data[type_table_part1[Page]][Char]))
58
59	#define TTYPE_PART2(Page, Char) \
60	((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
61	? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
62	: (type_data[type_table_part2[Page]][Char]))
63
64	#define TYPE(Char) \
65	(((Char) <= G_UNICODE_LAST_CHAR_PART1) \
66	? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
67	: (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
68	? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
69	: G_UNICODE_UNASSIGNED))
70
71
72	#define IS(Type, Class) (((guint)1 << (Type)) & (Class))
73	#define OR(Type, Rest) (((guint)1 << (Type)) \| (Rest))
74
75
76
77	#define ISALPHA(Type) IS ((Type), \
78	OR (G_UNICODE_LOWERCASE_LETTER, \
79	OR (G_UNICODE_UPPERCASE_LETTER, \
80	OR (G_UNICODE_TITLECASE_LETTER, \
81	OR (G_UNICODE_MODIFIER_LETTER, \
82	OR (G_UNICODE_OTHER_LETTER, 0))))))
83
84	#define ISALDIGIT(Type) IS ((Type), \
85	OR (G_UNICODE_DECIMAL_NUMBER, \
86	OR (G_UNICODE_LETTER_NUMBER, \
87	OR (G_UNICODE_OTHER_NUMBER, \
88	OR (G_UNICODE_LOWERCASE_LETTER, \
89	OR (G_UNICODE_UPPERCASE_LETTER, \
90	OR (G_UNICODE_TITLECASE_LETTER, \
91	OR (G_UNICODE_MODIFIER_LETTER, \
92	OR (G_UNICODE_OTHER_LETTER, 0)))))))))
93
94	#define ISMARK(Type) IS ((Type), \
95	OR (G_UNICODE_NON_SPACING_MARK, \
96	OR (G_UNICODE_SPACING_MARK, \
97	OR (G_UNICODE_ENCLOSING_MARK, 0))))
98
99	#define ISZEROWIDTHTYPE(Type) IS ((Type), \
100	OR (G_UNICODE_NON_SPACING_MARK, \
101	OR (G_UNICODE_ENCLOSING_MARK, \
102	OR (G_UNICODE_FORMAT, 0))))
103
104	/**
105	* g_unichar_isalnum:
106	* @c: a Unicode character
107	*
108	* Determines whether a character is alphanumeric.
109	* Given some UTF-8 text, obtain a character value
110	* with g_utf8_get_char().
111	*
112	* Returns: %TRUE if @c is an alphanumeric character
113	**/
114	gboolean
115	g_unichar_isalnum (gunichar c)
116	{
117	return ISALDIGIT (TYPE (c)) ? TRUE : FALSE;
118	}
119
120	/**
121	* g_unichar_isalpha:
122	* @c: a Unicode character
123	*
124	* Determines whether a character is alphabetic (i.e. a letter).
125	* Given some UTF-8 text, obtain a character value with
126	* g_utf8_get_char().
127	*
128	* Returns: %TRUE if @c is an alphabetic character
129	**/
130	gboolean
131	g_unichar_isalpha (gunichar c)
132	{
133	return ISALPHA (TYPE (c)) ? TRUE : FALSE;
134	}
135
136
137	/**
138	* g_unichar_iscntrl:
139	* @c: a Unicode character
140	*
141	* Determines whether a character is a control character.
142	* Given some UTF-8 text, obtain a character value with
143	* g_utf8_get_char().
144	*
145	* Returns: %TRUE if @c is a control character
146	**/
147	gboolean
148	g_unichar_iscntrl (gunichar c)
149	{
150	return TYPE (c) == G_UNICODE_CONTROL;
151	}
152
153	/**
154	* g_unichar_isdigit:
155	* @c: a Unicode character
156	*
157	* Determines whether a character is numeric (i.e. a digit). This
158	* covers ASCII 0-9 and also digits in other languages/scripts. Given
159	* some UTF-8 text, obtain a character value with g_utf8_get_char().
160	*
161	* Returns: %TRUE if @c is a digit
162	**/
163	gboolean
164	g_unichar_isdigit (gunichar c)
165	{
166	return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
167	}
168
169
170	/**
171	* g_unichar_isgraph:
172	* @c: a Unicode character
173	*
174	* Determines whether a character is printable and not a space
175	* (returns %FALSE for control characters, format characters, and
176	* spaces). g_unichar_isprint() is similar, but returns %TRUE for
177	* spaces. Given some UTF-8 text, obtain a character value with
178	* g_utf8_get_char().
179	*
180	* Returns: %TRUE if @c is printable unless it's a space
181	**/
182	gboolean
183	g_unichar_isgraph (gunichar c)
184	{
185	return !IS (TYPE(c),
186	OR (G_UNICODE_CONTROL,
187	OR (G_UNICODE_FORMAT,
188	OR (G_UNICODE_UNASSIGNED,
189	OR (G_UNICODE_SURROGATE,
190	OR (G_UNICODE_SPACE_SEPARATOR,
191	`0`))))));
192	}
193
194	/**
195	* g_unichar_islower:
196	* @c: a Unicode character
197	*
198	* Determines whether a character is a lowercase letter.
199	* Given some UTF-8 text, obtain a character value with
200	* g_utf8_get_char().
201	*
202	* Returns: %TRUE if @c is a lowercase letter
203	**/
204	gboolean
205	g_unichar_islower (gunichar c)
206	{
207	return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
208	}
209
210
211	/**
212	* g_unichar_isprint:
213	* @c: a Unicode character
214	*
215	* Determines whether a character is printable.
216	* Unlike g_unichar_isgraph(), returns %TRUE for spaces.
217	* Given some UTF-8 text, obtain a character value with
218	* g_utf8_get_char().
219	*
220	* Returns: %TRUE if @c is printable
221	**/
222	gboolean
223	g_unichar_isprint (gunichar c)
224	{
225	return !IS (TYPE(c),
226	OR (G_UNICODE_CONTROL,
227	OR (G_UNICODE_FORMAT,
228	OR (G_UNICODE_UNASSIGNED,
229	OR (G_UNICODE_SURROGATE,
230	`0`)))));
231	}
232
233	/**
234	* g_unichar_ispunct:
235	* @c: a Unicode character
236	*
237	* Determines whether a character is punctuation or a symbol.
238	* Given some UTF-8 text, obtain a character value with
239	* g_utf8_get_char().
240	*
241	* Returns: %TRUE if @c is a punctuation or symbol character
242	**/
243	gboolean
244	g_unichar_ispunct (gunichar c)
245	{
246	return IS (TYPE(c),
247	OR (G_UNICODE_CONNECT_PUNCTUATION,
248	OR (G_UNICODE_DASH_PUNCTUATION,
249	OR (G_UNICODE_CLOSE_PUNCTUATION,
250	OR (G_UNICODE_FINAL_PUNCTUATION,
251	OR (G_UNICODE_INITIAL_PUNCTUATION,
252	OR (G_UNICODE_OTHER_PUNCTUATION,
253	OR (G_UNICODE_OPEN_PUNCTUATION,
254	OR (G_UNICODE_CURRENCY_SYMBOL,
255	OR (G_UNICODE_MODIFIER_SYMBOL,
256	OR (G_UNICODE_MATH_SYMBOL,
257	OR (G_UNICODE_OTHER_SYMBOL,
258	`0`)))))))))))) ? TRUE : FALSE;
259	}
260
261	/**
262	* g_unichar_isspace:
263	* @c: a Unicode character
264	*
265	* Determines whether a character is a space, tab, or line separator
266	* (newline, carriage return, etc.). Given some UTF-8 text, obtain a
267	* character value with g_utf8_get_char().
268	*
269	* (Note: don't use this to do word breaking; you have to use
270	* Pango or equivalent to get word breaking right, the algorithm
271	* is fairly complex.)
272	*
273	* Returns: %TRUE if @c is a space character
274	**/
275	gboolean
276	g_unichar_isspace (gunichar c)
277	{
278	switch (c)
279	{
280	/ special-case these since Unicode thinks they are not spaces /
281	case `'\t'`:
282	case `'\n'`:
283	case `'\r'`:
284	case `'\f'`:
285	return TRUE;
286	break;
287
288	default:
289	{
290	return IS (TYPE(c),
291	OR (G_UNICODE_SPACE_SEPARATOR,
292	OR (G_UNICODE_LINE_SEPARATOR,
293	OR (G_UNICODE_PARAGRAPH_SEPARATOR,
294	`0`)))) ? TRUE : FALSE;
295	}
296	break;
297	}
298	}
299
300	/**
301	* g_unichar_ismark:
302	* @c: a Unicode character
303	*
304	* Determines whether a character is a mark (non-spacing mark,
305	* combining mark, or enclosing mark in Unicode speak).
306	* Given some UTF-8 text, obtain a character value
307	* with g_utf8_get_char().
308	*
309	* Note: in most cases where isalpha characters are allowed,
310	* ismark characters should be allowed to as they are essential
311	* for writing most European languages as well as many non-Latin
312	* scripts.
313	*
314	* Returns: %TRUE if @c is a mark character
315	*
316	* Since: 2.14
317	**/
318	gboolean
319	g_unichar_ismark (gunichar c)
320	{
321	return ISMARK (TYPE (c));
322	}
323
324	/**
325	* g_unichar_isupper:
326	* @c: a Unicode character
327	*
328	* Determines if a character is uppercase.
329	*
330	* Returns: %TRUE if @c is an uppercase character
331	**/
332	gboolean
333	g_unichar_isupper (gunichar c)
334	{
335	return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
336	}
337
338	/**
339	* g_unichar_istitle:
340	* @c: a Unicode character
341	*
342	* Determines if a character is titlecase. Some characters in
343	* Unicode which are composites, such as the DZ digraph
344	* have three case variants instead of just two. The titlecase
345	* form is used at the beginning of a word where only the
346	* first letter is capitalized. The titlecase form of the DZ
347	* digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
348	*
349	* Returns: %TRUE if the character is titlecase
350	**/
351	gboolean
352	g_unichar_istitle (gunichar c)
353	{
354	unsigned int i;
355	for (i = `0`; i < G_N_ELEMENTS (title_table); ++i)
356	if (title_table[i][`0`] == c)
357	return TRUE;
358	return FALSE;
359	}
360
361	/**
362	* g_unichar_isxdigit:
363	* @c: a Unicode character.
364	*
365	* Determines if a character is a hexadecimal digit.
366	*
367	* Returns: %TRUE if the character is a hexadecimal digit
368	**/
369	gboolean
370	g_unichar_isxdigit (gunichar c)
371	{
372	return ((c >= `'a'` && c <= `'f'`) \|\|
373	(c >= `'A'` && c <= `'F'`) \|\|
374	(c >= G_UNICHAR_FULLWIDTH_a && c <= G_UNICHAR_FULLWIDTH_f) \|\|
375	(c >= G_UNICHAR_FULLWIDTH_A && c <= G_UNICHAR_FULLWIDTH_F) \|\|
376	(TYPE (c) == G_UNICODE_DECIMAL_NUMBER));
377	}
378
379	/**
380	* g_unichar_isdefined:
381	* @c: a Unicode character
382	*
383	* Determines if a given character is assigned in the Unicode
384	* standard.
385	*
386	* Returns: %TRUE if the character has an assigned value
387	**/
388	gboolean
389	g_unichar_isdefined (gunichar c)
390	{
391	return !IS (TYPE(c),
392	OR (G_UNICODE_UNASSIGNED,
393	OR (G_UNICODE_SURROGATE,
394	`0`)));
395	}
396
397	/**
398	* g_unichar_iszerowidth:
399	* @c: a Unicode character
400	*
401	* Determines if a given character typically takes zero width when rendered.
402	* The return value is %TRUE for all non-spacing and enclosing marks
403	* (e.g., combining accents), format characters, zero-width
404	* space, but not U+00AD SOFT HYPHEN.
405	*
406	* A typical use of this function is with one of g_unichar_iswide() or
407	* g_unichar_iswide_cjk() to determine the number of cells a string occupies
408	* when displayed on a grid display (terminals). However, note that not all
409	* terminals support zero-width rendering of zero-width marks.
410	*
411	* Returns: %TRUE if the character has zero width
412	*
413	* Since: 2.14
414	**/
415	gboolean
416	g_unichar_iszerowidth (gunichar c)
417	{
418	if (G_UNLIKELY (c == `0x00AD`))
419	return FALSE;
420
421	if (G_UNLIKELY (ISZEROWIDTHTYPE (TYPE (c))))
422	return TRUE;
423
424	if (G_UNLIKELY ((c >= `0x1160` && c < `0x1200`) \|\|
425	c == `0x200B`))
426	return TRUE;
427
428	return FALSE;
429	}
430
431	static int
432	interval_compare (const void key, const* void *elt)
433	{
434	gunichar c = GPOINTER_TO_UINT (key);
435	struct Interval interval = (struct* Interval *)elt;
436
437	if (c < interval->start)
438	return -`1`;
439	if (c > interval->end)
440	return +`1`;
441
442	return `0`;
443	}
444
445	#define G_WIDTH_TABLE_MIDPOINT (G_N_ELEMENTS (g_unicode_width_table_wide) / 2)
446
447	static inline gboolean
448	g_unichar_iswide_bsearch (gunichar ch)
449	{
450	int lower = `0`;
451	int upper = G_N_ELEMENTS (g_unicode_width_table_wide) - `1`;
452	static int saved_mid = G_WIDTH_TABLE_MIDPOINT;
453	int mid = saved_mid;
454
455	do
456	{
457	if (ch < g_unicode_width_table_wide[mid].start)
458	upper = mid - `1`;
459	else if (ch > g_unicode_width_table_wide[mid].end)
460	lower = mid + `1`;
461	else
462	return TRUE;
463
464	mid = (lower + upper) / `2`;
465	}
466	while (lower <= upper);
467
468	return FALSE;
469	}
470
471	/**
472	* g_unichar_iswide:
473	* @c: a Unicode character
474	*
475	* Determines if a character is typically rendered in a double-width
476	* cell.
477	*
478	* Returns: %TRUE if the character is wide
479	**/
480	gboolean
481	g_unichar_iswide (gunichar c)
482	{
483	if (c < g_unicode_width_table_wide[`0`].start)
484	return FALSE;
485	else
486	return g_unichar_iswide_bsearch (ch: c);
487	}
488
489
490	/**
491	* g_unichar_iswide_cjk:
492	* @c: a Unicode character
493	*
494	* Determines if a character is typically rendered in a double-width
495	* cell under legacy East Asian locales. If a character is wide according to
496	* g_unichar_iswide(), then it is also reported wide with this function, but
497	* the converse is not necessarily true. See the
498	* [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
499	* for details.
500	*
501	* If a character passes the g_unichar_iswide() test then it will also pass
502	* this test, but not the other way around. Note that some characters may
503	* pass both this test and g_unichar_iszerowidth().
504	*
505	* Returns: %TRUE if the character is wide in legacy East Asian locales
506	*
507	* Since: 2.12
508	*/
509	gboolean
510	g_unichar_iswide_cjk (gunichar c)
511	{
512	if (g_unichar_iswide (c))
513	return TRUE;
514
515	/ bsearch() is declared attribute(nonnull(1)) so we can't validly search*
516	* for a NULL key */
517	if (c == `0`)
518	return FALSE;
519
520	if (bsearch (GUINT_TO_POINTER (c),
521	base: g_unicode_width_table_ambiguous,
522	G_N_ELEMENTS (g_unicode_width_table_ambiguous),
523	size: sizeof g_unicode_width_table_ambiguous[`0`],
524	compar: interval_compare))
525	return TRUE;
526
527	return FALSE;
528	}
529
530
531	/**
532	* g_unichar_toupper:
533	* @c: a Unicode character
534	*
535	* Converts a character to uppercase.
536	*
537	* Returns: the result of converting @c to uppercase.
538	* If @c is not a lowercase or titlecase character,
539	* or has no upper case equivalent @c is returned unchanged.
540	**/
541	gunichar
542	g_unichar_toupper (gunichar c)
543	{
544	int t = TYPE (c);
545	if (t == G_UNICODE_LOWERCASE_LETTER)
546	{
547	gunichar val = ATTTABLE (c >> `8`, c & `0xff`);
548	if (val >= `0x1000000`)
549	{
550	const gchar *p = special_case_table + val - `0x1000000`;
551	val = g_utf8_get_char (p);
552	}
553	/ Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR,*
554	* do not have an uppercase equivalent, in which case val will be
555	* zero.
556	*/
557	return val ? val : c;
558	}
559	else if (t == G_UNICODE_TITLECASE_LETTER)
560	{
561	unsigned int i;
562	for (i = `0`; i < G_N_ELEMENTS (title_table); ++i)
563	{
564	if (title_table[i][`0`] == c)
565	return title_table[i][`1`] ? title_table[i][`1`] : c;
566	}
567	}
568	return c;
569	}
570
571	/**
572	* g_unichar_tolower:
573	* @c: a Unicode character.
574	*
575	* Converts a character to lower case.
576	*
577	* Returns: the result of converting @c to lower case.
578	* If @c is not an upperlower or titlecase character,
579	* or has no lowercase equivalent @c is returned unchanged.
580	**/
581	gunichar
582	g_unichar_tolower (gunichar c)
583	{
584	int t = TYPE (c);
585	if (t == G_UNICODE_UPPERCASE_LETTER)
586	{
587	gunichar val = ATTTABLE (c >> `8`, c & `0xff`);
588	if (val >= `0x1000000`)
589	{
590	const gchar *p = special_case_table + val - `0x1000000`;
591	return g_utf8_get_char (p);
592	}
593	else
594	{
595	/ Not all uppercase letters are guaranteed to have a lowercase*
596	* equivalent. If this is the case, val will be zero. */
597	return val ? val : c;
598	}
599	}
600	else if (t == G_UNICODE_TITLECASE_LETTER)
601	{
602	unsigned int i;
603	for (i = `0`; i < G_N_ELEMENTS (title_table); ++i)
604	{
605	if (title_table[i][`0`] == c)
606	return title_table[i][`2`];
607	}
608	}
609	return c;
610	}
611
612	/**
613	* g_unichar_totitle:
614	* @c: a Unicode character
615	*
616	* Converts a character to the titlecase.
617	*
618	* Returns: the result of converting @c to titlecase.
619	* If @c is not an uppercase or lowercase character,
620	* @c is returned unchanged.
621	**/
622	gunichar
623	g_unichar_totitle (gunichar c)
624	{
625	unsigned int i;
626
627	/ We handle U+0000 explicitly because some elements in*
628	* title_table[i][1] may be null. */
629	if (c == `0`)
630	return c;
631
632	for (i = `0`; i < G_N_ELEMENTS (title_table); ++i)
633	{
634	if (title_table[i][`0`] == c \|\| title_table[i][`1`] == c
635	\|\| title_table[i][`2`] == c)
636	return title_table[i][`0`];
637	}
638
639	if (TYPE (c) == G_UNICODE_LOWERCASE_LETTER)
640	return g_unichar_toupper (c);
641
642	return c;
643	}
644
645	/**
646	* g_unichar_digit_value:
647	* @c: a Unicode character
648	*
649	* Determines the numeric value of a character as a decimal
650	* digit.
651	*
652	* Returns: If @c is a decimal digit (according to
653	* g_unichar_isdigit()), its numeric value. Otherwise, -1.
654	**/
655	int
656	g_unichar_digit_value (gunichar c)
657	{
658	if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
659	return ATTTABLE (c >> `8`, c & `0xff`);
660	return -`1`;
661	}
662
663	/**
664	* g_unichar_xdigit_value:
665	* @c: a Unicode character
666	*
667	* Determines the numeric value of a character as a hexadecimal
668	* digit.
669	*
670	* Returns: If @c is a hex digit (according to
671	* g_unichar_isxdigit()), its numeric value. Otherwise, -1.
672	**/
673	int
674	g_unichar_xdigit_value (gunichar c)
675	{
676	if (c >= `'A'` && c <= `'F'`)
677	return c - `'A'` + `10`;
678	if (c >= `'a'` && c <= `'f'`)
679	return c - `'a'` + `10`;
680	if (c >= G_UNICHAR_FULLWIDTH_A && c <= G_UNICHAR_FULLWIDTH_F)
681	return c - G_UNICHAR_FULLWIDTH_A + `10`;
682	if (c >= G_UNICHAR_FULLWIDTH_a && c <= G_UNICHAR_FULLWIDTH_f)
683	return c - G_UNICHAR_FULLWIDTH_a + `10`;
684	if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
685	return ATTTABLE (c >> `8`, c & `0xff`);
686	return -`1`;
687	}
688
689	/**
690	* g_unichar_type:
691	* @c: a Unicode character
692	*
693	* Classifies a Unicode character by type.
694	*
695	* Returns: the type of the character.
696	**/
697	GUnicodeType
698	g_unichar_type (gunichar c)
699	{
700	return TYPE (c);
701	}
702
703	/*
704	* Case mapping functions
705	*/
706
707	typedef enum {
708	LOCALE_NORMAL,
709	LOCALE_TURKIC,
710	LOCALE_LITHUANIAN
711	} LocaleType;
712
713	static LocaleType
714	get_locale_type (void)
715	{
716	#ifdef G_OS_WIN32
717	char *tem = g_win32_getlocale ();
718	char locale[`2`];
719
720	locale[`0`] = tem[`0`];
721	locale[`1`] = tem[`1`];
722	g_free (tem);
723	#else
724	const char *locale = setlocale (LC_CTYPE, NULL);
725
726	if (locale == NULL)
727	return LOCALE_NORMAL;
728	#endif
729
730	switch (locale[`0`])
731	{
732	case `'a'`:
733	if (locale[`1`] == `'z'`)
734	return LOCALE_TURKIC;
735	break;
736	case `'l'`:
737	if (locale[`1`] == `'t'`)
738	return LOCALE_LITHUANIAN;
739	break;
740	case `'t'`:
741	if (locale[`1`] == `'r'`)
742	return LOCALE_TURKIC;
743	break;
744	}
745
746	return LOCALE_NORMAL;
747	}
748
749	static gint
750	output_marks (const char **p_inout,
751	char *out_buffer,
752	gboolean remove_dot)
753	{
754	const char p = p_inout;
755	gint len = `0`;
756
757	while (*p)
758	{
759	gunichar c = g_utf8_get_char (p);
760
761	if (ISMARK (TYPE (c)))
762	{
763	if (!remove_dot \|\| c != `0x307` / COMBINING DOT ABOVE /)
764	len += g_unichar_to_utf8 (c, outbuf: out_buffer ? out_buffer + len : NULL);
765	p = g_utf8_next_char (p);
766	}
767	else
768	break;
769	}
770
771	*p_inout = p;
772	return len;
773	}
774
775	static gint
776	output_special_case (gchar *out_buffer,
777	int offset,
778	int type,
779	int which)
780	{
781	const gchar *p = special_case_table + offset;
782	gint len;
783
784	if (type != G_UNICODE_TITLECASE_LETTER)
785	p = g_utf8_next_char (p);
786
787	if (which == `1`)
788	p += strlen (s: p) + `1`;
789
790	len = strlen (s: p);
791	if (out_buffer)
792	memcpy (dest: out_buffer, src: p, n: len);
793
794	return len;
795	}
796
797	static gsize
798	real_toupper (const gchar *str,
799	gssize max_len,
800	gchar *out_buffer,
801	LocaleType locale_type)
802	{
803	const gchar *p = str;
804	const char *last = NULL;
805	gsize len = `0`;
806	gboolean last_was_i = FALSE;
807
808	while ((max_len < `0` \|\| p < str + max_len) && *p)
809	{
810	gunichar c = g_utf8_get_char (p);
811	int t = TYPE (c);
812	gunichar val;
813
814	last = p;
815	p = g_utf8_next_char (p);
816
817	if (locale_type == LOCALE_LITHUANIAN)
818	{
819	if (c == `'i'`)
820	last_was_i = TRUE;
821	else
822	{
823	if (last_was_i)
824	{
825	/ Nasty, need to remove any dot above. Though*
826	* I think only E WITH DOT ABOVE occurs in practice
827	* which could simplify this considerably.
828	*/
829	gsize decomp_len, i;
830	gunichar decomp[G_UNICHAR_MAX_DECOMPOSITION_LENGTH];
831
832	decomp_len = g_unichar_fully_decompose (ch: c, FALSE, result: decomp, G_N_ELEMENTS (decomp));
833	for (i=`0`; i < decomp_len; i++)
834	{
835	if (decomp[i] != `0x307` / COMBINING DOT ABOVE /)
836	len += g_unichar_to_utf8 (c: g_unichar_toupper (c: decomp[i]), outbuf: out_buffer ? out_buffer + len : NULL);
837	}
838
839	len += output_marks (p_inout: &p, out_buffer: out_buffer ? out_buffer + len : NULL, TRUE);
840
841	continue;
842	}
843
844	if (!ISMARK (t))
845	last_was_i = FALSE;
846	}
847	}
848
849	if (locale_type == LOCALE_TURKIC && c == `'i'`)
850	{
851	/ i => LATIN CAPITAL LETTER I WITH DOT ABOVE /
852	len += g_unichar_to_utf8 (c: `0x130`, outbuf: out_buffer ? out_buffer + len : NULL);
853	}
854	else if (c == `0x0345`) / COMBINING GREEK YPOGEGRAMMENI /
855	{
856	/ Nasty, need to move it after other combining marks .. this would go away if*
857	* we normalized first.
858	*/
859	len += output_marks (p_inout: &p, out_buffer: out_buffer ? out_buffer + len : NULL, FALSE);
860
861	/ And output as GREEK CAPITAL LETTER IOTA /
862	len += g_unichar_to_utf8 (c: `0x399`, outbuf: out_buffer ? out_buffer + len : NULL);
863	}
864	else if (IS (t,
865	OR (G_UNICODE_LOWERCASE_LETTER,
866	OR (G_UNICODE_TITLECASE_LETTER,
867	`0`))))
868	{
869	val = ATTTABLE (c >> `8`, c & `0xff`);
870
871	if (val >= `0x1000000`)
872	{
873	len += output_special_case (out_buffer: out_buffer ? out_buffer + len : NULL, offset: val - `0x1000000`, type: t,
874	which: t == G_UNICODE_LOWERCASE_LETTER ? `0` : `1`);
875	}
876	else
877	{
878	if (t == G_UNICODE_TITLECASE_LETTER)
879	{
880	unsigned int i;
881	for (i = `0`; i < G_N_ELEMENTS (title_table); ++i)
882	{
883	if (title_table[i][`0`] == c)
884	{
885	val = title_table[i][`1`];
886	break;
887	}
888	}
889	}
890
891	/ Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR,*
892	* do not have an uppercase equivalent, in which case val will be
893	* zero. */
894	len += g_unichar_to_utf8 (c: val ? val : c, outbuf: out_buffer ? out_buffer + len : NULL);
895	}
896	}
897	else
898	{
899	gsize char_len = g_utf8_skip[(guchar )last];
900
901	if (out_buffer)
902	memcpy (dest: out_buffer + len, src: last, n: char_len);
903
904	len += char_len;
905	}
906
907	}
908
909	return len;
910	}
911
912	/**
913	* g_utf8_strup:
914	* @str: a UTF-8 encoded string
915	* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
916	*
917	* Converts all Unicode characters in the string that have a case
918	* to uppercase. The exact manner that this is done depends
919	* on the current locale, and may result in the number of
920	* characters in the string increasing. (For instance, the
921	* German ess-zet will be changed to SS.)
922	*
923	* Returns: a newly allocated string, with all characters
924	* converted to uppercase.
925	**/
926	gchar *
927	g_utf8_strup (const gchar *str,
928	gssize len)
929	{
930	gsize result_len;
931	LocaleType locale_type;
932	gchar *result;
933
934	g_return_val_if_fail (str != NULL, NULL);
935
936	locale_type = get_locale_type ();
937
938	/*
939	* We use a two pass approach to keep memory management simple
940	*/
941	result_len = real_toupper (str, max_len: len, NULL, locale_type);
942	result = g_malloc (n_bytes: result_len + `1`);
943	real_toupper (str, max_len: len, out_buffer: result, locale_type);
944	result[result_len] = `'\0'`;
945
946	return result;
947	}
948
949	/ traverses the string checking for characters with combining class == 230*
950	* until a base character is found */
951	static gboolean
952	has_more_above (const gchar *str)
953	{
954	const gchar *p = str;
955	gint combining_class;
956
957	while (*p)
958	{
959	combining_class = g_unichar_combining_class (uc: g_utf8_get_char (p));
960	if (combining_class == `230`)
961	return TRUE;
962	else if (combining_class == `0`)
963	break;
964
965	p = g_utf8_next_char (p);
966	}
967
968	return FALSE;
969	}
970
971	static gsize
972	real_tolower (const gchar *str,
973	gssize max_len,
974	gchar *out_buffer,
975	LocaleType locale_type)
976	{
977	const gchar *p = str;
978	const char *last = NULL;
979	gsize len = `0`;
980
981	while ((max_len < `0` \|\| p < str + max_len) && *p)
982	{
983	gunichar c = g_utf8_get_char (p);
984	int t = TYPE (c);
985	gunichar val;
986
987	last = p;
988	p = g_utf8_next_char (p);
989
990	if (locale_type == LOCALE_TURKIC && (c == `'I'` \|\| c == `0x130` \|\|
991	c == G_UNICHAR_FULLWIDTH_I))
992	{
993	gboolean combining_dot = (c == `'I'` \|\| c == G_UNICHAR_FULLWIDTH_I) &&
994	g_utf8_get_char (p) == `0x0307`;
995	if (combining_dot \|\| c == `0x130`)
996	{
997	/ I + COMBINING DOT ABOVE => i (U+0069)*
998	* LATIN CAPITAL LETTER I WITH DOT ABOVE => i (U+0069) */
999	len += g_unichar_to_utf8 (c: `0x0069`, outbuf: out_buffer ? out_buffer + len : NULL);
1000	if (combining_dot)
1001	p = g_utf8_next_char (p);
1002	}
1003	else
1004	{
1005	/ I => LATIN SMALL LETTER DOTLESS I /
1006	len += g_unichar_to_utf8 (c: `0x131`, outbuf: out_buffer ? out_buffer + len : NULL);
1007	}
1008	}
1009	/ Introduce an explicit dot above when lowercasing capital I's and J's*
1010	* whenever there are more accents above. [SpecialCasing.txt] */
1011	else if (locale_type == LOCALE_LITHUANIAN &&
1012	(c == `0x00cc` \|\| c == `0x00cd` \|\| c == `0x0128`))
1013	{
1014	len += g_unichar_to_utf8 (c: `0x0069`, outbuf: out_buffer ? out_buffer + len : NULL);
1015	len += g_unichar_to_utf8 (c: `0x0307`, outbuf: out_buffer ? out_buffer + len : NULL);
1016
1017	switch (c)
1018	{
1019	case `0x00cc`:
1020	len += g_unichar_to_utf8 (c: `0x0300`, outbuf: out_buffer ? out_buffer + len : NULL);
1021	break;
1022	case `0x00cd`:
1023	len += g_unichar_to_utf8 (c: `0x0301`, outbuf: out_buffer ? out_buffer + len : NULL);
1024	break;
1025	case `0x0128`:
1026	len += g_unichar_to_utf8 (c: `0x0303`, outbuf: out_buffer ? out_buffer + len : NULL);
1027	break;
1028	}
1029	}
1030	else if (locale_type == LOCALE_LITHUANIAN &&
1031	(c == `'I'` \|\| c == G_UNICHAR_FULLWIDTH_I \|\|
1032	c == `'J'` \|\| c == G_UNICHAR_FULLWIDTH_J \|\| c == `0x012e`) &&
1033	has_more_above (str: p))
1034	{
1035	len += g_unichar_to_utf8 (c: g_unichar_tolower (c), outbuf: out_buffer ? out_buffer + len : NULL);
1036	len += g_unichar_to_utf8 (c: `0x0307`, outbuf: out_buffer ? out_buffer + len : NULL);
1037	}
1038	else if (c == `0x03A3`) / GREEK CAPITAL LETTER SIGMA /
1039	{
1040	if ((max_len < `0` \|\| p < str + max_len) && *p)
1041	{
1042	gunichar next_c = g_utf8_get_char (p);
1043	int next_type = TYPE(next_c);
1044
1045	/ SIGMA mapps differently depending on whether it is*
1046	* final or not. The following simplified test would
1047	* fail in the case of combining marks following the
1048	* sigma, but I don't think that occurs in real text.
1049	* The test here matches that in ICU.
1050	*/
1051	if (ISALPHA (next_type)) / Lu,Ll,Lt,Lm,Lo /
1052	val = `0x3c3`; / GREEK SMALL SIGMA /
1053	else
1054	val = `0x3c2`; / GREEK SMALL FINAL SIGMA /
1055	}
1056	else
1057	val = `0x3c2`; / GREEK SMALL FINAL SIGMA /
1058
1059	len += g_unichar_to_utf8 (c: val, outbuf: out_buffer ? out_buffer + len : NULL);
1060	}
1061	else if (IS (t,
1062	OR (G_UNICODE_UPPERCASE_LETTER,
1063	OR (G_UNICODE_TITLECASE_LETTER,
1064	`0`))))
1065	{
1066	val = ATTTABLE (c >> `8`, c & `0xff`);
1067
1068	if (val >= `0x1000000`)
1069	{
1070	len += output_special_case (out_buffer: out_buffer ? out_buffer + len : NULL, offset: val - `0x1000000`, type: t, which: `0`);
1071	}
1072	else
1073	{
1074	if (t == G_UNICODE_TITLECASE_LETTER)
1075	{
1076	unsigned int i;
1077	for (i = `0`; i < G_N_ELEMENTS (title_table); ++i)
1078	{
1079	if (title_table[i][`0`] == c)
1080	{
1081	val = title_table[i][`2`];
1082	break;
1083	}
1084	}
1085	}
1086
1087	/ Not all uppercase letters are guaranteed to have a lowercase*
1088	* equivalent. If this is the case, val will be zero. */
1089	len += g_unichar_to_utf8 (c: val ? val : c, outbuf: out_buffer ? out_buffer + len : NULL);
1090	}
1091	}
1092	else
1093	{
1094	gsize char_len = g_utf8_skip[(guchar )last];
1095
1096	if (out_buffer)
1097	memcpy (dest: out_buffer + len, src: last, n: char_len);
1098
1099	len += char_len;
1100	}
1101
1102	}
1103
1104	return len;
1105	}
1106
1107	/**
1108	* g_utf8_strdown:
1109	* @str: a UTF-8 encoded string
1110	* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1111	*
1112	* Converts all Unicode characters in the string that have a case
1113	* to lowercase. The exact manner that this is done depends
1114	* on the current locale, and may result in the number of
1115	* characters in the string changing.
1116	*
1117	* Returns: a newly allocated string, with all characters
1118	* converted to lowercase.
1119	**/
1120	gchar *
1121	g_utf8_strdown (const gchar *str,
1122	gssize len)
1123	{
1124	gsize result_len;
1125	LocaleType locale_type;
1126	gchar *result;
1127
1128	g_return_val_if_fail (str != NULL, NULL);
1129
1130	locale_type = get_locale_type ();
1131
1132	/*
1133	* We use a two pass approach to keep memory management simple
1134	*/
1135	result_len = real_tolower (str, max_len: len, NULL, locale_type);
1136	result = g_malloc (n_bytes: result_len + `1`);
1137	real_tolower (str, max_len: len, out_buffer: result, locale_type);
1138	result[result_len] = `'\0'`;
1139
1140	return result;
1141	}
1142
1143	/**
1144	* g_utf8_casefold:
1145	* @str: a UTF-8 encoded string
1146	* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
1147	*
1148	* Converts a string into a form that is independent of case. The
1149	* result will not correspond to any particular case, but can be
1150	* compared for equality or ordered with the results of calling
1151	* g_utf8_casefold() on other strings.
1152	*
1153	* Note that calling g_utf8_casefold() followed by g_utf8_collate() is
1154	* only an approximation to the correct linguistic case insensitive
1155	* ordering, though it is a fairly good one. Getting this exactly
1156	* right would require a more sophisticated collation function that
1157	* takes case sensitivity into account. GLib does not currently
1158	* provide such a function.
1159	*
1160	* Returns: a newly allocated string, that is a
1161	* case independent form of @str.
1162	**/
1163	gchar *
1164	g_utf8_casefold (const gchar *str,
1165	gssize len)
1166	{
1167	GString *result;
1168	const char *p;
1169
1170	g_return_val_if_fail (str != NULL, NULL);
1171
1172	result = g_string_new (NULL);
1173	p = str;
1174	while ((len < `0` \|\| p < str + len) && *p)
1175	{
1176	gunichar ch = g_utf8_get_char (p);
1177
1178	int start = `0`;
1179	int end = G_N_ELEMENTS (casefold_table);
1180
1181	if (ch >= casefold_table[start].ch &&
1182	ch <= casefold_table[end - `1`].ch)
1183	{
1184	while (TRUE)
1185	{
1186	int half = (start + end) / `2`;
1187	if (ch == casefold_table[half].ch)
1188	{
1189	g_string_append (string: result, val: casefold_table[half].data);
1190	goto next;
1191	}
1192	else if (half == start)
1193	break;
1194	else if (ch > casefold_table[half].ch)
1195	start = half;
1196	else
1197	end = half;
1198	}
1199	}
1200
1201	g_string_append_unichar (string: result, wc: g_unichar_tolower (c: ch));
1202
1203	next:
1204	p = g_utf8_next_char (p);
1205	}
1206
1207	return g_string_free (string: result, FALSE);
1208	}
1209
1210	/**
1211	* g_unichar_get_mirror_char:
1212	* @ch: a Unicode character
1213	* @mirrored_ch: location to store the mirrored character
1214	*
1215	* In Unicode, some characters are "mirrored". This means that their
1216	* images are mirrored horizontally in text that is laid out from right
1217	* to left. For instance, "(" would become its mirror image, ")", in
1218	* right-to-left text.
1219	*
1220	* If @ch has the Unicode mirrored property and there is another unicode
1221	* character that typically has a glyph that is the mirror image of @ch's
1222	* glyph and @mirrored_ch is set, it puts that character in the address
1223	* pointed to by @mirrored_ch. Otherwise the original character is put.
1224	*
1225	* Returns: %TRUE if @ch has a mirrored character, %FALSE otherwise
1226	*
1227	* Since: 2.4
1228	**/
1229	gboolean
1230	g_unichar_get_mirror_char (gunichar ch,
1231	gunichar *mirrored_ch)
1232	{
1233	gboolean found;
1234	gunichar mirrored;
1235
1236	mirrored = GLIB_GET_MIRRORING(ch);
1237
1238	found = ch != mirrored;
1239	if (mirrored_ch)
1240	*mirrored_ch = mirrored;
1241
1242	return found;
1243
1244	}
1245
1246	#define G_SCRIPT_TABLE_MIDPOINT (G_N_ELEMENTS (g_script_table) / 2)
1247
1248	static inline GUnicodeScript
1249	g_unichar_get_script_bsearch (gunichar ch)
1250	{
1251	int lower = `0`;
1252	int upper = G_N_ELEMENTS (g_script_table) - `1`;
1253	static int saved_mid = G_SCRIPT_TABLE_MIDPOINT;
1254	int mid = saved_mid;
1255
1256
1257	do
1258	{
1259	if (ch < g_script_table[mid].start)
1260	upper = mid - `1`;
1261	else if (ch >= g_script_table[mid].start + g_script_table[mid].chars)
1262	lower = mid + `1`;
1263	else
1264	return g_script_table[saved_mid = mid].script;
1265
1266	mid = (lower + upper) / `2`;
1267	}
1268	while (lower <= upper);
1269
1270	return G_UNICODE_SCRIPT_UNKNOWN;
1271	}
1272
1273	/**
1274	* g_unichar_get_script:
1275	* @ch: a Unicode character
1276	*
1277	* Looks up the #GUnicodeScript for a particular character (as defined
1278	* by Unicode Standard Annex \#24). No check is made for @ch being a
1279	* valid Unicode character; if you pass in invalid character, the
1280	* result is undefined.
1281	*
1282	* This function is equivalent to pango_script_for_unichar() and the
1283	* two are interchangeable.
1284	*
1285	* Returns: the #GUnicodeScript for the character.
1286	*
1287	* Since: 2.14
1288	*/
1289	GUnicodeScript
1290	g_unichar_get_script (gunichar ch)
1291	{
1292	if (ch < G_EASY_SCRIPTS_RANGE)
1293	return g_script_easy_table[ch];
1294	else
1295	return g_unichar_get_script_bsearch (ch);
1296	}
1297
1298
1299	/ http://unicode.org/iso15924/ /
1300	static const guint32 iso15924_tags[] =
1301	{
1302	#define PACK(a,b,c,d) ((guint32)((((guint8)(a))<<24)\|(((guint8)(b))<<16)\|(((guint8)(c))<<8)\|((guint8)(d))))
1303
1304	PACK (`'Z'`,`'y'`,`'y'`,`'y'`), / G_UNICODE_SCRIPT_COMMON /
1305	PACK (`'Z'`,`'i'`,`'n'`,`'h'`), / G_UNICODE_SCRIPT_INHERITED /
1306	PACK (`'A'`,`'r'`,`'a'`,`'b'`), / G_UNICODE_SCRIPT_ARABIC /
1307	PACK (`'A'`,`'r'`,`'m'`,`'n'`), / G_UNICODE_SCRIPT_ARMENIAN /
1308	PACK (`'B'`,`'e'`,`'n'`,`'g'`), / G_UNICODE_SCRIPT_BENGALI /
1309	PACK (`'B'`,`'o'`,`'p'`,`'o'`), / G_UNICODE_SCRIPT_BOPOMOFO /
1310	PACK (`'C'`,`'h'`,`'e'`,`'r'`), / G_UNICODE_SCRIPT_CHEROKEE /
1311	PACK (`'C'`,`'o'`,`'p'`,`'t'`), / G_UNICODE_SCRIPT_COPTIC /
1312	PACK (`'C'`,`'y'`,`'r'`,`'l'`), / G_UNICODE_SCRIPT_CYRILLIC /
1313	PACK (`'D'`,`'s'`,`'r'`,`'t'`), / G_UNICODE_SCRIPT_DESERET /
1314	PACK (`'D'`,`'e'`,`'v'`,`'a'`), / G_UNICODE_SCRIPT_DEVANAGARI /
1315	PACK (`'E'`,`'t'`,`'h'`,`'i'`), / G_UNICODE_SCRIPT_ETHIOPIC /
1316	PACK (`'G'`,`'e'`,`'o'`,`'r'`), / G_UNICODE_SCRIPT_GEORGIAN /
1317	PACK (`'G'`,`'o'`,`'t'`,`'h'`), / G_UNICODE_SCRIPT_GOTHIC /
1318	PACK (`'G'`,`'r'`,`'e'`,`'k'`), / G_UNICODE_SCRIPT_GREEK /
1319	PACK (`'G'`,`'u'`,`'j'`,`'r'`), / G_UNICODE_SCRIPT_GUJARATI /
1320	PACK (`'G'`,`'u'`,`'r'`,`'u'`), / G_UNICODE_SCRIPT_GURMUKHI /
1321	PACK (`'H'`,`'a'`,`'n'`,`'i'`), / G_UNICODE_SCRIPT_HAN /
1322	PACK (`'H'`,`'a'`,`'n'`,`'g'`), / G_UNICODE_SCRIPT_HANGUL /
1323	PACK (`'H'`,`'e'`,`'b'`,`'r'`), / G_UNICODE_SCRIPT_HEBREW /
1324	PACK (`'H'`,`'i'`,`'r'`,`'a'`), / G_UNICODE_SCRIPT_HIRAGANA /
1325	PACK (`'K'`,`'n'`,`'d'`,`'a'`), / G_UNICODE_SCRIPT_KANNADA /
1326	PACK (`'K'`,`'a'`,`'n'`,`'a'`), / G_UNICODE_SCRIPT_KATAKANA /
1327	PACK (`'K'`,`'h'`,`'m'`,`'r'`), / G_UNICODE_SCRIPT_KHMER /
1328	PACK (`'L'`,`'a'`,`'o'`,`'o'`), / G_UNICODE_SCRIPT_LAO /
1329	PACK (`'L'`,`'a'`,`'t'`,`'n'`), / G_UNICODE_SCRIPT_LATIN /
1330	PACK (`'M'`,`'l'`,`'y'`,`'m'`), / G_UNICODE_SCRIPT_MALAYALAM /
1331	PACK (`'M'`,`'o'`,`'n'`,`'g'`), / G_UNICODE_SCRIPT_MONGOLIAN /
1332	PACK (`'M'`,`'y'`,`'m'`,`'r'`), / G_UNICODE_SCRIPT_MYANMAR /
1333	PACK (`'O'`,`'g'`,`'a'`,`'m'`), / G_UNICODE_SCRIPT_OGHAM /
1334	PACK (`'I'`,`'t'`,`'a'`,`'l'`), / G_UNICODE_SCRIPT_OLD_ITALIC /
1335	PACK (`'O'`,`'r'`,`'y'`,`'a'`), / G_UNICODE_SCRIPT_ORIYA /
1336	PACK (`'R'`,`'u'`,`'n'`,`'r'`), / G_UNICODE_SCRIPT_RUNIC /
1337	PACK (`'S'`,`'i'`,`'n'`,`'h'`), / G_UNICODE_SCRIPT_SINHALA /
1338	PACK (`'S'`,`'y'`,`'r'`,`'c'`), / G_UNICODE_SCRIPT_SYRIAC /
1339	PACK (`'T'`,`'a'`,`'m'`,`'l'`), / G_UNICODE_SCRIPT_TAMIL /
1340	PACK (`'T'`,`'e'`,`'l'`,`'u'`), / G_UNICODE_SCRIPT_TELUGU /
1341	PACK (`'T'`,`'h'`,`'a'`,`'a'`), / G_UNICODE_SCRIPT_THAANA /
1342	PACK (`'T'`,`'h'`,`'a'`,`'i'`), / G_UNICODE_SCRIPT_THAI /
1343	PACK (`'T'`,`'i'`,`'b'`,`'t'`), / G_UNICODE_SCRIPT_TIBETAN /
1344	PACK (`'C'`,`'a'`,`'n'`,`'s'`), / G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL /
1345	PACK (`'Y'`,`'i'`,`'i'`,`'i'`), / G_UNICODE_SCRIPT_YI /
1346	PACK (`'T'`,`'g'`,`'l'`,`'g'`), / G_UNICODE_SCRIPT_TAGALOG /
1347	PACK (`'H'`,`'a'`,`'n'`,`'o'`), / G_UNICODE_SCRIPT_HANUNOO /
1348	PACK (`'B'`,`'u'`,`'h'`,`'d'`), / G_UNICODE_SCRIPT_BUHID /
1349	PACK (`'T'`,`'a'`,`'g'`,`'b'`), / G_UNICODE_SCRIPT_TAGBANWA /
1350
1351	/ Unicode-4.0 additions /
1352	PACK (`'B'`,`'r'`,`'a'`,`'i'`), / G_UNICODE_SCRIPT_BRAILLE /
1353	PACK (`'C'`,`'p'`,`'r'`,`'t'`), / G_UNICODE_SCRIPT_CYPRIOT /
1354	PACK (`'L'`,`'i'`,`'m'`,`'b'`), / G_UNICODE_SCRIPT_LIMBU /
1355	PACK (`'O'`,`'s'`,`'m'`,`'a'`), / G_UNICODE_SCRIPT_OSMANYA /
1356	PACK (`'S'`,`'h'`,`'a'`,`'w'`), / G_UNICODE_SCRIPT_SHAVIAN /
1357	PACK (`'L'`,`'i'`,`'n'`,`'b'`), / G_UNICODE_SCRIPT_LINEAR_B /
1358	PACK (`'T'`,`'a'`,`'l'`,`'e'`), / G_UNICODE_SCRIPT_TAI_LE /
1359	PACK (`'U'`,`'g'`,`'a'`,`'r'`), / G_UNICODE_SCRIPT_UGARITIC /
1360
1361	/ Unicode-4.1 additions /
1362	PACK (`'T'`,`'a'`,`'l'`,`'u'`), / G_UNICODE_SCRIPT_NEW_TAI_LUE /
1363	PACK (`'B'`,`'u'`,`'g'`,`'i'`), / G_UNICODE_SCRIPT_BUGINESE /
1364	PACK (`'G'`,`'l'`,`'a'`,`'g'`), / G_UNICODE_SCRIPT_GLAGOLITIC /
1365	PACK (`'T'`,`'f'`,`'n'`,`'g'`), / G_UNICODE_SCRIPT_TIFINAGH /
1366	PACK (`'S'`,`'y'`,`'l'`,`'o'`), / G_UNICODE_SCRIPT_SYLOTI_NAGRI /
1367	PACK (`'X'`,`'p'`,`'e'`,`'o'`), / G_UNICODE_SCRIPT_OLD_PERSIAN /
1368	PACK (`'K'`,`'h'`,`'a'`,`'r'`), / G_UNICODE_SCRIPT_KHAROSHTHI /
1369
1370	/ Unicode-5.0 additions /
1371	PACK (`'Z'`,`'z'`,`'z'`,`'z'`), / G_UNICODE_SCRIPT_UNKNOWN /
1372	PACK (`'B'`,`'a'`,`'l'`,`'i'`), / G_UNICODE_SCRIPT_BALINESE /
1373	PACK (`'X'`,`'s'`,`'u'`,`'x'`), / G_UNICODE_SCRIPT_CUNEIFORM /
1374	PACK (`'P'`,`'h'`,`'n'`,`'x'`), / G_UNICODE_SCRIPT_PHOENICIAN /
1375	PACK (`'P'`,`'h'`,`'a'`,`'g'`), / G_UNICODE_SCRIPT_PHAGS_PA /
1376	PACK (`'N'`,`'k'`,`'o'`,`'o'`), / G_UNICODE_SCRIPT_NKO /
1377
1378	/ Unicode-5.1 additions /
1379	PACK (`'K'`,`'a'`,`'l'`,`'i'`), / G_UNICODE_SCRIPT_KAYAH_LI /
1380	PACK (`'L'`,`'e'`,`'p'`,`'c'`), / G_UNICODE_SCRIPT_LEPCHA /
1381	PACK (`'R'`,`'j'`,`'n'`,`'g'`), / G_UNICODE_SCRIPT_REJANG /
1382	PACK (`'S'`,`'u'`,`'n'`,`'d'`), / G_UNICODE_SCRIPT_SUNDANESE /
1383	PACK (`'S'`,`'a'`,`'u'`,`'r'`), / G_UNICODE_SCRIPT_SAURASHTRA /
1384	PACK (`'C'`,`'h'`,`'a'`,`'m'`), / G_UNICODE_SCRIPT_CHAM /
1385	PACK (`'O'`,`'l'`,`'c'`,`'k'`), / G_UNICODE_SCRIPT_OL_CHIKI /
1386	PACK (`'V'`,`'a'`,`'i'`,`'i'`), / G_UNICODE_SCRIPT_VAI /
1387	PACK (`'C'`,`'a'`,`'r'`,`'i'`), / G_UNICODE_SCRIPT_CARIAN /
1388	PACK (`'L'`,`'y'`,`'c'`,`'i'`), / G_UNICODE_SCRIPT_LYCIAN /
1389	PACK (`'L'`,`'y'`,`'d'`,`'i'`), / G_UNICODE_SCRIPT_LYDIAN /
1390
1391	/ Unicode-5.2 additions /
1392	PACK (`'A'`,`'v'`,`'s'`,`'t'`), / G_UNICODE_SCRIPT_AVESTAN /
1393	PACK (`'B'`,`'a'`,`'m'`,`'u'`), / G_UNICODE_SCRIPT_BAMUM /
1394	PACK (`'E'`,`'g'`,`'y'`,`'p'`), / G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS /
1395	PACK (`'A'`,`'r'`,`'m'`,`'i'`), / G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC /
1396	PACK (`'P'`,`'h'`,`'l'`,`'i'`), / G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI /
1397	PACK (`'P'`,`'r'`,`'t'`,`'i'`), / G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN /
1398	PACK (`'J'`,`'a'`,`'v'`,`'a'`), / G_UNICODE_SCRIPT_JAVANESE /
1399	PACK (`'K'`,`'t'`,`'h'`,`'i'`), / G_UNICODE_SCRIPT_KAITHI /
1400	PACK (`'L'`,`'i'`,`'s'`,`'u'`), / G_UNICODE_SCRIPT_LISU /
1401	PACK (`'M'`,`'t'`,`'e'`,`'i'`), / G_UNICODE_SCRIPT_MEETEI_MAYEK /
1402	PACK (`'S'`,`'a'`,`'r'`,`'b'`), / G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN /
1403	PACK (`'O'`,`'r'`,`'k'`,`'h'`), / G_UNICODE_SCRIPT_OLD_TURKIC /
1404	PACK (`'S'`,`'a'`,`'m'`,`'r'`), / G_UNICODE_SCRIPT_SAMARITAN /
1405	PACK (`'L'`,`'a'`,`'n'`,`'a'`), / G_UNICODE_SCRIPT_TAI_THAM /
1406	PACK (`'T'`,`'a'`,`'v'`,`'t'`), / G_UNICODE_SCRIPT_TAI_VIET /
1407
1408	/ Unicode-6.0 additions /
1409	PACK (`'B'`,`'a'`,`'t'`,`'k'`), / G_UNICODE_SCRIPT_BATAK /
1410	PACK (`'B'`,`'r'`,`'a'`,`'h'`), / G_UNICODE_SCRIPT_BRAHMI /
1411	PACK (`'M'`,`'a'`,`'n'`,`'d'`), / G_UNICODE_SCRIPT_MANDAIC /
1412
1413	/ Unicode-6.1 additions /
1414	PACK (`'C'`,`'a'`,`'k'`,`'m'`), / G_UNICODE_SCRIPT_CHAKMA /
1415	PACK (`'M'`,`'e'`,`'r'`,`'c'`), / G_UNICODE_SCRIPT_MEROITIC_CURSIVE /
1416	PACK (`'M'`,`'e'`,`'r'`,`'o'`), / G_UNICODE_SCRIPT_MEROITIC_HIEROGLYPHS /
1417	PACK (`'P'`,`'l'`,`'r'`,`'d'`), / G_UNICODE_SCRIPT_MIAO /
1418	PACK (`'S'`,`'h'`,`'r'`,`'d'`), / G_UNICODE_SCRIPT_SHARADA /
1419	PACK (`'S'`,`'o'`,`'r'`,`'a'`), / G_UNICODE_SCRIPT_SORA_SOMPENG /
1420	PACK (`'T'`,`'a'`,`'k'`,`'r'`), / G_UNICODE_SCRIPT_TAKRI /
1421
1422	/ Unicode 7.0 additions /
1423	PACK (`'B'`,`'a'`,`'s'`,`'s'`), / G_UNICODE_SCRIPT_BASSA_VAH /
1424	PACK (`'A'`,`'g'`,`'h'`,`'b'`), / G_UNICODE_SCRIPT_CAUCASIAN_ALBANIAN /
1425	PACK (`'D'`,`'u'`,`'p'`,`'l'`), / G_UNICODE_SCRIPT_DUPLOYAN /
1426	PACK (`'E'`,`'l'`,`'b'`,`'a'`), / G_UNICODE_SCRIPT_ELBASAN /
1427	PACK (`'G'`,`'r'`,`'a'`,`'n'`), / G_UNICODE_SCRIPT_GRANTHA /
1428	PACK (`'K'`,`'h'`,`'o'`,`'j'`), / G_UNICODE_SCRIPT_KHOJKI/
1429	PACK (`'S'`,`'i'`,`'n'`,`'d'`), / G_UNICODE_SCRIPT_KHUDAWADI /
1430	PACK (`'L'`,`'i'`,`'n'`,`'a'`), / G_UNICODE_SCRIPT_LINEAR_A /
1431	PACK (`'M'`,`'a'`,`'h'`,`'j'`), / G_UNICODE_SCRIPT_MAHAJANI /
1432	PACK (`'M'`,`'a'`,`'n'`,`'i'`), / G_UNICODE_SCRIPT_MANICHAEAN /
1433	PACK (`'M'`,`'e'`,`'n'`,`'d'`), / G_UNICODE_SCRIPT_MENDE_KIKAKUI /
1434	PACK (`'M'`,`'o'`,`'d'`,`'i'`), / G_UNICODE_SCRIPT_MODI /
1435	PACK (`'M'`,`'r'`,`'o'`,`'o'`), / G_UNICODE_SCRIPT_MRO /
1436	PACK (`'N'`,`'b'`,`'a'`,`'t'`), / G_UNICODE_SCRIPT_NABATAEAN /
1437	PACK (`'N'`,`'a'`,`'r'`,`'b'`), / G_UNICODE_SCRIPT_OLD_NORTH_ARABIAN /
1438	PACK (`'P'`,`'e'`,`'r'`,`'m'`), / G_UNICODE_SCRIPT_OLD_PERMIC /
1439	PACK (`'H'`,`'m'`,`'n'`,`'g'`), / G_UNICODE_SCRIPT_PAHAWH_HMONG /
1440	PACK (`'P'`,`'a'`,`'l'`,`'m'`), / G_UNICODE_SCRIPT_PALMYRENE /
1441	PACK (`'P'`,`'a'`,`'u'`,`'c'`), / G_UNICODE_SCRIPT_PAU_CIN_HAU /
1442	PACK (`'P'`,`'h'`,`'l'`,`'p'`), / G_UNICODE_SCRIPT_PSALTER_PAHLAVI /
1443	PACK (`'S'`,`'i'`,`'d'`,`'d'`), / G_UNICODE_SCRIPT_SIDDHAM /
1444	PACK (`'T'`,`'i'`,`'r'`,`'h'`), / G_UNICODE_SCRIPT_TIRHUTA /
1445	PACK (`'W'`,`'a'`,`'r'`,`'a'`), / G_UNICODE_SCRIPT_WARANG_CITI /
1446
1447	/ Unicode 8.0 additions /
1448	PACK (`'A'`,`'h'`,`'o'`,`'m'`), / G_UNICODE_SCRIPT_AHOM /
1449	PACK (`'H'`,`'l'`,`'u'`,`'w'`), / G_UNICODE_SCRIPT_ANATOLIAN_HIEROGLYPHS /
1450	PACK (`'H'`,`'a'`,`'t'`,`'r'`), / G_UNICODE_SCRIPT_HATRAN /
1451	PACK (`'M'`,`'u'`,`'l'`,`'t'`), / G_UNICODE_SCRIPT_MULTANI /
1452	PACK (`'H'`,`'u'`,`'n'`,`'g'`), / G_UNICODE_SCRIPT_OLD_HUNGARIAN /
1453	PACK (`'S'`,`'g'`,`'n'`,`'w'`), / G_UNICODE_SCRIPT_SIGNWRITING /
1454
1455	/ Unicode 9.0 additions /
1456	PACK (`'A'`,`'d'`,`'l'`,`'m'`), / G_UNICODE_SCRIPT_ADLAM /
1457	PACK (`'B'`,`'h'`,`'k'`,`'s'`), / G_UNICODE_SCRIPT_BHAIKSUKI /
1458	PACK (`'M'`,`'a'`,`'r'`,`'c'`), / G_UNICODE_SCRIPT_MARCHEN /
1459	PACK (`'N'`,`'e'`,`'w'`,`'a'`), / G_UNICODE_SCRIPT_NEWA /
1460	PACK (`'O'`,`'s'`,`'g'`,`'e'`), / G_UNICODE_SCRIPT_OSAGE /
1461	PACK (`'T'`,`'a'`,`'n'`,`'g'`), / G_UNICODE_SCRIPT_TANGUT /
1462
1463	/ Unicode 10.0 additions /
1464	PACK (`'G'`,`'o'`,`'n'`,`'m'`), / G_UNICODE_SCRIPT_MASARAM_GONDI /
1465	PACK (`'N'`,`'s'`,`'h'`,`'u'`), / G_UNICODE_SCRIPT_NUSHU /
1466	PACK (`'S'`,`'o'`,`'y'`,`'o'`), / G_UNICODE_SCRIPT_SOYOMBO /
1467	PACK (`'Z'`,`'a'`,`'n'`,`'b'`), / G_UNICODE_SCRIPT_ZANABAZAR_SQUARE /
1468
1469	/ Unicode 11.0 additions /
1470	PACK (`'D'`,`'o'`,`'g'`,`'r'`), / G_UNICODE_SCRIPT_DOGRA /
1471	PACK (`'G'`,`'o'`,`'n'`,`'g'`), / G_UNICODE_SCRIPT_GUNJALA_GONDI /
1472	PACK (`'R'`,`'o'`,`'h'`,`'g'`), / G_UNICODE_SCRIPT_HANIFI_ROHINGYA /
1473	PACK (`'M'`,`'a'`,`'k'`,`'a'`), / G_UNICODE_SCRIPT_MAKASAR /
1474	PACK (`'M'`,`'e'`,`'d'`,`'f'`), / G_UNICODE_SCRIPT_MEDEFAIDRIN /
1475	PACK (`'S'`,`'o'`,`'g'`,`'o'`), / G_UNICODE_SCRIPT_OLD_SOGDIAN /
1476	PACK (`'S'`,`'o'`,`'g'`,`'d'`), / G_UNICODE_SCRIPT_SOGDIAN /
1477
1478	/ Unicode 12.0 additions /
1479	PACK (`'E'`,`'l'`,`'y'`,`'m'`), / G_UNICODE_SCRIPT_ELYMAIC /
1480	PACK (`'N'`,`'a'`,`'n'`,`'d'`), / G_UNICODE_SCRIPT_NANDINAGARI /
1481	PACK (`'H'`,`'m'`,`'n'`,`'p'`), / G_UNICODE_SCRIPT_NYIAKENG_PUACHUE_HMONG /
1482	PACK (`'W'`,`'c'`,`'h'`,`'o'`), / G_UNICODE_SCRIPT_WANCHO /
1483
1484	/ Unicode 13.0 additions /
1485	PACK (`'C'`, `'h'`, `'r'`, `'s'`), / G_UNICODE_SCRIPT_CHORASMIAN /
1486	PACK (`'D'`, `'i'`, `'a'`, `'k'`), / G_UNICODE_SCRIPT_DIVES_AKURU /
1487	PACK (`'K'`, `'i'`, `'t'`, `'s'`), / G_UNICODE_SCRIPT_KHITAN_SMALL_SCRIPT /
1488	PACK (`'Y'`, `'e'`, `'z'`, `'i'`), / G_UNICODE_SCRIPT_YEZIDI /
1489	#undef PACK
1490	};
1491
1492	/**
1493	* g_unicode_script_to_iso15924:
1494	* @script: a Unicode script
1495	*
1496	* Looks up the ISO 15924 code for @script. ISO 15924 assigns four-letter
1497	* codes to scripts. For example, the code for Arabic is 'Arab'. The
1498	* four letter codes are encoded as a @guint32 by this function in a
1499	* big-endian fashion. That is, the code returned for Arabic is
1500	* 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc).
1501	*
1502	* See
1503	* [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html)
1504	* for details.
1505	*
1506	* Returns: the ISO 15924 code for @script, encoded as an integer,
1507	* of zero if @script is %G_UNICODE_SCRIPT_INVALID_CODE or
1508	* ISO 15924 code 'Zzzz' (script code for UNKNOWN) if @script is not understood.
1509	*
1510	* Since: 2.30
1511	*/
1512	guint32
1513	g_unicode_script_to_iso15924 (GUnicodeScript script)
1514	{
1515	if (G_UNLIKELY (script == G_UNICODE_SCRIPT_INVALID_CODE))
1516	return `0`;
1517
1518	if (G_UNLIKELY (script < `0` \|\| script >= (int) G_N_ELEMENTS (iso15924_tags)))
1519	return `0x5A7A7A7A`;
1520
1521	return iso15924_tags[script];
1522	}
1523
1524	/**
1525	* g_unicode_script_from_iso15924:
1526	* @iso15924: a Unicode script
1527	*
1528	* Looks up the Unicode script for @iso15924. ISO 15924 assigns four-letter
1529	* codes to scripts. For example, the code for Arabic is 'Arab'.
1530	* This function accepts four letter codes encoded as a @guint32 in a
1531	* big-endian fashion. That is, the code expected for Arabic is
1532	* 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc).
1533	*
1534	* See
1535	* [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html)
1536	* for details.
1537	*
1538	* Returns: the Unicode script for @iso15924, or
1539	* of %G_UNICODE_SCRIPT_INVALID_CODE if @iso15924 is zero and
1540	* %G_UNICODE_SCRIPT_UNKNOWN if @iso15924 is unknown.
1541	*
1542	* Since: 2.30
1543	*/
1544	GUnicodeScript
1545	g_unicode_script_from_iso15924 (guint32 iso15924)
1546	{
1547	unsigned int i;
1548
1549	if (!iso15924)
1550	return G_UNICODE_SCRIPT_INVALID_CODE;
1551
1552	for (i = `0`; i < G_N_ELEMENTS (iso15924_tags); i++)
1553	if (iso15924_tags[i] == iso15924)
1554	return (GUnicodeScript) i;
1555
1556	return G_UNICODE_SCRIPT_UNKNOWN;
1557	}
1558

source code of gtk/subprojects/glib/glib/guniprop.c