1 | /* guniprop.c - Unicode character properties. |
2 | * |
3 | * Copyright (C) 1999 Tom Tromey |
4 | * Copyright (C) 2000 Red Hat, Inc. |
5 | * |
6 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Lesser General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2.1 of the License, or (at your option) any later version. |
10 | * |
11 | * This library is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Lesser General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
18 | */ |
19 | |
20 | #include "config.h" |
21 | |
22 | #include <stdlib.h> |
23 | #include <stddef.h> |
24 | #include <string.h> |
25 | #include <locale.h> |
26 | |
27 | #include "gmem.h" |
28 | #include "gstring.h" |
29 | #include "gtestutils.h" |
30 | #include "gtypes.h" |
31 | #include "gunicode.h" |
32 | #include "gunichartables.h" |
33 | #include "gmirroringtable.h" |
34 | #include "gscripttable.h" |
35 | #include "gunicodeprivate.h" |
36 | #ifdef G_OS_WIN32 |
37 | #include "gwin32.h" |
38 | #endif |
39 | |
40 | #define G_UNICHAR_FULLWIDTH_A 0xff21 |
41 | #define G_UNICHAR_FULLWIDTH_I 0xff29 |
42 | #define G_UNICHAR_FULLWIDTH_J 0xff2a |
43 | #define G_UNICHAR_FULLWIDTH_F 0xff26 |
44 | #define G_UNICHAR_FULLWIDTH_a 0xff41 |
45 | #define G_UNICHAR_FULLWIDTH_f 0xff46 |
46 | |
47 | #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \ |
48 | ? attr_table_part1[Page] \ |
49 | : attr_table_part2[(Page) - 0xe00]) |
50 | |
51 | #define ATTTABLE(Page, Char) \ |
52 | ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char])) |
53 | |
54 | #define TTYPE_PART1(Page, Char) \ |
55 | ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
56 | ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
57 | : (type_data[type_table_part1[Page]][Char])) |
58 | |
59 | #define TTYPE_PART2(Page, Char) \ |
60 | ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
61 | ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
62 | : (type_data[type_table_part2[Page]][Char])) |
63 | |
64 | #define TYPE(Char) \ |
65 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
66 | ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \ |
67 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
68 | ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
69 | : G_UNICODE_UNASSIGNED)) |
70 | |
71 | |
72 | #define IS(Type, Class) (((guint)1 << (Type)) & (Class)) |
73 | #define OR(Type, Rest) (((guint)1 << (Type)) | (Rest)) |
74 | |
75 | |
76 | |
77 | #define ISALPHA(Type) IS ((Type), \ |
78 | OR (G_UNICODE_LOWERCASE_LETTER, \ |
79 | OR (G_UNICODE_UPPERCASE_LETTER, \ |
80 | OR (G_UNICODE_TITLECASE_LETTER, \ |
81 | OR (G_UNICODE_MODIFIER_LETTER, \ |
82 | OR (G_UNICODE_OTHER_LETTER, 0)))))) |
83 | |
84 | #define ISALDIGIT(Type) IS ((Type), \ |
85 | OR (G_UNICODE_DECIMAL_NUMBER, \ |
86 | OR (G_UNICODE_LETTER_NUMBER, \ |
87 | OR (G_UNICODE_OTHER_NUMBER, \ |
88 | OR (G_UNICODE_LOWERCASE_LETTER, \ |
89 | OR (G_UNICODE_UPPERCASE_LETTER, \ |
90 | OR (G_UNICODE_TITLECASE_LETTER, \ |
91 | OR (G_UNICODE_MODIFIER_LETTER, \ |
92 | OR (G_UNICODE_OTHER_LETTER, 0))))))))) |
93 | |
94 | #define ISMARK(Type) IS ((Type), \ |
95 | OR (G_UNICODE_NON_SPACING_MARK, \ |
96 | OR (G_UNICODE_SPACING_MARK, \ |
97 | OR (G_UNICODE_ENCLOSING_MARK, 0)))) |
98 | |
99 | #define ISZEROWIDTHTYPE(Type) IS ((Type), \ |
100 | OR (G_UNICODE_NON_SPACING_MARK, \ |
101 | OR (G_UNICODE_ENCLOSING_MARK, \ |
102 | OR (G_UNICODE_FORMAT, 0)))) |
103 | |
104 | /** |
105 | * g_unichar_isalnum: |
106 | * @c: a Unicode character |
107 | * |
108 | * Determines whether a character is alphanumeric. |
109 | * Given some UTF-8 text, obtain a character value |
110 | * with g_utf8_get_char(). |
111 | * |
112 | * Returns: %TRUE if @c is an alphanumeric character |
113 | **/ |
114 | gboolean |
115 | g_unichar_isalnum (gunichar c) |
116 | { |
117 | return ISALDIGIT (TYPE (c)) ? TRUE : FALSE; |
118 | } |
119 | |
120 | /** |
121 | * g_unichar_isalpha: |
122 | * @c: a Unicode character |
123 | * |
124 | * Determines whether a character is alphabetic (i.e. a letter). |
125 | * Given some UTF-8 text, obtain a character value with |
126 | * g_utf8_get_char(). |
127 | * |
128 | * Returns: %TRUE if @c is an alphabetic character |
129 | **/ |
130 | gboolean |
131 | g_unichar_isalpha (gunichar c) |
132 | { |
133 | return ISALPHA (TYPE (c)) ? TRUE : FALSE; |
134 | } |
135 | |
136 | |
137 | /** |
138 | * g_unichar_iscntrl: |
139 | * @c: a Unicode character |
140 | * |
141 | * Determines whether a character is a control character. |
142 | * Given some UTF-8 text, obtain a character value with |
143 | * g_utf8_get_char(). |
144 | * |
145 | * Returns: %TRUE if @c is a control character |
146 | **/ |
147 | gboolean |
148 | g_unichar_iscntrl (gunichar c) |
149 | { |
150 | return TYPE (c) == G_UNICODE_CONTROL; |
151 | } |
152 | |
153 | /** |
154 | * g_unichar_isdigit: |
155 | * @c: a Unicode character |
156 | * |
157 | * Determines whether a character is numeric (i.e. a digit). This |
158 | * covers ASCII 0-9 and also digits in other languages/scripts. Given |
159 | * some UTF-8 text, obtain a character value with g_utf8_get_char(). |
160 | * |
161 | * Returns: %TRUE if @c is a digit |
162 | **/ |
163 | gboolean |
164 | g_unichar_isdigit (gunichar c) |
165 | { |
166 | return TYPE (c) == G_UNICODE_DECIMAL_NUMBER; |
167 | } |
168 | |
169 | |
170 | /** |
171 | * g_unichar_isgraph: |
172 | * @c: a Unicode character |
173 | * |
174 | * Determines whether a character is printable and not a space |
175 | * (returns %FALSE for control characters, format characters, and |
176 | * spaces). g_unichar_isprint() is similar, but returns %TRUE for |
177 | * spaces. Given some UTF-8 text, obtain a character value with |
178 | * g_utf8_get_char(). |
179 | * |
180 | * Returns: %TRUE if @c is printable unless it's a space |
181 | **/ |
182 | gboolean |
183 | g_unichar_isgraph (gunichar c) |
184 | { |
185 | return !IS (TYPE(c), |
186 | OR (G_UNICODE_CONTROL, |
187 | OR (G_UNICODE_FORMAT, |
188 | OR (G_UNICODE_UNASSIGNED, |
189 | OR (G_UNICODE_SURROGATE, |
190 | OR (G_UNICODE_SPACE_SEPARATOR, |
191 | 0)))))); |
192 | } |
193 | |
194 | /** |
195 | * g_unichar_islower: |
196 | * @c: a Unicode character |
197 | * |
198 | * Determines whether a character is a lowercase letter. |
199 | * Given some UTF-8 text, obtain a character value with |
200 | * g_utf8_get_char(). |
201 | * |
202 | * Returns: %TRUE if @c is a lowercase letter |
203 | **/ |
204 | gboolean |
205 | g_unichar_islower (gunichar c) |
206 | { |
207 | return TYPE (c) == G_UNICODE_LOWERCASE_LETTER; |
208 | } |
209 | |
210 | |
211 | /** |
212 | * g_unichar_isprint: |
213 | * @c: a Unicode character |
214 | * |
215 | * Determines whether a character is printable. |
216 | * Unlike g_unichar_isgraph(), returns %TRUE for spaces. |
217 | * Given some UTF-8 text, obtain a character value with |
218 | * g_utf8_get_char(). |
219 | * |
220 | * Returns: %TRUE if @c is printable |
221 | **/ |
222 | gboolean |
223 | g_unichar_isprint (gunichar c) |
224 | { |
225 | return !IS (TYPE(c), |
226 | OR (G_UNICODE_CONTROL, |
227 | OR (G_UNICODE_FORMAT, |
228 | OR (G_UNICODE_UNASSIGNED, |
229 | OR (G_UNICODE_SURROGATE, |
230 | 0))))); |
231 | } |
232 | |
233 | /** |
234 | * g_unichar_ispunct: |
235 | * @c: a Unicode character |
236 | * |
237 | * Determines whether a character is punctuation or a symbol. |
238 | * Given some UTF-8 text, obtain a character value with |
239 | * g_utf8_get_char(). |
240 | * |
241 | * Returns: %TRUE if @c is a punctuation or symbol character |
242 | **/ |
243 | gboolean |
244 | g_unichar_ispunct (gunichar c) |
245 | { |
246 | return IS (TYPE(c), |
247 | OR (G_UNICODE_CONNECT_PUNCTUATION, |
248 | OR (G_UNICODE_DASH_PUNCTUATION, |
249 | OR (G_UNICODE_CLOSE_PUNCTUATION, |
250 | OR (G_UNICODE_FINAL_PUNCTUATION, |
251 | OR (G_UNICODE_INITIAL_PUNCTUATION, |
252 | OR (G_UNICODE_OTHER_PUNCTUATION, |
253 | OR (G_UNICODE_OPEN_PUNCTUATION, |
254 | OR (G_UNICODE_CURRENCY_SYMBOL, |
255 | OR (G_UNICODE_MODIFIER_SYMBOL, |
256 | OR (G_UNICODE_MATH_SYMBOL, |
257 | OR (G_UNICODE_OTHER_SYMBOL, |
258 | 0)))))))))))) ? TRUE : FALSE; |
259 | } |
260 | |
261 | /** |
262 | * g_unichar_isspace: |
263 | * @c: a Unicode character |
264 | * |
265 | * Determines whether a character is a space, tab, or line separator |
266 | * (newline, carriage return, etc.). Given some UTF-8 text, obtain a |
267 | * character value with g_utf8_get_char(). |
268 | * |
269 | * (Note: don't use this to do word breaking; you have to use |
270 | * Pango or equivalent to get word breaking right, the algorithm |
271 | * is fairly complex.) |
272 | * |
273 | * Returns: %TRUE if @c is a space character |
274 | **/ |
275 | gboolean |
276 | g_unichar_isspace (gunichar c) |
277 | { |
278 | switch (c) |
279 | { |
280 | /* special-case these since Unicode thinks they are not spaces */ |
281 | case '\t': |
282 | case '\n': |
283 | case '\r': |
284 | case '\f': |
285 | return TRUE; |
286 | break; |
287 | |
288 | default: |
289 | { |
290 | return IS (TYPE(c), |
291 | OR (G_UNICODE_SPACE_SEPARATOR, |
292 | OR (G_UNICODE_LINE_SEPARATOR, |
293 | OR (G_UNICODE_PARAGRAPH_SEPARATOR, |
294 | 0)))) ? TRUE : FALSE; |
295 | } |
296 | break; |
297 | } |
298 | } |
299 | |
300 | /** |
301 | * g_unichar_ismark: |
302 | * @c: a Unicode character |
303 | * |
304 | * Determines whether a character is a mark (non-spacing mark, |
305 | * combining mark, or enclosing mark in Unicode speak). |
306 | * Given some UTF-8 text, obtain a character value |
307 | * with g_utf8_get_char(). |
308 | * |
309 | * Note: in most cases where isalpha characters are allowed, |
310 | * ismark characters should be allowed to as they are essential |
311 | * for writing most European languages as well as many non-Latin |
312 | * scripts. |
313 | * |
314 | * Returns: %TRUE if @c is a mark character |
315 | * |
316 | * Since: 2.14 |
317 | **/ |
318 | gboolean |
319 | g_unichar_ismark (gunichar c) |
320 | { |
321 | return ISMARK (TYPE (c)); |
322 | } |
323 | |
324 | /** |
325 | * g_unichar_isupper: |
326 | * @c: a Unicode character |
327 | * |
328 | * Determines if a character is uppercase. |
329 | * |
330 | * Returns: %TRUE if @c is an uppercase character |
331 | **/ |
332 | gboolean |
333 | g_unichar_isupper (gunichar c) |
334 | { |
335 | return TYPE (c) == G_UNICODE_UPPERCASE_LETTER; |
336 | } |
337 | |
338 | /** |
339 | * g_unichar_istitle: |
340 | * @c: a Unicode character |
341 | * |
342 | * Determines if a character is titlecase. Some characters in |
343 | * Unicode which are composites, such as the DZ digraph |
344 | * have three case variants instead of just two. The titlecase |
345 | * form is used at the beginning of a word where only the |
346 | * first letter is capitalized. The titlecase form of the DZ |
347 | * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. |
348 | * |
349 | * Returns: %TRUE if the character is titlecase |
350 | **/ |
351 | gboolean |
352 | g_unichar_istitle (gunichar c) |
353 | { |
354 | unsigned int i; |
355 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
356 | if (title_table[i][0] == c) |
357 | return TRUE; |
358 | return FALSE; |
359 | } |
360 | |
361 | /** |
362 | * g_unichar_isxdigit: |
363 | * @c: a Unicode character. |
364 | * |
365 | * Determines if a character is a hexadecimal digit. |
366 | * |
367 | * Returns: %TRUE if the character is a hexadecimal digit |
368 | **/ |
369 | gboolean |
370 | g_unichar_isxdigit (gunichar c) |
371 | { |
372 | return ((c >= 'a' && c <= 'f') || |
373 | (c >= 'A' && c <= 'F') || |
374 | (c >= G_UNICHAR_FULLWIDTH_a && c <= G_UNICHAR_FULLWIDTH_f) || |
375 | (c >= G_UNICHAR_FULLWIDTH_A && c <= G_UNICHAR_FULLWIDTH_F) || |
376 | (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)); |
377 | } |
378 | |
379 | /** |
380 | * g_unichar_isdefined: |
381 | * @c: a Unicode character |
382 | * |
383 | * Determines if a given character is assigned in the Unicode |
384 | * standard. |
385 | * |
386 | * Returns: %TRUE if the character has an assigned value |
387 | **/ |
388 | gboolean |
389 | g_unichar_isdefined (gunichar c) |
390 | { |
391 | return !IS (TYPE(c), |
392 | OR (G_UNICODE_UNASSIGNED, |
393 | OR (G_UNICODE_SURROGATE, |
394 | 0))); |
395 | } |
396 | |
397 | /** |
398 | * g_unichar_iszerowidth: |
399 | * @c: a Unicode character |
400 | * |
401 | * Determines if a given character typically takes zero width when rendered. |
402 | * The return value is %TRUE for all non-spacing and enclosing marks |
403 | * (e.g., combining accents), format characters, zero-width |
404 | * space, but not U+00AD SOFT HYPHEN. |
405 | * |
406 | * A typical use of this function is with one of g_unichar_iswide() or |
407 | * g_unichar_iswide_cjk() to determine the number of cells a string occupies |
408 | * when displayed on a grid display (terminals). However, note that not all |
409 | * terminals support zero-width rendering of zero-width marks. |
410 | * |
411 | * Returns: %TRUE if the character has zero width |
412 | * |
413 | * Since: 2.14 |
414 | **/ |
415 | gboolean |
416 | g_unichar_iszerowidth (gunichar c) |
417 | { |
418 | if (G_UNLIKELY (c == 0x00AD)) |
419 | return FALSE; |
420 | |
421 | if (G_UNLIKELY (ISZEROWIDTHTYPE (TYPE (c)))) |
422 | return TRUE; |
423 | |
424 | if (G_UNLIKELY ((c >= 0x1160 && c < 0x1200) || |
425 | c == 0x200B)) |
426 | return TRUE; |
427 | |
428 | return FALSE; |
429 | } |
430 | |
431 | static int |
432 | interval_compare (const void *key, const void *elt) |
433 | { |
434 | gunichar c = GPOINTER_TO_UINT (key); |
435 | struct Interval *interval = (struct Interval *)elt; |
436 | |
437 | if (c < interval->start) |
438 | return -1; |
439 | if (c > interval->end) |
440 | return +1; |
441 | |
442 | return 0; |
443 | } |
444 | |
445 | #define G_WIDTH_TABLE_MIDPOINT (G_N_ELEMENTS (g_unicode_width_table_wide) / 2) |
446 | |
447 | static inline gboolean |
448 | g_unichar_iswide_bsearch (gunichar ch) |
449 | { |
450 | int lower = 0; |
451 | int upper = G_N_ELEMENTS (g_unicode_width_table_wide) - 1; |
452 | static int saved_mid = G_WIDTH_TABLE_MIDPOINT; |
453 | int mid = saved_mid; |
454 | |
455 | do |
456 | { |
457 | if (ch < g_unicode_width_table_wide[mid].start) |
458 | upper = mid - 1; |
459 | else if (ch > g_unicode_width_table_wide[mid].end) |
460 | lower = mid + 1; |
461 | else |
462 | return TRUE; |
463 | |
464 | mid = (lower + upper) / 2; |
465 | } |
466 | while (lower <= upper); |
467 | |
468 | return FALSE; |
469 | } |
470 | |
471 | /** |
472 | * g_unichar_iswide: |
473 | * @c: a Unicode character |
474 | * |
475 | * Determines if a character is typically rendered in a double-width |
476 | * cell. |
477 | * |
478 | * Returns: %TRUE if the character is wide |
479 | **/ |
480 | gboolean |
481 | g_unichar_iswide (gunichar c) |
482 | { |
483 | if (c < g_unicode_width_table_wide[0].start) |
484 | return FALSE; |
485 | else |
486 | return g_unichar_iswide_bsearch (ch: c); |
487 | } |
488 | |
489 | |
490 | /** |
491 | * g_unichar_iswide_cjk: |
492 | * @c: a Unicode character |
493 | * |
494 | * Determines if a character is typically rendered in a double-width |
495 | * cell under legacy East Asian locales. If a character is wide according to |
496 | * g_unichar_iswide(), then it is also reported wide with this function, but |
497 | * the converse is not necessarily true. See the |
498 | * [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/) |
499 | * for details. |
500 | * |
501 | * If a character passes the g_unichar_iswide() test then it will also pass |
502 | * this test, but not the other way around. Note that some characters may |
503 | * pass both this test and g_unichar_iszerowidth(). |
504 | * |
505 | * Returns: %TRUE if the character is wide in legacy East Asian locales |
506 | * |
507 | * Since: 2.12 |
508 | */ |
509 | gboolean |
510 | g_unichar_iswide_cjk (gunichar c) |
511 | { |
512 | if (g_unichar_iswide (c)) |
513 | return TRUE; |
514 | |
515 | /* bsearch() is declared attribute(nonnull(1)) so we can't validly search |
516 | * for a NULL key */ |
517 | if (c == 0) |
518 | return FALSE; |
519 | |
520 | if (bsearch (GUINT_TO_POINTER (c), |
521 | base: g_unicode_width_table_ambiguous, |
522 | G_N_ELEMENTS (g_unicode_width_table_ambiguous), |
523 | size: sizeof g_unicode_width_table_ambiguous[0], |
524 | compar: interval_compare)) |
525 | return TRUE; |
526 | |
527 | return FALSE; |
528 | } |
529 | |
530 | |
531 | /** |
532 | * g_unichar_toupper: |
533 | * @c: a Unicode character |
534 | * |
535 | * Converts a character to uppercase. |
536 | * |
537 | * Returns: the result of converting @c to uppercase. |
538 | * If @c is not a lowercase or titlecase character, |
539 | * or has no upper case equivalent @c is returned unchanged. |
540 | **/ |
541 | gunichar |
542 | g_unichar_toupper (gunichar c) |
543 | { |
544 | int t = TYPE (c); |
545 | if (t == G_UNICODE_LOWERCASE_LETTER) |
546 | { |
547 | gunichar val = ATTTABLE (c >> 8, c & 0xff); |
548 | if (val >= 0x1000000) |
549 | { |
550 | const gchar *p = special_case_table + val - 0x1000000; |
551 | val = g_utf8_get_char (p); |
552 | } |
553 | /* Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR, |
554 | * do not have an uppercase equivalent, in which case val will be |
555 | * zero. |
556 | */ |
557 | return val ? val : c; |
558 | } |
559 | else if (t == G_UNICODE_TITLECASE_LETTER) |
560 | { |
561 | unsigned int i; |
562 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
563 | { |
564 | if (title_table[i][0] == c) |
565 | return title_table[i][1] ? title_table[i][1] : c; |
566 | } |
567 | } |
568 | return c; |
569 | } |
570 | |
571 | /** |
572 | * g_unichar_tolower: |
573 | * @c: a Unicode character. |
574 | * |
575 | * Converts a character to lower case. |
576 | * |
577 | * Returns: the result of converting @c to lower case. |
578 | * If @c is not an upperlower or titlecase character, |
579 | * or has no lowercase equivalent @c is returned unchanged. |
580 | **/ |
581 | gunichar |
582 | g_unichar_tolower (gunichar c) |
583 | { |
584 | int t = TYPE (c); |
585 | if (t == G_UNICODE_UPPERCASE_LETTER) |
586 | { |
587 | gunichar val = ATTTABLE (c >> 8, c & 0xff); |
588 | if (val >= 0x1000000) |
589 | { |
590 | const gchar *p = special_case_table + val - 0x1000000; |
591 | return g_utf8_get_char (p); |
592 | } |
593 | else |
594 | { |
595 | /* Not all uppercase letters are guaranteed to have a lowercase |
596 | * equivalent. If this is the case, val will be zero. */ |
597 | return val ? val : c; |
598 | } |
599 | } |
600 | else if (t == G_UNICODE_TITLECASE_LETTER) |
601 | { |
602 | unsigned int i; |
603 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
604 | { |
605 | if (title_table[i][0] == c) |
606 | return title_table[i][2]; |
607 | } |
608 | } |
609 | return c; |
610 | } |
611 | |
612 | /** |
613 | * g_unichar_totitle: |
614 | * @c: a Unicode character |
615 | * |
616 | * Converts a character to the titlecase. |
617 | * |
618 | * Returns: the result of converting @c to titlecase. |
619 | * If @c is not an uppercase or lowercase character, |
620 | * @c is returned unchanged. |
621 | **/ |
622 | gunichar |
623 | g_unichar_totitle (gunichar c) |
624 | { |
625 | unsigned int i; |
626 | |
627 | /* We handle U+0000 explicitly because some elements in |
628 | * title_table[i][1] may be null. */ |
629 | if (c == 0) |
630 | return c; |
631 | |
632 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
633 | { |
634 | if (title_table[i][0] == c || title_table[i][1] == c |
635 | || title_table[i][2] == c) |
636 | return title_table[i][0]; |
637 | } |
638 | |
639 | if (TYPE (c) == G_UNICODE_LOWERCASE_LETTER) |
640 | return g_unichar_toupper (c); |
641 | |
642 | return c; |
643 | } |
644 | |
645 | /** |
646 | * g_unichar_digit_value: |
647 | * @c: a Unicode character |
648 | * |
649 | * Determines the numeric value of a character as a decimal |
650 | * digit. |
651 | * |
652 | * Returns: If @c is a decimal digit (according to |
653 | * g_unichar_isdigit()), its numeric value. Otherwise, -1. |
654 | **/ |
655 | int |
656 | g_unichar_digit_value (gunichar c) |
657 | { |
658 | if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
659 | return ATTTABLE (c >> 8, c & 0xff); |
660 | return -1; |
661 | } |
662 | |
663 | /** |
664 | * g_unichar_xdigit_value: |
665 | * @c: a Unicode character |
666 | * |
667 | * Determines the numeric value of a character as a hexadecimal |
668 | * digit. |
669 | * |
670 | * Returns: If @c is a hex digit (according to |
671 | * g_unichar_isxdigit()), its numeric value. Otherwise, -1. |
672 | **/ |
673 | int |
674 | g_unichar_xdigit_value (gunichar c) |
675 | { |
676 | if (c >= 'A' && c <= 'F') |
677 | return c - 'A' + 10; |
678 | if (c >= 'a' && c <= 'f') |
679 | return c - 'a' + 10; |
680 | if (c >= G_UNICHAR_FULLWIDTH_A && c <= G_UNICHAR_FULLWIDTH_F) |
681 | return c - G_UNICHAR_FULLWIDTH_A + 10; |
682 | if (c >= G_UNICHAR_FULLWIDTH_a && c <= G_UNICHAR_FULLWIDTH_f) |
683 | return c - G_UNICHAR_FULLWIDTH_a + 10; |
684 | if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
685 | return ATTTABLE (c >> 8, c & 0xff); |
686 | return -1; |
687 | } |
688 | |
689 | /** |
690 | * g_unichar_type: |
691 | * @c: a Unicode character |
692 | * |
693 | * Classifies a Unicode character by type. |
694 | * |
695 | * Returns: the type of the character. |
696 | **/ |
697 | GUnicodeType |
698 | g_unichar_type (gunichar c) |
699 | { |
700 | return TYPE (c); |
701 | } |
702 | |
703 | /* |
704 | * Case mapping functions |
705 | */ |
706 | |
707 | typedef enum { |
708 | LOCALE_NORMAL, |
709 | LOCALE_TURKIC, |
710 | LOCALE_LITHUANIAN |
711 | } LocaleType; |
712 | |
713 | static LocaleType |
714 | get_locale_type (void) |
715 | { |
716 | #ifdef G_OS_WIN32 |
717 | char *tem = g_win32_getlocale (); |
718 | char locale[2]; |
719 | |
720 | locale[0] = tem[0]; |
721 | locale[1] = tem[1]; |
722 | g_free (tem); |
723 | #else |
724 | const char *locale = setlocale (LC_CTYPE, NULL); |
725 | |
726 | if (locale == NULL) |
727 | return LOCALE_NORMAL; |
728 | #endif |
729 | |
730 | switch (locale[0]) |
731 | { |
732 | case 'a': |
733 | if (locale[1] == 'z') |
734 | return LOCALE_TURKIC; |
735 | break; |
736 | case 'l': |
737 | if (locale[1] == 't') |
738 | return LOCALE_LITHUANIAN; |
739 | break; |
740 | case 't': |
741 | if (locale[1] == 'r') |
742 | return LOCALE_TURKIC; |
743 | break; |
744 | } |
745 | |
746 | return LOCALE_NORMAL; |
747 | } |
748 | |
749 | static gint |
750 | output_marks (const char **p_inout, |
751 | char *out_buffer, |
752 | gboolean remove_dot) |
753 | { |
754 | const char *p = *p_inout; |
755 | gint len = 0; |
756 | |
757 | while (*p) |
758 | { |
759 | gunichar c = g_utf8_get_char (p); |
760 | |
761 | if (ISMARK (TYPE (c))) |
762 | { |
763 | if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */) |
764 | len += g_unichar_to_utf8 (c, outbuf: out_buffer ? out_buffer + len : NULL); |
765 | p = g_utf8_next_char (p); |
766 | } |
767 | else |
768 | break; |
769 | } |
770 | |
771 | *p_inout = p; |
772 | return len; |
773 | } |
774 | |
775 | static gint |
776 | output_special_case (gchar *out_buffer, |
777 | int offset, |
778 | int type, |
779 | int which) |
780 | { |
781 | const gchar *p = special_case_table + offset; |
782 | gint len; |
783 | |
784 | if (type != G_UNICODE_TITLECASE_LETTER) |
785 | p = g_utf8_next_char (p); |
786 | |
787 | if (which == 1) |
788 | p += strlen (s: p) + 1; |
789 | |
790 | len = strlen (s: p); |
791 | if (out_buffer) |
792 | memcpy (dest: out_buffer, src: p, n: len); |
793 | |
794 | return len; |
795 | } |
796 | |
797 | static gsize |
798 | real_toupper (const gchar *str, |
799 | gssize max_len, |
800 | gchar *out_buffer, |
801 | LocaleType locale_type) |
802 | { |
803 | const gchar *p = str; |
804 | const char *last = NULL; |
805 | gsize len = 0; |
806 | gboolean last_was_i = FALSE; |
807 | |
808 | while ((max_len < 0 || p < str + max_len) && *p) |
809 | { |
810 | gunichar c = g_utf8_get_char (p); |
811 | int t = TYPE (c); |
812 | gunichar val; |
813 | |
814 | last = p; |
815 | p = g_utf8_next_char (p); |
816 | |
817 | if (locale_type == LOCALE_LITHUANIAN) |
818 | { |
819 | if (c == 'i') |
820 | last_was_i = TRUE; |
821 | else |
822 | { |
823 | if (last_was_i) |
824 | { |
825 | /* Nasty, need to remove any dot above. Though |
826 | * I think only E WITH DOT ABOVE occurs in practice |
827 | * which could simplify this considerably. |
828 | */ |
829 | gsize decomp_len, i; |
830 | gunichar decomp[G_UNICHAR_MAX_DECOMPOSITION_LENGTH]; |
831 | |
832 | decomp_len = g_unichar_fully_decompose (ch: c, FALSE, result: decomp, G_N_ELEMENTS (decomp)); |
833 | for (i=0; i < decomp_len; i++) |
834 | { |
835 | if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */) |
836 | len += g_unichar_to_utf8 (c: g_unichar_toupper (c: decomp[i]), outbuf: out_buffer ? out_buffer + len : NULL); |
837 | } |
838 | |
839 | len += output_marks (p_inout: &p, out_buffer: out_buffer ? out_buffer + len : NULL, TRUE); |
840 | |
841 | continue; |
842 | } |
843 | |
844 | if (!ISMARK (t)) |
845 | last_was_i = FALSE; |
846 | } |
847 | } |
848 | |
849 | if (locale_type == LOCALE_TURKIC && c == 'i') |
850 | { |
851 | /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
852 | len += g_unichar_to_utf8 (c: 0x130, outbuf: out_buffer ? out_buffer + len : NULL); |
853 | } |
854 | else if (c == 0x0345) /* COMBINING GREEK YPOGEGRAMMENI */ |
855 | { |
856 | /* Nasty, need to move it after other combining marks .. this would go away if |
857 | * we normalized first. |
858 | */ |
859 | len += output_marks (p_inout: &p, out_buffer: out_buffer ? out_buffer + len : NULL, FALSE); |
860 | |
861 | /* And output as GREEK CAPITAL LETTER IOTA */ |
862 | len += g_unichar_to_utf8 (c: 0x399, outbuf: out_buffer ? out_buffer + len : NULL); |
863 | } |
864 | else if (IS (t, |
865 | OR (G_UNICODE_LOWERCASE_LETTER, |
866 | OR (G_UNICODE_TITLECASE_LETTER, |
867 | 0)))) |
868 | { |
869 | val = ATTTABLE (c >> 8, c & 0xff); |
870 | |
871 | if (val >= 0x1000000) |
872 | { |
873 | len += output_special_case (out_buffer: out_buffer ? out_buffer + len : NULL, offset: val - 0x1000000, type: t, |
874 | which: t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1); |
875 | } |
876 | else |
877 | { |
878 | if (t == G_UNICODE_TITLECASE_LETTER) |
879 | { |
880 | unsigned int i; |
881 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
882 | { |
883 | if (title_table[i][0] == c) |
884 | { |
885 | val = title_table[i][1]; |
886 | break; |
887 | } |
888 | } |
889 | } |
890 | |
891 | /* Some lowercase letters, e.g., U+000AA, FEMININE ORDINAL INDICATOR, |
892 | * do not have an uppercase equivalent, in which case val will be |
893 | * zero. */ |
894 | len += g_unichar_to_utf8 (c: val ? val : c, outbuf: out_buffer ? out_buffer + len : NULL); |
895 | } |
896 | } |
897 | else |
898 | { |
899 | gsize char_len = g_utf8_skip[*(guchar *)last]; |
900 | |
901 | if (out_buffer) |
902 | memcpy (dest: out_buffer + len, src: last, n: char_len); |
903 | |
904 | len += char_len; |
905 | } |
906 | |
907 | } |
908 | |
909 | return len; |
910 | } |
911 | |
912 | /** |
913 | * g_utf8_strup: |
914 | * @str: a UTF-8 encoded string |
915 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
916 | * |
917 | * Converts all Unicode characters in the string that have a case |
918 | * to uppercase. The exact manner that this is done depends |
919 | * on the current locale, and may result in the number of |
920 | * characters in the string increasing. (For instance, the |
921 | * German ess-zet will be changed to SS.) |
922 | * |
923 | * Returns: a newly allocated string, with all characters |
924 | * converted to uppercase. |
925 | **/ |
926 | gchar * |
927 | g_utf8_strup (const gchar *str, |
928 | gssize len) |
929 | { |
930 | gsize result_len; |
931 | LocaleType locale_type; |
932 | gchar *result; |
933 | |
934 | g_return_val_if_fail (str != NULL, NULL); |
935 | |
936 | locale_type = get_locale_type (); |
937 | |
938 | /* |
939 | * We use a two pass approach to keep memory management simple |
940 | */ |
941 | result_len = real_toupper (str, max_len: len, NULL, locale_type); |
942 | result = g_malloc (n_bytes: result_len + 1); |
943 | real_toupper (str, max_len: len, out_buffer: result, locale_type); |
944 | result[result_len] = '\0'; |
945 | |
946 | return result; |
947 | } |
948 | |
949 | /* traverses the string checking for characters with combining class == 230 |
950 | * until a base character is found */ |
951 | static gboolean |
952 | has_more_above (const gchar *str) |
953 | { |
954 | const gchar *p = str; |
955 | gint combining_class; |
956 | |
957 | while (*p) |
958 | { |
959 | combining_class = g_unichar_combining_class (uc: g_utf8_get_char (p)); |
960 | if (combining_class == 230) |
961 | return TRUE; |
962 | else if (combining_class == 0) |
963 | break; |
964 | |
965 | p = g_utf8_next_char (p); |
966 | } |
967 | |
968 | return FALSE; |
969 | } |
970 | |
971 | static gsize |
972 | real_tolower (const gchar *str, |
973 | gssize max_len, |
974 | gchar *out_buffer, |
975 | LocaleType locale_type) |
976 | { |
977 | const gchar *p = str; |
978 | const char *last = NULL; |
979 | gsize len = 0; |
980 | |
981 | while ((max_len < 0 || p < str + max_len) && *p) |
982 | { |
983 | gunichar c = g_utf8_get_char (p); |
984 | int t = TYPE (c); |
985 | gunichar val; |
986 | |
987 | last = p; |
988 | p = g_utf8_next_char (p); |
989 | |
990 | if (locale_type == LOCALE_TURKIC && (c == 'I' || c == 0x130 || |
991 | c == G_UNICHAR_FULLWIDTH_I)) |
992 | { |
993 | gboolean combining_dot = (c == 'I' || c == G_UNICHAR_FULLWIDTH_I) && |
994 | g_utf8_get_char (p) == 0x0307; |
995 | if (combining_dot || c == 0x130) |
996 | { |
997 | /* I + COMBINING DOT ABOVE => i (U+0069) |
998 | * LATIN CAPITAL LETTER I WITH DOT ABOVE => i (U+0069) */ |
999 | len += g_unichar_to_utf8 (c: 0x0069, outbuf: out_buffer ? out_buffer + len : NULL); |
1000 | if (combining_dot) |
1001 | p = g_utf8_next_char (p); |
1002 | } |
1003 | else |
1004 | { |
1005 | /* I => LATIN SMALL LETTER DOTLESS I */ |
1006 | len += g_unichar_to_utf8 (c: 0x131, outbuf: out_buffer ? out_buffer + len : NULL); |
1007 | } |
1008 | } |
1009 | /* Introduce an explicit dot above when lowercasing capital I's and J's |
1010 | * whenever there are more accents above. [SpecialCasing.txt] */ |
1011 | else if (locale_type == LOCALE_LITHUANIAN && |
1012 | (c == 0x00cc || c == 0x00cd || c == 0x0128)) |
1013 | { |
1014 | len += g_unichar_to_utf8 (c: 0x0069, outbuf: out_buffer ? out_buffer + len : NULL); |
1015 | len += g_unichar_to_utf8 (c: 0x0307, outbuf: out_buffer ? out_buffer + len : NULL); |
1016 | |
1017 | switch (c) |
1018 | { |
1019 | case 0x00cc: |
1020 | len += g_unichar_to_utf8 (c: 0x0300, outbuf: out_buffer ? out_buffer + len : NULL); |
1021 | break; |
1022 | case 0x00cd: |
1023 | len += g_unichar_to_utf8 (c: 0x0301, outbuf: out_buffer ? out_buffer + len : NULL); |
1024 | break; |
1025 | case 0x0128: |
1026 | len += g_unichar_to_utf8 (c: 0x0303, outbuf: out_buffer ? out_buffer + len : NULL); |
1027 | break; |
1028 | } |
1029 | } |
1030 | else if (locale_type == LOCALE_LITHUANIAN && |
1031 | (c == 'I' || c == G_UNICHAR_FULLWIDTH_I || |
1032 | c == 'J' || c == G_UNICHAR_FULLWIDTH_J || c == 0x012e) && |
1033 | has_more_above (str: p)) |
1034 | { |
1035 | len += g_unichar_to_utf8 (c: g_unichar_tolower (c), outbuf: out_buffer ? out_buffer + len : NULL); |
1036 | len += g_unichar_to_utf8 (c: 0x0307, outbuf: out_buffer ? out_buffer + len : NULL); |
1037 | } |
1038 | else if (c == 0x03A3) /* GREEK CAPITAL LETTER SIGMA */ |
1039 | { |
1040 | if ((max_len < 0 || p < str + max_len) && *p) |
1041 | { |
1042 | gunichar next_c = g_utf8_get_char (p); |
1043 | int next_type = TYPE(next_c); |
1044 | |
1045 | /* SIGMA mapps differently depending on whether it is |
1046 | * final or not. The following simplified test would |
1047 | * fail in the case of combining marks following the |
1048 | * sigma, but I don't think that occurs in real text. |
1049 | * The test here matches that in ICU. |
1050 | */ |
1051 | if (ISALPHA (next_type)) /* Lu,Ll,Lt,Lm,Lo */ |
1052 | val = 0x3c3; /* GREEK SMALL SIGMA */ |
1053 | else |
1054 | val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
1055 | } |
1056 | else |
1057 | val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
1058 | |
1059 | len += g_unichar_to_utf8 (c: val, outbuf: out_buffer ? out_buffer + len : NULL); |
1060 | } |
1061 | else if (IS (t, |
1062 | OR (G_UNICODE_UPPERCASE_LETTER, |
1063 | OR (G_UNICODE_TITLECASE_LETTER, |
1064 | 0)))) |
1065 | { |
1066 | val = ATTTABLE (c >> 8, c & 0xff); |
1067 | |
1068 | if (val >= 0x1000000) |
1069 | { |
1070 | len += output_special_case (out_buffer: out_buffer ? out_buffer + len : NULL, offset: val - 0x1000000, type: t, which: 0); |
1071 | } |
1072 | else |
1073 | { |
1074 | if (t == G_UNICODE_TITLECASE_LETTER) |
1075 | { |
1076 | unsigned int i; |
1077 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
1078 | { |
1079 | if (title_table[i][0] == c) |
1080 | { |
1081 | val = title_table[i][2]; |
1082 | break; |
1083 | } |
1084 | } |
1085 | } |
1086 | |
1087 | /* Not all uppercase letters are guaranteed to have a lowercase |
1088 | * equivalent. If this is the case, val will be zero. */ |
1089 | len += g_unichar_to_utf8 (c: val ? val : c, outbuf: out_buffer ? out_buffer + len : NULL); |
1090 | } |
1091 | } |
1092 | else |
1093 | { |
1094 | gsize char_len = g_utf8_skip[*(guchar *)last]; |
1095 | |
1096 | if (out_buffer) |
1097 | memcpy (dest: out_buffer + len, src: last, n: char_len); |
1098 | |
1099 | len += char_len; |
1100 | } |
1101 | |
1102 | } |
1103 | |
1104 | return len; |
1105 | } |
1106 | |
1107 | /** |
1108 | * g_utf8_strdown: |
1109 | * @str: a UTF-8 encoded string |
1110 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
1111 | * |
1112 | * Converts all Unicode characters in the string that have a case |
1113 | * to lowercase. The exact manner that this is done depends |
1114 | * on the current locale, and may result in the number of |
1115 | * characters in the string changing. |
1116 | * |
1117 | * Returns: a newly allocated string, with all characters |
1118 | * converted to lowercase. |
1119 | **/ |
1120 | gchar * |
1121 | g_utf8_strdown (const gchar *str, |
1122 | gssize len) |
1123 | { |
1124 | gsize result_len; |
1125 | LocaleType locale_type; |
1126 | gchar *result; |
1127 | |
1128 | g_return_val_if_fail (str != NULL, NULL); |
1129 | |
1130 | locale_type = get_locale_type (); |
1131 | |
1132 | /* |
1133 | * We use a two pass approach to keep memory management simple |
1134 | */ |
1135 | result_len = real_tolower (str, max_len: len, NULL, locale_type); |
1136 | result = g_malloc (n_bytes: result_len + 1); |
1137 | real_tolower (str, max_len: len, out_buffer: result, locale_type); |
1138 | result[result_len] = '\0'; |
1139 | |
1140 | return result; |
1141 | } |
1142 | |
1143 | /** |
1144 | * g_utf8_casefold: |
1145 | * @str: a UTF-8 encoded string |
1146 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
1147 | * |
1148 | * Converts a string into a form that is independent of case. The |
1149 | * result will not correspond to any particular case, but can be |
1150 | * compared for equality or ordered with the results of calling |
1151 | * g_utf8_casefold() on other strings. |
1152 | * |
1153 | * Note that calling g_utf8_casefold() followed by g_utf8_collate() is |
1154 | * only an approximation to the correct linguistic case insensitive |
1155 | * ordering, though it is a fairly good one. Getting this exactly |
1156 | * right would require a more sophisticated collation function that |
1157 | * takes case sensitivity into account. GLib does not currently |
1158 | * provide such a function. |
1159 | * |
1160 | * Returns: a newly allocated string, that is a |
1161 | * case independent form of @str. |
1162 | **/ |
1163 | gchar * |
1164 | g_utf8_casefold (const gchar *str, |
1165 | gssize len) |
1166 | { |
1167 | GString *result; |
1168 | const char *p; |
1169 | |
1170 | g_return_val_if_fail (str != NULL, NULL); |
1171 | |
1172 | result = g_string_new (NULL); |
1173 | p = str; |
1174 | while ((len < 0 || p < str + len) && *p) |
1175 | { |
1176 | gunichar ch = g_utf8_get_char (p); |
1177 | |
1178 | int start = 0; |
1179 | int end = G_N_ELEMENTS (casefold_table); |
1180 | |
1181 | if (ch >= casefold_table[start].ch && |
1182 | ch <= casefold_table[end - 1].ch) |
1183 | { |
1184 | while (TRUE) |
1185 | { |
1186 | int half = (start + end) / 2; |
1187 | if (ch == casefold_table[half].ch) |
1188 | { |
1189 | g_string_append (string: result, val: casefold_table[half].data); |
1190 | goto next; |
1191 | } |
1192 | else if (half == start) |
1193 | break; |
1194 | else if (ch > casefold_table[half].ch) |
1195 | start = half; |
1196 | else |
1197 | end = half; |
1198 | } |
1199 | } |
1200 | |
1201 | g_string_append_unichar (string: result, wc: g_unichar_tolower (c: ch)); |
1202 | |
1203 | next: |
1204 | p = g_utf8_next_char (p); |
1205 | } |
1206 | |
1207 | return g_string_free (string: result, FALSE); |
1208 | } |
1209 | |
1210 | /** |
1211 | * g_unichar_get_mirror_char: |
1212 | * @ch: a Unicode character |
1213 | * @mirrored_ch: location to store the mirrored character |
1214 | * |
1215 | * In Unicode, some characters are "mirrored". This means that their |
1216 | * images are mirrored horizontally in text that is laid out from right |
1217 | * to left. For instance, "(" would become its mirror image, ")", in |
1218 | * right-to-left text. |
1219 | * |
1220 | * If @ch has the Unicode mirrored property and there is another unicode |
1221 | * character that typically has a glyph that is the mirror image of @ch's |
1222 | * glyph and @mirrored_ch is set, it puts that character in the address |
1223 | * pointed to by @mirrored_ch. Otherwise the original character is put. |
1224 | * |
1225 | * Returns: %TRUE if @ch has a mirrored character, %FALSE otherwise |
1226 | * |
1227 | * Since: 2.4 |
1228 | **/ |
1229 | gboolean |
1230 | g_unichar_get_mirror_char (gunichar ch, |
1231 | gunichar *mirrored_ch) |
1232 | { |
1233 | gboolean found; |
1234 | gunichar mirrored; |
1235 | |
1236 | mirrored = GLIB_GET_MIRRORING(ch); |
1237 | |
1238 | found = ch != mirrored; |
1239 | if (mirrored_ch) |
1240 | *mirrored_ch = mirrored; |
1241 | |
1242 | return found; |
1243 | |
1244 | } |
1245 | |
1246 | #define G_SCRIPT_TABLE_MIDPOINT (G_N_ELEMENTS (g_script_table) / 2) |
1247 | |
1248 | static inline GUnicodeScript |
1249 | g_unichar_get_script_bsearch (gunichar ch) |
1250 | { |
1251 | int lower = 0; |
1252 | int upper = G_N_ELEMENTS (g_script_table) - 1; |
1253 | static int saved_mid = G_SCRIPT_TABLE_MIDPOINT; |
1254 | int mid = saved_mid; |
1255 | |
1256 | |
1257 | do |
1258 | { |
1259 | if (ch < g_script_table[mid].start) |
1260 | upper = mid - 1; |
1261 | else if (ch >= g_script_table[mid].start + g_script_table[mid].chars) |
1262 | lower = mid + 1; |
1263 | else |
1264 | return g_script_table[saved_mid = mid].script; |
1265 | |
1266 | mid = (lower + upper) / 2; |
1267 | } |
1268 | while (lower <= upper); |
1269 | |
1270 | return G_UNICODE_SCRIPT_UNKNOWN; |
1271 | } |
1272 | |
1273 | /** |
1274 | * g_unichar_get_script: |
1275 | * @ch: a Unicode character |
1276 | * |
1277 | * Looks up the #GUnicodeScript for a particular character (as defined |
1278 | * by Unicode Standard Annex \#24). No check is made for @ch being a |
1279 | * valid Unicode character; if you pass in invalid character, the |
1280 | * result is undefined. |
1281 | * |
1282 | * This function is equivalent to pango_script_for_unichar() and the |
1283 | * two are interchangeable. |
1284 | * |
1285 | * Returns: the #GUnicodeScript for the character. |
1286 | * |
1287 | * Since: 2.14 |
1288 | */ |
1289 | GUnicodeScript |
1290 | g_unichar_get_script (gunichar ch) |
1291 | { |
1292 | if (ch < G_EASY_SCRIPTS_RANGE) |
1293 | return g_script_easy_table[ch]; |
1294 | else |
1295 | return g_unichar_get_script_bsearch (ch); |
1296 | } |
1297 | |
1298 | |
1299 | /* http://unicode.org/iso15924/ */ |
1300 | static const guint32 iso15924_tags[] = |
1301 | { |
1302 | #define PACK(a,b,c,d) ((guint32)((((guint8)(a))<<24)|(((guint8)(b))<<16)|(((guint8)(c))<<8)|((guint8)(d)))) |
1303 | |
1304 | PACK ('Z','y','y','y'), /* G_UNICODE_SCRIPT_COMMON */ |
1305 | PACK ('Z','i','n','h'), /* G_UNICODE_SCRIPT_INHERITED */ |
1306 | PACK ('A','r','a','b'), /* G_UNICODE_SCRIPT_ARABIC */ |
1307 | PACK ('A','r','m','n'), /* G_UNICODE_SCRIPT_ARMENIAN */ |
1308 | PACK ('B','e','n','g'), /* G_UNICODE_SCRIPT_BENGALI */ |
1309 | PACK ('B','o','p','o'), /* G_UNICODE_SCRIPT_BOPOMOFO */ |
1310 | PACK ('C','h','e','r'), /* G_UNICODE_SCRIPT_CHEROKEE */ |
1311 | PACK ('C','o','p','t'), /* G_UNICODE_SCRIPT_COPTIC */ |
1312 | PACK ('C','y','r','l'), /* G_UNICODE_SCRIPT_CYRILLIC */ |
1313 | PACK ('D','s','r','t'), /* G_UNICODE_SCRIPT_DESERET */ |
1314 | PACK ('D','e','v','a'), /* G_UNICODE_SCRIPT_DEVANAGARI */ |
1315 | PACK ('E','t','h','i'), /* G_UNICODE_SCRIPT_ETHIOPIC */ |
1316 | PACK ('G','e','o','r'), /* G_UNICODE_SCRIPT_GEORGIAN */ |
1317 | PACK ('G','o','t','h'), /* G_UNICODE_SCRIPT_GOTHIC */ |
1318 | PACK ('G','r','e','k'), /* G_UNICODE_SCRIPT_GREEK */ |
1319 | PACK ('G','u','j','r'), /* G_UNICODE_SCRIPT_GUJARATI */ |
1320 | PACK ('G','u','r','u'), /* G_UNICODE_SCRIPT_GURMUKHI */ |
1321 | PACK ('H','a','n','i'), /* G_UNICODE_SCRIPT_HAN */ |
1322 | PACK ('H','a','n','g'), /* G_UNICODE_SCRIPT_HANGUL */ |
1323 | PACK ('H','e','b','r'), /* G_UNICODE_SCRIPT_HEBREW */ |
1324 | PACK ('H','i','r','a'), /* G_UNICODE_SCRIPT_HIRAGANA */ |
1325 | PACK ('K','n','d','a'), /* G_UNICODE_SCRIPT_KANNADA */ |
1326 | PACK ('K','a','n','a'), /* G_UNICODE_SCRIPT_KATAKANA */ |
1327 | PACK ('K','h','m','r'), /* G_UNICODE_SCRIPT_KHMER */ |
1328 | PACK ('L','a','o','o'), /* G_UNICODE_SCRIPT_LAO */ |
1329 | PACK ('L','a','t','n'), /* G_UNICODE_SCRIPT_LATIN */ |
1330 | PACK ('M','l','y','m'), /* G_UNICODE_SCRIPT_MALAYALAM */ |
1331 | PACK ('M','o','n','g'), /* G_UNICODE_SCRIPT_MONGOLIAN */ |
1332 | PACK ('M','y','m','r'), /* G_UNICODE_SCRIPT_MYANMAR */ |
1333 | PACK ('O','g','a','m'), /* G_UNICODE_SCRIPT_OGHAM */ |
1334 | PACK ('I','t','a','l'), /* G_UNICODE_SCRIPT_OLD_ITALIC */ |
1335 | PACK ('O','r','y','a'), /* G_UNICODE_SCRIPT_ORIYA */ |
1336 | PACK ('R','u','n','r'), /* G_UNICODE_SCRIPT_RUNIC */ |
1337 | PACK ('S','i','n','h'), /* G_UNICODE_SCRIPT_SINHALA */ |
1338 | PACK ('S','y','r','c'), /* G_UNICODE_SCRIPT_SYRIAC */ |
1339 | PACK ('T','a','m','l'), /* G_UNICODE_SCRIPT_TAMIL */ |
1340 | PACK ('T','e','l','u'), /* G_UNICODE_SCRIPT_TELUGU */ |
1341 | PACK ('T','h','a','a'), /* G_UNICODE_SCRIPT_THAANA */ |
1342 | PACK ('T','h','a','i'), /* G_UNICODE_SCRIPT_THAI */ |
1343 | PACK ('T','i','b','t'), /* G_UNICODE_SCRIPT_TIBETAN */ |
1344 | PACK ('C','a','n','s'), /* G_UNICODE_SCRIPT_CANADIAN_ABORIGINAL */ |
1345 | PACK ('Y','i','i','i'), /* G_UNICODE_SCRIPT_YI */ |
1346 | PACK ('T','g','l','g'), /* G_UNICODE_SCRIPT_TAGALOG */ |
1347 | PACK ('H','a','n','o'), /* G_UNICODE_SCRIPT_HANUNOO */ |
1348 | PACK ('B','u','h','d'), /* G_UNICODE_SCRIPT_BUHID */ |
1349 | PACK ('T','a','g','b'), /* G_UNICODE_SCRIPT_TAGBANWA */ |
1350 | |
1351 | /* Unicode-4.0 additions */ |
1352 | PACK ('B','r','a','i'), /* G_UNICODE_SCRIPT_BRAILLE */ |
1353 | PACK ('C','p','r','t'), /* G_UNICODE_SCRIPT_CYPRIOT */ |
1354 | PACK ('L','i','m','b'), /* G_UNICODE_SCRIPT_LIMBU */ |
1355 | PACK ('O','s','m','a'), /* G_UNICODE_SCRIPT_OSMANYA */ |
1356 | PACK ('S','h','a','w'), /* G_UNICODE_SCRIPT_SHAVIAN */ |
1357 | PACK ('L','i','n','b'), /* G_UNICODE_SCRIPT_LINEAR_B */ |
1358 | PACK ('T','a','l','e'), /* G_UNICODE_SCRIPT_TAI_LE */ |
1359 | PACK ('U','g','a','r'), /* G_UNICODE_SCRIPT_UGARITIC */ |
1360 | |
1361 | /* Unicode-4.1 additions */ |
1362 | PACK ('T','a','l','u'), /* G_UNICODE_SCRIPT_NEW_TAI_LUE */ |
1363 | PACK ('B','u','g','i'), /* G_UNICODE_SCRIPT_BUGINESE */ |
1364 | PACK ('G','l','a','g'), /* G_UNICODE_SCRIPT_GLAGOLITIC */ |
1365 | PACK ('T','f','n','g'), /* G_UNICODE_SCRIPT_TIFINAGH */ |
1366 | PACK ('S','y','l','o'), /* G_UNICODE_SCRIPT_SYLOTI_NAGRI */ |
1367 | PACK ('X','p','e','o'), /* G_UNICODE_SCRIPT_OLD_PERSIAN */ |
1368 | PACK ('K','h','a','r'), /* G_UNICODE_SCRIPT_KHAROSHTHI */ |
1369 | |
1370 | /* Unicode-5.0 additions */ |
1371 | PACK ('Z','z','z','z'), /* G_UNICODE_SCRIPT_UNKNOWN */ |
1372 | PACK ('B','a','l','i'), /* G_UNICODE_SCRIPT_BALINESE */ |
1373 | PACK ('X','s','u','x'), /* G_UNICODE_SCRIPT_CUNEIFORM */ |
1374 | PACK ('P','h','n','x'), /* G_UNICODE_SCRIPT_PHOENICIAN */ |
1375 | PACK ('P','h','a','g'), /* G_UNICODE_SCRIPT_PHAGS_PA */ |
1376 | PACK ('N','k','o','o'), /* G_UNICODE_SCRIPT_NKO */ |
1377 | |
1378 | /* Unicode-5.1 additions */ |
1379 | PACK ('K','a','l','i'), /* G_UNICODE_SCRIPT_KAYAH_LI */ |
1380 | PACK ('L','e','p','c'), /* G_UNICODE_SCRIPT_LEPCHA */ |
1381 | PACK ('R','j','n','g'), /* G_UNICODE_SCRIPT_REJANG */ |
1382 | PACK ('S','u','n','d'), /* G_UNICODE_SCRIPT_SUNDANESE */ |
1383 | PACK ('S','a','u','r'), /* G_UNICODE_SCRIPT_SAURASHTRA */ |
1384 | PACK ('C','h','a','m'), /* G_UNICODE_SCRIPT_CHAM */ |
1385 | PACK ('O','l','c','k'), /* G_UNICODE_SCRIPT_OL_CHIKI */ |
1386 | PACK ('V','a','i','i'), /* G_UNICODE_SCRIPT_VAI */ |
1387 | PACK ('C','a','r','i'), /* G_UNICODE_SCRIPT_CARIAN */ |
1388 | PACK ('L','y','c','i'), /* G_UNICODE_SCRIPT_LYCIAN */ |
1389 | PACK ('L','y','d','i'), /* G_UNICODE_SCRIPT_LYDIAN */ |
1390 | |
1391 | /* Unicode-5.2 additions */ |
1392 | PACK ('A','v','s','t'), /* G_UNICODE_SCRIPT_AVESTAN */ |
1393 | PACK ('B','a','m','u'), /* G_UNICODE_SCRIPT_BAMUM */ |
1394 | PACK ('E','g','y','p'), /* G_UNICODE_SCRIPT_EGYPTIAN_HIEROGLYPHS */ |
1395 | PACK ('A','r','m','i'), /* G_UNICODE_SCRIPT_IMPERIAL_ARAMAIC */ |
1396 | PACK ('P','h','l','i'), /* G_UNICODE_SCRIPT_INSCRIPTIONAL_PAHLAVI */ |
1397 | PACK ('P','r','t','i'), /* G_UNICODE_SCRIPT_INSCRIPTIONAL_PARTHIAN */ |
1398 | PACK ('J','a','v','a'), /* G_UNICODE_SCRIPT_JAVANESE */ |
1399 | PACK ('K','t','h','i'), /* G_UNICODE_SCRIPT_KAITHI */ |
1400 | PACK ('L','i','s','u'), /* G_UNICODE_SCRIPT_LISU */ |
1401 | PACK ('M','t','e','i'), /* G_UNICODE_SCRIPT_MEETEI_MAYEK */ |
1402 | PACK ('S','a','r','b'), /* G_UNICODE_SCRIPT_OLD_SOUTH_ARABIAN */ |
1403 | PACK ('O','r','k','h'), /* G_UNICODE_SCRIPT_OLD_TURKIC */ |
1404 | PACK ('S','a','m','r'), /* G_UNICODE_SCRIPT_SAMARITAN */ |
1405 | PACK ('L','a','n','a'), /* G_UNICODE_SCRIPT_TAI_THAM */ |
1406 | PACK ('T','a','v','t'), /* G_UNICODE_SCRIPT_TAI_VIET */ |
1407 | |
1408 | /* Unicode-6.0 additions */ |
1409 | PACK ('B','a','t','k'), /* G_UNICODE_SCRIPT_BATAK */ |
1410 | PACK ('B','r','a','h'), /* G_UNICODE_SCRIPT_BRAHMI */ |
1411 | PACK ('M','a','n','d'), /* G_UNICODE_SCRIPT_MANDAIC */ |
1412 | |
1413 | /* Unicode-6.1 additions */ |
1414 | PACK ('C','a','k','m'), /* G_UNICODE_SCRIPT_CHAKMA */ |
1415 | PACK ('M','e','r','c'), /* G_UNICODE_SCRIPT_MEROITIC_CURSIVE */ |
1416 | PACK ('M','e','r','o'), /* G_UNICODE_SCRIPT_MEROITIC_HIEROGLYPHS */ |
1417 | PACK ('P','l','r','d'), /* G_UNICODE_SCRIPT_MIAO */ |
1418 | PACK ('S','h','r','d'), /* G_UNICODE_SCRIPT_SHARADA */ |
1419 | PACK ('S','o','r','a'), /* G_UNICODE_SCRIPT_SORA_SOMPENG */ |
1420 | PACK ('T','a','k','r'), /* G_UNICODE_SCRIPT_TAKRI */ |
1421 | |
1422 | /* Unicode 7.0 additions */ |
1423 | PACK ('B','a','s','s'), /* G_UNICODE_SCRIPT_BASSA_VAH */ |
1424 | PACK ('A','g','h','b'), /* G_UNICODE_SCRIPT_CAUCASIAN_ALBANIAN */ |
1425 | PACK ('D','u','p','l'), /* G_UNICODE_SCRIPT_DUPLOYAN */ |
1426 | PACK ('E','l','b','a'), /* G_UNICODE_SCRIPT_ELBASAN */ |
1427 | PACK ('G','r','a','n'), /* G_UNICODE_SCRIPT_GRANTHA */ |
1428 | PACK ('K','h','o','j'), /* G_UNICODE_SCRIPT_KHOJKI*/ |
1429 | PACK ('S','i','n','d'), /* G_UNICODE_SCRIPT_KHUDAWADI */ |
1430 | PACK ('L','i','n','a'), /* G_UNICODE_SCRIPT_LINEAR_A */ |
1431 | PACK ('M','a','h','j'), /* G_UNICODE_SCRIPT_MAHAJANI */ |
1432 | PACK ('M','a','n','i'), /* G_UNICODE_SCRIPT_MANICHAEAN */ |
1433 | PACK ('M','e','n','d'), /* G_UNICODE_SCRIPT_MENDE_KIKAKUI */ |
1434 | PACK ('M','o','d','i'), /* G_UNICODE_SCRIPT_MODI */ |
1435 | PACK ('M','r','o','o'), /* G_UNICODE_SCRIPT_MRO */ |
1436 | PACK ('N','b','a','t'), /* G_UNICODE_SCRIPT_NABATAEAN */ |
1437 | PACK ('N','a','r','b'), /* G_UNICODE_SCRIPT_OLD_NORTH_ARABIAN */ |
1438 | PACK ('P','e','r','m'), /* G_UNICODE_SCRIPT_OLD_PERMIC */ |
1439 | PACK ('H','m','n','g'), /* G_UNICODE_SCRIPT_PAHAWH_HMONG */ |
1440 | PACK ('P','a','l','m'), /* G_UNICODE_SCRIPT_PALMYRENE */ |
1441 | PACK ('P','a','u','c'), /* G_UNICODE_SCRIPT_PAU_CIN_HAU */ |
1442 | PACK ('P','h','l','p'), /* G_UNICODE_SCRIPT_PSALTER_PAHLAVI */ |
1443 | PACK ('S','i','d','d'), /* G_UNICODE_SCRIPT_SIDDHAM */ |
1444 | PACK ('T','i','r','h'), /* G_UNICODE_SCRIPT_TIRHUTA */ |
1445 | PACK ('W','a','r','a'), /* G_UNICODE_SCRIPT_WARANG_CITI */ |
1446 | |
1447 | /* Unicode 8.0 additions */ |
1448 | PACK ('A','h','o','m'), /* G_UNICODE_SCRIPT_AHOM */ |
1449 | PACK ('H','l','u','w'), /* G_UNICODE_SCRIPT_ANATOLIAN_HIEROGLYPHS */ |
1450 | PACK ('H','a','t','r'), /* G_UNICODE_SCRIPT_HATRAN */ |
1451 | PACK ('M','u','l','t'), /* G_UNICODE_SCRIPT_MULTANI */ |
1452 | PACK ('H','u','n','g'), /* G_UNICODE_SCRIPT_OLD_HUNGARIAN */ |
1453 | PACK ('S','g','n','w'), /* G_UNICODE_SCRIPT_SIGNWRITING */ |
1454 | |
1455 | /* Unicode 9.0 additions */ |
1456 | PACK ('A','d','l','m'), /* G_UNICODE_SCRIPT_ADLAM */ |
1457 | PACK ('B','h','k','s'), /* G_UNICODE_SCRIPT_BHAIKSUKI */ |
1458 | PACK ('M','a','r','c'), /* G_UNICODE_SCRIPT_MARCHEN */ |
1459 | PACK ('N','e','w','a'), /* G_UNICODE_SCRIPT_NEWA */ |
1460 | PACK ('O','s','g','e'), /* G_UNICODE_SCRIPT_OSAGE */ |
1461 | PACK ('T','a','n','g'), /* G_UNICODE_SCRIPT_TANGUT */ |
1462 | |
1463 | /* Unicode 10.0 additions */ |
1464 | PACK ('G','o','n','m'), /* G_UNICODE_SCRIPT_MASARAM_GONDI */ |
1465 | PACK ('N','s','h','u'), /* G_UNICODE_SCRIPT_NUSHU */ |
1466 | PACK ('S','o','y','o'), /* G_UNICODE_SCRIPT_SOYOMBO */ |
1467 | PACK ('Z','a','n','b'), /* G_UNICODE_SCRIPT_ZANABAZAR_SQUARE */ |
1468 | |
1469 | /* Unicode 11.0 additions */ |
1470 | PACK ('D','o','g','r'), /* G_UNICODE_SCRIPT_DOGRA */ |
1471 | PACK ('G','o','n','g'), /* G_UNICODE_SCRIPT_GUNJALA_GONDI */ |
1472 | PACK ('R','o','h','g'), /* G_UNICODE_SCRIPT_HANIFI_ROHINGYA */ |
1473 | PACK ('M','a','k','a'), /* G_UNICODE_SCRIPT_MAKASAR */ |
1474 | PACK ('M','e','d','f'), /* G_UNICODE_SCRIPT_MEDEFAIDRIN */ |
1475 | PACK ('S','o','g','o'), /* G_UNICODE_SCRIPT_OLD_SOGDIAN */ |
1476 | PACK ('S','o','g','d'), /* G_UNICODE_SCRIPT_SOGDIAN */ |
1477 | |
1478 | /* Unicode 12.0 additions */ |
1479 | PACK ('E','l','y','m'), /* G_UNICODE_SCRIPT_ELYMAIC */ |
1480 | PACK ('N','a','n','d'), /* G_UNICODE_SCRIPT_NANDINAGARI */ |
1481 | PACK ('H','m','n','p'), /* G_UNICODE_SCRIPT_NYIAKENG_PUACHUE_HMONG */ |
1482 | PACK ('W','c','h','o'), /* G_UNICODE_SCRIPT_WANCHO */ |
1483 | |
1484 | /* Unicode 13.0 additions */ |
1485 | PACK ('C', 'h', 'r', 's'), /* G_UNICODE_SCRIPT_CHORASMIAN */ |
1486 | PACK ('D', 'i', 'a', 'k'), /* G_UNICODE_SCRIPT_DIVES_AKURU */ |
1487 | PACK ('K', 'i', 't', 's'), /* G_UNICODE_SCRIPT_KHITAN_SMALL_SCRIPT */ |
1488 | PACK ('Y', 'e', 'z', 'i'), /* G_UNICODE_SCRIPT_YEZIDI */ |
1489 | #undef PACK |
1490 | }; |
1491 | |
1492 | /** |
1493 | * g_unicode_script_to_iso15924: |
1494 | * @script: a Unicode script |
1495 | * |
1496 | * Looks up the ISO 15924 code for @script. ISO 15924 assigns four-letter |
1497 | * codes to scripts. For example, the code for Arabic is 'Arab'. The |
1498 | * four letter codes are encoded as a @guint32 by this function in a |
1499 | * big-endian fashion. That is, the code returned for Arabic is |
1500 | * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). |
1501 | * |
1502 | * See |
1503 | * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) |
1504 | * for details. |
1505 | * |
1506 | * Returns: the ISO 15924 code for @script, encoded as an integer, |
1507 | * of zero if @script is %G_UNICODE_SCRIPT_INVALID_CODE or |
1508 | * ISO 15924 code 'Zzzz' (script code for UNKNOWN) if @script is not understood. |
1509 | * |
1510 | * Since: 2.30 |
1511 | */ |
1512 | guint32 |
1513 | g_unicode_script_to_iso15924 (GUnicodeScript script) |
1514 | { |
1515 | if (G_UNLIKELY (script == G_UNICODE_SCRIPT_INVALID_CODE)) |
1516 | return 0; |
1517 | |
1518 | if (G_UNLIKELY (script < 0 || script >= (int) G_N_ELEMENTS (iso15924_tags))) |
1519 | return 0x5A7A7A7A; |
1520 | |
1521 | return iso15924_tags[script]; |
1522 | } |
1523 | |
1524 | /** |
1525 | * g_unicode_script_from_iso15924: |
1526 | * @iso15924: a Unicode script |
1527 | * |
1528 | * Looks up the Unicode script for @iso15924. ISO 15924 assigns four-letter |
1529 | * codes to scripts. For example, the code for Arabic is 'Arab'. |
1530 | * This function accepts four letter codes encoded as a @guint32 in a |
1531 | * big-endian fashion. That is, the code expected for Arabic is |
1532 | * 0x41726162 (0x41 is ASCII code for 'A', 0x72 is ASCII code for 'r', etc). |
1533 | * |
1534 | * See |
1535 | * [Codes for the representation of names of scripts](http://unicode.org/iso15924/codelists.html) |
1536 | * for details. |
1537 | * |
1538 | * Returns: the Unicode script for @iso15924, or |
1539 | * of %G_UNICODE_SCRIPT_INVALID_CODE if @iso15924 is zero and |
1540 | * %G_UNICODE_SCRIPT_UNKNOWN if @iso15924 is unknown. |
1541 | * |
1542 | * Since: 2.30 |
1543 | */ |
1544 | GUnicodeScript |
1545 | g_unicode_script_from_iso15924 (guint32 iso15924) |
1546 | { |
1547 | unsigned int i; |
1548 | |
1549 | if (!iso15924) |
1550 | return G_UNICODE_SCRIPT_INVALID_CODE; |
1551 | |
1552 | for (i = 0; i < G_N_ELEMENTS (iso15924_tags); i++) |
1553 | if (iso15924_tags[i] == iso15924) |
1554 | return (GUnicodeScript) i; |
1555 | |
1556 | return G_UNICODE_SCRIPT_UNKNOWN; |
1557 | } |
1558 | |