1/* decomp.c - Character decomposition.
2 *
3 * Copyright (C) 1999, 2000 Tom Tromey
4 * Copyright 2000 Red Hat, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public License
17 * along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20/**
21 * SECTION:unicode
22 * @Title: Unicode Manipulation
23 * @Short_description: functions operating on Unicode characters and
24 * UTF-8 strings
25 * @See_also: g_locale_to_utf8(), g_locale_from_utf8()
26 *
27 * This section describes a number of functions for dealing with
28 * Unicode characters and strings. There are analogues of the
29 * traditional `ctype.h` character classification and case conversion
30 * functions, UTF-8 analogues of some string utility functions,
31 * functions to perform normalization, case conversion and collation
32 * on UTF-8 strings and finally functions to convert between the UTF-8,
33 * UTF-16 and UCS-4 encodings of Unicode.
34 *
35 * The implementations of the Unicode functions in GLib are based
36 * on the Unicode Character Data tables, which are available from
37 * [www.unicode.org](http://www.unicode.org/).
38 *
39 * * Unicode 4.0 was added in GLib 2.8
40 * * Unicode 4.1 was added in GLib 2.10
41 * * Unicode 5.0 was added in GLib 2.12
42 * * Unicode 5.1 was added in GLib 2.16.3
43 * * Unicode 6.0 was added in GLib 2.30
44 * * Unicode 6.1 was added in GLib 2.32
45 * * Unicode 6.2 was added in GLib 2.36
46 * * Unicode 6.3 was added in GLib 2.40
47 * * Unicode 7.0 was added in GLib 2.42
48 * * Unicode 8.0 was added in GLib 2.48
49 * * Unicode 9.0 was added in GLib 2.50.1
50 * * Unicode 10.0 was added in GLib 2.54
51 * * Unicode 11.10 was added in GLib 2.58
52 * * Unicode 12.0 was added in GLib 2.62
53 * * Unicode 12.1 was added in GLib 2.62
54 * * Unicode 13.0 was added in GLib 2.66
55 */
56
57#include "config.h"
58
59#include <stdlib.h>
60
61#include "gunicode.h"
62#include "gunidecomp.h"
63#include "gmem.h"
64#include "gunicomp.h"
65#include "gunicodeprivate.h"
66
67
68#define CC_PART1(Page, Char) \
69 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
70 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
71 : (cclass_data[combining_class_table_part1[Page]][Char]))
72
73#define CC_PART2(Page, Char) \
74 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
75 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
76 : (cclass_data[combining_class_table_part2[Page]][Char]))
77
78#define COMBINING_CLASS(Char) \
79 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
80 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
81 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
82 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
83 : 0))
84
85/**
86 * g_unichar_combining_class:
87 * @uc: a Unicode character
88 *
89 * Determines the canonical combining class of a Unicode character.
90 *
91 * Returns: the combining class of the character
92 *
93 * Since: 2.14
94 **/
95gint
96g_unichar_combining_class (gunichar uc)
97{
98 return COMBINING_CLASS (uc);
99}
100
101/* constants for hangul syllable [de]composition */
102#define SBase 0xAC00
103#define LBase 0x1100
104#define VBase 0x1161
105#define TBase 0x11A7
106#define LCount 19
107#define VCount 21
108#define TCount 28
109#define NCount (VCount * TCount)
110#define SCount (LCount * NCount)
111
112/**
113 * g_unicode_canonical_ordering:
114 * @string: a UCS-4 encoded string.
115 * @len: the maximum length of @string to use.
116 *
117 * Computes the canonical ordering of a string in-place.
118 * This rearranges decomposed characters in the string
119 * according to their combining classes. See the Unicode
120 * manual for more information.
121 **/
122void
123g_unicode_canonical_ordering (gunichar *string,
124 gsize len)
125{
126 gsize i;
127 int swap = 1;
128
129 while (swap)
130 {
131 int last;
132 swap = 0;
133 last = COMBINING_CLASS (string[0]);
134 for (i = 0; i < len - 1; ++i)
135 {
136 int next = COMBINING_CLASS (string[i + 1]);
137 if (next != 0 && last > next)
138 {
139 gsize j;
140 /* Percolate item leftward through string. */
141 for (j = i + 1; j > 0; --j)
142 {
143 gunichar t;
144 if (COMBINING_CLASS (string[j - 1]) <= next)
145 break;
146 t = string[j];
147 string[j] = string[j - 1];
148 string[j - 1] = t;
149 swap = 1;
150 }
151 /* We're re-entering the loop looking at the old
152 character again. */
153 next = last;
154 }
155 last = next;
156 }
157 }
158}
159
160/* http://www.unicode.org/unicode/reports/tr15/#Hangul
161 * r should be null or have sufficient space. Calling with r == NULL will
162 * only calculate the result_len; however, a buffer with space for three
163 * characters will always be big enough. */
164static void
165decompose_hangul (gunichar s,
166 gunichar *r,
167 gsize *result_len)
168{
169 gint SIndex = s - SBase;
170 gint TIndex = SIndex % TCount;
171
172 if (r)
173 {
174 r[0] = LBase + SIndex / NCount;
175 r[1] = VBase + (SIndex % NCount) / TCount;
176 }
177
178 if (TIndex)
179 {
180 if (r)
181 r[2] = TBase + TIndex;
182 *result_len = 3;
183 }
184 else
185 *result_len = 2;
186}
187
188/* returns a pointer to a null-terminated UTF-8 string */
189static const gchar *
190find_decomposition (gunichar ch,
191 gboolean compat)
192{
193 int start = 0;
194 int end = G_N_ELEMENTS (decomp_table);
195
196 if (ch >= decomp_table[start].ch &&
197 ch <= decomp_table[end - 1].ch)
198 {
199 while (TRUE)
200 {
201 int half = (start + end) / 2;
202 if (ch == decomp_table[half].ch)
203 {
204 int offset;
205
206 if (compat)
207 {
208 offset = decomp_table[half].compat_offset;
209 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
210 offset = decomp_table[half].canon_offset;
211 }
212 else
213 {
214 offset = decomp_table[half].canon_offset;
215 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
216 return NULL;
217 }
218
219 return &(decomp_expansion_string[offset]);
220 }
221 else if (half == start)
222 break;
223 else if (ch > decomp_table[half].ch)
224 start = half;
225 else
226 end = half;
227 }
228 }
229
230 return NULL;
231}
232
233/**
234 * g_unicode_canonical_decomposition:
235 * @ch: a Unicode character.
236 * @result_len: location to store the length of the return value.
237 *
238 * Computes the canonical decomposition of a Unicode character.
239 *
240 * Returns: a newly allocated string of Unicode characters.
241 * @result_len is set to the resulting length of the string.
242 *
243 * Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose()
244 * instead.
245 **/
246gunichar *
247g_unicode_canonical_decomposition (gunichar ch,
248 gsize *result_len)
249{
250 const gchar *decomp;
251 const gchar *p;
252 gunichar *r;
253
254 /* Hangul syllable */
255 if (ch >= SBase && ch < SBase + SCount)
256 {
257 decompose_hangul (s: ch, NULL, result_len);
258 r = g_malloc (n_bytes: *result_len * sizeof (gunichar));
259 decompose_hangul (s: ch, r, result_len);
260 }
261 else if ((decomp = find_decomposition (ch, FALSE)) != NULL)
262 {
263 /* Found it. */
264 int i;
265
266 *result_len = g_utf8_strlen (p: decomp, max: -1);
267 r = g_malloc (n_bytes: *result_len * sizeof (gunichar));
268
269 for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
270 r[i] = g_utf8_get_char (p);
271 }
272 else
273 {
274 /* Not in our table. */
275 r = g_malloc (n_bytes: sizeof (gunichar));
276 *r = ch;
277 *result_len = 1;
278 }
279
280 return r;
281}
282
283/* L,V => LV and LV,T => LVT */
284static gboolean
285combine_hangul (gunichar a,
286 gunichar b,
287 gunichar *result)
288{
289 gint LIndex = a - LBase;
290 gint SIndex = a - SBase;
291
292 gint VIndex = b - VBase;
293 gint TIndex = b - TBase;
294
295 if (0 <= LIndex && LIndex < LCount
296 && 0 <= VIndex && VIndex < VCount)
297 {
298 *result = SBase + (LIndex * VCount + VIndex) * TCount;
299 return TRUE;
300 }
301 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
302 && 0 < TIndex && TIndex < TCount)
303 {
304 *result = a + TIndex;
305 return TRUE;
306 }
307
308 return FALSE;
309}
310
311#define CI(Page, Char) \
312 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
313 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
314 : (compose_data[compose_table[Page]][Char]))
315
316#define COMPOSE_INDEX(Char) \
317 (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
318
319static gboolean
320combine (gunichar a,
321 gunichar b,
322 gunichar *result)
323{
324 gushort index_a, index_b;
325
326 if (combine_hangul (a, b, result))
327 return TRUE;
328
329 index_a = COMPOSE_INDEX(a);
330
331 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
332 {
333 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
334 {
335 *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
336 return TRUE;
337 }
338 else
339 return FALSE;
340 }
341
342 index_b = COMPOSE_INDEX(b);
343
344 if (index_b >= COMPOSE_SECOND_SINGLE_START)
345 {
346 if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
347 {
348 *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
349 return TRUE;
350 }
351 else
352 return FALSE;
353 }
354
355 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
356 index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START)
357 {
358 gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
359
360 if (res)
361 {
362 *result = res;
363 return TRUE;
364 }
365 }
366
367 return FALSE;
368}
369
370gunichar *
371_g_utf8_normalize_wc (const gchar *str,
372 gssize max_len,
373 GNormalizeMode mode)
374{
375 gsize n_wc;
376 gunichar *wc_buffer;
377 const char *p;
378 gsize last_start;
379 gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
380 mode == G_NORMALIZE_NFKD);
381 gboolean do_compose = (mode == G_NORMALIZE_NFC ||
382 mode == G_NORMALIZE_NFKC);
383
384 n_wc = 0;
385 p = str;
386 while ((max_len < 0 || p < str + max_len) && *p)
387 {
388 const gchar *decomp;
389 gunichar wc = g_utf8_get_char (p);
390
391 if (wc >= SBase && wc < SBase + SCount)
392 {
393 gsize result_len;
394 decompose_hangul (s: wc, NULL, result_len: &result_len);
395 n_wc += result_len;
396 }
397 else
398 {
399 decomp = find_decomposition (ch: wc, compat: do_compat);
400
401 if (decomp)
402 n_wc += g_utf8_strlen (p: decomp, max: -1);
403 else
404 n_wc++;
405 }
406
407 p = g_utf8_next_char (p);
408 }
409
410 wc_buffer = g_new (gunichar, n_wc + 1);
411
412 last_start = 0;
413 n_wc = 0;
414 p = str;
415 while ((max_len < 0 || p < str + max_len) && *p)
416 {
417 gunichar wc = g_utf8_get_char (p);
418 const gchar *decomp;
419 int cc;
420 gsize old_n_wc = n_wc;
421
422 if (wc >= SBase && wc < SBase + SCount)
423 {
424 gsize result_len;
425 decompose_hangul (s: wc, r: wc_buffer + n_wc, result_len: &result_len);
426 n_wc += result_len;
427 }
428 else
429 {
430 decomp = find_decomposition (ch: wc, compat: do_compat);
431
432 if (decomp)
433 {
434 const char *pd;
435 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
436 wc_buffer[n_wc++] = g_utf8_get_char (p: pd);
437 }
438 else
439 wc_buffer[n_wc++] = wc;
440 }
441
442 if (n_wc > 0)
443 {
444 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
445
446 if (cc == 0)
447 {
448 g_unicode_canonical_ordering (string: wc_buffer + last_start, len: n_wc - last_start);
449 last_start = old_n_wc;
450 }
451 }
452
453 p = g_utf8_next_char (p);
454 }
455
456 if (n_wc > 0)
457 {
458 g_unicode_canonical_ordering (string: wc_buffer + last_start, len: n_wc - last_start);
459 last_start = n_wc;
460 (void) last_start;
461 }
462
463 wc_buffer[n_wc] = 0;
464
465 /* All decomposed and reordered */
466
467 if (do_compose && n_wc > 0)
468 {
469 gsize i, j;
470 int last_cc = 0;
471 last_start = 0;
472
473 for (i = 0; i < n_wc; i++)
474 {
475 int cc = COMBINING_CLASS (wc_buffer[i]);
476
477 if (i > 0 &&
478 (last_cc == 0 || last_cc < cc) &&
479 combine (a: wc_buffer[last_start], b: wc_buffer[i],
480 result: &wc_buffer[last_start]))
481 {
482 for (j = i + 1; j < n_wc; j++)
483 wc_buffer[j-1] = wc_buffer[j];
484 n_wc--;
485 i--;
486
487 if (i == last_start)
488 last_cc = 0;
489 else
490 last_cc = COMBINING_CLASS (wc_buffer[i-1]);
491
492 continue;
493 }
494
495 if (cc == 0)
496 last_start = i;
497
498 last_cc = cc;
499 }
500 }
501
502 wc_buffer[n_wc] = 0;
503
504 return wc_buffer;
505}
506
507/**
508 * g_utf8_normalize:
509 * @str: a UTF-8 encoded string.
510 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
511 * @mode: the type of normalization to perform.
512 *
513 * Converts a string into canonical form, standardizing
514 * such issues as whether a character with an accent
515 * is represented as a base character and combining
516 * accent or as a single precomposed character. The
517 * string has to be valid UTF-8, otherwise %NULL is
518 * returned. You should generally call g_utf8_normalize()
519 * before comparing two Unicode strings.
520 *
521 * The normalization mode %G_NORMALIZE_DEFAULT only
522 * standardizes differences that do not affect the
523 * text content, such as the above-mentioned accent
524 * representation. %G_NORMALIZE_ALL also standardizes
525 * the "compatibility" characters in Unicode, such
526 * as SUPERSCRIPT THREE to the standard forms
527 * (in this case DIGIT THREE). Formatting information
528 * may be lost but for most text operations such
529 * characters should be considered the same.
530 *
531 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
532 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
533 * but returned a result with composed forms rather
534 * than a maximally decomposed form. This is often
535 * useful if you intend to convert the string to
536 * a legacy encoding or pass it to a system with
537 * less capable Unicode handling.
538 *
539 * Returns: (nullable): a newly allocated string, that
540 * is the normalized form of @str, or %NULL if @str
541 * is not valid UTF-8.
542 **/
543gchar *
544g_utf8_normalize (const gchar *str,
545 gssize len,
546 GNormalizeMode mode)
547{
548 gunichar *result_wc = _g_utf8_normalize_wc (str, max_len: len, mode);
549 gchar *result;
550
551 result = g_ucs4_to_utf8 (str: result_wc, len: -1, NULL, NULL, NULL);
552 g_free (mem: result_wc);
553
554 return result;
555}
556
557static gboolean
558decompose_hangul_step (gunichar ch,
559 gunichar *a,
560 gunichar *b)
561{
562 gint SIndex, TIndex;
563
564 if (ch < SBase || ch >= SBase + SCount)
565 return FALSE; /* not a hangul syllable */
566
567 SIndex = ch - SBase;
568 TIndex = SIndex % TCount;
569
570 if (TIndex)
571 {
572 /* split LVT -> LV,T */
573 *a = ch - TIndex;
574 *b = TBase + TIndex;
575 }
576 else
577 {
578 /* split LV -> L,V */
579 *a = LBase + SIndex / NCount;
580 *b = VBase + (SIndex % NCount) / TCount;
581 }
582
583 return TRUE;
584}
585
586/**
587 * g_unichar_decompose:
588 * @ch: a Unicode character
589 * @a: (out) (not optional): return location for the first component of @ch
590 * @b: (out) (not optional): return location for the second component of @ch
591 *
592 * Performs a single decomposition step of the
593 * Unicode canonical decomposition algorithm.
594 *
595 * This function does not include compatibility
596 * decompositions. It does, however, include algorithmic
597 * Hangul Jamo decomposition, as well as 'singleton'
598 * decompositions which replace a character by a single
599 * other character. In the case of singletons *@b will
600 * be set to zero.
601 *
602 * If @ch is not decomposable, *@a is set to @ch and *@b
603 * is set to zero.
604 *
605 * Note that the way Unicode decomposition pairs are
606 * defined, it is guaranteed that @b would not decompose
607 * further, but @a may itself decompose. To get the full
608 * canonical decomposition for @ch, one would need to
609 * recursively call this function on @a. Or use
610 * g_unichar_fully_decompose().
611 *
612 * See
613 * [UAX#15](http://unicode.org/reports/tr15/)
614 * for details.
615 *
616 * Returns: %TRUE if the character could be decomposed
617 *
618 * Since: 2.30
619 */
620gboolean
621g_unichar_decompose (gunichar ch,
622 gunichar *a,
623 gunichar *b)
624{
625 gint start = 0;
626 gint end = G_N_ELEMENTS (decomp_step_table);
627
628 if (decompose_hangul_step (ch, a, b))
629 return TRUE;
630
631 /* TODO use bsearch() */
632 if (ch >= decomp_step_table[start].ch &&
633 ch <= decomp_step_table[end - 1].ch)
634 {
635 while (TRUE)
636 {
637 gint half = (start + end) / 2;
638 const decomposition_step *p = &(decomp_step_table[half]);
639 if (ch == p->ch)
640 {
641 *a = p->a;
642 *b = p->b;
643 return TRUE;
644 }
645 else if (half == start)
646 break;
647 else if (ch > p->ch)
648 start = half;
649 else
650 end = half;
651 }
652 }
653
654 *a = ch;
655 *b = 0;
656
657 return FALSE;
658}
659
660/**
661 * g_unichar_compose:
662 * @a: a Unicode character
663 * @b: a Unicode character
664 * @ch: (out) (not optional): return location for the composed character
665 *
666 * Performs a single composition step of the
667 * Unicode canonical composition algorithm.
668 *
669 * This function includes algorithmic Hangul Jamo composition,
670 * but it is not exactly the inverse of g_unichar_decompose().
671 * No composition can have either of @a or @b equal to zero.
672 * To be precise, this function composes if and only if
673 * there exists a Primary Composite P which is canonically
674 * equivalent to the sequence <@a,@b>. See the Unicode
675 * Standard for the definition of Primary Composite.
676 *
677 * If @a and @b do not compose a new character, @ch is set to zero.
678 *
679 * See
680 * [UAX#15](http://unicode.org/reports/tr15/)
681 * for details.
682 *
683 * Returns: %TRUE if the characters could be composed
684 *
685 * Since: 2.30
686 */
687gboolean
688g_unichar_compose (gunichar a,
689 gunichar b,
690 gunichar *ch)
691{
692 if (combine (a, b, result: ch))
693 return TRUE;
694
695 *ch = 0;
696 return FALSE;
697}
698
699/**
700 * g_unichar_fully_decompose:
701 * @ch: a Unicode character.
702 * @compat: whether perform canonical or compatibility decomposition
703 * @result: (optional) (out caller-allocates): location to store decomposed result, or %NULL
704 * @result_len: length of @result
705 *
706 * Computes the canonical or compatibility decomposition of a
707 * Unicode character. For compatibility decomposition,
708 * pass %TRUE for @compat; for canonical decomposition
709 * pass %FALSE for @compat.
710 *
711 * The decomposed sequence is placed in @result. Only up to
712 * @result_len characters are written into @result. The length
713 * of the full decomposition (irrespective of @result_len) is
714 * returned by the function. For canonical decomposition,
715 * currently all decompositions are of length at most 4, but
716 * this may change in the future (very unlikely though).
717 * At any rate, Unicode does guarantee that a buffer of length
718 * 18 is always enough for both compatibility and canonical
719 * decompositions, so that is the size recommended. This is provided
720 * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH.
721 *
722 * See
723 * [UAX#15](http://unicode.org/reports/tr15/)
724 * for details.
725 *
726 * Returns: the length of the full decomposition.
727 *
728 * Since: 2.30
729 **/
730gsize
731g_unichar_fully_decompose (gunichar ch,
732 gboolean compat,
733 gunichar *result,
734 gsize result_len)
735{
736 const gchar *decomp;
737 const gchar *p;
738
739 /* Hangul syllable */
740 if (ch >= SBase && ch < SBase + SCount)
741 {
742 gsize len, i;
743 gunichar buffer[3];
744 decompose_hangul (s: ch, r: result ? buffer : NULL, result_len: &len);
745 if (result)
746 for (i = 0; i < len && i < result_len; i++)
747 result[i] = buffer[i];
748 return len;
749 }
750 else if ((decomp = find_decomposition (ch, compat)) != NULL)
751 {
752 /* Found it. */
753 gsize len, i;
754
755 len = g_utf8_strlen (p: decomp, max: -1);
756
757 for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++)
758 result[i] = g_utf8_get_char (p);
759
760 return len;
761 }
762
763 /* Does not decompose */
764 if (result && result_len >= 1)
765 *result = ch;
766 return 1;
767}
768

source code of gtk/subprojects/glib/glib/gunidecomp.c