1 | /* decomp.c - Character decomposition. |
2 | * |
3 | * Copyright (C) 1999, 2000 Tom Tromey |
4 | * Copyright 2000 Red Hat, Inc. |
5 | * |
6 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Lesser General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2.1 of the License, or (at your option) any later version. |
10 | * |
11 | * This library is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Lesser General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public License |
17 | * along with this library; if not, see <http://www.gnu.org/licenses/>. |
18 | */ |
19 | |
20 | /** |
21 | * SECTION:unicode |
22 | * @Title: Unicode Manipulation |
23 | * @Short_description: functions operating on Unicode characters and |
24 | * UTF-8 strings |
25 | * @See_also: g_locale_to_utf8(), g_locale_from_utf8() |
26 | * |
27 | * This section describes a number of functions for dealing with |
28 | * Unicode characters and strings. There are analogues of the |
29 | * traditional `ctype.h` character classification and case conversion |
30 | * functions, UTF-8 analogues of some string utility functions, |
31 | * functions to perform normalization, case conversion and collation |
32 | * on UTF-8 strings and finally functions to convert between the UTF-8, |
33 | * UTF-16 and UCS-4 encodings of Unicode. |
34 | * |
35 | * The implementations of the Unicode functions in GLib are based |
36 | * on the Unicode Character Data tables, which are available from |
37 | * [www.unicode.org](http://www.unicode.org/). |
38 | * |
39 | * * Unicode 4.0 was added in GLib 2.8 |
40 | * * Unicode 4.1 was added in GLib 2.10 |
41 | * * Unicode 5.0 was added in GLib 2.12 |
42 | * * Unicode 5.1 was added in GLib 2.16.3 |
43 | * * Unicode 6.0 was added in GLib 2.30 |
44 | * * Unicode 6.1 was added in GLib 2.32 |
45 | * * Unicode 6.2 was added in GLib 2.36 |
46 | * * Unicode 6.3 was added in GLib 2.40 |
47 | * * Unicode 7.0 was added in GLib 2.42 |
48 | * * Unicode 8.0 was added in GLib 2.48 |
49 | * * Unicode 9.0 was added in GLib 2.50.1 |
50 | * * Unicode 10.0 was added in GLib 2.54 |
51 | * * Unicode 11.10 was added in GLib 2.58 |
52 | * * Unicode 12.0 was added in GLib 2.62 |
53 | * * Unicode 12.1 was added in GLib 2.62 |
54 | * * Unicode 13.0 was added in GLib 2.66 |
55 | */ |
56 | |
57 | #include "config.h" |
58 | |
59 | #include <stdlib.h> |
60 | |
61 | #include "gunicode.h" |
62 | #include "gunidecomp.h" |
63 | #include "gmem.h" |
64 | #include "gunicomp.h" |
65 | #include "gunicodeprivate.h" |
66 | |
67 | |
68 | #define CC_PART1(Page, Char) \ |
69 | ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
70 | ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
71 | : (cclass_data[combining_class_table_part1[Page]][Char])) |
72 | |
73 | #define CC_PART2(Page, Char) \ |
74 | ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
75 | ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
76 | : (cclass_data[combining_class_table_part2[Page]][Char])) |
77 | |
78 | #define COMBINING_CLASS(Char) \ |
79 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
80 | ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ |
81 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
82 | ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
83 | : 0)) |
84 | |
85 | /** |
86 | * g_unichar_combining_class: |
87 | * @uc: a Unicode character |
88 | * |
89 | * Determines the canonical combining class of a Unicode character. |
90 | * |
91 | * Returns: the combining class of the character |
92 | * |
93 | * Since: 2.14 |
94 | **/ |
95 | gint |
96 | g_unichar_combining_class (gunichar uc) |
97 | { |
98 | return COMBINING_CLASS (uc); |
99 | } |
100 | |
101 | /* constants for hangul syllable [de]composition */ |
102 | #define SBase 0xAC00 |
103 | #define LBase 0x1100 |
104 | #define VBase 0x1161 |
105 | #define TBase 0x11A7 |
106 | #define LCount 19 |
107 | #define VCount 21 |
108 | #define TCount 28 |
109 | #define NCount (VCount * TCount) |
110 | #define SCount (LCount * NCount) |
111 | |
112 | /** |
113 | * g_unicode_canonical_ordering: |
114 | * @string: a UCS-4 encoded string. |
115 | * @len: the maximum length of @string to use. |
116 | * |
117 | * Computes the canonical ordering of a string in-place. |
118 | * This rearranges decomposed characters in the string |
119 | * according to their combining classes. See the Unicode |
120 | * manual for more information. |
121 | **/ |
122 | void |
123 | g_unicode_canonical_ordering (gunichar *string, |
124 | gsize len) |
125 | { |
126 | gsize i; |
127 | int swap = 1; |
128 | |
129 | while (swap) |
130 | { |
131 | int last; |
132 | swap = 0; |
133 | last = COMBINING_CLASS (string[0]); |
134 | for (i = 0; i < len - 1; ++i) |
135 | { |
136 | int next = COMBINING_CLASS (string[i + 1]); |
137 | if (next != 0 && last > next) |
138 | { |
139 | gsize j; |
140 | /* Percolate item leftward through string. */ |
141 | for (j = i + 1; j > 0; --j) |
142 | { |
143 | gunichar t; |
144 | if (COMBINING_CLASS (string[j - 1]) <= next) |
145 | break; |
146 | t = string[j]; |
147 | string[j] = string[j - 1]; |
148 | string[j - 1] = t; |
149 | swap = 1; |
150 | } |
151 | /* We're re-entering the loop looking at the old |
152 | character again. */ |
153 | next = last; |
154 | } |
155 | last = next; |
156 | } |
157 | } |
158 | } |
159 | |
160 | /* http://www.unicode.org/unicode/reports/tr15/#Hangul |
161 | * r should be null or have sufficient space. Calling with r == NULL will |
162 | * only calculate the result_len; however, a buffer with space for three |
163 | * characters will always be big enough. */ |
164 | static void |
165 | decompose_hangul (gunichar s, |
166 | gunichar *r, |
167 | gsize *result_len) |
168 | { |
169 | gint SIndex = s - SBase; |
170 | gint TIndex = SIndex % TCount; |
171 | |
172 | if (r) |
173 | { |
174 | r[0] = LBase + SIndex / NCount; |
175 | r[1] = VBase + (SIndex % NCount) / TCount; |
176 | } |
177 | |
178 | if (TIndex) |
179 | { |
180 | if (r) |
181 | r[2] = TBase + TIndex; |
182 | *result_len = 3; |
183 | } |
184 | else |
185 | *result_len = 2; |
186 | } |
187 | |
188 | /* returns a pointer to a null-terminated UTF-8 string */ |
189 | static const gchar * |
190 | find_decomposition (gunichar ch, |
191 | gboolean compat) |
192 | { |
193 | int start = 0; |
194 | int end = G_N_ELEMENTS (decomp_table); |
195 | |
196 | if (ch >= decomp_table[start].ch && |
197 | ch <= decomp_table[end - 1].ch) |
198 | { |
199 | while (TRUE) |
200 | { |
201 | int half = (start + end) / 2; |
202 | if (ch == decomp_table[half].ch) |
203 | { |
204 | int offset; |
205 | |
206 | if (compat) |
207 | { |
208 | offset = decomp_table[half].compat_offset; |
209 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
210 | offset = decomp_table[half].canon_offset; |
211 | } |
212 | else |
213 | { |
214 | offset = decomp_table[half].canon_offset; |
215 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
216 | return NULL; |
217 | } |
218 | |
219 | return &(decomp_expansion_string[offset]); |
220 | } |
221 | else if (half == start) |
222 | break; |
223 | else if (ch > decomp_table[half].ch) |
224 | start = half; |
225 | else |
226 | end = half; |
227 | } |
228 | } |
229 | |
230 | return NULL; |
231 | } |
232 | |
233 | /** |
234 | * g_unicode_canonical_decomposition: |
235 | * @ch: a Unicode character. |
236 | * @result_len: location to store the length of the return value. |
237 | * |
238 | * Computes the canonical decomposition of a Unicode character. |
239 | * |
240 | * Returns: a newly allocated string of Unicode characters. |
241 | * @result_len is set to the resulting length of the string. |
242 | * |
243 | * Deprecated: 2.30: Use the more flexible g_unichar_fully_decompose() |
244 | * instead. |
245 | **/ |
246 | gunichar * |
247 | g_unicode_canonical_decomposition (gunichar ch, |
248 | gsize *result_len) |
249 | { |
250 | const gchar *decomp; |
251 | const gchar *p; |
252 | gunichar *r; |
253 | |
254 | /* Hangul syllable */ |
255 | if (ch >= SBase && ch < SBase + SCount) |
256 | { |
257 | decompose_hangul (s: ch, NULL, result_len); |
258 | r = g_malloc (n_bytes: *result_len * sizeof (gunichar)); |
259 | decompose_hangul (s: ch, r, result_len); |
260 | } |
261 | else if ((decomp = find_decomposition (ch, FALSE)) != NULL) |
262 | { |
263 | /* Found it. */ |
264 | int i; |
265 | |
266 | *result_len = g_utf8_strlen (p: decomp, max: -1); |
267 | r = g_malloc (n_bytes: *result_len * sizeof (gunichar)); |
268 | |
269 | for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++) |
270 | r[i] = g_utf8_get_char (p); |
271 | } |
272 | else |
273 | { |
274 | /* Not in our table. */ |
275 | r = g_malloc (n_bytes: sizeof (gunichar)); |
276 | *r = ch; |
277 | *result_len = 1; |
278 | } |
279 | |
280 | return r; |
281 | } |
282 | |
283 | /* L,V => LV and LV,T => LVT */ |
284 | static gboolean |
285 | combine_hangul (gunichar a, |
286 | gunichar b, |
287 | gunichar *result) |
288 | { |
289 | gint LIndex = a - LBase; |
290 | gint SIndex = a - SBase; |
291 | |
292 | gint VIndex = b - VBase; |
293 | gint TIndex = b - TBase; |
294 | |
295 | if (0 <= LIndex && LIndex < LCount |
296 | && 0 <= VIndex && VIndex < VCount) |
297 | { |
298 | *result = SBase + (LIndex * VCount + VIndex) * TCount; |
299 | return TRUE; |
300 | } |
301 | else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 |
302 | && 0 < TIndex && TIndex < TCount) |
303 | { |
304 | *result = a + TIndex; |
305 | return TRUE; |
306 | } |
307 | |
308 | return FALSE; |
309 | } |
310 | |
311 | #define CI(Page, Char) \ |
312 | ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
313 | ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
314 | : (compose_data[compose_table[Page]][Char])) |
315 | |
316 | #define COMPOSE_INDEX(Char) \ |
317 | (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) |
318 | |
319 | static gboolean |
320 | combine (gunichar a, |
321 | gunichar b, |
322 | gunichar *result) |
323 | { |
324 | gushort index_a, index_b; |
325 | |
326 | if (combine_hangul (a, b, result)) |
327 | return TRUE; |
328 | |
329 | index_a = COMPOSE_INDEX(a); |
330 | |
331 | if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) |
332 | { |
333 | if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) |
334 | { |
335 | *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; |
336 | return TRUE; |
337 | } |
338 | else |
339 | return FALSE; |
340 | } |
341 | |
342 | index_b = COMPOSE_INDEX(b); |
343 | |
344 | if (index_b >= COMPOSE_SECOND_SINGLE_START) |
345 | { |
346 | if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) |
347 | { |
348 | *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; |
349 | return TRUE; |
350 | } |
351 | else |
352 | return FALSE; |
353 | } |
354 | |
355 | if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START && |
356 | index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START) |
357 | { |
358 | gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START]; |
359 | |
360 | if (res) |
361 | { |
362 | *result = res; |
363 | return TRUE; |
364 | } |
365 | } |
366 | |
367 | return FALSE; |
368 | } |
369 | |
370 | gunichar * |
371 | _g_utf8_normalize_wc (const gchar *str, |
372 | gssize max_len, |
373 | GNormalizeMode mode) |
374 | { |
375 | gsize n_wc; |
376 | gunichar *wc_buffer; |
377 | const char *p; |
378 | gsize last_start; |
379 | gboolean do_compat = (mode == G_NORMALIZE_NFKC || |
380 | mode == G_NORMALIZE_NFKD); |
381 | gboolean do_compose = (mode == G_NORMALIZE_NFC || |
382 | mode == G_NORMALIZE_NFKC); |
383 | |
384 | n_wc = 0; |
385 | p = str; |
386 | while ((max_len < 0 || p < str + max_len) && *p) |
387 | { |
388 | const gchar *decomp; |
389 | gunichar wc = g_utf8_get_char (p); |
390 | |
391 | if (wc >= SBase && wc < SBase + SCount) |
392 | { |
393 | gsize result_len; |
394 | decompose_hangul (s: wc, NULL, result_len: &result_len); |
395 | n_wc += result_len; |
396 | } |
397 | else |
398 | { |
399 | decomp = find_decomposition (ch: wc, compat: do_compat); |
400 | |
401 | if (decomp) |
402 | n_wc += g_utf8_strlen (p: decomp, max: -1); |
403 | else |
404 | n_wc++; |
405 | } |
406 | |
407 | p = g_utf8_next_char (p); |
408 | } |
409 | |
410 | wc_buffer = g_new (gunichar, n_wc + 1); |
411 | |
412 | last_start = 0; |
413 | n_wc = 0; |
414 | p = str; |
415 | while ((max_len < 0 || p < str + max_len) && *p) |
416 | { |
417 | gunichar wc = g_utf8_get_char (p); |
418 | const gchar *decomp; |
419 | int cc; |
420 | gsize old_n_wc = n_wc; |
421 | |
422 | if (wc >= SBase && wc < SBase + SCount) |
423 | { |
424 | gsize result_len; |
425 | decompose_hangul (s: wc, r: wc_buffer + n_wc, result_len: &result_len); |
426 | n_wc += result_len; |
427 | } |
428 | else |
429 | { |
430 | decomp = find_decomposition (ch: wc, compat: do_compat); |
431 | |
432 | if (decomp) |
433 | { |
434 | const char *pd; |
435 | for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) |
436 | wc_buffer[n_wc++] = g_utf8_get_char (p: pd); |
437 | } |
438 | else |
439 | wc_buffer[n_wc++] = wc; |
440 | } |
441 | |
442 | if (n_wc > 0) |
443 | { |
444 | cc = COMBINING_CLASS (wc_buffer[old_n_wc]); |
445 | |
446 | if (cc == 0) |
447 | { |
448 | g_unicode_canonical_ordering (string: wc_buffer + last_start, len: n_wc - last_start); |
449 | last_start = old_n_wc; |
450 | } |
451 | } |
452 | |
453 | p = g_utf8_next_char (p); |
454 | } |
455 | |
456 | if (n_wc > 0) |
457 | { |
458 | g_unicode_canonical_ordering (string: wc_buffer + last_start, len: n_wc - last_start); |
459 | last_start = n_wc; |
460 | (void) last_start; |
461 | } |
462 | |
463 | wc_buffer[n_wc] = 0; |
464 | |
465 | /* All decomposed and reordered */ |
466 | |
467 | if (do_compose && n_wc > 0) |
468 | { |
469 | gsize i, j; |
470 | int last_cc = 0; |
471 | last_start = 0; |
472 | |
473 | for (i = 0; i < n_wc; i++) |
474 | { |
475 | int cc = COMBINING_CLASS (wc_buffer[i]); |
476 | |
477 | if (i > 0 && |
478 | (last_cc == 0 || last_cc < cc) && |
479 | combine (a: wc_buffer[last_start], b: wc_buffer[i], |
480 | result: &wc_buffer[last_start])) |
481 | { |
482 | for (j = i + 1; j < n_wc; j++) |
483 | wc_buffer[j-1] = wc_buffer[j]; |
484 | n_wc--; |
485 | i--; |
486 | |
487 | if (i == last_start) |
488 | last_cc = 0; |
489 | else |
490 | last_cc = COMBINING_CLASS (wc_buffer[i-1]); |
491 | |
492 | continue; |
493 | } |
494 | |
495 | if (cc == 0) |
496 | last_start = i; |
497 | |
498 | last_cc = cc; |
499 | } |
500 | } |
501 | |
502 | wc_buffer[n_wc] = 0; |
503 | |
504 | return wc_buffer; |
505 | } |
506 | |
507 | /** |
508 | * g_utf8_normalize: |
509 | * @str: a UTF-8 encoded string. |
510 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
511 | * @mode: the type of normalization to perform. |
512 | * |
513 | * Converts a string into canonical form, standardizing |
514 | * such issues as whether a character with an accent |
515 | * is represented as a base character and combining |
516 | * accent or as a single precomposed character. The |
517 | * string has to be valid UTF-8, otherwise %NULL is |
518 | * returned. You should generally call g_utf8_normalize() |
519 | * before comparing two Unicode strings. |
520 | * |
521 | * The normalization mode %G_NORMALIZE_DEFAULT only |
522 | * standardizes differences that do not affect the |
523 | * text content, such as the above-mentioned accent |
524 | * representation. %G_NORMALIZE_ALL also standardizes |
525 | * the "compatibility" characters in Unicode, such |
526 | * as SUPERSCRIPT THREE to the standard forms |
527 | * (in this case DIGIT THREE). Formatting information |
528 | * may be lost but for most text operations such |
529 | * characters should be considered the same. |
530 | * |
531 | * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE |
532 | * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, |
533 | * but returned a result with composed forms rather |
534 | * than a maximally decomposed form. This is often |
535 | * useful if you intend to convert the string to |
536 | * a legacy encoding or pass it to a system with |
537 | * less capable Unicode handling. |
538 | * |
539 | * Returns: (nullable): a newly allocated string, that |
540 | * is the normalized form of @str, or %NULL if @str |
541 | * is not valid UTF-8. |
542 | **/ |
543 | gchar * |
544 | g_utf8_normalize (const gchar *str, |
545 | gssize len, |
546 | GNormalizeMode mode) |
547 | { |
548 | gunichar *result_wc = _g_utf8_normalize_wc (str, max_len: len, mode); |
549 | gchar *result; |
550 | |
551 | result = g_ucs4_to_utf8 (str: result_wc, len: -1, NULL, NULL, NULL); |
552 | g_free (mem: result_wc); |
553 | |
554 | return result; |
555 | } |
556 | |
557 | static gboolean |
558 | decompose_hangul_step (gunichar ch, |
559 | gunichar *a, |
560 | gunichar *b) |
561 | { |
562 | gint SIndex, TIndex; |
563 | |
564 | if (ch < SBase || ch >= SBase + SCount) |
565 | return FALSE; /* not a hangul syllable */ |
566 | |
567 | SIndex = ch - SBase; |
568 | TIndex = SIndex % TCount; |
569 | |
570 | if (TIndex) |
571 | { |
572 | /* split LVT -> LV,T */ |
573 | *a = ch - TIndex; |
574 | *b = TBase + TIndex; |
575 | } |
576 | else |
577 | { |
578 | /* split LV -> L,V */ |
579 | *a = LBase + SIndex / NCount; |
580 | *b = VBase + (SIndex % NCount) / TCount; |
581 | } |
582 | |
583 | return TRUE; |
584 | } |
585 | |
586 | /** |
587 | * g_unichar_decompose: |
588 | * @ch: a Unicode character |
589 | * @a: (out) (not optional): return location for the first component of @ch |
590 | * @b: (out) (not optional): return location for the second component of @ch |
591 | * |
592 | * Performs a single decomposition step of the |
593 | * Unicode canonical decomposition algorithm. |
594 | * |
595 | * This function does not include compatibility |
596 | * decompositions. It does, however, include algorithmic |
597 | * Hangul Jamo decomposition, as well as 'singleton' |
598 | * decompositions which replace a character by a single |
599 | * other character. In the case of singletons *@b will |
600 | * be set to zero. |
601 | * |
602 | * If @ch is not decomposable, *@a is set to @ch and *@b |
603 | * is set to zero. |
604 | * |
605 | * Note that the way Unicode decomposition pairs are |
606 | * defined, it is guaranteed that @b would not decompose |
607 | * further, but @a may itself decompose. To get the full |
608 | * canonical decomposition for @ch, one would need to |
609 | * recursively call this function on @a. Or use |
610 | * g_unichar_fully_decompose(). |
611 | * |
612 | * See |
613 | * [UAX#15](http://unicode.org/reports/tr15/) |
614 | * for details. |
615 | * |
616 | * Returns: %TRUE if the character could be decomposed |
617 | * |
618 | * Since: 2.30 |
619 | */ |
620 | gboolean |
621 | g_unichar_decompose (gunichar ch, |
622 | gunichar *a, |
623 | gunichar *b) |
624 | { |
625 | gint start = 0; |
626 | gint end = G_N_ELEMENTS (decomp_step_table); |
627 | |
628 | if (decompose_hangul_step (ch, a, b)) |
629 | return TRUE; |
630 | |
631 | /* TODO use bsearch() */ |
632 | if (ch >= decomp_step_table[start].ch && |
633 | ch <= decomp_step_table[end - 1].ch) |
634 | { |
635 | while (TRUE) |
636 | { |
637 | gint half = (start + end) / 2; |
638 | const decomposition_step *p = &(decomp_step_table[half]); |
639 | if (ch == p->ch) |
640 | { |
641 | *a = p->a; |
642 | *b = p->b; |
643 | return TRUE; |
644 | } |
645 | else if (half == start) |
646 | break; |
647 | else if (ch > p->ch) |
648 | start = half; |
649 | else |
650 | end = half; |
651 | } |
652 | } |
653 | |
654 | *a = ch; |
655 | *b = 0; |
656 | |
657 | return FALSE; |
658 | } |
659 | |
660 | /** |
661 | * g_unichar_compose: |
662 | * @a: a Unicode character |
663 | * @b: a Unicode character |
664 | * @ch: (out) (not optional): return location for the composed character |
665 | * |
666 | * Performs a single composition step of the |
667 | * Unicode canonical composition algorithm. |
668 | * |
669 | * This function includes algorithmic Hangul Jamo composition, |
670 | * but it is not exactly the inverse of g_unichar_decompose(). |
671 | * No composition can have either of @a or @b equal to zero. |
672 | * To be precise, this function composes if and only if |
673 | * there exists a Primary Composite P which is canonically |
674 | * equivalent to the sequence <@a,@b>. See the Unicode |
675 | * Standard for the definition of Primary Composite. |
676 | * |
677 | * If @a and @b do not compose a new character, @ch is set to zero. |
678 | * |
679 | * See |
680 | * [UAX#15](http://unicode.org/reports/tr15/) |
681 | * for details. |
682 | * |
683 | * Returns: %TRUE if the characters could be composed |
684 | * |
685 | * Since: 2.30 |
686 | */ |
687 | gboolean |
688 | g_unichar_compose (gunichar a, |
689 | gunichar b, |
690 | gunichar *ch) |
691 | { |
692 | if (combine (a, b, result: ch)) |
693 | return TRUE; |
694 | |
695 | *ch = 0; |
696 | return FALSE; |
697 | } |
698 | |
699 | /** |
700 | * g_unichar_fully_decompose: |
701 | * @ch: a Unicode character. |
702 | * @compat: whether perform canonical or compatibility decomposition |
703 | * @result: (optional) (out caller-allocates): location to store decomposed result, or %NULL |
704 | * @result_len: length of @result |
705 | * |
706 | * Computes the canonical or compatibility decomposition of a |
707 | * Unicode character. For compatibility decomposition, |
708 | * pass %TRUE for @compat; for canonical decomposition |
709 | * pass %FALSE for @compat. |
710 | * |
711 | * The decomposed sequence is placed in @result. Only up to |
712 | * @result_len characters are written into @result. The length |
713 | * of the full decomposition (irrespective of @result_len) is |
714 | * returned by the function. For canonical decomposition, |
715 | * currently all decompositions are of length at most 4, but |
716 | * this may change in the future (very unlikely though). |
717 | * At any rate, Unicode does guarantee that a buffer of length |
718 | * 18 is always enough for both compatibility and canonical |
719 | * decompositions, so that is the size recommended. This is provided |
720 | * as %G_UNICHAR_MAX_DECOMPOSITION_LENGTH. |
721 | * |
722 | * See |
723 | * [UAX#15](http://unicode.org/reports/tr15/) |
724 | * for details. |
725 | * |
726 | * Returns: the length of the full decomposition. |
727 | * |
728 | * Since: 2.30 |
729 | **/ |
730 | gsize |
731 | g_unichar_fully_decompose (gunichar ch, |
732 | gboolean compat, |
733 | gunichar *result, |
734 | gsize result_len) |
735 | { |
736 | const gchar *decomp; |
737 | const gchar *p; |
738 | |
739 | /* Hangul syllable */ |
740 | if (ch >= SBase && ch < SBase + SCount) |
741 | { |
742 | gsize len, i; |
743 | gunichar buffer[3]; |
744 | decompose_hangul (s: ch, r: result ? buffer : NULL, result_len: &len); |
745 | if (result) |
746 | for (i = 0; i < len && i < result_len; i++) |
747 | result[i] = buffer[i]; |
748 | return len; |
749 | } |
750 | else if ((decomp = find_decomposition (ch, compat)) != NULL) |
751 | { |
752 | /* Found it. */ |
753 | gsize len, i; |
754 | |
755 | len = g_utf8_strlen (p: decomp, max: -1); |
756 | |
757 | for (p = decomp, i = 0; i < len && i < result_len; p = g_utf8_next_char (p), i++) |
758 | result[i] = g_utf8_get_char (p); |
759 | |
760 | return len; |
761 | } |
762 | |
763 | /* Does not decompose */ |
764 | if (result && result_len >= 1) |
765 | *result = ch; |
766 | return 1; |
767 | } |
768 | |