1 | /* |
2 | * Copyright © 2014 Canonical Limited |
3 | * |
4 | * This library is free software; you can redistribute it and/or |
5 | * modify it under the terms of the GNU Lesser General Public |
6 | * License as published by the Free Software Foundation; either |
7 | * version 2.1 of the License, or (at your option) any later version. |
8 | * |
9 | * This library is distributed in the hope that it will be useful, |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | * Lesser General Public License for more details. |
13 | * |
14 | * You should have received a copy of the GNU Lesser General Public |
15 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
16 | * |
17 | * Author: Ryan Lortie <desrt@desrt.ca> |
18 | */ |
19 | |
20 | #include <config.h> |
21 | |
22 | #include "gstrfuncs.h" |
23 | |
24 | #include <glib.h> |
25 | #include <locale.h> |
26 | #include <stdlib.h> |
27 | #include <string.h> |
28 | |
29 | struct mapping_entry |
30 | { |
31 | guint16 src; |
32 | guint16 ascii; |
33 | }; |
34 | |
35 | struct mapping_range |
36 | { |
37 | guint16 start; |
38 | guint16 length; |
39 | }; |
40 | |
41 | struct locale_entry |
42 | { |
43 | guint8 name_offset; |
44 | guint8 item_id; |
45 | }; |
46 | |
47 | #include "gtranslit-data.h" |
48 | |
49 | #define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded) |
50 | #define get_length(encoded) ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1) |
51 | |
52 | #if G_BYTE_ORDER == G_BIG_ENDIAN |
53 | #define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1)) |
54 | #else |
55 | #define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded)) |
56 | #endif |
57 | |
58 | static const gchar * lookup_in_item (guint item_id, |
59 | const gunichar *key, |
60 | gint *result_len, |
61 | gint *key_consumed); |
62 | |
63 | static gint |
64 | compare_mapping_entry (gconstpointer user_data, |
65 | gconstpointer data) |
66 | { |
67 | const struct mapping_entry *entry = data; |
68 | const gunichar *key = user_data; |
69 | gunichar src_0; |
70 | |
71 | G_STATIC_ASSERT(MAX_KEY_SIZE == 2); |
72 | |
73 | src_0 = get_src_char (src_table, entry->src, 0); |
74 | |
75 | if (key[0] > src_0) |
76 | return 1; |
77 | else if (key[0] < src_0) |
78 | return -1; |
79 | |
80 | if (get_length (entry->src) > 1) |
81 | { |
82 | gunichar src_1; |
83 | |
84 | src_1 = get_src_char (src_table, entry->src, 1); |
85 | |
86 | if (key[1] > src_1) |
87 | return 1; |
88 | else if (key[1] < src_1) |
89 | return -1; |
90 | } |
91 | else if (key[1]) |
92 | return 1; |
93 | |
94 | return 0; |
95 | } |
96 | |
97 | static const gchar * |
98 | lookup_in_mapping (const struct mapping_entry *mapping, |
99 | gint mapping_size, |
100 | const gunichar *key, |
101 | gint *result_len, |
102 | gint *key_consumed) |
103 | { |
104 | const struct mapping_entry *hit; |
105 | |
106 | hit = bsearch (key: key, base: mapping, nmemb: mapping_size, size: sizeof (struct mapping_entry), compar: compare_mapping_entry); |
107 | |
108 | if (hit == NULL) |
109 | return NULL; |
110 | |
111 | *key_consumed = get_length (hit->src); |
112 | *result_len = get_length (hit->ascii); |
113 | |
114 | return get_ascii_item(ascii_table, hit->ascii); |
115 | } |
116 | |
117 | static const gchar * |
118 | lookup_in_chain (const guint8 *chain, |
119 | const gunichar *key, |
120 | gint *result_len, |
121 | gint *key_consumed) |
122 | { |
123 | const gchar *result; |
124 | |
125 | while (*chain != 0xff) |
126 | { |
127 | result = lookup_in_item (item_id: *chain, key, result_len, key_consumed); |
128 | |
129 | if (result) |
130 | return result; |
131 | |
132 | chain++; |
133 | } |
134 | |
135 | return NULL; |
136 | } |
137 | |
138 | static const gchar * |
139 | lookup_in_item (guint item_id, |
140 | const gunichar *key, |
141 | gint *result_len, |
142 | gint *key_consumed) |
143 | { |
144 | if (item_id & 0x80) |
145 | { |
146 | const guint8 *chain = chains_table + chain_starts[item_id & 0x7f]; |
147 | |
148 | return lookup_in_chain (chain, key, result_len, key_consumed); |
149 | } |
150 | else |
151 | { |
152 | const struct mapping_range *range = &mapping_ranges[item_id]; |
153 | |
154 | return lookup_in_mapping (mapping: mappings_table + range->start, mapping_size: range->length, key, result_len, key_consumed); |
155 | } |
156 | } |
157 | |
158 | static gint |
159 | compare_locale_entry (gconstpointer user_data, |
160 | gconstpointer data) |
161 | { |
162 | const struct locale_entry *entry = data; |
163 | const gchar *key = user_data; |
164 | |
165 | return strcmp (s1: key, s2: &locale_names[entry->name_offset]); |
166 | } |
167 | |
168 | static gboolean |
169 | lookup_item_id_for_one_locale (const gchar *key, |
170 | guint *item_id) |
171 | { |
172 | const struct locale_entry *hit; |
173 | |
174 | hit = bsearch (key: key, base: locale_index, G_N_ELEMENTS (locale_index), size: sizeof (struct locale_entry), compar: compare_locale_entry); |
175 | |
176 | if (hit == NULL) |
177 | return FALSE; |
178 | |
179 | *item_id = hit->item_id; |
180 | return TRUE; |
181 | } |
182 | |
183 | static guint |
184 | lookup_item_id_for_locale (const gchar *locale) |
185 | { |
186 | gchar key[MAX_LOCALE_NAME + 1]; |
187 | const gchar *language; |
188 | guint language_len; |
189 | const gchar *territory = NULL; |
190 | guint territory_len = 0; |
191 | const gchar *modifier = NULL; |
192 | guint modifier_len = 0; |
193 | const gchar *next_char; |
194 | guint id; |
195 | |
196 | /* As per POSIX, a valid locale looks like: |
197 | * |
198 | * language[_territory][.codeset][@modifier] |
199 | */ |
200 | language = locale; |
201 | language_len = strcspn (s: language, reject: "_.@" ); |
202 | next_char = language + language_len; |
203 | |
204 | if (*next_char == '_') |
205 | { |
206 | territory = next_char; |
207 | territory_len = strcspn (s: territory + 1, reject: "_.@" ) + 1; |
208 | next_char = territory + territory_len; |
209 | } |
210 | |
211 | if (*next_char == '.') |
212 | { |
213 | const gchar *codeset; |
214 | guint codeset_len; |
215 | |
216 | codeset = next_char; |
217 | codeset_len = strcspn (s: codeset + 1, reject: "_.@" ) + 1; |
218 | next_char = codeset + codeset_len; |
219 | } |
220 | |
221 | if (*next_char == '@') |
222 | { |
223 | modifier = next_char; |
224 | modifier_len = strcspn (s: modifier + 1, reject: "_.@" ) + 1; |
225 | next_char = modifier + modifier_len; |
226 | } |
227 | |
228 | /* What madness is this? */ |
229 | if (language_len == 0 || *next_char) |
230 | return default_item_id; |
231 | |
232 | /* We are not interested in codeset. |
233 | * |
234 | * For this locale: |
235 | * |
236 | * aa_BB@cc |
237 | * |
238 | * try in this order: |
239 | * |
240 | * Note: we have no locales of the form aa_BB@cc in the database. |
241 | * |
242 | * 1. aa@cc |
243 | * 2. aa_BB |
244 | * 3. aa |
245 | */ |
246 | |
247 | /* 1. */ |
248 | if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME) |
249 | { |
250 | memcpy (dest: key, src: language, n: language_len); |
251 | memcpy (dest: key + language_len, src: modifier, n: modifier_len); |
252 | key[language_len + modifier_len] = '\0'; |
253 | |
254 | if (lookup_item_id_for_one_locale (key, item_id: &id)) |
255 | return id; |
256 | } |
257 | |
258 | /* 2. */ |
259 | if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME) |
260 | { |
261 | memcpy (dest: key, src: language, n: language_len); |
262 | memcpy (dest: key + language_len, src: territory, n: territory_len); |
263 | key[language_len + territory_len] = '\0'; |
264 | |
265 | if (lookup_item_id_for_one_locale (key, item_id: &id)) |
266 | return id; |
267 | } |
268 | |
269 | /* 3. */ |
270 | if (language_len <= MAX_LOCALE_NAME) |
271 | { |
272 | memcpy (dest: key, src: language, n: language_len); |
273 | key[language_len] = '\0'; |
274 | |
275 | if (lookup_item_id_for_one_locale (key, item_id: &id)) |
276 | return id; |
277 | } |
278 | |
279 | return default_item_id; |
280 | } |
281 | |
282 | static guint |
283 | get_default_item_id (void) |
284 | { |
285 | static guint item_id; |
286 | static gboolean done; |
287 | |
288 | /* Doesn't need to be locked -- no harm in doing it twice. */ |
289 | if (!done) |
290 | { |
291 | const gchar *locale; |
292 | |
293 | locale = setlocale (LC_CTYPE, NULL); |
294 | item_id = lookup_item_id_for_locale (locale); |
295 | done = TRUE; |
296 | } |
297 | |
298 | return item_id; |
299 | } |
300 | |
301 | /** |
302 | * g_str_to_ascii: |
303 | * @str: a string, in UTF-8 |
304 | * @from_locale: (nullable): the source locale, if known |
305 | * |
306 | * Transliterate @str to plain ASCII. |
307 | * |
308 | * For best results, @str should be in composed normalised form. |
309 | * |
310 | * This function performs a reasonably good set of character |
311 | * replacements. The particular set of replacements that is done may |
312 | * change by version or even by runtime environment. |
313 | * |
314 | * If the source language of @str is known, it can used to improve the |
315 | * accuracy of the translation by passing it as @from_locale. It should |
316 | * be a valid POSIX locale string (of the form |
317 | * `language[_territory][.codeset][@modifier]`). |
318 | * |
319 | * If @from_locale is %NULL then the current locale is used. |
320 | * |
321 | * If you want to do translation for no specific locale, and you want it |
322 | * to be done independently of the currently locale, specify `"C"` for |
323 | * @from_locale. |
324 | * |
325 | * Returns: a string in plain ASCII |
326 | * |
327 | * Since: 2.40 |
328 | **/ |
329 | gchar * |
330 | g_str_to_ascii (const gchar *str, |
331 | const gchar *from_locale) |
332 | { |
333 | GString *result; |
334 | guint item_id; |
335 | |
336 | g_return_val_if_fail (str != NULL, NULL); |
337 | |
338 | if (g_str_is_ascii (str)) |
339 | return g_strdup (str); |
340 | |
341 | if (from_locale) |
342 | item_id = lookup_item_id_for_locale (locale: from_locale); |
343 | else |
344 | item_id = get_default_item_id (); |
345 | |
346 | result = g_string_sized_new (dfl_size: strlen (s: str)); |
347 | |
348 | while (*str) |
349 | { |
350 | /* We only need to transliterate non-ASCII values... */ |
351 | if (*str & 0x80) |
352 | { |
353 | gunichar key[MAX_KEY_SIZE]; |
354 | const gchar *r; |
355 | gint consumed; |
356 | gint r_len; |
357 | gunichar c; |
358 | |
359 | G_STATIC_ASSERT(MAX_KEY_SIZE == 2); |
360 | |
361 | c = g_utf8_get_char (p: str); |
362 | |
363 | /* This is where it gets evil... |
364 | * |
365 | * We know that MAX_KEY_SIZE is 2. We also know that we |
366 | * only want to try another character if it's non-ascii. |
367 | */ |
368 | str = g_utf8_next_char (str); |
369 | |
370 | key[0] = c; |
371 | if (*str & 0x80) |
372 | key[1] = g_utf8_get_char (p: str); |
373 | else |
374 | key[1] = 0; |
375 | |
376 | r = lookup_in_item (item_id, key, result_len: &r_len, key_consumed: &consumed); |
377 | |
378 | /* If we failed to map two characters, try again with one. |
379 | * |
380 | * gconv behaviour is a bit weird here -- it seems to |
381 | * depend in the randomness of the binary search and the |
382 | * size of the input buffer as to what result we get here. |
383 | * |
384 | * Doing it this way is more work, but should be |
385 | * more-correct. |
386 | */ |
387 | if (r == NULL && key[1]) |
388 | { |
389 | key[1] = 0; |
390 | r = lookup_in_item (item_id, key, result_len: &r_len, key_consumed: &consumed); |
391 | } |
392 | |
393 | if (r != NULL) |
394 | { |
395 | g_string_append_len (string: result, val: r, len: r_len); |
396 | if (consumed == 2) |
397 | /* If it took both then skip again */ |
398 | str = g_utf8_next_char (str); |
399 | } |
400 | else /* no match found */ |
401 | g_string_append_c (result, '?'); |
402 | } |
403 | else /* ASCII case */ |
404 | g_string_append_c (result, *str++); |
405 | } |
406 | |
407 | return g_string_free (string: result, FALSE); |
408 | } |
409 | |