1/*
2 * Copyright © 2014 Canonical Limited
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
16 *
17 * Author: Ryan Lortie <desrt@desrt.ca>
18 */
19
20#include <config.h>
21
22#include "gstrfuncs.h"
23
24#include <glib.h>
25#include <locale.h>
26#include <stdlib.h>
27#include <string.h>
28
29struct mapping_entry
30{
31 guint16 src;
32 guint16 ascii;
33};
34
35struct mapping_range
36{
37 guint16 start;
38 guint16 length;
39};
40
41struct locale_entry
42{
43 guint8 name_offset;
44 guint8 item_id;
45};
46
47#include "gtranslit-data.h"
48
49#define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded)
50#define get_length(encoded) ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1)
51
52#if G_BYTE_ORDER == G_BIG_ENDIAN
53#define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1))
54#else
55#define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded))
56#endif
57
58static const gchar * lookup_in_item (guint item_id,
59 const gunichar *key,
60 gint *result_len,
61 gint *key_consumed);
62
63static gint
64compare_mapping_entry (gconstpointer user_data,
65 gconstpointer data)
66{
67 const struct mapping_entry *entry = data;
68 const gunichar *key = user_data;
69 gunichar src_0;
70
71 G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
72
73 src_0 = get_src_char (src_table, entry->src, 0);
74
75 if (key[0] > src_0)
76 return 1;
77 else if (key[0] < src_0)
78 return -1;
79
80 if (get_length (entry->src) > 1)
81 {
82 gunichar src_1;
83
84 src_1 = get_src_char (src_table, entry->src, 1);
85
86 if (key[1] > src_1)
87 return 1;
88 else if (key[1] < src_1)
89 return -1;
90 }
91 else if (key[1])
92 return 1;
93
94 return 0;
95}
96
97static const gchar *
98lookup_in_mapping (const struct mapping_entry *mapping,
99 gint mapping_size,
100 const gunichar *key,
101 gint *result_len,
102 gint *key_consumed)
103{
104 const struct mapping_entry *hit;
105
106 hit = bsearch (key: key, base: mapping, nmemb: mapping_size, size: sizeof (struct mapping_entry), compar: compare_mapping_entry);
107
108 if (hit == NULL)
109 return NULL;
110
111 *key_consumed = get_length (hit->src);
112 *result_len = get_length (hit->ascii);
113
114 return get_ascii_item(ascii_table, hit->ascii);
115}
116
117static const gchar *
118lookup_in_chain (const guint8 *chain,
119 const gunichar *key,
120 gint *result_len,
121 gint *key_consumed)
122{
123 const gchar *result;
124
125 while (*chain != 0xff)
126 {
127 result = lookup_in_item (item_id: *chain, key, result_len, key_consumed);
128
129 if (result)
130 return result;
131
132 chain++;
133 }
134
135 return NULL;
136}
137
138static const gchar *
139lookup_in_item (guint item_id,
140 const gunichar *key,
141 gint *result_len,
142 gint *key_consumed)
143{
144 if (item_id & 0x80)
145 {
146 const guint8 *chain = chains_table + chain_starts[item_id & 0x7f];
147
148 return lookup_in_chain (chain, key, result_len, key_consumed);
149 }
150 else
151 {
152 const struct mapping_range *range = &mapping_ranges[item_id];
153
154 return lookup_in_mapping (mapping: mappings_table + range->start, mapping_size: range->length, key, result_len, key_consumed);
155 }
156}
157
158static gint
159compare_locale_entry (gconstpointer user_data,
160 gconstpointer data)
161{
162 const struct locale_entry *entry = data;
163 const gchar *key = user_data;
164
165 return strcmp (s1: key, s2: &locale_names[entry->name_offset]);
166}
167
168static gboolean
169lookup_item_id_for_one_locale (const gchar *key,
170 guint *item_id)
171{
172 const struct locale_entry *hit;
173
174 hit = bsearch (key: key, base: locale_index, G_N_ELEMENTS (locale_index), size: sizeof (struct locale_entry), compar: compare_locale_entry);
175
176 if (hit == NULL)
177 return FALSE;
178
179 *item_id = hit->item_id;
180 return TRUE;
181}
182
183static guint
184lookup_item_id_for_locale (const gchar *locale)
185{
186 gchar key[MAX_LOCALE_NAME + 1];
187 const gchar *language;
188 guint language_len;
189 const gchar *territory = NULL;
190 guint territory_len = 0;
191 const gchar *modifier = NULL;
192 guint modifier_len = 0;
193 const gchar *next_char;
194 guint id;
195
196 /* As per POSIX, a valid locale looks like:
197 *
198 * language[_territory][.codeset][@modifier]
199 */
200 language = locale;
201 language_len = strcspn (s: language, reject: "_.@");
202 next_char = language + language_len;
203
204 if (*next_char == '_')
205 {
206 territory = next_char;
207 territory_len = strcspn (s: territory + 1, reject: "_.@") + 1;
208 next_char = territory + territory_len;
209 }
210
211 if (*next_char == '.')
212 {
213 const gchar *codeset;
214 guint codeset_len;
215
216 codeset = next_char;
217 codeset_len = strcspn (s: codeset + 1, reject: "_.@") + 1;
218 next_char = codeset + codeset_len;
219 }
220
221 if (*next_char == '@')
222 {
223 modifier = next_char;
224 modifier_len = strcspn (s: modifier + 1, reject: "_.@") + 1;
225 next_char = modifier + modifier_len;
226 }
227
228 /* What madness is this? */
229 if (language_len == 0 || *next_char)
230 return default_item_id;
231
232 /* We are not interested in codeset.
233 *
234 * For this locale:
235 *
236 * aa_BB@cc
237 *
238 * try in this order:
239 *
240 * Note: we have no locales of the form aa_BB@cc in the database.
241 *
242 * 1. aa@cc
243 * 2. aa_BB
244 * 3. aa
245 */
246
247 /* 1. */
248 if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME)
249 {
250 memcpy (dest: key, src: language, n: language_len);
251 memcpy (dest: key + language_len, src: modifier, n: modifier_len);
252 key[language_len + modifier_len] = '\0';
253
254 if (lookup_item_id_for_one_locale (key, item_id: &id))
255 return id;
256 }
257
258 /* 2. */
259 if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME)
260 {
261 memcpy (dest: key, src: language, n: language_len);
262 memcpy (dest: key + language_len, src: territory, n: territory_len);
263 key[language_len + territory_len] = '\0';
264
265 if (lookup_item_id_for_one_locale (key, item_id: &id))
266 return id;
267 }
268
269 /* 3. */
270 if (language_len <= MAX_LOCALE_NAME)
271 {
272 memcpy (dest: key, src: language, n: language_len);
273 key[language_len] = '\0';
274
275 if (lookup_item_id_for_one_locale (key, item_id: &id))
276 return id;
277 }
278
279 return default_item_id;
280}
281
282static guint
283get_default_item_id (void)
284{
285 static guint item_id;
286 static gboolean done;
287
288 /* Doesn't need to be locked -- no harm in doing it twice. */
289 if (!done)
290 {
291 const gchar *locale;
292
293 locale = setlocale (LC_CTYPE, NULL);
294 item_id = lookup_item_id_for_locale (locale);
295 done = TRUE;
296 }
297
298 return item_id;
299}
300
301/**
302 * g_str_to_ascii:
303 * @str: a string, in UTF-8
304 * @from_locale: (nullable): the source locale, if known
305 *
306 * Transliterate @str to plain ASCII.
307 *
308 * For best results, @str should be in composed normalised form.
309 *
310 * This function performs a reasonably good set of character
311 * replacements. The particular set of replacements that is done may
312 * change by version or even by runtime environment.
313 *
314 * If the source language of @str is known, it can used to improve the
315 * accuracy of the translation by passing it as @from_locale. It should
316 * be a valid POSIX locale string (of the form
317 * `language[_territory][.codeset][@modifier]`).
318 *
319 * If @from_locale is %NULL then the current locale is used.
320 *
321 * If you want to do translation for no specific locale, and you want it
322 * to be done independently of the currently locale, specify `"C"` for
323 * @from_locale.
324 *
325 * Returns: a string in plain ASCII
326 *
327 * Since: 2.40
328 **/
329gchar *
330g_str_to_ascii (const gchar *str,
331 const gchar *from_locale)
332{
333 GString *result;
334 guint item_id;
335
336 g_return_val_if_fail (str != NULL, NULL);
337
338 if (g_str_is_ascii (str))
339 return g_strdup (str);
340
341 if (from_locale)
342 item_id = lookup_item_id_for_locale (locale: from_locale);
343 else
344 item_id = get_default_item_id ();
345
346 result = g_string_sized_new (dfl_size: strlen (s: str));
347
348 while (*str)
349 {
350 /* We only need to transliterate non-ASCII values... */
351 if (*str & 0x80)
352 {
353 gunichar key[MAX_KEY_SIZE];
354 const gchar *r;
355 gint consumed;
356 gint r_len;
357 gunichar c;
358
359 G_STATIC_ASSERT(MAX_KEY_SIZE == 2);
360
361 c = g_utf8_get_char (p: str);
362
363 /* This is where it gets evil...
364 *
365 * We know that MAX_KEY_SIZE is 2. We also know that we
366 * only want to try another character if it's non-ascii.
367 */
368 str = g_utf8_next_char (str);
369
370 key[0] = c;
371 if (*str & 0x80)
372 key[1] = g_utf8_get_char (p: str);
373 else
374 key[1] = 0;
375
376 r = lookup_in_item (item_id, key, result_len: &r_len, key_consumed: &consumed);
377
378 /* If we failed to map two characters, try again with one.
379 *
380 * gconv behaviour is a bit weird here -- it seems to
381 * depend in the randomness of the binary search and the
382 * size of the input buffer as to what result we get here.
383 *
384 * Doing it this way is more work, but should be
385 * more-correct.
386 */
387 if (r == NULL && key[1])
388 {
389 key[1] = 0;
390 r = lookup_in_item (item_id, key, result_len: &r_len, key_consumed: &consumed);
391 }
392
393 if (r != NULL)
394 {
395 g_string_append_len (string: result, val: r, len: r_len);
396 if (consumed == 2)
397 /* If it took both then skip again */
398 str = g_utf8_next_char (str);
399 }
400 else /* no match found */
401 g_string_append_c (result, '?');
402 }
403 else /* ASCII case */
404 g_string_append_c (result, *str++);
405 }
406
407 return g_string_free (string: result, FALSE);
408}
409

source code of gtk/subprojects/glib/glib/gtranslit.c