gtranslit.c source code [gtk/subprojects/glib/glib/gtranslit.c]

1	/*
2	* Copyright © 2014 Canonical Limited
3	*
4	* This library is free software; you can redistribute it and/or
5	* modify it under the terms of the GNU Lesser General Public
6	* License as published by the Free Software Foundation; either
7	* version 2.1 of the License, or (at your option) any later version.
8	*
9	* This library is distributed in the hope that it will be useful,
10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12	* Lesser General Public License for more details.
13	*
14	* You should have received a copy of the GNU Lesser General Public
15	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
16	*
17	* Author: Ryan Lortie <desrt@desrt.ca>
18	*/
19
20	#include <config.h>
21
22	#include "gstrfuncs.h"
23
24	#include <glib.h>
25	#include <locale.h>
26	#include <stdlib.h>
27	#include <string.h>
28
29	struct mapping_entry
30	{
31	guint16 src;
32	guint16 ascii;
33	};
34
35	struct mapping_range
36	{
37	guint16 start;
38	guint16 length;
39	};
40
41	struct locale_entry
42	{
43	guint8 name_offset;
44	guint8 item_id;
45	};
46
47	#include "gtranslit-data.h"
48
49	#define get_src_char(array, encoded, index) ((encoded & 0x8000) ? (array)[((encoded) & 0xfff) + index] : encoded)
50	#define get_length(encoded) ((encoded & 0x8000) ? ((encoded & 0x7000) >> 12) : 1)
51
52	#if G_BYTE_ORDER == G_BIG_ENDIAN
53	#define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) (((char *) &(encoded)) + 1))
54	#else
55	#define get_ascii_item(array, encoded) ((encoded & 0x8000) ? &(array)[(encoded) & 0xfff] : (gpointer) &(encoded))
56	#endif
57
58	static const gchar * lookup_in_item (guint item_id,
59	const gunichar *key,
60	gint *result_len,
61	gint *key_consumed);
62
63	static gint
64	compare_mapping_entry (gconstpointer user_data,
65	gconstpointer data)
66	{
67	const struct mapping_entry *entry = data;
68	const gunichar *key = user_data;
69	gunichar src_0;
70
71	G_STATIC_ASSERT(MAX_KEY_SIZE == `2`);
72
73	src_0 = get_src_char (src_table, entry->src, `0`);
74
75	if (key[`0`] > src_0)
76	return `1`;
77	else if (key[`0`] < src_0)
78	return -`1`;
79
80	if (get_length (entry->src) > `1`)
81	{
82	gunichar src_1;
83
84	src_1 = get_src_char (src_table, entry->src, `1`);
85
86	if (key[`1`] > src_1)
87	return `1`;
88	else if (key[`1`] < src_1)
89	return -`1`;
90	}
91	else if (key[`1`])
92	return `1`;
93
94	return `0`;
95	}
96
97	static const gchar *
98	lookup_in_mapping (const struct mapping_entry *mapping,
99	gint mapping_size,
100	const gunichar *key,
101	gint *result_len,
102	gint *key_consumed)
103	{
104	const struct mapping_entry *hit;
105
106	hit = bsearch (key: key, base: mapping, nmemb: mapping_size, size: sizeof (struct mapping_entry), compar: compare_mapping_entry);
107
108	if (hit == NULL)
109	return NULL;
110
111	*key_consumed = get_length (hit->src);
112	*result_len = get_length (hit->ascii);
113
114	return get_ascii_item(ascii_table, hit->ascii);
115	}
116
117	static const gchar *
118	lookup_in_chain (const guint8 *chain,
119	const gunichar *key,
120	gint *result_len,
121	gint *key_consumed)
122	{
123	const gchar *result;
124
125	while (*chain != `0xff`)
126	{
127	result = lookup_in_item (item_id: *chain, key, result_len, key_consumed);
128
129	if (result)
130	return result;
131
132	chain++;
133	}
134
135	return NULL;
136	}
137
138	static const gchar *
139	lookup_in_item (guint item_id,
140	const gunichar *key,
141	gint *result_len,
142	gint *key_consumed)
143	{
144	if (item_id & `0x80`)
145	{
146	const guint8 *chain = chains_table + chain_starts[item_id & `0x7f`];
147
148	return lookup_in_chain (chain, key, result_len, key_consumed);
149	}
150	else
151	{
152	const struct mapping_range *range = &mapping_ranges[item_id];
153
154	return lookup_in_mapping (mapping: mappings_table + range->start, mapping_size: range->length, key, result_len, key_consumed);
155	}
156	}
157
158	static gint
159	compare_locale_entry (gconstpointer user_data,
160	gconstpointer data)
161	{
162	const struct locale_entry *entry = data;
163	const gchar *key = user_data;
164
165	return strcmp (s1: key, s2: &locale_names[entry->name_offset]);
166	}
167
168	static gboolean
169	lookup_item_id_for_one_locale (const gchar *key,
170	guint *item_id)
171	{
172	const struct locale_entry *hit;
173
174	hit = bsearch (key: key, base: locale_index, G_N_ELEMENTS (locale_index), size: sizeof (struct locale_entry), compar: compare_locale_entry);
175
176	if (hit == NULL)
177	return FALSE;
178
179	*item_id = hit->item_id;
180	return TRUE;
181	}
182
183	static guint
184	lookup_item_id_for_locale (const gchar *locale)
185	{
186	gchar key[MAX_LOCALE_NAME + `1`];
187	const gchar *language;
188	guint language_len;
189	const gchar *territory = NULL;
190	guint territory_len = `0`;
191	const gchar *modifier = NULL;
192	guint modifier_len = `0`;
193	const gchar *next_char;
194	guint id;
195
196	/ As per POSIX, a valid locale looks like:*
197	*
198	* language[_territory][.codeset][@modifier]
199	*/
200	language = locale;
201	language_len = strcspn (s: language, reject: "_.@");
202	next_char = language + language_len;
203
204	if (*next_char == `'_'`)
205	{
206	territory = next_char;
207	territory_len = strcspn (s: territory + `1`, reject: "_.@") + `1`;
208	next_char = territory + territory_len;
209	}
210
211	if (*next_char == `'.'`)
212	{
213	const gchar *codeset;
214	guint codeset_len;
215
216	codeset = next_char;
217	codeset_len = strcspn (s: codeset + `1`, reject: "_.@") + `1`;
218	next_char = codeset + codeset_len;
219	}
220
221	if (*next_char == `'@'`)
222	{
223	modifier = next_char;
224	modifier_len = strcspn (s: modifier + `1`, reject: "_.@") + `1`;
225	next_char = modifier + modifier_len;
226	}
227
228	/ What madness is this? /
229	if (language_len == `0` \|\| *next_char)
230	return default_item_id;
231
232	/ We are not interested in codeset.*
233	*
234	* For this locale:
235	*
236	* aa_BB@cc
237	*
238	* try in this order:
239	*
240	* Note: we have no locales of the form aa_BB@cc in the database.
241	*
242	* 1. aa@cc
243	* 2. aa_BB
244	* 3. aa
245	*/
246
247	/ 1. /
248	if (modifier_len && language_len + modifier_len <= MAX_LOCALE_NAME)
249	{
250	memcpy (dest: key, src: language, n: language_len);
251	memcpy (dest: key + language_len, src: modifier, n: modifier_len);
252	key[language_len + modifier_len] = `'\0'`;
253
254	if (lookup_item_id_for_one_locale (key, item_id: &id))
255	return id;
256	}
257
258	/ 2. /
259	if (territory_len && language_len + territory_len <= MAX_LOCALE_NAME)
260	{
261	memcpy (dest: key, src: language, n: language_len);
262	memcpy (dest: key + language_len, src: territory, n: territory_len);
263	key[language_len + territory_len] = `'\0'`;
264
265	if (lookup_item_id_for_one_locale (key, item_id: &id))
266	return id;
267	}
268
269	/ 3. /
270	if (language_len <= MAX_LOCALE_NAME)
271	{
272	memcpy (dest: key, src: language, n: language_len);
273	key[language_len] = `'\0'`;
274
275	if (lookup_item_id_for_one_locale (key, item_id: &id))
276	return id;
277	}
278
279	return default_item_id;
280	}
281
282	static guint
283	get_default_item_id (void)
284	{
285	static guint item_id;
286	static gboolean done;
287
288	/ Doesn't need to be locked -- no harm in doing it twice. /
289	if (!done)
290	{
291	const gchar *locale;
292
293	locale = setlocale (LC_CTYPE, NULL);
294	item_id = lookup_item_id_for_locale (locale);
295	done = TRUE;
296	}
297
298	return item_id;
299	}
300
301	/**
302	* g_str_to_ascii:
303	* @str: a string, in UTF-8
304	* @from_locale: (nullable): the source locale, if known
305	*
306	* Transliterate @str to plain ASCII.
307	*
308	* For best results, @str should be in composed normalised form.
309	*
310	* This function performs a reasonably good set of character
311	* replacements. The particular set of replacements that is done may
312	* change by version or even by runtime environment.
313	*
314	* If the source language of @str is known, it can used to improve the
315	* accuracy of the translation by passing it as @from_locale. It should
316	* be a valid POSIX locale string (of the form
317	* `language[_territory][.codeset][@modifier]`).
318	*
319	* If @from_locale is %NULL then the current locale is used.
320	*
321	* If you want to do translation for no specific locale, and you want it
322	* to be done independently of the currently locale, specify `"C"` for
323	* @from_locale.
324	*
325	* Returns: a string in plain ASCII
326	*
327	* Since: 2.40
328	**/
329	gchar *
330	g_str_to_ascii (const gchar *str,
331	const gchar *from_locale)
332	{
333	GString *result;
334	guint item_id;
335
336	g_return_val_if_fail (str != NULL, NULL);
337
338	if (g_str_is_ascii (str))
339	return g_strdup (str);
340
341	if (from_locale)
342	item_id = lookup_item_id_for_locale (locale: from_locale);
343	else
344	item_id = get_default_item_id ();
345
346	result = g_string_sized_new (dfl_size: strlen (s: str));
347
348	while (*str)
349	{
350	/ We only need to transliterate non-ASCII values... /
351	if (*str & `0x80`)
352	{
353	gunichar key[MAX_KEY_SIZE];
354	const gchar *r;
355	gint consumed;
356	gint r_len;
357	gunichar c;
358
359	G_STATIC_ASSERT(MAX_KEY_SIZE == `2`);
360
361	c = g_utf8_get_char (p: str);
362
363	/ This is where it gets evil...*
364	*
365	* We know that MAX_KEY_SIZE is 2. We also know that we
366	* only want to try another character if it's non-ascii.
367	*/
368	str = g_utf8_next_char (str);
369
370	key[`0`] = c;
371	if (*str & `0x80`)
372	key[`1`] = g_utf8_get_char (p: str);
373	else
374	key[`1`] = `0`;
375
376	r = lookup_in_item (item_id, key, result_len: &r_len, key_consumed: &consumed);
377
378	/ If we failed to map two characters, try again with one.*
379	*
380	* gconv behaviour is a bit weird here -- it seems to
381	* depend in the randomness of the binary search and the
382	* size of the input buffer as to what result we get here.
383	*
384	* Doing it this way is more work, but should be
385	* more-correct.
386	*/
387	if (r == NULL && key[`1`])
388	{
389	key[`1`] = `0`;
390	r = lookup_in_item (item_id, key, result_len: &r_len, key_consumed: &consumed);
391	}
392
393	if (r != NULL)
394	{
395	g_string_append_len (string: result, val: r, len: r_len);
396	if (consumed == `2`)
397	/ If it took both then skip again /
398	str = g_utf8_next_char (str);
399	}
400	else / no match found /
401	g_string_append_c (result, `'?'`);
402	}
403	else / ASCII case /
404	g_string_append_c (result, *str++);
405	}
406
407	return g_string_free (string: result, FALSE);
408	}
409

source code of gtk/subprojects/glib/glib/gtranslit.c