gunicollate.c source code [gtk/subprojects/glib/glib/gunicollate.c]

1	/ gunicollate.c - Collation*
2	*
3	* Copyright 2001,2005 Red Hat, Inc.
4	*
5	* This library is free software; you can redistribute it and/or
6	* modify it under the terms of the GNU Lesser General Public
7	* License as published by the Free Software Foundation; either
8	* version 2.1 of the License, or (at your option) any later version.
9	*
10	* This library is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	* Lesser General Public License for more details.
14	*
15	* You should have received a copy of the GNU Lesser General Public License
16	* along with this library; if not, see <http://www.gnu.org/licenses/>.
17	*/
18
19	#include "config.h"
20
21	#include <locale.h>
22	#include <string.h>
23	#ifdef HAVE_WCHAR_H
24	#include <wchar.h>
25	#endif
26
27	#ifdef HAVE_CARBON
28	#include <CoreServices/CoreServices.h>
29	#endif
30
31	#include "gmem.h"
32	#include "gunicode.h"
33	#include "gunicodeprivate.h"
34	#include "gstring.h"
35	#include "gstrfuncs.h"
36	#include "gtestutils.h"
37	#include "gcharset.h"
38	#include "gconvert.h"
39
40	#if SIZEOF_WCHAR_T == 4 && defined(__STDC_ISO_10646__)
41	#define GUNICHAR_EQUALS_WCHAR_T 1
42	#endif
43
44	#ifdef _MSC_VER
45	/ Workaround for bug in MSVCR80.DLL /
46	static gsize
47	msc_strxfrm_wrapper (char *string1,
48	const char *string2,
49	gsize count)
50	{
51	if (!string1 \|\| count <= `0`)
52	{
53	char tmp;
54
55	return strxfrm (&tmp, string2, `1`);
56	}
57	return strxfrm (string1, string2, count);
58	}
59	#define strxfrm msc_strxfrm_wrapper
60	#endif
61
62	/**
63	* g_utf8_collate:
64	* @str1: a UTF-8 encoded string
65	* @str2: a UTF-8 encoded string
66	*
67	* Compares two strings for ordering using the linguistically
68	* correct rules for the [current locale][setlocale].
69	* When sorting a large number of strings, it will be significantly
70	* faster to obtain collation keys with g_utf8_collate_key() and
71	* compare the keys with strcmp() when sorting instead of sorting
72	* the original strings.
73	*
74	* Returns: < 0 if @str1 compares before @str2,
75	* 0 if they compare equal, > 0 if @str1 compares after @str2.
76	**/
77	gint
78	g_utf8_collate (const gchar *str1,
79	const gchar *str2)
80	{
81	gint result;
82
83	#ifdef HAVE_CARBON
84
85	UniChar *str1_utf16;
86	UniChar *str2_utf16;
87	glong len1;
88	glong len2;
89	SInt32 retval = `0`;
90
91	g_return_val_if_fail (str1 != NULL, `0`);
92	g_return_val_if_fail (str2 != NULL, `0`);
93
94	str1_utf16 = g_utf8_to_utf16 (str1, -`1`, NULL, &len1, NULL);
95	str2_utf16 = g_utf8_to_utf16 (str2, -`1`, NULL, &len2, NULL);
96
97	UCCompareTextDefault (kUCCollateStandardOptions,
98	str1_utf16, len1, str2_utf16, len2,
99	NULL, &retval);
100	result = retval;
101
102	g_free (str2_utf16);
103	g_free (str1_utf16);
104
105	#elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
106
107	gunichar *str1_norm;
108	gunichar *str2_norm;
109
110	g_return_val_if_fail (str1 != NULL, `0`);
111	g_return_val_if_fail (str2 != NULL, `0`);
112
113	str1_norm = _g_utf8_normalize_wc (str: str1, max_len: -`1`, mode: G_NORMALIZE_ALL_COMPOSE);
114	str2_norm = _g_utf8_normalize_wc (str: str2, max_len: -`1`, mode: G_NORMALIZE_ALL_COMPOSE);
115
116	result = wcscoll (s1: (wchar_t )str1_norm, s2: (wchar_t )str2_norm);
117
118	g_free (mem: str1_norm);
119	g_free (mem: str2_norm);
120
121	#else
122
123	const gchar *charset;
124	gchar *str1_norm;
125	gchar *str2_norm;
126
127	g_return_val_if_fail (str1 != NULL, `0`);
128	g_return_val_if_fail (str2 != NULL, `0`);
129
130	str1_norm = g_utf8_normalize (str1, -`1`, G_NORMALIZE_ALL_COMPOSE);
131	str2_norm = g_utf8_normalize (str2, -`1`, G_NORMALIZE_ALL_COMPOSE);
132
133	if (g_get_charset (&charset))
134	{
135	result = strcoll (str1_norm, str2_norm);
136	}
137	else
138	{
139	gchar *str1_locale = g_convert (str1_norm, -`1`, charset, "UTF-8", NULL, NULL, NULL);
140	gchar *str2_locale = g_convert (str2_norm, -`1`, charset, "UTF-8", NULL, NULL, NULL);
141
142	if (str1_locale && str2_locale)
143	result = strcoll (str1_locale, str2_locale);
144	else if (str1_locale)
145	result = -`1`;
146	else if (str2_locale)
147	result = `1`;
148	else
149	result = strcmp (str1_norm, str2_norm);
150
151	g_free (str1_locale);
152	g_free (str2_locale);
153	}
154
155	g_free (str1_norm);
156	g_free (str2_norm);
157
158	#endif
159
160	return result;
161	}
162
163	#if defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
164	/ We need UTF-8 encoding of numbers to encode the weights if*
165	* we are using wcsxfrm. However, we aren't encoding Unicode
166	* characters, so we can't simply use g_unichar_to_utf8.
167	*
168	* The following routine is taken (with modification) from GNU
169	* libc's strxfrm routine:
170	*
171	* Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
172	* Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
173	*/
174	static inline int
175	utf8_encode (char *buf, wchar_t val)
176	{
177	int retval;
178
179	if (val < `0x80`)
180	{
181	if (buf)
182	buf++ = (char*) val;
183	retval = `1`;
184	}
185	else
186	{
187	int step;
188
189	for (step = `2`; step < `6`; ++step)
190	if ((val & (~(guint32)`0` << (`5` * step + `1`))) == `0`)
191	break;
192	retval = step;
193
194	if (buf)
195	{
196	buf = (unsigned* char) (~`0xff` >> step);
197	--step;
198	do
199	{
200	buf[step] = `0x80` \| (val & `0x3f`);
201	val >>= `6`;
202	}
203	while (--step > `0`);
204	*buf \|= val;
205	}
206	}
207
208	return retval;
209	}
210	#endif
211
212	#ifdef HAVE_CARBON
213
214	static gchar *
215	collate_key_to_string (UCCollationValue *key,
216	gsize key_len)
217	{
218	gchar *result;
219	gsize result_len;
220	long lkey = (long* *) key;
221
222	/ UCCollationValue format:*
223	*
224	* UCCollateOptions (32/64 bits)
225	* SizeInBytes (32/64 bits)
226	* Value (8 bits arrey)
227	*
228	* UCCollateOptions: ordering option mask of the collator
229	* used to create the key. Size changes on 32-bit / 64-bit
230	* hosts. On 64-bits also the extra half-word seems to have
231	* some extra (unknown) meaning.
232	* SizeInBytes: size of the whole structure, in bytes
233	* (including UCCollateOptions and SizeInBytes fields). Size
234	* changes on 32-bit & 64-bit hosts.
235	* Value: array of bytes containing the comparison weights.
236	* Seems to have several sub-strings separated by \001 and \002
237	* chars. Also, experience shows this is directly strcmp-able.
238	*/
239
240	result_len = lkey[`1`];
241	result = g_malloc (result_len + `1`);
242	memcpy (result, &lkey[`2`], result_len);
243	result[result_len] = `'\0'`;
244
245	return result;
246	}
247
248	static gchar *
249	carbon_collate_key_with_collator (const gchar *str,
250	gssize len,
251	CollatorRef collator)
252	{
253	UniChar *str_utf16 = NULL;
254	glong len_utf16;
255	OSStatus ret;
256	UCCollationValue staticbuf[`512`];
257	UCCollationValue *freeme = NULL;
258	UCCollationValue *buf;
259	ItemCount buf_len;
260	ItemCount key_len;
261	ItemCount try_len;
262	gchar *result = NULL;
263
264	str_utf16 = g_utf8_to_utf16 (str, len, NULL, &len_utf16, NULL);
265	try_len = len_utf16 * `5` + `2`;
266
267	if (try_len <= sizeof staticbuf)
268	{
269	buf = staticbuf;
270	buf_len = sizeof staticbuf;
271	}
272	else
273	{
274	freeme = g_new (UCCollationValue, try_len);
275	buf = freeme;
276	buf_len = try_len;
277	}
278
279	ret = UCGetCollationKey (collator, str_utf16, len_utf16,
280	buf_len, &key_len, buf);
281
282	if (ret == kCollateBufferTooSmall)
283	{
284	freeme = g_renew (UCCollationValue, freeme, try_len * `2`);
285	buf = freeme;
286	buf_len = try_len * `2`;
287	ret = UCGetCollationKey (collator, str_utf16, len_utf16,
288	buf_len, &key_len, buf);
289	}
290
291	if (ret == `0`)
292	result = collate_key_to_string (buf, key_len);
293	else
294	result = g_strdup ("");
295
296	g_free (freeme);
297	g_free (str_utf16);
298	return result;
299	}
300
301	static gchar *
302	carbon_collate_key (const gchar *str,
303	gssize len)
304	{
305	static CollatorRef collator;
306
307	if (G_UNLIKELY (!collator))
308	{
309	UCCreateCollator (NULL, `0`, kUCCollateStandardOptions, &collator);
310
311	if (!collator)
312	{
313	static gboolean been_here;
314	if (!been_here)
315	g_warning ("%s: UCCreateCollator failed", G_STRLOC);
316	been_here = TRUE;
317	return g_strdup ("");
318	}
319	}
320
321	return carbon_collate_key_with_collator (str, len, collator);
322	}
323
324	static gchar *
325	carbon_collate_key_for_filename (const gchar *str,
326	gssize len)
327	{
328	static CollatorRef collator;
329
330	if (G_UNLIKELY (!collator))
331	{
332	/ http://developer.apple.com/qa/qa2004/qa1159.html /
333	UCCreateCollator (NULL, `0`,
334	kUCCollateComposeInsensitiveMask
335	\| kUCCollateWidthInsensitiveMask
336	\| kUCCollateCaseInsensitiveMask
337	\| kUCCollateDigitsOverrideMask
338	\| kUCCollateDigitsAsNumberMask
339	\| kUCCollatePunctuationSignificantMask,
340	&collator);
341
342	if (!collator)
343	{
344	static gboolean been_here;
345	if (!been_here)
346	g_warning ("%s: UCCreateCollator failed", G_STRLOC);
347	been_here = TRUE;
348	return g_strdup ("");
349	}
350	}
351
352	return carbon_collate_key_with_collator (str, len, collator);
353	}
354
355	#endif /* HAVE_CARBON */
356
357	/**
358	* g_utf8_collate_key:
359	* @str: a UTF-8 encoded string.
360	* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
361	*
362	* Converts a string into a collation key that can be compared
363	* with other collation keys produced by the same function using
364	* strcmp().
365	*
366	* The results of comparing the collation keys of two strings
367	* with strcmp() will always be the same as comparing the two
368	* original keys with g_utf8_collate().
369	*
370	* Note that this function depends on the [current locale][setlocale].
371	*
372	* Returns: a newly allocated string. This string should
373	* be freed with g_free() when you are done with it.
374	**/
375	gchar *
376	g_utf8_collate_key (const gchar *str,
377	gssize len)
378	{
379	gchar *result;
380
381	#ifdef HAVE_CARBON
382
383	g_return_val_if_fail (str != NULL, NULL);
384	result = carbon_collate_key (str, len);
385
386	#elif defined(HAVE_WCHAR_H) && defined(GUNICHAR_EQUALS_WCHAR_T)
387
388	gsize xfrm_len;
389	gunichar *str_norm;
390	wchar_t *result_wc;
391	gsize i;
392	gsize result_len = `0`;
393
394	g_return_val_if_fail (str != NULL, NULL);
395
396	str_norm = _g_utf8_normalize_wc (str, max_len: len, mode: G_NORMALIZE_ALL_COMPOSE);
397
398	xfrm_len = wcsxfrm (NULL, s2: (wchar_t *)str_norm, n: `0`);
399	result_wc = g_new (wchar_t, xfrm_len + `1`);
400	wcsxfrm (s1: result_wc, s2: (wchar_t *)str_norm, n: xfrm_len + `1`);
401
402	for (i = `0`; i < xfrm_len; i++)
403	result_len += utf8_encode (NULL, val: result_wc[i]);
404
405	result = g_malloc (n_bytes: result_len + `1`);
406	result_len = `0`;
407	for (i = `0`; i < xfrm_len; i++)
408	result_len += utf8_encode (buf: result + result_len, val: result_wc[i]);
409
410	result[result_len] = `'\0'`;
411
412	g_free (mem: result_wc);
413	g_free (mem: str_norm);
414
415	return result;
416	#else
417
418	gsize xfrm_len;
419	const gchar *charset;
420	gchar *str_norm;
421
422	g_return_val_if_fail (str != NULL, NULL);
423
424	str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
425
426	result = NULL;
427
428	if (g_get_charset (&charset))
429	{
430	xfrm_len = strxfrm (NULL, str_norm, `0`);
431	if (xfrm_len < G_MAXINT - `2`)
432	{
433	result = g_malloc (xfrm_len + `1`);
434	strxfrm (result, str_norm, xfrm_len + `1`);
435	}
436	}
437	else
438	{
439	gchar *str_locale = g_convert (str_norm, -`1`, charset, "UTF-8", NULL, NULL, NULL);
440
441	if (str_locale)
442	{
443	xfrm_len = strxfrm (NULL, str_locale, `0`);
444	if (xfrm_len < `0` \|\| xfrm_len >= G_MAXINT - `2`)
445	{
446	g_free (str_locale);
447	str_locale = NULL;
448	}
449	}
450	if (str_locale)
451	{
452	result = g_malloc (xfrm_len + `2`);
453	result[`0`] = `'A'`;
454	strxfrm (result + `1`, str_locale, xfrm_len + `1`);
455
456	g_free (str_locale);
457	}
458	}
459
460	if (!result)
461	{
462	xfrm_len = strlen (str_norm);
463	result = g_malloc (xfrm_len + `2`);
464	result[`0`] = `'B'`;
465	memcpy (result + `1`, str_norm, xfrm_len);
466	result[xfrm_len+`1`] = `'\0'`;
467	}
468
469	g_free (str_norm);
470	#endif
471
472	return result;
473	}
474
475	/ This is a collation key that is very very likely to sort before any*
476	* collation key that libc strxfrm generates. We use this before any
477	* special case (dot or number) to make sure that its sorted before
478	* anything else.
479	*/
480	#define COLLATION_SENTINEL "\1\1\1"
481
482	/**
483	* g_utf8_collate_key_for_filename:
484	* @str: a UTF-8 encoded string.
485	* @len: length of @str, in bytes, or -1 if @str is nul-terminated.
486	*
487	* Converts a string into a collation key that can be compared
488	* with other collation keys produced by the same function using strcmp().
489	*
490	* In order to sort filenames correctly, this function treats the dot '.'
491	* as a special case. Most dictionary orderings seem to consider it
492	* insignificant, thus producing the ordering "event.c" "eventgenerator.c"
493	* "event.h" instead of "event.c" "event.h" "eventgenerator.c". Also, we
494	* would like to treat numbers intelligently so that "file1" "file10" "file5"
495	* is sorted as "file1" "file5" "file10".
496	*
497	* Note that this function depends on the [current locale][setlocale].
498	*
499	* Returns: a newly allocated string. This string should
500	* be freed with g_free() when you are done with it.
501	*
502	* Since: 2.8
503	*/
504	gchar *
505	g_utf8_collate_key_for_filename (const gchar *str,
506	gssize len)
507	{
508	#ifndef HAVE_CARBON
509	GString *result;
510	GString *append;
511	const gchar *p;
512	const gchar *prev;
513	const gchar *end;
514	gchar *collate_key;
515	gint digits;
516	gint leading_zeros;
517
518	/*
519	* How it works:
520	*
521	* Split the filename into collatable substrings which do
522	* not contain [.0-9] and special-cased substrings. The collatable
523	* substrings are run through the normal g_utf8_collate_key() and the
524	* resulting keys are concatenated with keys generated from the
525	* special-cased substrings.
526	*
527	* Special cases: Dots are handled by replacing them with '\1' which
528	* implies that short dot-delimited substrings are before long ones,
529	* e.g.
530	*
531	* a\1a (a.a)
532	* a-\1a (a-.a)
533	* aa\1a (aa.a)
534	*
535	* Numbers are handled by prepending to each number d-1 superdigits
536	* where d = number of digits in the number and SUPERDIGIT is a
537	* character with an integer value higher than any digit (for instance
538	* ':'). This ensures that single-digit numbers are sorted before
539	* double-digit numbers which in turn are sorted separately from
540	* triple-digit numbers, etc. To avoid strange side-effects when
541	* sorting strings that already contain SUPERDIGITs, a '\2'
542	* is also prepended, like this
543	*
544	* file\21 (file1)
545	* file\25 (file5)
546	* file\2:10 (file10)
547	* file\2:26 (file26)
548	* file\2::100 (file100)
549	* file:foo (file:foo)
550	*
551	* This has the side-effect of sorting numbers before everything else (except
552	* dots), but this is probably OK.
553	*
554	* Leading digits are ignored when doing the above. To discriminate
555	* numbers which differ only in the number of leading digits, we append
556	* the number of leading digits as a byte at the very end of the collation
557	* key.
558	*
559	* To try avoid conflict with any collation key sequence generated by libc we
560	* start each switch to a special cased part with a sentinel that hopefully
561	* will sort before anything libc will generate.
562	*/
563
564	if (len < `0`)
565	len = strlen (s: str);
566
567	result = g_string_sized_new (dfl_size: len * `2`);
568	append = g_string_sized_new (dfl_size: `0`);
569
570	end = str + len;
571
572	/ No need to use utf8 functions, since we're only looking for ascii chars /
573	for (prev = p = str; p < end; p++)
574	{
575	switch (*p)
576	{
577	case `'.'`:
578	if (prev != p)
579	{
580	collate_key = g_utf8_collate_key (str: prev, len: p - prev);
581	g_string_append (string: result, val: collate_key);
582	g_free (mem: collate_key);
583	}
584
585	g_string_append (string: result, COLLATION_SENTINEL "\1");
586
587	/ skip the dot /
588	prev = p + `1`;
589	break;
590
591	case `'0'`:
592	case `'1'`:
593	case `'2'`:
594	case `'3'`:
595	case `'4'`:
596	case `'5'`:
597	case `'6'`:
598	case `'7'`:
599	case `'8'`:
600	case `'9'`:
601	if (prev != p)
602	{
603	collate_key = g_utf8_collate_key (str: prev, len: p - prev);
604	g_string_append (string: result, val: collate_key);
605	g_free (mem: collate_key);
606	}
607
608	g_string_append (string: result, COLLATION_SENTINEL "\2");
609
610	prev = p;
611
612	/ write d-1 colons /
613	if (*p == `'0'`)
614	{
615	leading_zeros = `1`;
616	digits = `0`;
617	}
618	else
619	{
620	leading_zeros = `0`;
621	digits = `1`;
622	}
623
624	while (++p < end)
625	{
626	if (*p == `'0'` && !digits)
627	++leading_zeros;
628	else if (g_ascii_isdigit(*p))
629	++digits;
630	else
631	{
632	/ count an all-zero sequence as*
633	* one digit plus leading zeros
634	*/
635	if (!digits)
636	{
637	++digits;
638	--leading_zeros;
639	}
640	break;
641	}
642	}
643
644	while (digits > `1`)
645	{
646	g_string_append_c (result, `':'`);
647	--digits;
648	}
649
650	if (leading_zeros > `0`)
651	{
652	g_string_append_c (append, (char)leading_zeros);
653	prev += leading_zeros;
654	}
655
656	/ write the number itself /
657	g_string_append_len (string: result, val: prev, len: p - prev);
658
659	prev = p;
660	--p; / go one step back to avoid disturbing outer loop /
661	break;
662
663	default:
664	/ other characters just accumulate /
665	break;
666	}
667	}
668
669	if (prev != p)
670	{
671	collate_key = g_utf8_collate_key (str: prev, len: p - prev);
672	g_string_append (string: result, val: collate_key);
673	g_free (mem: collate_key);
674	}
675
676	g_string_append (string: result, val: append->str);
677	g_string_free (string: append, TRUE);
678
679	return g_string_free (string: result, FALSE);
680	#else /* HAVE_CARBON */
681	return carbon_collate_key_for_filename (str, len);
682	#endif
683	}
684

source code of gtk/subprojects/glib/glib/gunicollate.c