gconvert.c source code [gtk/subprojects/glib/glib/gconvert.c]

1	/ GLIB - Library of useful routines for C programming*
2	*
3	* gconvert.c: Convert between character sets using iconv
4	* Copyright Red Hat Inc., 2000
5	* Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com>
6	*
7	* This library is free software; you can redistribute it and/or
8	* modify it under the terms of the GNU Lesser General Public
9	* License as published by the Free Software Foundation; either
10	* version 2.1 of the License, or (at your option) any later version.
11	*
12	* This library is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15	* Lesser General Public License for more details.
16	*
17	* You should have received a copy of the GNU Lesser General Public
18	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
19	*/
20
21	#include "config.h"
22	#include "glibconfig.h"
23
24	#ifndef G_OS_WIN32
25	#include <iconv.h>
26	#endif
27	#include <errno.h>
28	#include <stdio.h>
29	#include <string.h>
30	#include <stdlib.h>
31
32	#ifdef G_OS_WIN32
33	#include "win_iconv.c"
34	#endif
35
36	#ifdef G_PLATFORM_WIN32
37	#define STRICT
38	#include <windows.h>
39	#undef STRICT
40	#endif
41
42	#include "gconvert.h"
43
44	#include "gcharsetprivate.h"
45	#include "gslist.h"
46	#include "gstrfuncs.h"
47	#include "gtestutils.h"
48	#include "gthread.h"
49	#include "gthreadprivate.h"
50	#include "gunicode.h"
51	#include "gfileutils.h"
52	#include "genviron.h"
53
54	#include "glibintl.h"
55
56
57	/**
58	* SECTION:conversions
59	* @title: Character Set Conversion
60	* @short_description: convert strings between different character sets
61	*
62	* The g_convert() family of function wraps the functionality of iconv().
63	* In addition to pure character set conversions, GLib has functions to
64	* deal with the extra complications of encodings for file names.
65	*
66	* ## File Name Encodings
67	*
68	* Historically, UNIX has not had a defined encoding for file names:
69	* a file name is valid as long as it does not have path separators
70	* in it ("/"). However, displaying file names may require conversion:
71	* from the character set in which they were created, to the character
72	* set in which the application operates. Consider the Spanish file name
73	* "Presentación.sxi". If the application which created it uses
74	* ISO-8859-1 for its encoding,
75	* \|[
76	* Character: P r e s e n t a c i ó n . s x i
77	* Hex code: 50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69
78	* ]\|
79	* However, if the application use UTF-8, the actual file name on
80	* disk would look like this:
81	* \|[
82	* Character: P r e s e n t a c i ó n . s x i
83	* Hex code: 50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69
84	* ]\|
85	* Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use
86	* GLib do the same thing. If you get a file name from the file system,
87	* for example, from readdir() or from g_dir_read_name(), and you wish
88	* to display the file name to the user, you will need to convert it
89	* into UTF-8. The opposite case is when the user types the name of a
90	* file they wish to save: the toolkit will give you that string in
91	* UTF-8 encoding, and you will need to convert it to the character
92	* set used for file names before you can create the file with open()
93	* or fopen().
94	*
95	* By default, GLib assumes that file names on disk are in UTF-8
96	* encoding. This is a valid assumption for file systems which
97	* were created relatively recently: most applications use UTF-8
98	* encoding for their strings, and that is also what they use for
99	* the file names they create. However, older file systems may
100	* still contain file names created in "older" encodings, such as
101	* ISO-8859-1. In this case, for compatibility reasons, you may want
102	* to instruct GLib to use that particular encoding for file names
103	* rather than UTF-8. You can do this by specifying the encoding for
104	* file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING]
105	* environment variable. For example, if your installation uses
106	* ISO-8859-1 for file names, you can put this in your `~/.profile`:
107	* \|[
108	* export G_FILENAME_ENCODING=ISO-8859-1
109	* ]\|
110	* GLib provides the functions g_filename_to_utf8() and
111	* g_filename_from_utf8() to perform the necessary conversions.
112	* These functions convert file names from the encoding specified
113	* in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This
114	* [diagram][file-name-encodings-diagram] illustrates how
115	* these functions are used to convert between UTF-8 and the
116	* encoding for file names in the file system.
117	*
118	* ## Conversion between file name encodings # {#file-name-encodings-diagram)
119	*
120	* ![](file-name-encodings.png)
121	*
122	* ## Checklist for Application Writers
123	*
124	* This section is a practical summary of the detailed
125	* things to do to make sure your applications process file
126	* name encodings correctly.
127	*
128	* 1. If you get a file name from the file system from a function
129	* such as readdir() or gtk_file_chooser_get_filename(), you do
130	* not need to do any conversion to pass that file name to
131	* functions like open(), rename(), or fopen() -- those are "raw"
132	* file names which the file system understands.
133	*
134	* 2. If you need to display a file name, convert it to UTF-8 first
135	* by using g_filename_to_utf8(). If conversion fails, display a
136	* string like "Unknown file name". Do not convert this string back
137	* into the encoding used for file names if you wish to pass it to
138	* the file system; use the original file name instead.
139	*
140	* For example, the document window of a word processor could display
141	* "Unknown file name" in its title bar but still let the user save
142	* the file, as it would keep the raw file name internally. This
143	* can happen if the user has not set the `G_FILENAME_ENCODING`
144	* environment variable even though he has files whose names are
145	* not encoded in UTF-8.
146	*
147	* 3. If your user interface lets the user type a file name for saving
148	* or renaming, convert it to the encoding used for file names in
149	* the file system by using g_filename_from_utf8(). Pass the converted
150	* file name to functions like fopen(). If conversion fails, ask the
151	* user to enter a different file name. This can happen if the user
152	* types Japanese characters when `G_FILENAME_ENCODING` is set to
153	* `ISO-8859-1`, for example.
154	*/
155
156	/ We try to terminate strings in unknown charsets with this many zero bytes*
157	* to ensure that multibyte strings really are nul-terminated when we return
158	* them from g_convert() and friends.
159	*/
160	#define NUL_TERMINATOR_LENGTH 4
161
162	G_DEFINE_QUARK (g_convert_error, g_convert_error)
163
164	static gboolean
165	try_conversion (const char *to_codeset,
166	const char *from_codeset,
167	iconv_t *cd)
168	{
169	*cd = iconv_open (tocode: to_codeset, fromcode: from_codeset);
170
171	if (*cd == (iconv_t)-`1` && errno == EINVAL)
172	return FALSE;
173	else
174	return TRUE;
175	}
176
177	static gboolean
178	try_to_aliases (const char **to_aliases,
179	const char *from_codeset,
180	iconv_t *cd)
181	{
182	if (to_aliases)
183	{
184	const char **p = to_aliases;
185	while (*p)
186	{
187	if (try_conversion (to_codeset: *p, from_codeset, cd))
188	return TRUE;
189
190	p++;
191	}
192	}
193
194	return FALSE;
195	}
196
197	/**
198	* g_iconv_open: (skip)
199	* @to_codeset: destination codeset
200	* @from_codeset: source codeset
201	*
202	* Same as the standard UNIX routine iconv_open(), but
203	* may be implemented via libiconv on UNIX flavors that lack
204	* a native implementation.
205	*
206	* GLib provides g_convert() and g_locale_to_utf8() which are likely
207	* more convenient than the raw iconv wrappers.
208	*
209	* Returns: a "conversion descriptor", or (GIConv)-1 if
210	* opening the converter failed.
211	**/
212	GIConv
213	g_iconv_open (const gchar *to_codeset,
214	const gchar *from_codeset)
215	{
216	iconv_t cd;
217
218	if (!try_conversion (to_codeset, from_codeset, cd: &cd))
219	{
220	const char **to_aliases = _g_charset_get_aliases (canonical_name: to_codeset);
221	const char **from_aliases = _g_charset_get_aliases (canonical_name: from_codeset);
222
223	if (from_aliases)
224	{
225	const char **p = from_aliases;
226	while (*p)
227	{
228	if (try_conversion (to_codeset, from_codeset: *p, cd: &cd))
229	goto out;
230
231	if (try_to_aliases (to_aliases, from_codeset: *p, cd: &cd))
232	goto out;
233
234	p++;
235	}
236	}
237
238	if (try_to_aliases (to_aliases, from_codeset, cd: &cd))
239	goto out;
240	}
241
242	out:
243	return (cd == (iconv_t)-`1`) ? (GIConv)-`1` : (GIConv)cd;
244	}
245
246	/**
247	* g_iconv: (skip)
248	* @converter: conversion descriptor from g_iconv_open()
249	* @inbuf: bytes to convert
250	* @inbytes_left: inout parameter, bytes remaining to convert in @inbuf
251	* @outbuf: converted output bytes
252	* @outbytes_left: inout parameter, bytes available to fill in @outbuf
253	*
254	* Same as the standard UNIX routine iconv(), but
255	* may be implemented via libiconv on UNIX flavors that lack
256	* a native implementation.
257	*
258	* GLib provides g_convert() and g_locale_to_utf8() which are likely
259	* more convenient than the raw iconv wrappers.
260	*
261	* Note that the behaviour of iconv() for characters which are valid in the
262	* input character set, but which have no representation in the output character
263	* set, is implementation defined. This function may return success (with a
264	* positive number of non-reversible conversions as replacement characters were
265	* used), or it may return -1 and set an error such as %EILSEQ, in such a
266	* situation.
267	*
268	* Returns: count of non-reversible conversions, or -1 on error
269	**/
270	gsize
271	g_iconv (GIConv converter,
272	gchar **inbuf,
273	gsize *inbytes_left,
274	gchar **outbuf,
275	gsize *outbytes_left)
276	{
277	iconv_t cd = (iconv_t)converter;
278
279	return iconv (cd: cd, inbuf: inbuf, inbytesleft: inbytes_left, outbuf: outbuf, outbytesleft: outbytes_left);
280	}
281
282	/**
283	* g_iconv_close: (skip)
284	* @converter: a conversion descriptor from g_iconv_open()
285	*
286	* Same as the standard UNIX routine iconv_close(), but
287	* may be implemented via libiconv on UNIX flavors that lack
288	* a native implementation. Should be called to clean up
289	* the conversion descriptor from g_iconv_open() when
290	* you are done converting things.
291	*
292	* GLib provides g_convert() and g_locale_to_utf8() which are likely
293	* more convenient than the raw iconv wrappers.
294	*
295	* Returns: -1 on error, 0 on success
296	**/
297	gint
298	g_iconv_close (GIConv converter)
299	{
300	iconv_t cd = (iconv_t)converter;
301
302	return iconv_close (cd: cd);
303	}
304
305	static GIConv
306	open_converter (const gchar *to_codeset,
307	const gchar *from_codeset,
308	GError **error)
309	{
310	GIConv cd;
311
312	cd = g_iconv_open (to_codeset, from_codeset);
313
314	if (cd == (GIConv) -`1`)
315	{
316	/ Something went wrong. /
317	if (error)
318	{
319	if (errno == EINVAL)
320	g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_NO_CONVERSION,
321	_("Conversion from character set “%s” to “%s” is not supported"),
322	from_codeset, to_codeset);
323	else
324	g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_FAILED,
325	_("Could not open converter from “%s” to “%s”"),
326	from_codeset, to_codeset);
327	}
328	}
329
330	return cd;
331	}
332
333	static int
334	close_converter (GIConv cd)
335	{
336	if (cd == (GIConv) -`1`)
337	return `0`;
338
339	return g_iconv_close (converter: cd);
340	}
341
342	/**
343	* g_convert_with_iconv: (skip)
344	* @str: (array length=len) (element-type guint8):
345	* the string to convert.
346	* @len: the length of the string in bytes, or -1 if the string is
347	* nul-terminated (Note that some encodings may allow nul
348	* bytes to occur inside strings. In that case, using -1
349	* for the @len parameter is unsafe)
350	* @converter: conversion descriptor from g_iconv_open()
351	* @bytes_read: (out) (optional): location to store the number of bytes in
352	* the input string that were successfully converted, or %NULL.
353	* Even if the conversion was successful, this may be
354	* less than @len if there were partial characters
355	* at the end of the input. If the error
356	* #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
357	* stored will be the byte offset after the last valid
358	* input sequence.
359	* @bytes_written: (out) (optional): the number of bytes stored in
360	* the output buffer (not including the terminating nul).
361	* @error: location to store the error occurring, or %NULL to ignore
362	* errors. Any of the errors in #GConvertError may occur.
363	*
364	* Converts a string from one character set to another.
365	*
366	* Note that you should use g_iconv() for streaming conversions.
367	* Despite the fact that @bytes_read can return information about partial
368	* characters, the g_convert_... functions are not generally suitable
369	* for streaming. If the underlying converter maintains internal state,
370	* then this won't be preserved across successive calls to g_convert(),
371	* g_convert_with_iconv() or g_convert_with_fallback(). (An example of
372	* this is the GNU C converter for CP1255 which does not emit a base
373	* character until it knows that the next character is not a mark that
374	* could combine with the base character.)
375	*
376	* Characters which are valid in the input character set, but which have no
377	* representation in the output character set will result in a
378	* %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv()
379	* specification, which leaves this behaviour implementation defined. Note that
380	* this is the same error code as is returned for an invalid byte sequence in
381	* the input character set. To get defined behaviour for conversion of
382	* unrepresentable characters, use g_convert_with_fallback().
383	*
384	* Returns: (array length=bytes_written) (element-type guint8) (transfer full):
385	* If the conversion was successful, a newly allocated buffer
386	* containing the converted string, which must be freed with
387	* g_free(). Otherwise %NULL and @error will be set.
388	**/
389	gchar*
390	g_convert_with_iconv (const gchar *str,
391	gssize len,
392	GIConv converter,
393	gsize *bytes_read,
394	gsize *bytes_written,
395	GError **error)
396	{
397	gchar *dest;
398	gchar *outp;
399	const gchar *p;
400	gsize inbytes_remaining;
401	gsize outbytes_remaining;
402	gsize err;
403	gsize outbuf_size;
404	gboolean have_error = FALSE;
405	gboolean done = FALSE;
406	gboolean reset = FALSE;
407
408	g_return_val_if_fail (converter != (GIConv) -`1`, NULL);
409
410	if (len < `0`)
411	len = strlen (s: str);
412
413	p = str;
414	inbytes_remaining = len;
415	outbuf_size = len + NUL_TERMINATOR_LENGTH;
416
417	outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
418	outp = dest = g_malloc (n_bytes: outbuf_size);
419
420	while (!done && !have_error)
421	{
422	if (reset)
423	err = g_iconv (converter, NULL, inbytes_left: &inbytes_remaining, outbuf: &outp, outbytes_left: &outbytes_remaining);
424	else
425	err = g_iconv (converter, inbuf: (char **)&p, inbytes_left: &inbytes_remaining, outbuf: &outp, outbytes_left: &outbytes_remaining);
426
427	if (err == (gsize) -`1`)
428	{
429	switch (errno)
430	{
431	case EINVAL:
432	/ Incomplete text, do not report an error /
433	done = TRUE;
434	break;
435	case E2BIG:
436	{
437	gsize used = outp - dest;
438
439	outbuf_size *= `2`;
440	dest = g_realloc (mem: dest, n_bytes: outbuf_size);
441
442	outp = dest + used;
443	outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
444	}
445	break;
446	case EILSEQ:
447	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
448	_("Invalid byte sequence in conversion input"));
449	have_error = TRUE;
450	break;
451	default:
452	{
453	int errsv = errno;
454
455	g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_FAILED,
456	_("Error during conversion: %s"),
457	g_strerror (errnum: errsv));
458	}
459	have_error = TRUE;
460	break;
461	}
462	}
463	else if (err > `0`)
464	{
465	/ @err gives the number of replacement characters used. /
466	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
467	_("Unrepresentable character in conversion input"));
468	have_error = TRUE;
469	}
470	else
471	{
472	if (!reset)
473	{
474	/ call g_iconv with NULL inbuf to cleanup shift state /
475	reset = TRUE;
476	inbytes_remaining = `0`;
477	}
478	else
479	done = TRUE;
480	}
481	}
482
483	memset (s: outp, c: `0`, NUL_TERMINATOR_LENGTH);
484
485	if (bytes_read)
486	*bytes_read = p - str;
487	else
488	{
489	if ((p - str) != len)
490	{
491	if (!have_error)
492	{
493	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_PARTIAL_INPUT,
494	_("Partial character sequence at end of input"));
495	have_error = TRUE;
496	}
497	}
498	}
499
500	if (bytes_written)
501	bytes_written = outp - dest; /* Doesn't include '\0' /
502
503	if (have_error)
504	{
505	g_free (mem: dest);
506	return NULL;
507	}
508	else
509	return dest;
510	}
511
512	/**
513	* g_convert:
514	* @str: (array length=len) (element-type guint8):
515	* the string to convert.
516	* @len: the length of the string in bytes, or -1 if the string is
517	* nul-terminated (Note that some encodings may allow nul
518	* bytes to occur inside strings. In that case, using -1
519	* for the @len parameter is unsafe)
520	* @to_codeset: name of character set into which to convert @str
521	* @from_codeset: character set of @str.
522	* @bytes_read: (out) (optional): location to store the number of bytes in
523	* the input string that were successfully converted, or %NULL.
524	* Even if the conversion was successful, this may be
525	* less than @len if there were partial characters
526	* at the end of the input. If the error
527	* #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
528	* stored will be the byte offset after the last valid
529	* input sequence.
530	* @bytes_written: (out) (optional): the number of bytes stored in
531	* the output buffer (not including the terminating nul).
532	* @error: location to store the error occurring, or %NULL to ignore
533	* errors. Any of the errors in #GConvertError may occur.
534	*
535	* Converts a string from one character set to another.
536	*
537	* Note that you should use g_iconv() for streaming conversions.
538	* Despite the fact that @bytes_read can return information about partial
539	* characters, the g_convert_... functions are not generally suitable
540	* for streaming. If the underlying converter maintains internal state,
541	* then this won't be preserved across successive calls to g_convert(),
542	* g_convert_with_iconv() or g_convert_with_fallback(). (An example of
543	* this is the GNU C converter for CP1255 which does not emit a base
544	* character until it knows that the next character is not a mark that
545	* could combine with the base character.)
546	*
547	* Using extensions such as "//TRANSLIT" may not work (or may not work
548	* well) on many platforms. Consider using g_str_to_ascii() instead.
549	*
550	* Returns: (array length=bytes_written) (element-type guint8) (transfer full):
551	* If the conversion was successful, a newly allocated buffer
552	* containing the converted string, which must be freed with g_free().
553	* Otherwise %NULL and @error will be set.
554	**/
555	gchar*
556	g_convert (const gchar *str,
557	gssize len,
558	const gchar *to_codeset,
559	const gchar *from_codeset,
560	gsize *bytes_read,
561	gsize *bytes_written,
562	GError **error)
563	{
564	gchar *res;
565	GIConv cd;
566
567	g_return_val_if_fail (str != NULL, NULL);
568	g_return_val_if_fail (to_codeset != NULL, NULL);
569	g_return_val_if_fail (from_codeset != NULL, NULL);
570
571	cd = open_converter (to_codeset, from_codeset, error);
572
573	if (cd == (GIConv) -`1`)
574	{
575	if (bytes_read)
576	*bytes_read = `0`;
577
578	if (bytes_written)
579	*bytes_written = `0`;
580
581	return NULL;
582	}
583
584	res = g_convert_with_iconv (str, len, converter: cd,
585	bytes_read, bytes_written,
586	error);
587
588	close_converter (cd);
589
590	return res;
591	}
592
593	/**
594	* g_convert_with_fallback:
595	* @str: (array length=len) (element-type guint8):
596	* the string to convert.
597	* @len: the length of the string in bytes, or -1 if the string is
598	* nul-terminated (Note that some encodings may allow nul
599	* bytes to occur inside strings. In that case, using -1
600	* for the @len parameter is unsafe)
601	* @to_codeset: name of character set into which to convert @str
602	* @from_codeset: character set of @str.
603	* @fallback: UTF-8 string to use in place of characters not
604	* present in the target encoding. (The string must be
605	* representable in the target encoding).
606	* If %NULL, characters not in the target encoding will
607	* be represented as Unicode escapes \uxxxx or \Uxxxxyyyy.
608	* @bytes_read: (out) (optional): location to store the number of bytes in
609	* the input string that were successfully converted, or %NULL.
610	* Even if the conversion was successful, this may be
611	* less than @len if there were partial characters
612	* at the end of the input.
613	* @bytes_written: (out) (optional): the number of bytes stored in
614	* the output buffer (not including the terminating nul).
615	* @error: location to store the error occurring, or %NULL to ignore
616	* errors. Any of the errors in #GConvertError may occur.
617	*
618	* Converts a string from one character set to another, possibly
619	* including fallback sequences for characters not representable
620	* in the output. Note that it is not guaranteed that the specification
621	* for the fallback sequences in @fallback will be honored. Some
622	* systems may do an approximate conversion from @from_codeset
623	* to @to_codeset in their iconv() functions,
624	* in which case GLib will simply return that approximate conversion.
625	*
626	* Note that you should use g_iconv() for streaming conversions.
627	* Despite the fact that @bytes_read can return information about partial
628	* characters, the g_convert_... functions are not generally suitable
629	* for streaming. If the underlying converter maintains internal state,
630	* then this won't be preserved across successive calls to g_convert(),
631	* g_convert_with_iconv() or g_convert_with_fallback(). (An example of
632	* this is the GNU C converter for CP1255 which does not emit a base
633	* character until it knows that the next character is not a mark that
634	* could combine with the base character.)
635	*
636	* Returns: (array length=bytes_written) (element-type guint8) (transfer full):
637	* If the conversion was successful, a newly allocated buffer
638	* containing the converted string, which must be freed with g_free().
639	* Otherwise %NULL and @error will be set.
640	**/
641	gchar*
642	g_convert_with_fallback (const gchar *str,
643	gssize len,
644	const gchar *to_codeset,
645	const gchar *from_codeset,
646	const gchar *fallback,
647	gsize *bytes_read,
648	gsize *bytes_written,
649	GError **error)
650	{
651	gchar *utf8;
652	gchar *dest;
653	gchar *outp;
654	const gchar *insert_str = NULL;
655	const gchar *p;
656	gsize inbytes_remaining;
657	const gchar *save_p = NULL;
658	gsize save_inbytes = `0`;
659	gsize outbytes_remaining;
660	gsize err;
661	GIConv cd;
662	gsize outbuf_size;
663	gboolean have_error = FALSE;
664	gboolean done = FALSE;
665
666	GError *local_error = NULL;
667
668	g_return_val_if_fail (str != NULL, NULL);
669	g_return_val_if_fail (to_codeset != NULL, NULL);
670	g_return_val_if_fail (from_codeset != NULL, NULL);
671
672	if (len < `0`)
673	len = strlen (s: str);
674
675	/ Try an exact conversion; we only proceed if this fails*
676	* due to an illegal sequence in the input string.
677	*/
678	dest = g_convert (str, len, to_codeset, from_codeset,
679	bytes_read, bytes_written, error: &local_error);
680	if (!local_error)
681	return dest;
682
683	if (!g_error_matches (error: local_error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
684	{
685	g_propagate_error (dest: error, src: local_error);
686	return NULL;
687	}
688	else
689	g_error_free (error: local_error);
690
691	local_error = NULL;
692
693	/ No go; to proceed, we need a converter from "UTF-8" to*
694	* to_codeset, and the string as UTF-8.
695	*/
696	cd = open_converter (to_codeset, from_codeset: "UTF-8", error);
697	if (cd == (GIConv) -`1`)
698	{
699	if (bytes_read)
700	*bytes_read = `0`;
701
702	if (bytes_written)
703	*bytes_written = `0`;
704
705	return NULL;
706	}
707
708	utf8 = g_convert (str, len, to_codeset: "UTF-8", from_codeset,
709	bytes_read, bytes_written: &inbytes_remaining, error);
710	if (!utf8)
711	{
712	close_converter (cd);
713	if (bytes_written)
714	*bytes_written = `0`;
715	return NULL;
716	}
717
718	/ Now the heart of the code. We loop through the UTF-8 string, and*
719	* whenever we hit an offending character, we form fallback, convert
720	* the fallback to the target codeset, and then go back to
721	* converting the original string after finishing with the fallback.
722	*
723	* The variables save_p and save_inbytes store the input state
724	* for the original string while we are converting the fallback
725	*/
726	p = utf8;
727
728	outbuf_size = len + NUL_TERMINATOR_LENGTH;
729	outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH;
730	outp = dest = g_malloc (n_bytes: outbuf_size);
731
732	while (!done && !have_error)
733	{
734	gsize inbytes_tmp = inbytes_remaining;
735	err = g_iconv (converter: cd, inbuf: (char **)&p, inbytes_left: &inbytes_tmp, outbuf: &outp, outbytes_left: &outbytes_remaining);
736	inbytes_remaining = inbytes_tmp;
737
738	if (err == (gsize) -`1`)
739	{
740	switch (errno)
741	{
742	case EINVAL:
743	g_assert_not_reached();
744	break;
745	case E2BIG:
746	{
747	gsize used = outp - dest;
748
749	outbuf_size *= `2`;
750	dest = g_realloc (mem: dest, n_bytes: outbuf_size);
751
752	outp = dest + used;
753	outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH;
754
755	break;
756	}
757	case EILSEQ:
758	if (save_p)
759	{
760	/ Error converting fallback string - fatal*
761	*/
762	g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
763	_("Cannot convert fallback “%s” to codeset “%s”"),
764	insert_str, to_codeset);
765	have_error = TRUE;
766	break;
767	}
768	else if (p)
769	{
770	if (!fallback)
771	{
772	gunichar ch = g_utf8_get_char (p);
773	insert_str = g_strdup_printf (format: ch < `0x10000` ? "\\u%04x" : "\\U%08x",
774	ch);
775	}
776	else
777	insert_str = fallback;
778
779	save_p = g_utf8_next_char (p);
780	save_inbytes = inbytes_remaining - (save_p - p);
781	p = insert_str;
782	inbytes_remaining = strlen (s: p);
783	break;
784	}
785	/ if p is null /
786	G_GNUC_FALLTHROUGH;
787	default:
788	{
789	int errsv = errno;
790
791	g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_FAILED,
792	_("Error during conversion: %s"),
793	g_strerror (errnum: errsv));
794	}
795
796	have_error = TRUE;
797	break;
798	}
799	}
800	else
801	{
802	if (save_p)
803	{
804	if (!fallback)
805	g_free (mem: (gchar *)insert_str);
806	p = save_p;
807	inbytes_remaining = save_inbytes;
808	save_p = NULL;
809	}
810	else if (p)
811	{
812	/ call g_iconv with NULL inbuf to cleanup shift state /
813	p = NULL;
814	inbytes_remaining = `0`;
815	}
816	else
817	done = TRUE;
818	}
819	}
820
821	/ Cleanup*
822	*/
823	memset (s: outp, c: `0`, NUL_TERMINATOR_LENGTH);
824
825	close_converter (cd);
826
827	if (bytes_written)
828	bytes_written = outp - dest; /* Doesn't include '\0' /
829
830	g_free (mem: utf8);
831
832	if (have_error)
833	{
834	if (save_p && !fallback)
835	g_free (mem: (gchar *)insert_str);
836	g_free (mem: dest);
837	return NULL;
838	}
839	else
840	return dest;
841	}
842
843	/*
844	* g_locale_to_utf8
845	*
846	*
847	*/
848
849	/*
850	* Validate @string as UTF-8. @len can be negative if @string is
851	* nul-terminated, or a non-negative value in bytes. If @string ends in an
852	* incomplete sequence, or contains any illegal sequences or nul codepoints,
853	* %NULL will be returned and the error set to
854	* %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
855	* On success, @bytes_read and @bytes_written, if provided, will be set to
856	* the number of bytes in @string up to @len or the terminating nul byte.
857	* On error, @bytes_read will be set to the byte offset after the last valid
858	* and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0.
859	*/
860	static gchar *
861	strdup_len (const gchar *string,
862	gssize len,
863	gsize *bytes_read,
864	gsize *bytes_written,
865	GError **error)
866	{
867	gsize real_len;
868	const gchar *end_valid;
869
870	if (!g_utf8_validate (str: string, max_len: len, end: &end_valid))
871	{
872	if (bytes_read)
873	*bytes_read = end_valid - string;
874	if (bytes_written)
875	*bytes_written = `0`;
876
877	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
878	_("Invalid byte sequence in conversion input"));
879	return NULL;
880	}
881
882	real_len = end_valid - string;
883
884	if (bytes_read)
885	*bytes_read = real_len;
886	if (bytes_written)
887	*bytes_written = real_len;
888
889	return g_strndup (str: string, n: real_len);
890	}
891
892	typedef enum
893	{
894	CONVERT_CHECK_NO_NULS_IN_INPUT = `1` << `0`,
895	CONVERT_CHECK_NO_NULS_IN_OUTPUT = `1` << `1`
896	} ConvertCheckFlags;
897
898	/*
899	* Convert from @string in the encoding identified by @from_codeset,
900	* returning a string in the encoding identifed by @to_codeset.
901	* @len can be negative if @string is nul-terminated, or a non-negative
902	* value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags
903	* to check the input, the output, or both, for embedded nul bytes.
904	* On success, @bytes_read, if provided, will be set to the number of bytes
905	* in @string up to @len or the terminating nul byte, and @bytes_written, if
906	* provided, will be set to the number of output bytes written into the
907	* returned buffer, excluding the terminating nul sequence.
908	* On error, @bytes_read will be set to the byte offset after the last valid
909	* sequence in @string, and @bytes_written will be set to 0.
910	*/
911	static gchar *
912	convert_checked (const gchar *string,
913	gssize len,
914	const gchar *to_codeset,
915	const gchar *from_codeset,
916	ConvertCheckFlags flags,
917	gsize *bytes_read,
918	gsize *bytes_written,
919	GError **error)
920	{
921	gchar *out;
922	gsize outbytes;
923
924	if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > `0`)
925	{
926	const gchar *early_nul = memchr (s: string, c: `'\0'`, n: len);
927	if (early_nul != NULL)
928	{
929	if (bytes_read)
930	*bytes_read = early_nul - string;
931	if (bytes_written)
932	*bytes_written = `0`;
933
934	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
935	_("Embedded NUL byte in conversion input"));
936	return NULL;
937	}
938	}
939
940	out = g_convert (str: string, len, to_codeset, from_codeset,
941	bytes_read, bytes_written: &outbytes, error);
942	if (out == NULL)
943	{
944	if (bytes_written)
945	*bytes_written = `0`;
946	return NULL;
947	}
948
949	if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT)
950	&& memchr (s: out, c: `'\0'`, n: outbytes) != NULL)
951	{
952	g_free (mem: out);
953	if (bytes_written)
954	*bytes_written = `0`;
955	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_EMBEDDED_NUL,
956	_("Embedded NUL byte in conversion output"));
957	return NULL;
958	}
959
960	if (bytes_written)
961	*bytes_written = outbytes;
962	return out;
963	}
964
965	/**
966	* g_locale_to_utf8:
967	* @opsysstring: (array length=len) (element-type guint8): a string in the
968	* encoding of the current locale. On Windows
969	* this means the system codepage.
970	* @len: the length of the string, or -1 if the string is
971	* nul-terminated (Note that some encodings may allow nul
972	* bytes to occur inside strings. In that case, using -1
973	* for the @len parameter is unsafe)
974	* @bytes_read: (out) (optional): location to store the number of bytes in the
975	* input string that were successfully converted, or %NULL.
976	* Even if the conversion was successful, this may be
977	* less than @len if there were partial characters
978	* at the end of the input. If the error
979	* %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
980	* stored will be the byte offset after the last valid
981	* input sequence.
982	* @bytes_written: (out) (optional): the number of bytes stored in the output
983	* buffer (not including the terminating nul).
984	* @error: location to store the error occurring, or %NULL to ignore
985	* errors. Any of the errors in #GConvertError may occur.
986	*
987	* Converts a string which is in the encoding used for strings by
988	* the C runtime (usually the same as that used by the operating
989	* system) in the [current locale][setlocale] into a UTF-8 string.
990	*
991	* If the source encoding is not UTF-8 and the conversion output contains a
992	* nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
993	* function returns %NULL.
994	* If the source encoding is UTF-8, an embedded nul character is treated with
995	* the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with
996	* earlier versions of this library. Use g_convert() to produce output that
997	* may contain embedded nul characters.
998	*
999	* Returns: (type utf8): The converted string, or %NULL on an error.
1000	**/
1001	gchar *
1002	g_locale_to_utf8 (const gchar *opsysstring,
1003	gssize len,
1004	gsize *bytes_read,
1005	gsize *bytes_written,
1006	GError **error)
1007	{
1008	const char *charset;
1009
1010	if (g_get_charset (charset: &charset))
1011	return strdup_len (string: opsysstring, len, bytes_read, bytes_written, error);
1012	else
1013	return convert_checked (string: opsysstring, len, to_codeset: "UTF-8", from_codeset: charset,
1014	flags: CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1015	bytes_read, bytes_written, error);
1016	}
1017
1018	/**
1019	* g_locale_from_utf8:
1020	* @utf8string: a UTF-8 encoded string
1021	* @len: the length of the string, or -1 if the string is
1022	* nul-terminated.
1023	* @bytes_read: (out) (optional): location to store the number of bytes in the
1024	* input string that were successfully converted, or %NULL.
1025	* Even if the conversion was successful, this may be
1026	* less than @len if there were partial characters
1027	* at the end of the input. If the error
1028	* %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1029	* stored will be the byte offset after the last valid
1030	* input sequence.
1031	* @bytes_written: (out) (optional): the number of bytes stored in the output
1032	* buffer (not including the terminating nul).
1033	* @error: location to store the error occurring, or %NULL to ignore
1034	* errors. Any of the errors in #GConvertError may occur.
1035	*
1036	* Converts a string from UTF-8 to the encoding used for strings by
1037	* the C runtime (usually the same as that used by the operating
1038	* system) in the [current locale][setlocale]. On Windows this means
1039	* the system codepage.
1040	*
1041	* The input string shall not contain nul characters even if the @len
1042	* argument is positive. A nul character found inside the string will result
1043	* in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert
1044	* input that may contain embedded nul characters.
1045	*
1046	* Returns: (array length=bytes_written) (element-type guint8) (transfer full):
1047	* A newly-allocated buffer containing the converted string,
1048	* or %NULL on an error, and error will be set.
1049	**/
1050	gchar *
1051	g_locale_from_utf8 (const gchar *utf8string,
1052	gssize len,
1053	gsize *bytes_read,
1054	gsize *bytes_written,
1055	GError **error)
1056	{
1057	const gchar *charset;
1058
1059	if (g_get_charset (charset: &charset))
1060	return strdup_len (string: utf8string, len, bytes_read, bytes_written, error);
1061	else
1062	return convert_checked (string: utf8string, len, to_codeset: charset, from_codeset: "UTF-8",
1063	flags: CONVERT_CHECK_NO_NULS_IN_INPUT,
1064	bytes_read, bytes_written, error);
1065	}
1066
1067	#ifndef G_PLATFORM_WIN32
1068
1069	typedef struct _GFilenameCharsetCache GFilenameCharsetCache;
1070
1071	struct _GFilenameCharsetCache {
1072	gboolean is_utf8;
1073	gchar *charset;
1074	gchar **filename_charsets;
1075	};
1076
1077	static void
1078	filename_charset_cache_free (gpointer data)
1079	{
1080	GFilenameCharsetCache *cache = data;
1081	g_free (mem: cache->charset);
1082	g_strfreev (str_array: cache->filename_charsets);
1083	g_free (mem: cache);
1084	}
1085
1086	/**
1087	* g_get_filename_charsets:
1088	* @filename_charsets: (out) (transfer none) (array zero-terminated=1):
1089	* return location for the %NULL-terminated list of encoding names
1090	*
1091	* Determines the preferred character sets used for filenames.
1092	* The first character set from the @charsets is the filename encoding, the
1093	* subsequent character sets are used when trying to generate a displayable
1094	* representation of a filename, see g_filename_display_name().
1095	*
1096	* On Unix, the character sets are determined by consulting the
1097	* environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`.
1098	* On Windows, the character set used in the GLib API is always UTF-8
1099	* and said environment variables have no effect.
1100	*
1101	* `G_FILENAME_ENCODING` may be set to a comma-separated list of
1102	* character set names. The special token "\@locale" is taken
1103	* to mean the character set for the [current locale][setlocale].
1104	* If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is,
1105	* the character set of the current locale is taken as the filename
1106	* encoding. If neither environment variable is set, UTF-8 is taken
1107	* as the filename encoding, but the character set of the current locale
1108	* is also put in the list of encodings.
1109	*
1110	* The returned @charsets belong to GLib and must not be freed.
1111	*
1112	* Note that on Unix, regardless of the locale character set or
1113	* `G_FILENAME_ENCODING` value, the actual file names present
1114	* on a system might be in any random encoding or just gibberish.
1115	*
1116	* Returns: %TRUE if the filename encoding is UTF-8.
1117	*
1118	* Since: 2.6
1119	*/
1120	gboolean
1121	g_get_filename_charsets (const gchar ***filename_charsets)
1122	{
1123	static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free);
1124	GFilenameCharsetCache *cache = g_private_get (key: &cache_private);
1125	const gchar *charset;
1126
1127	if (!cache)
1128	cache = g_private_set_alloc0 (key: &cache_private, size: sizeof (GFilenameCharsetCache));
1129
1130	g_get_charset (charset: &charset);
1131
1132	if (!(cache->charset && strcmp (s1: cache->charset, s2: charset) == `0`))
1133	{
1134	const gchar *new_charset;
1135	const gchar *p;
1136	gint i;
1137
1138	g_free (mem: cache->charset);
1139	g_strfreev (str_array: cache->filename_charsets);
1140	cache->charset = g_strdup (str: charset);
1141
1142	p = g_getenv (variable: "G_FILENAME_ENCODING");
1143	if (p != NULL && p[`0`] != `'\0'`)
1144	{
1145	cache->filename_charsets = g_strsplit (string: p, delimiter: ",", max_tokens: `0`);
1146	cache->is_utf8 = (strcmp (s1: cache->filename_charsets[`0`], s2: "UTF-8") == `0`);
1147
1148	for (i = `0`; cache->filename_charsets[i]; i++)
1149	{
1150	if (strcmp (s1: "@locale", s2: cache->filename_charsets[i]) == `0`)
1151	{
1152	g_get_charset (charset: &new_charset);
1153	g_free (mem: cache->filename_charsets[i]);
1154	cache->filename_charsets[i] = g_strdup (str: new_charset);
1155	}
1156	}
1157	}
1158	else if (g_getenv (variable: "G_BROKEN_FILENAMES") != NULL)
1159	{
1160	cache->filename_charsets = g_new0 (gchar *, `2`);
1161	cache->is_utf8 = g_get_charset (charset: &new_charset);
1162	cache->filename_charsets[`0`] = g_strdup (str: new_charset);
1163	}
1164	else
1165	{
1166	cache->filename_charsets = g_new0 (gchar *, `3`);
1167	cache->is_utf8 = TRUE;
1168	cache->filename_charsets[`0`] = g_strdup (str: "UTF-8");
1169	if (!g_get_charset (charset: &new_charset))
1170	cache->filename_charsets[`1`] = g_strdup (str: new_charset);
1171	}
1172	}
1173
1174	if (filename_charsets)
1175	filename_charsets = (const* gchar **)cache->filename_charsets;
1176
1177	return cache->is_utf8;
1178	}
1179
1180	#else /* G_PLATFORM_WIN32 */
1181
1182	gboolean
1183	g_get_filename_charsets (const gchar ***filename_charsets)
1184	{
1185	static const gchar *charsets[] = {
1186	"UTF-8",
1187	NULL
1188	};
1189
1190	#ifdef G_OS_WIN32
1191	/ On Windows GLib pretends that the filename charset is UTF-8 /
1192	if (filename_charsets)
1193	*filename_charsets = charsets;
1194
1195	return TRUE;
1196	#else
1197	gboolean result;
1198
1199	/ Cygwin works like before /
1200	result = g_get_charset (&(charsets[`0`]));
1201
1202	if (filename_charsets)
1203	*filename_charsets = charsets;
1204
1205	return result;
1206	#endif
1207	}
1208
1209	#endif /* G_PLATFORM_WIN32 */
1210
1211	static gboolean
1212	get_filename_charset (const gchar **filename_charset)
1213	{
1214	const gchar **charsets;
1215	gboolean is_utf8;
1216
1217	is_utf8 = g_get_filename_charsets (filename_charsets: &charsets);
1218
1219	if (filename_charset)
1220	*filename_charset = charsets[`0`];
1221
1222	return is_utf8;
1223	}
1224
1225	/**
1226	* g_filename_to_utf8:
1227	* @opsysstring: (type filename): a string in the encoding for filenames
1228	* @len: the length of the string, or -1 if the string is
1229	* nul-terminated (Note that some encodings may allow nul
1230	* bytes to occur inside strings. In that case, using -1
1231	* for the @len parameter is unsafe)
1232	* @bytes_read: (out) (optional): location to store the number of bytes in the
1233	* input string that were successfully converted, or %NULL.
1234	* Even if the conversion was successful, this may be
1235	* less than @len if there were partial characters
1236	* at the end of the input. If the error
1237	* %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1238	* stored will be the byte offset after the last valid
1239	* input sequence.
1240	* @bytes_written: (out) (optional): the number of bytes stored in the output
1241	* buffer (not including the terminating nul).
1242	* @error: location to store the error occurring, or %NULL to ignore
1243	* errors. Any of the errors in #GConvertError may occur.
1244	*
1245	* Converts a string which is in the encoding used by GLib for
1246	* filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8
1247	* for filenames; on other platforms, this function indirectly depends on
1248	* the [current locale][setlocale].
1249	*
1250	* The input string shall not contain nul characters even if the @len
1251	* argument is positive. A nul character found inside the string will result
1252	* in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE.
1253	* If the source encoding is not UTF-8 and the conversion output contains a
1254	* nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the
1255	* function returns %NULL. Use g_convert() to produce output that
1256	* may contain embedded nul characters.
1257	*
1258	* Returns: (type utf8): The converted string, or %NULL on an error.
1259	**/
1260	gchar*
1261	g_filename_to_utf8 (const gchar *opsysstring,
1262	gssize len,
1263	gsize *bytes_read,
1264	gsize *bytes_written,
1265	GError **error)
1266	{
1267	const gchar *charset;
1268
1269	g_return_val_if_fail (opsysstring != NULL, NULL);
1270
1271	if (get_filename_charset (filename_charset: &charset))
1272	return strdup_len (string: opsysstring, len, bytes_read, bytes_written, error);
1273	else
1274	return convert_checked (string: opsysstring, len, to_codeset: "UTF-8", from_codeset: charset,
1275	flags: CONVERT_CHECK_NO_NULS_IN_INPUT \|
1276	CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1277	bytes_read, bytes_written, error);
1278	}
1279
1280	/**
1281	* g_filename_from_utf8:
1282	* @utf8string: (type utf8): a UTF-8 encoded string.
1283	* @len: the length of the string, or -1 if the string is
1284	* nul-terminated.
1285	* @bytes_read: (out) (optional): location to store the number of bytes in
1286	* the input string that were successfully converted, or %NULL.
1287	* Even if the conversion was successful, this may be
1288	* less than @len if there were partial characters
1289	* at the end of the input. If the error
1290	* %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value
1291	* stored will be the byte offset after the last valid
1292	* input sequence.
1293	* @bytes_written: (out) (optional): the number of bytes stored in
1294	* the output buffer (not including the terminating nul).
1295	* @error: location to store the error occurring, or %NULL to ignore
1296	* errors. Any of the errors in #GConvertError may occur.
1297	*
1298	* Converts a string from UTF-8 to the encoding GLib uses for
1299	* filenames. Note that on Windows GLib uses UTF-8 for filenames;
1300	* on other platforms, this function indirectly depends on the
1301	* [current locale][setlocale].
1302	*
1303	* The input string shall not contain nul characters even if the @len
1304	* argument is positive. A nul character found inside the string will result
1305	* in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is
1306	* not UTF-8 and the conversion output contains a nul character, the error
1307	* %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL.
1308	*
1309	* Returns: (type filename):
1310	* The converted string, or %NULL on an error.
1311	**/
1312	gchar*
1313	g_filename_from_utf8 (const gchar *utf8string,
1314	gssize len,
1315	gsize *bytes_read,
1316	gsize *bytes_written,
1317	GError **error)
1318	{
1319	const gchar *charset;
1320
1321	if (get_filename_charset (filename_charset: &charset))
1322	return strdup_len (string: utf8string, len, bytes_read, bytes_written, error);
1323	else
1324	return convert_checked (string: utf8string, len, to_codeset: charset, from_codeset: "UTF-8",
1325	flags: CONVERT_CHECK_NO_NULS_IN_INPUT \|
1326	CONVERT_CHECK_NO_NULS_IN_OUTPUT,
1327	bytes_read, bytes_written, error);
1328	}
1329
1330	/ Test of haystack has the needle prefix, comparing case*
1331	* insensitive. haystack may be UTF-8, but needle must
1332	* contain only ascii. */
1333	static gboolean
1334	has_case_prefix (const gchar haystack, const* gchar *needle)
1335	{
1336	const gchar h, n;
1337
1338	/ Eat one character at a time. /
1339	h = haystack;
1340	n = needle;
1341
1342	while (n && h &&
1343	g_ascii_tolower (c: n) == g_ascii_tolower (c: h))
1344	{
1345	n++;
1346	h++;
1347	}
1348
1349	return *n == `'\0'`;
1350	}
1351
1352	typedef enum {
1353	UNSAFE_ALL = `0x1`, / Escape all unsafe characters /
1354	UNSAFE_ALLOW_PLUS = `0x2`, / Allows '+' /
1355	UNSAFE_PATH = `0x8`, / Allows '/', '&', '=', ':', '@', '+', '$' and ',' /
1356	UNSAFE_HOST = `0x10`, / Allows '/' and ':' and '@' /
1357	UNSAFE_SLASHES = `0x20` / Allows all characters except for '/' and '%' /
1358	} UnsafeCharacterSet;
1359
1360	static const guchar acceptable[`96`] = {
1361	/ A table of the ASCII chars from space (32) to DEL (127) /
1362	/ ! " # $ % & ' ( ) * + , - . / /
1363	`0x00`,`0x3F`,`0x20`,`0x20`,`0x28`,`0x00`,`0x2C`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x2A`,`0x28`,`0x3F`,`0x3F`,`0x1C`,
1364	/ 0 1 2 3 4 5 6 7 8 9 : ; < = > ? /
1365	`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x38`,`0x20`,`0x20`,`0x2C`,`0x20`,`0x20`,
1366	/ @ A B C D E F G H I J K L M N O /
1367	`0x38`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,
1368	/ P Q R S T U V W X Y Z [ \ ] ^ _ /
1369	`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x20`,`0x20`,`0x20`,`0x20`,`0x3F`,
1370	/ ` a b c d e f g h i j k l m n o /
1371	`0x20`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,
1372	/ p q r s t u v w x y z { \| } ~ DEL /
1373	`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x3F`,`0x20`,`0x20`,`0x20`,`0x3F`,`0x20`
1374	};
1375
1376	static const gchar hex[`16`] = "0123456789ABCDEF";
1377
1378	/ Note: This escape function works on file: URIs, but if you want to*
1379	* escape something else, please read RFC-2396 */
1380	static gchar *
1381	g_escape_uri_string (const gchar *string,
1382	UnsafeCharacterSet mask)
1383	{
1384	#define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask))
1385
1386	const gchar *p;
1387	gchar *q;
1388	gchar *result;
1389	int c;
1390	gint unacceptable;
1391	UnsafeCharacterSet use_mask;
1392
1393	g_return_val_if_fail (mask == UNSAFE_ALL
1394	\|\| mask == UNSAFE_ALLOW_PLUS
1395	\|\| mask == UNSAFE_PATH
1396	\|\| mask == UNSAFE_HOST
1397	\|\| mask == UNSAFE_SLASHES, NULL);
1398
1399	unacceptable = `0`;
1400	use_mask = mask;
1401	for (p = string; *p != `'\0'`; p++)
1402	{
1403	c = (guchar) *p;
1404	if (!ACCEPTABLE (c))
1405	unacceptable++;
1406	}
1407
1408	result = g_malloc (n_bytes: p - string + unacceptable * `2` + `1`);
1409
1410	use_mask = mask;
1411	for (q = result, p = string; *p != `'\0'`; p++)
1412	{
1413	c = (guchar) *p;
1414
1415	if (!ACCEPTABLE (c))
1416	{
1417	q++ = `'%'`; /* means hex coming /
1418	*q++ = hex[c >> `4`];
1419	*q++ = hex[c & `15`];
1420	}
1421	else
1422	q++ = p;
1423	}
1424
1425	*q = `'\0'`;
1426
1427	return result;
1428	}
1429
1430
1431	static gchar *
1432	g_escape_file_uri (const gchar *hostname,
1433	const gchar *pathname)
1434	{
1435	char *escaped_hostname = NULL;
1436	char *escaped_path;
1437	char *res;
1438
1439	#ifdef G_OS_WIN32
1440	char p, backslash;
1441
1442	/ Turn backslashes into forward slashes. That's what Netscape*
1443	* does, and they are actually more or less equivalent in Windows.
1444	*/
1445
1446	pathname = g_strdup (pathname);
1447	p = (char *) pathname;
1448
1449	while ((backslash = strchr (p, `'\\'`)) != NULL)
1450	{
1451	*backslash = `'/'`;
1452	p = backslash + `1`;
1453	}
1454	#endif
1455
1456	if (hostname && *hostname != `'\0'`)
1457	{
1458	escaped_hostname = g_escape_uri_string (string: hostname, mask: UNSAFE_HOST);
1459	}
1460
1461	escaped_path = g_escape_uri_string (string: pathname, mask: UNSAFE_PATH);
1462
1463	res = g_strconcat (string1: "file://",
1464	(escaped_hostname) ? escaped_hostname : "",
1465	(*escaped_path != `'/'`) ? "/" : "",
1466	escaped_path,
1467	NULL);
1468
1469	#ifdef G_OS_WIN32
1470	g_free ((char *) pathname);
1471	#endif
1472
1473	g_free (mem: escaped_hostname);
1474	g_free (mem: escaped_path);
1475
1476	return res;
1477	}
1478
1479	static int
1480	unescape_character (const char *scanner)
1481	{
1482	int first_digit;
1483	int second_digit;
1484
1485	first_digit = g_ascii_xdigit_value (c: scanner[`0`]);
1486	if (first_digit < `0`)
1487	return -`1`;
1488
1489	second_digit = g_ascii_xdigit_value (c: scanner[`1`]);
1490	if (second_digit < `0`)
1491	return -`1`;
1492
1493	return (first_digit << `4`) \| second_digit;
1494	}
1495
1496	static gchar *
1497	g_unescape_uri_string (const char *escaped,
1498	int len,
1499	const char *illegal_escaped_characters,
1500	gboolean ascii_must_not_be_escaped)
1501	{
1502	const gchar in, in_end;
1503	gchar out, result;
1504	int c;
1505
1506	if (escaped == NULL)
1507	return NULL;
1508
1509	if (len < `0`)
1510	len = strlen (s: escaped);
1511
1512	result = g_malloc (n_bytes: len + `1`);
1513
1514	out = result;
1515	for (in = escaped, in_end = escaped + len; in < in_end; in++)
1516	{
1517	c = *in;
1518
1519	if (c == `'%'`)
1520	{
1521	/ catch partial escape sequences past the end of the substring /
1522	if (in + `3` > in_end)
1523	break;
1524
1525	c = unescape_character (scanner: in + `1`);
1526
1527	/ catch bad escape sequences and NUL characters /
1528	if (c <= `0`)
1529	break;
1530
1531	/ catch escaped ASCII /
1532	if (ascii_must_not_be_escaped && c <= `0x7F`)
1533	break;
1534
1535	/ catch other illegal escaped characters /
1536	if (strchr (s: illegal_escaped_characters, c: c) != NULL)
1537	break;
1538
1539	in += `2`;
1540	}
1541
1542	*out++ = c;
1543	}
1544
1545	g_assert (out - result <= len);
1546	*out = `'\0'`;
1547
1548	if (in != in_end)
1549	{
1550	g_free (mem: result);
1551	return NULL;
1552	}
1553
1554	return result;
1555	}
1556
1557	static gboolean
1558	is_asciialphanum (gunichar c)
1559	{
1560	return c <= `0x7F` && g_ascii_isalnum (c);
1561	}
1562
1563	static gboolean
1564	is_asciialpha (gunichar c)
1565	{
1566	return c <= `0x7F` && g_ascii_isalpha (c);
1567	}
1568
1569	/ allows an empty string /
1570	static gboolean
1571	hostname_validate (const char *hostname)
1572	{
1573	const char *p;
1574	gunichar c, first_char, last_char;
1575
1576	p = hostname;
1577	if (*p == `'\0'`)
1578	return TRUE;
1579	do
1580	{
1581	/ read in a label /
1582	c = g_utf8_get_char (p);
1583	p = g_utf8_next_char (p);
1584	if (!is_asciialphanum (c))
1585	return FALSE;
1586	first_char = c;
1587	do
1588	{
1589	last_char = c;
1590	c = g_utf8_get_char (p);
1591	p = g_utf8_next_char (p);
1592	}
1593	while (is_asciialphanum (c) \|\| c == `'-'`);
1594	if (last_char == `'-'`)
1595	return FALSE;
1596
1597	/ if that was the last label, check that it was a toplabel /
1598	if (c == `'\0'` \|\| (c == `'.'` && *p == `'\0'`))
1599	return is_asciialpha (c: first_char);
1600	}
1601	while (c == `'.'`);
1602	return FALSE;
1603	}
1604
1605	/**
1606	* g_filename_from_uri:
1607	* @uri: a uri describing a filename (escaped, encoded in ASCII).
1608	* @hostname: (out) (optional) (nullable): Location to store hostname for the URI.
1609	* If there is no hostname in the URI, %NULL will be
1610	* stored in this location.
1611	* @error: location to store the error occurring, or %NULL to ignore
1612	* errors. Any of the errors in #GConvertError may occur.
1613	*
1614	* Converts an escaped ASCII-encoded URI to a local filename in the
1615	* encoding used for filenames.
1616	*
1617	* Returns: (type filename): a newly-allocated string holding
1618	* the resulting filename, or %NULL on an error.
1619	**/
1620	gchar *
1621	g_filename_from_uri (const gchar *uri,
1622	gchar **hostname,
1623	GError **error)
1624	{
1625	const char *path_part;
1626	const char *host_part;
1627	char *unescaped_hostname;
1628	char *result;
1629	char *filename;
1630	int offs;
1631	#ifdef G_OS_WIN32
1632	char p, slash;
1633	#endif
1634
1635	if (hostname)
1636	*hostname = NULL;
1637
1638	if (!has_case_prefix (haystack: uri, needle: "file:/"))
1639	{
1640	g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_BAD_URI,
1641	_("The URI “%s” is not an absolute URI using the “file” scheme"),
1642	uri);
1643	return NULL;
1644	}
1645
1646	path_part = uri + strlen (s: "file:");
1647
1648	if (strchr (s: path_part, c: `'#'`) != NULL)
1649	{
1650	g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_BAD_URI,
1651	_("The local file URI “%s” may not include a “#”"),
1652	uri);
1653	return NULL;
1654	}
1655
1656	if (has_case_prefix (haystack: path_part, needle: "///"))
1657	path_part += `2`;
1658	else if (has_case_prefix (haystack: path_part, needle: "//"))
1659	{
1660	path_part += `2`;
1661	host_part = path_part;
1662
1663	path_part = strchr (s: path_part, c: `'/'`);
1664
1665	if (path_part == NULL)
1666	{
1667	g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_BAD_URI,
1668	_("The URI “%s” is invalid"),
1669	uri);
1670	return NULL;
1671	}
1672
1673	unescaped_hostname = g_unescape_uri_string (escaped: host_part, len: path_part - host_part, illegal_escaped_characters: "", TRUE);
1674
1675	if (unescaped_hostname == NULL \|\|
1676	!hostname_validate (hostname: unescaped_hostname))
1677	{
1678	g_free (mem: unescaped_hostname);
1679	g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_BAD_URI,
1680	_("The hostname of the URI “%s” is invalid"),
1681	uri);
1682	return NULL;
1683	}
1684
1685	if (hostname)
1686	*hostname = unescaped_hostname;
1687	else
1688	g_free (mem: unescaped_hostname);
1689	}
1690
1691	filename = g_unescape_uri_string (escaped: path_part, len: -`1`, illegal_escaped_characters: "/", FALSE);
1692
1693	if (filename == NULL)
1694	{
1695	g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_BAD_URI,
1696	_("The URI “%s” contains invalidly escaped characters"),
1697	uri);
1698	return NULL;
1699	}
1700
1701	offs = `0`;
1702	#ifdef G_OS_WIN32
1703	/ Drop localhost /
1704	if (hostname && *hostname != NULL &&
1705	g_ascii_strcasecmp (*hostname, "localhost") == `0`)
1706	{
1707	g_free (*hostname);
1708	*hostname = NULL;
1709	}
1710
1711	/ Turn slashes into backslashes, because that's the canonical spelling /
1712	p = filename;
1713	while ((slash = strchr (p, `'/'`)) != NULL)
1714	{
1715	*slash = `'\\'`;
1716	p = slash + `1`;
1717	}
1718
1719	/ Windows URIs with a drive letter can be like "file://host/c:/foo"*
1720	* or "file://host/c\|/foo" (some Netscape versions). In those cases, start
1721	* the filename from the drive letter.
1722	*/
1723	if (g_ascii_isalpha (filename[`1`]))
1724	{
1725	if (filename[`2`] == `':'`)
1726	offs = `1`;
1727	else if (filename[`2`] == `'\|'`)
1728	{
1729	filename[`2`] = `':'`;
1730	offs = `1`;
1731	}
1732	}
1733	#endif
1734
1735	result = g_strdup (str: filename + offs);
1736	g_free (mem: filename);
1737
1738	return result;
1739	}
1740
1741	/**
1742	* g_filename_to_uri:
1743	* @filename: (type filename): an absolute filename specified in the GLib file
1744	* name encoding, which is the on-disk file name bytes on Unix, and UTF-8
1745	* on Windows
1746	* @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none.
1747	* @error: location to store the error occurring, or %NULL to ignore
1748	* errors. Any of the errors in #GConvertError may occur.
1749	*
1750	* Converts an absolute filename to an escaped ASCII-encoded URI, with the path
1751	* component following Section 3.3. of RFC 2396.
1752	*
1753	* Returns: a newly-allocated string holding the resulting
1754	* URI, or %NULL on an error.
1755	**/
1756	gchar *
1757	g_filename_to_uri (const gchar *filename,
1758	const gchar *hostname,
1759	GError **error)
1760	{
1761	char *escaped_uri;
1762
1763	g_return_val_if_fail (filename != NULL, NULL);
1764
1765	if (!g_path_is_absolute (file_name: filename))
1766	{
1767	g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_NOT_ABSOLUTE_PATH,
1768	_("The pathname “%s” is not an absolute path"),
1769	filename);
1770	return NULL;
1771	}
1772
1773	if (hostname &&
1774	!(g_utf8_validate (str: hostname, max_len: -`1`, NULL)
1775	&& hostname_validate (hostname)))
1776	{
1777	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1778	_("Invalid hostname"));
1779	return NULL;
1780	}
1781
1782	#ifdef G_OS_WIN32
1783	/ Don't use localhost unnecessarily /
1784	if (hostname && g_ascii_strcasecmp (hostname, "localhost") == `0`)
1785	hostname = NULL;
1786	#endif
1787
1788	escaped_uri = g_escape_file_uri (hostname, pathname: filename);
1789
1790	return escaped_uri;
1791	}
1792
1793	/**
1794	* g_uri_list_extract_uris:
1795	* @uri_list: an URI list
1796	*
1797	* Splits an URI list conforming to the text/uri-list
1798	* mime type defined in RFC 2483 into individual URIs,
1799	* discarding any comments. The URIs are not validated.
1800	*
1801	* Returns: (transfer full): a newly allocated %NULL-terminated list
1802	* of strings holding the individual URIs. The array should be freed
1803	* with g_strfreev().
1804	*
1805	* Since: 2.6
1806	*/
1807	gchar **
1808	g_uri_list_extract_uris (const gchar *uri_list)
1809	{
1810	GPtrArray *uris;
1811	const gchar p, q;
1812
1813	uris = g_ptr_array_new ();
1814
1815	p = uri_list;
1816
1817	/ We don't actually try to validate the URI according to RFC*
1818	* 2396, or even check for allowed characters - we just ignore
1819	* comments and trim whitespace off the ends. We also
1820	* allow LF delimination as well as the specified CRLF.
1821	*
1822	* We do allow comments like specified in RFC 2483.
1823	*/
1824	while (p)
1825	{
1826	if (*p != `'#'`)
1827	{
1828	while (g_ascii_isspace (*p))
1829	p++;
1830
1831	q = p;
1832	while (q && (q != `'\n'`) && (*q != `'\r'`))
1833	q++;
1834
1835	if (q > p)
1836	{
1837	q--;
1838	while (q > p && g_ascii_isspace (*q))
1839	q--;
1840
1841	if (q > p)
1842	g_ptr_array_add (array: uris, data: g_strndup (str: p, n: q - p + `1`));
1843	}
1844	}
1845	p = strchr (s: p, c: `'\n'`);
1846	if (p)
1847	p++;
1848	}
1849
1850	g_ptr_array_add (array: uris, NULL);
1851
1852	return (gchar **) g_ptr_array_free (array: uris, FALSE);
1853	}
1854
1855	/**
1856	* g_filename_display_basename:
1857	* @filename: (type filename): an absolute pathname in the
1858	* GLib file name encoding
1859	*
1860	* Returns the display basename for the particular filename, guaranteed
1861	* to be valid UTF-8. The display name might not be identical to the filename,
1862	* for instance there might be problems converting it to UTF-8, and some files
1863	* can be translated in the display.
1864	*
1865	* If GLib cannot make sense of the encoding of @filename, as a last resort it
1866	* replaces unknown characters with U+FFFD, the Unicode replacement character.
1867	* You can search the result for the UTF-8 encoding of this character (which is
1868	* "\357\277\275" in octal notation) to find out if @filename was in an invalid
1869	* encoding.
1870	*
1871	* You must pass the whole absolute pathname to this functions so that
1872	* translation of well known locations can be done.
1873	*
1874	* This function is preferred over g_filename_display_name() if you know the
1875	* whole path, as it allows translation.
1876	*
1877	* Returns: a newly allocated string containing
1878	* a rendition of the basename of the filename in valid UTF-8
1879	*
1880	* Since: 2.6
1881	**/
1882	gchar *
1883	g_filename_display_basename (const gchar *filename)
1884	{
1885	char *basename;
1886	char *display_name;
1887
1888	g_return_val_if_fail (filename != NULL, NULL);
1889
1890	basename = g_path_get_basename (file_name: filename);
1891	display_name = g_filename_display_name (filename: basename);
1892	g_free (mem: basename);
1893	return display_name;
1894	}
1895
1896	/**
1897	* g_filename_display_name:
1898	* @filename: (type filename): a pathname hopefully in the
1899	* GLib file name encoding
1900	*
1901	* Converts a filename into a valid UTF-8 string. The conversion is
1902	* not necessarily reversible, so you should keep the original around
1903	* and use the return value of this function only for display purposes.
1904	* Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL
1905	* even if the filename actually isn't in the GLib file name encoding.
1906	*
1907	* If GLib cannot make sense of the encoding of @filename, as a last resort it
1908	* replaces unknown characters with U+FFFD, the Unicode replacement character.
1909	* You can search the result for the UTF-8 encoding of this character (which is
1910	* "\357\277\275" in octal notation) to find out if @filename was in an invalid
1911	* encoding.
1912	*
1913	* If you know the whole pathname of the file you should use
1914	* g_filename_display_basename(), since that allows location-based
1915	* translation of filenames.
1916	*
1917	* Returns: a newly allocated string containing
1918	* a rendition of the filename in valid UTF-8
1919	*
1920	* Since: 2.6
1921	**/
1922	gchar *
1923	g_filename_display_name (const gchar *filename)
1924	{
1925	gint i;
1926	const gchar **charsets;
1927	gchar *display_name = NULL;
1928	gboolean is_utf8;
1929
1930	is_utf8 = g_get_filename_charsets (filename_charsets: &charsets);
1931
1932	if (is_utf8)
1933	{
1934	if (g_utf8_validate (str: filename, max_len: -`1`, NULL))
1935	display_name = g_strdup (str: filename);
1936	}
1937
1938	if (!display_name)
1939	{
1940	/ Try to convert from the filename charsets to UTF-8.*
1941	* Skip the first charset if it is UTF-8.
1942	*/
1943	for (i = is_utf8 ? `1` : `0`; charsets[i]; i++)
1944	{
1945	display_name = g_convert (str: filename, len: -`1`, to_codeset: "UTF-8", from_codeset: charsets[i],
1946	NULL, NULL, NULL);
1947
1948	if (display_name)
1949	break;
1950	}
1951	}
1952
1953	/ if all conversions failed, we replace invalid UTF-8*
1954	* by a question mark
1955	*/
1956	if (!display_name)
1957	display_name = g_utf8_make_valid (str: filename, len: -`1`);
1958
1959	return display_name;
1960	}
1961
1962	#ifdef G_OS_WIN32
1963
1964	/ Binary compatibility versions. Not for newly compiled code. /
1965
1966	_GLIB_EXTERN gchar g_filename_to_utf8_utf8 (const* gchar *opsysstring,
1967	gssize len,
1968	gsize *bytes_read,
1969	gsize *bytes_written,
1970	GError **error) G_GNUC_MALLOC;
1971	_GLIB_EXTERN gchar g_filename_from_utf8_utf8 (const* gchar *utf8string,
1972	gssize len,
1973	gsize *bytes_read,
1974	gsize *bytes_written,
1975	GError **error) G_GNUC_MALLOC;
1976	_GLIB_EXTERN gchar g_filename_from_uri_utf8 (const* gchar *uri,
1977	gchar **hostname,
1978	GError **error) G_GNUC_MALLOC;
1979	_GLIB_EXTERN gchar g_filename_to_uri_utf8 (const* gchar *filename,
1980	const gchar *hostname,
1981	GError **error) G_GNUC_MALLOC;
1982
1983	gchar *
1984	g_filename_to_utf8_utf8 (const gchar *opsysstring,
1985	gssize len,
1986	gsize *bytes_read,
1987	gsize *bytes_written,
1988	GError **error)
1989	{
1990	return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error);
1991	}
1992
1993	gchar *
1994	g_filename_from_utf8_utf8 (const gchar *utf8string,
1995	gssize len,
1996	gsize *bytes_read,
1997	gsize *bytes_written,
1998	GError **error)
1999	{
2000	return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error);
2001	}
2002
2003	gchar *
2004	g_filename_from_uri_utf8 (const gchar *uri,
2005	gchar **hostname,
2006	GError **error)
2007	{
2008	return g_filename_from_uri (uri, hostname, error);
2009	}
2010
2011	gchar *
2012	g_filename_to_uri_utf8 (const gchar *filename,
2013	const gchar *hostname,
2014	GError **error)
2015	{
2016	return g_filename_to_uri (filename, hostname, error);
2017	}
2018
2019	#endif
2020

source code of gtk/subprojects/glib/glib/gconvert.c