1 | /* GLIB - Library of useful routines for C programming |
2 | * |
3 | * gconvert.c: Convert between character sets using iconv |
4 | * Copyright Red Hat Inc., 2000 |
5 | * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com> |
6 | * |
7 | * This library is free software; you can redistribute it and/or |
8 | * modify it under the terms of the GNU Lesser General Public |
9 | * License as published by the Free Software Foundation; either |
10 | * version 2.1 of the License, or (at your option) any later version. |
11 | * |
12 | * This library is distributed in the hope that it will be useful, |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | * Lesser General Public License for more details. |
16 | * |
17 | * You should have received a copy of the GNU Lesser General Public |
18 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
19 | */ |
20 | |
21 | #include "config.h" |
22 | #include "glibconfig.h" |
23 | |
24 | #ifndef G_OS_WIN32 |
25 | #include <iconv.h> |
26 | #endif |
27 | #include <errno.h> |
28 | #include <stdio.h> |
29 | #include <string.h> |
30 | #include <stdlib.h> |
31 | |
32 | #ifdef G_OS_WIN32 |
33 | #include "win_iconv.c" |
34 | #endif |
35 | |
36 | #ifdef G_PLATFORM_WIN32 |
37 | #define STRICT |
38 | #include <windows.h> |
39 | #undef STRICT |
40 | #endif |
41 | |
42 | #include "gconvert.h" |
43 | |
44 | #include "gcharsetprivate.h" |
45 | #include "gslist.h" |
46 | #include "gstrfuncs.h" |
47 | #include "gtestutils.h" |
48 | #include "gthread.h" |
49 | #include "gthreadprivate.h" |
50 | #include "gunicode.h" |
51 | #include "gfileutils.h" |
52 | #include "genviron.h" |
53 | |
54 | #include "glibintl.h" |
55 | |
56 | |
57 | /** |
58 | * SECTION:conversions |
59 | * @title: Character Set Conversion |
60 | * @short_description: convert strings between different character sets |
61 | * |
62 | * The g_convert() family of function wraps the functionality of iconv(). |
63 | * In addition to pure character set conversions, GLib has functions to |
64 | * deal with the extra complications of encodings for file names. |
65 | * |
66 | * ## File Name Encodings |
67 | * |
68 | * Historically, UNIX has not had a defined encoding for file names: |
69 | * a file name is valid as long as it does not have path separators |
70 | * in it ("/"). However, displaying file names may require conversion: |
71 | * from the character set in which they were created, to the character |
72 | * set in which the application operates. Consider the Spanish file name |
73 | * "Presentación.sxi". If the application which created it uses |
74 | * ISO-8859-1 for its encoding, |
75 | * |[ |
76 | * Character: P r e s e n t a c i ó n . s x i |
77 | * Hex code: 50 72 65 73 65 6e 74 61 63 69 f3 6e 2e 73 78 69 |
78 | * ]| |
79 | * However, if the application use UTF-8, the actual file name on |
80 | * disk would look like this: |
81 | * |[ |
82 | * Character: P r e s e n t a c i ó n . s x i |
83 | * Hex code: 50 72 65 73 65 6e 74 61 63 69 c3 b3 6e 2e 73 78 69 |
84 | * ]| |
85 | * Glib uses UTF-8 for its strings, and GUI toolkits like GTK+ that use |
86 | * GLib do the same thing. If you get a file name from the file system, |
87 | * for example, from readdir() or from g_dir_read_name(), and you wish |
88 | * to display the file name to the user, you will need to convert it |
89 | * into UTF-8. The opposite case is when the user types the name of a |
90 | * file they wish to save: the toolkit will give you that string in |
91 | * UTF-8 encoding, and you will need to convert it to the character |
92 | * set used for file names before you can create the file with open() |
93 | * or fopen(). |
94 | * |
95 | * By default, GLib assumes that file names on disk are in UTF-8 |
96 | * encoding. This is a valid assumption for file systems which |
97 | * were created relatively recently: most applications use UTF-8 |
98 | * encoding for their strings, and that is also what they use for |
99 | * the file names they create. However, older file systems may |
100 | * still contain file names created in "older" encodings, such as |
101 | * ISO-8859-1. In this case, for compatibility reasons, you may want |
102 | * to instruct GLib to use that particular encoding for file names |
103 | * rather than UTF-8. You can do this by specifying the encoding for |
104 | * file names in the [`G_FILENAME_ENCODING`][G_FILENAME_ENCODING] |
105 | * environment variable. For example, if your installation uses |
106 | * ISO-8859-1 for file names, you can put this in your `~/.profile`: |
107 | * |[ |
108 | * export G_FILENAME_ENCODING=ISO-8859-1 |
109 | * ]| |
110 | * GLib provides the functions g_filename_to_utf8() and |
111 | * g_filename_from_utf8() to perform the necessary conversions. |
112 | * These functions convert file names from the encoding specified |
113 | * in `G_FILENAME_ENCODING` to UTF-8 and vice-versa. This |
114 | * [diagram][file-name-encodings-diagram] illustrates how |
115 | * these functions are used to convert between UTF-8 and the |
116 | * encoding for file names in the file system. |
117 | * |
118 | * ## Conversion between file name encodings # {#file-name-encodings-diagram) |
119 | * |
120 | * ![](file-name-encodings.png) |
121 | * |
122 | * ## Checklist for Application Writers |
123 | * |
124 | * This section is a practical summary of the detailed |
125 | * things to do to make sure your applications process file |
126 | * name encodings correctly. |
127 | * |
128 | * 1. If you get a file name from the file system from a function |
129 | * such as readdir() or gtk_file_chooser_get_filename(), you do |
130 | * not need to do any conversion to pass that file name to |
131 | * functions like open(), rename(), or fopen() -- those are "raw" |
132 | * file names which the file system understands. |
133 | * |
134 | * 2. If you need to display a file name, convert it to UTF-8 first |
135 | * by using g_filename_to_utf8(). If conversion fails, display a |
136 | * string like "Unknown file name". Do not convert this string back |
137 | * into the encoding used for file names if you wish to pass it to |
138 | * the file system; use the original file name instead. |
139 | * |
140 | * For example, the document window of a word processor could display |
141 | * "Unknown file name" in its title bar but still let the user save |
142 | * the file, as it would keep the raw file name internally. This |
143 | * can happen if the user has not set the `G_FILENAME_ENCODING` |
144 | * environment variable even though he has files whose names are |
145 | * not encoded in UTF-8. |
146 | * |
147 | * 3. If your user interface lets the user type a file name for saving |
148 | * or renaming, convert it to the encoding used for file names in |
149 | * the file system by using g_filename_from_utf8(). Pass the converted |
150 | * file name to functions like fopen(). If conversion fails, ask the |
151 | * user to enter a different file name. This can happen if the user |
152 | * types Japanese characters when `G_FILENAME_ENCODING` is set to |
153 | * `ISO-8859-1`, for example. |
154 | */ |
155 | |
156 | /* We try to terminate strings in unknown charsets with this many zero bytes |
157 | * to ensure that multibyte strings really are nul-terminated when we return |
158 | * them from g_convert() and friends. |
159 | */ |
160 | #define NUL_TERMINATOR_LENGTH 4 |
161 | |
162 | G_DEFINE_QUARK (g_convert_error, g_convert_error) |
163 | |
164 | static gboolean |
165 | try_conversion (const char *to_codeset, |
166 | const char *from_codeset, |
167 | iconv_t *cd) |
168 | { |
169 | *cd = iconv_open (tocode: to_codeset, fromcode: from_codeset); |
170 | |
171 | if (*cd == (iconv_t)-1 && errno == EINVAL) |
172 | return FALSE; |
173 | else |
174 | return TRUE; |
175 | } |
176 | |
177 | static gboolean |
178 | try_to_aliases (const char **to_aliases, |
179 | const char *from_codeset, |
180 | iconv_t *cd) |
181 | { |
182 | if (to_aliases) |
183 | { |
184 | const char **p = to_aliases; |
185 | while (*p) |
186 | { |
187 | if (try_conversion (to_codeset: *p, from_codeset, cd)) |
188 | return TRUE; |
189 | |
190 | p++; |
191 | } |
192 | } |
193 | |
194 | return FALSE; |
195 | } |
196 | |
197 | /** |
198 | * g_iconv_open: (skip) |
199 | * @to_codeset: destination codeset |
200 | * @from_codeset: source codeset |
201 | * |
202 | * Same as the standard UNIX routine iconv_open(), but |
203 | * may be implemented via libiconv on UNIX flavors that lack |
204 | * a native implementation. |
205 | * |
206 | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
207 | * more convenient than the raw iconv wrappers. |
208 | * |
209 | * Returns: a "conversion descriptor", or (GIConv)-1 if |
210 | * opening the converter failed. |
211 | **/ |
212 | GIConv |
213 | g_iconv_open (const gchar *to_codeset, |
214 | const gchar *from_codeset) |
215 | { |
216 | iconv_t cd; |
217 | |
218 | if (!try_conversion (to_codeset, from_codeset, cd: &cd)) |
219 | { |
220 | const char **to_aliases = _g_charset_get_aliases (canonical_name: to_codeset); |
221 | const char **from_aliases = _g_charset_get_aliases (canonical_name: from_codeset); |
222 | |
223 | if (from_aliases) |
224 | { |
225 | const char **p = from_aliases; |
226 | while (*p) |
227 | { |
228 | if (try_conversion (to_codeset, from_codeset: *p, cd: &cd)) |
229 | goto out; |
230 | |
231 | if (try_to_aliases (to_aliases, from_codeset: *p, cd: &cd)) |
232 | goto out; |
233 | |
234 | p++; |
235 | } |
236 | } |
237 | |
238 | if (try_to_aliases (to_aliases, from_codeset, cd: &cd)) |
239 | goto out; |
240 | } |
241 | |
242 | out: |
243 | return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd; |
244 | } |
245 | |
246 | /** |
247 | * g_iconv: (skip) |
248 | * @converter: conversion descriptor from g_iconv_open() |
249 | * @inbuf: bytes to convert |
250 | * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf |
251 | * @outbuf: converted output bytes |
252 | * @outbytes_left: inout parameter, bytes available to fill in @outbuf |
253 | * |
254 | * Same as the standard UNIX routine iconv(), but |
255 | * may be implemented via libiconv on UNIX flavors that lack |
256 | * a native implementation. |
257 | * |
258 | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
259 | * more convenient than the raw iconv wrappers. |
260 | * |
261 | * Note that the behaviour of iconv() for characters which are valid in the |
262 | * input character set, but which have no representation in the output character |
263 | * set, is implementation defined. This function may return success (with a |
264 | * positive number of non-reversible conversions as replacement characters were |
265 | * used), or it may return -1 and set an error such as %EILSEQ, in such a |
266 | * situation. |
267 | * |
268 | * Returns: count of non-reversible conversions, or -1 on error |
269 | **/ |
270 | gsize |
271 | g_iconv (GIConv converter, |
272 | gchar **inbuf, |
273 | gsize *inbytes_left, |
274 | gchar **outbuf, |
275 | gsize *outbytes_left) |
276 | { |
277 | iconv_t cd = (iconv_t)converter; |
278 | |
279 | return iconv (cd: cd, inbuf: inbuf, inbytesleft: inbytes_left, outbuf: outbuf, outbytesleft: outbytes_left); |
280 | } |
281 | |
282 | /** |
283 | * g_iconv_close: (skip) |
284 | * @converter: a conversion descriptor from g_iconv_open() |
285 | * |
286 | * Same as the standard UNIX routine iconv_close(), but |
287 | * may be implemented via libiconv on UNIX flavors that lack |
288 | * a native implementation. Should be called to clean up |
289 | * the conversion descriptor from g_iconv_open() when |
290 | * you are done converting things. |
291 | * |
292 | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
293 | * more convenient than the raw iconv wrappers. |
294 | * |
295 | * Returns: -1 on error, 0 on success |
296 | **/ |
297 | gint |
298 | g_iconv_close (GIConv converter) |
299 | { |
300 | iconv_t cd = (iconv_t)converter; |
301 | |
302 | return iconv_close (cd: cd); |
303 | } |
304 | |
305 | static GIConv |
306 | open_converter (const gchar *to_codeset, |
307 | const gchar *from_codeset, |
308 | GError **error) |
309 | { |
310 | GIConv cd; |
311 | |
312 | cd = g_iconv_open (to_codeset, from_codeset); |
313 | |
314 | if (cd == (GIConv) -1) |
315 | { |
316 | /* Something went wrong. */ |
317 | if (error) |
318 | { |
319 | if (errno == EINVAL) |
320 | g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_NO_CONVERSION, |
321 | _("Conversion from character set “%s” to “%s” is not supported" ), |
322 | from_codeset, to_codeset); |
323 | else |
324 | g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_FAILED, |
325 | _("Could not open converter from “%s” to “%s”" ), |
326 | from_codeset, to_codeset); |
327 | } |
328 | } |
329 | |
330 | return cd; |
331 | } |
332 | |
333 | static int |
334 | close_converter (GIConv cd) |
335 | { |
336 | if (cd == (GIConv) -1) |
337 | return 0; |
338 | |
339 | return g_iconv_close (converter: cd); |
340 | } |
341 | |
342 | /** |
343 | * g_convert_with_iconv: (skip) |
344 | * @str: (array length=len) (element-type guint8): |
345 | * the string to convert. |
346 | * @len: the length of the string in bytes, or -1 if the string is |
347 | * nul-terminated (Note that some encodings may allow nul |
348 | * bytes to occur inside strings. In that case, using -1 |
349 | * for the @len parameter is unsafe) |
350 | * @converter: conversion descriptor from g_iconv_open() |
351 | * @bytes_read: (out) (optional): location to store the number of bytes in |
352 | * the input string that were successfully converted, or %NULL. |
353 | * Even if the conversion was successful, this may be |
354 | * less than @len if there were partial characters |
355 | * at the end of the input. If the error |
356 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
357 | * stored will be the byte offset after the last valid |
358 | * input sequence. |
359 | * @bytes_written: (out) (optional): the number of bytes stored in |
360 | * the output buffer (not including the terminating nul). |
361 | * @error: location to store the error occurring, or %NULL to ignore |
362 | * errors. Any of the errors in #GConvertError may occur. |
363 | * |
364 | * Converts a string from one character set to another. |
365 | * |
366 | * Note that you should use g_iconv() for streaming conversions. |
367 | * Despite the fact that @bytes_read can return information about partial |
368 | * characters, the g_convert_... functions are not generally suitable |
369 | * for streaming. If the underlying converter maintains internal state, |
370 | * then this won't be preserved across successive calls to g_convert(), |
371 | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
372 | * this is the GNU C converter for CP1255 which does not emit a base |
373 | * character until it knows that the next character is not a mark that |
374 | * could combine with the base character.) |
375 | * |
376 | * Characters which are valid in the input character set, but which have no |
377 | * representation in the output character set will result in a |
378 | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error. This is in contrast to the iconv() |
379 | * specification, which leaves this behaviour implementation defined. Note that |
380 | * this is the same error code as is returned for an invalid byte sequence in |
381 | * the input character set. To get defined behaviour for conversion of |
382 | * unrepresentable characters, use g_convert_with_fallback(). |
383 | * |
384 | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
385 | * If the conversion was successful, a newly allocated buffer |
386 | * containing the converted string, which must be freed with |
387 | * g_free(). Otherwise %NULL and @error will be set. |
388 | **/ |
389 | gchar* |
390 | g_convert_with_iconv (const gchar *str, |
391 | gssize len, |
392 | GIConv converter, |
393 | gsize *bytes_read, |
394 | gsize *bytes_written, |
395 | GError **error) |
396 | { |
397 | gchar *dest; |
398 | gchar *outp; |
399 | const gchar *p; |
400 | gsize inbytes_remaining; |
401 | gsize outbytes_remaining; |
402 | gsize err; |
403 | gsize outbuf_size; |
404 | gboolean have_error = FALSE; |
405 | gboolean done = FALSE; |
406 | gboolean reset = FALSE; |
407 | |
408 | g_return_val_if_fail (converter != (GIConv) -1, NULL); |
409 | |
410 | if (len < 0) |
411 | len = strlen (s: str); |
412 | |
413 | p = str; |
414 | inbytes_remaining = len; |
415 | outbuf_size = len + NUL_TERMINATOR_LENGTH; |
416 | |
417 | outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; |
418 | outp = dest = g_malloc (n_bytes: outbuf_size); |
419 | |
420 | while (!done && !have_error) |
421 | { |
422 | if (reset) |
423 | err = g_iconv (converter, NULL, inbytes_left: &inbytes_remaining, outbuf: &outp, outbytes_left: &outbytes_remaining); |
424 | else |
425 | err = g_iconv (converter, inbuf: (char **)&p, inbytes_left: &inbytes_remaining, outbuf: &outp, outbytes_left: &outbytes_remaining); |
426 | |
427 | if (err == (gsize) -1) |
428 | { |
429 | switch (errno) |
430 | { |
431 | case EINVAL: |
432 | /* Incomplete text, do not report an error */ |
433 | done = TRUE; |
434 | break; |
435 | case E2BIG: |
436 | { |
437 | gsize used = outp - dest; |
438 | |
439 | outbuf_size *= 2; |
440 | dest = g_realloc (mem: dest, n_bytes: outbuf_size); |
441 | |
442 | outp = dest + used; |
443 | outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; |
444 | } |
445 | break; |
446 | case EILSEQ: |
447 | g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
448 | _("Invalid byte sequence in conversion input" )); |
449 | have_error = TRUE; |
450 | break; |
451 | default: |
452 | { |
453 | int errsv = errno; |
454 | |
455 | g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_FAILED, |
456 | _("Error during conversion: %s" ), |
457 | g_strerror (errnum: errsv)); |
458 | } |
459 | have_error = TRUE; |
460 | break; |
461 | } |
462 | } |
463 | else if (err > 0) |
464 | { |
465 | /* @err gives the number of replacement characters used. */ |
466 | g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
467 | _("Unrepresentable character in conversion input" )); |
468 | have_error = TRUE; |
469 | } |
470 | else |
471 | { |
472 | if (!reset) |
473 | { |
474 | /* call g_iconv with NULL inbuf to cleanup shift state */ |
475 | reset = TRUE; |
476 | inbytes_remaining = 0; |
477 | } |
478 | else |
479 | done = TRUE; |
480 | } |
481 | } |
482 | |
483 | memset (s: outp, c: 0, NUL_TERMINATOR_LENGTH); |
484 | |
485 | if (bytes_read) |
486 | *bytes_read = p - str; |
487 | else |
488 | { |
489 | if ((p - str) != len) |
490 | { |
491 | if (!have_error) |
492 | { |
493 | g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_PARTIAL_INPUT, |
494 | _("Partial character sequence at end of input" )); |
495 | have_error = TRUE; |
496 | } |
497 | } |
498 | } |
499 | |
500 | if (bytes_written) |
501 | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
502 | |
503 | if (have_error) |
504 | { |
505 | g_free (mem: dest); |
506 | return NULL; |
507 | } |
508 | else |
509 | return dest; |
510 | } |
511 | |
512 | /** |
513 | * g_convert: |
514 | * @str: (array length=len) (element-type guint8): |
515 | * the string to convert. |
516 | * @len: the length of the string in bytes, or -1 if the string is |
517 | * nul-terminated (Note that some encodings may allow nul |
518 | * bytes to occur inside strings. In that case, using -1 |
519 | * for the @len parameter is unsafe) |
520 | * @to_codeset: name of character set into which to convert @str |
521 | * @from_codeset: character set of @str. |
522 | * @bytes_read: (out) (optional): location to store the number of bytes in |
523 | * the input string that were successfully converted, or %NULL. |
524 | * Even if the conversion was successful, this may be |
525 | * less than @len if there were partial characters |
526 | * at the end of the input. If the error |
527 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
528 | * stored will be the byte offset after the last valid |
529 | * input sequence. |
530 | * @bytes_written: (out) (optional): the number of bytes stored in |
531 | * the output buffer (not including the terminating nul). |
532 | * @error: location to store the error occurring, or %NULL to ignore |
533 | * errors. Any of the errors in #GConvertError may occur. |
534 | * |
535 | * Converts a string from one character set to another. |
536 | * |
537 | * Note that you should use g_iconv() for streaming conversions. |
538 | * Despite the fact that @bytes_read can return information about partial |
539 | * characters, the g_convert_... functions are not generally suitable |
540 | * for streaming. If the underlying converter maintains internal state, |
541 | * then this won't be preserved across successive calls to g_convert(), |
542 | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
543 | * this is the GNU C converter for CP1255 which does not emit a base |
544 | * character until it knows that the next character is not a mark that |
545 | * could combine with the base character.) |
546 | * |
547 | * Using extensions such as "//TRANSLIT" may not work (or may not work |
548 | * well) on many platforms. Consider using g_str_to_ascii() instead. |
549 | * |
550 | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
551 | * If the conversion was successful, a newly allocated buffer |
552 | * containing the converted string, which must be freed with g_free(). |
553 | * Otherwise %NULL and @error will be set. |
554 | **/ |
555 | gchar* |
556 | g_convert (const gchar *str, |
557 | gssize len, |
558 | const gchar *to_codeset, |
559 | const gchar *from_codeset, |
560 | gsize *bytes_read, |
561 | gsize *bytes_written, |
562 | GError **error) |
563 | { |
564 | gchar *res; |
565 | GIConv cd; |
566 | |
567 | g_return_val_if_fail (str != NULL, NULL); |
568 | g_return_val_if_fail (to_codeset != NULL, NULL); |
569 | g_return_val_if_fail (from_codeset != NULL, NULL); |
570 | |
571 | cd = open_converter (to_codeset, from_codeset, error); |
572 | |
573 | if (cd == (GIConv) -1) |
574 | { |
575 | if (bytes_read) |
576 | *bytes_read = 0; |
577 | |
578 | if (bytes_written) |
579 | *bytes_written = 0; |
580 | |
581 | return NULL; |
582 | } |
583 | |
584 | res = g_convert_with_iconv (str, len, converter: cd, |
585 | bytes_read, bytes_written, |
586 | error); |
587 | |
588 | close_converter (cd); |
589 | |
590 | return res; |
591 | } |
592 | |
593 | /** |
594 | * g_convert_with_fallback: |
595 | * @str: (array length=len) (element-type guint8): |
596 | * the string to convert. |
597 | * @len: the length of the string in bytes, or -1 if the string is |
598 | * nul-terminated (Note that some encodings may allow nul |
599 | * bytes to occur inside strings. In that case, using -1 |
600 | * for the @len parameter is unsafe) |
601 | * @to_codeset: name of character set into which to convert @str |
602 | * @from_codeset: character set of @str. |
603 | * @fallback: UTF-8 string to use in place of characters not |
604 | * present in the target encoding. (The string must be |
605 | * representable in the target encoding). |
606 | * If %NULL, characters not in the target encoding will |
607 | * be represented as Unicode escapes \uxxxx or \Uxxxxyyyy. |
608 | * @bytes_read: (out) (optional): location to store the number of bytes in |
609 | * the input string that were successfully converted, or %NULL. |
610 | * Even if the conversion was successful, this may be |
611 | * less than @len if there were partial characters |
612 | * at the end of the input. |
613 | * @bytes_written: (out) (optional): the number of bytes stored in |
614 | * the output buffer (not including the terminating nul). |
615 | * @error: location to store the error occurring, or %NULL to ignore |
616 | * errors. Any of the errors in #GConvertError may occur. |
617 | * |
618 | * Converts a string from one character set to another, possibly |
619 | * including fallback sequences for characters not representable |
620 | * in the output. Note that it is not guaranteed that the specification |
621 | * for the fallback sequences in @fallback will be honored. Some |
622 | * systems may do an approximate conversion from @from_codeset |
623 | * to @to_codeset in their iconv() functions, |
624 | * in which case GLib will simply return that approximate conversion. |
625 | * |
626 | * Note that you should use g_iconv() for streaming conversions. |
627 | * Despite the fact that @bytes_read can return information about partial |
628 | * characters, the g_convert_... functions are not generally suitable |
629 | * for streaming. If the underlying converter maintains internal state, |
630 | * then this won't be preserved across successive calls to g_convert(), |
631 | * g_convert_with_iconv() or g_convert_with_fallback(). (An example of |
632 | * this is the GNU C converter for CP1255 which does not emit a base |
633 | * character until it knows that the next character is not a mark that |
634 | * could combine with the base character.) |
635 | * |
636 | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
637 | * If the conversion was successful, a newly allocated buffer |
638 | * containing the converted string, which must be freed with g_free(). |
639 | * Otherwise %NULL and @error will be set. |
640 | **/ |
641 | gchar* |
642 | g_convert_with_fallback (const gchar *str, |
643 | gssize len, |
644 | const gchar *to_codeset, |
645 | const gchar *from_codeset, |
646 | const gchar *fallback, |
647 | gsize *bytes_read, |
648 | gsize *bytes_written, |
649 | GError **error) |
650 | { |
651 | gchar *utf8; |
652 | gchar *dest; |
653 | gchar *outp; |
654 | const gchar *insert_str = NULL; |
655 | const gchar *p; |
656 | gsize inbytes_remaining; |
657 | const gchar *save_p = NULL; |
658 | gsize save_inbytes = 0; |
659 | gsize outbytes_remaining; |
660 | gsize err; |
661 | GIConv cd; |
662 | gsize outbuf_size; |
663 | gboolean have_error = FALSE; |
664 | gboolean done = FALSE; |
665 | |
666 | GError *local_error = NULL; |
667 | |
668 | g_return_val_if_fail (str != NULL, NULL); |
669 | g_return_val_if_fail (to_codeset != NULL, NULL); |
670 | g_return_val_if_fail (from_codeset != NULL, NULL); |
671 | |
672 | if (len < 0) |
673 | len = strlen (s: str); |
674 | |
675 | /* Try an exact conversion; we only proceed if this fails |
676 | * due to an illegal sequence in the input string. |
677 | */ |
678 | dest = g_convert (str, len, to_codeset, from_codeset, |
679 | bytes_read, bytes_written, error: &local_error); |
680 | if (!local_error) |
681 | return dest; |
682 | |
683 | if (!g_error_matches (error: local_error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE)) |
684 | { |
685 | g_propagate_error (dest: error, src: local_error); |
686 | return NULL; |
687 | } |
688 | else |
689 | g_error_free (error: local_error); |
690 | |
691 | local_error = NULL; |
692 | |
693 | /* No go; to proceed, we need a converter from "UTF-8" to |
694 | * to_codeset, and the string as UTF-8. |
695 | */ |
696 | cd = open_converter (to_codeset, from_codeset: "UTF-8" , error); |
697 | if (cd == (GIConv) -1) |
698 | { |
699 | if (bytes_read) |
700 | *bytes_read = 0; |
701 | |
702 | if (bytes_written) |
703 | *bytes_written = 0; |
704 | |
705 | return NULL; |
706 | } |
707 | |
708 | utf8 = g_convert (str, len, to_codeset: "UTF-8" , from_codeset, |
709 | bytes_read, bytes_written: &inbytes_remaining, error); |
710 | if (!utf8) |
711 | { |
712 | close_converter (cd); |
713 | if (bytes_written) |
714 | *bytes_written = 0; |
715 | return NULL; |
716 | } |
717 | |
718 | /* Now the heart of the code. We loop through the UTF-8 string, and |
719 | * whenever we hit an offending character, we form fallback, convert |
720 | * the fallback to the target codeset, and then go back to |
721 | * converting the original string after finishing with the fallback. |
722 | * |
723 | * The variables save_p and save_inbytes store the input state |
724 | * for the original string while we are converting the fallback |
725 | */ |
726 | p = utf8; |
727 | |
728 | outbuf_size = len + NUL_TERMINATOR_LENGTH; |
729 | outbytes_remaining = outbuf_size - NUL_TERMINATOR_LENGTH; |
730 | outp = dest = g_malloc (n_bytes: outbuf_size); |
731 | |
732 | while (!done && !have_error) |
733 | { |
734 | gsize inbytes_tmp = inbytes_remaining; |
735 | err = g_iconv (converter: cd, inbuf: (char **)&p, inbytes_left: &inbytes_tmp, outbuf: &outp, outbytes_left: &outbytes_remaining); |
736 | inbytes_remaining = inbytes_tmp; |
737 | |
738 | if (err == (gsize) -1) |
739 | { |
740 | switch (errno) |
741 | { |
742 | case EINVAL: |
743 | g_assert_not_reached(); |
744 | break; |
745 | case E2BIG: |
746 | { |
747 | gsize used = outp - dest; |
748 | |
749 | outbuf_size *= 2; |
750 | dest = g_realloc (mem: dest, n_bytes: outbuf_size); |
751 | |
752 | outp = dest + used; |
753 | outbytes_remaining = outbuf_size - used - NUL_TERMINATOR_LENGTH; |
754 | |
755 | break; |
756 | } |
757 | case EILSEQ: |
758 | if (save_p) |
759 | { |
760 | /* Error converting fallback string - fatal |
761 | */ |
762 | g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
763 | _("Cannot convert fallback “%s” to codeset “%s”" ), |
764 | insert_str, to_codeset); |
765 | have_error = TRUE; |
766 | break; |
767 | } |
768 | else if (p) |
769 | { |
770 | if (!fallback) |
771 | { |
772 | gunichar ch = g_utf8_get_char (p); |
773 | insert_str = g_strdup_printf (format: ch < 0x10000 ? "\\u%04x" : "\\U%08x" , |
774 | ch); |
775 | } |
776 | else |
777 | insert_str = fallback; |
778 | |
779 | save_p = g_utf8_next_char (p); |
780 | save_inbytes = inbytes_remaining - (save_p - p); |
781 | p = insert_str; |
782 | inbytes_remaining = strlen (s: p); |
783 | break; |
784 | } |
785 | /* if p is null */ |
786 | G_GNUC_FALLTHROUGH; |
787 | default: |
788 | { |
789 | int errsv = errno; |
790 | |
791 | g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_FAILED, |
792 | _("Error during conversion: %s" ), |
793 | g_strerror (errnum: errsv)); |
794 | } |
795 | |
796 | have_error = TRUE; |
797 | break; |
798 | } |
799 | } |
800 | else |
801 | { |
802 | if (save_p) |
803 | { |
804 | if (!fallback) |
805 | g_free (mem: (gchar *)insert_str); |
806 | p = save_p; |
807 | inbytes_remaining = save_inbytes; |
808 | save_p = NULL; |
809 | } |
810 | else if (p) |
811 | { |
812 | /* call g_iconv with NULL inbuf to cleanup shift state */ |
813 | p = NULL; |
814 | inbytes_remaining = 0; |
815 | } |
816 | else |
817 | done = TRUE; |
818 | } |
819 | } |
820 | |
821 | /* Cleanup |
822 | */ |
823 | memset (s: outp, c: 0, NUL_TERMINATOR_LENGTH); |
824 | |
825 | close_converter (cd); |
826 | |
827 | if (bytes_written) |
828 | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
829 | |
830 | g_free (mem: utf8); |
831 | |
832 | if (have_error) |
833 | { |
834 | if (save_p && !fallback) |
835 | g_free (mem: (gchar *)insert_str); |
836 | g_free (mem: dest); |
837 | return NULL; |
838 | } |
839 | else |
840 | return dest; |
841 | } |
842 | |
843 | /* |
844 | * g_locale_to_utf8 |
845 | * |
846 | * |
847 | */ |
848 | |
849 | /* |
850 | * Validate @string as UTF-8. @len can be negative if @string is |
851 | * nul-terminated, or a non-negative value in bytes. If @string ends in an |
852 | * incomplete sequence, or contains any illegal sequences or nul codepoints, |
853 | * %NULL will be returned and the error set to |
854 | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. |
855 | * On success, @bytes_read and @bytes_written, if provided, will be set to |
856 | * the number of bytes in @string up to @len or the terminating nul byte. |
857 | * On error, @bytes_read will be set to the byte offset after the last valid |
858 | * and non-nul UTF-8 sequence in @string, and @bytes_written will be set to 0. |
859 | */ |
860 | static gchar * |
861 | strdup_len (const gchar *string, |
862 | gssize len, |
863 | gsize *bytes_read, |
864 | gsize *bytes_written, |
865 | GError **error) |
866 | { |
867 | gsize real_len; |
868 | const gchar *end_valid; |
869 | |
870 | if (!g_utf8_validate (str: string, max_len: len, end: &end_valid)) |
871 | { |
872 | if (bytes_read) |
873 | *bytes_read = end_valid - string; |
874 | if (bytes_written) |
875 | *bytes_written = 0; |
876 | |
877 | g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
878 | _("Invalid byte sequence in conversion input" )); |
879 | return NULL; |
880 | } |
881 | |
882 | real_len = end_valid - string; |
883 | |
884 | if (bytes_read) |
885 | *bytes_read = real_len; |
886 | if (bytes_written) |
887 | *bytes_written = real_len; |
888 | |
889 | return g_strndup (str: string, n: real_len); |
890 | } |
891 | |
892 | typedef enum |
893 | { |
894 | CONVERT_CHECK_NO_NULS_IN_INPUT = 1 << 0, |
895 | CONVERT_CHECK_NO_NULS_IN_OUTPUT = 1 << 1 |
896 | } ConvertCheckFlags; |
897 | |
898 | /* |
899 | * Convert from @string in the encoding identified by @from_codeset, |
900 | * returning a string in the encoding identifed by @to_codeset. |
901 | * @len can be negative if @string is nul-terminated, or a non-negative |
902 | * value in bytes. Flags defined in #ConvertCheckFlags can be set in @flags |
903 | * to check the input, the output, or both, for embedded nul bytes. |
904 | * On success, @bytes_read, if provided, will be set to the number of bytes |
905 | * in @string up to @len or the terminating nul byte, and @bytes_written, if |
906 | * provided, will be set to the number of output bytes written into the |
907 | * returned buffer, excluding the terminating nul sequence. |
908 | * On error, @bytes_read will be set to the byte offset after the last valid |
909 | * sequence in @string, and @bytes_written will be set to 0. |
910 | */ |
911 | static gchar * |
912 | convert_checked (const gchar *string, |
913 | gssize len, |
914 | const gchar *to_codeset, |
915 | const gchar *from_codeset, |
916 | ConvertCheckFlags flags, |
917 | gsize *bytes_read, |
918 | gsize *bytes_written, |
919 | GError **error) |
920 | { |
921 | gchar *out; |
922 | gsize outbytes; |
923 | |
924 | if ((flags & CONVERT_CHECK_NO_NULS_IN_INPUT) && len > 0) |
925 | { |
926 | const gchar *early_nul = memchr (s: string, c: '\0', n: len); |
927 | if (early_nul != NULL) |
928 | { |
929 | if (bytes_read) |
930 | *bytes_read = early_nul - string; |
931 | if (bytes_written) |
932 | *bytes_written = 0; |
933 | |
934 | g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
935 | _("Embedded NUL byte in conversion input" )); |
936 | return NULL; |
937 | } |
938 | } |
939 | |
940 | out = g_convert (str: string, len, to_codeset, from_codeset, |
941 | bytes_read, bytes_written: &outbytes, error); |
942 | if (out == NULL) |
943 | { |
944 | if (bytes_written) |
945 | *bytes_written = 0; |
946 | return NULL; |
947 | } |
948 | |
949 | if ((flags & CONVERT_CHECK_NO_NULS_IN_OUTPUT) |
950 | && memchr (s: out, c: '\0', n: outbytes) != NULL) |
951 | { |
952 | g_free (mem: out); |
953 | if (bytes_written) |
954 | *bytes_written = 0; |
955 | g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_EMBEDDED_NUL, |
956 | _("Embedded NUL byte in conversion output" )); |
957 | return NULL; |
958 | } |
959 | |
960 | if (bytes_written) |
961 | *bytes_written = outbytes; |
962 | return out; |
963 | } |
964 | |
965 | /** |
966 | * g_locale_to_utf8: |
967 | * @opsysstring: (array length=len) (element-type guint8): a string in the |
968 | * encoding of the current locale. On Windows |
969 | * this means the system codepage. |
970 | * @len: the length of the string, or -1 if the string is |
971 | * nul-terminated (Note that some encodings may allow nul |
972 | * bytes to occur inside strings. In that case, using -1 |
973 | * for the @len parameter is unsafe) |
974 | * @bytes_read: (out) (optional): location to store the number of bytes in the |
975 | * input string that were successfully converted, or %NULL. |
976 | * Even if the conversion was successful, this may be |
977 | * less than @len if there were partial characters |
978 | * at the end of the input. If the error |
979 | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
980 | * stored will be the byte offset after the last valid |
981 | * input sequence. |
982 | * @bytes_written: (out) (optional): the number of bytes stored in the output |
983 | * buffer (not including the terminating nul). |
984 | * @error: location to store the error occurring, or %NULL to ignore |
985 | * errors. Any of the errors in #GConvertError may occur. |
986 | * |
987 | * Converts a string which is in the encoding used for strings by |
988 | * the C runtime (usually the same as that used by the operating |
989 | * system) in the [current locale][setlocale] into a UTF-8 string. |
990 | * |
991 | * If the source encoding is not UTF-8 and the conversion output contains a |
992 | * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the |
993 | * function returns %NULL. |
994 | * If the source encoding is UTF-8, an embedded nul character is treated with |
995 | * the %G_CONVERT_ERROR_ILLEGAL_SEQUENCE error for backward compatibility with |
996 | * earlier versions of this library. Use g_convert() to produce output that |
997 | * may contain embedded nul characters. |
998 | * |
999 | * Returns: (type utf8): The converted string, or %NULL on an error. |
1000 | **/ |
1001 | gchar * |
1002 | g_locale_to_utf8 (const gchar *opsysstring, |
1003 | gssize len, |
1004 | gsize *bytes_read, |
1005 | gsize *bytes_written, |
1006 | GError **error) |
1007 | { |
1008 | const char *charset; |
1009 | |
1010 | if (g_get_charset (charset: &charset)) |
1011 | return strdup_len (string: opsysstring, len, bytes_read, bytes_written, error); |
1012 | else |
1013 | return convert_checked (string: opsysstring, len, to_codeset: "UTF-8" , from_codeset: charset, |
1014 | flags: CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1015 | bytes_read, bytes_written, error); |
1016 | } |
1017 | |
1018 | /** |
1019 | * g_locale_from_utf8: |
1020 | * @utf8string: a UTF-8 encoded string |
1021 | * @len: the length of the string, or -1 if the string is |
1022 | * nul-terminated. |
1023 | * @bytes_read: (out) (optional): location to store the number of bytes in the |
1024 | * input string that were successfully converted, or %NULL. |
1025 | * Even if the conversion was successful, this may be |
1026 | * less than @len if there were partial characters |
1027 | * at the end of the input. If the error |
1028 | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
1029 | * stored will be the byte offset after the last valid |
1030 | * input sequence. |
1031 | * @bytes_written: (out) (optional): the number of bytes stored in the output |
1032 | * buffer (not including the terminating nul). |
1033 | * @error: location to store the error occurring, or %NULL to ignore |
1034 | * errors. Any of the errors in #GConvertError may occur. |
1035 | * |
1036 | * Converts a string from UTF-8 to the encoding used for strings by |
1037 | * the C runtime (usually the same as that used by the operating |
1038 | * system) in the [current locale][setlocale]. On Windows this means |
1039 | * the system codepage. |
1040 | * |
1041 | * The input string shall not contain nul characters even if the @len |
1042 | * argument is positive. A nul character found inside the string will result |
1043 | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. Use g_convert() to convert |
1044 | * input that may contain embedded nul characters. |
1045 | * |
1046 | * Returns: (array length=bytes_written) (element-type guint8) (transfer full): |
1047 | * A newly-allocated buffer containing the converted string, |
1048 | * or %NULL on an error, and error will be set. |
1049 | **/ |
1050 | gchar * |
1051 | g_locale_from_utf8 (const gchar *utf8string, |
1052 | gssize len, |
1053 | gsize *bytes_read, |
1054 | gsize *bytes_written, |
1055 | GError **error) |
1056 | { |
1057 | const gchar *charset; |
1058 | |
1059 | if (g_get_charset (charset: &charset)) |
1060 | return strdup_len (string: utf8string, len, bytes_read, bytes_written, error); |
1061 | else |
1062 | return convert_checked (string: utf8string, len, to_codeset: charset, from_codeset: "UTF-8" , |
1063 | flags: CONVERT_CHECK_NO_NULS_IN_INPUT, |
1064 | bytes_read, bytes_written, error); |
1065 | } |
1066 | |
1067 | #ifndef G_PLATFORM_WIN32 |
1068 | |
1069 | typedef struct _GFilenameCharsetCache GFilenameCharsetCache; |
1070 | |
1071 | struct _GFilenameCharsetCache { |
1072 | gboolean is_utf8; |
1073 | gchar *charset; |
1074 | gchar **filename_charsets; |
1075 | }; |
1076 | |
1077 | static void |
1078 | filename_charset_cache_free (gpointer data) |
1079 | { |
1080 | GFilenameCharsetCache *cache = data; |
1081 | g_free (mem: cache->charset); |
1082 | g_strfreev (str_array: cache->filename_charsets); |
1083 | g_free (mem: cache); |
1084 | } |
1085 | |
1086 | /** |
1087 | * g_get_filename_charsets: |
1088 | * @filename_charsets: (out) (transfer none) (array zero-terminated=1): |
1089 | * return location for the %NULL-terminated list of encoding names |
1090 | * |
1091 | * Determines the preferred character sets used for filenames. |
1092 | * The first character set from the @charsets is the filename encoding, the |
1093 | * subsequent character sets are used when trying to generate a displayable |
1094 | * representation of a filename, see g_filename_display_name(). |
1095 | * |
1096 | * On Unix, the character sets are determined by consulting the |
1097 | * environment variables `G_FILENAME_ENCODING` and `G_BROKEN_FILENAMES`. |
1098 | * On Windows, the character set used in the GLib API is always UTF-8 |
1099 | * and said environment variables have no effect. |
1100 | * |
1101 | * `G_FILENAME_ENCODING` may be set to a comma-separated list of |
1102 | * character set names. The special token "\@locale" is taken |
1103 | * to mean the character set for the [current locale][setlocale]. |
1104 | * If `G_FILENAME_ENCODING` is not set, but `G_BROKEN_FILENAMES` is, |
1105 | * the character set of the current locale is taken as the filename |
1106 | * encoding. If neither environment variable is set, UTF-8 is taken |
1107 | * as the filename encoding, but the character set of the current locale |
1108 | * is also put in the list of encodings. |
1109 | * |
1110 | * The returned @charsets belong to GLib and must not be freed. |
1111 | * |
1112 | * Note that on Unix, regardless of the locale character set or |
1113 | * `G_FILENAME_ENCODING` value, the actual file names present |
1114 | * on a system might be in any random encoding or just gibberish. |
1115 | * |
1116 | * Returns: %TRUE if the filename encoding is UTF-8. |
1117 | * |
1118 | * Since: 2.6 |
1119 | */ |
1120 | gboolean |
1121 | g_get_filename_charsets (const gchar ***filename_charsets) |
1122 | { |
1123 | static GPrivate cache_private = G_PRIVATE_INIT (filename_charset_cache_free); |
1124 | GFilenameCharsetCache *cache = g_private_get (key: &cache_private); |
1125 | const gchar *charset; |
1126 | |
1127 | if (!cache) |
1128 | cache = g_private_set_alloc0 (key: &cache_private, size: sizeof (GFilenameCharsetCache)); |
1129 | |
1130 | g_get_charset (charset: &charset); |
1131 | |
1132 | if (!(cache->charset && strcmp (s1: cache->charset, s2: charset) == 0)) |
1133 | { |
1134 | const gchar *new_charset; |
1135 | const gchar *p; |
1136 | gint i; |
1137 | |
1138 | g_free (mem: cache->charset); |
1139 | g_strfreev (str_array: cache->filename_charsets); |
1140 | cache->charset = g_strdup (str: charset); |
1141 | |
1142 | p = g_getenv (variable: "G_FILENAME_ENCODING" ); |
1143 | if (p != NULL && p[0] != '\0') |
1144 | { |
1145 | cache->filename_charsets = g_strsplit (string: p, delimiter: "," , max_tokens: 0); |
1146 | cache->is_utf8 = (strcmp (s1: cache->filename_charsets[0], s2: "UTF-8" ) == 0); |
1147 | |
1148 | for (i = 0; cache->filename_charsets[i]; i++) |
1149 | { |
1150 | if (strcmp (s1: "@locale" , s2: cache->filename_charsets[i]) == 0) |
1151 | { |
1152 | g_get_charset (charset: &new_charset); |
1153 | g_free (mem: cache->filename_charsets[i]); |
1154 | cache->filename_charsets[i] = g_strdup (str: new_charset); |
1155 | } |
1156 | } |
1157 | } |
1158 | else if (g_getenv (variable: "G_BROKEN_FILENAMES" ) != NULL) |
1159 | { |
1160 | cache->filename_charsets = g_new0 (gchar *, 2); |
1161 | cache->is_utf8 = g_get_charset (charset: &new_charset); |
1162 | cache->filename_charsets[0] = g_strdup (str: new_charset); |
1163 | } |
1164 | else |
1165 | { |
1166 | cache->filename_charsets = g_new0 (gchar *, 3); |
1167 | cache->is_utf8 = TRUE; |
1168 | cache->filename_charsets[0] = g_strdup (str: "UTF-8" ); |
1169 | if (!g_get_charset (charset: &new_charset)) |
1170 | cache->filename_charsets[1] = g_strdup (str: new_charset); |
1171 | } |
1172 | } |
1173 | |
1174 | if (filename_charsets) |
1175 | *filename_charsets = (const gchar **)cache->filename_charsets; |
1176 | |
1177 | return cache->is_utf8; |
1178 | } |
1179 | |
1180 | #else /* G_PLATFORM_WIN32 */ |
1181 | |
1182 | gboolean |
1183 | g_get_filename_charsets (const gchar ***filename_charsets) |
1184 | { |
1185 | static const gchar *charsets[] = { |
1186 | "UTF-8" , |
1187 | NULL |
1188 | }; |
1189 | |
1190 | #ifdef G_OS_WIN32 |
1191 | /* On Windows GLib pretends that the filename charset is UTF-8 */ |
1192 | if (filename_charsets) |
1193 | *filename_charsets = charsets; |
1194 | |
1195 | return TRUE; |
1196 | #else |
1197 | gboolean result; |
1198 | |
1199 | /* Cygwin works like before */ |
1200 | result = g_get_charset (&(charsets[0])); |
1201 | |
1202 | if (filename_charsets) |
1203 | *filename_charsets = charsets; |
1204 | |
1205 | return result; |
1206 | #endif |
1207 | } |
1208 | |
1209 | #endif /* G_PLATFORM_WIN32 */ |
1210 | |
1211 | static gboolean |
1212 | get_filename_charset (const gchar **filename_charset) |
1213 | { |
1214 | const gchar **charsets; |
1215 | gboolean is_utf8; |
1216 | |
1217 | is_utf8 = g_get_filename_charsets (filename_charsets: &charsets); |
1218 | |
1219 | if (filename_charset) |
1220 | *filename_charset = charsets[0]; |
1221 | |
1222 | return is_utf8; |
1223 | } |
1224 | |
1225 | /** |
1226 | * g_filename_to_utf8: |
1227 | * @opsysstring: (type filename): a string in the encoding for filenames |
1228 | * @len: the length of the string, or -1 if the string is |
1229 | * nul-terminated (Note that some encodings may allow nul |
1230 | * bytes to occur inside strings. In that case, using -1 |
1231 | * for the @len parameter is unsafe) |
1232 | * @bytes_read: (out) (optional): location to store the number of bytes in the |
1233 | * input string that were successfully converted, or %NULL. |
1234 | * Even if the conversion was successful, this may be |
1235 | * less than @len if there were partial characters |
1236 | * at the end of the input. If the error |
1237 | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
1238 | * stored will be the byte offset after the last valid |
1239 | * input sequence. |
1240 | * @bytes_written: (out) (optional): the number of bytes stored in the output |
1241 | * buffer (not including the terminating nul). |
1242 | * @error: location to store the error occurring, or %NULL to ignore |
1243 | * errors. Any of the errors in #GConvertError may occur. |
1244 | * |
1245 | * Converts a string which is in the encoding used by GLib for |
1246 | * filenames into a UTF-8 string. Note that on Windows GLib uses UTF-8 |
1247 | * for filenames; on other platforms, this function indirectly depends on |
1248 | * the [current locale][setlocale]. |
1249 | * |
1250 | * The input string shall not contain nul characters even if the @len |
1251 | * argument is positive. A nul character found inside the string will result |
1252 | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. |
1253 | * If the source encoding is not UTF-8 and the conversion output contains a |
1254 | * nul character, the error %G_CONVERT_ERROR_EMBEDDED_NUL is set and the |
1255 | * function returns %NULL. Use g_convert() to produce output that |
1256 | * may contain embedded nul characters. |
1257 | * |
1258 | * Returns: (type utf8): The converted string, or %NULL on an error. |
1259 | **/ |
1260 | gchar* |
1261 | g_filename_to_utf8 (const gchar *opsysstring, |
1262 | gssize len, |
1263 | gsize *bytes_read, |
1264 | gsize *bytes_written, |
1265 | GError **error) |
1266 | { |
1267 | const gchar *charset; |
1268 | |
1269 | g_return_val_if_fail (opsysstring != NULL, NULL); |
1270 | |
1271 | if (get_filename_charset (filename_charset: &charset)) |
1272 | return strdup_len (string: opsysstring, len, bytes_read, bytes_written, error); |
1273 | else |
1274 | return convert_checked (string: opsysstring, len, to_codeset: "UTF-8" , from_codeset: charset, |
1275 | flags: CONVERT_CHECK_NO_NULS_IN_INPUT | |
1276 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1277 | bytes_read, bytes_written, error); |
1278 | } |
1279 | |
1280 | /** |
1281 | * g_filename_from_utf8: |
1282 | * @utf8string: (type utf8): a UTF-8 encoded string. |
1283 | * @len: the length of the string, or -1 if the string is |
1284 | * nul-terminated. |
1285 | * @bytes_read: (out) (optional): location to store the number of bytes in |
1286 | * the input string that were successfully converted, or %NULL. |
1287 | * Even if the conversion was successful, this may be |
1288 | * less than @len if there were partial characters |
1289 | * at the end of the input. If the error |
1290 | * %G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
1291 | * stored will be the byte offset after the last valid |
1292 | * input sequence. |
1293 | * @bytes_written: (out) (optional): the number of bytes stored in |
1294 | * the output buffer (not including the terminating nul). |
1295 | * @error: location to store the error occurring, or %NULL to ignore |
1296 | * errors. Any of the errors in #GConvertError may occur. |
1297 | * |
1298 | * Converts a string from UTF-8 to the encoding GLib uses for |
1299 | * filenames. Note that on Windows GLib uses UTF-8 for filenames; |
1300 | * on other platforms, this function indirectly depends on the |
1301 | * [current locale][setlocale]. |
1302 | * |
1303 | * The input string shall not contain nul characters even if the @len |
1304 | * argument is positive. A nul character found inside the string will result |
1305 | * in error %G_CONVERT_ERROR_ILLEGAL_SEQUENCE. If the filename encoding is |
1306 | * not UTF-8 and the conversion output contains a nul character, the error |
1307 | * %G_CONVERT_ERROR_EMBEDDED_NUL is set and the function returns %NULL. |
1308 | * |
1309 | * Returns: (type filename): |
1310 | * The converted string, or %NULL on an error. |
1311 | **/ |
1312 | gchar* |
1313 | g_filename_from_utf8 (const gchar *utf8string, |
1314 | gssize len, |
1315 | gsize *bytes_read, |
1316 | gsize *bytes_written, |
1317 | GError **error) |
1318 | { |
1319 | const gchar *charset; |
1320 | |
1321 | if (get_filename_charset (filename_charset: &charset)) |
1322 | return strdup_len (string: utf8string, len, bytes_read, bytes_written, error); |
1323 | else |
1324 | return convert_checked (string: utf8string, len, to_codeset: charset, from_codeset: "UTF-8" , |
1325 | flags: CONVERT_CHECK_NO_NULS_IN_INPUT | |
1326 | CONVERT_CHECK_NO_NULS_IN_OUTPUT, |
1327 | bytes_read, bytes_written, error); |
1328 | } |
1329 | |
1330 | /* Test of haystack has the needle prefix, comparing case |
1331 | * insensitive. haystack may be UTF-8, but needle must |
1332 | * contain only ascii. */ |
1333 | static gboolean |
1334 | has_case_prefix (const gchar *haystack, const gchar *needle) |
1335 | { |
1336 | const gchar *h, *n; |
1337 | |
1338 | /* Eat one character at a time. */ |
1339 | h = haystack; |
1340 | n = needle; |
1341 | |
1342 | while (*n && *h && |
1343 | g_ascii_tolower (c: *n) == g_ascii_tolower (c: *h)) |
1344 | { |
1345 | n++; |
1346 | h++; |
1347 | } |
1348 | |
1349 | return *n == '\0'; |
1350 | } |
1351 | |
1352 | typedef enum { |
1353 | UNSAFE_ALL = 0x1, /* Escape all unsafe characters */ |
1354 | UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */ |
1355 | UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */ |
1356 | UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */ |
1357 | UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */ |
1358 | } UnsafeCharacterSet; |
1359 | |
1360 | static const guchar acceptable[96] = { |
1361 | /* A table of the ASCII chars from space (32) to DEL (127) */ |
1362 | /* ! " # $ % & ' ( ) * + , - . / */ |
1363 | 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C, |
1364 | /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ |
1365 | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20, |
1366 | /* @ A B C D E F G H I J K L M N O */ |
1367 | 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
1368 | /* P Q R S T U V W X Y Z [ \ ] ^ _ */ |
1369 | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, |
1370 | /* ` a b c d e f g h i j k l m n o */ |
1371 | 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
1372 | /* p q r s t u v w x y z { | } ~ DEL */ |
1373 | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20 |
1374 | }; |
1375 | |
1376 | static const gchar hex[16] = "0123456789ABCDEF" ; |
1377 | |
1378 | /* Note: This escape function works on file: URIs, but if you want to |
1379 | * escape something else, please read RFC-2396 */ |
1380 | static gchar * |
1381 | g_escape_uri_string (const gchar *string, |
1382 | UnsafeCharacterSet mask) |
1383 | { |
1384 | #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask)) |
1385 | |
1386 | const gchar *p; |
1387 | gchar *q; |
1388 | gchar *result; |
1389 | int c; |
1390 | gint unacceptable; |
1391 | UnsafeCharacterSet use_mask; |
1392 | |
1393 | g_return_val_if_fail (mask == UNSAFE_ALL |
1394 | || mask == UNSAFE_ALLOW_PLUS |
1395 | || mask == UNSAFE_PATH |
1396 | || mask == UNSAFE_HOST |
1397 | || mask == UNSAFE_SLASHES, NULL); |
1398 | |
1399 | unacceptable = 0; |
1400 | use_mask = mask; |
1401 | for (p = string; *p != '\0'; p++) |
1402 | { |
1403 | c = (guchar) *p; |
1404 | if (!ACCEPTABLE (c)) |
1405 | unacceptable++; |
1406 | } |
1407 | |
1408 | result = g_malloc (n_bytes: p - string + unacceptable * 2 + 1); |
1409 | |
1410 | use_mask = mask; |
1411 | for (q = result, p = string; *p != '\0'; p++) |
1412 | { |
1413 | c = (guchar) *p; |
1414 | |
1415 | if (!ACCEPTABLE (c)) |
1416 | { |
1417 | *q++ = '%'; /* means hex coming */ |
1418 | *q++ = hex[c >> 4]; |
1419 | *q++ = hex[c & 15]; |
1420 | } |
1421 | else |
1422 | *q++ = *p; |
1423 | } |
1424 | |
1425 | *q = '\0'; |
1426 | |
1427 | return result; |
1428 | } |
1429 | |
1430 | |
1431 | static gchar * |
1432 | g_escape_file_uri (const gchar *hostname, |
1433 | const gchar *pathname) |
1434 | { |
1435 | char *escaped_hostname = NULL; |
1436 | char *escaped_path; |
1437 | char *res; |
1438 | |
1439 | #ifdef G_OS_WIN32 |
1440 | char *p, *backslash; |
1441 | |
1442 | /* Turn backslashes into forward slashes. That's what Netscape |
1443 | * does, and they are actually more or less equivalent in Windows. |
1444 | */ |
1445 | |
1446 | pathname = g_strdup (pathname); |
1447 | p = (char *) pathname; |
1448 | |
1449 | while ((backslash = strchr (p, '\\')) != NULL) |
1450 | { |
1451 | *backslash = '/'; |
1452 | p = backslash + 1; |
1453 | } |
1454 | #endif |
1455 | |
1456 | if (hostname && *hostname != '\0') |
1457 | { |
1458 | escaped_hostname = g_escape_uri_string (string: hostname, mask: UNSAFE_HOST); |
1459 | } |
1460 | |
1461 | escaped_path = g_escape_uri_string (string: pathname, mask: UNSAFE_PATH); |
1462 | |
1463 | res = g_strconcat (string1: "file://" , |
1464 | (escaped_hostname) ? escaped_hostname : "" , |
1465 | (*escaped_path != '/') ? "/" : "" , |
1466 | escaped_path, |
1467 | NULL); |
1468 | |
1469 | #ifdef G_OS_WIN32 |
1470 | g_free ((char *) pathname); |
1471 | #endif |
1472 | |
1473 | g_free (mem: escaped_hostname); |
1474 | g_free (mem: escaped_path); |
1475 | |
1476 | return res; |
1477 | } |
1478 | |
1479 | static int |
1480 | unescape_character (const char *scanner) |
1481 | { |
1482 | int first_digit; |
1483 | int second_digit; |
1484 | |
1485 | first_digit = g_ascii_xdigit_value (c: scanner[0]); |
1486 | if (first_digit < 0) |
1487 | return -1; |
1488 | |
1489 | second_digit = g_ascii_xdigit_value (c: scanner[1]); |
1490 | if (second_digit < 0) |
1491 | return -1; |
1492 | |
1493 | return (first_digit << 4) | second_digit; |
1494 | } |
1495 | |
1496 | static gchar * |
1497 | g_unescape_uri_string (const char *escaped, |
1498 | int len, |
1499 | const char *illegal_escaped_characters, |
1500 | gboolean ascii_must_not_be_escaped) |
1501 | { |
1502 | const gchar *in, *in_end; |
1503 | gchar *out, *result; |
1504 | int c; |
1505 | |
1506 | if (escaped == NULL) |
1507 | return NULL; |
1508 | |
1509 | if (len < 0) |
1510 | len = strlen (s: escaped); |
1511 | |
1512 | result = g_malloc (n_bytes: len + 1); |
1513 | |
1514 | out = result; |
1515 | for (in = escaped, in_end = escaped + len; in < in_end; in++) |
1516 | { |
1517 | c = *in; |
1518 | |
1519 | if (c == '%') |
1520 | { |
1521 | /* catch partial escape sequences past the end of the substring */ |
1522 | if (in + 3 > in_end) |
1523 | break; |
1524 | |
1525 | c = unescape_character (scanner: in + 1); |
1526 | |
1527 | /* catch bad escape sequences and NUL characters */ |
1528 | if (c <= 0) |
1529 | break; |
1530 | |
1531 | /* catch escaped ASCII */ |
1532 | if (ascii_must_not_be_escaped && c <= 0x7F) |
1533 | break; |
1534 | |
1535 | /* catch other illegal escaped characters */ |
1536 | if (strchr (s: illegal_escaped_characters, c: c) != NULL) |
1537 | break; |
1538 | |
1539 | in += 2; |
1540 | } |
1541 | |
1542 | *out++ = c; |
1543 | } |
1544 | |
1545 | g_assert (out - result <= len); |
1546 | *out = '\0'; |
1547 | |
1548 | if (in != in_end) |
1549 | { |
1550 | g_free (mem: result); |
1551 | return NULL; |
1552 | } |
1553 | |
1554 | return result; |
1555 | } |
1556 | |
1557 | static gboolean |
1558 | is_asciialphanum (gunichar c) |
1559 | { |
1560 | return c <= 0x7F && g_ascii_isalnum (c); |
1561 | } |
1562 | |
1563 | static gboolean |
1564 | is_asciialpha (gunichar c) |
1565 | { |
1566 | return c <= 0x7F && g_ascii_isalpha (c); |
1567 | } |
1568 | |
1569 | /* allows an empty string */ |
1570 | static gboolean |
1571 | hostname_validate (const char *hostname) |
1572 | { |
1573 | const char *p; |
1574 | gunichar c, first_char, last_char; |
1575 | |
1576 | p = hostname; |
1577 | if (*p == '\0') |
1578 | return TRUE; |
1579 | do |
1580 | { |
1581 | /* read in a label */ |
1582 | c = g_utf8_get_char (p); |
1583 | p = g_utf8_next_char (p); |
1584 | if (!is_asciialphanum (c)) |
1585 | return FALSE; |
1586 | first_char = c; |
1587 | do |
1588 | { |
1589 | last_char = c; |
1590 | c = g_utf8_get_char (p); |
1591 | p = g_utf8_next_char (p); |
1592 | } |
1593 | while (is_asciialphanum (c) || c == '-'); |
1594 | if (last_char == '-') |
1595 | return FALSE; |
1596 | |
1597 | /* if that was the last label, check that it was a toplabel */ |
1598 | if (c == '\0' || (c == '.' && *p == '\0')) |
1599 | return is_asciialpha (c: first_char); |
1600 | } |
1601 | while (c == '.'); |
1602 | return FALSE; |
1603 | } |
1604 | |
1605 | /** |
1606 | * g_filename_from_uri: |
1607 | * @uri: a uri describing a filename (escaped, encoded in ASCII). |
1608 | * @hostname: (out) (optional) (nullable): Location to store hostname for the URI. |
1609 | * If there is no hostname in the URI, %NULL will be |
1610 | * stored in this location. |
1611 | * @error: location to store the error occurring, or %NULL to ignore |
1612 | * errors. Any of the errors in #GConvertError may occur. |
1613 | * |
1614 | * Converts an escaped ASCII-encoded URI to a local filename in the |
1615 | * encoding used for filenames. |
1616 | * |
1617 | * Returns: (type filename): a newly-allocated string holding |
1618 | * the resulting filename, or %NULL on an error. |
1619 | **/ |
1620 | gchar * |
1621 | g_filename_from_uri (const gchar *uri, |
1622 | gchar **hostname, |
1623 | GError **error) |
1624 | { |
1625 | const char *path_part; |
1626 | const char *host_part; |
1627 | char *unescaped_hostname; |
1628 | char *result; |
1629 | char *filename; |
1630 | int offs; |
1631 | #ifdef G_OS_WIN32 |
1632 | char *p, *slash; |
1633 | #endif |
1634 | |
1635 | if (hostname) |
1636 | *hostname = NULL; |
1637 | |
1638 | if (!has_case_prefix (haystack: uri, needle: "file:/" )) |
1639 | { |
1640 | g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_BAD_URI, |
1641 | _("The URI “%s” is not an absolute URI using the “file” scheme" ), |
1642 | uri); |
1643 | return NULL; |
1644 | } |
1645 | |
1646 | path_part = uri + strlen (s: "file:" ); |
1647 | |
1648 | if (strchr (s: path_part, c: '#') != NULL) |
1649 | { |
1650 | g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_BAD_URI, |
1651 | _("The local file URI “%s” may not include a “#”" ), |
1652 | uri); |
1653 | return NULL; |
1654 | } |
1655 | |
1656 | if (has_case_prefix (haystack: path_part, needle: "///" )) |
1657 | path_part += 2; |
1658 | else if (has_case_prefix (haystack: path_part, needle: "//" )) |
1659 | { |
1660 | path_part += 2; |
1661 | host_part = path_part; |
1662 | |
1663 | path_part = strchr (s: path_part, c: '/'); |
1664 | |
1665 | if (path_part == NULL) |
1666 | { |
1667 | g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_BAD_URI, |
1668 | _("The URI “%s” is invalid" ), |
1669 | uri); |
1670 | return NULL; |
1671 | } |
1672 | |
1673 | unescaped_hostname = g_unescape_uri_string (escaped: host_part, len: path_part - host_part, illegal_escaped_characters: "" , TRUE); |
1674 | |
1675 | if (unescaped_hostname == NULL || |
1676 | !hostname_validate (hostname: unescaped_hostname)) |
1677 | { |
1678 | g_free (mem: unescaped_hostname); |
1679 | g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_BAD_URI, |
1680 | _("The hostname of the URI “%s” is invalid" ), |
1681 | uri); |
1682 | return NULL; |
1683 | } |
1684 | |
1685 | if (hostname) |
1686 | *hostname = unescaped_hostname; |
1687 | else |
1688 | g_free (mem: unescaped_hostname); |
1689 | } |
1690 | |
1691 | filename = g_unescape_uri_string (escaped: path_part, len: -1, illegal_escaped_characters: "/" , FALSE); |
1692 | |
1693 | if (filename == NULL) |
1694 | { |
1695 | g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_BAD_URI, |
1696 | _("The URI “%s” contains invalidly escaped characters" ), |
1697 | uri); |
1698 | return NULL; |
1699 | } |
1700 | |
1701 | offs = 0; |
1702 | #ifdef G_OS_WIN32 |
1703 | /* Drop localhost */ |
1704 | if (hostname && *hostname != NULL && |
1705 | g_ascii_strcasecmp (*hostname, "localhost" ) == 0) |
1706 | { |
1707 | g_free (*hostname); |
1708 | *hostname = NULL; |
1709 | } |
1710 | |
1711 | /* Turn slashes into backslashes, because that's the canonical spelling */ |
1712 | p = filename; |
1713 | while ((slash = strchr (p, '/')) != NULL) |
1714 | { |
1715 | *slash = '\\'; |
1716 | p = slash + 1; |
1717 | } |
1718 | |
1719 | /* Windows URIs with a drive letter can be like "file://host/c:/foo" |
1720 | * or "file://host/c|/foo" (some Netscape versions). In those cases, start |
1721 | * the filename from the drive letter. |
1722 | */ |
1723 | if (g_ascii_isalpha (filename[1])) |
1724 | { |
1725 | if (filename[2] == ':') |
1726 | offs = 1; |
1727 | else if (filename[2] == '|') |
1728 | { |
1729 | filename[2] = ':'; |
1730 | offs = 1; |
1731 | } |
1732 | } |
1733 | #endif |
1734 | |
1735 | result = g_strdup (str: filename + offs); |
1736 | g_free (mem: filename); |
1737 | |
1738 | return result; |
1739 | } |
1740 | |
1741 | /** |
1742 | * g_filename_to_uri: |
1743 | * @filename: (type filename): an absolute filename specified in the GLib file |
1744 | * name encoding, which is the on-disk file name bytes on Unix, and UTF-8 |
1745 | * on Windows |
1746 | * @hostname: (nullable): A UTF-8 encoded hostname, or %NULL for none. |
1747 | * @error: location to store the error occurring, or %NULL to ignore |
1748 | * errors. Any of the errors in #GConvertError may occur. |
1749 | * |
1750 | * Converts an absolute filename to an escaped ASCII-encoded URI, with the path |
1751 | * component following Section 3.3. of RFC 2396. |
1752 | * |
1753 | * Returns: a newly-allocated string holding the resulting |
1754 | * URI, or %NULL on an error. |
1755 | **/ |
1756 | gchar * |
1757 | g_filename_to_uri (const gchar *filename, |
1758 | const gchar *hostname, |
1759 | GError **error) |
1760 | { |
1761 | char *escaped_uri; |
1762 | |
1763 | g_return_val_if_fail (filename != NULL, NULL); |
1764 | |
1765 | if (!g_path_is_absolute (file_name: filename)) |
1766 | { |
1767 | g_set_error (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_NOT_ABSOLUTE_PATH, |
1768 | _("The pathname “%s” is not an absolute path" ), |
1769 | filename); |
1770 | return NULL; |
1771 | } |
1772 | |
1773 | if (hostname && |
1774 | !(g_utf8_validate (str: hostname, max_len: -1, NULL) |
1775 | && hostname_validate (hostname))) |
1776 | { |
1777 | g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
1778 | _("Invalid hostname" )); |
1779 | return NULL; |
1780 | } |
1781 | |
1782 | #ifdef G_OS_WIN32 |
1783 | /* Don't use localhost unnecessarily */ |
1784 | if (hostname && g_ascii_strcasecmp (hostname, "localhost" ) == 0) |
1785 | hostname = NULL; |
1786 | #endif |
1787 | |
1788 | escaped_uri = g_escape_file_uri (hostname, pathname: filename); |
1789 | |
1790 | return escaped_uri; |
1791 | } |
1792 | |
1793 | /** |
1794 | * g_uri_list_extract_uris: |
1795 | * @uri_list: an URI list |
1796 | * |
1797 | * Splits an URI list conforming to the text/uri-list |
1798 | * mime type defined in RFC 2483 into individual URIs, |
1799 | * discarding any comments. The URIs are not validated. |
1800 | * |
1801 | * Returns: (transfer full): a newly allocated %NULL-terminated list |
1802 | * of strings holding the individual URIs. The array should be freed |
1803 | * with g_strfreev(). |
1804 | * |
1805 | * Since: 2.6 |
1806 | */ |
1807 | gchar ** |
1808 | (const gchar *uri_list) |
1809 | { |
1810 | GPtrArray *uris; |
1811 | const gchar *p, *q; |
1812 | |
1813 | uris = g_ptr_array_new (); |
1814 | |
1815 | p = uri_list; |
1816 | |
1817 | /* We don't actually try to validate the URI according to RFC |
1818 | * 2396, or even check for allowed characters - we just ignore |
1819 | * comments and trim whitespace off the ends. We also |
1820 | * allow LF delimination as well as the specified CRLF. |
1821 | * |
1822 | * We do allow comments like specified in RFC 2483. |
1823 | */ |
1824 | while (p) |
1825 | { |
1826 | if (*p != '#') |
1827 | { |
1828 | while (g_ascii_isspace (*p)) |
1829 | p++; |
1830 | |
1831 | q = p; |
1832 | while (*q && (*q != '\n') && (*q != '\r')) |
1833 | q++; |
1834 | |
1835 | if (q > p) |
1836 | { |
1837 | q--; |
1838 | while (q > p && g_ascii_isspace (*q)) |
1839 | q--; |
1840 | |
1841 | if (q > p) |
1842 | g_ptr_array_add (array: uris, data: g_strndup (str: p, n: q - p + 1)); |
1843 | } |
1844 | } |
1845 | p = strchr (s: p, c: '\n'); |
1846 | if (p) |
1847 | p++; |
1848 | } |
1849 | |
1850 | g_ptr_array_add (array: uris, NULL); |
1851 | |
1852 | return (gchar **) g_ptr_array_free (array: uris, FALSE); |
1853 | } |
1854 | |
1855 | /** |
1856 | * g_filename_display_basename: |
1857 | * @filename: (type filename): an absolute pathname in the |
1858 | * GLib file name encoding |
1859 | * |
1860 | * Returns the display basename for the particular filename, guaranteed |
1861 | * to be valid UTF-8. The display name might not be identical to the filename, |
1862 | * for instance there might be problems converting it to UTF-8, and some files |
1863 | * can be translated in the display. |
1864 | * |
1865 | * If GLib cannot make sense of the encoding of @filename, as a last resort it |
1866 | * replaces unknown characters with U+FFFD, the Unicode replacement character. |
1867 | * You can search the result for the UTF-8 encoding of this character (which is |
1868 | * "\357\277\275" in octal notation) to find out if @filename was in an invalid |
1869 | * encoding. |
1870 | * |
1871 | * You must pass the whole absolute pathname to this functions so that |
1872 | * translation of well known locations can be done. |
1873 | * |
1874 | * This function is preferred over g_filename_display_name() if you know the |
1875 | * whole path, as it allows translation. |
1876 | * |
1877 | * Returns: a newly allocated string containing |
1878 | * a rendition of the basename of the filename in valid UTF-8 |
1879 | * |
1880 | * Since: 2.6 |
1881 | **/ |
1882 | gchar * |
1883 | g_filename_display_basename (const gchar *filename) |
1884 | { |
1885 | char *basename; |
1886 | char *display_name; |
1887 | |
1888 | g_return_val_if_fail (filename != NULL, NULL); |
1889 | |
1890 | basename = g_path_get_basename (file_name: filename); |
1891 | display_name = g_filename_display_name (filename: basename); |
1892 | g_free (mem: basename); |
1893 | return display_name; |
1894 | } |
1895 | |
1896 | /** |
1897 | * g_filename_display_name: |
1898 | * @filename: (type filename): a pathname hopefully in the |
1899 | * GLib file name encoding |
1900 | * |
1901 | * Converts a filename into a valid UTF-8 string. The conversion is |
1902 | * not necessarily reversible, so you should keep the original around |
1903 | * and use the return value of this function only for display purposes. |
1904 | * Unlike g_filename_to_utf8(), the result is guaranteed to be non-%NULL |
1905 | * even if the filename actually isn't in the GLib file name encoding. |
1906 | * |
1907 | * If GLib cannot make sense of the encoding of @filename, as a last resort it |
1908 | * replaces unknown characters with U+FFFD, the Unicode replacement character. |
1909 | * You can search the result for the UTF-8 encoding of this character (which is |
1910 | * "\357\277\275" in octal notation) to find out if @filename was in an invalid |
1911 | * encoding. |
1912 | * |
1913 | * If you know the whole pathname of the file you should use |
1914 | * g_filename_display_basename(), since that allows location-based |
1915 | * translation of filenames. |
1916 | * |
1917 | * Returns: a newly allocated string containing |
1918 | * a rendition of the filename in valid UTF-8 |
1919 | * |
1920 | * Since: 2.6 |
1921 | **/ |
1922 | gchar * |
1923 | g_filename_display_name (const gchar *filename) |
1924 | { |
1925 | gint i; |
1926 | const gchar **charsets; |
1927 | gchar *display_name = NULL; |
1928 | gboolean is_utf8; |
1929 | |
1930 | is_utf8 = g_get_filename_charsets (filename_charsets: &charsets); |
1931 | |
1932 | if (is_utf8) |
1933 | { |
1934 | if (g_utf8_validate (str: filename, max_len: -1, NULL)) |
1935 | display_name = g_strdup (str: filename); |
1936 | } |
1937 | |
1938 | if (!display_name) |
1939 | { |
1940 | /* Try to convert from the filename charsets to UTF-8. |
1941 | * Skip the first charset if it is UTF-8. |
1942 | */ |
1943 | for (i = is_utf8 ? 1 : 0; charsets[i]; i++) |
1944 | { |
1945 | display_name = g_convert (str: filename, len: -1, to_codeset: "UTF-8" , from_codeset: charsets[i], |
1946 | NULL, NULL, NULL); |
1947 | |
1948 | if (display_name) |
1949 | break; |
1950 | } |
1951 | } |
1952 | |
1953 | /* if all conversions failed, we replace invalid UTF-8 |
1954 | * by a question mark |
1955 | */ |
1956 | if (!display_name) |
1957 | display_name = g_utf8_make_valid (str: filename, len: -1); |
1958 | |
1959 | return display_name; |
1960 | } |
1961 | |
1962 | #ifdef G_OS_WIN32 |
1963 | |
1964 | /* Binary compatibility versions. Not for newly compiled code. */ |
1965 | |
1966 | _GLIB_EXTERN gchar *g_filename_to_utf8_utf8 (const gchar *opsysstring, |
1967 | gssize len, |
1968 | gsize *bytes_read, |
1969 | gsize *bytes_written, |
1970 | GError **error) G_GNUC_MALLOC; |
1971 | _GLIB_EXTERN gchar *g_filename_from_utf8_utf8 (const gchar *utf8string, |
1972 | gssize len, |
1973 | gsize *bytes_read, |
1974 | gsize *bytes_written, |
1975 | GError **error) G_GNUC_MALLOC; |
1976 | _GLIB_EXTERN gchar *g_filename_from_uri_utf8 (const gchar *uri, |
1977 | gchar **hostname, |
1978 | GError **error) G_GNUC_MALLOC; |
1979 | _GLIB_EXTERN gchar *g_filename_to_uri_utf8 (const gchar *filename, |
1980 | const gchar *hostname, |
1981 | GError **error) G_GNUC_MALLOC; |
1982 | |
1983 | gchar * |
1984 | g_filename_to_utf8_utf8 (const gchar *opsysstring, |
1985 | gssize len, |
1986 | gsize *bytes_read, |
1987 | gsize *bytes_written, |
1988 | GError **error) |
1989 | { |
1990 | return g_filename_to_utf8 (opsysstring, len, bytes_read, bytes_written, error); |
1991 | } |
1992 | |
1993 | gchar * |
1994 | g_filename_from_utf8_utf8 (const gchar *utf8string, |
1995 | gssize len, |
1996 | gsize *bytes_read, |
1997 | gsize *bytes_written, |
1998 | GError **error) |
1999 | { |
2000 | return g_filename_from_utf8 (utf8string, len, bytes_read, bytes_written, error); |
2001 | } |
2002 | |
2003 | gchar * |
2004 | g_filename_from_uri_utf8 (const gchar *uri, |
2005 | gchar **hostname, |
2006 | GError **error) |
2007 | { |
2008 | return g_filename_from_uri (uri, hostname, error); |
2009 | } |
2010 | |
2011 | gchar * |
2012 | g_filename_to_uri_utf8 (const gchar *filename, |
2013 | const gchar *hostname, |
2014 | GError **error) |
2015 | { |
2016 | return g_filename_to_uri (filename, hostname, error); |
2017 | } |
2018 | |
2019 | #endif |
2020 | |