1 | /* -*- mode: C; c-file-style: "gnu"; indent-tabs-mode: nil; -*- */ |
2 | |
3 | /* GLIB - Library of useful routines for C programming |
4 | * Copyright (C) 2008 Red Hat, Inc. |
5 | * |
6 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Lesser General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2.1 of the License, or (at your option) any later version. |
10 | * |
11 | * This library is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Lesser General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Lesser General |
17 | * Public License along with this library; if not, see <http://www.gnu.org/licenses/>. |
18 | */ |
19 | |
20 | #include "config.h" |
21 | #include "glibconfig.h" |
22 | |
23 | #include <string.h> |
24 | |
25 | #ifdef G_OS_UNIX |
26 | #include <unistd.h> |
27 | #endif |
28 | |
29 | #include "ghostutils.h" |
30 | |
31 | #include "garray.h" |
32 | #include "gmem.h" |
33 | #include "gstring.h" |
34 | #include "gstrfuncs.h" |
35 | #include "glibintl.h" |
36 | |
37 | #ifdef G_PLATFORM_WIN32 |
38 | #include <windows.h> |
39 | #endif |
40 | |
41 | |
42 | /** |
43 | * SECTION:ghostutils |
44 | * @short_description: Internet hostname utilities |
45 | * |
46 | * Functions for manipulating internet hostnames; in particular, for |
47 | * converting between Unicode and ASCII-encoded forms of |
48 | * Internationalized Domain Names (IDNs). |
49 | * |
50 | * The |
51 | * [Internationalized Domain Names for Applications (IDNA)](http://www.ietf.org/rfc/rfc3490.txt) |
52 | * standards allow for the use |
53 | * of Unicode domain names in applications, while providing |
54 | * backward-compatibility with the old ASCII-only DNS, by defining an |
55 | * ASCII-Compatible Encoding of any given Unicode name, which can be |
56 | * used with non-IDN-aware applications and protocols. (For example, |
57 | * "Παν語.org" maps to "xn--4wa8awb4637h.org".) |
58 | **/ |
59 | |
60 | #define IDNA_ACE_PREFIX "xn--" |
61 | #define IDNA_ACE_PREFIX_LEN 4 |
62 | |
63 | /* Punycode constants, from RFC 3492. */ |
64 | |
65 | #define PUNYCODE_BASE 36 |
66 | #define PUNYCODE_TMIN 1 |
67 | #define PUNYCODE_TMAX 26 |
68 | #define PUNYCODE_SKEW 38 |
69 | #define PUNYCODE_DAMP 700 |
70 | #define PUNYCODE_INITIAL_BIAS 72 |
71 | #define PUNYCODE_INITIAL_N 0x80 |
72 | |
73 | #define PUNYCODE_IS_BASIC(cp) ((guint)(cp) < 0x80) |
74 | |
75 | /* Encode/decode a single base-36 digit */ |
76 | static inline gchar |
77 | encode_digit (guint dig) |
78 | { |
79 | if (dig < 26) |
80 | return dig + 'a'; |
81 | else |
82 | return dig - 26 + '0'; |
83 | } |
84 | |
85 | static inline guint |
86 | decode_digit (gchar dig) |
87 | { |
88 | if (dig >= 'A' && dig <= 'Z') |
89 | return dig - 'A'; |
90 | else if (dig >= 'a' && dig <= 'z') |
91 | return dig - 'a'; |
92 | else if (dig >= '0' && dig <= '9') |
93 | return dig - '0' + 26; |
94 | else |
95 | return G_MAXUINT; |
96 | } |
97 | |
98 | /* Punycode bias adaptation algorithm, RFC 3492 section 6.1 */ |
99 | static guint |
100 | adapt (guint delta, |
101 | guint numpoints, |
102 | gboolean firsttime) |
103 | { |
104 | guint k; |
105 | |
106 | delta = firsttime ? delta / PUNYCODE_DAMP : delta / 2; |
107 | delta += delta / numpoints; |
108 | |
109 | k = 0; |
110 | while (delta > ((PUNYCODE_BASE - PUNYCODE_TMIN) * PUNYCODE_TMAX) / 2) |
111 | { |
112 | delta /= PUNYCODE_BASE - PUNYCODE_TMIN; |
113 | k += PUNYCODE_BASE; |
114 | } |
115 | |
116 | return k + ((PUNYCODE_BASE - PUNYCODE_TMIN + 1) * delta / |
117 | (delta + PUNYCODE_SKEW)); |
118 | } |
119 | |
120 | /* Punycode encoder, RFC 3492 section 6.3. The algorithm is |
121 | * sufficiently bizarre that it's not really worth trying to explain |
122 | * here. |
123 | */ |
124 | static gboolean |
125 | punycode_encode (const gchar *input_utf8, |
126 | gsize input_utf8_length, |
127 | GString *output) |
128 | { |
129 | guint delta, handled_chars, num_basic_chars, bias, j, q, k, t, digit; |
130 | gunichar n, m, *input; |
131 | glong input_length; |
132 | gboolean success = FALSE; |
133 | |
134 | /* Convert from UTF-8 to Unicode code points */ |
135 | input = g_utf8_to_ucs4 (str: input_utf8, len: input_utf8_length, NULL, |
136 | items_written: &input_length, NULL); |
137 | if (!input) |
138 | return FALSE; |
139 | |
140 | /* Copy basic chars */ |
141 | for (j = num_basic_chars = 0; j < input_length; j++) |
142 | { |
143 | if (PUNYCODE_IS_BASIC (input[j])) |
144 | { |
145 | g_string_append_c (output, g_ascii_tolower (input[j])); |
146 | num_basic_chars++; |
147 | } |
148 | } |
149 | if (num_basic_chars) |
150 | g_string_append_c (output, '-'); |
151 | |
152 | handled_chars = num_basic_chars; |
153 | |
154 | /* Encode non-basic chars */ |
155 | delta = 0; |
156 | bias = PUNYCODE_INITIAL_BIAS; |
157 | n = PUNYCODE_INITIAL_N; |
158 | while (handled_chars < input_length) |
159 | { |
160 | /* let m = the minimum {non-basic} code point >= n in the input */ |
161 | for (m = G_MAXUINT, j = 0; j < input_length; j++) |
162 | { |
163 | if (input[j] >= n && input[j] < m) |
164 | m = input[j]; |
165 | } |
166 | |
167 | if (m - n > (G_MAXUINT - delta) / (handled_chars + 1)) |
168 | goto fail; |
169 | delta += (m - n) * (handled_chars + 1); |
170 | n = m; |
171 | |
172 | for (j = 0; j < input_length; j++) |
173 | { |
174 | if (input[j] < n) |
175 | { |
176 | if (++delta == 0) |
177 | goto fail; |
178 | } |
179 | else if (input[j] == n) |
180 | { |
181 | q = delta; |
182 | for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE) |
183 | { |
184 | if (k <= bias) |
185 | t = PUNYCODE_TMIN; |
186 | else if (k >= bias + PUNYCODE_TMAX) |
187 | t = PUNYCODE_TMAX; |
188 | else |
189 | t = k - bias; |
190 | if (q < t) |
191 | break; |
192 | digit = t + (q - t) % (PUNYCODE_BASE - t); |
193 | g_string_append_c (output, encode_digit (digit)); |
194 | q = (q - t) / (PUNYCODE_BASE - t); |
195 | } |
196 | |
197 | g_string_append_c (output, encode_digit (q)); |
198 | bias = adapt (delta, numpoints: handled_chars + 1, firsttime: handled_chars == num_basic_chars); |
199 | delta = 0; |
200 | handled_chars++; |
201 | } |
202 | } |
203 | |
204 | delta++; |
205 | n++; |
206 | } |
207 | |
208 | success = TRUE; |
209 | |
210 | fail: |
211 | g_free (mem: input); |
212 | return success; |
213 | } |
214 | |
215 | /* From RFC 3454, Table B.1 */ |
216 | #define idna_is_junk(ch) ((ch) == 0x00AD || (ch) == 0x1806 || (ch) == 0x200B || (ch) == 0x2060 || (ch) == 0xFEFF || (ch) == 0x034F || (ch) == 0x180B || (ch) == 0x180C || (ch) == 0x180D || (ch) == 0x200C || (ch) == 0x200D || ((ch) >= 0xFE00 && (ch) <= 0xFE0F)) |
217 | |
218 | /* Scan @str for "junk" and return a cleaned-up string if any junk |
219 | * is found. Else return %NULL. |
220 | */ |
221 | static gchar * |
222 | remove_junk (const gchar *str, |
223 | gint len) |
224 | { |
225 | GString *cleaned = NULL; |
226 | const gchar *p; |
227 | gunichar ch; |
228 | |
229 | for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p)) |
230 | { |
231 | ch = g_utf8_get_char (p); |
232 | if (idna_is_junk (ch)) |
233 | { |
234 | if (!cleaned) |
235 | { |
236 | cleaned = g_string_new (NULL); |
237 | g_string_append_len (string: cleaned, val: str, len: p - str); |
238 | } |
239 | } |
240 | else if (cleaned) |
241 | g_string_append_unichar (string: cleaned, wc: ch); |
242 | } |
243 | |
244 | if (cleaned) |
245 | return g_string_free (string: cleaned, FALSE); |
246 | else |
247 | return NULL; |
248 | } |
249 | |
250 | static inline gboolean |
251 | contains_uppercase_letters (const gchar *str, |
252 | gint len) |
253 | { |
254 | const gchar *p; |
255 | |
256 | for (p = str; len == -1 ? *p : p < str + len; p = g_utf8_next_char (p)) |
257 | { |
258 | if (g_unichar_isupper (c: g_utf8_get_char (p))) |
259 | return TRUE; |
260 | } |
261 | return FALSE; |
262 | } |
263 | |
264 | static inline gboolean |
265 | contains_non_ascii (const gchar *str, |
266 | gint len) |
267 | { |
268 | const gchar *p; |
269 | |
270 | for (p = str; len == -1 ? *p : p < str + len; p++) |
271 | { |
272 | if ((guchar)*p > 0x80) |
273 | return TRUE; |
274 | } |
275 | return FALSE; |
276 | } |
277 | |
278 | /* RFC 3454, Appendix C. ish. */ |
279 | static inline gboolean |
280 | idna_is_prohibited (gunichar ch) |
281 | { |
282 | switch (g_unichar_type (c: ch)) |
283 | { |
284 | case G_UNICODE_CONTROL: |
285 | case G_UNICODE_FORMAT: |
286 | case G_UNICODE_UNASSIGNED: |
287 | case G_UNICODE_PRIVATE_USE: |
288 | case G_UNICODE_SURROGATE: |
289 | case G_UNICODE_LINE_SEPARATOR: |
290 | case G_UNICODE_PARAGRAPH_SEPARATOR: |
291 | case G_UNICODE_SPACE_SEPARATOR: |
292 | return TRUE; |
293 | |
294 | case G_UNICODE_OTHER_SYMBOL: |
295 | if (ch == 0xFFFC || ch == 0xFFFD || |
296 | (ch >= 0x2FF0 && ch <= 0x2FFB)) |
297 | return TRUE; |
298 | return FALSE; |
299 | |
300 | case G_UNICODE_NON_SPACING_MARK: |
301 | if (ch == 0x0340 || ch == 0x0341) |
302 | return TRUE; |
303 | return FALSE; |
304 | |
305 | default: |
306 | return FALSE; |
307 | } |
308 | } |
309 | |
310 | /* RFC 3491 IDN cleanup algorithm. */ |
311 | static gchar * |
312 | nameprep (const gchar *hostname, |
313 | gint len, |
314 | gboolean *is_unicode) |
315 | { |
316 | gchar *name, *tmp = NULL, *p; |
317 | |
318 | /* It would be nice if we could do this without repeatedly |
319 | * allocating strings and converting back and forth between |
320 | * gunichars and UTF-8... The code does at least avoid doing most of |
321 | * the sub-operations when they would just be equivalent to a |
322 | * g_strdup(). |
323 | */ |
324 | |
325 | /* Remove presentation-only characters */ |
326 | name = remove_junk (str: hostname, len); |
327 | if (name) |
328 | { |
329 | tmp = name; |
330 | len = -1; |
331 | } |
332 | else |
333 | name = (gchar *)hostname; |
334 | |
335 | /* Convert to lowercase */ |
336 | if (contains_uppercase_letters (str: name, len)) |
337 | { |
338 | name = g_utf8_strdown (str: name, len); |
339 | g_free (mem: tmp); |
340 | tmp = name; |
341 | len = -1; |
342 | } |
343 | |
344 | /* If there are no UTF8 characters, we're done. */ |
345 | if (!contains_non_ascii (str: name, len)) |
346 | { |
347 | *is_unicode = FALSE; |
348 | if (name == (gchar *)hostname) |
349 | return len == -1 ? g_strdup (str: hostname) : g_strndup (str: hostname, n: len); |
350 | else |
351 | return name; |
352 | } |
353 | |
354 | *is_unicode = TRUE; |
355 | |
356 | /* Normalize */ |
357 | name = g_utf8_normalize (str: name, len, mode: G_NORMALIZE_NFKC); |
358 | g_free (mem: tmp); |
359 | tmp = name; |
360 | |
361 | if (!name) |
362 | return NULL; |
363 | |
364 | /* KC normalization may have created more capital letters (eg, |
365 | * angstrom -> capital A with ring). So we have to lowercasify a |
366 | * second time. (This is more-or-less how the nameprep algorithm |
367 | * does it. If tolower(nfkc(tolower(X))) is guaranteed to be the |
368 | * same as tolower(nfkc(X)), then we could skip the first tolower, |
369 | * but I'm not sure it is.) |
370 | */ |
371 | if (contains_uppercase_letters (str: name, len: -1)) |
372 | { |
373 | name = g_utf8_strdown (str: name, len: -1); |
374 | g_free (mem: tmp); |
375 | tmp = name; |
376 | } |
377 | |
378 | /* Check for prohibited characters */ |
379 | for (p = name; *p; p = g_utf8_next_char (p)) |
380 | { |
381 | if (idna_is_prohibited (ch: g_utf8_get_char (p))) |
382 | { |
383 | name = NULL; |
384 | g_free (mem: tmp); |
385 | goto done; |
386 | } |
387 | } |
388 | |
389 | /* FIXME: We're supposed to verify certain constraints on bidi |
390 | * characters, but glib does not appear to have that information. |
391 | */ |
392 | |
393 | done: |
394 | return name; |
395 | } |
396 | |
397 | /* RFC 3490, section 3.1 says '.', 0x3002, 0xFF0E, and 0xFF61 count as |
398 | * label-separating dots. @str must be '\0'-terminated. |
399 | */ |
400 | #define idna_is_dot(str) ( \ |
401 | ((guchar)(str)[0] == '.') || \ |
402 | ((guchar)(str)[0] == 0xE3 && (guchar)(str)[1] == 0x80 && (guchar)(str)[2] == 0x82) || \ |
403 | ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBC && (guchar)(str)[2] == 0x8E) || \ |
404 | ((guchar)(str)[0] == 0xEF && (guchar)(str)[1] == 0xBD && (guchar)(str)[2] == 0xA1) ) |
405 | |
406 | static const gchar * |
407 | idna_end_of_label (const gchar *str) |
408 | { |
409 | for (; *str; str = g_utf8_next_char (str)) |
410 | { |
411 | if (idna_is_dot (str)) |
412 | return str; |
413 | } |
414 | return str; |
415 | } |
416 | |
417 | static gsize |
418 | get_hostname_max_length_bytes (void) |
419 | { |
420 | #if defined(G_OS_WIN32) |
421 | wchar_t tmp[MAX_COMPUTERNAME_LENGTH]; |
422 | return sizeof (tmp) / sizeof (tmp[0]); |
423 | #elif defined(_SC_HOST_NAME_MAX) |
424 | glong max = sysconf (_SC_HOST_NAME_MAX); |
425 | if (max > 0) |
426 | return (gsize) max; |
427 | |
428 | #ifdef HOST_NAME_MAX |
429 | return HOST_NAME_MAX; |
430 | #else |
431 | return _POSIX_HOST_NAME_MAX; |
432 | #endif /* HOST_NAME_MAX */ |
433 | #else |
434 | /* Fallback to some reasonable value |
435 | * See https://stackoverflow.com/questions/8724954/what-is-the-maximum-number-of-characters-for-a-host-name-in-unix/28918017#28918017 */ |
436 | return 255; |
437 | #endif |
438 | } |
439 | |
440 | /* Returns %TRUE if `strlen (str) > comparison_length`, but without actually |
441 | * running `strlen(str)`, as that would take a very long time for long |
442 | * (untrusted) input strings. */ |
443 | static gboolean |
444 | strlen_greater_than (const gchar *str, |
445 | gsize comparison_length) |
446 | { |
447 | gsize i; |
448 | |
449 | for (i = 0; str[i] != '\0'; i++) |
450 | if (i > comparison_length) |
451 | return TRUE; |
452 | |
453 | return FALSE; |
454 | } |
455 | |
456 | /** |
457 | * g_hostname_to_ascii: |
458 | * @hostname: a valid UTF-8 or ASCII hostname |
459 | * |
460 | * Converts @hostname to its canonical ASCII form; an ASCII-only |
461 | * string containing no uppercase letters and not ending with a |
462 | * trailing dot. |
463 | * |
464 | * Returns: (nullable) (transfer full): an ASCII hostname, which must be freed, |
465 | * or %NULL if @hostname is in some way invalid. |
466 | * |
467 | * Since: 2.22 |
468 | **/ |
469 | gchar * |
470 | g_hostname_to_ascii (const gchar *hostname) |
471 | { |
472 | gchar *name, *label, *p; |
473 | GString *out; |
474 | gssize llen, oldlen; |
475 | gboolean unicode; |
476 | gsize hostname_max_length_bytes = get_hostname_max_length_bytes (); |
477 | |
478 | /* Do an initial check on the hostname length, as overlong hostnames take a |
479 | * long time in the IDN cleanup algorithm in nameprep(). The ultimate |
480 | * restriction is that the IDN-decoded (i.e. pure ASCII) hostname cannot be |
481 | * longer than 255 bytes. That’s the least restrictive limit on hostname |
482 | * length of all the ways hostnames can be interpreted. Typically, the |
483 | * hostname will be an FQDN, which is limited to 253 bytes long. POSIX |
484 | * hostnames are limited to `get_hostname_max_length_bytes()` (typically 255 |
485 | * bytes). |
486 | * |
487 | * See https://stackoverflow.com/a/28918017/2931197 |
488 | * |
489 | * It’s possible for a hostname to be %-encoded, in which case its decoded |
490 | * length will be as much as 3× shorter. |
491 | * |
492 | * It’s also possible for a hostname to use overlong UTF-8 encodings, in which |
493 | * case its decoded length will be as much as 4× shorter. |
494 | * |
495 | * Note: This check is not intended as an absolute guarantee that a hostname |
496 | * is the right length and will be accepted by other systems. It’s intended to |
497 | * stop wildly-invalid hostnames from taking forever in nameprep(). |
498 | */ |
499 | if (hostname_max_length_bytes <= G_MAXSIZE / 4 && |
500 | strlen_greater_than (str: hostname, comparison_length: 4 * MAX (255, hostname_max_length_bytes))) |
501 | return NULL; |
502 | |
503 | label = name = nameprep (hostname, len: -1, is_unicode: &unicode); |
504 | if (!name || !unicode) |
505 | return name; |
506 | |
507 | out = g_string_new (NULL); |
508 | |
509 | do |
510 | { |
511 | unicode = FALSE; |
512 | for (p = label; *p && !idna_is_dot (p); p++) |
513 | { |
514 | if ((guchar)*p > 0x80) |
515 | unicode = TRUE; |
516 | } |
517 | |
518 | oldlen = out->len; |
519 | llen = p - label; |
520 | if (unicode) |
521 | { |
522 | if (!strncmp (s1: label, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
523 | goto fail; |
524 | |
525 | g_string_append (string: out, IDNA_ACE_PREFIX); |
526 | if (!punycode_encode (input_utf8: label, input_utf8_length: llen, output: out)) |
527 | goto fail; |
528 | } |
529 | else |
530 | g_string_append_len (string: out, val: label, len: llen); |
531 | |
532 | if (out->len - oldlen > 63) |
533 | goto fail; |
534 | |
535 | label += llen; |
536 | if (*label) |
537 | label = g_utf8_next_char (label); |
538 | if (*label) |
539 | g_string_append_c (out, '.'); |
540 | } |
541 | while (*label); |
542 | |
543 | g_free (mem: name); |
544 | return g_string_free (string: out, FALSE); |
545 | |
546 | fail: |
547 | g_free (mem: name); |
548 | g_string_free (string: out, TRUE); |
549 | return NULL; |
550 | } |
551 | |
552 | /** |
553 | * g_hostname_is_non_ascii: |
554 | * @hostname: a hostname |
555 | * |
556 | * Tests if @hostname contains Unicode characters. If this returns |
557 | * %TRUE, you need to encode the hostname with g_hostname_to_ascii() |
558 | * before using it in non-IDN-aware contexts. |
559 | * |
560 | * Note that a hostname might contain a mix of encoded and unencoded |
561 | * segments, and so it is possible for g_hostname_is_non_ascii() and |
562 | * g_hostname_is_ascii_encoded() to both return %TRUE for a name. |
563 | * |
564 | * Returns: %TRUE if @hostname contains any non-ASCII characters |
565 | * |
566 | * Since: 2.22 |
567 | **/ |
568 | gboolean |
569 | g_hostname_is_non_ascii (const gchar *hostname) |
570 | { |
571 | return contains_non_ascii (str: hostname, len: -1); |
572 | } |
573 | |
574 | /* Punycode decoder, RFC 3492 section 6.2. As with punycode_encode(), |
575 | * read the RFC if you want to understand what this is actually doing. |
576 | */ |
577 | static gboolean |
578 | punycode_decode (const gchar *input, |
579 | gsize input_length, |
580 | GString *output) |
581 | { |
582 | GArray *output_chars; |
583 | gunichar n; |
584 | guint i, bias; |
585 | guint oldi, w, k, digit, t; |
586 | const gchar *split; |
587 | |
588 | n = PUNYCODE_INITIAL_N; |
589 | i = 0; |
590 | bias = PUNYCODE_INITIAL_BIAS; |
591 | |
592 | split = input + input_length - 1; |
593 | while (split > input && *split != '-') |
594 | split--; |
595 | if (split > input) |
596 | { |
597 | output_chars = g_array_sized_new (FALSE, FALSE, element_size: sizeof (gunichar), |
598 | reserved_size: split - input); |
599 | input_length -= (split - input) + 1; |
600 | while (input < split) |
601 | { |
602 | gunichar ch = (gunichar)*input++; |
603 | if (!PUNYCODE_IS_BASIC (ch)) |
604 | goto fail; |
605 | g_array_append_val (output_chars, ch); |
606 | } |
607 | input++; |
608 | } |
609 | else |
610 | output_chars = g_array_new (FALSE, FALSE, element_size: sizeof (gunichar)); |
611 | |
612 | while (input_length) |
613 | { |
614 | oldi = i; |
615 | w = 1; |
616 | for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE) |
617 | { |
618 | if (!input_length--) |
619 | goto fail; |
620 | digit = decode_digit (dig: *input++); |
621 | if (digit >= PUNYCODE_BASE) |
622 | goto fail; |
623 | if (digit > (G_MAXUINT - i) / w) |
624 | goto fail; |
625 | i += digit * w; |
626 | if (k <= bias) |
627 | t = PUNYCODE_TMIN; |
628 | else if (k >= bias + PUNYCODE_TMAX) |
629 | t = PUNYCODE_TMAX; |
630 | else |
631 | t = k - bias; |
632 | if (digit < t) |
633 | break; |
634 | if (w > G_MAXUINT / (PUNYCODE_BASE - t)) |
635 | goto fail; |
636 | w *= (PUNYCODE_BASE - t); |
637 | } |
638 | |
639 | bias = adapt (delta: i - oldi, numpoints: output_chars->len + 1, firsttime: oldi == 0); |
640 | |
641 | if (i / (output_chars->len + 1) > G_MAXUINT - n) |
642 | goto fail; |
643 | n += i / (output_chars->len + 1); |
644 | i %= (output_chars->len + 1); |
645 | |
646 | g_array_insert_val (output_chars, i++, n); |
647 | } |
648 | |
649 | for (i = 0; i < output_chars->len; i++) |
650 | g_string_append_unichar (string: output, g_array_index (output_chars, gunichar, i)); |
651 | g_array_free (array: output_chars, TRUE); |
652 | return TRUE; |
653 | |
654 | fail: |
655 | g_array_free (array: output_chars, TRUE); |
656 | return FALSE; |
657 | } |
658 | |
659 | /** |
660 | * g_hostname_to_unicode: |
661 | * @hostname: a valid UTF-8 or ASCII hostname |
662 | * |
663 | * Converts @hostname to its canonical presentation form; a UTF-8 |
664 | * string in Unicode normalization form C, containing no uppercase |
665 | * letters, no forbidden characters, and no ASCII-encoded segments, |
666 | * and not ending with a trailing dot. |
667 | * |
668 | * Of course if @hostname is not an internationalized hostname, then |
669 | * the canonical presentation form will be entirely ASCII. |
670 | * |
671 | * Returns: (nullable) (transfer full): a UTF-8 hostname, which must be freed, |
672 | * or %NULL if @hostname is in some way invalid. |
673 | * |
674 | * Since: 2.22 |
675 | **/ |
676 | gchar * |
677 | g_hostname_to_unicode (const gchar *hostname) |
678 | { |
679 | GString *out; |
680 | gssize llen; |
681 | gsize hostname_max_length_bytes = get_hostname_max_length_bytes (); |
682 | |
683 | /* See the comment at the top of g_hostname_to_ascii(). */ |
684 | if (hostname_max_length_bytes <= G_MAXSIZE / 4 && |
685 | strlen_greater_than (str: hostname, comparison_length: 4 * MAX (255, hostname_max_length_bytes))) |
686 | return NULL; |
687 | |
688 | out = g_string_new (NULL); |
689 | |
690 | do |
691 | { |
692 | llen = idna_end_of_label (str: hostname) - hostname; |
693 | if (!g_ascii_strncasecmp (s1: hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
694 | { |
695 | hostname += IDNA_ACE_PREFIX_LEN; |
696 | llen -= IDNA_ACE_PREFIX_LEN; |
697 | if (!punycode_decode (input: hostname, input_length: llen, output: out)) |
698 | { |
699 | g_string_free (string: out, TRUE); |
700 | return NULL; |
701 | } |
702 | } |
703 | else |
704 | { |
705 | gboolean unicode; |
706 | gchar *canonicalized = nameprep (hostname, len: llen, is_unicode: &unicode); |
707 | |
708 | if (!canonicalized) |
709 | { |
710 | g_string_free (string: out, TRUE); |
711 | return NULL; |
712 | } |
713 | g_string_append (string: out, val: canonicalized); |
714 | g_free (mem: canonicalized); |
715 | } |
716 | |
717 | hostname += llen; |
718 | if (*hostname) |
719 | hostname = g_utf8_next_char (hostname); |
720 | if (*hostname) |
721 | g_string_append_c (out, '.'); |
722 | } |
723 | while (*hostname); |
724 | |
725 | return g_string_free (string: out, FALSE); |
726 | } |
727 | |
728 | /** |
729 | * g_hostname_is_ascii_encoded: |
730 | * @hostname: a hostname |
731 | * |
732 | * Tests if @hostname contains segments with an ASCII-compatible |
733 | * encoding of an Internationalized Domain Name. If this returns |
734 | * %TRUE, you should decode the hostname with g_hostname_to_unicode() |
735 | * before displaying it to the user. |
736 | * |
737 | * Note that a hostname might contain a mix of encoded and unencoded |
738 | * segments, and so it is possible for g_hostname_is_non_ascii() and |
739 | * g_hostname_is_ascii_encoded() to both return %TRUE for a name. |
740 | * |
741 | * Returns: %TRUE if @hostname contains any ASCII-encoded |
742 | * segments. |
743 | * |
744 | * Since: 2.22 |
745 | **/ |
746 | gboolean |
747 | g_hostname_is_ascii_encoded (const gchar *hostname) |
748 | { |
749 | while (1) |
750 | { |
751 | if (!g_ascii_strncasecmp (s1: hostname, IDNA_ACE_PREFIX, IDNA_ACE_PREFIX_LEN)) |
752 | return TRUE; |
753 | hostname = idna_end_of_label (str: hostname); |
754 | if (*hostname) |
755 | hostname = g_utf8_next_char (hostname); |
756 | if (!*hostname) |
757 | return FALSE; |
758 | } |
759 | } |
760 | |
761 | /** |
762 | * g_hostname_is_ip_address: |
763 | * @hostname: a hostname (or IP address in string form) |
764 | * |
765 | * Tests if @hostname is the string form of an IPv4 or IPv6 address. |
766 | * (Eg, "192.168.0.1".) |
767 | * |
768 | * Since 2.66, IPv6 addresses with a zone-id are accepted (RFC6874). |
769 | * |
770 | * Returns: %TRUE if @hostname is an IP address |
771 | * |
772 | * Since: 2.22 |
773 | **/ |
774 | gboolean |
775 | g_hostname_is_ip_address (const gchar *hostname) |
776 | { |
777 | gchar *p, *end; |
778 | gint nsegments, octet; |
779 | |
780 | /* On Linux we could implement this using inet_pton, but the Windows |
781 | * equivalent of that requires linking against winsock, so we just |
782 | * figure this out ourselves. Tested by tests/hostutils.c. |
783 | */ |
784 | |
785 | p = (char *)hostname; |
786 | |
787 | if (strchr (s: p, c: ':')) |
788 | { |
789 | gboolean skipped; |
790 | |
791 | /* If it contains a ':', it's an IPv6 address (assuming it's an |
792 | * IP address at all). This consists of eight ':'-separated |
793 | * segments, each containing a 1-4 digit hex number, except that |
794 | * optionally: (a) the last two segments can be replaced by an |
795 | * IPv4 address, and (b) a single span of 1 to 8 "0000" segments |
796 | * can be replaced with just "::". |
797 | */ |
798 | |
799 | nsegments = 0; |
800 | skipped = FALSE; |
801 | while (*p && *p != '%' && nsegments < 8) |
802 | { |
803 | /* Each segment after the first must be preceded by a ':'. |
804 | * (We also handle half of the "string starts with ::" case |
805 | * here.) |
806 | */ |
807 | if (p != (char *)hostname || (p[0] == ':' && p[1] == ':')) |
808 | { |
809 | if (*p != ':') |
810 | return FALSE; |
811 | p++; |
812 | } |
813 | |
814 | /* If there's another ':', it means we're skipping some segments */ |
815 | if (*p == ':' && !skipped) |
816 | { |
817 | skipped = TRUE; |
818 | nsegments++; |
819 | |
820 | /* Handle the "string ends with ::" case */ |
821 | if (!p[1]) |
822 | p++; |
823 | |
824 | continue; |
825 | } |
826 | |
827 | /* Read the segment, make sure it's valid. */ |
828 | for (end = p; g_ascii_isxdigit (*end); end++) |
829 | ; |
830 | if (end == p || end > p + 4) |
831 | return FALSE; |
832 | |
833 | if (*end == '.') |
834 | { |
835 | if ((nsegments == 6 && !skipped) || (nsegments <= 6 && skipped)) |
836 | goto parse_ipv4; |
837 | else |
838 | return FALSE; |
839 | } |
840 | |
841 | nsegments++; |
842 | p = end; |
843 | } |
844 | |
845 | return (!*p || (p[0] == '%' && p[1])) && (nsegments == 8 || skipped); |
846 | } |
847 | |
848 | parse_ipv4: |
849 | |
850 | /* Parse IPv4: N.N.N.N, where each N <= 255 and doesn't have leading 0s. */ |
851 | for (nsegments = 0; nsegments < 4; nsegments++) |
852 | { |
853 | if (nsegments != 0) |
854 | { |
855 | if (*p != '.') |
856 | return FALSE; |
857 | p++; |
858 | } |
859 | |
860 | /* Check the segment; a little tricker than the IPv6 case since |
861 | * we can't allow extra leading 0s, and we can't assume that all |
862 | * strings of valid length are within range. |
863 | */ |
864 | octet = 0; |
865 | if (*p == '0') |
866 | end = p + 1; |
867 | else |
868 | { |
869 | for (end = p; g_ascii_isdigit (*end); end++) |
870 | { |
871 | octet = 10 * octet + (*end - '0'); |
872 | |
873 | if (octet > 255) |
874 | break; |
875 | } |
876 | } |
877 | if (end == p || end > p + 3 || octet > 255) |
878 | return FALSE; |
879 | |
880 | p = end; |
881 | } |
882 | |
883 | /* If there's nothing left to parse, then it's ok. */ |
884 | return !*p; |
885 | } |
886 | |