gutf8.c source code [gtk/subprojects/glib/glib/gutf8.c]

1	/ gutf8.c - Operations on UTF-8 strings.*
2	*
3	* Copyright (C) 1999 Tom Tromey
4	* Copyright (C) 2000 Red Hat, Inc.
5	*
6	* This library is free software; you can redistribute it and/or
7	* modify it under the terms of the GNU Lesser General Public
8	* License as published by the Free Software Foundation; either
9	* version 2.1 of the License, or (at your option) any later version.
10	*
11	* This library is distributed in the hope that it will be useful,
12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* Lesser General Public License for more details.
15	*
16	* You should have received a copy of the GNU Lesser General Public
17	* License along with this library; if not, see <http://www.gnu.org/licenses/>.
18	*/
19
20	#include "config.h"
21
22	#include <stdlib.h>
23	#ifdef HAVE_CODESET
24	#include <langinfo.h>
25	#endif
26	#include <string.h>
27
28	#ifdef G_PLATFORM_WIN32
29	#include <stdio.h>
30	#define STRICT
31	#include <windows.h>
32	#undef STRICT
33	#endif
34
35	#include "gconvert.h"
36	#include "ghash.h"
37	#include "gstrfuncs.h"
38	#include "gtestutils.h"
39	#include "gtypes.h"
40	#include "gthread.h"
41	#include "glibintl.h"
42
43	#define UTF8_COMPUTE(Char, Mask, Len) \
44	if (Char < 128) \
45	{ \
46	Len = 1; \
47	Mask = 0x7f; \
48	} \
49	else if ((Char & 0xe0) == 0xc0) \
50	{ \
51	Len = 2; \
52	Mask = 0x1f; \
53	} \
54	else if ((Char & 0xf0) == 0xe0) \
55	{ \
56	Len = 3; \
57	Mask = 0x0f; \
58	} \
59	else if ((Char & 0xf8) == 0xf0) \
60	{ \
61	Len = 4; \
62	Mask = 0x07; \
63	} \
64	else if ((Char & 0xfc) == 0xf8) \
65	{ \
66	Len = 5; \
67	Mask = 0x03; \
68	} \
69	else if ((Char & 0xfe) == 0xfc) \
70	{ \
71	Len = 6; \
72	Mask = 0x01; \
73	} \
74	else \
75	Len = -1;
76
77	#define UTF8_LENGTH(Char) \
78	((Char) < 0x80 ? 1 : \
79	((Char) < 0x800 ? 2 : \
80	((Char) < 0x10000 ? 3 : \
81	((Char) < 0x200000 ? 4 : \
82	((Char) < 0x4000000 ? 5 : 6)))))
83
84
85	#define UTF8_GET(Result, Chars, Count, Mask, Len) \
86	(Result) = (Chars)[0] & (Mask); \
87	for ((Count) = 1; (Count) < (Len); ++(Count)) \
88	{ \
89	if (((Chars)[(Count)] & 0xc0) != 0x80) \
90	{ \
91	(Result) = -1; \
92	break; \
93	} \
94	(Result) <<= 6; \
95	(Result) \|= ((Chars)[(Count)] & 0x3f); \
96	}
97
98	/*
99	* Check whether a Unicode (5.2) char is in a valid range.
100	*
101	* The first check comes from the Unicode guarantee to never encode
102	* a point above 0x0010ffff, since UTF-16 couldn't represent it.
103	*
104	* The second check covers surrogate pairs (category Cs).
105	*
106	* @param Char the character
107	*/
108	#define UNICODE_VALID(Char) \
109	((Char) < 0x110000 && \
110	(((Char) & 0xFFFFF800) != 0xD800))
111
112
113	static const gchar utf8_skip_data[`256`] = {
114	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
115	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
116	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
117	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
118	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
119	`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,`1`,
120	`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,`2`,
121	`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`3`,`4`,`4`,`4`,`4`,`4`,`4`,`4`,`4`,`5`,`5`,`5`,`5`,`6`,`6`,`1`,`1`
122	};
123
124	const gchar * const g_utf8_skip = utf8_skip_data;
125
126	/**
127	* g_utf8_find_prev_char:
128	* @str: pointer to the beginning of a UTF-8 encoded string
129	* @p: pointer to some position within @str
130	*
131	* Given a position @p with a UTF-8 encoded string @str, find the start
132	* of the previous UTF-8 character starting before @p. Returns %NULL if no
133	* UTF-8 characters are present in @str before @p.
134	*
135	* @p does not have to be at the beginning of a UTF-8 character. No check
136	* is made to see if the character found is actually valid other than
137	* it starts with an appropriate byte.
138	*
139	* Returns: (transfer none) (nullable): a pointer to the found character or %NULL.
140	*/
141	gchar *
142	g_utf8_find_prev_char (const gchar *str,
143	const gchar *p)
144	{
145	while (p > str)
146	{
147	--p;
148	if ((*p & `0xc0`) != `0x80`)
149	return (gchar *)p;
150	}
151	return NULL;
152	}
153
154	/**
155	* g_utf8_find_next_char:
156	* @p: a pointer to a position within a UTF-8 encoded string
157	* @end: (nullable): a pointer to the byte following the end of the string,
158	* or %NULL to indicate that the string is nul-terminated
159	*
160	* Finds the start of the next UTF-8 character in the string after @p.
161	*
162	* @p does not have to be at the beginning of a UTF-8 character. No check
163	* is made to see if the character found is actually valid other than
164	* it starts with an appropriate byte.
165	*
166	* If @end is %NULL, the return value will never be %NULL: if the end of the
167	* string is reached, a pointer to the terminating nul byte is returned. If
168	* @end is non-%NULL, the return value will be %NULL if the end of the string
169	* is reached.
170	*
171	* Returns: (transfer none) (nullable): a pointer to the found character or %NULL if @end is
172	* set and is reached
173	*/
174	gchar *
175	g_utf8_find_next_char (const gchar *p,
176	const gchar *end)
177	{
178	if (end)
179	{
180	for (++p; p < end && (*p & `0xc0`) == `0x80`; ++p)
181	;
182	return (p >= end) ? NULL : (gchar *)p;
183	}
184	else
185	{
186	for (++p; (*p & `0xc0`) == `0x80`; ++p)
187	;
188	return (gchar *)p;
189	}
190	}
191
192	/**
193	* g_utf8_prev_char:
194	* @p: a pointer to a position within a UTF-8 encoded string
195	*
196	* Finds the previous UTF-8 character in the string before @p.
197	*
198	* @p does not have to be at the beginning of a UTF-8 character. No check
199	* is made to see if the character found is actually valid other than
200	* it starts with an appropriate byte. If @p might be the first
201	* character of the string, you must use g_utf8_find_prev_char() instead.
202	*
203	* Returns: (transfer none) (not nullable): a pointer to the found character
204	*/
205	gchar *
206	g_utf8_prev_char (const gchar *p)
207	{
208	while (TRUE)
209	{
210	p--;
211	if ((*p & `0xc0`) != `0x80`)
212	return (gchar *)p;
213	}
214	}
215
216	/**
217	* g_utf8_strlen:
218	* @p: pointer to the start of a UTF-8 encoded string
219	* @max: the maximum number of bytes to examine. If @max
220	* is less than 0, then the string is assumed to be
221	* nul-terminated. If @max is 0, @p will not be examined and
222	* may be %NULL. If @max is greater than 0, up to @max
223	* bytes are examined
224	*
225	* Computes the length of the string in characters, not including
226	* the terminating nul character. If the @max'th byte falls in the
227	* middle of a character, the last (partial) character is not counted.
228	*
229	* Returns: the length of the string in characters
230	*/
231	glong
232	g_utf8_strlen (const gchar *p,
233	gssize max)
234	{
235	glong len = `0`;
236	const gchar *start = p;
237	g_return_val_if_fail (p != NULL \|\| max == `0`, `0`);
238
239	if (max < `0`)
240	{
241	while (*p)
242	{
243	p = g_utf8_next_char (p);
244	++len;
245	}
246	}
247	else
248	{
249	if (max == `0` \|\| !*p)
250	return `0`;
251
252	p = g_utf8_next_char (p);
253
254	while (p - start < max && *p)
255	{
256	++len;
257	p = g_utf8_next_char (p);
258	}
259
260	/ only do the last len increment if we got a complete*
261	* char (don't count partial chars)
262	*/
263	if (p - start <= max)
264	++len;
265	}
266
267	return len;
268	}
269
270	/**
271	* g_utf8_substring:
272	* @str: a UTF-8 encoded string
273	* @start_pos: a character offset within @str
274	* @end_pos: another character offset within @str
275	*
276	* Copies a substring out of a UTF-8 encoded string.
277	* The substring will contain @end_pos - @start_pos characters.
278	*
279	* Returns: (transfer full): a newly allocated copy of the requested
280	* substring. Free with g_free() when no longer needed.
281	*
282	* Since: 2.30
283	*/
284	gchar *
285	g_utf8_substring (const gchar *str,
286	glong start_pos,
287	glong end_pos)
288	{
289	gchar start, end, *out;
290
291	start = g_utf8_offset_to_pointer (str, offset: start_pos);
292	end = g_utf8_offset_to_pointer (str: start, offset: end_pos - start_pos);
293
294	out = g_malloc (n_bytes: end - start + `1`);
295	memcpy (dest: out, src: start, n: end - start);
296	out[end - start] = `0`;
297
298	return out;
299	}
300
301	/**
302	* g_utf8_get_char:
303	* @p: a pointer to Unicode character encoded as UTF-8
304	*
305	* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
306	*
307	* If @p does not point to a valid UTF-8 encoded character, results
308	* are undefined. If you are not sure that the bytes are complete
309	* valid Unicode characters, you should use g_utf8_get_char_validated()
310	* instead.
311	*
312	* Returns: the resulting character
313	*/
314	gunichar
315	g_utf8_get_char (const gchar *p)
316	{
317	int i, mask = `0`, len;
318	gunichar result;
319	unsigned char c = (unsigned char) *p;
320
321	UTF8_COMPUTE (c, mask, len);
322	if (len == -`1`)
323	return (gunichar)-`1`;
324	UTF8_GET (result, p, i, mask, len);
325
326	return result;
327	}
328
329	/**
330	* g_utf8_offset_to_pointer:
331	* @str: a UTF-8 encoded string
332	* @offset: a character offset within @str
333	*
334	* Converts from an integer character offset to a pointer to a position
335	* within the string.
336	*
337	* Since 2.10, this function allows to pass a negative @offset to
338	* step backwards. It is usually worth stepping backwards from the end
339	* instead of forwards if @offset is in the last fourth of the string,
340	* since moving forward is about 3 times faster than moving backward.
341	*
342	* Note that this function doesn't abort when reaching the end of @str.
343	* Therefore you should be sure that @offset is within string boundaries
344	* before calling that function. Call g_utf8_strlen() when unsure.
345	* This limitation exists as this function is called frequently during
346	* text rendering and therefore has to be as fast as possible.
347	*
348	* Returns: (transfer none): the resulting pointer
349	*/
350	gchar *
351	g_utf8_offset_to_pointer (const gchar *str,
352	glong offset)
353	{
354	const gchar *s = str;
355
356	if (offset > `0`)
357	while (offset--)
358	s = g_utf8_next_char (s);
359	else
360	{
361	const char *s1;
362
363	/ This nice technique for fast backwards stepping*
364	* through a UTF-8 string was dubbed "stutter stepping"
365	* by its inventor, Larry Ewing.
366	*/
367	while (offset)
368	{
369	s1 = s;
370	s += offset;
371	while ((*s & `0xc0`) == `0x80`)
372	s--;
373
374	offset += g_utf8_pointer_to_offset (str: s, pos: s1);
375	}
376	}
377
378	return (gchar *)s;
379	}
380
381	/**
382	* g_utf8_pointer_to_offset:
383	* @str: a UTF-8 encoded string
384	* @pos: a pointer to a position within @str
385	*
386	* Converts from a pointer to position within a string to an integer
387	* character offset.
388	*
389	* Since 2.10, this function allows @pos to be before @str, and returns
390	* a negative offset in this case.
391	*
392	* Returns: the resulting character offset
393	*/
394	glong
395	g_utf8_pointer_to_offset (const gchar *str,
396	const gchar *pos)
397	{
398	const gchar *s = str;
399	glong offset = `0`;
400
401	if (pos < str)
402	offset = - g_utf8_pointer_to_offset (str: pos, pos: str);
403	else
404	while (s < pos)
405	{
406	s = g_utf8_next_char (s);
407	offset++;
408	}
409
410	return offset;
411	}
412
413
414	/**
415	* g_utf8_strncpy:
416	* @dest: (transfer none): buffer to fill with characters from @src
417	* @src: UTF-8 encoded string
418	* @n: character count
419	*
420	* Like the standard C strncpy() function, but copies a given number
421	* of characters instead of a given number of bytes. The @src string
422	* must be valid UTF-8 encoded text. (Use g_utf8_validate() on all
423	* text before trying to use UTF-8 utility functions with it.)
424	*
425	* Note you must ensure @dest is at least 4 * @n to fit the
426	* largest possible UTF-8 characters
427	*
428	* Returns: (transfer none): @dest
429	*/
430	gchar *
431	g_utf8_strncpy (gchar *dest,
432	const gchar *src,
433	gsize n)
434	{
435	const gchar *s = src;
436	while (n && *s)
437	{
438	s = g_utf8_next_char(s);
439	n--;
440	}
441	strncpy(dest: dest, src: src, n: s - src);
442	dest[s - src] = `0`;
443	return dest;
444	}
445
446	/ unicode_strchr /
447
448	/**
449	* g_unichar_to_utf8:
450	* @c: a Unicode character code
451	* @outbuf: (out caller-allocates) (optional): output buffer, must have at
452	* least 6 bytes of space. If %NULL, the length will be computed and
453	* returned and nothing will be written to @outbuf.
454	*
455	* Converts a single character to UTF-8.
456	*
457	* Returns: number of bytes written
458	*/
459	int
460	g_unichar_to_utf8 (gunichar c,
461	gchar *outbuf)
462	{
463	/ If this gets modified, also update the copy in g_string_insert_unichar() /
464	guint len = `0`;
465	int first;
466	int i;
467
468	if (c < `0x80`)
469	{
470	first = `0`;
471	len = `1`;
472	}
473	else if (c < `0x800`)
474	{
475	first = `0xc0`;
476	len = `2`;
477	}
478	else if (c < `0x10000`)
479	{
480	first = `0xe0`;
481	len = `3`;
482	}
483	else if (c < `0x200000`)
484	{
485	first = `0xf0`;
486	len = `4`;
487	}
488	else if (c < `0x4000000`)
489	{
490	first = `0xf8`;
491	len = `5`;
492	}
493	else
494	{
495	first = `0xfc`;
496	len = `6`;
497	}
498
499	if (outbuf)
500	{
501	for (i = len - `1`; i > `0`; --i)
502	{
503	outbuf[i] = (c & `0x3f`) \| `0x80`;
504	c >>= `6`;
505	}
506	outbuf[`0`] = c \| first;
507	}
508
509	return len;
510	}
511
512	/**
513	* g_utf8_strchr:
514	* @p: a nul-terminated UTF-8 encoded string
515	* @len: the maximum length of @p
516	* @c: a Unicode character
517	*
518	* Finds the leftmost occurrence of the given Unicode character
519	* in a UTF-8 encoded string, while limiting the search to @len bytes.
520	* If @len is -1, allow unbounded search.
521	*
522	* Returns: (transfer none) (nullable): %NULL if the string does not contain the character,
523	* otherwise, a pointer to the start of the leftmost occurrence
524	* of the character in the string.
525	*/
526	gchar *
527	g_utf8_strchr (const char *p,
528	gssize len,
529	gunichar c)
530	{
531	gchar ch[`10`];
532
533	gint charlen = g_unichar_to_utf8 (c, outbuf: ch);
534	ch[charlen] = `'\0'`;
535
536	return g_strstr_len (haystack: p, haystack_len: len, needle: ch);
537	}
538
539
540	/**
541	* g_utf8_strrchr:
542	* @p: a nul-terminated UTF-8 encoded string
543	* @len: the maximum length of @p
544	* @c: a Unicode character
545	*
546	* Find the rightmost occurrence of the given Unicode character
547	* in a UTF-8 encoded string, while limiting the search to @len bytes.
548	* If @len is -1, allow unbounded search.
549	*
550	* Returns: (transfer none) (nullable): %NULL if the string does not contain the character,
551	* otherwise, a pointer to the start of the rightmost occurrence
552	* of the character in the string.
553	*/
554	gchar *
555	g_utf8_strrchr (const char *p,
556	gssize len,
557	gunichar c)
558	{
559	gchar ch[`10`];
560
561	gint charlen = g_unichar_to_utf8 (c, outbuf: ch);
562	ch[charlen] = `'\0'`;
563
564	return g_strrstr_len (haystack: p, haystack_len: len, needle: ch);
565	}
566
567
568	/ Like g_utf8_get_char, but take a maximum length*
569	* and return (gunichar)-2 on incomplete trailing character;
570	* also check for malformed or overlong sequences
571	* and return (gunichar)-1 in this case.
572	*/
573	static inline gunichar
574	g_utf8_get_char_extended (const gchar *p,
575	gssize max_len)
576	{
577	guint i, len;
578	gunichar min_code;
579	gunichar wc = (guchar) *p;
580	const gunichar partial_sequence = (gunichar) -`2`;
581	const gunichar malformed_sequence = (gunichar) -`1`;
582
583	if (wc < `0x80`)
584	{
585	return wc;
586	}
587	else if (G_UNLIKELY (wc < `0xc0`))
588	{
589	return malformed_sequence;
590	}
591	else if (wc < `0xe0`)
592	{
593	len = `2`;
594	wc &= `0x1f`;
595	min_code = `1` << `7`;
596	}
597	else if (wc < `0xf0`)
598	{
599	len = `3`;
600	wc &= `0x0f`;
601	min_code = `1` << `11`;
602	}
603	else if (wc < `0xf8`)
604	{
605	len = `4`;
606	wc &= `0x07`;
607	min_code = `1` << `16`;
608	}
609	else if (wc < `0xfc`)
610	{
611	len = `5`;
612	wc &= `0x03`;
613	min_code = `1` << `21`;
614	}
615	else if (wc < `0xfe`)
616	{
617	len = `6`;
618	wc &= `0x01`;
619	min_code = `1` << `26`;
620	}
621	else
622	{
623	return malformed_sequence;
624	}
625
626	if (G_UNLIKELY (max_len >= `0` && len > max_len))
627	{
628	for (i = `1`; i < max_len; i++)
629	{
630	if ((((guchar *)p)[i] & `0xc0`) != `0x80`)
631	return malformed_sequence;
632	}
633	return partial_sequence;
634	}
635
636	for (i = `1`; i < len; ++i)
637	{
638	gunichar ch = ((guchar *)p)[i];
639
640	if (G_UNLIKELY ((ch & `0xc0`) != `0x80`))
641	{
642	if (ch)
643	return malformed_sequence;
644	else
645	return partial_sequence;
646	}
647
648	wc <<= `6`;
649	wc \|= (ch & `0x3f`);
650	}
651
652	if (G_UNLIKELY (wc < min_code))
653	return malformed_sequence;
654
655	return wc;
656	}
657
658	/**
659	* g_utf8_get_char_validated:
660	* @p: a pointer to Unicode character encoded as UTF-8
661	* @max_len: the maximum number of bytes to read, or -1 if @p is nul-terminated
662	*
663	* Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
664	* This function checks for incomplete characters, for invalid characters
665	* such as characters that are out of the range of Unicode, and for
666	* overlong encodings of valid characters.
667	*
668	* Note that g_utf8_get_char_validated() returns (gunichar)-2 if
669	* @max_len is positive and any of the bytes in the first UTF-8 character
670	* sequence are nul.
671	*
672	* Returns: the resulting character. If @p points to a partial
673	* sequence at the end of a string that could begin a valid
674	* character (or if @max_len is zero), returns (gunichar)-2;
675	* otherwise, if @p does not point to a valid UTF-8 encoded
676	* Unicode character, returns (gunichar)-1.
677	*/
678	gunichar
679	g_utf8_get_char_validated (const gchar *p,
680	gssize max_len)
681	{
682	gunichar result;
683
684	if (max_len == `0`)
685	return (gunichar)-`2`;
686
687	result = g_utf8_get_char_extended (p, max_len);
688
689	/ Disallow codepoint U+0000 as it’s a nul byte,*
690	* and all string handling in GLib is nul-terminated */
691	if (result == `0` && max_len > `0`)
692	return (gunichar) -`2`;
693
694	if (result & `0x80000000`)
695	return result;
696	else if (!UNICODE_VALID (result))
697	return (gunichar)-`1`;
698	else
699	return result;
700	}
701
702	#define CONT_BYTE_FAST(p) ((guchar)*p++ & 0x3f)
703
704	/**
705	* g_utf8_to_ucs4_fast:
706	* @str: a UTF-8 encoded string
707	* @len: the maximum length of @str to use, in bytes. If @len < 0,
708	* then the string is nul-terminated.
709	* @items_written: (out caller-allocates) (optional): location to store the
710	* number of characters in the result, or %NULL.
711	*
712	* Convert a string from UTF-8 to a 32-bit fixed width
713	* representation as UCS-4, assuming valid UTF-8 input.
714	* This function is roughly twice as fast as g_utf8_to_ucs4()
715	* but does no error checking on the input. A trailing 0 character
716	* will be added to the string after the converted text.
717	*
718	* Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
719	* This value must be freed with g_free().
720	*/
721	gunichar *
722	g_utf8_to_ucs4_fast (const gchar *str,
723	glong len,
724	glong *items_written)
725	{
726	gunichar *result;
727	gint n_chars, i;
728	const gchar *p;
729
730	g_return_val_if_fail (str != NULL, NULL);
731
732	p = str;
733	n_chars = `0`;
734	if (len < `0`)
735	{
736	while (*p)
737	{
738	p = g_utf8_next_char (p);
739	++n_chars;
740	}
741	}
742	else
743	{
744	while (p < str + len && *p)
745	{
746	p = g_utf8_next_char (p);
747	++n_chars;
748	}
749	}
750
751	result = g_new (gunichar, n_chars + `1`);
752
753	p = str;
754	for (i=`0`; i < n_chars; i++)
755	{
756	guchar first = (guchar)*p++;
757	gunichar wc;
758
759	if (first < `0xc0`)
760	{
761	/ We really hope first < 0x80, but we don't want to test an*
762	* extra branch for invalid input, which this function
763	* does not care about. Handling unexpected continuation bytes
764	* here will do the least damage. */
765	wc = first;
766	}
767	else
768	{
769	gunichar c1 = CONT_BYTE_FAST(p);
770	if (first < `0xe0`)
771	{
772	wc = ((first & `0x1f`) << `6`) \| c1;
773	}
774	else
775	{
776	gunichar c2 = CONT_BYTE_FAST(p);
777	if (first < `0xf0`)
778	{
779	wc = ((first & `0x0f`) << `12`) \| (c1 << `6`) \| c2;
780	}
781	else
782	{
783	gunichar c3 = CONT_BYTE_FAST(p);
784	wc = ((first & `0x07`) << `18`) \| (c1 << `12`) \| (c2 << `6`) \| c3;
785	if (G_UNLIKELY (first >= `0xf8`))
786	{
787	/ This can't be valid UTF-8, but g_utf8_next_char()*
788	* and company allow out-of-range sequences */
789	gunichar mask = `1` << `20`;
790	while ((wc & mask) != `0`)
791	{
792	wc <<= `6`;
793	wc \|= CONT_BYTE_FAST(p);
794	mask <<= `5`;
795	}
796	wc &= mask - `1`;
797	}
798	}
799	}
800	}
801	result[i] = wc;
802	}
803	result[i] = `0`;
804
805	if (items_written)
806	*items_written = i;
807
808	return result;
809	}
810
811	static gpointer
812	try_malloc_n (gsize n_blocks, gsize n_block_bytes, GError **error)
813	{
814	gpointer ptr = g_try_malloc_n (n_blocks, n_block_bytes);
815	if (ptr == NULL)
816	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_NO_MEMORY,
817	_("Failed to allocate memory"));
818	return ptr;
819	}
820
821	/**
822	* g_utf8_to_ucs4:
823	* @str: a UTF-8 encoded string
824	* @len: the maximum length of @str to use, in bytes. If @len < 0,
825	* then the string is nul-terminated.
826	* @items_read: (out caller-allocates) (optional): location to store number of
827	* bytes read, or %NULL.
828	* If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
829	* returned in case @str contains a trailing partial
830	* character. If an error occurs then the index of the
831	* invalid input is stored here.
832	* @items_written: (out caller-allocates) (optional): location to store number
833	* of characters written or %NULL. The value here stored does not include
834	* the trailing 0 character.
835	* @error: location to store the error occurring, or %NULL to ignore
836	* errors. Any of the errors in #GConvertError other than
837	* %G_CONVERT_ERROR_NO_CONVERSION may occur.
838	*
839	* Convert a string from UTF-8 to a 32-bit fixed width
840	* representation as UCS-4. A trailing 0 character will be added to the
841	* string after the converted text.
842	*
843	* Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
844	* This value must be freed with g_free(). If an error occurs,
845	* %NULL will be returned and @error set.
846	*/
847	gunichar *
848	g_utf8_to_ucs4 (const gchar *str,
849	glong len,
850	glong *items_read,
851	glong *items_written,
852	GError **error)
853	{
854	gunichar *result = NULL;
855	gint n_chars, i;
856	const gchar *in;
857
858	in = str;
859	n_chars = `0`;
860	while ((len < `0` \|\| str + len - in > `0`) && *in)
861	{
862	gunichar wc = g_utf8_get_char_extended (p: in, max_len: len < `0` ? `6` : str + len - in);
863	if (wc & `0x80000000`)
864	{
865	if (wc == (gunichar)-`2`)
866	{
867	if (items_read)
868	break;
869	else
870	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_PARTIAL_INPUT,
871	_("Partial character sequence at end of input"));
872	}
873	else
874	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
875	_("Invalid byte sequence in conversion input"));
876
877	goto err_out;
878	}
879
880	n_chars++;
881
882	in = g_utf8_next_char (in);
883	}
884
885	result = try_malloc_n (n_blocks: n_chars + `1`, n_block_bytes: sizeof (gunichar), error);
886	if (result == NULL)
887	goto err_out;
888
889	in = str;
890	for (i=`0`; i < n_chars; i++)
891	{
892	result[i] = g_utf8_get_char (p: in);
893	in = g_utf8_next_char (in);
894	}
895	result[i] = `0`;
896
897	if (items_written)
898	*items_written = n_chars;
899
900	err_out:
901	if (items_read)
902	*items_read = in - str;
903
904	return result;
905	}
906
907	/**
908	* g_ucs4_to_utf8:
909	* @str: a UCS-4 encoded string
910	* @len: the maximum length (number of characters) of @str to use.
911	* If @len < 0, then the string is nul-terminated.
912	* @items_read: (out caller-allocates) (optional): location to store number of
913	* characters read, or %NULL.
914	* @items_written: (out caller-allocates) (optional): location to store number
915	* of bytes written or %NULL. The value here stored does not include the
916	* trailing 0 byte.
917	* @error: location to store the error occurring, or %NULL to ignore
918	* errors. Any of the errors in #GConvertError other than
919	* %G_CONVERT_ERROR_NO_CONVERSION may occur.
920	*
921	* Convert a string from a 32-bit fixed width representation as UCS-4.
922	* to UTF-8. The result will be terminated with a 0 byte.
923	*
924	* Returns: (transfer full): a pointer to a newly allocated UTF-8 string.
925	* This value must be freed with g_free(). If an error occurs,
926	* %NULL will be returned and @error set. In that case, @items_read
927	* will be set to the position of the first invalid input character.
928	*/
929	gchar *
930	g_ucs4_to_utf8 (const gunichar *str,
931	glong len,
932	glong *items_read,
933	glong *items_written,
934	GError **error)
935	{
936	gint result_length;
937	gchar *result = NULL;
938	gchar *p;
939	gint i;
940
941	result_length = `0`;
942	for (i = `0`; len < `0` \|\| i < len ; i++)
943	{
944	if (!str[i])
945	break;
946
947	if (str[i] >= `0x80000000`)
948	{
949	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
950	_("Character out of range for UTF-8"));
951	goto err_out;
952	}
953
954	result_length += UTF8_LENGTH (str[i]);
955	}
956
957	result = try_malloc_n (n_blocks: result_length + `1`, n_block_bytes: `1`, error);
958	if (result == NULL)
959	goto err_out;
960
961	p = result;
962
963	i = `0`;
964	while (p < result + result_length)
965	p += g_unichar_to_utf8 (c: str[i++], outbuf: p);
966
967	*p = `'\0'`;
968
969	if (items_written)
970	*items_written = p - result;
971
972	err_out:
973	if (items_read)
974	*items_read = i;
975
976	return result;
977	}
978
979	#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
980
981	/**
982	* g_utf16_to_utf8:
983	* @str: a UTF-16 encoded string
984	* @len: the maximum length (number of #gunichar2) of @str to use.
985	* If @len < 0, then the string is nul-terminated.
986	* @items_read: (out caller-allocates) (optional): location to store number of
987	* words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
988	* be returned in case @str contains a trailing partial character. If
989	* an error occurs then the index of the invalid input is stored here.
990	* @items_written: (out caller-allocates) (optional): location to store number
991	* of bytes written, or %NULL. The value stored here does not include the
992	* trailing 0 byte.
993	* @error: location to store the error occurring, or %NULL to ignore
994	* errors. Any of the errors in #GConvertError other than
995	* %G_CONVERT_ERROR_NO_CONVERSION may occur.
996	*
997	* Convert a string from UTF-16 to UTF-8. The result will be
998	* terminated with a 0 byte.
999	*
1000	* Note that the input is expected to be already in native endianness,
1001	* an initial byte-order-mark character is not handled specially.
1002	* g_convert() can be used to convert a byte buffer of UTF-16 data of
1003	* ambiguous endianness.
1004	*
1005	* Further note that this function does not validate the result
1006	* string; it may e.g. include embedded NUL characters. The only
1007	* validation done by this function is to ensure that the input can
1008	* be correctly interpreted as UTF-16, i.e. it doesn't contain
1009	* unpaired surrogates or partial character sequences.
1010	*
1011	* Returns: (transfer full): a pointer to a newly allocated UTF-8 string.
1012	* This value must be freed with g_free(). If an error occurs,
1013	* %NULL will be returned and @error set.
1014	**/
1015	gchar *
1016	g_utf16_to_utf8 (const gunichar2 *str,
1017	glong len,
1018	glong *items_read,
1019	glong *items_written,
1020	GError **error)
1021	{
1022	/ This function and g_utf16_to_ucs4 are almost exactly identical -*
1023	* The lines that differ are marked.
1024	*/
1025	const gunichar2 *in;
1026	gchar *out;
1027	gchar *result = NULL;
1028	gint n_bytes;
1029	gunichar high_surrogate;
1030
1031	g_return_val_if_fail (str != NULL, NULL);
1032
1033	n_bytes = `0`;
1034	in = str;
1035	high_surrogate = `0`;
1036	while ((len < `0` \|\| in - str < len) && *in)
1037	{
1038	gunichar2 c = *in;
1039	gunichar wc;
1040
1041	if (c >= `0xdc00` && c < `0xe000`) / low surrogate /
1042	{
1043	if (high_surrogate)
1044	{
1045	wc = SURROGATE_VALUE (high_surrogate, c);
1046	high_surrogate = `0`;
1047	}
1048	else
1049	{
1050	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1051	_("Invalid sequence in conversion input"));
1052	goto err_out;
1053	}
1054	}
1055	else
1056	{
1057	if (high_surrogate)
1058	{
1059	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1060	_("Invalid sequence in conversion input"));
1061	goto err_out;
1062	}
1063
1064	if (c >= `0xd800` && c < `0xdc00`) / high surrogate /
1065	{
1066	high_surrogate = c;
1067	goto next1;
1068	}
1069	else
1070	wc = c;
1071	}
1072
1073	/******* DIFFERENT for UTF8/UCS4 *******/
1074	n_bytes += UTF8_LENGTH (wc);
1075
1076	next1:
1077	in++;
1078	}
1079
1080	if (high_surrogate && !items_read)
1081	{
1082	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_PARTIAL_INPUT,
1083	_("Partial character sequence at end of input"));
1084	goto err_out;
1085	}
1086
1087	/ At this point, everything is valid, and we just need to convert*
1088	*/
1089	/******* DIFFERENT for UTF8/UCS4 *******/
1090	result = try_malloc_n (n_blocks: n_bytes + `1`, n_block_bytes: `1`, error);
1091	if (result == NULL)
1092	goto err_out;
1093
1094	high_surrogate = `0`;
1095	out = result;
1096	in = str;
1097	while (out < result + n_bytes)
1098	{
1099	gunichar2 c = *in;
1100	gunichar wc;
1101
1102	if (c >= `0xdc00` && c < `0xe000`) / low surrogate /
1103	{
1104	wc = SURROGATE_VALUE (high_surrogate, c);
1105	high_surrogate = `0`;
1106	}
1107	else if (c >= `0xd800` && c < `0xdc00`) / high surrogate /
1108	{
1109	high_surrogate = c;
1110	goto next2;
1111	}
1112	else
1113	wc = c;
1114
1115	/******* DIFFERENT for UTF8/UCS4 *******/
1116	out += g_unichar_to_utf8 (c: wc, outbuf: out);
1117
1118	next2:
1119	in++;
1120	}
1121
1122	/******* DIFFERENT for UTF8/UCS4 *******/
1123	*out = `'\0'`;
1124
1125	if (items_written)
1126	/******* DIFFERENT for UTF8/UCS4 *******/
1127	*items_written = out - result;
1128
1129	err_out:
1130	if (items_read)
1131	*items_read = in - str;
1132
1133	return result;
1134	}
1135
1136	/**
1137	* g_utf16_to_ucs4:
1138	* @str: a UTF-16 encoded string
1139	* @len: the maximum length (number of #gunichar2) of @str to use.
1140	* If @len < 0, then the string is nul-terminated.
1141	* @items_read: (out caller-allocates) (optional): location to store number of
1142	* words read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
1143	* be returned in case @str contains a trailing partial character. If
1144	* an error occurs then the index of the invalid input is stored here.
1145	* @items_written: (out caller-allocates) (optional): location to store number
1146	* of characters written, or %NULL. The value stored here does not include
1147	* the trailing 0 character.
1148	* @error: location to store the error occurring, or %NULL to ignore
1149	* errors. Any of the errors in #GConvertError other than
1150	* %G_CONVERT_ERROR_NO_CONVERSION may occur.
1151	*
1152	* Convert a string from UTF-16 to UCS-4. The result will be
1153	* nul-terminated.
1154	*
1155	* Returns: (transfer full): a pointer to a newly allocated UCS-4 string.
1156	* This value must be freed with g_free(). If an error occurs,
1157	* %NULL will be returned and @error set.
1158	*/
1159	gunichar *
1160	g_utf16_to_ucs4 (const gunichar2 *str,
1161	glong len,
1162	glong *items_read,
1163	glong *items_written,
1164	GError **error)
1165	{
1166	const gunichar2 *in;
1167	gchar *out;
1168	gchar *result = NULL;
1169	gint n_bytes;
1170	gunichar high_surrogate;
1171
1172	g_return_val_if_fail (str != NULL, NULL);
1173
1174	n_bytes = `0`;
1175	in = str;
1176	high_surrogate = `0`;
1177	while ((len < `0` \|\| in - str < len) && *in)
1178	{
1179	gunichar2 c = *in;
1180
1181	if (c >= `0xdc00` && c < `0xe000`) / low surrogate /
1182	{
1183	if (high_surrogate)
1184	{
1185	high_surrogate = `0`;
1186	}
1187	else
1188	{
1189	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1190	_("Invalid sequence in conversion input"));
1191	goto err_out;
1192	}
1193	}
1194	else
1195	{
1196	if (high_surrogate)
1197	{
1198	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1199	_("Invalid sequence in conversion input"));
1200	goto err_out;
1201	}
1202
1203	if (c >= `0xd800` && c < `0xdc00`) / high surrogate /
1204	{
1205	high_surrogate = c;
1206	goto next1;
1207	}
1208	}
1209
1210	/******* DIFFERENT for UTF8/UCS4 *******/
1211	n_bytes += sizeof (gunichar);
1212
1213	next1:
1214	in++;
1215	}
1216
1217	if (high_surrogate && !items_read)
1218	{
1219	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_PARTIAL_INPUT,
1220	_("Partial character sequence at end of input"));
1221	goto err_out;
1222	}
1223
1224	/ At this point, everything is valid, and we just need to convert*
1225	*/
1226	/******* DIFFERENT for UTF8/UCS4 *******/
1227	result = try_malloc_n (n_blocks: n_bytes + `4`, n_block_bytes: `1`, error);
1228	if (result == NULL)
1229	goto err_out;
1230
1231	high_surrogate = `0`;
1232	out = result;
1233	in = str;
1234	while (out < result + n_bytes)
1235	{
1236	gunichar2 c = *in;
1237	gunichar wc;
1238
1239	if (c >= `0xdc00` && c < `0xe000`) / low surrogate /
1240	{
1241	wc = SURROGATE_VALUE (high_surrogate, c);
1242	high_surrogate = `0`;
1243	}
1244	else if (c >= `0xd800` && c < `0xdc00`) / high surrogate /
1245	{
1246	high_surrogate = c;
1247	goto next2;
1248	}
1249	else
1250	wc = c;
1251
1252	/******* DIFFERENT for UTF8/UCS4 *******/
1253	(gunichar )out = wc;
1254	out += sizeof (gunichar);
1255
1256	next2:
1257	in++;
1258	}
1259
1260	/******* DIFFERENT for UTF8/UCS4 *******/
1261	(gunichar )out = `0`;
1262
1263	if (items_written)
1264	/******* DIFFERENT for UTF8/UCS4 *******/
1265	items_written = (out - result) / sizeof* (gunichar);
1266
1267	err_out:
1268	if (items_read)
1269	*items_read = in - str;
1270
1271	return (gunichar *)result;
1272	}
1273
1274	/**
1275	* g_utf8_to_utf16:
1276	* @str: a UTF-8 encoded string
1277	* @len: the maximum length (number of bytes) of @str to use.
1278	* If @len < 0, then the string is nul-terminated.
1279	* @items_read: (out caller-allocates) (optional): location to store number of
1280	* bytes read, or %NULL. If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will
1281	* be returned in case @str contains a trailing partial character. If
1282	* an error occurs then the index of the invalid input is stored here.
1283	* @items_written: (out caller-allocates) (optional): location to store number
1284	* of #gunichar2 written, or %NULL. The value stored here does not include
1285	* the trailing 0.
1286	* @error: location to store the error occurring, or %NULL to ignore
1287	* errors. Any of the errors in #GConvertError other than
1288	* %G_CONVERT_ERROR_NO_CONVERSION may occur.
1289	*
1290	* Convert a string from UTF-8 to UTF-16. A 0 character will be
1291	* added to the result after the converted text.
1292	*
1293	* Returns: (transfer full): a pointer to a newly allocated UTF-16 string.
1294	* This value must be freed with g_free(). If an error occurs,
1295	* %NULL will be returned and @error set.
1296	*/
1297	gunichar2 *
1298	g_utf8_to_utf16 (const gchar *str,
1299	glong len,
1300	glong *items_read,
1301	glong *items_written,
1302	GError **error)
1303	{
1304	gunichar2 *result = NULL;
1305	gint n16;
1306	const gchar *in;
1307	gint i;
1308
1309	g_return_val_if_fail (str != NULL, NULL);
1310
1311	in = str;
1312	n16 = `0`;
1313	while ((len < `0` \|\| str + len - in > `0`) && *in)
1314	{
1315	gunichar wc = g_utf8_get_char_extended (p: in, max_len: len < `0` ? `6` : str + len - in);
1316	if (wc & `0x80000000`)
1317	{
1318	if (wc == (gunichar)-`2`)
1319	{
1320	if (items_read)
1321	break;
1322	else
1323	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_PARTIAL_INPUT,
1324	_("Partial character sequence at end of input"));
1325	}
1326	else
1327	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1328	_("Invalid byte sequence in conversion input"));
1329
1330	goto err_out;
1331	}
1332
1333	if (wc < `0xd800`)
1334	n16 += `1`;
1335	else if (wc < `0xe000`)
1336	{
1337	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1338	_("Invalid sequence in conversion input"));
1339
1340	goto err_out;
1341	}
1342	else if (wc < `0x10000`)
1343	n16 += `1`;
1344	else if (wc < `0x110000`)
1345	n16 += `2`;
1346	else
1347	{
1348	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1349	_("Character out of range for UTF-16"));
1350
1351	goto err_out;
1352	}
1353
1354	in = g_utf8_next_char (in);
1355	}
1356
1357	result = try_malloc_n (n_blocks: n16 + `1`, n_block_bytes: sizeof (gunichar2), error);
1358	if (result == NULL)
1359	goto err_out;
1360
1361	in = str;
1362	for (i = `0`; i < n16;)
1363	{
1364	gunichar wc = g_utf8_get_char (p: in);
1365
1366	if (wc < `0x10000`)
1367	{
1368	result[i++] = wc;
1369	}
1370	else
1371	{
1372	result[i++] = (wc - `0x10000`) / `0x400` + `0xd800`;
1373	result[i++] = (wc - `0x10000`) % `0x400` + `0xdc00`;
1374	}
1375
1376	in = g_utf8_next_char (in);
1377	}
1378
1379	result[i] = `0`;
1380
1381	if (items_written)
1382	*items_written = n16;
1383
1384	err_out:
1385	if (items_read)
1386	*items_read = in - str;
1387
1388	return result;
1389	}
1390
1391	/**
1392	* g_ucs4_to_utf16:
1393	* @str: a UCS-4 encoded string
1394	* @len: the maximum length (number of characters) of @str to use.
1395	* If @len < 0, then the string is nul-terminated.
1396	* @items_read: (out caller-allocates) (optional): location to store number of
1397	* bytes read, or %NULL. If an error occurs then the index of the invalid
1398	* input is stored here.
1399	* @items_written: (out caller-allocates) (optional): location to store number
1400	* of #gunichar2 written, or %NULL. The value stored here does not include
1401	* the trailing 0.
1402	* @error: location to store the error occurring, or %NULL to ignore
1403	* errors. Any of the errors in #GConvertError other than
1404	* %G_CONVERT_ERROR_NO_CONVERSION may occur.
1405	*
1406	* Convert a string from UCS-4 to UTF-16. A 0 character will be
1407	* added to the result after the converted text.
1408	*
1409	* Returns: (transfer full): a pointer to a newly allocated UTF-16 string.
1410	* This value must be freed with g_free(). If an error occurs,
1411	* %NULL will be returned and @error set.
1412	*/
1413	gunichar2 *
1414	g_ucs4_to_utf16 (const gunichar *str,
1415	glong len,
1416	glong *items_read,
1417	glong *items_written,
1418	GError **error)
1419	{
1420	gunichar2 *result = NULL;
1421	gint n16;
1422	gint i, j;
1423
1424	n16 = `0`;
1425	i = `0`;
1426	while ((len < `0` \|\| i < len) && str[i])
1427	{
1428	gunichar wc = str[i];
1429
1430	if (wc < `0xd800`)
1431	n16 += `1`;
1432	else if (wc < `0xe000`)
1433	{
1434	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1435	_("Invalid sequence in conversion input"));
1436
1437	goto err_out;
1438	}
1439	else if (wc < `0x10000`)
1440	n16 += `1`;
1441	else if (wc < `0x110000`)
1442	n16 += `2`;
1443	else
1444	{
1445	g_set_error_literal (err: error, G_CONVERT_ERROR, code: G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1446	_("Character out of range for UTF-16"));
1447
1448	goto err_out;
1449	}
1450
1451	i++;
1452	}
1453
1454	result = try_malloc_n (n_blocks: n16 + `1`, n_block_bytes: sizeof (gunichar2), error);
1455	if (result == NULL)
1456	goto err_out;
1457
1458	for (i = `0`, j = `0`; j < n16; i++)
1459	{
1460	gunichar wc = str[i];
1461
1462	if (wc < `0x10000`)
1463	{
1464	result[j++] = wc;
1465	}
1466	else
1467	{
1468	result[j++] = (wc - `0x10000`) / `0x400` + `0xd800`;
1469	result[j++] = (wc - `0x10000`) % `0x400` + `0xdc00`;
1470	}
1471	}
1472	result[j] = `0`;
1473
1474	if (items_written)
1475	*items_written = n16;
1476
1477	err_out:
1478	if (items_read)
1479	*items_read = i;
1480
1481	return result;
1482	}
1483
1484	#define VALIDATE_BYTE(mask, expect) \
1485	G_STMT_START { \
1486	if (G_UNLIKELY(((guchar )p & (mask)) != (expect))) \
1487	goto error; \
1488	} G_STMT_END
1489
1490	/ see IETF RFC 3629 Section 4 /
1491
1492	static const gchar *
1493	fast_validate (const char *str)
1494
1495	{
1496	const gchar *p;
1497
1498	for (p = str; *p; p++)
1499	{
1500	if ((guchar )p < `128`)
1501	/ done /;
1502	else
1503	{
1504	const gchar *last;
1505
1506	last = p;
1507	if ((guchar )p < `0xe0`) / 110xxxxx /
1508	{
1509	if (G_UNLIKELY ((guchar )p < `0xc2`))
1510	goto error;
1511	}
1512	else
1513	{
1514	if ((guchar )p < `0xf0`) / 1110xxxx /
1515	{
1516	switch ((guchar )p++ & `0x0f`)
1517	{
1518	case `0`:
1519	VALIDATE_BYTE(`0xe0`, `0xa0`); / 0xa0 ... 0xbf /
1520	break;
1521	case `0x0d`:
1522	VALIDATE_BYTE(`0xe0`, `0x80`); / 0x80 ... 0x9f /
1523	break;
1524	default:
1525	VALIDATE_BYTE(`0xc0`, `0x80`); / 10xxxxxx /
1526	}
1527	}
1528	else if ((guchar )p < `0xf5`) / 11110xxx excluding out-of-range /
1529	{
1530	switch ((guchar )p++ & `0x07`)
1531	{
1532	case `0`:
1533	VALIDATE_BYTE(`0xc0`, `0x80`); / 10xxxxxx /
1534	if (G_UNLIKELY(((guchar )p & `0x30`) == `0`))
1535	goto error;
1536	break;
1537	case `4`:
1538	VALIDATE_BYTE(`0xf0`, `0x80`); / 0x80 ... 0x8f /
1539	break;
1540	default:
1541	VALIDATE_BYTE(`0xc0`, `0x80`); / 10xxxxxx /
1542	}
1543	p++;
1544	VALIDATE_BYTE(`0xc0`, `0x80`); / 10xxxxxx /
1545	}
1546	else
1547	goto error;
1548	}
1549
1550	p++;
1551	VALIDATE_BYTE(`0xc0`, `0x80`); / 10xxxxxx /
1552
1553	continue;
1554
1555	error:
1556	return last;
1557	}
1558	}
1559
1560	return p;
1561	}
1562
1563	static const gchar *
1564	fast_validate_len (const char *str,
1565	gssize max_len)
1566
1567	{
1568	const gchar *p;
1569
1570	g_assert (max_len >= `0`);
1571
1572	for (p = str; ((p - str) < max_len) && *p; p++)
1573	{
1574	if ((guchar )p < `128`)
1575	/ done /;
1576	else
1577	{
1578	const gchar *last;
1579
1580	last = p;
1581	if ((guchar )p < `0xe0`) / 110xxxxx /
1582	{
1583	if (G_UNLIKELY (max_len - (p - str) < `2`))
1584	goto error;
1585
1586	if (G_UNLIKELY ((guchar )p < `0xc2`))
1587	goto error;
1588	}
1589	else
1590	{
1591	if ((guchar )p < `0xf0`) / 1110xxxx /
1592	{
1593	if (G_UNLIKELY (max_len - (p - str) < `3`))
1594	goto error;
1595
1596	switch ((guchar )p++ & `0x0f`)
1597	{
1598	case `0`:
1599	VALIDATE_BYTE(`0xe0`, `0xa0`); / 0xa0 ... 0xbf /
1600	break;
1601	case `0x0d`:
1602	VALIDATE_BYTE(`0xe0`, `0x80`); / 0x80 ... 0x9f /
1603	break;
1604	default:
1605	VALIDATE_BYTE(`0xc0`, `0x80`); / 10xxxxxx /
1606	}
1607	}
1608	else if ((guchar )p < `0xf5`) / 11110xxx excluding out-of-range /
1609	{
1610	if (G_UNLIKELY (max_len - (p - str) < `4`))
1611	goto error;
1612
1613	switch ((guchar )p++ & `0x07`)
1614	{
1615	case `0`:
1616	VALIDATE_BYTE(`0xc0`, `0x80`); / 10xxxxxx /
1617	if (G_UNLIKELY(((guchar )p & `0x30`) == `0`))
1618	goto error;
1619	break;
1620	case `4`:
1621	VALIDATE_BYTE(`0xf0`, `0x80`); / 0x80 ... 0x8f /
1622	break;
1623	default:
1624	VALIDATE_BYTE(`0xc0`, `0x80`); / 10xxxxxx /
1625	}
1626	p++;
1627	VALIDATE_BYTE(`0xc0`, `0x80`); / 10xxxxxx /
1628	}
1629	else
1630	goto error;
1631	}
1632
1633	p++;
1634	VALIDATE_BYTE(`0xc0`, `0x80`); / 10xxxxxx /
1635
1636	continue;
1637
1638	error:
1639	return last;
1640	}
1641	}
1642
1643	return p;
1644	}
1645
1646	/**
1647	* g_utf8_validate:
1648	* @str: (array length=max_len) (element-type guint8): a pointer to character data
1649	* @max_len: max bytes to validate, or -1 to go until NUL
1650	* @end: (out) (optional) (transfer none): return location for end of valid data
1651	*
1652	* Validates UTF-8 encoded text. @str is the text to validate;
1653	* if @str is nul-terminated, then @max_len can be -1, otherwise
1654	* @max_len should be the number of bytes to validate.
1655	* If @end is non-%NULL, then the end of the valid range
1656	* will be stored there (i.e. the start of the first invalid
1657	* character if some bytes were invalid, or the end of the text
1658	* being validated otherwise).
1659	*
1660	* Note that g_utf8_validate() returns %FALSE if @max_len is
1661	* positive and any of the @max_len bytes are nul.
1662	*
1663	* Returns %TRUE if all of @str was valid. Many GLib and GTK+
1664	* routines require valid UTF-8 as input; so data read from a file
1665	* or the network should be checked with g_utf8_validate() before
1666	* doing anything else with it.
1667	*
1668	* Returns: %TRUE if the text was valid UTF-8
1669	*/
1670	gboolean
1671	g_utf8_validate (const char *str,
1672	gssize max_len,
1673	const gchar **end)
1674
1675	{
1676	const gchar *p;
1677
1678	if (max_len >= `0`)
1679	return g_utf8_validate_len (str, max_len, end);
1680
1681	p = fast_validate (str);
1682
1683	if (end)
1684	*end = p;
1685
1686	if (*p != `'\0'`)
1687	return FALSE;
1688	else
1689	return TRUE;
1690	}
1691
1692	/**
1693	* g_utf8_validate_len:
1694	* @str: (array length=max_len) (element-type guint8): a pointer to character data
1695	* @max_len: max bytes to validate
1696	* @end: (out) (optional) (transfer none): return location for end of valid data
1697	*
1698	* Validates UTF-8 encoded text.
1699	*
1700	* As with g_utf8_validate(), but @max_len must be set, and hence this function
1701	* will always return %FALSE if any of the bytes of @str are nul.
1702	*
1703	* Returns: %TRUE if the text was valid UTF-8
1704	* Since: 2.60
1705	*/
1706	gboolean
1707	g_utf8_validate_len (const char *str,
1708	gsize max_len,
1709	const gchar **end)
1710
1711	{
1712	const gchar *p;
1713
1714	p = fast_validate_len (str, max_len);
1715
1716	if (end)
1717	*end = p;
1718
1719	if (p != str + max_len)
1720	return FALSE;
1721	else
1722	return TRUE;
1723	}
1724
1725	/**
1726	* g_unichar_validate:
1727	* @ch: a Unicode character
1728	*
1729	* Checks whether @ch is a valid Unicode character. Some possible
1730	* integer values of @ch will not be valid. 0 is considered a valid
1731	* character, though it's normally a string terminator.
1732	*
1733	* Returns: %TRUE if @ch is a valid Unicode character
1734	**/
1735	gboolean
1736	g_unichar_validate (gunichar ch)
1737	{
1738	return UNICODE_VALID (ch);
1739	}
1740
1741	/**
1742	* g_utf8_strreverse:
1743	* @str: a UTF-8 encoded string
1744	* @len: the maximum length of @str to use, in bytes. If @len < 0,
1745	* then the string is nul-terminated.
1746	*
1747	* Reverses a UTF-8 string. @str must be valid UTF-8 encoded text.
1748	* (Use g_utf8_validate() on all text before trying to use UTF-8
1749	* utility functions with it.)
1750	*
1751	* This function is intended for programmatic uses of reversed strings.
1752	* It pays no attention to decomposed characters, combining marks, byte
1753	* order marks, directional indicators (LRM, LRO, etc) and similar
1754	* characters which might need special handling when reversing a string
1755	* for display purposes.
1756	*
1757	* Note that unlike g_strreverse(), this function returns
1758	* newly-allocated memory, which should be freed with g_free() when
1759	* no longer needed.
1760	*
1761	* Returns: (transfer full): a newly-allocated string which is the reverse of @str
1762	*
1763	* Since: 2.2
1764	*/
1765	gchar *
1766	g_utf8_strreverse (const gchar *str,
1767	gssize len)
1768	{
1769	gchar r, result;
1770	const gchar *p;
1771
1772	if (len < `0`)
1773	len = strlen (s: str);
1774
1775	result = g_new (gchar, len + `1`);
1776	r = result + len;
1777	p = str;
1778	while (r > result)
1779	{
1780	gchar m, skip = g_utf8_skip[(guchar*) p];
1781	r -= skip;
1782	g_assert (r >= result);
1783	for (m = r; skip; skip--)
1784	m++ = p++;
1785	}
1786	result[len] = `0`;
1787
1788	return result;
1789	}
1790
1791	/**
1792	* g_utf8_make_valid:
1793	* @str: string to coerce into UTF-8
1794	* @len: the maximum length of @str to use, in bytes. If @len < 0,
1795	* then the string is nul-terminated.
1796	*
1797	* If the provided string is valid UTF-8, return a copy of it. If not,
1798	* return a copy in which bytes that could not be interpreted as valid Unicode
1799	* are replaced with the Unicode replacement character (U+FFFD).
1800	*
1801	* For example, this is an appropriate function to use if you have received
1802	* a string that was incorrectly declared to be UTF-8, and you need a valid
1803	* UTF-8 version of it that can be logged or displayed to the user, with the
1804	* assumption that it is close enough to ASCII or UTF-8 to be mostly
1805	* readable as-is.
1806	*
1807	* Returns: (transfer full): a valid UTF-8 string whose content resembles @str
1808	*
1809	* Since: 2.52
1810	*/
1811	gchar *
1812	g_utf8_make_valid (const gchar *str,
1813	gssize len)
1814	{
1815	GString *string;
1816	const gchar remainder, invalid;
1817	gsize remaining_bytes, valid_bytes;
1818
1819	g_return_val_if_fail (str != NULL, NULL);
1820
1821	if (len < `0`)
1822	len = strlen (s: str);
1823
1824	string = NULL;
1825	remainder = str;
1826	remaining_bytes = len;
1827
1828	while (remaining_bytes != `0`)
1829	{
1830	if (g_utf8_validate (str: remainder, max_len: remaining_bytes, end: &invalid))
1831	break;
1832	valid_bytes = invalid - remainder;
1833
1834	if (string == NULL)
1835	string = g_string_sized_new (dfl_size: remaining_bytes);
1836
1837	g_string_append_len (string, val: remainder, len: valid_bytes);
1838	/ append U+FFFD REPLACEMENT CHARACTER /
1839	g_string_append (string, val: "\357\277\275");
1840
1841	remaining_bytes -= valid_bytes + `1`;
1842	remainder = invalid + `1`;
1843	}
1844
1845	if (string == NULL)
1846	return g_strndup (str, n: len);
1847
1848	g_string_append_len (string, val: remainder, len: remaining_bytes);
1849	g_string_append_c (string, `'\0'`);
1850
1851	g_assert (g_utf8_validate (string->str, -`1`, NULL));
1852
1853	return g_string_free (string, FALSE);
1854	}
1855

source code of gtk/subprojects/glib/glib/gutf8.c