utf-7.c source code [glibc/iconvdata/utf-7.c]

1	/ Conversion module for UTF-7.*
2	Copyright (C) 2000-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	/ UTF-7 is a legacy encoding used for transmitting Unicode within the*
20	ASCII character set, used primarily by mail agents. New programs
21	are encouraged to use UTF-8 instead.
22
23	UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642). The
24	original Base64 encoding is defined in RFC 2045. /*
25
26	#include <dlfcn.h>
27	#include <gconv.h>
28	#include <stdint.h>
29	#include <stdlib.h>
30
31
32	/ Define this to 1 if you want the so-called "optional direct" characters*
33	! " # $ % & ; < = > @ [ ] ^ _ ` { \| }*
34	to be encoded. Define to 0 if you want them to be passed straight
35	through, like the so-called "direct" characters.
36	We set this to 1 because it's safer.
37	*/
38	#define UTF7_ENCODE_OPTIONAL_CHARS 1
39
40
41	/ The set of "direct characters":*
42	A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
43	*/
44
45	static const unsigned char direct_tab[`128` / `8`] =
46	{
47	`0x00`, `0x26`, `0x00`, `0x00`, `0x81`, `0xf3`, `0xff`, `0x87`,
48	`0xfe`, `0xff`, `0xff`, `0x07`, `0xfe`, `0xff`, `0xff`, `0x07`
49	};
50
51	static int
52	isdirect (uint32_t ch)
53	{
54	return (ch < `128` && ((direct_tab[ch >> `3`] >> (ch & `7`)) & `1`));
55	}
56
57
58	/ The set of "direct and optional direct characters":*
59	A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
60	! " # $ % & ; < = > @ [ ] ^ _ ` { \| }*
61	*/
62
63	static const unsigned char xdirect_tab[`128` / `8`] =
64	{
65	`0x00`, `0x26`, `0x00`, `0x00`, `0xff`, `0xf7`, `0xff`, `0xff`,
66	`0xff`, `0xff`, `0xff`, `0xef`, `0xff`, `0xff`, `0xff`, `0x3f`
67	};
68
69	static int
70	isxdirect (uint32_t ch)
71	{
72	return (ch < `128` && ((xdirect_tab[ch >> `3`] >> (ch & `7`)) & `1`));
73	}
74
75
76	/ The set of "extended base64 characters":*
77	A-Z a-z 0-9 + / -
78	*/
79
80	static const unsigned char xbase64_tab[`128` / `8`] =
81	{
82	`0x00`, `0x00`, `0x00`, `0x00`, `0x00`, `0xa8`, `0xff`, `0x03`,
83	`0xfe`, `0xff`, `0xff`, `0x07`, `0xfe`, `0xff`, `0xff`, `0x07`
84	};
85
86	static int
87	isxbase64 (uint32_t ch)
88	{
89	return (ch < `128` && ((xbase64_tab[ch >> `3`] >> (ch & `7`)) & `1`));
90	}
91
92
93	/ Converts a value in the range 0..63 to a base64 encoded char. /
94	static unsigned char
95	base64 (unsigned int i)
96	{
97	if (i < `26`)
98	return i + `'A'`;
99	else if (i < `52`)
100	return i - `26` + `'a'`;
101	else if (i < `62`)
102	return i - `52` + `'0'`;
103	else if (i == `62`)
104	return `'+'`;
105	else if (i == `63`)
106	return `'/'`;
107	else
108	abort ();
109	}
110
111
112	/ Definitions used in the body of the `gconv' function. /
113	#define CHARSET_NAME "UTF-7//"
114	#define DEFINE_INIT 1
115	#define DEFINE_FINI 1
116	#define FROM_LOOP from_utf7_loop
117	#define TO_LOOP to_utf7_loop
118	#define MIN_NEEDED_FROM 1
119	#define MAX_NEEDED_FROM 6
120	#define MIN_NEEDED_TO 4
121	#define MAX_NEEDED_TO 4
122	#define ONE_DIRECTION 0
123	#define PREPARE_LOOP \
124	mbstate_t saved_state; \
125	mbstate_t *statep = data->__statep;
126	#define EXTRA_LOOP_ARGS , statep
127
128
129	/ Since we might have to reset input pointer we must be able to save*
130	and restore the state. /*
131	#define SAVE_RESET_STATE(Save) \
132	if (Save) \
133	saved_state = *statep; \
134	else \
135	*statep = saved_state
136
137
138	/ First define the conversion function from UTF-7 to UCS4.*
139	The state is structured as follows:
140	__count bit 2..0: zero
141	__count bit 8..3: shift
142	__wch: data
143	Precise meaning:
144	shift data
145	0 -- not inside base64 encoding
146	1..32 XX..XX00..00 inside base64, (32 - shift) bits pending
147	This state layout is simpler than relying on STORE_REST/UNPACK_BYTES.
148
149	When shift = 0, __wch needs to store at most one lookahead byte (see
150	__GCONV_INCOMPLETE_INPUT below).
151	*/
152	#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
153	#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
154	#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
155	#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
156	#define LOOPFCT FROM_LOOP
157	#define BODY \
158	{ \
159	uint_fast8_t ch = *inptr; \
160	\
161	if ((statep->__count >> 3) == 0) \
162	{ \
163	/* base64 encoding inactive. */ \
164	if (isxdirect (ch)) \
165	{ \
166	inptr++; \
167	put32 (outptr, ch); \
168	outptr += 4; \
169	} \
170	else if (__glibc_likely (ch == '+')) \
171	{ \
172	if (__glibc_unlikely (inptr + 2 > inend)) \
173	{ \
174	/* Not enough input available. */ \
175	result = __GCONV_INCOMPLETE_INPUT; \
176	break; \
177	} \
178	if (inptr[1] == '-') \
179	{ \
180	inptr += 2; \
181	put32 (outptr, ch); \
182	outptr += 4; \
183	} \
184	else \
185	{ \
186	/* Switch into base64 mode. */ \
187	inptr++; \
188	statep->__count = (32 << 3); \
189	statep->__value.__wch = 0; \
190	} \
191	} \
192	else \
193	{ \
194	/* The input is invalid. */ \
195	STANDARD_FROM_LOOP_ERR_HANDLER (1); \
196	} \
197	} \
198	else \
199	{ \
200	/* base64 encoding active. */ \
201	uint32_t i; \
202	int shift; \
203	\
204	if (ch >= 'A' && ch <= 'Z') \
205	i = ch - 'A'; \
206	else if (ch >= 'a' && ch <= 'z') \
207	i = ch - 'a' + 26; \
208	else if (ch >= '0' && ch <= '9') \
209	i = ch - '0' + 52; \
210	else if (ch == '+') \
211	i = 62; \
212	else if (ch == '/') \
213	i = 63; \
214	else \
215	{ \
216	/* Terminate base64 encoding. */ \
217	\
218	/* If accumulated data is nonzero, the input is invalid. */ \
219	/* Also, partial UTF-16 characters are invalid. */ \
220	if (__builtin_expect (statep->__value.__wch != 0, 0) \
221	\|\| __builtin_expect ((statep->__count >> 3) <= 26, 0)) \
222	{ \
223	STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1)); \
224	} \
225	\
226	if (ch == '-') \
227	inptr++; \
228	\
229	statep->__count = 0; \
230	continue; \
231	} \
232	\
233	/* Concatenate the base64 integer i to the accumulator. */ \
234	shift = (statep->__count >> 3); \
235	if (shift > 6) \
236	{ \
237	uint32_t wch; \
238	\
239	shift -= 6; \
240	wch = statep->__value.__wch \| (i << shift); \
241	\
242	if (shift <= 16 && shift > 10) \
243	{ \
244	/* An UTF-16 character has just been completed. */ \
245	uint32_t wc1 = wch >> 16; \
246	\
247	/* UTF-16: When we see a High Surrogate, we must also decode \
248	the following Low Surrogate. */ \
249	if (!(wc1 >= 0xd800 && wc1 < 0xdc00)) \
250	{ \
251	wch = wch << 16; \
252	shift += 16; \
253	put32 (outptr, wc1); \
254	outptr += 4; \
255	} \
256	} \
257	else if (shift <= 10 && shift > 4) \
258	{ \
259	/* After a High Surrogate, verify that the next 16 bit \
260	indeed form a Low Surrogate. */ \
261	uint32_t wc2 = wch & 0xffff; \
262	\
263	if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1)) \
264	{ \
265	STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\
266	} \
267	} \
268	\
269	statep->__value.__wch = wch; \
270	} \
271	else \
272	{ \
273	/* An UTF-16 surrogate pair has just been completed. */ \
274	uint32_t wc1 = (uint32_t) statep->__value.__wch >> 16; \
275	uint32_t wc2 = ((uint32_t) statep->__value.__wch & 0xffff) \
276	\| (i >> (6 - shift)); \
277	\
278	statep->__value.__wch = (i << shift) << 26; \
279	shift += 26; \
280	\
281	assert (wc1 >= 0xd800 && wc1 < 0xdc00); \
282	assert (wc2 >= 0xdc00 && wc2 < 0xe000); \
283	put32 (outptr, \
284	0x10000 + ((wc1 - 0xd800) << 10) + (wc2 - 0xdc00)); \
285	outptr += 4; \
286	} \
287	\
288	statep->__count = shift << 3; \
289	\
290	/* Now that we digested the input increment the input pointer. */ \
291	inptr++; \
292	} \
293	}
294	#define LOOP_NEED_FLAGS
295	#define EXTRA_LOOP_DECLS , mbstate_t *statep
296	#include <iconv/loop.c>
297
298
299	/ Next, define the conversion from UCS4 to UTF-7.*
300	The state is structured as follows:
301	__count bit 2..0: zero
302	__count bit 4..3: shift
303	__count bit 8..5: data
304	Precise meaning:
305	shift data
306	0 0 not inside base64 encoding
307	1 0 inside base64, no pending bits
308	2 XX00 inside base64, 2 bits known for next byte
309	3 XXXX inside base64, 4 bits known for next byte
310
311	__count bit 2..0 and __wch are always zero, because this direction
312	never returns __GCONV_INCOMPLETE_INPUT.
313	*/
314	#define MIN_NEEDED_INPUT MIN_NEEDED_TO
315	#define MAX_NEEDED_INPUT MAX_NEEDED_TO
316	#define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM
317	#define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM
318	#define LOOPFCT TO_LOOP
319	#define BODY \
320	{ \
321	uint32_t ch = get32 (inptr); \
322	\
323	if ((statep->__count & 0x18) == 0) \
324	{ \
325	/* base64 encoding inactive */ \
326	if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \
327	{ \
328	*outptr++ = (unsigned char) ch; \
329	} \
330	else \
331	{ \
332	size_t count; \
333	\
334	if (ch == '+') \
335	count = 2; \
336	else if (ch < 0x10000) \
337	count = 3; \
338	else if (ch < 0x110000) \
339	count = 6; \
340	else \
341	STANDARD_TO_LOOP_ERR_HANDLER (4); \
342	\
343	if (__glibc_unlikely (outptr + count > outend)) \
344	{ \
345	result = __GCONV_FULL_OUTPUT; \
346	break; \
347	} \
348	\
349	*outptr++ = '+'; \
350	if (ch == '+') \
351	*outptr++ = '-'; \
352	else if (ch < 0x10000) \
353	{ \
354	*outptr++ = base64 (ch >> 10); \
355	*outptr++ = base64 ((ch >> 4) & 0x3f); \
356	statep->__count = ((ch & 15) << 5) \| (3 << 3); \
357	} \
358	else if (ch < 0x110000) \
359	{ \
360	uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \
361	uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \
362	\
363	ch = (ch1 << 16) \| ch2; \
364	*outptr++ = base64 (ch >> 26); \
365	*outptr++ = base64 ((ch >> 20) & 0x3f); \
366	*outptr++ = base64 ((ch >> 14) & 0x3f); \
367	*outptr++ = base64 ((ch >> 8) & 0x3f); \
368	*outptr++ = base64 ((ch >> 2) & 0x3f); \
369	statep->__count = ((ch & 3) << 7) \| (2 << 3); \
370	} \
371	else \
372	abort (); \
373	} \
374	} \
375	else \
376	{ \
377	/* base64 encoding active */ \
378	if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \
379	{ \
380	/* deactivate base64 encoding */ \
381	size_t count; \
382	\
383	count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1; \
384	if (__glibc_unlikely (outptr + count > outend)) \
385	{ \
386	result = __GCONV_FULL_OUTPUT; \
387	break; \
388	} \
389	\
390	if ((statep->__count & 0x18) >= 0x10) \
391	*outptr++ = base64 ((statep->__count >> 3) & ~3); \
392	if (isxbase64 (ch)) \
393	*outptr++ = '-'; \
394	*outptr++ = (unsigned char) ch; \
395	statep->__count = 0; \
396	} \
397	else \
398	{ \
399	size_t count; \
400	\
401	if (ch < 0x10000) \
402	count = ((statep->__count & 0x18) >= 0x10 ? 3 : 2); \
403	else if (ch < 0x110000) \
404	count = ((statep->__count & 0x18) >= 0x18 ? 6 : 5); \
405	else \
406	STANDARD_TO_LOOP_ERR_HANDLER (4); \
407	\
408	if (__glibc_unlikely (outptr + count > outend)) \
409	{ \
410	result = __GCONV_FULL_OUTPUT; \
411	break; \
412	} \
413	\
414	if (ch < 0x10000) \
415	{ \
416	switch ((statep->__count >> 3) & 3) \
417	{ \
418	case 1: \
419	*outptr++ = base64 (ch >> 10); \
420	*outptr++ = base64 ((ch >> 4) & 0x3f); \
421	statep->__count = ((ch & 15) << 5) \| (3 << 3); \
422	break; \
423	case 2: \
424	*outptr++ = \
425	base64 (((statep->__count >> 3) & ~3) \| (ch >> 12)); \
426	*outptr++ = base64 ((ch >> 6) & 0x3f); \
427	*outptr++ = base64 (ch & 0x3f); \
428	statep->__count = (1 << 3); \
429	break; \
430	case 3: \
431	*outptr++ = \
432	base64 (((statep->__count >> 3) & ~3) \| (ch >> 14)); \
433	*outptr++ = base64 ((ch >> 8) & 0x3f); \
434	*outptr++ = base64 ((ch >> 2) & 0x3f); \
435	statep->__count = ((ch & 3) << 7) \| (2 << 3); \
436	break; \
437	default: \
438	abort (); \
439	} \
440	} \
441	else if (ch < 0x110000) \
442	{ \
443	uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \
444	uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \
445	\
446	ch = (ch1 << 16) \| ch2; \
447	switch ((statep->__count >> 3) & 3) \
448	{ \
449	case 1: \
450	*outptr++ = base64 (ch >> 26); \
451	*outptr++ = base64 ((ch >> 20) & 0x3f); \
452	*outptr++ = base64 ((ch >> 14) & 0x3f); \
453	*outptr++ = base64 ((ch >> 8) & 0x3f); \
454	*outptr++ = base64 ((ch >> 2) & 0x3f); \
455	statep->__count = ((ch & 3) << 7) \| (2 << 3); \
456	break; \
457	case 2: \
458	*outptr++ = \
459	base64 (((statep->__count >> 3) & ~3) \| (ch >> 28)); \
460	*outptr++ = base64 ((ch >> 22) & 0x3f); \
461	*outptr++ = base64 ((ch >> 16) & 0x3f); \
462	*outptr++ = base64 ((ch >> 10) & 0x3f); \
463	*outptr++ = base64 ((ch >> 4) & 0x3f); \
464	statep->__count = ((ch & 15) << 5) \| (3 << 3); \
465	break; \
466	case 3: \
467	*outptr++ = \
468	base64 (((statep->__count >> 3) & ~3) \| (ch >> 30)); \
469	*outptr++ = base64 ((ch >> 24) & 0x3f); \
470	*outptr++ = base64 ((ch >> 18) & 0x3f); \
471	*outptr++ = base64 ((ch >> 12) & 0x3f); \
472	*outptr++ = base64 ((ch >> 6) & 0x3f); \
473	*outptr++ = base64 (ch & 0x3f); \
474	statep->__count = (1 << 3); \
475	break; \
476	default: \
477	abort (); \
478	} \
479	} \
480	else \
481	abort (); \
482	} \
483	} \
484	\
485	/* Now that we wrote the output increment the input pointer. */ \
486	inptr += 4; \
487	}
488	#define LOOP_NEED_FLAGS
489	#define EXTRA_LOOP_DECLS , mbstate_t *statep
490	#include <iconv/loop.c>
491
492
493	/ Since this is a stateful encoding we have to provide code which resets*
494	the output state to the initial state. This has to be done during the
495	flushing. /*
496	#define EMIT_SHIFT_TO_INIT \
497	if (FROM_DIRECTION) \
498	/* Nothing to emit. */ \
499	memset (data->__statep, '\0', sizeof (mbstate_t)); \
500	else \
501	{ \
502	/* The "to UTF-7" direction. Flush the remaining bits and terminate \
503	with a '-' byte. This will guarantee correct decoding if more \
504	UTF-7 encoded text is added afterwards. */ \
505	int state = data->__statep->__count; \
506	\
507	if (state & 0x18) \
508	{ \
509	/* Deactivate base64 encoding. */ \
510	size_t count = ((state & 0x18) >= 0x10) + 1; \
511	\
512	if (__glibc_unlikely (outbuf + count > outend)) \
513	/* We don't have enough room in the output buffer. */ \
514	status = __GCONV_FULL_OUTPUT; \
515	else \
516	{ \
517	/* Write out the shift sequence. */ \
518	if ((state & 0x18) >= 0x10) \
519	*outbuf++ = base64 ((state >> 3) & ~3); \
520	*outbuf++ = '-'; \
521	\
522	data->__statep->__count = 0; \
523	} \
524	} \
525	else \
526	data->__statep->__count = 0; \
527	}
528
529
530	/ Now define the toplevel functions. /
531	#include <iconv/skeleton.c>
532

source code of glibc/iconvdata/utf-7.c