1 | /* Conversion module for UTF-7. |
2 | Copyright (C) 2000-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* UTF-7 is a legacy encoding used for transmitting Unicode within the |
20 | ASCII character set, used primarily by mail agents. New programs |
21 | are encouraged to use UTF-8 instead. |
22 | |
23 | UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642). The |
24 | original Base64 encoding is defined in RFC 2045. */ |
25 | |
26 | #include <dlfcn.h> |
27 | #include <gconv.h> |
28 | #include <stdint.h> |
29 | #include <stdlib.h> |
30 | |
31 | |
32 | /* Define this to 1 if you want the so-called "optional direct" characters |
33 | ! " # $ % & * ; < = > @ [ ] ^ _ ` { | } |
34 | to be encoded. Define to 0 if you want them to be passed straight |
35 | through, like the so-called "direct" characters. |
36 | We set this to 1 because it's safer. |
37 | */ |
38 | #define UTF7_ENCODE_OPTIONAL_CHARS 1 |
39 | |
40 | |
41 | /* The set of "direct characters": |
42 | A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr |
43 | */ |
44 | |
45 | static const unsigned char direct_tab[128 / 8] = |
46 | { |
47 | 0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87, |
48 | 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07 |
49 | }; |
50 | |
51 | static int |
52 | isdirect (uint32_t ch) |
53 | { |
54 | return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1)); |
55 | } |
56 | |
57 | |
58 | /* The set of "direct and optional direct characters": |
59 | A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr |
60 | ! " # $ % & * ; < = > @ [ ] ^ _ ` { | } |
61 | */ |
62 | |
63 | static const unsigned char xdirect_tab[128 / 8] = |
64 | { |
65 | 0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff, |
66 | 0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f |
67 | }; |
68 | |
69 | static int |
70 | isxdirect (uint32_t ch) |
71 | { |
72 | return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1)); |
73 | } |
74 | |
75 | |
76 | /* The set of "extended base64 characters": |
77 | A-Z a-z 0-9 + / - |
78 | */ |
79 | |
80 | static const unsigned char xbase64_tab[128 / 8] = |
81 | { |
82 | 0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03, |
83 | 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07 |
84 | }; |
85 | |
86 | static int |
87 | isxbase64 (uint32_t ch) |
88 | { |
89 | return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1)); |
90 | } |
91 | |
92 | |
93 | /* Converts a value in the range 0..63 to a base64 encoded char. */ |
94 | static unsigned char |
95 | base64 (unsigned int i) |
96 | { |
97 | if (i < 26) |
98 | return i + 'A'; |
99 | else if (i < 52) |
100 | return i - 26 + 'a'; |
101 | else if (i < 62) |
102 | return i - 52 + '0'; |
103 | else if (i == 62) |
104 | return '+'; |
105 | else if (i == 63) |
106 | return '/'; |
107 | else |
108 | abort (); |
109 | } |
110 | |
111 | |
112 | /* Definitions used in the body of the `gconv' function. */ |
113 | #define CHARSET_NAME "UTF-7//" |
114 | #define DEFINE_INIT 1 |
115 | #define DEFINE_FINI 1 |
116 | #define FROM_LOOP from_utf7_loop |
117 | #define TO_LOOP to_utf7_loop |
118 | #define MIN_NEEDED_FROM 1 |
119 | #define MAX_NEEDED_FROM 6 |
120 | #define MIN_NEEDED_TO 4 |
121 | #define MAX_NEEDED_TO 4 |
122 | #define ONE_DIRECTION 0 |
123 | #define PREPARE_LOOP \ |
124 | mbstate_t saved_state; \ |
125 | mbstate_t *statep = data->__statep; |
126 | #define , statep |
127 | |
128 | |
129 | /* Since we might have to reset input pointer we must be able to save |
130 | and restore the state. */ |
131 | #define SAVE_RESET_STATE(Save) \ |
132 | if (Save) \ |
133 | saved_state = *statep; \ |
134 | else \ |
135 | *statep = saved_state |
136 | |
137 | |
138 | /* First define the conversion function from UTF-7 to UCS4. |
139 | The state is structured as follows: |
140 | __count bit 2..0: zero |
141 | __count bit 8..3: shift |
142 | __wch: data |
143 | Precise meaning: |
144 | shift data |
145 | 0 -- not inside base64 encoding |
146 | 1..32 XX..XX00..00 inside base64, (32 - shift) bits pending |
147 | This state layout is simpler than relying on STORE_REST/UNPACK_BYTES. |
148 | |
149 | When shift = 0, __wch needs to store at most one lookahead byte (see |
150 | __GCONV_INCOMPLETE_INPUT below). |
151 | */ |
152 | #define MIN_NEEDED_INPUT MIN_NEEDED_FROM |
153 | #define MAX_NEEDED_INPUT MAX_NEEDED_FROM |
154 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO |
155 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO |
156 | #define LOOPFCT FROM_LOOP |
157 | #define BODY \ |
158 | { \ |
159 | uint_fast8_t ch = *inptr; \ |
160 | \ |
161 | if ((statep->__count >> 3) == 0) \ |
162 | { \ |
163 | /* base64 encoding inactive. */ \ |
164 | if (isxdirect (ch)) \ |
165 | { \ |
166 | inptr++; \ |
167 | put32 (outptr, ch); \ |
168 | outptr += 4; \ |
169 | } \ |
170 | else if (__glibc_likely (ch == '+')) \ |
171 | { \ |
172 | if (__glibc_unlikely (inptr + 2 > inend)) \ |
173 | { \ |
174 | /* Not enough input available. */ \ |
175 | result = __GCONV_INCOMPLETE_INPUT; \ |
176 | break; \ |
177 | } \ |
178 | if (inptr[1] == '-') \ |
179 | { \ |
180 | inptr += 2; \ |
181 | put32 (outptr, ch); \ |
182 | outptr += 4; \ |
183 | } \ |
184 | else \ |
185 | { \ |
186 | /* Switch into base64 mode. */ \ |
187 | inptr++; \ |
188 | statep->__count = (32 << 3); \ |
189 | statep->__value.__wch = 0; \ |
190 | } \ |
191 | } \ |
192 | else \ |
193 | { \ |
194 | /* The input is invalid. */ \ |
195 | STANDARD_FROM_LOOP_ERR_HANDLER (1); \ |
196 | } \ |
197 | } \ |
198 | else \ |
199 | { \ |
200 | /* base64 encoding active. */ \ |
201 | uint32_t i; \ |
202 | int shift; \ |
203 | \ |
204 | if (ch >= 'A' && ch <= 'Z') \ |
205 | i = ch - 'A'; \ |
206 | else if (ch >= 'a' && ch <= 'z') \ |
207 | i = ch - 'a' + 26; \ |
208 | else if (ch >= '0' && ch <= '9') \ |
209 | i = ch - '0' + 52; \ |
210 | else if (ch == '+') \ |
211 | i = 62; \ |
212 | else if (ch == '/') \ |
213 | i = 63; \ |
214 | else \ |
215 | { \ |
216 | /* Terminate base64 encoding. */ \ |
217 | \ |
218 | /* If accumulated data is nonzero, the input is invalid. */ \ |
219 | /* Also, partial UTF-16 characters are invalid. */ \ |
220 | if (__builtin_expect (statep->__value.__wch != 0, 0) \ |
221 | || __builtin_expect ((statep->__count >> 3) <= 26, 0)) \ |
222 | { \ |
223 | STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1)); \ |
224 | } \ |
225 | \ |
226 | if (ch == '-') \ |
227 | inptr++; \ |
228 | \ |
229 | statep->__count = 0; \ |
230 | continue; \ |
231 | } \ |
232 | \ |
233 | /* Concatenate the base64 integer i to the accumulator. */ \ |
234 | shift = (statep->__count >> 3); \ |
235 | if (shift > 6) \ |
236 | { \ |
237 | uint32_t wch; \ |
238 | \ |
239 | shift -= 6; \ |
240 | wch = statep->__value.__wch | (i << shift); \ |
241 | \ |
242 | if (shift <= 16 && shift > 10) \ |
243 | { \ |
244 | /* An UTF-16 character has just been completed. */ \ |
245 | uint32_t wc1 = wch >> 16; \ |
246 | \ |
247 | /* UTF-16: When we see a High Surrogate, we must also decode \ |
248 | the following Low Surrogate. */ \ |
249 | if (!(wc1 >= 0xd800 && wc1 < 0xdc00)) \ |
250 | { \ |
251 | wch = wch << 16; \ |
252 | shift += 16; \ |
253 | put32 (outptr, wc1); \ |
254 | outptr += 4; \ |
255 | } \ |
256 | } \ |
257 | else if (shift <= 10 && shift > 4) \ |
258 | { \ |
259 | /* After a High Surrogate, verify that the next 16 bit \ |
260 | indeed form a Low Surrogate. */ \ |
261 | uint32_t wc2 = wch & 0xffff; \ |
262 | \ |
263 | if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1)) \ |
264 | { \ |
265 | STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\ |
266 | } \ |
267 | } \ |
268 | \ |
269 | statep->__value.__wch = wch; \ |
270 | } \ |
271 | else \ |
272 | { \ |
273 | /* An UTF-16 surrogate pair has just been completed. */ \ |
274 | uint32_t wc1 = (uint32_t) statep->__value.__wch >> 16; \ |
275 | uint32_t wc2 = ((uint32_t) statep->__value.__wch & 0xffff) \ |
276 | | (i >> (6 - shift)); \ |
277 | \ |
278 | statep->__value.__wch = (i << shift) << 26; \ |
279 | shift += 26; \ |
280 | \ |
281 | assert (wc1 >= 0xd800 && wc1 < 0xdc00); \ |
282 | assert (wc2 >= 0xdc00 && wc2 < 0xe000); \ |
283 | put32 (outptr, \ |
284 | 0x10000 + ((wc1 - 0xd800) << 10) + (wc2 - 0xdc00)); \ |
285 | outptr += 4; \ |
286 | } \ |
287 | \ |
288 | statep->__count = shift << 3; \ |
289 | \ |
290 | /* Now that we digested the input increment the input pointer. */ \ |
291 | inptr++; \ |
292 | } \ |
293 | } |
294 | #define LOOP_NEED_FLAGS |
295 | #define , mbstate_t *statep |
296 | #include <iconv/loop.c> |
297 | |
298 | |
299 | /* Next, define the conversion from UCS4 to UTF-7. |
300 | The state is structured as follows: |
301 | __count bit 2..0: zero |
302 | __count bit 4..3: shift |
303 | __count bit 8..5: data |
304 | Precise meaning: |
305 | shift data |
306 | 0 0 not inside base64 encoding |
307 | 1 0 inside base64, no pending bits |
308 | 2 XX00 inside base64, 2 bits known for next byte |
309 | 3 XXXX inside base64, 4 bits known for next byte |
310 | |
311 | __count bit 2..0 and __wch are always zero, because this direction |
312 | never returns __GCONV_INCOMPLETE_INPUT. |
313 | */ |
314 | #define MIN_NEEDED_INPUT MIN_NEEDED_TO |
315 | #define MAX_NEEDED_INPUT MAX_NEEDED_TO |
316 | #define MIN_NEEDED_OUTPUT MIN_NEEDED_FROM |
317 | #define MAX_NEEDED_OUTPUT MAX_NEEDED_FROM |
318 | #define LOOPFCT TO_LOOP |
319 | #define BODY \ |
320 | { \ |
321 | uint32_t ch = get32 (inptr); \ |
322 | \ |
323 | if ((statep->__count & 0x18) == 0) \ |
324 | { \ |
325 | /* base64 encoding inactive */ \ |
326 | if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \ |
327 | { \ |
328 | *outptr++ = (unsigned char) ch; \ |
329 | } \ |
330 | else \ |
331 | { \ |
332 | size_t count; \ |
333 | \ |
334 | if (ch == '+') \ |
335 | count = 2; \ |
336 | else if (ch < 0x10000) \ |
337 | count = 3; \ |
338 | else if (ch < 0x110000) \ |
339 | count = 6; \ |
340 | else \ |
341 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
342 | \ |
343 | if (__glibc_unlikely (outptr + count > outend)) \ |
344 | { \ |
345 | result = __GCONV_FULL_OUTPUT; \ |
346 | break; \ |
347 | } \ |
348 | \ |
349 | *outptr++ = '+'; \ |
350 | if (ch == '+') \ |
351 | *outptr++ = '-'; \ |
352 | else if (ch < 0x10000) \ |
353 | { \ |
354 | *outptr++ = base64 (ch >> 10); \ |
355 | *outptr++ = base64 ((ch >> 4) & 0x3f); \ |
356 | statep->__count = ((ch & 15) << 5) | (3 << 3); \ |
357 | } \ |
358 | else if (ch < 0x110000) \ |
359 | { \ |
360 | uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \ |
361 | uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \ |
362 | \ |
363 | ch = (ch1 << 16) | ch2; \ |
364 | *outptr++ = base64 (ch >> 26); \ |
365 | *outptr++ = base64 ((ch >> 20) & 0x3f); \ |
366 | *outptr++ = base64 ((ch >> 14) & 0x3f); \ |
367 | *outptr++ = base64 ((ch >> 8) & 0x3f); \ |
368 | *outptr++ = base64 ((ch >> 2) & 0x3f); \ |
369 | statep->__count = ((ch & 3) << 7) | (2 << 3); \ |
370 | } \ |
371 | else \ |
372 | abort (); \ |
373 | } \ |
374 | } \ |
375 | else \ |
376 | { \ |
377 | /* base64 encoding active */ \ |
378 | if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \ |
379 | { \ |
380 | /* deactivate base64 encoding */ \ |
381 | size_t count; \ |
382 | \ |
383 | count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1; \ |
384 | if (__glibc_unlikely (outptr + count > outend)) \ |
385 | { \ |
386 | result = __GCONV_FULL_OUTPUT; \ |
387 | break; \ |
388 | } \ |
389 | \ |
390 | if ((statep->__count & 0x18) >= 0x10) \ |
391 | *outptr++ = base64 ((statep->__count >> 3) & ~3); \ |
392 | if (isxbase64 (ch)) \ |
393 | *outptr++ = '-'; \ |
394 | *outptr++ = (unsigned char) ch; \ |
395 | statep->__count = 0; \ |
396 | } \ |
397 | else \ |
398 | { \ |
399 | size_t count; \ |
400 | \ |
401 | if (ch < 0x10000) \ |
402 | count = ((statep->__count & 0x18) >= 0x10 ? 3 : 2); \ |
403 | else if (ch < 0x110000) \ |
404 | count = ((statep->__count & 0x18) >= 0x18 ? 6 : 5); \ |
405 | else \ |
406 | STANDARD_TO_LOOP_ERR_HANDLER (4); \ |
407 | \ |
408 | if (__glibc_unlikely (outptr + count > outend)) \ |
409 | { \ |
410 | result = __GCONV_FULL_OUTPUT; \ |
411 | break; \ |
412 | } \ |
413 | \ |
414 | if (ch < 0x10000) \ |
415 | { \ |
416 | switch ((statep->__count >> 3) & 3) \ |
417 | { \ |
418 | case 1: \ |
419 | *outptr++ = base64 (ch >> 10); \ |
420 | *outptr++ = base64 ((ch >> 4) & 0x3f); \ |
421 | statep->__count = ((ch & 15) << 5) | (3 << 3); \ |
422 | break; \ |
423 | case 2: \ |
424 | *outptr++ = \ |
425 | base64 (((statep->__count >> 3) & ~3) | (ch >> 12)); \ |
426 | *outptr++ = base64 ((ch >> 6) & 0x3f); \ |
427 | *outptr++ = base64 (ch & 0x3f); \ |
428 | statep->__count = (1 << 3); \ |
429 | break; \ |
430 | case 3: \ |
431 | *outptr++ = \ |
432 | base64 (((statep->__count >> 3) & ~3) | (ch >> 14)); \ |
433 | *outptr++ = base64 ((ch >> 8) & 0x3f); \ |
434 | *outptr++ = base64 ((ch >> 2) & 0x3f); \ |
435 | statep->__count = ((ch & 3) << 7) | (2 << 3); \ |
436 | break; \ |
437 | default: \ |
438 | abort (); \ |
439 | } \ |
440 | } \ |
441 | else if (ch < 0x110000) \ |
442 | { \ |
443 | uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10); \ |
444 | uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff); \ |
445 | \ |
446 | ch = (ch1 << 16) | ch2; \ |
447 | switch ((statep->__count >> 3) & 3) \ |
448 | { \ |
449 | case 1: \ |
450 | *outptr++ = base64 (ch >> 26); \ |
451 | *outptr++ = base64 ((ch >> 20) & 0x3f); \ |
452 | *outptr++ = base64 ((ch >> 14) & 0x3f); \ |
453 | *outptr++ = base64 ((ch >> 8) & 0x3f); \ |
454 | *outptr++ = base64 ((ch >> 2) & 0x3f); \ |
455 | statep->__count = ((ch & 3) << 7) | (2 << 3); \ |
456 | break; \ |
457 | case 2: \ |
458 | *outptr++ = \ |
459 | base64 (((statep->__count >> 3) & ~3) | (ch >> 28)); \ |
460 | *outptr++ = base64 ((ch >> 22) & 0x3f); \ |
461 | *outptr++ = base64 ((ch >> 16) & 0x3f); \ |
462 | *outptr++ = base64 ((ch >> 10) & 0x3f); \ |
463 | *outptr++ = base64 ((ch >> 4) & 0x3f); \ |
464 | statep->__count = ((ch & 15) << 5) | (3 << 3); \ |
465 | break; \ |
466 | case 3: \ |
467 | *outptr++ = \ |
468 | base64 (((statep->__count >> 3) & ~3) | (ch >> 30)); \ |
469 | *outptr++ = base64 ((ch >> 24) & 0x3f); \ |
470 | *outptr++ = base64 ((ch >> 18) & 0x3f); \ |
471 | *outptr++ = base64 ((ch >> 12) & 0x3f); \ |
472 | *outptr++ = base64 ((ch >> 6) & 0x3f); \ |
473 | *outptr++ = base64 (ch & 0x3f); \ |
474 | statep->__count = (1 << 3); \ |
475 | break; \ |
476 | default: \ |
477 | abort (); \ |
478 | } \ |
479 | } \ |
480 | else \ |
481 | abort (); \ |
482 | } \ |
483 | } \ |
484 | \ |
485 | /* Now that we wrote the output increment the input pointer. */ \ |
486 | inptr += 4; \ |
487 | } |
488 | #define LOOP_NEED_FLAGS |
489 | #define , mbstate_t *statep |
490 | #include <iconv/loop.c> |
491 | |
492 | |
493 | /* Since this is a stateful encoding we have to provide code which resets |
494 | the output state to the initial state. This has to be done during the |
495 | flushing. */ |
496 | #define EMIT_SHIFT_TO_INIT \ |
497 | if (FROM_DIRECTION) \ |
498 | /* Nothing to emit. */ \ |
499 | memset (data->__statep, '\0', sizeof (mbstate_t)); \ |
500 | else \ |
501 | { \ |
502 | /* The "to UTF-7" direction. Flush the remaining bits and terminate \ |
503 | with a '-' byte. This will guarantee correct decoding if more \ |
504 | UTF-7 encoded text is added afterwards. */ \ |
505 | int state = data->__statep->__count; \ |
506 | \ |
507 | if (state & 0x18) \ |
508 | { \ |
509 | /* Deactivate base64 encoding. */ \ |
510 | size_t count = ((state & 0x18) >= 0x10) + 1; \ |
511 | \ |
512 | if (__glibc_unlikely (outbuf + count > outend)) \ |
513 | /* We don't have enough room in the output buffer. */ \ |
514 | status = __GCONV_FULL_OUTPUT; \ |
515 | else \ |
516 | { \ |
517 | /* Write out the shift sequence. */ \ |
518 | if ((state & 0x18) >= 0x10) \ |
519 | *outbuf++ = base64 ((state >> 3) & ~3); \ |
520 | *outbuf++ = '-'; \ |
521 | \ |
522 | data->__statep->__count = 0; \ |
523 | } \ |
524 | } \ |
525 | else \ |
526 | data->__statep->__count = 0; \ |
527 | } |
528 | |
529 | |
530 | /* Now define the toplevel functions. */ |
531 | #include <iconv/skeleton.c> |
532 | |