1/* Simple transformations functions.
2 Copyright (C) 1997-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <byteswap.h>
20#include <dlfcn.h>
21#include <endian.h>
22#include <errno.h>
23#include <gconv.h>
24#include <stdint.h>
25#include <stdlib.h>
26#include <string.h>
27#include <wchar.h>
28#include <sys/param.h>
29#include <gconv_int.h>
30
31#define BUILTIN_ALIAS(s1, s2) /* nothing */
32#define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
33 MinF, MaxF, MinT, MaxT) \
34 extern int Fct (struct __gconv_step *, struct __gconv_step_data *, \
35 const unsigned char **, const unsigned char *, \
36 unsigned char **, size_t *, int, int);
37#include "gconv_builtin.h"
38
39
40#ifndef EILSEQ
41# define EILSEQ EINVAL
42#endif
43
44
45/* Specialized conversion function for a single byte to INTERNAL, recognizing
46 only ASCII characters. */
47wint_t
48__gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c)
49{
50 if (c < 0x80)
51 return c;
52 else
53 return WEOF;
54}
55
56
57/* Transform from the internal, UCS4-like format, to UCS4. The
58 difference between the internal ucs4 format and the real UCS4
59 format is, if any, the endianess. The Unicode/ISO 10646 says that
60 unless some higher protocol specifies it differently, the byte
61 order is big endian.*/
62#define DEFINE_INIT 0
63#define DEFINE_FINI 0
64#define MIN_NEEDED_FROM 4
65#define MIN_NEEDED_TO 4
66#define FROM_DIRECTION 1
67#define FROM_LOOP internal_ucs4_loop
68#define TO_LOOP internal_ucs4_loop /* This is not used. */
69#define FUNCTION_NAME __gconv_transform_internal_ucs4
70#define ONE_DIRECTION 0
71
72
73static inline int
74__attribute ((always_inline))
75internal_ucs4_loop (struct __gconv_step *step,
76 struct __gconv_step_data *step_data,
77 const unsigned char **inptrp, const unsigned char *inend,
78 unsigned char **outptrp, const unsigned char *outend,
79 size_t *irreversible)
80{
81 const unsigned char *inptr = *inptrp;
82 unsigned char *outptr = *outptrp;
83 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
84 int result;
85
86#if __BYTE_ORDER == __LITTLE_ENDIAN
87 /* Sigh, we have to do some real work. */
88 size_t cnt;
89 uint32_t *outptr32 = (uint32_t *) outptr;
90
91 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
92 *outptr32++ = bswap_32 (*(const uint32_t *) inptr);
93
94 *inptrp = inptr;
95 *outptrp = (unsigned char *) outptr32;
96#elif __BYTE_ORDER == __BIG_ENDIAN
97 /* Simply copy the data. */
98 *inptrp = inptr + n_convert * 4;
99 *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
100#else
101# error "This endianess is not supported."
102#endif
103
104 /* Determine the status. */
105 if (*inptrp == inend)
106 result = __GCONV_EMPTY_INPUT;
107 else if (*outptrp + 4 > outend)
108 result = __GCONV_FULL_OUTPUT;
109 else
110 result = __GCONV_INCOMPLETE_INPUT;
111
112 return result;
113}
114
115#if !_STRING_ARCH_unaligned
116static inline int
117__attribute ((always_inline))
118internal_ucs4_loop_unaligned (struct __gconv_step *step,
119 struct __gconv_step_data *step_data,
120 const unsigned char **inptrp,
121 const unsigned char *inend,
122 unsigned char **outptrp,
123 const unsigned char *outend,
124 size_t *irreversible)
125{
126 const unsigned char *inptr = *inptrp;
127 unsigned char *outptr = *outptrp;
128 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
129 int result;
130
131# if __BYTE_ORDER == __LITTLE_ENDIAN
132 /* Sigh, we have to do some real work. */
133 size_t cnt;
134
135 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4)
136 {
137 outptr[0] = inptr[3];
138 outptr[1] = inptr[2];
139 outptr[2] = inptr[1];
140 outptr[3] = inptr[0];
141 }
142
143 *inptrp = inptr;
144 *outptrp = outptr;
145# elif __BYTE_ORDER == __BIG_ENDIAN
146 /* Simply copy the data. */
147 *inptrp = inptr + n_convert * 4;
148 *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
149# else
150# error "This endianess is not supported."
151# endif
152
153 /* Determine the status. */
154 if (*inptrp == inend)
155 result = __GCONV_EMPTY_INPUT;
156 else if (*outptrp + 4 > outend)
157 result = __GCONV_FULL_OUTPUT;
158 else
159 result = __GCONV_INCOMPLETE_INPUT;
160
161 return result;
162}
163#endif
164
165
166static inline int
167__attribute ((always_inline))
168internal_ucs4_loop_single (struct __gconv_step *step,
169 struct __gconv_step_data *step_data,
170 const unsigned char **inptrp,
171 const unsigned char *inend,
172 unsigned char **outptrp,
173 const unsigned char *outend,
174 size_t *irreversible)
175{
176 mbstate_t *state = step_data->__statep;
177 size_t cnt = state->__count & 7;
178
179 while (*inptrp < inend && cnt < 4)
180 state->__value.__wchb[cnt++] = *(*inptrp)++;
181
182 if (__glibc_unlikely (cnt < 4))
183 {
184 /* Still not enough bytes. Store the ones in the input buffer. */
185 state->__count &= ~7;
186 state->__count |= cnt;
187
188 return __GCONV_INCOMPLETE_INPUT;
189 }
190
191#if __BYTE_ORDER == __LITTLE_ENDIAN
192 (*outptrp)[0] = state->__value.__wchb[3];
193 (*outptrp)[1] = state->__value.__wchb[2];
194 (*outptrp)[2] = state->__value.__wchb[1];
195 (*outptrp)[3] = state->__value.__wchb[0];
196
197#elif __BYTE_ORDER == __BIG_ENDIAN
198 /* XXX unaligned */
199 (*outptrp)[0] = state->__value.__wchb[0];
200 (*outptrp)[1] = state->__value.__wchb[1];
201 (*outptrp)[2] = state->__value.__wchb[2];
202 (*outptrp)[3] = state->__value.__wchb[3];
203#else
204# error "This endianess is not supported."
205#endif
206 *outptrp += 4;
207
208 /* Clear the state buffer. */
209 state->__count &= ~7;
210
211 return __GCONV_OK;
212}
213
214#include <iconv/skeleton.c>
215
216
217/* Transform from UCS4 to the internal, UCS4-like format. Unlike
218 for the other direction we have to check for correct values here. */
219#define DEFINE_INIT 0
220#define DEFINE_FINI 0
221#define MIN_NEEDED_FROM 4
222#define MIN_NEEDED_TO 4
223#define FROM_DIRECTION 1
224#define FROM_LOOP ucs4_internal_loop
225#define TO_LOOP ucs4_internal_loop /* This is not used. */
226#define FUNCTION_NAME __gconv_transform_ucs4_internal
227#define ONE_DIRECTION 0
228
229
230static inline int
231__attribute ((always_inline))
232ucs4_internal_loop (struct __gconv_step *step,
233 struct __gconv_step_data *step_data,
234 const unsigned char **inptrp, const unsigned char *inend,
235 unsigned char **outptrp, const unsigned char *outend,
236 size_t *irreversible)
237{
238 int flags = step_data->__flags;
239 const unsigned char *inptr = *inptrp;
240 unsigned char *outptr = *outptrp;
241 int result;
242
243 for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
244 {
245 uint32_t inval;
246
247#if __BYTE_ORDER == __LITTLE_ENDIAN
248 inval = bswap_32 (*(const uint32_t *) inptr);
249#else
250 inval = *(const uint32_t *) inptr;
251#endif
252
253 if (__glibc_unlikely (inval > 0x7fffffff))
254 {
255 /* The value is too large. We don't try transliteration here since
256 this is not an error because of the lack of possibilities to
257 represent the result. This is a genuine bug in the input since
258 UCS4 does not allow such values. */
259 if (irreversible == NULL)
260 /* We are transliterating, don't try to correct anything. */
261 return __GCONV_ILLEGAL_INPUT;
262
263 if (flags & __GCONV_IGNORE_ERRORS)
264 {
265 /* Just ignore this character. */
266 ++*irreversible;
267 continue;
268 }
269
270 *inptrp = inptr;
271 *outptrp = outptr;
272 return __GCONV_ILLEGAL_INPUT;
273 }
274
275 *((uint32_t *) outptr) = inval;
276 outptr += sizeof (uint32_t);
277 }
278
279 *inptrp = inptr;
280 *outptrp = outptr;
281
282 /* Determine the status. */
283 if (*inptrp == inend)
284 result = __GCONV_EMPTY_INPUT;
285 else if (*outptrp + 4 > outend)
286 result = __GCONV_FULL_OUTPUT;
287 else
288 result = __GCONV_INCOMPLETE_INPUT;
289
290 return result;
291}
292
293#if !_STRING_ARCH_unaligned
294static inline int
295__attribute ((always_inline))
296ucs4_internal_loop_unaligned (struct __gconv_step *step,
297 struct __gconv_step_data *step_data,
298 const unsigned char **inptrp,
299 const unsigned char *inend,
300 unsigned char **outptrp,
301 const unsigned char *outend,
302 size_t *irreversible)
303{
304 int flags = step_data->__flags;
305 const unsigned char *inptr = *inptrp;
306 unsigned char *outptr = *outptrp;
307 int result;
308
309 for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
310 {
311 if (__glibc_unlikely (inptr[0] > 0x80))
312 {
313 /* The value is too large. We don't try transliteration here since
314 this is not an error because of the lack of possibilities to
315 represent the result. This is a genuine bug in the input since
316 UCS4 does not allow such values. */
317 if (irreversible == NULL)
318 /* We are transliterating, don't try to correct anything. */
319 return __GCONV_ILLEGAL_INPUT;
320
321 if (flags & __GCONV_IGNORE_ERRORS)
322 {
323 /* Just ignore this character. */
324 ++*irreversible;
325 continue;
326 }
327
328 *inptrp = inptr;
329 *outptrp = outptr;
330 return __GCONV_ILLEGAL_INPUT;
331 }
332
333# if __BYTE_ORDER == __LITTLE_ENDIAN
334 outptr[3] = inptr[0];
335 outptr[2] = inptr[1];
336 outptr[1] = inptr[2];
337 outptr[0] = inptr[3];
338# else
339 outptr[0] = inptr[0];
340 outptr[1] = inptr[1];
341 outptr[2] = inptr[2];
342 outptr[3] = inptr[3];
343# endif
344 outptr += 4;
345 }
346
347 *inptrp = inptr;
348 *outptrp = outptr;
349
350 /* Determine the status. */
351 if (*inptrp == inend)
352 result = __GCONV_EMPTY_INPUT;
353 else if (*outptrp + 4 > outend)
354 result = __GCONV_FULL_OUTPUT;
355 else
356 result = __GCONV_INCOMPLETE_INPUT;
357
358 return result;
359}
360#endif
361
362
363static inline int
364__attribute ((always_inline))
365ucs4_internal_loop_single (struct __gconv_step *step,
366 struct __gconv_step_data *step_data,
367 const unsigned char **inptrp,
368 const unsigned char *inend,
369 unsigned char **outptrp,
370 const unsigned char *outend,
371 size_t *irreversible)
372{
373 mbstate_t *state = step_data->__statep;
374 int flags = step_data->__flags;
375 size_t cnt = state->__count & 7;
376
377 while (*inptrp < inend && cnt < 4)
378 state->__value.__wchb[cnt++] = *(*inptrp)++;
379
380 if (__glibc_unlikely (cnt < 4))
381 {
382 /* Still not enough bytes. Store the ones in the input buffer. */
383 state->__count &= ~7;
384 state->__count |= cnt;
385
386 return __GCONV_INCOMPLETE_INPUT;
387 }
388
389 if (__builtin_expect (((unsigned char *) state->__value.__wchb)[0] > 0x80,
390 0))
391 {
392 /* The value is too large. We don't try transliteration here since
393 this is not an error because of the lack of possibilities to
394 represent the result. This is a genuine bug in the input since
395 UCS4 does not allow such values. */
396 if (!(flags & __GCONV_IGNORE_ERRORS))
397 {
398 *inptrp -= cnt - (state->__count & 7);
399 return __GCONV_ILLEGAL_INPUT;
400 }
401 }
402 else
403 {
404#if __BYTE_ORDER == __LITTLE_ENDIAN
405 (*outptrp)[0] = state->__value.__wchb[3];
406 (*outptrp)[1] = state->__value.__wchb[2];
407 (*outptrp)[2] = state->__value.__wchb[1];
408 (*outptrp)[3] = state->__value.__wchb[0];
409#elif __BYTE_ORDER == __BIG_ENDIAN
410 (*outptrp)[0] = state->__value.__wchb[0];
411 (*outptrp)[1] = state->__value.__wchb[1];
412 (*outptrp)[2] = state->__value.__wchb[2];
413 (*outptrp)[3] = state->__value.__wchb[3];
414#endif
415
416 *outptrp += 4;
417 }
418
419 /* Clear the state buffer. */
420 state->__count &= ~7;
421
422 return __GCONV_OK;
423}
424
425#include <iconv/skeleton.c>
426
427
428/* Similarly for the little endian form. */
429#define DEFINE_INIT 0
430#define DEFINE_FINI 0
431#define MIN_NEEDED_FROM 4
432#define MIN_NEEDED_TO 4
433#define FROM_DIRECTION 1
434#define FROM_LOOP internal_ucs4le_loop
435#define TO_LOOP internal_ucs4le_loop /* This is not used. */
436#define FUNCTION_NAME __gconv_transform_internal_ucs4le
437#define ONE_DIRECTION 0
438
439
440static inline int
441__attribute ((always_inline))
442internal_ucs4le_loop (struct __gconv_step *step,
443 struct __gconv_step_data *step_data,
444 const unsigned char **inptrp, const unsigned char *inend,
445 unsigned char **outptrp, const unsigned char *outend,
446 size_t *irreversible)
447{
448 const unsigned char *inptr = *inptrp;
449 unsigned char *outptr = *outptrp;
450 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
451 int result;
452
453#if __BYTE_ORDER == __BIG_ENDIAN
454 /* Sigh, we have to do some real work. */
455 size_t cnt;
456 uint32_t *outptr32 = (uint32_t *) outptr;
457
458 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
459 *outptr32++ = bswap_32 (*(const uint32_t *) inptr);
460 outptr = (unsigned char *) outptr32;
461
462 *inptrp = inptr;
463 *outptrp = outptr;
464#elif __BYTE_ORDER == __LITTLE_ENDIAN
465 /* Simply copy the data. */
466 *inptrp = inptr + n_convert * 4;
467 *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
468#else
469# error "This endianess is not supported."
470#endif
471
472 /* Determine the status. */
473 if (*inptrp == inend)
474 result = __GCONV_EMPTY_INPUT;
475 else if (*outptrp + 4 > outend)
476 result = __GCONV_FULL_OUTPUT;
477 else
478 result = __GCONV_INCOMPLETE_INPUT;
479
480 return result;
481}
482
483#if !_STRING_ARCH_unaligned
484static inline int
485__attribute ((always_inline))
486internal_ucs4le_loop_unaligned (struct __gconv_step *step,
487 struct __gconv_step_data *step_data,
488 const unsigned char **inptrp,
489 const unsigned char *inend,
490 unsigned char **outptrp,
491 const unsigned char *outend,
492 size_t *irreversible)
493{
494 const unsigned char *inptr = *inptrp;
495 unsigned char *outptr = *outptrp;
496 size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
497 int result;
498
499# if __BYTE_ORDER == __BIG_ENDIAN
500 /* Sigh, we have to do some real work. */
501 size_t cnt;
502
503 for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4)
504 {
505 outptr[0] = inptr[3];
506 outptr[1] = inptr[2];
507 outptr[2] = inptr[1];
508 outptr[3] = inptr[0];
509 }
510
511 *inptrp = inptr;
512 *outptrp = outptr;
513# elif __BYTE_ORDER == __LITTLE_ENDIAN
514 /* Simply copy the data. */
515 *inptrp = inptr + n_convert * 4;
516 *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
517# else
518# error "This endianess is not supported."
519# endif
520
521 /* Determine the status. */
522 if (*inptrp == inend)
523 result = __GCONV_EMPTY_INPUT;
524 else if (*inptrp + 4 > inend)
525 result = __GCONV_INCOMPLETE_INPUT;
526 else
527 {
528 assert (*outptrp + 4 > outend);
529 result = __GCONV_FULL_OUTPUT;
530 }
531
532 return result;
533}
534#endif
535
536
537static inline int
538__attribute ((always_inline))
539internal_ucs4le_loop_single (struct __gconv_step *step,
540 struct __gconv_step_data *step_data,
541 const unsigned char **inptrp,
542 const unsigned char *inend,
543 unsigned char **outptrp,
544 const unsigned char *outend,
545 size_t *irreversible)
546{
547 mbstate_t *state = step_data->__statep;
548 size_t cnt = state->__count & 7;
549
550 while (*inptrp < inend && cnt < 4)
551 state->__value.__wchb[cnt++] = *(*inptrp)++;
552
553 if (__glibc_unlikely (cnt < 4))
554 {
555 /* Still not enough bytes. Store the ones in the input buffer. */
556 state->__count &= ~7;
557 state->__count |= cnt;
558
559 return __GCONV_INCOMPLETE_INPUT;
560 }
561
562#if __BYTE_ORDER == __BIG_ENDIAN
563 (*outptrp)[0] = state->__value.__wchb[3];
564 (*outptrp)[1] = state->__value.__wchb[2];
565 (*outptrp)[2] = state->__value.__wchb[1];
566 (*outptrp)[3] = state->__value.__wchb[0];
567
568#else
569 /* XXX unaligned */
570 (*outptrp)[0] = state->__value.__wchb[0];
571 (*outptrp)[1] = state->__value.__wchb[1];
572 (*outptrp)[2] = state->__value.__wchb[2];
573 (*outptrp)[3] = state->__value.__wchb[3];
574
575#endif
576
577 *outptrp += 4;
578
579 /* Clear the state buffer. */
580 state->__count &= ~7;
581
582 return __GCONV_OK;
583}
584
585#include <iconv/skeleton.c>
586
587
588/* And finally from UCS4-LE to the internal encoding. */
589#define DEFINE_INIT 0
590#define DEFINE_FINI 0
591#define MIN_NEEDED_FROM 4
592#define MIN_NEEDED_TO 4
593#define FROM_DIRECTION 1
594#define FROM_LOOP ucs4le_internal_loop
595#define TO_LOOP ucs4le_internal_loop /* This is not used. */
596#define FUNCTION_NAME __gconv_transform_ucs4le_internal
597#define ONE_DIRECTION 0
598
599
600static inline int
601__attribute ((always_inline))
602ucs4le_internal_loop (struct __gconv_step *step,
603 struct __gconv_step_data *step_data,
604 const unsigned char **inptrp, const unsigned char *inend,
605 unsigned char **outptrp, const unsigned char *outend,
606 size_t *irreversible)
607{
608 int flags = step_data->__flags;
609 const unsigned char *inptr = *inptrp;
610 unsigned char *outptr = *outptrp;
611 int result;
612
613 for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
614 {
615 uint32_t inval;
616
617#if __BYTE_ORDER == __BIG_ENDIAN
618 inval = bswap_32 (*(const uint32_t *) inptr);
619#else
620 inval = *(const uint32_t *) inptr;
621#endif
622
623 if (__glibc_unlikely (inval > 0x7fffffff))
624 {
625 /* The value is too large. We don't try transliteration here since
626 this is not an error because of the lack of possibilities to
627 represent the result. This is a genuine bug in the input since
628 UCS4 does not allow such values. */
629 if (irreversible == NULL)
630 /* We are transliterating, don't try to correct anything. */
631 return __GCONV_ILLEGAL_INPUT;
632
633 if (flags & __GCONV_IGNORE_ERRORS)
634 {
635 /* Just ignore this character. */
636 ++*irreversible;
637 continue;
638 }
639
640 *inptrp = inptr;
641 *outptrp = outptr;
642 return __GCONV_ILLEGAL_INPUT;
643 }
644
645 *((uint32_t *) outptr) = inval;
646 outptr += sizeof (uint32_t);
647 }
648
649 *inptrp = inptr;
650 *outptrp = outptr;
651
652 /* Determine the status. */
653 if (*inptrp == inend)
654 result = __GCONV_EMPTY_INPUT;
655 else if (*inptrp + 4 > inend)
656 result = __GCONV_INCOMPLETE_INPUT;
657 else
658 {
659 assert (*outptrp + 4 > outend);
660 result = __GCONV_FULL_OUTPUT;
661 }
662
663 return result;
664}
665
666#if !_STRING_ARCH_unaligned
667static inline int
668__attribute ((always_inline))
669ucs4le_internal_loop_unaligned (struct __gconv_step *step,
670 struct __gconv_step_data *step_data,
671 const unsigned char **inptrp,
672 const unsigned char *inend,
673 unsigned char **outptrp,
674 const unsigned char *outend,
675 size_t *irreversible)
676{
677 int flags = step_data->__flags;
678 const unsigned char *inptr = *inptrp;
679 unsigned char *outptr = *outptrp;
680 int result;
681
682 for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
683 {
684 if (__glibc_unlikely (inptr[3] > 0x80))
685 {
686 /* The value is too large. We don't try transliteration here since
687 this is not an error because of the lack of possibilities to
688 represent the result. This is a genuine bug in the input since
689 UCS4 does not allow such values. */
690 if (irreversible == NULL)
691 /* We are transliterating, don't try to correct anything. */
692 return __GCONV_ILLEGAL_INPUT;
693
694 if (flags & __GCONV_IGNORE_ERRORS)
695 {
696 /* Just ignore this character. */
697 ++*irreversible;
698 continue;
699 }
700
701 *inptrp = inptr;
702 *outptrp = outptr;
703 return __GCONV_ILLEGAL_INPUT;
704 }
705
706# if __BYTE_ORDER == __BIG_ENDIAN
707 outptr[3] = inptr[0];
708 outptr[2] = inptr[1];
709 outptr[1] = inptr[2];
710 outptr[0] = inptr[3];
711# else
712 outptr[0] = inptr[0];
713 outptr[1] = inptr[1];
714 outptr[2] = inptr[2];
715 outptr[3] = inptr[3];
716# endif
717
718 outptr += 4;
719 }
720
721 *inptrp = inptr;
722 *outptrp = outptr;
723
724 /* Determine the status. */
725 if (*inptrp == inend)
726 result = __GCONV_EMPTY_INPUT;
727 else if (*inptrp + 4 > inend)
728 result = __GCONV_INCOMPLETE_INPUT;
729 else
730 {
731 assert (*outptrp + 4 > outend);
732 result = __GCONV_FULL_OUTPUT;
733 }
734
735 return result;
736}
737#endif
738
739
740static inline int
741__attribute ((always_inline))
742ucs4le_internal_loop_single (struct __gconv_step *step,
743 struct __gconv_step_data *step_data,
744 const unsigned char **inptrp,
745 const unsigned char *inend,
746 unsigned char **outptrp,
747 const unsigned char *outend,
748 size_t *irreversible)
749{
750 mbstate_t *state = step_data->__statep;
751 int flags = step_data->__flags;
752 size_t cnt = state->__count & 7;
753
754 while (*inptrp < inend && cnt < 4)
755 state->__value.__wchb[cnt++] = *(*inptrp)++;
756
757 if (__glibc_unlikely (cnt < 4))
758 {
759 /* Still not enough bytes. Store the ones in the input buffer. */
760 state->__count &= ~7;
761 state->__count |= cnt;
762
763 return __GCONV_INCOMPLETE_INPUT;
764 }
765
766 if (__builtin_expect (((unsigned char *) state->__value.__wchb)[3] > 0x80,
767 0))
768 {
769 /* The value is too large. We don't try transliteration here since
770 this is not an error because of the lack of possibilities to
771 represent the result. This is a genuine bug in the input since
772 UCS4 does not allow such values. */
773 if (!(flags & __GCONV_IGNORE_ERRORS))
774 return __GCONV_ILLEGAL_INPUT;
775 }
776 else
777 {
778#if __BYTE_ORDER == __BIG_ENDIAN
779 (*outptrp)[0] = state->__value.__wchb[3];
780 (*outptrp)[1] = state->__value.__wchb[2];
781 (*outptrp)[2] = state->__value.__wchb[1];
782 (*outptrp)[3] = state->__value.__wchb[0];
783#else
784 (*outptrp)[0] = state->__value.__wchb[0];
785 (*outptrp)[1] = state->__value.__wchb[1];
786 (*outptrp)[2] = state->__value.__wchb[2];
787 (*outptrp)[3] = state->__value.__wchb[3];
788#endif
789
790 *outptrp += 4;
791 }
792
793 /* Clear the state buffer. */
794 state->__count &= ~7;
795
796 return __GCONV_OK;
797}
798
799#include <iconv/skeleton.c>
800
801
802/* Convert from ISO 646-IRV to the internal (UCS4-like) format. */
803#define DEFINE_INIT 0
804#define DEFINE_FINI 0
805#define MIN_NEEDED_FROM 1
806#define MIN_NEEDED_TO 4
807#define FROM_DIRECTION 1
808#define FROM_LOOP ascii_internal_loop
809#define TO_LOOP ascii_internal_loop /* This is not used. */
810#define FUNCTION_NAME __gconv_transform_ascii_internal
811#define ONE_DIRECTION 1
812
813#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
814#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
815#define LOOPFCT FROM_LOOP
816#define BODY \
817 { \
818 if (__glibc_unlikely (*inptr > '\x7f')) \
819 { \
820 /* The value is too large. We don't try transliteration here since \
821 this is not an error because of the lack of possibilities to \
822 represent the result. This is a genuine bug in the input since \
823 ASCII does not allow such values. */ \
824 STANDARD_FROM_LOOP_ERR_HANDLER (1); \
825 } \
826 else \
827 { \
828 /* It's an one byte sequence. */ \
829 *((uint32_t *) outptr) = *inptr++; \
830 outptr += sizeof (uint32_t); \
831 } \
832 }
833#define LOOP_NEED_FLAGS
834#include <iconv/loop.c>
835#include <iconv/skeleton.c>
836
837
838/* Convert from the internal (UCS4-like) format to ISO 646-IRV. */
839#define DEFINE_INIT 0
840#define DEFINE_FINI 0
841#define MIN_NEEDED_FROM 4
842#define MIN_NEEDED_TO 1
843#define FROM_DIRECTION 1
844#define FROM_LOOP internal_ascii_loop
845#define TO_LOOP internal_ascii_loop /* This is not used. */
846#define FUNCTION_NAME __gconv_transform_internal_ascii
847#define ONE_DIRECTION 1
848
849#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
850#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
851#define LOOPFCT FROM_LOOP
852#define BODY \
853 { \
854 if (__glibc_unlikely (*((const uint32_t *) inptr) > 0x7f)) \
855 { \
856 UNICODE_TAG_HANDLER (*((const uint32_t *) inptr), 4); \
857 STANDARD_TO_LOOP_ERR_HANDLER (4); \
858 } \
859 else \
860 { \
861 /* It's an one byte sequence. */ \
862 *outptr++ = *((const uint32_t *) inptr); \
863 inptr += sizeof (uint32_t); \
864 } \
865 }
866#define LOOP_NEED_FLAGS
867#include <iconv/loop.c>
868#include <iconv/skeleton.c>
869
870
871/* Convert from the internal (UCS4-like) format to UTF-8. */
872#define DEFINE_INIT 0
873#define DEFINE_FINI 0
874#define MIN_NEEDED_FROM 4
875#define MIN_NEEDED_TO 1
876#define MAX_NEEDED_TO 6
877#define FROM_DIRECTION 1
878#define FROM_LOOP internal_utf8_loop
879#define TO_LOOP internal_utf8_loop /* This is not used. */
880#define FUNCTION_NAME __gconv_transform_internal_utf8
881#define ONE_DIRECTION 1
882
883#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
884#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
885#define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
886#define LOOPFCT FROM_LOOP
887#define BODY \
888 { \
889 uint32_t wc = *((const uint32_t *) inptr); \
890 \
891 if (__glibc_likely (wc < 0x80)) \
892 /* It's an one byte sequence. */ \
893 *outptr++ = (unsigned char) wc; \
894 else if (__glibc_likely (wc <= 0x7fffffff \
895 && (wc < 0xd800 || wc > 0xdfff))) \
896 { \
897 size_t step; \
898 unsigned char *start; \
899 \
900 for (step = 2; step < 6; ++step) \
901 if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \
902 break; \
903 \
904 if (__glibc_unlikely (outptr + step > outend)) \
905 { \
906 /* Too long. */ \
907 result = __GCONV_FULL_OUTPUT; \
908 break; \
909 } \
910 \
911 start = outptr; \
912 *outptr = (unsigned char) (~0xff >> step); \
913 outptr += step; \
914 do \
915 { \
916 start[--step] = 0x80 | (wc & 0x3f); \
917 wc >>= 6; \
918 } \
919 while (step > 1); \
920 start[0] |= wc; \
921 } \
922 else \
923 { \
924 STANDARD_TO_LOOP_ERR_HANDLER (4); \
925 } \
926 \
927 inptr += 4; \
928 }
929#define LOOP_NEED_FLAGS
930#include <iconv/loop.c>
931#include <iconv/skeleton.c>
932
933
934/* Convert from UTF-8 to the internal (UCS4-like) format. */
935#define DEFINE_INIT 0
936#define DEFINE_FINI 0
937#define MIN_NEEDED_FROM 1
938#define MAX_NEEDED_FROM 6
939#define MIN_NEEDED_TO 4
940#define FROM_DIRECTION 1
941#define FROM_LOOP utf8_internal_loop
942#define TO_LOOP utf8_internal_loop /* This is not used. */
943#define FUNCTION_NAME __gconv_transform_utf8_internal
944#define ONE_DIRECTION 1
945
946#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
947#define MAX_NEEDED_INPUT MAX_NEEDED_FROM
948#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
949#define LOOPFCT FROM_LOOP
950#define BODY \
951 { \
952 /* Next input byte. */ \
953 uint32_t ch = *inptr; \
954 \
955 if (__glibc_likely (ch < 0x80)) \
956 { \
957 /* One byte sequence. */ \
958 ++inptr; \
959 } \
960 else \
961 { \
962 uint_fast32_t cnt; \
963 uint_fast32_t i; \
964 \
965 if (ch >= 0xc2 && ch < 0xe0) \
966 { \
967 /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \
968 otherwise the wide character could have been represented \
969 using a single byte. */ \
970 cnt = 2; \
971 ch &= 0x1f; \
972 } \
973 else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
974 { \
975 /* We expect three bytes. */ \
976 cnt = 3; \
977 ch &= 0x0f; \
978 } \
979 else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
980 { \
981 /* We expect four bytes. */ \
982 cnt = 4; \
983 ch &= 0x07; \
984 } \
985 else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
986 { \
987 /* We expect five bytes. */ \
988 cnt = 5; \
989 ch &= 0x03; \
990 } \
991 else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \
992 { \
993 /* We expect six bytes. */ \
994 cnt = 6; \
995 ch &= 0x01; \
996 } \
997 else \
998 { \
999 /* Search the end of this ill-formed UTF-8 character. This \
1000 is the next byte with (x & 0xc0) != 0x80. */ \
1001 i = 0; \
1002 do \
1003 ++i; \
1004 while (inptr + i < inend \
1005 && (*(inptr + i) & 0xc0) == 0x80 \
1006 && i < 5); \
1007 \
1008 errout: \
1009 STANDARD_FROM_LOOP_ERR_HANDLER (i); \
1010 } \
1011 \
1012 if (__glibc_unlikely (inptr + cnt > inend)) \
1013 { \
1014 /* We don't have enough input. But before we report that check \
1015 that all the bytes are correct. */ \
1016 for (i = 1; inptr + i < inend; ++i) \
1017 if ((inptr[i] & 0xc0) != 0x80) \
1018 break; \
1019 \
1020 if (__glibc_likely (inptr + i == inend)) \
1021 { \
1022 result = __GCONV_INCOMPLETE_INPUT; \
1023 break; \
1024 } \
1025 \
1026 goto errout; \
1027 } \
1028 \
1029 /* Read the possible remaining bytes. */ \
1030 for (i = 1; i < cnt; ++i) \
1031 { \
1032 uint32_t byte = inptr[i]; \
1033 \
1034 if ((byte & 0xc0) != 0x80) \
1035 /* This is an illegal encoding. */ \
1036 break; \
1037 \
1038 ch <<= 6; \
1039 ch |= byte & 0x3f; \
1040 } \
1041 \
1042 /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
1043 If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
1044 have been represented with fewer than cnt bytes. */ \
1045 if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \
1046 /* Do not accept UTF-16 surrogates. */ \
1047 || (ch >= 0xd800 && ch <= 0xdfff)) \
1048 { \
1049 /* This is an illegal encoding. */ \
1050 goto errout; \
1051 } \
1052 \
1053 inptr += cnt; \
1054 } \
1055 \
1056 /* Now adjust the pointers and store the result. */ \
1057 *((uint32_t *) outptr) = ch; \
1058 outptr += sizeof (uint32_t); \
1059 }
1060#define LOOP_NEED_FLAGS
1061
1062#define STORE_REST \
1063 { \
1064 /* We store the remaining bytes while converting them into the UCS4 \
1065 format. We can assume that the first byte in the buffer is \
1066 correct and that it requires a larger number of bytes than there \
1067 are in the input buffer. */ \
1068 wint_t ch = **inptrp; \
1069 size_t cnt, r; \
1070 \
1071 state->__count = inend - *inptrp; \
1072 \
1073 assert (ch != 0xc0 && ch != 0xc1); \
1074 if (ch >= 0xc2 && ch < 0xe0) \
1075 { \
1076 /* We expect two bytes. The first byte cannot be 0xc0 or \
1077 0xc1, otherwise the wide character could have been \
1078 represented using a single byte. */ \
1079 cnt = 2; \
1080 ch &= 0x1f; \
1081 } \
1082 else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
1083 { \
1084 /* We expect three bytes. */ \
1085 cnt = 3; \
1086 ch &= 0x0f; \
1087 } \
1088 else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
1089 { \
1090 /* We expect four bytes. */ \
1091 cnt = 4; \
1092 ch &= 0x07; \
1093 } \
1094 else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
1095 { \
1096 /* We expect five bytes. */ \
1097 cnt = 5; \
1098 ch &= 0x03; \
1099 } \
1100 else \
1101 { \
1102 /* We expect six bytes. */ \
1103 cnt = 6; \
1104 ch &= 0x01; \
1105 } \
1106 \
1107 /* The first byte is already consumed. */ \
1108 r = cnt - 1; \
1109 while (++(*inptrp) < inend) \
1110 { \
1111 ch <<= 6; \
1112 ch |= **inptrp & 0x3f; \
1113 --r; \
1114 } \
1115 \
1116 /* Shift for the so far missing bytes. */ \
1117 ch <<= r * 6; \
1118 \
1119 /* Store the number of bytes expected for the entire sequence. */ \
1120 state->__count |= cnt << 8; \
1121 \
1122 /* Store the value. */ \
1123 state->__value.__wch = ch; \
1124 }
1125
1126#define UNPACK_BYTES \
1127 { \
1128 static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
1129 wint_t wch = state->__value.__wch; \
1130 size_t ntotal = state->__count >> 8; \
1131 \
1132 inlen = state->__count & 255; \
1133 \
1134 bytebuf[0] = inmask[ntotal - 2]; \
1135 \
1136 do \
1137 { \
1138 if (--ntotal < inlen) \
1139 bytebuf[ntotal] = 0x80 | (wch & 0x3f); \
1140 wch >>= 6; \
1141 } \
1142 while (ntotal > 1); \
1143 \
1144 bytebuf[0] |= wch; \
1145 }
1146
1147#define CLEAR_STATE \
1148 state->__count = 0
1149
1150
1151#include <iconv/loop.c>
1152#include <iconv/skeleton.c>
1153
1154
1155/* Convert from UCS2 to the internal (UCS4-like) format. */
1156#define DEFINE_INIT 0
1157#define DEFINE_FINI 0
1158#define MIN_NEEDED_FROM 2
1159#define MIN_NEEDED_TO 4
1160#define FROM_DIRECTION 1
1161#define FROM_LOOP ucs2_internal_loop
1162#define TO_LOOP ucs2_internal_loop /* This is not used. */
1163#define FUNCTION_NAME __gconv_transform_ucs2_internal
1164#define ONE_DIRECTION 1
1165
1166#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1167#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1168#define LOOPFCT FROM_LOOP
1169#define BODY \
1170 { \
1171 uint16_t u1 = get16 (inptr); \
1172 \
1173 if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
1174 { \
1175 /* Surrogate characters in UCS-2 input are not valid. Reject \
1176 them. (Catching this here is not security relevant.) */ \
1177 STANDARD_FROM_LOOP_ERR_HANDLER (2); \
1178 } \
1179 \
1180 *((uint32_t *) outptr) = u1; \
1181 outptr += sizeof (uint32_t); \
1182 inptr += 2; \
1183 }
1184#define LOOP_NEED_FLAGS
1185#include <iconv/loop.c>
1186#include <iconv/skeleton.c>
1187
1188
1189/* Convert from the internal (UCS4-like) format to UCS2. */
1190#define DEFINE_INIT 0
1191#define DEFINE_FINI 0
1192#define MIN_NEEDED_FROM 4
1193#define MIN_NEEDED_TO 2
1194#define FROM_DIRECTION 1
1195#define FROM_LOOP internal_ucs2_loop
1196#define TO_LOOP internal_ucs2_loop /* This is not used. */
1197#define FUNCTION_NAME __gconv_transform_internal_ucs2
1198#define ONE_DIRECTION 1
1199
1200#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1201#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1202#define LOOPFCT FROM_LOOP
1203#define BODY \
1204 { \
1205 uint32_t val = *((const uint32_t *) inptr); \
1206 \
1207 if (__glibc_unlikely (val >= 0x10000)) \
1208 { \
1209 UNICODE_TAG_HANDLER (val, 4); \
1210 STANDARD_TO_LOOP_ERR_HANDLER (4); \
1211 } \
1212 else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
1213 { \
1214 /* Surrogate characters in UCS-4 input are not valid. \
1215 We must catch this, because the UCS-2 output might be \
1216 interpreted as UTF-16 by other programs. If we let \
1217 surrogates pass through, attackers could make a security \
1218 hole exploit by synthesizing any desired plane 1-16 \
1219 character. */ \
1220 result = __GCONV_ILLEGAL_INPUT; \
1221 if (! ignore_errors_p ()) \
1222 break; \
1223 inptr += 4; \
1224 ++*irreversible; \
1225 continue; \
1226 } \
1227 else \
1228 { \
1229 put16 (outptr, val); \
1230 outptr += sizeof (uint16_t); \
1231 inptr += 4; \
1232 } \
1233 }
1234#define LOOP_NEED_FLAGS
1235#include <iconv/loop.c>
1236#include <iconv/skeleton.c>
1237
1238
1239/* Convert from UCS2 in other endianness to the internal (UCS4-like) format. */
1240#define DEFINE_INIT 0
1241#define DEFINE_FINI 0
1242#define MIN_NEEDED_FROM 2
1243#define MIN_NEEDED_TO 4
1244#define FROM_DIRECTION 1
1245#define FROM_LOOP ucs2reverse_internal_loop
1246#define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/
1247#define FUNCTION_NAME __gconv_transform_ucs2reverse_internal
1248#define ONE_DIRECTION 1
1249
1250#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1251#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1252#define LOOPFCT FROM_LOOP
1253#define BODY \
1254 { \
1255 uint16_t u1 = bswap_16 (get16 (inptr)); \
1256 \
1257 if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
1258 { \
1259 /* Surrogate characters in UCS-2 input are not valid. Reject \
1260 them. (Catching this here is not security relevant.) */ \
1261 if (! ignore_errors_p ()) \
1262 { \
1263 result = __GCONV_ILLEGAL_INPUT; \
1264 break; \
1265 } \
1266 inptr += 2; \
1267 ++*irreversible; \
1268 continue; \
1269 } \
1270 \
1271 *((uint32_t *) outptr) = u1; \
1272 outptr += sizeof (uint32_t); \
1273 inptr += 2; \
1274 }
1275#define LOOP_NEED_FLAGS
1276#include <iconv/loop.c>
1277#include <iconv/skeleton.c>
1278
1279
1280/* Convert from the internal (UCS4-like) format to UCS2 in other endianness. */
1281#define DEFINE_INIT 0
1282#define DEFINE_FINI 0
1283#define MIN_NEEDED_FROM 4
1284#define MIN_NEEDED_TO 2
1285#define FROM_DIRECTION 1
1286#define FROM_LOOP internal_ucs2reverse_loop
1287#define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/
1288#define FUNCTION_NAME __gconv_transform_internal_ucs2reverse
1289#define ONE_DIRECTION 1
1290
1291#define MIN_NEEDED_INPUT MIN_NEEDED_FROM
1292#define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
1293#define LOOPFCT FROM_LOOP
1294#define BODY \
1295 { \
1296 uint32_t val = *((const uint32_t *) inptr); \
1297 if (__glibc_unlikely (val >= 0x10000)) \
1298 { \
1299 UNICODE_TAG_HANDLER (val, 4); \
1300 STANDARD_TO_LOOP_ERR_HANDLER (4); \
1301 } \
1302 else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
1303 { \
1304 /* Surrogate characters in UCS-4 input are not valid. \
1305 We must catch this, because the UCS-2 output might be \
1306 interpreted as UTF-16 by other programs. If we let \
1307 surrogates pass through, attackers could make a security \
1308 hole exploit by synthesizing any desired plane 1-16 \
1309 character. */ \
1310 if (! ignore_errors_p ()) \
1311 { \
1312 result = __GCONV_ILLEGAL_INPUT; \
1313 break; \
1314 } \
1315 inptr += 4; \
1316 ++*irreversible; \
1317 continue; \
1318 } \
1319 else \
1320 { \
1321 put16 (outptr, bswap_16 (val)); \
1322 outptr += sizeof (uint16_t); \
1323 inptr += 4; \
1324 } \
1325 }
1326#define LOOP_NEED_FLAGS
1327#include <iconv/loop.c>
1328#include <iconv/skeleton.c>
1329

source code of glibc/iconv/gconv_simple.c