1/* UTF-8 to multibyte conversion.
2 Copyright (C) 2022-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <errno.h>
20#include <uchar.h>
21#include <wchar.h>
22
23
24/* This is the private state used if PS is NULL. */
25static mbstate_t state;
26
27size_t
28c8rtomb (char *s, char8_t c8, mbstate_t *ps)
29{
30 /* This implementation depends on the converter invoked by wcrtomb not
31 needing to retain state in either the top most bit of ps->__count or
32 in ps->__value between invocations. This implementation uses the
33 top most bit of ps->__count to indicate that trailing code units are
34 expected and uses ps->__value to store previously seen code units. */
35
36 wchar_t wc;
37
38 if (ps == NULL)
39 ps = &state;
40
41 if (s == NULL)
42 {
43 /* if 's' is a null pointer, behave as if u8'\0' was passed as 'c8'. If
44 this occurs for an incomplete code unit sequence, then an error will
45 be reported below. */
46 c8 = u8""[0];
47 }
48
49 if (! (ps->__count & 0x80000000))
50 {
51 /* Initial state. */
52 if ((c8 >= 0x80 && c8 <= 0xC1) || c8 >= 0xF5)
53 {
54 /* An invalid lead code unit. */
55 __set_errno (EILSEQ);
56 return -1;
57 }
58 if (c8 >= 0xC2)
59 {
60 /* A valid lead code unit. */
61 ps->__count |= 0x80000000;
62 ps->__value.__wchb[0] = c8;
63 ps->__value.__wchb[3] = 1;
64 return 0;
65 }
66 /* A single byte (ASCII) code unit. */
67 wc = c8;
68 }
69 else
70 {
71 char8_t cu1 = ps->__value.__wchb[0];
72 if (ps->__value.__wchb[3] == 1)
73 {
74 /* A single lead code unit was previously seen. */
75 if ((c8 < 0x80 || c8 > 0xBF)
76 || (cu1 == 0xE0 && c8 < 0xA0)
77 || (cu1 == 0xED && c8 > 0x9F)
78 || (cu1 == 0xF0 && c8 < 0x90)
79 || (cu1 == 0xF4 && c8 > 0x8F))
80 {
81 /* An invalid second code unit. */
82 __set_errno (EILSEQ);
83 return -1;
84 }
85 if (cu1 >= 0xE0)
86 {
87 /* A three or four code unit sequence. */
88 ps->__value.__wchb[1] = c8;
89 ++ps->__value.__wchb[3];
90 return 0;
91 }
92 wc = ((cu1 & 0x1F) << 6)
93 + (c8 & 0x3F);
94 }
95 else
96 {
97 char8_t cu2 = ps->__value.__wchb[1];
98 /* A three or four byte code unit sequence. */
99 if (c8 < 0x80 || c8 > 0xBF)
100 {
101 /* An invalid third or fourth code unit. */
102 __set_errno (EILSEQ);
103 return -1;
104 }
105 if (ps->__value.__wchb[3] == 2 && cu1 >= 0xF0)
106 {
107 /* A four code unit sequence. */
108 ps->__value.__wchb[2] = c8;
109 ++ps->__value.__wchb[3];
110 return 0;
111 }
112 if (cu1 < 0xF0)
113 {
114 wc = ((cu1 & 0x0F) << 12)
115 + ((cu2 & 0x3F) << 6)
116 + (c8 & 0x3F);
117 }
118 else
119 {
120 char8_t cu3 = ps->__value.__wchb[2];
121 wc = ((cu1 & 0x07) << 18)
122 + ((cu2 & 0x3F) << 12)
123 + ((cu3 & 0x3F) << 6)
124 + (c8 & 0x3F);
125 }
126 }
127 ps->__count &= 0x7fffffff;
128 ps->__value.__wch = 0;
129 }
130
131 return wcrtomb (s, wc, ps);
132}
133

source code of glibc/wcsmbs/c8rtomb.c