1 | /* Copyright (C) 2000-2022 Free Software Foundation, Inc. |
2 | This file is part of the GNU C Library. |
3 | |
4 | The GNU C Library is free software; you can redistribute it and/or |
5 | modify it under the terms of the GNU Lesser General Public |
6 | License as published by the Free Software Foundation; either |
7 | version 2.1 of the License, or (at your option) any later version. |
8 | |
9 | The GNU C Library is distributed in the hope that it will be useful, |
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | Lesser General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU Lesser General Public |
15 | License along with the GNU C Library; if not, see |
16 | <https://www.gnu.org/licenses/>. */ |
17 | |
18 | /* Create a table from CHARSET to Unicode. |
19 | This is a good test for CHARSET's iconv() module, in particular the |
20 | FROM_LOOP BODY macro. */ |
21 | |
22 | #include <stddef.h> |
23 | #include <stdio.h> |
24 | #include <stdlib.h> |
25 | #include <string.h> |
26 | #include <iconv.h> |
27 | #include <errno.h> |
28 | |
29 | /* If nonzero, ignore conversions outside Unicode plane 0. */ |
30 | static int bmp_only; |
31 | |
32 | /* Converts a byte buffer to a hexadecimal string. */ |
33 | static const char* |
34 | hexbuf (unsigned char buf[], unsigned int buflen) |
35 | { |
36 | static char msg[50]; |
37 | |
38 | switch (buflen) |
39 | { |
40 | case 1: |
41 | sprintf (s: msg, format: "0x%02X" , buf[0]); |
42 | break; |
43 | case 2: |
44 | sprintf (s: msg, format: "0x%02X%02X" , buf[0], buf[1]); |
45 | break; |
46 | case 3: |
47 | sprintf (s: msg, format: "0x%02X%02X%02X" , buf[0], buf[1], buf[2]); |
48 | break; |
49 | case 4: |
50 | sprintf (s: msg, format: "0x%02X%02X%02X%02X" , buf[0], buf[1], buf[2], buf[3]); |
51 | break; |
52 | default: |
53 | abort (); |
54 | } |
55 | return msg; |
56 | } |
57 | |
58 | /* Attempts to convert a byte buffer BUF (BUFLEN bytes) to OUT (12 bytes) |
59 | using the conversion descriptor CD. Returns the number of written bytes, |
60 | or 0 if ambiguous, or -1 if invalid. */ |
61 | static int |
62 | try (iconv_t cd, unsigned char buf[], unsigned int buflen, unsigned char *out) |
63 | { |
64 | const char *inbuf = (const char *) buf; |
65 | size_t inbytesleft = buflen; |
66 | char *outbuf = (char *) out; |
67 | size_t outbytesleft = 12; |
68 | size_t result; |
69 | |
70 | iconv (cd: cd, NULL, NULL, NULL, NULL); |
71 | result = iconv (cd: cd, inbuf: (char **) &inbuf, inbytesleft: &inbytesleft, outbuf: &outbuf, outbytesleft: &outbytesleft); |
72 | if (result != (size_t)(-1)) |
73 | result = iconv (cd: cd, NULL, NULL, outbuf: &outbuf, outbytesleft: &outbytesleft); |
74 | |
75 | if (result == (size_t)(-1)) |
76 | { |
77 | if (errno == EILSEQ) |
78 | { |
79 | return -1; |
80 | } |
81 | else if (errno == EINVAL) |
82 | { |
83 | return 0; |
84 | } |
85 | else |
86 | { |
87 | int saved_errno = errno; |
88 | fprintf (stderr, format: "%s: iconv error: " , hexbuf (buf, buflen)); |
89 | errno = saved_errno; |
90 | perror (s: "" ); |
91 | exit (status: 1); |
92 | } |
93 | } |
94 | else |
95 | { |
96 | if (inbytesleft != 0) |
97 | { |
98 | fprintf (stderr, format: "%s: inbytes = %ld, outbytes = %ld\n" , |
99 | hexbuf (buf, buflen), |
100 | (long) (buflen - inbytesleft), |
101 | (long) (12 - outbytesleft)); |
102 | exit (status: 1); |
103 | } |
104 | return 12 - outbytesleft; |
105 | } |
106 | } |
107 | |
108 | /* Returns the out[] buffer as a Unicode value, formatted as 0x%04X. */ |
109 | static const char * |
110 | utf8_decode (const unsigned char *out, unsigned int outlen) |
111 | { |
112 | static char hexbuf[84]; |
113 | char *p = hexbuf; |
114 | |
115 | while (outlen > 0) |
116 | { |
117 | if (p > hexbuf) |
118 | *p++ = ' '; |
119 | |
120 | if (out[0] < 0x80) |
121 | { |
122 | sprintf (s: p, format: "0x%04X" , out[0]); |
123 | out += 1; outlen -= 1; |
124 | } |
125 | else if (out[0] >= 0xc0 && out[0] < 0xe0 && outlen >= 2) |
126 | { |
127 | sprintf (s: p, format: "0x%04X" , ((out[0] & 0x1f) << 6) + (out[1] & 0x3f)); |
128 | out += 2; outlen -= 2; |
129 | } |
130 | else if (out[0] >= 0xe0 && out[0] < 0xf0 && outlen >= 3) |
131 | { |
132 | sprintf (s: p, format: "0x%04X" , ((out[0] & 0x0f) << 12) |
133 | + ((out[1] & 0x3f) << 6) + (out[2] & 0x3f)); |
134 | out += 3; outlen -= 3; |
135 | } |
136 | else if (out[0] >= 0xf0 && out[0] < 0xf8 && outlen >= 4) |
137 | { |
138 | sprintf (s: p, format: "0x%04X" , ((out[0] & 0x07) << 18) |
139 | + ((out[1] & 0x3f) << 12) |
140 | + ((out[2] & 0x3f) << 6) + (out[3] & 0x3f)); |
141 | out += 4; outlen -= 4; |
142 | } |
143 | else if (out[0] >= 0xf8 && out[0] < 0xfc && outlen >= 5) |
144 | { |
145 | sprintf (s: p, format: "0x%04X" , ((out[0] & 0x03) << 24) |
146 | + ((out[1] & 0x3f) << 18) |
147 | + ((out[2] & 0x3f) << 12) |
148 | + ((out[3] & 0x3f) << 6) + (out[4] & 0x3f)); |
149 | out += 5; outlen -= 5; |
150 | } |
151 | else if (out[0] >= 0xfc && out[0] < 0xfe && outlen >= 6) |
152 | { |
153 | sprintf (s: p, format: "0x%04X" , ((out[0] & 0x01) << 30) |
154 | + ((out[1] & 0x3f) << 24) |
155 | + ((out[2] & 0x3f) << 18) |
156 | + ((out[3] & 0x3f) << 12) |
157 | + ((out[4] & 0x3f) << 6) + (out[5] & 0x3f)); |
158 | out += 6; outlen -= 6; |
159 | } |
160 | else |
161 | { |
162 | sprintf (s: p, format: "0x????" ); |
163 | out += 1; outlen -= 1; |
164 | } |
165 | |
166 | if (bmp_only && strlen (s: p) > 6) |
167 | /* Ignore conversions outside Unicode plane 0. */ |
168 | return NULL; |
169 | |
170 | p += strlen (s: p); |
171 | } |
172 | |
173 | return hexbuf; |
174 | } |
175 | |
176 | int |
177 | main (int argc, char *argv[]) |
178 | { |
179 | const char *charset; |
180 | iconv_t cd; |
181 | int search_depth; |
182 | |
183 | if (argc != 2) |
184 | { |
185 | fprintf (stderr, format: "Usage: tst-table-from charset\n" ); |
186 | exit (status: 1); |
187 | } |
188 | charset = argv[1]; |
189 | |
190 | cd = iconv_open (tocode: "UTF-8" , fromcode: charset); |
191 | if (cd == (iconv_t)(-1)) |
192 | { |
193 | perror (s: "iconv_open" ); |
194 | exit (status: 1); |
195 | } |
196 | |
197 | /* When testing UTF-8 or GB18030, stop at 0x10000, otherwise the output |
198 | file gets too big. */ |
199 | bmp_only = (strcmp (s1: charset, s2: "UTF-8" ) == 0 |
200 | || strcmp (s1: charset, s2: "GB18030" ) == 0); |
201 | search_depth = (strcmp (s1: charset, s2: "UTF-8" ) == 0 ? 3 : 4); |
202 | |
203 | { |
204 | unsigned char out[12]; |
205 | unsigned char buf[4]; |
206 | unsigned int i0, i1, i2, i3; |
207 | int result; |
208 | |
209 | for (i0 = 0; i0 < 0x100; i0++) |
210 | { |
211 | buf[0] = i0; |
212 | result = try (cd, buf, buflen: 1, out); |
213 | if (result < 0) |
214 | { |
215 | } |
216 | else if (result > 0) |
217 | { |
218 | const char *unicode = utf8_decode (out, outlen: result); |
219 | if (unicode != NULL) |
220 | printf (format: "0x%02X\t%s\n" , i0, unicode); |
221 | } |
222 | else |
223 | { |
224 | for (i1 = 0; i1 < 0x100; i1++) |
225 | { |
226 | buf[1] = i1; |
227 | result = try (cd, buf, buflen: 2, out); |
228 | if (result < 0) |
229 | { |
230 | } |
231 | else if (result > 0) |
232 | { |
233 | const char *unicode = utf8_decode (out, outlen: result); |
234 | if (unicode != NULL) |
235 | printf (format: "0x%02X%02X\t%s\n" , i0, i1, unicode); |
236 | } |
237 | else |
238 | { |
239 | for (i2 = 0; i2 < 0x100; i2++) |
240 | { |
241 | buf[2] = i2; |
242 | result = try (cd, buf, buflen: 3, out); |
243 | if (result < 0) |
244 | { |
245 | } |
246 | else if (result > 0) |
247 | { |
248 | const char *unicode = utf8_decode (out, outlen: result); |
249 | if (unicode != NULL) |
250 | printf (format: "0x%02X%02X%02X\t%s\n" , |
251 | i0, i1, i2, unicode); |
252 | } |
253 | else if (search_depth > 3) |
254 | { |
255 | for (i3 = 0; i3 < 0x100; i3++) |
256 | { |
257 | buf[3] = i3; |
258 | result = try (cd, buf, buflen: 4, out); |
259 | if (result < 0) |
260 | { |
261 | } |
262 | else if (result > 0) |
263 | { |
264 | const char *unicode = |
265 | utf8_decode (out, outlen: result); |
266 | if (unicode != NULL) |
267 | printf (format: "0x%02X%02X%02X%02X\t%s\n" , |
268 | i0, i1, i2, i3, unicode); |
269 | } |
270 | else |
271 | { |
272 | fprintf (stderr, |
273 | format: "%s: incomplete byte sequence\n" , |
274 | hexbuf (buf, buflen: 4)); |
275 | exit (status: 1); |
276 | } |
277 | } |
278 | } |
279 | } |
280 | } |
281 | } |
282 | } |
283 | } |
284 | } |
285 | |
286 | if (iconv_close (cd: cd) < 0) |
287 | { |
288 | perror (s: "iconv_close" ); |
289 | exit (status: 1); |
290 | } |
291 | |
292 | if (ferror (stdin) || fflush (stdout) || ferror (stdout)) |
293 | { |
294 | fprintf (stderr, format: "I/O error\n" ); |
295 | exit (status: 1); |
296 | } |
297 | |
298 | return 0; |
299 | } |
300 | |