| 1 | /* bug 19727: Testing UTF conversions with UTF16 surrogates as input. |
| 2 | Copyright (C) 2016-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <stdio.h> |
| 20 | #include <stdlib.h> |
| 21 | #include <errno.h> |
| 22 | #include <string.h> |
| 23 | #include <inttypes.h> |
| 24 | #include <iconv.h> |
| 25 | #include <byteswap.h> |
| 26 | |
| 27 | static int |
| 28 | run_conversion (const char *from, const char *to, char *inbuf, size_t inbuflen, |
| 29 | int exp_errno, int line) |
| 30 | { |
| 31 | char outbuf[16]; |
| 32 | iconv_t cd; |
| 33 | char *inptr; |
| 34 | size_t inlen; |
| 35 | char *outptr; |
| 36 | size_t outlen; |
| 37 | size_t n; |
| 38 | int e; |
| 39 | int fails = 0; |
| 40 | |
| 41 | cd = iconv_open (tocode: to, fromcode: from); |
| 42 | if (cd == (iconv_t) -1) |
| 43 | { |
| 44 | printf (format: "line %d: cannot convert from %s to %s: %m\n" , line, from, to); |
| 45 | return 1; |
| 46 | } |
| 47 | |
| 48 | inptr = (char *) inbuf; |
| 49 | inlen = inbuflen; |
| 50 | outptr = outbuf; |
| 51 | outlen = sizeof (outbuf); |
| 52 | |
| 53 | errno = 0; |
| 54 | n = iconv (cd: cd, inbuf: &inptr, inbytesleft: &inlen, outbuf: &outptr, outbytesleft: &outlen); |
| 55 | e = errno; |
| 56 | |
| 57 | if (exp_errno == 0) |
| 58 | { |
| 59 | if (n == (size_t) -1) |
| 60 | { |
| 61 | puts (s: "n should be >= 0, but n == -1" ); |
| 62 | fails ++; |
| 63 | } |
| 64 | |
| 65 | if (e != 0) |
| 66 | { |
| 67 | printf (format: "errno should be 0: 'Success', but errno == %d: '%s'\n" |
| 68 | , e, strerror(errnum: e)); |
| 69 | fails ++; |
| 70 | } |
| 71 | } |
| 72 | else |
| 73 | { |
| 74 | if (n != (size_t) -1) |
| 75 | { |
| 76 | printf (format: "n should be -1, but n == %zd\n" , n); |
| 77 | fails ++; |
| 78 | } |
| 79 | |
| 80 | if (e != exp_errno) |
| 81 | { |
| 82 | printf (format: "errno should be %d: '%s', but errno == %d: '%s'\n" |
| 83 | , exp_errno, strerror (errnum: exp_errno), e, strerror (errnum: e)); |
| 84 | fails ++; |
| 85 | } |
| 86 | } |
| 87 | |
| 88 | iconv_close (cd: cd); |
| 89 | |
| 90 | if (fails > 0) |
| 91 | { |
| 92 | printf (format: "Errors in line %d while converting %s to %s.\n\n" |
| 93 | , line, from, to); |
| 94 | } |
| 95 | |
| 96 | return fails; |
| 97 | } |
| 98 | |
| 99 | static int |
| 100 | do_test (void) |
| 101 | { |
| 102 | int fails = 0; |
| 103 | char buf[4]; |
| 104 | |
| 105 | /* This test runs iconv() with UTF character in range of an UTF16 surrogate. |
| 106 | UTF-16 high surrogate is in range 0xD800..0xDBFF and |
| 107 | UTF-16 low surrogate is in range 0xDC00..0xDFFF. |
| 108 | Converting from or to UTF-xx has to report errors in those cases. |
| 109 | In UTF-16, surrogate pairs with a high surrogate in front of a low |
| 110 | surrogate is valid. */ |
| 111 | |
| 112 | /* Use RUN_UCS4_UTF32_INPUT to test conversion ... |
| 113 | |
| 114 | ... from INTERNAL to UTF-xx[LE|BE]: |
| 115 | Converting from UCS4 to UTF-xx[LE|BE] first converts UCS4 to INTERNAL |
| 116 | without checking for UTF-16 surrogate values |
| 117 | and then converts from INTERNAL to UTF-xx[LE|BE]. |
| 118 | The latter conversion has to report an error in those cases. |
| 119 | |
| 120 | ... from UTF-32[LE|BE] to INTERNAL: |
| 121 | Converting directly from UTF-32LE to UTF-8|16 is needed, |
| 122 | because e.g. s390x has iconv-modules which converts directly. */ |
| 123 | #define RUN_UCS4_UTF32_INPUT(b0, b1, b2, b3, err, line) \ |
| 124 | buf[0] = b0; \ |
| 125 | buf[1] = b1; \ |
| 126 | buf[2] = b2; \ |
| 127 | buf[3] = b3; \ |
| 128 | fails += run_conversion ("UCS4", "UTF-8", buf, 4, err, line); \ |
| 129 | fails += run_conversion ("UCS4", "UTF-16LE", buf, 4, err, line); \ |
| 130 | fails += run_conversion ("UCS4", "UTF-16BE", buf, 4, err, line); \ |
| 131 | fails += run_conversion ("UCS4", "UTF-32LE", buf, 4, err, line); \ |
| 132 | fails += run_conversion ("UCS4", "UTF-32BE", buf, 4, err, line); \ |
| 133 | fails += run_conversion ("UTF-32BE", "WCHAR_T", buf, 4, err, line); \ |
| 134 | fails += run_conversion ("UTF-32BE", "UTF-8", buf, 4, err, line); \ |
| 135 | fails += run_conversion ("UTF-32BE", "UTF-16LE", buf, 4, err, line); \ |
| 136 | fails += run_conversion ("UTF-32BE", "UTF-16BE", buf, 4, err, line); \ |
| 137 | buf[0] = b3; \ |
| 138 | buf[1] = b2; \ |
| 139 | buf[2] = b1; \ |
| 140 | buf[3] = b0; \ |
| 141 | fails += run_conversion ("UTF-32LE", "WCHAR_T", buf, 4, err, line); \ |
| 142 | fails += run_conversion ("UTF-32LE", "UTF-8", buf, 4, err, line); \ |
| 143 | fails += run_conversion ("UTF-32LE", "UTF-16LE", buf, 4, err, line); \ |
| 144 | fails += run_conversion ("UTF-32LE", "UTF-16BE", buf, 4, err, line); |
| 145 | |
| 146 | /* Use UCS4/UTF32 input of 0xD7FF. */ |
| 147 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xD7, 0xFF, 0, __LINE__); |
| 148 | |
| 149 | /* Use UCS4/UTF32 input of 0xD800. */ |
| 150 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xD8, 0x00, EILSEQ, __LINE__); |
| 151 | |
| 152 | /* Use UCS4/UTF32 input of 0xDBFF. */ |
| 153 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDB, 0xFF, EILSEQ, __LINE__); |
| 154 | |
| 155 | /* Use UCS4/UTF32 input of 0xDC00. */ |
| 156 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDC, 0x00, EILSEQ, __LINE__); |
| 157 | |
| 158 | /* Use UCS4/UTF32 input of 0xDFFF. */ |
| 159 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xDF, 0xFF, EILSEQ, __LINE__); |
| 160 | |
| 161 | /* Use UCS4/UTF32 input of 0xE000. */ |
| 162 | RUN_UCS4_UTF32_INPUT (0x0, 0x0, 0xE0, 0x00, 0, __LINE__); |
| 163 | |
| 164 | |
| 165 | /* Use RUN_UTF16_INPUT to test conversion from UTF16[LE|BE] to INTERNAL. |
| 166 | Converting directly from UTF-16 to UTF-8|32 is needed, |
| 167 | because e.g. s390x has iconv-modules which converts directly. |
| 168 | Use len == 2 or 4 to specify one or two UTF-16 characters. */ |
| 169 | #define RUN_UTF16_INPUT(b0, b1, b2, b3, len, err, line) \ |
| 170 | buf[0] = b0; \ |
| 171 | buf[1] = b1; \ |
| 172 | buf[2] = b2; \ |
| 173 | buf[3] = b3; \ |
| 174 | fails += run_conversion ("UTF-16BE", "WCHAR_T", buf, len, err, line); \ |
| 175 | fails += run_conversion ("UTF-16BE", "UTF-8", buf, len, err, line); \ |
| 176 | fails += run_conversion ("UTF-16BE", "UTF-32LE", buf, len, err, line); \ |
| 177 | fails += run_conversion ("UTF-16BE", "UTF-32BE", buf, len, err, line); \ |
| 178 | buf[0] = b1; \ |
| 179 | buf[1] = b0; \ |
| 180 | buf[2] = b3; \ |
| 181 | buf[3] = b2; \ |
| 182 | fails += run_conversion ("UTF-16LE", "WCHAR_T", buf, len, err, line); \ |
| 183 | fails += run_conversion ("UTF-16LE", "UTF-8", buf, len, err, line); \ |
| 184 | fails += run_conversion ("UTF-16LE", "UTF-32LE", buf, len, err, line); \ |
| 185 | fails += run_conversion ("UTF-16LE", "UTF-32BE", buf, len, err, line); |
| 186 | |
| 187 | /* Use UTF16 input of 0xD7FF. */ |
| 188 | RUN_UTF16_INPUT (0xD7, 0xFF, 0xD7, 0xFF, 4, 0, __LINE__); |
| 189 | |
| 190 | /* Use [single] UTF16 high surrogate 0xD800 [with a valid character behind]. |
| 191 | And check an UTF16 surrogate pair [without valid low surrogate]. */ |
| 192 | RUN_UTF16_INPUT (0xD8, 0x0, 0x0, 0x0, 2, EINVAL, __LINE__); |
| 193 | RUN_UTF16_INPUT (0xD8, 0x0, 0xD7, 0xFF, 4, EILSEQ, __LINE__); |
| 194 | RUN_UTF16_INPUT (0xD8, 0x0, 0xD8, 0x0, 4, EILSEQ, __LINE__); |
| 195 | RUN_UTF16_INPUT (0xD8, 0x0, 0xE0, 0x0, 4, EILSEQ, __LINE__); |
| 196 | RUN_UTF16_INPUT (0xD8, 0x0, 0xDC, 0x0, 4, 0, __LINE__); |
| 197 | |
| 198 | /* Use [single] UTF16 high surrogate 0xDBFF [with a valid character behind]. |
| 199 | And check an UTF16 surrogate pair [without valid low surrogate]. */ |
| 200 | RUN_UTF16_INPUT (0xDB, 0xFF, 0x0, 0x0, 2, EINVAL, __LINE__); |
| 201 | RUN_UTF16_INPUT (0xDB, 0xFF, 0xD7, 0xFF, 4, EILSEQ, __LINE__); |
| 202 | RUN_UTF16_INPUT (0xDB, 0xFF, 0xDB, 0xFF, 4, EILSEQ, __LINE__); |
| 203 | RUN_UTF16_INPUT (0xDB, 0xFF, 0xE0, 0x0, 4, EILSEQ, __LINE__); |
| 204 | RUN_UTF16_INPUT (0xDB, 0xFF, 0xDF, 0xFF, 4, 0, __LINE__); |
| 205 | |
| 206 | /* Use single UTF16 low surrogate 0xDC00 [with a valid character behind]. |
| 207 | And check an UTF16 surrogate pair [without valid high surrogate]. */ |
| 208 | RUN_UTF16_INPUT (0xDC, 0x0, 0x0, 0x0, 2, EILSEQ, __LINE__); |
| 209 | RUN_UTF16_INPUT (0xDC, 0x0, 0xD7, 0xFF, 4, EILSEQ, __LINE__); |
| 210 | RUN_UTF16_INPUT (0xD8, 0x0, 0xDC, 0x0, 4, 0, __LINE__); |
| 211 | RUN_UTF16_INPUT (0xD7, 0xFF, 0xDC, 0x0, 4, EILSEQ, __LINE__); |
| 212 | RUN_UTF16_INPUT (0xDC, 0x0, 0xDC, 0x0, 4, EILSEQ, __LINE__); |
| 213 | RUN_UTF16_INPUT (0xE0, 0x0, 0xDC, 0x0, 4, EILSEQ, __LINE__); |
| 214 | |
| 215 | /* Use single UTF16 low surrogate 0xDFFF [with a valid character behind]. |
| 216 | And check an UTF16 surrogate pair [without valid high surrogate]. */ |
| 217 | RUN_UTF16_INPUT (0xDF, 0xFF, 0x0, 0x0, 2, EILSEQ, __LINE__); |
| 218 | RUN_UTF16_INPUT (0xDF, 0xFF, 0xD7, 0xFF, 4, EILSEQ, __LINE__); |
| 219 | RUN_UTF16_INPUT (0xDB, 0xFF, 0xDF, 0xFF, 4, 0, __LINE__); |
| 220 | RUN_UTF16_INPUT (0xD7, 0xFF, 0xDF, 0xFF, 4, EILSEQ, __LINE__); |
| 221 | RUN_UTF16_INPUT (0xDF, 0xFF, 0xDF, 0xFF, 4, EILSEQ, __LINE__); |
| 222 | RUN_UTF16_INPUT (0xE0, 0x0, 0xDF, 0xFF, 4, EILSEQ, __LINE__); |
| 223 | |
| 224 | /* Use UCS4/UTF32 input of 0xE000. */ |
| 225 | RUN_UTF16_INPUT (0xE0, 0x0, 0xE0, 0x0, 4, 0, __LINE__); |
| 226 | |
| 227 | |
| 228 | /* Use RUN_UTF8_3BYTE_INPUT to test conversion from UTF-8 to INTERNAL. |
| 229 | Converting directly from UTF-8 to UTF-16|32 is needed, |
| 230 | because e.g. s390x has iconv-modules which converts directly. */ |
| 231 | #define RUN_UTF8_3BYTE_INPUT(b0, b1, b2, err, line) \ |
| 232 | buf[0] = b0; \ |
| 233 | buf[1] = b1; \ |
| 234 | buf[2] = b2; \ |
| 235 | fails += run_conversion ("UTF-8", "WCHAR_T", buf, 3, err, line); \ |
| 236 | fails += run_conversion ("UTF-8", "UTF-16LE", buf, 3, err, line); \ |
| 237 | fails += run_conversion ("UTF-8", "UTF-16BE", buf, 3, err, line); \ |
| 238 | fails += run_conversion ("UTF-8", "UTF-32LE", buf, 3, err, line); \ |
| 239 | fails += run_conversion ("UTF-8", "UTF-32BE", buf, 3, err, line); |
| 240 | |
| 241 | /* Use UTF-8 input of 0xD7FF. */ |
| 242 | RUN_UTF8_3BYTE_INPUT (0xED, 0x9F, 0xBF, 0, __LINE__); |
| 243 | |
| 244 | /* Use UTF-8 input of 0xD800. */ |
| 245 | RUN_UTF8_3BYTE_INPUT (0xED, 0xA0, 0x80, EILSEQ, __LINE__); |
| 246 | |
| 247 | /* Use UTF-8 input of 0xDBFF. */ |
| 248 | RUN_UTF8_3BYTE_INPUT (0xED, 0xAF, 0xBF, EILSEQ, __LINE__); |
| 249 | |
| 250 | /* Use UTF-8 input of 0xDC00. */ |
| 251 | RUN_UTF8_3BYTE_INPUT (0xED, 0xB0, 0x80, EILSEQ, __LINE__); |
| 252 | |
| 253 | /* Use UTF-8 input of 0xDFFF. */ |
| 254 | RUN_UTF8_3BYTE_INPUT (0xED, 0xBF, 0xBF, EILSEQ, __LINE__); |
| 255 | |
| 256 | /* Use UTF-8 input of 0xF000. */ |
| 257 | RUN_UTF8_3BYTE_INPUT (0xEF, 0x80, 0x80, 0, __LINE__); |
| 258 | |
| 259 | return fails > 0 ? EXIT_FAILURE : EXIT_SUCCESS; |
| 260 | } |
| 261 | |
| 262 | #define TEST_FUNCTION do_test () |
| 263 | #include "../test-skeleton.c" |
| 264 | |