1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Some of the source code in this file came from fs/cifs/cifs_unicode.c |
4 | * |
5 | * Copyright (c) International Business Machines Corp., 2000,2009 |
6 | * Modified by Steve French (sfrench@us.ibm.com) |
7 | * Modified by Namjae Jeon (linkinjeon@kernel.org) |
8 | */ |
9 | #include <linux/fs.h> |
10 | #include <linux/slab.h> |
11 | #include <asm/unaligned.h> |
12 | #include "glob.h" |
13 | #include "unicode.h" |
14 | #include "smb_common.h" |
15 | |
16 | /* |
17 | * cifs_mapchar() - convert a host-endian char to proper char in codepage |
18 | * @target: where converted character should be copied |
19 | * @from: host-endian source string |
20 | * @cp: codepage to which character should be converted |
21 | * @mapchar: should character be mapped according to mapchars mount option? |
22 | * |
23 | * This function handles the conversion of a single character. It is the |
24 | * responsibility of the caller to ensure that the target buffer is large |
25 | * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). |
26 | * |
27 | * Return: string length after conversion |
28 | */ |
29 | static int |
30 | cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, |
31 | bool mapchar) |
32 | { |
33 | int len = 1; |
34 | __u16 src_char; |
35 | |
36 | src_char = *from; |
37 | |
38 | if (!mapchar) |
39 | goto cp_convert; |
40 | |
41 | /* |
42 | * BB: Cannot handle remapping UNI_SLASH until all the calls to |
43 | * build_path_from_dentry are modified, as they use slash as |
44 | * separator. |
45 | */ |
46 | switch (src_char) { |
47 | case UNI_COLON: |
48 | *target = ':'; |
49 | break; |
50 | case UNI_ASTERISK: |
51 | *target = '*'; |
52 | break; |
53 | case UNI_QUESTION: |
54 | *target = '?'; |
55 | break; |
56 | case UNI_PIPE: |
57 | *target = '|'; |
58 | break; |
59 | case UNI_GRTRTHAN: |
60 | *target = '>'; |
61 | break; |
62 | case UNI_LESSTHAN: |
63 | *target = '<'; |
64 | break; |
65 | default: |
66 | goto cp_convert; |
67 | } |
68 | |
69 | out: |
70 | return len; |
71 | |
72 | cp_convert: |
73 | len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); |
74 | if (len <= 0) |
75 | goto surrogate_pair; |
76 | |
77 | goto out; |
78 | |
79 | surrogate_pair: |
80 | /* convert SURROGATE_PAIR and IVS */ |
81 | if (strcmp(cp->charset, "utf8" )) |
82 | goto unknown; |
83 | len = utf16s_to_utf8s(pwcs: from, len: 3, endian: UTF16_LITTLE_ENDIAN, s: target, maxlen: 6); |
84 | if (len <= 0) |
85 | goto unknown; |
86 | return len; |
87 | |
88 | unknown: |
89 | *target = '?'; |
90 | len = 1; |
91 | goto out; |
92 | } |
93 | |
94 | /* |
95 | * smb_utf16_bytes() - compute converted string length |
96 | * @from: pointer to input string |
97 | * @maxbytes: input string length |
98 | * @codepage: destination codepage |
99 | * |
100 | * Walk a utf16le string and return the number of bytes that the string will |
101 | * be after being converted to the given charset, not including any null |
102 | * termination required. Don't walk past maxbytes in the source buffer. |
103 | * |
104 | * Return: string length after conversion |
105 | */ |
106 | static int smb_utf16_bytes(const __le16 *from, int maxbytes, |
107 | const struct nls_table *codepage) |
108 | { |
109 | int i, j; |
110 | int charlen, outlen = 0; |
111 | int maxwords = maxbytes / 2; |
112 | char tmp[NLS_MAX_CHARSET_SIZE]; |
113 | __u16 ftmp[3]; |
114 | |
115 | for (i = 0; i < maxwords; i++) { |
116 | ftmp[0] = get_unaligned_le16(p: &from[i]); |
117 | if (ftmp[0] == 0) |
118 | break; |
119 | for (j = 1; j <= 2; j++) { |
120 | if (i + j < maxwords) |
121 | ftmp[j] = get_unaligned_le16(p: &from[i + j]); |
122 | else |
123 | ftmp[j] = 0; |
124 | } |
125 | |
126 | charlen = cifs_mapchar(target: tmp, from: ftmp, cp: codepage, mapchar: 0); |
127 | if (charlen > 0) |
128 | outlen += charlen; |
129 | else |
130 | outlen++; |
131 | } |
132 | |
133 | return outlen; |
134 | } |
135 | |
136 | /* |
137 | * smb_from_utf16() - convert utf16le string to local charset |
138 | * @to: destination buffer |
139 | * @from: source buffer |
140 | * @tolen: destination buffer size (in bytes) |
141 | * @fromlen: source buffer size (in bytes) |
142 | * @codepage: codepage to which characters should be converted |
143 | * @mapchar: should characters be remapped according to the mapchars option? |
144 | * |
145 | * Convert a little-endian utf16le string (as sent by the server) to a string |
146 | * in the provided codepage. The tolen and fromlen parameters are to ensure |
147 | * that the code doesn't walk off of the end of the buffer (which is always |
148 | * a danger if the alignment of the source buffer is off). The destination |
149 | * string is always properly null terminated and fits in the destination |
150 | * buffer. Returns the length of the destination string in bytes (including |
151 | * null terminator). |
152 | * |
153 | * Note that some windows versions actually send multiword UTF-16 characters |
154 | * instead of straight UTF16-2. The linux nls routines however aren't able to |
155 | * deal with those characters properly. In the event that we get some of |
156 | * those characters, they won't be translated properly. |
157 | * |
158 | * Return: string length after conversion |
159 | */ |
160 | static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, |
161 | const struct nls_table *codepage, bool mapchar) |
162 | { |
163 | int i, j, charlen, safelen; |
164 | int outlen = 0; |
165 | int nullsize = nls_nullsize(codepage); |
166 | int fromwords = fromlen / 2; |
167 | char tmp[NLS_MAX_CHARSET_SIZE]; |
168 | __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */ |
169 | |
170 | /* |
171 | * because the chars can be of varying widths, we need to take care |
172 | * not to overflow the destination buffer when we get close to the |
173 | * end of it. Until we get to this offset, we don't need to check |
174 | * for overflow however. |
175 | */ |
176 | safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); |
177 | |
178 | for (i = 0; i < fromwords; i++) { |
179 | ftmp[0] = get_unaligned_le16(p: &from[i]); |
180 | if (ftmp[0] == 0) |
181 | break; |
182 | for (j = 1; j <= 2; j++) { |
183 | if (i + j < fromwords) |
184 | ftmp[j] = get_unaligned_le16(p: &from[i + j]); |
185 | else |
186 | ftmp[j] = 0; |
187 | } |
188 | |
189 | /* |
190 | * check to see if converting this character might make the |
191 | * conversion bleed into the null terminator |
192 | */ |
193 | if (outlen >= safelen) { |
194 | charlen = cifs_mapchar(target: tmp, from: ftmp, cp: codepage, mapchar); |
195 | if ((outlen + charlen) > (tolen - nullsize)) |
196 | break; |
197 | } |
198 | |
199 | /* put converted char into 'to' buffer */ |
200 | charlen = cifs_mapchar(target: &to[outlen], from: ftmp, cp: codepage, mapchar); |
201 | outlen += charlen; |
202 | |
203 | /* |
204 | * charlen (=bytes of UTF-8 for 1 character) |
205 | * 4bytes UTF-8(surrogate pair) is charlen=4 |
206 | * (4bytes UTF-16 code) |
207 | * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4 |
208 | * (2 UTF-8 pairs divided to 2 UTF-16 pairs) |
209 | */ |
210 | if (charlen == 4) |
211 | i++; |
212 | else if (charlen >= 5) |
213 | /* 5-6bytes UTF-8 */ |
214 | i += 2; |
215 | } |
216 | |
217 | /* properly null-terminate string */ |
218 | for (i = 0; i < nullsize; i++) |
219 | to[outlen++] = 0; |
220 | |
221 | return outlen; |
222 | } |
223 | |
224 | /* |
225 | * smb_strtoUTF16() - Convert character string to unicode string |
226 | * @to: destination buffer |
227 | * @from: source buffer |
228 | * @len: destination buffer size (in bytes) |
229 | * @codepage: codepage to which characters should be converted |
230 | * |
231 | * Return: string length after conversion |
232 | */ |
233 | int smb_strtoUTF16(__le16 *to, const char *from, int len, |
234 | const struct nls_table *codepage) |
235 | { |
236 | int charlen; |
237 | int i; |
238 | wchar_t wchar_to; /* needed to quiet sparse */ |
239 | |
240 | /* special case for utf8 to handle no plane0 chars */ |
241 | if (!strcmp(codepage->charset, "utf8" )) { |
242 | /* |
243 | * convert utf8 -> utf16, we assume we have enough space |
244 | * as caller should have assumed conversion does not overflow |
245 | * in destination len is length in wchar_t units (16bits) |
246 | */ |
247 | i = utf8s_to_utf16s(s: from, len, endian: UTF16_LITTLE_ENDIAN, |
248 | pwcs: (wchar_t *)to, maxlen: len); |
249 | |
250 | /* if success terminate and exit */ |
251 | if (i >= 0) |
252 | goto success; |
253 | /* |
254 | * if fails fall back to UCS encoding as this |
255 | * function should not return negative values |
256 | * currently can fail only if source contains |
257 | * invalid encoded characters |
258 | */ |
259 | } |
260 | |
261 | for (i = 0; len > 0 && *from; i++, from += charlen, len -= charlen) { |
262 | charlen = codepage->char2uni(from, len, &wchar_to); |
263 | if (charlen < 1) { |
264 | /* A question mark */ |
265 | wchar_to = 0x003f; |
266 | charlen = 1; |
267 | } |
268 | put_unaligned_le16(val: wchar_to, p: &to[i]); |
269 | } |
270 | |
271 | success: |
272 | put_unaligned_le16(val: 0, p: &to[i]); |
273 | return i; |
274 | } |
275 | |
276 | /* |
277 | * smb_strndup_from_utf16() - copy a string from wire format to the local |
278 | * codepage |
279 | * @src: source string |
280 | * @maxlen: don't walk past this many bytes in the source string |
281 | * @is_unicode: is this a unicode string? |
282 | * @codepage: destination codepage |
283 | * |
284 | * Take a string given by the server, convert it to the local codepage and |
285 | * put it in a new buffer. Returns a pointer to the new string or NULL on |
286 | * error. |
287 | * |
288 | * Return: destination string buffer or error ptr |
289 | */ |
290 | char *smb_strndup_from_utf16(const char *src, const int maxlen, |
291 | const bool is_unicode, |
292 | const struct nls_table *codepage) |
293 | { |
294 | int len, ret; |
295 | char *dst; |
296 | |
297 | if (is_unicode) { |
298 | len = smb_utf16_bytes(from: (__le16 *)src, maxbytes: maxlen, codepage); |
299 | len += nls_nullsize(codepage); |
300 | dst = kmalloc(size: len, GFP_KERNEL); |
301 | if (!dst) |
302 | return ERR_PTR(error: -ENOMEM); |
303 | ret = smb_from_utf16(to: dst, from: (__le16 *)src, tolen: len, fromlen: maxlen, codepage, |
304 | mapchar: false); |
305 | if (ret < 0) { |
306 | kfree(objp: dst); |
307 | return ERR_PTR(error: -EINVAL); |
308 | } |
309 | } else { |
310 | len = strnlen(p: src, maxlen); |
311 | len++; |
312 | dst = kmalloc(size: len, GFP_KERNEL); |
313 | if (!dst) |
314 | return ERR_PTR(error: -ENOMEM); |
315 | strscpy(dst, src, len); |
316 | } |
317 | |
318 | return dst; |
319 | } |
320 | |
321 | /* |
322 | * Convert 16 bit Unicode pathname to wire format from string in current code |
323 | * page. Conversion may involve remapping up the six characters that are |
324 | * only legal in POSIX-like OS (if they are present in the string). Path |
325 | * names are little endian 16 bit Unicode on the wire |
326 | */ |
327 | /* |
328 | * smbConvertToUTF16() - convert string from local charset to utf16 |
329 | * @target: destination buffer |
330 | * @source: source buffer |
331 | * @srclen: source buffer size (in bytes) |
332 | * @cp: codepage to which characters should be converted |
333 | * @mapchar: should characters be remapped according to the mapchars option? |
334 | * |
335 | * Convert 16 bit Unicode pathname to wire format from string in current code |
336 | * page. Conversion may involve remapping up the six characters that are |
337 | * only legal in POSIX-like OS (if they are present in the string). Path |
338 | * names are little endian 16 bit Unicode on the wire |
339 | * |
340 | * Return: char length after conversion |
341 | */ |
342 | int smbConvertToUTF16(__le16 *target, const char *source, int srclen, |
343 | const struct nls_table *cp, int mapchars) |
344 | { |
345 | int i, j, charlen; |
346 | char src_char; |
347 | __le16 dst_char; |
348 | wchar_t tmp; |
349 | wchar_t wchar_to[6]; /* UTF-16 */ |
350 | int ret; |
351 | unicode_t u; |
352 | |
353 | if (!mapchars) |
354 | return smb_strtoUTF16(to: target, from: source, len: srclen, codepage: cp); |
355 | |
356 | for (i = 0, j = 0; i < srclen; j++) { |
357 | src_char = source[i]; |
358 | charlen = 1; |
359 | switch (src_char) { |
360 | case 0: |
361 | put_unaligned(0, &target[j]); |
362 | return j; |
363 | case ':': |
364 | dst_char = cpu_to_le16(UNI_COLON); |
365 | break; |
366 | case '*': |
367 | dst_char = cpu_to_le16(UNI_ASTERISK); |
368 | break; |
369 | case '?': |
370 | dst_char = cpu_to_le16(UNI_QUESTION); |
371 | break; |
372 | case '<': |
373 | dst_char = cpu_to_le16(UNI_LESSTHAN); |
374 | break; |
375 | case '>': |
376 | dst_char = cpu_to_le16(UNI_GRTRTHAN); |
377 | break; |
378 | case '|': |
379 | dst_char = cpu_to_le16(UNI_PIPE); |
380 | break; |
381 | /* |
382 | * FIXME: We can not handle remapping backslash (UNI_SLASH) |
383 | * until all the calls to build_path_from_dentry are modified, |
384 | * as they use backslash as separator. |
385 | */ |
386 | default: |
387 | charlen = cp->char2uni(source + i, srclen - i, &tmp); |
388 | dst_char = cpu_to_le16(tmp); |
389 | |
390 | /* |
391 | * if no match, use question mark, which at least in |
392 | * some cases serves as wild card |
393 | */ |
394 | if (charlen > 0) |
395 | goto ctoUTF16; |
396 | |
397 | /* convert SURROGATE_PAIR */ |
398 | if (strcmp(cp->charset, "utf8" )) |
399 | goto unknown; |
400 | if (*(source + i) & 0x80) { |
401 | charlen = utf8_to_utf32(s: source + i, len: 6, pu: &u); |
402 | if (charlen < 0) |
403 | goto unknown; |
404 | } else |
405 | goto unknown; |
406 | ret = utf8s_to_utf16s(s: source + i, len: charlen, |
407 | endian: UTF16_LITTLE_ENDIAN, |
408 | pwcs: wchar_to, maxlen: 6); |
409 | if (ret < 0) |
410 | goto unknown; |
411 | |
412 | i += charlen; |
413 | dst_char = cpu_to_le16(*wchar_to); |
414 | if (charlen <= 3) |
415 | /* 1-3bytes UTF-8 to 2bytes UTF-16 */ |
416 | put_unaligned(dst_char, &target[j]); |
417 | else if (charlen == 4) { |
418 | /* |
419 | * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16 |
420 | * 7-8bytes UTF-8(IVS) divided to 2 UTF-16 |
421 | * (charlen=3+4 or 4+4) |
422 | */ |
423 | put_unaligned(dst_char, &target[j]); |
424 | dst_char = cpu_to_le16(*(wchar_to + 1)); |
425 | j++; |
426 | put_unaligned(dst_char, &target[j]); |
427 | } else if (charlen >= 5) { |
428 | /* 5-6bytes UTF-8 to 6bytes UTF-16 */ |
429 | put_unaligned(dst_char, &target[j]); |
430 | dst_char = cpu_to_le16(*(wchar_to + 1)); |
431 | j++; |
432 | put_unaligned(dst_char, &target[j]); |
433 | dst_char = cpu_to_le16(*(wchar_to + 2)); |
434 | j++; |
435 | put_unaligned(dst_char, &target[j]); |
436 | } |
437 | continue; |
438 | |
439 | unknown: |
440 | dst_char = cpu_to_le16(0x003f); |
441 | charlen = 1; |
442 | } |
443 | |
444 | ctoUTF16: |
445 | /* |
446 | * character may take more than one byte in the source string, |
447 | * but will take exactly two bytes in the target string |
448 | */ |
449 | i += charlen; |
450 | put_unaligned(dst_char, &target[j]); |
451 | } |
452 | |
453 | return j; |
454 | } |
455 | |