unicode.c source code [linux/fs/smb/server/unicode.c]

1	// SPDX-License-Identifier: GPL-2.0-or-later
2	/*
3	* Some of the source code in this file came from fs/cifs/cifs_unicode.c
4	*
5	* Copyright (c) International Business Machines Corp., 2000,2009
6	* Modified by Steve French (sfrench@us.ibm.com)
7	* Modified by Namjae Jeon (linkinjeon@kernel.org)
8	*/
9	#include <linux/fs.h>
10	#include <linux/slab.h>
11	#include <asm/unaligned.h>
12	#include "glob.h"
13	#include "unicode.h"
14	#include "smb_common.h"
15
16	/*
17	* cifs_mapchar() - convert a host-endian char to proper char in codepage
18	* @target: where converted character should be copied
19	* @from: host-endian source string
20	* @cp: codepage to which character should be converted
21	* @mapchar: should character be mapped according to mapchars mount option?
22	*
23	* This function handles the conversion of a single character. It is the
24	* responsibility of the caller to ensure that the target buffer is large
25	* enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE).
26	*
27	* Return: string length after conversion
28	*/
29	static int
30	cifs_mapchar(char target, const* __u16 from, const* struct nls_table *cp,
31	bool mapchar)
32	{
33	int len = `1`;
34	__u16 src_char;
35
36	src_char = *from;
37
38	if (!mapchar)
39	goto cp_convert;
40
41	/*
42	* BB: Cannot handle remapping UNI_SLASH until all the calls to
43	* build_path_from_dentry are modified, as they use slash as
44	* separator.
45	*/
46	switch (src_char) {
47	case UNI_COLON:
48	*target = `':'`;
49	break;
50	case UNI_ASTERISK:
51	target = `''`;
52	break;
53	case UNI_QUESTION:
54	*target = `'?'`;
55	break;
56	case UNI_PIPE:
57	*target = `'\|'`;
58	break;
59	case UNI_GRTRTHAN:
60	*target = `'>'`;
61	break;
62	case UNI_LESSTHAN:
63	*target = `'<'`;
64	break;
65	default:
66	goto cp_convert;
67	}
68
69	out:
70	return len;
71
72	cp_convert:
73	len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE);
74	if (len <= `0`)
75	goto surrogate_pair;
76
77	goto out;
78
79	surrogate_pair:
80	/ convert SURROGATE_PAIR and IVS /
81	if (strcmp(cp->charset, "utf8"))
82	goto unknown;
83	len = utf16s_to_utf8s(pwcs: from, len: `3`, endian: UTF16_LITTLE_ENDIAN, s: target, maxlen: `6`);
84	if (len <= `0`)
85	goto unknown;
86	return len;
87
88	unknown:
89	*target = `'?'`;
90	len = `1`;
91	goto out;
92	}
93
94	/*
95	* smb_utf16_bytes() - compute converted string length
96	* @from: pointer to input string
97	* @maxbytes: input string length
98	* @codepage: destination codepage
99	*
100	* Walk a utf16le string and return the number of bytes that the string will
101	* be after being converted to the given charset, not including any null
102	* termination required. Don't walk past maxbytes in the source buffer.
103	*
104	* Return: string length after conversion
105	*/
106	static int smb_utf16_bytes(const __le16 from, int* maxbytes,
107	const struct nls_table *codepage)
108	{
109	int i, j;
110	int charlen, outlen = `0`;
111	int maxwords = maxbytes / `2`;
112	char tmp[NLS_MAX_CHARSET_SIZE];
113	__u16 ftmp[`3`];
114
115	for (i = `0`; i < maxwords; i++) {
116	ftmp[`0`] = get_unaligned_le16(p: &from[i]);
117	if (ftmp[`0`] == `0`)
118	break;
119	for (j = `1`; j <= `2`; j++) {
120	if (i + j < maxwords)
121	ftmp[j] = get_unaligned_le16(p: &from[i + j]);
122	else
123	ftmp[j] = `0`;
124	}
125
126	charlen = cifs_mapchar(target: tmp, from: ftmp, cp: codepage, mapchar: `0`);
127	if (charlen > `0`)
128	outlen += charlen;
129	else
130	outlen++;
131	}
132
133	return outlen;
134	}
135
136	/*
137	* smb_from_utf16() - convert utf16le string to local charset
138	* @to: destination buffer
139	* @from: source buffer
140	* @tolen: destination buffer size (in bytes)
141	* @fromlen: source buffer size (in bytes)
142	* @codepage: codepage to which characters should be converted
143	* @mapchar: should characters be remapped according to the mapchars option?
144	*
145	* Convert a little-endian utf16le string (as sent by the server) to a string
146	* in the provided codepage. The tolen and fromlen parameters are to ensure
147	* that the code doesn't walk off of the end of the buffer (which is always
148	* a danger if the alignment of the source buffer is off). The destination
149	* string is always properly null terminated and fits in the destination
150	* buffer. Returns the length of the destination string in bytes (including
151	* null terminator).
152	*
153	* Note that some windows versions actually send multiword UTF-16 characters
154	* instead of straight UTF16-2. The linux nls routines however aren't able to
155	* deal with those characters properly. In the event that we get some of
156	* those characters, they won't be translated properly.
157	*
158	* Return: string length after conversion
159	*/
160	static int smb_from_utf16(char to, const* __le16 from, int* tolen, int fromlen,
161	const struct nls_table *codepage, bool mapchar)
162	{
163	int i, j, charlen, safelen;
164	int outlen = `0`;
165	int nullsize = nls_nullsize(codepage);
166	int fromwords = fromlen / `2`;
167	char tmp[NLS_MAX_CHARSET_SIZE];
168	__u16 ftmp[`3`]; / ftmp[3] = 3array x 2bytes = 6bytes UTF-16 /
169
170	/*
171	* because the chars can be of varying widths, we need to take care
172	* not to overflow the destination buffer when we get close to the
173	* end of it. Until we get to this offset, we don't need to check
174	* for overflow however.
175	*/
176	safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize);
177
178	for (i = `0`; i < fromwords; i++) {
179	ftmp[`0`] = get_unaligned_le16(p: &from[i]);
180	if (ftmp[`0`] == `0`)
181	break;
182	for (j = `1`; j <= `2`; j++) {
183	if (i + j < fromwords)
184	ftmp[j] = get_unaligned_le16(p: &from[i + j]);
185	else
186	ftmp[j] = `0`;
187	}
188
189	/*
190	* check to see if converting this character might make the
191	* conversion bleed into the null terminator
192	*/
193	if (outlen >= safelen) {
194	charlen = cifs_mapchar(target: tmp, from: ftmp, cp: codepage, mapchar);
195	if ((outlen + charlen) > (tolen - nullsize))
196	break;
197	}
198
199	/ put converted char into 'to' buffer /
200	charlen = cifs_mapchar(target: &to[outlen], from: ftmp, cp: codepage, mapchar);
201	outlen += charlen;
202
203	/*
204	* charlen (=bytes of UTF-8 for 1 character)
205	* 4bytes UTF-8(surrogate pair) is charlen=4
206	* (4bytes UTF-16 code)
207	* 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4
208	* (2 UTF-8 pairs divided to 2 UTF-16 pairs)
209	*/
210	if (charlen == `4`)
211	i++;
212	else if (charlen >= `5`)
213	/ 5-6bytes UTF-8 /
214	i += `2`;
215	}
216
217	/ properly null-terminate string /
218	for (i = `0`; i < nullsize; i++)
219	to[outlen++] = `0`;
220
221	return outlen;
222	}
223
224	/*
225	* smb_strtoUTF16() - Convert character string to unicode string
226	* @to: destination buffer
227	* @from: source buffer
228	* @len: destination buffer size (in bytes)
229	* @codepage: codepage to which characters should be converted
230	*
231	* Return: string length after conversion
232	*/
233	int smb_strtoUTF16(__le16 to, const* char from, int* len,
234	const struct nls_table *codepage)
235	{
236	int charlen;
237	int i;
238	wchar_t wchar_to; / needed to quiet sparse /
239
240	/ special case for utf8 to handle no plane0 chars /
241	if (!strcmp(codepage->charset, "utf8")) {
242	/*
243	* convert utf8 -> utf16, we assume we have enough space
244	* as caller should have assumed conversion does not overflow
245	* in destination len is length in wchar_t units (16bits)
246	*/
247	i = utf8s_to_utf16s(s: from, len, endian: UTF16_LITTLE_ENDIAN,
248	pwcs: (wchar_t *)to, maxlen: len);
249
250	/ if success terminate and exit /
251	if (i >= `0`)
252	goto success;
253	/*
254	* if fails fall back to UCS encoding as this
255	* function should not return negative values
256	* currently can fail only if source contains
257	* invalid encoded characters
258	*/
259	}
260
261	for (i = `0`; len > `0` && *from; i++, from += charlen, len -= charlen) {
262	charlen = codepage->char2uni(from, len, &wchar_to);
263	if (charlen < `1`) {
264	/ A question mark /
265	wchar_to = `0x003f`;
266	charlen = `1`;
267	}
268	put_unaligned_le16(val: wchar_to, p: &to[i]);
269	}
270
271	success:
272	put_unaligned_le16(val: `0`, p: &to[i]);
273	return i;
274	}
275
276	/*
277	* smb_strndup_from_utf16() - copy a string from wire format to the local
278	* codepage
279	* @src: source string
280	* @maxlen: don't walk past this many bytes in the source string
281	* @is_unicode: is this a unicode string?
282	* @codepage: destination codepage
283	*
284	* Take a string given by the server, convert it to the local codepage and
285	* put it in a new buffer. Returns a pointer to the new string or NULL on
286	* error.
287	*
288	* Return: destination string buffer or error ptr
289	*/
290	char smb_strndup_from_utf16(const* char src, const* int maxlen,
291	const bool is_unicode,
292	const struct nls_table *codepage)
293	{
294	int len, ret;
295	char *dst;
296
297	if (is_unicode) {
298	len = smb_utf16_bytes(from: (__le16 *)src, maxbytes: maxlen, codepage);
299	len += nls_nullsize(codepage);
300	dst = kmalloc(size: len, GFP_KERNEL);
301	if (!dst)
302	return ERR_PTR(error: -ENOMEM);
303	ret = smb_from_utf16(to: dst, from: (__le16 *)src, tolen: len, fromlen: maxlen, codepage,
304	mapchar: false);
305	if (ret < `0`) {
306	kfree(objp: dst);
307	return ERR_PTR(error: -EINVAL);
308	}
309	} else {
310	len = strnlen(p: src, maxlen);
311	len++;
312	dst = kmalloc(size: len, GFP_KERNEL);
313	if (!dst)
314	return ERR_PTR(error: -ENOMEM);
315	strscpy(dst, src, len);
316	}
317
318	return dst;
319	}
320
321	/*
322	* Convert 16 bit Unicode pathname to wire format from string in current code
323	* page. Conversion may involve remapping up the six characters that are
324	* only legal in POSIX-like OS (if they are present in the string). Path
325	* names are little endian 16 bit Unicode on the wire
326	*/
327	/*
328	* smbConvertToUTF16() - convert string from local charset to utf16
329	* @target: destination buffer
330	* @source: source buffer
331	* @srclen: source buffer size (in bytes)
332	* @cp: codepage to which characters should be converted
333	* @mapchar: should characters be remapped according to the mapchars option?
334	*
335	* Convert 16 bit Unicode pathname to wire format from string in current code
336	* page. Conversion may involve remapping up the six characters that are
337	* only legal in POSIX-like OS (if they are present in the string). Path
338	* names are little endian 16 bit Unicode on the wire
339	*
340	* Return: char length after conversion
341	*/
342	int smbConvertToUTF16(__le16 target, const* char source, int* srclen,
343	const struct nls_table cp, int* mapchars)
344	{
345	int i, j, charlen;
346	char src_char;
347	__le16 dst_char;
348	wchar_t tmp;
349	wchar_t wchar_to[`6`]; / UTF-16 /
350	int ret;
351	unicode_t u;
352
353	if (!mapchars)
354	return smb_strtoUTF16(to: target, from: source, len: srclen, codepage: cp);
355
356	for (i = `0`, j = `0`; i < srclen; j++) {
357	src_char = source[i];
358	charlen = `1`;
359	switch (src_char) {
360	case `0`:
361	put_unaligned(`0`, &target[j]);
362	return j;
363	case `':'`:
364	dst_char = cpu_to_le16(UNI_COLON);
365	break;
366	case `'*'`:
367	dst_char = cpu_to_le16(UNI_ASTERISK);
368	break;
369	case `'?'`:
370	dst_char = cpu_to_le16(UNI_QUESTION);
371	break;
372	case `'<'`:
373	dst_char = cpu_to_le16(UNI_LESSTHAN);
374	break;
375	case `'>'`:
376	dst_char = cpu_to_le16(UNI_GRTRTHAN);
377	break;
378	case `'\|'`:
379	dst_char = cpu_to_le16(UNI_PIPE);
380	break;
381	/*
382	* FIXME: We can not handle remapping backslash (UNI_SLASH)
383	* until all the calls to build_path_from_dentry are modified,
384	* as they use backslash as separator.
385	*/
386	default:
387	charlen = cp->char2uni(source + i, srclen - i, &tmp);
388	dst_char = cpu_to_le16(tmp);
389
390	/*
391	* if no match, use question mark, which at least in
392	* some cases serves as wild card
393	*/
394	if (charlen > `0`)
395	goto ctoUTF16;
396
397	/ convert SURROGATE_PAIR /
398	if (strcmp(cp->charset, "utf8"))
399	goto unknown;
400	if (*(source + i) & `0x80`) {
401	charlen = utf8_to_utf32(s: source + i, len: `6`, pu: &u);
402	if (charlen < `0`)
403	goto unknown;
404	} else
405	goto unknown;
406	ret = utf8s_to_utf16s(s: source + i, len: charlen,
407	endian: UTF16_LITTLE_ENDIAN,
408	pwcs: wchar_to, maxlen: `6`);
409	if (ret < `0`)
410	goto unknown;
411
412	i += charlen;
413	dst_char = cpu_to_le16(*wchar_to);
414	if (charlen <= `3`)
415	/ 1-3bytes UTF-8 to 2bytes UTF-16 /
416	put_unaligned(dst_char, &target[j]);
417	else if (charlen == `4`) {
418	/*
419	* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16
420	* 7-8bytes UTF-8(IVS) divided to 2 UTF-16
421	* (charlen=3+4 or 4+4)
422	*/
423	put_unaligned(dst_char, &target[j]);
424	dst_char = cpu_to_le16(*(wchar_to + `1`));
425	j++;
426	put_unaligned(dst_char, &target[j]);
427	} else if (charlen >= `5`) {
428	/ 5-6bytes UTF-8 to 6bytes UTF-16 /
429	put_unaligned(dst_char, &target[j]);
430	dst_char = cpu_to_le16(*(wchar_to + `1`));
431	j++;
432	put_unaligned(dst_char, &target[j]);
433	dst_char = cpu_to_le16(*(wchar_to + `2`));
434	j++;
435	put_unaligned(dst_char, &target[j]);
436	}
437	continue;
438
439	unknown:
440	dst_char = cpu_to_le16(`0x003f`);
441	charlen = `1`;
442	}
443
444	ctoUTF16:
445	/*
446	* character may take more than one byte in the source string,
447	* but will take exactly two bytes in the target string
448	*/
449	i += charlen;
450	put_unaligned(dst_char, &target[j]);
451	}
452
453	return j;
454	}
455

source code of linux/fs/smb/server/unicode.c