utf_16.rs source code [crates/encoding_rs/src/utf_16.rs]

1	// Copyright Mozilla Foundation. See the COPYRIGHT
2	// file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	use super::*;
11	use crate::handles::*;
12	use crate::variant::*;
13
14	pub struct Utf16Decoder {
15	lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate
16	lead_byte: Option<u8>,
17	be: bool,
18	pending_bmp: bool, // if true, lead_surrogate is actually pending BMP
19	}
20
21	impl Utf16Decoder {
22	pub fn new(big_endian: bool) -> VariantDecoder {
23	VariantDecoder::Utf16(Utf16Decoder {
24	lead_surrogate: `0`,
25	lead_byte: None,
26	be: big_endian,
27	pending_bmp: `false`,
28	})
29	}
30
31	pub fn additional_from_state(&self) -> usize {
32	`1` + if self.lead_byte.is_some() { `1` } else { `0` }
33	+ if self.lead_surrogate == `0` { `0` } else { `2` }
34	}
35
36	pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
37	checked_add(
38	`1`,
39	checked_div(byte_length.checked_add(self.additional_from_state()), `2`),
40	)
41	}
42
43	pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
44	checked_add(
45	`1`,
46	checked_mul(
47	`3`,
48	checked_div(byte_length.checked_add(self.additional_from_state()), `2`),
49	),
50	)
51	}
52
53	pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
54	checked_add(
55	`1`,
56	checked_mul(
57	`3`,
58	checked_div(byte_length.checked_add(self.additional_from_state()), `2`),
59	),
60	)
61	}
62
63	decoder_functions!(
64	{
65	if self.pending_bmp {
66	match dest.check_space_bmp() {
67	Space::Full(_) => {
68	return (DecoderResult::OutputFull, `0`, `0`);
69	}
70	Space::Available(destination_handle) => {
71	destination_handle.write_bmp(self.lead_surrogate);
72	self.pending_bmp = `false`;
73	self.lead_surrogate = `0`;
74	}
75	}
76	}
77	},
78	{
79	// This is the fast path. The rest runs only at the
80	// start and end for partial sequences.
81	if self.lead_byte.is_none() && self.lead_surrogate == `0` {
82	if let Some((read, written)) = if self.be {
83	dest.copy_utf16_from::<BigEndian>(&mut source)
84	} else {
85	dest.copy_utf16_from::<LittleEndian>(&mut source)
86	} {
87	return (DecoderResult::Malformed(`2`, `0`), read, written);
88	}
89	}
90	},
91	{
92	debug_assert!(!self.pending_bmp);
93	if self.lead_surrogate != `0` \|\| self.lead_byte.is_some() {
94	// We need to check space without intent to write in order to
95	// make sure that there is space for the replacement character.
96	match dest.check_space_bmp() {
97	Space::Full(_) => {
98	return (DecoderResult::OutputFull, `0`, `0`);
99	}
100	Space::Available(_) => {
101	if self.lead_surrogate != `0` {
102	self.lead_surrogate = `0`;
103	match self.lead_byte {
104	None => {
105	return (
106	DecoderResult::Malformed(`2`, `0`),
107	src_consumed,
108	dest.written(),
109	);
110	}
111	Some(_) => {
112	self.lead_byte = None;
113	return (
114	DecoderResult::Malformed(`3`, `0`),
115	src_consumed,
116	dest.written(),
117	);
118	}
119	}
120	}
121	debug_assert!(self.lead_byte.is_some());
122	self.lead_byte = None;
123	return (DecoderResult::Malformed(`1`, `0`), src_consumed, dest.written());
124	}
125	}
126	}
127	},
128	{
129	match self.lead_byte {
130	None => {
131	self.lead_byte = Some(b);
132	continue;
133	}
134	Some(lead) => {
135	self.lead_byte = None;
136	let code_unit = if self.be {
137	u16::from(lead) << `8` \| u16::from(b)
138	} else {
139	u16::from(b) << `8` \| u16::from(lead)
140	};
141	let high_bits = code_unit & `0xFC00u16`;
142	if high_bits == `0xD800u16` {
143	// high surrogate
144	if self.lead_surrogate != `0` {
145	// The previous high surrogate was in
146	// error and this one becomes the new
147	// pending one.
148	self.lead_surrogate = code_unit as u16;
149	return (
150	DecoderResult::Malformed(`2`, `2`),
151	unread_handle.consumed(),
152	destination_handle.written(),
153	);
154	}
155	self.lead_surrogate = code_unit;
156	continue;
157	}
158	if high_bits == `0xDC00u16` {
159	// low surrogate
160	if self.lead_surrogate == `0` {
161	return (
162	DecoderResult::Malformed(`2`, `0`),
163	unread_handle.consumed(),
164	destination_handle.written(),
165	);
166	}
167	destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit);
168	self.lead_surrogate = `0`;
169	continue;
170	}
171	// bmp
172	if self.lead_surrogate != `0` {
173	// The previous high surrogate was in
174	// error and this code unit becomes a
175	// pending BMP character.
176	self.lead_surrogate = code_unit;
177	self.pending_bmp = `true`;
178	return (
179	DecoderResult::Malformed(`2`, `2`),
180	unread_handle.consumed(),
181	destination_handle.written(),
182	);
183	}
184	destination_handle.write_bmp(code_unit);
185	continue;
186	}
187	}
188	},
189	self,
190	src_consumed,
191	dest,
192	source,
193	b,
194	destination_handle,
195	unread_handle,
196	check_space_astral
197	);
198	}
199
200	// Any copyright to the test code below this comment is dedicated to the
201	// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
202
203	#[cfg(all(test, feature = "alloc"))]
204	mod tests {
205	use super::super::testing::*;
206	use super::super::*;
207
208	fn decode_utf_16le(bytes: &[u8], expect: &str) {
209	decode_without_padding(UTF_16LE, bytes, expect);
210	}
211
212	fn decode_utf_16be(bytes: &[u8], expect: &str) {
213	decode_without_padding(UTF_16BE, bytes, expect);
214	}
215
216	fn encode_utf_16le(string: &str, expect: &[u8]) {
217	encode(UTF_16LE, string, expect);
218	}
219
220	fn encode_utf_16be(string: &str, expect: &[u8]) {
221	encode(UTF_16BE, string, expect);
222	}
223
224	#[test]
225	fn test_utf_16_decode() {
226	decode_utf_16le(b"", "");
227	decode_utf_16be(b"", "");
228
229	decode_utf_16le(b"`\x61\x00\x62\x00`", "`\u{0061}\u{0062}`");
230	decode_utf_16be(b"`\x00\x61\x00\x62`", "`\u{0061}\u{0062}`");
231
232	decode_utf_16le(b"`\xFE\xFF\x00\x61\x00\x62`", "`\u{0061}\u{0062}`");
233	decode_utf_16be(b"`\xFF\xFE\x61\x00\x62\x00`", "`\u{0061}\u{0062}`");
234
235	decode_utf_16le(b"`\x61\x00\x62`", "`\u{0061}\u{FFFD}`");
236	decode_utf_16be(b"`\x00\x61\x00`", "`\u{0061}\u{FFFD}`");
237
238	decode_utf_16le(b"`\x3D\xD8\xA9`", "`\u{FFFD}`");
239	decode_utf_16be(b"`\xD8\x3D\xDC`", "`\u{FFFD}`");
240
241	decode_utf_16le(b"`\x3D\xD8\xA9\xDC\x03\x26`", "`\u{1F4A9}\u{2603}`");
242	decode_utf_16be(b"`\xD8\x3D\xDC\xA9\x26\x03`", "`\u{1F4A9}\u{2603}`");
243
244	decode_utf_16le(b"`\xA9\xDC\x03\x26`", "`\u{FFFD}\u{2603}`");
245	decode_utf_16be(b"`\xDC\xA9\x26\x03`", "`\u{FFFD}\u{2603}`");
246
247	decode_utf_16le(b"`\x3D\xD8\x03\x26`", "`\u{FFFD}\u{2603}`");
248	decode_utf_16be(b"`\xD8\x3D\x26\x03`", "`\u{FFFD}\u{2603}`");
249
250	// The \xFF makes sure that the parts before and after have different alignment
251	let long_le = b"`\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8`";
252	let long_be = b"`\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D`";
253	let long_expect = "`\x00\x00\x00\x00\u{1F4A9}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\x00\x00\x00\x00\u{FFFD}`";
254	decode_utf_16le(&long_le[..long_le.len() / `2`], long_expect);
255	decode_utf_16be(&long_be[..long_be.len() / `2`], long_expect);
256	decode_utf_16le(&long_le[long_le.len() / `2` + `1`..], long_expect);
257	decode_utf_16be(&long_be[long_be.len() / `2` + `1`..], long_expect);
258	}
259
260	#[test]
261	fn test_utf_16_encode() {
262	// Empty
263	encode_utf_16be("", b"");
264	encode_utf_16le("", b"");
265
266	// Encodes as UTF-8
267	assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
268	assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
269	encode_utf_16le("`\u{1F4A9}\u{2603}`", "`\u{1F4A9}\u{2603}`".as_bytes());
270	encode_utf_16be("`\u{1F4A9}\u{2603}`", "`\u{1F4A9}\u{2603}`".as_bytes());
271	}
272
273	#[test]
274	fn test_utf_16be_decode_one_by_one() {
275	let input = b"`\x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9`";
276	let mut output = [`0u16`; `20`];
277	let mut decoder = UTF_16BE.new_decoder();
278	for b in input.chunks(`1`) {
279	assert_eq!(b.len(), `1`);
280	let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
281	let (result, read, _, had_errors) =
282	decoder.decode_to_utf16(b, &mut output[..needed], `false`);
283	assert_eq!(result, CoderResult::InputEmpty);
284	assert_eq!(read, `1`);
285	assert!(!had_errors);
286	}
287	}
288
289	#[test]
290	fn test_utf_16le_decode_one_by_one() {
291	let input = b"`\x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC`";
292	let mut output = [`0u16`; `20`];
293	let mut decoder = UTF_16LE.new_decoder();
294	for b in input.chunks(`1`) {
295	assert_eq!(b.len(), `1`);
296	let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
297	let (result, read, _, had_errors) =
298	decoder.decode_to_utf16(b, &mut output[..needed], `false`);
299	assert_eq!(result, CoderResult::InputEmpty);
300	assert_eq!(read, `1`);
301	assert!(!had_errors);
302	}
303	}
304
305	#[test]
306	fn test_utf_16be_decode_three_at_a_time() {
307	let input = b"`\x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4`";
308	let mut output = [`0u16`; `20`];
309	let mut decoder = UTF_16BE.new_decoder();
310	for b in input.chunks(`3`) {
311	assert_eq!(b.len(), `3`);
312	let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
313	let (result, read, _, had_errors) =
314	decoder.decode_to_utf16(b, &mut output[..needed], `false`);
315	assert_eq!(result, CoderResult::InputEmpty);
316	assert_eq!(read, b.len());
317	assert!(!had_errors);
318	}
319	}
320
321	#[test]
322	fn test_utf_16le_decode_three_at_a_time() {
323	let input = b"`\xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00`";
324	let mut output = [`0u16`; `20`];
325	let mut decoder = UTF_16LE.new_decoder();
326	for b in input.chunks(`3`) {
327	assert_eq!(b.len(), `3`);
328	let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
329	let (result, read, _, had_errors) =
330	decoder.decode_to_utf16(b, &mut output[..needed], `false`);
331	assert_eq!(result, CoderResult::InputEmpty);
332	assert_eq!(read, b.len());
333	assert!(!had_errors);
334	}
335	}
336
337	#[test]
338	fn test_utf_16le_decode_bom_prefixed_split_byte_pair() {
339	let mut output = [`0u16`; `20`];
340	let mut decoder = UTF_16LE.new_decoder();
341	{
342	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
343	let (result, read, written, had_errors) =
344	decoder.decode_to_utf16(b"`\xFF`", &mut output[..needed], `false`);
345	assert_eq!(result, CoderResult::InputEmpty);
346	assert_eq!(read, `1`);
347	assert_eq!(written, `0`);
348	assert!(!had_errors);
349	}
350	{
351	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
352	let (result, read, written, had_errors) =
353	decoder.decode_to_utf16(b"`\xFD`", &mut output[..needed], `true`);
354	assert_eq!(result, CoderResult::InputEmpty);
355	assert_eq!(read, `1`);
356	assert_eq!(written, `1`);
357	assert!(!had_errors);
358	assert_eq!(output[`0`], `0xFDFF`);
359	}
360	}
361
362	#[test]
363	fn test_utf_16be_decode_bom_prefixed_split_byte_pair() {
364	let mut output = [`0u16`; `20`];
365	let mut decoder = UTF_16BE.new_decoder();
366	{
367	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
368	let (result, read, written, had_errors) =
369	decoder.decode_to_utf16(b"`\xFE`", &mut output[..needed], `false`);
370	assert_eq!(result, CoderResult::InputEmpty);
371	assert_eq!(read, `1`);
372	assert_eq!(written, `0`);
373	assert!(!had_errors);
374	}
375	{
376	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
377	let (result, read, written, had_errors) =
378	decoder.decode_to_utf16(b"`\xFD`", &mut output[..needed], `true`);
379	assert_eq!(result, CoderResult::InputEmpty);
380	assert_eq!(read, `1`);
381	assert_eq!(written, `1`);
382	assert!(!had_errors);
383	assert_eq!(output[`0`], `0xFEFD`);
384	}
385	}
386
387	#[test]
388	fn test_utf_16le_decode_bom_prefix() {
389	let mut output = [`0u16`; `20`];
390	let mut decoder = UTF_16LE.new_decoder();
391	{
392	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
393	let (result, read, written, had_errors) =
394	decoder.decode_to_utf16(b"`\xFF`", &mut output[..needed], `true`);
395	assert_eq!(result, CoderResult::InputEmpty);
396	assert_eq!(read, `1`);
397	assert_eq!(written, `1`);
398	assert!(had_errors);
399	assert_eq!(output[`0`], `0xFFFD`);
400	}
401	}
402
403	#[test]
404	fn test_utf_16be_decode_bom_prefix() {
405	let mut output = [`0u16`; `20`];
406	let mut decoder = UTF_16BE.new_decoder();
407	{
408	let needed = decoder.max_utf16_buffer_length(`1`).unwrap();
409	let (result, read, written, had_errors) =
410	decoder.decode_to_utf16(b"`\xFE`", &mut output[..needed], `true`);
411	assert_eq!(result, CoderResult::InputEmpty);
412	assert_eq!(read, `1`);
413	assert_eq!(written, `1`);
414	assert!(had_errors);
415	assert_eq!(output[`0`], `0xFFFD`);
416	}
417	}
418
419	#[test]
420	fn test_utf_16le_decode_near_end() {
421	let mut output = [`0u8`; `4`];
422	let mut decoder = UTF_16LE.new_decoder();
423	{
424	let (result, read, written, had_errors) =
425	decoder.decode_to_utf8(&[`0x03`], &mut output[..], `false`);
426	assert_eq!(result, CoderResult::InputEmpty);
427	assert_eq!(read, `1`);
428	assert_eq!(written, `0`);
429	assert!(!had_errors);
430	assert_eq!(output[`0`], `0x0`);
431	}
432	{
433	let (result, read, written, had_errors) =
434	decoder.decode_to_utf8(&[`0x26`, `0x03`, `0x26`], &mut output[..], `false`);
435	assert_eq!(result, CoderResult::OutputFull);
436	assert_eq!(read, `1`);
437	assert_eq!(written, `3`);
438	assert!(!had_errors);
439	assert_eq!(output[`0`], `0xE2`);
440	assert_eq!(output[`1`], `0x98`);
441	assert_eq!(output[`2`], `0x83`);
442	assert_eq!(output[`3`], `0x00`);
443	}
444	}
445
446	#[test]
447	fn test_utf_16be_decode_near_end() {
448	let mut output = [`0u8`; `4`];
449	let mut decoder = UTF_16BE.new_decoder();
450	{
451	let (result, read, written, had_errors) =
452	decoder.decode_to_utf8(&[`0x26`], &mut output[..], `false`);
453	assert_eq!(result, CoderResult::InputEmpty);
454	assert_eq!(read, `1`);
455	assert_eq!(written, `0`);
456	assert!(!had_errors);
457	assert_eq!(output[`0`], `0x0`);
458	}
459	{
460	let (result, read, written, had_errors) =
461	decoder.decode_to_utf8(&[`0x03`, `0x26`, `0x03`], &mut output[..], `false`);
462	assert_eq!(result, CoderResult::OutputFull);
463	assert_eq!(read, `1`);
464	assert_eq!(written, `3`);
465	assert!(!had_errors);
466	assert_eq!(output[`0`], `0xE2`);
467	assert_eq!(output[`1`], `0x98`);
468	assert_eq!(output[`2`], `0x83`);
469	assert_eq!(output[`3`], `0x00`);
470	}
471	}
472	}
473