1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::handles::*;
12use crate::variant::*;
13
14pub struct Utf16Decoder {
15 lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate
16 lead_byte: Option<u8>,
17 be: bool,
18 pending_bmp: bool, // if true, lead_surrogate is actually pending BMP
19}
20
21impl Utf16Decoder {
22 pub fn new(big_endian: bool) -> VariantDecoder {
23 VariantDecoder::Utf16(Utf16Decoder {
24 lead_surrogate: 0,
25 lead_byte: None,
26 be: big_endian,
27 pending_bmp: false,
28 })
29 }
30
31 pub fn additional_from_state(&self) -> usize {
32 1 + if self.lead_byte.is_some() { 1 } else { 0 }
33 + if self.lead_surrogate == 0 { 0 } else { 2 }
34 }
35
36 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
37 checked_add(
38 1,
39 checked_div(byte_length.checked_add(self.additional_from_state()), 2),
40 )
41 }
42
43 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
44 checked_add(
45 1,
46 checked_mul(
47 3,
48 checked_div(byte_length.checked_add(self.additional_from_state()), 2),
49 ),
50 )
51 }
52
53 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
54 checked_add(
55 1,
56 checked_mul(
57 3,
58 checked_div(byte_length.checked_add(self.additional_from_state()), 2),
59 ),
60 )
61 }
62
63 decoder_functions!(
64 {
65 if self.pending_bmp {
66 match dest.check_space_bmp() {
67 Space::Full(_) => {
68 return (DecoderResult::OutputFull, 0, 0);
69 }
70 Space::Available(destination_handle) => {
71 destination_handle.write_bmp(self.lead_surrogate);
72 self.pending_bmp = false;
73 self.lead_surrogate = 0;
74 }
75 }
76 }
77 },
78 {
79 // This is the fast path. The rest runs only at the
80 // start and end for partial sequences.
81 if self.lead_byte.is_none() && self.lead_surrogate == 0 {
82 if let Some((read, written)) = if self.be {
83 dest.copy_utf16_from::<BigEndian>(&mut source)
84 } else {
85 dest.copy_utf16_from::<LittleEndian>(&mut source)
86 } {
87 return (DecoderResult::Malformed(2, 0), read, written);
88 }
89 }
90 },
91 {
92 debug_assert!(!self.pending_bmp);
93 if self.lead_surrogate != 0 || self.lead_byte.is_some() {
94 // We need to check space without intent to write in order to
95 // make sure that there is space for the replacement character.
96 match dest.check_space_bmp() {
97 Space::Full(_) => {
98 return (DecoderResult::OutputFull, 0, 0);
99 }
100 Space::Available(_) => {
101 if self.lead_surrogate != 0 {
102 self.lead_surrogate = 0;
103 match self.lead_byte {
104 None => {
105 return (
106 DecoderResult::Malformed(2, 0),
107 src_consumed,
108 dest.written(),
109 );
110 }
111 Some(_) => {
112 self.lead_byte = None;
113 return (
114 DecoderResult::Malformed(3, 0),
115 src_consumed,
116 dest.written(),
117 );
118 }
119 }
120 }
121 debug_assert!(self.lead_byte.is_some());
122 self.lead_byte = None;
123 return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
124 }
125 }
126 }
127 },
128 {
129 match self.lead_byte {
130 None => {
131 self.lead_byte = Some(b);
132 continue;
133 }
134 Some(lead) => {
135 self.lead_byte = None;
136 let code_unit = if self.be {
137 u16::from(lead) << 8 | u16::from(b)
138 } else {
139 u16::from(b) << 8 | u16::from(lead)
140 };
141 let high_bits = code_unit & 0xFC00u16;
142 if high_bits == 0xD800u16 {
143 // high surrogate
144 if self.lead_surrogate != 0 {
145 // The previous high surrogate was in
146 // error and this one becomes the new
147 // pending one.
148 self.lead_surrogate = code_unit as u16;
149 return (
150 DecoderResult::Malformed(2, 2),
151 unread_handle.consumed(),
152 destination_handle.written(),
153 );
154 }
155 self.lead_surrogate = code_unit;
156 continue;
157 }
158 if high_bits == 0xDC00u16 {
159 // low surrogate
160 if self.lead_surrogate == 0 {
161 return (
162 DecoderResult::Malformed(2, 0),
163 unread_handle.consumed(),
164 destination_handle.written(),
165 );
166 }
167 destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit);
168 self.lead_surrogate = 0;
169 continue;
170 }
171 // bmp
172 if self.lead_surrogate != 0 {
173 // The previous high surrogate was in
174 // error and this code unit becomes a
175 // pending BMP character.
176 self.lead_surrogate = code_unit;
177 self.pending_bmp = true;
178 return (
179 DecoderResult::Malformed(2, 2),
180 unread_handle.consumed(),
181 destination_handle.written(),
182 );
183 }
184 destination_handle.write_bmp(code_unit);
185 continue;
186 }
187 }
188 },
189 self,
190 src_consumed,
191 dest,
192 source,
193 b,
194 destination_handle,
195 unread_handle,
196 check_space_astral
197 );
198}
199
200// Any copyright to the test code below this comment is dedicated to the
201// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
202
203#[cfg(all(test, feature = "alloc"))]
204mod tests {
205 use super::super::testing::*;
206 use super::super::*;
207
208 fn decode_utf_16le(bytes: &[u8], expect: &str) {
209 decode_without_padding(UTF_16LE, bytes, expect);
210 }
211
212 fn decode_utf_16be(bytes: &[u8], expect: &str) {
213 decode_without_padding(UTF_16BE, bytes, expect);
214 }
215
216 fn encode_utf_16le(string: &str, expect: &[u8]) {
217 encode(UTF_16LE, string, expect);
218 }
219
220 fn encode_utf_16be(string: &str, expect: &[u8]) {
221 encode(UTF_16BE, string, expect);
222 }
223
224 #[test]
225 fn test_utf_16_decode() {
226 decode_utf_16le(b"", "");
227 decode_utf_16be(b"", "");
228
229 decode_utf_16le(b"\x61\x00\x62\x00", "\u{0061}\u{0062}");
230 decode_utf_16be(b"\x00\x61\x00\x62", "\u{0061}\u{0062}");
231
232 decode_utf_16le(b"\xFE\xFF\x00\x61\x00\x62", "\u{0061}\u{0062}");
233 decode_utf_16be(b"\xFF\xFE\x61\x00\x62\x00", "\u{0061}\u{0062}");
234
235 decode_utf_16le(b"\x61\x00\x62", "\u{0061}\u{FFFD}");
236 decode_utf_16be(b"\x00\x61\x00", "\u{0061}\u{FFFD}");
237
238 decode_utf_16le(b"\x3D\xD8\xA9", "\u{FFFD}");
239 decode_utf_16be(b"\xD8\x3D\xDC", "\u{FFFD}");
240
241 decode_utf_16le(b"\x3D\xD8\xA9\xDC\x03\x26", "\u{1F4A9}\u{2603}");
242 decode_utf_16be(b"\xD8\x3D\xDC\xA9\x26\x03", "\u{1F4A9}\u{2603}");
243
244 decode_utf_16le(b"\xA9\xDC\x03\x26", "\u{FFFD}\u{2603}");
245 decode_utf_16be(b"\xDC\xA9\x26\x03", "\u{FFFD}\u{2603}");
246
247 decode_utf_16le(b"\x3D\xD8\x03\x26", "\u{FFFD}\u{2603}");
248 decode_utf_16be(b"\xD8\x3D\x26\x03", "\u{FFFD}\u{2603}");
249
250 // The \xFF makes sure that the parts before and after have different alignment
251 let long_le = b"\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8";
252 let long_be = b"\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D";
253 let long_expect = "\x00\x00\x00\x00\u{1F4A9}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\x00\x00\x00\x00\u{FFFD}";
254 decode_utf_16le(&long_le[..long_le.len() / 2], long_expect);
255 decode_utf_16be(&long_be[..long_be.len() / 2], long_expect);
256 decode_utf_16le(&long_le[long_le.len() / 2 + 1..], long_expect);
257 decode_utf_16be(&long_be[long_be.len() / 2 + 1..], long_expect);
258 }
259
260 #[test]
261 fn test_utf_16_encode() {
262 // Empty
263 encode_utf_16be("", b"");
264 encode_utf_16le("", b"");
265
266 // Encodes as UTF-8
267 assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
268 assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
269 encode_utf_16le("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
270 encode_utf_16be("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes());
271 }
272
273 #[test]
274 fn test_utf_16be_decode_one_by_one() {
275 let input = b"\x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9";
276 let mut output = [0u16; 20];
277 let mut decoder = UTF_16BE.new_decoder();
278 for b in input.chunks(1) {
279 assert_eq!(b.len(), 1);
280 let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
281 let (result, read, _, had_errors) =
282 decoder.decode_to_utf16(b, &mut output[..needed], false);
283 assert_eq!(result, CoderResult::InputEmpty);
284 assert_eq!(read, 1);
285 assert!(!had_errors);
286 }
287 }
288
289 #[test]
290 fn test_utf_16le_decode_one_by_one() {
291 let input = b"\x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC";
292 let mut output = [0u16; 20];
293 let mut decoder = UTF_16LE.new_decoder();
294 for b in input.chunks(1) {
295 assert_eq!(b.len(), 1);
296 let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
297 let (result, read, _, had_errors) =
298 decoder.decode_to_utf16(b, &mut output[..needed], false);
299 assert_eq!(result, CoderResult::InputEmpty);
300 assert_eq!(read, 1);
301 assert!(!had_errors);
302 }
303 }
304
305 #[test]
306 fn test_utf_16be_decode_three_at_a_time() {
307 let input = b"\x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4";
308 let mut output = [0u16; 20];
309 let mut decoder = UTF_16BE.new_decoder();
310 for b in input.chunks(3) {
311 assert_eq!(b.len(), 3);
312 let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
313 let (result, read, _, had_errors) =
314 decoder.decode_to_utf16(b, &mut output[..needed], false);
315 assert_eq!(result, CoderResult::InputEmpty);
316 assert_eq!(read, b.len());
317 assert!(!had_errors);
318 }
319 }
320
321 #[test]
322 fn test_utf_16le_decode_three_at_a_time() {
323 let input = b"\xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00";
324 let mut output = [0u16; 20];
325 let mut decoder = UTF_16LE.new_decoder();
326 for b in input.chunks(3) {
327 assert_eq!(b.len(), 3);
328 let needed = decoder.max_utf16_buffer_length(b.len()).unwrap();
329 let (result, read, _, had_errors) =
330 decoder.decode_to_utf16(b, &mut output[..needed], false);
331 assert_eq!(result, CoderResult::InputEmpty);
332 assert_eq!(read, b.len());
333 assert!(!had_errors);
334 }
335 }
336
337 #[test]
338 fn test_utf_16le_decode_bom_prefixed_split_byte_pair() {
339 let mut output = [0u16; 20];
340 let mut decoder = UTF_16LE.new_decoder();
341 {
342 let needed = decoder.max_utf16_buffer_length(1).unwrap();
343 let (result, read, written, had_errors) =
344 decoder.decode_to_utf16(b"\xFF", &mut output[..needed], false);
345 assert_eq!(result, CoderResult::InputEmpty);
346 assert_eq!(read, 1);
347 assert_eq!(written, 0);
348 assert!(!had_errors);
349 }
350 {
351 let needed = decoder.max_utf16_buffer_length(1).unwrap();
352 let (result, read, written, had_errors) =
353 decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
354 assert_eq!(result, CoderResult::InputEmpty);
355 assert_eq!(read, 1);
356 assert_eq!(written, 1);
357 assert!(!had_errors);
358 assert_eq!(output[0], 0xFDFF);
359 }
360 }
361
362 #[test]
363 fn test_utf_16be_decode_bom_prefixed_split_byte_pair() {
364 let mut output = [0u16; 20];
365 let mut decoder = UTF_16BE.new_decoder();
366 {
367 let needed = decoder.max_utf16_buffer_length(1).unwrap();
368 let (result, read, written, had_errors) =
369 decoder.decode_to_utf16(b"\xFE", &mut output[..needed], false);
370 assert_eq!(result, CoderResult::InputEmpty);
371 assert_eq!(read, 1);
372 assert_eq!(written, 0);
373 assert!(!had_errors);
374 }
375 {
376 let needed = decoder.max_utf16_buffer_length(1).unwrap();
377 let (result, read, written, had_errors) =
378 decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true);
379 assert_eq!(result, CoderResult::InputEmpty);
380 assert_eq!(read, 1);
381 assert_eq!(written, 1);
382 assert!(!had_errors);
383 assert_eq!(output[0], 0xFEFD);
384 }
385 }
386
387 #[test]
388 fn test_utf_16le_decode_bom_prefix() {
389 let mut output = [0u16; 20];
390 let mut decoder = UTF_16LE.new_decoder();
391 {
392 let needed = decoder.max_utf16_buffer_length(1).unwrap();
393 let (result, read, written, had_errors) =
394 decoder.decode_to_utf16(b"\xFF", &mut output[..needed], true);
395 assert_eq!(result, CoderResult::InputEmpty);
396 assert_eq!(read, 1);
397 assert_eq!(written, 1);
398 assert!(had_errors);
399 assert_eq!(output[0], 0xFFFD);
400 }
401 }
402
403 #[test]
404 fn test_utf_16be_decode_bom_prefix() {
405 let mut output = [0u16; 20];
406 let mut decoder = UTF_16BE.new_decoder();
407 {
408 let needed = decoder.max_utf16_buffer_length(1).unwrap();
409 let (result, read, written, had_errors) =
410 decoder.decode_to_utf16(b"\xFE", &mut output[..needed], true);
411 assert_eq!(result, CoderResult::InputEmpty);
412 assert_eq!(read, 1);
413 assert_eq!(written, 1);
414 assert!(had_errors);
415 assert_eq!(output[0], 0xFFFD);
416 }
417 }
418
419 #[test]
420 fn test_utf_16le_decode_near_end() {
421 let mut output = [0u8; 4];
422 let mut decoder = UTF_16LE.new_decoder();
423 {
424 let (result, read, written, had_errors) =
425 decoder.decode_to_utf8(&[0x03], &mut output[..], false);
426 assert_eq!(result, CoderResult::InputEmpty);
427 assert_eq!(read, 1);
428 assert_eq!(written, 0);
429 assert!(!had_errors);
430 assert_eq!(output[0], 0x0);
431 }
432 {
433 let (result, read, written, had_errors) =
434 decoder.decode_to_utf8(&[0x26, 0x03, 0x26], &mut output[..], false);
435 assert_eq!(result, CoderResult::OutputFull);
436 assert_eq!(read, 1);
437 assert_eq!(written, 3);
438 assert!(!had_errors);
439 assert_eq!(output[0], 0xE2);
440 assert_eq!(output[1], 0x98);
441 assert_eq!(output[2], 0x83);
442 assert_eq!(output[3], 0x00);
443 }
444 }
445
446 #[test]
447 fn test_utf_16be_decode_near_end() {
448 let mut output = [0u8; 4];
449 let mut decoder = UTF_16BE.new_decoder();
450 {
451 let (result, read, written, had_errors) =
452 decoder.decode_to_utf8(&[0x26], &mut output[..], false);
453 assert_eq!(result, CoderResult::InputEmpty);
454 assert_eq!(read, 1);
455 assert_eq!(written, 0);
456 assert!(!had_errors);
457 assert_eq!(output[0], 0x0);
458 }
459 {
460 let (result, read, written, had_errors) =
461 decoder.decode_to_utf8(&[0x03, 0x26, 0x03], &mut output[..], false);
462 assert_eq!(result, CoderResult::OutputFull);
463 assert_eq!(read, 1);
464 assert_eq!(written, 3);
465 assert!(!had_errors);
466 assert_eq!(output[0], 0xE2);
467 assert_eq!(output[1], 0x98);
468 assert_eq!(output[2], 0x83);
469 assert_eq!(output[3], 0x00);
470 }
471 }
472}
473