1 | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | use super::*; |
11 | use crate::handles::*; |
12 | use crate::variant::*; |
13 | |
14 | pub struct Utf16Decoder { |
15 | lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate |
16 | lead_byte: Option<u8>, |
17 | be: bool, |
18 | pending_bmp: bool, // if true, lead_surrogate is actually pending BMP |
19 | } |
20 | |
21 | impl Utf16Decoder { |
22 | pub fn new(big_endian: bool) -> VariantDecoder { |
23 | VariantDecoder::Utf16(Utf16Decoder { |
24 | lead_surrogate: 0, |
25 | lead_byte: None, |
26 | be: big_endian, |
27 | pending_bmp: false, |
28 | }) |
29 | } |
30 | |
31 | pub fn additional_from_state(&self) -> usize { |
32 | 1 + if self.lead_byte.is_some() { 1 } else { 0 } |
33 | + if self.lead_surrogate == 0 { 0 } else { 2 } |
34 | } |
35 | |
36 | pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
37 | checked_add( |
38 | 1, |
39 | checked_div(byte_length.checked_add(self.additional_from_state()), 2), |
40 | ) |
41 | } |
42 | |
43 | pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
44 | checked_add( |
45 | 1, |
46 | checked_mul( |
47 | 3, |
48 | checked_div(byte_length.checked_add(self.additional_from_state()), 2), |
49 | ), |
50 | ) |
51 | } |
52 | |
53 | pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
54 | checked_add( |
55 | 1, |
56 | checked_mul( |
57 | 3, |
58 | checked_div(byte_length.checked_add(self.additional_from_state()), 2), |
59 | ), |
60 | ) |
61 | } |
62 | |
63 | decoder_functions!( |
64 | { |
65 | if self.pending_bmp { |
66 | match dest.check_space_bmp() { |
67 | Space::Full(_) => { |
68 | return (DecoderResult::OutputFull, 0, 0); |
69 | } |
70 | Space::Available(destination_handle) => { |
71 | destination_handle.write_bmp(self.lead_surrogate); |
72 | self.pending_bmp = false; |
73 | self.lead_surrogate = 0; |
74 | } |
75 | } |
76 | } |
77 | }, |
78 | { |
79 | // This is the fast path. The rest runs only at the |
80 | // start and end for partial sequences. |
81 | if self.lead_byte.is_none() && self.lead_surrogate == 0 { |
82 | if let Some((read, written)) = if self.be { |
83 | dest.copy_utf16_from::<BigEndian>(&mut source) |
84 | } else { |
85 | dest.copy_utf16_from::<LittleEndian>(&mut source) |
86 | } { |
87 | return (DecoderResult::Malformed(2, 0), read, written); |
88 | } |
89 | } |
90 | }, |
91 | { |
92 | debug_assert!(!self.pending_bmp); |
93 | if self.lead_surrogate != 0 || self.lead_byte.is_some() { |
94 | // We need to check space without intent to write in order to |
95 | // make sure that there is space for the replacement character. |
96 | match dest.check_space_bmp() { |
97 | Space::Full(_) => { |
98 | return (DecoderResult::OutputFull, 0, 0); |
99 | } |
100 | Space::Available(_) => { |
101 | if self.lead_surrogate != 0 { |
102 | self.lead_surrogate = 0; |
103 | match self.lead_byte { |
104 | None => { |
105 | return ( |
106 | DecoderResult::Malformed(2, 0), |
107 | src_consumed, |
108 | dest.written(), |
109 | ); |
110 | } |
111 | Some(_) => { |
112 | self.lead_byte = None; |
113 | return ( |
114 | DecoderResult::Malformed(3, 0), |
115 | src_consumed, |
116 | dest.written(), |
117 | ); |
118 | } |
119 | } |
120 | } |
121 | debug_assert!(self.lead_byte.is_some()); |
122 | self.lead_byte = None; |
123 | return (DecoderResult::Malformed(1, 0), src_consumed, dest.written()); |
124 | } |
125 | } |
126 | } |
127 | }, |
128 | { |
129 | match self.lead_byte { |
130 | None => { |
131 | self.lead_byte = Some(b); |
132 | continue; |
133 | } |
134 | Some(lead) => { |
135 | self.lead_byte = None; |
136 | let code_unit = if self.be { |
137 | u16::from(lead) << 8 | u16::from(b) |
138 | } else { |
139 | u16::from(b) << 8 | u16::from(lead) |
140 | }; |
141 | let high_bits = code_unit & 0xFC00u16; |
142 | if high_bits == 0xD800u16 { |
143 | // high surrogate |
144 | if self.lead_surrogate != 0 { |
145 | // The previous high surrogate was in |
146 | // error and this one becomes the new |
147 | // pending one. |
148 | self.lead_surrogate = code_unit as u16; |
149 | return ( |
150 | DecoderResult::Malformed(2, 2), |
151 | unread_handle.consumed(), |
152 | destination_handle.written(), |
153 | ); |
154 | } |
155 | self.lead_surrogate = code_unit; |
156 | continue; |
157 | } |
158 | if high_bits == 0xDC00u16 { |
159 | // low surrogate |
160 | if self.lead_surrogate == 0 { |
161 | return ( |
162 | DecoderResult::Malformed(2, 0), |
163 | unread_handle.consumed(), |
164 | destination_handle.written(), |
165 | ); |
166 | } |
167 | destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit); |
168 | self.lead_surrogate = 0; |
169 | continue; |
170 | } |
171 | // bmp |
172 | if self.lead_surrogate != 0 { |
173 | // The previous high surrogate was in |
174 | // error and this code unit becomes a |
175 | // pending BMP character. |
176 | self.lead_surrogate = code_unit; |
177 | self.pending_bmp = true; |
178 | return ( |
179 | DecoderResult::Malformed(2, 2), |
180 | unread_handle.consumed(), |
181 | destination_handle.written(), |
182 | ); |
183 | } |
184 | destination_handle.write_bmp(code_unit); |
185 | continue; |
186 | } |
187 | } |
188 | }, |
189 | self, |
190 | src_consumed, |
191 | dest, |
192 | source, |
193 | b, |
194 | destination_handle, |
195 | unread_handle, |
196 | check_space_astral |
197 | ); |
198 | } |
199 | |
200 | // Any copyright to the test code below this comment is dedicated to the |
201 | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
202 | |
203 | #[cfg (all(test, feature = "alloc" ))] |
204 | mod tests { |
205 | use super::super::testing::*; |
206 | use super::super::*; |
207 | |
208 | fn decode_utf_16le(bytes: &[u8], expect: &str) { |
209 | decode_without_padding(UTF_16LE, bytes, expect); |
210 | } |
211 | |
212 | fn decode_utf_16be(bytes: &[u8], expect: &str) { |
213 | decode_without_padding(UTF_16BE, bytes, expect); |
214 | } |
215 | |
216 | fn encode_utf_16le(string: &str, expect: &[u8]) { |
217 | encode(UTF_16LE, string, expect); |
218 | } |
219 | |
220 | fn encode_utf_16be(string: &str, expect: &[u8]) { |
221 | encode(UTF_16BE, string, expect); |
222 | } |
223 | |
224 | #[test ] |
225 | fn test_utf_16_decode() { |
226 | decode_utf_16le(b"" , "" ); |
227 | decode_utf_16be(b"" , "" ); |
228 | |
229 | decode_utf_16le(b" \x61\x00\x62\x00" , " \u{0061}\u{0062}" ); |
230 | decode_utf_16be(b" \x00\x61\x00\x62" , " \u{0061}\u{0062}" ); |
231 | |
232 | decode_utf_16le(b" \xFE\xFF\x00\x61\x00\x62" , " \u{0061}\u{0062}" ); |
233 | decode_utf_16be(b" \xFF\xFE\x61\x00\x62\x00" , " \u{0061}\u{0062}" ); |
234 | |
235 | decode_utf_16le(b" \x61\x00\x62" , " \u{0061}\u{FFFD}" ); |
236 | decode_utf_16be(b" \x00\x61\x00" , " \u{0061}\u{FFFD}" ); |
237 | |
238 | decode_utf_16le(b" \x3D\xD8\xA9" , " \u{FFFD}" ); |
239 | decode_utf_16be(b" \xD8\x3D\xDC" , " \u{FFFD}" ); |
240 | |
241 | decode_utf_16le(b" \x3D\xD8\xA9\xDC\x03\x26" , " \u{1F4A9}\u{2603}" ); |
242 | decode_utf_16be(b" \xD8\x3D\xDC\xA9\x26\x03" , " \u{1F4A9}\u{2603}" ); |
243 | |
244 | decode_utf_16le(b" \xA9\xDC\x03\x26" , " \u{FFFD}\u{2603}" ); |
245 | decode_utf_16be(b" \xDC\xA9\x26\x03" , " \u{FFFD}\u{2603}" ); |
246 | |
247 | decode_utf_16le(b" \x3D\xD8\x03\x26" , " \u{FFFD}\u{2603}" ); |
248 | decode_utf_16be(b" \xD8\x3D\x26\x03" , " \u{FFFD}\u{2603}" ); |
249 | |
250 | // The \xFF makes sure that the parts before and after have different alignment |
251 | let long_le = b" \x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8" ; |
252 | let long_be = b" \x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D" ; |
253 | let long_expect = " \x00\x00\x00\x00\u{1F4A9}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\x00\x00\x00\x00\u{FFFD}" ; |
254 | decode_utf_16le(&long_le[..long_le.len() / 2], long_expect); |
255 | decode_utf_16be(&long_be[..long_be.len() / 2], long_expect); |
256 | decode_utf_16le(&long_le[long_le.len() / 2 + 1..], long_expect); |
257 | decode_utf_16be(&long_be[long_be.len() / 2 + 1..], long_expect); |
258 | } |
259 | |
260 | #[test ] |
261 | fn test_utf_16_encode() { |
262 | // Empty |
263 | encode_utf_16be("" , b"" ); |
264 | encode_utf_16le("" , b"" ); |
265 | |
266 | // Encodes as UTF-8 |
267 | assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8); |
268 | assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8); |
269 | encode_utf_16le(" \u{1F4A9}\u{2603}" , " \u{1F4A9}\u{2603}" .as_bytes()); |
270 | encode_utf_16be(" \u{1F4A9}\u{2603}" , " \u{1F4A9}\u{2603}" .as_bytes()); |
271 | } |
272 | |
273 | #[test ] |
274 | fn test_utf_16be_decode_one_by_one() { |
275 | let input = b" \x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9" ; |
276 | let mut output = [0u16; 20]; |
277 | let mut decoder = UTF_16BE.new_decoder(); |
278 | for b in input.chunks(1) { |
279 | assert_eq!(b.len(), 1); |
280 | let needed = decoder.max_utf16_buffer_length(b.len()).unwrap(); |
281 | let (result, read, _, had_errors) = |
282 | decoder.decode_to_utf16(b, &mut output[..needed], false); |
283 | assert_eq!(result, CoderResult::InputEmpty); |
284 | assert_eq!(read, 1); |
285 | assert!(!had_errors); |
286 | } |
287 | } |
288 | |
289 | #[test ] |
290 | fn test_utf_16le_decode_one_by_one() { |
291 | let input = b" \x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC" ; |
292 | let mut output = [0u16; 20]; |
293 | let mut decoder = UTF_16LE.new_decoder(); |
294 | for b in input.chunks(1) { |
295 | assert_eq!(b.len(), 1); |
296 | let needed = decoder.max_utf16_buffer_length(b.len()).unwrap(); |
297 | let (result, read, _, had_errors) = |
298 | decoder.decode_to_utf16(b, &mut output[..needed], false); |
299 | assert_eq!(result, CoderResult::InputEmpty); |
300 | assert_eq!(read, 1); |
301 | assert!(!had_errors); |
302 | } |
303 | } |
304 | |
305 | #[test ] |
306 | fn test_utf_16be_decode_three_at_a_time() { |
307 | let input = b" \x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4" ; |
308 | let mut output = [0u16; 20]; |
309 | let mut decoder = UTF_16BE.new_decoder(); |
310 | for b in input.chunks(3) { |
311 | assert_eq!(b.len(), 3); |
312 | let needed = decoder.max_utf16_buffer_length(b.len()).unwrap(); |
313 | let (result, read, _, had_errors) = |
314 | decoder.decode_to_utf16(b, &mut output[..needed], false); |
315 | assert_eq!(result, CoderResult::InputEmpty); |
316 | assert_eq!(read, b.len()); |
317 | assert!(!had_errors); |
318 | } |
319 | } |
320 | |
321 | #[test ] |
322 | fn test_utf_16le_decode_three_at_a_time() { |
323 | let input = b" \xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00" ; |
324 | let mut output = [0u16; 20]; |
325 | let mut decoder = UTF_16LE.new_decoder(); |
326 | for b in input.chunks(3) { |
327 | assert_eq!(b.len(), 3); |
328 | let needed = decoder.max_utf16_buffer_length(b.len()).unwrap(); |
329 | let (result, read, _, had_errors) = |
330 | decoder.decode_to_utf16(b, &mut output[..needed], false); |
331 | assert_eq!(result, CoderResult::InputEmpty); |
332 | assert_eq!(read, b.len()); |
333 | assert!(!had_errors); |
334 | } |
335 | } |
336 | |
337 | #[test ] |
338 | fn test_utf_16le_decode_bom_prefixed_split_byte_pair() { |
339 | let mut output = [0u16; 20]; |
340 | let mut decoder = UTF_16LE.new_decoder(); |
341 | { |
342 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
343 | let (result, read, written, had_errors) = |
344 | decoder.decode_to_utf16(b" \xFF" , &mut output[..needed], false); |
345 | assert_eq!(result, CoderResult::InputEmpty); |
346 | assert_eq!(read, 1); |
347 | assert_eq!(written, 0); |
348 | assert!(!had_errors); |
349 | } |
350 | { |
351 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
352 | let (result, read, written, had_errors) = |
353 | decoder.decode_to_utf16(b" \xFD" , &mut output[..needed], true); |
354 | assert_eq!(result, CoderResult::InputEmpty); |
355 | assert_eq!(read, 1); |
356 | assert_eq!(written, 1); |
357 | assert!(!had_errors); |
358 | assert_eq!(output[0], 0xFDFF); |
359 | } |
360 | } |
361 | |
362 | #[test ] |
363 | fn test_utf_16be_decode_bom_prefixed_split_byte_pair() { |
364 | let mut output = [0u16; 20]; |
365 | let mut decoder = UTF_16BE.new_decoder(); |
366 | { |
367 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
368 | let (result, read, written, had_errors) = |
369 | decoder.decode_to_utf16(b" \xFE" , &mut output[..needed], false); |
370 | assert_eq!(result, CoderResult::InputEmpty); |
371 | assert_eq!(read, 1); |
372 | assert_eq!(written, 0); |
373 | assert!(!had_errors); |
374 | } |
375 | { |
376 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
377 | let (result, read, written, had_errors) = |
378 | decoder.decode_to_utf16(b" \xFD" , &mut output[..needed], true); |
379 | assert_eq!(result, CoderResult::InputEmpty); |
380 | assert_eq!(read, 1); |
381 | assert_eq!(written, 1); |
382 | assert!(!had_errors); |
383 | assert_eq!(output[0], 0xFEFD); |
384 | } |
385 | } |
386 | |
387 | #[test ] |
388 | fn test_utf_16le_decode_bom_prefix() { |
389 | let mut output = [0u16; 20]; |
390 | let mut decoder = UTF_16LE.new_decoder(); |
391 | { |
392 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
393 | let (result, read, written, had_errors) = |
394 | decoder.decode_to_utf16(b" \xFF" , &mut output[..needed], true); |
395 | assert_eq!(result, CoderResult::InputEmpty); |
396 | assert_eq!(read, 1); |
397 | assert_eq!(written, 1); |
398 | assert!(had_errors); |
399 | assert_eq!(output[0], 0xFFFD); |
400 | } |
401 | } |
402 | |
403 | #[test ] |
404 | fn test_utf_16be_decode_bom_prefix() { |
405 | let mut output = [0u16; 20]; |
406 | let mut decoder = UTF_16BE.new_decoder(); |
407 | { |
408 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
409 | let (result, read, written, had_errors) = |
410 | decoder.decode_to_utf16(b" \xFE" , &mut output[..needed], true); |
411 | assert_eq!(result, CoderResult::InputEmpty); |
412 | assert_eq!(read, 1); |
413 | assert_eq!(written, 1); |
414 | assert!(had_errors); |
415 | assert_eq!(output[0], 0xFFFD); |
416 | } |
417 | } |
418 | |
419 | #[test ] |
420 | fn test_utf_16le_decode_near_end() { |
421 | let mut output = [0u8; 4]; |
422 | let mut decoder = UTF_16LE.new_decoder(); |
423 | { |
424 | let (result, read, written, had_errors) = |
425 | decoder.decode_to_utf8(&[0x03], &mut output[..], false); |
426 | assert_eq!(result, CoderResult::InputEmpty); |
427 | assert_eq!(read, 1); |
428 | assert_eq!(written, 0); |
429 | assert!(!had_errors); |
430 | assert_eq!(output[0], 0x0); |
431 | } |
432 | { |
433 | let (result, read, written, had_errors) = |
434 | decoder.decode_to_utf8(&[0x26, 0x03, 0x26], &mut output[..], false); |
435 | assert_eq!(result, CoderResult::OutputFull); |
436 | assert_eq!(read, 1); |
437 | assert_eq!(written, 3); |
438 | assert!(!had_errors); |
439 | assert_eq!(output[0], 0xE2); |
440 | assert_eq!(output[1], 0x98); |
441 | assert_eq!(output[2], 0x83); |
442 | assert_eq!(output[3], 0x00); |
443 | } |
444 | } |
445 | |
446 | #[test ] |
447 | fn test_utf_16be_decode_near_end() { |
448 | let mut output = [0u8; 4]; |
449 | let mut decoder = UTF_16BE.new_decoder(); |
450 | { |
451 | let (result, read, written, had_errors) = |
452 | decoder.decode_to_utf8(&[0x26], &mut output[..], false); |
453 | assert_eq!(result, CoderResult::InputEmpty); |
454 | assert_eq!(read, 1); |
455 | assert_eq!(written, 0); |
456 | assert!(!had_errors); |
457 | assert_eq!(output[0], 0x0); |
458 | } |
459 | { |
460 | let (result, read, written, had_errors) = |
461 | decoder.decode_to_utf8(&[0x03, 0x26, 0x03], &mut output[..], false); |
462 | assert_eq!(result, CoderResult::OutputFull); |
463 | assert_eq!(read, 1); |
464 | assert_eq!(written, 3); |
465 | assert!(!had_errors); |
466 | assert_eq!(output[0], 0xE2); |
467 | assert_eq!(output[1], 0x98); |
468 | assert_eq!(output[2], 0x83); |
469 | assert_eq!(output[3], 0x00); |
470 | } |
471 | } |
472 | } |
473 | |