1 | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | use super::*; |
11 | use crate::ascii::ascii_to_basic_latin; |
12 | use crate::ascii::basic_latin_to_ascii; |
13 | use crate::ascii::validate_ascii; |
14 | use crate::handles::*; |
15 | use crate::mem::convert_utf16_to_utf8_partial; |
16 | use crate::variant::*; |
17 | |
18 | cfg_if! { |
19 | if #[cfg(feature = "simd-accel" )] { |
20 | use ::core::intrinsics::unlikely; |
21 | use ::core::intrinsics::likely; |
22 | } else { |
23 | #[inline (always)] |
24 | fn unlikely(b: bool) -> bool { |
25 | b |
26 | } |
27 | #[inline (always)] |
28 | fn likely(b: bool) -> bool { |
29 | b |
30 | } |
31 | } |
32 | } |
33 | |
34 | #[repr (align(64))] // Align to cache lines |
35 | pub struct Utf8Data { |
36 | pub table: [u8; 384], |
37 | } |
38 | |
39 | // BEGIN GENERATED CODE. PLEASE DO NOT EDIT. |
40 | // Instead, please regenerate using generate-encoding-data.py |
41 | |
42 | pub static UTF8_DATA: Utf8Data = Utf8Data { |
43 | table: [ |
44 | 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, |
45 | 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, |
46 | 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, |
47 | 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, |
48 | 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, |
49 | 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, |
50 | 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, |
51 | 252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148, |
52 | 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164, |
53 | 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, |
54 | 164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252, |
55 | 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, |
56 | 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, |
57 | 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, |
58 | 252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
59 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
60 | 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, |
61 | 8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4, |
62 | 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, |
63 | ], |
64 | }; |
65 | |
66 | // END GENERATED CODE |
67 | |
68 | pub fn utf8_valid_up_to(src: &[u8]) -> usize { |
69 | let mut read = 0; |
70 | 'outer: loop { |
71 | let mut byte = { |
72 | let src_remaining = &src[read..]; |
73 | match validate_ascii(src_remaining) { |
74 | None => { |
75 | return src.len(); |
76 | } |
77 | Some((non_ascii, consumed)) => { |
78 | read += consumed; |
79 | non_ascii |
80 | } |
81 | } |
82 | }; |
83 | // Check for the longest sequence to avoid checking twice for the |
84 | // multi-byte sequences. This can't overflow with 64-bit address space, |
85 | // because full 64 bits aren't in use. In the 32-bit PAE case, for this |
86 | // to overflow would mean that the source slice would be so large that |
87 | // the address space of the process would not have space for any code. |
88 | // Therefore, the slice cannot be so long that this would overflow. |
89 | if likely(read + 4 <= src.len()) { |
90 | 'inner: loop { |
91 | // At this point, `byte` is not included in `read`, because we |
92 | // don't yet know that a) the UTF-8 sequence is valid and b) that there |
93 | // is output space if it is an astral sequence. |
94 | // Inspecting the lead byte directly is faster than what the |
95 | // std lib does! |
96 | if likely(in_inclusive_range8(byte, 0xC2, 0xDF)) { |
97 | // Two-byte |
98 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
99 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
100 | break 'outer; |
101 | } |
102 | read += 2; |
103 | |
104 | // Next lead (manually inlined) |
105 | if likely(read + 4 <= src.len()) { |
106 | byte = unsafe { *(src.get_unchecked(read)) }; |
107 | if byte < 0x80 { |
108 | read += 1; |
109 | continue 'outer; |
110 | } |
111 | continue 'inner; |
112 | } |
113 | break 'inner; |
114 | } |
115 | if likely(byte < 0xF0) { |
116 | 'three: loop { |
117 | // Three-byte |
118 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
119 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
120 | if ((UTF8_DATA.table[usize::from(second)] |
121 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
122 | | (third >> 6)) |
123 | != 2 |
124 | { |
125 | break 'outer; |
126 | } |
127 | read += 3; |
128 | |
129 | // Next lead (manually inlined) |
130 | if likely(read + 4 <= src.len()) { |
131 | byte = unsafe { *(src.get_unchecked(read)) }; |
132 | if in_inclusive_range8(byte, 0xE0, 0xEF) { |
133 | continue 'three; |
134 | } |
135 | if likely(byte < 0x80) { |
136 | read += 1; |
137 | continue 'outer; |
138 | } |
139 | continue 'inner; |
140 | } |
141 | break 'inner; |
142 | } |
143 | } |
144 | // Four-byte |
145 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
146 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
147 | let fourth = unsafe { *(src.get_unchecked(read + 3)) }; |
148 | if (u16::from( |
149 | UTF8_DATA.table[usize::from(second)] |
150 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }, |
151 | ) | u16::from(third >> 6) |
152 | | (u16::from(fourth & 0xC0) << 2)) |
153 | != 0x202 |
154 | { |
155 | break 'outer; |
156 | } |
157 | read += 4; |
158 | |
159 | // Next lead |
160 | if likely(read + 4 <= src.len()) { |
161 | byte = unsafe { *(src.get_unchecked(read)) }; |
162 | if byte < 0x80 { |
163 | read += 1; |
164 | continue 'outer; |
165 | } |
166 | continue 'inner; |
167 | } |
168 | break 'inner; |
169 | } |
170 | } |
171 | // We can't have a complete 4-byte sequence, but we could still have |
172 | // one to three shorter sequences. |
173 | 'tail: loop { |
174 | // >= is better for bound check elision than == |
175 | if read >= src.len() { |
176 | break 'outer; |
177 | } |
178 | byte = src[read]; |
179 | // At this point, `byte` is not included in `read`, because we |
180 | // don't yet know that a) the UTF-8 sequence is valid and b) that there |
181 | // is output space if it is an astral sequence. |
182 | // Inspecting the lead byte directly is faster than what the |
183 | // std lib does! |
184 | if byte < 0x80 { |
185 | read += 1; |
186 | continue 'tail; |
187 | } |
188 | if in_inclusive_range8(byte, 0xC2, 0xDF) { |
189 | // Two-byte |
190 | let new_read = read + 2; |
191 | if new_read > src.len() { |
192 | break 'outer; |
193 | } |
194 | let second = src[read + 1]; |
195 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
196 | break 'outer; |
197 | } |
198 | read += 2; |
199 | continue 'tail; |
200 | } |
201 | // We need to exclude valid four byte lead bytes, because |
202 | // `UTF8_DATA.second_mask` covers |
203 | if byte < 0xF0 { |
204 | // Three-byte |
205 | let new_read = read + 3; |
206 | if new_read > src.len() { |
207 | break 'outer; |
208 | } |
209 | let second = src[read + 1]; |
210 | let third = src[read + 2]; |
211 | if ((UTF8_DATA.table[usize::from(second)] |
212 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
213 | | (third >> 6)) |
214 | != 2 |
215 | { |
216 | break 'outer; |
217 | } |
218 | read += 3; |
219 | // `'tail` handles sequences shorter than 4, so |
220 | // there can't be another sequence after this one. |
221 | break 'outer; |
222 | } |
223 | break 'outer; |
224 | } |
225 | } |
226 | read |
227 | } |
228 | |
229 | #[cfg_attr (feature = "cargo-clippy" , allow(never_loop, cyclomatic_complexity))] |
230 | pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) { |
231 | let mut read = 0; |
232 | let mut written = 0; |
233 | 'outer: loop { |
234 | let mut byte = { |
235 | let src_remaining = &src[read..]; |
236 | let dst_remaining = &mut dst[written..]; |
237 | let length = ::core::cmp::min(src_remaining.len(), dst_remaining.len()); |
238 | match unsafe { |
239 | ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
240 | } { |
241 | None => { |
242 | read += length; |
243 | written += length; |
244 | break 'outer; |
245 | } |
246 | Some((non_ascii, consumed)) => { |
247 | read += consumed; |
248 | written += consumed; |
249 | non_ascii |
250 | } |
251 | } |
252 | }; |
253 | // Check for the longest sequence to avoid checking twice for the |
254 | // multi-byte sequences. This can't overflow with 64-bit address space, |
255 | // because full 64 bits aren't in use. In the 32-bit PAE case, for this |
256 | // to overflow would mean that the source slice would be so large that |
257 | // the address space of the process would not have space for any code. |
258 | // Therefore, the slice cannot be so long that this would overflow. |
259 | if likely(read + 4 <= src.len()) { |
260 | 'inner: loop { |
261 | // At this point, `byte` is not included in `read`, because we |
262 | // don't yet know that a) the UTF-8 sequence is valid and b) that there |
263 | // is output space if it is an astral sequence. |
264 | // We know, thanks to `ascii_to_basic_latin` that there is output |
265 | // space for at least one UTF-16 code unit, so no need to check |
266 | // for output space in the BMP cases. |
267 | // Inspecting the lead byte directly is faster than what the |
268 | // std lib does! |
269 | if likely(in_inclusive_range8(byte, 0xC2, 0xDF)) { |
270 | // Two-byte |
271 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
272 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
273 | break 'outer; |
274 | } |
275 | unsafe { |
276 | *(dst.get_unchecked_mut(written)) = |
277 | ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F) |
278 | }; |
279 | read += 2; |
280 | written += 1; |
281 | |
282 | // Next lead (manually inlined) |
283 | if written == dst.len() { |
284 | break 'outer; |
285 | } |
286 | if likely(read + 4 <= src.len()) { |
287 | byte = unsafe { *(src.get_unchecked(read)) }; |
288 | if byte < 0x80 { |
289 | unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; |
290 | read += 1; |
291 | written += 1; |
292 | continue 'outer; |
293 | } |
294 | continue 'inner; |
295 | } |
296 | break 'inner; |
297 | } |
298 | if likely(byte < 0xF0) { |
299 | 'three: loop { |
300 | // Three-byte |
301 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
302 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
303 | if ((UTF8_DATA.table[usize::from(second)] |
304 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
305 | | (third >> 6)) |
306 | != 2 |
307 | { |
308 | break 'outer; |
309 | } |
310 | let point = ((u16::from(byte) & 0xF) << 12) |
311 | | ((u16::from(second) & 0x3F) << 6) |
312 | | (u16::from(third) & 0x3F); |
313 | unsafe { *(dst.get_unchecked_mut(written)) = point }; |
314 | read += 3; |
315 | written += 1; |
316 | |
317 | // Next lead (manually inlined) |
318 | if written == dst.len() { |
319 | break 'outer; |
320 | } |
321 | if likely(read + 4 <= src.len()) { |
322 | byte = unsafe { *(src.get_unchecked(read)) }; |
323 | if in_inclusive_range8(byte, 0xE0, 0xEF) { |
324 | continue 'three; |
325 | } |
326 | if likely(byte < 0x80) { |
327 | unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; |
328 | read += 1; |
329 | written += 1; |
330 | continue 'outer; |
331 | } |
332 | continue 'inner; |
333 | } |
334 | break 'inner; |
335 | } |
336 | } |
337 | // Four-byte |
338 | if written + 1 == dst.len() { |
339 | break 'outer; |
340 | } |
341 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
342 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
343 | let fourth = unsafe { *(src.get_unchecked(read + 3)) }; |
344 | if (u16::from( |
345 | UTF8_DATA.table[usize::from(second)] |
346 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }, |
347 | ) | u16::from(third >> 6) |
348 | | (u16::from(fourth & 0xC0) << 2)) |
349 | != 0x202 |
350 | { |
351 | break 'outer; |
352 | } |
353 | let point = ((u32::from(byte) & 0x7) << 18) |
354 | | ((u32::from(second) & 0x3F) << 12) |
355 | | ((u32::from(third) & 0x3F) << 6) |
356 | | (u32::from(fourth) & 0x3F); |
357 | unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 }; |
358 | unsafe { |
359 | *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16 |
360 | }; |
361 | read += 4; |
362 | written += 2; |
363 | |
364 | // Next lead |
365 | if written == dst.len() { |
366 | break 'outer; |
367 | } |
368 | if likely(read + 4 <= src.len()) { |
369 | byte = unsafe { *(src.get_unchecked(read)) }; |
370 | if byte < 0x80 { |
371 | unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; |
372 | read += 1; |
373 | written += 1; |
374 | continue 'outer; |
375 | } |
376 | continue 'inner; |
377 | } |
378 | break 'inner; |
379 | } |
380 | } |
381 | // We can't have a complete 4-byte sequence, but we could still have |
382 | // one to three shorter sequences. |
383 | 'tail: loop { |
384 | // >= is better for bound check elision than == |
385 | if read >= src.len() || written >= dst.len() { |
386 | break 'outer; |
387 | } |
388 | byte = src[read]; |
389 | // At this point, `byte` is not included in `read`, because we |
390 | // don't yet know that a) the UTF-8 sequence is valid and b) that there |
391 | // is output space if it is an astral sequence. |
392 | // Inspecting the lead byte directly is faster than what the |
393 | // std lib does! |
394 | if byte < 0x80 { |
395 | dst[written] = u16::from(byte); |
396 | read += 1; |
397 | written += 1; |
398 | continue 'tail; |
399 | } |
400 | if in_inclusive_range8(byte, 0xC2, 0xDF) { |
401 | // Two-byte |
402 | let new_read = read + 2; |
403 | if new_read > src.len() { |
404 | break 'outer; |
405 | } |
406 | let second = src[read + 1]; |
407 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
408 | break 'outer; |
409 | } |
410 | dst[written] = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F); |
411 | read += 2; |
412 | written += 1; |
413 | continue 'tail; |
414 | } |
415 | // We need to exclude valid four byte lead bytes, because |
416 | // `UTF8_DATA.second_mask` covers |
417 | if byte < 0xF0 { |
418 | // Three-byte |
419 | let new_read = read + 3; |
420 | if new_read > src.len() { |
421 | break 'outer; |
422 | } |
423 | let second = src[read + 1]; |
424 | let third = src[read + 2]; |
425 | if ((UTF8_DATA.table[usize::from(second)] |
426 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
427 | | (third >> 6)) |
428 | != 2 |
429 | { |
430 | break 'outer; |
431 | } |
432 | let point = ((u16::from(byte) & 0xF) << 12) |
433 | | ((u16::from(second) & 0x3F) << 6) |
434 | | (u16::from(third) & 0x3F); |
435 | dst[written] = point; |
436 | read += 3; |
437 | written += 1; |
438 | // `'tail` handles sequences shorter than 4, so |
439 | // there can't be another sequence after this one. |
440 | break 'outer; |
441 | } |
442 | break 'outer; |
443 | } |
444 | } |
445 | (read, written) |
446 | } |
447 | |
448 | pub struct Utf8Decoder { |
449 | code_point: u32, |
450 | bytes_seen: usize, // 1, 2 or 3: counts continuations only |
451 | bytes_needed: usize, // 1, 2 or 3: counts continuations only |
452 | lower_boundary: u8, |
453 | upper_boundary: u8, |
454 | } |
455 | |
456 | impl Utf8Decoder { |
457 | pub fn new_inner() -> Utf8Decoder { |
458 | Utf8Decoder { |
459 | code_point: 0, |
460 | bytes_seen: 0, |
461 | bytes_needed: 0, |
462 | lower_boundary: 0x80u8, |
463 | upper_boundary: 0xBFu8, |
464 | } |
465 | } |
466 | |
467 | pub fn new() -> VariantDecoder { |
468 | VariantDecoder::Utf8(Utf8Decoder::new_inner()) |
469 | } |
470 | |
471 | pub fn in_neutral_state(&self) -> bool { |
472 | self.bytes_needed == 0 |
473 | } |
474 | |
475 | fn extra_from_state(&self) -> usize { |
476 | if self.bytes_needed == 0 { |
477 | 0 |
478 | } else { |
479 | self.bytes_seen + 1 |
480 | } |
481 | } |
482 | |
483 | pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
484 | byte_length.checked_add(1 + self.extra_from_state()) |
485 | } |
486 | |
487 | pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
488 | byte_length.checked_add(3 + self.extra_from_state()) |
489 | } |
490 | |
491 | pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
492 | checked_add( |
493 | 3, |
494 | checked_mul(3, byte_length.checked_add(self.extra_from_state())), |
495 | ) |
496 | } |
497 | |
498 | decoder_functions!( |
499 | {}, |
500 | { |
501 | // This is the fast path. The rest runs only at the |
502 | // start and end for partial sequences. |
503 | if self.bytes_needed == 0 { |
504 | dest.copy_utf8_up_to_invalid_from(&mut source); |
505 | } |
506 | }, |
507 | { |
508 | if self.bytes_needed != 0 { |
509 | let bad_bytes = (self.bytes_seen + 1) as u8; |
510 | self.code_point = 0; |
511 | self.bytes_needed = 0; |
512 | self.bytes_seen = 0; |
513 | return ( |
514 | DecoderResult::Malformed(bad_bytes, 0), |
515 | src_consumed, |
516 | dest.written(), |
517 | ); |
518 | } |
519 | }, |
520 | { |
521 | if self.bytes_needed == 0 { |
522 | if b < 0x80u8 { |
523 | destination_handle.write_ascii(b); |
524 | continue; |
525 | } |
526 | if b < 0xC2u8 { |
527 | return ( |
528 | DecoderResult::Malformed(1, 0), |
529 | unread_handle.consumed(), |
530 | destination_handle.written(), |
531 | ); |
532 | } |
533 | if b < 0xE0u8 { |
534 | self.bytes_needed = 1; |
535 | self.code_point = u32::from(b) & 0x1F; |
536 | continue; |
537 | } |
538 | if b < 0xF0u8 { |
539 | if b == 0xE0u8 { |
540 | self.lower_boundary = 0xA0u8; |
541 | } else if b == 0xEDu8 { |
542 | self.upper_boundary = 0x9Fu8; |
543 | } |
544 | self.bytes_needed = 2; |
545 | self.code_point = u32::from(b) & 0xF; |
546 | continue; |
547 | } |
548 | if b < 0xF5u8 { |
549 | if b == 0xF0u8 { |
550 | self.lower_boundary = 0x90u8; |
551 | } else if b == 0xF4u8 { |
552 | self.upper_boundary = 0x8Fu8; |
553 | } |
554 | self.bytes_needed = 3; |
555 | self.code_point = u32::from(b) & 0x7; |
556 | continue; |
557 | } |
558 | return ( |
559 | DecoderResult::Malformed(1, 0), |
560 | unread_handle.consumed(), |
561 | destination_handle.written(), |
562 | ); |
563 | } |
564 | // self.bytes_needed != 0 |
565 | if !(b >= self.lower_boundary && b <= self.upper_boundary) { |
566 | let bad_bytes = (self.bytes_seen + 1) as u8; |
567 | self.code_point = 0; |
568 | self.bytes_needed = 0; |
569 | self.bytes_seen = 0; |
570 | self.lower_boundary = 0x80u8; |
571 | self.upper_boundary = 0xBFu8; |
572 | return ( |
573 | DecoderResult::Malformed(bad_bytes, 0), |
574 | unread_handle.unread(), |
575 | destination_handle.written(), |
576 | ); |
577 | } |
578 | self.lower_boundary = 0x80u8; |
579 | self.upper_boundary = 0xBFu8; |
580 | self.code_point = (self.code_point << 6) | (u32::from(b) & 0x3F); |
581 | self.bytes_seen += 1; |
582 | if self.bytes_seen != self.bytes_needed { |
583 | continue; |
584 | } |
585 | if self.bytes_needed == 3 { |
586 | destination_handle.write_astral(self.code_point); |
587 | } else { |
588 | destination_handle.write_bmp_excl_ascii(self.code_point as u16); |
589 | } |
590 | self.code_point = 0; |
591 | self.bytes_needed = 0; |
592 | self.bytes_seen = 0; |
593 | continue; |
594 | }, |
595 | self, |
596 | src_consumed, |
597 | dest, |
598 | source, |
599 | b, |
600 | destination_handle, |
601 | unread_handle, |
602 | check_space_astral |
603 | ); |
604 | } |
605 | |
606 | #[cfg_attr (feature = "cargo-clippy" , allow(never_loop))] |
607 | #[inline (never)] |
608 | pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usize, usize) { |
609 | let mut read = 0; |
610 | let mut written = 0; |
611 | 'outer: loop { |
612 | let mut unit = { |
613 | let src_remaining = &src[read..]; |
614 | let dst_remaining = &mut dst[written..]; |
615 | let length = if dst_remaining.len() < src_remaining.len() { |
616 | dst_remaining.len() |
617 | } else { |
618 | src_remaining.len() |
619 | }; |
620 | match unsafe { |
621 | basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
622 | } { |
623 | None => { |
624 | read += length; |
625 | written += length; |
626 | return (read, written); |
627 | } |
628 | Some((non_ascii, consumed)) => { |
629 | read += consumed; |
630 | written += consumed; |
631 | non_ascii |
632 | } |
633 | } |
634 | }; |
635 | 'inner: loop { |
636 | // The following loop is only broken out of as a goto forward. |
637 | loop { |
638 | // Unfortunately, this check isn't enough for the compiler to elide |
639 | // the bound checks on writes to dst, which is why they are manually |
640 | // elided, which makes a measurable difference. |
641 | if written.checked_add(4).unwrap() > dst.len() { |
642 | return (read, written); |
643 | } |
644 | read += 1; |
645 | if unit < 0x800 { |
646 | unsafe { |
647 | *(dst.get_unchecked_mut(written)) = (unit >> 6) as u8 | 0xC0u8; |
648 | written += 1; |
649 | *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8; |
650 | written += 1; |
651 | } |
652 | break; |
653 | } |
654 | let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); |
655 | if likely(unit_minus_surrogate_start > (0xDFFF - 0xD800)) { |
656 | unsafe { |
657 | *(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8; |
658 | written += 1; |
659 | *(dst.get_unchecked_mut(written)) = ((unit & 0xFC0) >> 6) as u8 | 0x80u8; |
660 | written += 1; |
661 | *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8; |
662 | written += 1; |
663 | } |
664 | break; |
665 | } |
666 | if likely(unit_minus_surrogate_start <= (0xDBFF - 0xD800)) { |
667 | // high surrogate |
668 | // read > src.len() is impossible, but using |
669 | // >= instead of == allows the compiler to elide a bound check. |
670 | if read >= src.len() { |
671 | debug_assert_eq!(read, src.len()); |
672 | // Unpaired surrogate at the end of the buffer. |
673 | unsafe { |
674 | *(dst.get_unchecked_mut(written)) = 0xEFu8; |
675 | written += 1; |
676 | *(dst.get_unchecked_mut(written)) = 0xBFu8; |
677 | written += 1; |
678 | *(dst.get_unchecked_mut(written)) = 0xBDu8; |
679 | written += 1; |
680 | } |
681 | return (read, written); |
682 | } |
683 | let second = src[read]; |
684 | let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); |
685 | if likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) { |
686 | // The next code unit is a low surrogate. Advance position. |
687 | read += 1; |
688 | let astral = (u32::from(unit) << 10) + u32::from(second) |
689 | - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32); |
690 | unsafe { |
691 | *(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8; |
692 | written += 1; |
693 | *(dst.get_unchecked_mut(written)) = |
694 | ((astral & 0x3F000u32) >> 12) as u8 | 0x80u8; |
695 | written += 1; |
696 | *(dst.get_unchecked_mut(written)) = |
697 | ((astral & 0xFC0u32) >> 6) as u8 | 0x80u8; |
698 | written += 1; |
699 | *(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8; |
700 | written += 1; |
701 | } |
702 | break; |
703 | } |
704 | // The next code unit is not a low surrogate. Don't advance |
705 | // position and treat the high surrogate as unpaired. |
706 | // Fall through |
707 | } |
708 | // Unpaired low surrogate |
709 | unsafe { |
710 | *(dst.get_unchecked_mut(written)) = 0xEFu8; |
711 | written += 1; |
712 | *(dst.get_unchecked_mut(written)) = 0xBFu8; |
713 | written += 1; |
714 | *(dst.get_unchecked_mut(written)) = 0xBDu8; |
715 | written += 1; |
716 | } |
717 | break; |
718 | } |
719 | // Now see if the next unit is Basic Latin |
720 | // read > src.len() is impossible, but using |
721 | // >= instead of == allows the compiler to elide a bound check. |
722 | if read >= src.len() { |
723 | debug_assert_eq!(read, src.len()); |
724 | return (read, written); |
725 | } |
726 | unit = src[read]; |
727 | if unlikely(unit < 0x80) { |
728 | // written > dst.len() is impossible, but using |
729 | // >= instead of == allows the compiler to elide a bound check. |
730 | if written >= dst.len() { |
731 | debug_assert_eq!(written, dst.len()); |
732 | return (read, written); |
733 | } |
734 | dst[written] = unit as u8; |
735 | read += 1; |
736 | written += 1; |
737 | // Mysteriously, adding a punctuation check here makes |
738 | // the expected benificiary cases *slower*! |
739 | continue 'outer; |
740 | } |
741 | continue 'inner; |
742 | } |
743 | } |
744 | } |
745 | |
746 | #[inline (never)] |
747 | pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize, usize) { |
748 | // Everything below is cold code! |
749 | let mut read = 0; |
750 | let mut written = 0; |
751 | let mut unit = src[read]; |
752 | // We now have up to 3 output slots, so an astral character |
753 | // will not fit. |
754 | if unit < 0x800 { |
755 | loop { |
756 | if unit < 0x80 { |
757 | if written >= dst.len() { |
758 | return (read, written); |
759 | } |
760 | read += 1; |
761 | dst[written] = unit as u8; |
762 | written += 1; |
763 | } else if unit < 0x800 { |
764 | if written + 2 > dst.len() { |
765 | return (read, written); |
766 | } |
767 | read += 1; |
768 | dst[written] = (unit >> 6) as u8 | 0xC0u8; |
769 | written += 1; |
770 | dst[written] = (unit & 0x3F) as u8 | 0x80u8; |
771 | written += 1; |
772 | } else { |
773 | return (read, written); |
774 | } |
775 | // read > src.len() is impossible, but using |
776 | // >= instead of == allows the compiler to elide a bound check. |
777 | if read >= src.len() { |
778 | debug_assert_eq!(read, src.len()); |
779 | return (read, written); |
780 | } |
781 | unit = src[read]; |
782 | } |
783 | } |
784 | // Could be an unpaired surrogate, but we'll need 3 output |
785 | // slots in any case. |
786 | if written + 3 > dst.len() { |
787 | return (read, written); |
788 | } |
789 | read += 1; |
790 | let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); |
791 | if unit_minus_surrogate_start <= (0xDFFF - 0xD800) { |
792 | // Got surrogate |
793 | if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { |
794 | // Got high surrogate |
795 | if read >= src.len() { |
796 | // Unpaired high surrogate |
797 | unit = 0xFFFD; |
798 | } else { |
799 | let second = src[read]; |
800 | if in_inclusive_range16(second, 0xDC00, 0xDFFF) { |
801 | // Valid surrogate pair, but we know it won't fit. |
802 | read -= 1; |
803 | return (read, written); |
804 | } |
805 | // Unpaired high |
806 | unit = 0xFFFD; |
807 | } |
808 | } else { |
809 | // Unpaired low |
810 | unit = 0xFFFD; |
811 | } |
812 | } |
813 | dst[written] = (unit >> 12) as u8 | 0xE0u8; |
814 | written += 1; |
815 | dst[written] = ((unit & 0xFC0) >> 6) as u8 | 0x80u8; |
816 | written += 1; |
817 | dst[written] = (unit & 0x3F) as u8 | 0x80u8; |
818 | written += 1; |
819 | debug_assert_eq!(written, dst.len()); |
820 | (read, written) |
821 | } |
822 | |
823 | pub struct Utf8Encoder; |
824 | |
825 | impl Utf8Encoder { |
826 | pub fn new(encoding: &'static Encoding) -> Encoder { |
827 | Encoder::new(encoding, VariantEncoder::Utf8(Utf8Encoder)) |
828 | } |
829 | |
830 | pub fn max_buffer_length_from_utf16_without_replacement( |
831 | &self, |
832 | u16_length: usize, |
833 | ) -> Option<usize> { |
834 | u16_length.checked_mul(3) |
835 | } |
836 | |
837 | pub fn max_buffer_length_from_utf8_without_replacement( |
838 | &self, |
839 | byte_length: usize, |
840 | ) -> Option<usize> { |
841 | Some(byte_length) |
842 | } |
843 | |
844 | pub fn encode_from_utf16_raw( |
845 | &mut self, |
846 | src: &[u16], |
847 | dst: &mut [u8], |
848 | _last: bool, |
849 | ) -> (EncoderResult, usize, usize) { |
850 | let (read, written) = convert_utf16_to_utf8_partial(src, dst); |
851 | ( |
852 | if read == src.len() { |
853 | EncoderResult::InputEmpty |
854 | } else { |
855 | EncoderResult::OutputFull |
856 | }, |
857 | read, |
858 | written, |
859 | ) |
860 | } |
861 | |
862 | pub fn encode_from_utf8_raw( |
863 | &mut self, |
864 | src: &str, |
865 | dst: &mut [u8], |
866 | _last: bool, |
867 | ) -> (EncoderResult, usize, usize) { |
868 | let bytes = src.as_bytes(); |
869 | let mut to_write = bytes.len(); |
870 | if to_write <= dst.len() { |
871 | (&mut dst[..to_write]).copy_from_slice(bytes); |
872 | return (EncoderResult::InputEmpty, to_write, to_write); |
873 | } |
874 | to_write = dst.len(); |
875 | // Move back until we find a UTF-8 sequence boundary. |
876 | while (bytes[to_write] & 0xC0) == 0x80 { |
877 | to_write -= 1; |
878 | } |
879 | (&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]); |
880 | (EncoderResult::OutputFull, to_write, to_write) |
881 | } |
882 | } |
883 | |
884 | // Any copyright to the test code below this comment is dedicated to the |
885 | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
886 | |
887 | #[cfg (all(test, feature = "alloc" ))] |
888 | mod tests { |
889 | use super::super::testing::*; |
890 | use super::super::*; |
891 | |
892 | // fn decode_utf8_to_utf16(bytes: &[u8], expect: &[u16]) { |
893 | // decode_to_utf16_without_replacement(UTF_8, bytes, expect); |
894 | // } |
895 | |
896 | fn decode_utf8_to_utf8(bytes: &[u8], expect: &str) { |
897 | decode_to_utf8(UTF_8, bytes, expect); |
898 | } |
899 | |
900 | fn decode_valid_utf8(string: &str) { |
901 | decode_utf8_to_utf8(string.as_bytes(), string); |
902 | } |
903 | |
904 | fn encode_utf8_from_utf16(string: &[u16], expect: &[u8]) { |
905 | encode_from_utf16(UTF_8, string, expect); |
906 | } |
907 | |
908 | fn encode_utf8_from_utf8(string: &str, expect: &[u8]) { |
909 | encode_from_utf8(UTF_8, string, expect); |
910 | } |
911 | |
912 | fn encode_utf8_from_utf16_with_output_limit( |
913 | string: &[u16], |
914 | expect: &str, |
915 | limit: usize, |
916 | expect_result: EncoderResult, |
917 | ) { |
918 | let mut dst = Vec::new(); |
919 | { |
920 | dst.resize(limit, 0u8); |
921 | let mut encoder = UTF_8.new_encoder(); |
922 | let (result, read, written) = |
923 | encoder.encode_from_utf16_without_replacement(string, &mut dst, false); |
924 | assert_eq!(result, expect_result); |
925 | if expect_result == EncoderResult::InputEmpty { |
926 | assert_eq!(read, string.len()); |
927 | } |
928 | assert_eq!(&dst[..written], expect.as_bytes()); |
929 | } |
930 | { |
931 | dst.resize(64, 0u8); |
932 | for (i, elem) in dst.iter_mut().enumerate() { |
933 | *elem = i as u8; |
934 | } |
935 | let mut encoder = UTF_8.new_encoder(); |
936 | let (_, _, mut j) = |
937 | encoder.encode_from_utf16_without_replacement(string, &mut dst, false); |
938 | while j < dst.len() { |
939 | assert_eq!(usize::from(dst[j]), j); |
940 | j += 1; |
941 | } |
942 | } |
943 | } |
944 | |
945 | #[test ] |
946 | fn test_utf8_decode() { |
947 | // Empty |
948 | decode_valid_utf8("" ); |
949 | // ASCII |
950 | decode_valid_utf8("ab" ); |
951 | // Low BMP |
952 | decode_valid_utf8("a \u{E4}Z" ); |
953 | // High BMP |
954 | decode_valid_utf8("a \u{2603}Z" ); |
955 | // Astral |
956 | decode_valid_utf8("a \u{1F4A9}Z" ); |
957 | // Low BMP with last byte missing |
958 | decode_utf8_to_utf8(b"a \xC3Z" , "a \u{FFFD}Z" ); |
959 | decode_utf8_to_utf8(b"a \xC3" , "a \u{FFFD}" ); |
960 | // High BMP with last byte missing |
961 | decode_utf8_to_utf8(b"a \xE2\x98Z" , "a \u{FFFD}Z" ); |
962 | decode_utf8_to_utf8(b"a \xE2\x98" , "a \u{FFFD}" ); |
963 | // Astral with last byte missing |
964 | decode_utf8_to_utf8(b"a \xF0\x9F\x92Z" , "a \u{FFFD}Z" ); |
965 | decode_utf8_to_utf8(b"a \xF0\x9F\x92" , "a \u{FFFD}" ); |
966 | // Lone highest continuation |
967 | decode_utf8_to_utf8(b"a \xBFZ" , "a \u{FFFD}Z" ); |
968 | decode_utf8_to_utf8(b"a \xBF" , "a \u{FFFD}" ); |
969 | // Two lone highest continuations |
970 | decode_utf8_to_utf8(b"a \xBF\xBFZ" , "a \u{FFFD}\u{FFFD}Z" ); |
971 | decode_utf8_to_utf8(b"a \xBF\xBF" , "a \u{FFFD}\u{FFFD}" ); |
972 | // Low BMP followed by lowest lone continuation |
973 | decode_utf8_to_utf8(b"a \xC3\xA4\x80Z" , "a \u{E4}\u{FFFD}Z" ); |
974 | decode_utf8_to_utf8(b"a \xC3\xA4\x80" , "a \u{E4}\u{FFFD}" ); |
975 | // Low BMP followed by highest lone continuation |
976 | decode_utf8_to_utf8(b"a \xC3\xA4\xBFZ" , "a \u{E4}\u{FFFD}Z" ); |
977 | decode_utf8_to_utf8(b"a \xC3\xA4\xBF" , "a \u{E4}\u{FFFD}" ); |
978 | // High BMP followed by lowest lone continuation |
979 | decode_utf8_to_utf8(b"a \xE2\x98\x83\x80Z" , "a \u{2603}\u{FFFD}Z" ); |
980 | decode_utf8_to_utf8(b"a \xE2\x98\x83\x80" , "a \u{2603}\u{FFFD}" ); |
981 | // High BMP followed by highest lone continuation |
982 | decode_utf8_to_utf8(b"a \xE2\x98\x83\xBFZ" , "a \u{2603}\u{FFFD}Z" ); |
983 | decode_utf8_to_utf8(b"a \xE2\x98\x83\xBF" , "a \u{2603}\u{FFFD}" ); |
984 | // Astral followed by lowest lone continuation |
985 | decode_utf8_to_utf8(b"a \xF0\x9F\x92\xA9\x80Z" , "a \u{1F4A9}\u{FFFD}Z" ); |
986 | decode_utf8_to_utf8(b"a \xF0\x9F\x92\xA9\x80" , "a \u{1F4A9}\u{FFFD}" ); |
987 | // Astral followed by highest lone continuation |
988 | decode_utf8_to_utf8(b"a \xF0\x9F\x92\xA9\xBFZ" , "a \u{1F4A9}\u{FFFD}Z" ); |
989 | decode_utf8_to_utf8(b"a \xF0\x9F\x92\xA9\xBF" , "a \u{1F4A9}\u{FFFD}" ); |
990 | |
991 | // Boundary conditions |
992 | // Lowest single-byte |
993 | decode_valid_utf8("Z \x00" ); |
994 | decode_valid_utf8("Z \x00Z" ); |
995 | // Lowest single-byte as two-byte overlong sequence |
996 | decode_utf8_to_utf8(b"a \xC0\x80" , "a \u{FFFD}\u{FFFD}" ); |
997 | decode_utf8_to_utf8(b"a \xC0\x80Z" , "a \u{FFFD}\u{FFFD}Z" ); |
998 | // Lowest single-byte as three-byte overlong sequence |
999 | decode_utf8_to_utf8(b"a \xE0\x80\x80" , "a \u{FFFD}\u{FFFD}\u{FFFD}" ); |
1000 | decode_utf8_to_utf8(b"a \xE0\x80\x80Z" , "a \u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1001 | // Lowest single-byte as four-byte overlong sequence |
1002 | decode_utf8_to_utf8(b"a \xF0\x80\x80\x80" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1003 | decode_utf8_to_utf8(b"a \xF0\x80\x80\x80Z" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1004 | // One below lowest single-byte |
1005 | decode_utf8_to_utf8(b"a \xFF" , "a \u{FFFD}" ); |
1006 | decode_utf8_to_utf8(b"a \xFFZ" , "a \u{FFFD}Z" ); |
1007 | // Highest single-byte |
1008 | decode_valid_utf8("a \x7F" ); |
1009 | decode_valid_utf8("a \x7FZ" ); |
1010 | // Highest single-byte as two-byte overlong sequence |
1011 | decode_utf8_to_utf8(b"a \xC1\xBF" , "a \u{FFFD}\u{FFFD}" ); |
1012 | decode_utf8_to_utf8(b"a \xC1\xBFZ" , "a \u{FFFD}\u{FFFD}Z" ); |
1013 | // Highest single-byte as three-byte overlong sequence |
1014 | decode_utf8_to_utf8(b"a \xE0\x81\xBF" , "a \u{FFFD}\u{FFFD}\u{FFFD}" ); |
1015 | decode_utf8_to_utf8(b"a \xE0\x81\xBFZ" , "a \u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1016 | // Highest single-byte as four-byte overlong sequence |
1017 | decode_utf8_to_utf8(b"a \xF0\x80\x81\xBF" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1018 | decode_utf8_to_utf8(b"a \xF0\x80\x81\xBFZ" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1019 | // One past highest single byte (also lone continuation) |
1020 | decode_utf8_to_utf8(b"a \x80Z" , "a \u{FFFD}Z" ); |
1021 | decode_utf8_to_utf8(b"a \x80" , "a \u{FFFD}" ); |
1022 | // Two lone continuations |
1023 | decode_utf8_to_utf8(b"a \x80\x80Z" , "a \u{FFFD}\u{FFFD}Z" ); |
1024 | decode_utf8_to_utf8(b"a \x80\x80" , "a \u{FFFD}\u{FFFD}" ); |
1025 | // Three lone continuations |
1026 | decode_utf8_to_utf8(b"a \x80\x80\x80Z" , "a \u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1027 | decode_utf8_to_utf8(b"a \x80\x80\x80" , "a \u{FFFD}\u{FFFD}\u{FFFD}" ); |
1028 | // Four lone continuations |
1029 | decode_utf8_to_utf8(b"a \x80\x80\x80\x80Z" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1030 | decode_utf8_to_utf8(b"a \x80\x80\x80\x80" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1031 | // Lowest two-byte |
1032 | decode_utf8_to_utf8(b"a \xC2\x80" , "a \u{0080}" ); |
1033 | decode_utf8_to_utf8(b"a \xC2\x80Z" , "a \u{0080}Z" ); |
1034 | // Lowest two-byte as three-byte overlong sequence |
1035 | decode_utf8_to_utf8(b"a \xE0\x82\x80" , "a \u{FFFD}\u{FFFD}\u{FFFD}" ); |
1036 | decode_utf8_to_utf8(b"a \xE0\x82\x80Z" , "a \u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1037 | // Lowest two-byte as four-byte overlong sequence |
1038 | decode_utf8_to_utf8(b"a \xF0\x80\x82\x80" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1039 | decode_utf8_to_utf8(b"a \xF0\x80\x82\x80Z" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1040 | // Lead one below lowest two-byte |
1041 | decode_utf8_to_utf8(b"a \xC1\x80" , "a \u{FFFD}\u{FFFD}" ); |
1042 | decode_utf8_to_utf8(b"a \xC1\x80Z" , "a \u{FFFD}\u{FFFD}Z" ); |
1043 | // Trail one below lowest two-byte |
1044 | decode_utf8_to_utf8(b"a \xC2\x7F" , "a \u{FFFD}\u{007F}" ); |
1045 | decode_utf8_to_utf8(b"a \xC2\x7FZ" , "a \u{FFFD}\u{007F}Z" ); |
1046 | // Highest two-byte |
1047 | decode_utf8_to_utf8(b"a \xDF\xBF" , "a \u{07FF}" ); |
1048 | decode_utf8_to_utf8(b"a \xDF\xBFZ" , "a \u{07FF}Z" ); |
1049 | // Highest two-byte as three-byte overlong sequence |
1050 | decode_utf8_to_utf8(b"a \xE0\x9F\xBF" , "a \u{FFFD}\u{FFFD}\u{FFFD}" ); |
1051 | decode_utf8_to_utf8(b"a \xE0\x9F\xBFZ" , "a \u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1052 | // Highest two-byte as four-byte overlong sequence |
1053 | decode_utf8_to_utf8(b"a \xF0\x80\x9F\xBF" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1054 | decode_utf8_to_utf8(b"a \xF0\x80\x9F\xBFZ" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1055 | // Lowest three-byte |
1056 | decode_utf8_to_utf8(b"a \xE0\xA0\x80" , "a \u{0800}" ); |
1057 | decode_utf8_to_utf8(b"a \xE0\xA0\x80Z" , "a \u{0800}Z" ); |
1058 | // Lowest three-byte as four-byte overlong sequence |
1059 | decode_utf8_to_utf8(b"a \xF0\x80\xA0\x80" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1060 | decode_utf8_to_utf8(b"a \xF0\x80\xA0\x80Z" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1061 | // Highest below surrogates |
1062 | decode_utf8_to_utf8(b"a \xED\x9F\xBF" , "a \u{D7FF}" ); |
1063 | decode_utf8_to_utf8(b"a \xED\x9F\xBFZ" , "a \u{D7FF}Z" ); |
1064 | // Highest below surrogates as four-byte overlong sequence |
1065 | decode_utf8_to_utf8(b"a \xF0\x8D\x9F\xBF" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1066 | decode_utf8_to_utf8(b"a \xF0\x8D\x9F\xBFZ" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1067 | // First surrogate |
1068 | decode_utf8_to_utf8(b"a \xED\xA0\x80" , "a \u{FFFD}\u{FFFD}\u{FFFD}" ); |
1069 | decode_utf8_to_utf8(b"a \xED\xA0\x80Z" , "a \u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1070 | // First surrogate as four-byte overlong sequence |
1071 | decode_utf8_to_utf8(b"a \xF0\x8D\xA0\x80" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1072 | decode_utf8_to_utf8(b"a \xF0\x8D\xA0\x80Z" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1073 | // Last surrogate |
1074 | decode_utf8_to_utf8(b"a \xED\xBF\xBF" , "a \u{FFFD}\u{FFFD}\u{FFFD}" ); |
1075 | decode_utf8_to_utf8(b"a \xED\xBF\xBFZ" , "a \u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1076 | // Last surrogate as four-byte overlong sequence |
1077 | decode_utf8_to_utf8(b"a \xF0\x8D\xBF\xBF" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1078 | decode_utf8_to_utf8(b"a \xF0\x8D\xBF\xBFZ" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1079 | // Lowest above surrogates |
1080 | decode_utf8_to_utf8(b"a \xEE\x80\x80" , "a \u{E000}" ); |
1081 | decode_utf8_to_utf8(b"a \xEE\x80\x80Z" , "a \u{E000}Z" ); |
1082 | // Lowest above surrogates as four-byte overlong sequence |
1083 | decode_utf8_to_utf8(b"a \xF0\x8E\x80\x80" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1084 | decode_utf8_to_utf8(b"a \xF0\x8E\x80\x80Z" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1085 | // Highest three-byte |
1086 | decode_utf8_to_utf8(b"a \xEF\xBF\xBF" , "a \u{FFFF}" ); |
1087 | decode_utf8_to_utf8(b"a \xEF\xBF\xBFZ" , "a \u{FFFF}Z" ); |
1088 | // Highest three-byte as four-byte overlong sequence |
1089 | decode_utf8_to_utf8(b"a \xF0\x8F\xBF\xBF" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1090 | decode_utf8_to_utf8(b"a \xF0\x8F\xBF\xBFZ" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1091 | // Lowest four-byte |
1092 | decode_utf8_to_utf8(b"a \xF0\x90\x80\x80" , "a \u{10000}" ); |
1093 | decode_utf8_to_utf8(b"a \xF0\x90\x80\x80Z" , "a \u{10000}Z" ); |
1094 | // Highest four-byte |
1095 | decode_utf8_to_utf8(b"a \xF4\x8F\xBF\xBF" , "a \u{10FFFF}" ); |
1096 | decode_utf8_to_utf8(b"a \xF4\x8F\xBF\xBFZ" , "a \u{10FFFF}Z" ); |
1097 | // One past highest four-byte |
1098 | decode_utf8_to_utf8(b"a \xF4\x90\x80\x80" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}" ); |
1099 | decode_utf8_to_utf8(b"a \xF4\x90\x80\x80Z" , "a \u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z" ); |
1100 | |
1101 | // Highest four-byte with last byte replaced with 0xFF |
1102 | decode_utf8_to_utf8(b"a \xF4\x8F\xBF\xFF" , "a \u{FFFD}\u{FFFD}" ); |
1103 | decode_utf8_to_utf8(b"a \xF4\x8F\xBF\xFFZ" , "a \u{FFFD}\u{FFFD}Z" ); |
1104 | } |
1105 | |
1106 | #[test ] |
1107 | fn test_utf8_encode() { |
1108 | // Empty |
1109 | encode_utf8_from_utf16(&[], b"" ); |
1110 | encode_utf8_from_utf8("" , b"" ); |
1111 | |
1112 | encode_utf8_from_utf16(&[0x0000], " \u{0000}" .as_bytes()); |
1113 | encode_utf8_from_utf16(&[0x007F], " \u{007F}" .as_bytes()); |
1114 | encode_utf8_from_utf16(&[0x0080], " \u{0080}" .as_bytes()); |
1115 | encode_utf8_from_utf16(&[0x07FF], " \u{07FF}" .as_bytes()); |
1116 | encode_utf8_from_utf16(&[0x0800], " \u{0800}" .as_bytes()); |
1117 | encode_utf8_from_utf16(&[0xD7FF], " \u{D7FF}" .as_bytes()); |
1118 | encode_utf8_from_utf16(&[0xD800], " \u{FFFD}" .as_bytes()); |
1119 | encode_utf8_from_utf16(&[0xD800, 0x0062], " \u{FFFD}\u{0062}" .as_bytes()); |
1120 | encode_utf8_from_utf16(&[0xDFFF], " \u{FFFD}" .as_bytes()); |
1121 | encode_utf8_from_utf16(&[0xDFFF, 0x0062], " \u{FFFD}\u{0062}" .as_bytes()); |
1122 | encode_utf8_from_utf16(&[0xE000], " \u{E000}" .as_bytes()); |
1123 | encode_utf8_from_utf16(&[0xFFFF], " \u{FFFF}" .as_bytes()); |
1124 | encode_utf8_from_utf16(&[0xD800, 0xDC00], " \u{10000}" .as_bytes()); |
1125 | encode_utf8_from_utf16(&[0xDBFF, 0xDFFF], " \u{10FFFF}" .as_bytes()); |
1126 | encode_utf8_from_utf16(&[0xDC00, 0xDEDE], " \u{FFFD}\u{FFFD}" .as_bytes()); |
1127 | } |
1128 | |
1129 | #[test ] |
1130 | fn test_encode_utf8_from_utf16_with_output_limit() { |
1131 | encode_utf8_from_utf16_with_output_limit(&[0x0062], " \u{62}" , 1, EncoderResult::InputEmpty); |
1132 | encode_utf8_from_utf16_with_output_limit(&[0x00A7], " \u{A7}" , 2, EncoderResult::InputEmpty); |
1133 | encode_utf8_from_utf16_with_output_limit( |
1134 | &[0x2603], |
1135 | " \u{2603}" , |
1136 | 3, |
1137 | EncoderResult::InputEmpty, |
1138 | ); |
1139 | encode_utf8_from_utf16_with_output_limit( |
1140 | &[0xD83D, 0xDCA9], |
1141 | " \u{1F4A9}" , |
1142 | 4, |
1143 | EncoderResult::InputEmpty, |
1144 | ); |
1145 | |
1146 | encode_utf8_from_utf16_with_output_limit(&[0x00A7], "" , 1, EncoderResult::OutputFull); |
1147 | encode_utf8_from_utf16_with_output_limit(&[0x2603], "" , 2, EncoderResult::OutputFull); |
1148 | encode_utf8_from_utf16_with_output_limit( |
1149 | &[0xD83D, 0xDCA9], |
1150 | "" , |
1151 | 3, |
1152 | EncoderResult::OutputFull, |
1153 | ); |
1154 | |
1155 | encode_utf8_from_utf16_with_output_limit( |
1156 | &[0x0063, 0x0062], |
1157 | " \u{63}\u{62}" , |
1158 | 2, |
1159 | EncoderResult::InputEmpty, |
1160 | ); |
1161 | encode_utf8_from_utf16_with_output_limit( |
1162 | &[0x0063, 0x00A7], |
1163 | " \u{63}\u{A7}" , |
1164 | 3, |
1165 | EncoderResult::InputEmpty, |
1166 | ); |
1167 | encode_utf8_from_utf16_with_output_limit( |
1168 | &[0x0063, 0x2603], |
1169 | " \u{63}\u{2603}" , |
1170 | 4, |
1171 | EncoderResult::InputEmpty, |
1172 | ); |
1173 | encode_utf8_from_utf16_with_output_limit( |
1174 | &[0x0063, 0xD83D, 0xDCA9], |
1175 | " \u{63}\u{1F4A9}" , |
1176 | 5, |
1177 | EncoderResult::InputEmpty, |
1178 | ); |
1179 | |
1180 | encode_utf8_from_utf16_with_output_limit( |
1181 | &[0x0063, 0x00A7], |
1182 | " \u{63}" , |
1183 | 2, |
1184 | EncoderResult::OutputFull, |
1185 | ); |
1186 | encode_utf8_from_utf16_with_output_limit( |
1187 | &[0x0063, 0x2603], |
1188 | " \u{63}" , |
1189 | 3, |
1190 | EncoderResult::OutputFull, |
1191 | ); |
1192 | encode_utf8_from_utf16_with_output_limit( |
1193 | &[0x0063, 0xD83D, 0xDCA9], |
1194 | " \u{63}" , |
1195 | 4, |
1196 | EncoderResult::OutputFull, |
1197 | ); |
1198 | |
1199 | encode_utf8_from_utf16_with_output_limit( |
1200 | &[0x00B6, 0x0062], |
1201 | " \u{B6}\u{62}" , |
1202 | 3, |
1203 | EncoderResult::InputEmpty, |
1204 | ); |
1205 | encode_utf8_from_utf16_with_output_limit( |
1206 | &[0x00B6, 0x00A7], |
1207 | " \u{B6}\u{A7}" , |
1208 | 4, |
1209 | EncoderResult::InputEmpty, |
1210 | ); |
1211 | encode_utf8_from_utf16_with_output_limit( |
1212 | &[0x00B6, 0x2603], |
1213 | " \u{B6}\u{2603}" , |
1214 | 5, |
1215 | EncoderResult::InputEmpty, |
1216 | ); |
1217 | encode_utf8_from_utf16_with_output_limit( |
1218 | &[0x00B6, 0xD83D, 0xDCA9], |
1219 | " \u{B6}\u{1F4A9}" , |
1220 | 6, |
1221 | EncoderResult::InputEmpty, |
1222 | ); |
1223 | |
1224 | encode_utf8_from_utf16_with_output_limit( |
1225 | &[0x00B6, 0x00A7], |
1226 | " \u{B6}" , |
1227 | 3, |
1228 | EncoderResult::OutputFull, |
1229 | ); |
1230 | encode_utf8_from_utf16_with_output_limit( |
1231 | &[0x00B6, 0x2603], |
1232 | " \u{B6}" , |
1233 | 4, |
1234 | EncoderResult::OutputFull, |
1235 | ); |
1236 | encode_utf8_from_utf16_with_output_limit( |
1237 | &[0x00B6, 0xD83D, 0xDCA9], |
1238 | " \u{B6}" , |
1239 | 5, |
1240 | EncoderResult::OutputFull, |
1241 | ); |
1242 | |
1243 | encode_utf8_from_utf16_with_output_limit( |
1244 | &[0x263A, 0x0062], |
1245 | " \u{263A}\u{62}" , |
1246 | 4, |
1247 | EncoderResult::InputEmpty, |
1248 | ); |
1249 | encode_utf8_from_utf16_with_output_limit( |
1250 | &[0x263A, 0x00A7], |
1251 | " \u{263A}\u{A7}" , |
1252 | 5, |
1253 | EncoderResult::InputEmpty, |
1254 | ); |
1255 | encode_utf8_from_utf16_with_output_limit( |
1256 | &[0x263A, 0x2603], |
1257 | " \u{263A}\u{2603}" , |
1258 | 6, |
1259 | EncoderResult::InputEmpty, |
1260 | ); |
1261 | encode_utf8_from_utf16_with_output_limit( |
1262 | &[0x263A, 0xD83D, 0xDCA9], |
1263 | " \u{263A}\u{1F4A9}" , |
1264 | 7, |
1265 | EncoderResult::InputEmpty, |
1266 | ); |
1267 | |
1268 | encode_utf8_from_utf16_with_output_limit( |
1269 | &[0x263A, 0x00A7], |
1270 | " \u{263A}" , |
1271 | 4, |
1272 | EncoderResult::OutputFull, |
1273 | ); |
1274 | encode_utf8_from_utf16_with_output_limit( |
1275 | &[0x263A, 0x2603], |
1276 | " \u{263A}" , |
1277 | 5, |
1278 | EncoderResult::OutputFull, |
1279 | ); |
1280 | encode_utf8_from_utf16_with_output_limit( |
1281 | &[0x263A, 0xD83D, 0xDCA9], |
1282 | " \u{263A}" , |
1283 | 6, |
1284 | EncoderResult::OutputFull, |
1285 | ); |
1286 | |
1287 | encode_utf8_from_utf16_with_output_limit( |
1288 | &[0xD83D, 0xDE0E, 0x0062], |
1289 | " \u{1F60E}\u{62}" , |
1290 | 5, |
1291 | EncoderResult::InputEmpty, |
1292 | ); |
1293 | encode_utf8_from_utf16_with_output_limit( |
1294 | &[0xD83D, 0xDE0E, 0x00A7], |
1295 | " \u{1F60E}\u{A7}" , |
1296 | 6, |
1297 | EncoderResult::InputEmpty, |
1298 | ); |
1299 | encode_utf8_from_utf16_with_output_limit( |
1300 | &[0xD83D, 0xDE0E, 0x2603], |
1301 | " \u{1F60E}\u{2603}" , |
1302 | 7, |
1303 | EncoderResult::InputEmpty, |
1304 | ); |
1305 | encode_utf8_from_utf16_with_output_limit( |
1306 | &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9], |
1307 | " \u{1F60E}\u{1F4A9}" , |
1308 | 8, |
1309 | EncoderResult::InputEmpty, |
1310 | ); |
1311 | |
1312 | encode_utf8_from_utf16_with_output_limit( |
1313 | &[0xD83D, 0xDE0E, 0x00A7], |
1314 | " \u{1F60E}" , |
1315 | 5, |
1316 | EncoderResult::OutputFull, |
1317 | ); |
1318 | encode_utf8_from_utf16_with_output_limit( |
1319 | &[0xD83D, 0xDE0E, 0x2603], |
1320 | " \u{1F60E}" , |
1321 | 6, |
1322 | EncoderResult::OutputFull, |
1323 | ); |
1324 | encode_utf8_from_utf16_with_output_limit( |
1325 | &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9], |
1326 | " \u{1F60E}" , |
1327 | 7, |
1328 | EncoderResult::OutputFull, |
1329 | ); |
1330 | |
1331 | encode_utf8_from_utf16_with_output_limit( |
1332 | &[0x0063, 0x00B6, 0x0062, 0x0062], |
1333 | " \u{63}\u{B6}\u{62}\u{62}" , |
1334 | 5, |
1335 | EncoderResult::InputEmpty, |
1336 | ); |
1337 | encode_utf8_from_utf16_with_output_limit( |
1338 | &[0x0063, 0x00B6, 0x0062, 0x0062], |
1339 | " \u{63}\u{B6}\u{62}" , |
1340 | 4, |
1341 | EncoderResult::OutputFull, |
1342 | ); |
1343 | |
1344 | encode_utf8_from_utf16_with_output_limit( |
1345 | &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062], |
1346 | " \u{63}\u{B6}\u{62}\u{62}\u{62}" , |
1347 | 6, |
1348 | EncoderResult::InputEmpty, |
1349 | ); |
1350 | encode_utf8_from_utf16_with_output_limit( |
1351 | &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062], |
1352 | " \u{63}\u{B6}\u{62}\u{62}" , |
1353 | 5, |
1354 | EncoderResult::OutputFull, |
1355 | ); |
1356 | |
1357 | encode_utf8_from_utf16_with_output_limit( |
1358 | &[0x263A, 0x0062, 0x0062], |
1359 | " \u{263A}\u{62}\u{62}" , |
1360 | 5, |
1361 | EncoderResult::InputEmpty, |
1362 | ); |
1363 | encode_utf8_from_utf16_with_output_limit( |
1364 | &[0x263A, 0x0062, 0x0062], |
1365 | " \u{263A}\u{62}" , |
1366 | 4, |
1367 | EncoderResult::OutputFull, |
1368 | ); |
1369 | |
1370 | encode_utf8_from_utf16_with_output_limit( |
1371 | &[0x263A, 0x0062, 0x0062, 0x0062], |
1372 | " \u{263A}\u{62}\u{62}\u{62}" , |
1373 | 6, |
1374 | EncoderResult::InputEmpty, |
1375 | ); |
1376 | encode_utf8_from_utf16_with_output_limit( |
1377 | &[0x263A, 0x0062, 0x0062, 0x0062], |
1378 | " \u{263A}\u{62}\u{62}" , |
1379 | 5, |
1380 | EncoderResult::OutputFull, |
1381 | ); |
1382 | |
1383 | encode_utf8_from_utf16_with_output_limit( |
1384 | &[0x0063, 0x00B6, 0x00A7], |
1385 | " \u{63}\u{B6}\u{A7}" , |
1386 | 5, |
1387 | EncoderResult::InputEmpty, |
1388 | ); |
1389 | encode_utf8_from_utf16_with_output_limit( |
1390 | &[0x0063, 0x00B6, 0x00A7], |
1391 | " \u{63}\u{B6}" , |
1392 | 4, |
1393 | EncoderResult::OutputFull, |
1394 | ); |
1395 | |
1396 | encode_utf8_from_utf16_with_output_limit( |
1397 | &[0x0063, 0x00B6, 0x00A7, 0x0062], |
1398 | " \u{63}\u{B6}\u{A7}\u{62}" , |
1399 | 6, |
1400 | EncoderResult::InputEmpty, |
1401 | ); |
1402 | encode_utf8_from_utf16_with_output_limit( |
1403 | &[0x0063, 0x00B6, 0x00A7, 0x0062], |
1404 | " \u{63}\u{B6}\u{A7}" , |
1405 | 5, |
1406 | EncoderResult::OutputFull, |
1407 | ); |
1408 | |
1409 | encode_utf8_from_utf16_with_output_limit( |
1410 | &[0x263A, 0x00A7, 0x0062], |
1411 | " \u{263A}\u{A7}\u{62}" , |
1412 | 6, |
1413 | EncoderResult::InputEmpty, |
1414 | ); |
1415 | encode_utf8_from_utf16_with_output_limit( |
1416 | &[0x263A, 0x00A7, 0x0062], |
1417 | " \u{263A}\u{A7}" , |
1418 | 5, |
1419 | EncoderResult::OutputFull, |
1420 | ); |
1421 | |
1422 | encode_utf8_from_utf16_with_output_limit( |
1423 | &[0x0063, 0x00B6, 0x0062, 0x00A7], |
1424 | " \u{63}\u{B6}\u{62}\u{A7}" , |
1425 | 6, |
1426 | EncoderResult::InputEmpty, |
1427 | ); |
1428 | encode_utf8_from_utf16_with_output_limit( |
1429 | &[0x0063, 0x00B6, 0x0062, 0x00A7], |
1430 | " \u{63}\u{B6}\u{62}" , |
1431 | 5, |
1432 | EncoderResult::OutputFull, |
1433 | ); |
1434 | |
1435 | encode_utf8_from_utf16_with_output_limit( |
1436 | &[0x263A, 0x0062, 0x00A7], |
1437 | " \u{263A}\u{62}\u{A7}" , |
1438 | 6, |
1439 | EncoderResult::InputEmpty, |
1440 | ); |
1441 | encode_utf8_from_utf16_with_output_limit( |
1442 | &[0x263A, 0x0062, 0x00A7], |
1443 | " \u{263A}\u{62}" , |
1444 | 5, |
1445 | EncoderResult::OutputFull, |
1446 | ); |
1447 | |
1448 | encode_utf8_from_utf16_with_output_limit( |
1449 | &[0x0063, 0x00B6, 0x2603], |
1450 | " \u{63}\u{B6}\u{2603}" , |
1451 | 6, |
1452 | EncoderResult::InputEmpty, |
1453 | ); |
1454 | encode_utf8_from_utf16_with_output_limit( |
1455 | &[0x0063, 0x00B6, 0x2603], |
1456 | " \u{63}\u{B6}" , |
1457 | 5, |
1458 | EncoderResult::OutputFull, |
1459 | ); |
1460 | |
1461 | encode_utf8_from_utf16_with_output_limit( |
1462 | &[0x263A, 0x2603], |
1463 | " \u{263A}\u{2603}" , |
1464 | 6, |
1465 | EncoderResult::InputEmpty, |
1466 | ); |
1467 | encode_utf8_from_utf16_with_output_limit( |
1468 | &[0x263A, 0x2603], |
1469 | " \u{263A}" , |
1470 | 5, |
1471 | EncoderResult::OutputFull, |
1472 | ); |
1473 | |
1474 | encode_utf8_from_utf16_with_output_limit( |
1475 | &[0x0063, 0x00B6, 0xD83D], |
1476 | " \u{63}\u{B6}\u{FFFD}" , |
1477 | 6, |
1478 | EncoderResult::InputEmpty, |
1479 | ); |
1480 | encode_utf8_from_utf16_with_output_limit( |
1481 | &[0x0063, 0x00B6, 0xD83D], |
1482 | " \u{63}\u{B6}" , |
1483 | 5, |
1484 | EncoderResult::OutputFull, |
1485 | ); |
1486 | |
1487 | encode_utf8_from_utf16_with_output_limit( |
1488 | &[0x263A, 0xD83D], |
1489 | " \u{263A}\u{FFFD}" , |
1490 | 6, |
1491 | EncoderResult::InputEmpty, |
1492 | ); |
1493 | encode_utf8_from_utf16_with_output_limit( |
1494 | &[0x263A, 0xD83D], |
1495 | " \u{263A}" , |
1496 | 5, |
1497 | EncoderResult::OutputFull, |
1498 | ); |
1499 | |
1500 | encode_utf8_from_utf16_with_output_limit( |
1501 | &[0x0063, 0x00B6, 0xDCA9], |
1502 | " \u{63}\u{B6}\u{FFFD}" , |
1503 | 6, |
1504 | EncoderResult::InputEmpty, |
1505 | ); |
1506 | encode_utf8_from_utf16_with_output_limit( |
1507 | &[0x0063, 0x00B6, 0xDCA9], |
1508 | " \u{63}\u{B6}" , |
1509 | 5, |
1510 | EncoderResult::OutputFull, |
1511 | ); |
1512 | |
1513 | encode_utf8_from_utf16_with_output_limit( |
1514 | &[0x263A, 0xDCA9], |
1515 | " \u{263A}\u{FFFD}" , |
1516 | 6, |
1517 | EncoderResult::InputEmpty, |
1518 | ); |
1519 | encode_utf8_from_utf16_with_output_limit( |
1520 | &[0x263A, 0xDCA9], |
1521 | " \u{263A}" , |
1522 | 5, |
1523 | EncoderResult::OutputFull, |
1524 | ); |
1525 | } |
1526 | |
1527 | #[test ] |
1528 | fn test_utf8_max_length_from_utf16() { |
1529 | let mut encoder = UTF_8.new_encoder(); |
1530 | let mut output = [0u8; 13]; |
1531 | let input = &[0x2C9Fu16, 0x2CA9u16, 0x2CA3u16, 0x2C9Fu16]; |
1532 | let needed = encoder |
1533 | .max_buffer_length_from_utf16_without_replacement(input.len()) |
1534 | .unwrap(); |
1535 | let (result, _, _) = |
1536 | encoder.encode_from_utf16_without_replacement(input, &mut output[..needed], true); |
1537 | assert_eq!(result, EncoderResult::InputEmpty); |
1538 | } |
1539 | |
1540 | #[test ] |
1541 | fn test_decode_bom_prefixed_split_byte_triple() { |
1542 | let mut output = [0u16; 20]; |
1543 | let mut decoder = UTF_8.new_decoder(); |
1544 | { |
1545 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
1546 | let (result, read, written, had_errors) = |
1547 | decoder.decode_to_utf16(b" \xEF" , &mut output[..needed], false); |
1548 | assert_eq!(result, CoderResult::InputEmpty); |
1549 | assert_eq!(read, 1); |
1550 | assert_eq!(written, 0); |
1551 | assert!(!had_errors); |
1552 | } |
1553 | { |
1554 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
1555 | let (result, read, written, had_errors) = |
1556 | decoder.decode_to_utf16(b" \xBF" , &mut output[..needed], false); |
1557 | assert_eq!(result, CoderResult::InputEmpty); |
1558 | assert_eq!(read, 1); |
1559 | assert_eq!(written, 0); |
1560 | assert!(!had_errors); |
1561 | } |
1562 | { |
1563 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
1564 | let (result, read, written, had_errors) = |
1565 | decoder.decode_to_utf16(b" \xBE" , &mut output[..needed], true); |
1566 | assert_eq!(result, CoderResult::InputEmpty); |
1567 | assert_eq!(read, 1); |
1568 | assert_eq!(written, 1); |
1569 | assert!(!had_errors); |
1570 | assert_eq!(output[0], 0xFFFE); |
1571 | } |
1572 | } |
1573 | |
1574 | #[test ] |
1575 | fn test_decode_bom_prefixed_split_byte_pair() { |
1576 | let mut output = [0u16; 20]; |
1577 | let mut decoder = UTF_8.new_decoder(); |
1578 | { |
1579 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
1580 | let (result, read, written, had_errors) = |
1581 | decoder.decode_to_utf16(b" \xEF" , &mut output[..needed], false); |
1582 | assert_eq!(result, CoderResult::InputEmpty); |
1583 | assert_eq!(read, 1); |
1584 | assert_eq!(written, 0); |
1585 | assert!(!had_errors); |
1586 | } |
1587 | { |
1588 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
1589 | let (result, read, written, had_errors) = |
1590 | decoder.decode_to_utf16(b" \xBC" , &mut output[..needed], true); |
1591 | assert_eq!(result, CoderResult::InputEmpty); |
1592 | assert_eq!(read, 1); |
1593 | assert_eq!(written, 1); |
1594 | assert!(had_errors); |
1595 | assert_eq!(output[0], 0xFFFD); |
1596 | } |
1597 | } |
1598 | |
1599 | #[test ] |
1600 | fn test_decode_bom_prefix() { |
1601 | let mut output = [0u16; 20]; |
1602 | let mut decoder = UTF_8.new_decoder(); |
1603 | { |
1604 | let needed = decoder.max_utf16_buffer_length(1).unwrap(); |
1605 | let (result, read, written, had_errors) = |
1606 | decoder.decode_to_utf16(b" \xEF" , &mut output[..needed], true); |
1607 | assert_eq!(result, CoderResult::InputEmpty); |
1608 | assert_eq!(read, 1); |
1609 | assert_eq!(written, 1); |
1610 | assert!(had_errors); |
1611 | assert_eq!(output[0], 0xFFFD); |
1612 | } |
1613 | } |
1614 | |
1615 | #[test ] |
1616 | fn test_tail() { |
1617 | let mut output = [0u16; 1]; |
1618 | let mut decoder = UTF_8.new_decoder_without_bom_handling(); |
1619 | { |
1620 | let (result, read, written, had_errors) = |
1621 | decoder.decode_to_utf16(" \u{E4}a" .as_bytes(), &mut output[..], false); |
1622 | assert_eq!(result, CoderResult::OutputFull); |
1623 | assert_eq!(read, 2); |
1624 | assert_eq!(written, 1); |
1625 | assert!(!had_errors); |
1626 | assert_eq!(output[0], 0x00E4); |
1627 | } |
1628 | } |
1629 | } |
1630 | |