1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::ascii::ascii_to_basic_latin;
12use crate::ascii::basic_latin_to_ascii;
13use crate::ascii::validate_ascii;
14use crate::handles::*;
15use crate::mem::convert_utf16_to_utf8_partial;
16use crate::variant::*;
17
18cfg_if! {
19 if #[cfg(feature = "simd-accel")] {
20 use ::core::intrinsics::unlikely;
21 use ::core::intrinsics::likely;
22 } else {
23 #[inline(always)]
24 fn unlikely(b: bool) -> bool {
25 b
26 }
27 #[inline(always)]
28 fn likely(b: bool) -> bool {
29 b
30 }
31 }
32}
33
34#[repr(align(64))] // Align to cache lines
35pub struct Utf8Data {
36 pub table: [u8; 384],
37}
38
39// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
40// Instead, please regenerate using generate-encoding-data.py
41
42pub static UTF8_DATA: Utf8Data = Utf8Data {
43 table: [
44 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
45 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
46 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
47 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
48 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
49 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
50 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
51 252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
52 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
53 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
54 164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
55 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
56 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
57 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
58 252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
60 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
61 8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
62 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
63 ],
64};
65
66// END GENERATED CODE
67
68pub fn utf8_valid_up_to(src: &[u8]) -> usize {
69 let mut read = 0;
70 'outer: loop {
71 let mut byte = {
72 let src_remaining = &src[read..];
73 match validate_ascii(src_remaining) {
74 None => {
75 return src.len();
76 }
77 Some((non_ascii, consumed)) => {
78 read += consumed;
79 non_ascii
80 }
81 }
82 };
83 // Check for the longest sequence to avoid checking twice for the
84 // multi-byte sequences. This can't overflow with 64-bit address space,
85 // because full 64 bits aren't in use. In the 32-bit PAE case, for this
86 // to overflow would mean that the source slice would be so large that
87 // the address space of the process would not have space for any code.
88 // Therefore, the slice cannot be so long that this would overflow.
89 if likely(read + 4 <= src.len()) {
90 'inner: loop {
91 // At this point, `byte` is not included in `read`, because we
92 // don't yet know that a) the UTF-8 sequence is valid and b) that there
93 // is output space if it is an astral sequence.
94 // Inspecting the lead byte directly is faster than what the
95 // std lib does!
96 if likely(in_inclusive_range8(byte, 0xC2, 0xDF)) {
97 // Two-byte
98 let second = unsafe { *(src.get_unchecked(read + 1)) };
99 if !in_inclusive_range8(second, 0x80, 0xBF) {
100 break 'outer;
101 }
102 read += 2;
103
104 // Next lead (manually inlined)
105 if likely(read + 4 <= src.len()) {
106 byte = unsafe { *(src.get_unchecked(read)) };
107 if byte < 0x80 {
108 read += 1;
109 continue 'outer;
110 }
111 continue 'inner;
112 }
113 break 'inner;
114 }
115 if likely(byte < 0xF0) {
116 'three: loop {
117 // Three-byte
118 let second = unsafe { *(src.get_unchecked(read + 1)) };
119 let third = unsafe { *(src.get_unchecked(read + 2)) };
120 if ((UTF8_DATA.table[usize::from(second)]
121 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
122 | (third >> 6))
123 != 2
124 {
125 break 'outer;
126 }
127 read += 3;
128
129 // Next lead (manually inlined)
130 if likely(read + 4 <= src.len()) {
131 byte = unsafe { *(src.get_unchecked(read)) };
132 if in_inclusive_range8(byte, 0xE0, 0xEF) {
133 continue 'three;
134 }
135 if likely(byte < 0x80) {
136 read += 1;
137 continue 'outer;
138 }
139 continue 'inner;
140 }
141 break 'inner;
142 }
143 }
144 // Four-byte
145 let second = unsafe { *(src.get_unchecked(read + 1)) };
146 let third = unsafe { *(src.get_unchecked(read + 2)) };
147 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
148 if (u16::from(
149 UTF8_DATA.table[usize::from(second)]
150 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
151 ) | u16::from(third >> 6)
152 | (u16::from(fourth & 0xC0) << 2))
153 != 0x202
154 {
155 break 'outer;
156 }
157 read += 4;
158
159 // Next lead
160 if likely(read + 4 <= src.len()) {
161 byte = unsafe { *(src.get_unchecked(read)) };
162 if byte < 0x80 {
163 read += 1;
164 continue 'outer;
165 }
166 continue 'inner;
167 }
168 break 'inner;
169 }
170 }
171 // We can't have a complete 4-byte sequence, but we could still have
172 // one to three shorter sequences.
173 'tail: loop {
174 // >= is better for bound check elision than ==
175 if read >= src.len() {
176 break 'outer;
177 }
178 byte = src[read];
179 // At this point, `byte` is not included in `read`, because we
180 // don't yet know that a) the UTF-8 sequence is valid and b) that there
181 // is output space if it is an astral sequence.
182 // Inspecting the lead byte directly is faster than what the
183 // std lib does!
184 if byte < 0x80 {
185 read += 1;
186 continue 'tail;
187 }
188 if in_inclusive_range8(byte, 0xC2, 0xDF) {
189 // Two-byte
190 let new_read = read + 2;
191 if new_read > src.len() {
192 break 'outer;
193 }
194 let second = src[read + 1];
195 if !in_inclusive_range8(second, 0x80, 0xBF) {
196 break 'outer;
197 }
198 read += 2;
199 continue 'tail;
200 }
201 // We need to exclude valid four byte lead bytes, because
202 // `UTF8_DATA.second_mask` covers
203 if byte < 0xF0 {
204 // Three-byte
205 let new_read = read + 3;
206 if new_read > src.len() {
207 break 'outer;
208 }
209 let second = src[read + 1];
210 let third = src[read + 2];
211 if ((UTF8_DATA.table[usize::from(second)]
212 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
213 | (third >> 6))
214 != 2
215 {
216 break 'outer;
217 }
218 read += 3;
219 // `'tail` handles sequences shorter than 4, so
220 // there can't be another sequence after this one.
221 break 'outer;
222 }
223 break 'outer;
224 }
225 }
226 read
227}
228
229#[cfg_attr(feature = "cargo-clippy", allow(never_loop, cyclomatic_complexity))]
230pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) {
231 let mut read = 0;
232 let mut written = 0;
233 'outer: loop {
234 let mut byte = {
235 let src_remaining = &src[read..];
236 let dst_remaining = &mut dst[written..];
237 let length = ::core::cmp::min(src_remaining.len(), dst_remaining.len());
238 match unsafe {
239 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
240 } {
241 None => {
242 read += length;
243 written += length;
244 break 'outer;
245 }
246 Some((non_ascii, consumed)) => {
247 read += consumed;
248 written += consumed;
249 non_ascii
250 }
251 }
252 };
253 // Check for the longest sequence to avoid checking twice for the
254 // multi-byte sequences. This can't overflow with 64-bit address space,
255 // because full 64 bits aren't in use. In the 32-bit PAE case, for this
256 // to overflow would mean that the source slice would be so large that
257 // the address space of the process would not have space for any code.
258 // Therefore, the slice cannot be so long that this would overflow.
259 if likely(read + 4 <= src.len()) {
260 'inner: loop {
261 // At this point, `byte` is not included in `read`, because we
262 // don't yet know that a) the UTF-8 sequence is valid and b) that there
263 // is output space if it is an astral sequence.
264 // We know, thanks to `ascii_to_basic_latin` that there is output
265 // space for at least one UTF-16 code unit, so no need to check
266 // for output space in the BMP cases.
267 // Inspecting the lead byte directly is faster than what the
268 // std lib does!
269 if likely(in_inclusive_range8(byte, 0xC2, 0xDF)) {
270 // Two-byte
271 let second = unsafe { *(src.get_unchecked(read + 1)) };
272 if !in_inclusive_range8(second, 0x80, 0xBF) {
273 break 'outer;
274 }
275 unsafe {
276 *(dst.get_unchecked_mut(written)) =
277 ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F)
278 };
279 read += 2;
280 written += 1;
281
282 // Next lead (manually inlined)
283 if written == dst.len() {
284 break 'outer;
285 }
286 if likely(read + 4 <= src.len()) {
287 byte = unsafe { *(src.get_unchecked(read)) };
288 if byte < 0x80 {
289 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
290 read += 1;
291 written += 1;
292 continue 'outer;
293 }
294 continue 'inner;
295 }
296 break 'inner;
297 }
298 if likely(byte < 0xF0) {
299 'three: loop {
300 // Three-byte
301 let second = unsafe { *(src.get_unchecked(read + 1)) };
302 let third = unsafe { *(src.get_unchecked(read + 2)) };
303 if ((UTF8_DATA.table[usize::from(second)]
304 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
305 | (third >> 6))
306 != 2
307 {
308 break 'outer;
309 }
310 let point = ((u16::from(byte) & 0xF) << 12)
311 | ((u16::from(second) & 0x3F) << 6)
312 | (u16::from(third) & 0x3F);
313 unsafe { *(dst.get_unchecked_mut(written)) = point };
314 read += 3;
315 written += 1;
316
317 // Next lead (manually inlined)
318 if written == dst.len() {
319 break 'outer;
320 }
321 if likely(read + 4 <= src.len()) {
322 byte = unsafe { *(src.get_unchecked(read)) };
323 if in_inclusive_range8(byte, 0xE0, 0xEF) {
324 continue 'three;
325 }
326 if likely(byte < 0x80) {
327 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
328 read += 1;
329 written += 1;
330 continue 'outer;
331 }
332 continue 'inner;
333 }
334 break 'inner;
335 }
336 }
337 // Four-byte
338 if written + 1 == dst.len() {
339 break 'outer;
340 }
341 let second = unsafe { *(src.get_unchecked(read + 1)) };
342 let third = unsafe { *(src.get_unchecked(read + 2)) };
343 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
344 if (u16::from(
345 UTF8_DATA.table[usize::from(second)]
346 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
347 ) | u16::from(third >> 6)
348 | (u16::from(fourth & 0xC0) << 2))
349 != 0x202
350 {
351 break 'outer;
352 }
353 let point = ((u32::from(byte) & 0x7) << 18)
354 | ((u32::from(second) & 0x3F) << 12)
355 | ((u32::from(third) & 0x3F) << 6)
356 | (u32::from(fourth) & 0x3F);
357 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
358 unsafe {
359 *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
360 };
361 read += 4;
362 written += 2;
363
364 // Next lead
365 if written == dst.len() {
366 break 'outer;
367 }
368 if likely(read + 4 <= src.len()) {
369 byte = unsafe { *(src.get_unchecked(read)) };
370 if byte < 0x80 {
371 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
372 read += 1;
373 written += 1;
374 continue 'outer;
375 }
376 continue 'inner;
377 }
378 break 'inner;
379 }
380 }
381 // We can't have a complete 4-byte sequence, but we could still have
382 // one to three shorter sequences.
383 'tail: loop {
384 // >= is better for bound check elision than ==
385 if read >= src.len() || written >= dst.len() {
386 break 'outer;
387 }
388 byte = src[read];
389 // At this point, `byte` is not included in `read`, because we
390 // don't yet know that a) the UTF-8 sequence is valid and b) that there
391 // is output space if it is an astral sequence.
392 // Inspecting the lead byte directly is faster than what the
393 // std lib does!
394 if byte < 0x80 {
395 dst[written] = u16::from(byte);
396 read += 1;
397 written += 1;
398 continue 'tail;
399 }
400 if in_inclusive_range8(byte, 0xC2, 0xDF) {
401 // Two-byte
402 let new_read = read + 2;
403 if new_read > src.len() {
404 break 'outer;
405 }
406 let second = src[read + 1];
407 if !in_inclusive_range8(second, 0x80, 0xBF) {
408 break 'outer;
409 }
410 dst[written] = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
411 read += 2;
412 written += 1;
413 continue 'tail;
414 }
415 // We need to exclude valid four byte lead bytes, because
416 // `UTF8_DATA.second_mask` covers
417 if byte < 0xF0 {
418 // Three-byte
419 let new_read = read + 3;
420 if new_read > src.len() {
421 break 'outer;
422 }
423 let second = src[read + 1];
424 let third = src[read + 2];
425 if ((UTF8_DATA.table[usize::from(second)]
426 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
427 | (third >> 6))
428 != 2
429 {
430 break 'outer;
431 }
432 let point = ((u16::from(byte) & 0xF) << 12)
433 | ((u16::from(second) & 0x3F) << 6)
434 | (u16::from(third) & 0x3F);
435 dst[written] = point;
436 read += 3;
437 written += 1;
438 // `'tail` handles sequences shorter than 4, so
439 // there can't be another sequence after this one.
440 break 'outer;
441 }
442 break 'outer;
443 }
444 }
445 (read, written)
446}
447
448pub struct Utf8Decoder {
449 code_point: u32,
450 bytes_seen: usize, // 1, 2 or 3: counts continuations only
451 bytes_needed: usize, // 1, 2 or 3: counts continuations only
452 lower_boundary: u8,
453 upper_boundary: u8,
454}
455
456impl Utf8Decoder {
457 pub fn new_inner() -> Utf8Decoder {
458 Utf8Decoder {
459 code_point: 0,
460 bytes_seen: 0,
461 bytes_needed: 0,
462 lower_boundary: 0x80u8,
463 upper_boundary: 0xBFu8,
464 }
465 }
466
467 pub fn new() -> VariantDecoder {
468 VariantDecoder::Utf8(Utf8Decoder::new_inner())
469 }
470
471 pub fn in_neutral_state(&self) -> bool {
472 self.bytes_needed == 0
473 }
474
475 fn extra_from_state(&self) -> usize {
476 if self.bytes_needed == 0 {
477 0
478 } else {
479 self.bytes_seen + 1
480 }
481 }
482
483 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
484 byte_length.checked_add(1 + self.extra_from_state())
485 }
486
487 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
488 byte_length.checked_add(3 + self.extra_from_state())
489 }
490
491 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
492 checked_add(
493 3,
494 checked_mul(3, byte_length.checked_add(self.extra_from_state())),
495 )
496 }
497
498 decoder_functions!(
499 {},
500 {
501 // This is the fast path. The rest runs only at the
502 // start and end for partial sequences.
503 if self.bytes_needed == 0 {
504 dest.copy_utf8_up_to_invalid_from(&mut source);
505 }
506 },
507 {
508 if self.bytes_needed != 0 {
509 let bad_bytes = (self.bytes_seen + 1) as u8;
510 self.code_point = 0;
511 self.bytes_needed = 0;
512 self.bytes_seen = 0;
513 return (
514 DecoderResult::Malformed(bad_bytes, 0),
515 src_consumed,
516 dest.written(),
517 );
518 }
519 },
520 {
521 if self.bytes_needed == 0 {
522 if b < 0x80u8 {
523 destination_handle.write_ascii(b);
524 continue;
525 }
526 if b < 0xC2u8 {
527 return (
528 DecoderResult::Malformed(1, 0),
529 unread_handle.consumed(),
530 destination_handle.written(),
531 );
532 }
533 if b < 0xE0u8 {
534 self.bytes_needed = 1;
535 self.code_point = u32::from(b) & 0x1F;
536 continue;
537 }
538 if b < 0xF0u8 {
539 if b == 0xE0u8 {
540 self.lower_boundary = 0xA0u8;
541 } else if b == 0xEDu8 {
542 self.upper_boundary = 0x9Fu8;
543 }
544 self.bytes_needed = 2;
545 self.code_point = u32::from(b) & 0xF;
546 continue;
547 }
548 if b < 0xF5u8 {
549 if b == 0xF0u8 {
550 self.lower_boundary = 0x90u8;
551 } else if b == 0xF4u8 {
552 self.upper_boundary = 0x8Fu8;
553 }
554 self.bytes_needed = 3;
555 self.code_point = u32::from(b) & 0x7;
556 continue;
557 }
558 return (
559 DecoderResult::Malformed(1, 0),
560 unread_handle.consumed(),
561 destination_handle.written(),
562 );
563 }
564 // self.bytes_needed != 0
565 if !(b >= self.lower_boundary && b <= self.upper_boundary) {
566 let bad_bytes = (self.bytes_seen + 1) as u8;
567 self.code_point = 0;
568 self.bytes_needed = 0;
569 self.bytes_seen = 0;
570 self.lower_boundary = 0x80u8;
571 self.upper_boundary = 0xBFu8;
572 return (
573 DecoderResult::Malformed(bad_bytes, 0),
574 unread_handle.unread(),
575 destination_handle.written(),
576 );
577 }
578 self.lower_boundary = 0x80u8;
579 self.upper_boundary = 0xBFu8;
580 self.code_point = (self.code_point << 6) | (u32::from(b) & 0x3F);
581 self.bytes_seen += 1;
582 if self.bytes_seen != self.bytes_needed {
583 continue;
584 }
585 if self.bytes_needed == 3 {
586 destination_handle.write_astral(self.code_point);
587 } else {
588 destination_handle.write_bmp_excl_ascii(self.code_point as u16);
589 }
590 self.code_point = 0;
591 self.bytes_needed = 0;
592 self.bytes_seen = 0;
593 continue;
594 },
595 self,
596 src_consumed,
597 dest,
598 source,
599 b,
600 destination_handle,
601 unread_handle,
602 check_space_astral
603 );
604}
605
606#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
607#[inline(never)]
608pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
609 let mut read = 0;
610 let mut written = 0;
611 'outer: loop {
612 let mut unit = {
613 let src_remaining = &src[read..];
614 let dst_remaining = &mut dst[written..];
615 let length = if dst_remaining.len() < src_remaining.len() {
616 dst_remaining.len()
617 } else {
618 src_remaining.len()
619 };
620 match unsafe {
621 basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
622 } {
623 None => {
624 read += length;
625 written += length;
626 return (read, written);
627 }
628 Some((non_ascii, consumed)) => {
629 read += consumed;
630 written += consumed;
631 non_ascii
632 }
633 }
634 };
635 'inner: loop {
636 // The following loop is only broken out of as a goto forward.
637 loop {
638 // Unfortunately, this check isn't enough for the compiler to elide
639 // the bound checks on writes to dst, which is why they are manually
640 // elided, which makes a measurable difference.
641 if written.checked_add(4).unwrap() > dst.len() {
642 return (read, written);
643 }
644 read += 1;
645 if unit < 0x800 {
646 unsafe {
647 *(dst.get_unchecked_mut(written)) = (unit >> 6) as u8 | 0xC0u8;
648 written += 1;
649 *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
650 written += 1;
651 }
652 break;
653 }
654 let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
655 if likely(unit_minus_surrogate_start > (0xDFFF - 0xD800)) {
656 unsafe {
657 *(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8;
658 written += 1;
659 *(dst.get_unchecked_mut(written)) = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
660 written += 1;
661 *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
662 written += 1;
663 }
664 break;
665 }
666 if likely(unit_minus_surrogate_start <= (0xDBFF - 0xD800)) {
667 // high surrogate
668 // read > src.len() is impossible, but using
669 // >= instead of == allows the compiler to elide a bound check.
670 if read >= src.len() {
671 debug_assert_eq!(read, src.len());
672 // Unpaired surrogate at the end of the buffer.
673 unsafe {
674 *(dst.get_unchecked_mut(written)) = 0xEFu8;
675 written += 1;
676 *(dst.get_unchecked_mut(written)) = 0xBFu8;
677 written += 1;
678 *(dst.get_unchecked_mut(written)) = 0xBDu8;
679 written += 1;
680 }
681 return (read, written);
682 }
683 let second = src[read];
684 let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
685 if likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) {
686 // The next code unit is a low surrogate. Advance position.
687 read += 1;
688 let astral = (u32::from(unit) << 10) + u32::from(second)
689 - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
690 unsafe {
691 *(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8;
692 written += 1;
693 *(dst.get_unchecked_mut(written)) =
694 ((astral & 0x3F000u32) >> 12) as u8 | 0x80u8;
695 written += 1;
696 *(dst.get_unchecked_mut(written)) =
697 ((astral & 0xFC0u32) >> 6) as u8 | 0x80u8;
698 written += 1;
699 *(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8;
700 written += 1;
701 }
702 break;
703 }
704 // The next code unit is not a low surrogate. Don't advance
705 // position and treat the high surrogate as unpaired.
706 // Fall through
707 }
708 // Unpaired low surrogate
709 unsafe {
710 *(dst.get_unchecked_mut(written)) = 0xEFu8;
711 written += 1;
712 *(dst.get_unchecked_mut(written)) = 0xBFu8;
713 written += 1;
714 *(dst.get_unchecked_mut(written)) = 0xBDu8;
715 written += 1;
716 }
717 break;
718 }
719 // Now see if the next unit is Basic Latin
720 // read > src.len() is impossible, but using
721 // >= instead of == allows the compiler to elide a bound check.
722 if read >= src.len() {
723 debug_assert_eq!(read, src.len());
724 return (read, written);
725 }
726 unit = src[read];
727 if unlikely(unit < 0x80) {
728 // written > dst.len() is impossible, but using
729 // >= instead of == allows the compiler to elide a bound check.
730 if written >= dst.len() {
731 debug_assert_eq!(written, dst.len());
732 return (read, written);
733 }
734 dst[written] = unit as u8;
735 read += 1;
736 written += 1;
737 // Mysteriously, adding a punctuation check here makes
738 // the expected benificiary cases *slower*!
739 continue 'outer;
740 }
741 continue 'inner;
742 }
743 }
744}
745
746#[inline(never)]
747pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
748 // Everything below is cold code!
749 let mut read = 0;
750 let mut written = 0;
751 let mut unit = src[read];
752 // We now have up to 3 output slots, so an astral character
753 // will not fit.
754 if unit < 0x800 {
755 loop {
756 if unit < 0x80 {
757 if written >= dst.len() {
758 return (read, written);
759 }
760 read += 1;
761 dst[written] = unit as u8;
762 written += 1;
763 } else if unit < 0x800 {
764 if written + 2 > dst.len() {
765 return (read, written);
766 }
767 read += 1;
768 dst[written] = (unit >> 6) as u8 | 0xC0u8;
769 written += 1;
770 dst[written] = (unit & 0x3F) as u8 | 0x80u8;
771 written += 1;
772 } else {
773 return (read, written);
774 }
775 // read > src.len() is impossible, but using
776 // >= instead of == allows the compiler to elide a bound check.
777 if read >= src.len() {
778 debug_assert_eq!(read, src.len());
779 return (read, written);
780 }
781 unit = src[read];
782 }
783 }
784 // Could be an unpaired surrogate, but we'll need 3 output
785 // slots in any case.
786 if written + 3 > dst.len() {
787 return (read, written);
788 }
789 read += 1;
790 let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
791 if unit_minus_surrogate_start <= (0xDFFF - 0xD800) {
792 // Got surrogate
793 if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
794 // Got high surrogate
795 if read >= src.len() {
796 // Unpaired high surrogate
797 unit = 0xFFFD;
798 } else {
799 let second = src[read];
800 if in_inclusive_range16(second, 0xDC00, 0xDFFF) {
801 // Valid surrogate pair, but we know it won't fit.
802 read -= 1;
803 return (read, written);
804 }
805 // Unpaired high
806 unit = 0xFFFD;
807 }
808 } else {
809 // Unpaired low
810 unit = 0xFFFD;
811 }
812 }
813 dst[written] = (unit >> 12) as u8 | 0xE0u8;
814 written += 1;
815 dst[written] = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
816 written += 1;
817 dst[written] = (unit & 0x3F) as u8 | 0x80u8;
818 written += 1;
819 debug_assert_eq!(written, dst.len());
820 (read, written)
821}
822
823pub struct Utf8Encoder;
824
825impl Utf8Encoder {
826 pub fn new(encoding: &'static Encoding) -> Encoder {
827 Encoder::new(encoding, VariantEncoder::Utf8(Utf8Encoder))
828 }
829
830 pub fn max_buffer_length_from_utf16_without_replacement(
831 &self,
832 u16_length: usize,
833 ) -> Option<usize> {
834 u16_length.checked_mul(3)
835 }
836
837 pub fn max_buffer_length_from_utf8_without_replacement(
838 &self,
839 byte_length: usize,
840 ) -> Option<usize> {
841 Some(byte_length)
842 }
843
844 pub fn encode_from_utf16_raw(
845 &mut self,
846 src: &[u16],
847 dst: &mut [u8],
848 _last: bool,
849 ) -> (EncoderResult, usize, usize) {
850 let (read, written) = convert_utf16_to_utf8_partial(src, dst);
851 (
852 if read == src.len() {
853 EncoderResult::InputEmpty
854 } else {
855 EncoderResult::OutputFull
856 },
857 read,
858 written,
859 )
860 }
861
862 pub fn encode_from_utf8_raw(
863 &mut self,
864 src: &str,
865 dst: &mut [u8],
866 _last: bool,
867 ) -> (EncoderResult, usize, usize) {
868 let bytes = src.as_bytes();
869 let mut to_write = bytes.len();
870 if to_write <= dst.len() {
871 (&mut dst[..to_write]).copy_from_slice(bytes);
872 return (EncoderResult::InputEmpty, to_write, to_write);
873 }
874 to_write = dst.len();
875 // Move back until we find a UTF-8 sequence boundary.
876 while (bytes[to_write] & 0xC0) == 0x80 {
877 to_write -= 1;
878 }
879 (&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]);
880 (EncoderResult::OutputFull, to_write, to_write)
881 }
882}
883
884// Any copyright to the test code below this comment is dedicated to the
885// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
886
887#[cfg(all(test, feature = "alloc"))]
888mod tests {
889 use super::super::testing::*;
890 use super::super::*;
891
892 // fn decode_utf8_to_utf16(bytes: &[u8], expect: &[u16]) {
893 // decode_to_utf16_without_replacement(UTF_8, bytes, expect);
894 // }
895
896 fn decode_utf8_to_utf8(bytes: &[u8], expect: &str) {
897 decode_to_utf8(UTF_8, bytes, expect);
898 }
899
900 fn decode_valid_utf8(string: &str) {
901 decode_utf8_to_utf8(string.as_bytes(), string);
902 }
903
904 fn encode_utf8_from_utf16(string: &[u16], expect: &[u8]) {
905 encode_from_utf16(UTF_8, string, expect);
906 }
907
908 fn encode_utf8_from_utf8(string: &str, expect: &[u8]) {
909 encode_from_utf8(UTF_8, string, expect);
910 }
911
912 fn encode_utf8_from_utf16_with_output_limit(
913 string: &[u16],
914 expect: &str,
915 limit: usize,
916 expect_result: EncoderResult,
917 ) {
918 let mut dst = Vec::new();
919 {
920 dst.resize(limit, 0u8);
921 let mut encoder = UTF_8.new_encoder();
922 let (result, read, written) =
923 encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
924 assert_eq!(result, expect_result);
925 if expect_result == EncoderResult::InputEmpty {
926 assert_eq!(read, string.len());
927 }
928 assert_eq!(&dst[..written], expect.as_bytes());
929 }
930 {
931 dst.resize(64, 0u8);
932 for (i, elem) in dst.iter_mut().enumerate() {
933 *elem = i as u8;
934 }
935 let mut encoder = UTF_8.new_encoder();
936 let (_, _, mut j) =
937 encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
938 while j < dst.len() {
939 assert_eq!(usize::from(dst[j]), j);
940 j += 1;
941 }
942 }
943 }
944
945 #[test]
946 fn test_utf8_decode() {
947 // Empty
948 decode_valid_utf8("");
949 // ASCII
950 decode_valid_utf8("ab");
951 // Low BMP
952 decode_valid_utf8("a\u{E4}Z");
953 // High BMP
954 decode_valid_utf8("a\u{2603}Z");
955 // Astral
956 decode_valid_utf8("a\u{1F4A9}Z");
957 // Low BMP with last byte missing
958 decode_utf8_to_utf8(b"a\xC3Z", "a\u{FFFD}Z");
959 decode_utf8_to_utf8(b"a\xC3", "a\u{FFFD}");
960 // High BMP with last byte missing
961 decode_utf8_to_utf8(b"a\xE2\x98Z", "a\u{FFFD}Z");
962 decode_utf8_to_utf8(b"a\xE2\x98", "a\u{FFFD}");
963 // Astral with last byte missing
964 decode_utf8_to_utf8(b"a\xF0\x9F\x92Z", "a\u{FFFD}Z");
965 decode_utf8_to_utf8(b"a\xF0\x9F\x92", "a\u{FFFD}");
966 // Lone highest continuation
967 decode_utf8_to_utf8(b"a\xBFZ", "a\u{FFFD}Z");
968 decode_utf8_to_utf8(b"a\xBF", "a\u{FFFD}");
969 // Two lone highest continuations
970 decode_utf8_to_utf8(b"a\xBF\xBFZ", "a\u{FFFD}\u{FFFD}Z");
971 decode_utf8_to_utf8(b"a\xBF\xBF", "a\u{FFFD}\u{FFFD}");
972 // Low BMP followed by lowest lone continuation
973 decode_utf8_to_utf8(b"a\xC3\xA4\x80Z", "a\u{E4}\u{FFFD}Z");
974 decode_utf8_to_utf8(b"a\xC3\xA4\x80", "a\u{E4}\u{FFFD}");
975 // Low BMP followed by highest lone continuation
976 decode_utf8_to_utf8(b"a\xC3\xA4\xBFZ", "a\u{E4}\u{FFFD}Z");
977 decode_utf8_to_utf8(b"a\xC3\xA4\xBF", "a\u{E4}\u{FFFD}");
978 // High BMP followed by lowest lone continuation
979 decode_utf8_to_utf8(b"a\xE2\x98\x83\x80Z", "a\u{2603}\u{FFFD}Z");
980 decode_utf8_to_utf8(b"a\xE2\x98\x83\x80", "a\u{2603}\u{FFFD}");
981 // High BMP followed by highest lone continuation
982 decode_utf8_to_utf8(b"a\xE2\x98\x83\xBFZ", "a\u{2603}\u{FFFD}Z");
983 decode_utf8_to_utf8(b"a\xE2\x98\x83\xBF", "a\u{2603}\u{FFFD}");
984 // Astral followed by lowest lone continuation
985 decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80Z", "a\u{1F4A9}\u{FFFD}Z");
986 decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80", "a\u{1F4A9}\u{FFFD}");
987 // Astral followed by highest lone continuation
988 decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBFZ", "a\u{1F4A9}\u{FFFD}Z");
989 decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBF", "a\u{1F4A9}\u{FFFD}");
990
991 // Boundary conditions
992 // Lowest single-byte
993 decode_valid_utf8("Z\x00");
994 decode_valid_utf8("Z\x00Z");
995 // Lowest single-byte as two-byte overlong sequence
996 decode_utf8_to_utf8(b"a\xC0\x80", "a\u{FFFD}\u{FFFD}");
997 decode_utf8_to_utf8(b"a\xC0\x80Z", "a\u{FFFD}\u{FFFD}Z");
998 // Lowest single-byte as three-byte overlong sequence
999 decode_utf8_to_utf8(b"a\xE0\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1000 decode_utf8_to_utf8(b"a\xE0\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1001 // Lowest single-byte as four-byte overlong sequence
1002 decode_utf8_to_utf8(b"a\xF0\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1003 decode_utf8_to_utf8(b"a\xF0\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1004 // One below lowest single-byte
1005 decode_utf8_to_utf8(b"a\xFF", "a\u{FFFD}");
1006 decode_utf8_to_utf8(b"a\xFFZ", "a\u{FFFD}Z");
1007 // Highest single-byte
1008 decode_valid_utf8("a\x7F");
1009 decode_valid_utf8("a\x7FZ");
1010 // Highest single-byte as two-byte overlong sequence
1011 decode_utf8_to_utf8(b"a\xC1\xBF", "a\u{FFFD}\u{FFFD}");
1012 decode_utf8_to_utf8(b"a\xC1\xBFZ", "a\u{FFFD}\u{FFFD}Z");
1013 // Highest single-byte as three-byte overlong sequence
1014 decode_utf8_to_utf8(b"a\xE0\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1015 decode_utf8_to_utf8(b"a\xE0\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1016 // Highest single-byte as four-byte overlong sequence
1017 decode_utf8_to_utf8(b"a\xF0\x80\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1018 decode_utf8_to_utf8(b"a\xF0\x80\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1019 // One past highest single byte (also lone continuation)
1020 decode_utf8_to_utf8(b"a\x80Z", "a\u{FFFD}Z");
1021 decode_utf8_to_utf8(b"a\x80", "a\u{FFFD}");
1022 // Two lone continuations
1023 decode_utf8_to_utf8(b"a\x80\x80Z", "a\u{FFFD}\u{FFFD}Z");
1024 decode_utf8_to_utf8(b"a\x80\x80", "a\u{FFFD}\u{FFFD}");
1025 // Three lone continuations
1026 decode_utf8_to_utf8(b"a\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1027 decode_utf8_to_utf8(b"a\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1028 // Four lone continuations
1029 decode_utf8_to_utf8(b"a\x80\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1030 decode_utf8_to_utf8(b"a\x80\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1031 // Lowest two-byte
1032 decode_utf8_to_utf8(b"a\xC2\x80", "a\u{0080}");
1033 decode_utf8_to_utf8(b"a\xC2\x80Z", "a\u{0080}Z");
1034 // Lowest two-byte as three-byte overlong sequence
1035 decode_utf8_to_utf8(b"a\xE0\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1036 decode_utf8_to_utf8(b"a\xE0\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1037 // Lowest two-byte as four-byte overlong sequence
1038 decode_utf8_to_utf8(b"a\xF0\x80\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1039 decode_utf8_to_utf8(b"a\xF0\x80\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1040 // Lead one below lowest two-byte
1041 decode_utf8_to_utf8(b"a\xC1\x80", "a\u{FFFD}\u{FFFD}");
1042 decode_utf8_to_utf8(b"a\xC1\x80Z", "a\u{FFFD}\u{FFFD}Z");
1043 // Trail one below lowest two-byte
1044 decode_utf8_to_utf8(b"a\xC2\x7F", "a\u{FFFD}\u{007F}");
1045 decode_utf8_to_utf8(b"a\xC2\x7FZ", "a\u{FFFD}\u{007F}Z");
1046 // Highest two-byte
1047 decode_utf8_to_utf8(b"a\xDF\xBF", "a\u{07FF}");
1048 decode_utf8_to_utf8(b"a\xDF\xBFZ", "a\u{07FF}Z");
1049 // Highest two-byte as three-byte overlong sequence
1050 decode_utf8_to_utf8(b"a\xE0\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1051 decode_utf8_to_utf8(b"a\xE0\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1052 // Highest two-byte as four-byte overlong sequence
1053 decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1054 decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1055 // Lowest three-byte
1056 decode_utf8_to_utf8(b"a\xE0\xA0\x80", "a\u{0800}");
1057 decode_utf8_to_utf8(b"a\xE0\xA0\x80Z", "a\u{0800}Z");
1058 // Lowest three-byte as four-byte overlong sequence
1059 decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1060 decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1061 // Highest below surrogates
1062 decode_utf8_to_utf8(b"a\xED\x9F\xBF", "a\u{D7FF}");
1063 decode_utf8_to_utf8(b"a\xED\x9F\xBFZ", "a\u{D7FF}Z");
1064 // Highest below surrogates as four-byte overlong sequence
1065 decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1066 decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1067 // First surrogate
1068 decode_utf8_to_utf8(b"a\xED\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1069 decode_utf8_to_utf8(b"a\xED\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1070 // First surrogate as four-byte overlong sequence
1071 decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1072 decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1073 // Last surrogate
1074 decode_utf8_to_utf8(b"a\xED\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1075 decode_utf8_to_utf8(b"a\xED\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1076 // Last surrogate as four-byte overlong sequence
1077 decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1078 decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1079 // Lowest above surrogates
1080 decode_utf8_to_utf8(b"a\xEE\x80\x80", "a\u{E000}");
1081 decode_utf8_to_utf8(b"a\xEE\x80\x80Z", "a\u{E000}Z");
1082 // Lowest above surrogates as four-byte overlong sequence
1083 decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1084 decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1085 // Highest three-byte
1086 decode_utf8_to_utf8(b"a\xEF\xBF\xBF", "a\u{FFFF}");
1087 decode_utf8_to_utf8(b"a\xEF\xBF\xBFZ", "a\u{FFFF}Z");
1088 // Highest three-byte as four-byte overlong sequence
1089 decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1090 decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1091 // Lowest four-byte
1092 decode_utf8_to_utf8(b"a\xF0\x90\x80\x80", "a\u{10000}");
1093 decode_utf8_to_utf8(b"a\xF0\x90\x80\x80Z", "a\u{10000}Z");
1094 // Highest four-byte
1095 decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBF", "a\u{10FFFF}");
1096 decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBFZ", "a\u{10FFFF}Z");
1097 // One past highest four-byte
1098 decode_utf8_to_utf8(b"a\xF4\x90\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1099 decode_utf8_to_utf8(b"a\xF4\x90\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1100
1101 // Highest four-byte with last byte replaced with 0xFF
1102 decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFF", "a\u{FFFD}\u{FFFD}");
1103 decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFFZ", "a\u{FFFD}\u{FFFD}Z");
1104 }
1105
1106 #[test]
1107 fn test_utf8_encode() {
1108 // Empty
1109 encode_utf8_from_utf16(&[], b"");
1110 encode_utf8_from_utf8("", b"");
1111
1112 encode_utf8_from_utf16(&[0x0000], "\u{0000}".as_bytes());
1113 encode_utf8_from_utf16(&[0x007F], "\u{007F}".as_bytes());
1114 encode_utf8_from_utf16(&[0x0080], "\u{0080}".as_bytes());
1115 encode_utf8_from_utf16(&[0x07FF], "\u{07FF}".as_bytes());
1116 encode_utf8_from_utf16(&[0x0800], "\u{0800}".as_bytes());
1117 encode_utf8_from_utf16(&[0xD7FF], "\u{D7FF}".as_bytes());
1118 encode_utf8_from_utf16(&[0xD800], "\u{FFFD}".as_bytes());
1119 encode_utf8_from_utf16(&[0xD800, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
1120 encode_utf8_from_utf16(&[0xDFFF], "\u{FFFD}".as_bytes());
1121 encode_utf8_from_utf16(&[0xDFFF, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
1122 encode_utf8_from_utf16(&[0xE000], "\u{E000}".as_bytes());
1123 encode_utf8_from_utf16(&[0xFFFF], "\u{FFFF}".as_bytes());
1124 encode_utf8_from_utf16(&[0xD800, 0xDC00], "\u{10000}".as_bytes());
1125 encode_utf8_from_utf16(&[0xDBFF, 0xDFFF], "\u{10FFFF}".as_bytes());
1126 encode_utf8_from_utf16(&[0xDC00, 0xDEDE], "\u{FFFD}\u{FFFD}".as_bytes());
1127 }
1128
1129 #[test]
1130 fn test_encode_utf8_from_utf16_with_output_limit() {
1131 encode_utf8_from_utf16_with_output_limit(&[0x0062], "\u{62}", 1, EncoderResult::InputEmpty);
1132 encode_utf8_from_utf16_with_output_limit(&[0x00A7], "\u{A7}", 2, EncoderResult::InputEmpty);
1133 encode_utf8_from_utf16_with_output_limit(
1134 &[0x2603],
1135 "\u{2603}",
1136 3,
1137 EncoderResult::InputEmpty,
1138 );
1139 encode_utf8_from_utf16_with_output_limit(
1140 &[0xD83D, 0xDCA9],
1141 "\u{1F4A9}",
1142 4,
1143 EncoderResult::InputEmpty,
1144 );
1145
1146 encode_utf8_from_utf16_with_output_limit(&[0x00A7], "", 1, EncoderResult::OutputFull);
1147 encode_utf8_from_utf16_with_output_limit(&[0x2603], "", 2, EncoderResult::OutputFull);
1148 encode_utf8_from_utf16_with_output_limit(
1149 &[0xD83D, 0xDCA9],
1150 "",
1151 3,
1152 EncoderResult::OutputFull,
1153 );
1154
1155 encode_utf8_from_utf16_with_output_limit(
1156 &[0x0063, 0x0062],
1157 "\u{63}\u{62}",
1158 2,
1159 EncoderResult::InputEmpty,
1160 );
1161 encode_utf8_from_utf16_with_output_limit(
1162 &[0x0063, 0x00A7],
1163 "\u{63}\u{A7}",
1164 3,
1165 EncoderResult::InputEmpty,
1166 );
1167 encode_utf8_from_utf16_with_output_limit(
1168 &[0x0063, 0x2603],
1169 "\u{63}\u{2603}",
1170 4,
1171 EncoderResult::InputEmpty,
1172 );
1173 encode_utf8_from_utf16_with_output_limit(
1174 &[0x0063, 0xD83D, 0xDCA9],
1175 "\u{63}\u{1F4A9}",
1176 5,
1177 EncoderResult::InputEmpty,
1178 );
1179
1180 encode_utf8_from_utf16_with_output_limit(
1181 &[0x0063, 0x00A7],
1182 "\u{63}",
1183 2,
1184 EncoderResult::OutputFull,
1185 );
1186 encode_utf8_from_utf16_with_output_limit(
1187 &[0x0063, 0x2603],
1188 "\u{63}",
1189 3,
1190 EncoderResult::OutputFull,
1191 );
1192 encode_utf8_from_utf16_with_output_limit(
1193 &[0x0063, 0xD83D, 0xDCA9],
1194 "\u{63}",
1195 4,
1196 EncoderResult::OutputFull,
1197 );
1198
1199 encode_utf8_from_utf16_with_output_limit(
1200 &[0x00B6, 0x0062],
1201 "\u{B6}\u{62}",
1202 3,
1203 EncoderResult::InputEmpty,
1204 );
1205 encode_utf8_from_utf16_with_output_limit(
1206 &[0x00B6, 0x00A7],
1207 "\u{B6}\u{A7}",
1208 4,
1209 EncoderResult::InputEmpty,
1210 );
1211 encode_utf8_from_utf16_with_output_limit(
1212 &[0x00B6, 0x2603],
1213 "\u{B6}\u{2603}",
1214 5,
1215 EncoderResult::InputEmpty,
1216 );
1217 encode_utf8_from_utf16_with_output_limit(
1218 &[0x00B6, 0xD83D, 0xDCA9],
1219 "\u{B6}\u{1F4A9}",
1220 6,
1221 EncoderResult::InputEmpty,
1222 );
1223
1224 encode_utf8_from_utf16_with_output_limit(
1225 &[0x00B6, 0x00A7],
1226 "\u{B6}",
1227 3,
1228 EncoderResult::OutputFull,
1229 );
1230 encode_utf8_from_utf16_with_output_limit(
1231 &[0x00B6, 0x2603],
1232 "\u{B6}",
1233 4,
1234 EncoderResult::OutputFull,
1235 );
1236 encode_utf8_from_utf16_with_output_limit(
1237 &[0x00B6, 0xD83D, 0xDCA9],
1238 "\u{B6}",
1239 5,
1240 EncoderResult::OutputFull,
1241 );
1242
1243 encode_utf8_from_utf16_with_output_limit(
1244 &[0x263A, 0x0062],
1245 "\u{263A}\u{62}",
1246 4,
1247 EncoderResult::InputEmpty,
1248 );
1249 encode_utf8_from_utf16_with_output_limit(
1250 &[0x263A, 0x00A7],
1251 "\u{263A}\u{A7}",
1252 5,
1253 EncoderResult::InputEmpty,
1254 );
1255 encode_utf8_from_utf16_with_output_limit(
1256 &[0x263A, 0x2603],
1257 "\u{263A}\u{2603}",
1258 6,
1259 EncoderResult::InputEmpty,
1260 );
1261 encode_utf8_from_utf16_with_output_limit(
1262 &[0x263A, 0xD83D, 0xDCA9],
1263 "\u{263A}\u{1F4A9}",
1264 7,
1265 EncoderResult::InputEmpty,
1266 );
1267
1268 encode_utf8_from_utf16_with_output_limit(
1269 &[0x263A, 0x00A7],
1270 "\u{263A}",
1271 4,
1272 EncoderResult::OutputFull,
1273 );
1274 encode_utf8_from_utf16_with_output_limit(
1275 &[0x263A, 0x2603],
1276 "\u{263A}",
1277 5,
1278 EncoderResult::OutputFull,
1279 );
1280 encode_utf8_from_utf16_with_output_limit(
1281 &[0x263A, 0xD83D, 0xDCA9],
1282 "\u{263A}",
1283 6,
1284 EncoderResult::OutputFull,
1285 );
1286
1287 encode_utf8_from_utf16_with_output_limit(
1288 &[0xD83D, 0xDE0E, 0x0062],
1289 "\u{1F60E}\u{62}",
1290 5,
1291 EncoderResult::InputEmpty,
1292 );
1293 encode_utf8_from_utf16_with_output_limit(
1294 &[0xD83D, 0xDE0E, 0x00A7],
1295 "\u{1F60E}\u{A7}",
1296 6,
1297 EncoderResult::InputEmpty,
1298 );
1299 encode_utf8_from_utf16_with_output_limit(
1300 &[0xD83D, 0xDE0E, 0x2603],
1301 "\u{1F60E}\u{2603}",
1302 7,
1303 EncoderResult::InputEmpty,
1304 );
1305 encode_utf8_from_utf16_with_output_limit(
1306 &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
1307 "\u{1F60E}\u{1F4A9}",
1308 8,
1309 EncoderResult::InputEmpty,
1310 );
1311
1312 encode_utf8_from_utf16_with_output_limit(
1313 &[0xD83D, 0xDE0E, 0x00A7],
1314 "\u{1F60E}",
1315 5,
1316 EncoderResult::OutputFull,
1317 );
1318 encode_utf8_from_utf16_with_output_limit(
1319 &[0xD83D, 0xDE0E, 0x2603],
1320 "\u{1F60E}",
1321 6,
1322 EncoderResult::OutputFull,
1323 );
1324 encode_utf8_from_utf16_with_output_limit(
1325 &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
1326 "\u{1F60E}",
1327 7,
1328 EncoderResult::OutputFull,
1329 );
1330
1331 encode_utf8_from_utf16_with_output_limit(
1332 &[0x0063, 0x00B6, 0x0062, 0x0062],
1333 "\u{63}\u{B6}\u{62}\u{62}",
1334 5,
1335 EncoderResult::InputEmpty,
1336 );
1337 encode_utf8_from_utf16_with_output_limit(
1338 &[0x0063, 0x00B6, 0x0062, 0x0062],
1339 "\u{63}\u{B6}\u{62}",
1340 4,
1341 EncoderResult::OutputFull,
1342 );
1343
1344 encode_utf8_from_utf16_with_output_limit(
1345 &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
1346 "\u{63}\u{B6}\u{62}\u{62}\u{62}",
1347 6,
1348 EncoderResult::InputEmpty,
1349 );
1350 encode_utf8_from_utf16_with_output_limit(
1351 &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
1352 "\u{63}\u{B6}\u{62}\u{62}",
1353 5,
1354 EncoderResult::OutputFull,
1355 );
1356
1357 encode_utf8_from_utf16_with_output_limit(
1358 &[0x263A, 0x0062, 0x0062],
1359 "\u{263A}\u{62}\u{62}",
1360 5,
1361 EncoderResult::InputEmpty,
1362 );
1363 encode_utf8_from_utf16_with_output_limit(
1364 &[0x263A, 0x0062, 0x0062],
1365 "\u{263A}\u{62}",
1366 4,
1367 EncoderResult::OutputFull,
1368 );
1369
1370 encode_utf8_from_utf16_with_output_limit(
1371 &[0x263A, 0x0062, 0x0062, 0x0062],
1372 "\u{263A}\u{62}\u{62}\u{62}",
1373 6,
1374 EncoderResult::InputEmpty,
1375 );
1376 encode_utf8_from_utf16_with_output_limit(
1377 &[0x263A, 0x0062, 0x0062, 0x0062],
1378 "\u{263A}\u{62}\u{62}",
1379 5,
1380 EncoderResult::OutputFull,
1381 );
1382
1383 encode_utf8_from_utf16_with_output_limit(
1384 &[0x0063, 0x00B6, 0x00A7],
1385 "\u{63}\u{B6}\u{A7}",
1386 5,
1387 EncoderResult::InputEmpty,
1388 );
1389 encode_utf8_from_utf16_with_output_limit(
1390 &[0x0063, 0x00B6, 0x00A7],
1391 "\u{63}\u{B6}",
1392 4,
1393 EncoderResult::OutputFull,
1394 );
1395
1396 encode_utf8_from_utf16_with_output_limit(
1397 &[0x0063, 0x00B6, 0x00A7, 0x0062],
1398 "\u{63}\u{B6}\u{A7}\u{62}",
1399 6,
1400 EncoderResult::InputEmpty,
1401 );
1402 encode_utf8_from_utf16_with_output_limit(
1403 &[0x0063, 0x00B6, 0x00A7, 0x0062],
1404 "\u{63}\u{B6}\u{A7}",
1405 5,
1406 EncoderResult::OutputFull,
1407 );
1408
1409 encode_utf8_from_utf16_with_output_limit(
1410 &[0x263A, 0x00A7, 0x0062],
1411 "\u{263A}\u{A7}\u{62}",
1412 6,
1413 EncoderResult::InputEmpty,
1414 );
1415 encode_utf8_from_utf16_with_output_limit(
1416 &[0x263A, 0x00A7, 0x0062],
1417 "\u{263A}\u{A7}",
1418 5,
1419 EncoderResult::OutputFull,
1420 );
1421
1422 encode_utf8_from_utf16_with_output_limit(
1423 &[0x0063, 0x00B6, 0x0062, 0x00A7],
1424 "\u{63}\u{B6}\u{62}\u{A7}",
1425 6,
1426 EncoderResult::InputEmpty,
1427 );
1428 encode_utf8_from_utf16_with_output_limit(
1429 &[0x0063, 0x00B6, 0x0062, 0x00A7],
1430 "\u{63}\u{B6}\u{62}",
1431 5,
1432 EncoderResult::OutputFull,
1433 );
1434
1435 encode_utf8_from_utf16_with_output_limit(
1436 &[0x263A, 0x0062, 0x00A7],
1437 "\u{263A}\u{62}\u{A7}",
1438 6,
1439 EncoderResult::InputEmpty,
1440 );
1441 encode_utf8_from_utf16_with_output_limit(
1442 &[0x263A, 0x0062, 0x00A7],
1443 "\u{263A}\u{62}",
1444 5,
1445 EncoderResult::OutputFull,
1446 );
1447
1448 encode_utf8_from_utf16_with_output_limit(
1449 &[0x0063, 0x00B6, 0x2603],
1450 "\u{63}\u{B6}\u{2603}",
1451 6,
1452 EncoderResult::InputEmpty,
1453 );
1454 encode_utf8_from_utf16_with_output_limit(
1455 &[0x0063, 0x00B6, 0x2603],
1456 "\u{63}\u{B6}",
1457 5,
1458 EncoderResult::OutputFull,
1459 );
1460
1461 encode_utf8_from_utf16_with_output_limit(
1462 &[0x263A, 0x2603],
1463 "\u{263A}\u{2603}",
1464 6,
1465 EncoderResult::InputEmpty,
1466 );
1467 encode_utf8_from_utf16_with_output_limit(
1468 &[0x263A, 0x2603],
1469 "\u{263A}",
1470 5,
1471 EncoderResult::OutputFull,
1472 );
1473
1474 encode_utf8_from_utf16_with_output_limit(
1475 &[0x0063, 0x00B6, 0xD83D],
1476 "\u{63}\u{B6}\u{FFFD}",
1477 6,
1478 EncoderResult::InputEmpty,
1479 );
1480 encode_utf8_from_utf16_with_output_limit(
1481 &[0x0063, 0x00B6, 0xD83D],
1482 "\u{63}\u{B6}",
1483 5,
1484 EncoderResult::OutputFull,
1485 );
1486
1487 encode_utf8_from_utf16_with_output_limit(
1488 &[0x263A, 0xD83D],
1489 "\u{263A}\u{FFFD}",
1490 6,
1491 EncoderResult::InputEmpty,
1492 );
1493 encode_utf8_from_utf16_with_output_limit(
1494 &[0x263A, 0xD83D],
1495 "\u{263A}",
1496 5,
1497 EncoderResult::OutputFull,
1498 );
1499
1500 encode_utf8_from_utf16_with_output_limit(
1501 &[0x0063, 0x00B6, 0xDCA9],
1502 "\u{63}\u{B6}\u{FFFD}",
1503 6,
1504 EncoderResult::InputEmpty,
1505 );
1506 encode_utf8_from_utf16_with_output_limit(
1507 &[0x0063, 0x00B6, 0xDCA9],
1508 "\u{63}\u{B6}",
1509 5,
1510 EncoderResult::OutputFull,
1511 );
1512
1513 encode_utf8_from_utf16_with_output_limit(
1514 &[0x263A, 0xDCA9],
1515 "\u{263A}\u{FFFD}",
1516 6,
1517 EncoderResult::InputEmpty,
1518 );
1519 encode_utf8_from_utf16_with_output_limit(
1520 &[0x263A, 0xDCA9],
1521 "\u{263A}",
1522 5,
1523 EncoderResult::OutputFull,
1524 );
1525 }
1526
1527 #[test]
1528 fn test_utf8_max_length_from_utf16() {
1529 let mut encoder = UTF_8.new_encoder();
1530 let mut output = [0u8; 13];
1531 let input = &[0x2C9Fu16, 0x2CA9u16, 0x2CA3u16, 0x2C9Fu16];
1532 let needed = encoder
1533 .max_buffer_length_from_utf16_without_replacement(input.len())
1534 .unwrap();
1535 let (result, _, _) =
1536 encoder.encode_from_utf16_without_replacement(input, &mut output[..needed], true);
1537 assert_eq!(result, EncoderResult::InputEmpty);
1538 }
1539
1540 #[test]
1541 fn test_decode_bom_prefixed_split_byte_triple() {
1542 let mut output = [0u16; 20];
1543 let mut decoder = UTF_8.new_decoder();
1544 {
1545 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1546 let (result, read, written, had_errors) =
1547 decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
1548 assert_eq!(result, CoderResult::InputEmpty);
1549 assert_eq!(read, 1);
1550 assert_eq!(written, 0);
1551 assert!(!had_errors);
1552 }
1553 {
1554 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1555 let (result, read, written, had_errors) =
1556 decoder.decode_to_utf16(b"\xBF", &mut output[..needed], false);
1557 assert_eq!(result, CoderResult::InputEmpty);
1558 assert_eq!(read, 1);
1559 assert_eq!(written, 0);
1560 assert!(!had_errors);
1561 }
1562 {
1563 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1564 let (result, read, written, had_errors) =
1565 decoder.decode_to_utf16(b"\xBE", &mut output[..needed], true);
1566 assert_eq!(result, CoderResult::InputEmpty);
1567 assert_eq!(read, 1);
1568 assert_eq!(written, 1);
1569 assert!(!had_errors);
1570 assert_eq!(output[0], 0xFFFE);
1571 }
1572 }
1573
1574 #[test]
1575 fn test_decode_bom_prefixed_split_byte_pair() {
1576 let mut output = [0u16; 20];
1577 let mut decoder = UTF_8.new_decoder();
1578 {
1579 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1580 let (result, read, written, had_errors) =
1581 decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
1582 assert_eq!(result, CoderResult::InputEmpty);
1583 assert_eq!(read, 1);
1584 assert_eq!(written, 0);
1585 assert!(!had_errors);
1586 }
1587 {
1588 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1589 let (result, read, written, had_errors) =
1590 decoder.decode_to_utf16(b"\xBC", &mut output[..needed], true);
1591 assert_eq!(result, CoderResult::InputEmpty);
1592 assert_eq!(read, 1);
1593 assert_eq!(written, 1);
1594 assert!(had_errors);
1595 assert_eq!(output[0], 0xFFFD);
1596 }
1597 }
1598
1599 #[test]
1600 fn test_decode_bom_prefix() {
1601 let mut output = [0u16; 20];
1602 let mut decoder = UTF_8.new_decoder();
1603 {
1604 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1605 let (result, read, written, had_errors) =
1606 decoder.decode_to_utf16(b"\xEF", &mut output[..needed], true);
1607 assert_eq!(result, CoderResult::InputEmpty);
1608 assert_eq!(read, 1);
1609 assert_eq!(written, 1);
1610 assert!(had_errors);
1611 assert_eq!(output[0], 0xFFFD);
1612 }
1613 }
1614
1615 #[test]
1616 fn test_tail() {
1617 let mut output = [0u16; 1];
1618 let mut decoder = UTF_8.new_decoder_without_bom_handling();
1619 {
1620 let (result, read, written, had_errors) =
1621 decoder.decode_to_utf16("\u{E4}a".as_bytes(), &mut output[..], false);
1622 assert_eq!(result, CoderResult::OutputFull);
1623 assert_eq!(read, 2);
1624 assert_eq!(written, 1);
1625 assert!(!had_errors);
1626 assert_eq!(output[0], 0x00E4);
1627 }
1628 }
1629}
1630