1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! Functions for converting between different in-RAM representations of text
11//! and for quickly checking if the Unicode Bidirectional Algorithm can be
12//! avoided.
13//!
14//! By using slices for output, the functions here seek to enable by-register
15//! (ALU register or SIMD register as available) operations in order to
16//! outperform iterator-based conversions available in the Rust standard
17//! library.
18//!
19//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20//! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21//! in-memory encoding is sometimes used as a storage optimization of text
22//! when UTF-16 indexing and length semantics are exposed.
23//!
24//! The FFI binding for this module are in the
25//! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26
27#[cfg(feature = "alloc")]
28use alloc::borrow::Cow;
29#[cfg(feature = "alloc")]
30use alloc::string::String;
31#[cfg(feature = "alloc")]
32use alloc::vec::Vec;
33
34use super::in_inclusive_range16;
35use super::in_inclusive_range32;
36use super::in_inclusive_range8;
37use super::in_range16;
38use super::in_range32;
39use super::DecoderResult;
40use crate::ascii::*;
41use crate::utf_8::*;
42
43macro_rules! non_fuzz_debug_assert {
44 ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
45}
46
47cfg_if! {
48 if #[cfg(feature = "simd-accel")] {
49 use ::core::intrinsics::likely;
50 use ::core::intrinsics::unlikely;
51 } else {
52 #[inline(always)]
53 fn likely(b: bool) -> bool {
54 b
55 }
56 #[inline(always)]
57 fn unlikely(b: bool) -> bool {
58 b
59 }
60 }
61}
62
63/// Classification of text as Latin1 (all code points are below U+0100),
64/// left-to-right with some non-Latin1 characters or as containing at least
65/// some right-to-left characters.
66#[must_use]
67#[derive(Debug, PartialEq, Eq)]
68#[repr(C)]
69pub enum Latin1Bidi {
70 /// Every character is below U+0100.
71 Latin1 = 0,
72 /// There is at least one character that's U+0100 or higher, but there
73 /// are no right-to-left characters.
74 LeftToRight = 1,
75 /// There is at least one right-to-left character.
76 Bidi = 2,
77}
78
79// `as` truncates, so works on 32-bit, too.
80#[allow(dead_code)]
81const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
82
83#[allow(unused_macros)]
84macro_rules! by_unit_check_alu {
85 ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
86 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
87 #[inline(always)]
88 fn $name(buffer: &[$unit]) -> bool {
89 let mut offset = 0usize;
90 let mut accu = 0usize;
91 let unit_size = ::core::mem::size_of::<$unit>();
92 let len = buffer.len();
93 if len >= ALU_ALIGNMENT / unit_size {
94 // The most common reason to return `false` is for the first code
95 // unit to fail the test, so check that first.
96 if buffer[0] >= $bound {
97 return false;
98 }
99 let src = buffer.as_ptr();
100 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
101 & ALU_ALIGNMENT_MASK)
102 / unit_size;
103 if until_alignment + ALU_ALIGNMENT / unit_size <= len {
104 if until_alignment != 0 {
105 accu |= buffer[offset] as usize;
106 offset += 1;
107 until_alignment -= 1;
108 while until_alignment != 0 {
109 accu |= buffer[offset] as usize;
110 offset += 1;
111 until_alignment -= 1;
112 }
113 if accu >= $bound {
114 return false;
115 }
116 }
117 let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
118 if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
119 let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
120 loop {
121 let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
122 | unsafe {
123 *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
124 }
125 | unsafe {
126 *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
127 as *const usize)
128 }
129 | unsafe {
130 *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
131 as *const usize)
132 };
133 if unroll_accu & $mask != 0 {
134 return false;
135 }
136 offset += 4 * (ALU_ALIGNMENT / unit_size);
137 if offset > len_minus_unroll {
138 break;
139 }
140 }
141 }
142 while offset <= len_minus_stride {
143 accu |= unsafe { *(src.add(offset) as *const usize) };
144 offset += ALU_ALIGNMENT / unit_size;
145 }
146 }
147 }
148 for &unit in &buffer[offset..] {
149 accu |= unit as usize;
150 }
151 accu & $mask == 0
152 }
153 };
154}
155
156#[allow(unused_macros)]
157macro_rules! by_unit_check_simd {
158 ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
159 #[inline(always)]
160 fn $name(buffer: &[$unit]) -> bool {
161 let mut offset = 0usize;
162 let mut accu = 0usize;
163 let unit_size = ::core::mem::size_of::<$unit>();
164 let len = buffer.len();
165 if len >= SIMD_STRIDE_SIZE / unit_size {
166 // The most common reason to return `false` is for the first code
167 // unit to fail the test, so check that first.
168 if buffer[0] >= $bound {
169 return false;
170 }
171 let src = buffer.as_ptr();
172 let mut until_alignment = ((SIMD_ALIGNMENT
173 - ((src as usize) & SIMD_ALIGNMENT_MASK))
174 & SIMD_ALIGNMENT_MASK)
175 / unit_size;
176 if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
177 if until_alignment != 0 {
178 accu |= buffer[offset] as usize;
179 offset += 1;
180 until_alignment -= 1;
181 while until_alignment != 0 {
182 accu |= buffer[offset] as usize;
183 offset += 1;
184 until_alignment -= 1;
185 }
186 if accu >= $bound {
187 return false;
188 }
189 }
190 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
191 if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
192 let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
193 loop {
194 let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
195 | unsafe {
196 *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
197 as *const $simd_ty)
198 }
199 | unsafe {
200 *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
201 as *const $simd_ty)
202 }
203 | unsafe {
204 *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
205 as *const $simd_ty)
206 };
207 if !$func(unroll_accu) {
208 return false;
209 }
210 offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
211 if offset > len_minus_unroll {
212 break;
213 }
214 }
215 }
216 let mut simd_accu = $splat;
217 while offset <= len_minus_stride {
218 simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
219 offset += SIMD_STRIDE_SIZE / unit_size;
220 }
221 if !$func(simd_accu) {
222 return false;
223 }
224 }
225 }
226 for &unit in &buffer[offset..] {
227 accu |= unit as usize;
228 }
229 accu < $bound
230 }
231 };
232}
233
234cfg_if! {
235 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
236 use crate::simd_funcs::*;
237 use packed_simd::u8x16;
238 use packed_simd::u16x8;
239
240 const SIMD_ALIGNMENT: usize = 16;
241
242 const SIMD_ALIGNMENT_MASK: usize = 15;
243
244 by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
245 by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
246 by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
247
248 #[inline(always)]
249 fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
250 // This function is a mess, because it simultaneously tries to do
251 // only aligned SIMD (perhaps misguidedly) and needs to deal with
252 // the last code unit in a SIMD stride being part of a valid
253 // surrogate pair.
254 let unit_size = ::core::mem::size_of::<u16>();
255 let src = buffer.as_ptr();
256 let len = buffer.len();
257 let mut offset = 0usize;
258 'outer: loop {
259 let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
260 SIMD_ALIGNMENT_MASK) / unit_size;
261 if until_alignment == 0 {
262 if offset + SIMD_STRIDE_SIZE / unit_size > len {
263 break;
264 }
265 } else {
266 let offset_plus_until_alignment = offset + until_alignment;
267 let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
268 if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
269 break;
270 }
271 let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
272 if up_to < until_alignment {
273 return offset + up_to;
274 }
275 if last_valid_low {
276 offset = offset_plus_until_alignment_plus_one;
277 continue;
278 }
279 offset = offset_plus_until_alignment;
280 }
281 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
282 loop {
283 let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
284 if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
285 if offset_plus_stride == len {
286 break 'outer;
287 }
288 let offset_plus_stride_plus_one = offset_plus_stride + 1;
289 let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
290 if up_to < SIMD_STRIDE_SIZE / unit_size {
291 return offset + up_to;
292 }
293 if last_valid_low {
294 offset = offset_plus_stride_plus_one;
295 continue 'outer;
296 }
297 }
298 offset = offset_plus_stride;
299 if offset > len_minus_stride {
300 break 'outer;
301 }
302 }
303 }
304 let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
305 offset + up_to
306 }
307 } else {
308 by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
309 by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
310 by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
311
312 #[inline(always)]
313 fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
314 let (up_to, _) = utf16_valid_up_to_alu(buffer);
315 up_to
316 }
317 }
318}
319
320/// The second return value is true iff the last code unit of the slice was
321/// reached and turned out to be a low surrogate that is part of a valid pair.
322#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
323#[inline(always)]
324fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
325 let len = buffer.len();
326 if len == 0 {
327 return (0, false);
328 }
329 let mut offset = 0usize;
330 loop {
331 let unit = buffer[offset];
332 let next = offset + 1;
333 let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
334 if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
335 // Not a surrogate
336 offset = next;
337 if offset == len {
338 return (offset, false);
339 }
340 continue;
341 }
342 if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
343 // high surrogate
344 if next < len {
345 let second = buffer[next];
346 let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
347 if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
348 // The next code unit is a low surrogate. Advance position.
349 offset = next + 1;
350 if offset == len {
351 return (offset, true);
352 }
353 continue;
354 }
355 // The next code unit is not a low surrogate. Don't advance
356 // position and treat the high surrogate as unpaired.
357 // fall through
358 }
359 // Unpaired, fall through
360 }
361 // Unpaired surrogate
362 return (offset, false);
363 }
364}
365
366cfg_if! {
367 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
368 #[inline(always)]
369 fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
370 let mut offset = 0usize;
371 let bytes = buffer.as_bytes();
372 let len = bytes.len();
373 if len >= SIMD_STRIDE_SIZE {
374 let src = bytes.as_ptr();
375 let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
376 SIMD_ALIGNMENT_MASK;
377 if until_alignment + SIMD_STRIDE_SIZE <= len {
378 while until_alignment != 0 {
379 if bytes[offset] > 0xC3 {
380 return Some(offset);
381 }
382 offset += 1;
383 until_alignment -= 1;
384 }
385 let len_minus_stride = len - SIMD_STRIDE_SIZE;
386 loop {
387 if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
388 // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
389 while bytes[offset] & 0xC0 == 0x80 {
390 offset += 1;
391 }
392 return Some(offset);
393 }
394 offset += SIMD_STRIDE_SIZE;
395 if offset > len_minus_stride {
396 break;
397 }
398 }
399 }
400 }
401 for i in offset..len {
402 if bytes[i] > 0xC3 {
403 return Some(i);
404 }
405 }
406 None
407 }
408 } else {
409 #[inline(always)]
410 fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
411 let mut bytes = buffer.as_bytes();
412 let mut total = 0;
413 loop {
414 if let Some((byte, offset)) = validate_ascii(bytes) {
415 total += offset;
416 if byte > 0xC3 {
417 return Some(total);
418 }
419 bytes = &bytes[offset + 2..];
420 total += 2;
421 } else {
422 return None;
423 }
424 }
425 }
426 }
427}
428
429#[inline(always)]
430fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
431 let mut bytes: &[u8] = buffer;
432 let mut total: usize = 0;
433 loop {
434 if let Some((byte: u8, offset: usize)) = validate_ascii(slice:bytes) {
435 total += offset;
436 if in_inclusive_range8(i:byte, start:0xC2, end:0xC3) {
437 let next: usize = offset + 1;
438 if next == bytes.len() {
439 return Some(total);
440 }
441 if bytes[next] & 0xC0 != 0x80 {
442 return Some(total);
443 }
444 bytes = &bytes[offset + 2..];
445 total += 2;
446 } else {
447 return Some(total);
448 }
449 } else {
450 return None;
451 }
452 }
453}
454
455cfg_if! {
456 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
457 #[inline(always)]
458 fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
459 let mut offset = 0usize;
460 let len = buffer.len();
461 if len >= SIMD_STRIDE_SIZE / 2 {
462 let src = buffer.as_ptr();
463 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
464 SIMD_ALIGNMENT_MASK) / 2;
465 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
466 while until_alignment != 0 {
467 if is_utf16_code_unit_bidi(buffer[offset]) {
468 return true;
469 }
470 offset += 1;
471 until_alignment -= 1;
472 }
473 let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
474 loop {
475 if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
476 return true;
477 }
478 offset += SIMD_STRIDE_SIZE / 2;
479 if offset > len_minus_stride {
480 break;
481 }
482 }
483 }
484 }
485 for &u in &buffer[offset..] {
486 if is_utf16_code_unit_bidi(u) {
487 return true;
488 }
489 }
490 false
491 }
492 } else {
493 #[inline(always)]
494 fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
495 for &u in buffer {
496 if is_utf16_code_unit_bidi(u) {
497 return true;
498 }
499 }
500 false
501 }
502 }
503}
504
505cfg_if! {
506 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
507 #[inline(always)]
508 fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
509 let mut offset = 0usize;
510 let len = buffer.len();
511 if len >= SIMD_STRIDE_SIZE / 2 {
512 let src = buffer.as_ptr();
513 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
514 SIMD_ALIGNMENT_MASK) / 2;
515 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
516 while until_alignment != 0 {
517 if buffer[offset] > 0xFF {
518 // This transition isn't optimal, since the aligment is recomputing
519 // but not tweaking further today.
520 if is_utf16_bidi_impl(&buffer[offset..]) {
521 return Latin1Bidi::Bidi;
522 }
523 return Latin1Bidi::LeftToRight;
524 }
525 offset += 1;
526 until_alignment -= 1;
527 }
528 let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
529 loop {
530 let mut s = unsafe { *(src.add(offset) as *const u16x8) };
531 if !simd_is_latin1(s) {
532 loop {
533 if is_u16x8_bidi(s) {
534 return Latin1Bidi::Bidi;
535 }
536 offset += SIMD_STRIDE_SIZE / 2;
537 if offset > len_minus_stride {
538 for &u in &buffer[offset..] {
539 if is_utf16_code_unit_bidi(u) {
540 return Latin1Bidi::Bidi;
541 }
542 }
543 return Latin1Bidi::LeftToRight;
544 }
545 s = unsafe { *(src.add(offset) as *const u16x8) };
546 }
547 }
548 offset += SIMD_STRIDE_SIZE / 2;
549 if offset > len_minus_stride {
550 break;
551 }
552 }
553 }
554 }
555 let mut iter = (&buffer[offset..]).iter();
556 loop {
557 if let Some(&u) = iter.next() {
558 if u > 0xFF {
559 let mut inner_u = u;
560 loop {
561 if is_utf16_code_unit_bidi(inner_u) {
562 return Latin1Bidi::Bidi;
563 }
564 if let Some(&code_unit) = iter.next() {
565 inner_u = code_unit;
566 } else {
567 return Latin1Bidi::LeftToRight;
568 }
569 }
570 }
571 } else {
572 return Latin1Bidi::Latin1;
573 }
574 }
575 }
576 } else {
577 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
578 #[inline(always)]
579 fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
580 let mut offset = 0usize;
581 let len = buffer.len();
582 if len >= ALU_ALIGNMENT / 2 {
583 let src = buffer.as_ptr();
584 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
585 ALU_ALIGNMENT_MASK) / 2;
586 if until_alignment + ALU_ALIGNMENT / 2 <= len {
587 while until_alignment != 0 {
588 if buffer[offset] > 0xFF {
589 if is_utf16_bidi_impl(&buffer[offset..]) {
590 return Latin1Bidi::Bidi;
591 }
592 return Latin1Bidi::LeftToRight;
593 }
594 offset += 1;
595 until_alignment -= 1;
596 }
597 let len_minus_stride = len - ALU_ALIGNMENT / 2;
598 loop {
599 if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
600 if is_utf16_bidi_impl(&buffer[offset..]) {
601 return Latin1Bidi::Bidi;
602 }
603 return Latin1Bidi::LeftToRight;
604 }
605 offset += ALU_ALIGNMENT / 2;
606 if offset > len_minus_stride {
607 break;
608 }
609 }
610 }
611 }
612 let mut iter = (&buffer[offset..]).iter();
613 loop {
614 if let Some(&u) = iter.next() {
615 if u > 0xFF {
616 let mut inner_u = u;
617 loop {
618 if is_utf16_code_unit_bidi(inner_u) {
619 return Latin1Bidi::Bidi;
620 }
621 if let Some(&code_unit) = iter.next() {
622 inner_u = code_unit;
623 } else {
624 return Latin1Bidi::LeftToRight;
625 }
626 }
627 }
628 } else {
629 return Latin1Bidi::Latin1;
630 }
631 }
632 }
633 }
634}
635
636/// Checks whether the buffer is all-ASCII.
637///
638/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
639/// is not guaranteed to fail fast.)
640pub fn is_ascii(buffer: &[u8]) -> bool {
641 is_ascii_impl(buffer)
642}
643
644/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
645/// only ASCII characters).
646///
647/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
648/// is not guaranteed to fail fast.)
649pub fn is_basic_latin(buffer: &[u16]) -> bool {
650 is_basic_latin_impl(buffer)
651}
652
653/// Checks whether the buffer is valid UTF-8 representing only code points
654/// less than or equal to U+00FF.
655///
656/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
657/// invalidity or code points above U+00FF are discovered.
658pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
659 is_utf8_latin1_impl(buffer).is_none()
660}
661
662/// Checks whether the buffer represents only code points less than or equal
663/// to U+00FF.
664///
665/// Fails fast. (I.e. returns before having read the whole buffer if code
666/// points above U+00FF are discovered.
667pub fn is_str_latin1(buffer: &str) -> bool {
668 is_str_latin1_impl(buffer).is_none()
669}
670
671/// Checks whether the buffer represents only code point less than or equal
672/// to U+00FF.
673///
674/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
675/// is not guaranteed to fail fast.)
676pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
677 is_utf16_latin1_impl(buffer)
678}
679
680/// Checks whether a potentially-invalid UTF-8 buffer contains code points
681/// that trigger right-to-left processing.
682///
683/// The check is done on a Unicode block basis without regard to assigned
684/// vs. unassigned code points in the block. Hebrew presentation forms in
685/// the Alphabetic Presentation Forms block are treated as if they formed
686/// a block on their own (i.e. it treated as right-to-left). Additionally,
687/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
688/// for. Control characters that are technically bidi controls but do not
689/// cause right-to-left behavior without the presence of right-to-left
690/// characters or right-to-left controls are not checked for. As a special
691/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
692///
693/// Returns `true` if the input is invalid UTF-8 or the input contains an
694/// RTL character. Returns `false` if the input is valid UTF-8 and contains
695/// no RTL characters.
696#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
697#[inline]
698pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
699 // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
700 // than UTF-8 validation followed by `is_str_bidi()` for German,
701 // Russian and Japanese. However, this is considerably slower for Thai.
702 // Chances are that the compiler makes some branch predictions that are
703 // unfortunate for Thai. Not spending the time to manually optimize
704 // further at this time, since it's unclear if this variant even has
705 // use cases. However, this is worth revisiting once Rust gets the
706 // ability to annotate relative priorities of match arms.
707
708 // U+058F: D6 8F
709 // U+0590: D6 90
710 // U+08FF: E0 A3 BF
711 // U+0900: E0 A4 80
712 //
713 // U+200F: E2 80 8F
714 // U+202B: E2 80 AB
715 // U+202E: E2 80 AE
716 // U+2067: E2 81 A7
717 //
718 // U+FB1C: EF AC 9C
719 // U+FB1D: EF AC 9D
720 // U+FDFF: EF B7 BF
721 // U+FE00: EF B8 80
722 //
723 // U+FE6F: EF B9 AF
724 // U+FE70: EF B9 B0
725 // U+FEFE: EF BB BE
726 // U+FEFF: EF BB BF
727 //
728 // U+107FF: F0 90 9F BF
729 // U+10800: F0 90 A0 80
730 // U+10FFF: F0 90 BF BF
731 // U+11000: F0 91 80 80
732 //
733 // U+1E7FF: F0 9E 9F BF
734 // U+1E800: F0 9E A0 80
735 // U+1EFFF: F0 9E BF BF
736 // U+1F000: F0 9F 80 80
737 let mut src = buffer;
738 'outer: loop {
739 if let Some((mut byte, mut read)) = validate_ascii(src) {
740 // Check for the longest sequence to avoid checking twice for the
741 // multi-byte sequences.
742 if read + 4 <= src.len() {
743 'inner: loop {
744 // At this point, `byte` is not included in `read`.
745 match byte {
746 0..=0x7F => {
747 // ASCII: go back to SIMD.
748 read += 1;
749 src = &src[read..];
750 continue 'outer;
751 }
752 0xC2..=0xD5 => {
753 // Two-byte
754 let second = unsafe { *(src.get_unchecked(read + 1)) };
755 if !in_inclusive_range8(second, 0x80, 0xBF) {
756 return true;
757 }
758 read += 2;
759 }
760 0xD6 => {
761 // Two-byte
762 let second = unsafe { *(src.get_unchecked(read + 1)) };
763 if !in_inclusive_range8(second, 0x80, 0xBF) {
764 return true;
765 }
766 // XXX consider folding the above and below checks
767 if second > 0x8F {
768 return true;
769 }
770 read += 2;
771 }
772 // two-byte starting with 0xD7 and above is bidi
773 0xE1 | 0xE3..=0xEC | 0xEE => {
774 // Three-byte normal
775 let second = unsafe { *(src.get_unchecked(read + 1)) };
776 let third = unsafe { *(src.get_unchecked(read + 2)) };
777 if ((UTF8_DATA.table[usize::from(second)]
778 & unsafe {
779 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
780 })
781 | (third >> 6))
782 != 2
783 {
784 return true;
785 }
786 read += 3;
787 }
788 0xE2 => {
789 // Three-byte normal, potentially bidi
790 let second = unsafe { *(src.get_unchecked(read + 1)) };
791 let third = unsafe { *(src.get_unchecked(read + 2)) };
792 if ((UTF8_DATA.table[usize::from(second)]
793 & unsafe {
794 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
795 })
796 | (third >> 6))
797 != 2
798 {
799 return true;
800 }
801 if second == 0x80 {
802 if third == 0x8F || third == 0xAB || third == 0xAE {
803 return true;
804 }
805 } else if second == 0x81 {
806 if third == 0xA7 {
807 return true;
808 }
809 }
810 read += 3;
811 }
812 0xEF => {
813 // Three-byte normal, potentially bidi
814 let second = unsafe { *(src.get_unchecked(read + 1)) };
815 let third = unsafe { *(src.get_unchecked(read + 2)) };
816 if ((UTF8_DATA.table[usize::from(second)]
817 & unsafe {
818 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
819 })
820 | (third >> 6))
821 != 2
822 {
823 return true;
824 }
825 if in_inclusive_range8(second, 0xAC, 0xB7) {
826 if second == 0xAC {
827 if third > 0x9C {
828 return true;
829 }
830 } else {
831 return true;
832 }
833 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
834 if second == 0xB9 {
835 if third > 0xAF {
836 return true;
837 }
838 } else if second == 0xBB {
839 if third != 0xBF {
840 return true;
841 }
842 } else {
843 return true;
844 }
845 }
846 read += 3;
847 }
848 0xE0 => {
849 // Three-byte special lower bound, potentially bidi
850 let second = unsafe { *(src.get_unchecked(read + 1)) };
851 let third = unsafe { *(src.get_unchecked(read + 2)) };
852 if ((UTF8_DATA.table[usize::from(second)]
853 & unsafe {
854 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
855 })
856 | (third >> 6))
857 != 2
858 {
859 return true;
860 }
861 // XXX can this be folded into the above validity check
862 if second < 0xA4 {
863 return true;
864 }
865 read += 3;
866 }
867 0xED => {
868 // Three-byte special upper bound
869 let second = unsafe { *(src.get_unchecked(read + 1)) };
870 let third = unsafe { *(src.get_unchecked(read + 2)) };
871 if ((UTF8_DATA.table[usize::from(second)]
872 & unsafe {
873 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
874 })
875 | (third >> 6))
876 != 2
877 {
878 return true;
879 }
880 read += 3;
881 }
882 0xF1..=0xF4 => {
883 // Four-byte normal
884 let second = unsafe { *(src.get_unchecked(read + 1)) };
885 let third = unsafe { *(src.get_unchecked(read + 2)) };
886 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
887 if (u16::from(
888 UTF8_DATA.table[usize::from(second)]
889 & unsafe {
890 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
891 },
892 ) | u16::from(third >> 6)
893 | (u16::from(fourth & 0xC0) << 2))
894 != 0x202
895 {
896 return true;
897 }
898 read += 4;
899 }
900 0xF0 => {
901 // Four-byte special lower bound, potentially bidi
902 let second = unsafe { *(src.get_unchecked(read + 1)) };
903 let third = unsafe { *(src.get_unchecked(read + 2)) };
904 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
905 if (u16::from(
906 UTF8_DATA.table[usize::from(second)]
907 & unsafe {
908 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
909 },
910 ) | u16::from(third >> 6)
911 | (u16::from(fourth & 0xC0) << 2))
912 != 0x202
913 {
914 return true;
915 }
916 if unlikely(second == 0x90 || second == 0x9E) {
917 let third = src[read + 2];
918 if third >= 0xA0 {
919 return true;
920 }
921 }
922 read += 4;
923 }
924 _ => {
925 // Invalid lead or bidi-only lead
926 return true;
927 }
928 }
929 if read + 4 > src.len() {
930 if read == src.len() {
931 return false;
932 }
933 byte = src[read];
934 break 'inner;
935 }
936 byte = src[read];
937 continue 'inner;
938 }
939 }
940 // We can't have a complete 4-byte sequence, but we could still have
941 // a complete shorter sequence.
942
943 // At this point, `byte` is not included in `read`.
944 match byte {
945 0..=0x7F => {
946 // ASCII: go back to SIMD.
947 read += 1;
948 src = &src[read..];
949 continue 'outer;
950 }
951 0xC2..=0xD5 => {
952 // Two-byte
953 let new_read = read + 2;
954 if new_read > src.len() {
955 return true;
956 }
957 let second = unsafe { *(src.get_unchecked(read + 1)) };
958 if !in_inclusive_range8(second, 0x80, 0xBF) {
959 return true;
960 }
961 read = new_read;
962 // We need to deal with the case where we came here with 3 bytes
963 // left, so we need to take a look at the last one.
964 src = &src[read..];
965 continue 'outer;
966 }
967 0xD6 => {
968 // Two-byte, potentially bidi
969 let new_read = read + 2;
970 if new_read > src.len() {
971 return true;
972 }
973 let second = unsafe { *(src.get_unchecked(read + 1)) };
974 if !in_inclusive_range8(second, 0x80, 0xBF) {
975 return true;
976 }
977 // XXX consider folding the above and below checks
978 if second > 0x8F {
979 return true;
980 }
981 read = new_read;
982 // We need to deal with the case where we came here with 3 bytes
983 // left, so we need to take a look at the last one.
984 src = &src[read..];
985 continue 'outer;
986 }
987 // two-byte starting with 0xD7 and above is bidi
988 0xE1 | 0xE3..=0xEC | 0xEE => {
989 // Three-byte normal
990 let new_read = read + 3;
991 if new_read > src.len() {
992 return true;
993 }
994 let second = unsafe { *(src.get_unchecked(read + 1)) };
995 let third = unsafe { *(src.get_unchecked(read + 2)) };
996 if ((UTF8_DATA.table[usize::from(second)]
997 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
998 | (third >> 6))
999 != 2
1000 {
1001 return true;
1002 }
1003 }
1004 0xE2 => {
1005 // Three-byte normal, potentially bidi
1006 let new_read = read + 3;
1007 if new_read > src.len() {
1008 return true;
1009 }
1010 let second = unsafe { *(src.get_unchecked(read + 1)) };
1011 let third = unsafe { *(src.get_unchecked(read + 2)) };
1012 if ((UTF8_DATA.table[usize::from(second)]
1013 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1014 | (third >> 6))
1015 != 2
1016 {
1017 return true;
1018 }
1019 if second == 0x80 {
1020 if third == 0x8F || third == 0xAB || third == 0xAE {
1021 return true;
1022 }
1023 } else if second == 0x81 {
1024 if third == 0xA7 {
1025 return true;
1026 }
1027 }
1028 }
1029 0xEF => {
1030 // Three-byte normal, potentially bidi
1031 let new_read = read + 3;
1032 if new_read > src.len() {
1033 return true;
1034 }
1035 let second = unsafe { *(src.get_unchecked(read + 1)) };
1036 let third = unsafe { *(src.get_unchecked(read + 2)) };
1037 if ((UTF8_DATA.table[usize::from(second)]
1038 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1039 | (third >> 6))
1040 != 2
1041 {
1042 return true;
1043 }
1044 if in_inclusive_range8(second, 0xAC, 0xB7) {
1045 if second == 0xAC {
1046 if third > 0x9C {
1047 return true;
1048 }
1049 } else {
1050 return true;
1051 }
1052 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1053 if second == 0xB9 {
1054 if third > 0xAF {
1055 return true;
1056 }
1057 } else if second == 0xBB {
1058 if third != 0xBF {
1059 return true;
1060 }
1061 } else {
1062 return true;
1063 }
1064 }
1065 }
1066 0xE0 => {
1067 // Three-byte special lower bound, potentially bidi
1068 let new_read = read + 3;
1069 if new_read > src.len() {
1070 return true;
1071 }
1072 let second = unsafe { *(src.get_unchecked(read + 1)) };
1073 let third = unsafe { *(src.get_unchecked(read + 2)) };
1074 if ((UTF8_DATA.table[usize::from(second)]
1075 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1076 | (third >> 6))
1077 != 2
1078 {
1079 return true;
1080 }
1081 // XXX can this be folded into the above validity check
1082 if second < 0xA4 {
1083 return true;
1084 }
1085 }
1086 0xED => {
1087 // Three-byte special upper bound
1088 let new_read = read + 3;
1089 if new_read > src.len() {
1090 return true;
1091 }
1092 let second = unsafe { *(src.get_unchecked(read + 1)) };
1093 let third = unsafe { *(src.get_unchecked(read + 2)) };
1094 if ((UTF8_DATA.table[usize::from(second)]
1095 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1096 | (third >> 6))
1097 != 2
1098 {
1099 return true;
1100 }
1101 }
1102 _ => {
1103 // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1104 return true;
1105 }
1106 }
1107 return false;
1108 } else {
1109 return false;
1110 }
1111 }
1112}
1113
1114/// Checks whether a valid UTF-8 buffer contains code points that trigger
1115/// right-to-left processing.
1116///
1117/// The check is done on a Unicode block basis without regard to assigned
1118/// vs. unassigned code points in the block. Hebrew presentation forms in
1119/// the Alphabetic Presentation Forms block are treated as if they formed
1120/// a block on their own (i.e. it treated as right-to-left). Additionally,
1121/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1122/// for. Control characters that are technically bidi controls but do not
1123/// cause right-to-left behavior without the presence of right-to-left
1124/// characters or right-to-left controls are not checked for. As a special
1125/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1126#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1127#[inline]
1128pub fn is_str_bidi(buffer: &str) -> bool {
1129 // U+058F: D6 8F
1130 // U+0590: D6 90
1131 // U+08FF: E0 A3 BF
1132 // U+0900: E0 A4 80
1133 //
1134 // U+200F: E2 80 8F
1135 // U+202B: E2 80 AB
1136 // U+202E: E2 80 AE
1137 // U+2067: E2 81 A7
1138 //
1139 // U+FB1C: EF AC 9C
1140 // U+FB1D: EF AC 9D
1141 // U+FDFF: EF B7 BF
1142 // U+FE00: EF B8 80
1143 //
1144 // U+FE6F: EF B9 AF
1145 // U+FE70: EF B9 B0
1146 // U+FEFE: EF BB BE
1147 // U+FEFF: EF BB BF
1148 //
1149 // U+107FF: F0 90 9F BF
1150 // U+10800: F0 90 A0 80
1151 // U+10FFF: F0 90 BF BF
1152 // U+11000: F0 91 80 80
1153 //
1154 // U+1E7FF: F0 9E 9F BF
1155 // U+1E800: F0 9E A0 80
1156 // U+1EFFF: F0 9E BF BF
1157 // U+1F000: F0 9F 80 80
1158 let mut bytes = buffer.as_bytes();
1159 'outer: loop {
1160 // TODO: Instead of just validating ASCII using SIMD, use SIMD
1161 // to check for non-ASCII lead bytes, too, to quickly conclude
1162 // that the vector consist entirely of CJK and below-Hebrew
1163 // code points.
1164 // Unfortunately, scripts above Arabic but below CJK share
1165 // lead bytes with RTL.
1166 if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1167 'inner: loop {
1168 // At this point, `byte` is not included in `read`.
1169 if byte < 0xE0 {
1170 if byte >= 0x80 {
1171 // Two-byte
1172 // Adding `unlikely` here improved throughput on
1173 // Russian plain text by 33%!
1174 if unlikely(byte >= 0xD6) {
1175 if byte == 0xD6 {
1176 let second = bytes[read + 1];
1177 if second > 0x8F {
1178 return true;
1179 }
1180 } else {
1181 return true;
1182 }
1183 }
1184 read += 2;
1185 } else {
1186 // ASCII: write and go back to SIMD.
1187 read += 1;
1188 // Intuitively, we should go back to the outer loop only
1189 // if byte is 0x30 or above, so as to avoid trashing on
1190 // ASCII space, comma and period in non-Latin context.
1191 // However, the extra branch seems to cost more than it's
1192 // worth.
1193 bytes = &bytes[read..];
1194 continue 'outer;
1195 }
1196 } else if byte < 0xF0 {
1197 // Three-byte
1198 if unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) {
1199 let second = bytes[read + 1];
1200 if byte == 0xE0 {
1201 if second < 0xA4 {
1202 return true;
1203 }
1204 } else if byte == 0xE2 {
1205 let third = bytes[read + 2];
1206 if second == 0x80 {
1207 if third == 0x8F || third == 0xAB || third == 0xAE {
1208 return true;
1209 }
1210 } else if second == 0x81 {
1211 if third == 0xA7 {
1212 return true;
1213 }
1214 }
1215 } else {
1216 debug_assert_eq!(byte, 0xEF);
1217 if in_inclusive_range8(second, 0xAC, 0xB7) {
1218 if second == 0xAC {
1219 let third = bytes[read + 2];
1220 if third > 0x9C {
1221 return true;
1222 }
1223 } else {
1224 return true;
1225 }
1226 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1227 if second == 0xB9 {
1228 let third = bytes[read + 2];
1229 if third > 0xAF {
1230 return true;
1231 }
1232 } else if second == 0xBB {
1233 let third = bytes[read + 2];
1234 if third != 0xBF {
1235 return true;
1236 }
1237 } else {
1238 return true;
1239 }
1240 }
1241 }
1242 }
1243 read += 3;
1244 } else {
1245 // Four-byte
1246 let second = bytes[read + 1];
1247 if unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) {
1248 let third = bytes[read + 2];
1249 if third >= 0xA0 {
1250 return true;
1251 }
1252 }
1253 read += 4;
1254 }
1255 // The comparison is always < or == and never >, but including
1256 // > here to let the compiler assume that < is true if this
1257 // comparison is false.
1258 if read >= bytes.len() {
1259 return false;
1260 }
1261 byte = bytes[read];
1262 continue 'inner;
1263 }
1264 } else {
1265 return false;
1266 }
1267 }
1268}
1269
1270/// Checks whether a UTF-16 buffer contains code points that trigger
1271/// right-to-left processing.
1272///
1273/// The check is done on a Unicode block basis without regard to assigned
1274/// vs. unassigned code points in the block. Hebrew presentation forms in
1275/// the Alphabetic Presentation Forms block are treated as if they formed
1276/// a block on their own (i.e. it treated as right-to-left). Additionally,
1277/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1278/// for. Control characters that are technically bidi controls but do not
1279/// cause right-to-left behavior without the presence of right-to-left
1280/// characters or right-to-left controls are not checked for. As a special
1281/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1282///
1283/// Returns `true` if the input contains an RTL character or an unpaired
1284/// high surrogate that could be the high half of an RTL character.
1285/// Returns `false` if the input contains neither RTL characters nor
1286/// unpaired high surrogates that could be higher halves of RTL characters.
1287pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1288 is_utf16_bidi_impl(buffer)
1289}
1290
1291/// Checks whether a scalar value triggers right-to-left processing.
1292///
1293/// The check is done on a Unicode block basis without regard to assigned
1294/// vs. unassigned code points in the block. Hebrew presentation forms in
1295/// the Alphabetic Presentation Forms block are treated as if they formed
1296/// a block on their own (i.e. it treated as right-to-left). Additionally,
1297/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1298/// for. Control characters that are technically bidi controls but do not
1299/// cause right-to-left behavior without the presence of right-to-left
1300/// characters or right-to-left controls are not checked for. As a special
1301/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1302#[inline(always)]
1303pub fn is_char_bidi(c: char) -> bool {
1304 // Controls:
1305 // Every control with RIGHT-TO-LEFT in its name in
1306 // https://www.unicode.org/charts/PDF/U2000.pdf
1307 // U+200F RLM
1308 // U+202B RLE
1309 // U+202E RLO
1310 // U+2067 RLI
1311 //
1312 // BMP RTL:
1313 // https://www.unicode.org/roadmaps/bmp/
1314 // U+0590...U+08FF
1315 // U+FB1D...U+FDFF Hebrew presentation forms and
1316 // Arabic Presentation Forms A
1317 // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1318 //
1319 // Supplementary RTL:
1320 // https://www.unicode.org/roadmaps/smp/
1321 // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1322 // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1323 let code_point = u32::from(c);
1324 if code_point < 0x0590 {
1325 // Below Hebrew
1326 return false;
1327 }
1328 if in_range32(code_point, 0x0900, 0xFB1D) {
1329 // Above Arabic Extended-A and below Hebrew presentation forms
1330 if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1331 // In the range that contains the RTL controls
1332 return code_point == 0x200F
1333 || code_point == 0x202B
1334 || code_point == 0x202E
1335 || code_point == 0x2067;
1336 }
1337 return false;
1338 }
1339 if code_point > 0x1EFFF {
1340 // Above second astral RTL. (Emoji is here.)
1341 return false;
1342 }
1343 if in_range32(code_point, 0x11000, 0x1E800) {
1344 // Between astral RTL blocks
1345 return false;
1346 }
1347 if in_range32(code_point, 0xFEFF, 0x10800) {
1348 // Above Arabic Presentations Forms B (excl. BOM) and below first
1349 // astral RTL
1350 return false;
1351 }
1352 if in_range32(code_point, 0xFE00, 0xFE70) {
1353 // Between Arabic Presentations Forms
1354 return false;
1355 }
1356 true
1357}
1358
1359/// Checks whether a UTF-16 code unit triggers right-to-left processing.
1360///
1361/// The check is done on a Unicode block basis without regard to assigned
1362/// vs. unassigned code points in the block. Hebrew presentation forms in
1363/// the Alphabetic Presentation Forms block are treated as if they formed
1364/// a block on their own (i.e. it treated as right-to-left). Additionally,
1365/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1366/// for. Control characters that are technically bidi controls but do not
1367/// cause right-to-left behavior without the presence of right-to-left
1368/// characters or right-to-left controls are not checked for. As a special
1369/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1370///
1371/// Since supplementary-plane right-to-left blocks are identifiable from the
1372/// high surrogate without examining the low surrogate, this function returns
1373/// `true` for such high surrogates making the function suitable for handling
1374/// supplementary-plane text without decoding surrogate pairs to scalar
1375/// values. Obviously, such high surrogates are then reported as right-to-left
1376/// even if actually unpaired.
1377#[inline(always)]
1378pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1379 if u < 0x0590 {
1380 // Below Hebrew
1381 return false;
1382 }
1383 if in_range16(u, 0x0900, 0xD802) {
1384 // Above Arabic Extended-A and below first RTL surrogate
1385 if in_inclusive_range16(u, 0x200F, 0x2067) {
1386 // In the range that contains the RTL controls
1387 return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1388 }
1389 return false;
1390 }
1391 if in_range16(u, 0xD83C, 0xFB1D) {
1392 // Between astral RTL high surrogates and Hebrew presentation forms
1393 // (Emoji is here)
1394 return false;
1395 }
1396 if in_range16(u, 0xD804, 0xD83A) {
1397 // Between RTL high surragates
1398 return false;
1399 }
1400 if u > 0xFEFE {
1401 // Above Arabic Presentation Forms (excl. BOM)
1402 return false;
1403 }
1404 if in_range16(u, 0xFE00, 0xFE70) {
1405 // Between Arabic Presentations Forms
1406 return false;
1407 }
1408 true
1409}
1410
1411/// Checks whether a potentially invalid UTF-8 buffer contains code points
1412/// that trigger right-to-left processing or is all-Latin1.
1413///
1414/// Possibly more efficient than performing the checks separately.
1415///
1416/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1417/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1418/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1419pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1420 if let Some(offset: usize) = is_utf8_latin1_impl(buffer) {
1421 if is_utf8_bidi(&buffer[offset..]) {
1422 Latin1Bidi::Bidi
1423 } else {
1424 Latin1Bidi::LeftToRight
1425 }
1426 } else {
1427 Latin1Bidi::Latin1
1428 }
1429}
1430
1431/// Checks whether a valid UTF-8 buffer contains code points
1432/// that trigger right-to-left processing or is all-Latin1.
1433///
1434/// Possibly more efficient than performing the checks separately.
1435///
1436/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1437/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1438/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1439pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1440 // The transition from the latin1 check to the bidi check isn't
1441 // optimal but not tweaking it to perfection today.
1442 if let Some(offset: usize) = is_str_latin1_impl(buffer) {
1443 if is_str_bidi(&buffer[offset..]) {
1444 Latin1Bidi::Bidi
1445 } else {
1446 Latin1Bidi::LeftToRight
1447 }
1448 } else {
1449 Latin1Bidi::Latin1
1450 }
1451}
1452
1453/// Checks whether a potentially invalid UTF-16 buffer contains code points
1454/// that trigger right-to-left processing or is all-Latin1.
1455///
1456/// Possibly more efficient than performing the checks separately.
1457///
1458/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1459/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1460/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1461pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1462 check_utf16_for_latin1_and_bidi_impl(buffer)
1463}
1464
1465/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1466/// with the REPLACEMENT CHARACTER.
1467///
1468/// The length of the destination buffer must be at least the length of the
1469/// source buffer _plus one_.
1470///
1471/// Returns the number of `u16`s written.
1472///
1473/// # Panics
1474///
1475/// Panics if the destination buffer is shorter than stated above.
1476pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1477 // TODO: Can the requirement for dst to be at least one unit longer
1478 // be eliminated?
1479 assert!(dst.len() > src.len());
1480 let mut decoder = Utf8Decoder::new_inner();
1481 let mut total_read = 0usize;
1482 let mut total_written = 0usize;
1483 loop {
1484 let (result, read, written) =
1485 decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1486 total_read += read;
1487 total_written += written;
1488 match result {
1489 DecoderResult::InputEmpty => {
1490 return total_written;
1491 }
1492 DecoderResult::OutputFull => {
1493 unreachable!("The assert at the top of the function should have caught this.");
1494 }
1495 DecoderResult::Malformed(_, _) => {
1496 // There should always be space for the U+FFFD, because
1497 // otherwise we'd have gotten OutputFull already.
1498 dst[total_written] = 0xFFFD;
1499 total_written += 1;
1500 }
1501 }
1502 }
1503}
1504
1505/// Converts valid UTF-8 to valid UTF-16.
1506///
1507/// The length of the destination buffer must be at least the length of the
1508/// source buffer.
1509///
1510/// Returns the number of `u16`s written.
1511///
1512/// # Panics
1513///
1514/// Panics if the destination buffer is shorter than stated above.
1515pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1516 assert!(
1517 dst.len() >= src.len(),
1518 "Destination must not be shorter than the source."
1519 );
1520 let bytes = src.as_bytes();
1521 let mut read = 0;
1522 let mut written = 0;
1523 'outer: loop {
1524 let mut byte = {
1525 let src_remaining = &bytes[read..];
1526 let dst_remaining = &mut dst[written..];
1527 let length = src_remaining.len();
1528 match unsafe {
1529 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1530 } {
1531 None => {
1532 written += length;
1533 return written;
1534 }
1535 Some((non_ascii, consumed)) => {
1536 read += consumed;
1537 written += consumed;
1538 non_ascii
1539 }
1540 }
1541 };
1542 'inner: loop {
1543 // At this point, `byte` is not included in `read`.
1544 if byte < 0xE0 {
1545 if byte >= 0x80 {
1546 // Two-byte
1547 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1548 let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1549 unsafe { *(dst.get_unchecked_mut(written)) = point };
1550 read += 2;
1551 written += 1;
1552 } else {
1553 // ASCII: write and go back to SIMD.
1554 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1555 read += 1;
1556 written += 1;
1557 // Intuitively, we should go back to the outer loop only
1558 // if byte is 0x30 or above, so as to avoid trashing on
1559 // ASCII space, comma and period in non-Latin context.
1560 // However, the extra branch seems to cost more than it's
1561 // worth.
1562 continue 'outer;
1563 }
1564 } else if byte < 0xF0 {
1565 // Three-byte
1566 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1567 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1568 let point = ((u16::from(byte) & 0xF) << 12)
1569 | ((u16::from(second) & 0x3F) << 6)
1570 | (u16::from(third) & 0x3F);
1571 unsafe { *(dst.get_unchecked_mut(written)) = point };
1572 read += 3;
1573 written += 1;
1574 } else {
1575 // Four-byte
1576 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1577 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1578 let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1579 let point = ((u32::from(byte) & 0x7) << 18)
1580 | ((u32::from(second) & 0x3F) << 12)
1581 | ((u32::from(third) & 0x3F) << 6)
1582 | (u32::from(fourth) & 0x3F);
1583 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1584 unsafe {
1585 *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1586 };
1587 read += 4;
1588 written += 2;
1589 }
1590 // The comparison is always < or == and never >, but including
1591 // > here to let the compiler assume that < is true if this
1592 // comparison is false.
1593 if read >= src.len() {
1594 return written;
1595 }
1596 byte = bytes[read];
1597 continue 'inner;
1598 }
1599 }
1600}
1601
1602/// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1603///
1604/// The length of the destination buffer must be at least the length of the
1605/// source buffer.
1606///
1607/// Returns the number of `u16`s written or `None` if the input was invalid.
1608///
1609/// When the input was invalid, some output may have been written.
1610///
1611/// # Panics
1612///
1613/// Panics if the destination buffer is shorter than stated above.
1614pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1615 assert!(
1616 dst.len() >= src.len(),
1617 "Destination must not be shorter than the source."
1618 );
1619 let (read: usize, written: usize) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1620 if read == src.len() {
1621 return Some(written);
1622 }
1623 None
1624}
1625
1626/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1627/// with the REPLACEMENT CHARACTER with potentially insufficient output
1628/// space.
1629///
1630/// Returns the number of code units read and the number of bytes written.
1631///
1632/// Guarantees that the bytes in the destination beyond the number of
1633/// bytes claimed as written by the second item of the return tuple
1634/// are left unmodified.
1635///
1636/// Not all code units are read if there isn't enough output space.
1637///
1638/// Note that this method isn't designed for general streamability but for
1639/// not allocating memory for the worst case up front. Specifically,
1640/// if the input starts with or ends with an unpaired surrogate, those are
1641/// replaced with the REPLACEMENT CHARACTER.
1642///
1643/// Matches the semantics of `TextEncoder.encodeInto()` from the
1644/// Encoding Standard.
1645///
1646/// # Safety
1647///
1648/// If you want to convert into a `&mut str`, use
1649/// `convert_utf16_to_str_partial()` instead of using this function
1650/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1651#[inline(always)]
1652pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1653 // The two functions called below are marked `inline(never)` to make
1654 // transitions from the hot part (first function) into the cold part
1655 // (second function) go through a return and another call to discouge
1656 // the CPU from speculating from the hot code into the cold code.
1657 // Letting the transitions be mere intra-function jumps, even to
1658 // basic blocks out-of-lined to the end of the function would wipe
1659 // away a quarter of Arabic encode performance on Haswell!
1660 let (read: usize, written: usize) = convert_utf16_to_utf8_partial_inner(src, dst);
1661 if likely(read == src.len()) {
1662 return (read, written);
1663 }
1664 let (tail_read: usize, tail_written: usize) =
1665 convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1666 (read + tail_read, written + tail_written)
1667}
1668
1669/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1670/// with the REPLACEMENT CHARACTER.
1671///
1672/// The length of the destination buffer must be at least the length of the
1673/// source buffer times three.
1674///
1675/// Returns the number of bytes written.
1676///
1677/// # Panics
1678///
1679/// Panics if the destination buffer is shorter than stated above.
1680///
1681/// # Safety
1682///
1683/// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1684/// instead of using this function together with the `unsafe` method
1685/// `as_bytes_mut()` on `&mut str`.
1686#[inline(always)]
1687pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1688 assert!(dst.len() >= src.len() * 3);
1689 let (read: usize, written: usize) = convert_utf16_to_utf8_partial(src, dst);
1690 debug_assert_eq!(read, src.len());
1691 written
1692}
1693
1694/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1695/// with the REPLACEMENT CHARACTER such that the validity of the output is
1696/// signaled using the Rust type system with potentially insufficient output
1697/// space.
1698///
1699/// Returns the number of code units read and the number of bytes written.
1700///
1701/// Not all code units are read if there isn't enough output space.
1702///
1703/// Note that this method isn't designed for general streamability but for
1704/// not allocating memory for the worst case up front. Specifically,
1705/// if the input starts with or ends with an unpaired surrogate, those are
1706/// replaced with the REPLACEMENT CHARACTER.
1707pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1708 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1709 let (read: usize, written: usize) = convert_utf16_to_utf8_partial(src, dst:bytes);
1710 let len: usize = bytes.len();
1711 let mut trail: usize = written;
1712 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1713 bytes[trail] = 0;
1714 trail += 1;
1715 }
1716 (read, written)
1717}
1718
1719/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1720/// with the REPLACEMENT CHARACTER such that the validity of the output is
1721/// signaled using the Rust type system.
1722///
1723/// The length of the destination buffer must be at least the length of the
1724/// source buffer times three.
1725///
1726/// Returns the number of bytes written.
1727///
1728/// # Panics
1729///
1730/// Panics if the destination buffer is shorter than stated above.
1731#[inline(always)]
1732pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1733 assert!(dst.len() >= src.len() * 3);
1734 let (read: usize, written: usize) = convert_utf16_to_str_partial(src, dst);
1735 debug_assert_eq!(read, src.len());
1736 written
1737}
1738
1739/// Converts bytes whose unsigned value is interpreted as Unicode code point
1740/// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1741///
1742/// The length of the destination buffer must be at least the length of the
1743/// source buffer.
1744///
1745/// The number of `u16`s written equals the length of the source buffer.
1746///
1747/// # Panics
1748///
1749/// Panics if the destination buffer is shorter than stated above.
1750pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1751 assert!(
1752 dst.len() >= src.len(),
1753 "Destination must not be shorter than the source."
1754 );
1755 // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1756 // instructions and this code, but, yet, the autovectorized version is
1757 // faster.
1758 unsafe {
1759 unpack_latin1(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len());
1760 }
1761}
1762
1763/// Converts bytes whose unsigned value is interpreted as Unicode code point
1764/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1765/// output space.
1766///
1767/// Returns the number of bytes read and the number of bytes written.
1768///
1769/// If the output isn't large enough, not all input is consumed.
1770///
1771/// # Safety
1772///
1773/// If you want to convert into a `&mut str`, use
1774/// `convert_utf16_to_str_partial()` instead of using this function
1775/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1776pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1777 let src_len = src.len();
1778 let src_ptr = src.as_ptr();
1779 let dst_ptr = dst.as_mut_ptr();
1780 let dst_len = dst.len();
1781 let mut total_read = 0usize;
1782 let mut total_written = 0usize;
1783 loop {
1784 // src can't advance more than dst
1785 let src_left = src_len - total_read;
1786 let dst_left = dst_len - total_written;
1787 let min_left = ::core::cmp::min(src_left, dst_left);
1788 if let Some((non_ascii, consumed)) = unsafe {
1789 ascii_to_ascii(
1790 src_ptr.add(total_read),
1791 dst_ptr.add(total_written),
1792 min_left,
1793 )
1794 } {
1795 total_read += consumed;
1796 total_written += consumed;
1797 if total_written.checked_add(2).unwrap() > dst_len {
1798 return (total_read, total_written);
1799 }
1800
1801 total_read += 1; // consume `non_ascii`
1802
1803 dst[total_written] = (non_ascii >> 6) | 0xC0;
1804 total_written += 1;
1805 dst[total_written] = (non_ascii & 0x3F) | 0x80;
1806 total_written += 1;
1807 continue;
1808 }
1809 return (total_read + min_left, total_written + min_left);
1810 }
1811}
1812
1813/// Converts bytes whose unsigned value is interpreted as Unicode code point
1814/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1815///
1816/// The length of the destination buffer must be at least the length of the
1817/// source buffer times two.
1818///
1819/// Returns the number of bytes written.
1820///
1821/// # Panics
1822///
1823/// Panics if the destination buffer is shorter than stated above.
1824///
1825/// # Safety
1826///
1827/// Note that this function may write garbage beyond the number of bytes
1828/// indicated by the return value, so using a `&mut str` interpreted as
1829/// `&mut [u8]` as the destination is not safe. If you want to convert into
1830/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1831#[inline]
1832pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1833 assert!(
1834 dst.len() >= src.len() * 2,
1835 "Destination must not be shorter than the source times two."
1836 );
1837 let (read: usize, written: usize) = convert_latin1_to_utf8_partial(src, dst);
1838 debug_assert_eq!(read, src.len());
1839 written
1840}
1841
1842/// Converts bytes whose unsigned value is interpreted as Unicode code point
1843/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1844/// output is signaled using the Rust type system with potentially insufficient
1845/// output space.
1846///
1847/// Returns the number of bytes read and the number of bytes written.
1848///
1849/// If the output isn't large enough, not all input is consumed.
1850#[inline]
1851pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1852 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1853 let (read: usize, written: usize) = convert_latin1_to_utf8_partial(src, dst:bytes);
1854 let len: usize = bytes.len();
1855 let mut trail: usize = written;
1856 let max: usize = ::core::cmp::min(v1:len, v2:trail + MAX_STRIDE_SIZE);
1857 while trail < max {
1858 bytes[trail] = 0;
1859 trail += 1;
1860 }
1861 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1862 bytes[trail] = 0;
1863 trail += 1;
1864 }
1865 (read, written)
1866}
1867
1868/// Converts bytes whose unsigned value is interpreted as Unicode code point
1869/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1870/// output is signaled using the Rust type system.
1871///
1872/// The length of the destination buffer must be at least the length of the
1873/// source buffer times two.
1874///
1875/// Returns the number of bytes written.
1876///
1877/// # Panics
1878///
1879/// Panics if the destination buffer is shorter than stated above.
1880#[inline]
1881pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1882 assert!(
1883 dst.len() >= src.len() * 2,
1884 "Destination must not be shorter than the source times two."
1885 );
1886 let (read: usize, written: usize) = convert_latin1_to_str_partial(src, dst);
1887 debug_assert_eq!(read, src.len());
1888 written
1889}
1890
1891/// If the input is valid UTF-8 representing only Unicode code points from
1892/// U+0000 to U+00FF, inclusive, converts the input into output that
1893/// represents the value of each code point as the unsigned byte value of
1894/// each output byte.
1895///
1896/// If the input does not fulfill the condition stated above, this function
1897/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1898/// does something that is memory-safe without any promises about any
1899/// properties of the output. In particular, callers shouldn't assume the
1900/// output to be the same across crate versions or CPU architectures and
1901/// should not assume that non-ASCII input can't map to ASCII output.
1902///
1903/// The length of the destination buffer must be at least the length of the
1904/// source buffer.
1905///
1906/// Returns the number of bytes written.
1907///
1908/// # Panics
1909///
1910/// Panics if the destination buffer is shorter than stated above.
1911///
1912/// If debug assertions are enabled (and not fuzzing) and the input is
1913/// not in the range U+0000 to U+00FF, inclusive.
1914pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1915 assert!(
1916 dst.len() >= src.len(),
1917 "Destination must not be shorter than the source."
1918 );
1919 non_fuzz_debug_assert!(is_utf8_latin1(src));
1920 let src_len = src.len();
1921 let src_ptr = src.as_ptr();
1922 let dst_ptr = dst.as_mut_ptr();
1923 let mut total_read = 0usize;
1924 let mut total_written = 0usize;
1925 loop {
1926 // dst can't advance more than src
1927 let src_left = src_len - total_read;
1928 if let Some((non_ascii, consumed)) = unsafe {
1929 ascii_to_ascii(
1930 src_ptr.add(total_read),
1931 dst_ptr.add(total_written),
1932 src_left,
1933 )
1934 } {
1935 total_read += consumed + 1;
1936 total_written += consumed;
1937
1938 if total_read == src_len {
1939 return total_written;
1940 }
1941
1942 let trail = src[total_read];
1943 total_read += 1;
1944
1945 dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1946 total_written += 1;
1947 continue;
1948 }
1949 return total_written + src_left;
1950 }
1951}
1952
1953/// If the input is valid UTF-16 representing only Unicode code points from
1954/// U+0000 to U+00FF, inclusive, converts the input into output that
1955/// represents the value of each code point as the unsigned byte value of
1956/// each output byte.
1957///
1958/// If the input does not fulfill the condition stated above, does something
1959/// that is memory-safe without any promises about any properties of the
1960/// output and will probably assert in debug builds in future versions.
1961/// In particular, callers shouldn't assume the output to be the same across
1962/// crate versions or CPU architectures and should not assume that non-ASCII
1963/// input can't map to ASCII output.
1964///
1965/// The length of the destination buffer must be at least the length of the
1966/// source buffer.
1967///
1968/// The number of bytes written equals the length of the source buffer.
1969///
1970/// # Panics
1971///
1972/// Panics if the destination buffer is shorter than stated above.
1973///
1974/// (Probably in future versions if debug assertions are enabled (and not
1975/// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
1976pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1977 assert!(
1978 dst.len() >= src.len(),
1979 "Destination must not be shorter than the source."
1980 );
1981 // non_fuzz_debug_assert!(is_utf16_latin1(src));
1982 unsafe {
1983 pack_latin1(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len());
1984 }
1985}
1986
1987/// Converts bytes whose unsigned value is interpreted as Unicode code point
1988/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1989///
1990/// Borrows if input is ASCII-only. Performs a single heap allocation
1991/// otherwise.
1992///
1993/// Only available if the `alloc` feature is enabled (enabled by default).
1994#[cfg(feature = "alloc")]
1995pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
1996 let up_to: usize = ascii_valid_up_to(bytes);
1997 // >= makes later things optimize better than ==
1998 if up_to >= bytes.len() {
1999 debug_assert_eq!(up_to, bytes.len());
2000 let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) };
2001 return Cow::Borrowed(s);
2002 }
2003 let (head: &[u8], tail: &[u8]) = bytes.split_at(mid:up_to);
2004 let capacity: usize = head.len() + tail.len() * 2;
2005 let mut vec: Vec = Vec::with_capacity(capacity);
2006 unsafe {
2007 vec.set_len(new_len:capacity);
2008 }
2009 (&mut vec[..up_to]).copy_from_slice(src:head);
2010 let written: usize = convert_latin1_to_utf8(src:tail, &mut vec[up_to..]);
2011 vec.truncate(len:up_to + written);
2012 Cow::Owned(unsafe { String::from_utf8_unchecked(bytes:vec) })
2013}
2014
2015/// If the input is valid UTF-8 representing only Unicode code points from
2016/// U+0000 to U+00FF, inclusive, converts the input into output that
2017/// represents the value of each code point as the unsigned byte value of
2018/// each output byte.
2019///
2020/// If the input does not fulfill the condition stated above, this function
2021/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2022/// does something that is memory-safe without any promises about any
2023/// properties of the output. In particular, callers shouldn't assume the
2024/// output to be the same across crate versions or CPU architectures and
2025/// should not assume that non-ASCII input can't map to ASCII output.
2026///
2027/// Borrows if input is ASCII-only. Performs a single heap allocation
2028/// otherwise.
2029///
2030/// Only available if the `alloc` feature is enabled (enabled by default).
2031#[cfg(feature = "alloc")]
2032pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2033 let bytes: &[u8] = string.as_bytes();
2034 let up_to: usize = ascii_valid_up_to(bytes);
2035 // >= makes later things optimize better than ==
2036 if up_to >= bytes.len() {
2037 debug_assert_eq!(up_to, bytes.len());
2038 return Cow::Borrowed(bytes);
2039 }
2040 let (head: &[u8], tail: &[u8]) = bytes.split_at(mid:up_to);
2041 let capacity: usize = bytes.len();
2042 let mut vec: Vec = Vec::with_capacity(capacity);
2043 unsafe {
2044 vec.set_len(new_len:capacity);
2045 }
2046 (&mut vec[..up_to]).copy_from_slice(src:head);
2047 let written: usize = convert_utf8_to_latin1_lossy(src:tail, &mut vec[up_to..]);
2048 vec.truncate(len:up_to + written);
2049 Cow::Owned(vec)
2050}
2051
2052/// Returns the index of the first unpaired surrogate or, if the input is
2053/// valid UTF-16 in its entirety, the length of the input.
2054pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2055 utf16_valid_up_to_impl(buffer)
2056}
2057
2058/// Returns the index of first byte that starts an invalid byte
2059/// sequence or a non-Latin1 byte sequence, or the length of the
2060/// string if there are neither.
2061pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2062 is_utf8_latin1_impl(buffer).unwrap_or(default:buffer.len())
2063}
2064
2065/// Returns the index of first byte that starts a non-Latin1 byte
2066/// sequence, or the length of the string if there are none.
2067pub fn str_latin1_up_to(buffer: &str) -> usize {
2068 is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len())
2069}
2070
2071/// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2072#[inline]
2073pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2074 let mut offset: usize = 0;
2075 loop {
2076 offset += utf16_valid_up_to(&buffer[offset..]);
2077 if offset == buffer.len() {
2078 return;
2079 }
2080 buffer[offset] = 0xFFFD;
2081 offset += 1;
2082 }
2083}
2084
2085/// Copies ASCII from source to destination up to the first non-ASCII byte
2086/// (or the end of the input if it is ASCII in its entirety).
2087///
2088/// The length of the destination buffer must be at least the length of the
2089/// source buffer.
2090///
2091/// Returns the number of bytes written.
2092///
2093/// # Panics
2094///
2095/// Panics if the destination buffer is shorter than stated above.
2096pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2097 assert!(
2098 dst.len() >= src.len(),
2099 "Destination must not be shorter than the source."
2100 );
2101 if let Some((_, consumed: usize)) =
2102 unsafe { ascii_to_ascii(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len()) }
2103 {
2104 consumed
2105 } else {
2106 src.len()
2107 }
2108}
2109
2110/// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2111/// the first non-ASCII byte (or the end of the input if it is ASCII in its
2112/// entirety).
2113///
2114/// The length of the destination buffer must be at least the length of the
2115/// source buffer.
2116///
2117/// Returns the number of `u16`s written.
2118///
2119/// # Panics
2120///
2121/// Panics if the destination buffer is shorter than stated above.
2122pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2123 assert!(
2124 dst.len() >= src.len(),
2125 "Destination must not be shorter than the source."
2126 );
2127 if let Some((_, consumed: usize)) =
2128 unsafe { ascii_to_basic_latin(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len()) }
2129 {
2130 consumed
2131 } else {
2132 src.len()
2133 }
2134}
2135
2136/// Copies Basic Latin from source to destination narrowing it to ASCII up to
2137/// the first non-Basic Latin code unit (or the end of the input if it is
2138/// Basic Latin in its entirety).
2139///
2140/// The length of the destination buffer must be at least the length of the
2141/// source buffer.
2142///
2143/// Returns the number of bytes written.
2144///
2145/// # Panics
2146///
2147/// Panics if the destination buffer is shorter than stated above.
2148pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2149 assert!(
2150 dst.len() >= src.len(),
2151 "Destination must not be shorter than the source."
2152 );
2153 if let Some((_, consumed: usize)) =
2154 unsafe { basic_latin_to_ascii(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len()) }
2155 {
2156 consumed
2157 } else {
2158 src.len()
2159 }
2160}
2161
2162// Any copyright to the test code below this comment is dedicated to the
2163// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2164
2165#[cfg(all(test, feature = "alloc"))]
2166mod tests {
2167 use super::*;
2168
2169 #[test]
2170 fn test_is_ascii_success() {
2171 let mut src: Vec<u8> = Vec::with_capacity(128);
2172 src.resize(128, 0);
2173 for i in 0..src.len() {
2174 src[i] = i as u8;
2175 }
2176 for i in 0..src.len() {
2177 assert!(is_ascii(&src[i..]));
2178 }
2179 }
2180
2181 #[test]
2182 fn test_is_ascii_fail() {
2183 let mut src: Vec<u8> = Vec::with_capacity(128);
2184 src.resize(128, 0);
2185 for i in 0..src.len() {
2186 src[i] = i as u8;
2187 }
2188 for i in 0..src.len() {
2189 let tail = &mut src[i..];
2190 for j in 0..tail.len() {
2191 tail[j] = 0xA0;
2192 assert!(!is_ascii(tail));
2193 }
2194 }
2195 }
2196
2197 #[test]
2198 fn test_is_basic_latin_success() {
2199 let mut src: Vec<u16> = Vec::with_capacity(128);
2200 src.resize(128, 0);
2201 for i in 0..src.len() {
2202 src[i] = i as u16;
2203 }
2204 for i in 0..src.len() {
2205 assert!(is_basic_latin(&src[i..]));
2206 }
2207 }
2208
2209 #[test]
2210 fn test_is_basic_latin_fail() {
2211 let mut src: Vec<u16> = Vec::with_capacity(128);
2212 src.resize(128, 0);
2213 for i in 0..src.len() {
2214 src[i] = i as u16;
2215 }
2216 for i in 0..src.len() {
2217 let tail = &mut src[i..];
2218 for j in 0..tail.len() {
2219 tail[j] = 0xA0;
2220 assert!(!is_basic_latin(tail));
2221 }
2222 }
2223 }
2224
2225 #[test]
2226 fn test_is_utf16_latin1_success() {
2227 let mut src: Vec<u16> = Vec::with_capacity(256);
2228 src.resize(256, 0);
2229 for i in 0..src.len() {
2230 src[i] = i as u16;
2231 }
2232 for i in 0..src.len() {
2233 assert!(is_utf16_latin1(&src[i..]));
2234 assert_eq!(
2235 check_utf16_for_latin1_and_bidi(&src[i..]),
2236 Latin1Bidi::Latin1
2237 );
2238 }
2239 }
2240
2241 #[test]
2242 fn test_is_utf16_latin1_fail() {
2243 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2244 let mut src: Vec<u16> = Vec::with_capacity(len);
2245 src.resize(len, 0);
2246 for i in 0..src.len() {
2247 src[i] = i as u16;
2248 }
2249 for i in 0..src.len() {
2250 let tail = &mut src[i..];
2251 for j in 0..tail.len() {
2252 tail[j] = 0x100 + j as u16;
2253 assert!(!is_utf16_latin1(tail));
2254 assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2255 }
2256 }
2257 }
2258
2259 #[test]
2260 fn test_is_str_latin1_success() {
2261 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2262 let mut src: Vec<u16> = Vec::with_capacity(len);
2263 src.resize(len, 0);
2264 for i in 0..src.len() {
2265 src[i] = i as u16;
2266 }
2267 for i in 0..src.len() {
2268 let s = String::from_utf16(&src[i..]).unwrap();
2269 assert!(is_str_latin1(&s[..]));
2270 assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2271 }
2272 }
2273
2274 #[test]
2275 fn test_is_str_latin1_fail() {
2276 let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2277 let mut src: Vec<u16> = Vec::with_capacity(len);
2278 src.resize(len, 0);
2279 for i in 0..src.len() {
2280 src[i] = i as u16;
2281 }
2282 for i in 0..src.len() {
2283 let tail = &mut src[i..];
2284 for j in 0..tail.len() {
2285 tail[j] = 0x100 + j as u16;
2286 let s = String::from_utf16(tail).unwrap();
2287 assert!(!is_str_latin1(&s[..]));
2288 assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2289 }
2290 }
2291 }
2292
2293 #[test]
2294 fn test_is_utf8_latin1_success() {
2295 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2296 let mut src: Vec<u16> = Vec::with_capacity(len);
2297 src.resize(len, 0);
2298 for i in 0..src.len() {
2299 src[i] = i as u16;
2300 }
2301 for i in 0..src.len() {
2302 let s = String::from_utf16(&src[i..]).unwrap();
2303 assert!(is_utf8_latin1(s.as_bytes()));
2304 assert_eq!(
2305 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2306 Latin1Bidi::Latin1
2307 );
2308 }
2309 }
2310
2311 #[test]
2312 fn test_is_utf8_latin1_fail() {
2313 let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2314 let mut src: Vec<u16> = Vec::with_capacity(len);
2315 src.resize(len, 0);
2316 for i in 0..src.len() {
2317 src[i] = i as u16;
2318 }
2319 for i in 0..src.len() {
2320 let tail = &mut src[i..];
2321 for j in 0..tail.len() {
2322 tail[j] = 0x100 + j as u16;
2323 let s = String::from_utf16(tail).unwrap();
2324 assert!(!is_utf8_latin1(s.as_bytes()));
2325 assert_ne!(
2326 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2327 Latin1Bidi::Latin1
2328 );
2329 }
2330 }
2331 }
2332
2333 #[test]
2334 fn test_is_utf8_latin1_invalid() {
2335 assert!(!is_utf8_latin1(b"\xC3"));
2336 assert!(!is_utf8_latin1(b"a\xC3"));
2337 assert!(!is_utf8_latin1(b"\xFF"));
2338 assert!(!is_utf8_latin1(b"a\xFF"));
2339 assert!(!is_utf8_latin1(b"\xC3\xFF"));
2340 assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2341 }
2342
2343 #[test]
2344 fn test_convert_utf8_to_utf16() {
2345 let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2346 let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2347 dst.resize(src.len() + 1, 0);
2348 let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2349 dst.truncate(len);
2350 let reference: Vec<u16> = src.encode_utf16().collect();
2351 assert_eq!(dst, reference);
2352 }
2353
2354 #[test]
2355 fn test_convert_str_to_utf16() {
2356 let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2357 let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2358 dst.resize(src.len(), 0);
2359 let len = convert_str_to_utf16(src, &mut dst[..]);
2360 dst.truncate(len);
2361 let reference: Vec<u16> = src.encode_utf16().collect();
2362 assert_eq!(dst, reference);
2363 }
2364
2365 #[test]
2366 fn test_convert_utf16_to_utf8_partial() {
2367 let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2368 let src: Vec<u16> = reference.encode_utf16().collect();
2369 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2370 dst.resize(src.len() * 3 + 1, 0);
2371 let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2372 let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2373 dst.truncate(len);
2374 assert_eq!(dst, reference.as_bytes());
2375 }
2376
2377 #[test]
2378 fn test_convert_utf16_to_utf8() {
2379 let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2380 let src: Vec<u16> = reference.encode_utf16().collect();
2381 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2382 dst.resize(src.len() * 3 + 1, 0);
2383 let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2384 dst.truncate(len);
2385 assert_eq!(dst, reference.as_bytes());
2386 }
2387
2388 #[test]
2389 fn test_convert_latin1_to_utf16() {
2390 let mut src: Vec<u8> = Vec::with_capacity(256);
2391 src.resize(256, 0);
2392 let mut reference: Vec<u16> = Vec::with_capacity(256);
2393 reference.resize(256, 0);
2394 for i in 0..256 {
2395 src[i] = i as u8;
2396 reference[i] = i as u16;
2397 }
2398 let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2399 dst.resize(src.len(), 0);
2400 convert_latin1_to_utf16(&src[..], &mut dst[..]);
2401 assert_eq!(dst, reference);
2402 }
2403
2404 #[test]
2405 fn test_convert_latin1_to_utf8_partial() {
2406 let mut dst = [0u8, 2];
2407 let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2408 assert_eq!(read, 1);
2409 assert_eq!(written, 1);
2410 }
2411
2412 #[test]
2413 fn test_convert_latin1_to_utf8() {
2414 let mut src: Vec<u8> = Vec::with_capacity(256);
2415 src.resize(256, 0);
2416 let mut reference: Vec<u16> = Vec::with_capacity(256);
2417 reference.resize(256, 0);
2418 for i in 0..256 {
2419 src[i] = i as u8;
2420 reference[i] = i as u16;
2421 }
2422 let s = String::from_utf16(&reference[..]).unwrap();
2423 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2424 dst.resize(src.len() * 2, 0);
2425 let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2426 dst.truncate(len);
2427 assert_eq!(&dst[..], s.as_bytes());
2428 }
2429
2430 #[test]
2431 fn test_convert_utf8_to_latin1_lossy() {
2432 let mut reference: Vec<u8> = Vec::with_capacity(256);
2433 reference.resize(256, 0);
2434 let mut src16: Vec<u16> = Vec::with_capacity(256);
2435 src16.resize(256, 0);
2436 for i in 0..256 {
2437 src16[i] = i as u16;
2438 reference[i] = i as u8;
2439 }
2440 let src = String::from_utf16(&src16[..]).unwrap();
2441 let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2442 dst.resize(src.len(), 0);
2443 let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2444 dst.truncate(len);
2445 assert_eq!(dst, reference);
2446 }
2447
2448 #[cfg(all(debug_assertions, not(fuzzing)))]
2449 #[test]
2450 #[should_panic]
2451 fn test_convert_utf8_to_latin1_lossy_panics() {
2452 let mut dst = [0u8; 16];
2453 let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2454 }
2455
2456 #[test]
2457 fn test_convert_utf16_to_latin1_lossy() {
2458 let mut src: Vec<u16> = Vec::with_capacity(256);
2459 src.resize(256, 0);
2460 let mut reference: Vec<u8> = Vec::with_capacity(256);
2461 reference.resize(256, 0);
2462 for i in 0..256 {
2463 src[i] = i as u16;
2464 reference[i] = i as u8;
2465 }
2466 let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2467 dst.resize(src.len(), 0);
2468 convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2469 assert_eq!(dst, reference);
2470 }
2471
2472 #[test]
2473 // #[should_panic]
2474 fn test_convert_utf16_to_latin1_lossy_panics() {
2475 let mut dst = [0u8; 16];
2476 let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2477 }
2478
2479 #[test]
2480 fn test_utf16_valid_up_to() {
2481 let valid = vec![
2482 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2483 0xD83Du16, 0xDCA9u16, 0x00B6u16,
2484 ];
2485 assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2486 let lone_high = vec![
2487 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2488 0x2603u16, 0xD83Du16, 0x00B6u16,
2489 ];
2490 assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2491 let lone_low = vec![
2492 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2493 0x2603u16, 0xDCA9u16, 0x00B6u16,
2494 ];
2495 assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2496 let lone_high_at_end = vec![
2497 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2498 0x2603u16, 0x00B6u16, 0xD83Du16,
2499 ];
2500 assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2501 }
2502
2503 #[test]
2504 fn test_ensure_utf16_validity() {
2505 let mut src = vec![
2506 0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2507 0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2508 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2509 ];
2510 let reference = vec![
2511 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2512 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2513 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2514 ];
2515 ensure_utf16_validity(&mut src[..]);
2516 assert_eq!(src, reference);
2517 }
2518
2519 #[test]
2520 fn test_is_char_bidi() {
2521 assert!(!is_char_bidi('a'));
2522 assert!(!is_char_bidi('\u{03B1}'));
2523 assert!(!is_char_bidi('\u{3041}'));
2524 assert!(!is_char_bidi('\u{1F4A9}'));
2525 assert!(!is_char_bidi('\u{FE00}'));
2526 assert!(!is_char_bidi('\u{202C}'));
2527 assert!(!is_char_bidi('\u{FEFF}'));
2528 assert!(is_char_bidi('\u{0590}'));
2529 assert!(is_char_bidi('\u{08FF}'));
2530 assert!(is_char_bidi('\u{061C}'));
2531 assert!(is_char_bidi('\u{FB50}'));
2532 assert!(is_char_bidi('\u{FDFF}'));
2533 assert!(is_char_bidi('\u{FE70}'));
2534 assert!(is_char_bidi('\u{FEFE}'));
2535 assert!(is_char_bidi('\u{200F}'));
2536 assert!(is_char_bidi('\u{202B}'));
2537 assert!(is_char_bidi('\u{202E}'));
2538 assert!(is_char_bidi('\u{2067}'));
2539 assert!(is_char_bidi('\u{10800}'));
2540 assert!(is_char_bidi('\u{10FFF}'));
2541 assert!(is_char_bidi('\u{1E800}'));
2542 assert!(is_char_bidi('\u{1EFFF}'));
2543 }
2544
2545 #[test]
2546 fn test_is_utf16_code_unit_bidi() {
2547 assert!(!is_utf16_code_unit_bidi(0x0062));
2548 assert!(!is_utf16_code_unit_bidi(0x03B1));
2549 assert!(!is_utf16_code_unit_bidi(0x3041));
2550 assert!(!is_utf16_code_unit_bidi(0xD801));
2551 assert!(!is_utf16_code_unit_bidi(0xFE00));
2552 assert!(!is_utf16_code_unit_bidi(0x202C));
2553 assert!(!is_utf16_code_unit_bidi(0xFEFF));
2554 assert!(is_utf16_code_unit_bidi(0x0590));
2555 assert!(is_utf16_code_unit_bidi(0x08FF));
2556 assert!(is_utf16_code_unit_bidi(0x061C));
2557 assert!(is_utf16_code_unit_bidi(0xFB1D));
2558 assert!(is_utf16_code_unit_bidi(0xFB50));
2559 assert!(is_utf16_code_unit_bidi(0xFDFF));
2560 assert!(is_utf16_code_unit_bidi(0xFE70));
2561 assert!(is_utf16_code_unit_bidi(0xFEFE));
2562 assert!(is_utf16_code_unit_bidi(0x200F));
2563 assert!(is_utf16_code_unit_bidi(0x202B));
2564 assert!(is_utf16_code_unit_bidi(0x202E));
2565 assert!(is_utf16_code_unit_bidi(0x2067));
2566 assert!(is_utf16_code_unit_bidi(0xD802));
2567 assert!(is_utf16_code_unit_bidi(0xD803));
2568 assert!(is_utf16_code_unit_bidi(0xD83A));
2569 assert!(is_utf16_code_unit_bidi(0xD83B));
2570 }
2571
2572 #[test]
2573 fn test_is_str_bidi() {
2574 assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2575 assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2576 assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2577 assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2578 assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2579 assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2580 assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2581 assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2582 assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2583 assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2584 assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2585 assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2586 assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2587 assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2588 assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2589 assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2590 assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2591 assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2592 assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2593 assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2594 assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2595 assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2596 }
2597
2598 #[test]
2599 fn test_is_utf8_bidi() {
2600 assert!(!is_utf8_bidi(
2601 "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2602 ));
2603 assert!(!is_utf8_bidi(
2604 "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2605 ));
2606 assert!(!is_utf8_bidi(
2607 "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2608 ));
2609 assert!(!is_utf8_bidi(
2610 "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2611 ));
2612 assert!(!is_utf8_bidi(
2613 "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2614 ));
2615 assert!(!is_utf8_bidi(
2616 "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2617 ));
2618 assert!(!is_utf8_bidi(
2619 "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2620 ));
2621 assert!(is_utf8_bidi(
2622 "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2623 ));
2624 assert!(is_utf8_bidi(
2625 "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2626 ));
2627 assert!(is_utf8_bidi(
2628 "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2629 ));
2630 assert!(is_utf8_bidi(
2631 "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2632 ));
2633 assert!(is_utf8_bidi(
2634 "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2635 ));
2636 assert!(is_utf8_bidi(
2637 "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2638 ));
2639 assert!(is_utf8_bidi(
2640 "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2641 ));
2642 assert!(is_utf8_bidi(
2643 "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2644 ));
2645 assert!(is_utf8_bidi(
2646 "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2647 ));
2648 assert!(is_utf8_bidi(
2649 "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2650 ));
2651 assert!(is_utf8_bidi(
2652 "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2653 ));
2654 assert!(is_utf8_bidi(
2655 "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2656 ));
2657 assert!(is_utf8_bidi(
2658 "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2659 ));
2660 assert!(is_utf8_bidi(
2661 "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2662 ));
2663 assert!(is_utf8_bidi(
2664 "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2665 ));
2666 }
2667
2668 #[test]
2669 fn test_is_utf16_bidi() {
2670 assert!(!is_utf16_bidi(&[
2671 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2672 0x67, 0x68, 0x69,
2673 ]));
2674 assert!(!is_utf16_bidi(&[
2675 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2676 0x67, 0x68, 0x69,
2677 ]));
2678 assert!(!is_utf16_bidi(&[
2679 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2680 0x67, 0x68, 0x69,
2681 ]));
2682 assert!(!is_utf16_bidi(&[
2683 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2684 0x67, 0x68, 0x69,
2685 ]));
2686 assert!(!is_utf16_bidi(&[
2687 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2688 0x67, 0x68, 0x69,
2689 ]));
2690 assert!(!is_utf16_bidi(&[
2691 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2692 0x67, 0x68, 0x69,
2693 ]));
2694 assert!(!is_utf16_bidi(&[
2695 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2696 0x67, 0x68, 0x69,
2697 ]));
2698 assert!(is_utf16_bidi(&[
2699 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2700 0x67, 0x68, 0x69,
2701 ]));
2702 assert!(is_utf16_bidi(&[
2703 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2704 0x67, 0x68, 0x69,
2705 ]));
2706 assert!(is_utf16_bidi(&[
2707 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2708 0x67, 0x68, 0x69,
2709 ]));
2710 assert!(is_utf16_bidi(&[
2711 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2712 0x67, 0x68, 0x69,
2713 ]));
2714 assert!(is_utf16_bidi(&[
2715 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2716 0x67, 0x68, 0x69,
2717 ]));
2718 assert!(is_utf16_bidi(&[
2719 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2720 0x67, 0x68, 0x69,
2721 ]));
2722 assert!(is_utf16_bidi(&[
2723 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2724 0x67, 0x68, 0x69,
2725 ]));
2726 assert!(is_utf16_bidi(&[
2727 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2728 0x67, 0x68, 0x69,
2729 ]));
2730 assert!(is_utf16_bidi(&[
2731 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2732 0x67, 0x68, 0x69,
2733 ]));
2734 assert!(is_utf16_bidi(&[
2735 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2736 0x67, 0x68, 0x69,
2737 ]));
2738 assert!(is_utf16_bidi(&[
2739 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2740 0x67, 0x68, 0x69,
2741 ]));
2742 assert!(is_utf16_bidi(&[
2743 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2744 0x67, 0x68, 0x69,
2745 ]));
2746 assert!(is_utf16_bidi(&[
2747 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2748 0x67, 0x68, 0x69,
2749 ]));
2750 assert!(is_utf16_bidi(&[
2751 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2752 0x67, 0x68, 0x69,
2753 ]));
2754 assert!(is_utf16_bidi(&[
2755 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2756 0x67, 0x68, 0x69,
2757 ]));
2758 assert!(is_utf16_bidi(&[
2759 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2760 0x67, 0x68, 0x69,
2761 ]));
2762
2763 assert!(is_utf16_bidi(&[
2764 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2765 0x66, 0x67, 0x68, 0x69,
2766 ]));
2767 }
2768
2769 #[test]
2770 fn test_check_str_for_latin1_and_bidi() {
2771 assert_ne!(
2772 check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2773 Latin1Bidi::Bidi
2774 );
2775 assert_ne!(
2776 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2777 Latin1Bidi::Bidi
2778 );
2779 assert_ne!(
2780 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2781 Latin1Bidi::Bidi
2782 );
2783 assert_ne!(
2784 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2785 Latin1Bidi::Bidi
2786 );
2787 assert_ne!(
2788 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2789 Latin1Bidi::Bidi
2790 );
2791 assert_ne!(
2792 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2793 Latin1Bidi::Bidi
2794 );
2795 assert_ne!(
2796 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2797 Latin1Bidi::Bidi
2798 );
2799 assert_eq!(
2800 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2801 Latin1Bidi::Bidi
2802 );
2803 assert_eq!(
2804 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2805 Latin1Bidi::Bidi
2806 );
2807 assert_eq!(
2808 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2809 Latin1Bidi::Bidi
2810 );
2811 assert_eq!(
2812 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2813 Latin1Bidi::Bidi
2814 );
2815 assert_eq!(
2816 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2817 Latin1Bidi::Bidi
2818 );
2819 assert_eq!(
2820 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2821 Latin1Bidi::Bidi
2822 );
2823 assert_eq!(
2824 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2825 Latin1Bidi::Bidi
2826 );
2827 assert_eq!(
2828 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2829 Latin1Bidi::Bidi
2830 );
2831 assert_eq!(
2832 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2833 Latin1Bidi::Bidi
2834 );
2835 assert_eq!(
2836 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2837 Latin1Bidi::Bidi
2838 );
2839 assert_eq!(
2840 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2841 Latin1Bidi::Bidi
2842 );
2843 assert_eq!(
2844 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2845 Latin1Bidi::Bidi
2846 );
2847 assert_eq!(
2848 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2849 Latin1Bidi::Bidi
2850 );
2851 assert_eq!(
2852 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2853 Latin1Bidi::Bidi
2854 );
2855 assert_eq!(
2856 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2857 Latin1Bidi::Bidi
2858 );
2859 }
2860
2861 #[test]
2862 fn test_check_utf8_for_latin1_and_bidi() {
2863 assert_ne!(
2864 check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2865 Latin1Bidi::Bidi
2866 );
2867 assert_ne!(
2868 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2869 Latin1Bidi::Bidi
2870 );
2871 assert_ne!(
2872 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2873 Latin1Bidi::Bidi
2874 );
2875 assert_ne!(
2876 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2877 Latin1Bidi::Bidi
2878 );
2879 assert_ne!(
2880 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2881 Latin1Bidi::Bidi
2882 );
2883 assert_ne!(
2884 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2885 Latin1Bidi::Bidi
2886 );
2887 assert_ne!(
2888 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2889 Latin1Bidi::Bidi
2890 );
2891 assert_eq!(
2892 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2893 Latin1Bidi::Bidi
2894 );
2895 assert_eq!(
2896 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2897 Latin1Bidi::Bidi
2898 );
2899 assert_eq!(
2900 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2901 Latin1Bidi::Bidi
2902 );
2903 assert_eq!(
2904 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2905 Latin1Bidi::Bidi
2906 );
2907 assert_eq!(
2908 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2909 Latin1Bidi::Bidi
2910 );
2911 assert_eq!(
2912 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2913 Latin1Bidi::Bidi
2914 );
2915 assert_eq!(
2916 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2917 Latin1Bidi::Bidi
2918 );
2919 assert_eq!(
2920 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2921 Latin1Bidi::Bidi
2922 );
2923 assert_eq!(
2924 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2925 Latin1Bidi::Bidi
2926 );
2927 assert_eq!(
2928 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2929 Latin1Bidi::Bidi
2930 );
2931 assert_eq!(
2932 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2933 Latin1Bidi::Bidi
2934 );
2935 assert_eq!(
2936 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2937 Latin1Bidi::Bidi
2938 );
2939 assert_eq!(
2940 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2941 Latin1Bidi::Bidi
2942 );
2943 assert_eq!(
2944 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2945 Latin1Bidi::Bidi
2946 );
2947 assert_eq!(
2948 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2949 Latin1Bidi::Bidi
2950 );
2951 }
2952
2953 #[test]
2954 fn test_check_utf16_for_latin1_and_bidi() {
2955 assert_ne!(
2956 check_utf16_for_latin1_and_bidi(&[
2957 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2958 0x66, 0x67, 0x68, 0x69,
2959 ]),
2960 Latin1Bidi::Bidi
2961 );
2962 assert_ne!(
2963 check_utf16_for_latin1_and_bidi(&[
2964 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2965 0x66, 0x67, 0x68, 0x69,
2966 ]),
2967 Latin1Bidi::Bidi
2968 );
2969 assert_ne!(
2970 check_utf16_for_latin1_and_bidi(&[
2971 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2972 0x66, 0x67, 0x68, 0x69,
2973 ]),
2974 Latin1Bidi::Bidi
2975 );
2976 assert_ne!(
2977 check_utf16_for_latin1_and_bidi(&[
2978 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2979 0x66, 0x67, 0x68, 0x69,
2980 ]),
2981 Latin1Bidi::Bidi
2982 );
2983 assert_ne!(
2984 check_utf16_for_latin1_and_bidi(&[
2985 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
2986 0x66, 0x67, 0x68, 0x69,
2987 ]),
2988 Latin1Bidi::Bidi
2989 );
2990 assert_ne!(
2991 check_utf16_for_latin1_and_bidi(&[
2992 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
2993 0x66, 0x67, 0x68, 0x69,
2994 ]),
2995 Latin1Bidi::Bidi
2996 );
2997 assert_ne!(
2998 check_utf16_for_latin1_and_bidi(&[
2999 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
3000 0x66, 0x67, 0x68, 0x69,
3001 ]),
3002 Latin1Bidi::Bidi
3003 );
3004 assert_eq!(
3005 check_utf16_for_latin1_and_bidi(&[
3006 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
3007 0x66, 0x67, 0x68, 0x69,
3008 ]),
3009 Latin1Bidi::Bidi
3010 );
3011 assert_eq!(
3012 check_utf16_for_latin1_and_bidi(&[
3013 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3014 0x66, 0x67, 0x68, 0x69,
3015 ]),
3016 Latin1Bidi::Bidi
3017 );
3018 assert_eq!(
3019 check_utf16_for_latin1_and_bidi(&[
3020 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3021 0x66, 0x67, 0x68, 0x69,
3022 ]),
3023 Latin1Bidi::Bidi
3024 );
3025 assert_eq!(
3026 check_utf16_for_latin1_and_bidi(&[
3027 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3028 0x66, 0x67, 0x68, 0x69,
3029 ]),
3030 Latin1Bidi::Bidi
3031 );
3032 assert_eq!(
3033 check_utf16_for_latin1_and_bidi(&[
3034 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3035 0x66, 0x67, 0x68, 0x69,
3036 ]),
3037 Latin1Bidi::Bidi
3038 );
3039 assert_eq!(
3040 check_utf16_for_latin1_and_bidi(&[
3041 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3042 0x66, 0x67, 0x68, 0x69,
3043 ]),
3044 Latin1Bidi::Bidi
3045 );
3046 assert_eq!(
3047 check_utf16_for_latin1_and_bidi(&[
3048 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3049 0x66, 0x67, 0x68, 0x69,
3050 ]),
3051 Latin1Bidi::Bidi
3052 );
3053 assert_eq!(
3054 check_utf16_for_latin1_and_bidi(&[
3055 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3056 0x66, 0x67, 0x68, 0x69,
3057 ]),
3058 Latin1Bidi::Bidi
3059 );
3060 assert_eq!(
3061 check_utf16_for_latin1_and_bidi(&[
3062 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3063 0x66, 0x67, 0x68, 0x69,
3064 ]),
3065 Latin1Bidi::Bidi
3066 );
3067 assert_eq!(
3068 check_utf16_for_latin1_and_bidi(&[
3069 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3070 0x66, 0x67, 0x68, 0x69,
3071 ]),
3072 Latin1Bidi::Bidi
3073 );
3074 assert_eq!(
3075 check_utf16_for_latin1_and_bidi(&[
3076 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3077 0x66, 0x67, 0x68, 0x69,
3078 ]),
3079 Latin1Bidi::Bidi
3080 );
3081 assert_eq!(
3082 check_utf16_for_latin1_and_bidi(&[
3083 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3084 0x66, 0x67, 0x68, 0x69,
3085 ]),
3086 Latin1Bidi::Bidi
3087 );
3088 assert_eq!(
3089 check_utf16_for_latin1_and_bidi(&[
3090 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3091 0x66, 0x67, 0x68, 0x69,
3092 ]),
3093 Latin1Bidi::Bidi
3094 );
3095 assert_eq!(
3096 check_utf16_for_latin1_and_bidi(&[
3097 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3098 0x66, 0x67, 0x68, 0x69,
3099 ]),
3100 Latin1Bidi::Bidi
3101 );
3102 assert_eq!(
3103 check_utf16_for_latin1_and_bidi(&[
3104 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3105 0x66, 0x67, 0x68, 0x69,
3106 ]),
3107 Latin1Bidi::Bidi
3108 );
3109 assert_eq!(
3110 check_utf16_for_latin1_and_bidi(&[
3111 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3112 0x66, 0x67, 0x68, 0x69,
3113 ]),
3114 Latin1Bidi::Bidi
3115 );
3116
3117 assert_eq!(
3118 check_utf16_for_latin1_and_bidi(&[
3119 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3120 0x65, 0x66, 0x67, 0x68, 0x69,
3121 ]),
3122 Latin1Bidi::Bidi
3123 );
3124 }
3125
3126 #[inline(always)]
3127 pub fn reference_is_char_bidi(c: char) -> bool {
3128 match c {
3129 '\u{0590}'..='\u{08FF}'
3130 | '\u{FB1D}'..='\u{FDFF}'
3131 | '\u{FE70}'..='\u{FEFE}'
3132 | '\u{10800}'..='\u{10FFF}'
3133 | '\u{1E800}'..='\u{1EFFF}'
3134 | '\u{200F}'
3135 | '\u{202B}'
3136 | '\u{202E}'
3137 | '\u{2067}' => true,
3138 _ => false,
3139 }
3140 }
3141
3142 #[inline(always)]
3143 pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3144 match u {
3145 0x0590..=0x08FF
3146 | 0xFB1D..=0xFDFF
3147 | 0xFE70..=0xFEFE
3148 | 0xD802
3149 | 0xD803
3150 | 0xD83A
3151 | 0xD83B
3152 | 0x200F
3153 | 0x202B
3154 | 0x202E
3155 | 0x2067 => true,
3156 _ => false,
3157 }
3158 }
3159
3160 #[test]
3161 #[cfg_attr(miri, ignore)] // Miri is too slow
3162 fn test_is_char_bidi_thoroughly() {
3163 for i in 0..0xD800u32 {
3164 let c: char = ::core::char::from_u32(i).unwrap();
3165 assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3166 }
3167 for i in 0xE000..0x110000u32 {
3168 let c: char = ::core::char::from_u32(i).unwrap();
3169 assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3170 }
3171 }
3172
3173 #[test]
3174 #[cfg_attr(miri, ignore)] // Miri is too slow
3175 fn test_is_utf16_code_unit_bidi_thoroughly() {
3176 for i in 0..0x10000u32 {
3177 let u = i as u16;
3178 assert_eq!(
3179 is_utf16_code_unit_bidi(u),
3180 reference_is_utf16_code_unit_bidi(u)
3181 );
3182 }
3183 }
3184
3185 #[test]
3186 #[cfg_attr(miri, ignore)] // Miri is too slow
3187 fn test_is_str_bidi_thoroughly() {
3188 let mut buf = [0; 4];
3189 for i in 0..0xD800u32 {
3190 let c: char = ::core::char::from_u32(i).unwrap();
3191 assert_eq!(
3192 is_str_bidi(c.encode_utf8(&mut buf[..])),
3193 reference_is_char_bidi(c)
3194 );
3195 }
3196 for i in 0xE000..0x110000u32 {
3197 let c: char = ::core::char::from_u32(i).unwrap();
3198 assert_eq!(
3199 is_str_bidi(c.encode_utf8(&mut buf[..])),
3200 reference_is_char_bidi(c)
3201 );
3202 }
3203 }
3204
3205 #[test]
3206 #[cfg_attr(miri, ignore)] // Miri is too slow
3207 fn test_is_utf8_bidi_thoroughly() {
3208 let mut buf = [0; 8];
3209 for i in 0..0xD800u32 {
3210 let c: char = ::core::char::from_u32(i).unwrap();
3211 let expect = reference_is_char_bidi(c);
3212 {
3213 let len = {
3214 let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3215 assert_eq!(is_utf8_bidi(bytes), expect);
3216 bytes.len()
3217 };
3218 {
3219 let tail = &mut buf[len..];
3220 for b in tail.iter_mut() {
3221 *b = 0;
3222 }
3223 }
3224 }
3225 assert_eq!(is_utf8_bidi(&buf[..]), expect);
3226 }
3227 for i in 0xE000..0x110000u32 {
3228 let c: char = ::core::char::from_u32(i).unwrap();
3229 let expect = reference_is_char_bidi(c);
3230 {
3231 let len = {
3232 let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3233 assert_eq!(is_utf8_bidi(bytes), expect);
3234 bytes.len()
3235 };
3236 {
3237 let tail = &mut buf[len..];
3238 for b in tail.iter_mut() {
3239 *b = 0;
3240 }
3241 }
3242 }
3243 assert_eq!(is_utf8_bidi(&buf[..]), expect);
3244 }
3245 }
3246
3247 #[test]
3248 #[cfg_attr(miri, ignore)] // Miri is too slow
3249 fn test_is_utf16_bidi_thoroughly() {
3250 let mut buf = [0; 32];
3251 for i in 0..0x10000u32 {
3252 let u = i as u16;
3253 buf[15] = u;
3254 assert_eq!(
3255 is_utf16_bidi(&buf[..]),
3256 reference_is_utf16_code_unit_bidi(u)
3257 );
3258 }
3259 }
3260
3261 #[test]
3262 fn test_is_utf8_bidi_edge_cases() {
3263 assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3264 assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3265 assert!(!is_utf8_bidi(b"abc"));
3266 assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3267 assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3268 assert!(is_utf8_bidi(b"ab\xC2"));
3269 }
3270
3271 #[test]
3272 fn test_decode_latin1() {
3273 match decode_latin1(b"ab") {
3274 Cow::Borrowed(s) => {
3275 assert_eq!(s, "ab");
3276 }
3277 Cow::Owned(_) => {
3278 unreachable!("Should have borrowed");
3279 }
3280 }
3281 assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3282 }
3283
3284 #[test]
3285 fn test_encode_latin1_lossy() {
3286 match encode_latin1_lossy("ab") {
3287 Cow::Borrowed(s) => {
3288 assert_eq!(s, b"ab");
3289 }
3290 Cow::Owned(_) => {
3291 unreachable!("Should have borrowed");
3292 }
3293 }
3294 assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3295 }
3296
3297 #[test]
3298 fn test_convert_utf8_to_utf16_without_replacement() {
3299 let mut buf = [0u16; 5];
3300 assert_eq!(
3301 convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3302 Some(2)
3303 );
3304 assert_eq!(buf[0], u16::from(b'a'));
3305 assert_eq!(buf[1], u16::from(b'b'));
3306 assert_eq!(buf[2], 0);
3307 assert_eq!(
3308 convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3309 Some(2)
3310 );
3311 assert_eq!(buf[0], 0xE4);
3312 assert_eq!(buf[1], u16::from(b'c'));
3313 assert_eq!(buf[2], 0);
3314 assert_eq!(
3315 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3316 Some(1)
3317 );
3318 assert_eq!(buf[0], 0x2603);
3319 assert_eq!(buf[1], u16::from(b'c'));
3320 assert_eq!(buf[2], 0);
3321 assert_eq!(
3322 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3323 Some(2)
3324 );
3325 assert_eq!(buf[0], 0x2603);
3326 assert_eq!(buf[1], u16::from(b'd'));
3327 assert_eq!(buf[2], 0);
3328 assert_eq!(
3329 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3330 Some(2)
3331 );
3332 assert_eq!(buf[0], 0x2603);
3333 assert_eq!(buf[1], 0xE4);
3334 assert_eq!(buf[2], 0);
3335 assert_eq!(
3336 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3337 Some(2)
3338 );
3339 assert_eq!(buf[0], 0xD83D);
3340 assert_eq!(buf[1], 0xDCCE);
3341 assert_eq!(buf[2], 0);
3342 assert_eq!(
3343 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3344 Some(3)
3345 );
3346 assert_eq!(buf[0], 0xD83D);
3347 assert_eq!(buf[1], 0xDCCE);
3348 assert_eq!(buf[2], u16::from(b'e'));
3349 assert_eq!(
3350 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3351 None
3352 );
3353 }
3354}
3355