1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! Functions for converting between different in-RAM representations of text
11//! and for quickly checking if the Unicode Bidirectional Algorithm can be
12//! avoided.
13//!
14//! By using slices for output, the functions here seek to enable by-register
15//! (ALU register or SIMD register as available) operations in order to
16//! outperform iterator-based conversions available in the Rust standard
17//! library.
18//!
19//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20//! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21//! in-memory encoding is sometimes used as a storage optimization of text
22//! when UTF-16 indexing and length semantics are exposed.
23//!
24//! The FFI binding for this module are in the
25//! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26
27#[cfg(feature = "alloc")]
28use alloc::borrow::Cow;
29#[cfg(feature = "alloc")]
30use alloc::string::String;
31#[cfg(feature = "alloc")]
32use alloc::vec::Vec;
33
34use super::in_inclusive_range16;
35use super::in_inclusive_range32;
36use super::in_inclusive_range8;
37use super::in_range16;
38use super::in_range32;
39use super::DecoderResult;
40use crate::ascii::*;
41use crate::utf_8::*;
42
43macro_rules! non_fuzz_debug_assert {
44 ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
45}
46
47cfg_if! {
48 if #[cfg(feature = "simd-accel")] {
49 use ::core::intrinsics::likely;
50 use ::core::intrinsics::unlikely;
51 } else {
52 #[inline(always)]
53 fn likely(b: bool) -> bool {
54 b
55 }
56 #[inline(always)]
57 fn unlikely(b: bool) -> bool {
58 b
59 }
60 }
61}
62
63/// Classification of text as Latin1 (all code points are below U+0100),
64/// left-to-right with some non-Latin1 characters or as containing at least
65/// some right-to-left characters.
66#[must_use]
67#[derive(Debug, PartialEq, Eq)]
68#[repr(C)]
69pub enum Latin1Bidi {
70 /// Every character is below U+0100.
71 Latin1 = 0,
72 /// There is at least one character that's U+0100 or higher, but there
73 /// are no right-to-left characters.
74 LeftToRight = 1,
75 /// There is at least one right-to-left character.
76 Bidi = 2,
77}
78
79// `as` truncates, so works on 32-bit, too.
80#[allow(dead_code)]
81const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
82
83#[allow(unused_macros)]
84macro_rules! by_unit_check_alu {
85 ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
86 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
87 #[inline(always)]
88 fn $name(buffer: &[$unit]) -> bool {
89 let mut offset = 0usize;
90 let mut accu = 0usize;
91 let unit_size = ::core::mem::size_of::<$unit>();
92 let len = buffer.len();
93 if len >= ALU_ALIGNMENT / unit_size {
94 // The most common reason to return `false` is for the first code
95 // unit to fail the test, so check that first.
96 if buffer[0] >= $bound {
97 return false;
98 }
99 let src = buffer.as_ptr();
100 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
101 & ALU_ALIGNMENT_MASK)
102 / unit_size;
103 if until_alignment + ALU_ALIGNMENT / unit_size <= len {
104 if until_alignment != 0 {
105 accu |= buffer[offset] as usize;
106 offset += 1;
107 until_alignment -= 1;
108 while until_alignment != 0 {
109 accu |= buffer[offset] as usize;
110 offset += 1;
111 until_alignment -= 1;
112 }
113 if accu >= $bound {
114 return false;
115 }
116 }
117 let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
118 if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
119 // Safety: the above check lets us perform 4 consecutive reads of
120 // length ALU_ALIGNMENT / unit_size. ALU_ALIGNMENT is the size of usize, and unit_size
121 // is the size of the `src` pointer, so this is equal to performing four usize reads.
122 //
123 // This invariant is upheld on all loop iterations
124 let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
125 loop {
126 let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
127 | unsafe {
128 *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
129 }
130 | unsafe {
131 *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
132 as *const usize)
133 }
134 | unsafe {
135 *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
136 as *const usize)
137 };
138 if unroll_accu & $mask != 0 {
139 return false;
140 }
141 offset += 4 * (ALU_ALIGNMENT / unit_size);
142 // Safety: this check lets us continue to perform the 4 reads earlier
143 if offset > len_minus_unroll {
144 break;
145 }
146 }
147 }
148 while offset <= len_minus_stride {
149 // Safety: the above check lets us perform one usize read.
150 accu |= unsafe { *(src.add(offset) as *const usize) };
151 offset += ALU_ALIGNMENT / unit_size;
152 }
153 }
154 }
155 for &unit in &buffer[offset..] {
156 accu |= unit as usize;
157 }
158 accu & $mask == 0
159 }
160 };
161}
162
163#[allow(unused_macros)]
164macro_rules! by_unit_check_simd {
165 ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
166 #[inline(always)]
167 fn $name(buffer: &[$unit]) -> bool {
168 let mut offset = 0usize;
169 let mut accu = 0usize;
170 let unit_size = ::core::mem::size_of::<$unit>();
171 let len = buffer.len();
172 if len >= SIMD_STRIDE_SIZE / unit_size {
173 // The most common reason to return `false` is for the first code
174 // unit to fail the test, so check that first.
175 if buffer[0] >= $bound {
176 return false;
177 }
178 let src = buffer.as_ptr();
179 let mut until_alignment = ((SIMD_ALIGNMENT
180 - ((src as usize) & SIMD_ALIGNMENT_MASK))
181 & SIMD_ALIGNMENT_MASK)
182 / unit_size;
183 if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
184 if until_alignment != 0 {
185 accu |= buffer[offset] as usize;
186 offset += 1;
187 until_alignment -= 1;
188 while until_alignment != 0 {
189 accu |= buffer[offset] as usize;
190 offset += 1;
191 until_alignment -= 1;
192 }
193 if accu >= $bound {
194 return false;
195 }
196 }
197 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
198 if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
199 // Safety: the above check lets us perform 4 consecutive reads of
200 // length SIMD_STRIDE_SIZE / unit_size. SIMD_STRIDE_SIZE is the size of $simd_ty, and unit_size
201 // is the size of the `src` pointer, so this is equal to performing four $simd_ty reads.
202 //
203 // This invariant is upheld on all loop iterations
204 let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
205 loop {
206 let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
207 | unsafe {
208 *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
209 as *const $simd_ty)
210 }
211 | unsafe {
212 *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
213 as *const $simd_ty)
214 }
215 | unsafe {
216 *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
217 as *const $simd_ty)
218 };
219 if !$func(unroll_accu) {
220 return false;
221 }
222 offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
223 // Safety: this check lets us continue to perform the 4 reads earlier
224 if offset > len_minus_unroll {
225 break;
226 }
227 }
228 }
229 let mut simd_accu = $splat;
230 while offset <= len_minus_stride {
231 // Safety: the above check lets us perform one $simd_ty read.
232 simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
233 offset += SIMD_STRIDE_SIZE / unit_size;
234 }
235 if !$func(simd_accu) {
236 return false;
237 }
238 }
239 }
240 for &unit in &buffer[offset..] {
241 accu |= unit as usize;
242 }
243 accu < $bound
244 }
245 };
246}
247
248cfg_if! {
249 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
250 use crate::simd_funcs::*;
251 use core::simd::u8x16;
252 use core::simd::u16x8;
253
254 const SIMD_ALIGNMENT: usize = 16;
255
256 const SIMD_ALIGNMENT_MASK: usize = 15;
257
258 by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
259 by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
260 by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
261
262 #[inline(always)]
263 fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
264 // This function is a mess, because it simultaneously tries to do
265 // only aligned SIMD (perhaps misguidedly) and needs to deal with
266 // the last code unit in a SIMD stride being part of a valid
267 // surrogate pair.
268 let unit_size = ::core::mem::size_of::<u16>();
269 let src = buffer.as_ptr();
270 let len = buffer.len();
271 let mut offset = 0usize;
272 'outer: loop {
273 let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
274 SIMD_ALIGNMENT_MASK) / unit_size;
275 if until_alignment == 0 {
276 if offset + SIMD_STRIDE_SIZE / unit_size > len {
277 break;
278 }
279 } else {
280 let offset_plus_until_alignment = offset + until_alignment;
281 let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
282 if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
283 break;
284 }
285 let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
286 if up_to < until_alignment {
287 return offset + up_to;
288 }
289 if last_valid_low {
290 offset = offset_plus_until_alignment_plus_one;
291 continue;
292 }
293 offset = offset_plus_until_alignment;
294 }
295 let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
296 loop {
297 let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
298 if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
299 if offset_plus_stride == len {
300 break 'outer;
301 }
302 let offset_plus_stride_plus_one = offset_plus_stride + 1;
303 let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
304 if up_to < SIMD_STRIDE_SIZE / unit_size {
305 return offset + up_to;
306 }
307 if last_valid_low {
308 offset = offset_plus_stride_plus_one;
309 continue 'outer;
310 }
311 }
312 offset = offset_plus_stride;
313 if offset > len_minus_stride {
314 break 'outer;
315 }
316 }
317 }
318 let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
319 offset + up_to
320 }
321 } else {
322 by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
323 by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
324 by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
325
326 #[inline(always)]
327 fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
328 let (up_to, _) = utf16_valid_up_to_alu(buffer);
329 up_to
330 }
331 }
332}
333
334/// The second return value is true iff the last code unit of the slice was
335/// reached and turned out to be a low surrogate that is part of a valid pair.
336#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
337#[inline(always)]
338fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
339 let len = buffer.len();
340 if len == 0 {
341 return (0, false);
342 }
343 let mut offset = 0usize;
344 loop {
345 let unit = buffer[offset];
346 let next = offset + 1;
347 let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
348 if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
349 // Not a surrogate
350 offset = next;
351 if offset == len {
352 return (offset, false);
353 }
354 continue;
355 }
356 if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
357 // high surrogate
358 if next < len {
359 let second = buffer[next];
360 let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
361 if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
362 // The next code unit is a low surrogate. Advance position.
363 offset = next + 1;
364 if offset == len {
365 return (offset, true);
366 }
367 continue;
368 }
369 // The next code unit is not a low surrogate. Don't advance
370 // position and treat the high surrogate as unpaired.
371 // fall through
372 }
373 // Unpaired, fall through
374 }
375 // Unpaired surrogate
376 return (offset, false);
377 }
378}
379
380cfg_if! {
381 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
382 #[inline(always)]
383 fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
384 let mut offset = 0usize;
385 let bytes = buffer.as_bytes();
386 let len = bytes.len();
387 if len >= SIMD_STRIDE_SIZE {
388 let src = bytes.as_ptr();
389 let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
390 SIMD_ALIGNMENT_MASK;
391 if until_alignment + SIMD_STRIDE_SIZE <= len {
392 while until_alignment != 0 {
393 if bytes[offset] > 0xC3 {
394 return Some(offset);
395 }
396 offset += 1;
397 until_alignment -= 1;
398 }
399 let len_minus_stride = len - SIMD_STRIDE_SIZE;
400 loop {
401 if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
402 // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
403 while bytes[offset] & 0xC0 == 0x80 {
404 offset += 1;
405 }
406 return Some(offset);
407 }
408 offset += SIMD_STRIDE_SIZE;
409 if offset > len_minus_stride {
410 break;
411 }
412 }
413 }
414 }
415 for i in offset..len {
416 if bytes[i] > 0xC3 {
417 return Some(i);
418 }
419 }
420 None
421 }
422 } else {
423 #[inline(always)]
424 fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
425 let mut bytes = buffer.as_bytes();
426 let mut total = 0;
427 loop {
428 if let Some((byte, offset)) = validate_ascii(bytes) {
429 total += offset;
430 if byte > 0xC3 {
431 return Some(total);
432 }
433 bytes = &bytes[offset + 2..];
434 total += 2;
435 } else {
436 return None;
437 }
438 }
439 }
440 }
441}
442
443#[inline(always)]
444fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
445 let mut bytes: &[u8] = buffer;
446 let mut total: usize = 0;
447 loop {
448 if let Some((byte: u8, offset: usize)) = validate_ascii(slice:bytes) {
449 total += offset;
450 if in_inclusive_range8(i:byte, start:0xC2, end:0xC3) {
451 let next: usize = offset + 1;
452 if next == bytes.len() {
453 return Some(total);
454 }
455 if bytes[next] & 0xC0 != 0x80 {
456 return Some(total);
457 }
458 bytes = &bytes[offset + 2..];
459 total += 2;
460 } else {
461 return Some(total);
462 }
463 } else {
464 return None;
465 }
466 }
467}
468
469cfg_if! {
470 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
471 #[inline(always)]
472 fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
473 let mut offset = 0usize;
474 let len = buffer.len();
475 if len >= SIMD_STRIDE_SIZE / 2 {
476 let src = buffer.as_ptr();
477 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
478 SIMD_ALIGNMENT_MASK) / 2;
479 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
480 while until_alignment != 0 {
481 if is_utf16_code_unit_bidi(buffer[offset]) {
482 return true;
483 }
484 offset += 1;
485 until_alignment -= 1;
486 }
487 let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
488 loop {
489 if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
490 return true;
491 }
492 offset += SIMD_STRIDE_SIZE / 2;
493 if offset > len_minus_stride {
494 break;
495 }
496 }
497 }
498 }
499 for &u in &buffer[offset..] {
500 if is_utf16_code_unit_bidi(u) {
501 return true;
502 }
503 }
504 false
505 }
506 } else {
507 #[inline(always)]
508 fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
509 for &u in buffer {
510 if is_utf16_code_unit_bidi(u) {
511 return true;
512 }
513 }
514 false
515 }
516 }
517}
518
519cfg_if! {
520 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
521 #[inline(always)]
522 fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
523 let mut offset = 0usize;
524 let len = buffer.len();
525 if len >= SIMD_STRIDE_SIZE / 2 {
526 let src = buffer.as_ptr();
527 let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
528 SIMD_ALIGNMENT_MASK) / 2;
529 if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
530 while until_alignment != 0 {
531 if buffer[offset] > 0xFF {
532 // This transition isn't optimal, since the aligment is recomputing
533 // but not tweaking further today.
534 if is_utf16_bidi_impl(&buffer[offset..]) {
535 return Latin1Bidi::Bidi;
536 }
537 return Latin1Bidi::LeftToRight;
538 }
539 offset += 1;
540 until_alignment -= 1;
541 }
542 let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
543 loop {
544 let mut s = unsafe { *(src.add(offset) as *const u16x8) };
545 if !simd_is_latin1(s) {
546 loop {
547 if is_u16x8_bidi(s) {
548 return Latin1Bidi::Bidi;
549 }
550 offset += SIMD_STRIDE_SIZE / 2;
551 if offset > len_minus_stride {
552 for &u in &buffer[offset..] {
553 if is_utf16_code_unit_bidi(u) {
554 return Latin1Bidi::Bidi;
555 }
556 }
557 return Latin1Bidi::LeftToRight;
558 }
559 s = unsafe { *(src.add(offset) as *const u16x8) };
560 }
561 }
562 offset += SIMD_STRIDE_SIZE / 2;
563 if offset > len_minus_stride {
564 break;
565 }
566 }
567 }
568 }
569 let mut iter = (&buffer[offset..]).iter();
570 loop {
571 if let Some(&u) = iter.next() {
572 if u > 0xFF {
573 let mut inner_u = u;
574 loop {
575 if is_utf16_code_unit_bidi(inner_u) {
576 return Latin1Bidi::Bidi;
577 }
578 if let Some(&code_unit) = iter.next() {
579 inner_u = code_unit;
580 } else {
581 return Latin1Bidi::LeftToRight;
582 }
583 }
584 }
585 } else {
586 return Latin1Bidi::Latin1;
587 }
588 }
589 }
590 } else {
591 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
592 #[inline(always)]
593 fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
594 let mut offset = 0usize;
595 let len = buffer.len();
596 if len >= ALU_ALIGNMENT / 2 {
597 let src = buffer.as_ptr();
598 let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
599 ALU_ALIGNMENT_MASK) / 2;
600 if until_alignment + ALU_ALIGNMENT / 2 <= len {
601 while until_alignment != 0 {
602 if buffer[offset] > 0xFF {
603 if is_utf16_bidi_impl(&buffer[offset..]) {
604 return Latin1Bidi::Bidi;
605 }
606 return Latin1Bidi::LeftToRight;
607 }
608 offset += 1;
609 until_alignment -= 1;
610 }
611 let len_minus_stride = len - ALU_ALIGNMENT / 2;
612 loop {
613 if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
614 if is_utf16_bidi_impl(&buffer[offset..]) {
615 return Latin1Bidi::Bidi;
616 }
617 return Latin1Bidi::LeftToRight;
618 }
619 offset += ALU_ALIGNMENT / 2;
620 if offset > len_minus_stride {
621 break;
622 }
623 }
624 }
625 }
626 let mut iter = (&buffer[offset..]).iter();
627 loop {
628 if let Some(&u) = iter.next() {
629 if u > 0xFF {
630 let mut inner_u = u;
631 loop {
632 if is_utf16_code_unit_bidi(inner_u) {
633 return Latin1Bidi::Bidi;
634 }
635 if let Some(&code_unit) = iter.next() {
636 inner_u = code_unit;
637 } else {
638 return Latin1Bidi::LeftToRight;
639 }
640 }
641 }
642 } else {
643 return Latin1Bidi::Latin1;
644 }
645 }
646 }
647 }
648}
649
650/// Checks whether the buffer is all-ASCII.
651///
652/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
653/// is not guaranteed to fail fast.)
654pub fn is_ascii(buffer: &[u8]) -> bool {
655 is_ascii_impl(buffer)
656}
657
658/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
659/// only ASCII characters).
660///
661/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
662/// is not guaranteed to fail fast.)
663pub fn is_basic_latin(buffer: &[u16]) -> bool {
664 is_basic_latin_impl(buffer)
665}
666
667/// Checks whether the buffer is valid UTF-8 representing only code points
668/// less than or equal to U+00FF.
669///
670/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
671/// invalidity or code points above U+00FF are discovered.
672pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
673 is_utf8_latin1_impl(buffer).is_none()
674}
675
676/// Checks whether the buffer represents only code points less than or equal
677/// to U+00FF.
678///
679/// Fails fast. (I.e. returns before having read the whole buffer if code
680/// points above U+00FF are discovered.
681pub fn is_str_latin1(buffer: &str) -> bool {
682 is_str_latin1_impl(buffer).is_none()
683}
684
685/// Checks whether the buffer represents only code point less than or equal
686/// to U+00FF.
687///
688/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
689/// is not guaranteed to fail fast.)
690pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
691 is_utf16_latin1_impl(buffer)
692}
693
694/// Checks whether a potentially-invalid UTF-8 buffer contains code points
695/// that trigger right-to-left processing.
696///
697/// The check is done on a Unicode block basis without regard to assigned
698/// vs. unassigned code points in the block. Hebrew presentation forms in
699/// the Alphabetic Presentation Forms block are treated as if they formed
700/// a block on their own (i.e. it treated as right-to-left). Additionally,
701/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
702/// for. Control characters that are technically bidi controls but do not
703/// cause right-to-left behavior without the presence of right-to-left
704/// characters or right-to-left controls are not checked for. As a special
705/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
706///
707/// Returns `true` if the input is invalid UTF-8 or the input contains an
708/// RTL character. Returns `false` if the input is valid UTF-8 and contains
709/// no RTL characters.
710#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
711#[inline]
712pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
713 // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
714 // than UTF-8 validation followed by `is_str_bidi()` for German,
715 // Russian and Japanese. However, this is considerably slower for Thai.
716 // Chances are that the compiler makes some branch predictions that are
717 // unfortunate for Thai. Not spending the time to manually optimize
718 // further at this time, since it's unclear if this variant even has
719 // use cases. However, this is worth revisiting once Rust gets the
720 // ability to annotate relative priorities of match arms.
721
722 // U+058F: D6 8F
723 // U+0590: D6 90
724 // U+08FF: E0 A3 BF
725 // U+0900: E0 A4 80
726 //
727 // U+200F: E2 80 8F
728 // U+202B: E2 80 AB
729 // U+202E: E2 80 AE
730 // U+2067: E2 81 A7
731 //
732 // U+FB1C: EF AC 9C
733 // U+FB1D: EF AC 9D
734 // U+FDFF: EF B7 BF
735 // U+FE00: EF B8 80
736 //
737 // U+FE6F: EF B9 AF
738 // U+FE70: EF B9 B0
739 // U+FEFE: EF BB BE
740 // U+FEFF: EF BB BF
741 //
742 // U+107FF: F0 90 9F BF
743 // U+10800: F0 90 A0 80
744 // U+10FFF: F0 90 BF BF
745 // U+11000: F0 91 80 80
746 //
747 // U+1E7FF: F0 9E 9F BF
748 // U+1E800: F0 9E A0 80
749 // U+1EFFF: F0 9E BF BF
750 // U+1F000: F0 9F 80 80
751 let mut src = buffer;
752 'outer: loop {
753 if let Some((mut byte, mut read)) = validate_ascii(src) {
754 // Check for the longest sequence to avoid checking twice for the
755 // multi-byte sequences.
756 if read + 4 <= src.len() {
757 'inner: loop {
758 // At this point, `byte` is not included in `read`.
759 match byte {
760 0..=0x7F => {
761 // ASCII: go back to SIMD.
762 read += 1;
763 src = &src[read..];
764 continue 'outer;
765 }
766 0xC2..=0xD5 => {
767 // Two-byte
768 let second = unsafe { *(src.get_unchecked(read + 1)) };
769 if !in_inclusive_range8(second, 0x80, 0xBF) {
770 return true;
771 }
772 read += 2;
773 }
774 0xD6 => {
775 // Two-byte
776 let second = unsafe { *(src.get_unchecked(read + 1)) };
777 if !in_inclusive_range8(second, 0x80, 0xBF) {
778 return true;
779 }
780 // XXX consider folding the above and below checks
781 if second > 0x8F {
782 return true;
783 }
784 read += 2;
785 }
786 // two-byte starting with 0xD7 and above is bidi
787 0xE1 | 0xE3..=0xEC | 0xEE => {
788 // Three-byte normal
789 let second = unsafe { *(src.get_unchecked(read + 1)) };
790 let third = unsafe { *(src.get_unchecked(read + 2)) };
791 if ((UTF8_DATA.table[usize::from(second)]
792 & unsafe {
793 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
794 })
795 | (third >> 6))
796 != 2
797 {
798 return true;
799 }
800 read += 3;
801 }
802 0xE2 => {
803 // Three-byte normal, potentially bidi
804 let second = unsafe { *(src.get_unchecked(read + 1)) };
805 let third = unsafe { *(src.get_unchecked(read + 2)) };
806 if ((UTF8_DATA.table[usize::from(second)]
807 & unsafe {
808 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
809 })
810 | (third >> 6))
811 != 2
812 {
813 return true;
814 }
815 if second == 0x80 {
816 if third == 0x8F || third == 0xAB || third == 0xAE {
817 return true;
818 }
819 } else if second == 0x81 {
820 if third == 0xA7 {
821 return true;
822 }
823 }
824 read += 3;
825 }
826 0xEF => {
827 // Three-byte normal, potentially bidi
828 let second = unsafe { *(src.get_unchecked(read + 1)) };
829 let third = unsafe { *(src.get_unchecked(read + 2)) };
830 if ((UTF8_DATA.table[usize::from(second)]
831 & unsafe {
832 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
833 })
834 | (third >> 6))
835 != 2
836 {
837 return true;
838 }
839 if in_inclusive_range8(second, 0xAC, 0xB7) {
840 if second == 0xAC {
841 if third > 0x9C {
842 return true;
843 }
844 } else {
845 return true;
846 }
847 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
848 if second == 0xB9 {
849 if third > 0xAF {
850 return true;
851 }
852 } else if second == 0xBB {
853 if third != 0xBF {
854 return true;
855 }
856 } else {
857 return true;
858 }
859 }
860 read += 3;
861 }
862 0xE0 => {
863 // Three-byte special lower bound, potentially bidi
864 let second = unsafe { *(src.get_unchecked(read + 1)) };
865 let third = unsafe { *(src.get_unchecked(read + 2)) };
866 if ((UTF8_DATA.table[usize::from(second)]
867 & unsafe {
868 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
869 })
870 | (third >> 6))
871 != 2
872 {
873 return true;
874 }
875 // XXX can this be folded into the above validity check
876 if second < 0xA4 {
877 return true;
878 }
879 read += 3;
880 }
881 0xED => {
882 // Three-byte special upper bound
883 let second = unsafe { *(src.get_unchecked(read + 1)) };
884 let third = unsafe { *(src.get_unchecked(read + 2)) };
885 if ((UTF8_DATA.table[usize::from(second)]
886 & unsafe {
887 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
888 })
889 | (third >> 6))
890 != 2
891 {
892 return true;
893 }
894 read += 3;
895 }
896 0xF1..=0xF4 => {
897 // Four-byte normal
898 let second = unsafe { *(src.get_unchecked(read + 1)) };
899 let third = unsafe { *(src.get_unchecked(read + 2)) };
900 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
901 if (u16::from(
902 UTF8_DATA.table[usize::from(second)]
903 & unsafe {
904 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
905 },
906 ) | u16::from(third >> 6)
907 | (u16::from(fourth & 0xC0) << 2))
908 != 0x202
909 {
910 return true;
911 }
912 read += 4;
913 }
914 0xF0 => {
915 // Four-byte special lower bound, potentially bidi
916 let second = unsafe { *(src.get_unchecked(read + 1)) };
917 let third = unsafe { *(src.get_unchecked(read + 2)) };
918 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
919 if (u16::from(
920 UTF8_DATA.table[usize::from(second)]
921 & unsafe {
922 *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
923 },
924 ) | u16::from(third >> 6)
925 | (u16::from(fourth & 0xC0) << 2))
926 != 0x202
927 {
928 return true;
929 }
930 if unlikely(second == 0x90 || second == 0x9E) {
931 let third = src[read + 2];
932 if third >= 0xA0 {
933 return true;
934 }
935 }
936 read += 4;
937 }
938 _ => {
939 // Invalid lead or bidi-only lead
940 return true;
941 }
942 }
943 if read + 4 > src.len() {
944 if read == src.len() {
945 return false;
946 }
947 byte = src[read];
948 break 'inner;
949 }
950 byte = src[read];
951 continue 'inner;
952 }
953 }
954 // We can't have a complete 4-byte sequence, but we could still have
955 // a complete shorter sequence.
956
957 // At this point, `byte` is not included in `read`.
958 match byte {
959 0..=0x7F => {
960 // ASCII: go back to SIMD.
961 read += 1;
962 src = &src[read..];
963 continue 'outer;
964 }
965 0xC2..=0xD5 => {
966 // Two-byte
967 let new_read = read + 2;
968 if new_read > src.len() {
969 return true;
970 }
971 let second = unsafe { *(src.get_unchecked(read + 1)) };
972 if !in_inclusive_range8(second, 0x80, 0xBF) {
973 return true;
974 }
975 read = new_read;
976 // We need to deal with the case where we came here with 3 bytes
977 // left, so we need to take a look at the last one.
978 src = &src[read..];
979 continue 'outer;
980 }
981 0xD6 => {
982 // Two-byte, potentially bidi
983 let new_read = read + 2;
984 if new_read > src.len() {
985 return true;
986 }
987 let second = unsafe { *(src.get_unchecked(read + 1)) };
988 if !in_inclusive_range8(second, 0x80, 0xBF) {
989 return true;
990 }
991 // XXX consider folding the above and below checks
992 if second > 0x8F {
993 return true;
994 }
995 read = new_read;
996 // We need to deal with the case where we came here with 3 bytes
997 // left, so we need to take a look at the last one.
998 src = &src[read..];
999 continue 'outer;
1000 }
1001 // two-byte starting with 0xD7 and above is bidi
1002 0xE1 | 0xE3..=0xEC | 0xEE => {
1003 // Three-byte normal
1004 let new_read = read + 3;
1005 if new_read > src.len() {
1006 return true;
1007 }
1008 let second = unsafe { *(src.get_unchecked(read + 1)) };
1009 let third = unsafe { *(src.get_unchecked(read + 2)) };
1010 if ((UTF8_DATA.table[usize::from(second)]
1011 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1012 | (third >> 6))
1013 != 2
1014 {
1015 return true;
1016 }
1017 }
1018 0xE2 => {
1019 // Three-byte normal, potentially bidi
1020 let new_read = read + 3;
1021 if new_read > src.len() {
1022 return true;
1023 }
1024 let second = unsafe { *(src.get_unchecked(read + 1)) };
1025 let third = unsafe { *(src.get_unchecked(read + 2)) };
1026 if ((UTF8_DATA.table[usize::from(second)]
1027 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1028 | (third >> 6))
1029 != 2
1030 {
1031 return true;
1032 }
1033 if second == 0x80 {
1034 if third == 0x8F || third == 0xAB || third == 0xAE {
1035 return true;
1036 }
1037 } else if second == 0x81 {
1038 if third == 0xA7 {
1039 return true;
1040 }
1041 }
1042 }
1043 0xEF => {
1044 // Three-byte normal, potentially bidi
1045 let new_read = read + 3;
1046 if new_read > src.len() {
1047 return true;
1048 }
1049 let second = unsafe { *(src.get_unchecked(read + 1)) };
1050 let third = unsafe { *(src.get_unchecked(read + 2)) };
1051 if ((UTF8_DATA.table[usize::from(second)]
1052 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1053 | (third >> 6))
1054 != 2
1055 {
1056 return true;
1057 }
1058 if in_inclusive_range8(second, 0xAC, 0xB7) {
1059 if second == 0xAC {
1060 if third > 0x9C {
1061 return true;
1062 }
1063 } else {
1064 return true;
1065 }
1066 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1067 if second == 0xB9 {
1068 if third > 0xAF {
1069 return true;
1070 }
1071 } else if second == 0xBB {
1072 if third != 0xBF {
1073 return true;
1074 }
1075 } else {
1076 return true;
1077 }
1078 }
1079 }
1080 0xE0 => {
1081 // Three-byte special lower bound, potentially bidi
1082 let new_read = read + 3;
1083 if new_read > src.len() {
1084 return true;
1085 }
1086 let second = unsafe { *(src.get_unchecked(read + 1)) };
1087 let third = unsafe { *(src.get_unchecked(read + 2)) };
1088 if ((UTF8_DATA.table[usize::from(second)]
1089 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1090 | (third >> 6))
1091 != 2
1092 {
1093 return true;
1094 }
1095 // XXX can this be folded into the above validity check
1096 if second < 0xA4 {
1097 return true;
1098 }
1099 }
1100 0xED => {
1101 // Three-byte special upper bound
1102 let new_read = read + 3;
1103 if new_read > src.len() {
1104 return true;
1105 }
1106 let second = unsafe { *(src.get_unchecked(read + 1)) };
1107 let third = unsafe { *(src.get_unchecked(read + 2)) };
1108 if ((UTF8_DATA.table[usize::from(second)]
1109 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1110 | (third >> 6))
1111 != 2
1112 {
1113 return true;
1114 }
1115 }
1116 _ => {
1117 // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1118 return true;
1119 }
1120 }
1121 return false;
1122 } else {
1123 return false;
1124 }
1125 }
1126}
1127
1128/// Checks whether a valid UTF-8 buffer contains code points that trigger
1129/// right-to-left processing.
1130///
1131/// The check is done on a Unicode block basis without regard to assigned
1132/// vs. unassigned code points in the block. Hebrew presentation forms in
1133/// the Alphabetic Presentation Forms block are treated as if they formed
1134/// a block on their own (i.e. it treated as right-to-left). Additionally,
1135/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1136/// for. Control characters that are technically bidi controls but do not
1137/// cause right-to-left behavior without the presence of right-to-left
1138/// characters or right-to-left controls are not checked for. As a special
1139/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1140#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1141#[inline]
1142pub fn is_str_bidi(buffer: &str) -> bool {
1143 // U+058F: D6 8F
1144 // U+0590: D6 90
1145 // U+08FF: E0 A3 BF
1146 // U+0900: E0 A4 80
1147 //
1148 // U+200F: E2 80 8F
1149 // U+202B: E2 80 AB
1150 // U+202E: E2 80 AE
1151 // U+2067: E2 81 A7
1152 //
1153 // U+FB1C: EF AC 9C
1154 // U+FB1D: EF AC 9D
1155 // U+FDFF: EF B7 BF
1156 // U+FE00: EF B8 80
1157 //
1158 // U+FE6F: EF B9 AF
1159 // U+FE70: EF B9 B0
1160 // U+FEFE: EF BB BE
1161 // U+FEFF: EF BB BF
1162 //
1163 // U+107FF: F0 90 9F BF
1164 // U+10800: F0 90 A0 80
1165 // U+10FFF: F0 90 BF BF
1166 // U+11000: F0 91 80 80
1167 //
1168 // U+1E7FF: F0 9E 9F BF
1169 // U+1E800: F0 9E A0 80
1170 // U+1EFFF: F0 9E BF BF
1171 // U+1F000: F0 9F 80 80
1172 let mut bytes = buffer.as_bytes();
1173 'outer: loop {
1174 // TODO: Instead of just validating ASCII using SIMD, use SIMD
1175 // to check for non-ASCII lead bytes, too, to quickly conclude
1176 // that the vector consist entirely of CJK and below-Hebrew
1177 // code points.
1178 // Unfortunately, scripts above Arabic but below CJK share
1179 // lead bytes with RTL.
1180 if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1181 'inner: loop {
1182 // At this point, `byte` is not included in `read`.
1183 if byte < 0xE0 {
1184 if byte >= 0x80 {
1185 // Two-byte
1186 // Adding `unlikely` here improved throughput on
1187 // Russian plain text by 33%!
1188 if unlikely(byte >= 0xD6) {
1189 if byte == 0xD6 {
1190 let second = bytes[read + 1];
1191 if second > 0x8F {
1192 return true;
1193 }
1194 } else {
1195 return true;
1196 }
1197 }
1198 read += 2;
1199 } else {
1200 // ASCII: write and go back to SIMD.
1201 read += 1;
1202 // Intuitively, we should go back to the outer loop only
1203 // if byte is 0x30 or above, so as to avoid trashing on
1204 // ASCII space, comma and period in non-Latin context.
1205 // However, the extra branch seems to cost more than it's
1206 // worth.
1207 bytes = &bytes[read..];
1208 continue 'outer;
1209 }
1210 } else if byte < 0xF0 {
1211 // Three-byte
1212 if unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) {
1213 let second = bytes[read + 1];
1214 if byte == 0xE0 {
1215 if second < 0xA4 {
1216 return true;
1217 }
1218 } else if byte == 0xE2 {
1219 let third = bytes[read + 2];
1220 if second == 0x80 {
1221 if third == 0x8F || third == 0xAB || third == 0xAE {
1222 return true;
1223 }
1224 } else if second == 0x81 {
1225 if third == 0xA7 {
1226 return true;
1227 }
1228 }
1229 } else {
1230 debug_assert_eq!(byte, 0xEF);
1231 if in_inclusive_range8(second, 0xAC, 0xB7) {
1232 if second == 0xAC {
1233 let third = bytes[read + 2];
1234 if third > 0x9C {
1235 return true;
1236 }
1237 } else {
1238 return true;
1239 }
1240 } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1241 if second == 0xB9 {
1242 let third = bytes[read + 2];
1243 if third > 0xAF {
1244 return true;
1245 }
1246 } else if second == 0xBB {
1247 let third = bytes[read + 2];
1248 if third != 0xBF {
1249 return true;
1250 }
1251 } else {
1252 return true;
1253 }
1254 }
1255 }
1256 }
1257 read += 3;
1258 } else {
1259 // Four-byte
1260 let second = bytes[read + 1];
1261 if unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) {
1262 let third = bytes[read + 2];
1263 if third >= 0xA0 {
1264 return true;
1265 }
1266 }
1267 read += 4;
1268 }
1269 // The comparison is always < or == and never >, but including
1270 // > here to let the compiler assume that < is true if this
1271 // comparison is false.
1272 if read >= bytes.len() {
1273 return false;
1274 }
1275 byte = bytes[read];
1276 continue 'inner;
1277 }
1278 } else {
1279 return false;
1280 }
1281 }
1282}
1283
1284/// Checks whether a UTF-16 buffer contains code points that trigger
1285/// right-to-left processing.
1286///
1287/// The check is done on a Unicode block basis without regard to assigned
1288/// vs. unassigned code points in the block. Hebrew presentation forms in
1289/// the Alphabetic Presentation Forms block are treated as if they formed
1290/// a block on their own (i.e. it treated as right-to-left). Additionally,
1291/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1292/// for. Control characters that are technically bidi controls but do not
1293/// cause right-to-left behavior without the presence of right-to-left
1294/// characters or right-to-left controls are not checked for. As a special
1295/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1296///
1297/// Returns `true` if the input contains an RTL character or an unpaired
1298/// high surrogate that could be the high half of an RTL character.
1299/// Returns `false` if the input contains neither RTL characters nor
1300/// unpaired high surrogates that could be higher halves of RTL characters.
1301pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1302 is_utf16_bidi_impl(buffer)
1303}
1304
1305/// Checks whether a scalar value triggers right-to-left processing.
1306///
1307/// The check is done on a Unicode block basis without regard to assigned
1308/// vs. unassigned code points in the block. Hebrew presentation forms in
1309/// the Alphabetic Presentation Forms block are treated as if they formed
1310/// a block on their own (i.e. it treated as right-to-left). Additionally,
1311/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1312/// for. Control characters that are technically bidi controls but do not
1313/// cause right-to-left behavior without the presence of right-to-left
1314/// characters or right-to-left controls are not checked for. As a special
1315/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1316#[inline(always)]
1317pub fn is_char_bidi(c: char) -> bool {
1318 // Controls:
1319 // Every control with RIGHT-TO-LEFT in its name in
1320 // https://www.unicode.org/charts/PDF/U2000.pdf
1321 // U+200F RLM
1322 // U+202B RLE
1323 // U+202E RLO
1324 // U+2067 RLI
1325 //
1326 // BMP RTL:
1327 // https://www.unicode.org/roadmaps/bmp/
1328 // U+0590...U+08FF
1329 // U+FB1D...U+FDFF Hebrew presentation forms and
1330 // Arabic Presentation Forms A
1331 // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1332 //
1333 // Supplementary RTL:
1334 // https://www.unicode.org/roadmaps/smp/
1335 // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1336 // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1337 let code_point = u32::from(c);
1338 if code_point < 0x0590 {
1339 // Below Hebrew
1340 return false;
1341 }
1342 if in_range32(code_point, 0x0900, 0xFB1D) {
1343 // Above Arabic Extended-A and below Hebrew presentation forms
1344 if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1345 // In the range that contains the RTL controls
1346 return code_point == 0x200F
1347 || code_point == 0x202B
1348 || code_point == 0x202E
1349 || code_point == 0x2067;
1350 }
1351 return false;
1352 }
1353 if code_point > 0x1EFFF {
1354 // Above second astral RTL. (Emoji is here.)
1355 return false;
1356 }
1357 if in_range32(code_point, 0x11000, 0x1E800) {
1358 // Between astral RTL blocks
1359 return false;
1360 }
1361 if in_range32(code_point, 0xFEFF, 0x10800) {
1362 // Above Arabic Presentations Forms B (excl. BOM) and below first
1363 // astral RTL
1364 return false;
1365 }
1366 if in_range32(code_point, 0xFE00, 0xFE70) {
1367 // Between Arabic Presentations Forms
1368 return false;
1369 }
1370 true
1371}
1372
1373/// Checks whether a UTF-16 code unit triggers right-to-left processing.
1374///
1375/// The check is done on a Unicode block basis without regard to assigned
1376/// vs. unassigned code points in the block. Hebrew presentation forms in
1377/// the Alphabetic Presentation Forms block are treated as if they formed
1378/// a block on their own (i.e. it treated as right-to-left). Additionally,
1379/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1380/// for. Control characters that are technically bidi controls but do not
1381/// cause right-to-left behavior without the presence of right-to-left
1382/// characters or right-to-left controls are not checked for. As a special
1383/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1384///
1385/// Since supplementary-plane right-to-left blocks are identifiable from the
1386/// high surrogate without examining the low surrogate, this function returns
1387/// `true` for such high surrogates making the function suitable for handling
1388/// supplementary-plane text without decoding surrogate pairs to scalar
1389/// values. Obviously, such high surrogates are then reported as right-to-left
1390/// even if actually unpaired.
1391#[inline(always)]
1392pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1393 if u < 0x0590 {
1394 // Below Hebrew
1395 return false;
1396 }
1397 if in_range16(u, 0x0900, 0xD802) {
1398 // Above Arabic Extended-A and below first RTL surrogate
1399 if in_inclusive_range16(u, 0x200F, 0x2067) {
1400 // In the range that contains the RTL controls
1401 return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1402 }
1403 return false;
1404 }
1405 if in_range16(u, 0xD83C, 0xFB1D) {
1406 // Between astral RTL high surrogates and Hebrew presentation forms
1407 // (Emoji is here)
1408 return false;
1409 }
1410 if in_range16(u, 0xD804, 0xD83A) {
1411 // Between RTL high surragates
1412 return false;
1413 }
1414 if u > 0xFEFE {
1415 // Above Arabic Presentation Forms (excl. BOM)
1416 return false;
1417 }
1418 if in_range16(u, 0xFE00, 0xFE70) {
1419 // Between Arabic Presentations Forms
1420 return false;
1421 }
1422 true
1423}
1424
1425/// Checks whether a potentially invalid UTF-8 buffer contains code points
1426/// that trigger right-to-left processing or is all-Latin1.
1427///
1428/// Possibly more efficient than performing the checks separately.
1429///
1430/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1431/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1432/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1433pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1434 if let Some(offset: usize) = is_utf8_latin1_impl(buffer) {
1435 if is_utf8_bidi(&buffer[offset..]) {
1436 Latin1Bidi::Bidi
1437 } else {
1438 Latin1Bidi::LeftToRight
1439 }
1440 } else {
1441 Latin1Bidi::Latin1
1442 }
1443}
1444
1445/// Checks whether a valid UTF-8 buffer contains code points
1446/// that trigger right-to-left processing or is all-Latin1.
1447///
1448/// Possibly more efficient than performing the checks separately.
1449///
1450/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1451/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1452/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1453pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1454 // The transition from the latin1 check to the bidi check isn't
1455 // optimal but not tweaking it to perfection today.
1456 if let Some(offset: usize) = is_str_latin1_impl(buffer) {
1457 if is_str_bidi(&buffer[offset..]) {
1458 Latin1Bidi::Bidi
1459 } else {
1460 Latin1Bidi::LeftToRight
1461 }
1462 } else {
1463 Latin1Bidi::Latin1
1464 }
1465}
1466
1467/// Checks whether a potentially invalid UTF-16 buffer contains code points
1468/// that trigger right-to-left processing or is all-Latin1.
1469///
1470/// Possibly more efficient than performing the checks separately.
1471///
1472/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1473/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1474/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1475pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1476 check_utf16_for_latin1_and_bidi_impl(buffer)
1477}
1478
1479/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1480/// with the REPLACEMENT CHARACTER.
1481///
1482/// The length of the destination buffer must be at least the length of the
1483/// source buffer _plus one_.
1484///
1485/// Returns the number of `u16`s written.
1486///
1487/// # Panics
1488///
1489/// Panics if the destination buffer is shorter than stated above.
1490pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1491 // TODO: Can the requirement for dst to be at least one unit longer
1492 // be eliminated?
1493 assert!(dst.len() > src.len());
1494 let mut decoder = Utf8Decoder::new_inner();
1495 let mut total_read = 0usize;
1496 let mut total_written = 0usize;
1497 loop {
1498 let (result, read, written) =
1499 decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1500 total_read += read;
1501 total_written += written;
1502 match result {
1503 DecoderResult::InputEmpty => {
1504 return total_written;
1505 }
1506 DecoderResult::OutputFull => {
1507 unreachable!("The assert at the top of the function should have caught this.");
1508 }
1509 DecoderResult::Malformed(_, _) => {
1510 // There should always be space for the U+FFFD, because
1511 // otherwise we'd have gotten OutputFull already.
1512 dst[total_written] = 0xFFFD;
1513 total_written += 1;
1514 }
1515 }
1516 }
1517}
1518
1519/// Converts valid UTF-8 to valid UTF-16.
1520///
1521/// The length of the destination buffer must be at least the length of the
1522/// source buffer.
1523///
1524/// Returns the number of `u16`s written.
1525///
1526/// # Panics
1527///
1528/// Panics if the destination buffer is shorter than stated above.
1529pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1530 assert!(
1531 dst.len() >= src.len(),
1532 "Destination must not be shorter than the source."
1533 );
1534 let bytes = src.as_bytes();
1535 let mut read = 0;
1536 let mut written = 0;
1537 'outer: loop {
1538 let mut byte = {
1539 let src_remaining = &bytes[read..];
1540 let dst_remaining = &mut dst[written..];
1541 let length = src_remaining.len();
1542 match unsafe {
1543 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1544 } {
1545 None => {
1546 written += length;
1547 return written;
1548 }
1549 Some((non_ascii, consumed)) => {
1550 read += consumed;
1551 written += consumed;
1552 non_ascii
1553 }
1554 }
1555 };
1556 'inner: loop {
1557 // At this point, `byte` is not included in `read`.
1558 if byte < 0xE0 {
1559 if byte >= 0x80 {
1560 // Two-byte
1561 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1562 let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1563 unsafe { *(dst.get_unchecked_mut(written)) = point };
1564 read += 2;
1565 written += 1;
1566 } else {
1567 // ASCII: write and go back to SIMD.
1568 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1569 read += 1;
1570 written += 1;
1571 // Intuitively, we should go back to the outer loop only
1572 // if byte is 0x30 or above, so as to avoid trashing on
1573 // ASCII space, comma and period in non-Latin context.
1574 // However, the extra branch seems to cost more than it's
1575 // worth.
1576 continue 'outer;
1577 }
1578 } else if byte < 0xF0 {
1579 // Three-byte
1580 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1581 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1582 let point = ((u16::from(byte) & 0xF) << 12)
1583 | ((u16::from(second) & 0x3F) << 6)
1584 | (u16::from(third) & 0x3F);
1585 unsafe { *(dst.get_unchecked_mut(written)) = point };
1586 read += 3;
1587 written += 1;
1588 } else {
1589 // Four-byte
1590 let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1591 let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1592 let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1593 let point = ((u32::from(byte) & 0x7) << 18)
1594 | ((u32::from(second) & 0x3F) << 12)
1595 | ((u32::from(third) & 0x3F) << 6)
1596 | (u32::from(fourth) & 0x3F);
1597 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1598 unsafe {
1599 *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1600 };
1601 read += 4;
1602 written += 2;
1603 }
1604 // The comparison is always < or == and never >, but including
1605 // > here to let the compiler assume that < is true if this
1606 // comparison is false.
1607 if read >= src.len() {
1608 return written;
1609 }
1610 byte = bytes[read];
1611 continue 'inner;
1612 }
1613 }
1614}
1615
1616/// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1617///
1618/// The length of the destination buffer must be at least the length of the
1619/// source buffer.
1620///
1621/// Returns the number of `u16`s written or `None` if the input was invalid.
1622///
1623/// When the input was invalid, some output may have been written.
1624///
1625/// # Panics
1626///
1627/// Panics if the destination buffer is shorter than stated above.
1628pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1629 assert!(
1630 dst.len() >= src.len(),
1631 "Destination must not be shorter than the source."
1632 );
1633 let (read: usize, written: usize) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1634 if read == src.len() {
1635 return Some(written);
1636 }
1637 None
1638}
1639
1640/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1641/// with the REPLACEMENT CHARACTER with potentially insufficient output
1642/// space.
1643///
1644/// Returns the number of code units read and the number of bytes written.
1645///
1646/// Guarantees that the bytes in the destination beyond the number of
1647/// bytes claimed as written by the second item of the return tuple
1648/// are left unmodified.
1649///
1650/// Not all code units are read if there isn't enough output space.
1651///
1652/// Note that this method isn't designed for general streamability but for
1653/// not allocating memory for the worst case up front. Specifically,
1654/// if the input starts with or ends with an unpaired surrogate, those are
1655/// replaced with the REPLACEMENT CHARACTER.
1656///
1657/// Matches the semantics of `TextEncoder.encodeInto()` from the
1658/// Encoding Standard.
1659///
1660/// # Safety
1661///
1662/// If you want to convert into a `&mut str`, use
1663/// `convert_utf16_to_str_partial()` instead of using this function
1664/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1665#[inline(always)]
1666pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1667 // The two functions called below are marked `inline(never)` to make
1668 // transitions from the hot part (first function) into the cold part
1669 // (second function) go through a return and another call to discouge
1670 // the CPU from speculating from the hot code into the cold code.
1671 // Letting the transitions be mere intra-function jumps, even to
1672 // basic blocks out-of-lined to the end of the function would wipe
1673 // away a quarter of Arabic encode performance on Haswell!
1674 let (read: usize, written: usize) = convert_utf16_to_utf8_partial_inner(src, dst);
1675 if likely(read == src.len()) {
1676 return (read, written);
1677 }
1678 let (tail_read: usize, tail_written: usize) =
1679 convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1680 (read + tail_read, written + tail_written)
1681}
1682
1683/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1684/// with the REPLACEMENT CHARACTER.
1685///
1686/// The length of the destination buffer must be at least the length of the
1687/// source buffer times three.
1688///
1689/// Returns the number of bytes written.
1690///
1691/// # Panics
1692///
1693/// Panics if the destination buffer is shorter than stated above.
1694///
1695/// # Safety
1696///
1697/// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1698/// instead of using this function together with the `unsafe` method
1699/// `as_bytes_mut()` on `&mut str`.
1700#[inline(always)]
1701pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1702 assert!(dst.len() >= src.len() * 3);
1703 let (read: usize, written: usize) = convert_utf16_to_utf8_partial(src, dst);
1704 debug_assert_eq!(read, src.len());
1705 written
1706}
1707
1708/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1709/// with the REPLACEMENT CHARACTER such that the validity of the output is
1710/// signaled using the Rust type system with potentially insufficient output
1711/// space.
1712///
1713/// Returns the number of code units read and the number of bytes written.
1714///
1715/// Not all code units are read if there isn't enough output space.
1716///
1717/// Note that this method isn't designed for general streamability but for
1718/// not allocating memory for the worst case up front. Specifically,
1719/// if the input starts with or ends with an unpaired surrogate, those are
1720/// replaced with the REPLACEMENT CHARACTER.
1721pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1722 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1723 let (read: usize, written: usize) = convert_utf16_to_utf8_partial(src, dst:bytes);
1724 let len: usize = bytes.len();
1725 let mut trail: usize = written;
1726 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1727 bytes[trail] = 0;
1728 trail += 1;
1729 }
1730 (read, written)
1731}
1732
1733/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1734/// with the REPLACEMENT CHARACTER such that the validity of the output is
1735/// signaled using the Rust type system.
1736///
1737/// The length of the destination buffer must be at least the length of the
1738/// source buffer times three.
1739///
1740/// Returns the number of bytes written.
1741///
1742/// # Panics
1743///
1744/// Panics if the destination buffer is shorter than stated above.
1745#[inline(always)]
1746pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1747 assert!(dst.len() >= src.len() * 3);
1748 let (read: usize, written: usize) = convert_utf16_to_str_partial(src, dst);
1749 debug_assert_eq!(read, src.len());
1750 written
1751}
1752
1753/// Converts bytes whose unsigned value is interpreted as Unicode code point
1754/// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1755///
1756/// The length of the destination buffer must be at least the length of the
1757/// source buffer.
1758///
1759/// The number of `u16`s written equals the length of the source buffer.
1760///
1761/// # Panics
1762///
1763/// Panics if the destination buffer is shorter than stated above.
1764pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1765 assert!(
1766 dst.len() >= src.len(),
1767 "Destination must not be shorter than the source."
1768 );
1769 // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1770 // instructions and this code, but, yet, the autovectorized version is
1771 // faster.
1772 unsafe {
1773 unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1774 }
1775}
1776
1777/// Converts bytes whose unsigned value is interpreted as Unicode code point
1778/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1779/// output space.
1780///
1781/// Returns the number of bytes read and the number of bytes written.
1782///
1783/// If the output isn't large enough, not all input is consumed.
1784///
1785/// # Safety
1786///
1787/// If you want to convert into a `&mut str`, use
1788/// `convert_utf16_to_str_partial()` instead of using this function
1789/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1790pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1791 let src_len = src.len();
1792 let src_ptr = src.as_ptr();
1793 let dst_ptr = dst.as_mut_ptr();
1794 let dst_len = dst.len();
1795 let mut total_read = 0usize;
1796 let mut total_written = 0usize;
1797 loop {
1798 // src can't advance more than dst
1799 let src_left = src_len - total_read;
1800 let dst_left = dst_len - total_written;
1801 let min_left = ::core::cmp::min(src_left, dst_left);
1802 if let Some((non_ascii, consumed)) = unsafe {
1803 ascii_to_ascii(
1804 src_ptr.add(total_read),
1805 dst_ptr.add(total_written),
1806 min_left,
1807 )
1808 } {
1809 total_read += consumed;
1810 total_written += consumed;
1811 if total_written.checked_add(2).unwrap() > dst_len {
1812 return (total_read, total_written);
1813 }
1814
1815 total_read += 1; // consume `non_ascii`
1816
1817 dst[total_written] = (non_ascii >> 6) | 0xC0;
1818 total_written += 1;
1819 dst[total_written] = (non_ascii & 0x3F) | 0x80;
1820 total_written += 1;
1821 continue;
1822 }
1823 return (total_read + min_left, total_written + min_left);
1824 }
1825}
1826
1827/// Converts bytes whose unsigned value is interpreted as Unicode code point
1828/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1829///
1830/// The length of the destination buffer must be at least the length of the
1831/// source buffer times two.
1832///
1833/// Returns the number of bytes written.
1834///
1835/// # Panics
1836///
1837/// Panics if the destination buffer is shorter than stated above.
1838///
1839/// # Safety
1840///
1841/// Note that this function may write garbage beyond the number of bytes
1842/// indicated by the return value, so using a `&mut str` interpreted as
1843/// `&mut [u8]` as the destination is not safe. If you want to convert into
1844/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1845#[inline]
1846pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1847 assert!(
1848 dst.len() >= src.len() * 2,
1849 "Destination must not be shorter than the source times two."
1850 );
1851 let (read: usize, written: usize) = convert_latin1_to_utf8_partial(src, dst);
1852 debug_assert_eq!(read, src.len());
1853 written
1854}
1855
1856/// Converts bytes whose unsigned value is interpreted as Unicode code point
1857/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1858/// output is signaled using the Rust type system with potentially insufficient
1859/// output space.
1860///
1861/// Returns the number of bytes read and the number of bytes written.
1862///
1863/// If the output isn't large enough, not all input is consumed.
1864#[inline]
1865pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1866 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1867 let (read: usize, written: usize) = convert_latin1_to_utf8_partial(src, dst:bytes);
1868 let len: usize = bytes.len();
1869 let mut trail: usize = written;
1870 let max: usize = ::core::cmp::min(v1:len, v2:trail + MAX_STRIDE_SIZE);
1871 while trail < max {
1872 bytes[trail] = 0;
1873 trail += 1;
1874 }
1875 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1876 bytes[trail] = 0;
1877 trail += 1;
1878 }
1879 (read, written)
1880}
1881
1882/// Converts bytes whose unsigned value is interpreted as Unicode code point
1883/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1884/// output is signaled using the Rust type system.
1885///
1886/// The length of the destination buffer must be at least the length of the
1887/// source buffer times two.
1888///
1889/// Returns the number of bytes written.
1890///
1891/// # Panics
1892///
1893/// Panics if the destination buffer is shorter than stated above.
1894#[inline]
1895pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1896 assert!(
1897 dst.len() >= src.len() * 2,
1898 "Destination must not be shorter than the source times two."
1899 );
1900 let (read: usize, written: usize) = convert_latin1_to_str_partial(src, dst);
1901 debug_assert_eq!(read, src.len());
1902 written
1903}
1904
1905/// If the input is valid UTF-8 representing only Unicode code points from
1906/// U+0000 to U+00FF, inclusive, converts the input into output that
1907/// represents the value of each code point as the unsigned byte value of
1908/// each output byte.
1909///
1910/// If the input does not fulfill the condition stated above, this function
1911/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1912/// does something that is memory-safe without any promises about any
1913/// properties of the output. In particular, callers shouldn't assume the
1914/// output to be the same across crate versions or CPU architectures and
1915/// should not assume that non-ASCII input can't map to ASCII output.
1916///
1917/// The length of the destination buffer must be at least the length of the
1918/// source buffer.
1919///
1920/// Returns the number of bytes written.
1921///
1922/// # Panics
1923///
1924/// Panics if the destination buffer is shorter than stated above.
1925///
1926/// If debug assertions are enabled (and not fuzzing) and the input is
1927/// not in the range U+0000 to U+00FF, inclusive.
1928pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1929 assert!(
1930 dst.len() >= src.len(),
1931 "Destination must not be shorter than the source."
1932 );
1933 non_fuzz_debug_assert!(is_utf8_latin1(src));
1934 let src_len = src.len();
1935 let src_ptr = src.as_ptr();
1936 let dst_ptr = dst.as_mut_ptr();
1937 let mut total_read = 0usize;
1938 let mut total_written = 0usize;
1939 loop {
1940 // dst can't advance more than src
1941 let src_left = src_len - total_read;
1942 if let Some((non_ascii, consumed)) = unsafe {
1943 ascii_to_ascii(
1944 src_ptr.add(total_read),
1945 dst_ptr.add(total_written),
1946 src_left,
1947 )
1948 } {
1949 total_read += consumed + 1;
1950 total_written += consumed;
1951
1952 if total_read == src_len {
1953 return total_written;
1954 }
1955
1956 let trail = src[total_read];
1957 total_read += 1;
1958
1959 dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1960 total_written += 1;
1961 continue;
1962 }
1963 return total_written + src_left;
1964 }
1965}
1966
1967/// If the input is valid UTF-16 representing only Unicode code points from
1968/// U+0000 to U+00FF, inclusive, converts the input into output that
1969/// represents the value of each code point as the unsigned byte value of
1970/// each output byte.
1971///
1972/// If the input does not fulfill the condition stated above, does something
1973/// that is memory-safe without any promises about any properties of the
1974/// output and will probably assert in debug builds in future versions.
1975/// In particular, callers shouldn't assume the output to be the same across
1976/// crate versions or CPU architectures and should not assume that non-ASCII
1977/// input can't map to ASCII output.
1978///
1979/// The length of the destination buffer must be at least the length of the
1980/// source buffer.
1981///
1982/// The number of bytes written equals the length of the source buffer.
1983///
1984/// # Panics
1985///
1986/// Panics if the destination buffer is shorter than stated above.
1987///
1988/// (Probably in future versions if debug assertions are enabled (and not
1989/// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
1990pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1991 assert!(
1992 dst.len() >= src.len(),
1993 "Destination must not be shorter than the source."
1994 );
1995 // non_fuzz_debug_assert!(is_utf16_latin1(src));
1996 unsafe {
1997 pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1998 }
1999}
2000
2001/// Converts bytes whose unsigned value is interpreted as Unicode code point
2002/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
2003///
2004/// Borrows if input is ASCII-only. Performs a single heap allocation
2005/// otherwise.
2006///
2007/// Only available if the `alloc` feature is enabled (enabled by default).
2008#[cfg(feature = "alloc")]
2009pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
2010 let up_to: usize = ascii_valid_up_to(bytes);
2011 // >= makes later things optimize better than ==
2012 if up_to >= bytes.len() {
2013 debug_assert_eq!(up_to, bytes.len());
2014 let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) };
2015 return Cow::Borrowed(s);
2016 }
2017 let (head: &[u8], tail: &[u8]) = bytes.split_at(mid:up_to);
2018 let capacity: usize = head.len() + tail.len() * 2;
2019 let mut vec: Vec = Vec::with_capacity(capacity);
2020 unsafe {
2021 vec.set_len(new_len:capacity);
2022 }
2023 (&mut vec[..up_to]).copy_from_slice(src:head);
2024 let written: usize = convert_latin1_to_utf8(src:tail, &mut vec[up_to..]);
2025 vec.truncate(len:up_to + written);
2026 Cow::Owned(unsafe { String::from_utf8_unchecked(bytes:vec) })
2027}
2028
2029/// If the input is valid UTF-8 representing only Unicode code points from
2030/// U+0000 to U+00FF, inclusive, converts the input into output that
2031/// represents the value of each code point as the unsigned byte value of
2032/// each output byte.
2033///
2034/// If the input does not fulfill the condition stated above, this function
2035/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2036/// does something that is memory-safe without any promises about any
2037/// properties of the output. In particular, callers shouldn't assume the
2038/// output to be the same across crate versions or CPU architectures and
2039/// should not assume that non-ASCII input can't map to ASCII output.
2040///
2041/// Borrows if input is ASCII-only. Performs a single heap allocation
2042/// otherwise.
2043///
2044/// Only available if the `alloc` feature is enabled (enabled by default).
2045#[cfg(feature = "alloc")]
2046pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2047 let bytes: &[u8] = string.as_bytes();
2048 let up_to: usize = ascii_valid_up_to(bytes);
2049 // >= makes later things optimize better than ==
2050 if up_to >= bytes.len() {
2051 debug_assert_eq!(up_to, bytes.len());
2052 return Cow::Borrowed(bytes);
2053 }
2054 let (head: &[u8], tail: &[u8]) = bytes.split_at(mid:up_to);
2055 let capacity: usize = bytes.len();
2056 let mut vec: Vec = Vec::with_capacity(capacity);
2057 unsafe {
2058 vec.set_len(new_len:capacity);
2059 }
2060 (&mut vec[..up_to]).copy_from_slice(src:head);
2061 let written: usize = convert_utf8_to_latin1_lossy(src:tail, &mut vec[up_to..]);
2062 vec.truncate(len:up_to + written);
2063 Cow::Owned(vec)
2064}
2065
2066/// Returns the index of the first unpaired surrogate or, if the input is
2067/// valid UTF-16 in its entirety, the length of the input.
2068pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2069 utf16_valid_up_to_impl(buffer)
2070}
2071
2072/// Returns the index of first byte that starts an invalid byte
2073/// sequence or a non-Latin1 byte sequence, or the length of the
2074/// string if there are neither.
2075pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2076 is_utf8_latin1_impl(buffer).unwrap_or(default:buffer.len())
2077}
2078
2079/// Returns the index of first byte that starts a non-Latin1 byte
2080/// sequence, or the length of the string if there are none.
2081pub fn str_latin1_up_to(buffer: &str) -> usize {
2082 is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len())
2083}
2084
2085/// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2086#[inline]
2087pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2088 let mut offset: usize = 0;
2089 loop {
2090 offset += utf16_valid_up_to(&buffer[offset..]);
2091 if offset == buffer.len() {
2092 return;
2093 }
2094 buffer[offset] = 0xFFFD;
2095 offset += 1;
2096 }
2097}
2098
2099/// Copies ASCII from source to destination up to the first non-ASCII byte
2100/// (or the end of the input if it is ASCII in its entirety).
2101///
2102/// The length of the destination buffer must be at least the length of the
2103/// source buffer.
2104///
2105/// Returns the number of bytes written.
2106///
2107/// # Panics
2108///
2109/// Panics if the destination buffer is shorter than stated above.
2110pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2111 assert!(
2112 dst.len() >= src.len(),
2113 "Destination must not be shorter than the source."
2114 );
2115 if let Some((_, consumed: usize)) =
2116 unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2117 {
2118 consumed
2119 } else {
2120 src.len()
2121 }
2122}
2123
2124/// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2125/// the first non-ASCII byte (or the end of the input if it is ASCII in its
2126/// entirety).
2127///
2128/// The length of the destination buffer must be at least the length of the
2129/// source buffer.
2130///
2131/// Returns the number of `u16`s written.
2132///
2133/// # Panics
2134///
2135/// Panics if the destination buffer is shorter than stated above.
2136pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2137 assert!(
2138 dst.len() >= src.len(),
2139 "Destination must not be shorter than the source."
2140 );
2141 if let Some((_, consumed: usize)) =
2142 unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2143 {
2144 consumed
2145 } else {
2146 src.len()
2147 }
2148}
2149
2150/// Copies Basic Latin from source to destination narrowing it to ASCII up to
2151/// the first non-Basic Latin code unit (or the end of the input if it is
2152/// Basic Latin in its entirety).
2153///
2154/// The length of the destination buffer must be at least the length of the
2155/// source buffer.
2156///
2157/// Returns the number of bytes written.
2158///
2159/// # Panics
2160///
2161/// Panics if the destination buffer is shorter than stated above.
2162pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2163 assert!(
2164 dst.len() >= src.len(),
2165 "Destination must not be shorter than the source."
2166 );
2167 if let Some((_, consumed: usize)) =
2168 unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2169 {
2170 consumed
2171 } else {
2172 src.len()
2173 }
2174}
2175
2176// Any copyright to the test code below this comment is dedicated to the
2177// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2178
2179#[cfg(all(test, feature = "alloc"))]
2180mod tests {
2181 use super::*;
2182
2183 #[test]
2184 fn test_is_ascii_success() {
2185 let mut src: Vec<u8> = Vec::with_capacity(128);
2186 src.resize(128, 0);
2187 for i in 0..src.len() {
2188 src[i] = i as u8;
2189 }
2190 for i in 0..src.len() {
2191 assert!(is_ascii(&src[i..]));
2192 }
2193 }
2194
2195 #[test]
2196 fn test_is_ascii_fail() {
2197 let mut src: Vec<u8> = Vec::with_capacity(128);
2198 src.resize(128, 0);
2199 for i in 0..src.len() {
2200 src[i] = i as u8;
2201 }
2202 for i in 0..src.len() {
2203 let tail = &mut src[i..];
2204 for j in 0..tail.len() {
2205 tail[j] = 0xA0;
2206 assert!(!is_ascii(tail));
2207 }
2208 }
2209 }
2210
2211 #[test]
2212 fn test_is_basic_latin_success() {
2213 let mut src: Vec<u16> = Vec::with_capacity(128);
2214 src.resize(128, 0);
2215 for i in 0..src.len() {
2216 src[i] = i as u16;
2217 }
2218 for i in 0..src.len() {
2219 assert!(is_basic_latin(&src[i..]));
2220 }
2221 }
2222
2223 #[test]
2224 fn test_is_basic_latin_fail() {
2225 let mut src: Vec<u16> = Vec::with_capacity(128);
2226 src.resize(128, 0);
2227 for i in 0..src.len() {
2228 src[i] = i as u16;
2229 }
2230 for i in 0..src.len() {
2231 let tail = &mut src[i..];
2232 for j in 0..tail.len() {
2233 tail[j] = 0xA0;
2234 assert!(!is_basic_latin(tail));
2235 }
2236 }
2237 }
2238
2239 #[test]
2240 fn test_is_utf16_latin1_success() {
2241 let mut src: Vec<u16> = Vec::with_capacity(256);
2242 src.resize(256, 0);
2243 for i in 0..src.len() {
2244 src[i] = i as u16;
2245 }
2246 for i in 0..src.len() {
2247 assert!(is_utf16_latin1(&src[i..]));
2248 assert_eq!(
2249 check_utf16_for_latin1_and_bidi(&src[i..]),
2250 Latin1Bidi::Latin1
2251 );
2252 }
2253 }
2254
2255 #[test]
2256 fn test_is_utf16_latin1_fail() {
2257 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2258 let mut src: Vec<u16> = Vec::with_capacity(len);
2259 src.resize(len, 0);
2260 for i in 0..src.len() {
2261 src[i] = i as u16;
2262 }
2263 for i in 0..src.len() {
2264 let tail = &mut src[i..];
2265 for j in 0..tail.len() {
2266 tail[j] = 0x100 + j as u16;
2267 assert!(!is_utf16_latin1(tail));
2268 assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2269 }
2270 }
2271 }
2272
2273 #[test]
2274 fn test_is_str_latin1_success() {
2275 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2276 let mut src: Vec<u16> = Vec::with_capacity(len);
2277 src.resize(len, 0);
2278 for i in 0..src.len() {
2279 src[i] = i as u16;
2280 }
2281 for i in 0..src.len() {
2282 let s = String::from_utf16(&src[i..]).unwrap();
2283 assert!(is_str_latin1(&s[..]));
2284 assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2285 }
2286 }
2287
2288 #[test]
2289 fn test_is_str_latin1_fail() {
2290 let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2291 let mut src: Vec<u16> = Vec::with_capacity(len);
2292 src.resize(len, 0);
2293 for i in 0..src.len() {
2294 src[i] = i as u16;
2295 }
2296 for i in 0..src.len() {
2297 let tail = &mut src[i..];
2298 for j in 0..tail.len() {
2299 tail[j] = 0x100 + j as u16;
2300 let s = String::from_utf16(tail).unwrap();
2301 assert!(!is_str_latin1(&s[..]));
2302 assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2303 }
2304 }
2305 }
2306
2307 #[test]
2308 fn test_is_utf8_latin1_success() {
2309 let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2310 let mut src: Vec<u16> = Vec::with_capacity(len);
2311 src.resize(len, 0);
2312 for i in 0..src.len() {
2313 src[i] = i as u16;
2314 }
2315 for i in 0..src.len() {
2316 let s = String::from_utf16(&src[i..]).unwrap();
2317 assert!(is_utf8_latin1(s.as_bytes()));
2318 assert_eq!(
2319 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2320 Latin1Bidi::Latin1
2321 );
2322 }
2323 }
2324
2325 #[test]
2326 fn test_is_utf8_latin1_fail() {
2327 let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2328 let mut src: Vec<u16> = Vec::with_capacity(len);
2329 src.resize(len, 0);
2330 for i in 0..src.len() {
2331 src[i] = i as u16;
2332 }
2333 for i in 0..src.len() {
2334 let tail = &mut src[i..];
2335 for j in 0..tail.len() {
2336 tail[j] = 0x100 + j as u16;
2337 let s = String::from_utf16(tail).unwrap();
2338 assert!(!is_utf8_latin1(s.as_bytes()));
2339 assert_ne!(
2340 check_utf8_for_latin1_and_bidi(s.as_bytes()),
2341 Latin1Bidi::Latin1
2342 );
2343 }
2344 }
2345 }
2346
2347 #[test]
2348 fn test_is_utf8_latin1_invalid() {
2349 assert!(!is_utf8_latin1(b"\xC3"));
2350 assert!(!is_utf8_latin1(b"a\xC3"));
2351 assert!(!is_utf8_latin1(b"\xFF"));
2352 assert!(!is_utf8_latin1(b"a\xFF"));
2353 assert!(!is_utf8_latin1(b"\xC3\xFF"));
2354 assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2355 }
2356
2357 #[test]
2358 fn test_convert_utf8_to_utf16() {
2359 let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2360 let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2361 dst.resize(src.len() + 1, 0);
2362 let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2363 dst.truncate(len);
2364 let reference: Vec<u16> = src.encode_utf16().collect();
2365 assert_eq!(dst, reference);
2366 }
2367
2368 #[test]
2369 fn test_convert_str_to_utf16() {
2370 let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2371 let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2372 dst.resize(src.len(), 0);
2373 let len = convert_str_to_utf16(src, &mut dst[..]);
2374 dst.truncate(len);
2375 let reference: Vec<u16> = src.encode_utf16().collect();
2376 assert_eq!(dst, reference);
2377 }
2378
2379 #[test]
2380 fn test_convert_utf16_to_utf8_partial() {
2381 let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2382 let src: Vec<u16> = reference.encode_utf16().collect();
2383 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2384 dst.resize(src.len() * 3 + 1, 0);
2385 let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2386 let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2387 dst.truncate(len);
2388 assert_eq!(dst, reference.as_bytes());
2389 }
2390
2391 #[test]
2392 fn test_convert_utf16_to_utf8() {
2393 let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2394 let src: Vec<u16> = reference.encode_utf16().collect();
2395 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2396 dst.resize(src.len() * 3 + 1, 0);
2397 let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2398 dst.truncate(len);
2399 assert_eq!(dst, reference.as_bytes());
2400 }
2401
2402 #[test]
2403 fn test_convert_latin1_to_utf16() {
2404 let mut src: Vec<u8> = Vec::with_capacity(256);
2405 src.resize(256, 0);
2406 let mut reference: Vec<u16> = Vec::with_capacity(256);
2407 reference.resize(256, 0);
2408 for i in 0..256 {
2409 src[i] = i as u8;
2410 reference[i] = i as u16;
2411 }
2412 let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2413 dst.resize(src.len(), 0);
2414 convert_latin1_to_utf16(&src[..], &mut dst[..]);
2415 assert_eq!(dst, reference);
2416 }
2417
2418 #[test]
2419 fn test_convert_latin1_to_utf8_partial() {
2420 let mut dst = [0u8, 2];
2421 let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2422 assert_eq!(read, 1);
2423 assert_eq!(written, 1);
2424 }
2425
2426 #[test]
2427 fn test_convert_latin1_to_utf8() {
2428 let mut src: Vec<u8> = Vec::with_capacity(256);
2429 src.resize(256, 0);
2430 let mut reference: Vec<u16> = Vec::with_capacity(256);
2431 reference.resize(256, 0);
2432 for i in 0..256 {
2433 src[i] = i as u8;
2434 reference[i] = i as u16;
2435 }
2436 let s = String::from_utf16(&reference[..]).unwrap();
2437 let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2438 dst.resize(src.len() * 2, 0);
2439 let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2440 dst.truncate(len);
2441 assert_eq!(&dst[..], s.as_bytes());
2442 }
2443
2444 #[test]
2445 fn test_convert_utf8_to_latin1_lossy() {
2446 let mut reference: Vec<u8> = Vec::with_capacity(256);
2447 reference.resize(256, 0);
2448 let mut src16: Vec<u16> = Vec::with_capacity(256);
2449 src16.resize(256, 0);
2450 for i in 0..256 {
2451 src16[i] = i as u16;
2452 reference[i] = i as u8;
2453 }
2454 let src = String::from_utf16(&src16[..]).unwrap();
2455 let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2456 dst.resize(src.len(), 0);
2457 let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2458 dst.truncate(len);
2459 assert_eq!(dst, reference);
2460 }
2461
2462 #[cfg(all(debug_assertions, not(fuzzing)))]
2463 #[test]
2464 #[should_panic]
2465 fn test_convert_utf8_to_latin1_lossy_panics() {
2466 let mut dst = [0u8; 16];
2467 let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2468 }
2469
2470 #[test]
2471 fn test_convert_utf16_to_latin1_lossy() {
2472 let mut src: Vec<u16> = Vec::with_capacity(256);
2473 src.resize(256, 0);
2474 let mut reference: Vec<u8> = Vec::with_capacity(256);
2475 reference.resize(256, 0);
2476 for i in 0..256 {
2477 src[i] = i as u16;
2478 reference[i] = i as u8;
2479 }
2480 let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2481 dst.resize(src.len(), 0);
2482 convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2483 assert_eq!(dst, reference);
2484 }
2485
2486 #[test]
2487 // #[should_panic]
2488 fn test_convert_utf16_to_latin1_lossy_panics() {
2489 let mut dst = [0u8; 16];
2490 let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2491 }
2492
2493 #[test]
2494 fn test_utf16_valid_up_to() {
2495 let valid = vec![
2496 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2497 0xD83Du16, 0xDCA9u16, 0x00B6u16,
2498 ];
2499 assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2500 let lone_high = vec![
2501 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2502 0x2603u16, 0xD83Du16, 0x00B6u16,
2503 ];
2504 assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2505 let lone_low = vec![
2506 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2507 0x2603u16, 0xDCA9u16, 0x00B6u16,
2508 ];
2509 assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2510 let lone_high_at_end = vec![
2511 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2512 0x2603u16, 0x00B6u16, 0xD83Du16,
2513 ];
2514 assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2515 }
2516
2517 #[test]
2518 fn test_ensure_utf16_validity() {
2519 let mut src = vec![
2520 0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2521 0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2522 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2523 ];
2524 let reference = vec![
2525 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2526 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2527 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2528 ];
2529 ensure_utf16_validity(&mut src[..]);
2530 assert_eq!(src, reference);
2531 }
2532
2533 #[test]
2534 fn test_is_char_bidi() {
2535 assert!(!is_char_bidi('a'));
2536 assert!(!is_char_bidi('\u{03B1}'));
2537 assert!(!is_char_bidi('\u{3041}'));
2538 assert!(!is_char_bidi('\u{1F4A9}'));
2539 assert!(!is_char_bidi('\u{FE00}'));
2540 assert!(!is_char_bidi('\u{202C}'));
2541 assert!(!is_char_bidi('\u{FEFF}'));
2542 assert!(is_char_bidi('\u{0590}'));
2543 assert!(is_char_bidi('\u{08FF}'));
2544 assert!(is_char_bidi('\u{061C}'));
2545 assert!(is_char_bidi('\u{FB50}'));
2546 assert!(is_char_bidi('\u{FDFF}'));
2547 assert!(is_char_bidi('\u{FE70}'));
2548 assert!(is_char_bidi('\u{FEFE}'));
2549 assert!(is_char_bidi('\u{200F}'));
2550 assert!(is_char_bidi('\u{202B}'));
2551 assert!(is_char_bidi('\u{202E}'));
2552 assert!(is_char_bidi('\u{2067}'));
2553 assert!(is_char_bidi('\u{10800}'));
2554 assert!(is_char_bidi('\u{10FFF}'));
2555 assert!(is_char_bidi('\u{1E800}'));
2556 assert!(is_char_bidi('\u{1EFFF}'));
2557 }
2558
2559 #[test]
2560 fn test_is_utf16_code_unit_bidi() {
2561 assert!(!is_utf16_code_unit_bidi(0x0062));
2562 assert!(!is_utf16_code_unit_bidi(0x03B1));
2563 assert!(!is_utf16_code_unit_bidi(0x3041));
2564 assert!(!is_utf16_code_unit_bidi(0xD801));
2565 assert!(!is_utf16_code_unit_bidi(0xFE00));
2566 assert!(!is_utf16_code_unit_bidi(0x202C));
2567 assert!(!is_utf16_code_unit_bidi(0xFEFF));
2568 assert!(is_utf16_code_unit_bidi(0x0590));
2569 assert!(is_utf16_code_unit_bidi(0x08FF));
2570 assert!(is_utf16_code_unit_bidi(0x061C));
2571 assert!(is_utf16_code_unit_bidi(0xFB1D));
2572 assert!(is_utf16_code_unit_bidi(0xFB50));
2573 assert!(is_utf16_code_unit_bidi(0xFDFF));
2574 assert!(is_utf16_code_unit_bidi(0xFE70));
2575 assert!(is_utf16_code_unit_bidi(0xFEFE));
2576 assert!(is_utf16_code_unit_bidi(0x200F));
2577 assert!(is_utf16_code_unit_bidi(0x202B));
2578 assert!(is_utf16_code_unit_bidi(0x202E));
2579 assert!(is_utf16_code_unit_bidi(0x2067));
2580 assert!(is_utf16_code_unit_bidi(0xD802));
2581 assert!(is_utf16_code_unit_bidi(0xD803));
2582 assert!(is_utf16_code_unit_bidi(0xD83A));
2583 assert!(is_utf16_code_unit_bidi(0xD83B));
2584 }
2585
2586 #[test]
2587 fn test_is_str_bidi() {
2588 assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2589 assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2590 assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2591 assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2592 assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2593 assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2594 assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2595 assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2596 assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2597 assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2598 assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2599 assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2600 assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2601 assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2602 assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2603 assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2604 assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2605 assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2606 assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2607 assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2608 assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2609 assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2610 }
2611
2612 #[test]
2613 fn test_is_utf8_bidi() {
2614 assert!(!is_utf8_bidi(
2615 "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2616 ));
2617 assert!(!is_utf8_bidi(
2618 "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2619 ));
2620 assert!(!is_utf8_bidi(
2621 "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2622 ));
2623 assert!(!is_utf8_bidi(
2624 "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2625 ));
2626 assert!(!is_utf8_bidi(
2627 "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2628 ));
2629 assert!(!is_utf8_bidi(
2630 "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2631 ));
2632 assert!(!is_utf8_bidi(
2633 "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2634 ));
2635 assert!(is_utf8_bidi(
2636 "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2637 ));
2638 assert!(is_utf8_bidi(
2639 "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2640 ));
2641 assert!(is_utf8_bidi(
2642 "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2643 ));
2644 assert!(is_utf8_bidi(
2645 "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2646 ));
2647 assert!(is_utf8_bidi(
2648 "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2649 ));
2650 assert!(is_utf8_bidi(
2651 "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2652 ));
2653 assert!(is_utf8_bidi(
2654 "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2655 ));
2656 assert!(is_utf8_bidi(
2657 "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2658 ));
2659 assert!(is_utf8_bidi(
2660 "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2661 ));
2662 assert!(is_utf8_bidi(
2663 "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2664 ));
2665 assert!(is_utf8_bidi(
2666 "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2667 ));
2668 assert!(is_utf8_bidi(
2669 "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2670 ));
2671 assert!(is_utf8_bidi(
2672 "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2673 ));
2674 assert!(is_utf8_bidi(
2675 "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2676 ));
2677 assert!(is_utf8_bidi(
2678 "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2679 ));
2680 }
2681
2682 #[test]
2683 fn test_is_utf16_bidi() {
2684 assert!(!is_utf16_bidi(&[
2685 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2686 0x67, 0x68, 0x69,
2687 ]));
2688 assert!(!is_utf16_bidi(&[
2689 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2690 0x67, 0x68, 0x69,
2691 ]));
2692 assert!(!is_utf16_bidi(&[
2693 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2694 0x67, 0x68, 0x69,
2695 ]));
2696 assert!(!is_utf16_bidi(&[
2697 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2698 0x67, 0x68, 0x69,
2699 ]));
2700 assert!(!is_utf16_bidi(&[
2701 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2702 0x67, 0x68, 0x69,
2703 ]));
2704 assert!(!is_utf16_bidi(&[
2705 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2706 0x67, 0x68, 0x69,
2707 ]));
2708 assert!(!is_utf16_bidi(&[
2709 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2710 0x67, 0x68, 0x69,
2711 ]));
2712 assert!(is_utf16_bidi(&[
2713 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2714 0x67, 0x68, 0x69,
2715 ]));
2716 assert!(is_utf16_bidi(&[
2717 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2718 0x67, 0x68, 0x69,
2719 ]));
2720 assert!(is_utf16_bidi(&[
2721 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2722 0x67, 0x68, 0x69,
2723 ]));
2724 assert!(is_utf16_bidi(&[
2725 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2726 0x67, 0x68, 0x69,
2727 ]));
2728 assert!(is_utf16_bidi(&[
2729 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2730 0x67, 0x68, 0x69,
2731 ]));
2732 assert!(is_utf16_bidi(&[
2733 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2734 0x67, 0x68, 0x69,
2735 ]));
2736 assert!(is_utf16_bidi(&[
2737 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2738 0x67, 0x68, 0x69,
2739 ]));
2740 assert!(is_utf16_bidi(&[
2741 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2742 0x67, 0x68, 0x69,
2743 ]));
2744 assert!(is_utf16_bidi(&[
2745 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2746 0x67, 0x68, 0x69,
2747 ]));
2748 assert!(is_utf16_bidi(&[
2749 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2750 0x67, 0x68, 0x69,
2751 ]));
2752 assert!(is_utf16_bidi(&[
2753 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2754 0x67, 0x68, 0x69,
2755 ]));
2756 assert!(is_utf16_bidi(&[
2757 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2758 0x67, 0x68, 0x69,
2759 ]));
2760 assert!(is_utf16_bidi(&[
2761 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2762 0x67, 0x68, 0x69,
2763 ]));
2764 assert!(is_utf16_bidi(&[
2765 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2766 0x67, 0x68, 0x69,
2767 ]));
2768 assert!(is_utf16_bidi(&[
2769 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2770 0x67, 0x68, 0x69,
2771 ]));
2772 assert!(is_utf16_bidi(&[
2773 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2774 0x67, 0x68, 0x69,
2775 ]));
2776
2777 assert!(is_utf16_bidi(&[
2778 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2779 0x66, 0x67, 0x68, 0x69,
2780 ]));
2781 }
2782
2783 #[test]
2784 fn test_check_str_for_latin1_and_bidi() {
2785 assert_ne!(
2786 check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2787 Latin1Bidi::Bidi
2788 );
2789 assert_ne!(
2790 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2791 Latin1Bidi::Bidi
2792 );
2793 assert_ne!(
2794 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2795 Latin1Bidi::Bidi
2796 );
2797 assert_ne!(
2798 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2799 Latin1Bidi::Bidi
2800 );
2801 assert_ne!(
2802 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2803 Latin1Bidi::Bidi
2804 );
2805 assert_ne!(
2806 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2807 Latin1Bidi::Bidi
2808 );
2809 assert_ne!(
2810 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2811 Latin1Bidi::Bidi
2812 );
2813 assert_eq!(
2814 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2815 Latin1Bidi::Bidi
2816 );
2817 assert_eq!(
2818 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2819 Latin1Bidi::Bidi
2820 );
2821 assert_eq!(
2822 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2823 Latin1Bidi::Bidi
2824 );
2825 assert_eq!(
2826 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2827 Latin1Bidi::Bidi
2828 );
2829 assert_eq!(
2830 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2831 Latin1Bidi::Bidi
2832 );
2833 assert_eq!(
2834 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2835 Latin1Bidi::Bidi
2836 );
2837 assert_eq!(
2838 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2839 Latin1Bidi::Bidi
2840 );
2841 assert_eq!(
2842 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2843 Latin1Bidi::Bidi
2844 );
2845 assert_eq!(
2846 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2847 Latin1Bidi::Bidi
2848 );
2849 assert_eq!(
2850 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2851 Latin1Bidi::Bidi
2852 );
2853 assert_eq!(
2854 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2855 Latin1Bidi::Bidi
2856 );
2857 assert_eq!(
2858 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2859 Latin1Bidi::Bidi
2860 );
2861 assert_eq!(
2862 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2863 Latin1Bidi::Bidi
2864 );
2865 assert_eq!(
2866 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2867 Latin1Bidi::Bidi
2868 );
2869 assert_eq!(
2870 check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2871 Latin1Bidi::Bidi
2872 );
2873 }
2874
2875 #[test]
2876 fn test_check_utf8_for_latin1_and_bidi() {
2877 assert_ne!(
2878 check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2879 Latin1Bidi::Bidi
2880 );
2881 assert_ne!(
2882 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2883 Latin1Bidi::Bidi
2884 );
2885 assert_ne!(
2886 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2887 Latin1Bidi::Bidi
2888 );
2889 assert_ne!(
2890 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2891 Latin1Bidi::Bidi
2892 );
2893 assert_ne!(
2894 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2895 Latin1Bidi::Bidi
2896 );
2897 assert_ne!(
2898 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2899 Latin1Bidi::Bidi
2900 );
2901 assert_ne!(
2902 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2903 Latin1Bidi::Bidi
2904 );
2905 assert_eq!(
2906 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2907 Latin1Bidi::Bidi
2908 );
2909 assert_eq!(
2910 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2911 Latin1Bidi::Bidi
2912 );
2913 assert_eq!(
2914 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2915 Latin1Bidi::Bidi
2916 );
2917 assert_eq!(
2918 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2919 Latin1Bidi::Bidi
2920 );
2921 assert_eq!(
2922 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2923 Latin1Bidi::Bidi
2924 );
2925 assert_eq!(
2926 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2927 Latin1Bidi::Bidi
2928 );
2929 assert_eq!(
2930 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2931 Latin1Bidi::Bidi
2932 );
2933 assert_eq!(
2934 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2935 Latin1Bidi::Bidi
2936 );
2937 assert_eq!(
2938 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2939 Latin1Bidi::Bidi
2940 );
2941 assert_eq!(
2942 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2943 Latin1Bidi::Bidi
2944 );
2945 assert_eq!(
2946 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2947 Latin1Bidi::Bidi
2948 );
2949 assert_eq!(
2950 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2951 Latin1Bidi::Bidi
2952 );
2953 assert_eq!(
2954 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2955 Latin1Bidi::Bidi
2956 );
2957 assert_eq!(
2958 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2959 Latin1Bidi::Bidi
2960 );
2961 assert_eq!(
2962 check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2963 Latin1Bidi::Bidi
2964 );
2965 }
2966
2967 #[test]
2968 fn test_check_utf16_for_latin1_and_bidi() {
2969 assert_ne!(
2970 check_utf16_for_latin1_and_bidi(&[
2971 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2972 0x66, 0x67, 0x68, 0x69,
2973 ]),
2974 Latin1Bidi::Bidi
2975 );
2976 assert_ne!(
2977 check_utf16_for_latin1_and_bidi(&[
2978 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2979 0x66, 0x67, 0x68, 0x69,
2980 ]),
2981 Latin1Bidi::Bidi
2982 );
2983 assert_ne!(
2984 check_utf16_for_latin1_and_bidi(&[
2985 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2986 0x66, 0x67, 0x68, 0x69,
2987 ]),
2988 Latin1Bidi::Bidi
2989 );
2990 assert_ne!(
2991 check_utf16_for_latin1_and_bidi(&[
2992 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2993 0x66, 0x67, 0x68, 0x69,
2994 ]),
2995 Latin1Bidi::Bidi
2996 );
2997 assert_ne!(
2998 check_utf16_for_latin1_and_bidi(&[
2999 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
3000 0x66, 0x67, 0x68, 0x69,
3001 ]),
3002 Latin1Bidi::Bidi
3003 );
3004 assert_ne!(
3005 check_utf16_for_latin1_and_bidi(&[
3006 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
3007 0x66, 0x67, 0x68, 0x69,
3008 ]),
3009 Latin1Bidi::Bidi
3010 );
3011 assert_ne!(
3012 check_utf16_for_latin1_and_bidi(&[
3013 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
3014 0x66, 0x67, 0x68, 0x69,
3015 ]),
3016 Latin1Bidi::Bidi
3017 );
3018 assert_eq!(
3019 check_utf16_for_latin1_and_bidi(&[
3020 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
3021 0x66, 0x67, 0x68, 0x69,
3022 ]),
3023 Latin1Bidi::Bidi
3024 );
3025 assert_eq!(
3026 check_utf16_for_latin1_and_bidi(&[
3027 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3028 0x66, 0x67, 0x68, 0x69,
3029 ]),
3030 Latin1Bidi::Bidi
3031 );
3032 assert_eq!(
3033 check_utf16_for_latin1_and_bidi(&[
3034 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3035 0x66, 0x67, 0x68, 0x69,
3036 ]),
3037 Latin1Bidi::Bidi
3038 );
3039 assert_eq!(
3040 check_utf16_for_latin1_and_bidi(&[
3041 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3042 0x66, 0x67, 0x68, 0x69,
3043 ]),
3044 Latin1Bidi::Bidi
3045 );
3046 assert_eq!(
3047 check_utf16_for_latin1_and_bidi(&[
3048 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3049 0x66, 0x67, 0x68, 0x69,
3050 ]),
3051 Latin1Bidi::Bidi
3052 );
3053 assert_eq!(
3054 check_utf16_for_latin1_and_bidi(&[
3055 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3056 0x66, 0x67, 0x68, 0x69,
3057 ]),
3058 Latin1Bidi::Bidi
3059 );
3060 assert_eq!(
3061 check_utf16_for_latin1_and_bidi(&[
3062 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3063 0x66, 0x67, 0x68, 0x69,
3064 ]),
3065 Latin1Bidi::Bidi
3066 );
3067 assert_eq!(
3068 check_utf16_for_latin1_and_bidi(&[
3069 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3070 0x66, 0x67, 0x68, 0x69,
3071 ]),
3072 Latin1Bidi::Bidi
3073 );
3074 assert_eq!(
3075 check_utf16_for_latin1_and_bidi(&[
3076 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3077 0x66, 0x67, 0x68, 0x69,
3078 ]),
3079 Latin1Bidi::Bidi
3080 );
3081 assert_eq!(
3082 check_utf16_for_latin1_and_bidi(&[
3083 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3084 0x66, 0x67, 0x68, 0x69,
3085 ]),
3086 Latin1Bidi::Bidi
3087 );
3088 assert_eq!(
3089 check_utf16_for_latin1_and_bidi(&[
3090 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3091 0x66, 0x67, 0x68, 0x69,
3092 ]),
3093 Latin1Bidi::Bidi
3094 );
3095 assert_eq!(
3096 check_utf16_for_latin1_and_bidi(&[
3097 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3098 0x66, 0x67, 0x68, 0x69,
3099 ]),
3100 Latin1Bidi::Bidi
3101 );
3102 assert_eq!(
3103 check_utf16_for_latin1_and_bidi(&[
3104 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3105 0x66, 0x67, 0x68, 0x69,
3106 ]),
3107 Latin1Bidi::Bidi
3108 );
3109 assert_eq!(
3110 check_utf16_for_latin1_and_bidi(&[
3111 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3112 0x66, 0x67, 0x68, 0x69,
3113 ]),
3114 Latin1Bidi::Bidi
3115 );
3116 assert_eq!(
3117 check_utf16_for_latin1_and_bidi(&[
3118 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3119 0x66, 0x67, 0x68, 0x69,
3120 ]),
3121 Latin1Bidi::Bidi
3122 );
3123 assert_eq!(
3124 check_utf16_for_latin1_and_bidi(&[
3125 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3126 0x66, 0x67, 0x68, 0x69,
3127 ]),
3128 Latin1Bidi::Bidi
3129 );
3130
3131 assert_eq!(
3132 check_utf16_for_latin1_and_bidi(&[
3133 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3134 0x65, 0x66, 0x67, 0x68, 0x69,
3135 ]),
3136 Latin1Bidi::Bidi
3137 );
3138 }
3139
3140 #[inline(always)]
3141 pub fn reference_is_char_bidi(c: char) -> bool {
3142 match c {
3143 '\u{0590}'..='\u{08FF}'
3144 | '\u{FB1D}'..='\u{FDFF}'
3145 | '\u{FE70}'..='\u{FEFE}'
3146 | '\u{10800}'..='\u{10FFF}'
3147 | '\u{1E800}'..='\u{1EFFF}'
3148 | '\u{200F}'
3149 | '\u{202B}'
3150 | '\u{202E}'
3151 | '\u{2067}' => true,
3152 _ => false,
3153 }
3154 }
3155
3156 #[inline(always)]
3157 pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3158 match u {
3159 0x0590..=0x08FF
3160 | 0xFB1D..=0xFDFF
3161 | 0xFE70..=0xFEFE
3162 | 0xD802
3163 | 0xD803
3164 | 0xD83A
3165 | 0xD83B
3166 | 0x200F
3167 | 0x202B
3168 | 0x202E
3169 | 0x2067 => true,
3170 _ => false,
3171 }
3172 }
3173
3174 #[test]
3175 #[cfg_attr(miri, ignore)] // Miri is too slow
3176 fn test_is_char_bidi_thoroughly() {
3177 for i in 0..0xD800u32 {
3178 let c: char = ::core::char::from_u32(i).unwrap();
3179 assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3180 }
3181 for i in 0xE000..0x110000u32 {
3182 let c: char = ::core::char::from_u32(i).unwrap();
3183 assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3184 }
3185 }
3186
3187 #[test]
3188 #[cfg_attr(miri, ignore)] // Miri is too slow
3189 fn test_is_utf16_code_unit_bidi_thoroughly() {
3190 for i in 0..0x10000u32 {
3191 let u = i as u16;
3192 assert_eq!(
3193 is_utf16_code_unit_bidi(u),
3194 reference_is_utf16_code_unit_bidi(u)
3195 );
3196 }
3197 }
3198
3199 #[test]
3200 #[cfg_attr(miri, ignore)] // Miri is too slow
3201 fn test_is_str_bidi_thoroughly() {
3202 let mut buf = [0; 4];
3203 for i in 0..0xD800u32 {
3204 let c: char = ::core::char::from_u32(i).unwrap();
3205 assert_eq!(
3206 is_str_bidi(c.encode_utf8(&mut buf[..])),
3207 reference_is_char_bidi(c)
3208 );
3209 }
3210 for i in 0xE000..0x110000u32 {
3211 let c: char = ::core::char::from_u32(i).unwrap();
3212 assert_eq!(
3213 is_str_bidi(c.encode_utf8(&mut buf[..])),
3214 reference_is_char_bidi(c)
3215 );
3216 }
3217 }
3218
3219 #[test]
3220 #[cfg_attr(miri, ignore)] // Miri is too slow
3221 fn test_is_utf8_bidi_thoroughly() {
3222 let mut buf = [0; 8];
3223 for i in 0..0xD800u32 {
3224 let c: char = ::core::char::from_u32(i).unwrap();
3225 let expect = reference_is_char_bidi(c);
3226 {
3227 let len = {
3228 let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3229 assert_eq!(is_utf8_bidi(bytes), expect);
3230 bytes.len()
3231 };
3232 {
3233 let tail = &mut buf[len..];
3234 for b in tail.iter_mut() {
3235 *b = 0;
3236 }
3237 }
3238 }
3239 assert_eq!(is_utf8_bidi(&buf[..]), expect);
3240 }
3241 for i in 0xE000..0x110000u32 {
3242 let c: char = ::core::char::from_u32(i).unwrap();
3243 let expect = reference_is_char_bidi(c);
3244 {
3245 let len = {
3246 let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3247 assert_eq!(is_utf8_bidi(bytes), expect);
3248 bytes.len()
3249 };
3250 {
3251 let tail = &mut buf[len..];
3252 for b in tail.iter_mut() {
3253 *b = 0;
3254 }
3255 }
3256 }
3257 assert_eq!(is_utf8_bidi(&buf[..]), expect);
3258 }
3259 }
3260
3261 #[test]
3262 #[cfg_attr(miri, ignore)] // Miri is too slow
3263 fn test_is_utf16_bidi_thoroughly() {
3264 let mut buf = [0; 32];
3265 for i in 0..0x10000u32 {
3266 let u = i as u16;
3267 buf[15] = u;
3268 assert_eq!(
3269 is_utf16_bidi(&buf[..]),
3270 reference_is_utf16_code_unit_bidi(u)
3271 );
3272 }
3273 }
3274
3275 #[test]
3276 fn test_is_utf8_bidi_edge_cases() {
3277 assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3278 assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3279 assert!(!is_utf8_bidi(b"abc"));
3280 assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3281 assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3282 assert!(is_utf8_bidi(b"ab\xC2"));
3283 }
3284
3285 #[test]
3286 fn test_decode_latin1() {
3287 match decode_latin1(b"ab") {
3288 Cow::Borrowed(s) => {
3289 assert_eq!(s, "ab");
3290 }
3291 Cow::Owned(_) => {
3292 unreachable!("Should have borrowed");
3293 }
3294 }
3295 assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3296 }
3297
3298 #[test]
3299 fn test_encode_latin1_lossy() {
3300 match encode_latin1_lossy("ab") {
3301 Cow::Borrowed(s) => {
3302 assert_eq!(s, b"ab");
3303 }
3304 Cow::Owned(_) => {
3305 unreachable!("Should have borrowed");
3306 }
3307 }
3308 assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3309 }
3310
3311 #[test]
3312 fn test_convert_utf8_to_utf16_without_replacement() {
3313 let mut buf = [0u16; 5];
3314 assert_eq!(
3315 convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3316 Some(2)
3317 );
3318 assert_eq!(buf[0], u16::from(b'a'));
3319 assert_eq!(buf[1], u16::from(b'b'));
3320 assert_eq!(buf[2], 0);
3321 assert_eq!(
3322 convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3323 Some(2)
3324 );
3325 assert_eq!(buf[0], 0xE4);
3326 assert_eq!(buf[1], u16::from(b'c'));
3327 assert_eq!(buf[2], 0);
3328 assert_eq!(
3329 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3330 Some(1)
3331 );
3332 assert_eq!(buf[0], 0x2603);
3333 assert_eq!(buf[1], u16::from(b'c'));
3334 assert_eq!(buf[2], 0);
3335 assert_eq!(
3336 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3337 Some(2)
3338 );
3339 assert_eq!(buf[0], 0x2603);
3340 assert_eq!(buf[1], u16::from(b'd'));
3341 assert_eq!(buf[2], 0);
3342 assert_eq!(
3343 convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3344 Some(2)
3345 );
3346 assert_eq!(buf[0], 0x2603);
3347 assert_eq!(buf[1], 0xE4);
3348 assert_eq!(buf[2], 0);
3349 assert_eq!(
3350 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3351 Some(2)
3352 );
3353 assert_eq!(buf[0], 0xD83D);
3354 assert_eq!(buf[1], 0xDCCE);
3355 assert_eq!(buf[2], 0);
3356 assert_eq!(
3357 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3358 Some(3)
3359 );
3360 assert_eq!(buf[0], 0xD83D);
3361 assert_eq!(buf[1], 0xDCCE);
3362 assert_eq!(buf[2], u16::from(b'e'));
3363 assert_eq!(
3364 convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3365 None
3366 );
3367 }
3368}
3369