1 | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | //! Functions for converting between different in-RAM representations of text |
11 | //! and for quickly checking if the Unicode Bidirectional Algorithm can be |
12 | //! avoided. |
13 | //! |
14 | //! By using slices for output, the functions here seek to enable by-register |
15 | //! (ALU register or SIMD register as available) operations in order to |
16 | //! outperform iterator-based conversions available in the Rust standard |
17 | //! library. |
18 | //! |
19 | //! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to |
20 | //! U+00FF, inclusive, and does not refer to the windows-1252 range. This |
21 | //! in-memory encoding is sometimes used as a storage optimization of text |
22 | //! when UTF-16 indexing and length semantics are exposed. |
23 | //! |
24 | //! The FFI binding for this module are in the |
25 | //! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem). |
26 | |
27 | #[cfg (feature = "alloc" )] |
28 | use alloc::borrow::Cow; |
29 | #[cfg (feature = "alloc" )] |
30 | use alloc::string::String; |
31 | #[cfg (feature = "alloc" )] |
32 | use alloc::vec::Vec; |
33 | |
34 | use super::in_inclusive_range16; |
35 | use super::in_inclusive_range32; |
36 | use super::in_inclusive_range8; |
37 | use super::in_range16; |
38 | use super::in_range32; |
39 | use super::DecoderResult; |
40 | use crate::ascii::*; |
41 | use crate::utf_8::*; |
42 | |
43 | macro_rules! non_fuzz_debug_assert { |
44 | ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); }) |
45 | } |
46 | |
47 | cfg_if! { |
48 | if #[cfg(feature = "simd-accel" )] { |
49 | use ::core::intrinsics::likely; |
50 | use ::core::intrinsics::unlikely; |
51 | } else { |
52 | #[inline (always)] |
53 | fn likely(b: bool) -> bool { |
54 | b |
55 | } |
56 | #[inline (always)] |
57 | fn unlikely(b: bool) -> bool { |
58 | b |
59 | } |
60 | } |
61 | } |
62 | |
63 | /// Classification of text as Latin1 (all code points are below U+0100), |
64 | /// left-to-right with some non-Latin1 characters or as containing at least |
65 | /// some right-to-left characters. |
66 | #[must_use ] |
67 | #[derive (Debug, PartialEq, Eq)] |
68 | #[repr (C)] |
69 | pub enum Latin1Bidi { |
70 | /// Every character is below U+0100. |
71 | Latin1 = 0, |
72 | /// There is at least one character that's U+0100 or higher, but there |
73 | /// are no right-to-left characters. |
74 | LeftToRight = 1, |
75 | /// There is at least one right-to-left character. |
76 | Bidi = 2, |
77 | } |
78 | |
79 | // `as` truncates, so works on 32-bit, too. |
80 | #[allow (dead_code)] |
81 | const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize; |
82 | |
83 | #[allow (unused_macros)] |
84 | macro_rules! by_unit_check_alu { |
85 | ($name:ident, $unit:ty, $bound:expr, $mask:ident) => { |
86 | #[cfg_attr(feature = "cargo-clippy" , allow(cast_ptr_alignment))] |
87 | #[inline(always)] |
88 | fn $name(buffer: &[$unit]) -> bool { |
89 | let mut offset = 0usize; |
90 | let mut accu = 0usize; |
91 | let unit_size = ::core::mem::size_of::<$unit>(); |
92 | let len = buffer.len(); |
93 | if len >= ALU_ALIGNMENT / unit_size { |
94 | // The most common reason to return `false` is for the first code |
95 | // unit to fail the test, so check that first. |
96 | if buffer[0] >= $bound { |
97 | return false; |
98 | } |
99 | let src = buffer.as_ptr(); |
100 | let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) |
101 | & ALU_ALIGNMENT_MASK) |
102 | / unit_size; |
103 | if until_alignment + ALU_ALIGNMENT / unit_size <= len { |
104 | if until_alignment != 0 { |
105 | accu |= buffer[offset] as usize; |
106 | offset += 1; |
107 | until_alignment -= 1; |
108 | while until_alignment != 0 { |
109 | accu |= buffer[offset] as usize; |
110 | offset += 1; |
111 | until_alignment -= 1; |
112 | } |
113 | if accu >= $bound { |
114 | return false; |
115 | } |
116 | } |
117 | let len_minus_stride = len - ALU_ALIGNMENT / unit_size; |
118 | if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len { |
119 | let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size)); |
120 | loop { |
121 | let unroll_accu = unsafe { *(src.add(offset) as *const usize) } |
122 | | unsafe { |
123 | *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize) |
124 | } |
125 | | unsafe { |
126 | *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size))) |
127 | as *const usize) |
128 | } |
129 | | unsafe { |
130 | *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size))) |
131 | as *const usize) |
132 | }; |
133 | if unroll_accu & $mask != 0 { |
134 | return false; |
135 | } |
136 | offset += 4 * (ALU_ALIGNMENT / unit_size); |
137 | if offset > len_minus_unroll { |
138 | break; |
139 | } |
140 | } |
141 | } |
142 | while offset <= len_minus_stride { |
143 | accu |= unsafe { *(src.add(offset) as *const usize) }; |
144 | offset += ALU_ALIGNMENT / unit_size; |
145 | } |
146 | } |
147 | } |
148 | for &unit in &buffer[offset..] { |
149 | accu |= unit as usize; |
150 | } |
151 | accu & $mask == 0 |
152 | } |
153 | }; |
154 | } |
155 | |
156 | #[allow (unused_macros)] |
157 | macro_rules! by_unit_check_simd { |
158 | ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => { |
159 | #[inline(always)] |
160 | fn $name(buffer: &[$unit]) -> bool { |
161 | let mut offset = 0usize; |
162 | let mut accu = 0usize; |
163 | let unit_size = ::core::mem::size_of::<$unit>(); |
164 | let len = buffer.len(); |
165 | if len >= SIMD_STRIDE_SIZE / unit_size { |
166 | // The most common reason to return `false` is for the first code |
167 | // unit to fail the test, so check that first. |
168 | if buffer[0] >= $bound { |
169 | return false; |
170 | } |
171 | let src = buffer.as_ptr(); |
172 | let mut until_alignment = ((SIMD_ALIGNMENT |
173 | - ((src as usize) & SIMD_ALIGNMENT_MASK)) |
174 | & SIMD_ALIGNMENT_MASK) |
175 | / unit_size; |
176 | if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len { |
177 | if until_alignment != 0 { |
178 | accu |= buffer[offset] as usize; |
179 | offset += 1; |
180 | until_alignment -= 1; |
181 | while until_alignment != 0 { |
182 | accu |= buffer[offset] as usize; |
183 | offset += 1; |
184 | until_alignment -= 1; |
185 | } |
186 | if accu >= $bound { |
187 | return false; |
188 | } |
189 | } |
190 | let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size; |
191 | if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len { |
192 | let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size)); |
193 | loop { |
194 | let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) } |
195 | | unsafe { |
196 | *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size)) |
197 | as *const $simd_ty) |
198 | } |
199 | | unsafe { |
200 | *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size))) |
201 | as *const $simd_ty) |
202 | } |
203 | | unsafe { |
204 | *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size))) |
205 | as *const $simd_ty) |
206 | }; |
207 | if !$func(unroll_accu) { |
208 | return false; |
209 | } |
210 | offset += 4 * (SIMD_STRIDE_SIZE / unit_size); |
211 | if offset > len_minus_unroll { |
212 | break; |
213 | } |
214 | } |
215 | } |
216 | let mut simd_accu = $splat; |
217 | while offset <= len_minus_stride { |
218 | simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) }; |
219 | offset += SIMD_STRIDE_SIZE / unit_size; |
220 | } |
221 | if !$func(simd_accu) { |
222 | return false; |
223 | } |
224 | } |
225 | } |
226 | for &unit in &buffer[offset..] { |
227 | accu |= unit as usize; |
228 | } |
229 | accu < $bound |
230 | } |
231 | }; |
232 | } |
233 | |
234 | cfg_if! { |
235 | if #[cfg(all(feature = "simd-accel" , any(target_feature = "sse2" , all(target_endian = "little" , target_arch = "aarch64" ), all(target_endian = "little" , target_feature = "neon" ))))] { |
236 | use crate::simd_funcs::*; |
237 | use packed_simd::u8x16; |
238 | use packed_simd::u16x8; |
239 | |
240 | const SIMD_ALIGNMENT: usize = 16; |
241 | |
242 | const SIMD_ALIGNMENT_MASK: usize = 15; |
243 | |
244 | by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii); |
245 | by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin); |
246 | by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1); |
247 | |
248 | #[inline (always)] |
249 | fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { |
250 | // This function is a mess, because it simultaneously tries to do |
251 | // only aligned SIMD (perhaps misguidedly) and needs to deal with |
252 | // the last code unit in a SIMD stride being part of a valid |
253 | // surrogate pair. |
254 | let unit_size = ::core::mem::size_of::<u16>(); |
255 | let src = buffer.as_ptr(); |
256 | let len = buffer.len(); |
257 | let mut offset = 0usize; |
258 | 'outer: loop { |
259 | let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) & |
260 | SIMD_ALIGNMENT_MASK) / unit_size; |
261 | if until_alignment == 0 { |
262 | if offset + SIMD_STRIDE_SIZE / unit_size > len { |
263 | break; |
264 | } |
265 | } else { |
266 | let offset_plus_until_alignment = offset + until_alignment; |
267 | let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1; |
268 | if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len { |
269 | break; |
270 | } |
271 | let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]); |
272 | if up_to < until_alignment { |
273 | return offset + up_to; |
274 | } |
275 | if last_valid_low { |
276 | offset = offset_plus_until_alignment_plus_one; |
277 | continue; |
278 | } |
279 | offset = offset_plus_until_alignment; |
280 | } |
281 | let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size; |
282 | loop { |
283 | let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size; |
284 | if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) { |
285 | if offset_plus_stride == len { |
286 | break 'outer; |
287 | } |
288 | let offset_plus_stride_plus_one = offset_plus_stride + 1; |
289 | let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]); |
290 | if up_to < SIMD_STRIDE_SIZE / unit_size { |
291 | return offset + up_to; |
292 | } |
293 | if last_valid_low { |
294 | offset = offset_plus_stride_plus_one; |
295 | continue 'outer; |
296 | } |
297 | } |
298 | offset = offset_plus_stride; |
299 | if offset > len_minus_stride { |
300 | break 'outer; |
301 | } |
302 | } |
303 | } |
304 | let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]); |
305 | offset + up_to |
306 | } |
307 | } else { |
308 | by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK); |
309 | by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK); |
310 | by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK); |
311 | |
312 | #[inline (always)] |
313 | fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize { |
314 | let (up_to, _) = utf16_valid_up_to_alu(buffer); |
315 | up_to |
316 | } |
317 | } |
318 | } |
319 | |
320 | /// The second return value is true iff the last code unit of the slice was |
321 | /// reached and turned out to be a low surrogate that is part of a valid pair. |
322 | #[cfg_attr (feature = "cargo-clippy" , allow(collapsible_if))] |
323 | #[inline (always)] |
324 | fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) { |
325 | let len = buffer.len(); |
326 | if len == 0 { |
327 | return (0, false); |
328 | } |
329 | let mut offset = 0usize; |
330 | loop { |
331 | let unit = buffer[offset]; |
332 | let next = offset + 1; |
333 | let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); |
334 | if unit_minus_surrogate_start > (0xDFFF - 0xD800) { |
335 | // Not a surrogate |
336 | offset = next; |
337 | if offset == len { |
338 | return (offset, false); |
339 | } |
340 | continue; |
341 | } |
342 | if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { |
343 | // high surrogate |
344 | if next < len { |
345 | let second = buffer[next]; |
346 | let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); |
347 | if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) { |
348 | // The next code unit is a low surrogate. Advance position. |
349 | offset = next + 1; |
350 | if offset == len { |
351 | return (offset, true); |
352 | } |
353 | continue; |
354 | } |
355 | // The next code unit is not a low surrogate. Don't advance |
356 | // position and treat the high surrogate as unpaired. |
357 | // fall through |
358 | } |
359 | // Unpaired, fall through |
360 | } |
361 | // Unpaired surrogate |
362 | return (offset, false); |
363 | } |
364 | } |
365 | |
366 | cfg_if! { |
367 | if #[cfg(all(feature = "simd-accel" , any(target_feature = "sse2" , all(target_endian = "little" , target_arch = "aarch64" ), all(target_endian = "little" , target_feature = "neon" ))))] { |
368 | #[inline (always)] |
369 | fn is_str_latin1_impl(buffer: &str) -> Option<usize> { |
370 | let mut offset = 0usize; |
371 | let bytes = buffer.as_bytes(); |
372 | let len = bytes.len(); |
373 | if len >= SIMD_STRIDE_SIZE { |
374 | let src = bytes.as_ptr(); |
375 | let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & |
376 | SIMD_ALIGNMENT_MASK; |
377 | if until_alignment + SIMD_STRIDE_SIZE <= len { |
378 | while until_alignment != 0 { |
379 | if bytes[offset] > 0xC3 { |
380 | return Some(offset); |
381 | } |
382 | offset += 1; |
383 | until_alignment -= 1; |
384 | } |
385 | let len_minus_stride = len - SIMD_STRIDE_SIZE; |
386 | loop { |
387 | if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) { |
388 | // TODO: Ensure this compiles away when inlined into `is_str_latin1()`. |
389 | while bytes[offset] & 0xC0 == 0x80 { |
390 | offset += 1; |
391 | } |
392 | return Some(offset); |
393 | } |
394 | offset += SIMD_STRIDE_SIZE; |
395 | if offset > len_minus_stride { |
396 | break; |
397 | } |
398 | } |
399 | } |
400 | } |
401 | for i in offset..len { |
402 | if bytes[i] > 0xC3 { |
403 | return Some(i); |
404 | } |
405 | } |
406 | None |
407 | } |
408 | } else { |
409 | #[inline (always)] |
410 | fn is_str_latin1_impl(buffer: &str) -> Option<usize> { |
411 | let mut bytes = buffer.as_bytes(); |
412 | let mut total = 0; |
413 | loop { |
414 | if let Some((byte, offset)) = validate_ascii(bytes) { |
415 | total += offset; |
416 | if byte > 0xC3 { |
417 | return Some(total); |
418 | } |
419 | bytes = &bytes[offset + 2..]; |
420 | total += 2; |
421 | } else { |
422 | return None; |
423 | } |
424 | } |
425 | } |
426 | } |
427 | } |
428 | |
429 | #[inline (always)] |
430 | fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> { |
431 | let mut bytes: &[u8] = buffer; |
432 | let mut total: usize = 0; |
433 | loop { |
434 | if let Some((byte: u8, offset: usize)) = validate_ascii(slice:bytes) { |
435 | total += offset; |
436 | if in_inclusive_range8(i:byte, start:0xC2, end:0xC3) { |
437 | let next: usize = offset + 1; |
438 | if next == bytes.len() { |
439 | return Some(total); |
440 | } |
441 | if bytes[next] & 0xC0 != 0x80 { |
442 | return Some(total); |
443 | } |
444 | bytes = &bytes[offset + 2..]; |
445 | total += 2; |
446 | } else { |
447 | return Some(total); |
448 | } |
449 | } else { |
450 | return None; |
451 | } |
452 | } |
453 | } |
454 | |
455 | cfg_if! { |
456 | if #[cfg(all(feature = "simd-accel" , any(target_feature = "sse2" , all(target_endian = "little" , target_arch = "aarch64" ), all(target_endian = "little" , target_feature = "neon" ))))] { |
457 | #[inline (always)] |
458 | fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { |
459 | let mut offset = 0usize; |
460 | let len = buffer.len(); |
461 | if len >= SIMD_STRIDE_SIZE / 2 { |
462 | let src = buffer.as_ptr(); |
463 | let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & |
464 | SIMD_ALIGNMENT_MASK) / 2; |
465 | if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len { |
466 | while until_alignment != 0 { |
467 | if is_utf16_code_unit_bidi(buffer[offset]) { |
468 | return true; |
469 | } |
470 | offset += 1; |
471 | until_alignment -= 1; |
472 | } |
473 | let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2); |
474 | loop { |
475 | if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) { |
476 | return true; |
477 | } |
478 | offset += SIMD_STRIDE_SIZE / 2; |
479 | if offset > len_minus_stride { |
480 | break; |
481 | } |
482 | } |
483 | } |
484 | } |
485 | for &u in &buffer[offset..] { |
486 | if is_utf16_code_unit_bidi(u) { |
487 | return true; |
488 | } |
489 | } |
490 | false |
491 | } |
492 | } else { |
493 | #[inline (always)] |
494 | fn is_utf16_bidi_impl(buffer: &[u16]) -> bool { |
495 | for &u in buffer { |
496 | if is_utf16_code_unit_bidi(u) { |
497 | return true; |
498 | } |
499 | } |
500 | false |
501 | } |
502 | } |
503 | } |
504 | |
505 | cfg_if! { |
506 | if #[cfg(all(feature = "simd-accel" , any(target_feature = "sse2" , all(target_endian = "little" , target_arch = "aarch64" ), all(target_endian = "little" , target_feature = "neon" ))))] { |
507 | #[inline (always)] |
508 | fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { |
509 | let mut offset = 0usize; |
510 | let len = buffer.len(); |
511 | if len >= SIMD_STRIDE_SIZE / 2 { |
512 | let src = buffer.as_ptr(); |
513 | let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) & |
514 | SIMD_ALIGNMENT_MASK) / 2; |
515 | if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len { |
516 | while until_alignment != 0 { |
517 | if buffer[offset] > 0xFF { |
518 | // This transition isn't optimal, since the aligment is recomputing |
519 | // but not tweaking further today. |
520 | if is_utf16_bidi_impl(&buffer[offset..]) { |
521 | return Latin1Bidi::Bidi; |
522 | } |
523 | return Latin1Bidi::LeftToRight; |
524 | } |
525 | offset += 1; |
526 | until_alignment -= 1; |
527 | } |
528 | let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2); |
529 | loop { |
530 | let mut s = unsafe { *(src.add(offset) as *const u16x8) }; |
531 | if !simd_is_latin1(s) { |
532 | loop { |
533 | if is_u16x8_bidi(s) { |
534 | return Latin1Bidi::Bidi; |
535 | } |
536 | offset += SIMD_STRIDE_SIZE / 2; |
537 | if offset > len_minus_stride { |
538 | for &u in &buffer[offset..] { |
539 | if is_utf16_code_unit_bidi(u) { |
540 | return Latin1Bidi::Bidi; |
541 | } |
542 | } |
543 | return Latin1Bidi::LeftToRight; |
544 | } |
545 | s = unsafe { *(src.add(offset) as *const u16x8) }; |
546 | } |
547 | } |
548 | offset += SIMD_STRIDE_SIZE / 2; |
549 | if offset > len_minus_stride { |
550 | break; |
551 | } |
552 | } |
553 | } |
554 | } |
555 | let mut iter = (&buffer[offset..]).iter(); |
556 | loop { |
557 | if let Some(&u) = iter.next() { |
558 | if u > 0xFF { |
559 | let mut inner_u = u; |
560 | loop { |
561 | if is_utf16_code_unit_bidi(inner_u) { |
562 | return Latin1Bidi::Bidi; |
563 | } |
564 | if let Some(&code_unit) = iter.next() { |
565 | inner_u = code_unit; |
566 | } else { |
567 | return Latin1Bidi::LeftToRight; |
568 | } |
569 | } |
570 | } |
571 | } else { |
572 | return Latin1Bidi::Latin1; |
573 | } |
574 | } |
575 | } |
576 | } else { |
577 | #[cfg_attr (feature = "cargo-clippy" , allow(cast_ptr_alignment))] |
578 | #[inline (always)] |
579 | fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi { |
580 | let mut offset = 0usize; |
581 | let len = buffer.len(); |
582 | if len >= ALU_ALIGNMENT / 2 { |
583 | let src = buffer.as_ptr(); |
584 | let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & |
585 | ALU_ALIGNMENT_MASK) / 2; |
586 | if until_alignment + ALU_ALIGNMENT / 2 <= len { |
587 | while until_alignment != 0 { |
588 | if buffer[offset] > 0xFF { |
589 | if is_utf16_bidi_impl(&buffer[offset..]) { |
590 | return Latin1Bidi::Bidi; |
591 | } |
592 | return Latin1Bidi::LeftToRight; |
593 | } |
594 | offset += 1; |
595 | until_alignment -= 1; |
596 | } |
597 | let len_minus_stride = len - ALU_ALIGNMENT / 2; |
598 | loop { |
599 | if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 { |
600 | if is_utf16_bidi_impl(&buffer[offset..]) { |
601 | return Latin1Bidi::Bidi; |
602 | } |
603 | return Latin1Bidi::LeftToRight; |
604 | } |
605 | offset += ALU_ALIGNMENT / 2; |
606 | if offset > len_minus_stride { |
607 | break; |
608 | } |
609 | } |
610 | } |
611 | } |
612 | let mut iter = (&buffer[offset..]).iter(); |
613 | loop { |
614 | if let Some(&u) = iter.next() { |
615 | if u > 0xFF { |
616 | let mut inner_u = u; |
617 | loop { |
618 | if is_utf16_code_unit_bidi(inner_u) { |
619 | return Latin1Bidi::Bidi; |
620 | } |
621 | if let Some(&code_unit) = iter.next() { |
622 | inner_u = code_unit; |
623 | } else { |
624 | return Latin1Bidi::LeftToRight; |
625 | } |
626 | } |
627 | } |
628 | } else { |
629 | return Latin1Bidi::Latin1; |
630 | } |
631 | } |
632 | } |
633 | } |
634 | } |
635 | |
636 | /// Checks whether the buffer is all-ASCII. |
637 | /// |
638 | /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function |
639 | /// is not guaranteed to fail fast.) |
640 | pub fn is_ascii(buffer: &[u8]) -> bool { |
641 | is_ascii_impl(buffer) |
642 | } |
643 | |
644 | /// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing |
645 | /// only ASCII characters). |
646 | /// |
647 | /// May read the entire buffer even if it isn't all-ASCII. (I.e. the function |
648 | /// is not guaranteed to fail fast.) |
649 | pub fn is_basic_latin(buffer: &[u16]) -> bool { |
650 | is_basic_latin_impl(buffer) |
651 | } |
652 | |
653 | /// Checks whether the buffer is valid UTF-8 representing only code points |
654 | /// less than or equal to U+00FF. |
655 | /// |
656 | /// Fails fast. (I.e. returns before having read the whole buffer if UTF-8 |
657 | /// invalidity or code points above U+00FF are discovered. |
658 | pub fn is_utf8_latin1(buffer: &[u8]) -> bool { |
659 | is_utf8_latin1_impl(buffer).is_none() |
660 | } |
661 | |
662 | /// Checks whether the buffer represents only code points less than or equal |
663 | /// to U+00FF. |
664 | /// |
665 | /// Fails fast. (I.e. returns before having read the whole buffer if code |
666 | /// points above U+00FF are discovered. |
667 | pub fn is_str_latin1(buffer: &str) -> bool { |
668 | is_str_latin1_impl(buffer).is_none() |
669 | } |
670 | |
671 | /// Checks whether the buffer represents only code point less than or equal |
672 | /// to U+00FF. |
673 | /// |
674 | /// May read the entire buffer even if it isn't all-Latin1. (I.e. the function |
675 | /// is not guaranteed to fail fast.) |
676 | pub fn is_utf16_latin1(buffer: &[u16]) -> bool { |
677 | is_utf16_latin1_impl(buffer) |
678 | } |
679 | |
680 | /// Checks whether a potentially-invalid UTF-8 buffer contains code points |
681 | /// that trigger right-to-left processing. |
682 | /// |
683 | /// The check is done on a Unicode block basis without regard to assigned |
684 | /// vs. unassigned code points in the block. Hebrew presentation forms in |
685 | /// the Alphabetic Presentation Forms block are treated as if they formed |
686 | /// a block on their own (i.e. it treated as right-to-left). Additionally, |
687 | /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
688 | /// for. Control characters that are technically bidi controls but do not |
689 | /// cause right-to-left behavior without the presence of right-to-left |
690 | /// characters or right-to-left controls are not checked for. As a special |
691 | /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
692 | /// |
693 | /// Returns `true` if the input is invalid UTF-8 or the input contains an |
694 | /// RTL character. Returns `false` if the input is valid UTF-8 and contains |
695 | /// no RTL characters. |
696 | #[cfg_attr (feature = "cargo-clippy" , allow(collapsible_if, cyclomatic_complexity))] |
697 | #[inline ] |
698 | pub fn is_utf8_bidi(buffer: &[u8]) -> bool { |
699 | // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster |
700 | // than UTF-8 validation followed by `is_str_bidi()` for German, |
701 | // Russian and Japanese. However, this is considerably slower for Thai. |
702 | // Chances are that the compiler makes some branch predictions that are |
703 | // unfortunate for Thai. Not spending the time to manually optimize |
704 | // further at this time, since it's unclear if this variant even has |
705 | // use cases. However, this is worth revisiting once Rust gets the |
706 | // ability to annotate relative priorities of match arms. |
707 | |
708 | // U+058F: D6 8F |
709 | // U+0590: D6 90 |
710 | // U+08FF: E0 A3 BF |
711 | // U+0900: E0 A4 80 |
712 | // |
713 | // U+200F: E2 80 8F |
714 | // U+202B: E2 80 AB |
715 | // U+202E: E2 80 AE |
716 | // U+2067: E2 81 A7 |
717 | // |
718 | // U+FB1C: EF AC 9C |
719 | // U+FB1D: EF AC 9D |
720 | // U+FDFF: EF B7 BF |
721 | // U+FE00: EF B8 80 |
722 | // |
723 | // U+FE6F: EF B9 AF |
724 | // U+FE70: EF B9 B0 |
725 | // U+FEFE: EF BB BE |
726 | // U+FEFF: EF BB BF |
727 | // |
728 | // U+107FF: F0 90 9F BF |
729 | // U+10800: F0 90 A0 80 |
730 | // U+10FFF: F0 90 BF BF |
731 | // U+11000: F0 91 80 80 |
732 | // |
733 | // U+1E7FF: F0 9E 9F BF |
734 | // U+1E800: F0 9E A0 80 |
735 | // U+1EFFF: F0 9E BF BF |
736 | // U+1F000: F0 9F 80 80 |
737 | let mut src = buffer; |
738 | 'outer: loop { |
739 | if let Some((mut byte, mut read)) = validate_ascii(src) { |
740 | // Check for the longest sequence to avoid checking twice for the |
741 | // multi-byte sequences. |
742 | if read + 4 <= src.len() { |
743 | 'inner: loop { |
744 | // At this point, `byte` is not included in `read`. |
745 | match byte { |
746 | 0..=0x7F => { |
747 | // ASCII: go back to SIMD. |
748 | read += 1; |
749 | src = &src[read..]; |
750 | continue 'outer; |
751 | } |
752 | 0xC2..=0xD5 => { |
753 | // Two-byte |
754 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
755 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
756 | return true; |
757 | } |
758 | read += 2; |
759 | } |
760 | 0xD6 => { |
761 | // Two-byte |
762 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
763 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
764 | return true; |
765 | } |
766 | // XXX consider folding the above and below checks |
767 | if second > 0x8F { |
768 | return true; |
769 | } |
770 | read += 2; |
771 | } |
772 | // two-byte starting with 0xD7 and above is bidi |
773 | 0xE1 | 0xE3..=0xEC | 0xEE => { |
774 | // Three-byte normal |
775 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
776 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
777 | if ((UTF8_DATA.table[usize::from(second)] |
778 | & unsafe { |
779 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
780 | }) |
781 | | (third >> 6)) |
782 | != 2 |
783 | { |
784 | return true; |
785 | } |
786 | read += 3; |
787 | } |
788 | 0xE2 => { |
789 | // Three-byte normal, potentially bidi |
790 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
791 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
792 | if ((UTF8_DATA.table[usize::from(second)] |
793 | & unsafe { |
794 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
795 | }) |
796 | | (third >> 6)) |
797 | != 2 |
798 | { |
799 | return true; |
800 | } |
801 | if second == 0x80 { |
802 | if third == 0x8F || third == 0xAB || third == 0xAE { |
803 | return true; |
804 | } |
805 | } else if second == 0x81 { |
806 | if third == 0xA7 { |
807 | return true; |
808 | } |
809 | } |
810 | read += 3; |
811 | } |
812 | 0xEF => { |
813 | // Three-byte normal, potentially bidi |
814 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
815 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
816 | if ((UTF8_DATA.table[usize::from(second)] |
817 | & unsafe { |
818 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
819 | }) |
820 | | (third >> 6)) |
821 | != 2 |
822 | { |
823 | return true; |
824 | } |
825 | if in_inclusive_range8(second, 0xAC, 0xB7) { |
826 | if second == 0xAC { |
827 | if third > 0x9C { |
828 | return true; |
829 | } |
830 | } else { |
831 | return true; |
832 | } |
833 | } else if in_inclusive_range8(second, 0xB9, 0xBB) { |
834 | if second == 0xB9 { |
835 | if third > 0xAF { |
836 | return true; |
837 | } |
838 | } else if second == 0xBB { |
839 | if third != 0xBF { |
840 | return true; |
841 | } |
842 | } else { |
843 | return true; |
844 | } |
845 | } |
846 | read += 3; |
847 | } |
848 | 0xE0 => { |
849 | // Three-byte special lower bound, potentially bidi |
850 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
851 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
852 | if ((UTF8_DATA.table[usize::from(second)] |
853 | & unsafe { |
854 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
855 | }) |
856 | | (third >> 6)) |
857 | != 2 |
858 | { |
859 | return true; |
860 | } |
861 | // XXX can this be folded into the above validity check |
862 | if second < 0xA4 { |
863 | return true; |
864 | } |
865 | read += 3; |
866 | } |
867 | 0xED => { |
868 | // Three-byte special upper bound |
869 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
870 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
871 | if ((UTF8_DATA.table[usize::from(second)] |
872 | & unsafe { |
873 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
874 | }) |
875 | | (third >> 6)) |
876 | != 2 |
877 | { |
878 | return true; |
879 | } |
880 | read += 3; |
881 | } |
882 | 0xF1..=0xF4 => { |
883 | // Four-byte normal |
884 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
885 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
886 | let fourth = unsafe { *(src.get_unchecked(read + 3)) }; |
887 | if (u16::from( |
888 | UTF8_DATA.table[usize::from(second)] |
889 | & unsafe { |
890 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
891 | }, |
892 | ) | u16::from(third >> 6) |
893 | | (u16::from(fourth & 0xC0) << 2)) |
894 | != 0x202 |
895 | { |
896 | return true; |
897 | } |
898 | read += 4; |
899 | } |
900 | 0xF0 => { |
901 | // Four-byte special lower bound, potentially bidi |
902 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
903 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
904 | let fourth = unsafe { *(src.get_unchecked(read + 3)) }; |
905 | if (u16::from( |
906 | UTF8_DATA.table[usize::from(second)] |
907 | & unsafe { |
908 | *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) |
909 | }, |
910 | ) | u16::from(third >> 6) |
911 | | (u16::from(fourth & 0xC0) << 2)) |
912 | != 0x202 |
913 | { |
914 | return true; |
915 | } |
916 | if unlikely(second == 0x90 || second == 0x9E) { |
917 | let third = src[read + 2]; |
918 | if third >= 0xA0 { |
919 | return true; |
920 | } |
921 | } |
922 | read += 4; |
923 | } |
924 | _ => { |
925 | // Invalid lead or bidi-only lead |
926 | return true; |
927 | } |
928 | } |
929 | if read + 4 > src.len() { |
930 | if read == src.len() { |
931 | return false; |
932 | } |
933 | byte = src[read]; |
934 | break 'inner; |
935 | } |
936 | byte = src[read]; |
937 | continue 'inner; |
938 | } |
939 | } |
940 | // We can't have a complete 4-byte sequence, but we could still have |
941 | // a complete shorter sequence. |
942 | |
943 | // At this point, `byte` is not included in `read`. |
944 | match byte { |
945 | 0..=0x7F => { |
946 | // ASCII: go back to SIMD. |
947 | read += 1; |
948 | src = &src[read..]; |
949 | continue 'outer; |
950 | } |
951 | 0xC2..=0xD5 => { |
952 | // Two-byte |
953 | let new_read = read + 2; |
954 | if new_read > src.len() { |
955 | return true; |
956 | } |
957 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
958 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
959 | return true; |
960 | } |
961 | read = new_read; |
962 | // We need to deal with the case where we came here with 3 bytes |
963 | // left, so we need to take a look at the last one. |
964 | src = &src[read..]; |
965 | continue 'outer; |
966 | } |
967 | 0xD6 => { |
968 | // Two-byte, potentially bidi |
969 | let new_read = read + 2; |
970 | if new_read > src.len() { |
971 | return true; |
972 | } |
973 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
974 | if !in_inclusive_range8(second, 0x80, 0xBF) { |
975 | return true; |
976 | } |
977 | // XXX consider folding the above and below checks |
978 | if second > 0x8F { |
979 | return true; |
980 | } |
981 | read = new_read; |
982 | // We need to deal with the case where we came here with 3 bytes |
983 | // left, so we need to take a look at the last one. |
984 | src = &src[read..]; |
985 | continue 'outer; |
986 | } |
987 | // two-byte starting with 0xD7 and above is bidi |
988 | 0xE1 | 0xE3..=0xEC | 0xEE => { |
989 | // Three-byte normal |
990 | let new_read = read + 3; |
991 | if new_read > src.len() { |
992 | return true; |
993 | } |
994 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
995 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
996 | if ((UTF8_DATA.table[usize::from(second)] |
997 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
998 | | (third >> 6)) |
999 | != 2 |
1000 | { |
1001 | return true; |
1002 | } |
1003 | } |
1004 | 0xE2 => { |
1005 | // Three-byte normal, potentially bidi |
1006 | let new_read = read + 3; |
1007 | if new_read > src.len() { |
1008 | return true; |
1009 | } |
1010 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
1011 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
1012 | if ((UTF8_DATA.table[usize::from(second)] |
1013 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
1014 | | (third >> 6)) |
1015 | != 2 |
1016 | { |
1017 | return true; |
1018 | } |
1019 | if second == 0x80 { |
1020 | if third == 0x8F || third == 0xAB || third == 0xAE { |
1021 | return true; |
1022 | } |
1023 | } else if second == 0x81 { |
1024 | if third == 0xA7 { |
1025 | return true; |
1026 | } |
1027 | } |
1028 | } |
1029 | 0xEF => { |
1030 | // Three-byte normal, potentially bidi |
1031 | let new_read = read + 3; |
1032 | if new_read > src.len() { |
1033 | return true; |
1034 | } |
1035 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
1036 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
1037 | if ((UTF8_DATA.table[usize::from(second)] |
1038 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
1039 | | (third >> 6)) |
1040 | != 2 |
1041 | { |
1042 | return true; |
1043 | } |
1044 | if in_inclusive_range8(second, 0xAC, 0xB7) { |
1045 | if second == 0xAC { |
1046 | if third > 0x9C { |
1047 | return true; |
1048 | } |
1049 | } else { |
1050 | return true; |
1051 | } |
1052 | } else if in_inclusive_range8(second, 0xB9, 0xBB) { |
1053 | if second == 0xB9 { |
1054 | if third > 0xAF { |
1055 | return true; |
1056 | } |
1057 | } else if second == 0xBB { |
1058 | if third != 0xBF { |
1059 | return true; |
1060 | } |
1061 | } else { |
1062 | return true; |
1063 | } |
1064 | } |
1065 | } |
1066 | 0xE0 => { |
1067 | // Three-byte special lower bound, potentially bidi |
1068 | let new_read = read + 3; |
1069 | if new_read > src.len() { |
1070 | return true; |
1071 | } |
1072 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
1073 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
1074 | if ((UTF8_DATA.table[usize::from(second)] |
1075 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
1076 | | (third >> 6)) |
1077 | != 2 |
1078 | { |
1079 | return true; |
1080 | } |
1081 | // XXX can this be folded into the above validity check |
1082 | if second < 0xA4 { |
1083 | return true; |
1084 | } |
1085 | } |
1086 | 0xED => { |
1087 | // Three-byte special upper bound |
1088 | let new_read = read + 3; |
1089 | if new_read > src.len() { |
1090 | return true; |
1091 | } |
1092 | let second = unsafe { *(src.get_unchecked(read + 1)) }; |
1093 | let third = unsafe { *(src.get_unchecked(read + 2)) }; |
1094 | if ((UTF8_DATA.table[usize::from(second)] |
1095 | & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) }) |
1096 | | (third >> 6)) |
1097 | != 2 |
1098 | { |
1099 | return true; |
1100 | } |
1101 | } |
1102 | _ => { |
1103 | // Invalid lead, 4-byte lead or 2-byte bidi-only lead |
1104 | return true; |
1105 | } |
1106 | } |
1107 | return false; |
1108 | } else { |
1109 | return false; |
1110 | } |
1111 | } |
1112 | } |
1113 | |
1114 | /// Checks whether a valid UTF-8 buffer contains code points that trigger |
1115 | /// right-to-left processing. |
1116 | /// |
1117 | /// The check is done on a Unicode block basis without regard to assigned |
1118 | /// vs. unassigned code points in the block. Hebrew presentation forms in |
1119 | /// the Alphabetic Presentation Forms block are treated as if they formed |
1120 | /// a block on their own (i.e. it treated as right-to-left). Additionally, |
1121 | /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
1122 | /// for. Control characters that are technically bidi controls but do not |
1123 | /// cause right-to-left behavior without the presence of right-to-left |
1124 | /// characters or right-to-left controls are not checked for. As a special |
1125 | /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
1126 | #[cfg_attr (feature = "cargo-clippy" , allow(collapsible_if))] |
1127 | #[inline ] |
1128 | pub fn is_str_bidi(buffer: &str) -> bool { |
1129 | // U+058F: D6 8F |
1130 | // U+0590: D6 90 |
1131 | // U+08FF: E0 A3 BF |
1132 | // U+0900: E0 A4 80 |
1133 | // |
1134 | // U+200F: E2 80 8F |
1135 | // U+202B: E2 80 AB |
1136 | // U+202E: E2 80 AE |
1137 | // U+2067: E2 81 A7 |
1138 | // |
1139 | // U+FB1C: EF AC 9C |
1140 | // U+FB1D: EF AC 9D |
1141 | // U+FDFF: EF B7 BF |
1142 | // U+FE00: EF B8 80 |
1143 | // |
1144 | // U+FE6F: EF B9 AF |
1145 | // U+FE70: EF B9 B0 |
1146 | // U+FEFE: EF BB BE |
1147 | // U+FEFF: EF BB BF |
1148 | // |
1149 | // U+107FF: F0 90 9F BF |
1150 | // U+10800: F0 90 A0 80 |
1151 | // U+10FFF: F0 90 BF BF |
1152 | // U+11000: F0 91 80 80 |
1153 | // |
1154 | // U+1E7FF: F0 9E 9F BF |
1155 | // U+1E800: F0 9E A0 80 |
1156 | // U+1EFFF: F0 9E BF BF |
1157 | // U+1F000: F0 9F 80 80 |
1158 | let mut bytes = buffer.as_bytes(); |
1159 | 'outer: loop { |
1160 | // TODO: Instead of just validating ASCII using SIMD, use SIMD |
1161 | // to check for non-ASCII lead bytes, too, to quickly conclude |
1162 | // that the vector consist entirely of CJK and below-Hebrew |
1163 | // code points. |
1164 | // Unfortunately, scripts above Arabic but below CJK share |
1165 | // lead bytes with RTL. |
1166 | if let Some((mut byte, mut read)) = validate_ascii(bytes) { |
1167 | 'inner: loop { |
1168 | // At this point, `byte` is not included in `read`. |
1169 | if byte < 0xE0 { |
1170 | if byte >= 0x80 { |
1171 | // Two-byte |
1172 | // Adding `unlikely` here improved throughput on |
1173 | // Russian plain text by 33%! |
1174 | if unlikely(byte >= 0xD6) { |
1175 | if byte == 0xD6 { |
1176 | let second = bytes[read + 1]; |
1177 | if second > 0x8F { |
1178 | return true; |
1179 | } |
1180 | } else { |
1181 | return true; |
1182 | } |
1183 | } |
1184 | read += 2; |
1185 | } else { |
1186 | // ASCII: write and go back to SIMD. |
1187 | read += 1; |
1188 | // Intuitively, we should go back to the outer loop only |
1189 | // if byte is 0x30 or above, so as to avoid trashing on |
1190 | // ASCII space, comma and period in non-Latin context. |
1191 | // However, the extra branch seems to cost more than it's |
1192 | // worth. |
1193 | bytes = &bytes[read..]; |
1194 | continue 'outer; |
1195 | } |
1196 | } else if byte < 0xF0 { |
1197 | // Three-byte |
1198 | if unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) { |
1199 | let second = bytes[read + 1]; |
1200 | if byte == 0xE0 { |
1201 | if second < 0xA4 { |
1202 | return true; |
1203 | } |
1204 | } else if byte == 0xE2 { |
1205 | let third = bytes[read + 2]; |
1206 | if second == 0x80 { |
1207 | if third == 0x8F || third == 0xAB || third == 0xAE { |
1208 | return true; |
1209 | } |
1210 | } else if second == 0x81 { |
1211 | if third == 0xA7 { |
1212 | return true; |
1213 | } |
1214 | } |
1215 | } else { |
1216 | debug_assert_eq!(byte, 0xEF); |
1217 | if in_inclusive_range8(second, 0xAC, 0xB7) { |
1218 | if second == 0xAC { |
1219 | let third = bytes[read + 2]; |
1220 | if third > 0x9C { |
1221 | return true; |
1222 | } |
1223 | } else { |
1224 | return true; |
1225 | } |
1226 | } else if in_inclusive_range8(second, 0xB9, 0xBB) { |
1227 | if second == 0xB9 { |
1228 | let third = bytes[read + 2]; |
1229 | if third > 0xAF { |
1230 | return true; |
1231 | } |
1232 | } else if second == 0xBB { |
1233 | let third = bytes[read + 2]; |
1234 | if third != 0xBF { |
1235 | return true; |
1236 | } |
1237 | } else { |
1238 | return true; |
1239 | } |
1240 | } |
1241 | } |
1242 | } |
1243 | read += 3; |
1244 | } else { |
1245 | // Four-byte |
1246 | let second = bytes[read + 1]; |
1247 | if unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) { |
1248 | let third = bytes[read + 2]; |
1249 | if third >= 0xA0 { |
1250 | return true; |
1251 | } |
1252 | } |
1253 | read += 4; |
1254 | } |
1255 | // The comparison is always < or == and never >, but including |
1256 | // > here to let the compiler assume that < is true if this |
1257 | // comparison is false. |
1258 | if read >= bytes.len() { |
1259 | return false; |
1260 | } |
1261 | byte = bytes[read]; |
1262 | continue 'inner; |
1263 | } |
1264 | } else { |
1265 | return false; |
1266 | } |
1267 | } |
1268 | } |
1269 | |
1270 | /// Checks whether a UTF-16 buffer contains code points that trigger |
1271 | /// right-to-left processing. |
1272 | /// |
1273 | /// The check is done on a Unicode block basis without regard to assigned |
1274 | /// vs. unassigned code points in the block. Hebrew presentation forms in |
1275 | /// the Alphabetic Presentation Forms block are treated as if they formed |
1276 | /// a block on their own (i.e. it treated as right-to-left). Additionally, |
1277 | /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
1278 | /// for. Control characters that are technically bidi controls but do not |
1279 | /// cause right-to-left behavior without the presence of right-to-left |
1280 | /// characters or right-to-left controls are not checked for. As a special |
1281 | /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
1282 | /// |
1283 | /// Returns `true` if the input contains an RTL character or an unpaired |
1284 | /// high surrogate that could be the high half of an RTL character. |
1285 | /// Returns `false` if the input contains neither RTL characters nor |
1286 | /// unpaired high surrogates that could be higher halves of RTL characters. |
1287 | pub fn is_utf16_bidi(buffer: &[u16]) -> bool { |
1288 | is_utf16_bidi_impl(buffer) |
1289 | } |
1290 | |
1291 | /// Checks whether a scalar value triggers right-to-left processing. |
1292 | /// |
1293 | /// The check is done on a Unicode block basis without regard to assigned |
1294 | /// vs. unassigned code points in the block. Hebrew presentation forms in |
1295 | /// the Alphabetic Presentation Forms block are treated as if they formed |
1296 | /// a block on their own (i.e. it treated as right-to-left). Additionally, |
1297 | /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
1298 | /// for. Control characters that are technically bidi controls but do not |
1299 | /// cause right-to-left behavior without the presence of right-to-left |
1300 | /// characters or right-to-left controls are not checked for. As a special |
1301 | /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
1302 | #[inline (always)] |
1303 | pub fn is_char_bidi(c: char) -> bool { |
1304 | // Controls: |
1305 | // Every control with RIGHT-TO-LEFT in its name in |
1306 | // https://www.unicode.org/charts/PDF/U2000.pdf |
1307 | // U+200F RLM |
1308 | // U+202B RLE |
1309 | // U+202E RLO |
1310 | // U+2067 RLI |
1311 | // |
1312 | // BMP RTL: |
1313 | // https://www.unicode.org/roadmaps/bmp/ |
1314 | // U+0590...U+08FF |
1315 | // U+FB1D...U+FDFF Hebrew presentation forms and |
1316 | // Arabic Presentation Forms A |
1317 | // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM) |
1318 | // |
1319 | // Supplementary RTL: |
1320 | // https://www.unicode.org/roadmaps/smp/ |
1321 | // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803) |
1322 | // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B) |
1323 | let code_point = u32::from(c); |
1324 | if code_point < 0x0590 { |
1325 | // Below Hebrew |
1326 | return false; |
1327 | } |
1328 | if in_range32(code_point, 0x0900, 0xFB1D) { |
1329 | // Above Arabic Extended-A and below Hebrew presentation forms |
1330 | if in_inclusive_range32(code_point, 0x200F, 0x2067) { |
1331 | // In the range that contains the RTL controls |
1332 | return code_point == 0x200F |
1333 | || code_point == 0x202B |
1334 | || code_point == 0x202E |
1335 | || code_point == 0x2067; |
1336 | } |
1337 | return false; |
1338 | } |
1339 | if code_point > 0x1EFFF { |
1340 | // Above second astral RTL. (Emoji is here.) |
1341 | return false; |
1342 | } |
1343 | if in_range32(code_point, 0x11000, 0x1E800) { |
1344 | // Between astral RTL blocks |
1345 | return false; |
1346 | } |
1347 | if in_range32(code_point, 0xFEFF, 0x10800) { |
1348 | // Above Arabic Presentations Forms B (excl. BOM) and below first |
1349 | // astral RTL |
1350 | return false; |
1351 | } |
1352 | if in_range32(code_point, 0xFE00, 0xFE70) { |
1353 | // Between Arabic Presentations Forms |
1354 | return false; |
1355 | } |
1356 | true |
1357 | } |
1358 | |
1359 | /// Checks whether a UTF-16 code unit triggers right-to-left processing. |
1360 | /// |
1361 | /// The check is done on a Unicode block basis without regard to assigned |
1362 | /// vs. unassigned code points in the block. Hebrew presentation forms in |
1363 | /// the Alphabetic Presentation Forms block are treated as if they formed |
1364 | /// a block on their own (i.e. it treated as right-to-left). Additionally, |
1365 | /// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked |
1366 | /// for. Control characters that are technically bidi controls but do not |
1367 | /// cause right-to-left behavior without the presence of right-to-left |
1368 | /// characters or right-to-left controls are not checked for. As a special |
1369 | /// case, U+FEFF is excluded from Arabic Presentation Forms-B. |
1370 | /// |
1371 | /// Since supplementary-plane right-to-left blocks are identifiable from the |
1372 | /// high surrogate without examining the low surrogate, this function returns |
1373 | /// `true` for such high surrogates making the function suitable for handling |
1374 | /// supplementary-plane text without decoding surrogate pairs to scalar |
1375 | /// values. Obviously, such high surrogates are then reported as right-to-left |
1376 | /// even if actually unpaired. |
1377 | #[inline (always)] |
1378 | pub fn is_utf16_code_unit_bidi(u: u16) -> bool { |
1379 | if u < 0x0590 { |
1380 | // Below Hebrew |
1381 | return false; |
1382 | } |
1383 | if in_range16(u, 0x0900, 0xD802) { |
1384 | // Above Arabic Extended-A and below first RTL surrogate |
1385 | if in_inclusive_range16(u, 0x200F, 0x2067) { |
1386 | // In the range that contains the RTL controls |
1387 | return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067; |
1388 | } |
1389 | return false; |
1390 | } |
1391 | if in_range16(u, 0xD83C, 0xFB1D) { |
1392 | // Between astral RTL high surrogates and Hebrew presentation forms |
1393 | // (Emoji is here) |
1394 | return false; |
1395 | } |
1396 | if in_range16(u, 0xD804, 0xD83A) { |
1397 | // Between RTL high surragates |
1398 | return false; |
1399 | } |
1400 | if u > 0xFEFE { |
1401 | // Above Arabic Presentation Forms (excl. BOM) |
1402 | return false; |
1403 | } |
1404 | if in_range16(u, 0xFE00, 0xFE70) { |
1405 | // Between Arabic Presentations Forms |
1406 | return false; |
1407 | } |
1408 | true |
1409 | } |
1410 | |
1411 | /// Checks whether a potentially invalid UTF-8 buffer contains code points |
1412 | /// that trigger right-to-left processing or is all-Latin1. |
1413 | /// |
1414 | /// Possibly more efficient than performing the checks separately. |
1415 | /// |
1416 | /// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`. |
1417 | /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return |
1418 | /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. |
1419 | pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi { |
1420 | if let Some(offset: usize) = is_utf8_latin1_impl(buffer) { |
1421 | if is_utf8_bidi(&buffer[offset..]) { |
1422 | Latin1Bidi::Bidi |
1423 | } else { |
1424 | Latin1Bidi::LeftToRight |
1425 | } |
1426 | } else { |
1427 | Latin1Bidi::Latin1 |
1428 | } |
1429 | } |
1430 | |
1431 | /// Checks whether a valid UTF-8 buffer contains code points |
1432 | /// that trigger right-to-left processing or is all-Latin1. |
1433 | /// |
1434 | /// Possibly more efficient than performing the checks separately. |
1435 | /// |
1436 | /// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`. |
1437 | /// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return |
1438 | /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. |
1439 | pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi { |
1440 | // The transition from the latin1 check to the bidi check isn't |
1441 | // optimal but not tweaking it to perfection today. |
1442 | if let Some(offset: usize) = is_str_latin1_impl(buffer) { |
1443 | if is_str_bidi(&buffer[offset..]) { |
1444 | Latin1Bidi::Bidi |
1445 | } else { |
1446 | Latin1Bidi::LeftToRight |
1447 | } |
1448 | } else { |
1449 | Latin1Bidi::Latin1 |
1450 | } |
1451 | } |
1452 | |
1453 | /// Checks whether a potentially invalid UTF-16 buffer contains code points |
1454 | /// that trigger right-to-left processing or is all-Latin1. |
1455 | /// |
1456 | /// Possibly more efficient than performing the checks separately. |
1457 | /// |
1458 | /// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`. |
1459 | /// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return |
1460 | /// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`. |
1461 | pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi { |
1462 | check_utf16_for_latin1_and_bidi_impl(buffer) |
1463 | } |
1464 | |
1465 | /// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced |
1466 | /// with the REPLACEMENT CHARACTER. |
1467 | /// |
1468 | /// The length of the destination buffer must be at least the length of the |
1469 | /// source buffer _plus one_. |
1470 | /// |
1471 | /// Returns the number of `u16`s written. |
1472 | /// |
1473 | /// # Panics |
1474 | /// |
1475 | /// Panics if the destination buffer is shorter than stated above. |
1476 | pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize { |
1477 | // TODO: Can the requirement for dst to be at least one unit longer |
1478 | // be eliminated? |
1479 | assert!(dst.len() > src.len()); |
1480 | let mut decoder = Utf8Decoder::new_inner(); |
1481 | let mut total_read = 0usize; |
1482 | let mut total_written = 0usize; |
1483 | loop { |
1484 | let (result, read, written) = |
1485 | decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true); |
1486 | total_read += read; |
1487 | total_written += written; |
1488 | match result { |
1489 | DecoderResult::InputEmpty => { |
1490 | return total_written; |
1491 | } |
1492 | DecoderResult::OutputFull => { |
1493 | unreachable!("The assert at the top of the function should have caught this." ); |
1494 | } |
1495 | DecoderResult::Malformed(_, _) => { |
1496 | // There should always be space for the U+FFFD, because |
1497 | // otherwise we'd have gotten OutputFull already. |
1498 | dst[total_written] = 0xFFFD; |
1499 | total_written += 1; |
1500 | } |
1501 | } |
1502 | } |
1503 | } |
1504 | |
1505 | /// Converts valid UTF-8 to valid UTF-16. |
1506 | /// |
1507 | /// The length of the destination buffer must be at least the length of the |
1508 | /// source buffer. |
1509 | /// |
1510 | /// Returns the number of `u16`s written. |
1511 | /// |
1512 | /// # Panics |
1513 | /// |
1514 | /// Panics if the destination buffer is shorter than stated above. |
1515 | pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize { |
1516 | assert!( |
1517 | dst.len() >= src.len(), |
1518 | "Destination must not be shorter than the source." |
1519 | ); |
1520 | let bytes = src.as_bytes(); |
1521 | let mut read = 0; |
1522 | let mut written = 0; |
1523 | 'outer: loop { |
1524 | let mut byte = { |
1525 | let src_remaining = &bytes[read..]; |
1526 | let dst_remaining = &mut dst[written..]; |
1527 | let length = src_remaining.len(); |
1528 | match unsafe { |
1529 | ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
1530 | } { |
1531 | None => { |
1532 | written += length; |
1533 | return written; |
1534 | } |
1535 | Some((non_ascii, consumed)) => { |
1536 | read += consumed; |
1537 | written += consumed; |
1538 | non_ascii |
1539 | } |
1540 | } |
1541 | }; |
1542 | 'inner: loop { |
1543 | // At this point, `byte` is not included in `read`. |
1544 | if byte < 0xE0 { |
1545 | if byte >= 0x80 { |
1546 | // Two-byte |
1547 | let second = unsafe { *(bytes.get_unchecked(read + 1)) }; |
1548 | let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F); |
1549 | unsafe { *(dst.get_unchecked_mut(written)) = point }; |
1550 | read += 2; |
1551 | written += 1; |
1552 | } else { |
1553 | // ASCII: write and go back to SIMD. |
1554 | unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) }; |
1555 | read += 1; |
1556 | written += 1; |
1557 | // Intuitively, we should go back to the outer loop only |
1558 | // if byte is 0x30 or above, so as to avoid trashing on |
1559 | // ASCII space, comma and period in non-Latin context. |
1560 | // However, the extra branch seems to cost more than it's |
1561 | // worth. |
1562 | continue 'outer; |
1563 | } |
1564 | } else if byte < 0xF0 { |
1565 | // Three-byte |
1566 | let second = unsafe { *(bytes.get_unchecked(read + 1)) }; |
1567 | let third = unsafe { *(bytes.get_unchecked(read + 2)) }; |
1568 | let point = ((u16::from(byte) & 0xF) << 12) |
1569 | | ((u16::from(second) & 0x3F) << 6) |
1570 | | (u16::from(third) & 0x3F); |
1571 | unsafe { *(dst.get_unchecked_mut(written)) = point }; |
1572 | read += 3; |
1573 | written += 1; |
1574 | } else { |
1575 | // Four-byte |
1576 | let second = unsafe { *(bytes.get_unchecked(read + 1)) }; |
1577 | let third = unsafe { *(bytes.get_unchecked(read + 2)) }; |
1578 | let fourth = unsafe { *(bytes.get_unchecked(read + 3)) }; |
1579 | let point = ((u32::from(byte) & 0x7) << 18) |
1580 | | ((u32::from(second) & 0x3F) << 12) |
1581 | | ((u32::from(third) & 0x3F) << 6) |
1582 | | (u32::from(fourth) & 0x3F); |
1583 | unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 }; |
1584 | unsafe { |
1585 | *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16 |
1586 | }; |
1587 | read += 4; |
1588 | written += 2; |
1589 | } |
1590 | // The comparison is always < or == and never >, but including |
1591 | // > here to let the compiler assume that < is true if this |
1592 | // comparison is false. |
1593 | if read >= src.len() { |
1594 | return written; |
1595 | } |
1596 | byte = bytes[read]; |
1597 | continue 'inner; |
1598 | } |
1599 | } |
1600 | } |
1601 | |
1602 | /// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error. |
1603 | /// |
1604 | /// The length of the destination buffer must be at least the length of the |
1605 | /// source buffer. |
1606 | /// |
1607 | /// Returns the number of `u16`s written or `None` if the input was invalid. |
1608 | /// |
1609 | /// When the input was invalid, some output may have been written. |
1610 | /// |
1611 | /// # Panics |
1612 | /// |
1613 | /// Panics if the destination buffer is shorter than stated above. |
1614 | pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> { |
1615 | assert!( |
1616 | dst.len() >= src.len(), |
1617 | "Destination must not be shorter than the source." |
1618 | ); |
1619 | let (read: usize, written: usize) = convert_utf8_to_utf16_up_to_invalid(src, dst); |
1620 | if read == src.len() { |
1621 | return Some(written); |
1622 | } |
1623 | None |
1624 | } |
1625 | |
1626 | /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
1627 | /// with the REPLACEMENT CHARACTER with potentially insufficient output |
1628 | /// space. |
1629 | /// |
1630 | /// Returns the number of code units read and the number of bytes written. |
1631 | /// |
1632 | /// Guarantees that the bytes in the destination beyond the number of |
1633 | /// bytes claimed as written by the second item of the return tuple |
1634 | /// are left unmodified. |
1635 | /// |
1636 | /// Not all code units are read if there isn't enough output space. |
1637 | /// |
1638 | /// Note that this method isn't designed for general streamability but for |
1639 | /// not allocating memory for the worst case up front. Specifically, |
1640 | /// if the input starts with or ends with an unpaired surrogate, those are |
1641 | /// replaced with the REPLACEMENT CHARACTER. |
1642 | /// |
1643 | /// Matches the semantics of `TextEncoder.encodeInto()` from the |
1644 | /// Encoding Standard. |
1645 | /// |
1646 | /// # Safety |
1647 | /// |
1648 | /// If you want to convert into a `&mut str`, use |
1649 | /// `convert_utf16_to_str_partial()` instead of using this function |
1650 | /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. |
1651 | #[inline (always)] |
1652 | pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) { |
1653 | // The two functions called below are marked `inline(never)` to make |
1654 | // transitions from the hot part (first function) into the cold part |
1655 | // (second function) go through a return and another call to discouge |
1656 | // the CPU from speculating from the hot code into the cold code. |
1657 | // Letting the transitions be mere intra-function jumps, even to |
1658 | // basic blocks out-of-lined to the end of the function would wipe |
1659 | // away a quarter of Arabic encode performance on Haswell! |
1660 | let (read: usize, written: usize) = convert_utf16_to_utf8_partial_inner(src, dst); |
1661 | if likely(read == src.len()) { |
1662 | return (read, written); |
1663 | } |
1664 | let (tail_read: usize, tail_written: usize) = |
1665 | convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]); |
1666 | (read + tail_read, written + tail_written) |
1667 | } |
1668 | |
1669 | /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
1670 | /// with the REPLACEMENT CHARACTER. |
1671 | /// |
1672 | /// The length of the destination buffer must be at least the length of the |
1673 | /// source buffer times three. |
1674 | /// |
1675 | /// Returns the number of bytes written. |
1676 | /// |
1677 | /// # Panics |
1678 | /// |
1679 | /// Panics if the destination buffer is shorter than stated above. |
1680 | /// |
1681 | /// # Safety |
1682 | /// |
1683 | /// If you want to convert into a `&mut str`, use `convert_utf16_to_str()` |
1684 | /// instead of using this function together with the `unsafe` method |
1685 | /// `as_bytes_mut()` on `&mut str`. |
1686 | #[inline (always)] |
1687 | pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize { |
1688 | assert!(dst.len() >= src.len() * 3); |
1689 | let (read: usize, written: usize) = convert_utf16_to_utf8_partial(src, dst); |
1690 | debug_assert_eq!(read, src.len()); |
1691 | written |
1692 | } |
1693 | |
1694 | /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
1695 | /// with the REPLACEMENT CHARACTER such that the validity of the output is |
1696 | /// signaled using the Rust type system with potentially insufficient output |
1697 | /// space. |
1698 | /// |
1699 | /// Returns the number of code units read and the number of bytes written. |
1700 | /// |
1701 | /// Not all code units are read if there isn't enough output space. |
1702 | /// |
1703 | /// Note that this method isn't designed for general streamability but for |
1704 | /// not allocating memory for the worst case up front. Specifically, |
1705 | /// if the input starts with or ends with an unpaired surrogate, those are |
1706 | /// replaced with the REPLACEMENT CHARACTER. |
1707 | pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) { |
1708 | let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() }; |
1709 | let (read: usize, written: usize) = convert_utf16_to_utf8_partial(src, dst:bytes); |
1710 | let len: usize = bytes.len(); |
1711 | let mut trail: usize = written; |
1712 | while trail < len && ((bytes[trail] & 0xC0) == 0x80) { |
1713 | bytes[trail] = 0; |
1714 | trail += 1; |
1715 | } |
1716 | (read, written) |
1717 | } |
1718 | |
1719 | /// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced |
1720 | /// with the REPLACEMENT CHARACTER such that the validity of the output is |
1721 | /// signaled using the Rust type system. |
1722 | /// |
1723 | /// The length of the destination buffer must be at least the length of the |
1724 | /// source buffer times three. |
1725 | /// |
1726 | /// Returns the number of bytes written. |
1727 | /// |
1728 | /// # Panics |
1729 | /// |
1730 | /// Panics if the destination buffer is shorter than stated above. |
1731 | #[inline (always)] |
1732 | pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize { |
1733 | assert!(dst.len() >= src.len() * 3); |
1734 | let (read: usize, written: usize) = convert_utf16_to_str_partial(src, dst); |
1735 | debug_assert_eq!(read, src.len()); |
1736 | written |
1737 | } |
1738 | |
1739 | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
1740 | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-16. |
1741 | /// |
1742 | /// The length of the destination buffer must be at least the length of the |
1743 | /// source buffer. |
1744 | /// |
1745 | /// The number of `u16`s written equals the length of the source buffer. |
1746 | /// |
1747 | /// # Panics |
1748 | /// |
1749 | /// Panics if the destination buffer is shorter than stated above. |
1750 | pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) { |
1751 | assert!( |
1752 | dst.len() >= src.len(), |
1753 | "Destination must not be shorter than the source." |
1754 | ); |
1755 | // TODO: On aarch64, the safe version autovectorizes to the same unpacking |
1756 | // instructions and this code, but, yet, the autovectorized version is |
1757 | // faster. |
1758 | unsafe { |
1759 | unpack_latin1(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len()); |
1760 | } |
1761 | } |
1762 | |
1763 | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
1764 | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient |
1765 | /// output space. |
1766 | /// |
1767 | /// Returns the number of bytes read and the number of bytes written. |
1768 | /// |
1769 | /// If the output isn't large enough, not all input is consumed. |
1770 | /// |
1771 | /// # Safety |
1772 | /// |
1773 | /// If you want to convert into a `&mut str`, use |
1774 | /// `convert_utf16_to_str_partial()` instead of using this function |
1775 | /// together with the `unsafe` method `as_bytes_mut()` on `&mut str`. |
1776 | pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) { |
1777 | let src_len = src.len(); |
1778 | let src_ptr = src.as_ptr(); |
1779 | let dst_ptr = dst.as_mut_ptr(); |
1780 | let dst_len = dst.len(); |
1781 | let mut total_read = 0usize; |
1782 | let mut total_written = 0usize; |
1783 | loop { |
1784 | // src can't advance more than dst |
1785 | let src_left = src_len - total_read; |
1786 | let dst_left = dst_len - total_written; |
1787 | let min_left = ::core::cmp::min(src_left, dst_left); |
1788 | if let Some((non_ascii, consumed)) = unsafe { |
1789 | ascii_to_ascii( |
1790 | src_ptr.add(total_read), |
1791 | dst_ptr.add(total_written), |
1792 | min_left, |
1793 | ) |
1794 | } { |
1795 | total_read += consumed; |
1796 | total_written += consumed; |
1797 | if total_written.checked_add(2).unwrap() > dst_len { |
1798 | return (total_read, total_written); |
1799 | } |
1800 | |
1801 | total_read += 1; // consume `non_ascii` |
1802 | |
1803 | dst[total_written] = (non_ascii >> 6) | 0xC0; |
1804 | total_written += 1; |
1805 | dst[total_written] = (non_ascii & 0x3F) | 0x80; |
1806 | total_written += 1; |
1807 | continue; |
1808 | } |
1809 | return (total_read + min_left, total_written + min_left); |
1810 | } |
1811 | } |
1812 | |
1813 | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
1814 | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. |
1815 | /// |
1816 | /// The length of the destination buffer must be at least the length of the |
1817 | /// source buffer times two. |
1818 | /// |
1819 | /// Returns the number of bytes written. |
1820 | /// |
1821 | /// # Panics |
1822 | /// |
1823 | /// Panics if the destination buffer is shorter than stated above. |
1824 | /// |
1825 | /// # Safety |
1826 | /// |
1827 | /// Note that this function may write garbage beyond the number of bytes |
1828 | /// indicated by the return value, so using a `&mut str` interpreted as |
1829 | /// `&mut [u8]` as the destination is not safe. If you want to convert into |
1830 | /// a `&mut str`, use `convert_utf16_to_str()` instead of this function. |
1831 | #[inline ] |
1832 | pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize { |
1833 | assert!( |
1834 | dst.len() >= src.len() * 2, |
1835 | "Destination must not be shorter than the source times two." |
1836 | ); |
1837 | let (read: usize, written: usize) = convert_latin1_to_utf8_partial(src, dst); |
1838 | debug_assert_eq!(read, src.len()); |
1839 | written |
1840 | } |
1841 | |
1842 | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
1843 | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the |
1844 | /// output is signaled using the Rust type system with potentially insufficient |
1845 | /// output space. |
1846 | /// |
1847 | /// Returns the number of bytes read and the number of bytes written. |
1848 | /// |
1849 | /// If the output isn't large enough, not all input is consumed. |
1850 | #[inline ] |
1851 | pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) { |
1852 | let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() }; |
1853 | let (read: usize, written: usize) = convert_latin1_to_utf8_partial(src, dst:bytes); |
1854 | let len: usize = bytes.len(); |
1855 | let mut trail: usize = written; |
1856 | let max: usize = ::core::cmp::min(v1:len, v2:trail + MAX_STRIDE_SIZE); |
1857 | while trail < max { |
1858 | bytes[trail] = 0; |
1859 | trail += 1; |
1860 | } |
1861 | while trail < len && ((bytes[trail] & 0xC0) == 0x80) { |
1862 | bytes[trail] = 0; |
1863 | trail += 1; |
1864 | } |
1865 | (read, written) |
1866 | } |
1867 | |
1868 | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
1869 | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the |
1870 | /// output is signaled using the Rust type system. |
1871 | /// |
1872 | /// The length of the destination buffer must be at least the length of the |
1873 | /// source buffer times two. |
1874 | /// |
1875 | /// Returns the number of bytes written. |
1876 | /// |
1877 | /// # Panics |
1878 | /// |
1879 | /// Panics if the destination buffer is shorter than stated above. |
1880 | #[inline ] |
1881 | pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize { |
1882 | assert!( |
1883 | dst.len() >= src.len() * 2, |
1884 | "Destination must not be shorter than the source times two." |
1885 | ); |
1886 | let (read: usize, written: usize) = convert_latin1_to_str_partial(src, dst); |
1887 | debug_assert_eq!(read, src.len()); |
1888 | written |
1889 | } |
1890 | |
1891 | /// If the input is valid UTF-8 representing only Unicode code points from |
1892 | /// U+0000 to U+00FF, inclusive, converts the input into output that |
1893 | /// represents the value of each code point as the unsigned byte value of |
1894 | /// each output byte. |
1895 | /// |
1896 | /// If the input does not fulfill the condition stated above, this function |
1897 | /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise |
1898 | /// does something that is memory-safe without any promises about any |
1899 | /// properties of the output. In particular, callers shouldn't assume the |
1900 | /// output to be the same across crate versions or CPU architectures and |
1901 | /// should not assume that non-ASCII input can't map to ASCII output. |
1902 | /// |
1903 | /// The length of the destination buffer must be at least the length of the |
1904 | /// source buffer. |
1905 | /// |
1906 | /// Returns the number of bytes written. |
1907 | /// |
1908 | /// # Panics |
1909 | /// |
1910 | /// Panics if the destination buffer is shorter than stated above. |
1911 | /// |
1912 | /// If debug assertions are enabled (and not fuzzing) and the input is |
1913 | /// not in the range U+0000 to U+00FF, inclusive. |
1914 | pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize { |
1915 | assert!( |
1916 | dst.len() >= src.len(), |
1917 | "Destination must not be shorter than the source." |
1918 | ); |
1919 | non_fuzz_debug_assert!(is_utf8_latin1(src)); |
1920 | let src_len = src.len(); |
1921 | let src_ptr = src.as_ptr(); |
1922 | let dst_ptr = dst.as_mut_ptr(); |
1923 | let mut total_read = 0usize; |
1924 | let mut total_written = 0usize; |
1925 | loop { |
1926 | // dst can't advance more than src |
1927 | let src_left = src_len - total_read; |
1928 | if let Some((non_ascii, consumed)) = unsafe { |
1929 | ascii_to_ascii( |
1930 | src_ptr.add(total_read), |
1931 | dst_ptr.add(total_written), |
1932 | src_left, |
1933 | ) |
1934 | } { |
1935 | total_read += consumed + 1; |
1936 | total_written += consumed; |
1937 | |
1938 | if total_read == src_len { |
1939 | return total_written; |
1940 | } |
1941 | |
1942 | let trail = src[total_read]; |
1943 | total_read += 1; |
1944 | |
1945 | dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F); |
1946 | total_written += 1; |
1947 | continue; |
1948 | } |
1949 | return total_written + src_left; |
1950 | } |
1951 | } |
1952 | |
1953 | /// If the input is valid UTF-16 representing only Unicode code points from |
1954 | /// U+0000 to U+00FF, inclusive, converts the input into output that |
1955 | /// represents the value of each code point as the unsigned byte value of |
1956 | /// each output byte. |
1957 | /// |
1958 | /// If the input does not fulfill the condition stated above, does something |
1959 | /// that is memory-safe without any promises about any properties of the |
1960 | /// output and will probably assert in debug builds in future versions. |
1961 | /// In particular, callers shouldn't assume the output to be the same across |
1962 | /// crate versions or CPU architectures and should not assume that non-ASCII |
1963 | /// input can't map to ASCII output. |
1964 | /// |
1965 | /// The length of the destination buffer must be at least the length of the |
1966 | /// source buffer. |
1967 | /// |
1968 | /// The number of bytes written equals the length of the source buffer. |
1969 | /// |
1970 | /// # Panics |
1971 | /// |
1972 | /// Panics if the destination buffer is shorter than stated above. |
1973 | /// |
1974 | /// (Probably in future versions if debug assertions are enabled (and not |
1975 | /// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.) |
1976 | pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) { |
1977 | assert!( |
1978 | dst.len() >= src.len(), |
1979 | "Destination must not be shorter than the source." |
1980 | ); |
1981 | // non_fuzz_debug_assert!(is_utf16_latin1(src)); |
1982 | unsafe { |
1983 | pack_latin1(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len()); |
1984 | } |
1985 | } |
1986 | |
1987 | /// Converts bytes whose unsigned value is interpreted as Unicode code point |
1988 | /// (i.e. U+0000 to U+00FF, inclusive) to UTF-8. |
1989 | /// |
1990 | /// Borrows if input is ASCII-only. Performs a single heap allocation |
1991 | /// otherwise. |
1992 | /// |
1993 | /// Only available if the `alloc` feature is enabled (enabled by default). |
1994 | #[cfg (feature = "alloc" )] |
1995 | pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> { |
1996 | let up_to: usize = ascii_valid_up_to(bytes); |
1997 | // >= makes later things optimize better than == |
1998 | if up_to >= bytes.len() { |
1999 | debug_assert_eq!(up_to, bytes.len()); |
2000 | let s: &str = unsafe { ::core::str::from_utf8_unchecked(bytes) }; |
2001 | return Cow::Borrowed(s); |
2002 | } |
2003 | let (head: &[u8], tail: &[u8]) = bytes.split_at(mid:up_to); |
2004 | let capacity: usize = head.len() + tail.len() * 2; |
2005 | let mut vec: Vec = Vec::with_capacity(capacity); |
2006 | unsafe { |
2007 | vec.set_len(new_len:capacity); |
2008 | } |
2009 | (&mut vec[..up_to]).copy_from_slice(src:head); |
2010 | let written: usize = convert_latin1_to_utf8(src:tail, &mut vec[up_to..]); |
2011 | vec.truncate(len:up_to + written); |
2012 | Cow::Owned(unsafe { String::from_utf8_unchecked(bytes:vec) }) |
2013 | } |
2014 | |
2015 | /// If the input is valid UTF-8 representing only Unicode code points from |
2016 | /// U+0000 to U+00FF, inclusive, converts the input into output that |
2017 | /// represents the value of each code point as the unsigned byte value of |
2018 | /// each output byte. |
2019 | /// |
2020 | /// If the input does not fulfill the condition stated above, this function |
2021 | /// panics if debug assertions are enabled (and fuzzing isn't) and otherwise |
2022 | /// does something that is memory-safe without any promises about any |
2023 | /// properties of the output. In particular, callers shouldn't assume the |
2024 | /// output to be the same across crate versions or CPU architectures and |
2025 | /// should not assume that non-ASCII input can't map to ASCII output. |
2026 | /// |
2027 | /// Borrows if input is ASCII-only. Performs a single heap allocation |
2028 | /// otherwise. |
2029 | /// |
2030 | /// Only available if the `alloc` feature is enabled (enabled by default). |
2031 | #[cfg (feature = "alloc" )] |
2032 | pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> { |
2033 | let bytes: &[u8] = string.as_bytes(); |
2034 | let up_to: usize = ascii_valid_up_to(bytes); |
2035 | // >= makes later things optimize better than == |
2036 | if up_to >= bytes.len() { |
2037 | debug_assert_eq!(up_to, bytes.len()); |
2038 | return Cow::Borrowed(bytes); |
2039 | } |
2040 | let (head: &[u8], tail: &[u8]) = bytes.split_at(mid:up_to); |
2041 | let capacity: usize = bytes.len(); |
2042 | let mut vec: Vec = Vec::with_capacity(capacity); |
2043 | unsafe { |
2044 | vec.set_len(new_len:capacity); |
2045 | } |
2046 | (&mut vec[..up_to]).copy_from_slice(src:head); |
2047 | let written: usize = convert_utf8_to_latin1_lossy(src:tail, &mut vec[up_to..]); |
2048 | vec.truncate(len:up_to + written); |
2049 | Cow::Owned(vec) |
2050 | } |
2051 | |
2052 | /// Returns the index of the first unpaired surrogate or, if the input is |
2053 | /// valid UTF-16 in its entirety, the length of the input. |
2054 | pub fn utf16_valid_up_to(buffer: &[u16]) -> usize { |
2055 | utf16_valid_up_to_impl(buffer) |
2056 | } |
2057 | |
2058 | /// Returns the index of first byte that starts an invalid byte |
2059 | /// sequence or a non-Latin1 byte sequence, or the length of the |
2060 | /// string if there are neither. |
2061 | pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize { |
2062 | is_utf8_latin1_impl(buffer).unwrap_or(default:buffer.len()) |
2063 | } |
2064 | |
2065 | /// Returns the index of first byte that starts a non-Latin1 byte |
2066 | /// sequence, or the length of the string if there are none. |
2067 | pub fn str_latin1_up_to(buffer: &str) -> usize { |
2068 | is_str_latin1_impl(buffer).unwrap_or_else(|| buffer.len()) |
2069 | } |
2070 | |
2071 | /// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER. |
2072 | #[inline ] |
2073 | pub fn ensure_utf16_validity(buffer: &mut [u16]) { |
2074 | let mut offset: usize = 0; |
2075 | loop { |
2076 | offset += utf16_valid_up_to(&buffer[offset..]); |
2077 | if offset == buffer.len() { |
2078 | return; |
2079 | } |
2080 | buffer[offset] = 0xFFFD; |
2081 | offset += 1; |
2082 | } |
2083 | } |
2084 | |
2085 | /// Copies ASCII from source to destination up to the first non-ASCII byte |
2086 | /// (or the end of the input if it is ASCII in its entirety). |
2087 | /// |
2088 | /// The length of the destination buffer must be at least the length of the |
2089 | /// source buffer. |
2090 | /// |
2091 | /// Returns the number of bytes written. |
2092 | /// |
2093 | /// # Panics |
2094 | /// |
2095 | /// Panics if the destination buffer is shorter than stated above. |
2096 | pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize { |
2097 | assert!( |
2098 | dst.len() >= src.len(), |
2099 | "Destination must not be shorter than the source." |
2100 | ); |
2101 | if let Some((_, consumed: usize)) = |
2102 | unsafe { ascii_to_ascii(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len()) } |
2103 | { |
2104 | consumed |
2105 | } else { |
2106 | src.len() |
2107 | } |
2108 | } |
2109 | |
2110 | /// Copies ASCII from source to destination zero-extending it to UTF-16 up to |
2111 | /// the first non-ASCII byte (or the end of the input if it is ASCII in its |
2112 | /// entirety). |
2113 | /// |
2114 | /// The length of the destination buffer must be at least the length of the |
2115 | /// source buffer. |
2116 | /// |
2117 | /// Returns the number of `u16`s written. |
2118 | /// |
2119 | /// # Panics |
2120 | /// |
2121 | /// Panics if the destination buffer is shorter than stated above. |
2122 | pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize { |
2123 | assert!( |
2124 | dst.len() >= src.len(), |
2125 | "Destination must not be shorter than the source." |
2126 | ); |
2127 | if let Some((_, consumed: usize)) = |
2128 | unsafe { ascii_to_basic_latin(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len()) } |
2129 | { |
2130 | consumed |
2131 | } else { |
2132 | src.len() |
2133 | } |
2134 | } |
2135 | |
2136 | /// Copies Basic Latin from source to destination narrowing it to ASCII up to |
2137 | /// the first non-Basic Latin code unit (or the end of the input if it is |
2138 | /// Basic Latin in its entirety). |
2139 | /// |
2140 | /// The length of the destination buffer must be at least the length of the |
2141 | /// source buffer. |
2142 | /// |
2143 | /// Returns the number of bytes written. |
2144 | /// |
2145 | /// # Panics |
2146 | /// |
2147 | /// Panics if the destination buffer is shorter than stated above. |
2148 | pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize { |
2149 | assert!( |
2150 | dst.len() >= src.len(), |
2151 | "Destination must not be shorter than the source." |
2152 | ); |
2153 | if let Some((_, consumed: usize)) = |
2154 | unsafe { basic_latin_to_ascii(src:src.as_ptr(), dst:dst.as_mut_ptr(), src.len()) } |
2155 | { |
2156 | consumed |
2157 | } else { |
2158 | src.len() |
2159 | } |
2160 | } |
2161 | |
2162 | // Any copyright to the test code below this comment is dedicated to the |
2163 | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
2164 | |
2165 | #[cfg (all(test, feature = "alloc" ))] |
2166 | mod tests { |
2167 | use super::*; |
2168 | |
2169 | #[test ] |
2170 | fn test_is_ascii_success() { |
2171 | let mut src: Vec<u8> = Vec::with_capacity(128); |
2172 | src.resize(128, 0); |
2173 | for i in 0..src.len() { |
2174 | src[i] = i as u8; |
2175 | } |
2176 | for i in 0..src.len() { |
2177 | assert!(is_ascii(&src[i..])); |
2178 | } |
2179 | } |
2180 | |
2181 | #[test ] |
2182 | fn test_is_ascii_fail() { |
2183 | let mut src: Vec<u8> = Vec::with_capacity(128); |
2184 | src.resize(128, 0); |
2185 | for i in 0..src.len() { |
2186 | src[i] = i as u8; |
2187 | } |
2188 | for i in 0..src.len() { |
2189 | let tail = &mut src[i..]; |
2190 | for j in 0..tail.len() { |
2191 | tail[j] = 0xA0; |
2192 | assert!(!is_ascii(tail)); |
2193 | } |
2194 | } |
2195 | } |
2196 | |
2197 | #[test ] |
2198 | fn test_is_basic_latin_success() { |
2199 | let mut src: Vec<u16> = Vec::with_capacity(128); |
2200 | src.resize(128, 0); |
2201 | for i in 0..src.len() { |
2202 | src[i] = i as u16; |
2203 | } |
2204 | for i in 0..src.len() { |
2205 | assert!(is_basic_latin(&src[i..])); |
2206 | } |
2207 | } |
2208 | |
2209 | #[test ] |
2210 | fn test_is_basic_latin_fail() { |
2211 | let mut src: Vec<u16> = Vec::with_capacity(128); |
2212 | src.resize(128, 0); |
2213 | for i in 0..src.len() { |
2214 | src[i] = i as u16; |
2215 | } |
2216 | for i in 0..src.len() { |
2217 | let tail = &mut src[i..]; |
2218 | for j in 0..tail.len() { |
2219 | tail[j] = 0xA0; |
2220 | assert!(!is_basic_latin(tail)); |
2221 | } |
2222 | } |
2223 | } |
2224 | |
2225 | #[test ] |
2226 | fn test_is_utf16_latin1_success() { |
2227 | let mut src: Vec<u16> = Vec::with_capacity(256); |
2228 | src.resize(256, 0); |
2229 | for i in 0..src.len() { |
2230 | src[i] = i as u16; |
2231 | } |
2232 | for i in 0..src.len() { |
2233 | assert!(is_utf16_latin1(&src[i..])); |
2234 | assert_eq!( |
2235 | check_utf16_for_latin1_and_bidi(&src[i..]), |
2236 | Latin1Bidi::Latin1 |
2237 | ); |
2238 | } |
2239 | } |
2240 | |
2241 | #[test ] |
2242 | fn test_is_utf16_latin1_fail() { |
2243 | let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow |
2244 | let mut src: Vec<u16> = Vec::with_capacity(len); |
2245 | src.resize(len, 0); |
2246 | for i in 0..src.len() { |
2247 | src[i] = i as u16; |
2248 | } |
2249 | for i in 0..src.len() { |
2250 | let tail = &mut src[i..]; |
2251 | for j in 0..tail.len() { |
2252 | tail[j] = 0x100 + j as u16; |
2253 | assert!(!is_utf16_latin1(tail)); |
2254 | assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1); |
2255 | } |
2256 | } |
2257 | } |
2258 | |
2259 | #[test ] |
2260 | fn test_is_str_latin1_success() { |
2261 | let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow |
2262 | let mut src: Vec<u16> = Vec::with_capacity(len); |
2263 | src.resize(len, 0); |
2264 | for i in 0..src.len() { |
2265 | src[i] = i as u16; |
2266 | } |
2267 | for i in 0..src.len() { |
2268 | let s = String::from_utf16(&src[i..]).unwrap(); |
2269 | assert!(is_str_latin1(&s[..])); |
2270 | assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); |
2271 | } |
2272 | } |
2273 | |
2274 | #[test ] |
2275 | fn test_is_str_latin1_fail() { |
2276 | let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow |
2277 | let mut src: Vec<u16> = Vec::with_capacity(len); |
2278 | src.resize(len, 0); |
2279 | for i in 0..src.len() { |
2280 | src[i] = i as u16; |
2281 | } |
2282 | for i in 0..src.len() { |
2283 | let tail = &mut src[i..]; |
2284 | for j in 0..tail.len() { |
2285 | tail[j] = 0x100 + j as u16; |
2286 | let s = String::from_utf16(tail).unwrap(); |
2287 | assert!(!is_str_latin1(&s[..])); |
2288 | assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1); |
2289 | } |
2290 | } |
2291 | } |
2292 | |
2293 | #[test ] |
2294 | fn test_is_utf8_latin1_success() { |
2295 | let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow |
2296 | let mut src: Vec<u16> = Vec::with_capacity(len); |
2297 | src.resize(len, 0); |
2298 | for i in 0..src.len() { |
2299 | src[i] = i as u16; |
2300 | } |
2301 | for i in 0..src.len() { |
2302 | let s = String::from_utf16(&src[i..]).unwrap(); |
2303 | assert!(is_utf8_latin1(s.as_bytes())); |
2304 | assert_eq!( |
2305 | check_utf8_for_latin1_and_bidi(s.as_bytes()), |
2306 | Latin1Bidi::Latin1 |
2307 | ); |
2308 | } |
2309 | } |
2310 | |
2311 | #[test ] |
2312 | fn test_is_utf8_latin1_fail() { |
2313 | let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow |
2314 | let mut src: Vec<u16> = Vec::with_capacity(len); |
2315 | src.resize(len, 0); |
2316 | for i in 0..src.len() { |
2317 | src[i] = i as u16; |
2318 | } |
2319 | for i in 0..src.len() { |
2320 | let tail = &mut src[i..]; |
2321 | for j in 0..tail.len() { |
2322 | tail[j] = 0x100 + j as u16; |
2323 | let s = String::from_utf16(tail).unwrap(); |
2324 | assert!(!is_utf8_latin1(s.as_bytes())); |
2325 | assert_ne!( |
2326 | check_utf8_for_latin1_and_bidi(s.as_bytes()), |
2327 | Latin1Bidi::Latin1 |
2328 | ); |
2329 | } |
2330 | } |
2331 | } |
2332 | |
2333 | #[test ] |
2334 | fn test_is_utf8_latin1_invalid() { |
2335 | assert!(!is_utf8_latin1(b" \xC3" )); |
2336 | assert!(!is_utf8_latin1(b"a \xC3" )); |
2337 | assert!(!is_utf8_latin1(b" \xFF" )); |
2338 | assert!(!is_utf8_latin1(b"a \xFF" )); |
2339 | assert!(!is_utf8_latin1(b" \xC3\xFF" )); |
2340 | assert!(!is_utf8_latin1(b"a \xC3\xFF" )); |
2341 | } |
2342 | |
2343 | #[test ] |
2344 | fn test_convert_utf8_to_utf16() { |
2345 | let src = "abcdefghijklmnopqrstu \u{1F4A9}v \u{2603}w \u{00B6}xyzz" ; |
2346 | let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1); |
2347 | dst.resize(src.len() + 1, 0); |
2348 | let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]); |
2349 | dst.truncate(len); |
2350 | let reference: Vec<u16> = src.encode_utf16().collect(); |
2351 | assert_eq!(dst, reference); |
2352 | } |
2353 | |
2354 | #[test ] |
2355 | fn test_convert_str_to_utf16() { |
2356 | let src = "abcdefghijklmnopqrstu \u{1F4A9}v \u{2603}w \u{00B6}xyzz" ; |
2357 | let mut dst: Vec<u16> = Vec::with_capacity(src.len()); |
2358 | dst.resize(src.len(), 0); |
2359 | let len = convert_str_to_utf16(src, &mut dst[..]); |
2360 | dst.truncate(len); |
2361 | let reference: Vec<u16> = src.encode_utf16().collect(); |
2362 | assert_eq!(dst, reference); |
2363 | } |
2364 | |
2365 | #[test ] |
2366 | fn test_convert_utf16_to_utf8_partial() { |
2367 | let reference = "abcdefghijklmnopqrstu \u{1F4A9}v \u{2603}w \u{00B6}xyzz" ; |
2368 | let src: Vec<u16> = reference.encode_utf16().collect(); |
2369 | let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1); |
2370 | dst.resize(src.len() * 3 + 1, 0); |
2371 | let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]); |
2372 | let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]); |
2373 | dst.truncate(len); |
2374 | assert_eq!(dst, reference.as_bytes()); |
2375 | } |
2376 | |
2377 | #[test ] |
2378 | fn test_convert_utf16_to_utf8() { |
2379 | let reference = "abcdefghijklmnopqrstu \u{1F4A9}v \u{2603}w \u{00B6}xyzz" ; |
2380 | let src: Vec<u16> = reference.encode_utf16().collect(); |
2381 | let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1); |
2382 | dst.resize(src.len() * 3 + 1, 0); |
2383 | let len = convert_utf16_to_utf8(&src[..], &mut dst[..]); |
2384 | dst.truncate(len); |
2385 | assert_eq!(dst, reference.as_bytes()); |
2386 | } |
2387 | |
2388 | #[test ] |
2389 | fn test_convert_latin1_to_utf16() { |
2390 | let mut src: Vec<u8> = Vec::with_capacity(256); |
2391 | src.resize(256, 0); |
2392 | let mut reference: Vec<u16> = Vec::with_capacity(256); |
2393 | reference.resize(256, 0); |
2394 | for i in 0..256 { |
2395 | src[i] = i as u8; |
2396 | reference[i] = i as u16; |
2397 | } |
2398 | let mut dst: Vec<u16> = Vec::with_capacity(src.len()); |
2399 | dst.resize(src.len(), 0); |
2400 | convert_latin1_to_utf16(&src[..], &mut dst[..]); |
2401 | assert_eq!(dst, reference); |
2402 | } |
2403 | |
2404 | #[test ] |
2405 | fn test_convert_latin1_to_utf8_partial() { |
2406 | let mut dst = [0u8, 2]; |
2407 | let (read, written) = convert_latin1_to_utf8_partial(b"a \xFF" , &mut dst[..]); |
2408 | assert_eq!(read, 1); |
2409 | assert_eq!(written, 1); |
2410 | } |
2411 | |
2412 | #[test ] |
2413 | fn test_convert_latin1_to_utf8() { |
2414 | let mut src: Vec<u8> = Vec::with_capacity(256); |
2415 | src.resize(256, 0); |
2416 | let mut reference: Vec<u16> = Vec::with_capacity(256); |
2417 | reference.resize(256, 0); |
2418 | for i in 0..256 { |
2419 | src[i] = i as u8; |
2420 | reference[i] = i as u16; |
2421 | } |
2422 | let s = String::from_utf16(&reference[..]).unwrap(); |
2423 | let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2); |
2424 | dst.resize(src.len() * 2, 0); |
2425 | let len = convert_latin1_to_utf8(&src[..], &mut dst[..]); |
2426 | dst.truncate(len); |
2427 | assert_eq!(&dst[..], s.as_bytes()); |
2428 | } |
2429 | |
2430 | #[test ] |
2431 | fn test_convert_utf8_to_latin1_lossy() { |
2432 | let mut reference: Vec<u8> = Vec::with_capacity(256); |
2433 | reference.resize(256, 0); |
2434 | let mut src16: Vec<u16> = Vec::with_capacity(256); |
2435 | src16.resize(256, 0); |
2436 | for i in 0..256 { |
2437 | src16[i] = i as u16; |
2438 | reference[i] = i as u8; |
2439 | } |
2440 | let src = String::from_utf16(&src16[..]).unwrap(); |
2441 | let mut dst: Vec<u8> = Vec::with_capacity(src.len()); |
2442 | dst.resize(src.len(), 0); |
2443 | let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]); |
2444 | dst.truncate(len); |
2445 | assert_eq!(dst, reference); |
2446 | } |
2447 | |
2448 | #[cfg (all(debug_assertions, not(fuzzing)))] |
2449 | #[test ] |
2450 | #[should_panic ] |
2451 | fn test_convert_utf8_to_latin1_lossy_panics() { |
2452 | let mut dst = [0u8; 16]; |
2453 | let _ = convert_utf8_to_latin1_lossy(" \u{100}" .as_bytes(), &mut dst[..]); |
2454 | } |
2455 | |
2456 | #[test ] |
2457 | fn test_convert_utf16_to_latin1_lossy() { |
2458 | let mut src: Vec<u16> = Vec::with_capacity(256); |
2459 | src.resize(256, 0); |
2460 | let mut reference: Vec<u8> = Vec::with_capacity(256); |
2461 | reference.resize(256, 0); |
2462 | for i in 0..256 { |
2463 | src[i] = i as u16; |
2464 | reference[i] = i as u8; |
2465 | } |
2466 | let mut dst: Vec<u8> = Vec::with_capacity(src.len()); |
2467 | dst.resize(src.len(), 0); |
2468 | convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]); |
2469 | assert_eq!(dst, reference); |
2470 | } |
2471 | |
2472 | #[test ] |
2473 | // #[should_panic] |
2474 | fn test_convert_utf16_to_latin1_lossy_panics() { |
2475 | let mut dst = [0u8; 16]; |
2476 | let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]); |
2477 | } |
2478 | |
2479 | #[test ] |
2480 | fn test_utf16_valid_up_to() { |
2481 | let valid = vec![ |
2482 | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16, |
2483 | 0xD83Du16, 0xDCA9u16, 0x00B6u16, |
2484 | ]; |
2485 | assert_eq!(utf16_valid_up_to(&valid[..]), 16); |
2486 | let lone_high = vec![ |
2487 | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2488 | 0x2603u16, 0xD83Du16, 0x00B6u16, |
2489 | ]; |
2490 | assert_eq!(utf16_valid_up_to(&lone_high[..]), 14); |
2491 | let lone_low = vec![ |
2492 | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2493 | 0x2603u16, 0xDCA9u16, 0x00B6u16, |
2494 | ]; |
2495 | assert_eq!(utf16_valid_up_to(&lone_low[..]), 14); |
2496 | let lone_high_at_end = vec![ |
2497 | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2498 | 0x2603u16, 0x00B6u16, 0xD83Du16, |
2499 | ]; |
2500 | assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15); |
2501 | } |
2502 | |
2503 | #[test ] |
2504 | fn test_ensure_utf16_validity() { |
2505 | let mut src = vec![ |
2506 | 0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2507 | 0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2508 | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2509 | ]; |
2510 | let reference = vec![ |
2511 | 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2512 | 0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2513 | 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, |
2514 | ]; |
2515 | ensure_utf16_validity(&mut src[..]); |
2516 | assert_eq!(src, reference); |
2517 | } |
2518 | |
2519 | #[test ] |
2520 | fn test_is_char_bidi() { |
2521 | assert!(!is_char_bidi('a' )); |
2522 | assert!(!is_char_bidi(' \u{03B1}' )); |
2523 | assert!(!is_char_bidi(' \u{3041}' )); |
2524 | assert!(!is_char_bidi(' \u{1F4A9}' )); |
2525 | assert!(!is_char_bidi(' \u{FE00}' )); |
2526 | assert!(!is_char_bidi(' \u{202C}' )); |
2527 | assert!(!is_char_bidi(' \u{FEFF}' )); |
2528 | assert!(is_char_bidi(' \u{0590}' )); |
2529 | assert!(is_char_bidi(' \u{08FF}' )); |
2530 | assert!(is_char_bidi(' \u{061C}' )); |
2531 | assert!(is_char_bidi(' \u{FB50}' )); |
2532 | assert!(is_char_bidi(' \u{FDFF}' )); |
2533 | assert!(is_char_bidi(' \u{FE70}' )); |
2534 | assert!(is_char_bidi(' \u{FEFE}' )); |
2535 | assert!(is_char_bidi(' \u{200F}' )); |
2536 | assert!(is_char_bidi(' \u{202B}' )); |
2537 | assert!(is_char_bidi(' \u{202E}' )); |
2538 | assert!(is_char_bidi(' \u{2067}' )); |
2539 | assert!(is_char_bidi(' \u{10800}' )); |
2540 | assert!(is_char_bidi(' \u{10FFF}' )); |
2541 | assert!(is_char_bidi(' \u{1E800}' )); |
2542 | assert!(is_char_bidi(' \u{1EFFF}' )); |
2543 | } |
2544 | |
2545 | #[test ] |
2546 | fn test_is_utf16_code_unit_bidi() { |
2547 | assert!(!is_utf16_code_unit_bidi(0x0062)); |
2548 | assert!(!is_utf16_code_unit_bidi(0x03B1)); |
2549 | assert!(!is_utf16_code_unit_bidi(0x3041)); |
2550 | assert!(!is_utf16_code_unit_bidi(0xD801)); |
2551 | assert!(!is_utf16_code_unit_bidi(0xFE00)); |
2552 | assert!(!is_utf16_code_unit_bidi(0x202C)); |
2553 | assert!(!is_utf16_code_unit_bidi(0xFEFF)); |
2554 | assert!(is_utf16_code_unit_bidi(0x0590)); |
2555 | assert!(is_utf16_code_unit_bidi(0x08FF)); |
2556 | assert!(is_utf16_code_unit_bidi(0x061C)); |
2557 | assert!(is_utf16_code_unit_bidi(0xFB1D)); |
2558 | assert!(is_utf16_code_unit_bidi(0xFB50)); |
2559 | assert!(is_utf16_code_unit_bidi(0xFDFF)); |
2560 | assert!(is_utf16_code_unit_bidi(0xFE70)); |
2561 | assert!(is_utf16_code_unit_bidi(0xFEFE)); |
2562 | assert!(is_utf16_code_unit_bidi(0x200F)); |
2563 | assert!(is_utf16_code_unit_bidi(0x202B)); |
2564 | assert!(is_utf16_code_unit_bidi(0x202E)); |
2565 | assert!(is_utf16_code_unit_bidi(0x2067)); |
2566 | assert!(is_utf16_code_unit_bidi(0xD802)); |
2567 | assert!(is_utf16_code_unit_bidi(0xD803)); |
2568 | assert!(is_utf16_code_unit_bidi(0xD83A)); |
2569 | assert!(is_utf16_code_unit_bidi(0xD83B)); |
2570 | } |
2571 | |
2572 | #[test ] |
2573 | fn test_is_str_bidi() { |
2574 | assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop" )); |
2575 | assert!(!is_str_bidi("abcdefghijklmnop \u{03B1}abcdefghijklmnop" )); |
2576 | assert!(!is_str_bidi("abcdefghijklmnop \u{3041}abcdefghijklmnop" )); |
2577 | assert!(!is_str_bidi("abcdefghijklmnop \u{1F4A9}abcdefghijklmnop" )); |
2578 | assert!(!is_str_bidi("abcdefghijklmnop \u{FE00}abcdefghijklmnop" )); |
2579 | assert!(!is_str_bidi("abcdefghijklmnop \u{202C}abcdefghijklmnop" )); |
2580 | assert!(!is_str_bidi("abcdefghijklmnop \u{FEFF}abcdefghijklmnop" )); |
2581 | assert!(is_str_bidi("abcdefghijklmnop \u{0590}abcdefghijklmnop" )); |
2582 | assert!(is_str_bidi("abcdefghijklmnop \u{08FF}abcdefghijklmnop" )); |
2583 | assert!(is_str_bidi("abcdefghijklmnop \u{061C}abcdefghijklmnop" )); |
2584 | assert!(is_str_bidi("abcdefghijklmnop \u{FB50}abcdefghijklmnop" )); |
2585 | assert!(is_str_bidi("abcdefghijklmnop \u{FDFF}abcdefghijklmnop" )); |
2586 | assert!(is_str_bidi("abcdefghijklmnop \u{FE70}abcdefghijklmnop" )); |
2587 | assert!(is_str_bidi("abcdefghijklmnop \u{FEFE}abcdefghijklmnop" )); |
2588 | assert!(is_str_bidi("abcdefghijklmnop \u{200F}abcdefghijklmnop" )); |
2589 | assert!(is_str_bidi("abcdefghijklmnop \u{202B}abcdefghijklmnop" )); |
2590 | assert!(is_str_bidi("abcdefghijklmnop \u{202E}abcdefghijklmnop" )); |
2591 | assert!(is_str_bidi("abcdefghijklmnop \u{2067}abcdefghijklmnop" )); |
2592 | assert!(is_str_bidi("abcdefghijklmnop \u{10800}abcdefghijklmnop" )); |
2593 | assert!(is_str_bidi("abcdefghijklmnop \u{10FFF}abcdefghijklmnop" )); |
2594 | assert!(is_str_bidi("abcdefghijklmnop \u{1E800}abcdefghijklmnop" )); |
2595 | assert!(is_str_bidi("abcdefghijklmnop \u{1EFFF}abcdefghijklmnop" )); |
2596 | } |
2597 | |
2598 | #[test ] |
2599 | fn test_is_utf8_bidi() { |
2600 | assert!(!is_utf8_bidi( |
2601 | "abcdefghijklmnopaabcdefghijklmnop" .as_bytes() |
2602 | )); |
2603 | assert!(!is_utf8_bidi( |
2604 | "abcdefghijklmnop \u{03B1}abcdefghijklmnop" .as_bytes() |
2605 | )); |
2606 | assert!(!is_utf8_bidi( |
2607 | "abcdefghijklmnop \u{3041}abcdefghijklmnop" .as_bytes() |
2608 | )); |
2609 | assert!(!is_utf8_bidi( |
2610 | "abcdefghijklmnop \u{1F4A9}abcdefghijklmnop" .as_bytes() |
2611 | )); |
2612 | assert!(!is_utf8_bidi( |
2613 | "abcdefghijklmnop \u{FE00}abcdefghijklmnop" .as_bytes() |
2614 | )); |
2615 | assert!(!is_utf8_bidi( |
2616 | "abcdefghijklmnop \u{202C}abcdefghijklmnop" .as_bytes() |
2617 | )); |
2618 | assert!(!is_utf8_bidi( |
2619 | "abcdefghijklmnop \u{FEFF}abcdefghijklmnop" .as_bytes() |
2620 | )); |
2621 | assert!(is_utf8_bidi( |
2622 | "abcdefghijklmnop \u{0590}abcdefghijklmnop" .as_bytes() |
2623 | )); |
2624 | assert!(is_utf8_bidi( |
2625 | "abcdefghijklmnop \u{08FF}abcdefghijklmnop" .as_bytes() |
2626 | )); |
2627 | assert!(is_utf8_bidi( |
2628 | "abcdefghijklmnop \u{061C}abcdefghijklmnop" .as_bytes() |
2629 | )); |
2630 | assert!(is_utf8_bidi( |
2631 | "abcdefghijklmnop \u{FB50}abcdefghijklmnop" .as_bytes() |
2632 | )); |
2633 | assert!(is_utf8_bidi( |
2634 | "abcdefghijklmnop \u{FDFF}abcdefghijklmnop" .as_bytes() |
2635 | )); |
2636 | assert!(is_utf8_bidi( |
2637 | "abcdefghijklmnop \u{FE70}abcdefghijklmnop" .as_bytes() |
2638 | )); |
2639 | assert!(is_utf8_bidi( |
2640 | "abcdefghijklmnop \u{FEFE}abcdefghijklmnop" .as_bytes() |
2641 | )); |
2642 | assert!(is_utf8_bidi( |
2643 | "abcdefghijklmnop \u{200F}abcdefghijklmnop" .as_bytes() |
2644 | )); |
2645 | assert!(is_utf8_bidi( |
2646 | "abcdefghijklmnop \u{202B}abcdefghijklmnop" .as_bytes() |
2647 | )); |
2648 | assert!(is_utf8_bidi( |
2649 | "abcdefghijklmnop \u{202E}abcdefghijklmnop" .as_bytes() |
2650 | )); |
2651 | assert!(is_utf8_bidi( |
2652 | "abcdefghijklmnop \u{2067}abcdefghijklmnop" .as_bytes() |
2653 | )); |
2654 | assert!(is_utf8_bidi( |
2655 | "abcdefghijklmnop \u{10800}abcdefghijklmnop" .as_bytes() |
2656 | )); |
2657 | assert!(is_utf8_bidi( |
2658 | "abcdefghijklmnop \u{10FFF}abcdefghijklmnop" .as_bytes() |
2659 | )); |
2660 | assert!(is_utf8_bidi( |
2661 | "abcdefghijklmnop \u{1E800}abcdefghijklmnop" .as_bytes() |
2662 | )); |
2663 | assert!(is_utf8_bidi( |
2664 | "abcdefghijklmnop \u{1EFFF}abcdefghijklmnop" .as_bytes() |
2665 | )); |
2666 | } |
2667 | |
2668 | #[test ] |
2669 | fn test_is_utf16_bidi() { |
2670 | assert!(!is_utf16_bidi(&[ |
2671 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66, |
2672 | 0x67, 0x68, 0x69, |
2673 | ])); |
2674 | assert!(!is_utf16_bidi(&[ |
2675 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66, |
2676 | 0x67, 0x68, 0x69, |
2677 | ])); |
2678 | assert!(!is_utf16_bidi(&[ |
2679 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66, |
2680 | 0x67, 0x68, 0x69, |
2681 | ])); |
2682 | assert!(!is_utf16_bidi(&[ |
2683 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66, |
2684 | 0x67, 0x68, 0x69, |
2685 | ])); |
2686 | assert!(!is_utf16_bidi(&[ |
2687 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66, |
2688 | 0x67, 0x68, 0x69, |
2689 | ])); |
2690 | assert!(!is_utf16_bidi(&[ |
2691 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66, |
2692 | 0x67, 0x68, 0x69, |
2693 | ])); |
2694 | assert!(!is_utf16_bidi(&[ |
2695 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66, |
2696 | 0x67, 0x68, 0x69, |
2697 | ])); |
2698 | assert!(is_utf16_bidi(&[ |
2699 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66, |
2700 | 0x67, 0x68, 0x69, |
2701 | ])); |
2702 | assert!(is_utf16_bidi(&[ |
2703 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66, |
2704 | 0x67, 0x68, 0x69, |
2705 | ])); |
2706 | assert!(is_utf16_bidi(&[ |
2707 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66, |
2708 | 0x67, 0x68, 0x69, |
2709 | ])); |
2710 | assert!(is_utf16_bidi(&[ |
2711 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66, |
2712 | 0x67, 0x68, 0x69, |
2713 | ])); |
2714 | assert!(is_utf16_bidi(&[ |
2715 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66, |
2716 | 0x67, 0x68, 0x69, |
2717 | ])); |
2718 | assert!(is_utf16_bidi(&[ |
2719 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66, |
2720 | 0x67, 0x68, 0x69, |
2721 | ])); |
2722 | assert!(is_utf16_bidi(&[ |
2723 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66, |
2724 | 0x67, 0x68, 0x69, |
2725 | ])); |
2726 | assert!(is_utf16_bidi(&[ |
2727 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66, |
2728 | 0x67, 0x68, 0x69, |
2729 | ])); |
2730 | assert!(is_utf16_bidi(&[ |
2731 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66, |
2732 | 0x67, 0x68, 0x69, |
2733 | ])); |
2734 | assert!(is_utf16_bidi(&[ |
2735 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66, |
2736 | 0x67, 0x68, 0x69, |
2737 | ])); |
2738 | assert!(is_utf16_bidi(&[ |
2739 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66, |
2740 | 0x67, 0x68, 0x69, |
2741 | ])); |
2742 | assert!(is_utf16_bidi(&[ |
2743 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66, |
2744 | 0x67, 0x68, 0x69, |
2745 | ])); |
2746 | assert!(is_utf16_bidi(&[ |
2747 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66, |
2748 | 0x67, 0x68, 0x69, |
2749 | ])); |
2750 | assert!(is_utf16_bidi(&[ |
2751 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66, |
2752 | 0x67, 0x68, 0x69, |
2753 | ])); |
2754 | assert!(is_utf16_bidi(&[ |
2755 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66, |
2756 | 0x67, 0x68, 0x69, |
2757 | ])); |
2758 | assert!(is_utf16_bidi(&[ |
2759 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66, |
2760 | 0x67, 0x68, 0x69, |
2761 | ])); |
2762 | |
2763 | assert!(is_utf16_bidi(&[ |
2764 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65, |
2765 | 0x66, 0x67, 0x68, 0x69, |
2766 | ])); |
2767 | } |
2768 | |
2769 | #[test ] |
2770 | fn test_check_str_for_latin1_and_bidi() { |
2771 | assert_ne!( |
2772 | check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop" ), |
2773 | Latin1Bidi::Bidi |
2774 | ); |
2775 | assert_ne!( |
2776 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{03B1}abcdefghijklmnop" ), |
2777 | Latin1Bidi::Bidi |
2778 | ); |
2779 | assert_ne!( |
2780 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{3041}abcdefghijklmnop" ), |
2781 | Latin1Bidi::Bidi |
2782 | ); |
2783 | assert_ne!( |
2784 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{1F4A9}abcdefghijklmnop" ), |
2785 | Latin1Bidi::Bidi |
2786 | ); |
2787 | assert_ne!( |
2788 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{FE00}abcdefghijklmnop" ), |
2789 | Latin1Bidi::Bidi |
2790 | ); |
2791 | assert_ne!( |
2792 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{202C}abcdefghijklmnop" ), |
2793 | Latin1Bidi::Bidi |
2794 | ); |
2795 | assert_ne!( |
2796 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{FEFF}abcdefghijklmnop" ), |
2797 | Latin1Bidi::Bidi |
2798 | ); |
2799 | assert_eq!( |
2800 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{0590}abcdefghijklmnop" ), |
2801 | Latin1Bidi::Bidi |
2802 | ); |
2803 | assert_eq!( |
2804 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{08FF}abcdefghijklmnop" ), |
2805 | Latin1Bidi::Bidi |
2806 | ); |
2807 | assert_eq!( |
2808 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{061C}abcdefghijklmnop" ), |
2809 | Latin1Bidi::Bidi |
2810 | ); |
2811 | assert_eq!( |
2812 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{FB50}abcdefghijklmnop" ), |
2813 | Latin1Bidi::Bidi |
2814 | ); |
2815 | assert_eq!( |
2816 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{FDFF}abcdefghijklmnop" ), |
2817 | Latin1Bidi::Bidi |
2818 | ); |
2819 | assert_eq!( |
2820 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{FE70}abcdefghijklmnop" ), |
2821 | Latin1Bidi::Bidi |
2822 | ); |
2823 | assert_eq!( |
2824 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{FEFE}abcdefghijklmnop" ), |
2825 | Latin1Bidi::Bidi |
2826 | ); |
2827 | assert_eq!( |
2828 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{200F}abcdefghijklmnop" ), |
2829 | Latin1Bidi::Bidi |
2830 | ); |
2831 | assert_eq!( |
2832 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{202B}abcdefghijklmnop" ), |
2833 | Latin1Bidi::Bidi |
2834 | ); |
2835 | assert_eq!( |
2836 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{202E}abcdefghijklmnop" ), |
2837 | Latin1Bidi::Bidi |
2838 | ); |
2839 | assert_eq!( |
2840 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{2067}abcdefghijklmnop" ), |
2841 | Latin1Bidi::Bidi |
2842 | ); |
2843 | assert_eq!( |
2844 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{10800}abcdefghijklmnop" ), |
2845 | Latin1Bidi::Bidi |
2846 | ); |
2847 | assert_eq!( |
2848 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{10FFF}abcdefghijklmnop" ), |
2849 | Latin1Bidi::Bidi |
2850 | ); |
2851 | assert_eq!( |
2852 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{1E800}abcdefghijklmnop" ), |
2853 | Latin1Bidi::Bidi |
2854 | ); |
2855 | assert_eq!( |
2856 | check_str_for_latin1_and_bidi("abcdefghijklmnop \u{1EFFF}abcdefghijklmnop" ), |
2857 | Latin1Bidi::Bidi |
2858 | ); |
2859 | } |
2860 | |
2861 | #[test ] |
2862 | fn test_check_utf8_for_latin1_and_bidi() { |
2863 | assert_ne!( |
2864 | check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop" .as_bytes()), |
2865 | Latin1Bidi::Bidi |
2866 | ); |
2867 | assert_ne!( |
2868 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{03B1}abcdefghijklmnop" .as_bytes()), |
2869 | Latin1Bidi::Bidi |
2870 | ); |
2871 | assert_ne!( |
2872 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{3041}abcdefghijklmnop" .as_bytes()), |
2873 | Latin1Bidi::Bidi |
2874 | ); |
2875 | assert_ne!( |
2876 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{1F4A9}abcdefghijklmnop" .as_bytes()), |
2877 | Latin1Bidi::Bidi |
2878 | ); |
2879 | assert_ne!( |
2880 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{FE00}abcdefghijklmnop" .as_bytes()), |
2881 | Latin1Bidi::Bidi |
2882 | ); |
2883 | assert_ne!( |
2884 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{202C}abcdefghijklmnop" .as_bytes()), |
2885 | Latin1Bidi::Bidi |
2886 | ); |
2887 | assert_ne!( |
2888 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{FEFF}abcdefghijklmnop" .as_bytes()), |
2889 | Latin1Bidi::Bidi |
2890 | ); |
2891 | assert_eq!( |
2892 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{0590}abcdefghijklmnop" .as_bytes()), |
2893 | Latin1Bidi::Bidi |
2894 | ); |
2895 | assert_eq!( |
2896 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{08FF}abcdefghijklmnop" .as_bytes()), |
2897 | Latin1Bidi::Bidi |
2898 | ); |
2899 | assert_eq!( |
2900 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{061C}abcdefghijklmnop" .as_bytes()), |
2901 | Latin1Bidi::Bidi |
2902 | ); |
2903 | assert_eq!( |
2904 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{FB50}abcdefghijklmnop" .as_bytes()), |
2905 | Latin1Bidi::Bidi |
2906 | ); |
2907 | assert_eq!( |
2908 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{FDFF}abcdefghijklmnop" .as_bytes()), |
2909 | Latin1Bidi::Bidi |
2910 | ); |
2911 | assert_eq!( |
2912 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{FE70}abcdefghijklmnop" .as_bytes()), |
2913 | Latin1Bidi::Bidi |
2914 | ); |
2915 | assert_eq!( |
2916 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{FEFE}abcdefghijklmnop" .as_bytes()), |
2917 | Latin1Bidi::Bidi |
2918 | ); |
2919 | assert_eq!( |
2920 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{200F}abcdefghijklmnop" .as_bytes()), |
2921 | Latin1Bidi::Bidi |
2922 | ); |
2923 | assert_eq!( |
2924 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{202B}abcdefghijklmnop" .as_bytes()), |
2925 | Latin1Bidi::Bidi |
2926 | ); |
2927 | assert_eq!( |
2928 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{202E}abcdefghijklmnop" .as_bytes()), |
2929 | Latin1Bidi::Bidi |
2930 | ); |
2931 | assert_eq!( |
2932 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{2067}abcdefghijklmnop" .as_bytes()), |
2933 | Latin1Bidi::Bidi |
2934 | ); |
2935 | assert_eq!( |
2936 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{10800}abcdefghijklmnop" .as_bytes()), |
2937 | Latin1Bidi::Bidi |
2938 | ); |
2939 | assert_eq!( |
2940 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{10FFF}abcdefghijklmnop" .as_bytes()), |
2941 | Latin1Bidi::Bidi |
2942 | ); |
2943 | assert_eq!( |
2944 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{1E800}abcdefghijklmnop" .as_bytes()), |
2945 | Latin1Bidi::Bidi |
2946 | ); |
2947 | assert_eq!( |
2948 | check_utf8_for_latin1_and_bidi("abcdefghijklmnop \u{1EFFF}abcdefghijklmnop" .as_bytes()), |
2949 | Latin1Bidi::Bidi |
2950 | ); |
2951 | } |
2952 | |
2953 | #[test ] |
2954 | fn test_check_utf16_for_latin1_and_bidi() { |
2955 | assert_ne!( |
2956 | check_utf16_for_latin1_and_bidi(&[ |
2957 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, |
2958 | 0x66, 0x67, 0x68, 0x69, |
2959 | ]), |
2960 | Latin1Bidi::Bidi |
2961 | ); |
2962 | assert_ne!( |
2963 | check_utf16_for_latin1_and_bidi(&[ |
2964 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, |
2965 | 0x66, 0x67, 0x68, 0x69, |
2966 | ]), |
2967 | Latin1Bidi::Bidi |
2968 | ); |
2969 | assert_ne!( |
2970 | check_utf16_for_latin1_and_bidi(&[ |
2971 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, |
2972 | 0x66, 0x67, 0x68, 0x69, |
2973 | ]), |
2974 | Latin1Bidi::Bidi |
2975 | ); |
2976 | assert_ne!( |
2977 | check_utf16_for_latin1_and_bidi(&[ |
2978 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, |
2979 | 0x66, 0x67, 0x68, 0x69, |
2980 | ]), |
2981 | Latin1Bidi::Bidi |
2982 | ); |
2983 | assert_ne!( |
2984 | check_utf16_for_latin1_and_bidi(&[ |
2985 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, |
2986 | 0x66, 0x67, 0x68, 0x69, |
2987 | ]), |
2988 | Latin1Bidi::Bidi |
2989 | ); |
2990 | assert_ne!( |
2991 | check_utf16_for_latin1_and_bidi(&[ |
2992 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, |
2993 | 0x66, 0x67, 0x68, 0x69, |
2994 | ]), |
2995 | Latin1Bidi::Bidi |
2996 | ); |
2997 | assert_ne!( |
2998 | check_utf16_for_latin1_and_bidi(&[ |
2999 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, |
3000 | 0x66, 0x67, 0x68, 0x69, |
3001 | ]), |
3002 | Latin1Bidi::Bidi |
3003 | ); |
3004 | assert_eq!( |
3005 | check_utf16_for_latin1_and_bidi(&[ |
3006 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, |
3007 | 0x66, 0x67, 0x68, 0x69, |
3008 | ]), |
3009 | Latin1Bidi::Bidi |
3010 | ); |
3011 | assert_eq!( |
3012 | check_utf16_for_latin1_and_bidi(&[ |
3013 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, |
3014 | 0x66, 0x67, 0x68, 0x69, |
3015 | ]), |
3016 | Latin1Bidi::Bidi |
3017 | ); |
3018 | assert_eq!( |
3019 | check_utf16_for_latin1_and_bidi(&[ |
3020 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, |
3021 | 0x66, 0x67, 0x68, 0x69, |
3022 | ]), |
3023 | Latin1Bidi::Bidi |
3024 | ); |
3025 | assert_eq!( |
3026 | check_utf16_for_latin1_and_bidi(&[ |
3027 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, |
3028 | 0x66, 0x67, 0x68, 0x69, |
3029 | ]), |
3030 | Latin1Bidi::Bidi |
3031 | ); |
3032 | assert_eq!( |
3033 | check_utf16_for_latin1_and_bidi(&[ |
3034 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, |
3035 | 0x66, 0x67, 0x68, 0x69, |
3036 | ]), |
3037 | Latin1Bidi::Bidi |
3038 | ); |
3039 | assert_eq!( |
3040 | check_utf16_for_latin1_and_bidi(&[ |
3041 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, |
3042 | 0x66, 0x67, 0x68, 0x69, |
3043 | ]), |
3044 | Latin1Bidi::Bidi |
3045 | ); |
3046 | assert_eq!( |
3047 | check_utf16_for_latin1_and_bidi(&[ |
3048 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, |
3049 | 0x66, 0x67, 0x68, 0x69, |
3050 | ]), |
3051 | Latin1Bidi::Bidi |
3052 | ); |
3053 | assert_eq!( |
3054 | check_utf16_for_latin1_and_bidi(&[ |
3055 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, |
3056 | 0x66, 0x67, 0x68, 0x69, |
3057 | ]), |
3058 | Latin1Bidi::Bidi |
3059 | ); |
3060 | assert_eq!( |
3061 | check_utf16_for_latin1_and_bidi(&[ |
3062 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, |
3063 | 0x66, 0x67, 0x68, 0x69, |
3064 | ]), |
3065 | Latin1Bidi::Bidi |
3066 | ); |
3067 | assert_eq!( |
3068 | check_utf16_for_latin1_and_bidi(&[ |
3069 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, |
3070 | 0x66, 0x67, 0x68, 0x69, |
3071 | ]), |
3072 | Latin1Bidi::Bidi |
3073 | ); |
3074 | assert_eq!( |
3075 | check_utf16_for_latin1_and_bidi(&[ |
3076 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, |
3077 | 0x66, 0x67, 0x68, 0x69, |
3078 | ]), |
3079 | Latin1Bidi::Bidi |
3080 | ); |
3081 | assert_eq!( |
3082 | check_utf16_for_latin1_and_bidi(&[ |
3083 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, |
3084 | 0x66, 0x67, 0x68, 0x69, |
3085 | ]), |
3086 | Latin1Bidi::Bidi |
3087 | ); |
3088 | assert_eq!( |
3089 | check_utf16_for_latin1_and_bidi(&[ |
3090 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, |
3091 | 0x66, 0x67, 0x68, 0x69, |
3092 | ]), |
3093 | Latin1Bidi::Bidi |
3094 | ); |
3095 | assert_eq!( |
3096 | check_utf16_for_latin1_and_bidi(&[ |
3097 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, |
3098 | 0x66, 0x67, 0x68, 0x69, |
3099 | ]), |
3100 | Latin1Bidi::Bidi |
3101 | ); |
3102 | assert_eq!( |
3103 | check_utf16_for_latin1_and_bidi(&[ |
3104 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, |
3105 | 0x66, 0x67, 0x68, 0x69, |
3106 | ]), |
3107 | Latin1Bidi::Bidi |
3108 | ); |
3109 | assert_eq!( |
3110 | check_utf16_for_latin1_and_bidi(&[ |
3111 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, |
3112 | 0x66, 0x67, 0x68, 0x69, |
3113 | ]), |
3114 | Latin1Bidi::Bidi |
3115 | ); |
3116 | |
3117 | assert_eq!( |
3118 | check_utf16_for_latin1_and_bidi(&[ |
3119 | 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, |
3120 | 0x65, 0x66, 0x67, 0x68, 0x69, |
3121 | ]), |
3122 | Latin1Bidi::Bidi |
3123 | ); |
3124 | } |
3125 | |
3126 | #[inline (always)] |
3127 | pub fn reference_is_char_bidi(c: char) -> bool { |
3128 | match c { |
3129 | ' \u{0590}' ..=' \u{08FF}' |
3130 | | ' \u{FB1D}' ..=' \u{FDFF}' |
3131 | | ' \u{FE70}' ..=' \u{FEFE}' |
3132 | | ' \u{10800}' ..=' \u{10FFF}' |
3133 | | ' \u{1E800}' ..=' \u{1EFFF}' |
3134 | | ' \u{200F}' |
3135 | | ' \u{202B}' |
3136 | | ' \u{202E}' |
3137 | | ' \u{2067}' => true, |
3138 | _ => false, |
3139 | } |
3140 | } |
3141 | |
3142 | #[inline (always)] |
3143 | pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool { |
3144 | match u { |
3145 | 0x0590..=0x08FF |
3146 | | 0xFB1D..=0xFDFF |
3147 | | 0xFE70..=0xFEFE |
3148 | | 0xD802 |
3149 | | 0xD803 |
3150 | | 0xD83A |
3151 | | 0xD83B |
3152 | | 0x200F |
3153 | | 0x202B |
3154 | | 0x202E |
3155 | | 0x2067 => true, |
3156 | _ => false, |
3157 | } |
3158 | } |
3159 | |
3160 | #[test ] |
3161 | #[cfg_attr (miri, ignore)] // Miri is too slow |
3162 | fn test_is_char_bidi_thoroughly() { |
3163 | for i in 0..0xD800u32 { |
3164 | let c: char = ::core::char::from_u32(i).unwrap(); |
3165 | assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); |
3166 | } |
3167 | for i in 0xE000..0x110000u32 { |
3168 | let c: char = ::core::char::from_u32(i).unwrap(); |
3169 | assert_eq!(is_char_bidi(c), reference_is_char_bidi(c)); |
3170 | } |
3171 | } |
3172 | |
3173 | #[test ] |
3174 | #[cfg_attr (miri, ignore)] // Miri is too slow |
3175 | fn test_is_utf16_code_unit_bidi_thoroughly() { |
3176 | for i in 0..0x10000u32 { |
3177 | let u = i as u16; |
3178 | assert_eq!( |
3179 | is_utf16_code_unit_bidi(u), |
3180 | reference_is_utf16_code_unit_bidi(u) |
3181 | ); |
3182 | } |
3183 | } |
3184 | |
3185 | #[test ] |
3186 | #[cfg_attr (miri, ignore)] // Miri is too slow |
3187 | fn test_is_str_bidi_thoroughly() { |
3188 | let mut buf = [0; 4]; |
3189 | for i in 0..0xD800u32 { |
3190 | let c: char = ::core::char::from_u32(i).unwrap(); |
3191 | assert_eq!( |
3192 | is_str_bidi(c.encode_utf8(&mut buf[..])), |
3193 | reference_is_char_bidi(c) |
3194 | ); |
3195 | } |
3196 | for i in 0xE000..0x110000u32 { |
3197 | let c: char = ::core::char::from_u32(i).unwrap(); |
3198 | assert_eq!( |
3199 | is_str_bidi(c.encode_utf8(&mut buf[..])), |
3200 | reference_is_char_bidi(c) |
3201 | ); |
3202 | } |
3203 | } |
3204 | |
3205 | #[test ] |
3206 | #[cfg_attr (miri, ignore)] // Miri is too slow |
3207 | fn test_is_utf8_bidi_thoroughly() { |
3208 | let mut buf = [0; 8]; |
3209 | for i in 0..0xD800u32 { |
3210 | let c: char = ::core::char::from_u32(i).unwrap(); |
3211 | let expect = reference_is_char_bidi(c); |
3212 | { |
3213 | let len = { |
3214 | let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); |
3215 | assert_eq!(is_utf8_bidi(bytes), expect); |
3216 | bytes.len() |
3217 | }; |
3218 | { |
3219 | let tail = &mut buf[len..]; |
3220 | for b in tail.iter_mut() { |
3221 | *b = 0; |
3222 | } |
3223 | } |
3224 | } |
3225 | assert_eq!(is_utf8_bidi(&buf[..]), expect); |
3226 | } |
3227 | for i in 0xE000..0x110000u32 { |
3228 | let c: char = ::core::char::from_u32(i).unwrap(); |
3229 | let expect = reference_is_char_bidi(c); |
3230 | { |
3231 | let len = { |
3232 | let bytes = c.encode_utf8(&mut buf[..]).as_bytes(); |
3233 | assert_eq!(is_utf8_bidi(bytes), expect); |
3234 | bytes.len() |
3235 | }; |
3236 | { |
3237 | let tail = &mut buf[len..]; |
3238 | for b in tail.iter_mut() { |
3239 | *b = 0; |
3240 | } |
3241 | } |
3242 | } |
3243 | assert_eq!(is_utf8_bidi(&buf[..]), expect); |
3244 | } |
3245 | } |
3246 | |
3247 | #[test ] |
3248 | #[cfg_attr (miri, ignore)] // Miri is too slow |
3249 | fn test_is_utf16_bidi_thoroughly() { |
3250 | let mut buf = [0; 32]; |
3251 | for i in 0..0x10000u32 { |
3252 | let u = i as u16; |
3253 | buf[15] = u; |
3254 | assert_eq!( |
3255 | is_utf16_bidi(&buf[..]), |
3256 | reference_is_utf16_code_unit_bidi(u) |
3257 | ); |
3258 | } |
3259 | } |
3260 | |
3261 | #[test ] |
3262 | fn test_is_utf8_bidi_edge_cases() { |
3263 | assert!(!is_utf8_bidi(b" \xD5\xBF\x61" )); |
3264 | assert!(!is_utf8_bidi(b" \xD6\x80\x61" )); |
3265 | assert!(!is_utf8_bidi(b"abc" )); |
3266 | assert!(is_utf8_bidi(b" \xD5\xBF\xC2" )); |
3267 | assert!(is_utf8_bidi(b" \xD6\x80\xC2" )); |
3268 | assert!(is_utf8_bidi(b"ab \xC2" )); |
3269 | } |
3270 | |
3271 | #[test ] |
3272 | fn test_decode_latin1() { |
3273 | match decode_latin1(b"ab" ) { |
3274 | Cow::Borrowed(s) => { |
3275 | assert_eq!(s, "ab" ); |
3276 | } |
3277 | Cow::Owned(_) => { |
3278 | unreachable!("Should have borrowed" ); |
3279 | } |
3280 | } |
3281 | assert_eq!(decode_latin1(b"a \xE4" ), "a \u{E4}" ); |
3282 | } |
3283 | |
3284 | #[test ] |
3285 | fn test_encode_latin1_lossy() { |
3286 | match encode_latin1_lossy("ab" ) { |
3287 | Cow::Borrowed(s) => { |
3288 | assert_eq!(s, b"ab" ); |
3289 | } |
3290 | Cow::Owned(_) => { |
3291 | unreachable!("Should have borrowed" ); |
3292 | } |
3293 | } |
3294 | assert_eq!(encode_latin1_lossy("a \u{E4}" ), &(b"a \xE4" )[..]); |
3295 | } |
3296 | |
3297 | #[test ] |
3298 | fn test_convert_utf8_to_utf16_without_replacement() { |
3299 | let mut buf = [0u16; 5]; |
3300 | assert_eq!( |
3301 | convert_utf8_to_utf16_without_replacement(b"ab" , &mut buf[..2]), |
3302 | Some(2) |
3303 | ); |
3304 | assert_eq!(buf[0], u16::from(b'a' )); |
3305 | assert_eq!(buf[1], u16::from(b'b' )); |
3306 | assert_eq!(buf[2], 0); |
3307 | assert_eq!( |
3308 | convert_utf8_to_utf16_without_replacement(b" \xC3\xA4c" , &mut buf[..3]), |
3309 | Some(2) |
3310 | ); |
3311 | assert_eq!(buf[0], 0xE4); |
3312 | assert_eq!(buf[1], u16::from(b'c' )); |
3313 | assert_eq!(buf[2], 0); |
3314 | assert_eq!( |
3315 | convert_utf8_to_utf16_without_replacement(b" \xE2\x98\x83" , &mut buf[..3]), |
3316 | Some(1) |
3317 | ); |
3318 | assert_eq!(buf[0], 0x2603); |
3319 | assert_eq!(buf[1], u16::from(b'c' )); |
3320 | assert_eq!(buf[2], 0); |
3321 | assert_eq!( |
3322 | convert_utf8_to_utf16_without_replacement(b" \xE2\x98\x83d" , &mut buf[..4]), |
3323 | Some(2) |
3324 | ); |
3325 | assert_eq!(buf[0], 0x2603); |
3326 | assert_eq!(buf[1], u16::from(b'd' )); |
3327 | assert_eq!(buf[2], 0); |
3328 | assert_eq!( |
3329 | convert_utf8_to_utf16_without_replacement(b" \xE2\x98\x83\xC3\xA4" , &mut buf[..5]), |
3330 | Some(2) |
3331 | ); |
3332 | assert_eq!(buf[0], 0x2603); |
3333 | assert_eq!(buf[1], 0xE4); |
3334 | assert_eq!(buf[2], 0); |
3335 | assert_eq!( |
3336 | convert_utf8_to_utf16_without_replacement(b" \xF0\x9F\x93\x8E" , &mut buf[..4]), |
3337 | Some(2) |
3338 | ); |
3339 | assert_eq!(buf[0], 0xD83D); |
3340 | assert_eq!(buf[1], 0xDCCE); |
3341 | assert_eq!(buf[2], 0); |
3342 | assert_eq!( |
3343 | convert_utf8_to_utf16_without_replacement(b" \xF0\x9F\x93\x8Ee" , &mut buf[..5]), |
3344 | Some(3) |
3345 | ); |
3346 | assert_eq!(buf[0], 0xD83D); |
3347 | assert_eq!(buf[1], 0xDCCE); |
3348 | assert_eq!(buf[2], u16::from(b'e' )); |
3349 | assert_eq!( |
3350 | convert_utf8_to_utf16_without_replacement(b" \xF0\x9F\x93" , &mut buf[..5]), |
3351 | None |
3352 | ); |
3353 | } |
3354 | } |
3355 | |