1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10// It's assumed that in due course Rust will have explicit SIMD but will not
11// be good at run-time selection of SIMD vs. no-SIMD. In such a future,
12// x86_64 will always use SSE2 and 32-bit x86 will use SSE2 when compiled with
13// a Mozilla-shipped rustc. SIMD support and especially detection on ARM is a
14// mess. Under the circumstances, it seems to make sense to optimize the ALU
15// case for ARMv7 rather than x86. Annoyingly, I was unable to get useful
16// numbers of the actual ARMv7 CPU I have access to, because (thermal?)
17// throttling kept interfering. Since Raspberry Pi 3 (ARMv8 core but running
18// ARMv7 code) produced reproducible performance numbers, that's the ARM
19// computer that this code ended up being optimized for in the ALU case.
20// Less popular CPU architectures simply get the approach that was chosen based
21// on Raspberry Pi 3 measurements. The UTF-16 and UTF-8 ALU cases take
22// different approaches based on benchmarking on Raspberry Pi 3.
23
24#[cfg(all(
25 feature = "simd-accel",
26 any(
27 target_feature = "sse2",
28 all(target_endian = "little", target_arch = "aarch64"),
29 all(target_endian = "little", target_feature = "neon")
30 )
31))]
32use crate::simd_funcs::*;
33
34cfg_if! {
35 if #[cfg(feature = "simd-accel")] {
36 #[allow(unused_imports)]
37 use ::core::intrinsics::unlikely;
38 #[allow(unused_imports)]
39 use ::core::intrinsics::likely;
40 } else {
41 #[allow(dead_code)]
42 #[inline(always)]
43 fn unlikely(b: bool) -> bool {
44 b
45 }
46 #[allow(dead_code)]
47 #[inline(always)]
48 fn likely(b: bool) -> bool {
49 b
50 }
51 }
52}
53
54// `as` truncates, so works on 32-bit, too.
55#[allow(dead_code)]
56pub const ASCII_MASK: usize = 0x8080_8080_8080_8080u64 as usize;
57
58// `as` truncates, so works on 32-bit, too.
59#[allow(dead_code)]
60pub const BASIC_LATIN_MASK: usize = 0xFF80_FF80_FF80_FF80u64 as usize;
61
62#[allow(unused_macros)]
63macro_rules! ascii_naive {
64 ($name:ident, $src_unit:ty, $dst_unit:ty) => {
65 #[inline(always)]
66 pub unsafe fn $name(
67 src: *const $src_unit,
68 dst: *mut $dst_unit,
69 len: usize,
70 ) -> Option<($src_unit, usize)> {
71 // Yes, manually omitting the bound check here matters
72 // a lot for perf.
73 for i in 0..len {
74 let code_unit = *(src.add(i));
75 if code_unit > 127 {
76 return Some((code_unit, i));
77 }
78 *(dst.add(i)) = code_unit as $dst_unit;
79 }
80 return None;
81 }
82 };
83}
84
85#[allow(unused_macros)]
86macro_rules! ascii_alu {
87 ($name:ident,
88 $src_unit:ty,
89 $dst_unit:ty,
90 $stride_fn:ident) => {
91 #[cfg_attr(feature = "cargo-clippy", allow(never_loop, cast_ptr_alignment))]
92 #[inline(always)]
93 pub unsafe fn $name(
94 src: *const $src_unit,
95 dst: *mut $dst_unit,
96 len: usize,
97 ) -> Option<($src_unit, usize)> {
98 let mut offset = 0usize;
99 // This loop is only broken out of as a `goto` forward
100 loop {
101 let mut until_alignment = {
102 // Check if the other unit aligns if we move the narrower unit
103 // to alignment.
104 // if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
105 // ascii_to_ascii
106 let src_alignment = (src as usize) & ALU_ALIGNMENT_MASK;
107 let dst_alignment = (dst as usize) & ALU_ALIGNMENT_MASK;
108 if src_alignment != dst_alignment {
109 break;
110 }
111 (ALU_ALIGNMENT - src_alignment) & ALU_ALIGNMENT_MASK
112 // } else if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
113 // ascii_to_basic_latin
114 // let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
115 // if (dst.add(src_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
116 // break;
117 // }
118 // src_until_alignment
119 // } else {
120 // basic_latin_to_ascii
121 // let dst_until_alignment = (ALIGNMENT - ((dst as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
122 // if (src.add(dst_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
123 // break;
124 // }
125 // dst_until_alignment
126 // }
127 };
128 if until_alignment + ALU_STRIDE_SIZE <= len {
129 // Moving pointers to alignment seems to be a pessimization on
130 // x86_64 for operations that have UTF-16 as the internal
131 // Unicode representation. However, since it seems to be a win
132 // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
133 // mixed results when encoding from UTF-16 and since x86 and
134 // x86_64 should be using SSE2 in due course, keeping the move
135 // to alignment here. It would be good to test on more ARM CPUs
136 // and on real MIPS and POWER hardware.
137 while until_alignment != 0 {
138 let code_unit = *(src.add(offset));
139 if code_unit > 127 {
140 return Some((code_unit, offset));
141 }
142 *(dst.add(offset)) = code_unit as $dst_unit;
143 offset += 1;
144 until_alignment -= 1;
145 }
146 let len_minus_stride = len - ALU_STRIDE_SIZE;
147 loop {
148 if let Some(num_ascii) = $stride_fn(
149 src.add(offset) as *const usize,
150 dst.add(offset) as *mut usize,
151 ) {
152 offset += num_ascii;
153 return Some((*(src.add(offset)), offset));
154 }
155 offset += ALU_STRIDE_SIZE;
156 if offset > len_minus_stride {
157 break;
158 }
159 }
160 }
161 break;
162 }
163 while offset < len {
164 let code_unit = *(src.add(offset));
165 if code_unit > 127 {
166 return Some((code_unit, offset));
167 }
168 *(dst.add(offset)) = code_unit as $dst_unit;
169 offset += 1;
170 }
171 None
172 }
173 };
174}
175
176#[allow(unused_macros)]
177macro_rules! basic_latin_alu {
178 ($name:ident,
179 $src_unit:ty,
180 $dst_unit:ty,
181 $stride_fn:ident) => {
182 #[cfg_attr(
183 feature = "cargo-clippy",
184 allow(never_loop, cast_ptr_alignment, cast_lossless)
185 )]
186 #[inline(always)]
187 pub unsafe fn $name(
188 src: *const $src_unit,
189 dst: *mut $dst_unit,
190 len: usize,
191 ) -> Option<($src_unit, usize)> {
192 let mut offset = 0usize;
193 // This loop is only broken out of as a `goto` forward
194 loop {
195 let mut until_alignment = {
196 // Check if the other unit aligns if we move the narrower unit
197 // to alignment.
198 // if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
199 // ascii_to_ascii
200 // let src_alignment = (src as usize) & ALIGNMENT_MASK;
201 // let dst_alignment = (dst as usize) & ALIGNMENT_MASK;
202 // if src_alignment != dst_alignment {
203 // break;
204 // }
205 // (ALIGNMENT - src_alignment) & ALIGNMENT_MASK
206 // } else
207 if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
208 // ascii_to_basic_latin
209 let src_until_alignment = (ALU_ALIGNMENT
210 - ((src as usize) & ALU_ALIGNMENT_MASK))
211 & ALU_ALIGNMENT_MASK;
212 if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
213 != 0
214 {
215 break;
216 }
217 src_until_alignment
218 } else {
219 // basic_latin_to_ascii
220 let dst_until_alignment = (ALU_ALIGNMENT
221 - ((dst as usize) & ALU_ALIGNMENT_MASK))
222 & ALU_ALIGNMENT_MASK;
223 if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
224 != 0
225 {
226 break;
227 }
228 dst_until_alignment
229 }
230 };
231 if until_alignment + ALU_STRIDE_SIZE <= len {
232 // Moving pointers to alignment seems to be a pessimization on
233 // x86_64 for operations that have UTF-16 as the internal
234 // Unicode representation. However, since it seems to be a win
235 // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
236 // mixed results when encoding from UTF-16 and since x86 and
237 // x86_64 should be using SSE2 in due course, keeping the move
238 // to alignment here. It would be good to test on more ARM CPUs
239 // and on real MIPS and POWER hardware.
240 while until_alignment != 0 {
241 let code_unit = *(src.add(offset));
242 if code_unit > 127 {
243 return Some((code_unit, offset));
244 }
245 *(dst.add(offset)) = code_unit as $dst_unit;
246 offset += 1;
247 until_alignment -= 1;
248 }
249 let len_minus_stride = len - ALU_STRIDE_SIZE;
250 loop {
251 if !$stride_fn(
252 src.add(offset) as *const usize,
253 dst.add(offset) as *mut usize,
254 ) {
255 break;
256 }
257 offset += ALU_STRIDE_SIZE;
258 if offset > len_minus_stride {
259 break;
260 }
261 }
262 }
263 break;
264 }
265 while offset < len {
266 let code_unit = *(src.add(offset));
267 if code_unit > 127 {
268 return Some((code_unit, offset));
269 }
270 *(dst.add(offset)) = code_unit as $dst_unit;
271 offset += 1;
272 }
273 None
274 }
275 };
276}
277
278#[allow(unused_macros)]
279macro_rules! latin1_alu {
280 ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_fn:ident) => {
281 #[cfg_attr(
282 feature = "cargo-clippy",
283 allow(never_loop, cast_ptr_alignment, cast_lossless)
284 )]
285 #[inline(always)]
286 pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
287 let mut offset = 0usize;
288 // This loop is only broken out of as a `goto` forward
289 loop {
290 let mut until_alignment = {
291 if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
292 // unpack
293 let src_until_alignment = (ALU_ALIGNMENT
294 - ((src as usize) & ALU_ALIGNMENT_MASK))
295 & ALU_ALIGNMENT_MASK;
296 if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
297 != 0
298 {
299 break;
300 }
301 src_until_alignment
302 } else {
303 // pack
304 let dst_until_alignment = (ALU_ALIGNMENT
305 - ((dst as usize) & ALU_ALIGNMENT_MASK))
306 & ALU_ALIGNMENT_MASK;
307 if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
308 != 0
309 {
310 break;
311 }
312 dst_until_alignment
313 }
314 };
315 if until_alignment + ALU_STRIDE_SIZE <= len {
316 while until_alignment != 0 {
317 let code_unit = *(src.add(offset));
318 *(dst.add(offset)) = code_unit as $dst_unit;
319 offset += 1;
320 until_alignment -= 1;
321 }
322 let len_minus_stride = len - ALU_STRIDE_SIZE;
323 loop {
324 $stride_fn(
325 src.add(offset) as *const usize,
326 dst.add(offset) as *mut usize,
327 );
328 offset += ALU_STRIDE_SIZE;
329 if offset > len_minus_stride {
330 break;
331 }
332 }
333 }
334 break;
335 }
336 while offset < len {
337 let code_unit = *(src.add(offset));
338 *(dst.add(offset)) = code_unit as $dst_unit;
339 offset += 1;
340 }
341 }
342 };
343}
344
345#[allow(unused_macros)]
346macro_rules! ascii_simd_check_align {
347 (
348 $name:ident,
349 $src_unit:ty,
350 $dst_unit:ty,
351 $stride_both_aligned:ident,
352 $stride_src_aligned:ident,
353 $stride_dst_aligned:ident,
354 $stride_neither_aligned:ident
355 ) => {
356 #[inline(always)]
357 pub unsafe fn $name(
358 src: *const $src_unit,
359 dst: *mut $dst_unit,
360 len: usize,
361 ) -> Option<($src_unit, usize)> {
362 let mut offset = 0usize;
363 if SIMD_STRIDE_SIZE <= len {
364 let len_minus_stride = len - SIMD_STRIDE_SIZE;
365 // XXX Should we first process one stride unconditionally as unaligned to
366 // avoid the cost of the branchiness below if the first stride fails anyway?
367 // XXX Should we just use unaligned SSE2 access unconditionally? It seems that
368 // on Haswell, it would make sense to just use unaligned and not bother
369 // checking. Need to benchmark older architectures before deciding.
370 let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
371 if ((src as usize) & SIMD_ALIGNMENT_MASK) == 0 {
372 if dst_masked == 0 {
373 loop {
374 if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
375 break;
376 }
377 offset += SIMD_STRIDE_SIZE;
378 if offset > len_minus_stride {
379 break;
380 }
381 }
382 } else {
383 loop {
384 if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
385 break;
386 }
387 offset += SIMD_STRIDE_SIZE;
388 if offset > len_minus_stride {
389 break;
390 }
391 }
392 }
393 } else {
394 if dst_masked == 0 {
395 loop {
396 if !$stride_dst_aligned(src.add(offset), dst.add(offset)) {
397 break;
398 }
399 offset += SIMD_STRIDE_SIZE;
400 if offset > len_minus_stride {
401 break;
402 }
403 }
404 } else {
405 loop {
406 if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
407 break;
408 }
409 offset += SIMD_STRIDE_SIZE;
410 if offset > len_minus_stride {
411 break;
412 }
413 }
414 }
415 }
416 }
417 while offset < len {
418 let code_unit = *(src.add(offset));
419 if code_unit > 127 {
420 return Some((code_unit, offset));
421 }
422 *(dst.add(offset)) = code_unit as $dst_unit;
423 offset += 1;
424 }
425 None
426 }
427 };
428}
429
430#[allow(unused_macros)]
431macro_rules! ascii_simd_check_align_unrolled {
432 (
433 $name:ident,
434 $src_unit:ty,
435 $dst_unit:ty,
436 $stride_both_aligned:ident,
437 $stride_src_aligned:ident,
438 $stride_neither_aligned:ident,
439 $double_stride_both_aligned:ident,
440 $double_stride_src_aligned:ident
441 ) => {
442 #[inline(always)]
443 pub unsafe fn $name(
444 src: *const $src_unit,
445 dst: *mut $dst_unit,
446 len: usize,
447 ) -> Option<($src_unit, usize)> {
448 let unit_size = ::core::mem::size_of::<$src_unit>();
449 let mut offset = 0usize;
450 // This loop is only broken out of as a goto forward without
451 // actually looping
452 'outer: loop {
453 if SIMD_STRIDE_SIZE <= len {
454 // First, process one unaligned
455 if !$stride_neither_aligned(src, dst) {
456 break 'outer;
457 }
458 offset = SIMD_STRIDE_SIZE;
459
460 // We have now seen 16 ASCII bytes. Let's guess that
461 // there will be enough more to justify more expense
462 // in the case of non-ASCII.
463 // Use aligned reads for the sake of old microachitectures.
464 let until_alignment = ((SIMD_ALIGNMENT
465 - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK))
466 & SIMD_ALIGNMENT_MASK)
467 / unit_size;
468 // This addition won't overflow, because even in the 32-bit PAE case the
469 // address space holds enough code that the slice length can't be that
470 // close to address space size.
471 // offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
472 if until_alignment + (SIMD_STRIDE_SIZE * 3) <= len {
473 if until_alignment != 0 {
474 if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
475 break;
476 }
477 offset += until_alignment;
478 }
479 let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * 2);
480 let dst_masked = (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK;
481 if dst_masked == 0 {
482 loop {
483 if let Some(advance) =
484 $double_stride_both_aligned(src.add(offset), dst.add(offset))
485 {
486 offset += advance;
487 let code_unit = *(src.add(offset));
488 return Some((code_unit, offset));
489 }
490 offset += SIMD_STRIDE_SIZE * 2;
491 if offset > len_minus_stride_times_two {
492 break;
493 }
494 }
495 if offset + SIMD_STRIDE_SIZE <= len {
496 if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
497 break 'outer;
498 }
499 offset += SIMD_STRIDE_SIZE;
500 }
501 } else {
502 loop {
503 if let Some(advance) =
504 $double_stride_src_aligned(src.add(offset), dst.add(offset))
505 {
506 offset += advance;
507 let code_unit = *(src.add(offset));
508 return Some((code_unit, offset));
509 }
510 offset += SIMD_STRIDE_SIZE * 2;
511 if offset > len_minus_stride_times_two {
512 break;
513 }
514 }
515 if offset + SIMD_STRIDE_SIZE <= len {
516 if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
517 break 'outer;
518 }
519 offset += SIMD_STRIDE_SIZE;
520 }
521 }
522 } else {
523 // At most two iterations, so unroll
524 if offset + SIMD_STRIDE_SIZE <= len {
525 if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
526 break;
527 }
528 offset += SIMD_STRIDE_SIZE;
529 if offset + SIMD_STRIDE_SIZE <= len {
530 if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
531 break;
532 }
533 offset += SIMD_STRIDE_SIZE;
534 }
535 }
536 }
537 }
538 break 'outer;
539 }
540 while offset < len {
541 let code_unit = *(src.add(offset));
542 if code_unit > 127 {
543 return Some((code_unit, offset));
544 }
545 *(dst.add(offset)) = code_unit as $dst_unit;
546 offset += 1;
547 }
548 None
549 }
550 };
551}
552
553#[allow(unused_macros)]
554macro_rules! latin1_simd_check_align {
555 (
556 $name:ident,
557 $src_unit:ty,
558 $dst_unit:ty,
559 $stride_both_aligned:ident,
560 $stride_src_aligned:ident,
561 $stride_dst_aligned:ident,
562 $stride_neither_aligned:ident
563 ) => {
564 #[inline(always)]
565 pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
566 let mut offset = 0usize;
567 if SIMD_STRIDE_SIZE <= len {
568 let len_minus_stride = len - SIMD_STRIDE_SIZE;
569 let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
570 if ((src as usize) & SIMD_ALIGNMENT_MASK) == 0 {
571 if dst_masked == 0 {
572 loop {
573 $stride_both_aligned(src.add(offset), dst.add(offset));
574 offset += SIMD_STRIDE_SIZE;
575 if offset > len_minus_stride {
576 break;
577 }
578 }
579 } else {
580 loop {
581 $stride_src_aligned(src.add(offset), dst.add(offset));
582 offset += SIMD_STRIDE_SIZE;
583 if offset > len_minus_stride {
584 break;
585 }
586 }
587 }
588 } else {
589 if dst_masked == 0 {
590 loop {
591 $stride_dst_aligned(src.add(offset), dst.add(offset));
592 offset += SIMD_STRIDE_SIZE;
593 if offset > len_minus_stride {
594 break;
595 }
596 }
597 } else {
598 loop {
599 $stride_neither_aligned(src.add(offset), dst.add(offset));
600 offset += SIMD_STRIDE_SIZE;
601 if offset > len_minus_stride {
602 break;
603 }
604 }
605 }
606 }
607 }
608 while offset < len {
609 let code_unit = *(src.add(offset));
610 *(dst.add(offset)) = code_unit as $dst_unit;
611 offset += 1;
612 }
613 }
614 };
615}
616
617#[allow(unused_macros)]
618macro_rules! latin1_simd_check_align_unrolled {
619 (
620 $name:ident,
621 $src_unit:ty,
622 $dst_unit:ty,
623 $stride_both_aligned:ident,
624 $stride_src_aligned:ident,
625 $stride_dst_aligned:ident,
626 $stride_neither_aligned:ident
627 ) => {
628 #[inline(always)]
629 pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
630 let unit_size = ::core::mem::size_of::<$src_unit>();
631 let mut offset = 0usize;
632 if SIMD_STRIDE_SIZE <= len {
633 let mut until_alignment = ((SIMD_STRIDE_SIZE
634 - ((src as usize) & SIMD_ALIGNMENT_MASK))
635 & SIMD_ALIGNMENT_MASK)
636 / unit_size;
637 while until_alignment != 0 {
638 *(dst.add(offset)) = *(src.add(offset)) as $dst_unit;
639 offset += 1;
640 until_alignment -= 1;
641 }
642 let len_minus_stride = len - SIMD_STRIDE_SIZE;
643 if offset + SIMD_STRIDE_SIZE * 2 <= len {
644 let len_minus_stride_times_two = len_minus_stride - SIMD_STRIDE_SIZE;
645 if (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK == 0 {
646 loop {
647 $stride_both_aligned(src.add(offset), dst.add(offset));
648 offset += SIMD_STRIDE_SIZE;
649 $stride_both_aligned(src.add(offset), dst.add(offset));
650 offset += SIMD_STRIDE_SIZE;
651 if offset > len_minus_stride_times_two {
652 break;
653 }
654 }
655 } else {
656 loop {
657 $stride_src_aligned(src.add(offset), dst.add(offset));
658 offset += SIMD_STRIDE_SIZE;
659 $stride_src_aligned(src.add(offset), dst.add(offset));
660 offset += SIMD_STRIDE_SIZE;
661 if offset > len_minus_stride_times_two {
662 break;
663 }
664 }
665 }
666 }
667 if offset < len_minus_stride {
668 $stride_src_aligned(src.add(offset), dst.add(offset));
669 offset += SIMD_STRIDE_SIZE;
670 }
671 }
672 while offset < len {
673 let code_unit = *(src.add(offset));
674 // On x86_64, this loop autovectorizes but in the pack
675 // case there are instructions whose purpose is to make sure
676 // each u16 in the vector is truncated before packing. However,
677 // since we don't care about saturating behavior of SSE2 packing
678 // when the input isn't Latin1, those instructions are useless.
679 // Unfortunately, using the `assume` intrinsic to lie to the
680 // optimizer doesn't make LLVM omit the trunctation that we
681 // don't need. Possibly this loop could be manually optimized
682 // to do the sort of thing that LLVM does but without the
683 // ANDing the read vectors of u16 with a constant that discards
684 // the high half of each u16. As far as I can tell, the
685 // optimization assumes that doing a SIMD read past the end of
686 // the array is OK.
687 *(dst.add(offset)) = code_unit as $dst_unit;
688 offset += 1;
689 }
690 }
691 };
692}
693
694#[allow(unused_macros)]
695macro_rules! ascii_simd_unalign {
696 ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
697 #[inline(always)]
698 pub unsafe fn $name(
699 src: *const $src_unit,
700 dst: *mut $dst_unit,
701 len: usize,
702 ) -> Option<($src_unit, usize)> {
703 let mut offset = 0usize;
704 if SIMD_STRIDE_SIZE <= len {
705 let len_minus_stride = len - SIMD_STRIDE_SIZE;
706 loop {
707 if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
708 break;
709 }
710 offset += SIMD_STRIDE_SIZE;
711 if offset > len_minus_stride {
712 break;
713 }
714 }
715 }
716 while offset < len {
717 let code_unit = *(src.add(offset));
718 if code_unit > 127 {
719 return Some((code_unit, offset));
720 }
721 *(dst.add(offset)) = code_unit as $dst_unit;
722 offset += 1;
723 }
724 None
725 }
726 };
727}
728
729#[allow(unused_macros)]
730macro_rules! latin1_simd_unalign {
731 ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
732 #[inline(always)]
733 pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
734 let mut offset = 0usize;
735 if SIMD_STRIDE_SIZE <= len {
736 let len_minus_stride = len - SIMD_STRIDE_SIZE;
737 loop {
738 $stride_neither_aligned(src.add(offset), dst.add(offset));
739 offset += SIMD_STRIDE_SIZE;
740 if offset > len_minus_stride {
741 break;
742 }
743 }
744 }
745 while offset < len {
746 let code_unit = *(src.add(offset));
747 *(dst.add(offset)) = code_unit as $dst_unit;
748 offset += 1;
749 }
750 }
751 };
752}
753
754#[allow(unused_macros)]
755macro_rules! ascii_to_ascii_simd_stride {
756 ($name:ident, $load:ident, $store:ident) => {
757 #[inline(always)]
758 pub unsafe fn $name(src: *const u8, dst: *mut u8) -> bool {
759 let simd = $load(src);
760 if !simd_is_ascii(simd) {
761 return false;
762 }
763 $store(dst, simd);
764 true
765 }
766 };
767}
768
769#[allow(unused_macros)]
770macro_rules! ascii_to_ascii_simd_double_stride {
771 ($name:ident, $store:ident) => {
772 #[inline(always)]
773 pub unsafe fn $name(src: *const u8, dst: *mut u8) -> Option<usize> {
774 let first = load16_aligned(src);
775 let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
776 $store(dst, first);
777 if unlikely(!simd_is_ascii(first | second)) {
778 let mask_first = mask_ascii(first);
779 if mask_first != 0 {
780 return Some(mask_first.trailing_zeros() as usize);
781 }
782 $store(dst.add(SIMD_STRIDE_SIZE), second);
783 let mask_second = mask_ascii(second);
784 return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
785 }
786 $store(dst.add(SIMD_STRIDE_SIZE), second);
787 None
788 }
789 };
790}
791
792#[allow(unused_macros)]
793macro_rules! ascii_to_basic_latin_simd_stride {
794 ($name:ident, $load:ident, $store:ident) => {
795 #[inline(always)]
796 pub unsafe fn $name(src: *const u8, dst: *mut u16) -> bool {
797 let simd = $load(src);
798 if !simd_is_ascii(simd) {
799 return false;
800 }
801 let (first, second) = simd_unpack(simd);
802 $store(dst, first);
803 $store(dst.add(8), second);
804 true
805 }
806 };
807}
808
809#[allow(unused_macros)]
810macro_rules! ascii_to_basic_latin_simd_double_stride {
811 ($name:ident, $store:ident) => {
812 #[inline(always)]
813 pub unsafe fn $name(src: *const u8, dst: *mut u16) -> Option<usize> {
814 let first = load16_aligned(src);
815 let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
816 let (a, b) = simd_unpack(first);
817 $store(dst, a);
818 $store(dst.add(SIMD_STRIDE_SIZE / 2), b);
819 if unlikely(!simd_is_ascii(first | second)) {
820 let mask_first = mask_ascii(first);
821 if mask_first != 0 {
822 return Some(mask_first.trailing_zeros() as usize);
823 }
824 let (c, d) = simd_unpack(second);
825 $store(dst.add(SIMD_STRIDE_SIZE), c);
826 $store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / 2)), d);
827 let mask_second = mask_ascii(second);
828 return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
829 }
830 let (c, d) = simd_unpack(second);
831 $store(dst.add(SIMD_STRIDE_SIZE), c);
832 $store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / 2)), d);
833 None
834 }
835 };
836}
837
838#[allow(unused_macros)]
839macro_rules! unpack_simd_stride {
840 ($name:ident, $load:ident, $store:ident) => {
841 #[inline(always)]
842 pub unsafe fn $name(src: *const u8, dst: *mut u16) {
843 let simd = $load(src);
844 let (first, second) = simd_unpack(simd);
845 $store(dst, first);
846 $store(dst.add(8), second);
847 }
848 };
849}
850
851#[allow(unused_macros)]
852macro_rules! basic_latin_to_ascii_simd_stride {
853 ($name:ident, $load:ident, $store:ident) => {
854 #[inline(always)]
855 pub unsafe fn $name(src: *const u16, dst: *mut u8) -> bool {
856 let first = $load(src);
857 let second = $load(src.add(8));
858 if simd_is_basic_latin(first | second) {
859 $store(dst, simd_pack(first, second));
860 true
861 } else {
862 false
863 }
864 }
865 };
866}
867
868#[allow(unused_macros)]
869macro_rules! pack_simd_stride {
870 ($name:ident, $load:ident, $store:ident) => {
871 #[inline(always)]
872 pub unsafe fn $name(src: *const u16, dst: *mut u8) {
873 let first = $load(src);
874 let second = $load(src.add(8));
875 $store(dst, simd_pack(first, second));
876 }
877 };
878}
879
880cfg_if! {
881 if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "aarch64"))] {
882 // SIMD with the same instructions for aligned and unaligned loads and stores
883
884 pub const SIMD_STRIDE_SIZE: usize = 16;
885
886 pub const MAX_STRIDE_SIZE: usize = 16;
887
888// pub const ALIGNMENT: usize = 8;
889
890 pub const ALU_STRIDE_SIZE: usize = 16;
891
892 pub const ALU_ALIGNMENT: usize = 8;
893
894 pub const ALU_ALIGNMENT_MASK: usize = 7;
895
896 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
897
898 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
899 unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
900
901 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
902 pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
903
904 ascii_simd_unalign!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_neither_aligned);
905 ascii_simd_unalign!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_neither_aligned);
906 ascii_simd_unalign!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_neither_aligned);
907 latin1_simd_unalign!(unpack_latin1, u8, u16, unpack_stride_neither_aligned);
908 latin1_simd_unalign!(pack_latin1, u16, u8, pack_stride_neither_aligned);
909 } else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
910 // SIMD with different instructions for aligned and unaligned loads and stores.
911 //
912 // Newer microarchitectures are not supposed to have a performance difference between
913 // aligned and unaligned SSE2 loads and stores when the address is actually aligned,
914 // but the benchmark results I see don't agree.
915
916 pub const SIMD_STRIDE_SIZE: usize = 16;
917
918 pub const MAX_STRIDE_SIZE: usize = 16;
919
920 pub const SIMD_ALIGNMENT_MASK: usize = 15;
921
922 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
923 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
924 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_dst_aligned, load16_unaligned, store16_aligned);
925 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
926
927 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
928 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
929 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_dst_aligned, load16_unaligned, store8_aligned);
930 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
931
932 unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
933 unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
934 unpack_simd_stride!(unpack_stride_dst_aligned, load16_unaligned, store8_aligned);
935 unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
936
937 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
938 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
939 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
940 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
941
942 pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
943 pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
944 pack_simd_stride!(pack_stride_dst_aligned, load8_unaligned, store16_aligned);
945 pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
946
947 ascii_simd_check_align!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_dst_aligned, ascii_to_ascii_stride_neither_aligned);
948 ascii_simd_check_align!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_dst_aligned, ascii_to_basic_latin_stride_neither_aligned);
949 ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
950 latin1_simd_check_align!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
951 latin1_simd_check_align!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
952 } else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
953 // SIMD with different instructions for aligned and unaligned loads and stores.
954 //
955 // Newer microarchitectures are not supposed to have a performance difference between
956 // aligned and unaligned SSE2 loads and stores when the address is actually aligned,
957 // but the benchmark results I see don't agree.
958
959 pub const SIMD_STRIDE_SIZE: usize = 16;
960
961 pub const SIMD_ALIGNMENT: usize = 16;
962
963 pub const MAX_STRIDE_SIZE: usize = 16;
964
965 pub const SIMD_ALIGNMENT_MASK: usize = 15;
966
967 ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_both_aligned, store16_aligned);
968 ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_src_aligned, store16_unaligned);
969
970 ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_both_aligned, store8_aligned);
971 ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_src_aligned, store8_unaligned);
972
973 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
974 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
975 ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
976
977 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
978 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
979 ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
980
981 unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
982 unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
983
984 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
985 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
986 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
987 basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
988
989 pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
990 pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
991
992 ascii_simd_check_align_unrolled!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_neither_aligned, ascii_to_ascii_simd_double_stride_both_aligned, ascii_to_ascii_simd_double_stride_src_aligned);
993 ascii_simd_check_align_unrolled!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_neither_aligned, ascii_to_basic_latin_simd_double_stride_both_aligned, ascii_to_basic_latin_simd_double_stride_src_aligned);
994
995 ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
996 latin1_simd_check_align_unrolled!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
997 latin1_simd_check_align_unrolled!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
998 } else if #[cfg(all(target_endian = "little", target_pointer_width = "64"))] {
999 // Aligned ALU word, little-endian, 64-bit
1000
1001 pub const ALU_STRIDE_SIZE: usize = 16;
1002
1003 pub const MAX_STRIDE_SIZE: usize = 16;
1004
1005 pub const ALU_ALIGNMENT: usize = 8;
1006
1007 pub const ALU_ALIGNMENT_MASK: usize = 7;
1008
1009 #[inline(always)]
1010 unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1011 let first = ((0x0000_0000_FF00_0000usize & word) << 24) |
1012 ((0x0000_0000_00FF_0000usize & word) << 16) |
1013 ((0x0000_0000_0000_FF00usize & word) << 8) |
1014 (0x0000_0000_0000_00FFusize & word);
1015 let second = ((0xFF00_0000_0000_0000usize & word) >> 8) |
1016 ((0x00FF_0000_0000_0000usize & word) >> 16) |
1017 ((0x0000_FF00_0000_0000usize & word) >> 24) |
1018 ((0x0000_00FF_0000_0000usize & word) >> 32);
1019 let third = ((0x0000_0000_FF00_0000usize & second_word) << 24) |
1020 ((0x0000_0000_00FF_0000usize & second_word) << 16) |
1021 ((0x0000_0000_0000_FF00usize & second_word) << 8) |
1022 (0x0000_0000_0000_00FFusize & second_word);
1023 let fourth = ((0xFF00_0000_0000_0000usize & second_word) >> 8) |
1024 ((0x00FF_0000_0000_0000usize & second_word) >> 16) |
1025 ((0x0000_FF00_0000_0000usize & second_word) >> 24) |
1026 ((0x0000_00FF_0000_0000usize & second_word) >> 32);
1027 *dst = first;
1028 *(dst.add(1)) = second;
1029 *(dst.add(2)) = third;
1030 *(dst.add(3)) = fourth;
1031 }
1032
1033 #[inline(always)]
1034 unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1035 let word = ((0x00FF_0000_0000_0000usize & second) << 8) |
1036 ((0x0000_00FF_0000_0000usize & second) << 16) |
1037 ((0x0000_0000_00FF_0000usize & second) << 24) |
1038 ((0x0000_0000_0000_00FFusize & second) << 32) |
1039 ((0x00FF_0000_0000_0000usize & first) >> 24) |
1040 ((0x0000_00FF_0000_0000usize & first) >> 16) |
1041 ((0x0000_0000_00FF_0000usize & first) >> 8) |
1042 (0x0000_0000_0000_00FFusize & first);
1043 let second_word = ((0x00FF_0000_0000_0000usize & fourth) << 8) |
1044 ((0x0000_00FF_0000_0000usize & fourth) << 16) |
1045 ((0x0000_0000_00FF_0000usize & fourth) << 24) |
1046 ((0x0000_0000_0000_00FFusize & fourth) << 32) |
1047 ((0x00FF_0000_0000_0000usize & third) >> 24) |
1048 ((0x0000_00FF_0000_0000usize & third) >> 16) |
1049 ((0x0000_0000_00FF_0000usize & third) >> 8) |
1050 (0x0000_0000_0000_00FFusize & third);
1051 *dst = word;
1052 *(dst.add(1)) = second_word;
1053 }
1054 } else if #[cfg(all(target_endian = "little", target_pointer_width = "32"))] {
1055 // Aligned ALU word, little-endian, 32-bit
1056
1057 pub const ALU_STRIDE_SIZE: usize = 8;
1058
1059 pub const MAX_STRIDE_SIZE: usize = 8;
1060
1061 pub const ALU_ALIGNMENT: usize = 4;
1062
1063 pub const ALU_ALIGNMENT_MASK: usize = 3;
1064
1065 #[inline(always)]
1066 unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1067 let first = ((0x0000_FF00usize & word) << 8) |
1068 (0x0000_00FFusize & word);
1069 let second = ((0xFF00_0000usize & word) >> 8) |
1070 ((0x00FF_0000usize & word) >> 16);
1071 let third = ((0x0000_FF00usize & second_word) << 8) |
1072 (0x0000_00FFusize & second_word);
1073 let fourth = ((0xFF00_0000usize & second_word) >> 8) |
1074 ((0x00FF_0000usize & second_word) >> 16);
1075 *dst = first;
1076 *(dst.add(1)) = second;
1077 *(dst.add(2)) = third;
1078 *(dst.add(3)) = fourth;
1079 }
1080
1081 #[inline(always)]
1082 unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1083 let word = ((0x00FF_0000usize & second) << 8) |
1084 ((0x0000_00FFusize & second) << 16) |
1085 ((0x00FF_0000usize & first) >> 8) |
1086 (0x0000_00FFusize & first);
1087 let second_word = ((0x00FF_0000usize & fourth) << 8) |
1088 ((0x0000_00FFusize & fourth) << 16) |
1089 ((0x00FF_0000usize & third) >> 8) |
1090 (0x0000_00FFusize & third);
1091 *dst = word;
1092 *(dst.add(1)) = second_word;
1093 }
1094 } else if #[cfg(all(target_endian = "big", target_pointer_width = "64"))] {
1095 // Aligned ALU word, big-endian, 64-bit
1096
1097 pub const ALU_STRIDE_SIZE: usize = 16;
1098
1099 pub const MAX_STRIDE_SIZE: usize = 16;
1100
1101 pub const ALU_ALIGNMENT: usize = 8;
1102
1103 pub const ALU_ALIGNMENT_MASK: usize = 7;
1104
1105 #[inline(always)]
1106 unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1107 let first = ((0xFF00_0000_0000_0000usize & word) >> 8) |
1108 ((0x00FF_0000_0000_0000usize & word) >> 16) |
1109 ((0x0000_FF00_0000_0000usize & word) >> 24) |
1110 ((0x0000_00FF_0000_0000usize & word) >> 32);
1111 let second = ((0x0000_0000_FF00_0000usize & word) << 24) |
1112 ((0x0000_0000_00FF_0000usize & word) << 16) |
1113 ((0x0000_0000_0000_FF00usize & word) << 8) |
1114 (0x0000_0000_0000_00FFusize & word);
1115 let third = ((0xFF00_0000_0000_0000usize & second_word) >> 8) |
1116 ((0x00FF_0000_0000_0000usize & second_word) >> 16) |
1117 ((0x0000_FF00_0000_0000usize & second_word) >> 24) |
1118 ((0x0000_00FF_0000_0000usize & second_word) >> 32);
1119 let fourth = ((0x0000_0000_FF00_0000usize & second_word) << 24) |
1120 ((0x0000_0000_00FF_0000usize & second_word) << 16) |
1121 ((0x0000_0000_0000_FF00usize & second_word) << 8) |
1122 (0x0000_0000_0000_00FFusize & second_word);
1123 *dst = first;
1124 *(dst.add(1)) = second;
1125 *(dst.add(2)) = third;
1126 *(dst.add(3)) = fourth;
1127 }
1128
1129 #[inline(always)]
1130 unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1131 let word = ((0x00FF0000_00000000usize & first) << 8) |
1132 ((0x000000FF_00000000usize & first) << 16) |
1133 ((0x00000000_00FF0000usize & first) << 24) |
1134 ((0x00000000_000000FFusize & first) << 32) |
1135 ((0x00FF0000_00000000usize & second) >> 24) |
1136 ((0x000000FF_00000000usize & second) >> 16) |
1137 ((0x00000000_00FF0000usize & second) >> 8) |
1138 (0x00000000_000000FFusize & second);
1139 let second_word = ((0x00FF0000_00000000usize & third) << 8) |
1140 ((0x000000FF_00000000usize & third) << 16) |
1141 ((0x00000000_00FF0000usize & third) << 24) |
1142 ((0x00000000_000000FFusize & third) << 32) |
1143 ((0x00FF0000_00000000usize & fourth) >> 24) |
1144 ((0x000000FF_00000000usize & fourth) >> 16) |
1145 ((0x00000000_00FF0000usize & fourth) >> 8) |
1146 (0x00000000_000000FFusize & fourth);
1147 *dst = word;
1148 *(dst.add(1)) = second_word;
1149 }
1150 } else if #[cfg(all(target_endian = "big", target_pointer_width = "32"))] {
1151 // Aligned ALU word, big-endian, 32-bit
1152
1153 pub const ALU_STRIDE_SIZE: usize = 8;
1154
1155 pub const MAX_STRIDE_SIZE: usize = 8;
1156
1157 pub const ALU_ALIGNMENT: usize = 4;
1158
1159 pub const ALU_ALIGNMENT_MASK: usize = 3;
1160
1161 #[inline(always)]
1162 unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1163 let first = ((0xFF00_0000usize & word) >> 8) |
1164 ((0x00FF_0000usize & word) >> 16);
1165 let second = ((0x0000_FF00usize & word) << 8) |
1166 (0x0000_00FFusize & word);
1167 let third = ((0xFF00_0000usize & second_word) >> 8) |
1168 ((0x00FF_0000usize & second_word) >> 16);
1169 let fourth = ((0x0000_FF00usize & second_word) << 8) |
1170 (0x0000_00FFusize & second_word);
1171 *dst = first;
1172 *(dst.add(1)) = second;
1173 *(dst.add(2)) = third;
1174 *(dst.add(3)) = fourth;
1175 }
1176
1177 #[inline(always)]
1178 unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1179 let word = ((0x00FF_0000usize & first) << 8) |
1180 ((0x0000_00FFusize & first) << 16) |
1181 ((0x00FF_0000usize & second) >> 8) |
1182 (0x0000_00FFusize & second);
1183 let second_word = ((0x00FF_0000usize & third) << 8) |
1184 ((0x0000_00FFusize & third) << 16) |
1185 ((0x00FF_0000usize & fourth) >> 8) |
1186 (0x0000_00FFusize & fourth);
1187 *dst = word;
1188 *(dst.add(1)) = second_word;
1189 }
1190 } else {
1191 ascii_naive!(ascii_to_ascii, u8, u8);
1192 ascii_naive!(ascii_to_basic_latin, u8, u16);
1193 ascii_naive!(basic_latin_to_ascii, u16, u8);
1194 }
1195}
1196
1197cfg_if! {
1198 if #[cfg(target_endian = "little")] {
1199 #[allow(dead_code)]
1200 #[inline(always)]
1201 fn count_zeros(word: usize) -> u32 {
1202 word.trailing_zeros()
1203 }
1204 } else {
1205 #[allow(dead_code)]
1206 #[inline(always)]
1207 fn count_zeros(word: usize) -> u32 {
1208 word.leading_zeros()
1209 }
1210 }
1211}
1212
1213cfg_if! {
1214 if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "disabled"))] {
1215 #[inline(always)]
1216 pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1217 let src = slice.as_ptr();
1218 let len = slice.len();
1219 let mut offset = 0usize;
1220 if SIMD_STRIDE_SIZE <= len {
1221 let len_minus_stride = len - SIMD_STRIDE_SIZE;
1222 loop {
1223 let simd = unsafe { load16_unaligned(src.add(offset)) };
1224 if !simd_is_ascii(simd) {
1225 break;
1226 }
1227 offset += SIMD_STRIDE_SIZE;
1228 if offset > len_minus_stride {
1229 break;
1230 }
1231 }
1232 }
1233 while offset < len {
1234 let code_unit = slice[offset];
1235 if code_unit > 127 {
1236 return Some((code_unit, offset));
1237 }
1238 offset += 1;
1239 }
1240 None
1241 }
1242 } else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
1243 #[inline(always)]
1244 pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1245 let src = slice.as_ptr();
1246 let len = slice.len();
1247 let mut offset = 0usize;
1248 if SIMD_STRIDE_SIZE <= len {
1249 // First, process one unaligned vector
1250 let simd = unsafe { load16_unaligned(src) };
1251 let mask = mask_ascii(simd);
1252 if mask != 0 {
1253 offset = mask.trailing_zeros() as usize;
1254 let non_ascii = unsafe { *src.add(offset) };
1255 return Some((non_ascii, offset));
1256 }
1257 offset = SIMD_STRIDE_SIZE;
1258
1259 // We have now seen 16 ASCII bytes. Let's guess that
1260 // there will be enough more to justify more expense
1261 // in the case of non-ASCII.
1262 // Use aligned reads for the sake of old microachitectures.
1263 let until_alignment = unsafe { (SIMD_ALIGNMENT - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK };
1264 // This addition won't overflow, because even in the 32-bit PAE case the
1265 // address space holds enough code that the slice length can't be that
1266 // close to address space size.
1267 // offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
1268 if until_alignment + (SIMD_STRIDE_SIZE * 3) <= len {
1269 if until_alignment != 0 {
1270 let simd = unsafe { load16_unaligned(src.add(offset)) };
1271 let mask = mask_ascii(simd);
1272 if mask != 0 {
1273 offset += mask.trailing_zeros() as usize;
1274 let non_ascii = unsafe { *src.add(offset) };
1275 return Some((non_ascii, offset));
1276 }
1277 offset += until_alignment;
1278 }
1279 let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * 2);
1280 loop {
1281 let first = unsafe { load16_aligned(src.add(offset)) };
1282 let second = unsafe { load16_aligned(src.add(offset + SIMD_STRIDE_SIZE)) };
1283 if !simd_is_ascii(first | second) {
1284 let mask_first = mask_ascii(first);
1285 if mask_first != 0 {
1286 offset += mask_first.trailing_zeros() as usize;
1287 } else {
1288 let mask_second = mask_ascii(second);
1289 offset += SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize;
1290 }
1291 let non_ascii = unsafe { *src.add(offset) };
1292 return Some((non_ascii, offset));
1293 }
1294 offset += SIMD_STRIDE_SIZE * 2;
1295 if offset > len_minus_stride_times_two {
1296 break;
1297 }
1298 }
1299 if offset + SIMD_STRIDE_SIZE <= len {
1300 let simd = unsafe { load16_aligned(src.add(offset)) };
1301 let mask = mask_ascii(simd);
1302 if mask != 0 {
1303 offset += mask.trailing_zeros() as usize;
1304 let non_ascii = unsafe { *src.add(offset) };
1305 return Some((non_ascii, offset));
1306 }
1307 offset += SIMD_STRIDE_SIZE;
1308 }
1309 } else {
1310 // At most two iterations, so unroll
1311 if offset + SIMD_STRIDE_SIZE <= len {
1312 let simd = unsafe { load16_unaligned(src.add(offset)) };
1313 let mask = mask_ascii(simd);
1314 if mask != 0 {
1315 offset += mask.trailing_zeros() as usize;
1316 let non_ascii = unsafe { *src.add(offset) };
1317 return Some((non_ascii, offset));
1318 }
1319 offset += SIMD_STRIDE_SIZE;
1320 if offset + SIMD_STRIDE_SIZE <= len {
1321 let simd = unsafe { load16_unaligned(src.add(offset)) };
1322 let mask = mask_ascii(simd);
1323 if mask != 0 {
1324 offset += mask.trailing_zeros() as usize;
1325 let non_ascii = unsafe { *src.add(offset) };
1326 return Some((non_ascii, offset));
1327 }
1328 offset += SIMD_STRIDE_SIZE;
1329 }
1330 }
1331 }
1332 }
1333 while offset < len {
1334 let code_unit = unsafe { *(src.add(offset)) };
1335 if code_unit > 127 {
1336 return Some((code_unit, offset));
1337 }
1338 offset += 1;
1339 }
1340 None
1341 }
1342 } else {
1343 #[inline(always)]
1344 fn find_non_ascii(word: usize, second_word: usize) -> Option<usize> {
1345 let word_masked = word & ASCII_MASK;
1346 let second_masked = second_word & ASCII_MASK;
1347 if (word_masked | second_masked) == 0 {
1348 return None;
1349 }
1350 if word_masked != 0 {
1351 let zeros = count_zeros(word_masked);
1352 // `zeros` now contains 7 (for the seven bits of non-ASCII)
1353 // plus 8 times the number of ASCII in text order before the
1354 // non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1355 // text order before the non-ASCII byte in the big-endian case.
1356 let num_ascii = (zeros >> 3) as usize;
1357 return Some(num_ascii);
1358 }
1359 let zeros = count_zeros(second_masked);
1360 // `zeros` now contains 7 (for the seven bits of non-ASCII)
1361 // plus 8 times the number of ASCII in text order before the
1362 // non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1363 // text order before the non-ASCII byte in the big-endian case.
1364 let num_ascii = (zeros >> 3) as usize;
1365 Some(ALU_ALIGNMENT + num_ascii)
1366 }
1367
1368 #[inline(always)]
1369 unsafe fn validate_ascii_stride(src: *const usize) -> Option<usize> {
1370 let word = *src;
1371 let second_word = *(src.add(1));
1372 find_non_ascii(word, second_word)
1373 }
1374
1375 #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
1376 #[inline(always)]
1377 pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1378 let src = slice.as_ptr();
1379 let len = slice.len();
1380 let mut offset = 0usize;
1381 let mut until_alignment = (ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & ALU_ALIGNMENT_MASK;
1382 if until_alignment + ALU_STRIDE_SIZE <= len {
1383 while until_alignment != 0 {
1384 let code_unit = slice[offset];
1385 if code_unit > 127 {
1386 return Some((code_unit, offset));
1387 }
1388 offset += 1;
1389 until_alignment -= 1;
1390 }
1391 let len_minus_stride = len - ALU_STRIDE_SIZE;
1392 loop {
1393 let ptr = unsafe { src.add(offset) as *const usize };
1394 if let Some(num_ascii) = unsafe { validate_ascii_stride(ptr) } {
1395 offset += num_ascii;
1396 return Some((unsafe { *(src.add(offset)) }, offset));
1397 }
1398 offset += ALU_STRIDE_SIZE;
1399 if offset > len_minus_stride {
1400 break;
1401 }
1402 }
1403 }
1404 while offset < len {
1405 let code_unit = slice[offset];
1406 if code_unit > 127 {
1407 return Some((code_unit, offset));
1408 }
1409 offset += 1;
1410 }
1411 None
1412 }
1413
1414 }
1415}
1416
1417cfg_if! {
1418 if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] {
1419
1420 } else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
1421 // Even with NEON enabled, we use the ALU path for ASCII validation, because testing
1422 // on Exynos 5 indicated that using NEON isn't worthwhile where there are only
1423 // vector reads without vector writes.
1424
1425 pub const ALU_STRIDE_SIZE: usize = 8;
1426
1427 pub const ALU_ALIGNMENT: usize = 4;
1428
1429 pub const ALU_ALIGNMENT_MASK: usize = 3;
1430 } else {
1431 #[inline(always)]
1432 unsafe fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1433 let word = *src;
1434 let second_word = *(src.add(1));
1435 unpack_alu(word, second_word, dst);
1436 }
1437
1438 #[inline(always)]
1439 unsafe fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1440 let first = *src;
1441 let second = *(src.add(1));
1442 let third = *(src.add(2));
1443 let fourth = *(src.add(3));
1444 pack_alu(first, second, third, fourth, dst);
1445 }
1446
1447 #[inline(always)]
1448 unsafe fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1449 let word = *src;
1450 let second_word = *(src.add(1));
1451 // Check if the words contains non-ASCII
1452 if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 {
1453 return false;
1454 }
1455 unpack_alu(word, second_word, dst);
1456 true
1457 }
1458
1459 #[inline(always)]
1460 unsafe fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1461 let first = *src;
1462 let second = *(src.add(1));
1463 let third = *(src.add(2));
1464 let fourth = *(src.add(3));
1465 if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 {
1466 return false;
1467 }
1468 pack_alu(first, second, third, fourth, dst);
1469 true
1470 }
1471
1472 #[inline(always)]
1473 unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option<usize> {
1474 let word = *src;
1475 let second_word = *(src.add(1));
1476 *dst = word;
1477 *(dst.add(1)) = second_word;
1478 find_non_ascii(word, second_word)
1479 }
1480
1481 basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_alu);
1482 basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_alu);
1483 latin1_alu!(unpack_latin1, u8, u16, unpack_latin1_stride_alu);
1484 latin1_alu!(pack_latin1, u16, u8, pack_latin1_stride_alu);
1485 ascii_alu!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride);
1486 }
1487}
1488
1489pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
1490 match validate_ascii(slice:bytes) {
1491 None => bytes.len(),
1492 Some((_, num_valid: usize)) => num_valid,
1493 }
1494}
1495
1496pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
1497 for (i: usize, b_ref: &u8) in bytes.iter().enumerate() {
1498 let b: u8 = *b_ref;
1499 if b >= 0x80 || b == 0x1B || b == 0x0E || b == 0x0F {
1500 return i;
1501 }
1502 }
1503 bytes.len()
1504}
1505
1506// Any copyright to the test code below this comment is dedicated to the
1507// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
1508
1509#[cfg(all(test, feature = "alloc"))]
1510mod tests {
1511 use super::*;
1512 use alloc::vec::Vec;
1513
1514 macro_rules! test_ascii {
1515 ($test_name:ident, $fn_tested:ident, $src_unit:ty, $dst_unit:ty) => {
1516 #[test]
1517 fn $test_name() {
1518 let mut src: Vec<$src_unit> = Vec::with_capacity(32);
1519 let mut dst: Vec<$dst_unit> = Vec::with_capacity(32);
1520 for i in 0..32 {
1521 src.clear();
1522 dst.clear();
1523 dst.resize(32, 0);
1524 for j in 0..32 {
1525 let c = if i == j { 0xAA } else { j + 0x40 };
1526 src.push(c as $src_unit);
1527 }
1528 match unsafe { $fn_tested(src.as_ptr(), dst.as_mut_ptr(), 32) } {
1529 None => unreachable!("Should always find non-ASCII"),
1530 Some((non_ascii, num_ascii)) => {
1531 assert_eq!(non_ascii, 0xAA);
1532 assert_eq!(num_ascii, i);
1533 for j in 0..i {
1534 assert_eq!(dst[j], (j + 0x40) as $dst_unit);
1535 }
1536 }
1537 }
1538 }
1539 }
1540 };
1541 }
1542
1543 test_ascii!(test_ascii_to_ascii, ascii_to_ascii, u8, u8);
1544 test_ascii!(test_ascii_to_basic_latin, ascii_to_basic_latin, u8, u16);
1545 test_ascii!(test_basic_latin_to_ascii, basic_latin_to_ascii, u16, u8);
1546}
1547