| 1 | #[cfg (not(feature = "runtime-dispatch-simd" ))] |
| 2 | use core::{mem, ptr, usize}; |
| 3 | #[cfg (feature = "runtime-dispatch-simd" )] |
| 4 | use std::{mem, ptr, usize}; |
| 5 | |
| 6 | fn splat(byte: u8) -> usize { |
| 7 | let lo: usize = usize::MAX / 0xFF; |
| 8 | lo * byte as usize |
| 9 | } |
| 10 | |
| 11 | unsafe fn usize_load_unchecked(bytes: &[u8], offset: usize) -> usize { |
| 12 | let mut output: usize = 0; |
| 13 | ptr::copy_nonoverlapping( |
| 14 | src:bytes.as_ptr().add(offset), |
| 15 | &mut output as *mut usize as *mut u8, |
| 16 | count:mem::size_of::<usize>(), |
| 17 | ); |
| 18 | output |
| 19 | } |
| 20 | |
| 21 | fn bytewise_equal(lhs: usize, rhs: usize) -> usize { |
| 22 | let lo: usize = usize::MAX / 0xFF; |
| 23 | let hi: usize = lo << 7; |
| 24 | |
| 25 | let x: usize = lhs ^ rhs; |
| 26 | !((((x & !hi) + !hi) | x) >> 7) & lo |
| 27 | } |
| 28 | |
| 29 | fn sum_usize(values: usize) -> usize { |
| 30 | let every_other_byte_lo: usize = usize::MAX / 0xFFFF; |
| 31 | let every_other_byte: usize = every_other_byte_lo * 0xFF; |
| 32 | |
| 33 | // Pairwise reduction to avoid overflow on next step. |
| 34 | let pair_sum: usize = (values & every_other_byte) + ((values >> 8) & every_other_byte); |
| 35 | |
| 36 | // Multiplication results in top two bytes holding sum. |
| 37 | pair_sum.wrapping_mul(every_other_byte_lo) >> ((mem::size_of::<usize>() - 2) * 8) |
| 38 | } |
| 39 | |
| 40 | fn is_leading_utf8_byte(values: usize) -> usize { |
| 41 | // a leading UTF-8 byte is one which does not start with the bits 10. |
| 42 | ((!values >> 7) | (values >> 6)) & splat(byte:1) |
| 43 | } |
| 44 | |
| 45 | pub fn chunk_count(haystack: &[u8], needle: u8) -> usize { |
| 46 | let chunksize = mem::size_of::<usize>(); |
| 47 | assert!(haystack.len() >= chunksize); |
| 48 | |
| 49 | unsafe { |
| 50 | let mut offset = 0; |
| 51 | let mut count = 0; |
| 52 | |
| 53 | let needles = splat(needle); |
| 54 | |
| 55 | // 2040 |
| 56 | while haystack.len() >= offset + chunksize * 255 { |
| 57 | let mut counts = 0; |
| 58 | for _ in 0..255 { |
| 59 | counts += bytewise_equal(usize_load_unchecked(haystack, offset), needles); |
| 60 | offset += chunksize; |
| 61 | } |
| 62 | count += sum_usize(counts); |
| 63 | } |
| 64 | |
| 65 | // 8 |
| 66 | let mut counts = 0; |
| 67 | for i in 0..(haystack.len() - offset) / chunksize { |
| 68 | counts += bytewise_equal( |
| 69 | usize_load_unchecked(haystack, offset + i * chunksize), |
| 70 | needles, |
| 71 | ); |
| 72 | } |
| 73 | if haystack.len() % 8 != 0 { |
| 74 | let mask = usize::from_le(!(!0 >> ((haystack.len() % chunksize) * 8))); |
| 75 | counts += bytewise_equal( |
| 76 | usize_load_unchecked(haystack, haystack.len() - chunksize), |
| 77 | needles, |
| 78 | ) & mask; |
| 79 | } |
| 80 | count += sum_usize(counts); |
| 81 | |
| 82 | count |
| 83 | } |
| 84 | } |
| 85 | |
| 86 | pub fn chunk_num_chars(utf8_chars: &[u8]) -> usize { |
| 87 | let chunksize = mem::size_of::<usize>(); |
| 88 | assert!(utf8_chars.len() >= chunksize); |
| 89 | |
| 90 | unsafe { |
| 91 | let mut offset = 0; |
| 92 | let mut count = 0; |
| 93 | |
| 94 | // 2040 |
| 95 | while utf8_chars.len() >= offset + chunksize * 255 { |
| 96 | let mut counts = 0; |
| 97 | for _ in 0..255 { |
| 98 | counts += is_leading_utf8_byte(usize_load_unchecked(utf8_chars, offset)); |
| 99 | offset += chunksize; |
| 100 | } |
| 101 | count += sum_usize(counts); |
| 102 | } |
| 103 | |
| 104 | // 8 |
| 105 | let mut counts = 0; |
| 106 | for i in 0..(utf8_chars.len() - offset) / chunksize { |
| 107 | counts += |
| 108 | is_leading_utf8_byte(usize_load_unchecked(utf8_chars, offset + i * chunksize)); |
| 109 | } |
| 110 | if utf8_chars.len() % 8 != 0 { |
| 111 | let mask = usize::from_le(!(!0 >> ((utf8_chars.len() % chunksize) * 8))); |
| 112 | counts += is_leading_utf8_byte(usize_load_unchecked( |
| 113 | utf8_chars, |
| 114 | utf8_chars.len() - chunksize, |
| 115 | )) & mask; |
| 116 | } |
| 117 | count += sum_usize(counts); |
| 118 | |
| 119 | count |
| 120 | } |
| 121 | } |
| 122 | |