1 | /// SWAR: SIMD Within A Register |
2 | /// SIMD validator backend that validates register-sized chunks of data at a time. |
3 | use crate::{is_header_name_token, is_header_value_token, is_uri_token, Bytes}; |
4 | |
5 | // Adapt block-size to match native register size, i.e: 32bit => 4, 64bit => 8 |
6 | const BLOCK_SIZE: usize = core::mem::size_of::<usize>(); |
7 | type ByteBlock = [u8; BLOCK_SIZE]; |
8 | |
9 | #[inline ] |
10 | pub fn match_uri_vectored(bytes: &mut Bytes) { |
11 | loop { |
12 | if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) { |
13 | let n = match_uri_char_8_swar(bytes8); |
14 | // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes |
15 | // in `bytes`, so calling `advance(n)` is safe. |
16 | unsafe { |
17 | bytes.advance(n); |
18 | } |
19 | if n == BLOCK_SIZE { |
20 | continue; |
21 | } |
22 | } |
23 | if let Some(b) = bytes.peek() { |
24 | if is_uri_token(b) { |
25 | // SAFETY: using peek to retrieve the byte ensures that there is at least 1 more byte |
26 | // in bytes, so calling advance is safe. |
27 | unsafe { |
28 | bytes.advance(1); |
29 | } |
30 | continue; |
31 | } |
32 | } |
33 | break; |
34 | } |
35 | } |
36 | |
37 | #[inline ] |
38 | pub fn match_header_value_vectored(bytes: &mut Bytes) { |
39 | loop { |
40 | if let Some(bytes8) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) { |
41 | let n = match_header_value_char_8_swar(bytes8); |
42 | // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes |
43 | // in `bytes`, so calling `advance(n)` is safe. |
44 | unsafe { |
45 | bytes.advance(n); |
46 | } |
47 | if n == BLOCK_SIZE { |
48 | continue; |
49 | } |
50 | } |
51 | if let Some(b) = bytes.peek() { |
52 | if is_header_value_token(b) { |
53 | // SAFETY: using peek to retrieve the byte ensures that there is at least 1 more byte |
54 | // in bytes, so calling advance is safe. |
55 | unsafe { |
56 | bytes.advance(1); |
57 | } |
58 | continue; |
59 | } |
60 | } |
61 | break; |
62 | } |
63 | } |
64 | |
65 | #[inline ] |
66 | pub fn match_header_name_vectored(bytes: &mut Bytes) { |
67 | while let Some(block: [u8; 8]) = bytes.peek_n::<ByteBlock>(BLOCK_SIZE) { |
68 | let n: usize = match_block(f:is_header_name_token, block); |
69 | // SAFETY: using peek_n to retrieve the bytes ensures that there are at least n more bytes |
70 | // in `bytes`, so calling `advance(n)` is safe. |
71 | unsafe { |
72 | bytes.advance(n); |
73 | } |
74 | if n != BLOCK_SIZE { |
75 | return; |
76 | } |
77 | } |
78 | // SAFETY: match_tail processes at most the remaining data in `bytes`. advances `bytes` to the |
79 | // end, but no further. |
80 | unsafe { bytes.advance(match_tail(f:is_header_name_token, bytes.as_ref())) }; |
81 | } |
82 | |
83 | // Matches "tail", i.e: when we have <BLOCK_SIZE bytes in the buffer, should be uncommon |
84 | #[cold ] |
85 | #[inline ] |
86 | fn match_tail(f: impl Fn(u8) -> bool, bytes: &[u8]) -> usize { |
87 | for (i: usize, &b: u8) in bytes.iter().enumerate() { |
88 | if !f(b) { |
89 | return i; |
90 | } |
91 | } |
92 | bytes.len() |
93 | } |
94 | |
95 | // Naive fallback block matcher |
96 | #[inline (always)] |
97 | fn match_block(f: impl Fn(u8) -> bool, block: ByteBlock) -> usize { |
98 | for (i: usize, &b: u8) in block.iter().enumerate() { |
99 | if !f(b) { |
100 | return i; |
101 | } |
102 | } |
103 | BLOCK_SIZE |
104 | } |
105 | |
106 | // A const alternative to u64::from_ne_bytes to avoid bumping MSRV (1.36 => 1.44) |
107 | // creates a u64 whose bytes are each equal to b |
108 | const fn uniform_block(b: u8) -> usize { |
109 | (b as u64 * 0x01_01_01_01_01_01_01_01 /* [1_u8; 8] */) as usize |
110 | } |
111 | |
112 | // A byte-wise range-check on an entire word/block, |
113 | // ensuring all bytes in the word satisfy `33 <= (x != 127) <= 255` |
114 | #[inline ] |
115 | fn match_uri_char_8_swar(block: ByteBlock) -> usize { |
116 | // 33 <= (x != 127) <= 255 |
117 | const M: u8 = 0x21; |
118 | // uniform block full of exclamation mark (!) (33). |
119 | const BM: usize = uniform_block(M); |
120 | // uniform block full of 1. |
121 | const ONE: usize = uniform_block(0x01); |
122 | // uniform block full of DEL (127). |
123 | const DEL: usize = uniform_block(0x7f); |
124 | // uniform block full of 128. |
125 | const M128: usize = uniform_block(128); |
126 | |
127 | let x: usize = usize::from_ne_bytes(block); // Really just a transmute |
128 | let lt: usize = x.wrapping_sub(BM) & !x; // <= m |
129 | |
130 | let xor_del: usize = x ^ DEL; |
131 | let eq_del: usize = xor_del.wrapping_sub(ONE) & !xor_del; // == DEL |
132 | |
133 | offsetnz((lt | eq_del) & M128) |
134 | } |
135 | |
136 | // A byte-wise range-check on an entire word/block, |
137 | // ensuring all bytes in the word satisfy `32 <= (x != 127) <= 255` |
138 | #[inline ] |
139 | fn match_header_value_char_8_swar(block: ByteBlock) -> usize { |
140 | // 32 <= (x != 127) <= 255 |
141 | const M: u8 = 0x20; |
142 | // uniform block full of exclamation mark (!) (33). |
143 | const BM: usize = uniform_block(M); |
144 | // uniform block full of 1. |
145 | const ONE: usize = uniform_block(0x01); |
146 | // uniform block full of DEL (127). |
147 | const DEL: usize = uniform_block(0x7f); |
148 | // uniform block full of 128. |
149 | const M128: usize = uniform_block(128); |
150 | |
151 | let x: usize = usize::from_ne_bytes(block); // Really just a transmute |
152 | let lt: usize = x.wrapping_sub(BM) & !x; // <= m |
153 | |
154 | let xor_del: usize = x ^ DEL; |
155 | let eq_del: usize = xor_del.wrapping_sub(ONE) & !xor_del; // == DEL |
156 | |
157 | offsetnz((lt | eq_del) & M128) |
158 | } |
159 | |
160 | /// Check block to find offset of first non-zero byte |
161 | // NOTE: Curiously `block.trailing_zeros() >> 3` appears to be slower, maybe revisit |
162 | #[inline ] |
163 | fn offsetnz(block: usize) -> usize { |
164 | // fast path optimistic case (common for long valid sequences) |
165 | if block == 0 { |
166 | return BLOCK_SIZE; |
167 | } |
168 | |
169 | // perf: rust will unroll this loop |
170 | for (i: usize, b: u8) in block.to_ne_bytes().iter().copied().enumerate() { |
171 | if b != 0 { |
172 | return i; |
173 | } |
174 | } |
175 | unreachable!() |
176 | } |
177 | |
178 | #[test ] |
179 | fn test_is_header_value_block() { |
180 | let is_header_value_block = |b| match_header_value_char_8_swar(b) == BLOCK_SIZE; |
181 | |
182 | // 0..32 => false |
183 | for b in 0..32_u8 { |
184 | assert!(!is_header_value_block([b; BLOCK_SIZE]), "b={}" , b); |
185 | } |
186 | // 32..=126 => true |
187 | for b in 32..=126_u8 { |
188 | assert!(is_header_value_block([b; BLOCK_SIZE]), "b={}" , b); |
189 | } |
190 | // 127 => false |
191 | assert!(!is_header_value_block([b' \x7F' ; BLOCK_SIZE]), "b={}" , b' \x7F' ); |
192 | // 128..=255 => true |
193 | for b in 128..=255_u8 { |
194 | assert!(is_header_value_block([b; BLOCK_SIZE]), "b={}" , b); |
195 | } |
196 | |
197 | |
198 | #[cfg (target_pointer_width = "64" )] |
199 | { |
200 | // A few sanity checks on non-uniform bytes for safe-measure |
201 | assert!(!is_header_value_block(*b"foo.com \n" )); |
202 | assert!(!is_header_value_block(*b"o.com \r\nU" )); |
203 | } |
204 | } |
205 | |
206 | #[test ] |
207 | fn test_is_uri_block() { |
208 | let is_uri_block = |b| match_uri_char_8_swar(b) == BLOCK_SIZE; |
209 | |
210 | // 0..33 => false |
211 | for b in 0..33_u8 { |
212 | assert!(!is_uri_block([b; BLOCK_SIZE]), "b={}" , b); |
213 | } |
214 | // 33..=126 => true |
215 | for b in 33..=126_u8 { |
216 | assert!(is_uri_block([b; BLOCK_SIZE]), "b={}" , b); |
217 | } |
218 | // 127 => false |
219 | assert!(!is_uri_block([b' \x7F' ; BLOCK_SIZE]), "b={}" , b' \x7F' ); |
220 | // 128..=255 => true |
221 | for b in 128..=255_u8 { |
222 | assert!(is_uri_block([b; BLOCK_SIZE]), "b={}" , b); |
223 | } |
224 | } |
225 | |
226 | #[test ] |
227 | fn test_offsetnz() { |
228 | let seq = [0_u8; BLOCK_SIZE]; |
229 | for i in 0..BLOCK_SIZE { |
230 | let mut seq = seq; |
231 | seq[i] = 1; |
232 | let x = usize::from_ne_bytes(seq); |
233 | assert_eq!(offsetnz(x), i); |
234 | } |
235 | } |
236 | |