1 | use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN}; |
2 | use arrayref::{array_mut_ref, array_ref}; |
3 | |
4 | cfg_if::cfg_if! { |
5 | if #[cfg(any(target_arch = "x86" , target_arch = "x86_64" ))] { |
6 | cfg_if::cfg_if! { |
7 | if #[cfg(blake3_avx512_ffi)] { |
8 | pub const MAX_SIMD_DEGREE: usize = 16; |
9 | } else { |
10 | pub const MAX_SIMD_DEGREE: usize = 8; |
11 | } |
12 | } |
13 | } else if #[cfg(blake3_neon)] { |
14 | pub const MAX_SIMD_DEGREE: usize = 4; |
15 | } else { |
16 | pub const MAX_SIMD_DEGREE: usize = 1; |
17 | } |
18 | } |
19 | |
20 | // There are some places where we want a static size that's equal to the |
21 | // MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently |
22 | // allowed to use cmp::max, so we have to hardcode this additional constant |
23 | // value. Get rid of this once cmp::max is a const fn. |
24 | cfg_if::cfg_if! { |
25 | if #[cfg(any(target_arch = "x86" , target_arch = "x86_64" ))] { |
26 | cfg_if::cfg_if! { |
27 | if #[cfg(blake3_avx512_ffi)] { |
28 | pub const MAX_SIMD_DEGREE_OR_2: usize = 16; |
29 | } else { |
30 | pub const MAX_SIMD_DEGREE_OR_2: usize = 8; |
31 | } |
32 | } |
33 | } else if #[cfg(blake3_neon)] { |
34 | pub const MAX_SIMD_DEGREE_OR_2: usize = 4; |
35 | } else { |
36 | pub const MAX_SIMD_DEGREE_OR_2: usize = 2; |
37 | } |
38 | } |
39 | |
40 | #[derive (Clone, Copy, Debug)] |
41 | pub enum Platform { |
42 | Portable, |
43 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
44 | SSE2, |
45 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
46 | SSE41, |
47 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
48 | AVX2, |
49 | #[cfg (blake3_avx512_ffi)] |
50 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
51 | AVX512, |
52 | #[cfg (blake3_neon)] |
53 | NEON, |
54 | } |
55 | |
56 | impl Platform { |
57 | #[allow (unreachable_code)] |
58 | pub fn detect() -> Self { |
59 | #[cfg (miri)] |
60 | { |
61 | return Platform::Portable; |
62 | } |
63 | |
64 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
65 | { |
66 | #[cfg (blake3_avx512_ffi)] |
67 | { |
68 | if avx512_detected() { |
69 | return Platform::AVX512; |
70 | } |
71 | } |
72 | if avx2_detected() { |
73 | return Platform::AVX2; |
74 | } |
75 | if sse41_detected() { |
76 | return Platform::SSE41; |
77 | } |
78 | if sse2_detected() { |
79 | return Platform::SSE2; |
80 | } |
81 | } |
82 | // We don't use dynamic feature detection for NEON. If the "neon" |
83 | // feature is on, NEON is assumed to be supported. |
84 | #[cfg (blake3_neon)] |
85 | { |
86 | return Platform::NEON; |
87 | } |
88 | Platform::Portable |
89 | } |
90 | |
91 | pub fn simd_degree(&self) -> usize { |
92 | let degree = match self { |
93 | Platform::Portable => 1, |
94 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
95 | Platform::SSE2 => 4, |
96 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
97 | Platform::SSE41 => 4, |
98 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
99 | Platform::AVX2 => 8, |
100 | #[cfg (blake3_avx512_ffi)] |
101 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
102 | Platform::AVX512 => 16, |
103 | #[cfg (blake3_neon)] |
104 | Platform::NEON => 4, |
105 | }; |
106 | debug_assert!(degree <= MAX_SIMD_DEGREE); |
107 | degree |
108 | } |
109 | |
110 | pub fn compress_in_place( |
111 | &self, |
112 | cv: &mut CVWords, |
113 | block: &[u8; BLOCK_LEN], |
114 | block_len: u8, |
115 | counter: u64, |
116 | flags: u8, |
117 | ) { |
118 | match self { |
119 | Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), |
120 | // Safe because detect() checked for platform support. |
121 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
122 | Platform::SSE2 => unsafe { |
123 | crate::sse2::compress_in_place(cv, block, block_len, counter, flags) |
124 | }, |
125 | // Safe because detect() checked for platform support. |
126 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
127 | Platform::SSE41 | Platform::AVX2 => unsafe { |
128 | crate::sse41::compress_in_place(cv, block, block_len, counter, flags) |
129 | }, |
130 | // Safe because detect() checked for platform support. |
131 | #[cfg (blake3_avx512_ffi)] |
132 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
133 | Platform::AVX512 => unsafe { |
134 | crate::avx512::compress_in_place(cv, block, block_len, counter, flags) |
135 | }, |
136 | // No NEON compress_in_place() implementation yet. |
137 | #[cfg (blake3_neon)] |
138 | Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags), |
139 | } |
140 | } |
141 | |
142 | pub fn compress_xof( |
143 | &self, |
144 | cv: &CVWords, |
145 | block: &[u8; BLOCK_LEN], |
146 | block_len: u8, |
147 | counter: u64, |
148 | flags: u8, |
149 | ) -> [u8; 64] { |
150 | match self { |
151 | Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), |
152 | // Safe because detect() checked for platform support. |
153 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
154 | Platform::SSE2 => unsafe { |
155 | crate::sse2::compress_xof(cv, block, block_len, counter, flags) |
156 | }, |
157 | // Safe because detect() checked for platform support. |
158 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
159 | Platform::SSE41 | Platform::AVX2 => unsafe { |
160 | crate::sse41::compress_xof(cv, block, block_len, counter, flags) |
161 | }, |
162 | // Safe because detect() checked for platform support. |
163 | #[cfg (blake3_avx512_ffi)] |
164 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
165 | Platform::AVX512 => unsafe { |
166 | crate::avx512::compress_xof(cv, block, block_len, counter, flags) |
167 | }, |
168 | // No NEON compress_xof() implementation yet. |
169 | #[cfg (blake3_neon)] |
170 | Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags), |
171 | } |
172 | } |
173 | |
174 | // IMPLEMENTATION NOTE |
175 | // =================== |
176 | // hash_many() applies two optimizations. The critically important |
177 | // optimization is the high-performance parallel SIMD hashing mode, |
178 | // described in detail in the spec. This more than doubles throughput per |
179 | // thread. Another optimization is keeping the state vectors transposed |
180 | // from block to block within a chunk. When state vectors are transposed |
181 | // after every block, there's a small but measurable performance loss. |
182 | // Compressing chunks with a dedicated loop avoids this. |
183 | |
184 | pub fn hash_many<const N: usize>( |
185 | &self, |
186 | inputs: &[&[u8; N]], |
187 | key: &CVWords, |
188 | counter: u64, |
189 | increment_counter: IncrementCounter, |
190 | flags: u8, |
191 | flags_start: u8, |
192 | flags_end: u8, |
193 | out: &mut [u8], |
194 | ) { |
195 | match self { |
196 | Platform::Portable => portable::hash_many( |
197 | inputs, |
198 | key, |
199 | counter, |
200 | increment_counter, |
201 | flags, |
202 | flags_start, |
203 | flags_end, |
204 | out, |
205 | ), |
206 | // Safe because detect() checked for platform support. |
207 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
208 | Platform::SSE2 => unsafe { |
209 | crate::sse2::hash_many( |
210 | inputs, |
211 | key, |
212 | counter, |
213 | increment_counter, |
214 | flags, |
215 | flags_start, |
216 | flags_end, |
217 | out, |
218 | ) |
219 | }, |
220 | // Safe because detect() checked for platform support. |
221 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
222 | Platform::SSE41 => unsafe { |
223 | crate::sse41::hash_many( |
224 | inputs, |
225 | key, |
226 | counter, |
227 | increment_counter, |
228 | flags, |
229 | flags_start, |
230 | flags_end, |
231 | out, |
232 | ) |
233 | }, |
234 | // Safe because detect() checked for platform support. |
235 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
236 | Platform::AVX2 => unsafe { |
237 | crate::avx2::hash_many( |
238 | inputs, |
239 | key, |
240 | counter, |
241 | increment_counter, |
242 | flags, |
243 | flags_start, |
244 | flags_end, |
245 | out, |
246 | ) |
247 | }, |
248 | // Safe because detect() checked for platform support. |
249 | #[cfg (blake3_avx512_ffi)] |
250 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
251 | Platform::AVX512 => unsafe { |
252 | crate::avx512::hash_many( |
253 | inputs, |
254 | key, |
255 | counter, |
256 | increment_counter, |
257 | flags, |
258 | flags_start, |
259 | flags_end, |
260 | out, |
261 | ) |
262 | }, |
263 | // Assumed to be safe if the "neon" feature is on. |
264 | #[cfg (blake3_neon)] |
265 | Platform::NEON => unsafe { |
266 | crate::neon::hash_many( |
267 | inputs, |
268 | key, |
269 | counter, |
270 | increment_counter, |
271 | flags, |
272 | flags_start, |
273 | flags_end, |
274 | out, |
275 | ) |
276 | }, |
277 | } |
278 | } |
279 | |
280 | pub fn xof_many( |
281 | &self, |
282 | cv: &CVWords, |
283 | block: &[u8; BLOCK_LEN], |
284 | block_len: u8, |
285 | mut counter: u64, |
286 | flags: u8, |
287 | out: &mut [u8], |
288 | ) { |
289 | debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only" ); |
290 | if out.is_empty() { |
291 | // The current assembly implementation always outputs at least 1 block. |
292 | return; |
293 | } |
294 | match self { |
295 | // Safe because detect() checked for platform support. |
296 | #[cfg (blake3_avx512_ffi)] |
297 | #[cfg (unix)] |
298 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
299 | Platform::AVX512 => unsafe { |
300 | crate::avx512::xof_many(cv, block, block_len, counter, flags, out) |
301 | }, |
302 | _ => { |
303 | // For platforms without an optimized xof_many, fall back to a loop over |
304 | // compress_xof. This is still faster than portable code. |
305 | for out_block in out.chunks_exact_mut(BLOCK_LEN) { |
306 | // TODO: Use array_chunks_mut here once that's stable. |
307 | let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap(); |
308 | *out_array = self.compress_xof(cv, block, block_len, counter, flags); |
309 | counter += 1; |
310 | } |
311 | } |
312 | } |
313 | } |
314 | |
315 | // Explicit platform constructors, for benchmarks. |
316 | |
317 | pub fn portable() -> Self { |
318 | Self::Portable |
319 | } |
320 | |
321 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
322 | pub fn sse2() -> Option<Self> { |
323 | if sse2_detected() { |
324 | Some(Self::SSE2) |
325 | } else { |
326 | None |
327 | } |
328 | } |
329 | |
330 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
331 | pub fn sse41() -> Option<Self> { |
332 | if sse41_detected() { |
333 | Some(Self::SSE41) |
334 | } else { |
335 | None |
336 | } |
337 | } |
338 | |
339 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
340 | pub fn avx2() -> Option<Self> { |
341 | if avx2_detected() { |
342 | Some(Self::AVX2) |
343 | } else { |
344 | None |
345 | } |
346 | } |
347 | |
348 | #[cfg (blake3_avx512_ffi)] |
349 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
350 | pub fn avx512() -> Option<Self> { |
351 | if avx512_detected() { |
352 | Some(Self::AVX512) |
353 | } else { |
354 | None |
355 | } |
356 | } |
357 | |
358 | #[cfg (blake3_neon)] |
359 | pub fn neon() -> Option<Self> { |
360 | // Assumed to be safe if the "neon" feature is on. |
361 | Some(Self::NEON) |
362 | } |
363 | } |
364 | |
365 | // Note that AVX-512 is divided into multiple featuresets, and we use two of |
366 | // them, F and VL. |
367 | #[cfg (blake3_avx512_ffi)] |
368 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
369 | #[inline (always)] |
370 | #[allow (unreachable_code)] |
371 | pub fn avx512_detected() -> bool { |
372 | if cfg!(miri) { |
373 | return false; |
374 | } |
375 | |
376 | // A testing-only short-circuit. |
377 | if cfg!(feature = "no_avx512" ) { |
378 | return false; |
379 | } |
380 | // Static check, e.g. for building with target-cpu=native. |
381 | #[cfg (all(target_feature = "avx512f" , target_feature = "avx512vl" ))] |
382 | { |
383 | return true; |
384 | } |
385 | // Dynamic check, if std is enabled. |
386 | #[cfg (feature = "std" )] |
387 | { |
388 | if is_x86_feature_detected!("avx512f" ) && is_x86_feature_detected!("avx512vl" ) { |
389 | return true; |
390 | } |
391 | } |
392 | false |
393 | } |
394 | |
395 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
396 | #[inline (always)] |
397 | #[allow (unreachable_code)] |
398 | pub fn avx2_detected() -> bool { |
399 | if cfg!(miri) { |
400 | return false; |
401 | } |
402 | |
403 | // A testing-only short-circuit. |
404 | if cfg!(feature = "no_avx2" ) { |
405 | return false; |
406 | } |
407 | // Static check, e.g. for building with target-cpu=native. |
408 | #[cfg (target_feature = "avx2" )] |
409 | { |
410 | return true; |
411 | } |
412 | // Dynamic check, if std is enabled. |
413 | #[cfg (feature = "std" )] |
414 | { |
415 | if is_x86_feature_detected!("avx2" ) { |
416 | return true; |
417 | } |
418 | } |
419 | false |
420 | } |
421 | |
422 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
423 | #[inline (always)] |
424 | #[allow (unreachable_code)] |
425 | pub fn sse41_detected() -> bool { |
426 | if cfg!(miri) { |
427 | return false; |
428 | } |
429 | |
430 | // A testing-only short-circuit. |
431 | if cfg!(feature = "no_sse41" ) { |
432 | return false; |
433 | } |
434 | // Static check, e.g. for building with target-cpu=native. |
435 | #[cfg (target_feature = "sse4.1" )] |
436 | { |
437 | return true; |
438 | } |
439 | // Dynamic check, if std is enabled. |
440 | #[cfg (feature = "std" )] |
441 | { |
442 | if is_x86_feature_detected!("sse4.1" ) { |
443 | return true; |
444 | } |
445 | } |
446 | false |
447 | } |
448 | |
449 | #[cfg (any(target_arch = "x86" , target_arch = "x86_64" ))] |
450 | #[inline (always)] |
451 | #[allow (unreachable_code)] |
452 | pub fn sse2_detected() -> bool { |
453 | if cfg!(miri) { |
454 | return false; |
455 | } |
456 | |
457 | // A testing-only short-circuit. |
458 | if cfg!(feature = "no_sse2" ) { |
459 | return false; |
460 | } |
461 | // Static check, e.g. for building with target-cpu=native. |
462 | #[cfg (target_feature = "sse2" )] |
463 | { |
464 | return true; |
465 | } |
466 | // Dynamic check, if std is enabled. |
467 | #[cfg (feature = "std" )] |
468 | { |
469 | if is_x86_feature_detected!("sse2" ) { |
470 | return true; |
471 | } |
472 | } |
473 | false |
474 | } |
475 | |
476 | #[inline (always)] |
477 | pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { |
478 | let mut out: [u32; 8] = [0; 8]; |
479 | out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); |
480 | out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); |
481 | out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); |
482 | out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); |
483 | out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); |
484 | out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); |
485 | out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); |
486 | out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); |
487 | out |
488 | } |
489 | |
490 | #[inline (always)] |
491 | pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { |
492 | let mut out: [u32; 16] = [0; 16]; |
493 | out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4)); |
494 | out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4)); |
495 | out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4)); |
496 | out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4)); |
497 | out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4)); |
498 | out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4)); |
499 | out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4)); |
500 | out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4)); |
501 | out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4)); |
502 | out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4)); |
503 | out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4)); |
504 | out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4)); |
505 | out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4)); |
506 | out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4)); |
507 | out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4)); |
508 | out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4)); |
509 | out |
510 | } |
511 | |
512 | #[inline (always)] |
513 | pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { |
514 | let mut out: [u8; 32] = [0; 32]; |
515 | *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); |
516 | *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); |
517 | *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); |
518 | *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); |
519 | *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); |
520 | *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); |
521 | *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); |
522 | *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); |
523 | out |
524 | } |
525 | |
526 | #[inline (always)] |
527 | pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { |
528 | let mut out: [u8; 64] = [0; 64]; |
529 | *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes(); |
530 | *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes(); |
531 | *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes(); |
532 | *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes(); |
533 | *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes(); |
534 | *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes(); |
535 | *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes(); |
536 | *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes(); |
537 | *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes(); |
538 | *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes(); |
539 | *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes(); |
540 | *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes(); |
541 | *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes(); |
542 | *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes(); |
543 | *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes(); |
544 | *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes(); |
545 | out |
546 | } |
547 | |