1use crate::{portable, CVWords, IncrementCounter, BLOCK_LEN};
2use arrayref::{array_mut_ref, array_ref};
3
4cfg_if::cfg_if! {
5 if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
6 cfg_if::cfg_if! {
7 if #[cfg(blake3_avx512_ffi)] {
8 pub const MAX_SIMD_DEGREE: usize = 16;
9 } else {
10 pub const MAX_SIMD_DEGREE: usize = 8;
11 }
12 }
13 } else if #[cfg(blake3_neon)] {
14 pub const MAX_SIMD_DEGREE: usize = 4;
15 } else {
16 pub const MAX_SIMD_DEGREE: usize = 1;
17 }
18}
19
20// There are some places where we want a static size that's equal to the
21// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently
22// allowed to use cmp::max, so we have to hardcode this additional constant
23// value. Get rid of this once cmp::max is a const fn.
24cfg_if::cfg_if! {
25 if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
26 cfg_if::cfg_if! {
27 if #[cfg(blake3_avx512_ffi)] {
28 pub const MAX_SIMD_DEGREE_OR_2: usize = 16;
29 } else {
30 pub const MAX_SIMD_DEGREE_OR_2: usize = 8;
31 }
32 }
33 } else if #[cfg(blake3_neon)] {
34 pub const MAX_SIMD_DEGREE_OR_2: usize = 4;
35 } else {
36 pub const MAX_SIMD_DEGREE_OR_2: usize = 2;
37 }
38}
39
40#[derive(Clone, Copy, Debug)]
41pub enum Platform {
42 Portable,
43 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
44 SSE2,
45 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
46 SSE41,
47 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
48 AVX2,
49 #[cfg(blake3_avx512_ffi)]
50 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
51 AVX512,
52 #[cfg(blake3_neon)]
53 NEON,
54}
55
56impl Platform {
57 #[allow(unreachable_code)]
58 pub fn detect() -> Self {
59 #[cfg(miri)]
60 {
61 return Platform::Portable;
62 }
63
64 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
65 {
66 #[cfg(blake3_avx512_ffi)]
67 {
68 if avx512_detected() {
69 return Platform::AVX512;
70 }
71 }
72 if avx2_detected() {
73 return Platform::AVX2;
74 }
75 if sse41_detected() {
76 return Platform::SSE41;
77 }
78 if sse2_detected() {
79 return Platform::SSE2;
80 }
81 }
82 // We don't use dynamic feature detection for NEON. If the "neon"
83 // feature is on, NEON is assumed to be supported.
84 #[cfg(blake3_neon)]
85 {
86 return Platform::NEON;
87 }
88 Platform::Portable
89 }
90
91 pub fn simd_degree(&self) -> usize {
92 let degree = match self {
93 Platform::Portable => 1,
94 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
95 Platform::SSE2 => 4,
96 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
97 Platform::SSE41 => 4,
98 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
99 Platform::AVX2 => 8,
100 #[cfg(blake3_avx512_ffi)]
101 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
102 Platform::AVX512 => 16,
103 #[cfg(blake3_neon)]
104 Platform::NEON => 4,
105 };
106 debug_assert!(degree <= MAX_SIMD_DEGREE);
107 degree
108 }
109
110 pub fn compress_in_place(
111 &self,
112 cv: &mut CVWords,
113 block: &[u8; BLOCK_LEN],
114 block_len: u8,
115 counter: u64,
116 flags: u8,
117 ) {
118 match self {
119 Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags),
120 // Safe because detect() checked for platform support.
121 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
122 Platform::SSE2 => unsafe {
123 crate::sse2::compress_in_place(cv, block, block_len, counter, flags)
124 },
125 // Safe because detect() checked for platform support.
126 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
127 Platform::SSE41 | Platform::AVX2 => unsafe {
128 crate::sse41::compress_in_place(cv, block, block_len, counter, flags)
129 },
130 // Safe because detect() checked for platform support.
131 #[cfg(blake3_avx512_ffi)]
132 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
133 Platform::AVX512 => unsafe {
134 crate::avx512::compress_in_place(cv, block, block_len, counter, flags)
135 },
136 // No NEON compress_in_place() implementation yet.
137 #[cfg(blake3_neon)]
138 Platform::NEON => portable::compress_in_place(cv, block, block_len, counter, flags),
139 }
140 }
141
142 pub fn compress_xof(
143 &self,
144 cv: &CVWords,
145 block: &[u8; BLOCK_LEN],
146 block_len: u8,
147 counter: u64,
148 flags: u8,
149 ) -> [u8; 64] {
150 match self {
151 Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags),
152 // Safe because detect() checked for platform support.
153 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
154 Platform::SSE2 => unsafe {
155 crate::sse2::compress_xof(cv, block, block_len, counter, flags)
156 },
157 // Safe because detect() checked for platform support.
158 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
159 Platform::SSE41 | Platform::AVX2 => unsafe {
160 crate::sse41::compress_xof(cv, block, block_len, counter, flags)
161 },
162 // Safe because detect() checked for platform support.
163 #[cfg(blake3_avx512_ffi)]
164 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
165 Platform::AVX512 => unsafe {
166 crate::avx512::compress_xof(cv, block, block_len, counter, flags)
167 },
168 // No NEON compress_xof() implementation yet.
169 #[cfg(blake3_neon)]
170 Platform::NEON => portable::compress_xof(cv, block, block_len, counter, flags),
171 }
172 }
173
174 // IMPLEMENTATION NOTE
175 // ===================
176 // hash_many() applies two optimizations. The critically important
177 // optimization is the high-performance parallel SIMD hashing mode,
178 // described in detail in the spec. This more than doubles throughput per
179 // thread. Another optimization is keeping the state vectors transposed
180 // from block to block within a chunk. When state vectors are transposed
181 // after every block, there's a small but measurable performance loss.
182 // Compressing chunks with a dedicated loop avoids this.
183
184 pub fn hash_many<const N: usize>(
185 &self,
186 inputs: &[&[u8; N]],
187 key: &CVWords,
188 counter: u64,
189 increment_counter: IncrementCounter,
190 flags: u8,
191 flags_start: u8,
192 flags_end: u8,
193 out: &mut [u8],
194 ) {
195 match self {
196 Platform::Portable => portable::hash_many(
197 inputs,
198 key,
199 counter,
200 increment_counter,
201 flags,
202 flags_start,
203 flags_end,
204 out,
205 ),
206 // Safe because detect() checked for platform support.
207 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
208 Platform::SSE2 => unsafe {
209 crate::sse2::hash_many(
210 inputs,
211 key,
212 counter,
213 increment_counter,
214 flags,
215 flags_start,
216 flags_end,
217 out,
218 )
219 },
220 // Safe because detect() checked for platform support.
221 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
222 Platform::SSE41 => unsafe {
223 crate::sse41::hash_many(
224 inputs,
225 key,
226 counter,
227 increment_counter,
228 flags,
229 flags_start,
230 flags_end,
231 out,
232 )
233 },
234 // Safe because detect() checked for platform support.
235 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
236 Platform::AVX2 => unsafe {
237 crate::avx2::hash_many(
238 inputs,
239 key,
240 counter,
241 increment_counter,
242 flags,
243 flags_start,
244 flags_end,
245 out,
246 )
247 },
248 // Safe because detect() checked for platform support.
249 #[cfg(blake3_avx512_ffi)]
250 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
251 Platform::AVX512 => unsafe {
252 crate::avx512::hash_many(
253 inputs,
254 key,
255 counter,
256 increment_counter,
257 flags,
258 flags_start,
259 flags_end,
260 out,
261 )
262 },
263 // Assumed to be safe if the "neon" feature is on.
264 #[cfg(blake3_neon)]
265 Platform::NEON => unsafe {
266 crate::neon::hash_many(
267 inputs,
268 key,
269 counter,
270 increment_counter,
271 flags,
272 flags_start,
273 flags_end,
274 out,
275 )
276 },
277 }
278 }
279
280 pub fn xof_many(
281 &self,
282 cv: &CVWords,
283 block: &[u8; BLOCK_LEN],
284 block_len: u8,
285 mut counter: u64,
286 flags: u8,
287 out: &mut [u8],
288 ) {
289 debug_assert_eq!(0, out.len() % BLOCK_LEN, "whole blocks only");
290 if out.is_empty() {
291 // The current assembly implementation always outputs at least 1 block.
292 return;
293 }
294 match self {
295 // Safe because detect() checked for platform support.
296 #[cfg(blake3_avx512_ffi)]
297 #[cfg(unix)]
298 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
299 Platform::AVX512 => unsafe {
300 crate::avx512::xof_many(cv, block, block_len, counter, flags, out)
301 },
302 _ => {
303 // For platforms without an optimized xof_many, fall back to a loop over
304 // compress_xof. This is still faster than portable code.
305 for out_block in out.chunks_exact_mut(BLOCK_LEN) {
306 // TODO: Use array_chunks_mut here once that's stable.
307 let out_array: &mut [u8; BLOCK_LEN] = out_block.try_into().unwrap();
308 *out_array = self.compress_xof(cv, block, block_len, counter, flags);
309 counter += 1;
310 }
311 }
312 }
313 }
314
315 // Explicit platform constructors, for benchmarks.
316
317 pub fn portable() -> Self {
318 Self::Portable
319 }
320
321 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
322 pub fn sse2() -> Option<Self> {
323 if sse2_detected() {
324 Some(Self::SSE2)
325 } else {
326 None
327 }
328 }
329
330 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
331 pub fn sse41() -> Option<Self> {
332 if sse41_detected() {
333 Some(Self::SSE41)
334 } else {
335 None
336 }
337 }
338
339 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
340 pub fn avx2() -> Option<Self> {
341 if avx2_detected() {
342 Some(Self::AVX2)
343 } else {
344 None
345 }
346 }
347
348 #[cfg(blake3_avx512_ffi)]
349 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
350 pub fn avx512() -> Option<Self> {
351 if avx512_detected() {
352 Some(Self::AVX512)
353 } else {
354 None
355 }
356 }
357
358 #[cfg(blake3_neon)]
359 pub fn neon() -> Option<Self> {
360 // Assumed to be safe if the "neon" feature is on.
361 Some(Self::NEON)
362 }
363}
364
365// Note that AVX-512 is divided into multiple featuresets, and we use two of
366// them, F and VL.
367#[cfg(blake3_avx512_ffi)]
368#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
369#[inline(always)]
370#[allow(unreachable_code)]
371pub fn avx512_detected() -> bool {
372 if cfg!(miri) {
373 return false;
374 }
375
376 // A testing-only short-circuit.
377 if cfg!(feature = "no_avx512") {
378 return false;
379 }
380 // Static check, e.g. for building with target-cpu=native.
381 #[cfg(all(target_feature = "avx512f", target_feature = "avx512vl"))]
382 {
383 return true;
384 }
385 // Dynamic check, if std is enabled.
386 #[cfg(feature = "std")]
387 {
388 if is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl") {
389 return true;
390 }
391 }
392 false
393}
394
395#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
396#[inline(always)]
397#[allow(unreachable_code)]
398pub fn avx2_detected() -> bool {
399 if cfg!(miri) {
400 return false;
401 }
402
403 // A testing-only short-circuit.
404 if cfg!(feature = "no_avx2") {
405 return false;
406 }
407 // Static check, e.g. for building with target-cpu=native.
408 #[cfg(target_feature = "avx2")]
409 {
410 return true;
411 }
412 // Dynamic check, if std is enabled.
413 #[cfg(feature = "std")]
414 {
415 if is_x86_feature_detected!("avx2") {
416 return true;
417 }
418 }
419 false
420}
421
422#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
423#[inline(always)]
424#[allow(unreachable_code)]
425pub fn sse41_detected() -> bool {
426 if cfg!(miri) {
427 return false;
428 }
429
430 // A testing-only short-circuit.
431 if cfg!(feature = "no_sse41") {
432 return false;
433 }
434 // Static check, e.g. for building with target-cpu=native.
435 #[cfg(target_feature = "sse4.1")]
436 {
437 return true;
438 }
439 // Dynamic check, if std is enabled.
440 #[cfg(feature = "std")]
441 {
442 if is_x86_feature_detected!("sse4.1") {
443 return true;
444 }
445 }
446 false
447}
448
449#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
450#[inline(always)]
451#[allow(unreachable_code)]
452pub fn sse2_detected() -> bool {
453 if cfg!(miri) {
454 return false;
455 }
456
457 // A testing-only short-circuit.
458 if cfg!(feature = "no_sse2") {
459 return false;
460 }
461 // Static check, e.g. for building with target-cpu=native.
462 #[cfg(target_feature = "sse2")]
463 {
464 return true;
465 }
466 // Dynamic check, if std is enabled.
467 #[cfg(feature = "std")]
468 {
469 if is_x86_feature_detected!("sse2") {
470 return true;
471 }
472 }
473 false
474}
475
476#[inline(always)]
477pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] {
478 let mut out: [u32; 8] = [0; 8];
479 out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
480 out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
481 out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
482 out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
483 out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
484 out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
485 out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
486 out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
487 out
488}
489
490#[inline(always)]
491pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] {
492 let mut out: [u32; 16] = [0; 16];
493 out[0] = u32::from_le_bytes(*array_ref!(bytes, 0 * 4, 4));
494 out[1] = u32::from_le_bytes(*array_ref!(bytes, 1 * 4, 4));
495 out[2] = u32::from_le_bytes(*array_ref!(bytes, 2 * 4, 4));
496 out[3] = u32::from_le_bytes(*array_ref!(bytes, 3 * 4, 4));
497 out[4] = u32::from_le_bytes(*array_ref!(bytes, 4 * 4, 4));
498 out[5] = u32::from_le_bytes(*array_ref!(bytes, 5 * 4, 4));
499 out[6] = u32::from_le_bytes(*array_ref!(bytes, 6 * 4, 4));
500 out[7] = u32::from_le_bytes(*array_ref!(bytes, 7 * 4, 4));
501 out[8] = u32::from_le_bytes(*array_ref!(bytes, 8 * 4, 4));
502 out[9] = u32::from_le_bytes(*array_ref!(bytes, 9 * 4, 4));
503 out[10] = u32::from_le_bytes(*array_ref!(bytes, 10 * 4, 4));
504 out[11] = u32::from_le_bytes(*array_ref!(bytes, 11 * 4, 4));
505 out[12] = u32::from_le_bytes(*array_ref!(bytes, 12 * 4, 4));
506 out[13] = u32::from_le_bytes(*array_ref!(bytes, 13 * 4, 4));
507 out[14] = u32::from_le_bytes(*array_ref!(bytes, 14 * 4, 4));
508 out[15] = u32::from_le_bytes(*array_ref!(bytes, 15 * 4, 4));
509 out
510}
511
512#[inline(always)]
513pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] {
514 let mut out: [u8; 32] = [0; 32];
515 *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
516 *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
517 *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
518 *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
519 *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
520 *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
521 *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
522 *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
523 out
524}
525
526#[inline(always)]
527pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] {
528 let mut out: [u8; 64] = [0; 64];
529 *array_mut_ref!(out, 0 * 4, 4) = words[0].to_le_bytes();
530 *array_mut_ref!(out, 1 * 4, 4) = words[1].to_le_bytes();
531 *array_mut_ref!(out, 2 * 4, 4) = words[2].to_le_bytes();
532 *array_mut_ref!(out, 3 * 4, 4) = words[3].to_le_bytes();
533 *array_mut_ref!(out, 4 * 4, 4) = words[4].to_le_bytes();
534 *array_mut_ref!(out, 5 * 4, 4) = words[5].to_le_bytes();
535 *array_mut_ref!(out, 6 * 4, 4) = words[6].to_le_bytes();
536 *array_mut_ref!(out, 7 * 4, 4) = words[7].to_le_bytes();
537 *array_mut_ref!(out, 8 * 4, 4) = words[8].to_le_bytes();
538 *array_mut_ref!(out, 9 * 4, 4) = words[9].to_le_bytes();
539 *array_mut_ref!(out, 10 * 4, 4) = words[10].to_le_bytes();
540 *array_mut_ref!(out, 11 * 4, 4) = words[11].to_le_bytes();
541 *array_mut_ref!(out, 12 * 4, 4) = words[12].to_le_bytes();
542 *array_mut_ref!(out, 13 * 4, 4) = words[13].to_le_bytes();
543 *array_mut_ref!(out, 14 * 4, 4) = words[14].to_le_bytes();
544 *array_mut_ref!(out, 15 * 4, 4) = words[15].to_le_bytes();
545 out
546}
547