| 1 | //! SHA-256 `x86`/`x86_64` backend |
| 2 | |
| 3 | #![allow (clippy::many_single_char_names)] |
| 4 | |
| 5 | #[cfg (target_arch = "x86" )] |
| 6 | use core::arch::x86::*; |
| 7 | #[cfg (target_arch = "x86_64" )] |
| 8 | use core::arch::x86_64::*; |
| 9 | |
| 10 | unsafe fn schedule(v0: __m128i, v1: __m128i, v2: __m128i, v3: __m128i) -> __m128i { |
| 11 | let t1: __m128i = _mm_sha256msg1_epu32(a:v0, b:v1); |
| 12 | let t2: __m128i = _mm_alignr_epi8(a:v3, b:v2, 4); |
| 13 | let t3: __m128i = _mm_add_epi32(a:t1, b:t2); |
| 14 | _mm_sha256msg2_epu32(a:t3, b:v3) |
| 15 | } |
| 16 | |
| 17 | macro_rules! rounds4 { |
| 18 | ($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{ |
| 19 | let k = crate::consts::K32X4[$i]; |
| 20 | let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32); |
| 21 | let t1 = _mm_add_epi32($rest, kv); |
| 22 | $cdgh = _mm_sha256rnds2_epu32($cdgh, $abef, t1); |
| 23 | let t2 = _mm_shuffle_epi32(t1, 0x0E); |
| 24 | $abef = _mm_sha256rnds2_epu32($abef, $cdgh, t2); |
| 25 | }}; |
| 26 | } |
| 27 | |
| 28 | macro_rules! schedule_rounds4 { |
| 29 | ( |
| 30 | $abef:ident, $cdgh:ident, |
| 31 | $w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr, |
| 32 | $i: expr |
| 33 | ) => {{ |
| 34 | $w4 = schedule($w0, $w1, $w2, $w3); |
| 35 | rounds4!($abef, $cdgh, $w4, $i); |
| 36 | }}; |
| 37 | } |
| 38 | |
| 39 | // we use unaligned loads with `__m128i` pointers |
| 40 | #[allow (clippy::cast_ptr_alignment)] |
| 41 | #[target_feature (enable = "sha,sse2,ssse3,sse4.1" )] |
| 42 | unsafe fn digest_blocks(state: &mut [u32; 8], blocks: &[[u8; 64]]) { |
| 43 | #[allow (non_snake_case)] |
| 44 | let MASK: __m128i = _mm_set_epi64x( |
| 45 | 0x0C0D_0E0F_0809_0A0Bu64 as i64, |
| 46 | 0x0405_0607_0001_0203u64 as i64, |
| 47 | ); |
| 48 | |
| 49 | let state_ptr = state.as_ptr() as *const __m128i; |
| 50 | let dcba = _mm_loadu_si128(state_ptr.add(0)); |
| 51 | let efgh = _mm_loadu_si128(state_ptr.add(1)); |
| 52 | |
| 53 | let cdab = _mm_shuffle_epi32(dcba, 0xB1); |
| 54 | let efgh = _mm_shuffle_epi32(efgh, 0x1B); |
| 55 | let mut abef = _mm_alignr_epi8(cdab, efgh, 8); |
| 56 | let mut cdgh = _mm_blend_epi16(efgh, cdab, 0xF0); |
| 57 | |
| 58 | for block in blocks { |
| 59 | let abef_save = abef; |
| 60 | let cdgh_save = cdgh; |
| 61 | |
| 62 | let data_ptr = block.as_ptr() as *const __m128i; |
| 63 | let mut w0 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(0)), MASK); |
| 64 | let mut w1 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(1)), MASK); |
| 65 | let mut w2 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(2)), MASK); |
| 66 | let mut w3 = _mm_shuffle_epi8(_mm_loadu_si128(data_ptr.add(3)), MASK); |
| 67 | let mut w4; |
| 68 | |
| 69 | rounds4!(abef, cdgh, w0, 0); |
| 70 | rounds4!(abef, cdgh, w1, 1); |
| 71 | rounds4!(abef, cdgh, w2, 2); |
| 72 | rounds4!(abef, cdgh, w3, 3); |
| 73 | schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4); |
| 74 | schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5); |
| 75 | schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6); |
| 76 | schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7); |
| 77 | schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8); |
| 78 | schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9); |
| 79 | schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10); |
| 80 | schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11); |
| 81 | schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12); |
| 82 | schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13); |
| 83 | schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14); |
| 84 | schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15); |
| 85 | |
| 86 | abef = _mm_add_epi32(abef, abef_save); |
| 87 | cdgh = _mm_add_epi32(cdgh, cdgh_save); |
| 88 | } |
| 89 | |
| 90 | let feba = _mm_shuffle_epi32(abef, 0x1B); |
| 91 | let dchg = _mm_shuffle_epi32(cdgh, 0xB1); |
| 92 | let dcba = _mm_blend_epi16(feba, dchg, 0xF0); |
| 93 | let hgef = _mm_alignr_epi8(dchg, feba, 8); |
| 94 | |
| 95 | let state_ptr_mut = state.as_mut_ptr() as *mut __m128i; |
| 96 | _mm_storeu_si128(state_ptr_mut.add(0), dcba); |
| 97 | _mm_storeu_si128(state_ptr_mut.add(1), hgef); |
| 98 | } |
| 99 | |
| 100 | cpufeatures::new!(shani_cpuid, "sha" , "sse2" , "ssse3" , "sse4.1" ); |
| 101 | |
| 102 | pub fn compress(state: &mut [u32; 8], blocks: &[[u8; 64]]) { |
| 103 | // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 |
| 104 | // after stabilization |
| 105 | if shani_cpuid::get() { |
| 106 | unsafe { |
| 107 | digest_blocks(state, blocks); |
| 108 | } |
| 109 | } else { |
| 110 | super::soft::compress(state, blocks); |
| 111 | } |
| 112 | } |
| 113 | |