1//! SHA-512 `x86`/`x86_64` backend
2
3#![allow(clippy::many_single_char_names)]
4
5use core::mem::size_of;
6
7#[cfg(target_arch = "x86")]
8use core::arch::x86::*;
9#[cfg(target_arch = "x86_64")]
10use core::arch::x86_64::*;
11
12use crate::consts::K64;
13
14cpufeatures::new!(avx2_cpuid, "avx2");
15
16pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
17 // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
18 // after stabilization
19 if avx2_cpuid::get() {
20 unsafe {
21 sha512_compress_x86_64_avx2(state, blocks);
22 }
23 } else {
24 super::soft::compress(state, blocks);
25 }
26}
27
28#[target_feature(enable = "avx2")]
29unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
30 let mut start_block = 0;
31
32 if blocks.len() & 0b1 != 0 {
33 sha512_compress_x86_64_avx(state, &blocks[0]);
34 start_block += 1;
35 }
36
37 let mut ms: MsgSchedule = [_mm_setzero_si128(); 8];
38 let mut t2: RoundStates = [_mm_setzero_si128(); 40];
39 let mut x = [_mm256_setzero_si256(); 8];
40
41 for i in (start_block..blocks.len()).step_by(2) {
42 load_data_avx2(&mut x, &mut ms, &mut t2, blocks.as_ptr().add(i) as *const _);
43
44 // First block
45 let mut current_state = *state;
46 rounds_0_63_avx2(&mut current_state, &mut x, &mut ms, &mut t2);
47 rounds_64_79(&mut current_state, &ms);
48 accumulate_state(state, &current_state);
49
50 // Second block
51 current_state = *state;
52 process_second_block(&mut current_state, &t2);
53 accumulate_state(state, &current_state);
54 }
55}
56
57#[inline(always)]
58unsafe fn sha512_compress_x86_64_avx(state: &mut [u64; 8], block: &[u8; 128]) {
59 let mut ms: [__m128i; 8] = [_mm_setzero_si128(); 8];
60 let mut x: [__m128i; 8] = [_mm_setzero_si128(); 8];
61
62 // Reduced to single iteration
63 let mut current_state: [u64; 8] = *state;
64 load_data_avx(&mut x, &mut ms, data:block.as_ptr() as *const _);
65 rounds_0_63_avx(&mut current_state, &mut x, &mut ms);
66 rounds_64_79(&mut current_state, &ms);
67 accumulate_state(dst:state, &current_state);
68}
69
70#[inline(always)]
71unsafe fn load_data_avx(x: &mut [__m128i; 8], ms: &mut MsgSchedule, data: *const __m128i) {
72 #[allow(non_snake_case)]
73 let MASK: __m128i = _mm_setr_epi32(e3:0x04050607, e2:0x00010203, e1:0x0c0d0e0f, e0:0x08090a0b);
74
75 macro_rules! unrolled_iterations {
76 ($($i:literal),*) => {$(
77 x[$i] = _mm_loadu_si128(data.add($i) as *const _);
78 x[$i] = _mm_shuffle_epi8(x[$i], MASK);
79
80 let y = _mm_add_epi64(
81 x[$i],
82 _mm_loadu_si128(&K64[2 * $i] as *const u64 as *const _),
83 );
84
85 ms[$i] = y;
86 )*};
87 }
88
89 unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7);
90}
91
92#[inline(always)]
93unsafe fn load_data_avx2(
94 x: &mut [__m256i; 8],
95 ms: &mut MsgSchedule,
96 t2: &mut RoundStates,
97 data: *const __m128i,
98) {
99 #[allow(non_snake_case)]
100 let MASK = _mm256_set_epi64x(
101 0x0809_0A0B_0C0D_0E0F_i64,
102 0x0001_0203_0405_0607_i64,
103 0x0809_0A0B_0C0D_0E0F_i64,
104 0x0001_0203_0405_0607_i64,
105 );
106
107 macro_rules! unrolled_iterations {
108 ($($i:literal),*) => {$(
109 x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add(8 + $i) as *const _), 1);
110 x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i) as *const _), 0);
111
112 x[$i] = _mm256_shuffle_epi8(x[$i], MASK);
113
114 let t = _mm_loadu_si128(K64.as_ptr().add($i * 2) as *const u64 as *const _);
115 let y = _mm256_add_epi64(x[$i], _mm256_set_m128i(t, t));
116
117 ms[$i] = _mm256_extracti128_si256(y, 0);
118 t2[$i] = _mm256_extracti128_si256(y, 1);
119 )*};
120 }
121
122 unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7);
123}
124
125#[inline(always)]
126unsafe fn rounds_0_63_avx(current_state: &mut State, x: &mut [__m128i; 8], ms: &mut MsgSchedule) {
127 let mut k64_idx: usize = SHA512_BLOCK_WORDS_NUM;
128
129 for _ in 0..4 {
130 for j: usize in 0..8 {
131 let k64: __m128i = _mm_loadu_si128(&K64[k64_idx] as *const u64 as *const _);
132 let y: __m128i = sha512_update_x_avx(x, k64);
133
134 {
135 let ms: &[u64; 16] = cast_ms(ms);
136 sha_round(s:current_state, x:ms[2 * j]);
137 sha_round(s:current_state, x:ms[2 * j + 1]);
138 }
139
140 ms[j] = y;
141 k64_idx += 2;
142 }
143 }
144}
145
146#[inline(always)]
147unsafe fn rounds_0_63_avx2(
148 current_state: &mut State,
149 x: &mut [__m256i; 8],
150 ms: &mut MsgSchedule,
151 t2: &mut RoundStates,
152) {
153 let mut k64x4_idx: usize = SHA512_BLOCK_WORDS_NUM;
154
155 for i: usize in 1..5 {
156 for j: usize in 0..8 {
157 let t: __m128i = _mm_loadu_si128(K64.as_ptr().add(count:k64x4_idx) as *const u64 as *const _);
158 let y: __m256i = sha512_update_x_avx2(x, k64:_mm256_set_m128i(hi:t, lo:t));
159
160 {
161 let ms: &[u64; 16] = cast_ms(ms);
162 sha_round(s:current_state, x:ms[2 * j]);
163 sha_round(s:current_state, x:ms[2 * j + 1]);
164 }
165
166 ms[j] = _mm256_extracti128_si256(y, 0);
167 t2[8 * i + j] = _mm256_extracti128_si256(y, 1);
168
169 k64x4_idx += 2;
170 }
171 }
172}
173
174#[inline(always)]
175fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) {
176 let ms: &[u64; 16] = cast_ms(ms);
177 for i: usize in 64..80 {
178 sha_round(s:current_state, x:ms[i & 0xf]);
179 }
180}
181
182#[inline(always)]
183fn process_second_block(current_state: &mut State, t2: &RoundStates) {
184 for t2: &u64 in cast_rs(t2).iter() {
185 sha_round(s:current_state, *t2);
186 }
187}
188
189#[inline(always)]
190fn sha_round(s: &mut State, x: u64) {
191 macro_rules! big_sigma0 {
192 ($a:expr) => {
193 $a.rotate_right(28) ^ $a.rotate_right(34) ^ $a.rotate_right(39)
194 };
195 }
196 macro_rules! big_sigma1 {
197 ($a:expr) => {
198 $a.rotate_right(14) ^ $a.rotate_right(18) ^ $a.rotate_right(41)
199 };
200 }
201 macro_rules! bool3ary_202 {
202 ($a:expr, $b:expr, $c:expr) => {
203 $c ^ ($a & ($b ^ $c))
204 };
205 } // Choose, MD5F, SHA1C
206 macro_rules! bool3ary_232 {
207 ($a:expr, $b:expr, $c:expr) => {
208 ($a & $b) ^ ($a & $c) ^ ($b & $c)
209 };
210 } // Majority, SHA1M
211
212 macro_rules! rotate_state {
213 ($s:ident) => {{
214 let tmp = $s[7];
215 $s[7] = $s[6];
216 $s[6] = $s[5];
217 $s[5] = $s[4];
218 $s[4] = $s[3];
219 $s[3] = $s[2];
220 $s[2] = $s[1];
221 $s[1] = $s[0];
222 $s[0] = tmp;
223 }};
224 }
225
226 let t = x
227 .wrapping_add(s[7])
228 .wrapping_add(big_sigma1!(s[4]))
229 .wrapping_add(bool3ary_202!(s[4], s[5], s[6]));
230
231 s[7] = t
232 .wrapping_add(big_sigma0!(s[0]))
233 .wrapping_add(bool3ary_232!(s[0], s[1], s[2]));
234 s[3] = s[3].wrapping_add(t);
235
236 rotate_state!(s);
237}
238
239#[inline(always)]
240fn accumulate_state(dst: &mut State, src: &State) {
241 for i: usize in 0..SHA512_HASH_WORDS_NUM {
242 dst[i] = dst[i].wrapping_add(src[i]);
243 }
244}
245
246macro_rules! fn_sha512_update_x {
247 ($name:ident, $ty:ident, {
248 ADD64 = $ADD64:ident,
249 ALIGNR8 = $ALIGNR8:ident,
250 SRL64 = $SRL64:ident,
251 SLL64 = $SLL64:ident,
252 XOR = $XOR:ident,
253 }) => {
254 unsafe fn $name(x: &mut [$ty; 8], k64: $ty) -> $ty {
255 // q[2:1]
256 let mut t0 = $ALIGNR8(x[1], x[0], 8);
257 // q[10:9]
258 let mut t3 = $ALIGNR8(x[5], x[4], 8);
259 // q[2:1] >> s0[0]
260 let mut t2 = $SRL64(t0, 1);
261 // q[1:0] + q[10:9]
262 x[0] = $ADD64(x[0], t3);
263 // q[2:1] >> s0[2]
264 t3 = $SRL64(t0, 7);
265 // q[2:1] << (64 - s0[1])
266 let mut t1 = $SLL64(t0, 64 - 8);
267 // (q[2:1] >> s0[2]) ^
268 // (q[2:1] >> s0[0])
269 t0 = $XOR(t3, t2);
270 // q[2:1] >> s0[1]
271 t2 = $SRL64(t2, 8 - 1);
272 // (q[2:1] >> s0[2]) ^
273 // (q[2:1] >> s0[0]) ^
274 // q[2:1] << (64 - s0[1])
275 t0 = $XOR(t0, t1);
276 // q[2:1] << (64 - s0[0])
277 t1 = $SLL64(t1, 8 - 1);
278 // sigma1(q[2:1])
279 t0 = $XOR(t0, t2);
280 t0 = $XOR(t0, t1);
281 // q[15:14] >> s1[2]
282 t3 = $SRL64(x[7], 6);
283 // q[15:14] >> (64 - s1[1])
284 t2 = $SLL64(x[7], 64 - 61);
285 // q[1:0] + sigma0(q[2:1])
286 x[0] = $ADD64(x[0], t0);
287 // q[15:14] >> s1[0]
288 t1 = $SRL64(x[7], 19);
289 // q[15:14] >> s1[2] ^
290 // q[15:14] >> (64 - s1[1])
291 t3 = $XOR(t3, t2);
292 // q[15:14] >> (64 - s1[0])
293 t2 = $SLL64(t2, 61 - 19);
294 // q[15:14] >> s1[2] ^
295 // q[15:14] >> (64 - s1[1] ^
296 // q[15:14] >> s1[0]
297 t3 = $XOR(t3, t1);
298 // q[15:14] >> s1[1]
299 t1 = $SRL64(t1, 61 - 19);
300 // sigma1(q[15:14])
301 t3 = $XOR(t3, t2);
302 t3 = $XOR(t3, t1);
303
304 // q[1:0] + q[10:9] + sigma1(q[15:14]) + sigma0(q[2:1])
305 x[0] = $ADD64(x[0], t3);
306
307 // rotate
308 let temp = x[0];
309 x[0] = x[1];
310 x[1] = x[2];
311 x[2] = x[3];
312 x[3] = x[4];
313 x[4] = x[5];
314 x[5] = x[6];
315 x[6] = x[7];
316 x[7] = temp;
317
318 $ADD64(x[7], k64)
319 }
320 };
321}
322
323fn_sha512_update_x!(sha512_update_x_avx, __m128i, {
324 ADD64 = _mm_add_epi64,
325 ALIGNR8 = _mm_alignr_epi8,
326 SRL64 = _mm_srli_epi64,
327 SLL64 = _mm_slli_epi64,
328 XOR = _mm_xor_si128,
329});
330
331fn_sha512_update_x!(sha512_update_x_avx2, __m256i, {
332 ADD64 = _mm256_add_epi64,
333 ALIGNR8 = _mm256_alignr_epi8,
334 SRL64 = _mm256_srli_epi64,
335 SLL64 = _mm256_slli_epi64,
336 XOR = _mm256_xor_si256,
337});
338
339#[inline(always)]
340fn cast_ms(ms: &MsgSchedule) -> &[u64; SHA512_BLOCK_WORDS_NUM] {
341 unsafe { &*(ms as *const MsgSchedule as *const _) }
342}
343
344#[inline(always)]
345fn cast_rs(rs: &RoundStates) -> &[u64; SHA512_ROUNDS_NUM] {
346 unsafe { &*(rs as *const RoundStates as *const _) }
347}
348
349type State = [u64; SHA512_HASH_WORDS_NUM];
350type MsgSchedule = [__m128i; SHA512_BLOCK_WORDS_NUM / 2];
351type RoundStates = [__m128i; SHA512_ROUNDS_NUM / 2];
352
353const SHA512_BLOCK_BYTE_LEN: usize = 128;
354const SHA512_ROUNDS_NUM: usize = 80;
355const SHA512_HASH_BYTE_LEN: usize = 64;
356const SHA512_HASH_WORDS_NUM: usize = SHA512_HASH_BYTE_LEN / size_of::<u64>();
357const SHA512_BLOCK_WORDS_NUM: usize = SHA512_BLOCK_BYTE_LEN / size_of::<u64>();
358