1//! Fixsliced implementations of AES-128, AES-192 and AES-256 (64-bit)
2//! adapted from the C implementation.
3//!
4//! All implementations are fully bitsliced and do not rely on any
5//! Look-Up Table (LUT).
6//!
7//! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
8//!
9//! # Author (original C code)
10//!
11//! Alexandre Adomnicai, Nanyang Technological University, Singapore
12//! <alexandre.adomnicai@ntu.edu.sg>
13//!
14//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
15
16#![allow(clippy::unreadable_literal)]
17
18use crate::Block;
19use cipher::{consts::U4, generic_array::GenericArray};
20
21/// AES block batch size for this implementation
22pub(crate) type FixsliceBlocks = U4;
23
24pub(crate) type BatchBlocks = GenericArray<Block, FixsliceBlocks>;
25
26/// AES-128 round keys
27pub(crate) type FixsliceKeys128 = [u64; 88];
28
29/// AES-192 round keys
30pub(crate) type FixsliceKeys192 = [u64; 104];
31
32/// AES-256 round keys
33pub(crate) type FixsliceKeys256 = [u64; 120];
34
35/// 512-bit internal state
36pub(crate) type State = [u64; 8];
37
38/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
39pub(crate) fn aes128_key_schedule(key: &[u8; 16]) -> FixsliceKeys128 {
40 let mut rkeys = [0u64; 88];
41
42 bitslice(&mut rkeys[..8], key, key, key, key);
43
44 let mut rk_off = 0;
45 for rcon in 0..10 {
46 memshift32(&mut rkeys, rk_off);
47 rk_off += 8;
48
49 sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
50 sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
51
52 if rcon < 8 {
53 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
54 } else {
55 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
56 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
57 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
58 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
59 }
60
61 xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
62 }
63
64 // Adjust to match fixslicing format
65 #[cfg(aes_compact)]
66 {
67 for i in (8..88).step_by(16) {
68 inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
69 }
70 }
71 #[cfg(not(aes_compact))]
72 {
73 for i in (8..72).step_by(32) {
74 inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
75 inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
76 inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
77 }
78 inv_shift_rows_1(&mut rkeys[72..80]);
79 }
80
81 // Account for NOTs removed from sub_bytes
82 for i in 1..11 {
83 sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
84 }
85
86 rkeys
87}
88
89/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
90pub(crate) fn aes192_key_schedule(key: &[u8; 24]) -> FixsliceKeys192 {
91 let mut rkeys = [0u64; 104];
92 let mut tmp = [0u64; 8];
93
94 bitslice(
95 &mut rkeys[..8],
96 &key[..16],
97 &key[..16],
98 &key[..16],
99 &key[..16],
100 );
101 bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]);
102
103 let mut rcon = 0;
104 let mut rk_off = 8;
105
106 loop {
107 for i in 0..8 {
108 rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8))
109 | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
110 }
111
112 sub_bytes(&mut tmp);
113 sub_bytes_nots(&mut tmp);
114
115 add_round_constant_bit(&mut tmp, rcon);
116 rcon += 1;
117
118 for i in 0..8 {
119 let mut ti = rkeys[rk_off + i];
120 ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1));
121 ti ^= 0xf000f000f000f000 & (ti << 4);
122 tmp[i] = ti;
123 }
124 rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
125 rk_off += 8;
126
127 for i in 0..8 {
128 let ui = tmp[i];
129 let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
130 | (0xff00ff00ff00ff00 & (ui << 8));
131 ti ^= 0x000f000f000f000f & (ui >> 12);
132 tmp[i] = ti
133 ^ (0xfff0fff0fff0fff0 & (ti << 4))
134 ^ (0xff00ff00ff00ff00 & (ti << 8))
135 ^ (0xf000f000f000f000 & (ti << 12));
136 }
137 rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
138 rk_off += 8;
139
140 sub_bytes(&mut tmp);
141 sub_bytes_nots(&mut tmp);
142
143 add_round_constant_bit(&mut tmp, rcon);
144 rcon += 1;
145
146 for i in 0..8 {
147 let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8))
148 | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8));
149 ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3));
150 rkeys[rk_off + i] = ti
151 ^ (0xfff0fff0fff0fff0 & (ti << 4))
152 ^ (0xff00ff00ff00ff00 & (ti << 8))
153 ^ (0xf000f000f000f000 & (ti << 12));
154 }
155 rk_off += 8;
156
157 if rcon >= 8 {
158 break;
159 }
160
161 for i in 0..8 {
162 let ui = rkeys[(rk_off - 8) + i];
163 let mut ti = rkeys[(rk_off - 16) + i];
164 ti ^= 0x0f000f000f000f00 & (ui >> 4);
165 ti ^= 0xf000f000f000f000 & (ti << 4);
166 tmp[i] = ti;
167 }
168 }
169
170 // Adjust to match fixslicing format
171 #[cfg(aes_compact)]
172 {
173 for i in (8..104).step_by(16) {
174 inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
175 }
176 }
177 #[cfg(not(aes_compact))]
178 {
179 for i in (0..96).step_by(32) {
180 inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
181 inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
182 inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
183 }
184 }
185
186 // Account for NOTs removed from sub_bytes
187 for i in 1..13 {
188 sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
189 }
190
191 rkeys
192}
193
194/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
195pub(crate) fn aes256_key_schedule(key: &[u8; 32]) -> FixsliceKeys256 {
196 let mut rkeys = [0u64; 120];
197
198 bitslice(
199 &mut rkeys[..8],
200 &key[..16],
201 &key[..16],
202 &key[..16],
203 &key[..16],
204 );
205 bitslice(
206 &mut rkeys[8..16],
207 &key[16..],
208 &key[16..],
209 &key[16..],
210 &key[16..],
211 );
212
213 let mut rk_off = 8;
214
215 let mut rcon = 0;
216 loop {
217 memshift32(&mut rkeys, rk_off);
218 rk_off += 8;
219
220 sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
221 sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
222
223 add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
224 xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
225 rcon += 1;
226
227 if rcon == 7 {
228 break;
229 }
230
231 memshift32(&mut rkeys, rk_off);
232 rk_off += 8;
233
234 sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
235 sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
236
237 xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
238 }
239
240 // Adjust to match fixslicing format
241 #[cfg(aes_compact)]
242 {
243 for i in (8..120).step_by(16) {
244 inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
245 }
246 }
247 #[cfg(not(aes_compact))]
248 {
249 for i in (8..104).step_by(32) {
250 inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
251 inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
252 inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
253 }
254 inv_shift_rows_1(&mut rkeys[104..112]);
255 }
256
257 // Account for NOTs removed from sub_bytes
258 for i in 1..15 {
259 sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
260 }
261
262 rkeys
263}
264
265/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
266///
267/// Decrypts four blocks in-place and in parallel.
268pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
269 let mut state = State::default();
270
271 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
272
273 add_round_key(&mut state, &rkeys[80..]);
274 inv_sub_bytes(&mut state);
275
276 #[cfg(not(aes_compact))]
277 {
278 inv_shift_rows_2(&mut state);
279 }
280
281 let mut rk_off = 72;
282 loop {
283 #[cfg(aes_compact)]
284 {
285 inv_shift_rows_2(&mut state);
286 }
287
288 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
289 inv_mix_columns_1(&mut state);
290 inv_sub_bytes(&mut state);
291 rk_off -= 8;
292
293 if rk_off == 0 {
294 break;
295 }
296
297 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
298 inv_mix_columns_0(&mut state);
299 inv_sub_bytes(&mut state);
300 rk_off -= 8;
301
302 #[cfg(not(aes_compact))]
303 {
304 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
305 inv_mix_columns_3(&mut state);
306 inv_sub_bytes(&mut state);
307 rk_off -= 8;
308
309 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
310 inv_mix_columns_2(&mut state);
311 inv_sub_bytes(&mut state);
312 rk_off -= 8;
313 }
314 }
315
316 add_round_key(&mut state, &rkeys[..8]);
317
318 inv_bitslice(&state)
319}
320
321/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
322///
323/// Encrypts four blocks in-place and in parallel.
324pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks {
325 let mut state = State::default();
326
327 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
328
329 add_round_key(&mut state, &rkeys[..8]);
330
331 let mut rk_off = 8;
332 loop {
333 sub_bytes(&mut state);
334 mix_columns_1(&mut state);
335 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
336 rk_off += 8;
337
338 #[cfg(aes_compact)]
339 {
340 shift_rows_2(&mut state);
341 }
342
343 if rk_off == 80 {
344 break;
345 }
346
347 #[cfg(not(aes_compact))]
348 {
349 sub_bytes(&mut state);
350 mix_columns_2(&mut state);
351 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
352 rk_off += 8;
353
354 sub_bytes(&mut state);
355 mix_columns_3(&mut state);
356 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
357 rk_off += 8;
358 }
359
360 sub_bytes(&mut state);
361 mix_columns_0(&mut state);
362 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
363 rk_off += 8;
364 }
365
366 #[cfg(not(aes_compact))]
367 {
368 shift_rows_2(&mut state);
369 }
370
371 sub_bytes(&mut state);
372 add_round_key(&mut state, &rkeys[80..]);
373
374 inv_bitslice(&state)
375}
376
377/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
378///
379/// Decrypts four blocks in-place and in parallel.
380pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
381 let mut state = State::default();
382
383 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
384
385 add_round_key(&mut state, &rkeys[96..]);
386 inv_sub_bytes(&mut state);
387
388 let mut rk_off = 88;
389 loop {
390 #[cfg(aes_compact)]
391 {
392 inv_shift_rows_2(&mut state);
393 }
394 #[cfg(not(aes_compact))]
395 {
396 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
397 inv_mix_columns_3(&mut state);
398 inv_sub_bytes(&mut state);
399 rk_off -= 8;
400
401 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
402 inv_mix_columns_2(&mut state);
403 inv_sub_bytes(&mut state);
404 rk_off -= 8;
405 }
406
407 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
408 inv_mix_columns_1(&mut state);
409 inv_sub_bytes(&mut state);
410 rk_off -= 8;
411
412 if rk_off == 0 {
413 break;
414 }
415
416 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
417 inv_mix_columns_0(&mut state);
418 inv_sub_bytes(&mut state);
419 rk_off -= 8;
420 }
421
422 add_round_key(&mut state, &rkeys[..8]);
423
424 inv_bitslice(&state)
425}
426
427/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
428///
429/// Encrypts four blocks in-place and in parallel.
430pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks {
431 let mut state = State::default();
432
433 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
434
435 add_round_key(&mut state, &rkeys[..8]);
436
437 let mut rk_off = 8;
438 loop {
439 sub_bytes(&mut state);
440 mix_columns_1(&mut state);
441 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
442 rk_off += 8;
443
444 #[cfg(aes_compact)]
445 {
446 shift_rows_2(&mut state);
447 }
448 #[cfg(not(aes_compact))]
449 {
450 sub_bytes(&mut state);
451 mix_columns_2(&mut state);
452 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
453 rk_off += 8;
454
455 sub_bytes(&mut state);
456 mix_columns_3(&mut state);
457 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
458 rk_off += 8;
459 }
460
461 if rk_off == 96 {
462 break;
463 }
464
465 sub_bytes(&mut state);
466 mix_columns_0(&mut state);
467 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
468 rk_off += 8;
469 }
470
471 sub_bytes(&mut state);
472 add_round_key(&mut state, &rkeys[96..]);
473
474 inv_bitslice(&state)
475}
476
477/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
478///
479/// Decrypts four blocks in-place and in parallel.
480pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
481 let mut state = State::default();
482
483 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
484
485 add_round_key(&mut state, &rkeys[112..]);
486 inv_sub_bytes(&mut state);
487
488 #[cfg(not(aes_compact))]
489 {
490 inv_shift_rows_2(&mut state);
491 }
492
493 let mut rk_off = 104;
494 loop {
495 #[cfg(aes_compact)]
496 {
497 inv_shift_rows_2(&mut state);
498 }
499
500 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
501 inv_mix_columns_1(&mut state);
502 inv_sub_bytes(&mut state);
503 rk_off -= 8;
504
505 if rk_off == 0 {
506 break;
507 }
508
509 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
510 inv_mix_columns_0(&mut state);
511 inv_sub_bytes(&mut state);
512 rk_off -= 8;
513
514 #[cfg(not(aes_compact))]
515 {
516 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
517 inv_mix_columns_3(&mut state);
518 inv_sub_bytes(&mut state);
519 rk_off -= 8;
520
521 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
522 inv_mix_columns_2(&mut state);
523 inv_sub_bytes(&mut state);
524 rk_off -= 8;
525 }
526 }
527
528 add_round_key(&mut state, &rkeys[..8]);
529
530 inv_bitslice(&state)
531}
532
533/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
534///
535/// Encrypts four blocks in-place and in parallel.
536pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks {
537 let mut state = State::default();
538
539 bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]);
540
541 add_round_key(&mut state, &rkeys[..8]);
542
543 let mut rk_off = 8;
544 loop {
545 sub_bytes(&mut state);
546 mix_columns_1(&mut state);
547 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
548 rk_off += 8;
549
550 #[cfg(aes_compact)]
551 {
552 shift_rows_2(&mut state);
553 }
554
555 if rk_off == 112 {
556 break;
557 }
558
559 #[cfg(not(aes_compact))]
560 {
561 sub_bytes(&mut state);
562 mix_columns_2(&mut state);
563 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
564 rk_off += 8;
565
566 sub_bytes(&mut state);
567 mix_columns_3(&mut state);
568 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
569 rk_off += 8;
570 }
571
572 sub_bytes(&mut state);
573 mix_columns_0(&mut state);
574 add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
575 rk_off += 8;
576 }
577
578 #[cfg(not(aes_compact))]
579 {
580 shift_rows_2(&mut state);
581 }
582
583 sub_bytes(&mut state);
584 add_round_key(&mut state, &rkeys[112..]);
585
586 inv_bitslice(&state)
587}
588
589/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true
590/// inverse of 'sub_bytes'.
591fn inv_sub_bytes(state: &mut [u64]) {
592 debug_assert_eq!(state.len(), 8);
593
594 // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
595 // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
596
597 let u7 = state[0];
598 let u6 = state[1];
599 let u5 = state[2];
600 let u4 = state[3];
601 let u3 = state[4];
602 let u2 = state[5];
603 let u1 = state[6];
604 let u0 = state[7];
605
606 let t23 = u0 ^ u3;
607 let t8 = u1 ^ t23;
608 let m2 = t23 & t8;
609 let t4 = u4 ^ t8;
610 let t22 = u1 ^ u3;
611 let t2 = u0 ^ u1;
612 let t1 = u3 ^ u4;
613 // t23 -> stack
614 let t9 = u7 ^ t1;
615 // t8 -> stack
616 let m7 = t22 & t9;
617 // t9 -> stack
618 let t24 = u4 ^ u7;
619 // m7 -> stack
620 let t10 = t2 ^ t24;
621 // u4 -> stack
622 let m14 = t2 & t10;
623 let r5 = u6 ^ u7;
624 // m2 -> stack
625 let t3 = t1 ^ r5;
626 // t2 -> stack
627 let t13 = t2 ^ r5;
628 let t19 = t22 ^ r5;
629 // t3 -> stack
630 let t17 = u2 ^ t19;
631 // t4 -> stack
632 let t25 = u2 ^ t1;
633 let r13 = u1 ^ u6;
634 // t25 -> stack
635 let t20 = t24 ^ r13;
636 // t17 -> stack
637 let m9 = t20 & t17;
638 // t20 -> stack
639 let r17 = u2 ^ u5;
640 // t22 -> stack
641 let t6 = t22 ^ r17;
642 // t13 -> stack
643 let m1 = t13 & t6;
644 let y5 = u0 ^ r17;
645 let m4 = t19 & y5;
646 let m5 = m4 ^ m1;
647 let m17 = m5 ^ t24;
648 let r18 = u5 ^ u6;
649 let t27 = t1 ^ r18;
650 let t15 = t10 ^ t27;
651 // t6 -> stack
652 let m11 = t1 & t15;
653 let m15 = m14 ^ m11;
654 let m21 = m17 ^ m15;
655 // t1 -> stack
656 // t4 <- stack
657 let m12 = t4 & t27;
658 let m13 = m12 ^ m11;
659 let t14 = t10 ^ r18;
660 let m3 = t14 ^ m1;
661 // m2 <- stack
662 let m16 = m3 ^ m2;
663 let m20 = m16 ^ m13;
664 // u4 <- stack
665 let r19 = u2 ^ u4;
666 let t16 = r13 ^ r19;
667 // t3 <- stack
668 let t26 = t3 ^ t16;
669 let m6 = t3 & t16;
670 let m8 = t26 ^ m6;
671 // t10 -> stack
672 // m7 <- stack
673 let m18 = m8 ^ m7;
674 let m22 = m18 ^ m13;
675 let m25 = m22 & m20;
676 let m26 = m21 ^ m25;
677 let m10 = m9 ^ m6;
678 let m19 = m10 ^ m15;
679 // t25 <- stack
680 let m23 = m19 ^ t25;
681 let m28 = m23 ^ m25;
682 let m24 = m22 ^ m23;
683 let m30 = m26 & m24;
684 let m39 = m23 ^ m30;
685 let m48 = m39 & y5;
686 let m57 = m39 & t19;
687 // m48 -> stack
688 let m36 = m24 ^ m25;
689 let m31 = m20 & m23;
690 let m27 = m20 ^ m21;
691 let m32 = m27 & m31;
692 let m29 = m28 & m27;
693 let m37 = m21 ^ m29;
694 // m39 -> stack
695 let m42 = m37 ^ m39;
696 let m52 = m42 & t15;
697 // t27 -> stack
698 // t1 <- stack
699 let m61 = m42 & t1;
700 let p0 = m52 ^ m61;
701 let p16 = m57 ^ m61;
702 // m57 -> stack
703 // t20 <- stack
704 let m60 = m37 & t20;
705 // p16 -> stack
706 // t17 <- stack
707 let m51 = m37 & t17;
708 let m33 = m27 ^ m25;
709 let m38 = m32 ^ m33;
710 let m43 = m37 ^ m38;
711 let m49 = m43 & t16;
712 let p6 = m49 ^ m60;
713 let p13 = m49 ^ m51;
714 let m58 = m43 & t3;
715 // t9 <- stack
716 let m50 = m38 & t9;
717 // t22 <- stack
718 let m59 = m38 & t22;
719 // p6 -> stack
720 let p1 = m58 ^ m59;
721 let p7 = p0 ^ p1;
722 let m34 = m21 & m22;
723 let m35 = m24 & m34;
724 let m40 = m35 ^ m36;
725 let m41 = m38 ^ m40;
726 let m45 = m42 ^ m41;
727 // t27 <- stack
728 let m53 = m45 & t27;
729 let p8 = m50 ^ m53;
730 let p23 = p7 ^ p8;
731 // t4 <- stack
732 let m62 = m45 & t4;
733 let p14 = m49 ^ m62;
734 let s6 = p14 ^ p23;
735 // t10 <- stack
736 let m54 = m41 & t10;
737 let p2 = m54 ^ m62;
738 let p22 = p2 ^ p7;
739 let s0 = p13 ^ p22;
740 let p17 = m58 ^ p2;
741 let p15 = m54 ^ m59;
742 // t2 <- stack
743 let m63 = m41 & t2;
744 // m39 <- stack
745 let m44 = m39 ^ m40;
746 // p17 -> stack
747 // t6 <- stack
748 let m46 = m44 & t6;
749 let p5 = m46 ^ m51;
750 // p23 -> stack
751 let p18 = m63 ^ p5;
752 let p24 = p5 ^ p7;
753 // m48 <- stack
754 let p12 = m46 ^ m48;
755 let s3 = p12 ^ p22;
756 // t13 <- stack
757 let m55 = m44 & t13;
758 let p9 = m55 ^ m63;
759 // p16 <- stack
760 let s7 = p9 ^ p16;
761 // t8 <- stack
762 let m47 = m40 & t8;
763 let p3 = m47 ^ m50;
764 let p19 = p2 ^ p3;
765 let s5 = p19 ^ p24;
766 let p11 = p0 ^ p3;
767 let p26 = p9 ^ p11;
768 // t23 <- stack
769 let m56 = m40 & t23;
770 let p4 = m48 ^ m56;
771 // p6 <- stack
772 let p20 = p4 ^ p6;
773 let p29 = p15 ^ p20;
774 let s1 = p26 ^ p29;
775 // m57 <- stack
776 let p10 = m57 ^ p4;
777 let p27 = p10 ^ p18;
778 // p23 <- stack
779 let s4 = p23 ^ p27;
780 let p25 = p6 ^ p10;
781 let p28 = p11 ^ p25;
782 // p17 <- stack
783 let s2 = p17 ^ p28;
784
785 state[0] = s7;
786 state[1] = s6;
787 state[2] = s5;
788 state[3] = s4;
789 state[4] = s3;
790 state[5] = s2;
791 state[6] = s1;
792 state[7] = s0;
793}
794
795/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
796///
797/// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
798///
799/// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule.
800fn sub_bytes(state: &mut [u64]) {
801 debug_assert_eq!(state.len(), 8);
802
803 // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
804 // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
805
806 let u7 = state[0];
807 let u6 = state[1];
808 let u5 = state[2];
809 let u4 = state[3];
810 let u3 = state[4];
811 let u2 = state[5];
812 let u1 = state[6];
813 let u0 = state[7];
814
815 let y14 = u3 ^ u5;
816 let y13 = u0 ^ u6;
817 let y12 = y13 ^ y14;
818 let t1 = u4 ^ y12;
819 let y15 = t1 ^ u5;
820 let t2 = y12 & y15;
821 let y6 = y15 ^ u7;
822 let y20 = t1 ^ u1;
823 // y12 -> stack
824 let y9 = u0 ^ u3;
825 // y20 -> stack
826 let y11 = y20 ^ y9;
827 // y9 -> stack
828 let t12 = y9 & y11;
829 // y6 -> stack
830 let y7 = u7 ^ y11;
831 let y8 = u0 ^ u5;
832 let t0 = u1 ^ u2;
833 let y10 = y15 ^ t0;
834 // y15 -> stack
835 let y17 = y10 ^ y11;
836 // y14 -> stack
837 let t13 = y14 & y17;
838 let t14 = t13 ^ t12;
839 // y17 -> stack
840 let y19 = y10 ^ y8;
841 // y10 -> stack
842 let t15 = y8 & y10;
843 let t16 = t15 ^ t12;
844 let y16 = t0 ^ y11;
845 // y11 -> stack
846 let y21 = y13 ^ y16;
847 // y13 -> stack
848 let t7 = y13 & y16;
849 // y16 -> stack
850 let y18 = u0 ^ y16;
851 let y1 = t0 ^ u7;
852 let y4 = y1 ^ u3;
853 // u7 -> stack
854 let t5 = y4 & u7;
855 let t6 = t5 ^ t2;
856 let t18 = t6 ^ t16;
857 let t22 = t18 ^ y19;
858 let y2 = y1 ^ u0;
859 let t10 = y2 & y7;
860 let t11 = t10 ^ t7;
861 let t20 = t11 ^ t16;
862 let t24 = t20 ^ y18;
863 let y5 = y1 ^ u6;
864 let t8 = y5 & y1;
865 let t9 = t8 ^ t7;
866 let t19 = t9 ^ t14;
867 let t23 = t19 ^ y21;
868 let y3 = y5 ^ y8;
869 // y6 <- stack
870 let t3 = y3 & y6;
871 let t4 = t3 ^ t2;
872 // y20 <- stack
873 let t17 = t4 ^ y20;
874 let t21 = t17 ^ t14;
875 let t26 = t21 & t23;
876 let t27 = t24 ^ t26;
877 let t31 = t22 ^ t26;
878 let t25 = t21 ^ t22;
879 // y4 -> stack
880 let t28 = t25 & t27;
881 let t29 = t28 ^ t22;
882 let z14 = t29 & y2;
883 let z5 = t29 & y7;
884 let t30 = t23 ^ t24;
885 let t32 = t31 & t30;
886 let t33 = t32 ^ t24;
887 let t35 = t27 ^ t33;
888 let t36 = t24 & t35;
889 let t38 = t27 ^ t36;
890 let t39 = t29 & t38;
891 let t40 = t25 ^ t39;
892 let t43 = t29 ^ t40;
893 // y16 <- stack
894 let z3 = t43 & y16;
895 let tc12 = z3 ^ z5;
896 // tc12 -> stack
897 // y13 <- stack
898 let z12 = t43 & y13;
899 let z13 = t40 & y5;
900 let z4 = t40 & y1;
901 let tc6 = z3 ^ z4;
902 let t34 = t23 ^ t33;
903 let t37 = t36 ^ t34;
904 let t41 = t40 ^ t37;
905 // y10 <- stack
906 let z8 = t41 & y10;
907 let z17 = t41 & y8;
908 let t44 = t33 ^ t37;
909 // y15 <- stack
910 let z0 = t44 & y15;
911 // z17 -> stack
912 // y12 <- stack
913 let z9 = t44 & y12;
914 let z10 = t37 & y3;
915 let z1 = t37 & y6;
916 let tc5 = z1 ^ z0;
917 let tc11 = tc6 ^ tc5;
918 // y4 <- stack
919 let z11 = t33 & y4;
920 let t42 = t29 ^ t33;
921 let t45 = t42 ^ t41;
922 // y17 <- stack
923 let z7 = t45 & y17;
924 let tc8 = z7 ^ tc6;
925 // y14 <- stack
926 let z16 = t45 & y14;
927 // y11 <- stack
928 let z6 = t42 & y11;
929 let tc16 = z6 ^ tc8;
930 // z14 -> stack
931 // y9 <- stack
932 let z15 = t42 & y9;
933 let tc20 = z15 ^ tc16;
934 let tc1 = z15 ^ z16;
935 let tc2 = z10 ^ tc1;
936 let tc21 = tc2 ^ z11;
937 let tc3 = z9 ^ tc2;
938 let s0 = tc3 ^ tc16;
939 let s3 = tc3 ^ tc11;
940 let s1 = s3 ^ tc16;
941 let tc13 = z13 ^ tc1;
942 // u7 <- stack
943 let z2 = t33 & u7;
944 let tc4 = z0 ^ z2;
945 let tc7 = z12 ^ tc4;
946 let tc9 = z8 ^ tc7;
947 let tc10 = tc8 ^ tc9;
948 // z14 <- stack
949 let tc17 = z14 ^ tc10;
950 let s5 = tc21 ^ tc17;
951 let tc26 = tc17 ^ tc20;
952 // z17 <- stack
953 let s2 = tc26 ^ z17;
954 // tc12 <- stack
955 let tc14 = tc4 ^ tc12;
956 let tc18 = tc13 ^ tc14;
957 let s6 = tc10 ^ tc18;
958 let s7 = z12 ^ tc18;
959 let s4 = tc14 ^ s3;
960
961 state[0] = s7;
962 state[1] = s6;
963 state[2] = s5;
964 state[3] = s4;
965 state[4] = s3;
966 state[5] = s2;
967 state[6] = s1;
968 state[7] = s0;
969}
970
971/// NOT operations that are omitted in S-box
972#[inline]
973fn sub_bytes_nots(state: &mut [u64]) {
974 debug_assert_eq!(state.len(), 8);
975 state[0] ^= 0xffffffffffffffff;
976 state[1] ^= 0xffffffffffffffff;
977 state[5] ^= 0xffffffffffffffff;
978 state[6] ^= 0xffffffffffffffff;
979}
980
981/// Computation of the MixColumns transformation in the fixsliced representation, with different
982/// rotations used according to the round number mod 4.
983///
984/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
985macro_rules! define_mix_columns {
986 (
987 $name:ident,
988 $name_inv:ident,
989 $first_rotate:path,
990 $second_rotate:path
991 ) => {
992 #[rustfmt::skip]
993 fn $name(state: &mut State) {
994 let (a0, a1, a2, a3, a4, a5, a6, a7) = (
995 state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
996 );
997 let (b0, b1, b2, b3, b4, b5, b6, b7) = (
998 $first_rotate(a0),
999 $first_rotate(a1),
1000 $first_rotate(a2),
1001 $first_rotate(a3),
1002 $first_rotate(a4),
1003 $first_rotate(a5),
1004 $first_rotate(a6),
1005 $first_rotate(a7),
1006 );
1007 let (c0, c1, c2, c3, c4, c5, c6, c7) = (
1008 a0 ^ b0,
1009 a1 ^ b1,
1010 a2 ^ b2,
1011 a3 ^ b3,
1012 a4 ^ b4,
1013 a5 ^ b5,
1014 a6 ^ b6,
1015 a7 ^ b7,
1016 );
1017 state[0] = b0 ^ c7 ^ $second_rotate(c0);
1018 state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
1019 state[2] = b2 ^ c1 ^ $second_rotate(c2);
1020 state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
1021 state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
1022 state[5] = b5 ^ c4 ^ $second_rotate(c5);
1023 state[6] = b6 ^ c5 ^ $second_rotate(c6);
1024 state[7] = b7 ^ c6 ^ $second_rotate(c7);
1025 }
1026
1027 #[rustfmt::skip]
1028 fn $name_inv(state: &mut State) {
1029 let (a0, a1, a2, a3, a4, a5, a6, a7) = (
1030 state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
1031 );
1032 let (b0, b1, b2, b3, b4, b5, b6, b7) = (
1033 $first_rotate(a0),
1034 $first_rotate(a1),
1035 $first_rotate(a2),
1036 $first_rotate(a3),
1037 $first_rotate(a4),
1038 $first_rotate(a5),
1039 $first_rotate(a6),
1040 $first_rotate(a7),
1041 );
1042 let (c0, c1, c2, c3, c4, c5, c6, c7) = (
1043 a0 ^ b0,
1044 a1 ^ b1,
1045 a2 ^ b2,
1046 a3 ^ b3,
1047 a4 ^ b4,
1048 a5 ^ b5,
1049 a6 ^ b6,
1050 a7 ^ b7,
1051 );
1052 let (d0, d1, d2, d3, d4, d5, d6, d7) = (
1053 a0 ^ c7,
1054 a1 ^ c0 ^ c7,
1055 a2 ^ c1,
1056 a3 ^ c2 ^ c7,
1057 a4 ^ c3 ^ c7,
1058 a5 ^ c4,
1059 a6 ^ c5,
1060 a7 ^ c6,
1061 );
1062 let (e0, e1, e2, e3, e4, e5, e6, e7) = (
1063 c0 ^ d6,
1064 c1 ^ d6 ^ d7,
1065 c2 ^ d0 ^ d7,
1066 c3 ^ d1 ^ d6,
1067 c4 ^ d2 ^ d6 ^ d7,
1068 c5 ^ d3 ^ d7,
1069 c6 ^ d4,
1070 c7 ^ d5,
1071 );
1072 state[0] = d0 ^ e0 ^ $second_rotate(e0);
1073 state[1] = d1 ^ e1 ^ $second_rotate(e1);
1074 state[2] = d2 ^ e2 ^ $second_rotate(e2);
1075 state[3] = d3 ^ e3 ^ $second_rotate(e3);
1076 state[4] = d4 ^ e4 ^ $second_rotate(e4);
1077 state[5] = d5 ^ e5 ^ $second_rotate(e5);
1078 state[6] = d6 ^ e6 ^ $second_rotate(e6);
1079 state[7] = d7 ^ e7 ^ $second_rotate(e7);
1080 }
1081 }
1082}
1083
1084define_mix_columns!(
1085 mix_columns_0,
1086 inv_mix_columns_0,
1087 rotate_rows_1,
1088 rotate_rows_2
1089);
1090
1091define_mix_columns!(
1092 mix_columns_1,
1093 inv_mix_columns_1,
1094 rotate_rows_and_columns_1_1,
1095 rotate_rows_and_columns_2_2
1096);
1097
1098#[cfg(not(aes_compact))]
1099define_mix_columns!(
1100 mix_columns_2,
1101 inv_mix_columns_2,
1102 rotate_rows_and_columns_1_2,
1103 rotate_rows_2
1104);
1105
1106#[cfg(not(aes_compact))]
1107define_mix_columns!(
1108 mix_columns_3,
1109 inv_mix_columns_3,
1110 rotate_rows_and_columns_1_3,
1111 rotate_rows_and_columns_2_2
1112);
1113
1114#[inline]
1115fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) {
1116 let t: u64 = (*a ^ ((*a) >> shift)) & mask;
1117 *a ^= t ^ (t << shift);
1118}
1119
1120#[inline]
1121fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) {
1122 let t: u64 = (*a ^ ((*b) >> shift)) & mask;
1123 *a ^= t;
1124 *b ^= t << shift;
1125}
1126
1127/// Applies ShiftRows once on an AES state (or key).
1128#[cfg(any(not(aes_compact), feature = "hazmat"))]
1129#[inline]
1130fn shift_rows_1(state: &mut [u64]) {
1131 debug_assert_eq!(state.len(), 8);
1132 for x: &mut u64 in state.iter_mut() {
1133 delta_swap_1(a:x, shift:8, mask:0x00f000ff000f0000);
1134 delta_swap_1(a:x, shift:4, mask:0x0f0f00000f0f0000);
1135 }
1136}
1137
1138/// Applies ShiftRows twice on an AES state (or key).
1139#[inline]
1140fn shift_rows_2(state: &mut [u64]) {
1141 debug_assert_eq!(state.len(), 8);
1142 for x: &mut u64 in state.iter_mut() {
1143 delta_swap_1(a:x, shift:8, mask:0x00ff000000ff0000);
1144 }
1145}
1146
1147/// Applies ShiftRows three times on an AES state (or key).
1148#[inline]
1149fn shift_rows_3(state: &mut [u64]) {
1150 debug_assert_eq!(state.len(), 8);
1151 for x: &mut u64 in state.iter_mut() {
1152 delta_swap_1(a:x, shift:8, mask:0x000f00ff00f00000);
1153 delta_swap_1(a:x, shift:4, mask:0x0f0f00000f0f0000);
1154 }
1155}
1156
1157#[inline(always)]
1158fn inv_shift_rows_1(state: &mut [u64]) {
1159 shift_rows_3(state);
1160}
1161
1162#[inline(always)]
1163fn inv_shift_rows_2(state: &mut [u64]) {
1164 shift_rows_2(state);
1165}
1166
1167#[cfg(not(aes_compact))]
1168#[inline(always)]
1169fn inv_shift_rows_3(state: &mut [u64]) {
1170 shift_rows_1(state);
1171}
1172
1173/// XOR the columns after the S-box during the key schedule round function.
1174///
1175/// The `idx_xor` parameter refers to the index of the previous round key that is
1176/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
1177/// respectively).
1178///
1179/// The `idx_ror` parameter refers to the rotation value, which varies between the
1180/// different key schedules.
1181fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) {
1182 for i: usize in 0..8 {
1183 let off_i: usize = offset + i;
1184 let rk: u64 = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(x:rkeys[off_i], y:idx_ror));
1185 rkeys[off_i] = rk
1186 ^ (0xfff0fff0fff0fff0 & (rk << 4))
1187 ^ (0xff00ff00ff00ff00 & (rk << 8))
1188 ^ (0xf000f000f000f000 & (rk << 12));
1189 }
1190}
1191
1192/// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state.
1193fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) {
1194 debug_assert_eq!(output.len(), 8);
1195 debug_assert_eq!(input0.len(), 16);
1196 debug_assert_eq!(input1.len(), 16);
1197 debug_assert_eq!(input2.len(), 16);
1198 debug_assert_eq!(input3.len(), 16);
1199
1200 // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a
1201 // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
1202 // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
1203 // b1 b0 c1 c0 r1 r0 p2 p1 p0
1204 //
1205 // The desired bitsliced data groups first by bit position, then row, column, block:
1206 // p2 p1 p0 r1 r0 c1 c0 b1 b0
1207
1208 #[rustfmt::skip]
1209 fn read_reordered(input: &[u8]) -> u64 {
1210 (u64::from(input[0x0]) ) |
1211 (u64::from(input[0x1]) << 0x10) |
1212 (u64::from(input[0x2]) << 0x20) |
1213 (u64::from(input[0x3]) << 0x30) |
1214 (u64::from(input[0x8]) << 0x08) |
1215 (u64::from(input[0x9]) << 0x18) |
1216 (u64::from(input[0xa]) << 0x28) |
1217 (u64::from(input[0xb]) << 0x38)
1218 }
1219
1220 // Reorder each block's bytes on input
1221 // __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __
1222 // Reorder by relabeling (note the order of input)
1223 // b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __
1224 let mut t0 = read_reordered(&input0[0x00..0x0c]);
1225 let mut t4 = read_reordered(&input0[0x04..0x10]);
1226 let mut t1 = read_reordered(&input1[0x00..0x0c]);
1227 let mut t5 = read_reordered(&input1[0x04..0x10]);
1228 let mut t2 = read_reordered(&input2[0x00..0x0c]);
1229 let mut t6 = read_reordered(&input2[0x04..0x10]);
1230 let mut t3 = read_reordered(&input3[0x00..0x0c]);
1231 let mut t7 = read_reordered(&input3[0x04..0x10]);
1232
1233 // Bit Index Swap 6 <-> 0:
1234 // __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0
1235 let m0 = 0x5555555555555555;
1236 delta_swap_2(&mut t1, &mut t0, 1, m0);
1237 delta_swap_2(&mut t3, &mut t2, 1, m0);
1238 delta_swap_2(&mut t5, &mut t4, 1, m0);
1239 delta_swap_2(&mut t7, &mut t6, 1, m0);
1240
1241 // Bit Index Swap 7 <-> 1:
1242 // __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __
1243 let m1 = 0x3333333333333333;
1244 delta_swap_2(&mut t2, &mut t0, 2, m1);
1245 delta_swap_2(&mut t3, &mut t1, 2, m1);
1246 delta_swap_2(&mut t6, &mut t4, 2, m1);
1247 delta_swap_2(&mut t7, &mut t5, 2, m1);
1248
1249 // Bit Index Swap 8 <-> 2:
1250 // c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __
1251 let m2 = 0x0f0f0f0f0f0f0f0f;
1252 delta_swap_2(&mut t4, &mut t0, 4, m2);
1253 delta_swap_2(&mut t5, &mut t1, 4, m2);
1254 delta_swap_2(&mut t6, &mut t2, 4, m2);
1255 delta_swap_2(&mut t7, &mut t3, 4, m2);
1256
1257 // Final bitsliced bit index, as desired:
1258 // p2 p1 p0 r1 r0 c1 c0 b1 b0
1259 output[0] = t0;
1260 output[1] = t1;
1261 output[2] = t2;
1262 output[3] = t3;
1263 output[4] = t4;
1264 output[5] = t5;
1265 output[6] = t6;
1266 output[7] = t7;
1267}
1268
1269/// Un-bitslice a 512-bit internal state into four 128-bit blocks of output.
1270fn inv_bitslice(input: &[u64]) -> BatchBlocks {
1271 debug_assert_eq!(input.len(), 8);
1272
1273 // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at
1274 // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the
1275 // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
1276 // b1 b0 c1 c0 r1 r0 p2 p1 p0
1277 //
1278 // The initially bitsliced data groups first by bit position, then row, column, block:
1279 // p2 p1 p0 r1 r0 c1 c0 b1 b0
1280
1281 let mut t0 = input[0];
1282 let mut t1 = input[1];
1283 let mut t2 = input[2];
1284 let mut t3 = input[3];
1285 let mut t4 = input[4];
1286 let mut t5 = input[5];
1287 let mut t6 = input[6];
1288 let mut t7 = input[7];
1289
1290 // TODO: these bit index swaps are identical to those in 'packing'
1291
1292 // Bit Index Swap 6 <-> 0:
1293 // __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0
1294 let m0 = 0x5555555555555555;
1295 delta_swap_2(&mut t1, &mut t0, 1, m0);
1296 delta_swap_2(&mut t3, &mut t2, 1, m0);
1297 delta_swap_2(&mut t5, &mut t4, 1, m0);
1298 delta_swap_2(&mut t7, &mut t6, 1, m0);
1299
1300 // Bit Index Swap 7 <-> 1:
1301 // __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __
1302 let m1 = 0x3333333333333333;
1303 delta_swap_2(&mut t2, &mut t0, 2, m1);
1304 delta_swap_2(&mut t3, &mut t1, 2, m1);
1305 delta_swap_2(&mut t6, &mut t4, 2, m1);
1306 delta_swap_2(&mut t7, &mut t5, 2, m1);
1307
1308 // Bit Index Swap 8 <-> 2:
1309 // p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __
1310 let m2 = 0x0f0f0f0f0f0f0f0f;
1311 delta_swap_2(&mut t4, &mut t0, 4, m2);
1312 delta_swap_2(&mut t5, &mut t1, 4, m2);
1313 delta_swap_2(&mut t6, &mut t2, 4, m2);
1314 delta_swap_2(&mut t7, &mut t3, 4, m2);
1315
1316 #[rustfmt::skip]
1317 fn write_reordered(columns: u64, output: &mut [u8]) {
1318 output[0x0] = (columns ) as u8;
1319 output[0x1] = (columns >> 0x10) as u8;
1320 output[0x2] = (columns >> 0x20) as u8;
1321 output[0x3] = (columns >> 0x30) as u8;
1322 output[0x8] = (columns >> 0x08) as u8;
1323 output[0x9] = (columns >> 0x18) as u8;
1324 output[0xa] = (columns >> 0x28) as u8;
1325 output[0xb] = (columns >> 0x38) as u8;
1326 }
1327
1328 let mut output = BatchBlocks::default();
1329 // Reorder by relabeling (note the order of output)
1330 // c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __
1331 // Reorder each block's bytes on output
1332 // __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __
1333 write_reordered(t0, &mut output[0][0x00..0x0c]);
1334 write_reordered(t4, &mut output[0][0x04..0x10]);
1335 write_reordered(t1, &mut output[1][0x00..0x0c]);
1336 write_reordered(t5, &mut output[1][0x04..0x10]);
1337 write_reordered(t2, &mut output[2][0x00..0x0c]);
1338 write_reordered(t6, &mut output[2][0x04..0x10]);
1339 write_reordered(t3, &mut output[3][0x00..0x0c]);
1340 write_reordered(t7, &mut output[3][0x04..0x10]);
1341
1342 // Final AES bit index, as desired:
1343 // b1 b0 c1 c0 r1 r0 p2 p1 p0
1344 output
1345}
1346
1347/// Copy 32-bytes within the provided slice to an 8-byte offset
1348fn memshift32(buffer: &mut [u64], src_offset: usize) {
1349 debug_assert_eq!(src_offset % 8, 0);
1350
1351 let dst_offset: usize = src_offset + 8;
1352 debug_assert!(dst_offset + 8 <= buffer.len());
1353
1354 for i: usize in (0..8).rev() {
1355 buffer[dst_offset + i] = buffer[src_offset + i];
1356 }
1357}
1358
1359/// XOR the round key to the internal state. The round keys are expected to be
1360/// pre-computed and to be packed in the fixsliced representation.
1361#[inline]
1362fn add_round_key(state: &mut State, rkey: &[u64]) {
1363 debug_assert_eq!(rkey.len(), 8);
1364 for (a: &mut u64, b: &u64) in state.iter_mut().zip(rkey) {
1365 *a ^= b;
1366 }
1367}
1368
1369#[inline(always)]
1370fn add_round_constant_bit(state: &mut [u64], bit: usize) {
1371 state[bit] ^= 0x00000000f0000000;
1372}
1373
1374#[inline(always)]
1375fn ror(x: u64, y: u32) -> u64 {
1376 x.rotate_right(y)
1377}
1378
1379#[inline(always)]
1380fn ror_distance(rows: u32, cols: u32) -> u32 {
1381 (rows << 4) + (cols << 2)
1382}
1383
1384#[inline(always)]
1385fn rotate_rows_1(x: u64) -> u64 {
1386 ror(x, y:ror_distance(rows:1, cols:0))
1387}
1388
1389#[inline(always)]
1390fn rotate_rows_2(x: u64) -> u64 {
1391 ror(x, y:ror_distance(rows:2, cols:0))
1392}
1393
1394#[inline(always)]
1395#[rustfmt::skip]
1396fn rotate_rows_and_columns_1_1(x: u64) -> u64 {
1397 (ror(x, y:ror_distance(rows:1, cols:1)) & 0x0fff0fff0fff0fff) |
1398 (ror(x, y:ror_distance(rows:0, cols:1)) & 0xf000f000f000f000)
1399}
1400
1401#[cfg(not(aes_compact))]
1402#[inline(always)]
1403#[rustfmt::skip]
1404fn rotate_rows_and_columns_1_2(x: u64) -> u64 {
1405 (ror(x, y:ror_distance(rows:1, cols:2)) & 0x00ff00ff00ff00ff) |
1406 (ror(x, y:ror_distance(rows:0, cols:2)) & 0xff00ff00ff00ff00)
1407}
1408
1409#[cfg(not(aes_compact))]
1410#[inline(always)]
1411#[rustfmt::skip]
1412fn rotate_rows_and_columns_1_3(x: u64) -> u64 {
1413 (ror(x, y:ror_distance(rows:1, cols:3)) & 0x000f000f000f000f) |
1414 (ror(x, y:ror_distance(rows:0, cols:3)) & 0xfff0fff0fff0fff0)
1415}
1416
1417#[inline(always)]
1418#[rustfmt::skip]
1419fn rotate_rows_and_columns_2_2(x: u64) -> u64 {
1420 (ror(x, y:ror_distance(rows:2, cols:2)) & 0x00ff00ff00ff00ff) |
1421 (ror(x, y:ror_distance(rows:1, cols:2)) & 0xff00ff00ff00ff00)
1422}
1423
1424/// Low-level "hazmat" AES functions.
1425///
1426/// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
1427/// implementations in this crate, but instead provides raw access to
1428/// the AES round function gated under the `hazmat` crate feature.
1429#[cfg(feature = "hazmat")]
1430pub(crate) mod hazmat {
1431 use super::{
1432 bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0,
1433 shift_rows_1, sub_bytes, sub_bytes_nots, State,
1434 };
1435 use crate::{Block, Block8};
1436
1437 /// XOR the `src` block into the `dst` block in-place.
1438 fn xor_in_place(dst: &mut Block, src: &Block) {
1439 for (a, b) in dst.iter_mut().zip(src.as_slice()) {
1440 *a ^= *b;
1441 }
1442 }
1443
1444 /// Perform a bitslice operation, loading a single block.
1445 fn bitslice_block(block: &Block) -> State {
1446 let mut state = State::default();
1447 bitslice(&mut state, block, block, block, block);
1448 state
1449 }
1450
1451 /// Perform an inverse bitslice operation, extracting a single block.
1452 fn inv_bitslice_block(block: &mut Block, state: &State) {
1453 block.copy_from_slice(&inv_bitslice(state)[0]);
1454 }
1455
1456 /// AES cipher (encrypt) round function.
1457 #[inline]
1458 pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) {
1459 let mut state = bitslice_block(block);
1460 sub_bytes(&mut state);
1461 sub_bytes_nots(&mut state);
1462 shift_rows_1(&mut state);
1463 mix_columns_0(&mut state);
1464 inv_bitslice_block(block, &state);
1465 xor_in_place(block, round_key);
1466 }
1467
1468 /// AES cipher (encrypt) round function: parallel version.
1469 #[inline]
1470 pub(crate) fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
1471 for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
1472 let mut state = State::default();
1473 bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
1474 sub_bytes(&mut state);
1475 sub_bytes_nots(&mut state);
1476 shift_rows_1(&mut state);
1477 mix_columns_0(&mut state);
1478 let res = inv_bitslice(&state);
1479
1480 for i in 0..4 {
1481 chunk[i] = res[i];
1482 xor_in_place(&mut chunk[i], &keys[i]);
1483 }
1484 }
1485 }
1486
1487 /// AES cipher (encrypt) round function.
1488 #[inline]
1489 pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
1490 let mut state = State::default();
1491 bitslice(&mut state, block, block, block, block);
1492 sub_bytes_nots(&mut state);
1493 inv_sub_bytes(&mut state);
1494 inv_shift_rows_1(&mut state);
1495 inv_mix_columns_0(&mut state);
1496 inv_bitslice_block(block, &state);
1497 xor_in_place(block, round_key);
1498 }
1499
1500 /// AES cipher (encrypt) round function: parallel version.
1501 #[inline]
1502 pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: &Block8) {
1503 for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) {
1504 let mut state = State::default();
1505 bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]);
1506 sub_bytes_nots(&mut state);
1507 inv_sub_bytes(&mut state);
1508 inv_shift_rows_1(&mut state);
1509 inv_mix_columns_0(&mut state);
1510 let res = inv_bitslice(&state);
1511
1512 for i in 0..4 {
1513 chunk[i] = res[i];
1514 xor_in_place(&mut chunk[i], &keys[i]);
1515 }
1516 }
1517 }
1518
1519 /// AES mix columns function.
1520 #[inline]
1521 pub(crate) fn mix_columns(block: &mut Block) {
1522 let mut state = bitslice_block(block);
1523 mix_columns_0(&mut state);
1524 inv_bitslice_block(block, &state);
1525 }
1526
1527 /// AES inverse mix columns function.
1528 #[inline]
1529 pub(crate) fn inv_mix_columns(block: &mut Block) {
1530 let mut state = bitslice_block(block);
1531 inv_mix_columns_0(&mut state);
1532 inv_bitslice_block(block, &state);
1533 }
1534}
1535