1 | //! Fixsliced implementations of AES-128, AES-192 and AES-256 (64-bit) |
2 | //! adapted from the C implementation. |
3 | //! |
4 | //! All implementations are fully bitsliced and do not rely on any |
5 | //! Look-Up Table (LUT). |
6 | //! |
7 | //! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details. |
8 | //! |
9 | //! # Author (original C code) |
10 | //! |
11 | //! Alexandre Adomnicai, Nanyang Technological University, Singapore |
12 | //! <alexandre.adomnicai@ntu.edu.sg> |
13 | //! |
14 | //! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission. |
15 | |
16 | #![allow (clippy::unreadable_literal)] |
17 | |
18 | use crate::Block; |
19 | use cipher::{consts::U4, generic_array::GenericArray}; |
20 | |
21 | /// AES block batch size for this implementation |
22 | pub(crate) type FixsliceBlocks = U4; |
23 | |
24 | pub(crate) type BatchBlocks = GenericArray<Block, FixsliceBlocks>; |
25 | |
26 | /// AES-128 round keys |
27 | pub(crate) type FixsliceKeys128 = [u64; 88]; |
28 | |
29 | /// AES-192 round keys |
30 | pub(crate) type FixsliceKeys192 = [u64; 104]; |
31 | |
32 | /// AES-256 round keys |
33 | pub(crate) type FixsliceKeys256 = [u64; 120]; |
34 | |
35 | /// 512-bit internal state |
36 | pub(crate) type State = [u64; 8]; |
37 | |
38 | /// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation. |
39 | pub(crate) fn aes128_key_schedule(key: &[u8; 16]) -> FixsliceKeys128 { |
40 | let mut rkeys = [0u64; 88]; |
41 | |
42 | bitslice(&mut rkeys[..8], key, key, key, key); |
43 | |
44 | let mut rk_off = 0; |
45 | for rcon in 0..10 { |
46 | memshift32(&mut rkeys, rk_off); |
47 | rk_off += 8; |
48 | |
49 | sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); |
50 | sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); |
51 | |
52 | if rcon < 8 { |
53 | add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); |
54 | } else { |
55 | add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8); |
56 | add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7); |
57 | add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5); |
58 | add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4); |
59 | } |
60 | |
61 | xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3)); |
62 | } |
63 | |
64 | // Adjust to match fixslicing format |
65 | #[cfg (aes_compact)] |
66 | { |
67 | for i in (8..88).step_by(16) { |
68 | inv_shift_rows_1(&mut rkeys[i..(i + 8)]); |
69 | } |
70 | } |
71 | #[cfg (not(aes_compact))] |
72 | { |
73 | for i in (8..72).step_by(32) { |
74 | inv_shift_rows_1(&mut rkeys[i..(i + 8)]); |
75 | inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); |
76 | inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); |
77 | } |
78 | inv_shift_rows_1(&mut rkeys[72..80]); |
79 | } |
80 | |
81 | // Account for NOTs removed from sub_bytes |
82 | for i in 1..11 { |
83 | sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); |
84 | } |
85 | |
86 | rkeys |
87 | } |
88 | |
89 | /// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation. |
90 | pub(crate) fn aes192_key_schedule(key: &[u8; 24]) -> FixsliceKeys192 { |
91 | let mut rkeys = [0u64; 104]; |
92 | let mut tmp = [0u64; 8]; |
93 | |
94 | bitslice( |
95 | &mut rkeys[..8], |
96 | &key[..16], |
97 | &key[..16], |
98 | &key[..16], |
99 | &key[..16], |
100 | ); |
101 | bitslice(&mut tmp, &key[8..], &key[8..], &key[8..], &key[8..]); |
102 | |
103 | let mut rcon = 0; |
104 | let mut rk_off = 8; |
105 | |
106 | loop { |
107 | for i in 0..8 { |
108 | rkeys[rk_off + i] = (0x00ff00ff00ff00ff & (tmp[i] >> 8)) |
109 | | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); |
110 | } |
111 | |
112 | sub_bytes(&mut tmp); |
113 | sub_bytes_nots(&mut tmp); |
114 | |
115 | add_round_constant_bit(&mut tmp, rcon); |
116 | rcon += 1; |
117 | |
118 | for i in 0..8 { |
119 | let mut ti = rkeys[rk_off + i]; |
120 | ti ^= 0x0f000f000f000f00 & ror(tmp[i], ror_distance(1, 1)); |
121 | ti ^= 0xf000f000f000f000 & (ti << 4); |
122 | tmp[i] = ti; |
123 | } |
124 | rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); |
125 | rk_off += 8; |
126 | |
127 | for i in 0..8 { |
128 | let ui = tmp[i]; |
129 | let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) |
130 | | (0xff00ff00ff00ff00 & (ui << 8)); |
131 | ti ^= 0x000f000f000f000f & (ui >> 12); |
132 | tmp[i] = ti |
133 | ^ (0xfff0fff0fff0fff0 & (ti << 4)) |
134 | ^ (0xff00ff00ff00ff00 & (ti << 8)) |
135 | ^ (0xf000f000f000f000 & (ti << 12)); |
136 | } |
137 | rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); |
138 | rk_off += 8; |
139 | |
140 | sub_bytes(&mut tmp); |
141 | sub_bytes_nots(&mut tmp); |
142 | |
143 | add_round_constant_bit(&mut tmp, rcon); |
144 | rcon += 1; |
145 | |
146 | for i in 0..8 { |
147 | let mut ti = (0x00ff00ff00ff00ff & (rkeys[(rk_off - 16) + i] >> 8)) |
148 | | (0xff00ff00ff00ff00 & (rkeys[(rk_off - 8) + i] << 8)); |
149 | ti ^= 0x000f000f000f000f & ror(tmp[i], ror_distance(1, 3)); |
150 | rkeys[rk_off + i] = ti |
151 | ^ (0xfff0fff0fff0fff0 & (ti << 4)) |
152 | ^ (0xff00ff00ff00ff00 & (ti << 8)) |
153 | ^ (0xf000f000f000f000 & (ti << 12)); |
154 | } |
155 | rk_off += 8; |
156 | |
157 | if rcon >= 8 { |
158 | break; |
159 | } |
160 | |
161 | for i in 0..8 { |
162 | let ui = rkeys[(rk_off - 8) + i]; |
163 | let mut ti = rkeys[(rk_off - 16) + i]; |
164 | ti ^= 0x0f000f000f000f00 & (ui >> 4); |
165 | ti ^= 0xf000f000f000f000 & (ti << 4); |
166 | tmp[i] = ti; |
167 | } |
168 | } |
169 | |
170 | // Adjust to match fixslicing format |
171 | #[cfg (aes_compact)] |
172 | { |
173 | for i in (8..104).step_by(16) { |
174 | inv_shift_rows_1(&mut rkeys[i..(i + 8)]); |
175 | } |
176 | } |
177 | #[cfg (not(aes_compact))] |
178 | { |
179 | for i in (0..96).step_by(32) { |
180 | inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]); |
181 | inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]); |
182 | inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]); |
183 | } |
184 | } |
185 | |
186 | // Account for NOTs removed from sub_bytes |
187 | for i in 1..13 { |
188 | sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); |
189 | } |
190 | |
191 | rkeys |
192 | } |
193 | |
194 | /// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation. |
195 | pub(crate) fn aes256_key_schedule(key: &[u8; 32]) -> FixsliceKeys256 { |
196 | let mut rkeys = [0u64; 120]; |
197 | |
198 | bitslice( |
199 | &mut rkeys[..8], |
200 | &key[..16], |
201 | &key[..16], |
202 | &key[..16], |
203 | &key[..16], |
204 | ); |
205 | bitslice( |
206 | &mut rkeys[8..16], |
207 | &key[16..], |
208 | &key[16..], |
209 | &key[16..], |
210 | &key[16..], |
211 | ); |
212 | |
213 | let mut rk_off = 8; |
214 | |
215 | let mut rcon = 0; |
216 | loop { |
217 | memshift32(&mut rkeys, rk_off); |
218 | rk_off += 8; |
219 | |
220 | sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); |
221 | sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); |
222 | |
223 | add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); |
224 | xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3)); |
225 | rcon += 1; |
226 | |
227 | if rcon == 7 { |
228 | break; |
229 | } |
230 | |
231 | memshift32(&mut rkeys, rk_off); |
232 | rk_off += 8; |
233 | |
234 | sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); |
235 | sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); |
236 | |
237 | xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3)); |
238 | } |
239 | |
240 | // Adjust to match fixslicing format |
241 | #[cfg (aes_compact)] |
242 | { |
243 | for i in (8..120).step_by(16) { |
244 | inv_shift_rows_1(&mut rkeys[i..(i + 8)]); |
245 | } |
246 | } |
247 | #[cfg (not(aes_compact))] |
248 | { |
249 | for i in (8..104).step_by(32) { |
250 | inv_shift_rows_1(&mut rkeys[i..(i + 8)]); |
251 | inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); |
252 | inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); |
253 | } |
254 | inv_shift_rows_1(&mut rkeys[104..112]); |
255 | } |
256 | |
257 | // Account for NOTs removed from sub_bytes |
258 | for i in 1..15 { |
259 | sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); |
260 | } |
261 | |
262 | rkeys |
263 | } |
264 | |
265 | /// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted). |
266 | /// |
267 | /// Decrypts four blocks in-place and in parallel. |
268 | pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks { |
269 | let mut state = State::default(); |
270 | |
271 | bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); |
272 | |
273 | add_round_key(&mut state, &rkeys[80..]); |
274 | inv_sub_bytes(&mut state); |
275 | |
276 | #[cfg (not(aes_compact))] |
277 | { |
278 | inv_shift_rows_2(&mut state); |
279 | } |
280 | |
281 | let mut rk_off = 72; |
282 | loop { |
283 | #[cfg (aes_compact)] |
284 | { |
285 | inv_shift_rows_2(&mut state); |
286 | } |
287 | |
288 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
289 | inv_mix_columns_1(&mut state); |
290 | inv_sub_bytes(&mut state); |
291 | rk_off -= 8; |
292 | |
293 | if rk_off == 0 { |
294 | break; |
295 | } |
296 | |
297 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
298 | inv_mix_columns_0(&mut state); |
299 | inv_sub_bytes(&mut state); |
300 | rk_off -= 8; |
301 | |
302 | #[cfg (not(aes_compact))] |
303 | { |
304 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
305 | inv_mix_columns_3(&mut state); |
306 | inv_sub_bytes(&mut state); |
307 | rk_off -= 8; |
308 | |
309 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
310 | inv_mix_columns_2(&mut state); |
311 | inv_sub_bytes(&mut state); |
312 | rk_off -= 8; |
313 | } |
314 | } |
315 | |
316 | add_round_key(&mut state, &rkeys[..8]); |
317 | |
318 | inv_bitslice(&state) |
319 | } |
320 | |
321 | /// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted). |
322 | /// |
323 | /// Encrypts four blocks in-place and in parallel. |
324 | pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks { |
325 | let mut state = State::default(); |
326 | |
327 | bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); |
328 | |
329 | add_round_key(&mut state, &rkeys[..8]); |
330 | |
331 | let mut rk_off = 8; |
332 | loop { |
333 | sub_bytes(&mut state); |
334 | mix_columns_1(&mut state); |
335 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
336 | rk_off += 8; |
337 | |
338 | #[cfg (aes_compact)] |
339 | { |
340 | shift_rows_2(&mut state); |
341 | } |
342 | |
343 | if rk_off == 80 { |
344 | break; |
345 | } |
346 | |
347 | #[cfg (not(aes_compact))] |
348 | { |
349 | sub_bytes(&mut state); |
350 | mix_columns_2(&mut state); |
351 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
352 | rk_off += 8; |
353 | |
354 | sub_bytes(&mut state); |
355 | mix_columns_3(&mut state); |
356 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
357 | rk_off += 8; |
358 | } |
359 | |
360 | sub_bytes(&mut state); |
361 | mix_columns_0(&mut state); |
362 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
363 | rk_off += 8; |
364 | } |
365 | |
366 | #[cfg (not(aes_compact))] |
367 | { |
368 | shift_rows_2(&mut state); |
369 | } |
370 | |
371 | sub_bytes(&mut state); |
372 | add_round_key(&mut state, &rkeys[80..]); |
373 | |
374 | inv_bitslice(&state) |
375 | } |
376 | |
377 | /// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted). |
378 | /// |
379 | /// Decrypts four blocks in-place and in parallel. |
380 | pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks { |
381 | let mut state = State::default(); |
382 | |
383 | bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); |
384 | |
385 | add_round_key(&mut state, &rkeys[96..]); |
386 | inv_sub_bytes(&mut state); |
387 | |
388 | let mut rk_off = 88; |
389 | loop { |
390 | #[cfg (aes_compact)] |
391 | { |
392 | inv_shift_rows_2(&mut state); |
393 | } |
394 | #[cfg (not(aes_compact))] |
395 | { |
396 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
397 | inv_mix_columns_3(&mut state); |
398 | inv_sub_bytes(&mut state); |
399 | rk_off -= 8; |
400 | |
401 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
402 | inv_mix_columns_2(&mut state); |
403 | inv_sub_bytes(&mut state); |
404 | rk_off -= 8; |
405 | } |
406 | |
407 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
408 | inv_mix_columns_1(&mut state); |
409 | inv_sub_bytes(&mut state); |
410 | rk_off -= 8; |
411 | |
412 | if rk_off == 0 { |
413 | break; |
414 | } |
415 | |
416 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
417 | inv_mix_columns_0(&mut state); |
418 | inv_sub_bytes(&mut state); |
419 | rk_off -= 8; |
420 | } |
421 | |
422 | add_round_key(&mut state, &rkeys[..8]); |
423 | |
424 | inv_bitslice(&state) |
425 | } |
426 | |
427 | /// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted). |
428 | /// |
429 | /// Encrypts four blocks in-place and in parallel. |
430 | pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks { |
431 | let mut state = State::default(); |
432 | |
433 | bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); |
434 | |
435 | add_round_key(&mut state, &rkeys[..8]); |
436 | |
437 | let mut rk_off = 8; |
438 | loop { |
439 | sub_bytes(&mut state); |
440 | mix_columns_1(&mut state); |
441 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
442 | rk_off += 8; |
443 | |
444 | #[cfg (aes_compact)] |
445 | { |
446 | shift_rows_2(&mut state); |
447 | } |
448 | #[cfg (not(aes_compact))] |
449 | { |
450 | sub_bytes(&mut state); |
451 | mix_columns_2(&mut state); |
452 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
453 | rk_off += 8; |
454 | |
455 | sub_bytes(&mut state); |
456 | mix_columns_3(&mut state); |
457 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
458 | rk_off += 8; |
459 | } |
460 | |
461 | if rk_off == 96 { |
462 | break; |
463 | } |
464 | |
465 | sub_bytes(&mut state); |
466 | mix_columns_0(&mut state); |
467 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
468 | rk_off += 8; |
469 | } |
470 | |
471 | sub_bytes(&mut state); |
472 | add_round_key(&mut state, &rkeys[96..]); |
473 | |
474 | inv_bitslice(&state) |
475 | } |
476 | |
477 | /// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted). |
478 | /// |
479 | /// Decrypts four blocks in-place and in parallel. |
480 | pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks { |
481 | let mut state = State::default(); |
482 | |
483 | bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); |
484 | |
485 | add_round_key(&mut state, &rkeys[112..]); |
486 | inv_sub_bytes(&mut state); |
487 | |
488 | #[cfg (not(aes_compact))] |
489 | { |
490 | inv_shift_rows_2(&mut state); |
491 | } |
492 | |
493 | let mut rk_off = 104; |
494 | loop { |
495 | #[cfg (aes_compact)] |
496 | { |
497 | inv_shift_rows_2(&mut state); |
498 | } |
499 | |
500 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
501 | inv_mix_columns_1(&mut state); |
502 | inv_sub_bytes(&mut state); |
503 | rk_off -= 8; |
504 | |
505 | if rk_off == 0 { |
506 | break; |
507 | } |
508 | |
509 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
510 | inv_mix_columns_0(&mut state); |
511 | inv_sub_bytes(&mut state); |
512 | rk_off -= 8; |
513 | |
514 | #[cfg (not(aes_compact))] |
515 | { |
516 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
517 | inv_mix_columns_3(&mut state); |
518 | inv_sub_bytes(&mut state); |
519 | rk_off -= 8; |
520 | |
521 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
522 | inv_mix_columns_2(&mut state); |
523 | inv_sub_bytes(&mut state); |
524 | rk_off -= 8; |
525 | } |
526 | } |
527 | |
528 | add_round_key(&mut state, &rkeys[..8]); |
529 | |
530 | inv_bitslice(&state) |
531 | } |
532 | |
533 | /// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted). |
534 | /// |
535 | /// Encrypts four blocks in-place and in parallel. |
536 | pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks { |
537 | let mut state = State::default(); |
538 | |
539 | bitslice(&mut state, &blocks[0], &blocks[1], &blocks[2], &blocks[3]); |
540 | |
541 | add_round_key(&mut state, &rkeys[..8]); |
542 | |
543 | let mut rk_off = 8; |
544 | loop { |
545 | sub_bytes(&mut state); |
546 | mix_columns_1(&mut state); |
547 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
548 | rk_off += 8; |
549 | |
550 | #[cfg (aes_compact)] |
551 | { |
552 | shift_rows_2(&mut state); |
553 | } |
554 | |
555 | if rk_off == 112 { |
556 | break; |
557 | } |
558 | |
559 | #[cfg (not(aes_compact))] |
560 | { |
561 | sub_bytes(&mut state); |
562 | mix_columns_2(&mut state); |
563 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
564 | rk_off += 8; |
565 | |
566 | sub_bytes(&mut state); |
567 | mix_columns_3(&mut state); |
568 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
569 | rk_off += 8; |
570 | } |
571 | |
572 | sub_bytes(&mut state); |
573 | mix_columns_0(&mut state); |
574 | add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); |
575 | rk_off += 8; |
576 | } |
577 | |
578 | #[cfg (not(aes_compact))] |
579 | { |
580 | shift_rows_2(&mut state); |
581 | } |
582 | |
583 | sub_bytes(&mut state); |
584 | add_round_key(&mut state, &rkeys[112..]); |
585 | |
586 | inv_bitslice(&state) |
587 | } |
588 | |
589 | /// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are accounted for here so that it is a true |
590 | /// inverse of 'sub_bytes'. |
591 | fn inv_sub_bytes(state: &mut [u64]) { |
592 | debug_assert_eq!(state.len(), 8); |
593 | |
594 | // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler |
595 | // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) |
596 | |
597 | let u7 = state[0]; |
598 | let u6 = state[1]; |
599 | let u5 = state[2]; |
600 | let u4 = state[3]; |
601 | let u3 = state[4]; |
602 | let u2 = state[5]; |
603 | let u1 = state[6]; |
604 | let u0 = state[7]; |
605 | |
606 | let t23 = u0 ^ u3; |
607 | let t8 = u1 ^ t23; |
608 | let m2 = t23 & t8; |
609 | let t4 = u4 ^ t8; |
610 | let t22 = u1 ^ u3; |
611 | let t2 = u0 ^ u1; |
612 | let t1 = u3 ^ u4; |
613 | // t23 -> stack |
614 | let t9 = u7 ^ t1; |
615 | // t8 -> stack |
616 | let m7 = t22 & t9; |
617 | // t9 -> stack |
618 | let t24 = u4 ^ u7; |
619 | // m7 -> stack |
620 | let t10 = t2 ^ t24; |
621 | // u4 -> stack |
622 | let m14 = t2 & t10; |
623 | let r5 = u6 ^ u7; |
624 | // m2 -> stack |
625 | let t3 = t1 ^ r5; |
626 | // t2 -> stack |
627 | let t13 = t2 ^ r5; |
628 | let t19 = t22 ^ r5; |
629 | // t3 -> stack |
630 | let t17 = u2 ^ t19; |
631 | // t4 -> stack |
632 | let t25 = u2 ^ t1; |
633 | let r13 = u1 ^ u6; |
634 | // t25 -> stack |
635 | let t20 = t24 ^ r13; |
636 | // t17 -> stack |
637 | let m9 = t20 & t17; |
638 | // t20 -> stack |
639 | let r17 = u2 ^ u5; |
640 | // t22 -> stack |
641 | let t6 = t22 ^ r17; |
642 | // t13 -> stack |
643 | let m1 = t13 & t6; |
644 | let y5 = u0 ^ r17; |
645 | let m4 = t19 & y5; |
646 | let m5 = m4 ^ m1; |
647 | let m17 = m5 ^ t24; |
648 | let r18 = u5 ^ u6; |
649 | let t27 = t1 ^ r18; |
650 | let t15 = t10 ^ t27; |
651 | // t6 -> stack |
652 | let m11 = t1 & t15; |
653 | let m15 = m14 ^ m11; |
654 | let m21 = m17 ^ m15; |
655 | // t1 -> stack |
656 | // t4 <- stack |
657 | let m12 = t4 & t27; |
658 | let m13 = m12 ^ m11; |
659 | let t14 = t10 ^ r18; |
660 | let m3 = t14 ^ m1; |
661 | // m2 <- stack |
662 | let m16 = m3 ^ m2; |
663 | let m20 = m16 ^ m13; |
664 | // u4 <- stack |
665 | let r19 = u2 ^ u4; |
666 | let t16 = r13 ^ r19; |
667 | // t3 <- stack |
668 | let t26 = t3 ^ t16; |
669 | let m6 = t3 & t16; |
670 | let m8 = t26 ^ m6; |
671 | // t10 -> stack |
672 | // m7 <- stack |
673 | let m18 = m8 ^ m7; |
674 | let m22 = m18 ^ m13; |
675 | let m25 = m22 & m20; |
676 | let m26 = m21 ^ m25; |
677 | let m10 = m9 ^ m6; |
678 | let m19 = m10 ^ m15; |
679 | // t25 <- stack |
680 | let m23 = m19 ^ t25; |
681 | let m28 = m23 ^ m25; |
682 | let m24 = m22 ^ m23; |
683 | let m30 = m26 & m24; |
684 | let m39 = m23 ^ m30; |
685 | let m48 = m39 & y5; |
686 | let m57 = m39 & t19; |
687 | // m48 -> stack |
688 | let m36 = m24 ^ m25; |
689 | let m31 = m20 & m23; |
690 | let m27 = m20 ^ m21; |
691 | let m32 = m27 & m31; |
692 | let m29 = m28 & m27; |
693 | let m37 = m21 ^ m29; |
694 | // m39 -> stack |
695 | let m42 = m37 ^ m39; |
696 | let m52 = m42 & t15; |
697 | // t27 -> stack |
698 | // t1 <- stack |
699 | let m61 = m42 & t1; |
700 | let p0 = m52 ^ m61; |
701 | let p16 = m57 ^ m61; |
702 | // m57 -> stack |
703 | // t20 <- stack |
704 | let m60 = m37 & t20; |
705 | // p16 -> stack |
706 | // t17 <- stack |
707 | let m51 = m37 & t17; |
708 | let m33 = m27 ^ m25; |
709 | let m38 = m32 ^ m33; |
710 | let m43 = m37 ^ m38; |
711 | let m49 = m43 & t16; |
712 | let p6 = m49 ^ m60; |
713 | let p13 = m49 ^ m51; |
714 | let m58 = m43 & t3; |
715 | // t9 <- stack |
716 | let m50 = m38 & t9; |
717 | // t22 <- stack |
718 | let m59 = m38 & t22; |
719 | // p6 -> stack |
720 | let p1 = m58 ^ m59; |
721 | let p7 = p0 ^ p1; |
722 | let m34 = m21 & m22; |
723 | let m35 = m24 & m34; |
724 | let m40 = m35 ^ m36; |
725 | let m41 = m38 ^ m40; |
726 | let m45 = m42 ^ m41; |
727 | // t27 <- stack |
728 | let m53 = m45 & t27; |
729 | let p8 = m50 ^ m53; |
730 | let p23 = p7 ^ p8; |
731 | // t4 <- stack |
732 | let m62 = m45 & t4; |
733 | let p14 = m49 ^ m62; |
734 | let s6 = p14 ^ p23; |
735 | // t10 <- stack |
736 | let m54 = m41 & t10; |
737 | let p2 = m54 ^ m62; |
738 | let p22 = p2 ^ p7; |
739 | let s0 = p13 ^ p22; |
740 | let p17 = m58 ^ p2; |
741 | let p15 = m54 ^ m59; |
742 | // t2 <- stack |
743 | let m63 = m41 & t2; |
744 | // m39 <- stack |
745 | let m44 = m39 ^ m40; |
746 | // p17 -> stack |
747 | // t6 <- stack |
748 | let m46 = m44 & t6; |
749 | let p5 = m46 ^ m51; |
750 | // p23 -> stack |
751 | let p18 = m63 ^ p5; |
752 | let p24 = p5 ^ p7; |
753 | // m48 <- stack |
754 | let p12 = m46 ^ m48; |
755 | let s3 = p12 ^ p22; |
756 | // t13 <- stack |
757 | let m55 = m44 & t13; |
758 | let p9 = m55 ^ m63; |
759 | // p16 <- stack |
760 | let s7 = p9 ^ p16; |
761 | // t8 <- stack |
762 | let m47 = m40 & t8; |
763 | let p3 = m47 ^ m50; |
764 | let p19 = p2 ^ p3; |
765 | let s5 = p19 ^ p24; |
766 | let p11 = p0 ^ p3; |
767 | let p26 = p9 ^ p11; |
768 | // t23 <- stack |
769 | let m56 = m40 & t23; |
770 | let p4 = m48 ^ m56; |
771 | // p6 <- stack |
772 | let p20 = p4 ^ p6; |
773 | let p29 = p15 ^ p20; |
774 | let s1 = p26 ^ p29; |
775 | // m57 <- stack |
776 | let p10 = m57 ^ p4; |
777 | let p27 = p10 ^ p18; |
778 | // p23 <- stack |
779 | let s4 = p23 ^ p27; |
780 | let p25 = p6 ^ p10; |
781 | let p28 = p11 ^ p25; |
782 | // p17 <- stack |
783 | let s2 = p17 ^ p28; |
784 | |
785 | state[0] = s7; |
786 | state[1] = s6; |
787 | state[2] = s5; |
788 | state[3] = s4; |
789 | state[4] = s3; |
790 | state[5] = s2; |
791 | state[6] = s1; |
792 | state[7] = s0; |
793 | } |
794 | |
795 | /// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik. |
796 | /// |
797 | /// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt> |
798 | /// |
799 | /// Note that the 4 bitwise NOT (^= 0xffffffffffffffff) are moved to the key schedule. |
800 | fn sub_bytes(state: &mut [u64]) { |
801 | debug_assert_eq!(state.len(), 8); |
802 | |
803 | // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler |
804 | // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) |
805 | |
806 | let u7 = state[0]; |
807 | let u6 = state[1]; |
808 | let u5 = state[2]; |
809 | let u4 = state[3]; |
810 | let u3 = state[4]; |
811 | let u2 = state[5]; |
812 | let u1 = state[6]; |
813 | let u0 = state[7]; |
814 | |
815 | let y14 = u3 ^ u5; |
816 | let y13 = u0 ^ u6; |
817 | let y12 = y13 ^ y14; |
818 | let t1 = u4 ^ y12; |
819 | let y15 = t1 ^ u5; |
820 | let t2 = y12 & y15; |
821 | let y6 = y15 ^ u7; |
822 | let y20 = t1 ^ u1; |
823 | // y12 -> stack |
824 | let y9 = u0 ^ u3; |
825 | // y20 -> stack |
826 | let y11 = y20 ^ y9; |
827 | // y9 -> stack |
828 | let t12 = y9 & y11; |
829 | // y6 -> stack |
830 | let y7 = u7 ^ y11; |
831 | let y8 = u0 ^ u5; |
832 | let t0 = u1 ^ u2; |
833 | let y10 = y15 ^ t0; |
834 | // y15 -> stack |
835 | let y17 = y10 ^ y11; |
836 | // y14 -> stack |
837 | let t13 = y14 & y17; |
838 | let t14 = t13 ^ t12; |
839 | // y17 -> stack |
840 | let y19 = y10 ^ y8; |
841 | // y10 -> stack |
842 | let t15 = y8 & y10; |
843 | let t16 = t15 ^ t12; |
844 | let y16 = t0 ^ y11; |
845 | // y11 -> stack |
846 | let y21 = y13 ^ y16; |
847 | // y13 -> stack |
848 | let t7 = y13 & y16; |
849 | // y16 -> stack |
850 | let y18 = u0 ^ y16; |
851 | let y1 = t0 ^ u7; |
852 | let y4 = y1 ^ u3; |
853 | // u7 -> stack |
854 | let t5 = y4 & u7; |
855 | let t6 = t5 ^ t2; |
856 | let t18 = t6 ^ t16; |
857 | let t22 = t18 ^ y19; |
858 | let y2 = y1 ^ u0; |
859 | let t10 = y2 & y7; |
860 | let t11 = t10 ^ t7; |
861 | let t20 = t11 ^ t16; |
862 | let t24 = t20 ^ y18; |
863 | let y5 = y1 ^ u6; |
864 | let t8 = y5 & y1; |
865 | let t9 = t8 ^ t7; |
866 | let t19 = t9 ^ t14; |
867 | let t23 = t19 ^ y21; |
868 | let y3 = y5 ^ y8; |
869 | // y6 <- stack |
870 | let t3 = y3 & y6; |
871 | let t4 = t3 ^ t2; |
872 | // y20 <- stack |
873 | let t17 = t4 ^ y20; |
874 | let t21 = t17 ^ t14; |
875 | let t26 = t21 & t23; |
876 | let t27 = t24 ^ t26; |
877 | let t31 = t22 ^ t26; |
878 | let t25 = t21 ^ t22; |
879 | // y4 -> stack |
880 | let t28 = t25 & t27; |
881 | let t29 = t28 ^ t22; |
882 | let z14 = t29 & y2; |
883 | let z5 = t29 & y7; |
884 | let t30 = t23 ^ t24; |
885 | let t32 = t31 & t30; |
886 | let t33 = t32 ^ t24; |
887 | let t35 = t27 ^ t33; |
888 | let t36 = t24 & t35; |
889 | let t38 = t27 ^ t36; |
890 | let t39 = t29 & t38; |
891 | let t40 = t25 ^ t39; |
892 | let t43 = t29 ^ t40; |
893 | // y16 <- stack |
894 | let z3 = t43 & y16; |
895 | let tc12 = z3 ^ z5; |
896 | // tc12 -> stack |
897 | // y13 <- stack |
898 | let z12 = t43 & y13; |
899 | let z13 = t40 & y5; |
900 | let z4 = t40 & y1; |
901 | let tc6 = z3 ^ z4; |
902 | let t34 = t23 ^ t33; |
903 | let t37 = t36 ^ t34; |
904 | let t41 = t40 ^ t37; |
905 | // y10 <- stack |
906 | let z8 = t41 & y10; |
907 | let z17 = t41 & y8; |
908 | let t44 = t33 ^ t37; |
909 | // y15 <- stack |
910 | let z0 = t44 & y15; |
911 | // z17 -> stack |
912 | // y12 <- stack |
913 | let z9 = t44 & y12; |
914 | let z10 = t37 & y3; |
915 | let z1 = t37 & y6; |
916 | let tc5 = z1 ^ z0; |
917 | let tc11 = tc6 ^ tc5; |
918 | // y4 <- stack |
919 | let z11 = t33 & y4; |
920 | let t42 = t29 ^ t33; |
921 | let t45 = t42 ^ t41; |
922 | // y17 <- stack |
923 | let z7 = t45 & y17; |
924 | let tc8 = z7 ^ tc6; |
925 | // y14 <- stack |
926 | let z16 = t45 & y14; |
927 | // y11 <- stack |
928 | let z6 = t42 & y11; |
929 | let tc16 = z6 ^ tc8; |
930 | // z14 -> stack |
931 | // y9 <- stack |
932 | let z15 = t42 & y9; |
933 | let tc20 = z15 ^ tc16; |
934 | let tc1 = z15 ^ z16; |
935 | let tc2 = z10 ^ tc1; |
936 | let tc21 = tc2 ^ z11; |
937 | let tc3 = z9 ^ tc2; |
938 | let s0 = tc3 ^ tc16; |
939 | let s3 = tc3 ^ tc11; |
940 | let s1 = s3 ^ tc16; |
941 | let tc13 = z13 ^ tc1; |
942 | // u7 <- stack |
943 | let z2 = t33 & u7; |
944 | let tc4 = z0 ^ z2; |
945 | let tc7 = z12 ^ tc4; |
946 | let tc9 = z8 ^ tc7; |
947 | let tc10 = tc8 ^ tc9; |
948 | // z14 <- stack |
949 | let tc17 = z14 ^ tc10; |
950 | let s5 = tc21 ^ tc17; |
951 | let tc26 = tc17 ^ tc20; |
952 | // z17 <- stack |
953 | let s2 = tc26 ^ z17; |
954 | // tc12 <- stack |
955 | let tc14 = tc4 ^ tc12; |
956 | let tc18 = tc13 ^ tc14; |
957 | let s6 = tc10 ^ tc18; |
958 | let s7 = z12 ^ tc18; |
959 | let s4 = tc14 ^ s3; |
960 | |
961 | state[0] = s7; |
962 | state[1] = s6; |
963 | state[2] = s5; |
964 | state[3] = s4; |
965 | state[4] = s3; |
966 | state[5] = s2; |
967 | state[6] = s1; |
968 | state[7] = s0; |
969 | } |
970 | |
971 | /// NOT operations that are omitted in S-box |
972 | #[inline ] |
973 | fn sub_bytes_nots(state: &mut [u64]) { |
974 | debug_assert_eq!(state.len(), 8); |
975 | state[0] ^= 0xffffffffffffffff; |
976 | state[1] ^= 0xffffffffffffffff; |
977 | state[5] ^= 0xffffffffffffffff; |
978 | state[6] ^= 0xffffffffffffffff; |
979 | } |
980 | |
981 | /// Computation of the MixColumns transformation in the fixsliced representation, with different |
982 | /// rotations used according to the round number mod 4. |
983 | /// |
984 | /// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm. |
985 | macro_rules! define_mix_columns { |
986 | ( |
987 | $name:ident, |
988 | $name_inv:ident, |
989 | $first_rotate:path, |
990 | $second_rotate:path |
991 | ) => { |
992 | #[rustfmt::skip] |
993 | fn $name(state: &mut State) { |
994 | let (a0, a1, a2, a3, a4, a5, a6, a7) = ( |
995 | state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] |
996 | ); |
997 | let (b0, b1, b2, b3, b4, b5, b6, b7) = ( |
998 | $first_rotate(a0), |
999 | $first_rotate(a1), |
1000 | $first_rotate(a2), |
1001 | $first_rotate(a3), |
1002 | $first_rotate(a4), |
1003 | $first_rotate(a5), |
1004 | $first_rotate(a6), |
1005 | $first_rotate(a7), |
1006 | ); |
1007 | let (c0, c1, c2, c3, c4, c5, c6, c7) = ( |
1008 | a0 ^ b0, |
1009 | a1 ^ b1, |
1010 | a2 ^ b2, |
1011 | a3 ^ b3, |
1012 | a4 ^ b4, |
1013 | a5 ^ b5, |
1014 | a6 ^ b6, |
1015 | a7 ^ b7, |
1016 | ); |
1017 | state[0] = b0 ^ c7 ^ $second_rotate(c0); |
1018 | state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1); |
1019 | state[2] = b2 ^ c1 ^ $second_rotate(c2); |
1020 | state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3); |
1021 | state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4); |
1022 | state[5] = b5 ^ c4 ^ $second_rotate(c5); |
1023 | state[6] = b6 ^ c5 ^ $second_rotate(c6); |
1024 | state[7] = b7 ^ c6 ^ $second_rotate(c7); |
1025 | } |
1026 | |
1027 | #[rustfmt::skip] |
1028 | fn $name_inv(state: &mut State) { |
1029 | let (a0, a1, a2, a3, a4, a5, a6, a7) = ( |
1030 | state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] |
1031 | ); |
1032 | let (b0, b1, b2, b3, b4, b5, b6, b7) = ( |
1033 | $first_rotate(a0), |
1034 | $first_rotate(a1), |
1035 | $first_rotate(a2), |
1036 | $first_rotate(a3), |
1037 | $first_rotate(a4), |
1038 | $first_rotate(a5), |
1039 | $first_rotate(a6), |
1040 | $first_rotate(a7), |
1041 | ); |
1042 | let (c0, c1, c2, c3, c4, c5, c6, c7) = ( |
1043 | a0 ^ b0, |
1044 | a1 ^ b1, |
1045 | a2 ^ b2, |
1046 | a3 ^ b3, |
1047 | a4 ^ b4, |
1048 | a5 ^ b5, |
1049 | a6 ^ b6, |
1050 | a7 ^ b7, |
1051 | ); |
1052 | let (d0, d1, d2, d3, d4, d5, d6, d7) = ( |
1053 | a0 ^ c7, |
1054 | a1 ^ c0 ^ c7, |
1055 | a2 ^ c1, |
1056 | a3 ^ c2 ^ c7, |
1057 | a4 ^ c3 ^ c7, |
1058 | a5 ^ c4, |
1059 | a6 ^ c5, |
1060 | a7 ^ c6, |
1061 | ); |
1062 | let (e0, e1, e2, e3, e4, e5, e6, e7) = ( |
1063 | c0 ^ d6, |
1064 | c1 ^ d6 ^ d7, |
1065 | c2 ^ d0 ^ d7, |
1066 | c3 ^ d1 ^ d6, |
1067 | c4 ^ d2 ^ d6 ^ d7, |
1068 | c5 ^ d3 ^ d7, |
1069 | c6 ^ d4, |
1070 | c7 ^ d5, |
1071 | ); |
1072 | state[0] = d0 ^ e0 ^ $second_rotate(e0); |
1073 | state[1] = d1 ^ e1 ^ $second_rotate(e1); |
1074 | state[2] = d2 ^ e2 ^ $second_rotate(e2); |
1075 | state[3] = d3 ^ e3 ^ $second_rotate(e3); |
1076 | state[4] = d4 ^ e4 ^ $second_rotate(e4); |
1077 | state[5] = d5 ^ e5 ^ $second_rotate(e5); |
1078 | state[6] = d6 ^ e6 ^ $second_rotate(e6); |
1079 | state[7] = d7 ^ e7 ^ $second_rotate(e7); |
1080 | } |
1081 | } |
1082 | } |
1083 | |
1084 | define_mix_columns!( |
1085 | mix_columns_0, |
1086 | inv_mix_columns_0, |
1087 | rotate_rows_1, |
1088 | rotate_rows_2 |
1089 | ); |
1090 | |
1091 | define_mix_columns!( |
1092 | mix_columns_1, |
1093 | inv_mix_columns_1, |
1094 | rotate_rows_and_columns_1_1, |
1095 | rotate_rows_and_columns_2_2 |
1096 | ); |
1097 | |
1098 | #[cfg (not(aes_compact))] |
1099 | define_mix_columns!( |
1100 | mix_columns_2, |
1101 | inv_mix_columns_2, |
1102 | rotate_rows_and_columns_1_2, |
1103 | rotate_rows_2 |
1104 | ); |
1105 | |
1106 | #[cfg (not(aes_compact))] |
1107 | define_mix_columns!( |
1108 | mix_columns_3, |
1109 | inv_mix_columns_3, |
1110 | rotate_rows_and_columns_1_3, |
1111 | rotate_rows_and_columns_2_2 |
1112 | ); |
1113 | |
1114 | #[inline ] |
1115 | fn delta_swap_1(a: &mut u64, shift: u32, mask: u64) { |
1116 | let t: u64 = (*a ^ ((*a) >> shift)) & mask; |
1117 | *a ^= t ^ (t << shift); |
1118 | } |
1119 | |
1120 | #[inline ] |
1121 | fn delta_swap_2(a: &mut u64, b: &mut u64, shift: u32, mask: u64) { |
1122 | let t: u64 = (*a ^ ((*b) >> shift)) & mask; |
1123 | *a ^= t; |
1124 | *b ^= t << shift; |
1125 | } |
1126 | |
1127 | /// Applies ShiftRows once on an AES state (or key). |
1128 | #[cfg (any(not(aes_compact), feature = "hazmat" ))] |
1129 | #[inline ] |
1130 | fn shift_rows_1(state: &mut [u64]) { |
1131 | debug_assert_eq!(state.len(), 8); |
1132 | for x: &mut u64 in state.iter_mut() { |
1133 | delta_swap_1(a:x, shift:8, mask:0x00f000ff000f0000); |
1134 | delta_swap_1(a:x, shift:4, mask:0x0f0f00000f0f0000); |
1135 | } |
1136 | } |
1137 | |
1138 | /// Applies ShiftRows twice on an AES state (or key). |
1139 | #[inline ] |
1140 | fn shift_rows_2(state: &mut [u64]) { |
1141 | debug_assert_eq!(state.len(), 8); |
1142 | for x: &mut u64 in state.iter_mut() { |
1143 | delta_swap_1(a:x, shift:8, mask:0x00ff000000ff0000); |
1144 | } |
1145 | } |
1146 | |
1147 | /// Applies ShiftRows three times on an AES state (or key). |
1148 | #[inline ] |
1149 | fn shift_rows_3(state: &mut [u64]) { |
1150 | debug_assert_eq!(state.len(), 8); |
1151 | for x: &mut u64 in state.iter_mut() { |
1152 | delta_swap_1(a:x, shift:8, mask:0x000f00ff00f00000); |
1153 | delta_swap_1(a:x, shift:4, mask:0x0f0f00000f0f0000); |
1154 | } |
1155 | } |
1156 | |
1157 | #[inline (always)] |
1158 | fn inv_shift_rows_1(state: &mut [u64]) { |
1159 | shift_rows_3(state); |
1160 | } |
1161 | |
1162 | #[inline (always)] |
1163 | fn inv_shift_rows_2(state: &mut [u64]) { |
1164 | shift_rows_2(state); |
1165 | } |
1166 | |
1167 | #[cfg (not(aes_compact))] |
1168 | #[inline (always)] |
1169 | fn inv_shift_rows_3(state: &mut [u64]) { |
1170 | shift_rows_1(state); |
1171 | } |
1172 | |
1173 | /// XOR the columns after the S-box during the key schedule round function. |
1174 | /// |
1175 | /// The `idx_xor` parameter refers to the index of the previous round key that is |
1176 | /// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256, |
1177 | /// respectively). |
1178 | /// |
1179 | /// The `idx_ror` parameter refers to the rotation value, which varies between the |
1180 | /// different key schedules. |
1181 | fn xor_columns(rkeys: &mut [u64], offset: usize, idx_xor: usize, idx_ror: u32) { |
1182 | for i: usize in 0..8 { |
1183 | let off_i: usize = offset + i; |
1184 | let rk: u64 = rkeys[off_i - idx_xor] ^ (0x000f000f000f000f & ror(x:rkeys[off_i], y:idx_ror)); |
1185 | rkeys[off_i] = rk |
1186 | ^ (0xfff0fff0fff0fff0 & (rk << 4)) |
1187 | ^ (0xff00ff00ff00ff00 & (rk << 8)) |
1188 | ^ (0xf000f000f000f000 & (rk << 12)); |
1189 | } |
1190 | } |
1191 | |
1192 | /// Bitslice four 128-bit input blocks input0, input1, input2, input3 into a 512-bit internal state. |
1193 | fn bitslice(output: &mut [u64], input0: &[u8], input1: &[u8], input2: &[u8], input3: &[u8]) { |
1194 | debug_assert_eq!(output.len(), 8); |
1195 | debug_assert_eq!(input0.len(), 16); |
1196 | debug_assert_eq!(input1.len(), 16); |
1197 | debug_assert_eq!(input2.len(), 16); |
1198 | debug_assert_eq!(input3.len(), 16); |
1199 | |
1200 | // Bitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at a |
1201 | // 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the |
1202 | // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition): |
1203 | // b1 b0 c1 c0 r1 r0 p2 p1 p0 |
1204 | // |
1205 | // The desired bitsliced data groups first by bit position, then row, column, block: |
1206 | // p2 p1 p0 r1 r0 c1 c0 b1 b0 |
1207 | |
1208 | #[rustfmt::skip] |
1209 | fn read_reordered(input: &[u8]) -> u64 { |
1210 | (u64::from(input[0x0]) ) | |
1211 | (u64::from(input[0x1]) << 0x10) | |
1212 | (u64::from(input[0x2]) << 0x20) | |
1213 | (u64::from(input[0x3]) << 0x30) | |
1214 | (u64::from(input[0x8]) << 0x08) | |
1215 | (u64::from(input[0x9]) << 0x18) | |
1216 | (u64::from(input[0xa]) << 0x28) | |
1217 | (u64::from(input[0xb]) << 0x38) |
1218 | } |
1219 | |
1220 | // Reorder each block's bytes on input |
1221 | // __ __ c1 c0 r1 r0 __ __ __ => __ __ c0 r1 r0 c1 __ __ __ |
1222 | // Reorder by relabeling (note the order of input) |
1223 | // b1 b0 c0 __ __ __ __ __ __ => c0 b1 b0 __ __ __ __ __ __ |
1224 | let mut t0 = read_reordered(&input0[0x00..0x0c]); |
1225 | let mut t4 = read_reordered(&input0[0x04..0x10]); |
1226 | let mut t1 = read_reordered(&input1[0x00..0x0c]); |
1227 | let mut t5 = read_reordered(&input1[0x04..0x10]); |
1228 | let mut t2 = read_reordered(&input2[0x00..0x0c]); |
1229 | let mut t6 = read_reordered(&input2[0x04..0x10]); |
1230 | let mut t3 = read_reordered(&input3[0x00..0x0c]); |
1231 | let mut t7 = read_reordered(&input3[0x04..0x10]); |
1232 | |
1233 | // Bit Index Swap 6 <-> 0: |
1234 | // __ __ b0 __ __ __ __ __ p0 => __ __ p0 __ __ __ __ __ b0 |
1235 | let m0 = 0x5555555555555555; |
1236 | delta_swap_2(&mut t1, &mut t0, 1, m0); |
1237 | delta_swap_2(&mut t3, &mut t2, 1, m0); |
1238 | delta_swap_2(&mut t5, &mut t4, 1, m0); |
1239 | delta_swap_2(&mut t7, &mut t6, 1, m0); |
1240 | |
1241 | // Bit Index Swap 7 <-> 1: |
1242 | // __ b1 __ __ __ __ __ p1 __ => __ p1 __ __ __ __ __ b1 __ |
1243 | let m1 = 0x3333333333333333; |
1244 | delta_swap_2(&mut t2, &mut t0, 2, m1); |
1245 | delta_swap_2(&mut t3, &mut t1, 2, m1); |
1246 | delta_swap_2(&mut t6, &mut t4, 2, m1); |
1247 | delta_swap_2(&mut t7, &mut t5, 2, m1); |
1248 | |
1249 | // Bit Index Swap 8 <-> 2: |
1250 | // c0 __ __ __ __ __ p2 __ __ => p2 __ __ __ __ __ c0 __ __ |
1251 | let m2 = 0x0f0f0f0f0f0f0f0f; |
1252 | delta_swap_2(&mut t4, &mut t0, 4, m2); |
1253 | delta_swap_2(&mut t5, &mut t1, 4, m2); |
1254 | delta_swap_2(&mut t6, &mut t2, 4, m2); |
1255 | delta_swap_2(&mut t7, &mut t3, 4, m2); |
1256 | |
1257 | // Final bitsliced bit index, as desired: |
1258 | // p2 p1 p0 r1 r0 c1 c0 b1 b0 |
1259 | output[0] = t0; |
1260 | output[1] = t1; |
1261 | output[2] = t2; |
1262 | output[3] = t3; |
1263 | output[4] = t4; |
1264 | output[5] = t5; |
1265 | output[6] = t6; |
1266 | output[7] = t7; |
1267 | } |
1268 | |
1269 | /// Un-bitslice a 512-bit internal state into four 128-bit blocks of output. |
1270 | fn inv_bitslice(input: &[u64]) -> BatchBlocks { |
1271 | debug_assert_eq!(input.len(), 8); |
1272 | |
1273 | // Unbitslicing is a bit index manipulation. 512 bits of data means each bit is positioned at |
1274 | // a 9-bit index. AES data is 4 blocks, each one a 4x4 column-major matrix of bytes, so the |
1275 | // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition): |
1276 | // b1 b0 c1 c0 r1 r0 p2 p1 p0 |
1277 | // |
1278 | // The initially bitsliced data groups first by bit position, then row, column, block: |
1279 | // p2 p1 p0 r1 r0 c1 c0 b1 b0 |
1280 | |
1281 | let mut t0 = input[0]; |
1282 | let mut t1 = input[1]; |
1283 | let mut t2 = input[2]; |
1284 | let mut t3 = input[3]; |
1285 | let mut t4 = input[4]; |
1286 | let mut t5 = input[5]; |
1287 | let mut t6 = input[6]; |
1288 | let mut t7 = input[7]; |
1289 | |
1290 | // TODO: these bit index swaps are identical to those in 'packing' |
1291 | |
1292 | // Bit Index Swap 6 <-> 0: |
1293 | // __ __ p0 __ __ __ __ __ b0 => __ __ b0 __ __ __ __ __ p0 |
1294 | let m0 = 0x5555555555555555; |
1295 | delta_swap_2(&mut t1, &mut t0, 1, m0); |
1296 | delta_swap_2(&mut t3, &mut t2, 1, m0); |
1297 | delta_swap_2(&mut t5, &mut t4, 1, m0); |
1298 | delta_swap_2(&mut t7, &mut t6, 1, m0); |
1299 | |
1300 | // Bit Index Swap 7 <-> 1: |
1301 | // __ p1 __ __ __ __ __ b1 __ => __ b1 __ __ __ __ __ p1 __ |
1302 | let m1 = 0x3333333333333333; |
1303 | delta_swap_2(&mut t2, &mut t0, 2, m1); |
1304 | delta_swap_2(&mut t3, &mut t1, 2, m1); |
1305 | delta_swap_2(&mut t6, &mut t4, 2, m1); |
1306 | delta_swap_2(&mut t7, &mut t5, 2, m1); |
1307 | |
1308 | // Bit Index Swap 8 <-> 2: |
1309 | // p2 __ __ __ __ __ c0 __ __ => c0 __ __ __ __ __ p2 __ __ |
1310 | let m2 = 0x0f0f0f0f0f0f0f0f; |
1311 | delta_swap_2(&mut t4, &mut t0, 4, m2); |
1312 | delta_swap_2(&mut t5, &mut t1, 4, m2); |
1313 | delta_swap_2(&mut t6, &mut t2, 4, m2); |
1314 | delta_swap_2(&mut t7, &mut t3, 4, m2); |
1315 | |
1316 | #[rustfmt::skip] |
1317 | fn write_reordered(columns: u64, output: &mut [u8]) { |
1318 | output[0x0] = (columns ) as u8; |
1319 | output[0x1] = (columns >> 0x10) as u8; |
1320 | output[0x2] = (columns >> 0x20) as u8; |
1321 | output[0x3] = (columns >> 0x30) as u8; |
1322 | output[0x8] = (columns >> 0x08) as u8; |
1323 | output[0x9] = (columns >> 0x18) as u8; |
1324 | output[0xa] = (columns >> 0x28) as u8; |
1325 | output[0xb] = (columns >> 0x38) as u8; |
1326 | } |
1327 | |
1328 | let mut output = BatchBlocks::default(); |
1329 | // Reorder by relabeling (note the order of output) |
1330 | // c0 b1 b0 __ __ __ __ __ __ => b1 b0 c0 __ __ __ __ __ __ |
1331 | // Reorder each block's bytes on output |
1332 | // __ __ c0 r1 r0 c1 __ __ __ => __ __ c1 c0 r1 r0 __ __ __ |
1333 | write_reordered(t0, &mut output[0][0x00..0x0c]); |
1334 | write_reordered(t4, &mut output[0][0x04..0x10]); |
1335 | write_reordered(t1, &mut output[1][0x00..0x0c]); |
1336 | write_reordered(t5, &mut output[1][0x04..0x10]); |
1337 | write_reordered(t2, &mut output[2][0x00..0x0c]); |
1338 | write_reordered(t6, &mut output[2][0x04..0x10]); |
1339 | write_reordered(t3, &mut output[3][0x00..0x0c]); |
1340 | write_reordered(t7, &mut output[3][0x04..0x10]); |
1341 | |
1342 | // Final AES bit index, as desired: |
1343 | // b1 b0 c1 c0 r1 r0 p2 p1 p0 |
1344 | output |
1345 | } |
1346 | |
1347 | /// Copy 32-bytes within the provided slice to an 8-byte offset |
1348 | fn memshift32(buffer: &mut [u64], src_offset: usize) { |
1349 | debug_assert_eq!(src_offset % 8, 0); |
1350 | |
1351 | let dst_offset: usize = src_offset + 8; |
1352 | debug_assert!(dst_offset + 8 <= buffer.len()); |
1353 | |
1354 | for i: usize in (0..8).rev() { |
1355 | buffer[dst_offset + i] = buffer[src_offset + i]; |
1356 | } |
1357 | } |
1358 | |
1359 | /// XOR the round key to the internal state. The round keys are expected to be |
1360 | /// pre-computed and to be packed in the fixsliced representation. |
1361 | #[inline ] |
1362 | fn add_round_key(state: &mut State, rkey: &[u64]) { |
1363 | debug_assert_eq!(rkey.len(), 8); |
1364 | for (a: &mut u64, b: &u64) in state.iter_mut().zip(rkey) { |
1365 | *a ^= b; |
1366 | } |
1367 | } |
1368 | |
1369 | #[inline (always)] |
1370 | fn add_round_constant_bit(state: &mut [u64], bit: usize) { |
1371 | state[bit] ^= 0x00000000f0000000; |
1372 | } |
1373 | |
1374 | #[inline (always)] |
1375 | fn ror(x: u64, y: u32) -> u64 { |
1376 | x.rotate_right(y) |
1377 | } |
1378 | |
1379 | #[inline (always)] |
1380 | fn ror_distance(rows: u32, cols: u32) -> u32 { |
1381 | (rows << 4) + (cols << 2) |
1382 | } |
1383 | |
1384 | #[inline (always)] |
1385 | fn rotate_rows_1(x: u64) -> u64 { |
1386 | ror(x, y:ror_distance(rows:1, cols:0)) |
1387 | } |
1388 | |
1389 | #[inline (always)] |
1390 | fn rotate_rows_2(x: u64) -> u64 { |
1391 | ror(x, y:ror_distance(rows:2, cols:0)) |
1392 | } |
1393 | |
1394 | #[inline (always)] |
1395 | #[rustfmt::skip] |
1396 | fn rotate_rows_and_columns_1_1(x: u64) -> u64 { |
1397 | (ror(x, y:ror_distance(rows:1, cols:1)) & 0x0fff0fff0fff0fff) | |
1398 | (ror(x, y:ror_distance(rows:0, cols:1)) & 0xf000f000f000f000) |
1399 | } |
1400 | |
1401 | #[cfg (not(aes_compact))] |
1402 | #[inline (always)] |
1403 | #[rustfmt::skip] |
1404 | fn rotate_rows_and_columns_1_2(x: u64) -> u64 { |
1405 | (ror(x, y:ror_distance(rows:1, cols:2)) & 0x00ff00ff00ff00ff) | |
1406 | (ror(x, y:ror_distance(rows:0, cols:2)) & 0xff00ff00ff00ff00) |
1407 | } |
1408 | |
1409 | #[cfg (not(aes_compact))] |
1410 | #[inline (always)] |
1411 | #[rustfmt::skip] |
1412 | fn rotate_rows_and_columns_1_3(x: u64) -> u64 { |
1413 | (ror(x, y:ror_distance(rows:1, cols:3)) & 0x000f000f000f000f) | |
1414 | (ror(x, y:ror_distance(rows:0, cols:3)) & 0xfff0fff0fff0fff0) |
1415 | } |
1416 | |
1417 | #[inline (always)] |
1418 | #[rustfmt::skip] |
1419 | fn rotate_rows_and_columns_2_2(x: u64) -> u64 { |
1420 | (ror(x, y:ror_distance(rows:2, cols:2)) & 0x00ff00ff00ff00ff) | |
1421 | (ror(x, y:ror_distance(rows:1, cols:2)) & 0xff00ff00ff00ff00) |
1422 | } |
1423 | |
1424 | /// Low-level "hazmat" AES functions. |
1425 | /// |
1426 | /// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256` |
1427 | /// implementations in this crate, but instead provides raw access to |
1428 | /// the AES round function gated under the `hazmat` crate feature. |
1429 | #[cfg (feature = "hazmat" )] |
1430 | pub(crate) mod hazmat { |
1431 | use super::{ |
1432 | bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0, |
1433 | shift_rows_1, sub_bytes, sub_bytes_nots, State, |
1434 | }; |
1435 | use crate::{Block, Block8}; |
1436 | |
1437 | /// XOR the `src` block into the `dst` block in-place. |
1438 | fn xor_in_place(dst: &mut Block, src: &Block) { |
1439 | for (a, b) in dst.iter_mut().zip(src.as_slice()) { |
1440 | *a ^= *b; |
1441 | } |
1442 | } |
1443 | |
1444 | /// Perform a bitslice operation, loading a single block. |
1445 | fn bitslice_block(block: &Block) -> State { |
1446 | let mut state = State::default(); |
1447 | bitslice(&mut state, block, block, block, block); |
1448 | state |
1449 | } |
1450 | |
1451 | /// Perform an inverse bitslice operation, extracting a single block. |
1452 | fn inv_bitslice_block(block: &mut Block, state: &State) { |
1453 | block.copy_from_slice(&inv_bitslice(state)[0]); |
1454 | } |
1455 | |
1456 | /// AES cipher (encrypt) round function. |
1457 | #[inline ] |
1458 | pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) { |
1459 | let mut state = bitslice_block(block); |
1460 | sub_bytes(&mut state); |
1461 | sub_bytes_nots(&mut state); |
1462 | shift_rows_1(&mut state); |
1463 | mix_columns_0(&mut state); |
1464 | inv_bitslice_block(block, &state); |
1465 | xor_in_place(block, round_key); |
1466 | } |
1467 | |
1468 | /// AES cipher (encrypt) round function: parallel version. |
1469 | #[inline ] |
1470 | pub(crate) fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) { |
1471 | for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) { |
1472 | let mut state = State::default(); |
1473 | bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]); |
1474 | sub_bytes(&mut state); |
1475 | sub_bytes_nots(&mut state); |
1476 | shift_rows_1(&mut state); |
1477 | mix_columns_0(&mut state); |
1478 | let res = inv_bitslice(&state); |
1479 | |
1480 | for i in 0..4 { |
1481 | chunk[i] = res[i]; |
1482 | xor_in_place(&mut chunk[i], &keys[i]); |
1483 | } |
1484 | } |
1485 | } |
1486 | |
1487 | /// AES cipher (encrypt) round function. |
1488 | #[inline ] |
1489 | pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) { |
1490 | let mut state = State::default(); |
1491 | bitslice(&mut state, block, block, block, block); |
1492 | sub_bytes_nots(&mut state); |
1493 | inv_sub_bytes(&mut state); |
1494 | inv_shift_rows_1(&mut state); |
1495 | inv_mix_columns_0(&mut state); |
1496 | inv_bitslice_block(block, &state); |
1497 | xor_in_place(block, round_key); |
1498 | } |
1499 | |
1500 | /// AES cipher (encrypt) round function: parallel version. |
1501 | #[inline ] |
1502 | pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: &Block8) { |
1503 | for (chunk, keys) in blocks.chunks_exact_mut(4).zip(round_keys.chunks_exact(4)) { |
1504 | let mut state = State::default(); |
1505 | bitslice(&mut state, &chunk[0], &chunk[1], &chunk[2], &chunk[3]); |
1506 | sub_bytes_nots(&mut state); |
1507 | inv_sub_bytes(&mut state); |
1508 | inv_shift_rows_1(&mut state); |
1509 | inv_mix_columns_0(&mut state); |
1510 | let res = inv_bitslice(&state); |
1511 | |
1512 | for i in 0..4 { |
1513 | chunk[i] = res[i]; |
1514 | xor_in_place(&mut chunk[i], &keys[i]); |
1515 | } |
1516 | } |
1517 | } |
1518 | |
1519 | /// AES mix columns function. |
1520 | #[inline ] |
1521 | pub(crate) fn mix_columns(block: &mut Block) { |
1522 | let mut state = bitslice_block(block); |
1523 | mix_columns_0(&mut state); |
1524 | inv_bitslice_block(block, &state); |
1525 | } |
1526 | |
1527 | /// AES inverse mix columns function. |
1528 | #[inline ] |
1529 | pub(crate) fn inv_mix_columns(block: &mut Block) { |
1530 | let mut state = bitslice_block(block); |
1531 | inv_mix_columns_0(&mut state); |
1532 | inv_bitslice_block(block, &state); |
1533 | } |
1534 | } |
1535 | |