1########################################################################
2# Copyright (c) 2013, Intel Corporation
3#
4# This software is available to you under a choice of one of two
5# licenses. You may choose to be licensed under the terms of the GNU
6# General Public License (GPL) Version 2, available from the file
7# COPYING in the main directory of this source tree, or the
8# OpenIB.org BSD license below:
9#
10# Redistribution and use in source and binary forms, with or without
11# modification, are permitted provided that the following conditions are
12# met:
13#
14# * Redistributions of source code must retain the above copyright
15# notice, this list of conditions and the following disclaimer.
16#
17# * Redistributions in binary form must reproduce the above copyright
18# notice, this list of conditions and the following disclaimer in the
19# documentation and/or other materials provided with the
20# distribution.
21#
22# * Neither the name of the Intel Corporation nor the names of its
23# contributors may be used to endorse or promote products derived from
24# this software without specific prior written permission.
25#
26#
27# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38########################################################################
39##
40## Authors:
41## Erdinc Ozturk <erdinc.ozturk@intel.com>
42## Vinodh Gopal <vinodh.gopal@intel.com>
43## James Guilford <james.guilford@intel.com>
44## Tim Chen <tim.c.chen@linux.intel.com>
45##
46## References:
47## This code was derived and highly optimized from the code described in paper:
48## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49## on Intel Architecture Processors. August, 2010
50## The details of the implementation is explained in:
51## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52## on Intel Architecture Processors. October, 2012.
53##
54## Assumptions:
55##
56##
57##
58## iv:
59## 0 1 2 3
60## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62## | Salt (From the SA) |
63## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64## | Initialization Vector |
65## | (This is the sequence number from IPSec header) |
66## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67## | 0x1 |
68## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69##
70##
71##
72## AAD:
73## AAD padded to 128 bits with 0
74## for example, assume AAD is a u32 vector
75##
76## if AAD is 8 bytes:
77## AAD[3] = {A0, A1}#
78## padded AAD in xmm register = {A1 A0 0 0}
79##
80## 0 1 2 3
81## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83## | SPI (A1) |
84## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85## | 32-bit Sequence Number (A0) |
86## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87## | 0x0 |
88## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89##
90## AAD Format with 32-bit Sequence Number
91##
92## if AAD is 12 bytes:
93## AAD[3] = {A0, A1, A2}#
94## padded AAD in xmm register = {A2 A1 A0 0}
95##
96## 0 1 2 3
97## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99## | SPI (A2) |
100## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101## | 64-bit Extended Sequence Number {A1,A0} |
102## | |
103## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104## | 0x0 |
105## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106##
107## AAD Format with 64-bit Extended Sequence Number
108##
109##
110## aadLen:
111## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112## The code additionally supports aadLen of length 16 bytes.
113##
114## TLen:
115## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116##
117## poly = x^128 + x^127 + x^126 + x^121 + 1
118## throughout the code, one tab and two tab indentations are used. one tab is
119## for GHASH part, two tabs is for AES part.
120##
121
122#include <linux/linkage.h>
123
124# constants in mergeable sections, linker can reorder and merge
125.section .rodata.cst16.POLY, "aM", @progbits, 16
126.align 16
127POLY: .octa 0xC2000000000000000000000000000001
128
129.section .rodata.cst16.POLY2, "aM", @progbits, 16
130.align 16
131POLY2: .octa 0xC20000000000000000000001C2000000
132
133.section .rodata.cst16.TWOONE, "aM", @progbits, 16
134.align 16
135TWOONE: .octa 0x00000001000000000000000000000001
136
137.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
138.align 16
139SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
140
141.section .rodata.cst16.ONE, "aM", @progbits, 16
142.align 16
143ONE: .octa 0x00000000000000000000000000000001
144
145.section .rodata.cst16.ONEf, "aM", @progbits, 16
146.align 16
147ONEf: .octa 0x01000000000000000000000000000000
148
149# order of these constants should not change.
150# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151.section .rodata, "a", @progbits
152.align 16
153SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
154ALL_F: .octa 0xffffffffffffffffffffffffffffffff
155 .octa 0x00000000000000000000000000000000
156
157.text
158
159
160#define AadHash 16*0
161#define AadLen 16*1
162#define InLen (16*1)+8
163#define PBlockEncKey 16*2
164#define OrigIV 16*3
165#define CurCount 16*4
166#define PBlockLen 16*5
167
168HashKey = 16*6 # store HashKey <<1 mod poly here
169HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
170HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
171HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
172HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
173HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
174HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
175HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
176HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
177HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
178HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
179HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
180HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
181HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
182HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
183HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
184
185#define arg1 %rdi
186#define arg2 %rsi
187#define arg3 %rdx
188#define arg4 %rcx
189#define arg5 %r8
190#define arg6 %r9
191#define keysize 2*15*16(arg1)
192
193i = 0
194j = 0
195
196out_order = 0
197in_order = 1
198DEC = 0
199ENC = 1
200
201.macro define_reg r n
202reg_\r = %xmm\n
203.endm
204
205.macro setreg
206.altmacro
207define_reg i %i
208define_reg j %j
209.noaltmacro
210.endm
211
212TMP1 = 16*0 # Temporary storage for AAD
213TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
214TMP3 = 16*2 # Temporary storage for AES State 3
215TMP4 = 16*3 # Temporary storage for AES State 4
216TMP5 = 16*4 # Temporary storage for AES State 5
217TMP6 = 16*5 # Temporary storage for AES State 6
218TMP7 = 16*6 # Temporary storage for AES State 7
219TMP8 = 16*7 # Temporary storage for AES State 8
220
221VARIABLE_OFFSET = 16*8
222
223################################
224# Utility Macros
225################################
226
227.macro FUNC_SAVE
228 push %r12
229 push %r13
230 push %r15
231
232 push %rbp
233 mov %rsp, %rbp
234
235 sub $VARIABLE_OFFSET, %rsp
236 and $~63, %rsp # align rsp to 64 bytes
237.endm
238
239.macro FUNC_RESTORE
240 mov %rbp, %rsp
241 pop %rbp
242
243 pop %r15
244 pop %r13
245 pop %r12
246.endm
247
248# Encryption of a single block
249.macro ENCRYPT_SINGLE_BLOCK REP XMM0
250 vpxor (arg1), \XMM0, \XMM0
251 i = 1
252 setreg
253.rep \REP
254 vaesenc 16*i(arg1), \XMM0, \XMM0
255 i = (i+1)
256 setreg
257.endr
258 vaesenclast 16*i(arg1), \XMM0, \XMM0
259.endm
260
261# combined for GCM encrypt and decrypt functions
262# clobbering all xmm registers
263# clobbering r10, r11, r12, r13, r15, rax
264.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
265 vmovdqu AadHash(arg2), %xmm8
266 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
267 add arg5, InLen(arg2)
268
269 # initialize the data pointer offset as zero
270 xor %r11d, %r11d
271
272 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
273 sub %r11, arg5
274
275 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
276 and $-16, %r13 # r13 = r13 - (r13 mod 16)
277
278 mov %r13, %r12
279 shr $4, %r12
280 and $7, %r12
281 jz .L_initial_num_blocks_is_0\@
282
283 cmp $7, %r12
284 je .L_initial_num_blocks_is_7\@
285 cmp $6, %r12
286 je .L_initial_num_blocks_is_6\@
287 cmp $5, %r12
288 je .L_initial_num_blocks_is_5\@
289 cmp $4, %r12
290 je .L_initial_num_blocks_is_4\@
291 cmp $3, %r12
292 je .L_initial_num_blocks_is_3\@
293 cmp $2, %r12
294 je .L_initial_num_blocks_is_2\@
295
296 jmp .L_initial_num_blocks_is_1\@
297
298.L_initial_num_blocks_is_7\@:
299 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
300 sub $16*7, %r13
301 jmp .L_initial_blocks_encrypted\@
302
303.L_initial_num_blocks_is_6\@:
304 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
305 sub $16*6, %r13
306 jmp .L_initial_blocks_encrypted\@
307
308.L_initial_num_blocks_is_5\@:
309 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
310 sub $16*5, %r13
311 jmp .L_initial_blocks_encrypted\@
312
313.L_initial_num_blocks_is_4\@:
314 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
315 sub $16*4, %r13
316 jmp .L_initial_blocks_encrypted\@
317
318.L_initial_num_blocks_is_3\@:
319 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
320 sub $16*3, %r13
321 jmp .L_initial_blocks_encrypted\@
322
323.L_initial_num_blocks_is_2\@:
324 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
325 sub $16*2, %r13
326 jmp .L_initial_blocks_encrypted\@
327
328.L_initial_num_blocks_is_1\@:
329 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
330 sub $16*1, %r13
331 jmp .L_initial_blocks_encrypted\@
332
333.L_initial_num_blocks_is_0\@:
334 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
335
336
337.L_initial_blocks_encrypted\@:
338 test %r13, %r13
339 je .L_zero_cipher_left\@
340
341 sub $128, %r13
342 je .L_eight_cipher_left\@
343
344
345
346
347 vmovd %xmm9, %r15d
348 and $255, %r15d
349 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
350
351
352.L_encrypt_by_8_new\@:
353 cmp $(255-8), %r15d
354 jg .L_encrypt_by_8\@
355
356
357
358 add $8, %r15b
359 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
360 add $128, %r11
361 sub $128, %r13
362 jne .L_encrypt_by_8_new\@
363
364 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
365 jmp .L_eight_cipher_left\@
366
367.L_encrypt_by_8\@:
368 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
369 add $8, %r15b
370 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
371 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
372 add $128, %r11
373 sub $128, %r13
374 jne .L_encrypt_by_8_new\@
375
376 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
377
378
379
380
381.L_eight_cipher_left\@:
382 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
383
384
385.L_zero_cipher_left\@:
386 vmovdqu %xmm14, AadHash(arg2)
387 vmovdqu %xmm9, CurCount(arg2)
388
389 # check for 0 length
390 mov arg5, %r13
391 and $15, %r13 # r13 = (arg5 mod 16)
392
393 je .L_multiple_of_16_bytes\@
394
395 # handle the last <16 Byte block separately
396
397 mov %r13, PBlockLen(arg2)
398
399 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
400 vmovdqu %xmm9, CurCount(arg2)
401 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
402
403 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
404 vmovdqu %xmm9, PBlockEncKey(arg2)
405
406 cmp $16, arg5
407 jge .L_large_enough_update\@
408
409 lea (arg4,%r11,1), %r10
410 mov %r13, %r12
411
412 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
413
414 lea SHIFT_MASK+16(%rip), %r12
415 sub %r13, %r12 # adjust the shuffle mask pointer to be
416 # able to shift 16-r13 bytes (r13 is the
417 # number of bytes in plaintext mod 16)
418
419 jmp .L_final_ghash_mul\@
420
421.L_large_enough_update\@:
422 sub $16, %r11
423 add %r13, %r11
424
425 # receive the last <16 Byte block
426 vmovdqu (arg4, %r11, 1), %xmm1
427
428 sub %r13, %r11
429 add $16, %r11
430
431 lea SHIFT_MASK+16(%rip), %r12
432 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
433 # (r13 is the number of bytes in plaintext mod 16)
434 sub %r13, %r12
435 # get the appropriate shuffle mask
436 vmovdqu (%r12), %xmm2
437 # shift right 16-r13 bytes
438 vpshufb %xmm2, %xmm1, %xmm1
439
440.L_final_ghash_mul\@:
441 .if \ENC_DEC == DEC
442 vmovdqa %xmm1, %xmm2
443 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
444 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
445 # mask out top 16-r13 bytes of xmm9
446 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
447 vpand %xmm1, %xmm2, %xmm2
448 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
449 vpxor %xmm2, %xmm14, %xmm14
450
451 vmovdqu %xmm14, AadHash(arg2)
452 .else
453 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
454 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
455 # mask out top 16-r13 bytes of xmm9
456 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
457 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
458 vpxor %xmm9, %xmm14, %xmm14
459
460 vmovdqu %xmm14, AadHash(arg2)
461 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
462 .endif
463
464
465 #############################
466 # output r13 Bytes
467 vmovq %xmm9, %rax
468 cmp $8, %r13
469 jle .L_less_than_8_bytes_left\@
470
471 mov %rax, (arg3 , %r11)
472 add $8, %r11
473 vpsrldq $8, %xmm9, %xmm9
474 vmovq %xmm9, %rax
475 sub $8, %r13
476
477.L_less_than_8_bytes_left\@:
478 movb %al, (arg3 , %r11)
479 add $1, %r11
480 shr $8, %rax
481 sub $1, %r13
482 jne .L_less_than_8_bytes_left\@
483 #############################
484
485.L_multiple_of_16_bytes\@:
486.endm
487
488
489# GCM_COMPLETE Finishes update of tag of last partial block
490# Output: Authorization Tag (AUTH_TAG)
491# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
492.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
493 vmovdqu AadHash(arg2), %xmm14
494 vmovdqu HashKey(arg2), %xmm13
495
496 mov PBlockLen(arg2), %r12
497 test %r12, %r12
498 je .L_partial_done\@
499
500 #GHASH computation for the last <16 Byte block
501 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
502
503.L_partial_done\@:
504 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
505 shl $3, %r12 # convert into number of bits
506 vmovd %r12d, %xmm15 # len(A) in xmm15
507
508 mov InLen(arg2), %r12
509 shl $3, %r12 # len(C) in bits (*128)
510 vmovq %r12, %xmm1
511 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
512 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
513
514 vpxor %xmm15, %xmm14, %xmm14
515 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
516 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
517
518 vmovdqu OrigIV(arg2), %xmm9
519
520 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
521
522 vpxor %xmm14, %xmm9, %xmm9
523
524
525
526.L_return_T\@:
527 mov \AUTH_TAG, %r10 # r10 = authTag
528 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
529
530 cmp $16, %r11
531 je .L_T_16\@
532
533 cmp $8, %r11
534 jl .L_T_4\@
535
536.L_T_8\@:
537 vmovq %xmm9, %rax
538 mov %rax, (%r10)
539 add $8, %r10
540 sub $8, %r11
541 vpsrldq $8, %xmm9, %xmm9
542 test %r11, %r11
543 je .L_return_T_done\@
544.L_T_4\@:
545 vmovd %xmm9, %eax
546 mov %eax, (%r10)
547 add $4, %r10
548 sub $4, %r11
549 vpsrldq $4, %xmm9, %xmm9
550 test %r11, %r11
551 je .L_return_T_done\@
552.L_T_123\@:
553 vmovd %xmm9, %eax
554 cmp $2, %r11
555 jl .L_T_1\@
556 mov %ax, (%r10)
557 cmp $2, %r11
558 je .L_return_T_done\@
559 add $2, %r10
560 sar $16, %eax
561.L_T_1\@:
562 mov %al, (%r10)
563 jmp .L_return_T_done\@
564
565.L_T_16\@:
566 vmovdqu %xmm9, (%r10)
567
568.L_return_T_done\@:
569.endm
570
571.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
572
573 mov \AAD, %r10 # r10 = AAD
574 mov \AADLEN, %r12 # r12 = aadLen
575
576
577 mov %r12, %r11
578
579 vpxor \T8, \T8, \T8
580 vpxor \T7, \T7, \T7
581 cmp $16, %r11
582 jl .L_get_AAD_rest8\@
583.L_get_AAD_blocks\@:
584 vmovdqu (%r10), \T7
585 vpshufb SHUF_MASK(%rip), \T7, \T7
586 vpxor \T7, \T8, \T8
587 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
588 add $16, %r10
589 sub $16, %r12
590 sub $16, %r11
591 cmp $16, %r11
592 jge .L_get_AAD_blocks\@
593 vmovdqu \T8, \T7
594 test %r11, %r11
595 je .L_get_AAD_done\@
596
597 vpxor \T7, \T7, \T7
598
599 /* read the last <16B of AAD. since we have at least 4B of
600 data right after the AAD (the ICV, and maybe some CT), we can
601 read 4B/8B blocks safely, and then get rid of the extra stuff */
602.L_get_AAD_rest8\@:
603 cmp $4, %r11
604 jle .L_get_AAD_rest4\@
605 movq (%r10), \T1
606 add $8, %r10
607 sub $8, %r11
608 vpslldq $8, \T1, \T1
609 vpsrldq $8, \T7, \T7
610 vpxor \T1, \T7, \T7
611 jmp .L_get_AAD_rest8\@
612.L_get_AAD_rest4\@:
613 test %r11, %r11
614 jle .L_get_AAD_rest0\@
615 mov (%r10), %eax
616 movq %rax, \T1
617 add $4, %r10
618 sub $4, %r11
619 vpslldq $12, \T1, \T1
620 vpsrldq $4, \T7, \T7
621 vpxor \T1, \T7, \T7
622.L_get_AAD_rest0\@:
623 /* finalize: shift out the extra bytes we read, and align
624 left. since pslldq can only shift by an immediate, we use
625 vpshufb and a pair of shuffle masks */
626 leaq ALL_F(%rip), %r11
627 subq %r12, %r11
628 vmovdqu 16(%r11), \T1
629 andq $~3, %r11
630 vpshufb (%r11), \T7, \T7
631 vpand \T1, \T7, \T7
632.L_get_AAD_rest_final\@:
633 vpshufb SHUF_MASK(%rip), \T7, \T7
634 vpxor \T8, \T7, \T7
635 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
636
637.L_get_AAD_done\@:
638 vmovdqu \T7, AadHash(arg2)
639.endm
640
641.macro INIT GHASH_MUL PRECOMPUTE
642 mov arg6, %r11
643 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
644 xor %r11d, %r11d
645 mov %r11, InLen(arg2) # ctx_data.in_length = 0
646
647 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
648 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
649 mov arg3, %rax
650 movdqu (%rax), %xmm0
651 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
652
653 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
654 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
655
656 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
657
658 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
659 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
660 vmovdqa %xmm6, %xmm2
661 vpsllq $1, %xmm6, %xmm6
662 vpsrlq $63, %xmm2, %xmm2
663 vmovdqa %xmm2, %xmm1
664 vpslldq $8, %xmm2, %xmm2
665 vpsrldq $8, %xmm1, %xmm1
666 vpor %xmm2, %xmm6, %xmm6
667 #reduction
668 vpshufd $0b00100100, %xmm1, %xmm2
669 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
670 vpand POLY(%rip), %xmm2, %xmm2
671 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
672 #######################################################################
673 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
674
675 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
676
677 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
678.endm
679
680
681# Reads DLEN bytes starting at DPTR and stores in XMMDst
682# where 0 < DLEN < 16
683# Clobbers %rax, DLEN
684.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
685 vpxor \XMMDst, \XMMDst, \XMMDst
686
687 cmp $8, \DLEN
688 jl .L_read_lt8_\@
689 mov (\DPTR), %rax
690 vpinsrq $0, %rax, \XMMDst, \XMMDst
691 sub $8, \DLEN
692 jz .L_done_read_partial_block_\@
693 xor %eax, %eax
694.L_read_next_byte_\@:
695 shl $8, %rax
696 mov 7(\DPTR, \DLEN, 1), %al
697 dec \DLEN
698 jnz .L_read_next_byte_\@
699 vpinsrq $1, %rax, \XMMDst, \XMMDst
700 jmp .L_done_read_partial_block_\@
701.L_read_lt8_\@:
702 xor %eax, %eax
703.L_read_next_byte_lt8_\@:
704 shl $8, %rax
705 mov -1(\DPTR, \DLEN, 1), %al
706 dec \DLEN
707 jnz .L_read_next_byte_lt8_\@
708 vpinsrq $0, %rax, \XMMDst, \XMMDst
709.L_done_read_partial_block_\@:
710.endm
711
712# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
713# between update calls.
714# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
715# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
716# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
717.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
718 AAD_HASH ENC_DEC
719 mov PBlockLen(arg2), %r13
720 test %r13, %r13
721 je .L_partial_block_done_\@ # Leave Macro if no partial blocks
722 # Read in input data without over reading
723 cmp $16, \PLAIN_CYPH_LEN
724 jl .L_fewer_than_16_bytes_\@
725 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
726 jmp .L_data_read_\@
727
728.L_fewer_than_16_bytes_\@:
729 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
730 mov \PLAIN_CYPH_LEN, %r12
731 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
732
733 mov PBlockLen(arg2), %r13
734
735.L_data_read_\@: # Finished reading in data
736
737 vmovdqu PBlockEncKey(arg2), %xmm9
738 vmovdqu HashKey(arg2), %xmm13
739
740 lea SHIFT_MASK(%rip), %r12
741
742 # adjust the shuffle mask pointer to be able to shift r13 bytes
743 # r16-r13 is the number of bytes in plaintext mod 16)
744 add %r13, %r12
745 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
746 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
747
748.if \ENC_DEC == DEC
749 vmovdqa %xmm1, %xmm3
750 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
751
752 mov \PLAIN_CYPH_LEN, %r10
753 add %r13, %r10
754 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
755 sub $16, %r10
756 # Determine if partial block is not being filled and
757 # shift mask accordingly
758 jge .L_no_extra_mask_1_\@
759 sub %r10, %r12
760.L_no_extra_mask_1_\@:
761
762 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
763 # get the appropriate mask to mask out bottom r13 bytes of xmm9
764 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
765
766 vpand %xmm1, %xmm3, %xmm3
767 vmovdqa SHUF_MASK(%rip), %xmm10
768 vpshufb %xmm10, %xmm3, %xmm3
769 vpshufb %xmm2, %xmm3, %xmm3
770 vpxor %xmm3, \AAD_HASH, \AAD_HASH
771
772 test %r10, %r10
773 jl .L_partial_incomplete_1_\@
774
775 # GHASH computation for the last <16 Byte block
776 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
777 xor %eax,%eax
778
779 mov %rax, PBlockLen(arg2)
780 jmp .L_dec_done_\@
781.L_partial_incomplete_1_\@:
782 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
783.L_dec_done_\@:
784 vmovdqu \AAD_HASH, AadHash(arg2)
785.else
786 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
787
788 mov \PLAIN_CYPH_LEN, %r10
789 add %r13, %r10
790 # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
791 sub $16, %r10
792 # Determine if partial block is not being filled and
793 # shift mask accordingly
794 jge .L_no_extra_mask_2_\@
795 sub %r10, %r12
796.L_no_extra_mask_2_\@:
797
798 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
799 # get the appropriate mask to mask out bottom r13 bytes of xmm9
800 vpand %xmm1, %xmm9, %xmm9
801
802 vmovdqa SHUF_MASK(%rip), %xmm1
803 vpshufb %xmm1, %xmm9, %xmm9
804 vpshufb %xmm2, %xmm9, %xmm9
805 vpxor %xmm9, \AAD_HASH, \AAD_HASH
806
807 test %r10, %r10
808 jl .L_partial_incomplete_2_\@
809
810 # GHASH computation for the last <16 Byte block
811 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
812 xor %eax,%eax
813
814 mov %rax, PBlockLen(arg2)
815 jmp .L_encode_done_\@
816.L_partial_incomplete_2_\@:
817 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
818.L_encode_done_\@:
819 vmovdqu \AAD_HASH, AadHash(arg2)
820
821 vmovdqa SHUF_MASK(%rip), %xmm10
822 # shuffle xmm9 back to output as ciphertext
823 vpshufb %xmm10, %xmm9, %xmm9
824 vpshufb %xmm2, %xmm9, %xmm9
825.endif
826 # output encrypted Bytes
827 test %r10, %r10
828 jl .L_partial_fill_\@
829 mov %r13, %r12
830 mov $16, %r13
831 # Set r13 to be the number of bytes to write out
832 sub %r12, %r13
833 jmp .L_count_set_\@
834.L_partial_fill_\@:
835 mov \PLAIN_CYPH_LEN, %r13
836.L_count_set_\@:
837 vmovdqa %xmm9, %xmm0
838 vmovq %xmm0, %rax
839 cmp $8, %r13
840 jle .L_less_than_8_bytes_left_\@
841
842 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
843 add $8, \DATA_OFFSET
844 psrldq $8, %xmm0
845 vmovq %xmm0, %rax
846 sub $8, %r13
847.L_less_than_8_bytes_left_\@:
848 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
849 add $1, \DATA_OFFSET
850 shr $8, %rax
851 sub $1, %r13
852 jne .L_less_than_8_bytes_left_\@
853.L_partial_block_done_\@:
854.endm # PARTIAL_BLOCK
855
856###############################################################################
857# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
858# Input: A and B (128-bits each, bit-reflected)
859# Output: C = A*B*x mod poly, (i.e. >>1 )
860# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
861# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
862###############################################################################
863.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
864
865 vpshufd $0b01001110, \GH, \T2
866 vpshufd $0b01001110, \HK, \T3
867 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
868 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
869
870 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
871 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
872 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
873 vpxor \GH, \T2,\T2
874 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
875
876 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
877 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
878 vpxor \T3, \GH, \GH
879 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
880
881 #first phase of the reduction
882 vpslld $31, \GH, \T2 # packed right shifting << 31
883 vpslld $30, \GH, \T3 # packed right shifting shift << 30
884 vpslld $25, \GH, \T4 # packed right shifting shift << 25
885
886 vpxor \T3, \T2, \T2 # xor the shifted versions
887 vpxor \T4, \T2, \T2
888
889 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
890
891 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
892 vpxor \T2, \GH, \GH # first phase of the reduction complete
893
894 #second phase of the reduction
895
896 vpsrld $1,\GH, \T2 # packed left shifting >> 1
897 vpsrld $2,\GH, \T3 # packed left shifting >> 2
898 vpsrld $7,\GH, \T4 # packed left shifting >> 7
899 vpxor \T3, \T2, \T2 # xor the shifted versions
900 vpxor \T4, \T2, \T2
901
902 vpxor \T5, \T2, \T2
903 vpxor \T2, \GH, \GH
904 vpxor \T1, \GH, \GH # the result is in GH
905
906
907.endm
908
909.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
910
911 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
912 vmovdqa \HK, \T5
913
914 vpshufd $0b01001110, \T5, \T1
915 vpxor \T5, \T1, \T1
916 vmovdqu \T1, HashKey_k(arg2)
917
918 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
919 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
920 vpshufd $0b01001110, \T5, \T1
921 vpxor \T5, \T1, \T1
922 vmovdqu \T1, HashKey_2_k(arg2)
923
924 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
925 vmovdqu \T5, HashKey_3(arg2)
926 vpshufd $0b01001110, \T5, \T1
927 vpxor \T5, \T1, \T1
928 vmovdqu \T1, HashKey_3_k(arg2)
929
930 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
931 vmovdqu \T5, HashKey_4(arg2)
932 vpshufd $0b01001110, \T5, \T1
933 vpxor \T5, \T1, \T1
934 vmovdqu \T1, HashKey_4_k(arg2)
935
936 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
937 vmovdqu \T5, HashKey_5(arg2)
938 vpshufd $0b01001110, \T5, \T1
939 vpxor \T5, \T1, \T1
940 vmovdqu \T1, HashKey_5_k(arg2)
941
942 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
943 vmovdqu \T5, HashKey_6(arg2)
944 vpshufd $0b01001110, \T5, \T1
945 vpxor \T5, \T1, \T1
946 vmovdqu \T1, HashKey_6_k(arg2)
947
948 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
949 vmovdqu \T5, HashKey_7(arg2)
950 vpshufd $0b01001110, \T5, \T1
951 vpxor \T5, \T1, \T1
952 vmovdqu \T1, HashKey_7_k(arg2)
953
954 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
955 vmovdqu \T5, HashKey_8(arg2)
956 vpshufd $0b01001110, \T5, \T1
957 vpxor \T5, \T1, \T1
958 vmovdqu \T1, HashKey_8_k(arg2)
959
960.endm
961
962## if a = number of total plaintext bytes
963## b = floor(a/16)
964## num_initial_blocks = b mod 4#
965## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
966## r10, r11, r12, rax are clobbered
967## arg1, arg2, arg3, arg4 are used as pointers only, not modified
968
969.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
970 i = (8-\num_initial_blocks)
971 setreg
972 vmovdqu AadHash(arg2), reg_i
973
974 # start AES for num_initial_blocks blocks
975 vmovdqu CurCount(arg2), \CTR
976
977 i = (9-\num_initial_blocks)
978 setreg
979.rep \num_initial_blocks
980 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
981 vmovdqa \CTR, reg_i
982 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
983 i = (i+1)
984 setreg
985.endr
986
987 vmovdqa (arg1), \T_key
988 i = (9-\num_initial_blocks)
989 setreg
990.rep \num_initial_blocks
991 vpxor \T_key, reg_i, reg_i
992 i = (i+1)
993 setreg
994.endr
995
996 j = 1
997 setreg
998.rep \REP
999 vmovdqa 16*j(arg1), \T_key
1000 i = (9-\num_initial_blocks)
1001 setreg
1002.rep \num_initial_blocks
1003 vaesenc \T_key, reg_i, reg_i
1004 i = (i+1)
1005 setreg
1006.endr
1007
1008 j = (j+1)
1009 setreg
1010.endr
1011
1012 vmovdqa 16*j(arg1), \T_key
1013 i = (9-\num_initial_blocks)
1014 setreg
1015.rep \num_initial_blocks
1016 vaesenclast \T_key, reg_i, reg_i
1017 i = (i+1)
1018 setreg
1019.endr
1020
1021 i = (9-\num_initial_blocks)
1022 setreg
1023.rep \num_initial_blocks
1024 vmovdqu (arg4, %r11), \T1
1025 vpxor \T1, reg_i, reg_i
1026 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1027 add $16, %r11
1028.if \ENC_DEC == DEC
1029 vmovdqa \T1, reg_i
1030.endif
1031 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1032 i = (i+1)
1033 setreg
1034.endr
1035
1036
1037 i = (8-\num_initial_blocks)
1038 j = (9-\num_initial_blocks)
1039 setreg
1040
1041.rep \num_initial_blocks
1042 vpxor reg_i, reg_j, reg_j
1043 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1044 i = (i+1)
1045 j = (j+1)
1046 setreg
1047.endr
1048 # XMM8 has the combined result here
1049
1050 vmovdqa \XMM8, TMP1(%rsp)
1051 vmovdqa \XMM8, \T3
1052
1053 cmp $128, %r13
1054 jl .L_initial_blocks_done\@ # no need for precomputed constants
1055
1056###############################################################################
1057# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1058 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1059 vmovdqa \CTR, \XMM1
1060 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1061
1062 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1063 vmovdqa \CTR, \XMM2
1064 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1065
1066 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1067 vmovdqa \CTR, \XMM3
1068 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1069
1070 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1071 vmovdqa \CTR, \XMM4
1072 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1073
1074 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1075 vmovdqa \CTR, \XMM5
1076 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1077
1078 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1079 vmovdqa \CTR, \XMM6
1080 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1081
1082 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1083 vmovdqa \CTR, \XMM7
1084 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1085
1086 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1087 vmovdqa \CTR, \XMM8
1088 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1089
1090 vmovdqa (arg1), \T_key
1091 vpxor \T_key, \XMM1, \XMM1
1092 vpxor \T_key, \XMM2, \XMM2
1093 vpxor \T_key, \XMM3, \XMM3
1094 vpxor \T_key, \XMM4, \XMM4
1095 vpxor \T_key, \XMM5, \XMM5
1096 vpxor \T_key, \XMM6, \XMM6
1097 vpxor \T_key, \XMM7, \XMM7
1098 vpxor \T_key, \XMM8, \XMM8
1099
1100 i = 1
1101 setreg
1102.rep \REP # do REP rounds
1103 vmovdqa 16*i(arg1), \T_key
1104 vaesenc \T_key, \XMM1, \XMM1
1105 vaesenc \T_key, \XMM2, \XMM2
1106 vaesenc \T_key, \XMM3, \XMM3
1107 vaesenc \T_key, \XMM4, \XMM4
1108 vaesenc \T_key, \XMM5, \XMM5
1109 vaesenc \T_key, \XMM6, \XMM6
1110 vaesenc \T_key, \XMM7, \XMM7
1111 vaesenc \T_key, \XMM8, \XMM8
1112 i = (i+1)
1113 setreg
1114.endr
1115
1116 vmovdqa 16*i(arg1), \T_key
1117 vaesenclast \T_key, \XMM1, \XMM1
1118 vaesenclast \T_key, \XMM2, \XMM2
1119 vaesenclast \T_key, \XMM3, \XMM3
1120 vaesenclast \T_key, \XMM4, \XMM4
1121 vaesenclast \T_key, \XMM5, \XMM5
1122 vaesenclast \T_key, \XMM6, \XMM6
1123 vaesenclast \T_key, \XMM7, \XMM7
1124 vaesenclast \T_key, \XMM8, \XMM8
1125
1126 vmovdqu (arg4, %r11), \T1
1127 vpxor \T1, \XMM1, \XMM1
1128 vmovdqu \XMM1, (arg3 , %r11)
1129 .if \ENC_DEC == DEC
1130 vmovdqa \T1, \XMM1
1131 .endif
1132
1133 vmovdqu 16*1(arg4, %r11), \T1
1134 vpxor \T1, \XMM2, \XMM2
1135 vmovdqu \XMM2, 16*1(arg3 , %r11)
1136 .if \ENC_DEC == DEC
1137 vmovdqa \T1, \XMM2
1138 .endif
1139
1140 vmovdqu 16*2(arg4, %r11), \T1
1141 vpxor \T1, \XMM3, \XMM3
1142 vmovdqu \XMM3, 16*2(arg3 , %r11)
1143 .if \ENC_DEC == DEC
1144 vmovdqa \T1, \XMM3
1145 .endif
1146
1147 vmovdqu 16*3(arg4, %r11), \T1
1148 vpxor \T1, \XMM4, \XMM4
1149 vmovdqu \XMM4, 16*3(arg3 , %r11)
1150 .if \ENC_DEC == DEC
1151 vmovdqa \T1, \XMM4
1152 .endif
1153
1154 vmovdqu 16*4(arg4, %r11), \T1
1155 vpxor \T1, \XMM5, \XMM5
1156 vmovdqu \XMM5, 16*4(arg3 , %r11)
1157 .if \ENC_DEC == DEC
1158 vmovdqa \T1, \XMM5
1159 .endif
1160
1161 vmovdqu 16*5(arg4, %r11), \T1
1162 vpxor \T1, \XMM6, \XMM6
1163 vmovdqu \XMM6, 16*5(arg3 , %r11)
1164 .if \ENC_DEC == DEC
1165 vmovdqa \T1, \XMM6
1166 .endif
1167
1168 vmovdqu 16*6(arg4, %r11), \T1
1169 vpxor \T1, \XMM7, \XMM7
1170 vmovdqu \XMM7, 16*6(arg3 , %r11)
1171 .if \ENC_DEC == DEC
1172 vmovdqa \T1, \XMM7
1173 .endif
1174
1175 vmovdqu 16*7(arg4, %r11), \T1
1176 vpxor \T1, \XMM8, \XMM8
1177 vmovdqu \XMM8, 16*7(arg3 , %r11)
1178 .if \ENC_DEC == DEC
1179 vmovdqa \T1, \XMM8
1180 .endif
1181
1182 add $128, %r11
1183
1184 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1185 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1186 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1187 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1188 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1189 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1190 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1191 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1192 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1193
1194###############################################################################
1195
1196.L_initial_blocks_done\@:
1197
1198.endm
1199
1200# encrypt 8 blocks at a time
1201# ghash the 8 previously encrypted ciphertext blocks
1202# arg1, arg2, arg3, arg4 are used as pointers only, not modified
1203# r11 is the data offset value
1204.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1205
1206 vmovdqa \XMM1, \T2
1207 vmovdqa \XMM2, TMP2(%rsp)
1208 vmovdqa \XMM3, TMP3(%rsp)
1209 vmovdqa \XMM4, TMP4(%rsp)
1210 vmovdqa \XMM5, TMP5(%rsp)
1211 vmovdqa \XMM6, TMP6(%rsp)
1212 vmovdqa \XMM7, TMP7(%rsp)
1213 vmovdqa \XMM8, TMP8(%rsp)
1214
1215.if \loop_idx == in_order
1216 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1217 vpaddd ONE(%rip), \XMM1, \XMM2
1218 vpaddd ONE(%rip), \XMM2, \XMM3
1219 vpaddd ONE(%rip), \XMM3, \XMM4
1220 vpaddd ONE(%rip), \XMM4, \XMM5
1221 vpaddd ONE(%rip), \XMM5, \XMM6
1222 vpaddd ONE(%rip), \XMM6, \XMM7
1223 vpaddd ONE(%rip), \XMM7, \XMM8
1224 vmovdqa \XMM8, \CTR
1225
1226 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1227 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1228 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1229 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1230 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1231 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1232 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1233 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1234.else
1235 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1236 vpaddd ONEf(%rip), \XMM1, \XMM2
1237 vpaddd ONEf(%rip), \XMM2, \XMM3
1238 vpaddd ONEf(%rip), \XMM3, \XMM4
1239 vpaddd ONEf(%rip), \XMM4, \XMM5
1240 vpaddd ONEf(%rip), \XMM5, \XMM6
1241 vpaddd ONEf(%rip), \XMM6, \XMM7
1242 vpaddd ONEf(%rip), \XMM7, \XMM8
1243 vmovdqa \XMM8, \CTR
1244.endif
1245
1246
1247 #######################################################################
1248
1249 vmovdqu (arg1), \T1
1250 vpxor \T1, \XMM1, \XMM1
1251 vpxor \T1, \XMM2, \XMM2
1252 vpxor \T1, \XMM3, \XMM3
1253 vpxor \T1, \XMM4, \XMM4
1254 vpxor \T1, \XMM5, \XMM5
1255 vpxor \T1, \XMM6, \XMM6
1256 vpxor \T1, \XMM7, \XMM7
1257 vpxor \T1, \XMM8, \XMM8
1258
1259 #######################################################################
1260
1261
1262
1263
1264
1265 vmovdqu 16*1(arg1), \T1
1266 vaesenc \T1, \XMM1, \XMM1
1267 vaesenc \T1, \XMM2, \XMM2
1268 vaesenc \T1, \XMM3, \XMM3
1269 vaesenc \T1, \XMM4, \XMM4
1270 vaesenc \T1, \XMM5, \XMM5
1271 vaesenc \T1, \XMM6, \XMM6
1272 vaesenc \T1, \XMM7, \XMM7
1273 vaesenc \T1, \XMM8, \XMM8
1274
1275 vmovdqu 16*2(arg1), \T1
1276 vaesenc \T1, \XMM1, \XMM1
1277 vaesenc \T1, \XMM2, \XMM2
1278 vaesenc \T1, \XMM3, \XMM3
1279 vaesenc \T1, \XMM4, \XMM4
1280 vaesenc \T1, \XMM5, \XMM5
1281 vaesenc \T1, \XMM6, \XMM6
1282 vaesenc \T1, \XMM7, \XMM7
1283 vaesenc \T1, \XMM8, \XMM8
1284
1285
1286 #######################################################################
1287
1288 vmovdqu HashKey_8(arg2), \T5
1289 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1290 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1291
1292 vpshufd $0b01001110, \T2, \T6
1293 vpxor \T2, \T6, \T6
1294
1295 vmovdqu HashKey_8_k(arg2), \T5
1296 vpclmulqdq $0x00, \T5, \T6, \T6
1297
1298 vmovdqu 16*3(arg1), \T1
1299 vaesenc \T1, \XMM1, \XMM1
1300 vaesenc \T1, \XMM2, \XMM2
1301 vaesenc \T1, \XMM3, \XMM3
1302 vaesenc \T1, \XMM4, \XMM4
1303 vaesenc \T1, \XMM5, \XMM5
1304 vaesenc \T1, \XMM6, \XMM6
1305 vaesenc \T1, \XMM7, \XMM7
1306 vaesenc \T1, \XMM8, \XMM8
1307
1308 vmovdqa TMP2(%rsp), \T1
1309 vmovdqu HashKey_7(arg2), \T5
1310 vpclmulqdq $0x11, \T5, \T1, \T3
1311 vpxor \T3, \T4, \T4
1312 vpclmulqdq $0x00, \T5, \T1, \T3
1313 vpxor \T3, \T7, \T7
1314
1315 vpshufd $0b01001110, \T1, \T3
1316 vpxor \T1, \T3, \T3
1317 vmovdqu HashKey_7_k(arg2), \T5
1318 vpclmulqdq $0x10, \T5, \T3, \T3
1319 vpxor \T3, \T6, \T6
1320
1321 vmovdqu 16*4(arg1), \T1
1322 vaesenc \T1, \XMM1, \XMM1
1323 vaesenc \T1, \XMM2, \XMM2
1324 vaesenc \T1, \XMM3, \XMM3
1325 vaesenc \T1, \XMM4, \XMM4
1326 vaesenc \T1, \XMM5, \XMM5
1327 vaesenc \T1, \XMM6, \XMM6
1328 vaesenc \T1, \XMM7, \XMM7
1329 vaesenc \T1, \XMM8, \XMM8
1330
1331 #######################################################################
1332
1333 vmovdqa TMP3(%rsp), \T1
1334 vmovdqu HashKey_6(arg2), \T5
1335 vpclmulqdq $0x11, \T5, \T1, \T3
1336 vpxor \T3, \T4, \T4
1337 vpclmulqdq $0x00, \T5, \T1, \T3
1338 vpxor \T3, \T7, \T7
1339
1340 vpshufd $0b01001110, \T1, \T3
1341 vpxor \T1, \T3, \T3
1342 vmovdqu HashKey_6_k(arg2), \T5
1343 vpclmulqdq $0x10, \T5, \T3, \T3
1344 vpxor \T3, \T6, \T6
1345
1346 vmovdqu 16*5(arg1), \T1
1347 vaesenc \T1, \XMM1, \XMM1
1348 vaesenc \T1, \XMM2, \XMM2
1349 vaesenc \T1, \XMM3, \XMM3
1350 vaesenc \T1, \XMM4, \XMM4
1351 vaesenc \T1, \XMM5, \XMM5
1352 vaesenc \T1, \XMM6, \XMM6
1353 vaesenc \T1, \XMM7, \XMM7
1354 vaesenc \T1, \XMM8, \XMM8
1355
1356 vmovdqa TMP4(%rsp), \T1
1357 vmovdqu HashKey_5(arg2), \T5
1358 vpclmulqdq $0x11, \T5, \T1, \T3
1359 vpxor \T3, \T4, \T4
1360 vpclmulqdq $0x00, \T5, \T1, \T3
1361 vpxor \T3, \T7, \T7
1362
1363 vpshufd $0b01001110, \T1, \T3
1364 vpxor \T1, \T3, \T3
1365 vmovdqu HashKey_5_k(arg2), \T5
1366 vpclmulqdq $0x10, \T5, \T3, \T3
1367 vpxor \T3, \T6, \T6
1368
1369 vmovdqu 16*6(arg1), \T1
1370 vaesenc \T1, \XMM1, \XMM1
1371 vaesenc \T1, \XMM2, \XMM2
1372 vaesenc \T1, \XMM3, \XMM3
1373 vaesenc \T1, \XMM4, \XMM4
1374 vaesenc \T1, \XMM5, \XMM5
1375 vaesenc \T1, \XMM6, \XMM6
1376 vaesenc \T1, \XMM7, \XMM7
1377 vaesenc \T1, \XMM8, \XMM8
1378
1379
1380 vmovdqa TMP5(%rsp), \T1
1381 vmovdqu HashKey_4(arg2), \T5
1382 vpclmulqdq $0x11, \T5, \T1, \T3
1383 vpxor \T3, \T4, \T4
1384 vpclmulqdq $0x00, \T5, \T1, \T3
1385 vpxor \T3, \T7, \T7
1386
1387 vpshufd $0b01001110, \T1, \T3
1388 vpxor \T1, \T3, \T3
1389 vmovdqu HashKey_4_k(arg2), \T5
1390 vpclmulqdq $0x10, \T5, \T3, \T3
1391 vpxor \T3, \T6, \T6
1392
1393 vmovdqu 16*7(arg1), \T1
1394 vaesenc \T1, \XMM1, \XMM1
1395 vaesenc \T1, \XMM2, \XMM2
1396 vaesenc \T1, \XMM3, \XMM3
1397 vaesenc \T1, \XMM4, \XMM4
1398 vaesenc \T1, \XMM5, \XMM5
1399 vaesenc \T1, \XMM6, \XMM6
1400 vaesenc \T1, \XMM7, \XMM7
1401 vaesenc \T1, \XMM8, \XMM8
1402
1403 vmovdqa TMP6(%rsp), \T1
1404 vmovdqu HashKey_3(arg2), \T5
1405 vpclmulqdq $0x11, \T5, \T1, \T3
1406 vpxor \T3, \T4, \T4
1407 vpclmulqdq $0x00, \T5, \T1, \T3
1408 vpxor \T3, \T7, \T7
1409
1410 vpshufd $0b01001110, \T1, \T3
1411 vpxor \T1, \T3, \T3
1412 vmovdqu HashKey_3_k(arg2), \T5
1413 vpclmulqdq $0x10, \T5, \T3, \T3
1414 vpxor \T3, \T6, \T6
1415
1416
1417 vmovdqu 16*8(arg1), \T1
1418 vaesenc \T1, \XMM1, \XMM1
1419 vaesenc \T1, \XMM2, \XMM2
1420 vaesenc \T1, \XMM3, \XMM3
1421 vaesenc \T1, \XMM4, \XMM4
1422 vaesenc \T1, \XMM5, \XMM5
1423 vaesenc \T1, \XMM6, \XMM6
1424 vaesenc \T1, \XMM7, \XMM7
1425 vaesenc \T1, \XMM8, \XMM8
1426
1427 vmovdqa TMP7(%rsp), \T1
1428 vmovdqu HashKey_2(arg2), \T5
1429 vpclmulqdq $0x11, \T5, \T1, \T3
1430 vpxor \T3, \T4, \T4
1431 vpclmulqdq $0x00, \T5, \T1, \T3
1432 vpxor \T3, \T7, \T7
1433
1434 vpshufd $0b01001110, \T1, \T3
1435 vpxor \T1, \T3, \T3
1436 vmovdqu HashKey_2_k(arg2), \T5
1437 vpclmulqdq $0x10, \T5, \T3, \T3
1438 vpxor \T3, \T6, \T6
1439
1440 #######################################################################
1441
1442 vmovdqu 16*9(arg1), \T5
1443 vaesenc \T5, \XMM1, \XMM1
1444 vaesenc \T5, \XMM2, \XMM2
1445 vaesenc \T5, \XMM3, \XMM3
1446 vaesenc \T5, \XMM4, \XMM4
1447 vaesenc \T5, \XMM5, \XMM5
1448 vaesenc \T5, \XMM6, \XMM6
1449 vaesenc \T5, \XMM7, \XMM7
1450 vaesenc \T5, \XMM8, \XMM8
1451
1452 vmovdqa TMP8(%rsp), \T1
1453 vmovdqu HashKey(arg2), \T5
1454 vpclmulqdq $0x11, \T5, \T1, \T3
1455 vpxor \T3, \T4, \T4
1456 vpclmulqdq $0x00, \T5, \T1, \T3
1457 vpxor \T3, \T7, \T7
1458
1459 vpshufd $0b01001110, \T1, \T3
1460 vpxor \T1, \T3, \T3
1461 vmovdqu HashKey_k(arg2), \T5
1462 vpclmulqdq $0x10, \T5, \T3, \T3
1463 vpxor \T3, \T6, \T6
1464
1465 vpxor \T4, \T6, \T6
1466 vpxor \T7, \T6, \T6
1467
1468 vmovdqu 16*10(arg1), \T5
1469
1470 i = 11
1471 setreg
1472.rep (\REP-9)
1473
1474 vaesenc \T5, \XMM1, \XMM1
1475 vaesenc \T5, \XMM2, \XMM2
1476 vaesenc \T5, \XMM3, \XMM3
1477 vaesenc \T5, \XMM4, \XMM4
1478 vaesenc \T5, \XMM5, \XMM5
1479 vaesenc \T5, \XMM6, \XMM6
1480 vaesenc \T5, \XMM7, \XMM7
1481 vaesenc \T5, \XMM8, \XMM8
1482
1483 vmovdqu 16*i(arg1), \T5
1484 i = i + 1
1485 setreg
1486.endr
1487
1488 i = 0
1489 j = 1
1490 setreg
1491.rep 8
1492 vpxor 16*i(arg4, %r11), \T5, \T2
1493 .if \ENC_DEC == ENC
1494 vaesenclast \T2, reg_j, reg_j
1495 .else
1496 vaesenclast \T2, reg_j, \T3
1497 vmovdqu 16*i(arg4, %r11), reg_j
1498 vmovdqu \T3, 16*i(arg3, %r11)
1499 .endif
1500 i = (i+1)
1501 j = (j+1)
1502 setreg
1503.endr
1504 #######################################################################
1505
1506
1507 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1508 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1509 vpxor \T3, \T7, \T7
1510 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1511
1512
1513
1514 #######################################################################
1515 #first phase of the reduction
1516 #######################################################################
1517 vpslld $31, \T7, \T2 # packed right shifting << 31
1518 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1519 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1520
1521 vpxor \T3, \T2, \T2 # xor the shifted versions
1522 vpxor \T4, \T2, \T2
1523
1524 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1525
1526 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1527 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1528 #######################################################################
1529 .if \ENC_DEC == ENC
1530 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1531 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1532 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1533 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1534 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1535 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1536 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1537 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1538 .endif
1539
1540 #######################################################################
1541 #second phase of the reduction
1542 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1543 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1544 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1545 vpxor \T3, \T2, \T2 # xor the shifted versions
1546 vpxor \T4, \T2, \T2
1547
1548 vpxor \T1, \T2, \T2
1549 vpxor \T2, \T7, \T7
1550 vpxor \T7, \T6, \T6 # the result is in T6
1551 #######################################################################
1552
1553 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1554 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1555 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1556 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1557 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1558 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1559 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1560 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1561
1562
1563 vpxor \T6, \XMM1, \XMM1
1564
1565
1566
1567.endm
1568
1569
1570# GHASH the last 4 ciphertext blocks.
1571.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1572
1573 ## Karatsuba Method
1574
1575
1576 vpshufd $0b01001110, \XMM1, \T2
1577 vpxor \XMM1, \T2, \T2
1578 vmovdqu HashKey_8(arg2), \T5
1579 vpclmulqdq $0x11, \T5, \XMM1, \T6
1580 vpclmulqdq $0x00, \T5, \XMM1, \T7
1581
1582 vmovdqu HashKey_8_k(arg2), \T3
1583 vpclmulqdq $0x00, \T3, \T2, \XMM1
1584
1585 ######################
1586
1587 vpshufd $0b01001110, \XMM2, \T2
1588 vpxor \XMM2, \T2, \T2
1589 vmovdqu HashKey_7(arg2), \T5
1590 vpclmulqdq $0x11, \T5, \XMM2, \T4
1591 vpxor \T4, \T6, \T6
1592
1593 vpclmulqdq $0x00, \T5, \XMM2, \T4
1594 vpxor \T4, \T7, \T7
1595
1596 vmovdqu HashKey_7_k(arg2), \T3
1597 vpclmulqdq $0x00, \T3, \T2, \T2
1598 vpxor \T2, \XMM1, \XMM1
1599
1600 ######################
1601
1602 vpshufd $0b01001110, \XMM3, \T2
1603 vpxor \XMM3, \T2, \T2
1604 vmovdqu HashKey_6(arg2), \T5
1605 vpclmulqdq $0x11, \T5, \XMM3, \T4
1606 vpxor \T4, \T6, \T6
1607
1608 vpclmulqdq $0x00, \T5, \XMM3, \T4
1609 vpxor \T4, \T7, \T7
1610
1611 vmovdqu HashKey_6_k(arg2), \T3
1612 vpclmulqdq $0x00, \T3, \T2, \T2
1613 vpxor \T2, \XMM1, \XMM1
1614
1615 ######################
1616
1617 vpshufd $0b01001110, \XMM4, \T2
1618 vpxor \XMM4, \T2, \T2
1619 vmovdqu HashKey_5(arg2), \T5
1620 vpclmulqdq $0x11, \T5, \XMM4, \T4
1621 vpxor \T4, \T6, \T6
1622
1623 vpclmulqdq $0x00, \T5, \XMM4, \T4
1624 vpxor \T4, \T7, \T7
1625
1626 vmovdqu HashKey_5_k(arg2), \T3
1627 vpclmulqdq $0x00, \T3, \T2, \T2
1628 vpxor \T2, \XMM1, \XMM1
1629
1630 ######################
1631
1632 vpshufd $0b01001110, \XMM5, \T2
1633 vpxor \XMM5, \T2, \T2
1634 vmovdqu HashKey_4(arg2), \T5
1635 vpclmulqdq $0x11, \T5, \XMM5, \T4
1636 vpxor \T4, \T6, \T6
1637
1638 vpclmulqdq $0x00, \T5, \XMM5, \T4
1639 vpxor \T4, \T7, \T7
1640
1641 vmovdqu HashKey_4_k(arg2), \T3
1642 vpclmulqdq $0x00, \T3, \T2, \T2
1643 vpxor \T2, \XMM1, \XMM1
1644
1645 ######################
1646
1647 vpshufd $0b01001110, \XMM6, \T2
1648 vpxor \XMM6, \T2, \T2
1649 vmovdqu HashKey_3(arg2), \T5
1650 vpclmulqdq $0x11, \T5, \XMM6, \T4
1651 vpxor \T4, \T6, \T6
1652
1653 vpclmulqdq $0x00, \T5, \XMM6, \T4
1654 vpxor \T4, \T7, \T7
1655
1656 vmovdqu HashKey_3_k(arg2), \T3
1657 vpclmulqdq $0x00, \T3, \T2, \T2
1658 vpxor \T2, \XMM1, \XMM1
1659
1660 ######################
1661
1662 vpshufd $0b01001110, \XMM7, \T2
1663 vpxor \XMM7, \T2, \T2
1664 vmovdqu HashKey_2(arg2), \T5
1665 vpclmulqdq $0x11, \T5, \XMM7, \T4
1666 vpxor \T4, \T6, \T6
1667
1668 vpclmulqdq $0x00, \T5, \XMM7, \T4
1669 vpxor \T4, \T7, \T7
1670
1671 vmovdqu HashKey_2_k(arg2), \T3
1672 vpclmulqdq $0x00, \T3, \T2, \T2
1673 vpxor \T2, \XMM1, \XMM1
1674
1675 ######################
1676
1677 vpshufd $0b01001110, \XMM8, \T2
1678 vpxor \XMM8, \T2, \T2
1679 vmovdqu HashKey(arg2), \T5
1680 vpclmulqdq $0x11, \T5, \XMM8, \T4
1681 vpxor \T4, \T6, \T6
1682
1683 vpclmulqdq $0x00, \T5, \XMM8, \T4
1684 vpxor \T4, \T7, \T7
1685
1686 vmovdqu HashKey_k(arg2), \T3
1687 vpclmulqdq $0x00, \T3, \T2, \T2
1688
1689 vpxor \T2, \XMM1, \XMM1
1690 vpxor \T6, \XMM1, \XMM1
1691 vpxor \T7, \XMM1, \T2
1692
1693
1694
1695
1696 vpslldq $8, \T2, \T4
1697 vpsrldq $8, \T2, \T2
1698
1699 vpxor \T4, \T7, \T7
1700 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1701 # the accumulated carry-less multiplications
1702
1703 #######################################################################
1704 #first phase of the reduction
1705 vpslld $31, \T7, \T2 # packed right shifting << 31
1706 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1707 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1708
1709 vpxor \T3, \T2, \T2 # xor the shifted versions
1710 vpxor \T4, \T2, \T2
1711
1712 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1713
1714 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1715 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1716 #######################################################################
1717
1718
1719 #second phase of the reduction
1720 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1721 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1722 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1723 vpxor \T3, \T2, \T2 # xor the shifted versions
1724 vpxor \T4, \T2, \T2
1725
1726 vpxor \T1, \T2, \T2
1727 vpxor \T2, \T7, \T7
1728 vpxor \T7, \T6, \T6 # the result is in T6
1729
1730.endm
1731
1732#############################################################
1733#void aesni_gcm_precomp_avx_gen2
1734# (gcm_data *my_ctx_data,
1735# gcm_context_data *data,
1736# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1737# u8 *iv, /* Pre-counter block j0: 4 byte salt
1738# (from Security Association) concatenated with 8 byte
1739# Initialisation Vector (from IPSec ESP Payload)
1740# concatenated with 0x00000001. 16-byte aligned pointer. */
1741# const u8 *aad, /* Additional Authentication Data (AAD)*/
1742# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1743#############################################################
1744SYM_FUNC_START(aesni_gcm_init_avx_gen2)
1745 FUNC_SAVE
1746 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1747 FUNC_RESTORE
1748 RET
1749SYM_FUNC_END(aesni_gcm_init_avx_gen2)
1750
1751###############################################################################
1752#void aesni_gcm_enc_update_avx_gen2(
1753# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1754# gcm_context_data *data,
1755# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
1756# const u8 *in, /* Plaintext input */
1757# u64 plaintext_len) /* Length of data in Bytes for encryption. */
1758###############################################################################
1759SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1760 FUNC_SAVE
1761 mov keysize, %eax
1762 cmp $32, %eax
1763 je key_256_enc_update
1764 cmp $16, %eax
1765 je key_128_enc_update
1766 # must be 192
1767 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1768 FUNC_RESTORE
1769 RET
1770key_128_enc_update:
1771 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1772 FUNC_RESTORE
1773 RET
1774key_256_enc_update:
1775 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1776 FUNC_RESTORE
1777 RET
1778SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1779
1780###############################################################################
1781#void aesni_gcm_dec_update_avx_gen2(
1782# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1783# gcm_context_data *data,
1784# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
1785# const u8 *in, /* Ciphertext input */
1786# u64 plaintext_len) /* Length of data in Bytes for encryption. */
1787###############################################################################
1788SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1789 FUNC_SAVE
1790 mov keysize,%eax
1791 cmp $32, %eax
1792 je key_256_dec_update
1793 cmp $16, %eax
1794 je key_128_dec_update
1795 # must be 192
1796 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1797 FUNC_RESTORE
1798 RET
1799key_128_dec_update:
1800 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1801 FUNC_RESTORE
1802 RET
1803key_256_dec_update:
1804 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1805 FUNC_RESTORE
1806 RET
1807SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1808
1809###############################################################################
1810#void aesni_gcm_finalize_avx_gen2(
1811# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
1812# gcm_context_data *data,
1813# u8 *auth_tag, /* Authenticated Tag output. */
1814# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
1815# Valid values are 16 (most likely), 12 or 8. */
1816###############################################################################
1817SYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1818 FUNC_SAVE
1819 mov keysize,%eax
1820 cmp $32, %eax
1821 je key_256_finalize
1822 cmp $16, %eax
1823 je key_128_finalize
1824 # must be 192
1825 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1826 FUNC_RESTORE
1827 RET
1828key_128_finalize:
1829 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1830 FUNC_RESTORE
1831 RET
1832key_256_finalize:
1833 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1834 FUNC_RESTORE
1835 RET
1836SYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1837
1838###############################################################################
1839# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1840# Input: A and B (128-bits each, bit-reflected)
1841# Output: C = A*B*x mod poly, (i.e. >>1 )
1842# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1843# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1844###############################################################################
1845.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1846
1847 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1848 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1849 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1850 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1851 vpxor \T3, \GH, \GH
1852
1853
1854 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1855 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1856
1857 vpxor \T3, \T1, \T1
1858 vpxor \T2, \GH, \GH
1859
1860 #######################################################################
1861 #first phase of the reduction
1862 vmovdqa POLY2(%rip), \T3
1863
1864 vpclmulqdq $0x01, \GH, \T3, \T2
1865 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1866
1867 vpxor \T2, \GH, \GH # first phase of the reduction complete
1868 #######################################################################
1869 #second phase of the reduction
1870 vpclmulqdq $0x00, \GH, \T3, \T2
1871 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1872
1873 vpclmulqdq $0x10, \GH, \T3, \GH
1874 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1875
1876 vpxor \T2, \GH, \GH # second phase of the reduction complete
1877 #######################################################################
1878 vpxor \T1, \GH, \GH # the result is in GH
1879
1880
1881.endm
1882
1883.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1884
1885 # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1886 vmovdqa \HK, \T5
1887 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1888 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1889
1890 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1891 vmovdqu \T5, HashKey_3(arg2)
1892
1893 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1894 vmovdqu \T5, HashKey_4(arg2)
1895
1896 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1897 vmovdqu \T5, HashKey_5(arg2)
1898
1899 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1900 vmovdqu \T5, HashKey_6(arg2)
1901
1902 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1903 vmovdqu \T5, HashKey_7(arg2)
1904
1905 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1906 vmovdqu \T5, HashKey_8(arg2)
1907
1908.endm
1909
1910## if a = number of total plaintext bytes
1911## b = floor(a/16)
1912## num_initial_blocks = b mod 4#
1913## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1914## r10, r11, r12, rax are clobbered
1915## arg1, arg2, arg3, arg4 are used as pointers only, not modified
1916
1917.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1918 i = (8-\num_initial_blocks)
1919 setreg
1920 vmovdqu AadHash(arg2), reg_i
1921
1922 # start AES for num_initial_blocks blocks
1923 vmovdqu CurCount(arg2), \CTR
1924
1925 i = (9-\num_initial_blocks)
1926 setreg
1927.rep \num_initial_blocks
1928 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1929 vmovdqa \CTR, reg_i
1930 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1931 i = (i+1)
1932 setreg
1933.endr
1934
1935 vmovdqa (arg1), \T_key
1936 i = (9-\num_initial_blocks)
1937 setreg
1938.rep \num_initial_blocks
1939 vpxor \T_key, reg_i, reg_i
1940 i = (i+1)
1941 setreg
1942.endr
1943
1944 j = 1
1945 setreg
1946.rep \REP
1947 vmovdqa 16*j(arg1), \T_key
1948 i = (9-\num_initial_blocks)
1949 setreg
1950.rep \num_initial_blocks
1951 vaesenc \T_key, reg_i, reg_i
1952 i = (i+1)
1953 setreg
1954.endr
1955
1956 j = (j+1)
1957 setreg
1958.endr
1959
1960
1961 vmovdqa 16*j(arg1), \T_key
1962 i = (9-\num_initial_blocks)
1963 setreg
1964.rep \num_initial_blocks
1965 vaesenclast \T_key, reg_i, reg_i
1966 i = (i+1)
1967 setreg
1968.endr
1969
1970 i = (9-\num_initial_blocks)
1971 setreg
1972.rep \num_initial_blocks
1973 vmovdqu (arg4, %r11), \T1
1974 vpxor \T1, reg_i, reg_i
1975 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
1976 # num_initial_blocks blocks
1977 add $16, %r11
1978.if \ENC_DEC == DEC
1979 vmovdqa \T1, reg_i
1980.endif
1981 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1982 i = (i+1)
1983 setreg
1984.endr
1985
1986
1987 i = (8-\num_initial_blocks)
1988 j = (9-\num_initial_blocks)
1989 setreg
1990
1991.rep \num_initial_blocks
1992 vpxor reg_i, reg_j, reg_j
1993 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1994 i = (i+1)
1995 j = (j+1)
1996 setreg
1997.endr
1998 # XMM8 has the combined result here
1999
2000 vmovdqa \XMM8, TMP1(%rsp)
2001 vmovdqa \XMM8, \T3
2002
2003 cmp $128, %r13
2004 jl .L_initial_blocks_done\@ # no need for precomputed constants
2005
2006###############################################################################
2007# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2008 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2009 vmovdqa \CTR, \XMM1
2010 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2011
2012 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2013 vmovdqa \CTR, \XMM2
2014 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2015
2016 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2017 vmovdqa \CTR, \XMM3
2018 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2019
2020 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2021 vmovdqa \CTR, \XMM4
2022 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2023
2024 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2025 vmovdqa \CTR, \XMM5
2026 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2027
2028 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2029 vmovdqa \CTR, \XMM6
2030 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2031
2032 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2033 vmovdqa \CTR, \XMM7
2034 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2035
2036 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2037 vmovdqa \CTR, \XMM8
2038 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2039
2040 vmovdqa (arg1), \T_key
2041 vpxor \T_key, \XMM1, \XMM1
2042 vpxor \T_key, \XMM2, \XMM2
2043 vpxor \T_key, \XMM3, \XMM3
2044 vpxor \T_key, \XMM4, \XMM4
2045 vpxor \T_key, \XMM5, \XMM5
2046 vpxor \T_key, \XMM6, \XMM6
2047 vpxor \T_key, \XMM7, \XMM7
2048 vpxor \T_key, \XMM8, \XMM8
2049
2050 i = 1
2051 setreg
2052.rep \REP # do REP rounds
2053 vmovdqa 16*i(arg1), \T_key
2054 vaesenc \T_key, \XMM1, \XMM1
2055 vaesenc \T_key, \XMM2, \XMM2
2056 vaesenc \T_key, \XMM3, \XMM3
2057 vaesenc \T_key, \XMM4, \XMM4
2058 vaesenc \T_key, \XMM5, \XMM5
2059 vaesenc \T_key, \XMM6, \XMM6
2060 vaesenc \T_key, \XMM7, \XMM7
2061 vaesenc \T_key, \XMM8, \XMM8
2062 i = (i+1)
2063 setreg
2064.endr
2065
2066
2067 vmovdqa 16*i(arg1), \T_key
2068 vaesenclast \T_key, \XMM1, \XMM1
2069 vaesenclast \T_key, \XMM2, \XMM2
2070 vaesenclast \T_key, \XMM3, \XMM3
2071 vaesenclast \T_key, \XMM4, \XMM4
2072 vaesenclast \T_key, \XMM5, \XMM5
2073 vaesenclast \T_key, \XMM6, \XMM6
2074 vaesenclast \T_key, \XMM7, \XMM7
2075 vaesenclast \T_key, \XMM8, \XMM8
2076
2077 vmovdqu (arg4, %r11), \T1
2078 vpxor \T1, \XMM1, \XMM1
2079 vmovdqu \XMM1, (arg3 , %r11)
2080 .if \ENC_DEC == DEC
2081 vmovdqa \T1, \XMM1
2082 .endif
2083
2084 vmovdqu 16*1(arg4, %r11), \T1
2085 vpxor \T1, \XMM2, \XMM2
2086 vmovdqu \XMM2, 16*1(arg3 , %r11)
2087 .if \ENC_DEC == DEC
2088 vmovdqa \T1, \XMM2
2089 .endif
2090
2091 vmovdqu 16*2(arg4, %r11), \T1
2092 vpxor \T1, \XMM3, \XMM3
2093 vmovdqu \XMM3, 16*2(arg3 , %r11)
2094 .if \ENC_DEC == DEC
2095 vmovdqa \T1, \XMM3
2096 .endif
2097
2098 vmovdqu 16*3(arg4, %r11), \T1
2099 vpxor \T1, \XMM4, \XMM4
2100 vmovdqu \XMM4, 16*3(arg3 , %r11)
2101 .if \ENC_DEC == DEC
2102 vmovdqa \T1, \XMM4
2103 .endif
2104
2105 vmovdqu 16*4(arg4, %r11), \T1
2106 vpxor \T1, \XMM5, \XMM5
2107 vmovdqu \XMM5, 16*4(arg3 , %r11)
2108 .if \ENC_DEC == DEC
2109 vmovdqa \T1, \XMM5
2110 .endif
2111
2112 vmovdqu 16*5(arg4, %r11), \T1
2113 vpxor \T1, \XMM6, \XMM6
2114 vmovdqu \XMM6, 16*5(arg3 , %r11)
2115 .if \ENC_DEC == DEC
2116 vmovdqa \T1, \XMM6
2117 .endif
2118
2119 vmovdqu 16*6(arg4, %r11), \T1
2120 vpxor \T1, \XMM7, \XMM7
2121 vmovdqu \XMM7, 16*6(arg3 , %r11)
2122 .if \ENC_DEC == DEC
2123 vmovdqa \T1, \XMM7
2124 .endif
2125
2126 vmovdqu 16*7(arg4, %r11), \T1
2127 vpxor \T1, \XMM8, \XMM8
2128 vmovdqu \XMM8, 16*7(arg3 , %r11)
2129 .if \ENC_DEC == DEC
2130 vmovdqa \T1, \XMM8
2131 .endif
2132
2133 add $128, %r11
2134
2135 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2136 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2137 # the corresponding ciphertext
2138 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2139 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2140 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2141 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2142 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2143 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2144 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2145
2146###############################################################################
2147
2148.L_initial_blocks_done\@:
2149
2150
2151.endm
2152
2153
2154
2155# encrypt 8 blocks at a time
2156# ghash the 8 previously encrypted ciphertext blocks
2157# arg1, arg2, arg3, arg4 are used as pointers only, not modified
2158# r11 is the data offset value
2159.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2160
2161 vmovdqa \XMM1, \T2
2162 vmovdqa \XMM2, TMP2(%rsp)
2163 vmovdqa \XMM3, TMP3(%rsp)
2164 vmovdqa \XMM4, TMP4(%rsp)
2165 vmovdqa \XMM5, TMP5(%rsp)
2166 vmovdqa \XMM6, TMP6(%rsp)
2167 vmovdqa \XMM7, TMP7(%rsp)
2168 vmovdqa \XMM8, TMP8(%rsp)
2169
2170.if \loop_idx == in_order
2171 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2172 vpaddd ONE(%rip), \XMM1, \XMM2
2173 vpaddd ONE(%rip), \XMM2, \XMM3
2174 vpaddd ONE(%rip), \XMM3, \XMM4
2175 vpaddd ONE(%rip), \XMM4, \XMM5
2176 vpaddd ONE(%rip), \XMM5, \XMM6
2177 vpaddd ONE(%rip), \XMM6, \XMM7
2178 vpaddd ONE(%rip), \XMM7, \XMM8
2179 vmovdqa \XMM8, \CTR
2180
2181 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2182 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2183 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2184 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2185 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2186 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2187 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2188 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2189.else
2190 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2191 vpaddd ONEf(%rip), \XMM1, \XMM2
2192 vpaddd ONEf(%rip), \XMM2, \XMM3
2193 vpaddd ONEf(%rip), \XMM3, \XMM4
2194 vpaddd ONEf(%rip), \XMM4, \XMM5
2195 vpaddd ONEf(%rip), \XMM5, \XMM6
2196 vpaddd ONEf(%rip), \XMM6, \XMM7
2197 vpaddd ONEf(%rip), \XMM7, \XMM8
2198 vmovdqa \XMM8, \CTR
2199.endif
2200
2201
2202 #######################################################################
2203
2204 vmovdqu (arg1), \T1
2205 vpxor \T1, \XMM1, \XMM1
2206 vpxor \T1, \XMM2, \XMM2
2207 vpxor \T1, \XMM3, \XMM3
2208 vpxor \T1, \XMM4, \XMM4
2209 vpxor \T1, \XMM5, \XMM5
2210 vpxor \T1, \XMM6, \XMM6
2211 vpxor \T1, \XMM7, \XMM7
2212 vpxor \T1, \XMM8, \XMM8
2213
2214 #######################################################################
2215
2216
2217
2218
2219
2220 vmovdqu 16*1(arg1), \T1
2221 vaesenc \T1, \XMM1, \XMM1
2222 vaesenc \T1, \XMM2, \XMM2
2223 vaesenc \T1, \XMM3, \XMM3
2224 vaesenc \T1, \XMM4, \XMM4
2225 vaesenc \T1, \XMM5, \XMM5
2226 vaesenc \T1, \XMM6, \XMM6
2227 vaesenc \T1, \XMM7, \XMM7
2228 vaesenc \T1, \XMM8, \XMM8
2229
2230 vmovdqu 16*2(arg1), \T1
2231 vaesenc \T1, \XMM1, \XMM1
2232 vaesenc \T1, \XMM2, \XMM2
2233 vaesenc \T1, \XMM3, \XMM3
2234 vaesenc \T1, \XMM4, \XMM4
2235 vaesenc \T1, \XMM5, \XMM5
2236 vaesenc \T1, \XMM6, \XMM6
2237 vaesenc \T1, \XMM7, \XMM7
2238 vaesenc \T1, \XMM8, \XMM8
2239
2240
2241 #######################################################################
2242
2243 vmovdqu HashKey_8(arg2), \T5
2244 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2245 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2246 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2247 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2248 vpxor \T5, \T6, \T6
2249
2250 vmovdqu 16*3(arg1), \T1
2251 vaesenc \T1, \XMM1, \XMM1
2252 vaesenc \T1, \XMM2, \XMM2
2253 vaesenc \T1, \XMM3, \XMM3
2254 vaesenc \T1, \XMM4, \XMM4
2255 vaesenc \T1, \XMM5, \XMM5
2256 vaesenc \T1, \XMM6, \XMM6
2257 vaesenc \T1, \XMM7, \XMM7
2258 vaesenc \T1, \XMM8, \XMM8
2259
2260 vmovdqa TMP2(%rsp), \T1
2261 vmovdqu HashKey_7(arg2), \T5
2262 vpclmulqdq $0x11, \T5, \T1, \T3
2263 vpxor \T3, \T4, \T4
2264
2265 vpclmulqdq $0x00, \T5, \T1, \T3
2266 vpxor \T3, \T7, \T7
2267
2268 vpclmulqdq $0x01, \T5, \T1, \T3
2269 vpxor \T3, \T6, \T6
2270
2271 vpclmulqdq $0x10, \T5, \T1, \T3
2272 vpxor \T3, \T6, \T6
2273
2274 vmovdqu 16*4(arg1), \T1
2275 vaesenc \T1, \XMM1, \XMM1
2276 vaesenc \T1, \XMM2, \XMM2
2277 vaesenc \T1, \XMM3, \XMM3
2278 vaesenc \T1, \XMM4, \XMM4
2279 vaesenc \T1, \XMM5, \XMM5
2280 vaesenc \T1, \XMM6, \XMM6
2281 vaesenc \T1, \XMM7, \XMM7
2282 vaesenc \T1, \XMM8, \XMM8
2283
2284 #######################################################################
2285
2286 vmovdqa TMP3(%rsp), \T1
2287 vmovdqu HashKey_6(arg2), \T5
2288 vpclmulqdq $0x11, \T5, \T1, \T3
2289 vpxor \T3, \T4, \T4
2290
2291 vpclmulqdq $0x00, \T5, \T1, \T3
2292 vpxor \T3, \T7, \T7
2293
2294 vpclmulqdq $0x01, \T5, \T1, \T3
2295 vpxor \T3, \T6, \T6
2296
2297 vpclmulqdq $0x10, \T5, \T1, \T3
2298 vpxor \T3, \T6, \T6
2299
2300 vmovdqu 16*5(arg1), \T1
2301 vaesenc \T1, \XMM1, \XMM1
2302 vaesenc \T1, \XMM2, \XMM2
2303 vaesenc \T1, \XMM3, \XMM3
2304 vaesenc \T1, \XMM4, \XMM4
2305 vaesenc \T1, \XMM5, \XMM5
2306 vaesenc \T1, \XMM6, \XMM6
2307 vaesenc \T1, \XMM7, \XMM7
2308 vaesenc \T1, \XMM8, \XMM8
2309
2310 vmovdqa TMP4(%rsp), \T1
2311 vmovdqu HashKey_5(arg2), \T5
2312 vpclmulqdq $0x11, \T5, \T1, \T3
2313 vpxor \T3, \T4, \T4
2314
2315 vpclmulqdq $0x00, \T5, \T1, \T3
2316 vpxor \T3, \T7, \T7
2317
2318 vpclmulqdq $0x01, \T5, \T1, \T3
2319 vpxor \T3, \T6, \T6
2320
2321 vpclmulqdq $0x10, \T5, \T1, \T3
2322 vpxor \T3, \T6, \T6
2323
2324 vmovdqu 16*6(arg1), \T1
2325 vaesenc \T1, \XMM1, \XMM1
2326 vaesenc \T1, \XMM2, \XMM2
2327 vaesenc \T1, \XMM3, \XMM3
2328 vaesenc \T1, \XMM4, \XMM4
2329 vaesenc \T1, \XMM5, \XMM5
2330 vaesenc \T1, \XMM6, \XMM6
2331 vaesenc \T1, \XMM7, \XMM7
2332 vaesenc \T1, \XMM8, \XMM8
2333
2334
2335 vmovdqa TMP5(%rsp), \T1
2336 vmovdqu HashKey_4(arg2), \T5
2337 vpclmulqdq $0x11, \T5, \T1, \T3
2338 vpxor \T3, \T4, \T4
2339
2340 vpclmulqdq $0x00, \T5, \T1, \T3
2341 vpxor \T3, \T7, \T7
2342
2343 vpclmulqdq $0x01, \T5, \T1, \T3
2344 vpxor \T3, \T6, \T6
2345
2346 vpclmulqdq $0x10, \T5, \T1, \T3
2347 vpxor \T3, \T6, \T6
2348
2349 vmovdqu 16*7(arg1), \T1
2350 vaesenc \T1, \XMM1, \XMM1
2351 vaesenc \T1, \XMM2, \XMM2
2352 vaesenc \T1, \XMM3, \XMM3
2353 vaesenc \T1, \XMM4, \XMM4
2354 vaesenc \T1, \XMM5, \XMM5
2355 vaesenc \T1, \XMM6, \XMM6
2356 vaesenc \T1, \XMM7, \XMM7
2357 vaesenc \T1, \XMM8, \XMM8
2358
2359 vmovdqa TMP6(%rsp), \T1
2360 vmovdqu HashKey_3(arg2), \T5
2361 vpclmulqdq $0x11, \T5, \T1, \T3
2362 vpxor \T3, \T4, \T4
2363
2364 vpclmulqdq $0x00, \T5, \T1, \T3
2365 vpxor \T3, \T7, \T7
2366
2367 vpclmulqdq $0x01, \T5, \T1, \T3
2368 vpxor \T3, \T6, \T6
2369
2370 vpclmulqdq $0x10, \T5, \T1, \T3
2371 vpxor \T3, \T6, \T6
2372
2373 vmovdqu 16*8(arg1), \T1
2374 vaesenc \T1, \XMM1, \XMM1
2375 vaesenc \T1, \XMM2, \XMM2
2376 vaesenc \T1, \XMM3, \XMM3
2377 vaesenc \T1, \XMM4, \XMM4
2378 vaesenc \T1, \XMM5, \XMM5
2379 vaesenc \T1, \XMM6, \XMM6
2380 vaesenc \T1, \XMM7, \XMM7
2381 vaesenc \T1, \XMM8, \XMM8
2382
2383 vmovdqa TMP7(%rsp), \T1
2384 vmovdqu HashKey_2(arg2), \T5
2385 vpclmulqdq $0x11, \T5, \T1, \T3
2386 vpxor \T3, \T4, \T4
2387
2388 vpclmulqdq $0x00, \T5, \T1, \T3
2389 vpxor \T3, \T7, \T7
2390
2391 vpclmulqdq $0x01, \T5, \T1, \T3
2392 vpxor \T3, \T6, \T6
2393
2394 vpclmulqdq $0x10, \T5, \T1, \T3
2395 vpxor \T3, \T6, \T6
2396
2397
2398 #######################################################################
2399
2400 vmovdqu 16*9(arg1), \T5
2401 vaesenc \T5, \XMM1, \XMM1
2402 vaesenc \T5, \XMM2, \XMM2
2403 vaesenc \T5, \XMM3, \XMM3
2404 vaesenc \T5, \XMM4, \XMM4
2405 vaesenc \T5, \XMM5, \XMM5
2406 vaesenc \T5, \XMM6, \XMM6
2407 vaesenc \T5, \XMM7, \XMM7
2408 vaesenc \T5, \XMM8, \XMM8
2409
2410 vmovdqa TMP8(%rsp), \T1
2411 vmovdqu HashKey(arg2), \T5
2412
2413 vpclmulqdq $0x00, \T5, \T1, \T3
2414 vpxor \T3, \T7, \T7
2415
2416 vpclmulqdq $0x01, \T5, \T1, \T3
2417 vpxor \T3, \T6, \T6
2418
2419 vpclmulqdq $0x10, \T5, \T1, \T3
2420 vpxor \T3, \T6, \T6
2421
2422 vpclmulqdq $0x11, \T5, \T1, \T3
2423 vpxor \T3, \T4, \T1
2424
2425
2426 vmovdqu 16*10(arg1), \T5
2427
2428 i = 11
2429 setreg
2430.rep (\REP-9)
2431 vaesenc \T5, \XMM1, \XMM1
2432 vaesenc \T5, \XMM2, \XMM2
2433 vaesenc \T5, \XMM3, \XMM3
2434 vaesenc \T5, \XMM4, \XMM4
2435 vaesenc \T5, \XMM5, \XMM5
2436 vaesenc \T5, \XMM6, \XMM6
2437 vaesenc \T5, \XMM7, \XMM7
2438 vaesenc \T5, \XMM8, \XMM8
2439
2440 vmovdqu 16*i(arg1), \T5
2441 i = i + 1
2442 setreg
2443.endr
2444
2445 i = 0
2446 j = 1
2447 setreg
2448.rep 8
2449 vpxor 16*i(arg4, %r11), \T5, \T2
2450 .if \ENC_DEC == ENC
2451 vaesenclast \T2, reg_j, reg_j
2452 .else
2453 vaesenclast \T2, reg_j, \T3
2454 vmovdqu 16*i(arg4, %r11), reg_j
2455 vmovdqu \T3, 16*i(arg3, %r11)
2456 .endif
2457 i = (i+1)
2458 j = (j+1)
2459 setreg
2460.endr
2461 #######################################################################
2462
2463
2464 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2465 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2466 vpxor \T3, \T7, \T7
2467 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2468
2469
2470
2471 #######################################################################
2472 #first phase of the reduction
2473 vmovdqa POLY2(%rip), \T3
2474
2475 vpclmulqdq $0x01, \T7, \T3, \T2
2476 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2477
2478 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2479 #######################################################################
2480 .if \ENC_DEC == ENC
2481 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2482 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2483 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2484 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2485 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2486 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2487 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2488 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2489 .endif
2490
2491 #######################################################################
2492 #second phase of the reduction
2493 vpclmulqdq $0x00, \T7, \T3, \T2
2494 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2495
2496 vpclmulqdq $0x10, \T7, \T3, \T4
2497 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2498
2499 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2500 #######################################################################
2501 vpxor \T4, \T1, \T1 # the result is in T1
2502
2503 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2504 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2505 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2506 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2507 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2508 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2509 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2510 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2511
2512
2513 vpxor \T1, \XMM1, \XMM1
2514
2515
2516
2517.endm
2518
2519
2520# GHASH the last 4 ciphertext blocks.
2521.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2522
2523 ## Karatsuba Method
2524
2525 vmovdqu HashKey_8(arg2), \T5
2526
2527 vpshufd $0b01001110, \XMM1, \T2
2528 vpshufd $0b01001110, \T5, \T3
2529 vpxor \XMM1, \T2, \T2
2530 vpxor \T5, \T3, \T3
2531
2532 vpclmulqdq $0x11, \T5, \XMM1, \T6
2533 vpclmulqdq $0x00, \T5, \XMM1, \T7
2534
2535 vpclmulqdq $0x00, \T3, \T2, \XMM1
2536
2537 ######################
2538
2539 vmovdqu HashKey_7(arg2), \T5
2540 vpshufd $0b01001110, \XMM2, \T2
2541 vpshufd $0b01001110, \T5, \T3
2542 vpxor \XMM2, \T2, \T2
2543 vpxor \T5, \T3, \T3
2544
2545 vpclmulqdq $0x11, \T5, \XMM2, \T4
2546 vpxor \T4, \T6, \T6
2547
2548 vpclmulqdq $0x00, \T5, \XMM2, \T4
2549 vpxor \T4, \T7, \T7
2550
2551 vpclmulqdq $0x00, \T3, \T2, \T2
2552
2553 vpxor \T2, \XMM1, \XMM1
2554
2555 ######################
2556
2557 vmovdqu HashKey_6(arg2), \T5
2558 vpshufd $0b01001110, \XMM3, \T2
2559 vpshufd $0b01001110, \T5, \T3
2560 vpxor \XMM3, \T2, \T2
2561 vpxor \T5, \T3, \T3
2562
2563 vpclmulqdq $0x11, \T5, \XMM3, \T4
2564 vpxor \T4, \T6, \T6
2565
2566 vpclmulqdq $0x00, \T5, \XMM3, \T4
2567 vpxor \T4, \T7, \T7
2568
2569 vpclmulqdq $0x00, \T3, \T2, \T2
2570
2571 vpxor \T2, \XMM1, \XMM1
2572
2573 ######################
2574
2575 vmovdqu HashKey_5(arg2), \T5
2576 vpshufd $0b01001110, \XMM4, \T2
2577 vpshufd $0b01001110, \T5, \T3
2578 vpxor \XMM4, \T2, \T2
2579 vpxor \T5, \T3, \T3
2580
2581 vpclmulqdq $0x11, \T5, \XMM4, \T4
2582 vpxor \T4, \T6, \T6
2583
2584 vpclmulqdq $0x00, \T5, \XMM4, \T4
2585 vpxor \T4, \T7, \T7
2586
2587 vpclmulqdq $0x00, \T3, \T2, \T2
2588
2589 vpxor \T2, \XMM1, \XMM1
2590
2591 ######################
2592
2593 vmovdqu HashKey_4(arg2), \T5
2594 vpshufd $0b01001110, \XMM5, \T2
2595 vpshufd $0b01001110, \T5, \T3
2596 vpxor \XMM5, \T2, \T2
2597 vpxor \T5, \T3, \T3
2598
2599 vpclmulqdq $0x11, \T5, \XMM5, \T4
2600 vpxor \T4, \T6, \T6
2601
2602 vpclmulqdq $0x00, \T5, \XMM5, \T4
2603 vpxor \T4, \T7, \T7
2604
2605 vpclmulqdq $0x00, \T3, \T2, \T2
2606
2607 vpxor \T2, \XMM1, \XMM1
2608
2609 ######################
2610
2611 vmovdqu HashKey_3(arg2), \T5
2612 vpshufd $0b01001110, \XMM6, \T2
2613 vpshufd $0b01001110, \T5, \T3
2614 vpxor \XMM6, \T2, \T2
2615 vpxor \T5, \T3, \T3
2616
2617 vpclmulqdq $0x11, \T5, \XMM6, \T4
2618 vpxor \T4, \T6, \T6
2619
2620 vpclmulqdq $0x00, \T5, \XMM6, \T4
2621 vpxor \T4, \T7, \T7
2622
2623 vpclmulqdq $0x00, \T3, \T2, \T2
2624
2625 vpxor \T2, \XMM1, \XMM1
2626
2627 ######################
2628
2629 vmovdqu HashKey_2(arg2), \T5
2630 vpshufd $0b01001110, \XMM7, \T2
2631 vpshufd $0b01001110, \T5, \T3
2632 vpxor \XMM7, \T2, \T2
2633 vpxor \T5, \T3, \T3
2634
2635 vpclmulqdq $0x11, \T5, \XMM7, \T4
2636 vpxor \T4, \T6, \T6
2637
2638 vpclmulqdq $0x00, \T5, \XMM7, \T4
2639 vpxor \T4, \T7, \T7
2640
2641 vpclmulqdq $0x00, \T3, \T2, \T2
2642
2643 vpxor \T2, \XMM1, \XMM1
2644
2645 ######################
2646
2647 vmovdqu HashKey(arg2), \T5
2648 vpshufd $0b01001110, \XMM8, \T2
2649 vpshufd $0b01001110, \T5, \T3
2650 vpxor \XMM8, \T2, \T2
2651 vpxor \T5, \T3, \T3
2652
2653 vpclmulqdq $0x11, \T5, \XMM8, \T4
2654 vpxor \T4, \T6, \T6
2655
2656 vpclmulqdq $0x00, \T5, \XMM8, \T4
2657 vpxor \T4, \T7, \T7
2658
2659 vpclmulqdq $0x00, \T3, \T2, \T2
2660
2661 vpxor \T2, \XMM1, \XMM1
2662 vpxor \T6, \XMM1, \XMM1
2663 vpxor \T7, \XMM1, \T2
2664
2665
2666
2667
2668 vpslldq $8, \T2, \T4
2669 vpsrldq $8, \T2, \T2
2670
2671 vpxor \T4, \T7, \T7
2672 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2673 # accumulated carry-less multiplications
2674
2675 #######################################################################
2676 #first phase of the reduction
2677 vmovdqa POLY2(%rip), \T3
2678
2679 vpclmulqdq $0x01, \T7, \T3, \T2
2680 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2681
2682 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2683 #######################################################################
2684
2685
2686 #second phase of the reduction
2687 vpclmulqdq $0x00, \T7, \T3, \T2
2688 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2689
2690 vpclmulqdq $0x10, \T7, \T3, \T4
2691 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2692
2693 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2694 #######################################################################
2695 vpxor \T4, \T6, \T6 # the result is in T6
2696.endm
2697
2698
2699
2700#############################################################
2701#void aesni_gcm_init_avx_gen4
2702# (gcm_data *my_ctx_data,
2703# gcm_context_data *data,
2704# u8 *iv, /* Pre-counter block j0: 4 byte salt
2705# (from Security Association) concatenated with 8 byte
2706# Initialisation Vector (from IPSec ESP Payload)
2707# concatenated with 0x00000001. 16-byte aligned pointer. */
2708# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2709# const u8 *aad, /* Additional Authentication Data (AAD)*/
2710# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2711#############################################################
2712SYM_FUNC_START(aesni_gcm_init_avx_gen4)
2713 FUNC_SAVE
2714 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2715 FUNC_RESTORE
2716 RET
2717SYM_FUNC_END(aesni_gcm_init_avx_gen4)
2718
2719###############################################################################
2720#void aesni_gcm_enc_avx_gen4(
2721# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2722# gcm_context_data *data,
2723# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
2724# const u8 *in, /* Plaintext input */
2725# u64 plaintext_len) /* Length of data in Bytes for encryption. */
2726###############################################################################
2727SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2728 FUNC_SAVE
2729 mov keysize,%eax
2730 cmp $32, %eax
2731 je key_256_enc_update4
2732 cmp $16, %eax
2733 je key_128_enc_update4
2734 # must be 192
2735 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2736 FUNC_RESTORE
2737 RET
2738key_128_enc_update4:
2739 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2740 FUNC_RESTORE
2741 RET
2742key_256_enc_update4:
2743 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2744 FUNC_RESTORE
2745 RET
2746SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2747
2748###############################################################################
2749#void aesni_gcm_dec_update_avx_gen4(
2750# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2751# gcm_context_data *data,
2752# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
2753# const u8 *in, /* Ciphertext input */
2754# u64 plaintext_len) /* Length of data in Bytes for encryption. */
2755###############################################################################
2756SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2757 FUNC_SAVE
2758 mov keysize,%eax
2759 cmp $32, %eax
2760 je key_256_dec_update4
2761 cmp $16, %eax
2762 je key_128_dec_update4
2763 # must be 192
2764 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2765 FUNC_RESTORE
2766 RET
2767key_128_dec_update4:
2768 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2769 FUNC_RESTORE
2770 RET
2771key_256_dec_update4:
2772 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2773 FUNC_RESTORE
2774 RET
2775SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2776
2777###############################################################################
2778#void aesni_gcm_finalize_avx_gen4(
2779# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
2780# gcm_context_data *data,
2781# u8 *auth_tag, /* Authenticated Tag output. */
2782# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
2783# Valid values are 16 (most likely), 12 or 8. */
2784###############################################################################
2785SYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2786 FUNC_SAVE
2787 mov keysize,%eax
2788 cmp $32, %eax
2789 je key_256_finalize4
2790 cmp $16, %eax
2791 je key_128_finalize4
2792 # must be 192
2793 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2794 FUNC_RESTORE
2795 RET
2796key_128_finalize4:
2797 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2798 FUNC_RESTORE
2799 RET
2800key_256_finalize4:
2801 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2802 FUNC_RESTORE
2803 RET
2804SYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2805

source code of linux/arch/x86/crypto/aesni-intel_avx-x86_64.S