1########################################################################
2# Implement fast SHA-256 with SSSE3 instructions. (x86_64)
3#
4# Copyright (C) 2013 Intel Corporation.
5#
6# Authors:
7# James Guilford <james.guilford@intel.com>
8# Kirk Yap <kirk.s.yap@intel.com>
9# Tim Chen <tim.c.chen@linux.intel.com>
10#
11# This software is available to you under a choice of one of two
12# licenses. You may choose to be licensed under the terms of the GNU
13# General Public License (GPL) Version 2, available from the file
14# COPYING in the main directory of this source tree, or the
15# OpenIB.org BSD license below:
16#
17# Redistribution and use in source and binary forms, with or
18# without modification, are permitted provided that the following
19# conditions are met:
20#
21# - Redistributions of source code must retain the above
22# copyright notice, this list of conditions and the following
23# disclaimer.
24#
25# - Redistributions in binary form must reproduce the above
26# copyright notice, this list of conditions and the following
27# disclaimer in the documentation and/or other materials
28# provided with the distribution.
29#
30# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
31# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
33# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
34# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
35# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
36# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
37# SOFTWARE.
38#
39########################################################################
40#
41# This code is described in an Intel White-Paper:
42# "Fast SHA-256 Implementations on Intel Architecture Processors"
43#
44# To find it, surf to http://www.intel.com/p/en_US/embedded
45# and search for that title.
46#
47########################################################################
48
49#include <linux/linkage.h>
50#include <linux/cfi_types.h>
51
52## assume buffers not aligned
53#define MOVDQ movdqu
54
55################################ Define Macros
56
57# addm [mem], reg
58# Add reg to mem using reg-mem add and store
59.macro addm p1 p2
60 add \p1, \p2
61 mov \p2, \p1
62.endm
63
64################################
65
66# COPY_XMM_AND_BSWAP xmm, [mem], byte_flip_mask
67# Load xmm with mem and byte swap each dword
68.macro COPY_XMM_AND_BSWAP p1 p2 p3
69 MOVDQ \p2, \p1
70 pshufb \p3, \p1
71.endm
72
73################################
74
75X0 = %xmm4
76X1 = %xmm5
77X2 = %xmm6
78X3 = %xmm7
79
80XTMP0 = %xmm0
81XTMP1 = %xmm1
82XTMP2 = %xmm2
83XTMP3 = %xmm3
84XTMP4 = %xmm8
85XFER = %xmm9
86
87SHUF_00BA = %xmm10 # shuffle xBxA -> 00BA
88SHUF_DC00 = %xmm11 # shuffle xDxC -> DC00
89BYTE_FLIP_MASK = %xmm12
90
91NUM_BLKS = %rdx # 3rd arg
92INP = %rsi # 2nd arg
93CTX = %rdi # 1st arg
94
95SRND = %rsi # clobbers INP
96c = %ecx
97d = %r8d
98e = %edx
99TBL = %r12
100a = %eax
101b = %ebx
102
103f = %r9d
104g = %r10d
105h = %r11d
106
107y0 = %r13d
108y1 = %r14d
109y2 = %r15d
110
111
112
113_INP_END_SIZE = 8
114_INP_SIZE = 8
115_XFER_SIZE = 16
116_XMM_SAVE_SIZE = 0
117
118_INP_END = 0
119_INP = _INP_END + _INP_END_SIZE
120_XFER = _INP + _INP_SIZE
121_XMM_SAVE = _XFER + _XFER_SIZE
122STACK_SIZE = _XMM_SAVE + _XMM_SAVE_SIZE
123
124# rotate_Xs
125# Rotate values of symbols X0...X3
126.macro rotate_Xs
127X_ = X0
128X0 = X1
129X1 = X2
130X2 = X3
131X3 = X_
132.endm
133
134# ROTATE_ARGS
135# Rotate values of symbols a...h
136.macro ROTATE_ARGS
137TMP_ = h
138h = g
139g = f
140f = e
141e = d
142d = c
143c = b
144b = a
145a = TMP_
146.endm
147
148.macro FOUR_ROUNDS_AND_SCHED
149 ## compute s0 four at a time and s1 two at a time
150 ## compute W[-16] + W[-7] 4 at a time
151 movdqa X3, XTMP0
152 mov e, y0 # y0 = e
153 ror $(25-11), y0 # y0 = e >> (25-11)
154 mov a, y1 # y1 = a
155 palignr $4, X2, XTMP0 # XTMP0 = W[-7]
156 ror $(22-13), y1 # y1 = a >> (22-13)
157 xor e, y0 # y0 = e ^ (e >> (25-11))
158 mov f, y2 # y2 = f
159 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
160 movdqa X1, XTMP1
161 xor a, y1 # y1 = a ^ (a >> (22-13)
162 xor g, y2 # y2 = f^g
163 paddd X0, XTMP0 # XTMP0 = W[-7] + W[-16]
164 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
165 and e, y2 # y2 = (f^g)&e
166 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
167 ## compute s0
168 palignr $4, X0, XTMP1 # XTMP1 = W[-15]
169 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
170 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
171 xor g, y2 # y2 = CH = ((f^g)&e)^g
172 movdqa XTMP1, XTMP2 # XTMP2 = W[-15]
173 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
174 add y0, y2 # y2 = S1 + CH
175 add _XFER(%rsp) , y2 # y2 = k + w + S1 + CH
176 movdqa XTMP1, XTMP3 # XTMP3 = W[-15]
177 mov a, y0 # y0 = a
178 add y2, h # h = h + S1 + CH + k + w
179 mov a, y2 # y2 = a
180 pslld $(32-7), XTMP1 #
181 or c, y0 # y0 = a|c
182 add h, d # d = d + h + S1 + CH + k + w
183 and c, y2 # y2 = a&c
184 psrld $7, XTMP2 #
185 and b, y0 # y0 = (a|c)&b
186 add y1, h # h = h + S1 + CH + k + w + S0
187 por XTMP2, XTMP1 # XTMP1 = W[-15] ror 7
188 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
189 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
190 #
191 ROTATE_ARGS #
192 movdqa XTMP3, XTMP2 # XTMP2 = W[-15]
193 mov e, y0 # y0 = e
194 mov a, y1 # y1 = a
195 movdqa XTMP3, XTMP4 # XTMP4 = W[-15]
196 ror $(25-11), y0 # y0 = e >> (25-11)
197 xor e, y0 # y0 = e ^ (e >> (25-11))
198 mov f, y2 # y2 = f
199 ror $(22-13), y1 # y1 = a >> (22-13)
200 pslld $(32-18), XTMP3 #
201 xor a, y1 # y1 = a ^ (a >> (22-13)
202 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
203 xor g, y2 # y2 = f^g
204 psrld $18, XTMP2 #
205 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
206 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
207 and e, y2 # y2 = (f^g)&e
208 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
209 pxor XTMP3, XTMP1
210 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
211 xor g, y2 # y2 = CH = ((f^g)&e)^g
212 psrld $3, XTMP4 # XTMP4 = W[-15] >> 3
213 add y0, y2 # y2 = S1 + CH
214 add (1*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
215 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
216 pxor XTMP2, XTMP1 # XTMP1 = W[-15] ror 7 ^ W[-15] ror 18
217 mov a, y0 # y0 = a
218 add y2, h # h = h + S1 + CH + k + w
219 mov a, y2 # y2 = a
220 pxor XTMP4, XTMP1 # XTMP1 = s0
221 or c, y0 # y0 = a|c
222 add h, d # d = d + h + S1 + CH + k + w
223 and c, y2 # y2 = a&c
224 ## compute low s1
225 pshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
226 and b, y0 # y0 = (a|c)&b
227 add y1, h # h = h + S1 + CH + k + w + S0
228 paddd XTMP1, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
229 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
230 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
231
232 ROTATE_ARGS
233 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {BBAA}
234 mov e, y0 # y0 = e
235 mov a, y1 # y1 = a
236 ror $(25-11), y0 # y0 = e >> (25-11)
237 movdqa XTMP2, XTMP4 # XTMP4 = W[-2] {BBAA}
238 xor e, y0 # y0 = e ^ (e >> (25-11))
239 ror $(22-13), y1 # y1 = a >> (22-13)
240 mov f, y2 # y2 = f
241 xor a, y1 # y1 = a ^ (a >> (22-13)
242 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
243 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
244 xor g, y2 # y2 = f^g
245 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
246 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
247 and e, y2 # y2 = (f^g)&e
248 psrld $10, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
249 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
250 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
251 xor g, y2 # y2 = CH = ((f^g)&e)^g
252 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
253 pxor XTMP3, XTMP2
254 add y0, y2 # y2 = S1 + CH
255 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
256 add (2*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
257 pxor XTMP2, XTMP4 # XTMP4 = s1 {xBxA}
258 mov a, y0 # y0 = a
259 add y2, h # h = h + S1 + CH + k + w
260 mov a, y2 # y2 = a
261 pshufb SHUF_00BA, XTMP4 # XTMP4 = s1 {00BA}
262 or c, y0 # y0 = a|c
263 add h, d # d = d + h + S1 + CH + k + w
264 and c, y2 # y2 = a&c
265 paddd XTMP4, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
266 and b, y0 # y0 = (a|c)&b
267 add y1, h # h = h + S1 + CH + k + w + S0
268 ## compute high s1
269 pshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {BBAA}
270 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
271 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
272 #
273 ROTATE_ARGS #
274 movdqa XTMP2, XTMP3 # XTMP3 = W[-2] {DDCC}
275 mov e, y0 # y0 = e
276 ror $(25-11), y0 # y0 = e >> (25-11)
277 mov a, y1 # y1 = a
278 movdqa XTMP2, X0 # X0 = W[-2] {DDCC}
279 ror $(22-13), y1 # y1 = a >> (22-13)
280 xor e, y0 # y0 = e ^ (e >> (25-11))
281 mov f, y2 # y2 = f
282 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
283 psrlq $17, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
284 xor a, y1 # y1 = a ^ (a >> (22-13)
285 xor g, y2 # y2 = f^g
286 psrlq $19, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
287 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25
288 and e, y2 # y2 = (f^g)&e
289 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
290 psrld $10, X0 # X0 = W[-2] >> 10 {DDCC}
291 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22
292 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>2
293 xor g, y2 # y2 = CH = ((f^g)&e)^g
294 pxor XTMP3, XTMP2 #
295 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>2
296 add y0, y2 # y2 = S1 + CH
297 add (3*4 + _XFER)(%rsp), y2 # y2 = k + w + S1 + CH
298 pxor XTMP2, X0 # X0 = s1 {xDxC}
299 mov a, y0 # y0 = a
300 add y2, h # h = h + S1 + CH + k + w
301 mov a, y2 # y2 = a
302 pshufb SHUF_DC00, X0 # X0 = s1 {DC00}
303 or c, y0 # y0 = a|c
304 add h, d # d = d + h + S1 + CH + k + w
305 and c, y2 # y2 = a&c
306 paddd XTMP0, X0 # X0 = {W[3], W[2], W[1], W[0]}
307 and b, y0 # y0 = (a|c)&b
308 add y1, h # h = h + S1 + CH + k + w + S0
309 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
310 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
311
312 ROTATE_ARGS
313 rotate_Xs
314.endm
315
316## input is [rsp + _XFER + %1 * 4]
317.macro DO_ROUND round
318 mov e, y0 # y0 = e
319 ror $(25-11), y0 # y0 = e >> (25-11)
320 mov a, y1 # y1 = a
321 xor e, y0 # y0 = e ^ (e >> (25-11))
322 ror $(22-13), y1 # y1 = a >> (22-13)
323 mov f, y2 # y2 = f
324 xor a, y1 # y1 = a ^ (a >> (22-13)
325 ror $(11-6), y0 # y0 = (e >> (11-6)) ^ (e >> (25-6))
326 xor g, y2 # y2 = f^g
327 xor e, y0 # y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
328 ror $(13-2), y1 # y1 = (a >> (13-2)) ^ (a >> (22-2))
329 and e, y2 # y2 = (f^g)&e
330 xor a, y1 # y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
331 ror $6, y0 # y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
332 xor g, y2 # y2 = CH = ((f^g)&e)^g
333 add y0, y2 # y2 = S1 + CH
334 ror $2, y1 # y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
335 offset = \round * 4 + _XFER
336 add offset(%rsp), y2 # y2 = k + w + S1 + CH
337 mov a, y0 # y0 = a
338 add y2, h # h = h + S1 + CH + k + w
339 mov a, y2 # y2 = a
340 or c, y0 # y0 = a|c
341 add h, d # d = d + h + S1 + CH + k + w
342 and c, y2 # y2 = a&c
343 and b, y0 # y0 = (a|c)&b
344 add y1, h # h = h + S1 + CH + k + w + S0
345 or y2, y0 # y0 = MAJ = (a|c)&b)|(a&c)
346 add y0, h # h = h + S1 + CH + k + w + S0 + MAJ
347 ROTATE_ARGS
348.endm
349
350########################################################################
351## void sha256_transform_ssse3(struct sha256_state *state, const u8 *data,
352## int blocks);
353## arg 1 : pointer to state
354## (struct sha256_state is assumed to begin with u32 state[8])
355## arg 2 : pointer to input data
356## arg 3 : Num blocks
357########################################################################
358.text
359SYM_TYPED_FUNC_START(sha256_transform_ssse3)
360 pushq %rbx
361 pushq %r12
362 pushq %r13
363 pushq %r14
364 pushq %r15
365 pushq %rbp
366 mov %rsp, %rbp
367
368 subq $STACK_SIZE, %rsp
369 and $~15, %rsp
370
371 shl $6, NUM_BLKS # convert to bytes
372 jz .Ldone_hash
373 add INP, NUM_BLKS
374 mov NUM_BLKS, _INP_END(%rsp) # pointer to end of data
375
376 ## load initial digest
377 mov 4*0(CTX), a
378 mov 4*1(CTX), b
379 mov 4*2(CTX), c
380 mov 4*3(CTX), d
381 mov 4*4(CTX), e
382 mov 4*5(CTX), f
383 mov 4*6(CTX), g
384 mov 4*7(CTX), h
385
386 movdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
387 movdqa _SHUF_00BA(%rip), SHUF_00BA
388 movdqa _SHUF_DC00(%rip), SHUF_DC00
389
390.Lloop0:
391 lea K256(%rip), TBL
392
393 ## byte swap first 16 dwords
394 COPY_XMM_AND_BSWAP X0, 0*16(INP), BYTE_FLIP_MASK
395 COPY_XMM_AND_BSWAP X1, 1*16(INP), BYTE_FLIP_MASK
396 COPY_XMM_AND_BSWAP X2, 2*16(INP), BYTE_FLIP_MASK
397 COPY_XMM_AND_BSWAP X3, 3*16(INP), BYTE_FLIP_MASK
398
399 mov INP, _INP(%rsp)
400
401 ## schedule 48 input dwords, by doing 3 rounds of 16 each
402 mov $3, SRND
403.align 16
404.Lloop1:
405 movdqa (TBL), XFER
406 paddd X0, XFER
407 movdqa XFER, _XFER(%rsp)
408 FOUR_ROUNDS_AND_SCHED
409
410 movdqa 1*16(TBL), XFER
411 paddd X0, XFER
412 movdqa XFER, _XFER(%rsp)
413 FOUR_ROUNDS_AND_SCHED
414
415 movdqa 2*16(TBL), XFER
416 paddd X0, XFER
417 movdqa XFER, _XFER(%rsp)
418 FOUR_ROUNDS_AND_SCHED
419
420 movdqa 3*16(TBL), XFER
421 paddd X0, XFER
422 movdqa XFER, _XFER(%rsp)
423 add $4*16, TBL
424 FOUR_ROUNDS_AND_SCHED
425
426 sub $1, SRND
427 jne .Lloop1
428
429 mov $2, SRND
430.Lloop2:
431 paddd (TBL), X0
432 movdqa X0, _XFER(%rsp)
433 DO_ROUND 0
434 DO_ROUND 1
435 DO_ROUND 2
436 DO_ROUND 3
437 paddd 1*16(TBL), X1
438 movdqa X1, _XFER(%rsp)
439 add $2*16, TBL
440 DO_ROUND 0
441 DO_ROUND 1
442 DO_ROUND 2
443 DO_ROUND 3
444
445 movdqa X2, X0
446 movdqa X3, X1
447
448 sub $1, SRND
449 jne .Lloop2
450
451 addm (4*0)(CTX),a
452 addm (4*1)(CTX),b
453 addm (4*2)(CTX),c
454 addm (4*3)(CTX),d
455 addm (4*4)(CTX),e
456 addm (4*5)(CTX),f
457 addm (4*6)(CTX),g
458 addm (4*7)(CTX),h
459
460 mov _INP(%rsp), INP
461 add $64, INP
462 cmp _INP_END(%rsp), INP
463 jne .Lloop0
464
465.Ldone_hash:
466
467 mov %rbp, %rsp
468 popq %rbp
469 popq %r15
470 popq %r14
471 popq %r13
472 popq %r12
473 popq %rbx
474
475 RET
476SYM_FUNC_END(sha256_transform_ssse3)
477
478.section .rodata.cst256.K256, "aM", @progbits, 256
479.align 64
480K256:
481 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
482 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
483 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
484 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
485 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
486 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
487 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
488 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
489 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
490 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
491 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
492 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
493 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
494 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
495 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
496 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
497
498.section .rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
499.align 16
500PSHUFFLE_BYTE_FLIP_MASK:
501 .octa 0x0c0d0e0f08090a0b0405060700010203
502
503.section .rodata.cst16._SHUF_00BA, "aM", @progbits, 16
504.align 16
505# shuffle xBxA -> 00BA
506_SHUF_00BA:
507 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100
508
509.section .rodata.cst16._SHUF_DC00, "aM", @progbits, 16
510.align 16
511# shuffle xDxC -> DC00
512_SHUF_DC00:
513 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF
514

source code of linux/arch/x86/crypto/sha256-ssse3-asm.S