1#define __ARM_ARCH__ __LINUX_ARM_ARCH__
2@ SPDX-License-Identifier: GPL-2.0
3
4@ This code is taken from the OpenSSL project but the author (Andy Polyakov)
5@ has relicensed it under the GPLv2. Therefore this program is free software;
6@ you can redistribute it and/or modify it under the terms of the GNU General
7@ Public License version 2 as published by the Free Software Foundation.
8@
9@ The original headers, including the original license headers, are
10@ included below for completeness.
11
12@ ====================================================================
13@ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
14@ project. The module is, however, dual licensed under OpenSSL and
15@ CRYPTOGAMS licenses depending on where you obtain it. For further
16@ details see https://www.openssl.org/~appro/cryptogams/.
17@ ====================================================================
18
19@ sha1_block procedure for ARMv4.
20@
21@ January 2007.
22
23@ Size/performance trade-off
24@ ====================================================================
25@ impl size in bytes comp cycles[*] measured performance
26@ ====================================================================
27@ thumb 304 3212 4420
28@ armv4-small 392/+29% 1958/+64% 2250/+96%
29@ armv4-compact 740/+89% 1552/+26% 1840/+22%
30@ armv4-large 1420/+92% 1307/+19% 1370/+34%[***]
31@ full unroll ~5100/+260% ~1260/+4% ~1300/+5%
32@ ====================================================================
33@ thumb = same as 'small' but in Thumb instructions[**] and
34@ with recurring code in two private functions;
35@ small = detached Xload/update, loops are folded;
36@ compact = detached Xload/update, 5x unroll;
37@ large = interleaved Xload/update, 5x unroll;
38@ full unroll = interleaved Xload/update, full unroll, estimated[!];
39@
40@ [*] Manually counted instructions in "grand" loop body. Measured
41@ performance is affected by prologue and epilogue overhead,
42@ i-cache availability, branch penalties, etc.
43@ [**] While each Thumb instruction is twice smaller, they are not as
44@ diverse as ARM ones: e.g., there are only two arithmetic
45@ instructions with 3 arguments, no [fixed] rotate, addressing
46@ modes are limited. As result it takes more instructions to do
47@ the same job in Thumb, therefore the code is never twice as
48@ small and always slower.
49@ [***] which is also ~35% better than compiler generated code. Dual-
50@ issue Cortex A8 core was measured to process input block in
51@ ~990 cycles.
52
53@ August 2010.
54@
55@ Rescheduling for dual-issue pipeline resulted in 13% improvement on
56@ Cortex A8 core and in absolute terms ~870 cycles per input block
57@ [or 13.6 cycles per byte].
58
59@ February 2011.
60@
61@ Profiler-assisted and platform-specific optimization resulted in 10%
62@ improvement on Cortex A8 core and 12.2 cycles per byte.
63
64#include <linux/linkage.h>
65
66.text
67
68.align 2
69ENTRY(sha1_block_data_order)
70 stmdb sp!,{r4-r12,lr}
71 add r2,r1,r2,lsl#6 @ r2 to point at the end of r1
72 ldmia r0,{r3,r4,r5,r6,r7}
73.Lloop:
74 ldr r8,.LK_00_19
75 mov r14,sp
76 sub sp,sp,#15*4
77 mov r5,r5,ror#30
78 mov r6,r6,ror#30
79 mov r7,r7,ror#30 @ [6]
80.L_00_15:
81#if __ARM_ARCH__<7
82 ldrb r10,[r1,#2]
83 ldrb r9,[r1,#3]
84 ldrb r11,[r1,#1]
85 add r7,r8,r7,ror#2 @ E+=K_00_19
86 ldrb r12,[r1],#4
87 orr r9,r9,r10,lsl#8
88 eor r10,r5,r6 @ F_xx_xx
89 orr r9,r9,r11,lsl#16
90 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
91 orr r9,r9,r12,lsl#24
92#else
93 ldr r9,[r1],#4 @ handles unaligned
94 add r7,r8,r7,ror#2 @ E+=K_00_19
95 eor r10,r5,r6 @ F_xx_xx
96 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
97#ifdef __ARMEL__
98 rev r9,r9 @ byte swap
99#endif
100#endif
101 and r10,r4,r10,ror#2
102 add r7,r7,r9 @ E+=X[i]
103 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
104 str r9,[r14,#-4]!
105 add r7,r7,r10 @ E+=F_00_19(B,C,D)
106#if __ARM_ARCH__<7
107 ldrb r10,[r1,#2]
108 ldrb r9,[r1,#3]
109 ldrb r11,[r1,#1]
110 add r6,r8,r6,ror#2 @ E+=K_00_19
111 ldrb r12,[r1],#4
112 orr r9,r9,r10,lsl#8
113 eor r10,r4,r5 @ F_xx_xx
114 orr r9,r9,r11,lsl#16
115 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
116 orr r9,r9,r12,lsl#24
117#else
118 ldr r9,[r1],#4 @ handles unaligned
119 add r6,r8,r6,ror#2 @ E+=K_00_19
120 eor r10,r4,r5 @ F_xx_xx
121 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
122#ifdef __ARMEL__
123 rev r9,r9 @ byte swap
124#endif
125#endif
126 and r10,r3,r10,ror#2
127 add r6,r6,r9 @ E+=X[i]
128 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
129 str r9,[r14,#-4]!
130 add r6,r6,r10 @ E+=F_00_19(B,C,D)
131#if __ARM_ARCH__<7
132 ldrb r10,[r1,#2]
133 ldrb r9,[r1,#3]
134 ldrb r11,[r1,#1]
135 add r5,r8,r5,ror#2 @ E+=K_00_19
136 ldrb r12,[r1],#4
137 orr r9,r9,r10,lsl#8
138 eor r10,r3,r4 @ F_xx_xx
139 orr r9,r9,r11,lsl#16
140 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
141 orr r9,r9,r12,lsl#24
142#else
143 ldr r9,[r1],#4 @ handles unaligned
144 add r5,r8,r5,ror#2 @ E+=K_00_19
145 eor r10,r3,r4 @ F_xx_xx
146 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
147#ifdef __ARMEL__
148 rev r9,r9 @ byte swap
149#endif
150#endif
151 and r10,r7,r10,ror#2
152 add r5,r5,r9 @ E+=X[i]
153 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
154 str r9,[r14,#-4]!
155 add r5,r5,r10 @ E+=F_00_19(B,C,D)
156#if __ARM_ARCH__<7
157 ldrb r10,[r1,#2]
158 ldrb r9,[r1,#3]
159 ldrb r11,[r1,#1]
160 add r4,r8,r4,ror#2 @ E+=K_00_19
161 ldrb r12,[r1],#4
162 orr r9,r9,r10,lsl#8
163 eor r10,r7,r3 @ F_xx_xx
164 orr r9,r9,r11,lsl#16
165 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
166 orr r9,r9,r12,lsl#24
167#else
168 ldr r9,[r1],#4 @ handles unaligned
169 add r4,r8,r4,ror#2 @ E+=K_00_19
170 eor r10,r7,r3 @ F_xx_xx
171 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
172#ifdef __ARMEL__
173 rev r9,r9 @ byte swap
174#endif
175#endif
176 and r10,r6,r10,ror#2
177 add r4,r4,r9 @ E+=X[i]
178 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
179 str r9,[r14,#-4]!
180 add r4,r4,r10 @ E+=F_00_19(B,C,D)
181#if __ARM_ARCH__<7
182 ldrb r10,[r1,#2]
183 ldrb r9,[r1,#3]
184 ldrb r11,[r1,#1]
185 add r3,r8,r3,ror#2 @ E+=K_00_19
186 ldrb r12,[r1],#4
187 orr r9,r9,r10,lsl#8
188 eor r10,r6,r7 @ F_xx_xx
189 orr r9,r9,r11,lsl#16
190 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
191 orr r9,r9,r12,lsl#24
192#else
193 ldr r9,[r1],#4 @ handles unaligned
194 add r3,r8,r3,ror#2 @ E+=K_00_19
195 eor r10,r6,r7 @ F_xx_xx
196 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
197#ifdef __ARMEL__
198 rev r9,r9 @ byte swap
199#endif
200#endif
201 and r10,r5,r10,ror#2
202 add r3,r3,r9 @ E+=X[i]
203 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
204 str r9,[r14,#-4]!
205 add r3,r3,r10 @ E+=F_00_19(B,C,D)
206 cmp r14,sp
207 bne .L_00_15 @ [((11+4)*5+2)*3]
208 sub sp,sp,#25*4
209#if __ARM_ARCH__<7
210 ldrb r10,[r1,#2]
211 ldrb r9,[r1,#3]
212 ldrb r11,[r1,#1]
213 add r7,r8,r7,ror#2 @ E+=K_00_19
214 ldrb r12,[r1],#4
215 orr r9,r9,r10,lsl#8
216 eor r10,r5,r6 @ F_xx_xx
217 orr r9,r9,r11,lsl#16
218 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
219 orr r9,r9,r12,lsl#24
220#else
221 ldr r9,[r1],#4 @ handles unaligned
222 add r7,r8,r7,ror#2 @ E+=K_00_19
223 eor r10,r5,r6 @ F_xx_xx
224 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
225#ifdef __ARMEL__
226 rev r9,r9 @ byte swap
227#endif
228#endif
229 and r10,r4,r10,ror#2
230 add r7,r7,r9 @ E+=X[i]
231 eor r10,r10,r6,ror#2 @ F_00_19(B,C,D)
232 str r9,[r14,#-4]!
233 add r7,r7,r10 @ E+=F_00_19(B,C,D)
234 ldr r9,[r14,#15*4]
235 ldr r10,[r14,#13*4]
236 ldr r11,[r14,#7*4]
237 add r6,r8,r6,ror#2 @ E+=K_xx_xx
238 ldr r12,[r14,#2*4]
239 eor r9,r9,r10
240 eor r11,r11,r12 @ 1 cycle stall
241 eor r10,r4,r5 @ F_xx_xx
242 mov r9,r9,ror#31
243 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
244 eor r9,r9,r11,ror#31
245 str r9,[r14,#-4]!
246 and r10,r3,r10,ror#2 @ F_xx_xx
247 @ F_xx_xx
248 add r6,r6,r9 @ E+=X[i]
249 eor r10,r10,r5,ror#2 @ F_00_19(B,C,D)
250 add r6,r6,r10 @ E+=F_00_19(B,C,D)
251 ldr r9,[r14,#15*4]
252 ldr r10,[r14,#13*4]
253 ldr r11,[r14,#7*4]
254 add r5,r8,r5,ror#2 @ E+=K_xx_xx
255 ldr r12,[r14,#2*4]
256 eor r9,r9,r10
257 eor r11,r11,r12 @ 1 cycle stall
258 eor r10,r3,r4 @ F_xx_xx
259 mov r9,r9,ror#31
260 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
261 eor r9,r9,r11,ror#31
262 str r9,[r14,#-4]!
263 and r10,r7,r10,ror#2 @ F_xx_xx
264 @ F_xx_xx
265 add r5,r5,r9 @ E+=X[i]
266 eor r10,r10,r4,ror#2 @ F_00_19(B,C,D)
267 add r5,r5,r10 @ E+=F_00_19(B,C,D)
268 ldr r9,[r14,#15*4]
269 ldr r10,[r14,#13*4]
270 ldr r11,[r14,#7*4]
271 add r4,r8,r4,ror#2 @ E+=K_xx_xx
272 ldr r12,[r14,#2*4]
273 eor r9,r9,r10
274 eor r11,r11,r12 @ 1 cycle stall
275 eor r10,r7,r3 @ F_xx_xx
276 mov r9,r9,ror#31
277 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
278 eor r9,r9,r11,ror#31
279 str r9,[r14,#-4]!
280 and r10,r6,r10,ror#2 @ F_xx_xx
281 @ F_xx_xx
282 add r4,r4,r9 @ E+=X[i]
283 eor r10,r10,r3,ror#2 @ F_00_19(B,C,D)
284 add r4,r4,r10 @ E+=F_00_19(B,C,D)
285 ldr r9,[r14,#15*4]
286 ldr r10,[r14,#13*4]
287 ldr r11,[r14,#7*4]
288 add r3,r8,r3,ror#2 @ E+=K_xx_xx
289 ldr r12,[r14,#2*4]
290 eor r9,r9,r10
291 eor r11,r11,r12 @ 1 cycle stall
292 eor r10,r6,r7 @ F_xx_xx
293 mov r9,r9,ror#31
294 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
295 eor r9,r9,r11,ror#31
296 str r9,[r14,#-4]!
297 and r10,r5,r10,ror#2 @ F_xx_xx
298 @ F_xx_xx
299 add r3,r3,r9 @ E+=X[i]
300 eor r10,r10,r7,ror#2 @ F_00_19(B,C,D)
301 add r3,r3,r10 @ E+=F_00_19(B,C,D)
302
303 ldr r8,.LK_20_39 @ [+15+16*4]
304 cmn sp,#0 @ [+3], clear carry to denote 20_39
305.L_20_39_or_60_79:
306 ldr r9,[r14,#15*4]
307 ldr r10,[r14,#13*4]
308 ldr r11,[r14,#7*4]
309 add r7,r8,r7,ror#2 @ E+=K_xx_xx
310 ldr r12,[r14,#2*4]
311 eor r9,r9,r10
312 eor r11,r11,r12 @ 1 cycle stall
313 eor r10,r5,r6 @ F_xx_xx
314 mov r9,r9,ror#31
315 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
316 eor r9,r9,r11,ror#31
317 str r9,[r14,#-4]!
318 eor r10,r4,r10,ror#2 @ F_xx_xx
319 @ F_xx_xx
320 add r7,r7,r9 @ E+=X[i]
321 add r7,r7,r10 @ E+=F_20_39(B,C,D)
322 ldr r9,[r14,#15*4]
323 ldr r10,[r14,#13*4]
324 ldr r11,[r14,#7*4]
325 add r6,r8,r6,ror#2 @ E+=K_xx_xx
326 ldr r12,[r14,#2*4]
327 eor r9,r9,r10
328 eor r11,r11,r12 @ 1 cycle stall
329 eor r10,r4,r5 @ F_xx_xx
330 mov r9,r9,ror#31
331 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
332 eor r9,r9,r11,ror#31
333 str r9,[r14,#-4]!
334 eor r10,r3,r10,ror#2 @ F_xx_xx
335 @ F_xx_xx
336 add r6,r6,r9 @ E+=X[i]
337 add r6,r6,r10 @ E+=F_20_39(B,C,D)
338 ldr r9,[r14,#15*4]
339 ldr r10,[r14,#13*4]
340 ldr r11,[r14,#7*4]
341 add r5,r8,r5,ror#2 @ E+=K_xx_xx
342 ldr r12,[r14,#2*4]
343 eor r9,r9,r10
344 eor r11,r11,r12 @ 1 cycle stall
345 eor r10,r3,r4 @ F_xx_xx
346 mov r9,r9,ror#31
347 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
348 eor r9,r9,r11,ror#31
349 str r9,[r14,#-4]!
350 eor r10,r7,r10,ror#2 @ F_xx_xx
351 @ F_xx_xx
352 add r5,r5,r9 @ E+=X[i]
353 add r5,r5,r10 @ E+=F_20_39(B,C,D)
354 ldr r9,[r14,#15*4]
355 ldr r10,[r14,#13*4]
356 ldr r11,[r14,#7*4]
357 add r4,r8,r4,ror#2 @ E+=K_xx_xx
358 ldr r12,[r14,#2*4]
359 eor r9,r9,r10
360 eor r11,r11,r12 @ 1 cycle stall
361 eor r10,r7,r3 @ F_xx_xx
362 mov r9,r9,ror#31
363 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
364 eor r9,r9,r11,ror#31
365 str r9,[r14,#-4]!
366 eor r10,r6,r10,ror#2 @ F_xx_xx
367 @ F_xx_xx
368 add r4,r4,r9 @ E+=X[i]
369 add r4,r4,r10 @ E+=F_20_39(B,C,D)
370 ldr r9,[r14,#15*4]
371 ldr r10,[r14,#13*4]
372 ldr r11,[r14,#7*4]
373 add r3,r8,r3,ror#2 @ E+=K_xx_xx
374 ldr r12,[r14,#2*4]
375 eor r9,r9,r10
376 eor r11,r11,r12 @ 1 cycle stall
377 eor r10,r6,r7 @ F_xx_xx
378 mov r9,r9,ror#31
379 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
380 eor r9,r9,r11,ror#31
381 str r9,[r14,#-4]!
382 eor r10,r5,r10,ror#2 @ F_xx_xx
383 @ F_xx_xx
384 add r3,r3,r9 @ E+=X[i]
385 add r3,r3,r10 @ E+=F_20_39(B,C,D)
386 ARM( teq r14,sp ) @ preserve carry
387 THUMB( mov r11,sp )
388 THUMB( teq r14,r11 ) @ preserve carry
389 bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4]
390 bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes
391
392 ldr r8,.LK_40_59
393 sub sp,sp,#20*4 @ [+2]
394.L_40_59:
395 ldr r9,[r14,#15*4]
396 ldr r10,[r14,#13*4]
397 ldr r11,[r14,#7*4]
398 add r7,r8,r7,ror#2 @ E+=K_xx_xx
399 ldr r12,[r14,#2*4]
400 eor r9,r9,r10
401 eor r11,r11,r12 @ 1 cycle stall
402 eor r10,r5,r6 @ F_xx_xx
403 mov r9,r9,ror#31
404 add r7,r7,r3,ror#27 @ E+=ROR(A,27)
405 eor r9,r9,r11,ror#31
406 str r9,[r14,#-4]!
407 and r10,r4,r10,ror#2 @ F_xx_xx
408 and r11,r5,r6 @ F_xx_xx
409 add r7,r7,r9 @ E+=X[i]
410 add r7,r7,r10 @ E+=F_40_59(B,C,D)
411 add r7,r7,r11,ror#2
412 ldr r9,[r14,#15*4]
413 ldr r10,[r14,#13*4]
414 ldr r11,[r14,#7*4]
415 add r6,r8,r6,ror#2 @ E+=K_xx_xx
416 ldr r12,[r14,#2*4]
417 eor r9,r9,r10
418 eor r11,r11,r12 @ 1 cycle stall
419 eor r10,r4,r5 @ F_xx_xx
420 mov r9,r9,ror#31
421 add r6,r6,r7,ror#27 @ E+=ROR(A,27)
422 eor r9,r9,r11,ror#31
423 str r9,[r14,#-4]!
424 and r10,r3,r10,ror#2 @ F_xx_xx
425 and r11,r4,r5 @ F_xx_xx
426 add r6,r6,r9 @ E+=X[i]
427 add r6,r6,r10 @ E+=F_40_59(B,C,D)
428 add r6,r6,r11,ror#2
429 ldr r9,[r14,#15*4]
430 ldr r10,[r14,#13*4]
431 ldr r11,[r14,#7*4]
432 add r5,r8,r5,ror#2 @ E+=K_xx_xx
433 ldr r12,[r14,#2*4]
434 eor r9,r9,r10
435 eor r11,r11,r12 @ 1 cycle stall
436 eor r10,r3,r4 @ F_xx_xx
437 mov r9,r9,ror#31
438 add r5,r5,r6,ror#27 @ E+=ROR(A,27)
439 eor r9,r9,r11,ror#31
440 str r9,[r14,#-4]!
441 and r10,r7,r10,ror#2 @ F_xx_xx
442 and r11,r3,r4 @ F_xx_xx
443 add r5,r5,r9 @ E+=X[i]
444 add r5,r5,r10 @ E+=F_40_59(B,C,D)
445 add r5,r5,r11,ror#2
446 ldr r9,[r14,#15*4]
447 ldr r10,[r14,#13*4]
448 ldr r11,[r14,#7*4]
449 add r4,r8,r4,ror#2 @ E+=K_xx_xx
450 ldr r12,[r14,#2*4]
451 eor r9,r9,r10
452 eor r11,r11,r12 @ 1 cycle stall
453 eor r10,r7,r3 @ F_xx_xx
454 mov r9,r9,ror#31
455 add r4,r4,r5,ror#27 @ E+=ROR(A,27)
456 eor r9,r9,r11,ror#31
457 str r9,[r14,#-4]!
458 and r10,r6,r10,ror#2 @ F_xx_xx
459 and r11,r7,r3 @ F_xx_xx
460 add r4,r4,r9 @ E+=X[i]
461 add r4,r4,r10 @ E+=F_40_59(B,C,D)
462 add r4,r4,r11,ror#2
463 ldr r9,[r14,#15*4]
464 ldr r10,[r14,#13*4]
465 ldr r11,[r14,#7*4]
466 add r3,r8,r3,ror#2 @ E+=K_xx_xx
467 ldr r12,[r14,#2*4]
468 eor r9,r9,r10
469 eor r11,r11,r12 @ 1 cycle stall
470 eor r10,r6,r7 @ F_xx_xx
471 mov r9,r9,ror#31
472 add r3,r3,r4,ror#27 @ E+=ROR(A,27)
473 eor r9,r9,r11,ror#31
474 str r9,[r14,#-4]!
475 and r10,r5,r10,ror#2 @ F_xx_xx
476 and r11,r6,r7 @ F_xx_xx
477 add r3,r3,r9 @ E+=X[i]
478 add r3,r3,r10 @ E+=F_40_59(B,C,D)
479 add r3,r3,r11,ror#2
480 cmp r14,sp
481 bne .L_40_59 @ [+((12+5)*5+2)*4]
482
483 ldr r8,.LK_60_79
484 sub sp,sp,#20*4
485 cmp sp,#0 @ set carry to denote 60_79
486 b .L_20_39_or_60_79 @ [+4], spare 300 bytes
487.L_done:
488 add sp,sp,#80*4 @ "deallocate" stack frame
489 ldmia r0,{r8,r9,r10,r11,r12}
490 add r3,r8,r3
491 add r4,r9,r4
492 add r5,r10,r5,ror#2
493 add r6,r11,r6,ror#2
494 add r7,r12,r7,ror#2
495 stmia r0,{r3,r4,r5,r6,r7}
496 teq r1,r2
497 bne .Lloop @ [+18], total 1307
498
499 ldmia sp!,{r4-r12,pc}
500.align 2
501.LK_00_19: .word 0x5a827999
502.LK_20_39: .word 0x6ed9eba1
503.LK_40_59: .word 0x8f1bbcdc
504.LK_60_79: .word 0xca62c1d6
505ENDPROC(sha1_block_data_order)
506.asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>"
507.align 2
508

source code of linux/arch/arm/crypto/sha1-armv4-large.S