1 | #define __ARM_ARCH__ __LINUX_ARM_ARCH__ |
2 | @ SPDX-License-Identifier: GPL-2.0 |
3 | |
4 | @ This code is taken from the OpenSSL project but the author (Andy Polyakov) |
5 | @ has relicensed it under the GPLv2. Therefore this program is free software; |
6 | @ you can redistribute it and/or modify it under the terms of the GNU General |
7 | @ Public License version 2 as published by the Free Software Foundation. |
8 | @ |
9 | @ The original headers, including the original license headers, are |
10 | @ included below for completeness. |
11 | |
12 | @ ==================================================================== |
13 | @ Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL |
14 | @ project. The module is, however, dual licensed under OpenSSL and |
15 | @ CRYPTOGAMS licenses depending on where you obtain it. For further |
16 | @ details see https://www.openssl.org/~appro/cryptogams/. |
17 | @ ==================================================================== |
18 | |
19 | @ sha1_block procedure for ARMv4. |
20 | @ |
21 | @ January 2007. |
22 | |
23 | @ Size/performance trade-off |
24 | @ ==================================================================== |
25 | @ impl size in bytes comp cycles[*] measured performance |
26 | @ ==================================================================== |
27 | @ thumb 304 3212 4420 |
28 | @ armv4-small 392/+29% 1958/+64% 2250/+96% |
29 | @ armv4-compact 740/+89% 1552/+26% 1840/+22% |
30 | @ armv4-large 1420/+92% 1307/+19% 1370/+34%[***] |
31 | @ full unroll ~5100/+260% ~1260/+4% ~1300/+5% |
32 | @ ==================================================================== |
33 | @ thumb = same as 'small' but in Thumb instructions[**] and |
34 | @ with recurring code in two private functions; |
35 | @ small = detached Xload/update, loops are folded; |
36 | @ compact = detached Xload/update, 5x unroll; |
37 | @ large = interleaved Xload/update, 5x unroll; |
38 | @ full unroll = interleaved Xload/update, full unroll, estimated[!]; |
39 | @ |
40 | @ [*] Manually counted instructions in "grand" loop body. Measured |
41 | @ performance is affected by prologue and epilogue overhead, |
42 | @ i-cache availability, branch penalties, etc. |
43 | @ [**] While each Thumb instruction is twice smaller, they are not as |
44 | @ diverse as ARM ones: e.g., there are only two arithmetic |
45 | @ instructions with 3 arguments, no [fixed] rotate, addressing |
46 | @ modes are limited. As result it takes more instructions to do |
47 | @ the same job in Thumb, therefore the code is never twice as |
48 | @ small and always slower. |
49 | @ [***] which is also ~35% better than compiler generated code. Dual- |
50 | @ issue Cortex A8 core was measured to process input block in |
51 | @ ~990 cycles. |
52 | |
53 | @ August 2010. |
54 | @ |
55 | @ Rescheduling for dual-issue pipeline resulted in 13% improvement on |
56 | @ Cortex A8 core and in absolute terms ~870 cycles per input block |
57 | @ [or 13.6 cycles per byte]. |
58 | |
59 | @ February 2011. |
60 | @ |
61 | @ Profiler-assisted and platform-specific optimization resulted in 10% |
62 | @ improvement on Cortex A8 core and 12.2 cycles per byte. |
63 | |
64 | #include <linux/linkage.h> |
65 | |
66 | .text |
67 | |
68 | .align 2 |
69 | ENTRY(sha1_block_data_order) |
70 | stmdb sp!,{r4-r12,lr} |
71 | add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 |
72 | ldmia r0,{r3,r4,r5,r6,r7} |
73 | .Lloop: |
74 | ldr r8,.LK_00_19 |
75 | mov r14,sp |
76 | sub sp,sp,#15*4 |
77 | mov r5,r5,ror#30 |
78 | mov r6,r6,ror#30 |
79 | mov r7,r7,ror#30 @ [6] |
80 | .L_00_15: |
81 | #if __ARM_ARCH__<7 |
82 | ldrb r10,[r1,#2] |
83 | ldrb r9,[r1,#3] |
84 | ldrb r11,[r1,#1] |
85 | add r7,r8,r7,ror#2 @ E+=K_00_19 |
86 | ldrb r12,[r1],#4 |
87 | orr r9,r9,r10,lsl#8 |
88 | eor r10,r5,r6 @ F_xx_xx |
89 | orr r9,r9,r11,lsl#16 |
90 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) |
91 | orr r9,r9,r12,lsl#24 |
92 | #else |
93 | ldr r9,[r1],#4 @ handles unaligned |
94 | add r7,r8,r7,ror#2 @ E+=K_00_19 |
95 | eor r10,r5,r6 @ F_xx_xx |
96 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) |
97 | #ifdef __ARMEL__ |
98 | rev r9,r9 @ byte swap |
99 | #endif |
100 | #endif |
101 | and r10,r4,r10,ror#2 |
102 | add r7,r7,r9 @ E+=X[i] |
103 | eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) |
104 | str r9,[r14,#-4]! |
105 | add r7,r7,r10 @ E+=F_00_19(B,C,D) |
106 | #if __ARM_ARCH__<7 |
107 | ldrb r10,[r1,#2] |
108 | ldrb r9,[r1,#3] |
109 | ldrb r11,[r1,#1] |
110 | add r6,r8,r6,ror#2 @ E+=K_00_19 |
111 | ldrb r12,[r1],#4 |
112 | orr r9,r9,r10,lsl#8 |
113 | eor r10,r4,r5 @ F_xx_xx |
114 | orr r9,r9,r11,lsl#16 |
115 | add r6,r6,r7,ror#27 @ E+=ROR(A,27) |
116 | orr r9,r9,r12,lsl#24 |
117 | #else |
118 | ldr r9,[r1],#4 @ handles unaligned |
119 | add r6,r8,r6,ror#2 @ E+=K_00_19 |
120 | eor r10,r4,r5 @ F_xx_xx |
121 | add r6,r6,r7,ror#27 @ E+=ROR(A,27) |
122 | #ifdef __ARMEL__ |
123 | rev r9,r9 @ byte swap |
124 | #endif |
125 | #endif |
126 | and r10,r3,r10,ror#2 |
127 | add r6,r6,r9 @ E+=X[i] |
128 | eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) |
129 | str r9,[r14,#-4]! |
130 | add r6,r6,r10 @ E+=F_00_19(B,C,D) |
131 | #if __ARM_ARCH__<7 |
132 | ldrb r10,[r1,#2] |
133 | ldrb r9,[r1,#3] |
134 | ldrb r11,[r1,#1] |
135 | add r5,r8,r5,ror#2 @ E+=K_00_19 |
136 | ldrb r12,[r1],#4 |
137 | orr r9,r9,r10,lsl#8 |
138 | eor r10,r3,r4 @ F_xx_xx |
139 | orr r9,r9,r11,lsl#16 |
140 | add r5,r5,r6,ror#27 @ E+=ROR(A,27) |
141 | orr r9,r9,r12,lsl#24 |
142 | #else |
143 | ldr r9,[r1],#4 @ handles unaligned |
144 | add r5,r8,r5,ror#2 @ E+=K_00_19 |
145 | eor r10,r3,r4 @ F_xx_xx |
146 | add r5,r5,r6,ror#27 @ E+=ROR(A,27) |
147 | #ifdef __ARMEL__ |
148 | rev r9,r9 @ byte swap |
149 | #endif |
150 | #endif |
151 | and r10,r7,r10,ror#2 |
152 | add r5,r5,r9 @ E+=X[i] |
153 | eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) |
154 | str r9,[r14,#-4]! |
155 | add r5,r5,r10 @ E+=F_00_19(B,C,D) |
156 | #if __ARM_ARCH__<7 |
157 | ldrb r10,[r1,#2] |
158 | ldrb r9,[r1,#3] |
159 | ldrb r11,[r1,#1] |
160 | add r4,r8,r4,ror#2 @ E+=K_00_19 |
161 | ldrb r12,[r1],#4 |
162 | orr r9,r9,r10,lsl#8 |
163 | eor r10,r7,r3 @ F_xx_xx |
164 | orr r9,r9,r11,lsl#16 |
165 | add r4,r4,r5,ror#27 @ E+=ROR(A,27) |
166 | orr r9,r9,r12,lsl#24 |
167 | #else |
168 | ldr r9,[r1],#4 @ handles unaligned |
169 | add r4,r8,r4,ror#2 @ E+=K_00_19 |
170 | eor r10,r7,r3 @ F_xx_xx |
171 | add r4,r4,r5,ror#27 @ E+=ROR(A,27) |
172 | #ifdef __ARMEL__ |
173 | rev r9,r9 @ byte swap |
174 | #endif |
175 | #endif |
176 | and r10,r6,r10,ror#2 |
177 | add r4,r4,r9 @ E+=X[i] |
178 | eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) |
179 | str r9,[r14,#-4]! |
180 | add r4,r4,r10 @ E+=F_00_19(B,C,D) |
181 | #if __ARM_ARCH__<7 |
182 | ldrb r10,[r1,#2] |
183 | ldrb r9,[r1,#3] |
184 | ldrb r11,[r1,#1] |
185 | add r3,r8,r3,ror#2 @ E+=K_00_19 |
186 | ldrb r12,[r1],#4 |
187 | orr r9,r9,r10,lsl#8 |
188 | eor r10,r6,r7 @ F_xx_xx |
189 | orr r9,r9,r11,lsl#16 |
190 | add r3,r3,r4,ror#27 @ E+=ROR(A,27) |
191 | orr r9,r9,r12,lsl#24 |
192 | #else |
193 | ldr r9,[r1],#4 @ handles unaligned |
194 | add r3,r8,r3,ror#2 @ E+=K_00_19 |
195 | eor r10,r6,r7 @ F_xx_xx |
196 | add r3,r3,r4,ror#27 @ E+=ROR(A,27) |
197 | #ifdef __ARMEL__ |
198 | rev r9,r9 @ byte swap |
199 | #endif |
200 | #endif |
201 | and r10,r5,r10,ror#2 |
202 | add r3,r3,r9 @ E+=X[i] |
203 | eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) |
204 | str r9,[r14,#-4]! |
205 | add r3,r3,r10 @ E+=F_00_19(B,C,D) |
206 | cmp r14,sp |
207 | bne .L_00_15 @ [((11+4)*5+2)*3] |
208 | sub sp,sp,#25*4 |
209 | #if __ARM_ARCH__<7 |
210 | ldrb r10,[r1,#2] |
211 | ldrb r9,[r1,#3] |
212 | ldrb r11,[r1,#1] |
213 | add r7,r8,r7,ror#2 @ E+=K_00_19 |
214 | ldrb r12,[r1],#4 |
215 | orr r9,r9,r10,lsl#8 |
216 | eor r10,r5,r6 @ F_xx_xx |
217 | orr r9,r9,r11,lsl#16 |
218 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) |
219 | orr r9,r9,r12,lsl#24 |
220 | #else |
221 | ldr r9,[r1],#4 @ handles unaligned |
222 | add r7,r8,r7,ror#2 @ E+=K_00_19 |
223 | eor r10,r5,r6 @ F_xx_xx |
224 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) |
225 | #ifdef __ARMEL__ |
226 | rev r9,r9 @ byte swap |
227 | #endif |
228 | #endif |
229 | and r10,r4,r10,ror#2 |
230 | add r7,r7,r9 @ E+=X[i] |
231 | eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) |
232 | str r9,[r14,#-4]! |
233 | add r7,r7,r10 @ E+=F_00_19(B,C,D) |
234 | ldr r9,[r14,#15*4] |
235 | ldr r10,[r14,#13*4] |
236 | ldr r11,[r14,#7*4] |
237 | add r6,r8,r6,ror#2 @ E+=K_xx_xx |
238 | ldr r12,[r14,#2*4] |
239 | eor r9,r9,r10 |
240 | eor r11,r11,r12 @ 1 cycle stall |
241 | eor r10,r4,r5 @ F_xx_xx |
242 | mov r9,r9,ror#31 |
243 | add r6,r6,r7,ror#27 @ E+=ROR(A,27) |
244 | eor r9,r9,r11,ror#31 |
245 | str r9,[r14,#-4]! |
246 | and r10,r3,r10,ror#2 @ F_xx_xx |
247 | @ F_xx_xx |
248 | add r6,r6,r9 @ E+=X[i] |
249 | eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) |
250 | add r6,r6,r10 @ E+=F_00_19(B,C,D) |
251 | ldr r9,[r14,#15*4] |
252 | ldr r10,[r14,#13*4] |
253 | ldr r11,[r14,#7*4] |
254 | add r5,r8,r5,ror#2 @ E+=K_xx_xx |
255 | ldr r12,[r14,#2*4] |
256 | eor r9,r9,r10 |
257 | eor r11,r11,r12 @ 1 cycle stall |
258 | eor r10,r3,r4 @ F_xx_xx |
259 | mov r9,r9,ror#31 |
260 | add r5,r5,r6,ror#27 @ E+=ROR(A,27) |
261 | eor r9,r9,r11,ror#31 |
262 | str r9,[r14,#-4]! |
263 | and r10,r7,r10,ror#2 @ F_xx_xx |
264 | @ F_xx_xx |
265 | add r5,r5,r9 @ E+=X[i] |
266 | eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) |
267 | add r5,r5,r10 @ E+=F_00_19(B,C,D) |
268 | ldr r9,[r14,#15*4] |
269 | ldr r10,[r14,#13*4] |
270 | ldr r11,[r14,#7*4] |
271 | add r4,r8,r4,ror#2 @ E+=K_xx_xx |
272 | ldr r12,[r14,#2*4] |
273 | eor r9,r9,r10 |
274 | eor r11,r11,r12 @ 1 cycle stall |
275 | eor r10,r7,r3 @ F_xx_xx |
276 | mov r9,r9,ror#31 |
277 | add r4,r4,r5,ror#27 @ E+=ROR(A,27) |
278 | eor r9,r9,r11,ror#31 |
279 | str r9,[r14,#-4]! |
280 | and r10,r6,r10,ror#2 @ F_xx_xx |
281 | @ F_xx_xx |
282 | add r4,r4,r9 @ E+=X[i] |
283 | eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) |
284 | add r4,r4,r10 @ E+=F_00_19(B,C,D) |
285 | ldr r9,[r14,#15*4] |
286 | ldr r10,[r14,#13*4] |
287 | ldr r11,[r14,#7*4] |
288 | add r3,r8,r3,ror#2 @ E+=K_xx_xx |
289 | ldr r12,[r14,#2*4] |
290 | eor r9,r9,r10 |
291 | eor r11,r11,r12 @ 1 cycle stall |
292 | eor r10,r6,r7 @ F_xx_xx |
293 | mov r9,r9,ror#31 |
294 | add r3,r3,r4,ror#27 @ E+=ROR(A,27) |
295 | eor r9,r9,r11,ror#31 |
296 | str r9,[r14,#-4]! |
297 | and r10,r5,r10,ror#2 @ F_xx_xx |
298 | @ F_xx_xx |
299 | add r3,r3,r9 @ E+=X[i] |
300 | eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) |
301 | add r3,r3,r10 @ E+=F_00_19(B,C,D) |
302 | |
303 | ldr r8,.LK_20_39 @ [+15+16*4] |
304 | cmn sp,#0 @ [+3], clear carry to denote 20_39 |
305 | .L_20_39_or_60_79: |
306 | ldr r9,[r14,#15*4] |
307 | ldr r10,[r14,#13*4] |
308 | ldr r11,[r14,#7*4] |
309 | add r7,r8,r7,ror#2 @ E+=K_xx_xx |
310 | ldr r12,[r14,#2*4] |
311 | eor r9,r9,r10 |
312 | eor r11,r11,r12 @ 1 cycle stall |
313 | eor r10,r5,r6 @ F_xx_xx |
314 | mov r9,r9,ror#31 |
315 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) |
316 | eor r9,r9,r11,ror#31 |
317 | str r9,[r14,#-4]! |
318 | eor r10,r4,r10,ror#2 @ F_xx_xx |
319 | @ F_xx_xx |
320 | add r7,r7,r9 @ E+=X[i] |
321 | add r7,r7,r10 @ E+=F_20_39(B,C,D) |
322 | ldr r9,[r14,#15*4] |
323 | ldr r10,[r14,#13*4] |
324 | ldr r11,[r14,#7*4] |
325 | add r6,r8,r6,ror#2 @ E+=K_xx_xx |
326 | ldr r12,[r14,#2*4] |
327 | eor r9,r9,r10 |
328 | eor r11,r11,r12 @ 1 cycle stall |
329 | eor r10,r4,r5 @ F_xx_xx |
330 | mov r9,r9,ror#31 |
331 | add r6,r6,r7,ror#27 @ E+=ROR(A,27) |
332 | eor r9,r9,r11,ror#31 |
333 | str r9,[r14,#-4]! |
334 | eor r10,r3,r10,ror#2 @ F_xx_xx |
335 | @ F_xx_xx |
336 | add r6,r6,r9 @ E+=X[i] |
337 | add r6,r6,r10 @ E+=F_20_39(B,C,D) |
338 | ldr r9,[r14,#15*4] |
339 | ldr r10,[r14,#13*4] |
340 | ldr r11,[r14,#7*4] |
341 | add r5,r8,r5,ror#2 @ E+=K_xx_xx |
342 | ldr r12,[r14,#2*4] |
343 | eor r9,r9,r10 |
344 | eor r11,r11,r12 @ 1 cycle stall |
345 | eor r10,r3,r4 @ F_xx_xx |
346 | mov r9,r9,ror#31 |
347 | add r5,r5,r6,ror#27 @ E+=ROR(A,27) |
348 | eor r9,r9,r11,ror#31 |
349 | str r9,[r14,#-4]! |
350 | eor r10,r7,r10,ror#2 @ F_xx_xx |
351 | @ F_xx_xx |
352 | add r5,r5,r9 @ E+=X[i] |
353 | add r5,r5,r10 @ E+=F_20_39(B,C,D) |
354 | ldr r9,[r14,#15*4] |
355 | ldr r10,[r14,#13*4] |
356 | ldr r11,[r14,#7*4] |
357 | add r4,r8,r4,ror#2 @ E+=K_xx_xx |
358 | ldr r12,[r14,#2*4] |
359 | eor r9,r9,r10 |
360 | eor r11,r11,r12 @ 1 cycle stall |
361 | eor r10,r7,r3 @ F_xx_xx |
362 | mov r9,r9,ror#31 |
363 | add r4,r4,r5,ror#27 @ E+=ROR(A,27) |
364 | eor r9,r9,r11,ror#31 |
365 | str r9,[r14,#-4]! |
366 | eor r10,r6,r10,ror#2 @ F_xx_xx |
367 | @ F_xx_xx |
368 | add r4,r4,r9 @ E+=X[i] |
369 | add r4,r4,r10 @ E+=F_20_39(B,C,D) |
370 | ldr r9,[r14,#15*4] |
371 | ldr r10,[r14,#13*4] |
372 | ldr r11,[r14,#7*4] |
373 | add r3,r8,r3,ror#2 @ E+=K_xx_xx |
374 | ldr r12,[r14,#2*4] |
375 | eor r9,r9,r10 |
376 | eor r11,r11,r12 @ 1 cycle stall |
377 | eor r10,r6,r7 @ F_xx_xx |
378 | mov r9,r9,ror#31 |
379 | add r3,r3,r4,ror#27 @ E+=ROR(A,27) |
380 | eor r9,r9,r11,ror#31 |
381 | str r9,[r14,#-4]! |
382 | eor r10,r5,r10,ror#2 @ F_xx_xx |
383 | @ F_xx_xx |
384 | add r3,r3,r9 @ E+=X[i] |
385 | add r3,r3,r10 @ E+=F_20_39(B,C,D) |
386 | ARM( teq r14,sp ) @ preserve carry |
387 | THUMB( mov r11,sp ) |
388 | THUMB( teq r14,r11 ) @ preserve carry |
389 | bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] |
390 | bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes |
391 | |
392 | ldr r8,.LK_40_59 |
393 | sub sp,sp,#20*4 @ [+2] |
394 | .L_40_59: |
395 | ldr r9,[r14,#15*4] |
396 | ldr r10,[r14,#13*4] |
397 | ldr r11,[r14,#7*4] |
398 | add r7,r8,r7,ror#2 @ E+=K_xx_xx |
399 | ldr r12,[r14,#2*4] |
400 | eor r9,r9,r10 |
401 | eor r11,r11,r12 @ 1 cycle stall |
402 | eor r10,r5,r6 @ F_xx_xx |
403 | mov r9,r9,ror#31 |
404 | add r7,r7,r3,ror#27 @ E+=ROR(A,27) |
405 | eor r9,r9,r11,ror#31 |
406 | str r9,[r14,#-4]! |
407 | and r10,r4,r10,ror#2 @ F_xx_xx |
408 | and r11,r5,r6 @ F_xx_xx |
409 | add r7,r7,r9 @ E+=X[i] |
410 | add r7,r7,r10 @ E+=F_40_59(B,C,D) |
411 | add r7,r7,r11,ror#2 |
412 | ldr r9,[r14,#15*4] |
413 | ldr r10,[r14,#13*4] |
414 | ldr r11,[r14,#7*4] |
415 | add r6,r8,r6,ror#2 @ E+=K_xx_xx |
416 | ldr r12,[r14,#2*4] |
417 | eor r9,r9,r10 |
418 | eor r11,r11,r12 @ 1 cycle stall |
419 | eor r10,r4,r5 @ F_xx_xx |
420 | mov r9,r9,ror#31 |
421 | add r6,r6,r7,ror#27 @ E+=ROR(A,27) |
422 | eor r9,r9,r11,ror#31 |
423 | str r9,[r14,#-4]! |
424 | and r10,r3,r10,ror#2 @ F_xx_xx |
425 | and r11,r4,r5 @ F_xx_xx |
426 | add r6,r6,r9 @ E+=X[i] |
427 | add r6,r6,r10 @ E+=F_40_59(B,C,D) |
428 | add r6,r6,r11,ror#2 |
429 | ldr r9,[r14,#15*4] |
430 | ldr r10,[r14,#13*4] |
431 | ldr r11,[r14,#7*4] |
432 | add r5,r8,r5,ror#2 @ E+=K_xx_xx |
433 | ldr r12,[r14,#2*4] |
434 | eor r9,r9,r10 |
435 | eor r11,r11,r12 @ 1 cycle stall |
436 | eor r10,r3,r4 @ F_xx_xx |
437 | mov r9,r9,ror#31 |
438 | add r5,r5,r6,ror#27 @ E+=ROR(A,27) |
439 | eor r9,r9,r11,ror#31 |
440 | str r9,[r14,#-4]! |
441 | and r10,r7,r10,ror#2 @ F_xx_xx |
442 | and r11,r3,r4 @ F_xx_xx |
443 | add r5,r5,r9 @ E+=X[i] |
444 | add r5,r5,r10 @ E+=F_40_59(B,C,D) |
445 | add r5,r5,r11,ror#2 |
446 | ldr r9,[r14,#15*4] |
447 | ldr r10,[r14,#13*4] |
448 | ldr r11,[r14,#7*4] |
449 | add r4,r8,r4,ror#2 @ E+=K_xx_xx |
450 | ldr r12,[r14,#2*4] |
451 | eor r9,r9,r10 |
452 | eor r11,r11,r12 @ 1 cycle stall |
453 | eor r10,r7,r3 @ F_xx_xx |
454 | mov r9,r9,ror#31 |
455 | add r4,r4,r5,ror#27 @ E+=ROR(A,27) |
456 | eor r9,r9,r11,ror#31 |
457 | str r9,[r14,#-4]! |
458 | and r10,r6,r10,ror#2 @ F_xx_xx |
459 | and r11,r7,r3 @ F_xx_xx |
460 | add r4,r4,r9 @ E+=X[i] |
461 | add r4,r4,r10 @ E+=F_40_59(B,C,D) |
462 | add r4,r4,r11,ror#2 |
463 | ldr r9,[r14,#15*4] |
464 | ldr r10,[r14,#13*4] |
465 | ldr r11,[r14,#7*4] |
466 | add r3,r8,r3,ror#2 @ E+=K_xx_xx |
467 | ldr r12,[r14,#2*4] |
468 | eor r9,r9,r10 |
469 | eor r11,r11,r12 @ 1 cycle stall |
470 | eor r10,r6,r7 @ F_xx_xx |
471 | mov r9,r9,ror#31 |
472 | add r3,r3,r4,ror#27 @ E+=ROR(A,27) |
473 | eor r9,r9,r11,ror#31 |
474 | str r9,[r14,#-4]! |
475 | and r10,r5,r10,ror#2 @ F_xx_xx |
476 | and r11,r6,r7 @ F_xx_xx |
477 | add r3,r3,r9 @ E+=X[i] |
478 | add r3,r3,r10 @ E+=F_40_59(B,C,D) |
479 | add r3,r3,r11,ror#2 |
480 | cmp r14,sp |
481 | bne .L_40_59 @ [+((12+5)*5+2)*4] |
482 | |
483 | ldr r8,.LK_60_79 |
484 | sub sp,sp,#20*4 |
485 | cmp sp,#0 @ set carry to denote 60_79 |
486 | b .L_20_39_or_60_79 @ [+4], spare 300 bytes |
487 | .L_done: |
488 | add sp,sp,#80*4 @ "deallocate" stack frame |
489 | ldmia r0,{r8,r9,r10,r11,r12} |
490 | add r3,r8,r3 |
491 | add r4,r9,r4 |
492 | add r5,r10,r5,ror#2 |
493 | add r6,r11,r6,ror#2 |
494 | add r7,r12,r7,ror#2 |
495 | stmia r0,{r3,r4,r5,r6,r7} |
496 | teq r1,r2 |
497 | bne .Lloop @ [+18], total 1307 |
498 | |
499 | ldmia sp!,{r4-r12,pc} |
500 | .align 2 |
501 | .LK_00_19: .word 0x5a827999 |
502 | .LK_20_39: .word 0x6ed9eba1 |
503 | .LK_40_59: .word 0x8f1bbcdc |
504 | .LK_60_79: .word 0xca62c1d6 |
505 | ENDPROC(sha1_block_data_order) |
506 | .asciz "SHA1 block transform for ARMv4, CRYPTOGAMS by <appro@openssl.org>" |
507 | .align 2 |
508 | |