chacha-neon-core.S source code [linux/arch/arm/crypto/chacha-neon-core.S]

1	/*
2	* ChaCha/XChaCha NEON helper functions
3	*
4	* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5	*
6	* This program is free software; you can redistribute it and/or modify
7	* it under the terms of the GNU General Public License version 2 as
8	* published by the Free Software Foundation.
9	*
10	* Based on:
11	* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
12	*
13	* Copyright (C) 2015 Martin Willi
14	*
15	* This program is free software; you can redistribute it and/or modify
16	* it under the terms of the GNU General Public License as published by
17	* the Free Software Foundation; either version 2 of the License, or
18	* (at your option) any later version.
19	*/
20
21	/*
22	* NEON doesn't have a rotate instruction. The alternatives are, more or less:
23	*
24	* (a) vshl.u32 + vsri.u32 (needs temporary register)
25	* (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
26	* (c) vrev32.16 (16-bit rotations only)
27	* (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
28	* needs index vector)
29	*
30	* ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
31	* the only choices are (a) and (b). We use (a) since it takes two-thirds the
32	* cycles of (b) on both Cortex-A7 and Cortex-A53.
33	*
34	* For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
35	* and doesn't need a temporary register.
36	*
37	* For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
38	* is twice as fast as (a), even when doing (a) on multiple registers
39	* simultaneously to eliminate the stall between vshl and vsri. Also, it
40	* parallelizes better when temporary registers are scarce.
41	*
42	* A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
43	* (a), so the need to load the rotation table actually makes the vtbl method
44	* slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
45	* seems to be a good compromise to get a more significant speed boost on some
46	* CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
47	*/
48
49	#include <linux/linkage.h>
50	#include <asm/cache.h>
51
52	.text
53	.fpu neon
54	.align `5`
55
56	/*
57	* chacha_permute - permute one block
58	*
59	* Permute one 64-byte block where the state matrix is stored in the four NEON
60	* registers q0-q3. It performs matrix operations on four words in parallel,
61	* but requires shuffling to rearrange the words after each round.
62	*
63	* The round count is given in r3.
64	*
65	* Clobbers: r3, ip, q4-q5
66	*/
67	chacha_permute:
68
69	adr ip, .Lrol8_table
70	vld1`.8` {d10}, [ip, :`64`]
71
72	.Ldoubleround:
73	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
74	vadd.i32 q0, q0, q1
75	veor q3, q3, q0
76	vrev32`.16` q3, q3
77
78	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
79	vadd.i32 q2, q2, q3
80	veor q4, q1, q2
81	vshl.u32 q1, q4, #`12`
82	vsri.u32 q1, q4, #`20`
83
84	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
85	vadd.i32 q0, q0, q1
86	veor q3, q3, q0
87	vtbl`.8` d6, {d6}, d10
88	vtbl`.8` d7, {d7}, d10
89
90	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91	vadd.i32 q2, q2, q3
92	veor q4, q1, q2
93	vshl.u32 q1, q4, #`7`
94	vsri.u32 q1, q4, #`25`
95
96	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
97	vext`.8` q1, q1, q1, #`4`
98	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99	vext`.8` q2, q2, q2, #`8`
100	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
101	vext`.8` q3, q3, q3, #`12`
102
103	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
104	vadd.i32 q0, q0, q1
105	veor q3, q3, q0
106	vrev32`.16` q3, q3
107
108	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
109	vadd.i32 q2, q2, q3
110	veor q4, q1, q2
111	vshl.u32 q1, q4, #`12`
112	vsri.u32 q1, q4, #`20`
113
114	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
115	vadd.i32 q0, q0, q1
116	veor q3, q3, q0
117	vtbl`.8` d6, {d6}, d10
118	vtbl`.8` d7, {d7}, d10
119
120	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
121	vadd.i32 q2, q2, q3
122	veor q4, q1, q2
123	vshl.u32 q1, q4, #`7`
124	vsri.u32 q1, q4, #`25`
125
126	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
127	vext`.8` q1, q1, q1, #`12`
128	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
129	vext`.8` q2, q2, q2, #`8`
130	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
131	vext`.8` q3, q3, q3, #`4`
132
133	subs r3, r3, #`2`
134	bne .Ldoubleround
135
136	bx lr
137	ENDPROC(chacha_permute)
138
139	ENTRY(chacha_block_xor_neon)
140	// r0: Input state matrix, s
141	// r1: 1 data block output, o
142	// r2: 1 data block input, i
143	// r3: nrounds
144	push {lr}
145
146	// x0..3 = s0..3
147	add ip, r0, #`0x20`
148	vld1`.32` {q0-q1}, [r0]
149	vld1`.32` {q2-q3}, [ip]
150
151	vmov q8, q0
152	vmov q9, q1
153	vmov q10, q2
154	vmov q11, q3
155
156	bl chacha_permute
157
158	add ip, r2, #`0x20`
159	vld1`.8` {q4-q5}, [r2]
160	vld1`.8` {q6-q7}, [ip]
161
162	// o0 = i0 ^ (x0 + s0)
163	vadd.i32 q0, q0, q8
164	veor q0, q0, q4
165
166	// o1 = i1 ^ (x1 + s1)
167	vadd.i32 q1, q1, q9
168	veor q1, q1, q5
169
170	// o2 = i2 ^ (x2 + s2)
171	vadd.i32 q2, q2, q10
172	veor q2, q2, q6
173
174	// o3 = i3 ^ (x3 + s3)
175	vadd.i32 q3, q3, q11
176	veor q3, q3, q7
177
178	add ip, r1, #`0x20`
179	vst1`.8` {q0-q1}, [r1]
180	vst1`.8` {q2-q3}, [ip]
181
182	pop {pc}
183	ENDPROC(chacha_block_xor_neon)
184
185	ENTRY(hchacha_block_neon)
186	// r0: Input state matrix, s
187	// r1: output (8 32-bit words)
188	// r2: nrounds
189	push {lr}
190
191	vld1`.32` {q0-q1}, [r0]!
192	vld1`.32` {q2-q3}, [r0]
193
194	mov r3, r2
195	bl chacha_permute
196
197	vst1`.32` {q0}, [r1]!
198	vst1`.32` {q3}, [r1]
199
200	pop {pc}
201	ENDPROC(hchacha_block_neon)
202
203	.align `4`
204	.Lctrinc: .word `0`, `1`, `2`, `3`
205	.Lrol8_table: .byte `3`, `0`, `1`, `2`, `7`, `4`, `5`, `6`
206
207	.align `5`
208	ENTRY(chacha_4block_xor_neon)
209	push {r4, lr}
210	mov r4, sp // preserve the stack pointer
211	sub ip, sp, #`0x20` // allocate a 32 byte buffer
212	bic ip, ip, #`0x1f` // aligned to 32 bytes
213	mov sp, ip
214
215	// r0: Input state matrix, s
216	// r1: 4 data blocks output, o
217	// r2: 4 data blocks input, i
218	// r3: nrounds
219
220	//
221	// This function encrypts four consecutive ChaCha blocks by loading
222	// the state matrix in NEON registers four times. The algorithm performs
223	// each operation on the corresponding word of each state matrix, hence
224	// requires no word shuffling. The words are re-interleaved before the
225	// final addition of the original state and the XORing step.
226	//
227
228	// x0..15[0-3] = s0..15[0-3]
229	add ip, r0, #`0x20`
230	vld1`.32` {q0-q1}, [r0]
231	vld1`.32` {q2-q3}, [ip]
232
233	adr lr, .Lctrinc
234	vdup`.32` q15, d7[`1`]
235	vdup`.32` q14, d7[`0`]
236	vld1`.32` {q4}, [lr, :`128`]
237	vdup`.32` q13, d6[`1`]
238	vdup`.32` q12, d6[`0`]
239	vdup`.32` q11, d5[`1`]
240	vdup`.32` q10, d5[`0`]
241	vadd.u32 q12, q12, q4 // x12 += counter values 0-3
242	vdup`.32` q9, d4[`1`]
243	vdup`.32` q8, d4[`0`]
244	vdup`.32` q7, d3[`1`]
245	vdup`.32` q6, d3[`0`]
246	vdup`.32` q5, d2[`1`]
247	vdup`.32` q4, d2[`0`]
248	vdup`.32` q3, d1[`1`]
249	vdup`.32` q2, d1[`0`]
250	vdup`.32` q1, d0[`1`]
251	vdup`.32` q0, d0[`0`]
252
253	adr ip, .Lrol8_table
254	b `1f`
255
256	.Ldoubleround4:
257	vld1`.32` {q8-q9}, [sp, :`256`]
258	`1`:
259	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
260	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
261	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
262	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
263	vadd.i32 q0, q0, q4
264	vadd.i32 q1, q1, q5
265	vadd.i32 q2, q2, q6
266	vadd.i32 q3, q3, q7
267
268	veor q12, q12, q0
269	veor q13, q13, q1
270	veor q14, q14, q2
271	veor q15, q15, q3
272
273	vrev32`.16` q12, q12
274	vrev32`.16` q13, q13
275	vrev32`.16` q14, q14
276	vrev32`.16` q15, q15
277
278	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
279	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
280	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
281	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
282	vadd.i32 q8, q8, q12
283	vadd.i32 q9, q9, q13
284	vadd.i32 q10, q10, q14
285	vadd.i32 q11, q11, q15
286
287	vst1`.32` {q8-q9}, [sp, :`256`]
288
289	veor q8, q4, q8
290	veor q9, q5, q9
291	vshl.u32 q4, q8, #`12`
292	vshl.u32 q5, q9, #`12`
293	vsri.u32 q4, q8, #`20`
294	vsri.u32 q5, q9, #`20`
295
296	veor q8, q6, q10
297	veor q9, q7, q11
298	vshl.u32 q6, q8, #`12`
299	vshl.u32 q7, q9, #`12`
300	vsri.u32 q6, q8, #`20`
301	vsri.u32 q7, q9, #`20`
302
303	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
304	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
305	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
306	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
307	vld1`.8` {d16}, [ip, :`64`]
308	vadd.i32 q0, q0, q4
309	vadd.i32 q1, q1, q5
310	vadd.i32 q2, q2, q6
311	vadd.i32 q3, q3, q7
312
313	veor q12, q12, q0
314	veor q13, q13, q1
315	veor q14, q14, q2
316	veor q15, q15, q3
317
318	vtbl`.8` d24, {d24}, d16
319	vtbl`.8` d25, {d25}, d16
320	vtbl`.8` d26, {d26}, d16
321	vtbl`.8` d27, {d27}, d16
322	vtbl`.8` d28, {d28}, d16
323	vtbl`.8` d29, {d29}, d16
324	vtbl`.8` d30, {d30}, d16
325	vtbl`.8` d31, {d31}, d16
326
327	vld1`.32` {q8-q9}, [sp, :`256`]
328
329	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
330	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
331	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
332	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
333	vadd.i32 q8, q8, q12
334	vadd.i32 q9, q9, q13
335	vadd.i32 q10, q10, q14
336	vadd.i32 q11, q11, q15
337
338	vst1`.32` {q8-q9}, [sp, :`256`]
339
340	veor q8, q4, q8
341	veor q9, q5, q9
342	vshl.u32 q4, q8, #`7`
343	vshl.u32 q5, q9, #`7`
344	vsri.u32 q4, q8, #`25`
345	vsri.u32 q5, q9, #`25`
346
347	veor q8, q6, q10
348	veor q9, q7, q11
349	vshl.u32 q6, q8, #`7`
350	vshl.u32 q7, q9, #`7`
351	vsri.u32 q6, q8, #`25`
352	vsri.u32 q7, q9, #`25`
353
354	vld1`.32` {q8-q9}, [sp, :`256`]
355
356	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
357	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
358	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
359	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
360	vadd.i32 q0, q0, q5
361	vadd.i32 q1, q1, q6
362	vadd.i32 q2, q2, q7
363	vadd.i32 q3, q3, q4
364
365	veor q15, q15, q0
366	veor q12, q12, q1
367	veor q13, q13, q2
368	veor q14, q14, q3
369
370	vrev32`.16` q15, q15
371	vrev32`.16` q12, q12
372	vrev32`.16` q13, q13
373	vrev32`.16` q14, q14
374
375	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
376	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
377	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
378	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
379	vadd.i32 q10, q10, q15
380	vadd.i32 q11, q11, q12
381	vadd.i32 q8, q8, q13
382	vadd.i32 q9, q9, q14
383
384	vst1`.32` {q8-q9}, [sp, :`256`]
385
386	veor q8, q7, q8
387	veor q9, q4, q9
388	vshl.u32 q7, q8, #`12`
389	vshl.u32 q4, q9, #`12`
390	vsri.u32 q7, q8, #`20`
391	vsri.u32 q4, q9, #`20`
392
393	veor q8, q5, q10
394	veor q9, q6, q11
395	vshl.u32 q5, q8, #`12`
396	vshl.u32 q6, q9, #`12`
397	vsri.u32 q5, q8, #`20`
398	vsri.u32 q6, q9, #`20`
399
400	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
401	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
402	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
403	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
404	vld1`.8` {d16}, [ip, :`64`]
405	vadd.i32 q0, q0, q5
406	vadd.i32 q1, q1, q6
407	vadd.i32 q2, q2, q7
408	vadd.i32 q3, q3, q4
409
410	veor q15, q15, q0
411	veor q12, q12, q1
412	veor q13, q13, q2
413	veor q14, q14, q3
414
415	vtbl`.8` d30, {d30}, d16
416	vtbl`.8` d31, {d31}, d16
417	vtbl`.8` d24, {d24}, d16
418	vtbl`.8` d25, {d25}, d16
419	vtbl`.8` d26, {d26}, d16
420	vtbl`.8` d27, {d27}, d16
421	vtbl`.8` d28, {d28}, d16
422	vtbl`.8` d29, {d29}, d16
423
424	vld1`.32` {q8-q9}, [sp, :`256`]
425
426	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
427	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
428	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
429	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
430	vadd.i32 q10, q10, q15
431	vadd.i32 q11, q11, q12
432	vadd.i32 q8, q8, q13
433	vadd.i32 q9, q9, q14
434
435	vst1`.32` {q8-q9}, [sp, :`256`]
436
437	veor q8, q7, q8
438	veor q9, q4, q9
439	vshl.u32 q7, q8, #`7`
440	vshl.u32 q4, q9, #`7`
441	vsri.u32 q7, q8, #`25`
442	vsri.u32 q4, q9, #`25`
443
444	veor q8, q5, q10
445	veor q9, q6, q11
446	vshl.u32 q5, q8, #`7`
447	vshl.u32 q6, q9, #`7`
448	vsri.u32 q5, q8, #`25`
449	vsri.u32 q6, q9, #`25`
450
451	subs r3, r3, #`2`
452	bne .Ldoubleround4
453
454	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
455	// x8..9[0-3] are on the stack.
456
457	// Re-interleave the words in the first two rows of each block (x0..7).
458	// Also add the counter values 0-3 to x12[0-3].
459	vld1`.32` {q8}, [lr, :`128`] // load counter values 0-3
460	vzip`.32` q0, q1 // => (0 1 0 1) (0 1 0 1)
461	vzip`.32` q2, q3 // => (2 3 2 3) (2 3 2 3)
462	vzip`.32` q4, q5 // => (4 5 4 5) (4 5 4 5)
463	vzip`.32` q6, q7 // => (6 7 6 7) (6 7 6 7)
464	vadd.u32 q12, q8 // x12 += counter values 0-3
465	vswp d1, d4
466	vswp d3, d6
467	vld1`.32` {q8-q9}, [r0]! // load s0..7
468	vswp d9, d12
469	vswp d11, d14
470
471	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
472	// after XORing the first 32 bytes.
473	vswp q1, q4
474
475	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
476
477	// x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
478	vadd.u32 q0, q0, q8
479	vadd.u32 q2, q2, q8
480	vadd.u32 q4, q4, q8
481	vadd.u32 q3, q3, q8
482
483	// x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
484	vadd.u32 q1, q1, q9
485	vadd.u32 q6, q6, q9
486	vadd.u32 q5, q5, q9
487	vadd.u32 q7, q7, q9
488
489	// XOR first 32 bytes using keystream from first two rows of first block
490	vld1`.8` {q8-q9}, [r2]!
491	veor q8, q8, q0
492	veor q9, q9, q1
493	vst1`.8` {q8-q9}, [r1]!
494
495	// Re-interleave the words in the last two rows of each block (x8..15).
496	vld1`.32` {q8-q9}, [sp, :`256`]
497	mov sp, r4 // restore original stack pointer
498	ldr r4, [r4, #`8`] // load number of bytes
499	vzip`.32` q12, q13 // => (12 13 12 13) (12 13 12 13)
500	vzip`.32` q14, q15 // => (14 15 14 15) (14 15 14 15)
501	vzip`.32` q8, q9 // => (8 9 8 9) (8 9 8 9)
502	vzip`.32` q10, q11 // => (10 11 10 11) (10 11 10 11)
503	vld1`.32` {q0-q1}, [r0] // load s8..15
504	vswp d25, d28
505	vswp d27, d30
506	vswp d17, d20
507	vswp d19, d22
508
509	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
510
511	// x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
512	vadd.u32 q8, q8, q0
513	vadd.u32 q10, q10, q0
514	vadd.u32 q9, q9, q0
515	vadd.u32 q11, q11, q0
516
517	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
518	vadd.u32 q12, q12, q1
519	vadd.u32 q14, q14, q1
520	vadd.u32 q13, q13, q1
521	vadd.u32 q15, q15, q1
522
523	// XOR the rest of the data with the keystream
524
525	vld1`.8` {q0-q1}, [r2]!
526	subs r4, r4, #`96`
527	veor q0, q0, q8
528	veor q1, q1, q12
529	ble .Lle96
530	vst1`.8` {q0-q1}, [r1]!
531
532	vld1`.8` {q0-q1}, [r2]!
533	subs r4, r4, #`32`
534	veor q0, q0, q2
535	veor q1, q1, q6
536	ble .Lle128
537	vst1`.8` {q0-q1}, [r1]!
538
539	vld1`.8` {q0-q1}, [r2]!
540	subs r4, r4, #`32`
541	veor q0, q0, q10
542	veor q1, q1, q14
543	ble .Lle160
544	vst1`.8` {q0-q1}, [r1]!
545
546	vld1`.8` {q0-q1}, [r2]!
547	subs r4, r4, #`32`
548	veor q0, q0, q4
549	veor q1, q1, q5
550	ble .Lle192
551	vst1`.8` {q0-q1}, [r1]!
552
553	vld1`.8` {q0-q1}, [r2]!
554	subs r4, r4, #`32`
555	veor q0, q0, q9
556	veor q1, q1, q13
557	ble .Lle224
558	vst1`.8` {q0-q1}, [r1]!
559
560	vld1`.8` {q0-q1}, [r2]!
561	subs r4, r4, #`32`
562	veor q0, q0, q3
563	veor q1, q1, q7
564	blt .Llt256
565	.Lout:
566	vst1`.8` {q0-q1}, [r1]!
567
568	vld1`.8` {q0-q1}, [r2]
569	veor q0, q0, q11
570	veor q1, q1, q15
571	vst1`.8` {q0-q1}, [r1]
572
573	pop {r4, pc}
574
575	.Lle192:
576	vmov q4, q9
577	vmov q5, q13
578
579	.Lle160:
580	// nothing to do
581
582	.Lfinalblock:
583	// Process the final block if processing less than 4 full blocks.
584	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
585	// previous 32 byte output block that still needs to be written at
586	// [r1] in q0-q1.
587	beq .Lfullblock
588
589	.Lpartialblock:
590	adr lr, .Lpermute + `32`
591	add r2, r2, r4
592	add lr, lr, r4
593	add r4, r4, r1
594
595	vld1`.8` {q2-q3}, [lr]
596	vld1`.8` {q6-q7}, [r2]
597
598	add r4, r4, #`32`
599
600	vtbl`.8` d4, {q4-q5}, d4
601	vtbl`.8` d5, {q4-q5}, d5
602	vtbl`.8` d6, {q4-q5}, d6
603	vtbl`.8` d7, {q4-q5}, d7
604
605	veor q6, q6, q2
606	veor q7, q7, q3
607
608	vst1`.8` {q6-q7}, [r4] // overlapping stores
609	vst1`.8` {q0-q1}, [r1]
610	pop {r4, pc}
611
612	.Lfullblock:
613	vmov q11, q4
614	vmov q15, q5
615	b .Lout
616	.Lle96:
617	vmov q4, q2
618	vmov q5, q6
619	b .Lfinalblock
620	.Lle128:
621	vmov q4, q10
622	vmov q5, q14
623	b .Lfinalblock
624	.Lle224:
625	vmov q4, q3
626	vmov q5, q7
627	b .Lfinalblock
628	.Llt256:
629	vmov q4, q11
630	vmov q5, q15
631	b .Lpartialblock
632	ENDPROC(chacha_4block_xor_neon)
633
634	.align L1_CACHE_SHIFT
635	.Lpermute:
636	.byte `0x00`, `0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`
637	.byte `0x08`, `0x09`, `0x0a`, `0x0b`, `0x0c`, `0x0d`, `0x0e`, `0x0f`
638	.byte `0x10`, `0x11`, `0x12`, `0x13`, `0x14`, `0x15`, `0x16`, `0x17`
639	.byte `0x18`, `0x19`, `0x1a`, `0x1b`, `0x1c`, `0x1d`, `0x1e`, `0x1f`
640	.byte `0x00`, `0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`
641	.byte `0x08`, `0x09`, `0x0a`, `0x0b`, `0x0c`, `0x0d`, `0x0e`, `0x0f`
642	.byte `0x10`, `0x11`, `0x12`, `0x13`, `0x14`, `0x15`, `0x16`, `0x17`
643	.byte `0x18`, `0x19`, `0x1a`, `0x1b`, `0x1c`, `0x1d`, `0x1e`, `0x1f`
644

source code of linux/arch/arm/crypto/chacha-neon-core.S