1 | /* SPDX-License-Identifier: GPL-2.0-only */ |
2 | /* |
3 | * Copyright 2012 Xyratex Technology Limited |
4 | * |
5 | * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 |
6 | * calculation. |
7 | * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) |
8 | * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found |
9 | * at: |
10 | * http://www.intel.com/products/processor/manuals/ |
11 | * Intel(R) 64 and IA-32 Architectures Software Developer's Manual |
12 | * Volume 2B: Instruction Set Reference, N-Z |
13 | * |
14 | * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> |
15 | * Alexander Boyko <Alexander_Boyko@xyratex.com> |
16 | */ |
17 | |
18 | #include <linux/linkage.h> |
19 | |
20 | |
21 | .section .rodata |
22 | .align 16 |
23 | /* |
24 | * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 |
25 | * #define CONSTANT_R1 0x154442bd4LL |
26 | * |
27 | * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 |
28 | * #define CONSTANT_R2 0x1c6e41596LL |
29 | */ |
30 | .Lconstant_R2R1: |
31 | .octa 0x00000001c6e415960000000154442bd4 |
32 | /* |
33 | * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 |
34 | * #define CONSTANT_R3 0x1751997d0LL |
35 | * |
36 | * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e |
37 | * #define CONSTANT_R4 0x0ccaa009eLL |
38 | */ |
39 | .Lconstant_R4R3: |
40 | .octa 0x00000000ccaa009e00000001751997d0 |
41 | /* |
42 | * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 |
43 | * #define CONSTANT_R5 0x163cd6124LL |
44 | */ |
45 | .Lconstant_R5: |
46 | .octa 0x00000000000000000000000163cd6124 |
47 | .Lconstant_mask32: |
48 | .octa 0x000000000000000000000000FFFFFFFF |
49 | /* |
50 | * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL |
51 | * |
52 | * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL |
53 | * #define CONSTANT_RU 0x1F7011641LL |
54 | */ |
55 | .Lconstant_RUpoly: |
56 | .octa 0x00000001F701164100000001DB710641 |
57 | |
58 | #define CONSTANT %xmm0 |
59 | |
60 | #ifdef __x86_64__ |
61 | #define BUF %rdi |
62 | #define LEN %rsi |
63 | #define CRC %edx |
64 | #else |
65 | #define BUF %eax |
66 | #define LEN %edx |
67 | #define CRC %ecx |
68 | #endif |
69 | |
70 | |
71 | |
72 | .text |
73 | /** |
74 | * Calculate crc32 |
75 | * BUF - buffer (16 bytes aligned) |
76 | * LEN - sizeof buffer (16 bytes aligned), LEN should be grater than 63 |
77 | * CRC - initial crc32 |
78 | * return %eax crc32 |
79 | * uint crc32_pclmul_le_16(unsigned char const *buffer, |
80 | * size_t len, uint crc32) |
81 | */ |
82 | |
83 | SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ |
84 | movdqa (BUF), %xmm1 |
85 | movdqa 0x10(BUF), %xmm2 |
86 | movdqa 0x20(BUF), %xmm3 |
87 | movdqa 0x30(BUF), %xmm4 |
88 | movd CRC, CONSTANT |
89 | pxor CONSTANT, %xmm1 |
90 | sub $0x40, LEN |
91 | add $0x40, BUF |
92 | cmp $0x40, LEN |
93 | jb .Lless_64 |
94 | |
95 | #ifdef __x86_64__ |
96 | movdqa .Lconstant_R2R1(%rip), CONSTANT |
97 | #else |
98 | movdqa .Lconstant_R2R1, CONSTANT |
99 | #endif |
100 | |
101 | .Lloop_64:/* 64 bytes Full cache line folding */ |
102 | prefetchnta 0x40(BUF) |
103 | movdqa %xmm1, %xmm5 |
104 | movdqa %xmm2, %xmm6 |
105 | movdqa %xmm3, %xmm7 |
106 | #ifdef __x86_64__ |
107 | movdqa %xmm4, %xmm8 |
108 | #endif |
109 | pclmulqdq $0x00, CONSTANT, %xmm1 |
110 | pclmulqdq $0x00, CONSTANT, %xmm2 |
111 | pclmulqdq $0x00, CONSTANT, %xmm3 |
112 | #ifdef __x86_64__ |
113 | pclmulqdq $0x00, CONSTANT, %xmm4 |
114 | #endif |
115 | pclmulqdq $0x11, CONSTANT, %xmm5 |
116 | pclmulqdq $0x11, CONSTANT, %xmm6 |
117 | pclmulqdq $0x11, CONSTANT, %xmm7 |
118 | #ifdef __x86_64__ |
119 | pclmulqdq $0x11, CONSTANT, %xmm8 |
120 | #endif |
121 | pxor %xmm5, %xmm1 |
122 | pxor %xmm6, %xmm2 |
123 | pxor %xmm7, %xmm3 |
124 | #ifdef __x86_64__ |
125 | pxor %xmm8, %xmm4 |
126 | #else |
127 | /* xmm8 unsupported for x32 */ |
128 | movdqa %xmm4, %xmm5 |
129 | pclmulqdq $0x00, CONSTANT, %xmm4 |
130 | pclmulqdq $0x11, CONSTANT, %xmm5 |
131 | pxor %xmm5, %xmm4 |
132 | #endif |
133 | |
134 | pxor (BUF), %xmm1 |
135 | pxor 0x10(BUF), %xmm2 |
136 | pxor 0x20(BUF), %xmm3 |
137 | pxor 0x30(BUF), %xmm4 |
138 | |
139 | sub $0x40, LEN |
140 | add $0x40, BUF |
141 | cmp $0x40, LEN |
142 | jge .Lloop_64 |
143 | .Lless_64:/* Folding cache line into 128bit */ |
144 | #ifdef __x86_64__ |
145 | movdqa .Lconstant_R4R3(%rip), CONSTANT |
146 | #else |
147 | movdqa .Lconstant_R4R3, CONSTANT |
148 | #endif |
149 | prefetchnta (BUF) |
150 | |
151 | movdqa %xmm1, %xmm5 |
152 | pclmulqdq $0x00, CONSTANT, %xmm1 |
153 | pclmulqdq $0x11, CONSTANT, %xmm5 |
154 | pxor %xmm5, %xmm1 |
155 | pxor %xmm2, %xmm1 |
156 | |
157 | movdqa %xmm1, %xmm5 |
158 | pclmulqdq $0x00, CONSTANT, %xmm1 |
159 | pclmulqdq $0x11, CONSTANT, %xmm5 |
160 | pxor %xmm5, %xmm1 |
161 | pxor %xmm3, %xmm1 |
162 | |
163 | movdqa %xmm1, %xmm5 |
164 | pclmulqdq $0x00, CONSTANT, %xmm1 |
165 | pclmulqdq $0x11, CONSTANT, %xmm5 |
166 | pxor %xmm5, %xmm1 |
167 | pxor %xmm4, %xmm1 |
168 | |
169 | cmp $0x10, LEN |
170 | jb .Lfold_64 |
171 | .Lloop_16:/* Folding rest buffer into 128bit */ |
172 | movdqa %xmm1, %xmm5 |
173 | pclmulqdq $0x00, CONSTANT, %xmm1 |
174 | pclmulqdq $0x11, CONSTANT, %xmm5 |
175 | pxor %xmm5, %xmm1 |
176 | pxor (BUF), %xmm1 |
177 | sub $0x10, LEN |
178 | add $0x10, BUF |
179 | cmp $0x10, LEN |
180 | jge .Lloop_16 |
181 | |
182 | .Lfold_64: |
183 | /* perform the last 64 bit fold, also adds 32 zeroes |
184 | * to the input stream */ |
185 | pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ |
186 | psrldq $0x08, %xmm1 |
187 | pxor CONSTANT, %xmm1 |
188 | |
189 | /* final 32-bit fold */ |
190 | movdqa %xmm1, %xmm2 |
191 | #ifdef __x86_64__ |
192 | movdqa .Lconstant_R5(%rip), CONSTANT |
193 | movdqa .Lconstant_mask32(%rip), %xmm3 |
194 | #else |
195 | movdqa .Lconstant_R5, CONSTANT |
196 | movdqa .Lconstant_mask32, %xmm3 |
197 | #endif |
198 | psrldq $0x04, %xmm2 |
199 | pand %xmm3, %xmm1 |
200 | pclmulqdq $0x00, CONSTANT, %xmm1 |
201 | pxor %xmm2, %xmm1 |
202 | |
203 | /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ |
204 | #ifdef __x86_64__ |
205 | movdqa .Lconstant_RUpoly(%rip), CONSTANT |
206 | #else |
207 | movdqa .Lconstant_RUpoly, CONSTANT |
208 | #endif |
209 | movdqa %xmm1, %xmm2 |
210 | pand %xmm3, %xmm1 |
211 | pclmulqdq $0x10, CONSTANT, %xmm1 |
212 | pand %xmm3, %xmm1 |
213 | pclmulqdq $0x00, CONSTANT, %xmm1 |
214 | pxor %xmm2, %xmm1 |
215 | pextrd $0x01, %xmm1, %eax |
216 | |
217 | RET |
218 | SYM_FUNC_END(crc32_pclmul_le_16) |
219 | |