1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* Copyright 2025 Google LLC */ |
3 | |
4 | /* |
5 | * This file is a "template" that generates a CRC function optimized using the |
6 | * RISC-V Zbc (scalar carryless multiplication) extension. The includer of this |
7 | * file must define the following parameters to specify the type of CRC: |
8 | * |
9 | * crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC |
10 | * LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural |
11 | * mapping between bits and polynomial coefficients |
12 | * 1 for a lsb (least-significant-bit) first CRC, i.e. reflected |
13 | * mapping between bits and polynomial coefficients |
14 | */ |
15 | |
16 | #include <asm/byteorder.h> |
17 | #include <linux/minmax.h> |
18 | |
19 | #define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */ |
20 | |
21 | static inline unsigned long clmul(unsigned long a, unsigned long b) |
22 | { |
23 | unsigned long res; |
24 | |
25 | asm(".option push\n" |
26 | ".option arch,+zbc\n" |
27 | "clmul %0, %1, %2\n" |
28 | ".option pop\n" |
29 | : "=r" (res) : "r" (a), "r" (b)); |
30 | return res; |
31 | } |
32 | |
33 | static inline unsigned long clmulh(unsigned long a, unsigned long b) |
34 | { |
35 | unsigned long res; |
36 | |
37 | asm(".option push\n" |
38 | ".option arch,+zbc\n" |
39 | "clmulh %0, %1, %2\n" |
40 | ".option pop\n" |
41 | : "=r" (res) : "r" (a), "r" (b)); |
42 | return res; |
43 | } |
44 | |
45 | static inline unsigned long clmulr(unsigned long a, unsigned long b) |
46 | { |
47 | unsigned long res; |
48 | |
49 | asm(".option push\n" |
50 | ".option arch,+zbc\n" |
51 | "clmulr %0, %1, %2\n" |
52 | ".option pop\n" |
53 | : "=r" (res) : "r" (a), "r" (b)); |
54 | return res; |
55 | } |
56 | |
57 | /* |
58 | * crc_load_long() loads one "unsigned long" of aligned data bytes, producing a |
59 | * polynomial whose bit order matches the CRC's bit order. |
60 | */ |
61 | #ifdef CONFIG_64BIT |
62 | # if LSB_CRC |
63 | # define crc_load_long(x) le64_to_cpup(x) |
64 | # else |
65 | # define crc_load_long(x) be64_to_cpup(x) |
66 | # endif |
67 | #else |
68 | # if LSB_CRC |
69 | # define crc_load_long(x) le32_to_cpup(x) |
70 | # else |
71 | # define crc_load_long(x) be32_to_cpup(x) |
72 | # endif |
73 | #endif |
74 | |
75 | /* XOR @crc into the end of @msgpoly that represents the high-order terms. */ |
76 | static inline unsigned long |
77 | crc_clmul_prep(crc_t crc, unsigned long msgpoly) |
78 | { |
79 | #if LSB_CRC |
80 | return msgpoly ^ crc; |
81 | #else |
82 | return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS)); |
83 | #endif |
84 | } |
85 | |
86 | /* |
87 | * Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it |
88 | * modulo the generator polynomial G. This gives the CRC of @msgpoly. |
89 | */ |
90 | static inline crc_t |
91 | crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts) |
92 | { |
93 | unsigned long tmp; |
94 | |
95 | /* |
96 | * First step of Barrett reduction with integrated multiplication by |
97 | * x^n: calculate floor((msgpoly * x^n) / G). This is the value by |
98 | * which G needs to be multiplied to cancel out the x^n and higher terms |
99 | * of msgpoly * x^n. Do it using the following formula: |
100 | * |
101 | * msb-first: |
102 | * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1)) |
103 | * lsb-first: |
104 | * floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG) |
105 | * |
106 | * barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G), |
107 | * which fits a long exactly. Using any lower power of x there would |
108 | * not carry enough precision through the calculation, while using any |
109 | * higher power of x would require extra instructions to handle a wider |
110 | * multiplication. In the msb-first case, using this power of x results |
111 | * in needing a floored division by x^(BITS_PER_LONG-1), which matches |
112 | * what clmulr produces. In the lsb-first case, a factor of x gets |
113 | * implicitly introduced by each carryless multiplication (shown as |
114 | * '* x' above), and the floored division instead needs to be by |
115 | * x^BITS_PER_LONG which matches what clmul produces. |
116 | */ |
117 | #if LSB_CRC |
118 | tmp = clmul(msgpoly, consts->barrett_reduction_const_1); |
119 | #else |
120 | tmp = clmulr(a: msgpoly, b: consts->barrett_reduction_const_1); |
121 | #endif |
122 | |
123 | /* |
124 | * Second step of Barrett reduction: |
125 | * |
126 | * crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G)) |
127 | * |
128 | * This reduces (msgpoly * x^n) modulo G by adding the appropriate |
129 | * multiple of G to it. The result uses only the x^0..x^(n-1) terms. |
130 | * HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those |
131 | * terms in the first place, it is more efficient to do the equivalent: |
132 | * |
133 | * crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n |
134 | * |
135 | * In the lsb-first case further modify it to the following which avoids |
136 | * a shift, as the crc ends up in the physically low n bits from clmulr: |
137 | * |
138 | * product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x |
139 | * crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n |
140 | * |
141 | * barrett_reduction_const_2 contains the constant multiplier (G - x^n) |
142 | * or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The |
143 | * cast of the result to crc_t is essential, as it applies the mod x^n! |
144 | */ |
145 | #if LSB_CRC |
146 | return clmulr(tmp, consts->barrett_reduction_const_2); |
147 | #else |
148 | return clmul(a: tmp, b: consts->barrett_reduction_const_2); |
149 | #endif |
150 | } |
151 | |
152 | /* Update @crc with the data from @msgpoly. */ |
153 | static inline crc_t |
154 | crc_clmul_update_long(crc_t crc, unsigned long msgpoly, |
155 | const struct crc_clmul_consts *consts) |
156 | { |
157 | return crc_clmul_long(msgpoly: crc_clmul_prep(crc, msgpoly), consts); |
158 | } |
159 | |
160 | /* Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. */ |
161 | static inline crc_t |
162 | crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len, |
163 | const struct crc_clmul_consts *consts) |
164 | { |
165 | unsigned long msgpoly; |
166 | size_t i; |
167 | |
168 | #if LSB_CRC |
169 | msgpoly = (unsigned long)p[0] << (BITS_PER_LONG - 8); |
170 | for (i = 1; i < len; i++) |
171 | msgpoly = (msgpoly >> 8) ^ ((unsigned long)p[i] << (BITS_PER_LONG - 8)); |
172 | #else |
173 | msgpoly = p[0]; |
174 | for (i = 1; i < len; i++) |
175 | msgpoly = (msgpoly << 8) ^ p[i]; |
176 | #endif |
177 | |
178 | if (len >= sizeof(crc_t)) { |
179 | #if LSB_CRC |
180 | msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); |
181 | #else |
182 | msgpoly ^= (unsigned long)crc << (8*len - CRC_BITS); |
183 | #endif |
184 | return crc_clmul_long(msgpoly, consts); |
185 | } |
186 | #if LSB_CRC |
187 | msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - 8*len); |
188 | return crc_clmul_long(msgpoly, consts) ^ (crc >> (8*len)); |
189 | #else |
190 | msgpoly ^= crc >> (CRC_BITS - 8*len); |
191 | return crc_clmul_long(msgpoly, consts) ^ (crc << (8*len)); |
192 | #endif |
193 | } |
194 | |
195 | static inline crc_t |
196 | crc_clmul(crc_t crc, const void *p, size_t len, |
197 | const struct crc_clmul_consts *consts) |
198 | { |
199 | size_t align; |
200 | |
201 | /* This implementation assumes that the CRC fits in an unsigned long. */ |
202 | BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long)); |
203 | |
204 | /* If the buffer is not long-aligned, align it. */ |
205 | align = (unsigned long)p % sizeof(unsigned long); |
206 | if (align && len) { |
207 | align = min(sizeof(unsigned long) - align, len); |
208 | crc = crc_clmul_update_partial(crc, p, len: align, consts); |
209 | p += align; |
210 | len -= align; |
211 | } |
212 | |
213 | if (len >= 4 * sizeof(unsigned long)) { |
214 | unsigned long m0, m1; |
215 | |
216 | m0 = crc_clmul_prep(crc, crc_load_long(p)); |
217 | m1 = crc_load_long(p + sizeof(unsigned long)); |
218 | p += 2 * sizeof(unsigned long); |
219 | len -= 2 * sizeof(unsigned long); |
220 | /* |
221 | * Main loop. Each iteration starts with a message polynomial |
222 | * (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two |
223 | * more longs of data to form x^(3*BITS_PER_LONG)*m0 + |
224 | * x^(2*BITS_PER_LONG)*m1 + x^BITS_PER_LONG*m2 + m3, then |
225 | * "folds" that back into a congruent (modulo G) value that uses |
226 | * just m0 and m1 again. This is done by multiplying m0 by the |
227 | * precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by |
228 | * the precomputed constant (x^(2*BITS_PER_LONG) mod G), then |
229 | * adding the results to m2 and m3 as appropriate. Each such |
230 | * multiplication produces a result twice the length of a long, |
231 | * which in RISC-V is two instructions clmul and clmulh. |
232 | * |
233 | * This could be changed to fold across more than 2 longs at a |
234 | * time if there is a CPU that can take advantage of it. |
235 | */ |
236 | do { |
237 | unsigned long p0, p1, p2, p3; |
238 | |
239 | p0 = clmulh(a: m0, b: consts->fold_across_2_longs_const_hi); |
240 | p1 = clmul(a: m0, b: consts->fold_across_2_longs_const_hi); |
241 | p2 = clmulh(a: m1, b: consts->fold_across_2_longs_const_lo); |
242 | p3 = clmul(a: m1, b: consts->fold_across_2_longs_const_lo); |
243 | m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p); |
244 | m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^ |
245 | crc_load_long(p + sizeof(unsigned long)); |
246 | |
247 | p += 2 * sizeof(unsigned long); |
248 | len -= 2 * sizeof(unsigned long); |
249 | } while (len >= 2 * sizeof(unsigned long)); |
250 | |
251 | crc = crc_clmul_long(msgpoly: m0, consts); |
252 | crc = crc_clmul_update_long(crc, msgpoly: m1, consts); |
253 | } |
254 | |
255 | while (len >= sizeof(unsigned long)) { |
256 | crc = crc_clmul_update_long(crc, crc_load_long(p), consts); |
257 | p += sizeof(unsigned long); |
258 | len -= sizeof(unsigned long); |
259 | } |
260 | |
261 | if (len) |
262 | crc = crc_clmul_update_partial(crc, p, len, consts); |
263 | |
264 | return crc; |
265 | } |
266 | |