crc-clmul-template.h source code [linux/arch/riscv/lib/crc-clmul-template.h]

1	/ SPDX-License-Identifier: GPL-2.0-or-later /
2	/ Copyright 2025 Google LLC /
3
4	/*
5	* This file is a "template" that generates a CRC function optimized using the
6	* RISC-V Zbc (scalar carryless multiplication) extension. The includer of this
7	* file must define the following parameters to specify the type of CRC:
8	*
9	* crc_t: the data type of the CRC, e.g. u32 for a 32-bit CRC
10	* LSB_CRC: 0 for a msb (most-significant-bit) first CRC, i.e. natural
11	* mapping between bits and polynomial coefficients
12	* 1 for a lsb (least-significant-bit) first CRC, i.e. reflected
13	* mapping between bits and polynomial coefficients
14	*/
15
16	#include <asm/byteorder.h>
17	#include <linux/minmax.h>
18
19	#define CRC_BITS (8 * sizeof(crc_t)) /* a.k.a. 'n' */
20
21	static inline unsigned long clmul(unsigned long a, unsigned long b)
22	{
23	unsigned long res;
24
25	asm(".option push\n"
26	".option arch,+zbc\n"
27	"clmul %0, %1, %2\n"
28	".option pop\n"
29	: "=r" (res) : "r" (a), "r" (b));
30	return res;
31	}
32
33	static inline unsigned long clmulh(unsigned long a, unsigned long b)
34	{
35	unsigned long res;
36
37	asm(".option push\n"
38	".option arch,+zbc\n"
39	"clmulh %0, %1, %2\n"
40	".option pop\n"
41	: "=r" (res) : "r" (a), "r" (b));
42	return res;
43	}
44
45	static inline unsigned long clmulr(unsigned long a, unsigned long b)
46	{
47	unsigned long res;
48
49	asm(".option push\n"
50	".option arch,+zbc\n"
51	"clmulr %0, %1, %2\n"
52	".option pop\n"
53	: "=r" (res) : "r" (a), "r" (b));
54	return res;
55	}
56
57	/*
58	* crc_load_long() loads one "unsigned long" of aligned data bytes, producing a
59	* polynomial whose bit order matches the CRC's bit order.
60	*/
61	#ifdef CONFIG_64BIT
62	# if LSB_CRC
63	# define crc_load_long(x) le64_to_cpup(x)
64	# else
65	# define crc_load_long(x) be64_to_cpup(x)
66	# endif
67	#else
68	# if LSB_CRC
69	# define crc_load_long(x) le32_to_cpup(x)
70	# else
71	# define crc_load_long(x) be32_to_cpup(x)
72	# endif
73	#endif
74
75	/ XOR @crc into the end of @msgpoly that represents the high-order terms. /
76	static inline unsigned long
77	crc_clmul_prep(crc_t crc, unsigned long msgpoly)
78	{
79	#if LSB_CRC
80	return msgpoly ^ crc;
81	#else
82	return msgpoly ^ ((unsigned long)crc << (BITS_PER_LONG - CRC_BITS));
83	#endif
84	}
85
86	/*
87	* Multiply the long-sized @msgpoly by x^n (a.k.a. x^CRC_BITS) and reduce it
88	* modulo the generator polynomial G. This gives the CRC of @msgpoly.
89	*/
90	static inline crc_t
91	crc_clmul_long(unsigned long msgpoly, const struct crc_clmul_consts *consts)
92	{
93	unsigned long tmp;
94
95	/*
96	* First step of Barrett reduction with integrated multiplication by
97	* x^n: calculate floor((msgpoly * x^n) / G). This is the value by
98	* which G needs to be multiplied to cancel out the x^n and higher terms
99	* of msgpoly * x^n. Do it using the following formula:
100	*
101	* msb-first:
102	* floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G)) / x^(BITS_PER_LONG-1))
103	* lsb-first:
104	* floor((msgpoly * floor(x^(BITS_PER_LONG-1+n) / G) * x) / x^BITS_PER_LONG)
105	*
106	* barrett_reduction_const_1 contains floor(x^(BITS_PER_LONG-1+n) / G),
107	* which fits a long exactly. Using any lower power of x there would
108	* not carry enough precision through the calculation, while using any
109	* higher power of x would require extra instructions to handle a wider
110	* multiplication. In the msb-first case, using this power of x results
111	* in needing a floored division by x^(BITS_PER_LONG-1), which matches
112	* what clmulr produces. In the lsb-first case, a factor of x gets
113	* implicitly introduced by each carryless multiplication (shown as
114	* '* x' above), and the floored division instead needs to be by
115	* x^BITS_PER_LONG which matches what clmul produces.
116	*/
117	#if LSB_CRC
118	tmp = clmul(msgpoly, consts->barrett_reduction_const_1);
119	#else
120	tmp = clmulr(a: msgpoly, b: consts->barrett_reduction_const_1);
121	#endif
122
123	/*
124	* Second step of Barrett reduction:
125	*
126	* crc := (msgpoly * x^n) + (G * floor((msgpoly * x^n) / G))
127	*
128	* This reduces (msgpoly * x^n) modulo G by adding the appropriate
129	* multiple of G to it. The result uses only the x^0..x^(n-1) terms.
130	* HOWEVER, since the unreduced value (msgpoly * x^n) is zero in those
131	* terms in the first place, it is more efficient to do the equivalent:
132	*
133	* crc := ((G - x^n) * floor((msgpoly * x^n) / G)) mod x^n
134	*
135	* In the lsb-first case further modify it to the following which avoids
136	* a shift, as the crc ends up in the physically low n bits from clmulr:
137	*
138	* product := ((G - x^n) * x^(BITS_PER_LONG - n)) * floor((msgpoly * x^n) / G) * x
139	* crc := floor(product / x^(BITS_PER_LONG + 1 - n)) mod x^n
140	*
141	* barrett_reduction_const_2 contains the constant multiplier (G - x^n)
142	* or (G - x^n) * x^(BITS_PER_LONG - n) from the formulas above. The
143	* cast of the result to crc_t is essential, as it applies the mod x^n!
144	*/
145	#if LSB_CRC
146	return clmulr(tmp, consts->barrett_reduction_const_2);
147	#else
148	return clmul(a: tmp, b: consts->barrett_reduction_const_2);
149	#endif
150	}
151
152	/ Update @crc with the data from @msgpoly. /
153	static inline crc_t
154	crc_clmul_update_long(crc_t crc, unsigned long msgpoly,
155	const struct crc_clmul_consts *consts)
156	{
157	return crc_clmul_long(msgpoly: crc_clmul_prep(crc, msgpoly), consts);
158	}
159
160	/ Update @crc with 1 <= @len < sizeof(unsigned long) bytes of data. /
161	static inline crc_t
162	crc_clmul_update_partial(crc_t crc, const u8 *p, size_t len,
163	const struct crc_clmul_consts *consts)
164	{
165	unsigned long msgpoly;
166	size_t i;
167
168	#if LSB_CRC
169	msgpoly = (unsigned long)p[`0`] << (BITS_PER_LONG - `8`);
170	for (i = `1`; i < len; i++)
171	msgpoly = (msgpoly >> `8`) ^ ((unsigned long)p[i] << (BITS_PER_LONG - `8`));
172	#else
173	msgpoly = p[`0`];
174	for (i = `1`; i < len; i++)
175	msgpoly = (msgpoly << `8`) ^ p[i];
176	#endif
177
178	if (len >= sizeof(crc_t)) {
179	#if LSB_CRC
180	msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - `8`*len);
181	#else
182	msgpoly ^= (unsigned long)crc << (`8`*len - CRC_BITS);
183	#endif
184	return crc_clmul_long(msgpoly, consts);
185	}
186	#if LSB_CRC
187	msgpoly ^= (unsigned long)crc << (BITS_PER_LONG - `8`*len);
188	return crc_clmul_long(msgpoly, consts) ^ (crc >> (`8`*len));
189	#else
190	msgpoly ^= crc >> (CRC_BITS - `8`*len);
191	return crc_clmul_long(msgpoly, consts) ^ (crc << (`8`*len));
192	#endif
193	}
194
195	static inline crc_t
196	crc_clmul(crc_t crc, const void *p, size_t len,
197	const struct crc_clmul_consts *consts)
198	{
199	size_t align;
200
201	/ This implementation assumes that the CRC fits in an unsigned long. /
202	BUILD_BUG_ON(sizeof(crc_t) > sizeof(unsigned long));
203
204	/ If the buffer is not long-aligned, align it. /
205	align = (unsigned long)p % sizeof(unsigned long);
206	if (align && len) {
207	align = min(sizeof(unsigned long) - align, len);
208	crc = crc_clmul_update_partial(crc, p, len: align, consts);
209	p += align;
210	len -= align;
211	}
212
213	if (len >= `4` * sizeof(unsigned long)) {
214	unsigned long m0, m1;
215
216	m0 = crc_clmul_prep(crc, crc_load_long(p));
217	m1 = crc_load_long(p + sizeof(unsigned long));
218	p += `2` * sizeof(unsigned long);
219	len -= `2` * sizeof(unsigned long);
220	/*
221	* Main loop. Each iteration starts with a message polynomial
222	* (x^BITS_PER_LONG)*m0 + m1, then logically extends it by two
223	* more longs of data to form x^(3BITS_PER_LONG)m0 +
224	* x^(2BITS_PER_LONG)m1 + x^BITS_PER_LONG*m2 + m3, then
225	* "folds" that back into a congruent (modulo G) value that uses
226	* just m0 and m1 again. This is done by multiplying m0 by the
227	* precomputed constant (x^(3*BITS_PER_LONG) mod G) and m1 by
228	* the precomputed constant (x^(2*BITS_PER_LONG) mod G), then
229	* adding the results to m2 and m3 as appropriate. Each such
230	* multiplication produces a result twice the length of a long,
231	* which in RISC-V is two instructions clmul and clmulh.
232	*
233	* This could be changed to fold across more than 2 longs at a
234	* time if there is a CPU that can take advantage of it.
235	*/
236	do {
237	unsigned long p0, p1, p2, p3;
238
239	p0 = clmulh(a: m0, b: consts->fold_across_2_longs_const_hi);
240	p1 = clmul(a: m0, b: consts->fold_across_2_longs_const_hi);
241	p2 = clmulh(a: m1, b: consts->fold_across_2_longs_const_lo);
242	p3 = clmul(a: m1, b: consts->fold_across_2_longs_const_lo);
243	m0 = (LSB_CRC ? p1 ^ p3 : p0 ^ p2) ^ crc_load_long(p);
244	m1 = (LSB_CRC ? p0 ^ p2 : p1 ^ p3) ^
245	crc_load_long(p + sizeof(unsigned long));
246
247	p += `2` * sizeof(unsigned long);
248	len -= `2` * sizeof(unsigned long);
249	} while (len >= `2` * sizeof(unsigned long));
250
251	crc = crc_clmul_long(msgpoly: m0, consts);
252	crc = crc_clmul_update_long(crc, msgpoly: m1, consts);
253	}
254
255	while (len >= sizeof(unsigned long)) {
256	crc = crc_clmul_update_long(crc, crc_load_long(p), consts);
257	p += sizeof(unsigned long);
258	len -= sizeof(unsigned long);
259	}
260
261	if (len)
262	crc = crc_clmul_update_partial(crc, p, len, consts);
263
264	return crc;
265	}
266

source code of linux/arch/riscv/lib/crc-clmul-template.h