crc32le-vx.c source code [linux/arch/s390/crypto/crc32le-vx.c]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/*
3	* Hardware-accelerated CRC-32 variants for Linux on z Systems
4	*
5	* Use the z/Architecture Vector Extension Facility to accelerate the
6	* computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet
7	* and Castagnoli.
8	*
9	* This CRC-32 implementation algorithm is bitreflected and processes
10	* the least-significant bit first (Little-Endian).
11	*
12	* Copyright IBM Corp. 2015
13	* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
14	*/
15
16	#include <linux/types.h>
17	#include <asm/fpu.h>
18	#include "crc32-vx.h"
19
20	/ Vector register range containing CRC-32 constants /
21	#define CONST_PERM_LE2BE 9
22	#define CONST_R2R1 10
23	#define CONST_R4R3 11
24	#define CONST_R5 12
25	#define CONST_RU_POLY 13
26	#define CONST_CRC_POLY 14
27
28	/*
29	* The CRC-32 constant block contains reduction constants to fold and
30	* process particular chunks of the input data stream in parallel.
31	*
32	* For the CRC-32 variants, the constants are precomputed according to
33	* these definitions:
34	*
35	* R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
36	* R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
37	* R3 = [(x128+32 mod P'(x) << 32)]' << 1
38	* R4 = [(x128-32 mod P'(x) << 32)]' << 1
39	* R5 = [(x64 mod P'(x) << 32)]' << 1
40	* R6 = [(x32 mod P'(x) << 32)]' << 1
41	*
42	* The bitreflected Barret reduction constant, u', is defined as
43	* the bit reversal of floor(x**64 / P(x)).
44	*
45	* where P(x) is the polynomial in the normal domain and the P'(x) is the
46	* polynomial in the reversed (bitreflected) domain.
47	*
48	* CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
49	*
50	* P(x) = 0x04C11DB7
51	* P'(x) = 0xEDB88320
52	*
53	* CRC-32C (Castagnoli) polynomials:
54	*
55	* P(x) = 0x1EDC6F41
56	* P'(x) = 0x82F63B78
57	*/
58
59	static unsigned long constants_CRC_32_LE[] = {
60	`0x0f0e0d0c0b0a0908`, `0x0706050403020100`, / BE->LE mask /
61	`0x1c6e41596`, `0x154442bd4`, / R2, R1 /
62	`0x0ccaa009e`, `0x1751997d0`, / R4, R3 /
63	`0x0`, `0x163cd6124`, / R5 /
64	`0x0`, `0x1f7011641`, / u' /
65	`0x0`, `0x1db710641` / P'(x) << 1 /
66	};
67
68	static unsigned long constants_CRC_32C_LE[] = {
69	`0x0f0e0d0c0b0a0908`, `0x0706050403020100`, / BE->LE mask /
70	`0x09e4addf8`, `0x740eef02`, / R2, R1 /
71	`0x14cd00bd6`, `0xf20c0dfe`, / R4, R3 /
72	`0x0`, `0x0dd45aab8`, / R5 /
73	`0x0`, `0x0dea713f1`, / u' /
74	`0x0`, `0x105ec76f0` / P'(x) << 1 /
75	};
76
77	/**
78	* crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers
79	* @crc: Initial CRC value, typically ~0.
80	* @buf: Input buffer pointer, performance might be improved if the
81	* buffer is on a doubleword boundary.
82	* @size: Size of the buffer, must be 64 bytes or greater.
83	* @constants: CRC-32 constant pool base pointer.
84	*
85	* Register usage:
86	* V0: Initial CRC value and intermediate constants and results.
87	* V1..V4: Data for CRC computation.
88	* V5..V8: Next data chunks that are fetched from the input buffer.
89	* V9: Constant for BE->LE conversion and shift operations
90	* V10..V14: CRC-32 constants.
91	*/
92	static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const buf, size_t size, unsigned* long *constants)
93	{
94	/ Load CRC-32 constants /
95	fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants);
96
97	/*
98	* Load the initial CRC value.
99	*
100	* The CRC value is loaded into the rightmost word of the
101	* vector register and is later XORed with the LSB portion
102	* of the loaded input data.
103	*/
104	fpu_vzero(`0`); / Clear V0 /
105	fpu_vlvgf(`0`, crc, `3`); / Load CRC into rightmost word /
106
107	/ Load a 64-byte data chunk and XOR with CRC /
108	fpu_vlm(`1`, `4`, buf);
109	fpu_vperm(`1`, `1`, `1`, CONST_PERM_LE2BE);
110	fpu_vperm(`2`, `2`, `2`, CONST_PERM_LE2BE);
111	fpu_vperm(`3`, `3`, `3`, CONST_PERM_LE2BE);
112	fpu_vperm(`4`, `4`, `4`, CONST_PERM_LE2BE);
113
114	fpu_vx(`1`, `0`, `1`); / V1 ^= CRC /
115	buf += `64`;
116	size -= `64`;
117
118	while (size >= `64`) {
119	fpu_vlm(`5`, `8`, buf);
120	fpu_vperm(`5`, `5`, `5`, CONST_PERM_LE2BE);
121	fpu_vperm(`6`, `6`, `6`, CONST_PERM_LE2BE);
122	fpu_vperm(`7`, `7`, `7`, CONST_PERM_LE2BE);
123	fpu_vperm(`8`, `8`, `8`, CONST_PERM_LE2BE);
124	/*
125	* Perform a GF(2) multiplication of the doublewords in V1 with
126	* the R1 and R2 reduction constants in V0. The intermediate
127	* result is then folded (accumulated) with the next data chunk
128	* in V5 and stored in V1. Repeat this step for the register
129	* contents in V2, V3, and V4 respectively.
130	*/
131	fpu_vgfmag(`1`, CONST_R2R1, `1`, `5`);
132	fpu_vgfmag(`2`, CONST_R2R1, `2`, `6`);
133	fpu_vgfmag(`3`, CONST_R2R1, `3`, `7`);
134	fpu_vgfmag(`4`, CONST_R2R1, `4`, `8`);
135	buf += `64`;
136	size -= `64`;
137	}
138
139	/*
140	* Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
141	* and R4 and accumulating the next 128-bit chunk until a single 128-bit
142	* value remains.
143	*/
144	fpu_vgfmag(`1`, CONST_R4R3, `1`, `2`);
145	fpu_vgfmag(`1`, CONST_R4R3, `1`, `3`);
146	fpu_vgfmag(`1`, CONST_R4R3, `1`, `4`);
147
148	while (size >= `16`) {
149	fpu_vl(`2`, buf);
150	fpu_vperm(`2`, `2`, `2`, CONST_PERM_LE2BE);
151	fpu_vgfmag(`1`, CONST_R4R3, `1`, `2`);
152	buf += `16`;
153	size -= `16`;
154	}
155
156	/*
157	* Set up a vector register for byte shifts. The shift value must
158	* be loaded in bits 1-4 in byte element 7 of a vector register.
159	* Shift by 8 bytes: 0x40
160	* Shift by 4 bytes: 0x20
161	*/
162	fpu_vleib(`9`, `0x40`, `7`);
163
164	/*
165	* Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
166	* to move R4 into the rightmost doubleword and set the leftmost
167	* doubleword to 0x1.
168	*/
169	fpu_vsrlb(`0`, CONST_R4R3, `9`);
170	fpu_vleig(`0`, `1`, `0`);
171
172	/*
173	* Compute GF(2) product of V1 and V0. The rightmost doubleword
174	* of V1 is multiplied with R4. The leftmost doubleword of V1 is
175	* multiplied by 0x1 and is then XORed with rightmost product.
176	* Implicitly, the intermediate leftmost product becomes padded
177	*/
178	fpu_vgfmg(`1`, `0`, `1`);
179
180	/*
181	* Now do the final 32-bit fold by multiplying the rightmost word
182	* in V1 with R5 and XOR the result with the remaining bits in V1.
183	*
184	* To achieve this by a single VGFMAG, right shift V1 by a word
185	* and store the result in V2 which is then accumulated. Use the
186	* vector unpack instruction to load the rightmost half of the
187	* doubleword into the rightmost doubleword element of V1; the other
188	* half is loaded in the leftmost doubleword.
189	* The vector register with CONST_R5 contains the R5 constant in the
190	* rightmost doubleword and the leftmost doubleword is zero to ignore
191	* the leftmost product of V1.
192	*/
193	fpu_vleib(`9`, `0x20`, `7`); / Shift by words /
194	fpu_vsrlb(`2`, `1`, `9`); / Store remaining bits in V2 /
195	fpu_vupllf(`1`, `1`); / Split rightmost doubleword /
196	fpu_vgfmag(`1`, CONST_R5, `1`, `2`); / V1 = (V1 * R5) XOR V2 /
197
198	/*
199	* Apply a Barret reduction to compute the final 32-bit CRC value.
200	*
201	* The input values to the Barret reduction are the degree-63 polynomial
202	* in V1 (R(x)), degree-32 generator polynomial, and the reduction
203	* constant u. The Barret reduction result is the CRC value of R(x) mod
204	* P(x).
205	*
206	* The Barret reduction algorithm is defined as:
207	*
208	* 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
209	* 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
210	* 3. C(x) = R(x) XOR T2(x) mod x^32
211	*
212	* Note: The leftmost doubleword of vector register containing
213	* CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
214	* is zero and does not contribute to the final result.
215	*/
216
217	/ T1(x) = floor( R(x) / x^32 ) GF2MUL u /
218	fpu_vupllf(`2`, `1`);
219	fpu_vgfmg(`2`, CONST_RU_POLY, `2`);
220
221	/*
222	* Compute the GF(2) product of the CRC polynomial with T1(x) in
223	* V2 and XOR the intermediate result, T2(x), with the value in V1.
224	* The final result is stored in word element 2 of V2.
225	*/
226	fpu_vupllf(`2`, `2`);
227	fpu_vgfmag(`2`, CONST_CRC_POLY, `2`, `1`);
228
229	return fpu_vlgvf(`2`, `2`);
230	}
231
232	u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
233	{
234	return crc32_le_vgfm_generic(crc, buf, size, constants: &constants_CRC_32_LE[`0`]);
235	}
236
237	u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size)
238	{
239	return crc32_le_vgfm_generic(crc, buf, size, constants: &constants_CRC_32C_LE[`0`]);
240	}
241

source code of linux/arch/s390/crypto/crc32le-vx.c