crc32.h source code [linux/lib/crc/x86/crc32.h]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* x86-optimized CRC32 functions
4	*
5	* Copyright (C) 2008 Intel Corporation
6	* Copyright 2012 Xyratex Technology Limited
7	* Copyright 2024 Google LLC
8	*/
9
10	#include "crc-pclmul-template.h"
11
12	static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
13	static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
14	static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
15
16	DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
17
18	static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
19	{
20	CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
21	have_pclmulqdq);
22	return crc32_le_base(crc, p, len);
23	}
24
25	#ifdef CONFIG_X86_64
26	#define CRC32_INST "crc32q %1, %q0"
27	#else
28	#define CRC32_INST "crc32l %1, %0"
29	#endif
30
31	/*
32	* Use carryless multiply version of crc32c when buffer size is >= 512 to
33	* account for FPU state save/restore overhead.
34	*/
35	#define CRC32C_PCLMUL_BREAKEVEN 512
36
37	asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
38
39	static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
40	{
41	size_t num_longs;
42
43	if (!static_branch_likely(&have_crc32))
44	return crc32c_base(crc, p, len);
45
46	if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
47	static_branch_likely(&have_pclmulqdq) && likely(irq_fpu_usable())) {
48	/*
49	* Long length, the vector registers are usable, and the CPU is
50	* 64-bit and supports both CRC32 and PCLMULQDQ instructions.
51	* It is worthwhile to divide the data into multiple streams,
52	* CRC them independently, and combine them using PCLMULQDQ.
53	* crc32c_x86_3way() does this using 3 streams, which is the
54	* most that x86_64 CPUs have traditionally been capable of.
55	*
56	* However, due to improved VPCLMULQDQ performance on newer
57	* CPUs, use crc32_lsb_vpclmul_avx512() instead of
58	* crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
59	* "good" implementation of AVX-512.
60	*
61	* Future work: the optimal strategy on Zen 3--5 is actually to
62	* use both crc32q and VPCLMULQDQ in parallel. Unfortunately,
63	* different numbers of streams and vector lengths are optimal
64	* on each CPU microarchitecture, making it challenging to take
65	* advantage of this. (Zen 5 even supports 7 parallel crc32q, a
66	* major upgrade.) For now, just choose between
67	* crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter
68	* is needed anyway for crc32_le(), so we just reuse it here.
69	*/
70	kernel_fpu_begin();
71	if (static_branch_likely(&have_vpclmul_avx512))
72	crc = crc32_lsb_vpclmul_avx512(crc, p, len,
73	consts_ptr: crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
74	else
75	crc = crc32c_x86_3way(crc, buffer: p, len);
76	kernel_fpu_end();
77	return crc;
78	}
79
80	/*
81	* Short length, XMM registers unusable, or the CPU is 32-bit; but the
82	* CPU supports CRC32 instructions. Just issue a single stream of CRC32
83	* instructions inline. While this doesn't use the CPU's CRC32
84	* throughput very well, it avoids the need to combine streams. Stream
85	* combination would be inefficient here.
86	*/
87
88	for (num_longs = len / sizeof(unsigned long);
89	num_longs != `0`; num_longs--, p += sizeof(unsigned long))
90	asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM ((unsigned* long *)p));
91
92	if (sizeof(unsigned long) > `4` && (len & `4`)) {
93	asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM ((u32 )p));
94	p += `4`;
95	}
96	if (len & `2`) {
97	asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM ((u16 )p));
98	p += `2`;
99	}
100	if (len & `1`)
101	asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
102
103	return crc;
104	}
105
106	#define crc32_be_arch crc32_be_base /* not implemented on this arch */
107
108	#define crc32_mod_init_arch crc32_mod_init_arch
109	static void crc32_mod_init_arch(void)
110	{
111	if (boot_cpu_has(X86_FEATURE_XMM4_2))
112	static_branch_enable(&have_crc32);
113	if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
114	static_branch_enable(&have_pclmulqdq);
115	if (have_vpclmul()) {
116	if (have_avx512()) {
117	static_call_update(crc32_lsb_pclmul,
118	crc32_lsb_vpclmul_avx512);
119	static_branch_enable(&have_vpclmul_avx512);
120	} else {
121	static_call_update(crc32_lsb_pclmul,
122	crc32_lsb_vpclmul_avx2);
123	}
124	}
125	}
126	}
127
128	static inline u32 crc32_optimizations_arch(void)
129	{
130	u32 optimizations = `0`;
131
132	if (static_key_enabled(&have_crc32))
133	optimizations \|= CRC32C_OPTIMIZATION;
134	if (static_key_enabled(&have_pclmulqdq))
135	optimizations \|= CRC32_LE_OPTIMIZATION;
136	return optimizations;
137	}
138

source code of linux/lib/crc/x86/crc32.h