x86_64.rs source code [crates/compiler-builtins/src/mem/x86_64.rs]

1	// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
2	// been enhanced to perform better than an simple qword loop, making them ideal
3	// for implementing memcpy/memset. Note that "rep cmps" has received no such
4	// enhancement, so it is not used to implement memcmp.
5	//
6	// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
7	// further enhanced to automatically select the best microarchitectural
8	// implementation based on length and alignment. See the following features from
9	// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual":
10	// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
11	// - FSRM - Fast Short REP MOV (Ice Lake and later)
12	// - Fast Zero-Length MOVSB (On no current hardware)
13	// - Fast Short STOSB (On no current hardware)
14	//
15	// To simplify things, we switch to using the byte-based variants if the "ermsb"
16	// feature is present at compile-time. We don't bother detecting other features.
17	// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
18
19	use core::arch::asm;
20	use core::{intrinsics, mem};
21
22	#[inline(always)]
23	#[cfg(target_feature = "ermsb")]
24	pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
25	// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
26	core::arch::asm!(
27	"repe movsb (%rsi), (%rdi)",
28	inout("rcx") count => _,
29	inout("rdi") dest => _,
30	inout("rsi") src => _,
31	options(att_syntax, nostack, preserves_flags)
32	);
33	}
34
35	#[inline(always)]
36	#[cfg(not(target_feature = "ermsb"))]
37	pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
38	let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
39	// Separating the blocks gives the compiler more freedom to reorder instructions.
40	asm!(
41	"rep movsb",
42	inout("ecx") pre_byte_count => _,
43	inout("rdi") dest => dest,
44	inout("rsi") src => src,
45	options(att_syntax, nostack, preserves_flags)
46	);
47	asm!(
48	"rep movsq",
49	inout("rcx") qword_count => _,
50	inout("rdi") dest => dest,
51	inout("rsi") src => src,
52	options(att_syntax, nostack, preserves_flags)
53	);
54	asm!(
55	"rep movsb",
56	inout("ecx") byte_count => _,
57	inout("rdi") dest => _,
58	inout("rsi") src => _,
59	options(att_syntax, nostack, preserves_flags)
60	);
61	}
62
63	#[inline(always)]
64	pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
65	let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
66	// We can't separate this block due to std/cld
67	asm!(
68	"std",
69	"rep movsb",
70	"sub $7, %rsi",
71	"sub $7, %rdi",
72	"mov {qword_count:r}, %rcx",
73	"rep movsq",
74	"test {pre_byte_count:e}, {pre_byte_count:e}",
75	"add $7, %rsi",
76	"add $7, %rdi",
77	"mov {pre_byte_count:e}, %ecx",
78	"rep movsb",
79	"cld",
80	pre_byte_count = in(reg) pre_byte_count,
81	qword_count = in(reg) qword_count,
82	inout("ecx") byte_count => _,
83	inout("rdi") dest.add(count - `1`) => _,
84	inout("rsi") src.add(count - `1`) => _,
85	// We modify flags, but we restore it afterwards
86	options(att_syntax, nostack, preserves_flags)
87	);
88	}
89
90	#[inline(always)]
91	#[cfg(target_feature = "ermsb")]
92	pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
93	// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
94	core::arch::asm!(
95	"repe stosb %al, (%rdi)",
96	inout("rcx") count => _,
97	inout("rdi") dest => _,
98	inout("al") c => _,
99	options(att_syntax, nostack, preserves_flags)
100	)
101	}
102
103	#[inline(always)]
104	#[cfg(not(target_feature = "ermsb"))]
105	pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
106	let c = c as u64 * `0x0101_0101_0101_0101`;
107	let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
108	// Separating the blocks gives the compiler more freedom to reorder instructions.
109	asm!(
110	"rep stosb",
111	inout("ecx") pre_byte_count => _,
112	inout("rdi") dest => dest,
113	in("rax") c,
114	options(att_syntax, nostack, preserves_flags)
115	);
116	asm!(
117	"rep stosq",
118	inout("rcx") qword_count => _,
119	inout("rdi") dest => dest,
120	in("rax") c,
121	options(att_syntax, nostack, preserves_flags)
122	);
123	asm!(
124	"rep stosb",
125	inout("ecx") byte_count => _,
126	inout("rdi") dest => _,
127	in("rax") c,
128	options(att_syntax, nostack, preserves_flags)
129	);
130	}
131
132	#[inline(always)]
133	pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
134	#[inline(always)]
135	unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
136	where
137	T: Clone + Copy + Eq,
138	U: Clone + Copy + Eq,
139	F: FnOnce(*const U, *const U, usize) -> i32,
140	{
141	// Ensure T is not a ZST.
142	const { assert!(mem::size_of::<T>() != `0`) };
143
144	let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>()));
145	while a != end {
146	if a.read_unaligned() != b.read_unaligned() {
147	return f(a.cast(), b.cast(), mem::size_of::<T>());
148	}
149	a = a.add(`1`);
150	b = b.add(`1`);
151	}
152	f(
153	a.cast(),
154	b.cast(),
155	intrinsics::unchecked_rem(n, mem::size_of::<T>()),
156	)
157	}
158	let c1 = \|mut a: *const u8, mut b: *const u8, n\| {
159	for _ in `0`..n {
160	if a.read() != b.read() {
161	return i32::from(a.read()) - i32::from(b.read());
162	}
163	a = a.add(`1`);
164	b = b.add(`1`);
165	}
166	`0`
167	};
168	let c2 = \|a: *const u16, b, n\| cmp(a, b, n, c1);
169	let c4 = \|a: *const u32, b, n\| cmp(a, b, n, c2);
170	let c8 = \|a: *const u64, b, n\| cmp(a, b, n, c4);
171	let c16 = \|a: *const u128, b, n\| cmp(a, b, n, c8);
172	c16(a.cast(), b.cast(), n)
173	}
174
175	// In order to process more than on byte simultaneously when executing strlen,
176	// two things must be considered:
177	// An n byte read with an n-byte aligned address will never cross*
178	// a page boundary and will always succeed. Any smaller alignment
179	// may result in a read that will cross a page boundary, which may
180	// trigger an access violation.
181	// Surface Rust considers any kind of out-of-bounds read as undefined*
182	// behaviour. To dodge this, memory access operations are written
183	// using inline assembly.
184
185	#[cfg(target_feature = "sse2")]
186	#[inline(always)]
187	pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
188	use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8};
189
190	let mut n = `0`;
191
192	// The use of _mm_movemask_epi8 and company allow for speedups,
193	// but they aren't cheap by themselves. Thus, possibly small strings
194	// are handled in simple loops.
195
196	for _ in `0`..`4` {
197	if *s == `0` {
198	return n;
199	}
200
201	n += `1`;
202	s = s.add(`1`);
203	}
204
205	// Shave of the least significand bits to align the address to a 16
206	// byte boundary. The shaved of bits are used to correct the first iteration.
207
208	let align = s as usize & `15`;
209	let mut s = ((s as usize) - align) as *const __m128i;
210	let zero = _mm_set1_epi8(`0`);
211
212	let x = {
213	let r;
214	asm!(
215	"movdqa ({addr:r}), {dest}",
216	addr = in(reg) s,
217	dest = out(xmm_reg) r,
218	options(att_syntax, nostack),
219	);
220	r
221	};
222	let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
223
224	if cmp != `0` {
225	return n + cmp.trailing_zeros() as usize;
226	}
227
228	n += `16` - align;
229	s = s.add(`1`);
230
231	loop {
232	let x = {
233	let r;
234	asm!(
235	"movdqa ({addr:r}), {dest}",
236	addr = in(reg) s,
237	dest = out(xmm_reg) r,
238	options(att_syntax, nostack),
239	);
240	r
241	};
242	let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
243	if cmp == `0` {
244	n += `16`;
245	s = s.add(`1`);
246	} else {
247	return n + cmp.trailing_zeros() as usize;
248	}
249	}
250	}
251
252	// Provided for scenarios like kernel development, where SSE might not
253	// be available.
254	#[cfg(not(target_feature = "sse2"))]
255	#[inline(always)]
256	pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
257	let mut n = `0`;
258
259	// Check bytes in steps of one until
260	// either a zero byte is discovered or
261	// pointer is aligned to an eight byte boundary.
262
263	while s as usize & `7` != `0` {
264	if *s == `0` {
265	return n;
266	}
267	n += `1`;
268	s = s.add(`1`);
269	}
270
271	// Check bytes in steps of eight until a zero
272	// byte is discovered.
273
274	let mut s = s as *const u64;
275
276	loop {
277	let mut cs = {
278	let r: u64;
279	asm!(
280	"mov ({addr}), {dest}",
281	addr = in(reg) s,
282	dest = out(reg) r,
283	options(att_syntax, nostack),
284	);
285	r
286	};
287	// Detect if a word has a zero byte, taken from
288	// https://graphics.stanford.edu/~seander/bithacks.html
289	if (cs.wrapping_sub(`0x0101010101010101`) & !cs & `0x8080808080808080`) != `0` {
290	loop {
291	if cs & `255` == `0` {
292	return n;
293	} else {
294	cs >>= `8`;
295	n += `1`;
296	}
297	}
298	} else {
299	n += `8`;
300	s = s.add(`1`);
301	}
302	}
303	}
304
305	/// Determine optimal parameters for a `rep` instruction.
306	fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
307	// Unaligned writes are still slow on modern processors, so align the destination address.
308	let pre_byte_count: usize = ((`8` - (dest as usize & `0b111`)) & `0b111`).min(count);
309	count -= pre_byte_count;
310	let qword_count: usize = count >> `3`;
311	let byte_count: usize = count & `0b111`;
312	(pre_byte_count, qword_count, byte_count)
313	}
314