x86_64.rs source code [crates/compiler_builtins/src/mem/x86_64.rs]

1	// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
2	// been enhanced to perform better than an simple qword loop, making them ideal
3	// for implementing memcpy/memset. Note that "rep cmps" has received no such
4	// enhancement, so it is not used to implement memcmp.
5	//
6	// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
7	// further enhanced to automatically select the best microarchitectural
8	// implementation based on length and alignment. See the following features from
9	// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual":
10	// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
11	// - FSRM - Fast Short REP MOV (Ice Lake and later)
12	// - Fast Zero-Length MOVSB (On no current hardware)
13	// - Fast Short STOSB (On no current hardware)
14	//
15	// To simplify things, we switch to using the byte-based variants if the "ermsb"
16	// feature is present at compile-time. We don't bother detecting other features.
17	// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
18
19	use core::arch::asm;
20	use core::intrinsics;
21	use core::mem;
22
23	#[inline(always)]
24	#[cfg(target_feature = "ermsb")]
25	pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
26	// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
27	core::arch::asm!(
28	"repe movsb (%rsi), (%rdi)",
29	inout("rcx") count => _,
30	inout("rdi") dest => _,
31	inout("rsi") src => _,
32	options(att_syntax, nostack, preserves_flags)
33	);
34	}
35
36	#[inline(always)]
37	#[cfg(not(target_feature = "ermsb"))]
38	pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
39	let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
40	// Separating the blocks gives the compiler more freedom to reorder instructions.
41	asm!(
42	"rep movsb",
43	inout("ecx") pre_byte_count => _,
44	inout("rdi") dest => dest,
45	inout("rsi") src => src,
46	options(att_syntax, nostack, preserves_flags)
47	);
48	asm!(
49	"rep movsq",
50	inout("rcx") qword_count => _,
51	inout("rdi") dest => dest,
52	inout("rsi") src => src,
53	options(att_syntax, nostack, preserves_flags)
54	);
55	asm!(
56	"rep movsb",
57	inout("ecx") byte_count => _,
58	inout("rdi") dest => _,
59	inout("rsi") src => _,
60	options(att_syntax, nostack, preserves_flags)
61	);
62	}
63
64	#[inline(always)]
65	pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
66	let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
67	// We can't separate this block due to std/cld
68	asm!(
69	"std",
70	"rep movsb",
71	"sub $7, %rsi",
72	"sub $7, %rdi",
73	"mov {qword_count}, %rcx",
74	"rep movsq",
75	"test {pre_byte_count:e}, {pre_byte_count:e}",
76	"add $7, %rsi",
77	"add $7, %rdi",
78	"mov {pre_byte_count:e}, %ecx",
79	"rep movsb",
80	"cld",
81	pre_byte_count = in(reg) pre_byte_count,
82	qword_count = in(reg) qword_count,
83	inout("ecx") byte_count => _,
84	inout("rdi") dest.add(count - `1`) => _,
85	inout("rsi") src.add(count - `1`) => _,
86	// We modify flags, but we restore it afterwards
87	options(att_syntax, nostack, preserves_flags)
88	);
89	}
90
91	#[inline(always)]
92	#[cfg(target_feature = "ermsb")]
93	pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
94	// FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
95	core::arch::asm!(
96	"repe stosb %al, (%rdi)",
97	inout("rcx") count => _,
98	inout("rdi") dest => _,
99	inout("al") c => _,
100	options(att_syntax, nostack, preserves_flags)
101	)
102	}
103
104	#[inline(always)]
105	#[cfg(not(target_feature = "ermsb"))]
106	pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
107	let c = c as u64 * `0x0101_0101_0101_0101`;
108	let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
109	// Separating the blocks gives the compiler more freedom to reorder instructions.
110	asm!(
111	"rep stosb",
112	inout("ecx") pre_byte_count => _,
113	inout("rdi") dest => dest,
114	in("rax") c,
115	options(att_syntax, nostack, preserves_flags)
116	);
117	asm!(
118	"rep stosq",
119	inout("rcx") qword_count => _,
120	inout("rdi") dest => dest,
121	in("rax") c,
122	options(att_syntax, nostack, preserves_flags)
123	);
124	asm!(
125	"rep stosb",
126	inout("ecx") byte_count => _,
127	inout("rdi") dest => _,
128	in("rax") c,
129	options(att_syntax, nostack, preserves_flags)
130	);
131	}
132
133	#[inline(always)]
134	pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
135	#[inline(always)]
136	unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
137	where
138	T: Clone + Copy + Eq,
139	U: Clone + Copy + Eq,
140	F: FnOnce(*const U, *const U, usize) -> i32,
141	{
142	// Ensure T is not a ZST.
143	const { assert!(mem::size_of::<T>() != `0`) };
144
145	let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>()));
146	while a != end {
147	if a.read_unaligned() != b.read_unaligned() {
148	return f(a.cast(), b.cast(), mem::size_of::<T>());
149	}
150	a = a.add(`1`);
151	b = b.add(`1`);
152	}
153	f(
154	a.cast(),
155	b.cast(),
156	intrinsics::unchecked_rem(n, mem::size_of::<T>()),
157	)
158	}
159	let c1 = \|mut a: *const u8, mut b: *const u8, n\| {
160	for _ in `0`..n {
161	if a.read() != b.read() {
162	return i32::from(a.read()) - i32::from(b.read());
163	}
164	a = a.add(`1`);
165	b = b.add(`1`);
166	}
167	`0`
168	};
169	let c2 = \|a: *const u16, b, n\| cmp(a, b, n, c1);
170	let c4 = \|a: *const u32, b, n\| cmp(a, b, n, c2);
171	let c8 = \|a: *const u64, b, n\| cmp(a, b, n, c4);
172	let c16 = \|a: *const u128, b, n\| cmp(a, b, n, c8);
173	c16(a.cast(), b.cast(), n)
174	}
175
176	// In order to process more than on byte simultaneously when executing strlen,
177	// two things must be considered:
178	// An n byte read with an n-byte aligned address will never cross*
179	// a page boundary and will always succeed. Any smaller alignment
180	// may result in a read that will cross a page boundary, which may
181	// trigger an access violation.
182	// Surface Rust considers any kind of out-of-bounds read as undefined*
183	// behaviour. To dodge this, memory access operations are written
184	// using inline assembly.
185
186	#[cfg(target_feature = "sse2")]
187	#[inline(always)]
188	pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
189	use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8};
190
191	let mut n = `0`;
192
193	// The use of _mm_movemask_epi8 and company allow for speedups,
194	// but they aren't cheap by themselves. Thus, possibly small strings
195	// are handled in simple loops.
196
197	for _ in `0`..`4` {
198	if *s == `0` {
199	return n;
200	}
201
202	n += `1`;
203	s = s.add(`1`);
204	}
205
206	// Shave of the least significand bits to align the address to a 16
207	// byte boundary. The shaved of bits are used to correct the first iteration.
208
209	let align = s as usize & `15`;
210	let mut s = ((s as usize) - align) as *const __m128i;
211	let zero = _mm_set1_epi8(`0`);
212
213	let x = {
214	let r;
215	asm!(
216	"movdqa ({addr}), {dest}",
217	addr = in(reg) s,
218	dest = out(xmm_reg) r,
219	options(att_syntax, nostack),
220	);
221	r
222	};
223	let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
224
225	if cmp != `0` {
226	return n + cmp.trailing_zeros() as usize;
227	}
228
229	n += `16` - align;
230	s = s.add(`1`);
231
232	loop {
233	let x = {
234	let r;
235	asm!(
236	"movdqa ({addr}), {dest}",
237	addr = in(reg) s,
238	dest = out(xmm_reg) r,
239	options(att_syntax, nostack),
240	);
241	r
242	};
243	let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
244	if cmp == `0` {
245	n += `16`;
246	s = s.add(`1`);
247	} else {
248	return n + cmp.trailing_zeros() as usize;
249	}
250	}
251	}
252
253	// Provided for scenarios like kernel development, where SSE might not
254	// be available.
255	#[cfg(not(target_feature = "sse2"))]
256	#[inline(always)]
257	pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
258	let mut n = `0`;
259
260	// Check bytes in steps of one until
261	// either a zero byte is discovered or
262	// pointer is aligned to an eight byte boundary.
263
264	while s as usize & `7` != `0` {
265	if *s == `0` {
266	return n;
267	}
268	n += `1`;
269	s = s.add(`1`);
270	}
271
272	// Check bytes in steps of eight until a zero
273	// byte is discovered.
274
275	let mut s = s as *const u64;
276
277	loop {
278	let mut cs = {
279	let r: u64;
280	asm!(
281	"mov ({addr}), {dest}",
282	addr = in(reg) s,
283	dest = out(reg) r,
284	options(att_syntax, nostack),
285	);
286	r
287	};
288	// Detect if a word has a zero byte, taken from
289	// https://graphics.stanford.edu/~seander/bithacks.html
290	if (cs.wrapping_sub(`0x0101010101010101`) & !cs & `0x8080808080808080`) != `0` {
291	loop {
292	if cs & `255` == `0` {
293	return n;
294	} else {
295	cs >>= `8`;
296	n += `1`;
297	}
298	}
299	} else {
300	n += `8`;
301	s = s.add(`1`);
302	}
303	}
304	}
305
306	/// Determine optimal parameters for a `rep` instruction.
307	fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
308	// Unaligned writes are still slow on modern processors, so align the destination address.
309	let pre_byte_count: usize = ((`8` - (dest as usize & `0b111`)) & `0b111`).min(count);
310	count -= pre_byte_count;
311	let qword_count: usize = count >> `3`;
312	let byte_count: usize = count & `0b111`;
313	(pre_byte_count, qword_count, byte_count)
314	}
315