1// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
2// been enhanced to perform better than an simple qword loop, making them ideal
3// for implementing memcpy/memset. Note that "rep cmps" has received no such
4// enhancement, so it is not used to implement memcmp.
5//
6// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
7// further enhanced to automatically select the best microarchitectural
8// implementation based on length and alignment. See the following features from
9// the "IntelĀ® 64 and IA-32 Architectures Optimization Reference Manual":
10// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
11// - FSRM - Fast Short REP MOV (Ice Lake and later)
12// - Fast Zero-Length MOVSB (On no current hardware)
13// - Fast Short STOSB (On no current hardware)
14//
15// To simplify things, we switch to using the byte-based variants if the "ermsb"
16// feature is present at compile-time. We don't bother detecting other features.
17// Note that ERMSB does not enhance the backwards (DF=1) "rep movsb".
18
19use core::arch::asm;
20use core::intrinsics;
21use core::mem;
22
23#[inline(always)]
24#[cfg(target_feature = "ermsb")]
25pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) {
26 // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
27 core::arch::asm!(
28 "repe movsb (%rsi), (%rdi)",
29 inout("rcx") count => _,
30 inout("rdi") dest => _,
31 inout("rsi") src => _,
32 options(att_syntax, nostack, preserves_flags)
33 );
34}
35
36#[inline(always)]
37#[cfg(not(target_feature = "ermsb"))]
38pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) {
39 let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
40 // Separating the blocks gives the compiler more freedom to reorder instructions.
41 asm!(
42 "rep movsb",
43 inout("ecx") pre_byte_count => _,
44 inout("rdi") dest => dest,
45 inout("rsi") src => src,
46 options(att_syntax, nostack, preserves_flags)
47 );
48 asm!(
49 "rep movsq",
50 inout("rcx") qword_count => _,
51 inout("rdi") dest => dest,
52 inout("rsi") src => src,
53 options(att_syntax, nostack, preserves_flags)
54 );
55 asm!(
56 "rep movsb",
57 inout("ecx") byte_count => _,
58 inout("rdi") dest => _,
59 inout("rsi") src => _,
60 options(att_syntax, nostack, preserves_flags)
61 );
62}
63
64#[inline(always)]
65pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) {
66 let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
67 // We can't separate this block due to std/cld
68 asm!(
69 "std",
70 "rep movsb",
71 "sub $7, %rsi",
72 "sub $7, %rdi",
73 "mov {qword_count}, %rcx",
74 "rep movsq",
75 "test {pre_byte_count:e}, {pre_byte_count:e}",
76 "add $7, %rsi",
77 "add $7, %rdi",
78 "mov {pre_byte_count:e}, %ecx",
79 "rep movsb",
80 "cld",
81 pre_byte_count = in(reg) pre_byte_count,
82 qword_count = in(reg) qword_count,
83 inout("ecx") byte_count => _,
84 inout("rdi") dest.add(count - 1) => _,
85 inout("rsi") src.add(count - 1) => _,
86 // We modify flags, but we restore it afterwards
87 options(att_syntax, nostack, preserves_flags)
88 );
89}
90
91#[inline(always)]
92#[cfg(target_feature = "ermsb")]
93pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) {
94 // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
95 core::arch::asm!(
96 "repe stosb %al, (%rdi)",
97 inout("rcx") count => _,
98 inout("rdi") dest => _,
99 inout("al") c => _,
100 options(att_syntax, nostack, preserves_flags)
101 )
102}
103
104#[inline(always)]
105#[cfg(not(target_feature = "ermsb"))]
106pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) {
107 let c = c as u64 * 0x0101_0101_0101_0101;
108 let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count);
109 // Separating the blocks gives the compiler more freedom to reorder instructions.
110 asm!(
111 "rep stosb",
112 inout("ecx") pre_byte_count => _,
113 inout("rdi") dest => dest,
114 in("rax") c,
115 options(att_syntax, nostack, preserves_flags)
116 );
117 asm!(
118 "rep stosq",
119 inout("rcx") qword_count => _,
120 inout("rdi") dest => dest,
121 in("rax") c,
122 options(att_syntax, nostack, preserves_flags)
123 );
124 asm!(
125 "rep stosb",
126 inout("ecx") byte_count => _,
127 inout("rdi") dest => _,
128 in("rax") c,
129 options(att_syntax, nostack, preserves_flags)
130 );
131}
132
133#[inline(always)]
134pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 {
135 #[inline(always)]
136 unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32
137 where
138 T: Clone + Copy + Eq,
139 U: Clone + Copy + Eq,
140 F: FnOnce(*const U, *const U, usize) -> i32,
141 {
142 // Ensure T is not a ZST.
143 const { assert!(mem::size_of::<T>() != 0) };
144
145 let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>()));
146 while a != end {
147 if a.read_unaligned() != b.read_unaligned() {
148 return f(a.cast(), b.cast(), mem::size_of::<T>());
149 }
150 a = a.add(1);
151 b = b.add(1);
152 }
153 f(
154 a.cast(),
155 b.cast(),
156 intrinsics::unchecked_rem(n, mem::size_of::<T>()),
157 )
158 }
159 let c1 = |mut a: *const u8, mut b: *const u8, n| {
160 for _ in 0..n {
161 if a.read() != b.read() {
162 return i32::from(a.read()) - i32::from(b.read());
163 }
164 a = a.add(1);
165 b = b.add(1);
166 }
167 0
168 };
169 let c2 = |a: *const u16, b, n| cmp(a, b, n, c1);
170 let c4 = |a: *const u32, b, n| cmp(a, b, n, c2);
171 let c8 = |a: *const u64, b, n| cmp(a, b, n, c4);
172 let c16 = |a: *const u128, b, n| cmp(a, b, n, c8);
173 c16(a.cast(), b.cast(), n)
174}
175
176// In order to process more than on byte simultaneously when executing strlen,
177// two things must be considered:
178// * An n byte read with an n-byte aligned address will never cross
179// a page boundary and will always succeed. Any smaller alignment
180// may result in a read that will cross a page boundary, which may
181// trigger an access violation.
182// * Surface Rust considers any kind of out-of-bounds read as undefined
183// behaviour. To dodge this, memory access operations are written
184// using inline assembly.
185
186#[cfg(target_feature = "sse2")]
187#[inline(always)]
188pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
189 use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8};
190
191 let mut n = 0;
192
193 // The use of _mm_movemask_epi8 and company allow for speedups,
194 // but they aren't cheap by themselves. Thus, possibly small strings
195 // are handled in simple loops.
196
197 for _ in 0..4 {
198 if *s == 0 {
199 return n;
200 }
201
202 n += 1;
203 s = s.add(1);
204 }
205
206 // Shave of the least significand bits to align the address to a 16
207 // byte boundary. The shaved of bits are used to correct the first iteration.
208
209 let align = s as usize & 15;
210 let mut s = ((s as usize) - align) as *const __m128i;
211 let zero = _mm_set1_epi8(0);
212
213 let x = {
214 let r;
215 asm!(
216 "movdqa ({addr}), {dest}",
217 addr = in(reg) s,
218 dest = out(xmm_reg) r,
219 options(att_syntax, nostack),
220 );
221 r
222 };
223 let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align;
224
225 if cmp != 0 {
226 return n + cmp.trailing_zeros() as usize;
227 }
228
229 n += 16 - align;
230 s = s.add(1);
231
232 loop {
233 let x = {
234 let r;
235 asm!(
236 "movdqa ({addr}), {dest}",
237 addr = in(reg) s,
238 dest = out(xmm_reg) r,
239 options(att_syntax, nostack),
240 );
241 r
242 };
243 let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32;
244 if cmp == 0 {
245 n += 16;
246 s = s.add(1);
247 } else {
248 return n + cmp.trailing_zeros() as usize;
249 }
250 }
251}
252
253// Provided for scenarios like kernel development, where SSE might not
254// be available.
255#[cfg(not(target_feature = "sse2"))]
256#[inline(always)]
257pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize {
258 let mut n = 0;
259
260 // Check bytes in steps of one until
261 // either a zero byte is discovered or
262 // pointer is aligned to an eight byte boundary.
263
264 while s as usize & 7 != 0 {
265 if *s == 0 {
266 return n;
267 }
268 n += 1;
269 s = s.add(1);
270 }
271
272 // Check bytes in steps of eight until a zero
273 // byte is discovered.
274
275 let mut s = s as *const u64;
276
277 loop {
278 let mut cs = {
279 let r: u64;
280 asm!(
281 "mov ({addr}), {dest}",
282 addr = in(reg) s,
283 dest = out(reg) r,
284 options(att_syntax, nostack),
285 );
286 r
287 };
288 // Detect if a word has a zero byte, taken from
289 // https://graphics.stanford.edu/~seander/bithacks.html
290 if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 {
291 loop {
292 if cs & 255 == 0 {
293 return n;
294 } else {
295 cs >>= 8;
296 n += 1;
297 }
298 }
299 } else {
300 n += 8;
301 s = s.add(1);
302 }
303 }
304}
305
306/// Determine optimal parameters for a `rep` instruction.
307fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) {
308 // Unaligned writes are still slow on modern processors, so align the destination address.
309 let pre_byte_count: usize = ((8 - (dest as usize & 0b111)) & 0b111).min(count);
310 count -= pre_byte_count;
311 let qword_count: usize = count >> 3;
312 let byte_count: usize = count & 0b111;
313 (pre_byte_count, qword_count, byte_count)
314}
315