1 | // On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have |
2 | // been enhanced to perform better than an simple qword loop, making them ideal |
3 | // for implementing memcpy/memset. Note that "rep cmps" has received no such |
4 | // enhancement, so it is not used to implement memcmp. |
5 | // |
6 | // On certain recent Intel processors, "rep movsb" and "rep stosb" have been |
7 | // further enhanced to automatically select the best microarchitectural |
8 | // implementation based on length and alignment. See the following features from |
9 | // the "IntelĀ® 64 and IA-32 Architectures Optimization Reference Manual": |
10 | // - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later) |
11 | // - FSRM - Fast Short REP MOV (Ice Lake and later) |
12 | // - Fast Zero-Length MOVSB (On no current hardware) |
13 | // - Fast Short STOSB (On no current hardware) |
14 | // |
15 | // To simplify things, we switch to using the byte-based variants if the "ermsb" |
16 | // feature is present at compile-time. We don't bother detecting other features. |
17 | // Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". |
18 | |
19 | use core::arch::asm; |
20 | use core::intrinsics; |
21 | use core::mem; |
22 | |
23 | #[inline (always)] |
24 | #[cfg (target_feature = "ermsb" )] |
25 | pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { |
26 | // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. |
27 | core::arch::asm!( |
28 | "repe movsb (%rsi), (%rdi)" , |
29 | inout("rcx" ) count => _, |
30 | inout("rdi" ) dest => _, |
31 | inout("rsi" ) src => _, |
32 | options(att_syntax, nostack, preserves_flags) |
33 | ); |
34 | } |
35 | |
36 | #[inline (always)] |
37 | #[cfg (not(target_feature = "ermsb" ))] |
38 | pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) { |
39 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); |
40 | // Separating the blocks gives the compiler more freedom to reorder instructions. |
41 | asm!( |
42 | "rep movsb" , |
43 | inout("ecx" ) pre_byte_count => _, |
44 | inout("rdi" ) dest => dest, |
45 | inout("rsi" ) src => src, |
46 | options(att_syntax, nostack, preserves_flags) |
47 | ); |
48 | asm!( |
49 | "rep movsq" , |
50 | inout("rcx" ) qword_count => _, |
51 | inout("rdi" ) dest => dest, |
52 | inout("rsi" ) src => src, |
53 | options(att_syntax, nostack, preserves_flags) |
54 | ); |
55 | asm!( |
56 | "rep movsb" , |
57 | inout("ecx" ) byte_count => _, |
58 | inout("rdi" ) dest => _, |
59 | inout("rsi" ) src => _, |
60 | options(att_syntax, nostack, preserves_flags) |
61 | ); |
62 | } |
63 | |
64 | #[inline (always)] |
65 | pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { |
66 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); |
67 | // We can't separate this block due to std/cld |
68 | asm!( |
69 | "std" , |
70 | "rep movsb" , |
71 | "sub $7, %rsi" , |
72 | "sub $7, %rdi" , |
73 | "mov {qword_count}, %rcx" , |
74 | "rep movsq" , |
75 | "test {pre_byte_count:e}, {pre_byte_count:e}" , |
76 | "add $7, %rsi" , |
77 | "add $7, %rdi" , |
78 | "mov {pre_byte_count:e}, %ecx" , |
79 | "rep movsb" , |
80 | "cld" , |
81 | pre_byte_count = in(reg) pre_byte_count, |
82 | qword_count = in(reg) qword_count, |
83 | inout("ecx" ) byte_count => _, |
84 | inout("rdi" ) dest.add(count - 1) => _, |
85 | inout("rsi" ) src.add(count - 1) => _, |
86 | // We modify flags, but we restore it afterwards |
87 | options(att_syntax, nostack, preserves_flags) |
88 | ); |
89 | } |
90 | |
91 | #[inline (always)] |
92 | #[cfg (target_feature = "ermsb" )] |
93 | pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { |
94 | // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. |
95 | core::arch::asm!( |
96 | "repe stosb %al, (%rdi)" , |
97 | inout("rcx" ) count => _, |
98 | inout("rdi" ) dest => _, |
99 | inout("al" ) c => _, |
100 | options(att_syntax, nostack, preserves_flags) |
101 | ) |
102 | } |
103 | |
104 | #[inline (always)] |
105 | #[cfg (not(target_feature = "ermsb" ))] |
106 | pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { |
107 | let c = c as u64 * 0x0101_0101_0101_0101; |
108 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); |
109 | // Separating the blocks gives the compiler more freedom to reorder instructions. |
110 | asm!( |
111 | "rep stosb" , |
112 | inout("ecx" ) pre_byte_count => _, |
113 | inout("rdi" ) dest => dest, |
114 | in("rax" ) c, |
115 | options(att_syntax, nostack, preserves_flags) |
116 | ); |
117 | asm!( |
118 | "rep stosq" , |
119 | inout("rcx" ) qword_count => _, |
120 | inout("rdi" ) dest => dest, |
121 | in("rax" ) c, |
122 | options(att_syntax, nostack, preserves_flags) |
123 | ); |
124 | asm!( |
125 | "rep stosb" , |
126 | inout("ecx" ) byte_count => _, |
127 | inout("rdi" ) dest => _, |
128 | in("rax" ) c, |
129 | options(att_syntax, nostack, preserves_flags) |
130 | ); |
131 | } |
132 | |
133 | #[inline (always)] |
134 | pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { |
135 | #[inline (always)] |
136 | unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32 |
137 | where |
138 | T: Clone + Copy + Eq, |
139 | U: Clone + Copy + Eq, |
140 | F: FnOnce(*const U, *const U, usize) -> i32, |
141 | { |
142 | // Ensure T is not a ZST. |
143 | const { assert!(mem::size_of::<T>() != 0) }; |
144 | |
145 | let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>())); |
146 | while a != end { |
147 | if a.read_unaligned() != b.read_unaligned() { |
148 | return f(a.cast(), b.cast(), mem::size_of::<T>()); |
149 | } |
150 | a = a.add(1); |
151 | b = b.add(1); |
152 | } |
153 | f( |
154 | a.cast(), |
155 | b.cast(), |
156 | intrinsics::unchecked_rem(n, mem::size_of::<T>()), |
157 | ) |
158 | } |
159 | let c1 = |mut a: *const u8, mut b: *const u8, n| { |
160 | for _ in 0..n { |
161 | if a.read() != b.read() { |
162 | return i32::from(a.read()) - i32::from(b.read()); |
163 | } |
164 | a = a.add(1); |
165 | b = b.add(1); |
166 | } |
167 | 0 |
168 | }; |
169 | let c2 = |a: *const u16, b, n| cmp(a, b, n, c1); |
170 | let c4 = |a: *const u32, b, n| cmp(a, b, n, c2); |
171 | let c8 = |a: *const u64, b, n| cmp(a, b, n, c4); |
172 | let c16 = |a: *const u128, b, n| cmp(a, b, n, c8); |
173 | c16(a.cast(), b.cast(), n) |
174 | } |
175 | |
176 | // In order to process more than on byte simultaneously when executing strlen, |
177 | // two things must be considered: |
178 | // * An n byte read with an n-byte aligned address will never cross |
179 | // a page boundary and will always succeed. Any smaller alignment |
180 | // may result in a read that will cross a page boundary, which may |
181 | // trigger an access violation. |
182 | // * Surface Rust considers any kind of out-of-bounds read as undefined |
183 | // behaviour. To dodge this, memory access operations are written |
184 | // using inline assembly. |
185 | |
186 | #[cfg (target_feature = "sse2" )] |
187 | #[inline (always)] |
188 | pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { |
189 | use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8}; |
190 | |
191 | let mut n = 0; |
192 | |
193 | // The use of _mm_movemask_epi8 and company allow for speedups, |
194 | // but they aren't cheap by themselves. Thus, possibly small strings |
195 | // are handled in simple loops. |
196 | |
197 | for _ in 0..4 { |
198 | if *s == 0 { |
199 | return n; |
200 | } |
201 | |
202 | n += 1; |
203 | s = s.add(1); |
204 | } |
205 | |
206 | // Shave of the least significand bits to align the address to a 16 |
207 | // byte boundary. The shaved of bits are used to correct the first iteration. |
208 | |
209 | let align = s as usize & 15; |
210 | let mut s = ((s as usize) - align) as *const __m128i; |
211 | let zero = _mm_set1_epi8(0); |
212 | |
213 | let x = { |
214 | let r; |
215 | asm!( |
216 | "movdqa ({addr}), {dest}" , |
217 | addr = in(reg) s, |
218 | dest = out(xmm_reg) r, |
219 | options(att_syntax, nostack), |
220 | ); |
221 | r |
222 | }; |
223 | let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align; |
224 | |
225 | if cmp != 0 { |
226 | return n + cmp.trailing_zeros() as usize; |
227 | } |
228 | |
229 | n += 16 - align; |
230 | s = s.add(1); |
231 | |
232 | loop { |
233 | let x = { |
234 | let r; |
235 | asm!( |
236 | "movdqa ({addr}), {dest}" , |
237 | addr = in(reg) s, |
238 | dest = out(xmm_reg) r, |
239 | options(att_syntax, nostack), |
240 | ); |
241 | r |
242 | }; |
243 | let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32; |
244 | if cmp == 0 { |
245 | n += 16; |
246 | s = s.add(1); |
247 | } else { |
248 | return n + cmp.trailing_zeros() as usize; |
249 | } |
250 | } |
251 | } |
252 | |
253 | // Provided for scenarios like kernel development, where SSE might not |
254 | // be available. |
255 | #[cfg (not(target_feature = "sse2" ))] |
256 | #[inline (always)] |
257 | pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { |
258 | let mut n = 0; |
259 | |
260 | // Check bytes in steps of one until |
261 | // either a zero byte is discovered or |
262 | // pointer is aligned to an eight byte boundary. |
263 | |
264 | while s as usize & 7 != 0 { |
265 | if *s == 0 { |
266 | return n; |
267 | } |
268 | n += 1; |
269 | s = s.add(1); |
270 | } |
271 | |
272 | // Check bytes in steps of eight until a zero |
273 | // byte is discovered. |
274 | |
275 | let mut s = s as *const u64; |
276 | |
277 | loop { |
278 | let mut cs = { |
279 | let r: u64; |
280 | asm!( |
281 | "mov ({addr}), {dest}" , |
282 | addr = in(reg) s, |
283 | dest = out(reg) r, |
284 | options(att_syntax, nostack), |
285 | ); |
286 | r |
287 | }; |
288 | // Detect if a word has a zero byte, taken from |
289 | // https://graphics.stanford.edu/~seander/bithacks.html |
290 | if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 { |
291 | loop { |
292 | if cs & 255 == 0 { |
293 | return n; |
294 | } else { |
295 | cs >>= 8; |
296 | n += 1; |
297 | } |
298 | } |
299 | } else { |
300 | n += 8; |
301 | s = s.add(1); |
302 | } |
303 | } |
304 | } |
305 | |
306 | /// Determine optimal parameters for a `rep` instruction. |
307 | fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { |
308 | // Unaligned writes are still slow on modern processors, so align the destination address. |
309 | let pre_byte_count: usize = ((8 - (dest as usize & 0b111)) & 0b111).min(count); |
310 | count -= pre_byte_count; |
311 | let qword_count: usize = count >> 3; |
312 | let byte_count: usize = count & 0b111; |
313 | (pre_byte_count, qword_count, byte_count) |
314 | } |
315 | |