1 | // On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have |
2 | // been enhanced to perform better than an simple qword loop, making them ideal |
3 | // for implementing memcpy/memset. Note that "rep cmps" has received no such |
4 | // enhancement, so it is not used to implement memcmp. |
5 | // |
6 | // On certain recent Intel processors, "rep movsb" and "rep stosb" have been |
7 | // further enhanced to automatically select the best microarchitectural |
8 | // implementation based on length and alignment. See the following features from |
9 | // the "IntelĀ® 64 and IA-32 Architectures Optimization Reference Manual": |
10 | // - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later) |
11 | // - FSRM - Fast Short REP MOV (Ice Lake and later) |
12 | // - Fast Zero-Length MOVSB (On no current hardware) |
13 | // - Fast Short STOSB (On no current hardware) |
14 | // |
15 | // To simplify things, we switch to using the byte-based variants if the "ermsb" |
16 | // feature is present at compile-time. We don't bother detecting other features. |
17 | // Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". |
18 | |
19 | use core::arch::asm; |
20 | use core::{intrinsics, mem}; |
21 | |
22 | #[inline (always)] |
23 | #[cfg (target_feature = "ermsb" )] |
24 | pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { |
25 | // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. |
26 | core::arch::asm!( |
27 | "repe movsb (%rsi), (%rdi)" , |
28 | inout("rcx" ) count => _, |
29 | inout("rdi" ) dest => _, |
30 | inout("rsi" ) src => _, |
31 | options(att_syntax, nostack, preserves_flags) |
32 | ); |
33 | } |
34 | |
35 | #[inline (always)] |
36 | #[cfg (not(target_feature = "ermsb" ))] |
37 | pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) { |
38 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); |
39 | // Separating the blocks gives the compiler more freedom to reorder instructions. |
40 | asm!( |
41 | "rep movsb" , |
42 | inout("ecx" ) pre_byte_count => _, |
43 | inout("rdi" ) dest => dest, |
44 | inout("rsi" ) src => src, |
45 | options(att_syntax, nostack, preserves_flags) |
46 | ); |
47 | asm!( |
48 | "rep movsq" , |
49 | inout("rcx" ) qword_count => _, |
50 | inout("rdi" ) dest => dest, |
51 | inout("rsi" ) src => src, |
52 | options(att_syntax, nostack, preserves_flags) |
53 | ); |
54 | asm!( |
55 | "rep movsb" , |
56 | inout("ecx" ) byte_count => _, |
57 | inout("rdi" ) dest => _, |
58 | inout("rsi" ) src => _, |
59 | options(att_syntax, nostack, preserves_flags) |
60 | ); |
61 | } |
62 | |
63 | #[inline (always)] |
64 | pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { |
65 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); |
66 | // We can't separate this block due to std/cld |
67 | asm!( |
68 | "std" , |
69 | "rep movsb" , |
70 | "sub $7, %rsi" , |
71 | "sub $7, %rdi" , |
72 | "mov { qword_count:r}, %rcx" , |
73 | "rep movsq" , |
74 | "test { pre_byte_count:e}, { pre_byte_count:e}" , |
75 | "add $7, %rsi" , |
76 | "add $7, %rdi" , |
77 | "mov { pre_byte_count:e}, %ecx" , |
78 | "rep movsb" , |
79 | "cld" , |
80 | pre_byte_count = in(reg) pre_byte_count, |
81 | qword_count = in(reg) qword_count, |
82 | inout("ecx" ) byte_count => _, |
83 | inout("rdi" ) dest.add(count - 1) => _, |
84 | inout("rsi" ) src.add(count - 1) => _, |
85 | // We modify flags, but we restore it afterwards |
86 | options(att_syntax, nostack, preserves_flags) |
87 | ); |
88 | } |
89 | |
90 | #[inline (always)] |
91 | #[cfg (target_feature = "ermsb" )] |
92 | pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { |
93 | // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. |
94 | core::arch::asm!( |
95 | "repe stosb %al, (%rdi)" , |
96 | inout("rcx" ) count => _, |
97 | inout("rdi" ) dest => _, |
98 | inout("al" ) c => _, |
99 | options(att_syntax, nostack, preserves_flags) |
100 | ) |
101 | } |
102 | |
103 | #[inline (always)] |
104 | #[cfg (not(target_feature = "ermsb" ))] |
105 | pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { |
106 | let c = c as u64 * 0x0101_0101_0101_0101; |
107 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); |
108 | // Separating the blocks gives the compiler more freedom to reorder instructions. |
109 | asm!( |
110 | "rep stosb" , |
111 | inout("ecx" ) pre_byte_count => _, |
112 | inout("rdi" ) dest => dest, |
113 | in("rax" ) c, |
114 | options(att_syntax, nostack, preserves_flags) |
115 | ); |
116 | asm!( |
117 | "rep stosq" , |
118 | inout("rcx" ) qword_count => _, |
119 | inout("rdi" ) dest => dest, |
120 | in("rax" ) c, |
121 | options(att_syntax, nostack, preserves_flags) |
122 | ); |
123 | asm!( |
124 | "rep stosb" , |
125 | inout("ecx" ) byte_count => _, |
126 | inout("rdi" ) dest => _, |
127 | in("rax" ) c, |
128 | options(att_syntax, nostack, preserves_flags) |
129 | ); |
130 | } |
131 | |
132 | #[inline (always)] |
133 | pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { |
134 | #[inline (always)] |
135 | unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32 |
136 | where |
137 | T: Clone + Copy + Eq, |
138 | U: Clone + Copy + Eq, |
139 | F: FnOnce(*const U, *const U, usize) -> i32, |
140 | { |
141 | // Ensure T is not a ZST. |
142 | const { assert!(mem::size_of::<T>() != 0) }; |
143 | |
144 | let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>())); |
145 | while a != end { |
146 | if a.read_unaligned() != b.read_unaligned() { |
147 | return f(a.cast(), b.cast(), mem::size_of::<T>()); |
148 | } |
149 | a = a.add(1); |
150 | b = b.add(1); |
151 | } |
152 | f( |
153 | a.cast(), |
154 | b.cast(), |
155 | intrinsics::unchecked_rem(n, mem::size_of::<T>()), |
156 | ) |
157 | } |
158 | let c1 = |mut a: *const u8, mut b: *const u8, n| { |
159 | for _ in 0..n { |
160 | if a.read() != b.read() { |
161 | return i32::from(a.read()) - i32::from(b.read()); |
162 | } |
163 | a = a.add(1); |
164 | b = b.add(1); |
165 | } |
166 | 0 |
167 | }; |
168 | let c2 = |a: *const u16, b, n| cmp(a, b, n, c1); |
169 | let c4 = |a: *const u32, b, n| cmp(a, b, n, c2); |
170 | let c8 = |a: *const u64, b, n| cmp(a, b, n, c4); |
171 | let c16 = |a: *const u128, b, n| cmp(a, b, n, c8); |
172 | c16(a.cast(), b.cast(), n) |
173 | } |
174 | |
175 | // In order to process more than on byte simultaneously when executing strlen, |
176 | // two things must be considered: |
177 | // * An n byte read with an n-byte aligned address will never cross |
178 | // a page boundary and will always succeed. Any smaller alignment |
179 | // may result in a read that will cross a page boundary, which may |
180 | // trigger an access violation. |
181 | // * Surface Rust considers any kind of out-of-bounds read as undefined |
182 | // behaviour. To dodge this, memory access operations are written |
183 | // using inline assembly. |
184 | |
185 | #[cfg (target_feature = "sse2" )] |
186 | #[inline (always)] |
187 | pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { |
188 | use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8}; |
189 | |
190 | let mut n = 0; |
191 | |
192 | // The use of _mm_movemask_epi8 and company allow for speedups, |
193 | // but they aren't cheap by themselves. Thus, possibly small strings |
194 | // are handled in simple loops. |
195 | |
196 | for _ in 0..4 { |
197 | if *s == 0 { |
198 | return n; |
199 | } |
200 | |
201 | n += 1; |
202 | s = s.add(1); |
203 | } |
204 | |
205 | // Shave of the least significand bits to align the address to a 16 |
206 | // byte boundary. The shaved of bits are used to correct the first iteration. |
207 | |
208 | let align = s as usize & 15; |
209 | let mut s = ((s as usize) - align) as *const __m128i; |
210 | let zero = _mm_set1_epi8(0); |
211 | |
212 | let x = { |
213 | let r; |
214 | asm!( |
215 | "movdqa ({ addr:r}), { dest}" , |
216 | addr = in(reg) s, |
217 | dest = out(xmm_reg) r, |
218 | options(att_syntax, nostack), |
219 | ); |
220 | r |
221 | }; |
222 | let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align; |
223 | |
224 | if cmp != 0 { |
225 | return n + cmp.trailing_zeros() as usize; |
226 | } |
227 | |
228 | n += 16 - align; |
229 | s = s.add(1); |
230 | |
231 | loop { |
232 | let x = { |
233 | let r; |
234 | asm!( |
235 | "movdqa ({ addr:r}), { dest}" , |
236 | addr = in(reg) s, |
237 | dest = out(xmm_reg) r, |
238 | options(att_syntax, nostack), |
239 | ); |
240 | r |
241 | }; |
242 | let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32; |
243 | if cmp == 0 { |
244 | n += 16; |
245 | s = s.add(1); |
246 | } else { |
247 | return n + cmp.trailing_zeros() as usize; |
248 | } |
249 | } |
250 | } |
251 | |
252 | // Provided for scenarios like kernel development, where SSE might not |
253 | // be available. |
254 | #[cfg (not(target_feature = "sse2" ))] |
255 | #[inline (always)] |
256 | pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { |
257 | let mut n = 0; |
258 | |
259 | // Check bytes in steps of one until |
260 | // either a zero byte is discovered or |
261 | // pointer is aligned to an eight byte boundary. |
262 | |
263 | while s as usize & 7 != 0 { |
264 | if *s == 0 { |
265 | return n; |
266 | } |
267 | n += 1; |
268 | s = s.add(1); |
269 | } |
270 | |
271 | // Check bytes in steps of eight until a zero |
272 | // byte is discovered. |
273 | |
274 | let mut s = s as *const u64; |
275 | |
276 | loop { |
277 | let mut cs = { |
278 | let r: u64; |
279 | asm!( |
280 | "mov ({addr}), {dest}" , |
281 | addr = in(reg) s, |
282 | dest = out(reg) r, |
283 | options(att_syntax, nostack), |
284 | ); |
285 | r |
286 | }; |
287 | // Detect if a word has a zero byte, taken from |
288 | // https://graphics.stanford.edu/~seander/bithacks.html |
289 | if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 { |
290 | loop { |
291 | if cs & 255 == 0 { |
292 | return n; |
293 | } else { |
294 | cs >>= 8; |
295 | n += 1; |
296 | } |
297 | } |
298 | } else { |
299 | n += 8; |
300 | s = s.add(1); |
301 | } |
302 | } |
303 | } |
304 | |
305 | /// Determine optimal parameters for a `rep` instruction. |
306 | fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { |
307 | // Unaligned writes are still slow on modern processors, so align the destination address. |
308 | let pre_byte_count: usize = ((8 - (dest as usize & 0b111)) & 0b111).min(count); |
309 | count -= pre_byte_count; |
310 | let qword_count: usize = count >> 3; |
311 | let byte_count: usize = count & 0b111; |
312 | (pre_byte_count, qword_count, byte_count) |
313 | } |
314 | |