| 1 | // On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have |
| 2 | // been enhanced to perform better than an simple qword loop, making them ideal |
| 3 | // for implementing memcpy/memset. Note that "rep cmps" has received no such |
| 4 | // enhancement, so it is not used to implement memcmp. |
| 5 | // |
| 6 | // On certain recent Intel processors, "rep movsb" and "rep stosb" have been |
| 7 | // further enhanced to automatically select the best microarchitectural |
| 8 | // implementation based on length and alignment. See the following features from |
| 9 | // the "IntelĀ® 64 and IA-32 Architectures Optimization Reference Manual": |
| 10 | // - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later) |
| 11 | // - FSRM - Fast Short REP MOV (Ice Lake and later) |
| 12 | // - Fast Zero-Length MOVSB (On no current hardware) |
| 13 | // - Fast Short STOSB (On no current hardware) |
| 14 | // |
| 15 | // To simplify things, we switch to using the byte-based variants if the "ermsb" |
| 16 | // feature is present at compile-time. We don't bother detecting other features. |
| 17 | // Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". |
| 18 | |
| 19 | use core::arch::asm; |
| 20 | use core::intrinsics; |
| 21 | use core::mem; |
| 22 | |
| 23 | #[inline (always)] |
| 24 | #[cfg (target_feature = "ermsb" )] |
| 25 | pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { |
| 26 | // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. |
| 27 | core::arch::asm!( |
| 28 | "repe movsb (%rsi), (%rdi)" , |
| 29 | inout("rcx" ) count => _, |
| 30 | inout("rdi" ) dest => _, |
| 31 | inout("rsi" ) src => _, |
| 32 | options(att_syntax, nostack, preserves_flags) |
| 33 | ); |
| 34 | } |
| 35 | |
| 36 | #[inline (always)] |
| 37 | #[cfg (not(target_feature = "ermsb" ))] |
| 38 | pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) { |
| 39 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); |
| 40 | // Separating the blocks gives the compiler more freedom to reorder instructions. |
| 41 | asm!( |
| 42 | "rep movsb" , |
| 43 | inout("ecx" ) pre_byte_count => _, |
| 44 | inout("rdi" ) dest => dest, |
| 45 | inout("rsi" ) src => src, |
| 46 | options(att_syntax, nostack, preserves_flags) |
| 47 | ); |
| 48 | asm!( |
| 49 | "rep movsq" , |
| 50 | inout("rcx" ) qword_count => _, |
| 51 | inout("rdi" ) dest => dest, |
| 52 | inout("rsi" ) src => src, |
| 53 | options(att_syntax, nostack, preserves_flags) |
| 54 | ); |
| 55 | asm!( |
| 56 | "rep movsb" , |
| 57 | inout("ecx" ) byte_count => _, |
| 58 | inout("rdi" ) dest => _, |
| 59 | inout("rsi" ) src => _, |
| 60 | options(att_syntax, nostack, preserves_flags) |
| 61 | ); |
| 62 | } |
| 63 | |
| 64 | #[inline (always)] |
| 65 | pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { |
| 66 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); |
| 67 | // We can't separate this block due to std/cld |
| 68 | asm!( |
| 69 | "std" , |
| 70 | "rep movsb" , |
| 71 | "sub $7, %rsi" , |
| 72 | "sub $7, %rdi" , |
| 73 | "mov { qword_count}, %rcx" , |
| 74 | "rep movsq" , |
| 75 | "test { pre_byte_count:e}, { pre_byte_count:e}" , |
| 76 | "add $7, %rsi" , |
| 77 | "add $7, %rdi" , |
| 78 | "mov { pre_byte_count:e}, %ecx" , |
| 79 | "rep movsb" , |
| 80 | "cld" , |
| 81 | pre_byte_count = in(reg) pre_byte_count, |
| 82 | qword_count = in(reg) qword_count, |
| 83 | inout("ecx" ) byte_count => _, |
| 84 | inout("rdi" ) dest.add(count - 1) => _, |
| 85 | inout("rsi" ) src.add(count - 1) => _, |
| 86 | // We modify flags, but we restore it afterwards |
| 87 | options(att_syntax, nostack, preserves_flags) |
| 88 | ); |
| 89 | } |
| 90 | |
| 91 | #[inline (always)] |
| 92 | #[cfg (target_feature = "ermsb" )] |
| 93 | pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { |
| 94 | // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. |
| 95 | core::arch::asm!( |
| 96 | "repe stosb %al, (%rdi)" , |
| 97 | inout("rcx" ) count => _, |
| 98 | inout("rdi" ) dest => _, |
| 99 | inout("al" ) c => _, |
| 100 | options(att_syntax, nostack, preserves_flags) |
| 101 | ) |
| 102 | } |
| 103 | |
| 104 | #[inline (always)] |
| 105 | #[cfg (not(target_feature = "ermsb" ))] |
| 106 | pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { |
| 107 | let c = c as u64 * 0x0101_0101_0101_0101; |
| 108 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); |
| 109 | // Separating the blocks gives the compiler more freedom to reorder instructions. |
| 110 | asm!( |
| 111 | "rep stosb" , |
| 112 | inout("ecx" ) pre_byte_count => _, |
| 113 | inout("rdi" ) dest => dest, |
| 114 | in("rax" ) c, |
| 115 | options(att_syntax, nostack, preserves_flags) |
| 116 | ); |
| 117 | asm!( |
| 118 | "rep stosq" , |
| 119 | inout("rcx" ) qword_count => _, |
| 120 | inout("rdi" ) dest => dest, |
| 121 | in("rax" ) c, |
| 122 | options(att_syntax, nostack, preserves_flags) |
| 123 | ); |
| 124 | asm!( |
| 125 | "rep stosb" , |
| 126 | inout("ecx" ) byte_count => _, |
| 127 | inout("rdi" ) dest => _, |
| 128 | in("rax" ) c, |
| 129 | options(att_syntax, nostack, preserves_flags) |
| 130 | ); |
| 131 | } |
| 132 | |
| 133 | #[inline (always)] |
| 134 | pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { |
| 135 | #[inline (always)] |
| 136 | unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32 |
| 137 | where |
| 138 | T: Clone + Copy + Eq, |
| 139 | U: Clone + Copy + Eq, |
| 140 | F: FnOnce(*const U, *const U, usize) -> i32, |
| 141 | { |
| 142 | // Ensure T is not a ZST. |
| 143 | const { assert!(mem::size_of::<T>() != 0) }; |
| 144 | |
| 145 | let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>())); |
| 146 | while a != end { |
| 147 | if a.read_unaligned() != b.read_unaligned() { |
| 148 | return f(a.cast(), b.cast(), mem::size_of::<T>()); |
| 149 | } |
| 150 | a = a.add(1); |
| 151 | b = b.add(1); |
| 152 | } |
| 153 | f( |
| 154 | a.cast(), |
| 155 | b.cast(), |
| 156 | intrinsics::unchecked_rem(n, mem::size_of::<T>()), |
| 157 | ) |
| 158 | } |
| 159 | let c1 = |mut a: *const u8, mut b: *const u8, n| { |
| 160 | for _ in 0..n { |
| 161 | if a.read() != b.read() { |
| 162 | return i32::from(a.read()) - i32::from(b.read()); |
| 163 | } |
| 164 | a = a.add(1); |
| 165 | b = b.add(1); |
| 166 | } |
| 167 | 0 |
| 168 | }; |
| 169 | let c2 = |a: *const u16, b, n| cmp(a, b, n, c1); |
| 170 | let c4 = |a: *const u32, b, n| cmp(a, b, n, c2); |
| 171 | let c8 = |a: *const u64, b, n| cmp(a, b, n, c4); |
| 172 | let c16 = |a: *const u128, b, n| cmp(a, b, n, c8); |
| 173 | c16(a.cast(), b.cast(), n) |
| 174 | } |
| 175 | |
| 176 | // In order to process more than on byte simultaneously when executing strlen, |
| 177 | // two things must be considered: |
| 178 | // * An n byte read with an n-byte aligned address will never cross |
| 179 | // a page boundary and will always succeed. Any smaller alignment |
| 180 | // may result in a read that will cross a page boundary, which may |
| 181 | // trigger an access violation. |
| 182 | // * Surface Rust considers any kind of out-of-bounds read as undefined |
| 183 | // behaviour. To dodge this, memory access operations are written |
| 184 | // using inline assembly. |
| 185 | |
| 186 | #[cfg (target_feature = "sse2" )] |
| 187 | #[inline (always)] |
| 188 | pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { |
| 189 | use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8}; |
| 190 | |
| 191 | let mut n = 0; |
| 192 | |
| 193 | // The use of _mm_movemask_epi8 and company allow for speedups, |
| 194 | // but they aren't cheap by themselves. Thus, possibly small strings |
| 195 | // are handled in simple loops. |
| 196 | |
| 197 | for _ in 0..4 { |
| 198 | if *s == 0 { |
| 199 | return n; |
| 200 | } |
| 201 | |
| 202 | n += 1; |
| 203 | s = s.add(1); |
| 204 | } |
| 205 | |
| 206 | // Shave of the least significand bits to align the address to a 16 |
| 207 | // byte boundary. The shaved of bits are used to correct the first iteration. |
| 208 | |
| 209 | let align = s as usize & 15; |
| 210 | let mut s = ((s as usize) - align) as *const __m128i; |
| 211 | let zero = _mm_set1_epi8(0); |
| 212 | |
| 213 | let x = { |
| 214 | let r; |
| 215 | asm!( |
| 216 | "movdqa ({ addr}), { dest}" , |
| 217 | addr = in(reg) s, |
| 218 | dest = out(xmm_reg) r, |
| 219 | options(att_syntax, nostack), |
| 220 | ); |
| 221 | r |
| 222 | }; |
| 223 | let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align; |
| 224 | |
| 225 | if cmp != 0 { |
| 226 | return n + cmp.trailing_zeros() as usize; |
| 227 | } |
| 228 | |
| 229 | n += 16 - align; |
| 230 | s = s.add(1); |
| 231 | |
| 232 | loop { |
| 233 | let x = { |
| 234 | let r; |
| 235 | asm!( |
| 236 | "movdqa ({ addr}), { dest}" , |
| 237 | addr = in(reg) s, |
| 238 | dest = out(xmm_reg) r, |
| 239 | options(att_syntax, nostack), |
| 240 | ); |
| 241 | r |
| 242 | }; |
| 243 | let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32; |
| 244 | if cmp == 0 { |
| 245 | n += 16; |
| 246 | s = s.add(1); |
| 247 | } else { |
| 248 | return n + cmp.trailing_zeros() as usize; |
| 249 | } |
| 250 | } |
| 251 | } |
| 252 | |
| 253 | // Provided for scenarios like kernel development, where SSE might not |
| 254 | // be available. |
| 255 | #[cfg (not(target_feature = "sse2" ))] |
| 256 | #[inline (always)] |
| 257 | pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { |
| 258 | let mut n = 0; |
| 259 | |
| 260 | // Check bytes in steps of one until |
| 261 | // either a zero byte is discovered or |
| 262 | // pointer is aligned to an eight byte boundary. |
| 263 | |
| 264 | while s as usize & 7 != 0 { |
| 265 | if *s == 0 { |
| 266 | return n; |
| 267 | } |
| 268 | n += 1; |
| 269 | s = s.add(1); |
| 270 | } |
| 271 | |
| 272 | // Check bytes in steps of eight until a zero |
| 273 | // byte is discovered. |
| 274 | |
| 275 | let mut s = s as *const u64; |
| 276 | |
| 277 | loop { |
| 278 | let mut cs = { |
| 279 | let r: u64; |
| 280 | asm!( |
| 281 | "mov ({addr}), {dest}" , |
| 282 | addr = in(reg) s, |
| 283 | dest = out(reg) r, |
| 284 | options(att_syntax, nostack), |
| 285 | ); |
| 286 | r |
| 287 | }; |
| 288 | // Detect if a word has a zero byte, taken from |
| 289 | // https://graphics.stanford.edu/~seander/bithacks.html |
| 290 | if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 { |
| 291 | loop { |
| 292 | if cs & 255 == 0 { |
| 293 | return n; |
| 294 | } else { |
| 295 | cs >>= 8; |
| 296 | n += 1; |
| 297 | } |
| 298 | } |
| 299 | } else { |
| 300 | n += 8; |
| 301 | s = s.add(1); |
| 302 | } |
| 303 | } |
| 304 | } |
| 305 | |
| 306 | /// Determine optimal parameters for a `rep` instruction. |
| 307 | fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { |
| 308 | // Unaligned writes are still slow on modern processors, so align the destination address. |
| 309 | let pre_byte_count: usize = ((8 - (dest as usize & 0b111)) & 0b111).min(count); |
| 310 | count -= pre_byte_count; |
| 311 | let qword_count: usize = count >> 3; |
| 312 | let byte_count: usize = count & 0b111; |
| 313 | (pre_byte_count, qword_count, byte_count) |
| 314 | } |
| 315 | |