| 1 | // On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have | 
| 2 | // been enhanced to perform better than an simple qword loop, making them ideal | 
|---|
| 3 | // for implementing memcpy/memset. Note that "rep cmps" has received no such | 
|---|
| 4 | // enhancement, so it is not used to implement memcmp. | 
|---|
| 5 | // | 
|---|
| 6 | // On certain recent Intel processors, "rep movsb" and "rep stosb" have been | 
|---|
| 7 | // further enhanced to automatically select the best microarchitectural | 
|---|
| 8 | // implementation based on length and alignment. See the following features from | 
|---|
| 9 | // the "IntelĀ® 64 and IA-32 Architectures Optimization Reference Manual": | 
|---|
| 10 | //  - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later) | 
|---|
| 11 | //  - FSRM - Fast Short REP MOV (Ice Lake and later) | 
|---|
| 12 | //  - Fast Zero-Length MOVSB (On no current hardware) | 
|---|
| 13 | //  - Fast Short STOSB (On no current hardware) | 
|---|
| 14 | // | 
|---|
| 15 | // To simplify things, we switch to using the byte-based variants if the "ermsb" | 
|---|
| 16 | // feature is present at compile-time. We don't bother detecting other features. | 
|---|
| 17 | // Note that ERMSB does not enhance the backwards (DF=1) "rep movsb". | 
|---|
| 18 |  | 
|---|
| 19 | use core::arch::asm; | 
|---|
| 20 | use core::{intrinsics, mem}; | 
|---|
| 21 |  | 
|---|
| 22 | #[ inline(always)] | 
|---|
| 23 | #[ cfg(target_feature = "ermsb")] | 
|---|
| 24 | pub unsafe fn copy_forward(dest: *mut u8, src: *const u8, count: usize) { | 
|---|
| 25 | // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. | 
|---|
| 26 | core::arch::asm!( | 
|---|
| 27 | "repe movsb (%rsi), (%rdi)", | 
|---|
| 28 | inout( "rcx") count => _, | 
|---|
| 29 | inout( "rdi") dest => _, | 
|---|
| 30 | inout( "rsi") src => _, | 
|---|
| 31 | options(att_syntax, nostack, preserves_flags) | 
|---|
| 32 | ); | 
|---|
| 33 | } | 
|---|
| 34 |  | 
|---|
| 35 | #[ inline(always)] | 
|---|
| 36 | #[ cfg(not(target_feature = "ermsb"))] | 
|---|
| 37 | pub unsafe fn copy_forward(mut dest: *mut u8, mut src: *const u8, count: usize) { | 
|---|
| 38 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); | 
|---|
| 39 | // Separating the blocks gives the compiler more freedom to reorder instructions. | 
|---|
| 40 | asm!( | 
|---|
| 41 | "rep movsb", | 
|---|
| 42 | inout( "ecx") pre_byte_count => _, | 
|---|
| 43 | inout( "rdi") dest => dest, | 
|---|
| 44 | inout( "rsi") src => src, | 
|---|
| 45 | options(att_syntax, nostack, preserves_flags) | 
|---|
| 46 | ); | 
|---|
| 47 | asm!( | 
|---|
| 48 | "rep movsq", | 
|---|
| 49 | inout( "rcx") qword_count => _, | 
|---|
| 50 | inout( "rdi") dest => dest, | 
|---|
| 51 | inout( "rsi") src => src, | 
|---|
| 52 | options(att_syntax, nostack, preserves_flags) | 
|---|
| 53 | ); | 
|---|
| 54 | asm!( | 
|---|
| 55 | "rep movsb", | 
|---|
| 56 | inout( "ecx") byte_count => _, | 
|---|
| 57 | inout( "rdi") dest => _, | 
|---|
| 58 | inout( "rsi") src => _, | 
|---|
| 59 | options(att_syntax, nostack, preserves_flags) | 
|---|
| 60 | ); | 
|---|
| 61 | } | 
|---|
| 62 |  | 
|---|
| 63 | #[ inline(always)] | 
|---|
| 64 | pub unsafe fn copy_backward(dest: *mut u8, src: *const u8, count: usize) { | 
|---|
| 65 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); | 
|---|
| 66 | // We can't separate this block due to std/cld | 
|---|
| 67 | asm!( | 
|---|
| 68 | "std", | 
|---|
| 69 | "rep movsb", | 
|---|
| 70 | "sub $7, %rsi", | 
|---|
| 71 | "sub $7, %rdi", | 
|---|
| 72 | "mov {qword_count :r}, %rcx", | 
|---|
| 73 | "rep movsq", | 
|---|
| 74 | "test {pre_byte_count :e}, {pre_byte_count :e}", | 
|---|
| 75 | "add $7, %rsi", | 
|---|
| 76 | "add $7, %rdi", | 
|---|
| 77 | "mov {pre_byte_count :e}, %ecx", | 
|---|
| 78 | "rep movsb", | 
|---|
| 79 | "cld", | 
|---|
| 80 | pre_byte_count = in(reg) pre_byte_count, | 
|---|
| 81 | qword_count = in(reg) qword_count, | 
|---|
| 82 | inout( "ecx") byte_count => _, | 
|---|
| 83 | inout( "rdi") dest.add(count - 1) => _, | 
|---|
| 84 | inout( "rsi") src.add(count - 1) => _, | 
|---|
| 85 | // We modify flags, but we restore it afterwards | 
|---|
| 86 | options(att_syntax, nostack, preserves_flags) | 
|---|
| 87 | ); | 
|---|
| 88 | } | 
|---|
| 89 |  | 
|---|
| 90 | #[ inline(always)] | 
|---|
| 91 | #[ cfg(target_feature = "ermsb")] | 
|---|
| 92 | pub unsafe fn set_bytes(dest: *mut u8, c: u8, count: usize) { | 
|---|
| 93 | // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust. | 
|---|
| 94 | core::arch::asm!( | 
|---|
| 95 | "repe stosb %al, (%rdi)", | 
|---|
| 96 | inout( "rcx") count => _, | 
|---|
| 97 | inout( "rdi") dest => _, | 
|---|
| 98 | inout( "al") c => _, | 
|---|
| 99 | options(att_syntax, nostack, preserves_flags) | 
|---|
| 100 | ) | 
|---|
| 101 | } | 
|---|
| 102 |  | 
|---|
| 103 | #[ inline(always)] | 
|---|
| 104 | #[ cfg(not(target_feature = "ermsb"))] | 
|---|
| 105 | pub unsafe fn set_bytes(mut dest: *mut u8, c: u8, count: usize) { | 
|---|
| 106 | let c = c as u64 * 0x0101_0101_0101_0101; | 
|---|
| 107 | let (pre_byte_count, qword_count, byte_count) = rep_param(dest, count); | 
|---|
| 108 | // Separating the blocks gives the compiler more freedom to reorder instructions. | 
|---|
| 109 | asm!( | 
|---|
| 110 | "rep stosb", | 
|---|
| 111 | inout( "ecx") pre_byte_count => _, | 
|---|
| 112 | inout( "rdi") dest => dest, | 
|---|
| 113 | in( "rax") c, | 
|---|
| 114 | options(att_syntax, nostack, preserves_flags) | 
|---|
| 115 | ); | 
|---|
| 116 | asm!( | 
|---|
| 117 | "rep stosq", | 
|---|
| 118 | inout( "rcx") qword_count => _, | 
|---|
| 119 | inout( "rdi") dest => dest, | 
|---|
| 120 | in( "rax") c, | 
|---|
| 121 | options(att_syntax, nostack, preserves_flags) | 
|---|
| 122 | ); | 
|---|
| 123 | asm!( | 
|---|
| 124 | "rep stosb", | 
|---|
| 125 | inout( "ecx") byte_count => _, | 
|---|
| 126 | inout( "rdi") dest => _, | 
|---|
| 127 | in( "rax") c, | 
|---|
| 128 | options(att_syntax, nostack, preserves_flags) | 
|---|
| 129 | ); | 
|---|
| 130 | } | 
|---|
| 131 |  | 
|---|
| 132 | #[ inline(always)] | 
|---|
| 133 | pub unsafe fn compare_bytes(a: *const u8, b: *const u8, n: usize) -> i32 { | 
|---|
| 134 | #[ inline(always)] | 
|---|
| 135 | unsafe fn cmp<T, U, F>(mut a: *const T, mut b: *const T, n: usize, f: F) -> i32 | 
|---|
| 136 | where | 
|---|
| 137 | T: Clone + Copy + Eq, | 
|---|
| 138 | U: Clone + Copy + Eq, | 
|---|
| 139 | F: FnOnce(*const U, *const U, usize) -> i32, | 
|---|
| 140 | { | 
|---|
| 141 | // Ensure T is not a ZST. | 
|---|
| 142 | const { assert!(mem::size_of::<T>() != 0) }; | 
|---|
| 143 |  | 
|---|
| 144 | let end = a.add(intrinsics::unchecked_div(n, mem::size_of::<T>())); | 
|---|
| 145 | while a != end { | 
|---|
| 146 | if a.read_unaligned() != b.read_unaligned() { | 
|---|
| 147 | return f(a.cast(), b.cast(), mem::size_of::<T>()); | 
|---|
| 148 | } | 
|---|
| 149 | a = a.add(1); | 
|---|
| 150 | b = b.add(1); | 
|---|
| 151 | } | 
|---|
| 152 | f( | 
|---|
| 153 | a.cast(), | 
|---|
| 154 | b.cast(), | 
|---|
| 155 | intrinsics::unchecked_rem(n, mem::size_of::<T>()), | 
|---|
| 156 | ) | 
|---|
| 157 | } | 
|---|
| 158 | let c1 = |mut a: *const u8, mut b: *const u8, n| { | 
|---|
| 159 | for _ in 0..n { | 
|---|
| 160 | if a.read() != b.read() { | 
|---|
| 161 | return i32::from(a.read()) - i32::from(b.read()); | 
|---|
| 162 | } | 
|---|
| 163 | a = a.add(1); | 
|---|
| 164 | b = b.add(1); | 
|---|
| 165 | } | 
|---|
| 166 | 0 | 
|---|
| 167 | }; | 
|---|
| 168 | let c2 = |a: *const u16, b, n| cmp(a, b, n, c1); | 
|---|
| 169 | let c4 = |a: *const u32, b, n| cmp(a, b, n, c2); | 
|---|
| 170 | let c8 = |a: *const u64, b, n| cmp(a, b, n, c4); | 
|---|
| 171 | let c16 = |a: *const u128, b, n| cmp(a, b, n, c8); | 
|---|
| 172 | c16(a.cast(), b.cast(), n) | 
|---|
| 173 | } | 
|---|
| 174 |  | 
|---|
| 175 | // In order to process more than on byte simultaneously when executing strlen, | 
|---|
| 176 | // two things must be considered: | 
|---|
| 177 | // * An n byte read with an n-byte aligned address will never cross | 
|---|
| 178 | //   a page boundary and will always succeed. Any smaller alignment | 
|---|
| 179 | //   may result in a read that will cross a page boundary, which may | 
|---|
| 180 | //   trigger an access violation. | 
|---|
| 181 | // * Surface Rust considers any kind of out-of-bounds read as undefined | 
|---|
| 182 | //   behaviour. To dodge this, memory access operations are written | 
|---|
| 183 | //   using inline assembly. | 
|---|
| 184 |  | 
|---|
| 185 | #[ cfg(target_feature = "sse2")] | 
|---|
| 186 | #[ inline(always)] | 
|---|
| 187 | pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { | 
|---|
| 188 | use core::arch::x86_64::{__m128i, _mm_cmpeq_epi8, _mm_movemask_epi8, _mm_set1_epi8}; | 
|---|
| 189 |  | 
|---|
| 190 | let mut n = 0; | 
|---|
| 191 |  | 
|---|
| 192 | // The use of _mm_movemask_epi8 and company allow for speedups, | 
|---|
| 193 | // but they aren't cheap by themselves. Thus, possibly small strings | 
|---|
| 194 | // are handled in simple loops. | 
|---|
| 195 |  | 
|---|
| 196 | for _ in 0..4 { | 
|---|
| 197 | if *s == 0 { | 
|---|
| 198 | return n; | 
|---|
| 199 | } | 
|---|
| 200 |  | 
|---|
| 201 | n += 1; | 
|---|
| 202 | s = s.add(1); | 
|---|
| 203 | } | 
|---|
| 204 |  | 
|---|
| 205 | // Shave of the least significand bits to align the address to a 16 | 
|---|
| 206 | // byte boundary. The shaved of bits are used to correct the first iteration. | 
|---|
| 207 |  | 
|---|
| 208 | let align = s as usize & 15; | 
|---|
| 209 | let mut s = ((s as usize) - align) as *const __m128i; | 
|---|
| 210 | let zero = _mm_set1_epi8(0); | 
|---|
| 211 |  | 
|---|
| 212 | let x = { | 
|---|
| 213 | let r; | 
|---|
| 214 | asm!( | 
|---|
| 215 | "movdqa ({addr :r}), {dest }", | 
|---|
| 216 | addr = in(reg) s, | 
|---|
| 217 | dest = out(xmm_reg) r, | 
|---|
| 218 | options(att_syntax, nostack), | 
|---|
| 219 | ); | 
|---|
| 220 | r | 
|---|
| 221 | }; | 
|---|
| 222 | let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) >> align; | 
|---|
| 223 |  | 
|---|
| 224 | if cmp != 0 { | 
|---|
| 225 | return n + cmp.trailing_zeros() as usize; | 
|---|
| 226 | } | 
|---|
| 227 |  | 
|---|
| 228 | n += 16 - align; | 
|---|
| 229 | s = s.add(1); | 
|---|
| 230 |  | 
|---|
| 231 | loop { | 
|---|
| 232 | let x = { | 
|---|
| 233 | let r; | 
|---|
| 234 | asm!( | 
|---|
| 235 | "movdqa ({addr :r}), {dest }", | 
|---|
| 236 | addr = in(reg) s, | 
|---|
| 237 | dest = out(xmm_reg) r, | 
|---|
| 238 | options(att_syntax, nostack), | 
|---|
| 239 | ); | 
|---|
| 240 | r | 
|---|
| 241 | }; | 
|---|
| 242 | let cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(x, zero)) as u32; | 
|---|
| 243 | if cmp == 0 { | 
|---|
| 244 | n += 16; | 
|---|
| 245 | s = s.add(1); | 
|---|
| 246 | } else { | 
|---|
| 247 | return n + cmp.trailing_zeros() as usize; | 
|---|
| 248 | } | 
|---|
| 249 | } | 
|---|
| 250 | } | 
|---|
| 251 |  | 
|---|
| 252 | // Provided for scenarios like kernel development, where SSE might not | 
|---|
| 253 | // be available. | 
|---|
| 254 | #[ cfg(not(target_feature = "sse2"))] | 
|---|
| 255 | #[ inline(always)] | 
|---|
| 256 | pub unsafe fn c_string_length(mut s: *const core::ffi::c_char) -> usize { | 
|---|
| 257 | let mut n = 0; | 
|---|
| 258 |  | 
|---|
| 259 | // Check bytes in steps of one until | 
|---|
| 260 | // either a zero byte is discovered or | 
|---|
| 261 | // pointer is aligned to an eight byte boundary. | 
|---|
| 262 |  | 
|---|
| 263 | while s as usize & 7 != 0 { | 
|---|
| 264 | if *s == 0 { | 
|---|
| 265 | return n; | 
|---|
| 266 | } | 
|---|
| 267 | n += 1; | 
|---|
| 268 | s = s.add(1); | 
|---|
| 269 | } | 
|---|
| 270 |  | 
|---|
| 271 | // Check bytes in steps of eight until a zero | 
|---|
| 272 | // byte is discovered. | 
|---|
| 273 |  | 
|---|
| 274 | let mut s = s as *const u64; | 
|---|
| 275 |  | 
|---|
| 276 | loop { | 
|---|
| 277 | let mut cs = { | 
|---|
| 278 | let r: u64; | 
|---|
| 279 | asm!( | 
|---|
| 280 | "mov ({addr}), {dest}", | 
|---|
| 281 | addr = in(reg) s, | 
|---|
| 282 | dest = out(reg) r, | 
|---|
| 283 | options(att_syntax, nostack), | 
|---|
| 284 | ); | 
|---|
| 285 | r | 
|---|
| 286 | }; | 
|---|
| 287 | // Detect if a word has a zero byte, taken from | 
|---|
| 288 | // https://graphics.stanford.edu/~seander/bithacks.html | 
|---|
| 289 | if (cs.wrapping_sub(0x0101010101010101) & !cs & 0x8080808080808080) != 0 { | 
|---|
| 290 | loop { | 
|---|
| 291 | if cs & 255 == 0 { | 
|---|
| 292 | return n; | 
|---|
| 293 | } else { | 
|---|
| 294 | cs >>= 8; | 
|---|
| 295 | n += 1; | 
|---|
| 296 | } | 
|---|
| 297 | } | 
|---|
| 298 | } else { | 
|---|
| 299 | n += 8; | 
|---|
| 300 | s = s.add(1); | 
|---|
| 301 | } | 
|---|
| 302 | } | 
|---|
| 303 | } | 
|---|
| 304 |  | 
|---|
| 305 | /// Determine optimal parameters for a `rep` instruction. | 
|---|
| 306 | fn rep_param(dest: *mut u8, mut count: usize) -> (usize, usize, usize) { | 
|---|
| 307 | // Unaligned writes are still slow on modern processors, so align the destination address. | 
|---|
| 308 | let pre_byte_count: usize = ((8 - (dest as usize & 0b111)) & 0b111).min(count); | 
|---|
| 309 | count -= pre_byte_count; | 
|---|
| 310 | let qword_count: usize = count >> 3; | 
|---|
| 311 | let byte_count: usize = count & 0b111; | 
|---|
| 312 | (pre_byte_count, qword_count, byte_count) | 
|---|
| 313 | } | 
|---|
| 314 |  | 
|---|