1 | //! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`) |
2 | |
3 | use crate::core_arch::{simd::*, x86::*}; |
4 | |
5 | #[cfg (test)] |
6 | use stdarch_test::assert_instr; |
7 | |
8 | #[allow (improper_ctypes)] |
9 | extern "C" { |
10 | #[link_name = "llvm.x86.sse4a.extrq" ] |
11 | fn extrq(x: i64x2, y: i8x16) -> i64x2; |
12 | #[link_name = "llvm.x86.sse4a.insertq" ] |
13 | fn insertq(x: i64x2, y: i64x2) -> i64x2; |
14 | #[link_name = "llvm.x86.sse4a.movnt.sd" ] |
15 | fn movntsd(x: *mut f64, y: __m128d); |
16 | #[link_name = "llvm.x86.sse4a.movnt.ss" ] |
17 | fn movntss(x: *mut f32, y: __m128); |
18 | } |
19 | |
20 | // FIXME(blocked on #248): _mm_extracti_si64(x, len, idx) // EXTRQ |
21 | // FIXME(blocked on #248): _mm_inserti_si64(x, y, len, idx) // INSERTQ |
22 | |
23 | /// Extracts the bit range specified by `y` from the lower 64 bits of `x`. |
24 | /// |
25 | /// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The |
26 | /// `[5:0]` bits of `y` specify the length of the bit-range to extract. All |
27 | /// other bits are ignored. |
28 | /// |
29 | /// If the length is zero, it is interpreted as `64`. If the length and index |
30 | /// are zero, the lower 64 bits of `x` are extracted. |
31 | /// |
32 | /// If `length == 0 && index > 0` or `length + index > 64` the result is |
33 | /// undefined. |
34 | #[inline ] |
35 | #[target_feature (enable = "sse4a" )] |
36 | #[cfg_attr (test, assert_instr(extrq))] |
37 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
38 | pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i { |
39 | transmute(src:extrq(x:x.as_i64x2(), y:y.as_i8x16())) |
40 | } |
41 | |
42 | /// Inserts the `[length:0]` bits of `y` into `x` at `index`. |
43 | /// |
44 | /// The bits of `y`: |
45 | /// |
46 | /// - `[69:64]` specify the `length`, |
47 | /// - `[77:72]` specify the index. |
48 | /// |
49 | /// If the `length` is zero it is interpreted as `64`. If `index + length > 64` |
50 | /// or `index > 0 && length == 0` the result is undefined. |
51 | #[inline ] |
52 | #[target_feature (enable = "sse4a" )] |
53 | #[cfg_attr (test, assert_instr(insertq))] |
54 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
55 | pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i { |
56 | transmute(src:insertq(x:x.as_i64x2(), y:y.as_i64x2())) |
57 | } |
58 | |
59 | /// Non-temporal store of `a.0` into `p`. |
60 | /// |
61 | /// Writes 64-bit data to a memory location without polluting the caches. |
62 | /// |
63 | /// # Safety of non-temporal stores |
64 | /// |
65 | /// After using this intrinsic, but before any other access to the memory that this intrinsic |
66 | /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In |
67 | /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they |
68 | /// return. |
69 | /// |
70 | /// See [`_mm_sfence`] for details. |
71 | #[inline ] |
72 | #[target_feature (enable = "sse4a" )] |
73 | #[cfg_attr (test, assert_instr(movntsd))] |
74 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
75 | pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) { |
76 | movntsd(x:p, y:a); |
77 | } |
78 | |
79 | /// Non-temporal store of `a.0` into `p`. |
80 | /// |
81 | /// Writes 32-bit data to a memory location without polluting the caches. |
82 | /// |
83 | /// # Safety of non-temporal stores |
84 | /// |
85 | /// After using this intrinsic, but before any other access to the memory that this intrinsic |
86 | /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In |
87 | /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they |
88 | /// return. |
89 | /// |
90 | /// See [`_mm_sfence`] for details. |
91 | #[inline ] |
92 | #[target_feature (enable = "sse4a" )] |
93 | #[cfg_attr (test, assert_instr(movntss))] |
94 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
95 | pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) { |
96 | movntss(x:p, y:a); |
97 | } |
98 | |
99 | #[cfg (test)] |
100 | mod tests { |
101 | use crate::core_arch::x86::*; |
102 | use stdarch_test::simd_test; |
103 | |
104 | #[simd_test(enable = "sse4a" )] |
105 | unsafe fn test_mm_extract_si64() { |
106 | let b = 0b0110_0000_0000_i64; |
107 | // ^^^^ bit range extracted |
108 | let x = _mm_setr_epi64x(b, 0); |
109 | let v = 0b001000___00___000100_i64; |
110 | // ^idx: 2^3 = 8 ^length = 2^2 = 4 |
111 | let y = _mm_setr_epi64x(v, 0); |
112 | let e = _mm_setr_epi64x(0b0110_i64, 0); |
113 | let r = _mm_extract_si64(x, y); |
114 | assert_eq_m128i(r, e); |
115 | } |
116 | |
117 | #[simd_test(enable = "sse4a" )] |
118 | unsafe fn test_mm_insert_si64() { |
119 | let i = 0b0110_i64; |
120 | // ^^^^ bit range inserted |
121 | let z = 0b1010_1010_1010i64; |
122 | // ^^^^ bit range replaced |
123 | let e = 0b0110_1010_1010i64; |
124 | // ^^^^ replaced 1010 with 0110 |
125 | let x = _mm_setr_epi64x(z, 0); |
126 | let expected = _mm_setr_epi64x(e, 0); |
127 | let v = 0b001000___00___000100_i64; |
128 | // ^idx: 2^3 = 8 ^length = 2^2 = 4 |
129 | let y = _mm_setr_epi64x(i, v); |
130 | let r = _mm_insert_si64(x, y); |
131 | assert_eq_m128i(r, expected); |
132 | } |
133 | |
134 | #[repr (align(16))] |
135 | struct MemoryF64 { |
136 | data: [f64; 2], |
137 | } |
138 | |
139 | #[simd_test(enable = "sse4a" )] |
140 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
141 | // (non-temporal store) |
142 | #[cfg_attr (miri, ignore)] |
143 | unsafe fn test_mm_stream_sd() { |
144 | let mut mem = MemoryF64 { |
145 | data: [1.0_f64, 2.0], |
146 | }; |
147 | { |
148 | let vals = &mut mem.data; |
149 | let d = vals.as_mut_ptr(); |
150 | |
151 | let x = _mm_setr_pd(3.0, 4.0); |
152 | |
153 | _mm_stream_sd(d, x); |
154 | } |
155 | assert_eq!(mem.data[0], 3.0); |
156 | assert_eq!(mem.data[1], 2.0); |
157 | } |
158 | |
159 | #[repr (align(16))] |
160 | struct MemoryF32 { |
161 | data: [f32; 4], |
162 | } |
163 | |
164 | #[simd_test(enable = "sse4a" )] |
165 | // Miri cannot support this until it is clear how it fits in the Rust memory model |
166 | // (non-temporal store) |
167 | #[cfg_attr (miri, ignore)] |
168 | unsafe fn test_mm_stream_ss() { |
169 | let mut mem = MemoryF32 { |
170 | data: [1.0_f32, 2.0, 3.0, 4.0], |
171 | }; |
172 | { |
173 | let vals = &mut mem.data; |
174 | let d = vals.as_mut_ptr(); |
175 | |
176 | let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
177 | |
178 | _mm_stream_ss(d, x); |
179 | } |
180 | assert_eq!(mem.data[0], 5.0); |
181 | assert_eq!(mem.data[1], 2.0); |
182 | assert_eq!(mem.data[2], 3.0); |
183 | assert_eq!(mem.data[3], 4.0); |
184 | } |
185 | } |
186 | |