1 | //! `i686`'s Streaming SIMD Extensions 4a (`SSE4a`) |
2 | |
3 | use crate::{ |
4 | core_arch::{simd::*, x86::*}, |
5 | mem::transmute, |
6 | }; |
7 | |
8 | #[cfg (test)] |
9 | use stdarch_test::assert_instr; |
10 | |
11 | #[allow (improper_ctypes)] |
12 | extern "C" { |
13 | #[link_name = "llvm.x86.sse4a.extrq" ] |
14 | fn extrq(x: i64x2, y: i8x16) -> i64x2; |
15 | #[link_name = "llvm.x86.sse4a.insertq" ] |
16 | fn insertq(x: i64x2, y: i64x2) -> i64x2; |
17 | #[link_name = "llvm.x86.sse4a.movnt.sd" ] |
18 | fn movntsd(x: *mut f64, y: __m128d); |
19 | #[link_name = "llvm.x86.sse4a.movnt.ss" ] |
20 | fn movntss(x: *mut f32, y: __m128); |
21 | } |
22 | |
23 | // FIXME(blocked on #248): _mm_extracti_si64(x, len, idx) // EXTRQ |
24 | // FIXME(blocked on #248): _mm_inserti_si64(x, y, len, idx) // INSERTQ |
25 | |
26 | /// Extracts the bit range specified by `y` from the lower 64 bits of `x`. |
27 | /// |
28 | /// The `[13:8]` bits of `y` specify the index of the bit-range to extract. The |
29 | /// `[5:0]` bits of `y` specify the length of the bit-range to extract. All |
30 | /// other bits are ignored. |
31 | /// |
32 | /// If the length is zero, it is interpreted as `64`. If the length and index |
33 | /// are zero, the lower 64 bits of `x` are extracted. |
34 | /// |
35 | /// If `length == 0 && index > 0` or `length + index > 64` the result is |
36 | /// undefined. |
37 | #[inline ] |
38 | #[target_feature (enable = "sse4a" )] |
39 | #[cfg_attr (test, assert_instr(extrq))] |
40 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
41 | pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i { |
42 | transmute(src:extrq(x:x.as_i64x2(), y:y.as_i8x16())) |
43 | } |
44 | |
45 | /// Inserts the `[length:0]` bits of `y` into `x` at `index`. |
46 | /// |
47 | /// The bits of `y`: |
48 | /// |
49 | /// - `[69:64]` specify the `length`, |
50 | /// - `[77:72]` specify the index. |
51 | /// |
52 | /// If the `length` is zero it is interpreted as `64`. If `index + length > 64` |
53 | /// or `index > 0 && length == 0` the result is undefined. |
54 | #[inline ] |
55 | #[target_feature (enable = "sse4a" )] |
56 | #[cfg_attr (test, assert_instr(insertq))] |
57 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
58 | pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i { |
59 | transmute(src:insertq(x:x.as_i64x2(), y:y.as_i64x2())) |
60 | } |
61 | |
62 | /// Non-temporal store of `a.0` into `p`. |
63 | /// |
64 | /// Writes 64-bit data to a memory location without polluting the caches. |
65 | #[inline ] |
66 | #[target_feature (enable = "sse4a" )] |
67 | #[cfg_attr (test, assert_instr(movntsd))] |
68 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
69 | pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) { |
70 | movntsd(x:p, y:a); |
71 | } |
72 | |
73 | /// Non-temporal store of `a.0` into `p`. |
74 | /// |
75 | /// Writes 32-bit data to a memory location without polluting the caches. |
76 | #[inline ] |
77 | #[target_feature (enable = "sse4a" )] |
78 | #[cfg_attr (test, assert_instr(movntss))] |
79 | #[stable (feature = "simd_x86" , since = "1.27.0" )] |
80 | pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) { |
81 | movntss(x:p, y:a); |
82 | } |
83 | |
84 | #[cfg (test)] |
85 | mod tests { |
86 | use crate::core_arch::x86::*; |
87 | use stdarch_test::simd_test; |
88 | |
89 | #[simd_test(enable = "sse4a" )] |
90 | unsafe fn test_mm_extract_si64() { |
91 | let b = 0b0110_0000_0000_i64; |
92 | // ^^^^ bit range extracted |
93 | let x = _mm_setr_epi64x(b, 0); |
94 | let v = 0b001000___00___000100_i64; |
95 | // ^idx: 2^3 = 8 ^length = 2^2 = 4 |
96 | let y = _mm_setr_epi64x(v, 0); |
97 | let e = _mm_setr_epi64x(0b0110_i64, 0); |
98 | let r = _mm_extract_si64(x, y); |
99 | assert_eq_m128i(r, e); |
100 | } |
101 | |
102 | #[simd_test(enable = "sse4a" )] |
103 | unsafe fn test_mm_insert_si64() { |
104 | let i = 0b0110_i64; |
105 | // ^^^^ bit range inserted |
106 | let z = 0b1010_1010_1010i64; |
107 | // ^^^^ bit range replaced |
108 | let e = 0b0110_1010_1010i64; |
109 | // ^^^^ replaced 1010 with 0110 |
110 | let x = _mm_setr_epi64x(z, 0); |
111 | let expected = _mm_setr_epi64x(e, 0); |
112 | let v = 0b001000___00___000100_i64; |
113 | // ^idx: 2^3 = 8 ^length = 2^2 = 4 |
114 | let y = _mm_setr_epi64x(i, v); |
115 | let r = _mm_insert_si64(x, y); |
116 | assert_eq_m128i(r, expected); |
117 | } |
118 | |
119 | #[repr (align(16))] |
120 | struct MemoryF64 { |
121 | data: [f64; 2], |
122 | } |
123 | |
124 | #[simd_test(enable = "sse4a" )] |
125 | unsafe fn test_mm_stream_sd() { |
126 | let mut mem = MemoryF64 { |
127 | data: [1.0_f64, 2.0], |
128 | }; |
129 | { |
130 | let vals = &mut mem.data; |
131 | let d = vals.as_mut_ptr(); |
132 | |
133 | let x = _mm_setr_pd(3.0, 4.0); |
134 | |
135 | _mm_stream_sd(d, x); |
136 | } |
137 | assert_eq!(mem.data[0], 3.0); |
138 | assert_eq!(mem.data[1], 2.0); |
139 | } |
140 | |
141 | #[repr (align(16))] |
142 | struct MemoryF32 { |
143 | data: [f32; 4], |
144 | } |
145 | |
146 | #[simd_test(enable = "sse4a" )] |
147 | unsafe fn test_mm_stream_ss() { |
148 | let mut mem = MemoryF32 { |
149 | data: [1.0_f32, 2.0, 3.0, 4.0], |
150 | }; |
151 | { |
152 | let vals = &mut mem.data; |
153 | let d = vals.as_mut_ptr(); |
154 | |
155 | let x = _mm_setr_ps(5.0, 6.0, 7.0, 8.0); |
156 | |
157 | _mm_stream_ss(d, x); |
158 | } |
159 | assert_eq!(mem.data[0], 5.0); |
160 | assert_eq!(mem.data[1], 2.0); |
161 | assert_eq!(mem.data[2], 3.0); |
162 | assert_eq!(mem.data[3], 4.0); |
163 | } |
164 | } |
165 | |