1 | // pathfinder/simd/src/x86.rs |
2 | // |
3 | // Copyright © 2019 The Pathfinder Project Developers. |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | // option. This file may not be copied, modified, or distributed |
9 | // except according to those terms. |
10 | |
11 | use std::cmp::PartialEq; |
12 | use std::fmt::{self, Debug, Formatter}; |
13 | use std::mem; |
14 | use std::ops::{Add, BitAnd, BitOr, BitXor, Div, Index, IndexMut, Mul, Not, Shr, Sub}; |
15 | |
16 | #[cfg (target_pointer_width = "32" )] |
17 | use std::arch::x86::{__m128, __m128i}; |
18 | #[cfg (target_pointer_width = "32" )] |
19 | use std::arch::x86; |
20 | #[cfg (target_pointer_width = "64" )] |
21 | use std::arch::x86_64::{__m128, __m128i}; |
22 | #[cfg (target_pointer_width = "64" )] |
23 | use std::arch::x86_64 as x86; |
24 | |
25 | mod swizzle_f32x4; |
26 | mod swizzle_i32x4; |
27 | |
28 | // Two 32-bit floats |
29 | |
30 | #[derive (Clone, Copy)] |
31 | pub struct F32x2(pub u64); |
32 | |
33 | impl F32x2 { |
34 | // Constructors |
35 | |
36 | #[inline ] |
37 | pub fn new(a: f32, b: f32) -> F32x2 { |
38 | unsafe { |
39 | let a = mem::transmute::<*const f32, *const u32>(&a); |
40 | let b = mem::transmute::<*const f32, *const u32>(&b); |
41 | F32x2((*a as u64) | ((*b as u64) << 32)) |
42 | } |
43 | } |
44 | |
45 | #[inline ] |
46 | pub fn splat(x: f32) -> F32x2 { |
47 | F32x2::new(x, x) |
48 | } |
49 | |
50 | // Basic operations |
51 | |
52 | #[inline ] |
53 | pub fn approx_recip(self) -> F32x2 { |
54 | self.to_f32x4().approx_recip().xy() |
55 | } |
56 | |
57 | #[inline ] |
58 | pub fn min(self, other: F32x2) -> F32x2 { |
59 | self.to_f32x4().min(other.to_f32x4()).xy() |
60 | } |
61 | |
62 | #[inline ] |
63 | pub fn max(self, other: F32x2) -> F32x2 { |
64 | self.to_f32x4().max(other.to_f32x4()).xy() |
65 | } |
66 | |
67 | #[inline ] |
68 | pub fn clamp(self, min: F32x2, max: F32x2) -> F32x2 { |
69 | self.to_f32x4().clamp(min.to_f32x4(), max.to_f32x4()).xy() |
70 | } |
71 | |
72 | #[inline ] |
73 | pub fn abs(self) -> F32x2 { |
74 | self.to_f32x4().abs().xy() |
75 | } |
76 | |
77 | #[inline ] |
78 | pub fn floor(self) -> F32x2 { |
79 | self.to_f32x4().floor().xy() |
80 | } |
81 | |
82 | #[inline ] |
83 | pub fn ceil(self) -> F32x2 { |
84 | self.to_f32x4().ceil().xy() |
85 | } |
86 | |
87 | #[inline ] |
88 | pub fn sqrt(self) -> F32x2 { |
89 | self.to_f32x4().sqrt().xy() |
90 | } |
91 | |
92 | // Packed comparisons |
93 | |
94 | #[inline ] |
95 | pub fn packed_eq(self, other: F32x2) -> U32x2 { |
96 | self.to_f32x4().packed_eq(other.to_f32x4()).xy() |
97 | } |
98 | |
99 | #[inline ] |
100 | pub fn packed_gt(self, other: F32x2) -> U32x2 { |
101 | self.to_f32x4().packed_gt(other.to_f32x4()).xy() |
102 | } |
103 | |
104 | #[inline ] |
105 | pub fn packed_lt(self, other: F32x2) -> U32x2 { |
106 | self.to_f32x4().packed_lt(other.to_f32x4()).xy() |
107 | } |
108 | |
109 | #[inline ] |
110 | pub fn packed_le(self, other: F32x2) -> U32x2 { |
111 | self.to_f32x4().packed_le(other.to_f32x4()).xy() |
112 | } |
113 | |
114 | // Conversions |
115 | |
116 | #[inline ] |
117 | pub fn to_f32x4(self) -> F32x4 { |
118 | unsafe { |
119 | let mut result = F32x4::default(); |
120 | *mem::transmute::<&mut __m128, &mut u64>(&mut result.0) = self.0; |
121 | result |
122 | } |
123 | } |
124 | |
125 | #[inline ] |
126 | pub fn to_i32x2(self) -> I32x2 { |
127 | self.to_i32x4().xy() |
128 | } |
129 | |
130 | #[inline ] |
131 | pub fn to_i32x4(self) -> I32x4 { |
132 | self.to_f32x4().to_i32x4() |
133 | } |
134 | |
135 | // Swizzle |
136 | |
137 | #[inline ] |
138 | pub fn yx(self) -> F32x2 { |
139 | self.to_f32x4().yx() |
140 | } |
141 | |
142 | // Concatenations |
143 | |
144 | #[inline ] |
145 | pub fn concat_xy_xy(self, other: F32x2) -> F32x4 { |
146 | self.to_f32x4().concat_xy_xy(other.to_f32x4()) |
147 | } |
148 | } |
149 | |
150 | impl Default for F32x2 { |
151 | #[inline ] |
152 | fn default() -> F32x2 { |
153 | F32x2(0) |
154 | } |
155 | } |
156 | |
157 | impl Index<usize> for F32x2 { |
158 | type Output = f32; |
159 | #[inline ] |
160 | fn index(&self, index: usize) -> &f32 { |
161 | unsafe { &mem::transmute::<&u64, &[f32; 2]>(&self.0)[index] } |
162 | } |
163 | } |
164 | |
165 | impl IndexMut<usize> for F32x2 { |
166 | #[inline ] |
167 | fn index_mut(&mut self, index: usize) -> &mut f32 { |
168 | unsafe { &mut mem::transmute::<&mut u64, &mut [f32; 2]>(&mut self.0)[index] } |
169 | } |
170 | } |
171 | |
172 | impl Debug for F32x2 { |
173 | #[inline ] |
174 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { |
175 | write!(f, "< {}, {}>" , self[0], self[1]) |
176 | } |
177 | } |
178 | |
179 | impl PartialEq for F32x2 { |
180 | #[inline ] |
181 | fn eq(&self, other: &F32x2) -> bool { |
182 | self.packed_eq(*other).all_true() |
183 | } |
184 | } |
185 | |
186 | impl Add<F32x2> for F32x2 { |
187 | type Output = F32x2; |
188 | #[inline ] |
189 | fn add(self, other: F32x2) -> F32x2 { |
190 | (self.to_f32x4() + other.to_f32x4()).xy() |
191 | } |
192 | } |
193 | |
194 | impl Div<F32x2> for F32x2 { |
195 | type Output = F32x2; |
196 | #[inline ] |
197 | fn div(self, other: F32x2) -> F32x2 { |
198 | (self.to_f32x4() / other.to_f32x4()).xy() |
199 | } |
200 | } |
201 | |
202 | impl Mul<F32x2> for F32x2 { |
203 | type Output = F32x2; |
204 | #[inline ] |
205 | fn mul(self, other: F32x2) -> F32x2 { |
206 | (self.to_f32x4() * other.to_f32x4()).xy() |
207 | } |
208 | } |
209 | |
210 | impl Sub<F32x2> for F32x2 { |
211 | type Output = F32x2; |
212 | #[inline ] |
213 | fn sub(self, other: F32x2) -> F32x2 { |
214 | (self.to_f32x4() - other.to_f32x4()).xy() |
215 | } |
216 | } |
217 | |
218 | // Four 32-bit floats |
219 | |
220 | #[derive (Clone, Copy)] |
221 | pub struct F32x4(pub __m128); |
222 | |
223 | impl F32x4 { |
224 | // Constructors |
225 | |
226 | #[inline ] |
227 | pub fn new(a: f32, b: f32, c: f32, d: f32) -> F32x4 { |
228 | unsafe { |
229 | let vector = [a, b, c, d]; |
230 | F32x4(x86::_mm_loadu_ps(vector.as_ptr())) |
231 | } |
232 | } |
233 | |
234 | #[inline ] |
235 | pub fn splat(x: f32) -> F32x4 { |
236 | unsafe { F32x4(x86::_mm_set1_ps(x)) } |
237 | } |
238 | |
239 | // Basic operations |
240 | |
241 | #[inline ] |
242 | pub fn approx_recip(self) -> F32x4 { |
243 | unsafe { F32x4(x86::_mm_rcp_ps(self.0)) } |
244 | } |
245 | |
246 | #[inline ] |
247 | pub fn min(self, other: F32x4) -> F32x4 { |
248 | unsafe { F32x4(x86::_mm_min_ps(self.0, other.0)) } |
249 | } |
250 | |
251 | #[inline ] |
252 | pub fn max(self, other: F32x4) -> F32x4 { |
253 | unsafe { F32x4(x86::_mm_max_ps(self.0, other.0)) } |
254 | } |
255 | |
256 | #[inline ] |
257 | pub fn clamp(self, min: F32x4, max: F32x4) -> F32x4 { |
258 | self.max(min).min(max) |
259 | } |
260 | |
261 | #[inline ] |
262 | pub fn abs(self) -> F32x4 { |
263 | unsafe { |
264 | let tmp = x86::_mm_srli_epi32(I32x4::splat(-1).0, 1); |
265 | F32x4(x86::_mm_and_ps(x86::_mm_castsi128_ps(tmp), self.0)) |
266 | } |
267 | } |
268 | |
269 | #[inline ] |
270 | pub fn floor(self) -> F32x4 { |
271 | unsafe { F32x4(x86::_mm_floor_ps(self.0)) } |
272 | } |
273 | |
274 | #[inline ] |
275 | pub fn ceil(self) -> F32x4 { |
276 | unsafe { F32x4(x86::_mm_ceil_ps(self.0)) } |
277 | } |
278 | |
279 | #[inline ] |
280 | pub fn sqrt(self) -> F32x4 { |
281 | unsafe { F32x4(x86::_mm_sqrt_ps(self.0)) } |
282 | } |
283 | |
284 | // Packed comparisons |
285 | |
286 | #[inline ] |
287 | pub fn packed_eq(self, other: F32x4) -> U32x4 { |
288 | unsafe { |
289 | U32x4(x86::_mm_castps_si128(x86::_mm_cmpeq_ps( |
290 | self.0, other.0, |
291 | ))) |
292 | } |
293 | } |
294 | |
295 | #[inline ] |
296 | pub fn packed_gt(self, other: F32x4) -> U32x4 { |
297 | unsafe { |
298 | U32x4(x86::_mm_castps_si128(x86::_mm_cmpgt_ps( |
299 | self.0, other.0, |
300 | ))) |
301 | } |
302 | } |
303 | |
304 | #[inline ] |
305 | pub fn packed_lt(self, other: F32x4) -> U32x4 { |
306 | other.packed_gt(self) |
307 | } |
308 | |
309 | #[inline ] |
310 | pub fn packed_le(self, other: F32x4) -> U32x4 { |
311 | !self.packed_gt(other) |
312 | } |
313 | |
314 | // Conversions |
315 | |
316 | /// Converts these packed floats to integers via rounding. |
317 | #[inline ] |
318 | pub fn to_i32x4(self) -> I32x4 { |
319 | unsafe { I32x4(x86::_mm_cvtps_epi32(self.0)) } |
320 | } |
321 | |
322 | // Extraction |
323 | |
324 | #[inline ] |
325 | pub fn xy(self) -> F32x2 { |
326 | unsafe { |
327 | let swizzled = self.0; |
328 | F32x2(*mem::transmute::<&__m128, &u64>(&swizzled)) |
329 | } |
330 | } |
331 | |
332 | #[inline ] |
333 | pub fn xw(self) -> F32x2 { |
334 | self.xwyz().xy() |
335 | } |
336 | |
337 | #[inline ] |
338 | pub fn yx(self) -> F32x2 { |
339 | self.yxwz().xy() |
340 | } |
341 | |
342 | #[inline ] |
343 | pub fn zy(self) -> F32x2 { |
344 | self.zyxw().xy() |
345 | } |
346 | |
347 | #[inline ] |
348 | pub fn zw(self) -> F32x2 { |
349 | self.zwxy().xy() |
350 | } |
351 | |
352 | // Concatenations |
353 | |
354 | #[inline ] |
355 | pub fn concat_xy_xy(self, other: F32x4) -> F32x4 { |
356 | unsafe { |
357 | let this = x86::_mm_castps_pd(self.0); |
358 | let other = x86::_mm_castps_pd(other.0); |
359 | let result = x86::_mm_unpacklo_pd(this, other); |
360 | F32x4(x86::_mm_castpd_ps(result)) |
361 | } |
362 | } |
363 | |
364 | #[inline ] |
365 | pub fn concat_xy_zw(self, other: F32x4) -> F32x4 { |
366 | unsafe { |
367 | let this = x86::_mm_castps_pd(self.0); |
368 | let other = x86::_mm_castps_pd(other.0); |
369 | let result = x86::_mm_shuffle_pd(this, other, 0b10); |
370 | F32x4(x86::_mm_castpd_ps(result)) |
371 | } |
372 | } |
373 | |
374 | #[inline ] |
375 | pub fn concat_zw_zw(self, other: F32x4) -> F32x4 { |
376 | unsafe { |
377 | let this = x86::_mm_castps_pd(self.0); |
378 | let other = x86::_mm_castps_pd(other.0); |
379 | let result = x86::_mm_unpackhi_pd(this, other); |
380 | F32x4(x86::_mm_castpd_ps(result)) |
381 | } |
382 | } |
383 | |
384 | #[inline ] |
385 | pub fn concat_wz_yx(self, other: F32x4) -> F32x4 { |
386 | unsafe { F32x4(x86::_mm_shuffle_ps(self.0, other.0, 0b0001_1011)) } |
387 | } |
388 | } |
389 | |
390 | impl Default for F32x4 { |
391 | #[inline ] |
392 | fn default() -> F32x4 { |
393 | unsafe { F32x4(x86::_mm_setzero_ps()) } |
394 | } |
395 | } |
396 | |
397 | impl Index<usize> for F32x4 { |
398 | type Output = f32; |
399 | #[inline ] |
400 | fn index(&self, index: usize) -> &f32 { |
401 | unsafe { &mem::transmute::<&__m128, &[f32; 4]>(&self.0)[index] } |
402 | } |
403 | } |
404 | |
405 | impl IndexMut<usize> for F32x4 { |
406 | #[inline ] |
407 | fn index_mut(&mut self, index: usize) -> &mut f32 { |
408 | unsafe { &mut mem::transmute::<&mut __m128, &mut [f32; 4]>(&mut self.0)[index] } |
409 | } |
410 | } |
411 | |
412 | impl Debug for F32x4 { |
413 | #[inline ] |
414 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { |
415 | write!(f, "< {}, {}, {}, {}>" , self[0], self[1], self[2], self[3]) |
416 | } |
417 | } |
418 | |
419 | impl PartialEq for F32x4 { |
420 | #[inline ] |
421 | fn eq(&self, other: &F32x4) -> bool { |
422 | self.packed_eq(*other).all_true() |
423 | } |
424 | } |
425 | |
426 | impl Add<F32x4> for F32x4 { |
427 | type Output = F32x4; |
428 | #[inline ] |
429 | fn add(self, other: F32x4) -> F32x4 { |
430 | unsafe { F32x4(x86::_mm_add_ps(self.0, b:other.0)) } |
431 | } |
432 | } |
433 | |
434 | impl Div<F32x4> for F32x4 { |
435 | type Output = F32x4; |
436 | #[inline ] |
437 | fn div(self, other: F32x4) -> F32x4 { |
438 | unsafe { F32x4(x86::_mm_div_ps(self.0, b:other.0)) } |
439 | } |
440 | } |
441 | |
442 | impl Mul<F32x4> for F32x4 { |
443 | type Output = F32x4; |
444 | #[inline ] |
445 | fn mul(self, other: F32x4) -> F32x4 { |
446 | unsafe { F32x4(x86::_mm_mul_ps(self.0, b:other.0)) } |
447 | } |
448 | } |
449 | |
450 | impl Sub<F32x4> for F32x4 { |
451 | type Output = F32x4; |
452 | #[inline ] |
453 | fn sub(self, other: F32x4) -> F32x4 { |
454 | unsafe { F32x4(x86::_mm_sub_ps(self.0, b:other.0)) } |
455 | } |
456 | } |
457 | |
458 | // Two 32-bit signed integers |
459 | |
460 | #[derive (Clone, Copy)] |
461 | pub struct I32x2(pub u64); |
462 | |
463 | impl I32x2 { |
464 | // Constructors |
465 | |
466 | #[inline ] |
467 | pub fn new(a: i32, b: i32) -> I32x2 { |
468 | unsafe { |
469 | let a = mem::transmute::<*const i32, *const u32>(&a); |
470 | let b = mem::transmute::<*const i32, *const u32>(&b); |
471 | I32x2((*a as u64) | ((*b as u64) << 32)) |
472 | } |
473 | } |
474 | |
475 | #[inline ] |
476 | pub fn splat(x: i32) -> I32x2 { |
477 | I32x2::new(x, x) |
478 | } |
479 | |
480 | // Accessors |
481 | |
482 | #[inline ] |
483 | pub fn x(self) -> i32 { |
484 | self[0] |
485 | } |
486 | |
487 | #[inline ] |
488 | pub fn y(self) -> i32 { |
489 | self[1] |
490 | } |
491 | |
492 | // Concatenations |
493 | |
494 | #[inline ] |
495 | pub fn concat_xy_xy(self, other: I32x2) -> I32x4 { |
496 | self.to_i32x4().concat_xy_xy(other.to_i32x4()) |
497 | } |
498 | |
499 | // Conversions |
500 | |
501 | #[inline ] |
502 | pub fn to_i32x4(self) -> I32x4 { |
503 | unsafe { |
504 | let mut result = I32x4::default(); |
505 | *mem::transmute::<&mut __m128i, &mut u64>(&mut result.0) = self.0; |
506 | result |
507 | } |
508 | } |
509 | |
510 | #[inline ] |
511 | pub fn to_f32x4(self) -> F32x4 { |
512 | self.to_i32x4().to_f32x4() |
513 | } |
514 | |
515 | /// Converts these packed integers to floats. |
516 | #[inline ] |
517 | pub fn to_f32x2(self) -> F32x2 { |
518 | self.to_f32x4().xy() |
519 | } |
520 | |
521 | // Basic operations |
522 | |
523 | #[inline ] |
524 | pub fn max(self, other: I32x2) -> I32x2 { |
525 | self.to_i32x4().max(other.to_i32x4()).xy() |
526 | } |
527 | |
528 | #[inline ] |
529 | pub fn min(self, other: I32x2) -> I32x2 { |
530 | self.to_i32x4().min(other.to_i32x4()).xy() |
531 | } |
532 | |
533 | // Comparisons |
534 | |
535 | // TODO(pcwalton): Use the `U32x2` type! |
536 | #[inline ] |
537 | pub fn packed_eq(self, other: I32x2) -> U32x4 { |
538 | self.to_i32x4().packed_eq(other.to_i32x4()) |
539 | } |
540 | |
541 | #[inline ] |
542 | pub fn packed_gt(self, other: I32x2) -> U32x4 { |
543 | self.to_i32x4().packed_gt(other.to_i32x4()) |
544 | } |
545 | |
546 | #[inline ] |
547 | pub fn packed_le(self, other: I32x2) -> U32x4 { |
548 | self.to_i32x4().packed_le(other.to_i32x4()) |
549 | } |
550 | } |
551 | |
552 | impl Default for I32x2 { |
553 | #[inline ] |
554 | fn default() -> I32x2 { |
555 | I32x2(0) |
556 | } |
557 | } |
558 | |
559 | impl Index<usize> for I32x2 { |
560 | type Output = i32; |
561 | #[inline ] |
562 | fn index(&self, index: usize) -> &i32 { |
563 | unsafe { &mem::transmute::<&u64, &[i32; 2]>(&self.0)[index] } |
564 | } |
565 | } |
566 | |
567 | impl IndexMut<usize> for I32x2 { |
568 | #[inline ] |
569 | fn index_mut(&mut self, index: usize) -> &mut i32 { |
570 | unsafe { &mut mem::transmute::<&mut u64, &mut [i32; 2]>(&mut self.0)[index] } |
571 | } |
572 | } |
573 | |
574 | impl Add<I32x2> for I32x2 { |
575 | type Output = I32x2; |
576 | #[inline ] |
577 | fn add(self, other: I32x2) -> I32x2 { |
578 | (self.to_i32x4() + other.to_i32x4()).xy() |
579 | } |
580 | } |
581 | |
582 | impl Sub<I32x2> for I32x2 { |
583 | type Output = I32x2; |
584 | #[inline ] |
585 | fn sub(self, other: I32x2) -> I32x2 { |
586 | (self.to_i32x4() - other.to_i32x4()).xy() |
587 | } |
588 | } |
589 | |
590 | impl Mul<I32x2> for I32x2 { |
591 | type Output = I32x2; |
592 | #[inline ] |
593 | fn mul(self, other: I32x2) -> I32x2 { |
594 | (self.to_i32x4() * other.to_i32x4()).xy() |
595 | } |
596 | } |
597 | |
598 | impl Debug for I32x2 { |
599 | #[inline ] |
600 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { |
601 | write!(f, "< {}, {}>" , self[0], self[1]) |
602 | } |
603 | } |
604 | |
605 | impl PartialEq for I32x2 { |
606 | #[inline ] |
607 | fn eq(&self, other: &I32x2) -> bool { |
608 | self.packed_eq(*other).all_true() |
609 | } |
610 | } |
611 | |
612 | // Four 32-bit signed integers |
613 | |
614 | #[derive (Clone, Copy)] |
615 | pub struct I32x4(pub __m128i); |
616 | |
617 | impl I32x4 { |
618 | // Constructors |
619 | |
620 | #[inline ] |
621 | pub fn new(a: i32, b: i32, c: i32, d: i32) -> I32x4 { |
622 | unsafe { |
623 | let vector = [a, b, c, d]; |
624 | I32x4(x86::_mm_loadu_si128(vector.as_ptr() as *const __m128i)) |
625 | } |
626 | } |
627 | |
628 | #[inline ] |
629 | pub fn splat(x: i32) -> I32x4 { |
630 | unsafe { I32x4(x86::_mm_set1_epi32(x)) } |
631 | } |
632 | |
633 | // Extraction |
634 | |
635 | #[inline ] |
636 | pub fn xy(self) -> I32x2 { |
637 | unsafe { |
638 | let swizzled = self.0; |
639 | I32x2(*mem::transmute::<&__m128i, &u64>(&swizzled)) |
640 | } |
641 | } |
642 | |
643 | #[inline ] |
644 | pub fn xw(self) -> I32x2 { |
645 | self.xwyz().xy() |
646 | } |
647 | |
648 | #[inline ] |
649 | pub fn yx(self) -> I32x2 { |
650 | self.yxwz().xy() |
651 | } |
652 | |
653 | #[inline ] |
654 | pub fn zy(self) -> I32x2 { |
655 | self.zyxw().xy() |
656 | } |
657 | |
658 | #[inline ] |
659 | pub fn zw(self) -> I32x2 { |
660 | self.zwxy().xy() |
661 | } |
662 | |
663 | // Concatenations |
664 | |
665 | #[inline ] |
666 | pub fn concat_xy_xy(self, other: I32x4) -> I32x4 { |
667 | unsafe { |
668 | let this = x86::_mm_castsi128_pd(self.0); |
669 | let other = x86::_mm_castsi128_pd(other.0); |
670 | let result = x86::_mm_unpacklo_pd(this, other); |
671 | I32x4(x86::_mm_castpd_si128(result)) |
672 | } |
673 | } |
674 | |
675 | #[inline ] |
676 | pub fn concat_zw_zw(self, other: I32x4) -> I32x4 { |
677 | unsafe { |
678 | let this = x86::_mm_castsi128_pd(self.0); |
679 | let other = x86::_mm_castsi128_pd(other.0); |
680 | let result = x86::_mm_unpackhi_pd(this, other); |
681 | I32x4(x86::_mm_castpd_si128(result)) |
682 | } |
683 | } |
684 | |
685 | // Conversions |
686 | |
687 | /// Converts these packed integers to floats. |
688 | #[inline ] |
689 | pub fn to_f32x4(self) -> F32x4 { |
690 | unsafe { F32x4(x86::_mm_cvtepi32_ps(self.0)) } |
691 | } |
692 | |
693 | /// Converts these packed signed integers to unsigned integers. |
694 | /// |
695 | /// Overflowing values will wrap around. |
696 | #[inline ] |
697 | pub fn to_u32x4(self) -> U32x4 { |
698 | U32x4(self.0) |
699 | } |
700 | |
701 | // Basic operations |
702 | |
703 | #[inline ] |
704 | pub fn max(self, other: I32x4) -> I32x4 { |
705 | unsafe { I32x4(x86::_mm_max_epi32(self.0, other.0)) } |
706 | } |
707 | |
708 | #[inline ] |
709 | pub fn min(self, other: I32x4) -> I32x4 { |
710 | unsafe { I32x4(x86::_mm_min_epi32(self.0, other.0)) } |
711 | } |
712 | |
713 | // Packed comparisons |
714 | |
715 | #[inline ] |
716 | pub fn packed_eq(self, other: I32x4) -> U32x4 { |
717 | unsafe { U32x4(x86::_mm_cmpeq_epi32(self.0, other.0)) } |
718 | } |
719 | |
720 | // Comparisons |
721 | |
722 | #[inline ] |
723 | pub fn packed_gt(self, other: I32x4) -> U32x4 { |
724 | unsafe { U32x4(x86::_mm_cmpgt_epi32(self.0, other.0)) } |
725 | } |
726 | |
727 | #[inline ] |
728 | pub fn packed_lt(self, other: I32x4) -> U32x4 { |
729 | other.packed_gt(self) |
730 | } |
731 | |
732 | #[inline ] |
733 | pub fn packed_le(self, other: I32x4) -> U32x4 { |
734 | !self.packed_gt(other) |
735 | } |
736 | } |
737 | |
738 | impl Default for I32x4 { |
739 | #[inline ] |
740 | fn default() -> I32x4 { |
741 | unsafe { I32x4(x86::_mm_setzero_si128()) } |
742 | } |
743 | } |
744 | |
745 | impl Index<usize> for I32x4 { |
746 | type Output = i32; |
747 | #[inline ] |
748 | fn index(&self, index: usize) -> &i32 { |
749 | unsafe { &mem::transmute::<&__m128i, &[i32; 4]>(&self.0)[index] } |
750 | } |
751 | } |
752 | |
753 | impl IndexMut<usize> for I32x4 { |
754 | #[inline ] |
755 | fn index_mut(&mut self, index: usize) -> &mut i32 { |
756 | unsafe { &mut mem::transmute::<&mut __m128i, &mut [i32; 4]>(&mut self.0)[index] } |
757 | } |
758 | } |
759 | |
760 | impl Add<I32x4> for I32x4 { |
761 | type Output = I32x4; |
762 | #[inline ] |
763 | fn add(self, other: I32x4) -> I32x4 { |
764 | unsafe { I32x4(x86::_mm_add_epi32(self.0, b:other.0)) } |
765 | } |
766 | } |
767 | |
768 | impl Sub<I32x4> for I32x4 { |
769 | type Output = I32x4; |
770 | #[inline ] |
771 | fn sub(self, other: I32x4) -> I32x4 { |
772 | unsafe { I32x4(x86::_mm_sub_epi32(self.0, b:other.0)) } |
773 | } |
774 | } |
775 | |
776 | impl Mul<I32x4> for I32x4 { |
777 | type Output = I32x4; |
778 | #[inline ] |
779 | fn mul(self, other: I32x4) -> I32x4 { |
780 | unsafe { I32x4(x86::_mm_mullo_epi32(self.0, b:other.0)) } |
781 | } |
782 | } |
783 | |
784 | impl BitAnd<I32x4> for I32x4 { |
785 | type Output = I32x4; |
786 | #[inline ] |
787 | fn bitand(self, other: I32x4) -> I32x4 { |
788 | unsafe { I32x4(x86::_mm_and_si128(self.0, b:other.0)) } |
789 | } |
790 | } |
791 | |
792 | impl BitOr<I32x4> for I32x4 { |
793 | type Output = I32x4; |
794 | #[inline ] |
795 | fn bitor(self, other: I32x4) -> I32x4 { |
796 | unsafe { I32x4(x86::_mm_or_si128(self.0, b:other.0)) } |
797 | } |
798 | } |
799 | |
800 | impl Debug for I32x4 { |
801 | #[inline ] |
802 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { |
803 | write!(f, "< {}, {}, {}, {}>" , self[0], self[1], self[2], self[3]) |
804 | } |
805 | } |
806 | |
807 | impl PartialEq for I32x4 { |
808 | #[inline ] |
809 | fn eq(&self, other: &I32x4) -> bool { |
810 | self.packed_eq(*other).all_true() |
811 | } |
812 | } |
813 | |
814 | // Two 32-bit unsigned integers |
815 | |
816 | #[derive (Clone, Copy)] |
817 | pub struct U32x2(pub u64); |
818 | |
819 | impl U32x2 { |
820 | #[inline ] |
821 | pub fn new(x: u32, y: u32) -> U32x2 { |
822 | U32x2(x as u64 | ((y as u64) << 32)) |
823 | } |
824 | |
825 | #[inline ] |
826 | pub fn splat(x: u32) -> U32x2 { |
827 | U32x2::new(x, x) |
828 | } |
829 | |
830 | /// Returns true if both booleans in this vector are true. |
831 | /// |
832 | /// The result is *undefined* if both values in this vector are not booleans. A boolean is a |
833 | /// value with all bits set or all bits clear (i.e. !0 or 0). |
834 | #[inline ] |
835 | pub fn all_true(self) -> bool { |
836 | self.0 == !0 |
837 | } |
838 | |
839 | /// Returns true if both booleans in this vector are false. |
840 | /// |
841 | /// The result is *undefined* if both values in this vector are not booleans. A boolean is a |
842 | /// value with all bits set or all bits clear (i.e. !0 or 0). |
843 | #[inline ] |
844 | pub fn all_false(self) -> bool { |
845 | self.0 == 0 |
846 | } |
847 | |
848 | #[inline ] |
849 | pub fn to_i32x2(self) -> I32x2 { |
850 | I32x2(self.0) |
851 | } |
852 | } |
853 | |
854 | impl Not for U32x2 { |
855 | type Output = U32x2; |
856 | #[inline ] |
857 | fn not(self) -> U32x2 { |
858 | U32x2(!self.0) |
859 | } |
860 | } |
861 | |
862 | impl BitAnd<U32x2> for U32x2 { |
863 | type Output = U32x2; |
864 | #[inline ] |
865 | fn bitand(self, other: U32x2) -> U32x2 { |
866 | U32x2(self.0 & other.0) |
867 | } |
868 | } |
869 | |
870 | impl BitOr<U32x2> for U32x2 { |
871 | type Output = U32x2; |
872 | #[inline ] |
873 | fn bitor(self, other: U32x2) -> U32x2 { |
874 | U32x2(self.0 | other.0) |
875 | } |
876 | } |
877 | |
878 | // Four 32-bit unsigned integers |
879 | |
880 | #[derive (Clone, Copy)] |
881 | pub struct U32x4(pub __m128i); |
882 | |
883 | impl U32x4 { |
884 | // Constructors |
885 | |
886 | #[inline ] |
887 | pub fn new(a: u32, b: u32, c: u32, d: u32) -> U32x4 { |
888 | unsafe { |
889 | let vector = [a, b, c, d]; |
890 | U32x4(x86::_mm_loadu_si128(vector.as_ptr() as *const __m128i)) |
891 | } |
892 | } |
893 | |
894 | #[inline ] |
895 | pub fn splat(x: u32) -> U32x4 { |
896 | unsafe { U32x4(x86::_mm_set1_epi32(x as i32)) } |
897 | } |
898 | |
899 | // Conversions |
900 | |
901 | /// Converts these packed unsigned integers to signed integers. |
902 | /// |
903 | /// Overflowing values will wrap around. |
904 | #[inline ] |
905 | pub fn to_i32x4(self) -> I32x4 { |
906 | I32x4(self.0) |
907 | } |
908 | |
909 | // Basic operations |
910 | |
911 | /// Returns true if all four booleans in this vector are true. |
912 | /// |
913 | /// The result is *undefined* if all four values in this vector are not booleans. A boolean is |
914 | /// a value with all bits set or all bits clear (i.e. !0 or 0). |
915 | #[inline ] |
916 | pub fn all_true(self) -> bool { |
917 | unsafe { x86::_mm_movemask_ps(x86::_mm_castsi128_ps(self.0)) == 0x0f } |
918 | } |
919 | |
920 | /// Returns true if all four booleans in this vector are false. |
921 | /// |
922 | /// The result is *undefined* if all four values in this vector are not booleans. A boolean is |
923 | /// a value with all bits set or all bits clear (i.e. !0 or 0). |
924 | #[inline ] |
925 | pub fn all_false(self) -> bool { |
926 | unsafe { x86::_mm_movemask_ps(x86::_mm_castsi128_ps(self.0)) == 0x00 } |
927 | } |
928 | |
929 | // Extraction |
930 | |
931 | #[inline ] |
932 | pub fn xy(self) -> U32x2 { |
933 | unsafe { |
934 | let swizzled = self.0; |
935 | U32x2(*mem::transmute::<&__m128i, &u64>(&swizzled)) |
936 | } |
937 | } |
938 | |
939 | // Packed comparisons |
940 | |
941 | #[inline ] |
942 | pub fn packed_eq(self, other: U32x4) -> U32x4 { |
943 | unsafe { U32x4(x86::_mm_cmpeq_epi32(self.0, other.0)) } |
944 | } |
945 | } |
946 | |
947 | impl Debug for U32x4 { |
948 | #[inline ] |
949 | fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { |
950 | write!(f, "< {}, {}, {}, {}>" , self[0], self[1], self[2], self[3]) |
951 | } |
952 | } |
953 | |
954 | impl Index<usize> for U32x4 { |
955 | type Output = u32; |
956 | #[inline ] |
957 | fn index(&self, index: usize) -> &u32 { |
958 | unsafe { &mem::transmute::<&__m128i, &[u32; 4]>(&self.0)[index] } |
959 | } |
960 | } |
961 | |
962 | impl PartialEq for U32x4 { |
963 | #[inline ] |
964 | fn eq(&self, other: &U32x4) -> bool { |
965 | self.packed_eq(*other).all_true() |
966 | } |
967 | } |
968 | |
969 | impl Not for U32x4 { |
970 | type Output = U32x4; |
971 | #[inline ] |
972 | fn not(self) -> U32x4 { |
973 | self ^ U32x4::splat(!0) |
974 | } |
975 | } |
976 | |
977 | impl BitXor<U32x4> for U32x4 { |
978 | type Output = U32x4; |
979 | #[inline ] |
980 | fn bitxor(self, other: U32x4) -> U32x4 { |
981 | unsafe { U32x4(x86::_mm_xor_si128(self.0, b:other.0)) } |
982 | } |
983 | } |
984 | |
985 | impl Shr<u32> for U32x4 { |
986 | type Output = U32x4; |
987 | #[inline ] |
988 | fn shr(self, amount: u32) -> U32x4 { |
989 | unsafe { U32x4(x86::_mm_srl_epi32(self.0, count:U32x4::new(a:amount, b:0, c:0, d:0).0)) } |
990 | } |
991 | } |
992 | |