1/*
2 * Distributed under the Boost Software License, Version 1.0.
3 * (See accompanying file LICENSE_1_0.txt or copy at
4 * http://www.boost.org/LICENSE_1_0.txt)
5 *
6 * Copyright (c) 2020 Andrey Semashev
7 */
8/*!
9 * \file find_address_sse41.cpp
10 *
11 * This file contains SSE4.1 implementation of the \c find_address algorithm
12 */
13
14#include <boost/predef/architecture/x86.h>
15#include <boost/atomic/detail/int_sizes.hpp>
16
17#if BOOST_ARCH_X86 && defined(BOOST_ATOMIC_DETAIL_SIZEOF_POINTER) && (BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8)
18
19#include <cstddef>
20#include <smmintrin.h>
21
22#include <boost/cstdint.hpp>
23#include <boost/atomic/detail/config.hpp>
24#include <boost/atomic/detail/intptr.hpp>
25#include "find_address.hpp"
26#include "x86_vector_tools.hpp"
27#include "bit_operation_tools.hpp"
28
29#include <boost/atomic/detail/header.hpp>
30
31namespace boost {
32namespace atomics {
33namespace detail {
34
35//! SSE4.1 implementation of the \c find_address algorithm
36std::size_t find_address_sse41(const volatile void* addr, const volatile void* const* addrs, std::size_t size)
37{
38 if (size < 12u)
39 return find_address_generic(addr, addrs, size);
40
41 const __m128i mm_addr = mm_set1_epiptr(ptr: (uintptr_t)addr);
42 std::size_t pos = 0u;
43 const std::size_t n = (size + 1u) & ~static_cast< std::size_t >(1u);
44 for (std::size_t m = n & ~static_cast< std::size_t >(15u); pos < m; pos += 16u)
45 {
46 __m128i mm1 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos));
47 __m128i mm2 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 2u));
48 __m128i mm3 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 4u));
49 __m128i mm4 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 6u));
50 __m128i mm5 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 8u));
51 __m128i mm6 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 10u));
52 __m128i mm7 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 12u));
53 __m128i mm8 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 14u));
54
55 mm1 = _mm_cmpeq_epi64(V1: mm1, V2: mm_addr);
56 mm2 = _mm_cmpeq_epi64(V1: mm2, V2: mm_addr);
57 mm3 = _mm_cmpeq_epi64(V1: mm3, V2: mm_addr);
58 mm4 = _mm_cmpeq_epi64(V1: mm4, V2: mm_addr);
59 mm5 = _mm_cmpeq_epi64(V1: mm5, V2: mm_addr);
60 mm6 = _mm_cmpeq_epi64(V1: mm6, V2: mm_addr);
61 mm7 = _mm_cmpeq_epi64(V1: mm7, V2: mm_addr);
62 mm8 = _mm_cmpeq_epi64(V1: mm8, V2: mm_addr);
63
64 mm1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0)));
65 mm3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3), _mm_castsi128_ps(mm4), _MM_SHUFFLE(2, 0, 2, 0)));
66 mm5 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm5), _mm_castsi128_ps(mm6), _MM_SHUFFLE(2, 0, 2, 0)));
67 mm7 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm7), _mm_castsi128_ps(mm8), _MM_SHUFFLE(2, 0, 2, 0)));
68
69 mm1 = _mm_packs_epi32(a: mm1, b: mm3);
70 mm5 = _mm_packs_epi32(a: mm5, b: mm7);
71
72 mm1 = _mm_packs_epi16(a: mm1, b: mm5);
73
74 uint32_t mask = _mm_movemask_epi8(a: mm1);
75 if (mask)
76 {
77 pos += atomics::detail::count_trailing_zeros(x: mask);
78 goto done;
79 }
80 }
81
82 if ((n - pos) >= 8u)
83 {
84 __m128i mm1 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos));
85 __m128i mm2 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 2u));
86 __m128i mm3 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 4u));
87 __m128i mm4 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 6u));
88
89 mm1 = _mm_cmpeq_epi64(V1: mm1, V2: mm_addr);
90 mm2 = _mm_cmpeq_epi64(V1: mm2, V2: mm_addr);
91 mm3 = _mm_cmpeq_epi64(V1: mm3, V2: mm_addr);
92 mm4 = _mm_cmpeq_epi64(V1: mm4, V2: mm_addr);
93
94 mm1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0)));
95 mm3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3), _mm_castsi128_ps(mm4), _MM_SHUFFLE(2, 0, 2, 0)));
96
97 mm1 = _mm_packs_epi32(a: mm1, b: mm3);
98
99 uint32_t mask = _mm_movemask_epi8(a: mm1);
100 if (mask)
101 {
102 pos += atomics::detail::count_trailing_zeros(x: mask) / 2u;
103 goto done;
104 }
105
106 pos += 8u;
107 }
108
109 if ((n - pos) >= 4u)
110 {
111 __m128i mm1 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos));
112 __m128i mm2 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 2u));
113
114 mm1 = _mm_cmpeq_epi64(V1: mm1, V2: mm_addr);
115 mm2 = _mm_cmpeq_epi64(V1: mm2, V2: mm_addr);
116
117 mm1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0)));
118
119 uint32_t mask = _mm_movemask_ps(a: _mm_castsi128_ps(a: mm1));
120 if (mask)
121 {
122 pos += atomics::detail::count_trailing_zeros(x: mask);
123 goto done;
124 }
125
126 pos += 4u;
127 }
128
129 if (pos < n)
130 {
131 __m128i mm1 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos));
132
133 mm1 = _mm_cmpeq_epi64(V1: mm1, V2: mm_addr);
134 uint32_t mask = _mm_movemask_pd(a: _mm_castsi128_pd(a: mm1));
135 if (mask)
136 {
137 pos += atomics::detail::count_trailing_zeros(x: mask);
138 goto done;
139 }
140
141 pos += 2u;
142 }
143
144done:
145 return pos;
146}
147
148} // namespace detail
149} // namespace atomics
150} // namespace boost
151
152#include <boost/atomic/detail/footer.hpp>
153
154#endif // BOOST_ARCH_X86 && defined(BOOST_ATOMIC_DETAIL_SIZEOF_POINTER) && (BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8)
155

source code of boost/libs/atomic/src/find_address_sse41.cpp