| 1 | /* |
| 2 | * Distributed under the Boost Software License, Version 1.0. |
| 3 | * (See accompanying file LICENSE_1_0.txt or copy at |
| 4 | * http://www.boost.org/LICENSE_1_0.txt) |
| 5 | * |
| 6 | * Copyright (c) 2020 Andrey Semashev |
| 7 | */ |
| 8 | /*! |
| 9 | * \file find_address_sse41.cpp |
| 10 | * |
| 11 | * This file contains SSE4.1 implementation of the \c find_address algorithm |
| 12 | */ |
| 13 | |
| 14 | #include <boost/predef/architecture/x86.h> |
| 15 | #include <boost/atomic/detail/int_sizes.hpp> |
| 16 | |
| 17 | #if BOOST_ARCH_X86 && defined(BOOST_ATOMIC_DETAIL_SIZEOF_POINTER) && (BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8) |
| 18 | |
| 19 | #include <cstddef> |
| 20 | #include <smmintrin.h> |
| 21 | |
| 22 | #include <boost/cstdint.hpp> |
| 23 | #include <boost/atomic/detail/config.hpp> |
| 24 | #include <boost/atomic/detail/intptr.hpp> |
| 25 | #include "find_address.hpp" |
| 26 | #include "x86_vector_tools.hpp" |
| 27 | #include "bit_operation_tools.hpp" |
| 28 | |
| 29 | #include <boost/atomic/detail/header.hpp> |
| 30 | |
| 31 | namespace boost { |
| 32 | namespace atomics { |
| 33 | namespace detail { |
| 34 | |
| 35 | //! SSE4.1 implementation of the \c find_address algorithm |
| 36 | std::size_t find_address_sse41(const volatile void* addr, const volatile void* const* addrs, std::size_t size) |
| 37 | { |
| 38 | if (size < 12u) |
| 39 | return find_address_generic(addr, addrs, size); |
| 40 | |
| 41 | const __m128i mm_addr = mm_set1_epiptr(ptr: (uintptr_t)addr); |
| 42 | std::size_t pos = 0u; |
| 43 | const std::size_t n = (size + 1u) & ~static_cast< std::size_t >(1u); |
| 44 | for (std::size_t m = n & ~static_cast< std::size_t >(15u); pos < m; pos += 16u) |
| 45 | { |
| 46 | __m128i mm1 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos)); |
| 47 | __m128i mm2 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 2u)); |
| 48 | __m128i mm3 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 4u)); |
| 49 | __m128i mm4 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 6u)); |
| 50 | __m128i mm5 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 8u)); |
| 51 | __m128i mm6 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 10u)); |
| 52 | __m128i mm7 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 12u)); |
| 53 | __m128i mm8 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 14u)); |
| 54 | |
| 55 | mm1 = _mm_cmpeq_epi64(V1: mm1, V2: mm_addr); |
| 56 | mm2 = _mm_cmpeq_epi64(V1: mm2, V2: mm_addr); |
| 57 | mm3 = _mm_cmpeq_epi64(V1: mm3, V2: mm_addr); |
| 58 | mm4 = _mm_cmpeq_epi64(V1: mm4, V2: mm_addr); |
| 59 | mm5 = _mm_cmpeq_epi64(V1: mm5, V2: mm_addr); |
| 60 | mm6 = _mm_cmpeq_epi64(V1: mm6, V2: mm_addr); |
| 61 | mm7 = _mm_cmpeq_epi64(V1: mm7, V2: mm_addr); |
| 62 | mm8 = _mm_cmpeq_epi64(V1: mm8, V2: mm_addr); |
| 63 | |
| 64 | mm1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0))); |
| 65 | mm3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3), _mm_castsi128_ps(mm4), _MM_SHUFFLE(2, 0, 2, 0))); |
| 66 | mm5 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm5), _mm_castsi128_ps(mm6), _MM_SHUFFLE(2, 0, 2, 0))); |
| 67 | mm7 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm7), _mm_castsi128_ps(mm8), _MM_SHUFFLE(2, 0, 2, 0))); |
| 68 | |
| 69 | mm1 = _mm_packs_epi32(a: mm1, b: mm3); |
| 70 | mm5 = _mm_packs_epi32(a: mm5, b: mm7); |
| 71 | |
| 72 | mm1 = _mm_packs_epi16(a: mm1, b: mm5); |
| 73 | |
| 74 | uint32_t mask = _mm_movemask_epi8(a: mm1); |
| 75 | if (mask) |
| 76 | { |
| 77 | pos += atomics::detail::count_trailing_zeros(x: mask); |
| 78 | goto done; |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | if ((n - pos) >= 8u) |
| 83 | { |
| 84 | __m128i mm1 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos)); |
| 85 | __m128i mm2 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 2u)); |
| 86 | __m128i mm3 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 4u)); |
| 87 | __m128i mm4 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 6u)); |
| 88 | |
| 89 | mm1 = _mm_cmpeq_epi64(V1: mm1, V2: mm_addr); |
| 90 | mm2 = _mm_cmpeq_epi64(V1: mm2, V2: mm_addr); |
| 91 | mm3 = _mm_cmpeq_epi64(V1: mm3, V2: mm_addr); |
| 92 | mm4 = _mm_cmpeq_epi64(V1: mm4, V2: mm_addr); |
| 93 | |
| 94 | mm1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0))); |
| 95 | mm3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm3), _mm_castsi128_ps(mm4), _MM_SHUFFLE(2, 0, 2, 0))); |
| 96 | |
| 97 | mm1 = _mm_packs_epi32(a: mm1, b: mm3); |
| 98 | |
| 99 | uint32_t mask = _mm_movemask_epi8(a: mm1); |
| 100 | if (mask) |
| 101 | { |
| 102 | pos += atomics::detail::count_trailing_zeros(x: mask) / 2u; |
| 103 | goto done; |
| 104 | } |
| 105 | |
| 106 | pos += 8u; |
| 107 | } |
| 108 | |
| 109 | if ((n - pos) >= 4u) |
| 110 | { |
| 111 | __m128i mm1 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos)); |
| 112 | __m128i mm2 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos + 2u)); |
| 113 | |
| 114 | mm1 = _mm_cmpeq_epi64(V1: mm1, V2: mm_addr); |
| 115 | mm2 = _mm_cmpeq_epi64(V1: mm2, V2: mm_addr); |
| 116 | |
| 117 | mm1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(mm1), _mm_castsi128_ps(mm2), _MM_SHUFFLE(2, 0, 2, 0))); |
| 118 | |
| 119 | uint32_t mask = _mm_movemask_ps(a: _mm_castsi128_ps(a: mm1)); |
| 120 | if (mask) |
| 121 | { |
| 122 | pos += atomics::detail::count_trailing_zeros(x: mask); |
| 123 | goto done; |
| 124 | } |
| 125 | |
| 126 | pos += 4u; |
| 127 | } |
| 128 | |
| 129 | if (pos < n) |
| 130 | { |
| 131 | __m128i mm1 = _mm_load_si128(p: reinterpret_cast< const __m128i* >(addrs + pos)); |
| 132 | |
| 133 | mm1 = _mm_cmpeq_epi64(V1: mm1, V2: mm_addr); |
| 134 | uint32_t mask = _mm_movemask_pd(a: _mm_castsi128_pd(a: mm1)); |
| 135 | if (mask) |
| 136 | { |
| 137 | pos += atomics::detail::count_trailing_zeros(x: mask); |
| 138 | goto done; |
| 139 | } |
| 140 | |
| 141 | pos += 2u; |
| 142 | } |
| 143 | |
| 144 | done: |
| 145 | return pos; |
| 146 | } |
| 147 | |
| 148 | } // namespace detail |
| 149 | } // namespace atomics |
| 150 | } // namespace boost |
| 151 | |
| 152 | #include <boost/atomic/detail/footer.hpp> |
| 153 | |
| 154 | #endif // BOOST_ARCH_X86 && defined(BOOST_ATOMIC_DETAIL_SIZEOF_POINTER) && (BOOST_ATOMIC_DETAIL_SIZEOF_POINTER == 8) |
| 155 | |