| 1 | /* |
| 2 | * Copyright Andrey Semashev 2007 - 2021. |
| 3 | * Distributed under the Boost Software License, Version 1.0. |
| 4 | * (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | * http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | /*! |
| 8 | * \file dump_avx2.cpp |
| 9 | * \author Andrey Semashev |
| 10 | * \date 05.05.2013 |
| 11 | * |
| 12 | * \brief This header is the Boost.Log library implementation, see the library documentation |
| 13 | * at http://www.boost.org/doc/libs/release/libs/log/doc/html/index.html. |
| 14 | */ |
| 15 | |
| 16 | // NOTE: You should generally avoid including headers as much as possible here, because this file |
| 17 | // is compiled with special compiler options, and any included header may result in generation of |
| 18 | // unintended code with these options and violation of ODR. |
| 19 | #include <boost/log/detail/config.hpp> |
| 20 | #include <ostream> |
| 21 | #include <immintrin.h> |
| 22 | #include <boost/cstdint.hpp> |
| 23 | #include <boost/log/detail/header.hpp> |
| 24 | |
| 25 | #if defined(__x86_64) || defined(__x86_64__) || \ |
| 26 | defined(__amd64__) || defined(__amd64) || \ |
| 27 | defined(_M_X64) |
| 28 | #define BOOST_LOG_AUX_X86_64 |
| 29 | #endif |
| 30 | |
| 31 | namespace boost { |
| 32 | |
| 33 | BOOST_LOG_OPEN_NAMESPACE |
| 34 | |
| 35 | namespace aux { |
| 36 | |
| 37 | extern const char g_hex_char_table[2][16]; |
| 38 | |
| 39 | template< typename CharT > |
| 40 | extern void dump_data_generic(const void* data, std::size_t size, std::basic_ostream< CharT >& strm); |
| 41 | |
| 42 | BOOST_LOG_ANONYMOUS_NAMESPACE { |
| 43 | |
| 44 | enum |
| 45 | { |
| 46 | packs_per_stride = 32, |
| 47 | stride = packs_per_stride * 32 |
| 48 | }; |
| 49 | |
| 50 | template< typename CharT > |
| 51 | BOOST_FORCEINLINE void store_characters(__m128i mm_chars, CharT* buf) |
| 52 | { |
| 53 | switch (sizeof(CharT)) |
| 54 | { |
| 55 | case 1: |
| 56 | _mm_store_si128(p: reinterpret_cast< __m128i* >(buf), b: mm_chars); |
| 57 | break; |
| 58 | |
| 59 | case 2: |
| 60 | _mm256_store_si256(p: reinterpret_cast< __m256i* >(buf), a: _mm256_cvtepu8_epi16(V: mm_chars)); |
| 61 | break; |
| 62 | |
| 63 | case 4: |
| 64 | { |
| 65 | __m128i mm = _mm_unpackhi_epi64(a: mm_chars, b: mm_chars); |
| 66 | _mm256_store_si256(p: reinterpret_cast< __m256i* >(buf), a: _mm256_cvtepu8_epi32(V: mm_chars)); |
| 67 | _mm256_store_si256(p: reinterpret_cast< __m256i* >(buf) + 1, a: _mm256_cvtepu8_epi32(V: mm)); |
| 68 | } |
| 69 | break; |
| 70 | } |
| 71 | } |
| 72 | |
| 73 | template< typename CharT > |
| 74 | BOOST_FORCEINLINE void store_characters_x3(__m256i mm_chars1, __m256i mm_chars2, __m256i mm_chars3, CharT* buf) |
| 75 | { |
| 76 | store_characters(_mm256_castsi256_si128(a: mm_chars1), buf); |
| 77 | store_characters(_mm256_castsi256_si128(a: mm_chars2), buf + 16); |
| 78 | store_characters(_mm256_castsi256_si128(a: mm_chars3), buf + 32); |
| 79 | store_characters(_mm256_extracti128_si256(mm_chars1, 1), buf + 48); |
| 80 | store_characters(_mm256_extracti128_si256(mm_chars2, 1), buf + 64); |
| 81 | store_characters(_mm256_extracti128_si256(mm_chars3, 1), buf + 80); |
| 82 | } |
| 83 | |
| 84 | union ymm_constant |
| 85 | { |
| 86 | uint8_t as_bytes[32]; |
| 87 | __m256i as_mm; |
| 88 | |
| 89 | BOOST_FORCEINLINE operator __m256i () const { return as_mm; } |
| 90 | BOOST_FORCEINLINE operator __m128i () const { return _mm256_castsi256_si128(a: as_mm); } |
| 91 | }; |
| 92 | |
| 93 | static const ymm_constant mm_shuffle_pattern1 = {.as_bytes: { 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80 }}; |
| 94 | static const ymm_constant mm_shuffle_pattern2 = {.as_bytes: { 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10 }}; |
| 95 | static const ymm_constant mm_shuffle_pattern3 = {.as_bytes: { 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }}; |
| 96 | static const ymm_constant mm_shuffle_pattern13 = {.as_bytes: { 0x80, 0, 1, 0x80, 2, 3, 0x80, 4, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 5, 0x80, 6, 7, 0x80, 8, 9, 0x80, 10, 11, 0x80, 12, 13, 0x80, 14, 15 }}; |
| 97 | |
| 98 | #if defined(BOOST_LOG_AUX_X86_64) |
| 99 | |
| 100 | // x86-64 architecture has more registers which we can utilize to pass constants |
| 101 | #define BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL __m256i mm_15, __m256i mm_char_space, |
| 102 | #define BOOST_LOG_AUX_MM256_CONSTANT_ARGS mm_15, mm_char_space, |
| 103 | #define BOOST_LOG_AUX_MM256_CONSTANTS \ |
| 104 | const __m256i mm_15 = _mm256_set1_epi32(0x0F0F0F0F);\ |
| 105 | const __m256i mm_char_space = _mm256_set1_epi32(0x20202020); |
| 106 | #define BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL __m128i mm_15, __m128i mm_char_space, |
| 107 | #define BOOST_LOG_AUX_MM128_CONSTANT_ARGS mm_15, mm_char_space, |
| 108 | #define BOOST_LOG_AUX_MM128_CONSTANTS \ |
| 109 | const __m128i mm_15 = _mm_set1_epi32(0x0F0F0F0F);\ |
| 110 | const __m128i mm_char_space = _mm_set1_epi32(0x20202020); |
| 111 | |
| 112 | #else |
| 113 | |
| 114 | // MSVC in 32-bit mode is not able to pass all constants to dump_pack, and is also not able to align them on the stack, so we have to fetch them from global constants |
| 115 | static const ymm_constant mm_15 = {{ 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F }}; |
| 116 | static const ymm_constant mm_char_space = {{ 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20 }}; |
| 117 | #define BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL |
| 118 | #define BOOST_LOG_AUX_MM256_CONSTANT_ARGS |
| 119 | #define BOOST_LOG_AUX_MM256_CONSTANTS |
| 120 | #define BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL |
| 121 | #define BOOST_LOG_AUX_MM128_CONSTANT_ARGS |
| 122 | #define BOOST_LOG_AUX_MM128_CONSTANTS |
| 123 | |
| 124 | #endif |
| 125 | |
| 126 | /*! |
| 127 | * \brief Dumps a pack of input data into a string of 8 bit ASCII characters. |
| 128 | * |
| 129 | * The composed string is placed as follows (in Intel notation): mm_output1[127:0], mm_output2[127:0], mm_output3[127:0], mm_output1[255:128], mm_output2[255:128], mm_output3[255:128]. |
| 130 | */ |
| 131 | static BOOST_FORCEINLINE void dump_pack |
| 132 | ( |
| 133 | BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL |
| 134 | __m256i mm_char_table, __m256i mm_input, |
| 135 | __m256i& mm_output1, __m256i& mm_output2, __m256i& mm_output3 |
| 136 | ) |
| 137 | { |
| 138 | // Split half-bytes |
| 139 | __m256i mm_input_hi = _mm256_and_si256(a: _mm256_srli_epi16(a: mm_input, count: 4), b: mm_15); |
| 140 | __m256i mm_input_lo = _mm256_and_si256(a: mm_input, b: mm_15); |
| 141 | |
| 142 | // Stringize each of the halves |
| 143 | mm_input_hi = _mm256_shuffle_epi8(a: mm_char_table, b: mm_input_hi); |
| 144 | mm_input_lo = _mm256_shuffle_epi8(a: mm_char_table, b: mm_input_lo); |
| 145 | |
| 146 | // Join them back together |
| 147 | __m256i mm_1 = _mm256_unpacklo_epi8(a: mm_input_hi, b: mm_input_lo); |
| 148 | __m256i mm_2 = _mm256_unpackhi_epi8(a: mm_input_hi, b: mm_input_lo); |
| 149 | |
| 150 | // Insert spaces between stringized bytes: |
| 151 | // |0123456789abcdef|0123456789abcdef| |
| 152 | // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef| |
| 153 | __m256i mm_out1 = _mm256_shuffle_epi8(a: mm_1, b: mm_shuffle_pattern1.as_mm); |
| 154 | __m256i mm_out3 = _mm256_shuffle_epi8(a: mm_2, b: mm_shuffle_pattern3.as_mm); |
| 155 | __m256i mm_out2 = _mm256_shuffle_epi8(_mm256_alignr_epi8(mm_2, mm_1, 10), b: mm_shuffle_pattern2.as_mm); |
| 156 | |
| 157 | mm_output1 = _mm256_max_epu8(a: mm_out1, b: mm_char_space); |
| 158 | mm_output2 = _mm256_max_epu8(a: mm_out2, b: mm_char_space); |
| 159 | mm_output3 = _mm256_max_epu8(a: mm_out3, b: mm_char_space); |
| 160 | } |
| 161 | |
| 162 | //! Dumps a pack of input data into a string of 8 bit ASCII characters |
| 163 | static BOOST_FORCEINLINE void dump_pack |
| 164 | ( |
| 165 | BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL |
| 166 | __m128i mm_char_table, __m128i mm_input, |
| 167 | __m128i& mm_output1, __m128i& mm_output2, __m128i& mm_output3 |
| 168 | ) |
| 169 | { |
| 170 | // Split half-bytes |
| 171 | __m128i mm_input_hi = _mm_and_si128(a: _mm_srli_epi16(a: mm_input, count: 4), b: mm_15); |
| 172 | __m128i mm_input_lo = _mm_and_si128(a: mm_input, b: mm_15); |
| 173 | |
| 174 | // Stringize each of the halves |
| 175 | mm_input_hi = _mm_shuffle_epi8(a: mm_char_table, b: mm_input_hi); |
| 176 | mm_input_lo = _mm_shuffle_epi8(a: mm_char_table, b: mm_input_lo); |
| 177 | |
| 178 | // Join them back together |
| 179 | __m128i mm_1 = _mm_unpacklo_epi8(a: mm_input_hi, b: mm_input_lo); |
| 180 | __m128i mm_2 = _mm_unpackhi_epi8(a: mm_input_hi, b: mm_input_lo); |
| 181 | |
| 182 | // Insert spaces between stringized bytes: |
| 183 | // |0123456789abcdef|0123456789abcdef| |
| 184 | // | 01 23 45 67 89 |ab cd ef 01 23 4|5 67 89 ab cd ef| |
| 185 | mm_output1 = _mm_shuffle_epi8(a: mm_1, b: mm_shuffle_pattern1); |
| 186 | mm_output2 = _mm_shuffle_epi8(_mm_alignr_epi8(mm_2, mm_1, 10), b: mm_shuffle_pattern2); |
| 187 | mm_output3 = _mm_shuffle_epi8(a: mm_2, b: mm_shuffle_pattern3); |
| 188 | |
| 189 | mm_output1 = _mm_max_epu8(a: mm_output1, b: mm_char_space); |
| 190 | mm_output2 = _mm_max_epu8(a: mm_output2, b: mm_char_space); |
| 191 | mm_output3 = _mm_max_epu8(a: mm_output3, b: mm_char_space); |
| 192 | } |
| 193 | |
| 194 | template< typename CharT > |
| 195 | BOOST_FORCEINLINE void dump_data_avx2(const void* data, std::size_t size, std::basic_ostream< CharT >& strm) |
| 196 | { |
| 197 | typedef CharT char_type; |
| 198 | |
| 199 | char_type buf_storage[stride * 3u + 32u]; |
| 200 | // Align the temporary buffer at 32 bytes |
| 201 | char_type* const buf = reinterpret_cast< char_type* >((uint8_t*)buf_storage + (32u - (((uintptr_t)(char_type*)buf_storage) & 31u))); |
| 202 | char_type* buf_begin = buf + 1u; // skip the first space of the first chunk |
| 203 | char_type* buf_end = buf + stride * 3u; |
| 204 | |
| 205 | const char* const char_table = g_hex_char_table[(strm.flags() & std::ios_base::uppercase) != 0]; |
| 206 | #if defined(__GNUC__) && ((defined(BOOST_GCC) && BOOST_GCC < 40900) || (defined(BOOST_CLANG) && BOOST_CLANG_VERSION < 40000)) |
| 207 | // gcc 4.7 is missing _mm256_broadcastsi128_si256 declaration in immintrin.h. |
| 208 | // gcc 4.8 generates vmovdqu+vinserti128 instead of a single vbroadcasti128. |
| 209 | // clang up until 4.0 generates vmovdqu+vinserti128 or worse. |
| 210 | __m256i mm_char_table; |
| 211 | __asm__("vbroadcasti128 %1, %0" : "=x" (mm_char_table) : "m" (*reinterpret_cast< const __m128i* >(char_table))); |
| 212 | #else |
| 213 | const __m256i mm_char_table = _mm256_broadcastsi128_si256(X: _mm_loadu_si128(p: reinterpret_cast< const __m128i* >(char_table))); |
| 214 | #endif |
| 215 | |
| 216 | // First, check the input alignment. Also, if we can dump the whole data in one go, do it right away. It turns out to be faster than splitting |
| 217 | // the work between prealign and tail part. It is also a fairly common case since on most platforms memory is not aligned to 32 bytes (i.e. prealign is often needed). |
| 218 | const uint8_t* p = static_cast< const uint8_t* >(data); |
| 219 | const std::size_t prealign_size = size == 32u ? static_cast< std::size_t >(32u) : static_cast< std::size_t >((32u - ((uintptr_t)p & 31u)) & 31u); |
| 220 | if (prealign_size) |
| 221 | { |
| 222 | __m256i mm_input = _mm256_loadu_si256(p: reinterpret_cast< const __m256i* >(p)); |
| 223 | BOOST_LOG_AUX_MM256_CONSTANTS |
| 224 | |
| 225 | __m256i mm_output1, mm_output2, mm_output3; |
| 226 | dump_pack(BOOST_LOG_AUX_MM256_CONSTANT_ARGS mm_char_table, mm_input, mm_output1, mm_output2, mm_output3); |
| 227 | |
| 228 | store_characters_x3(mm_output1, mm_output2, mm_output3, buf); |
| 229 | |
| 230 | _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call |
| 231 | strm.write(buf_begin, prealign_size * 3u - 1u); |
| 232 | |
| 233 | buf_begin = buf; |
| 234 | size -= prealign_size; |
| 235 | p += prealign_size; |
| 236 | } |
| 237 | |
| 238 | const std::size_t stride_count = size / stride; |
| 239 | std::size_t tail_size = size % stride; |
| 240 | for (std::size_t i = 0; i < stride_count; ++i) |
| 241 | { |
| 242 | char_type* b = buf; |
| 243 | BOOST_LOG_AUX_MM256_CONSTANTS |
| 244 | |
| 245 | for (unsigned int j = 0; j < packs_per_stride; ++j, b += 3u * 32u, p += 32u) |
| 246 | { |
| 247 | __m256i mm_input = _mm256_load_si256(p: reinterpret_cast< const __m256i* >(p)); |
| 248 | __m256i mm_output1, mm_output2, mm_output3; |
| 249 | dump_pack(BOOST_LOG_AUX_MM256_CONSTANT_ARGS mm_char_table, mm_input, mm_output1, mm_output2, mm_output3); |
| 250 | |
| 251 | store_characters_x3(mm_output1, mm_output2, mm_output3, b); |
| 252 | } |
| 253 | |
| 254 | _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call |
| 255 | strm.write(buf_begin, buf_end - buf_begin); |
| 256 | buf_begin = buf; |
| 257 | } |
| 258 | |
| 259 | if (BOOST_UNLIKELY(tail_size > 0)) |
| 260 | { |
| 261 | char_type* b = buf; |
| 262 | while (tail_size >= 16u) |
| 263 | { |
| 264 | __m128i mm_input = _mm_load_si128(p: reinterpret_cast< const __m128i* >(p)); |
| 265 | BOOST_LOG_AUX_MM128_CONSTANTS |
| 266 | |
| 267 | __m128i mm_output1, mm_output2, mm_output3; |
| 268 | dump_pack(BOOST_LOG_AUX_MM128_CONSTANT_ARGS mm_char_table: _mm256_castsi256_si128(a: mm_char_table), mm_input, mm_output1, mm_output2, mm_output3); |
| 269 | |
| 270 | store_characters(mm_output1, b); |
| 271 | store_characters(mm_output2, b + 16u); |
| 272 | store_characters(mm_output3, b + 32u); |
| 273 | |
| 274 | b += 3u * 16u; |
| 275 | p += 16u; |
| 276 | tail_size -= 16u; |
| 277 | } |
| 278 | |
| 279 | _mm256_zeroall(); // need to zero all ymm registers to avoid register spills/restores the compler generates around the function call |
| 280 | for (unsigned int i = 0; i < tail_size; ++i, ++p, b += 3u) |
| 281 | { |
| 282 | uint32_t n = *p; |
| 283 | b[0] = static_cast< char_type >(' '); |
| 284 | b[1] = static_cast< char_type >(char_table[n >> 4]); |
| 285 | b[2] = static_cast< char_type >(char_table[n & 0x0F]); |
| 286 | } |
| 287 | |
| 288 | strm.write(buf_begin, b - buf_begin); |
| 289 | } |
| 290 | } |
| 291 | |
| 292 | #undef BOOST_LOG_AUX_MM256_CONSTANT_ARGS_DECL |
| 293 | #undef BOOST_LOG_AUX_MM256_CONSTANT_ARGS |
| 294 | #undef BOOST_LOG_AUX_MM256_CONSTANTS |
| 295 | #undef BOOST_LOG_AUX_MM128_CONSTANT_ARGS_DECL |
| 296 | #undef BOOST_LOG_AUX_MM128_CONSTANT_ARGS |
| 297 | |
| 298 | } // namespace |
| 299 | |
| 300 | void dump_data_char_avx2(const void* data, std::size_t size, std::basic_ostream< char >& strm) |
| 301 | { |
| 302 | if (size >= 32) |
| 303 | { |
| 304 | dump_data_avx2(data, size, strm); |
| 305 | } |
| 306 | else |
| 307 | { |
| 308 | dump_data_generic(data, size, strm); |
| 309 | } |
| 310 | } |
| 311 | |
| 312 | void dump_data_wchar_avx2(const void* data, std::size_t size, std::basic_ostream< wchar_t >& strm) |
| 313 | { |
| 314 | if (size >= 32) |
| 315 | { |
| 316 | dump_data_avx2(data, size, strm); |
| 317 | } |
| 318 | else |
| 319 | { |
| 320 | dump_data_generic(data, size, strm); |
| 321 | } |
| 322 | } |
| 323 | |
| 324 | #if !defined(BOOST_NO_CXX11_CHAR16_T) |
| 325 | void dump_data_char16_avx2(const void* data, std::size_t size, std::basic_ostream< char16_t >& strm) |
| 326 | { |
| 327 | if (size >= 32) |
| 328 | { |
| 329 | dump_data_avx2(data, size, strm); |
| 330 | } |
| 331 | else |
| 332 | { |
| 333 | dump_data_generic(data, size, strm); |
| 334 | } |
| 335 | } |
| 336 | #endif |
| 337 | |
| 338 | #if !defined(BOOST_NO_CXX11_CHAR32_T) |
| 339 | void dump_data_char32_avx2(const void* data, std::size_t size, std::basic_ostream< char32_t >& strm) |
| 340 | { |
| 341 | if (size >= 32) |
| 342 | { |
| 343 | dump_data_avx2(data, size, strm); |
| 344 | } |
| 345 | else |
| 346 | { |
| 347 | dump_data_generic(data, size, strm); |
| 348 | } |
| 349 | } |
| 350 | #endif |
| 351 | |
| 352 | } // namespace aux |
| 353 | |
| 354 | BOOST_LOG_CLOSE_NAMESPACE // namespace log |
| 355 | |
| 356 | } // namespace boost |
| 357 | |
| 358 | #include <boost/log/detail/footer.hpp> |
| 359 | |