| 1 | /* Function cbrt vectorized with AVX-512. |
| 2 | Copyright (C) 2021-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | https://www.gnu.org/licenses/. */ |
| 18 | |
| 19 | /* |
| 20 | * ALGORITHM DESCRIPTION: |
| 21 | * |
| 22 | * x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52 |
| 23 | * Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5], |
| 24 | * where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in double precision |
| 25 | * cbrt(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5] |
| 26 | * (T stores the high 53 bits, D stores the low order bits) |
| 27 | * Result=2^k*T+(2^k*T*r)*P+2^k*D |
| 28 | * where P=p1+p2*r+..+p8*r^7 |
| 29 | * |
| 30 | */ |
| 31 | |
| 32 | /* Offsets for data table __svml_dcbrt_data_internal_avx512 |
| 33 | */ |
| 34 | #define etbl_H 0 |
| 35 | #define etbl_L 64 |
| 36 | #define cbrt_tbl_H 128 |
| 37 | #define BiasL 256 |
| 38 | #define SZero 320 |
| 39 | #define OneThird 384 |
| 40 | #define Bias3 448 |
| 41 | #define Three 512 |
| 42 | #define One 576 |
| 43 | #define poly_coeff10 640 |
| 44 | #define poly_coeff9 704 |
| 45 | #define poly_coeff8 768 |
| 46 | #define poly_coeff7 832 |
| 47 | #define poly_coeff6 896 |
| 48 | #define poly_coeff5 960 |
| 49 | #define poly_coeff4 1024 |
| 50 | #define poly_coeff3 1088 |
| 51 | #define poly_coeff2 1152 |
| 52 | #define poly_coeff1 1216 |
| 53 | |
| 54 | #include <sysdep.h> |
| 55 | |
| 56 | .section .text.evex512, "ax" , @progbits |
| 57 | ENTRY(_ZGVeN8v_cbrt_skx) |
| 58 | vgetmantpd $0, {sae}, %zmm0, %zmm14 |
| 59 | |
| 60 | /* GetExp(x) */ |
| 61 | vgetexppd {sae}, %zmm0, %zmm7 |
| 62 | vmovups BiasL+__svml_dcbrt_data_internal_avx512(%rip), %zmm8 |
| 63 | |
| 64 | /* exponent/3 */ |
| 65 | vmovups OneThird+__svml_dcbrt_data_internal_avx512(%rip), %zmm9 |
| 66 | vmovups Bias3+__svml_dcbrt_data_internal_avx512(%rip), %zmm10 |
| 67 | |
| 68 | /* Reduced argument: R = DblRcp*Mantissa - 1 */ |
| 69 | vmovups One+__svml_dcbrt_data_internal_avx512(%rip), %zmm2 |
| 70 | |
| 71 | /* exponent%3 (to be used as index) */ |
| 72 | vmovups Three+__svml_dcbrt_data_internal_avx512(%rip), %zmm11 |
| 73 | |
| 74 | /* DblRcp ~ 1/Mantissa */ |
| 75 | vrcp14pd %zmm14, %zmm13 |
| 76 | vaddpd {rn-sae}, %zmm8, %zmm7, %zmm12 |
| 77 | vandpd SZero+__svml_dcbrt_data_internal_avx512(%rip), %zmm0, %zmm6 |
| 78 | |
| 79 | /* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */ |
| 80 | vrndscalepd $72, {sae}, %zmm13, %zmm15 |
| 81 | vfmsub231pd {rn-sae}, %zmm12, %zmm9, %zmm10 |
| 82 | |
| 83 | /* polynomial */ |
| 84 | vmovups poly_coeff10+__svml_dcbrt_data_internal_avx512(%rip), %zmm0 |
| 85 | vmovups poly_coeff8+__svml_dcbrt_data_internal_avx512(%rip), %zmm7 |
| 86 | vmovups poly_coeff7+__svml_dcbrt_data_internal_avx512(%rip), %zmm9 |
| 87 | vfmsub231pd {rn-sae}, %zmm15, %zmm14, %zmm2 |
| 88 | vrndscalepd $9, {sae}, %zmm10, %zmm5 |
| 89 | |
| 90 | /* Table lookup */ |
| 91 | vmovups cbrt_tbl_H+__svml_dcbrt_data_internal_avx512(%rip), %zmm10 |
| 92 | vmovups poly_coeff6+__svml_dcbrt_data_internal_avx512(%rip), %zmm8 |
| 93 | vmovups poly_coeff3+__svml_dcbrt_data_internal_avx512(%rip), %zmm13 |
| 94 | vfmadd231pd {rn-sae}, %zmm2, %zmm7, %zmm9 |
| 95 | vfnmadd231pd {rn-sae}, %zmm5, %zmm11, %zmm12 |
| 96 | vmovups poly_coeff5+__svml_dcbrt_data_internal_avx512(%rip), %zmm11 |
| 97 | vmovups poly_coeff1+__svml_dcbrt_data_internal_avx512(%rip), %zmm14 |
| 98 | |
| 99 | /* Prepare table index */ |
| 100 | vpsrlq $49, %zmm15, %zmm1 |
| 101 | |
| 102 | /* Table lookup: 2^(exponent%3) */ |
| 103 | vpermpd __svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm4 |
| 104 | vpermpd etbl_L+__svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm3 |
| 105 | vpermt2pd cbrt_tbl_H+64+__svml_dcbrt_data_internal_avx512(%rip), %zmm1, %zmm10 |
| 106 | vmovups poly_coeff9+__svml_dcbrt_data_internal_avx512(%rip), %zmm1 |
| 107 | vfmadd231pd {rn-sae}, %zmm2, %zmm8, %zmm11 |
| 108 | vmovups poly_coeff2+__svml_dcbrt_data_internal_avx512(%rip), %zmm12 |
| 109 | vscalefpd {rn-sae}, %zmm5, %zmm10, %zmm15 |
| 110 | vfmadd231pd {rn-sae}, %zmm2, %zmm0, %zmm1 |
| 111 | vmovups poly_coeff4+__svml_dcbrt_data_internal_avx512(%rip), %zmm5 |
| 112 | vfmadd231pd {rn-sae}, %zmm2, %zmm12, %zmm14 |
| 113 | vmulpd {rn-sae}, %zmm2, %zmm2, %zmm0 |
| 114 | vfmadd231pd {rn-sae}, %zmm2, %zmm5, %zmm13 |
| 115 | |
| 116 | /* Sh*R */ |
| 117 | vmulpd {rn-sae}, %zmm2, %zmm4, %zmm2 |
| 118 | vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm1 |
| 119 | vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1 |
| 120 | vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm1 |
| 121 | vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1 |
| 122 | |
| 123 | /* Sl + (Sh*R)*Poly */ |
| 124 | vfmadd213pd {rn-sae}, %zmm3, %zmm1, %zmm2 |
| 125 | |
| 126 | /* |
| 127 | * branch-free |
| 128 | * scaled_Th*(Sh+Sl+Sh*R*Poly) |
| 129 | */ |
| 130 | vaddpd {rn-sae}, %zmm4, %zmm2, %zmm3 |
| 131 | vmulpd {rn-sae}, %zmm15, %zmm3, %zmm4 |
| 132 | vorpd %zmm6, %zmm4, %zmm0 |
| 133 | ret |
| 134 | |
| 135 | END(_ZGVeN8v_cbrt_skx) |
| 136 | |
| 137 | .section .rodata, "a" |
| 138 | .align 64 |
| 139 | |
| 140 | #ifdef __svml_dcbrt_data_internal_avx512_typedef |
| 141 | typedef unsigned int VUINT32; |
| 142 | typedef struct { |
| 143 | __declspec(align(64)) VUINT32 etbl_H[8][2]; |
| 144 | __declspec(align(64)) VUINT32 etbl_L[8][2]; |
| 145 | __declspec(align(64)) VUINT32 cbrt_tbl_H[16][2]; |
| 146 | __declspec(align(64)) VUINT32 BiasL[8][2]; |
| 147 | __declspec(align(64)) VUINT32 SZero[8][2]; |
| 148 | __declspec(align(64)) VUINT32 OneThird[8][2]; |
| 149 | __declspec(align(64)) VUINT32 Bias3[8][2]; |
| 150 | __declspec(align(64)) VUINT32 Three[8][2]; |
| 151 | __declspec(align(64)) VUINT32 One[8][2]; |
| 152 | __declspec(align(64)) VUINT32 poly_coeff10[8][2]; |
| 153 | __declspec(align(64)) VUINT32 poly_coeff9[8][2]; |
| 154 | __declspec(align(64)) VUINT32 poly_coeff8[8][2]; |
| 155 | __declspec(align(64)) VUINT32 poly_coeff7[8][2]; |
| 156 | __declspec(align(64)) VUINT32 poly_coeff6[8][2]; |
| 157 | __declspec(align(64)) VUINT32 poly_coeff5[8][2]; |
| 158 | __declspec(align(64)) VUINT32 poly_coeff4[8][2]; |
| 159 | __declspec(align(64)) VUINT32 poly_coeff3[8][2]; |
| 160 | __declspec(align(64)) VUINT32 poly_coeff2[8][2]; |
| 161 | __declspec(align(64)) VUINT32 poly_coeff1[8][2]; |
| 162 | } __svml_dcbrt_data_internal_avx512; |
| 163 | #endif |
| 164 | __svml_dcbrt_data_internal_avx512: |
| 165 | /* etbl_H */ |
| 166 | .quad 0x3ff0000000000000 |
| 167 | .quad 0x3ff428a2f98d728b |
| 168 | .quad 0x3ff965fea53d6e3d |
| 169 | .quad 0x0000000000000000 |
| 170 | .quad 0xbff0000000000000 |
| 171 | .quad 0xbff428a2f98d728b |
| 172 | .quad 0xbff965fea53d6e3d |
| 173 | .quad 0x0000000000000000 |
| 174 | /* etbl_L */ |
| 175 | .align 64 |
| 176 | .quad 0x0000000000000000 |
| 177 | .quad 0xbc7ddc22548ea41e |
| 178 | .quad 0xbc9f53e999952f09 |
| 179 | .quad 0x0000000000000000 |
| 180 | .quad 0x0000000000000000 |
| 181 | .quad 0x3c7ddc22548ea41e |
| 182 | .quad 0x3c9f53e999952f09 |
| 183 | .quad 0x0000000000000000 |
| 184 | /* cbrt_tbl_H */ |
| 185 | .align 64 |
| 186 | .quad 0x3ff428a2f98d728b |
| 187 | .quad 0x3ff361f35ca116ff |
| 188 | .quad 0x3ff2b6b5edf6b54a |
| 189 | .quad 0x3ff220e6dd675180 |
| 190 | .quad 0x3ff19c3b38e975a8 |
| 191 | .quad 0x3ff12589c21fb842 |
| 192 | .quad 0x3ff0ba6ee5f9aad4 |
| 193 | .quad 0x3ff059123d3a9848 |
| 194 | .quad 0x3ff0000000000000 |
| 195 | .quad 0x0000000000000000 |
| 196 | .quad 0x0000000000000000 |
| 197 | .quad 0x0000000000000000 |
| 198 | .quad 0x0000000000000000 |
| 199 | .quad 0x0000000000000000 |
| 200 | .quad 0x0000000000000000 |
| 201 | .quad 0x0000000000000000 |
| 202 | /* BiasL */ |
| 203 | .align 64 |
| 204 | .quad 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000 |
| 205 | /* Zero */ |
| 206 | .align 64 |
| 207 | .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 |
| 208 | /* OneThird */ |
| 209 | .align 64 |
| 210 | .quad 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556 |
| 211 | /* Bias3 */ |
| 212 | .align 64 |
| 213 | .quad 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000 |
| 214 | /* Three */ |
| 215 | .align 64 |
| 216 | .quad 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000 |
| 217 | /* One */ |
| 218 | .align 64 |
| 219 | .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 |
| 220 | /* poly_coeff10 */ |
| 221 | .align 64 |
| 222 | .quad 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62 |
| 223 | /* poly_coeff9 */ |
| 224 | .align 64 |
| 225 | .quad 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875 |
| 226 | /* poly_coeff8 */ |
| 227 | .align 64 |
| 228 | .quad 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f |
| 229 | /* poly_coeff7 */ |
| 230 | .align 64 |
| 231 | .quad 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914 |
| 232 | /* poly_coeff6 */ |
| 233 | .align 64 |
| 234 | .quad 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e |
| 235 | /* poly_coeff5 */ |
| 236 | .align 64 |
| 237 | .quad 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569 |
| 238 | /* poly_coeff4 */ |
| 239 | .align 64 |
| 240 | .quad 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e |
| 241 | /* poly_coeff3 */ |
| 242 | .align 64 |
| 243 | .quad 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31 |
| 244 | /* poly_coeff2 */ |
| 245 | .align 64 |
| 246 | .quad 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741 |
| 247 | /* poly_coeff1 */ |
| 248 | .align 64 |
| 249 | .quad 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557 |
| 250 | .align 64 |
| 251 | .type __svml_dcbrt_data_internal_avx512, @object |
| 252 | .size __svml_dcbrt_data_internal_avx512, .-__svml_dcbrt_data_internal_avx512 |
| 253 | |