| 1 | /* Function sin vectorized with AVX-512, KNL and SKX versions. |
| 2 | Copyright (C) 2014-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | #include "svml_d_trig_data.h" |
| 21 | #include "svml_d_wrapper_impl.h" |
| 22 | |
| 23 | .section .text.evex512, "ax" , @progbits |
| 24 | ENTRY (_ZGVeN8v_sin_knl) |
| 25 | /* |
| 26 | ALGORITHM DESCRIPTION: |
| 27 | |
| 28 | ( low accuracy ( < 4ulp ) or enhanced performance |
| 29 | ( half of correct mantissa ) implementation ) |
| 30 | |
| 31 | Argument representation: |
| 32 | arg = N*Pi + R |
| 33 | |
| 34 | Result calculation: |
| 35 | sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) |
| 36 | sin(R) is approximated by corresponding polynomial |
| 37 | */ |
| 38 | pushq %rbp |
| 39 | cfi_adjust_cfa_offset (8) |
| 40 | cfi_rel_offset (%rbp, 0) |
| 41 | movq %rsp, %rbp |
| 42 | cfi_def_cfa_register (%rbp) |
| 43 | andq $-64, %rsp |
| 44 | subq $1280, %rsp |
| 45 | movq __svml_d_trig_data@GOTPCREL(%rip), %rax |
| 46 | movq $-1, %rdx |
| 47 | vmovups __dAbsMask(%rax), %zmm6 |
| 48 | vmovups __dInvPI(%rax), %zmm1 |
| 49 | |
| 50 | /* |
| 51 | ARGUMENT RANGE REDUCTION: |
| 52 | X' = |X| |
| 53 | */ |
| 54 | vpandq %zmm6, %zmm0, %zmm12 |
| 55 | vmovups __dPI1_FMA(%rax), %zmm2 |
| 56 | vmovups __dC7_sin(%rax), %zmm7 |
| 57 | |
| 58 | /* SignX - sign bit of X */ |
| 59 | vpandnq %zmm0, %zmm6, %zmm11 |
| 60 | |
| 61 | /* R = X' - N*Pi1 */ |
| 62 | vmovaps %zmm12, %zmm3 |
| 63 | |
| 64 | /* Y = X'*InvPi + RS : right shifter add */ |
| 65 | vfmadd213pd __dRShifter(%rax), %zmm12, %zmm1 |
| 66 | vcmppd $22, __dRangeVal(%rax), %zmm12, %k1 |
| 67 | vpbroadcastq %rdx, %zmm13{%k1}{z} |
| 68 | |
| 69 | /* N = Y - RS : right shifter sub */ |
| 70 | vsubpd __dRShifter(%rax), %zmm1, %zmm4 |
| 71 | |
| 72 | /* SignRes = Y<<63 : shift LSB to MSB place for result sign */ |
| 73 | vpsllq $63, %zmm1, %zmm5 |
| 74 | vptestmq %zmm13, %zmm13, %k0 |
| 75 | vfnmadd231pd %zmm4, %zmm2, %zmm3 |
| 76 | kmovw %k0, %ecx |
| 77 | movzbl %cl, %ecx |
| 78 | |
| 79 | /* R = R - N*Pi2 */ |
| 80 | vfnmadd231pd __dPI2_FMA(%rax), %zmm4, %zmm3 |
| 81 | |
| 82 | /* R = R - N*Pi3 */ |
| 83 | vfnmadd132pd __dPI3_FMA(%rax), %zmm3, %zmm4 |
| 84 | |
| 85 | /* |
| 86 | POLYNOMIAL APPROXIMATION: |
| 87 | R2 = R*R |
| 88 | */ |
| 89 | vmulpd %zmm4, %zmm4, %zmm8 |
| 90 | |
| 91 | /* R = R^SignRes : update sign of reduced argument */ |
| 92 | vpxorq %zmm5, %zmm4, %zmm9 |
| 93 | vfmadd213pd __dC6_sin(%rax), %zmm8, %zmm7 |
| 94 | vfmadd213pd __dC5_sin(%rax), %zmm8, %zmm7 |
| 95 | vfmadd213pd __dC4_sin(%rax), %zmm8, %zmm7 |
| 96 | |
| 97 | /* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ |
| 98 | vfmadd213pd __dC3_sin(%rax), %zmm8, %zmm7 |
| 99 | |
| 100 | /* Poly = R2*(C1+R2*(C2+R2*Poly)) */ |
| 101 | vfmadd213pd __dC2_sin(%rax), %zmm8, %zmm7 |
| 102 | vfmadd213pd __dC1_sin(%rax), %zmm8, %zmm7 |
| 103 | vmulpd %zmm8, %zmm7, %zmm10 |
| 104 | |
| 105 | /* Poly = Poly*R + R */ |
| 106 | vfmadd213pd %zmm9, %zmm9, %zmm10 |
| 107 | |
| 108 | /* |
| 109 | RECONSTRUCTION: |
| 110 | Final sign setting: Res = Poly^SignX |
| 111 | */ |
| 112 | vpxorq %zmm11, %zmm10, %zmm1 |
| 113 | testl %ecx, %ecx |
| 114 | jne .LBL_1_3 |
| 115 | |
| 116 | .LBL_1_2: |
| 117 | cfi_remember_state |
| 118 | vmovaps %zmm1, %zmm0 |
| 119 | movq %rbp, %rsp |
| 120 | cfi_def_cfa_register (%rsp) |
| 121 | popq %rbp |
| 122 | cfi_adjust_cfa_offset (-8) |
| 123 | cfi_restore (%rbp) |
| 124 | ret |
| 125 | |
| 126 | .LBL_1_3: |
| 127 | cfi_restore_state |
| 128 | vmovups %zmm0, 1152(%rsp) |
| 129 | vmovups %zmm1, 1216(%rsp) |
| 130 | je .LBL_1_2 |
| 131 | |
| 132 | xorb %dl, %dl |
| 133 | kmovw %k4, 1048(%rsp) |
| 134 | xorl %eax, %eax |
| 135 | kmovw %k5, 1040(%rsp) |
| 136 | kmovw %k6, 1032(%rsp) |
| 137 | kmovw %k7, 1024(%rsp) |
| 138 | vmovups %zmm16, 960(%rsp) |
| 139 | vmovups %zmm17, 896(%rsp) |
| 140 | vmovups %zmm18, 832(%rsp) |
| 141 | vmovups %zmm19, 768(%rsp) |
| 142 | vmovups %zmm20, 704(%rsp) |
| 143 | vmovups %zmm21, 640(%rsp) |
| 144 | vmovups %zmm22, 576(%rsp) |
| 145 | vmovups %zmm23, 512(%rsp) |
| 146 | vmovups %zmm24, 448(%rsp) |
| 147 | vmovups %zmm25, 384(%rsp) |
| 148 | vmovups %zmm26, 320(%rsp) |
| 149 | vmovups %zmm27, 256(%rsp) |
| 150 | vmovups %zmm28, 192(%rsp) |
| 151 | vmovups %zmm29, 128(%rsp) |
| 152 | vmovups %zmm30, 64(%rsp) |
| 153 | vmovups %zmm31, (%rsp) |
| 154 | movq %rsi, 1064(%rsp) |
| 155 | movq %rdi, 1056(%rsp) |
| 156 | movq %r12, 1096(%rsp) |
| 157 | cfi_offset_rel_rsp (12, 1096) |
| 158 | movb %dl, %r12b |
| 159 | movq %r13, 1088(%rsp) |
| 160 | cfi_offset_rel_rsp (13, 1088) |
| 161 | movl %ecx, %r13d |
| 162 | movq %r14, 1080(%rsp) |
| 163 | cfi_offset_rel_rsp (14, 1080) |
| 164 | movl %eax, %r14d |
| 165 | movq %r15, 1072(%rsp) |
| 166 | cfi_offset_rel_rsp (15, 1072) |
| 167 | cfi_remember_state |
| 168 | |
| 169 | .LBL_1_6: |
| 170 | btl %r14d, %r13d |
| 171 | jc .LBL_1_12 |
| 172 | |
| 173 | .LBL_1_7: |
| 174 | lea 1(%r14), %esi |
| 175 | btl %esi, %r13d |
| 176 | jc .LBL_1_10 |
| 177 | |
| 178 | .LBL_1_8: |
| 179 | addb $1, %r12b |
| 180 | addl $2, %r14d |
| 181 | cmpb $16, %r12b |
| 182 | jb .LBL_1_6 |
| 183 | |
| 184 | kmovw 1048(%rsp), %k4 |
| 185 | movq 1064(%rsp), %rsi |
| 186 | kmovw 1040(%rsp), %k5 |
| 187 | movq 1056(%rsp), %rdi |
| 188 | kmovw 1032(%rsp), %k6 |
| 189 | movq 1096(%rsp), %r12 |
| 190 | cfi_restore (%r12) |
| 191 | movq 1088(%rsp), %r13 |
| 192 | cfi_restore (%r13) |
| 193 | kmovw 1024(%rsp), %k7 |
| 194 | vmovups 960(%rsp), %zmm16 |
| 195 | vmovups 896(%rsp), %zmm17 |
| 196 | vmovups 832(%rsp), %zmm18 |
| 197 | vmovups 768(%rsp), %zmm19 |
| 198 | vmovups 704(%rsp), %zmm20 |
| 199 | vmovups 640(%rsp), %zmm21 |
| 200 | vmovups 576(%rsp), %zmm22 |
| 201 | vmovups 512(%rsp), %zmm23 |
| 202 | vmovups 448(%rsp), %zmm24 |
| 203 | vmovups 384(%rsp), %zmm25 |
| 204 | vmovups 320(%rsp), %zmm26 |
| 205 | vmovups 256(%rsp), %zmm27 |
| 206 | vmovups 192(%rsp), %zmm28 |
| 207 | vmovups 128(%rsp), %zmm29 |
| 208 | vmovups 64(%rsp), %zmm30 |
| 209 | vmovups (%rsp), %zmm31 |
| 210 | movq 1080(%rsp), %r14 |
| 211 | cfi_restore (%r14) |
| 212 | movq 1072(%rsp), %r15 |
| 213 | cfi_restore (%r15) |
| 214 | vmovups 1216(%rsp), %zmm1 |
| 215 | jmp .LBL_1_2 |
| 216 | |
| 217 | .LBL_1_10: |
| 218 | cfi_restore_state |
| 219 | movzbl %r12b, %r15d |
| 220 | shlq $4, %r15 |
| 221 | vmovsd 1160(%rsp,%r15), %xmm0 |
| 222 | call JUMPTARGET(sin) |
| 223 | vmovsd %xmm0, 1224(%rsp,%r15) |
| 224 | jmp .LBL_1_8 |
| 225 | |
| 226 | .LBL_1_12: |
| 227 | movzbl %r12b, %r15d |
| 228 | shlq $4, %r15 |
| 229 | vmovsd 1152(%rsp,%r15), %xmm0 |
| 230 | call JUMPTARGET(sin) |
| 231 | vmovsd %xmm0, 1216(%rsp,%r15) |
| 232 | jmp .LBL_1_7 |
| 233 | END (_ZGVeN8v_sin_knl) |
| 234 | |
| 235 | ENTRY (_ZGVeN8v_sin_skx) |
| 236 | /* |
| 237 | ALGORITHM DESCRIPTION: |
| 238 | |
| 239 | ( low accuracy ( < 4ulp ) or enhanced performance |
| 240 | ( half of correct mantissa ) implementation ) |
| 241 | |
| 242 | Argument representation: |
| 243 | arg = N*Pi + R |
| 244 | |
| 245 | Result calculation: |
| 246 | sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) |
| 247 | sin(R) is approximated by corresponding polynomial |
| 248 | */ |
| 249 | pushq %rbp |
| 250 | cfi_adjust_cfa_offset (8) |
| 251 | cfi_rel_offset (%rbp, 0) |
| 252 | movq %rsp, %rbp |
| 253 | cfi_def_cfa_register (%rbp) |
| 254 | andq $-64, %rsp |
| 255 | subq $1280, %rsp |
| 256 | movq __svml_d_trig_data@GOTPCREL(%rip), %rax |
| 257 | vpternlogd $0xff, %zmm1, %zmm1, %zmm14 |
| 258 | vmovups __dAbsMask(%rax), %zmm7 |
| 259 | vmovups __dInvPI(%rax), %zmm2 |
| 260 | vmovups __dRShifter(%rax), %zmm1 |
| 261 | vmovups __dPI1_FMA(%rax), %zmm3 |
| 262 | vmovups __dC7_sin(%rax), %zmm8 |
| 263 | |
| 264 | /* |
| 265 | ARGUMENT RANGE REDUCTION: |
| 266 | X' = |X| |
| 267 | */ |
| 268 | vandpd %zmm7, %zmm0, %zmm13 |
| 269 | |
| 270 | /* SignX - sign bit of X */ |
| 271 | vandnpd %zmm0, %zmm7, %zmm12 |
| 272 | |
| 273 | /* Y = X'*InvPi + RS : right shifter add */ |
| 274 | vfmadd213pd %zmm1, %zmm13, %zmm2 |
| 275 | vcmppd $18, __dRangeVal(%rax), %zmm13, %k1 |
| 276 | |
| 277 | /* SignRes = Y<<63 : shift LSB to MSB place for result sign */ |
| 278 | vpsllq $63, %zmm2, %zmm6 |
| 279 | |
| 280 | /* N = Y - RS : right shifter sub */ |
| 281 | vsubpd %zmm1, %zmm2, %zmm5 |
| 282 | |
| 283 | /* R = X' - N*Pi1 */ |
| 284 | vmovaps %zmm13, %zmm4 |
| 285 | vfnmadd231pd %zmm5, %zmm3, %zmm4 |
| 286 | |
| 287 | /* R = R - N*Pi2 */ |
| 288 | vfnmadd231pd __dPI2_FMA(%rax), %zmm5, %zmm4 |
| 289 | |
| 290 | /* R = R - N*Pi3 */ |
| 291 | vfnmadd132pd __dPI3_FMA(%rax), %zmm4, %zmm5 |
| 292 | |
| 293 | /* |
| 294 | POLYNOMIAL APPROXIMATION: |
| 295 | R2 = R*R |
| 296 | */ |
| 297 | vmulpd %zmm5, %zmm5, %zmm9 |
| 298 | |
| 299 | /* R = R^SignRes : update sign of reduced argument */ |
| 300 | vxorpd %zmm6, %zmm5, %zmm10 |
| 301 | vfmadd213pd __dC6_sin(%rax), %zmm9, %zmm8 |
| 302 | vfmadd213pd __dC5_sin(%rax), %zmm9, %zmm8 |
| 303 | vfmadd213pd __dC4_sin(%rax), %zmm9, %zmm8 |
| 304 | |
| 305 | /* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */ |
| 306 | vfmadd213pd __dC3_sin(%rax), %zmm9, %zmm8 |
| 307 | |
| 308 | /* Poly = R2*(C1+R2*(C2+R2*Poly)) */ |
| 309 | vfmadd213pd __dC2_sin(%rax), %zmm9, %zmm8 |
| 310 | vfmadd213pd __dC1_sin(%rax), %zmm9, %zmm8 |
| 311 | vmulpd %zmm9, %zmm8, %zmm11 |
| 312 | |
| 313 | /* Poly = Poly*R + R */ |
| 314 | vfmadd213pd %zmm10, %zmm10, %zmm11 |
| 315 | |
| 316 | /* |
| 317 | RECONSTRUCTION: |
| 318 | Final sign setting: Res = Poly^SignX |
| 319 | */ |
| 320 | vxorpd %zmm12, %zmm11, %zmm1 |
| 321 | vpandnq %zmm13, %zmm13, %zmm14{%k1} |
| 322 | vcmppd $3, %zmm14, %zmm14, %k0 |
| 323 | kmovw %k0, %ecx |
| 324 | testl %ecx, %ecx |
| 325 | jne .LBL_2_3 |
| 326 | |
| 327 | .LBL_2_2: |
| 328 | cfi_remember_state |
| 329 | vmovaps %zmm1, %zmm0 |
| 330 | movq %rbp, %rsp |
| 331 | cfi_def_cfa_register (%rsp) |
| 332 | popq %rbp |
| 333 | cfi_adjust_cfa_offset (-8) |
| 334 | cfi_restore (%rbp) |
| 335 | ret |
| 336 | |
| 337 | .LBL_2_3: |
| 338 | cfi_restore_state |
| 339 | vmovups %zmm0, 1152(%rsp) |
| 340 | vmovups %zmm1, 1216(%rsp) |
| 341 | je .LBL_2_2 |
| 342 | |
| 343 | xorb %dl, %dl |
| 344 | xorl %eax, %eax |
| 345 | kmovw %k4, 1048(%rsp) |
| 346 | kmovw %k5, 1040(%rsp) |
| 347 | kmovw %k6, 1032(%rsp) |
| 348 | kmovw %k7, 1024(%rsp) |
| 349 | vmovups %zmm16, 960(%rsp) |
| 350 | vmovups %zmm17, 896(%rsp) |
| 351 | vmovups %zmm18, 832(%rsp) |
| 352 | vmovups %zmm19, 768(%rsp) |
| 353 | vmovups %zmm20, 704(%rsp) |
| 354 | vmovups %zmm21, 640(%rsp) |
| 355 | vmovups %zmm22, 576(%rsp) |
| 356 | vmovups %zmm23, 512(%rsp) |
| 357 | vmovups %zmm24, 448(%rsp) |
| 358 | vmovups %zmm25, 384(%rsp) |
| 359 | vmovups %zmm26, 320(%rsp) |
| 360 | vmovups %zmm27, 256(%rsp) |
| 361 | vmovups %zmm28, 192(%rsp) |
| 362 | vmovups %zmm29, 128(%rsp) |
| 363 | vmovups %zmm30, 64(%rsp) |
| 364 | vmovups %zmm31, (%rsp) |
| 365 | movq %rsi, 1064(%rsp) |
| 366 | movq %rdi, 1056(%rsp) |
| 367 | movq %r12, 1096(%rsp) |
| 368 | cfi_offset_rel_rsp (12, 1096) |
| 369 | movb %dl, %r12b |
| 370 | movq %r13, 1088(%rsp) |
| 371 | cfi_offset_rel_rsp (13, 1088) |
| 372 | movl %ecx, %r13d |
| 373 | movq %r14, 1080(%rsp) |
| 374 | cfi_offset_rel_rsp (14, 1080) |
| 375 | movl %eax, %r14d |
| 376 | movq %r15, 1072(%rsp) |
| 377 | cfi_offset_rel_rsp (15, 1072) |
| 378 | cfi_remember_state |
| 379 | |
| 380 | .LBL_2_6: |
| 381 | btl %r14d, %r13d |
| 382 | jc .LBL_2_12 |
| 383 | |
| 384 | .LBL_2_7: |
| 385 | lea 1(%r14), %esi |
| 386 | btl %esi, %r13d |
| 387 | jc .LBL_2_10 |
| 388 | |
| 389 | .LBL_2_8: |
| 390 | incb %r12b |
| 391 | addl $2, %r14d |
| 392 | cmpb $16, %r12b |
| 393 | jb .LBL_2_6 |
| 394 | |
| 395 | kmovw 1048(%rsp), %k4 |
| 396 | kmovw 1040(%rsp), %k5 |
| 397 | kmovw 1032(%rsp), %k6 |
| 398 | kmovw 1024(%rsp), %k7 |
| 399 | vmovups 960(%rsp), %zmm16 |
| 400 | vmovups 896(%rsp), %zmm17 |
| 401 | vmovups 832(%rsp), %zmm18 |
| 402 | vmovups 768(%rsp), %zmm19 |
| 403 | vmovups 704(%rsp), %zmm20 |
| 404 | vmovups 640(%rsp), %zmm21 |
| 405 | vmovups 576(%rsp), %zmm22 |
| 406 | vmovups 512(%rsp), %zmm23 |
| 407 | vmovups 448(%rsp), %zmm24 |
| 408 | vmovups 384(%rsp), %zmm25 |
| 409 | vmovups 320(%rsp), %zmm26 |
| 410 | vmovups 256(%rsp), %zmm27 |
| 411 | vmovups 192(%rsp), %zmm28 |
| 412 | vmovups 128(%rsp), %zmm29 |
| 413 | vmovups 64(%rsp), %zmm30 |
| 414 | vmovups (%rsp), %zmm31 |
| 415 | vmovups 1216(%rsp), %zmm1 |
| 416 | movq 1064(%rsp), %rsi |
| 417 | movq 1056(%rsp), %rdi |
| 418 | movq 1096(%rsp), %r12 |
| 419 | cfi_restore (%r12) |
| 420 | movq 1088(%rsp), %r13 |
| 421 | cfi_restore (%r13) |
| 422 | movq 1080(%rsp), %r14 |
| 423 | cfi_restore (%r14) |
| 424 | movq 1072(%rsp), %r15 |
| 425 | cfi_restore (%r15) |
| 426 | jmp .LBL_2_2 |
| 427 | |
| 428 | .LBL_2_10: |
| 429 | cfi_restore_state |
| 430 | movzbl %r12b, %r15d |
| 431 | shlq $4, %r15 |
| 432 | vmovsd 1160(%rsp,%r15), %xmm0 |
| 433 | vzeroupper |
| 434 | vmovsd 1160(%rsp,%r15), %xmm0 |
| 435 | |
| 436 | call JUMPTARGET(sin) |
| 437 | |
| 438 | vmovsd %xmm0, 1224(%rsp,%r15) |
| 439 | jmp .LBL_2_8 |
| 440 | |
| 441 | .LBL_2_12: |
| 442 | movzbl %r12b, %r15d |
| 443 | shlq $4, %r15 |
| 444 | vmovsd 1152(%rsp,%r15), %xmm0 |
| 445 | vzeroupper |
| 446 | vmovsd 1152(%rsp,%r15), %xmm0 |
| 447 | |
| 448 | call JUMPTARGET(sin) |
| 449 | |
| 450 | vmovsd %xmm0, 1216(%rsp,%r15) |
| 451 | jmp .LBL_2_7 |
| 452 | END (_ZGVeN8v_sin_skx) |
| 453 | |