| 1 | /* Function sincos vectorized with AVX-512. KNL and SKX versions. |
| 2 | Copyright (C) 2014-2024 Free Software Foundation, Inc. |
| 3 | This file is part of the GNU C Library. |
| 4 | |
| 5 | The GNU C Library is free software; you can redistribute it and/or |
| 6 | modify it under the terms of the GNU Lesser General Public |
| 7 | License as published by the Free Software Foundation; either |
| 8 | version 2.1 of the License, or (at your option) any later version. |
| 9 | |
| 10 | The GNU C Library is distributed in the hope that it will be useful, |
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | Lesser General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU Lesser General Public |
| 16 | License along with the GNU C Library; if not, see |
| 17 | <https://www.gnu.org/licenses/>. */ |
| 18 | |
| 19 | #include <sysdep.h> |
| 20 | #include "svml_d_trig_data.h" |
| 21 | #include "svml_d_wrapper_impl.h" |
| 22 | |
| 23 | /* |
| 24 | ALGORITHM DESCRIPTION: |
| 25 | |
| 26 | ( low accuracy ( < 4ulp ) or enhanced performance |
| 27 | ( half of correct mantissa ) implementation ) |
| 28 | |
| 29 | Argument representation: |
| 30 | arg = N*Pi + R |
| 31 | |
| 32 | Result calculation: |
| 33 | sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R) |
| 34 | arg + Pi/2 = (N'*Pi + R') |
| 35 | cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R') |
| 36 | sin(R), sin(R') are approximated by corresponding polynomial. */ |
| 37 | |
| 38 | .section .text.evex512, "ax" , @progbits |
| 39 | ENTRY (_ZGVeN8vl8l8_sincos_knl) |
| 40 | pushq %rbp |
| 41 | cfi_adjust_cfa_offset (8) |
| 42 | cfi_rel_offset (%rbp, 0) |
| 43 | movq %rsp, %rbp |
| 44 | cfi_def_cfa_register (%rbp) |
| 45 | andq $-64, %rsp |
| 46 | subq $1344, %rsp |
| 47 | movq __svml_d_trig_data@GOTPCREL(%rip), %rax |
| 48 | vmovaps %zmm0, %zmm4 |
| 49 | movq $-1, %rdx |
| 50 | vmovups __dSignMask(%rax), %zmm12 |
| 51 | vmovups __dInvPI(%rax), %zmm5 |
| 52 | |
| 53 | /* ARGUMENT RANGE REDUCTION: |
| 54 | Absolute argument: X' = |X| */ |
| 55 | vpandnq %zmm4, %zmm12, %zmm3 |
| 56 | vmovups __dPI1_FMA(%rax), %zmm7 |
| 57 | vmovups __dPI3_FMA(%rax), %zmm9 |
| 58 | |
| 59 | /* SinR = X' - SinN*Pi1 */ |
| 60 | vmovaps %zmm3, %zmm8 |
| 61 | |
| 62 | /* CosR = SinX - CosN*Pi1 */ |
| 63 | vmovaps %zmm3, %zmm10 |
| 64 | |
| 65 | /* SinY = X'*InvPi + RS : right shifter add */ |
| 66 | vfmadd213pd __dRShifter(%rax), %zmm3, %zmm5 |
| 67 | vmovups __dC6(%rax), %zmm13 |
| 68 | |
| 69 | /* SinN = Y - RS : right shifter sub */ |
| 70 | vsubpd __dRShifter(%rax), %zmm5, %zmm1 |
| 71 | vmovaps %zmm13, %zmm14 |
| 72 | |
| 73 | /* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ |
| 74 | vpsllq $63, %zmm5, %zmm2 |
| 75 | vcmppd $22, __dRangeVal(%rax), %zmm3, %k1 |
| 76 | |
| 77 | /* Update CosRSign and CosSignRes signs */ |
| 78 | vmovaps %zmm12, %zmm5 |
| 79 | vfnmadd231pd %zmm1, %zmm7, %zmm8 |
| 80 | |
| 81 | /* SinR = SinR - SinN*Pi1 */ |
| 82 | vfnmadd231pd __dPI2_FMA(%rax), %zmm1, %zmm8 |
| 83 | |
| 84 | /* Sine result sign: SinRSign = SignMask & SinR */ |
| 85 | vpandq %zmm8, %zmm12, %zmm11 |
| 86 | |
| 87 | /* Set SinRSign to 0.5 */ |
| 88 | vporq __dOneHalf(%rax), %zmm11, %zmm6 |
| 89 | vpternlogq $150, %zmm2, %zmm11, %zmm5 |
| 90 | |
| 91 | /* Update sign SinSignRes */ |
| 92 | vpternlogq $120, %zmm4, %zmm12, %zmm2 |
| 93 | |
| 94 | /* Polynomial approximation */ |
| 95 | vmovups __dC7(%rax), %zmm11 |
| 96 | |
| 97 | /* CosN = SinN +(-)0.5 */ |
| 98 | vaddpd %zmm6, %zmm1, %zmm0 |
| 99 | |
| 100 | /* SinR = SinR - SinN*Pi3 */ |
| 101 | vfnmadd213pd %zmm8, %zmm9, %zmm1 |
| 102 | vfnmadd231pd %zmm0, %zmm7, %zmm10 |
| 103 | |
| 104 | /* SinR2 = SinR^2 */ |
| 105 | vmulpd %zmm1, %zmm1, %zmm15 |
| 106 | |
| 107 | /* Grab SignX |
| 108 | CosR = CosR - CosN*Pi2 */ |
| 109 | vfnmadd231pd __dPI2_FMA(%rax), %zmm0, %zmm10 |
| 110 | vfmadd231pd __dC7(%rax), %zmm15, %zmm14 |
| 111 | |
| 112 | /* CosR = CosR - CosN*Pi3 */ |
| 113 | vfnmadd213pd %zmm10, %zmm9, %zmm0 |
| 114 | vfmadd213pd __dC5(%rax), %zmm15, %zmm14 |
| 115 | |
| 116 | /* CosR2 = CosR^2 */ |
| 117 | vmulpd %zmm0, %zmm0, %zmm12 |
| 118 | vfmadd213pd __dC4(%rax), %zmm15, %zmm14 |
| 119 | vfmadd213pd %zmm13, %zmm12, %zmm11 |
| 120 | |
| 121 | /* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ |
| 122 | vfmadd213pd __dC3(%rax), %zmm15, %zmm14 |
| 123 | vfmadd213pd __dC5(%rax), %zmm12, %zmm11 |
| 124 | |
| 125 | /* SinPoly = C2 + SinR2*SinPoly */ |
| 126 | vfmadd213pd __dC2(%rax), %zmm15, %zmm14 |
| 127 | vfmadd213pd __dC4(%rax), %zmm12, %zmm11 |
| 128 | |
| 129 | /* SinPoly = C1 + SinR2*SinPoly */ |
| 130 | vfmadd213pd __dC1(%rax), %zmm15, %zmm14 |
| 131 | |
| 132 | /* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ |
| 133 | vfmadd213pd __dC3(%rax), %zmm12, %zmm11 |
| 134 | |
| 135 | /* SinPoly = SinR2*SinPoly */ |
| 136 | vmulpd %zmm15, %zmm14, %zmm13 |
| 137 | |
| 138 | /* CosPoly = C2 + CosR2*CosPoly */ |
| 139 | vfmadd213pd __dC2(%rax), %zmm12, %zmm11 |
| 140 | |
| 141 | /* SinPoly = SinR*SinPoly */ |
| 142 | vfmadd213pd %zmm1, %zmm1, %zmm13 |
| 143 | vpbroadcastq %rdx, %zmm1{%k1}{z} |
| 144 | |
| 145 | /* CosPoly = C1 + CosR2*CosPoly */ |
| 146 | vfmadd213pd __dC1(%rax), %zmm12, %zmm11 |
| 147 | vptestmq %zmm1, %zmm1, %k0 |
| 148 | kmovw %k0, %ecx |
| 149 | |
| 150 | /* CosPoly = CosR2*CosPoly */ |
| 151 | vmulpd %zmm12, %zmm11, %zmm14 |
| 152 | movzbl %cl, %ecx |
| 153 | |
| 154 | /* CosPoly = CosR*CosPoly */ |
| 155 | vfmadd213pd %zmm0, %zmm0, %zmm14 |
| 156 | |
| 157 | /* Final reconstruction. |
| 158 | Update Sin result's sign */ |
| 159 | vpxorq %zmm2, %zmm13, %zmm0 |
| 160 | |
| 161 | /* Update Cos result's sign */ |
| 162 | vpxorq %zmm5, %zmm14, %zmm2 |
| 163 | testl %ecx, %ecx |
| 164 | jne .LBL_1_3 |
| 165 | |
| 166 | .LBL_1_2: |
| 167 | cfi_remember_state |
| 168 | vmovups %zmm0, (%rdi) |
| 169 | vmovups %zmm2, (%rsi) |
| 170 | movq %rbp, %rsp |
| 171 | cfi_def_cfa_register (%rsp) |
| 172 | popq %rbp |
| 173 | cfi_adjust_cfa_offset (-8) |
| 174 | cfi_restore (%rbp) |
| 175 | ret |
| 176 | |
| 177 | .LBL_1_3: |
| 178 | cfi_restore_state |
| 179 | vmovups %zmm4, 1152(%rsp) |
| 180 | vmovups %zmm0, 1216(%rsp) |
| 181 | vmovups %zmm2, 1280(%rsp) |
| 182 | je .LBL_1_2 |
| 183 | |
| 184 | xorb %dl, %dl |
| 185 | kmovw %k4, 1048(%rsp) |
| 186 | xorl %eax, %eax |
| 187 | kmovw %k5, 1040(%rsp) |
| 188 | kmovw %k6, 1032(%rsp) |
| 189 | kmovw %k7, 1024(%rsp) |
| 190 | vmovups %zmm16, 960(%rsp) |
| 191 | vmovups %zmm17, 896(%rsp) |
| 192 | vmovups %zmm18, 832(%rsp) |
| 193 | vmovups %zmm19, 768(%rsp) |
| 194 | vmovups %zmm20, 704(%rsp) |
| 195 | vmovups %zmm21, 640(%rsp) |
| 196 | vmovups %zmm22, 576(%rsp) |
| 197 | vmovups %zmm23, 512(%rsp) |
| 198 | vmovups %zmm24, 448(%rsp) |
| 199 | vmovups %zmm25, 384(%rsp) |
| 200 | vmovups %zmm26, 320(%rsp) |
| 201 | vmovups %zmm27, 256(%rsp) |
| 202 | vmovups %zmm28, 192(%rsp) |
| 203 | vmovups %zmm29, 128(%rsp) |
| 204 | vmovups %zmm30, 64(%rsp) |
| 205 | vmovups %zmm31, (%rsp) |
| 206 | movq %rsi, 1056(%rsp) |
| 207 | movq %r12, 1096(%rsp) |
| 208 | cfi_offset_rel_rsp (12, 1096) |
| 209 | movb %dl, %r12b |
| 210 | movq %r13, 1088(%rsp) |
| 211 | cfi_offset_rel_rsp (13, 1088) |
| 212 | movl %eax, %r13d |
| 213 | movq %r14, 1080(%rsp) |
| 214 | cfi_offset_rel_rsp (14, 1080) |
| 215 | movl %ecx, %r14d |
| 216 | movq %r15, 1072(%rsp) |
| 217 | cfi_offset_rel_rsp (15, 1072) |
| 218 | movq %rbx, 1064(%rsp) |
| 219 | movq %rdi, %rbx |
| 220 | cfi_remember_state |
| 221 | |
| 222 | .LBL_1_6: |
| 223 | btl %r13d, %r14d |
| 224 | jc .LBL_1_13 |
| 225 | |
| 226 | .LBL_1_7: |
| 227 | lea 1(%r13), %esi |
| 228 | btl %esi, %r14d |
| 229 | jc .LBL_1_10 |
| 230 | |
| 231 | .LBL_1_8: |
| 232 | addb $1, %r12b |
| 233 | addl $2, %r13d |
| 234 | cmpb $16, %r12b |
| 235 | jb .LBL_1_6 |
| 236 | |
| 237 | movq %rbx, %rdi |
| 238 | kmovw 1048(%rsp), %k4 |
| 239 | movq 1056(%rsp), %rsi |
| 240 | kmovw 1040(%rsp), %k5 |
| 241 | movq 1096(%rsp), %r12 |
| 242 | cfi_restore (%r12) |
| 243 | kmovw 1032(%rsp), %k6 |
| 244 | movq 1088(%rsp), %r13 |
| 245 | cfi_restore (%r13) |
| 246 | kmovw 1024(%rsp), %k7 |
| 247 | vmovups 960(%rsp), %zmm16 |
| 248 | vmovups 896(%rsp), %zmm17 |
| 249 | vmovups 832(%rsp), %zmm18 |
| 250 | vmovups 768(%rsp), %zmm19 |
| 251 | vmovups 704(%rsp), %zmm20 |
| 252 | vmovups 640(%rsp), %zmm21 |
| 253 | vmovups 576(%rsp), %zmm22 |
| 254 | vmovups 512(%rsp), %zmm23 |
| 255 | vmovups 448(%rsp), %zmm24 |
| 256 | vmovups 384(%rsp), %zmm25 |
| 257 | vmovups 320(%rsp), %zmm26 |
| 258 | vmovups 256(%rsp), %zmm27 |
| 259 | vmovups 192(%rsp), %zmm28 |
| 260 | vmovups 128(%rsp), %zmm29 |
| 261 | vmovups 64(%rsp), %zmm30 |
| 262 | vmovups (%rsp), %zmm31 |
| 263 | movq 1080(%rsp), %r14 |
| 264 | cfi_restore (%r14) |
| 265 | movq 1072(%rsp), %r15 |
| 266 | cfi_restore (%r15) |
| 267 | movq 1064(%rsp), %rbx |
| 268 | vmovups 1216(%rsp), %zmm0 |
| 269 | vmovups 1280(%rsp), %zmm2 |
| 270 | jmp .LBL_1_2 |
| 271 | |
| 272 | .LBL_1_10: |
| 273 | cfi_restore_state |
| 274 | movzbl %r12b, %r15d |
| 275 | shlq $4, %r15 |
| 276 | vmovsd 1160(%rsp,%r15), %xmm0 |
| 277 | |
| 278 | call JUMPTARGET(sin) |
| 279 | |
| 280 | vmovsd %xmm0, 1224(%rsp,%r15) |
| 281 | vmovsd 1160(%rsp,%r15), %xmm0 |
| 282 | |
| 283 | call JUMPTARGET(cos) |
| 284 | |
| 285 | vmovsd %xmm0, 1288(%rsp,%r15) |
| 286 | jmp .LBL_1_8 |
| 287 | |
| 288 | .LBL_1_13: |
| 289 | movzbl %r12b, %r15d |
| 290 | shlq $4, %r15 |
| 291 | vmovsd 1152(%rsp,%r15), %xmm0 |
| 292 | |
| 293 | call JUMPTARGET(sin) |
| 294 | |
| 295 | vmovsd %xmm0, 1216(%rsp,%r15) |
| 296 | vmovsd 1152(%rsp,%r15), %xmm0 |
| 297 | |
| 298 | call JUMPTARGET(cos) |
| 299 | |
| 300 | vmovsd %xmm0, 1280(%rsp,%r15) |
| 301 | jmp .LBL_1_7 |
| 302 | |
| 303 | END (_ZGVeN8vl8l8_sincos_knl) |
| 304 | libmvec_hidden_def(_ZGVeN8vl8l8_sincos_knl) |
| 305 | |
| 306 | ENTRY (_ZGVeN8vl8l8_sincos_skx) |
| 307 | pushq %rbp |
| 308 | cfi_adjust_cfa_offset (8) |
| 309 | cfi_rel_offset (%rbp, 0) |
| 310 | movq %rsp, %rbp |
| 311 | cfi_def_cfa_register (%rbp) |
| 312 | andq $-64, %rsp |
| 313 | subq $1344, %rsp |
| 314 | movq __svml_d_trig_data@GOTPCREL(%rip), %rax |
| 315 | vmovaps %zmm0, %zmm8 |
| 316 | vmovups __dSignMask(%rax), %zmm4 |
| 317 | vmovups __dInvPI(%rax), %zmm9 |
| 318 | vmovups __dRShifter(%rax), %zmm10 |
| 319 | vmovups __dPI1_FMA(%rax), %zmm13 |
| 320 | vmovups __dPI2_FMA(%rax), %zmm14 |
| 321 | vmovups __dOneHalf(%rax), %zmm11 |
| 322 | vmovups __dPI3_FMA(%rax), %zmm2 |
| 323 | |
| 324 | /* ARGUMENT RANGE REDUCTION: |
| 325 | Absolute argument: X' = |X| */ |
| 326 | vandnpd %zmm8, %zmm4, %zmm7 |
| 327 | |
| 328 | /* SinY = X'*InvPi + RS : right shifter add */ |
| 329 | vfmadd213pd %zmm10, %zmm7, %zmm9 |
| 330 | vcmppd $18, __dRangeVal(%rax), %zmm7, %k1 |
| 331 | |
| 332 | /* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */ |
| 333 | vpsllq $63, %zmm9, %zmm6 |
| 334 | |
| 335 | /* SinN = Y - RS : right shifter sub */ |
| 336 | vsubpd %zmm10, %zmm9, %zmm5 |
| 337 | vmovups __dC5(%rax), %zmm9 |
| 338 | vmovups __dC4(%rax), %zmm10 |
| 339 | |
| 340 | /* SinR = X' - SinN*Pi1 */ |
| 341 | vmovaps %zmm7, %zmm15 |
| 342 | vfnmadd231pd %zmm5, %zmm13, %zmm15 |
| 343 | |
| 344 | /* SinR = SinR - SinN*Pi1 */ |
| 345 | vfnmadd231pd %zmm5, %zmm14, %zmm15 |
| 346 | |
| 347 | /* Sine result sign: SinRSign = SignMask & SinR */ |
| 348 | vandpd %zmm15, %zmm4, %zmm1 |
| 349 | |
| 350 | /* Set SinRSign to 0.5 */ |
| 351 | vorpd %zmm1, %zmm11, %zmm12 |
| 352 | vmovups __dC3(%rax), %zmm11 |
| 353 | |
| 354 | /* CosN = SinN +(-)0.5 */ |
| 355 | vaddpd %zmm12, %zmm5, %zmm3 |
| 356 | |
| 357 | /* SinR = SinR - SinN*Pi3 */ |
| 358 | vfnmadd213pd %zmm15, %zmm2, %zmm5 |
| 359 | vmovups __dC2(%rax), %zmm12 |
| 360 | |
| 361 | /* SinR2 = SinR^2 */ |
| 362 | vmulpd %zmm5, %zmm5, %zmm15 |
| 363 | |
| 364 | /* CosR = SinX - CosN*Pi1 */ |
| 365 | vmovaps %zmm7, %zmm0 |
| 366 | vfnmadd231pd %zmm3, %zmm13, %zmm0 |
| 367 | vmovups __dC1(%rax), %zmm13 |
| 368 | |
| 369 | /* Grab SignX |
| 370 | CosR = CosR - CosN*Pi2 */ |
| 371 | vfnmadd231pd %zmm3, %zmm14, %zmm0 |
| 372 | |
| 373 | /* CosR = CosR - CosN*Pi3 */ |
| 374 | vfnmadd213pd %zmm0, %zmm2, %zmm3 |
| 375 | |
| 376 | /* Polynomial approximation */ |
| 377 | vmovups __dC7(%rax), %zmm0 |
| 378 | |
| 379 | /* Update CosRSign and CosSignRes signs */ |
| 380 | vmovaps %zmm4, %zmm2 |
| 381 | vpternlogq $150, %zmm6, %zmm1, %zmm2 |
| 382 | |
| 383 | /* Update sign SinSignRes */ |
| 384 | vpternlogq $120, %zmm8, %zmm4, %zmm6 |
| 385 | |
| 386 | /* CosR2 = CosR^2 */ |
| 387 | vmulpd %zmm3, %zmm3, %zmm1 |
| 388 | vmovups __dC6(%rax), %zmm4 |
| 389 | vmovaps %zmm0, %zmm14 |
| 390 | vfmadd213pd %zmm4, %zmm1, %zmm0 |
| 391 | vfmadd213pd %zmm4, %zmm15, %zmm14 |
| 392 | vfmadd213pd %zmm9, %zmm1, %zmm0 |
| 393 | vfmadd213pd %zmm9, %zmm15, %zmm14 |
| 394 | vfmadd213pd %zmm10, %zmm1, %zmm0 |
| 395 | vfmadd213pd %zmm10, %zmm15, %zmm14 |
| 396 | |
| 397 | /* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */ |
| 398 | vfmadd213pd %zmm11, %zmm1, %zmm0 |
| 399 | |
| 400 | /* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */ |
| 401 | vfmadd213pd %zmm11, %zmm15, %zmm14 |
| 402 | |
| 403 | /* CosPoly = C2 + CosR2*CosPoly */ |
| 404 | vfmadd213pd %zmm12, %zmm1, %zmm0 |
| 405 | |
| 406 | /* SinPoly = C2 + SinR2*SinPoly */ |
| 407 | vfmadd213pd %zmm12, %zmm15, %zmm14 |
| 408 | |
| 409 | /* CosPoly = C1 + CosR2*CosPoly */ |
| 410 | vfmadd213pd %zmm13, %zmm1, %zmm0 |
| 411 | |
| 412 | /* SinPoly = C1 + SinR2*SinPoly */ |
| 413 | vfmadd213pd %zmm13, %zmm15, %zmm14 |
| 414 | |
| 415 | /* CosPoly = CosR2*CosPoly */ |
| 416 | vmulpd %zmm1, %zmm0, %zmm1 |
| 417 | |
| 418 | /* SinPoly = SinR2*SinPoly */ |
| 419 | vmulpd %zmm15, %zmm14, %zmm4 |
| 420 | |
| 421 | /* CosPoly = CosR*CosPoly */ |
| 422 | vfmadd213pd %zmm3, %zmm3, %zmm1 |
| 423 | |
| 424 | /* SinPoly = SinR*SinPoly */ |
| 425 | vfmadd213pd %zmm5, %zmm5, %zmm4 |
| 426 | vpternlogd $0xff, %zmm3, %zmm3, %zmm3 |
| 427 | |
| 428 | /* Update Cos result's sign */ |
| 429 | vxorpd %zmm2, %zmm1, %zmm1 |
| 430 | |
| 431 | /* Final reconstruction. |
| 432 | Update Sin result's sign */ |
| 433 | vxorpd %zmm6, %zmm4, %zmm0 |
| 434 | vpandnq %zmm7, %zmm7, %zmm3{%k1} |
| 435 | vcmppd $3, %zmm3, %zmm3, %k0 |
| 436 | kmovw %k0, %ecx |
| 437 | testl %ecx, %ecx |
| 438 | jne .LBL_2_3 |
| 439 | |
| 440 | .LBL_2_2: |
| 441 | cfi_remember_state |
| 442 | vmovups %zmm0, (%rdi) |
| 443 | vmovups %zmm1, (%rsi) |
| 444 | movq %rbp, %rsp |
| 445 | cfi_def_cfa_register (%rsp) |
| 446 | popq %rbp |
| 447 | cfi_adjust_cfa_offset (-8) |
| 448 | cfi_restore (%rbp) |
| 449 | ret |
| 450 | |
| 451 | .LBL_2_3: |
| 452 | cfi_restore_state |
| 453 | vmovups %zmm8, 1152(%rsp) |
| 454 | vmovups %zmm0, 1216(%rsp) |
| 455 | vmovups %zmm1, 1280(%rsp) |
| 456 | je .LBL_2_2 |
| 457 | |
| 458 | xorb %dl, %dl |
| 459 | xorl %eax, %eax |
| 460 | kmovw %k4, 1048(%rsp) |
| 461 | kmovw %k5, 1040(%rsp) |
| 462 | kmovw %k6, 1032(%rsp) |
| 463 | kmovw %k7, 1024(%rsp) |
| 464 | vmovups %zmm16, 960(%rsp) |
| 465 | vmovups %zmm17, 896(%rsp) |
| 466 | vmovups %zmm18, 832(%rsp) |
| 467 | vmovups %zmm19, 768(%rsp) |
| 468 | vmovups %zmm20, 704(%rsp) |
| 469 | vmovups %zmm21, 640(%rsp) |
| 470 | vmovups %zmm22, 576(%rsp) |
| 471 | vmovups %zmm23, 512(%rsp) |
| 472 | vmovups %zmm24, 448(%rsp) |
| 473 | vmovups %zmm25, 384(%rsp) |
| 474 | vmovups %zmm26, 320(%rsp) |
| 475 | vmovups %zmm27, 256(%rsp) |
| 476 | vmovups %zmm28, 192(%rsp) |
| 477 | vmovups %zmm29, 128(%rsp) |
| 478 | vmovups %zmm30, 64(%rsp) |
| 479 | vmovups %zmm31, (%rsp) |
| 480 | movq %rsi, 1056(%rsp) |
| 481 | movq %r12, 1096(%rsp) |
| 482 | cfi_offset_rel_rsp (12, 1096) |
| 483 | movb %dl, %r12b |
| 484 | movq %r13, 1088(%rsp) |
| 485 | cfi_offset_rel_rsp (13, 1088) |
| 486 | movl %eax, %r13d |
| 487 | movq %r14, 1080(%rsp) |
| 488 | cfi_offset_rel_rsp (14, 1080) |
| 489 | movl %ecx, %r14d |
| 490 | movq %r15, 1072(%rsp) |
| 491 | cfi_offset_rel_rsp (15, 1072) |
| 492 | movq %rbx, 1064(%rsp) |
| 493 | movq %rdi, %rbx |
| 494 | cfi_remember_state |
| 495 | |
| 496 | .LBL_2_6: |
| 497 | btl %r13d, %r14d |
| 498 | jc .LBL_2_13 |
| 499 | |
| 500 | .LBL_2_7: |
| 501 | lea 1(%r13), %esi |
| 502 | btl %esi, %r14d |
| 503 | jc .LBL_2_10 |
| 504 | |
| 505 | .LBL_2_8: |
| 506 | incb %r12b |
| 507 | addl $2, %r13d |
| 508 | cmpb $16, %r12b |
| 509 | jb .LBL_2_6 |
| 510 | |
| 511 | kmovw 1048(%rsp), %k4 |
| 512 | movq %rbx, %rdi |
| 513 | kmovw 1040(%rsp), %k5 |
| 514 | kmovw 1032(%rsp), %k6 |
| 515 | kmovw 1024(%rsp), %k7 |
| 516 | vmovups 960(%rsp), %zmm16 |
| 517 | vmovups 896(%rsp), %zmm17 |
| 518 | vmovups 832(%rsp), %zmm18 |
| 519 | vmovups 768(%rsp), %zmm19 |
| 520 | vmovups 704(%rsp), %zmm20 |
| 521 | vmovups 640(%rsp), %zmm21 |
| 522 | vmovups 576(%rsp), %zmm22 |
| 523 | vmovups 512(%rsp), %zmm23 |
| 524 | vmovups 448(%rsp), %zmm24 |
| 525 | vmovups 384(%rsp), %zmm25 |
| 526 | vmovups 320(%rsp), %zmm26 |
| 527 | vmovups 256(%rsp), %zmm27 |
| 528 | vmovups 192(%rsp), %zmm28 |
| 529 | vmovups 128(%rsp), %zmm29 |
| 530 | vmovups 64(%rsp), %zmm30 |
| 531 | vmovups (%rsp), %zmm31 |
| 532 | vmovups 1216(%rsp), %zmm0 |
| 533 | vmovups 1280(%rsp), %zmm1 |
| 534 | movq 1056(%rsp), %rsi |
| 535 | movq 1096(%rsp), %r12 |
| 536 | cfi_restore (%r12) |
| 537 | movq 1088(%rsp), %r13 |
| 538 | cfi_restore (%r13) |
| 539 | movq 1080(%rsp), %r14 |
| 540 | cfi_restore (%r14) |
| 541 | movq 1072(%rsp), %r15 |
| 542 | cfi_restore (%r15) |
| 543 | movq 1064(%rsp), %rbx |
| 544 | jmp .LBL_2_2 |
| 545 | |
| 546 | .LBL_2_10: |
| 547 | cfi_restore_state |
| 548 | movzbl %r12b, %r15d |
| 549 | shlq $4, %r15 |
| 550 | vmovsd 1160(%rsp,%r15), %xmm0 |
| 551 | vzeroupper |
| 552 | vmovsd 1160(%rsp,%r15), %xmm0 |
| 553 | |
| 554 | call JUMPTARGET(sin) |
| 555 | |
| 556 | vmovsd %xmm0, 1224(%rsp,%r15) |
| 557 | vmovsd 1160(%rsp,%r15), %xmm0 |
| 558 | |
| 559 | call JUMPTARGET(cos) |
| 560 | |
| 561 | vmovsd %xmm0, 1288(%rsp,%r15) |
| 562 | jmp .LBL_2_8 |
| 563 | |
| 564 | .LBL_2_13: |
| 565 | movzbl %r12b, %r15d |
| 566 | shlq $4, %r15 |
| 567 | vmovsd 1152(%rsp,%r15), %xmm0 |
| 568 | vzeroupper |
| 569 | vmovsd 1152(%rsp,%r15), %xmm0 |
| 570 | |
| 571 | call JUMPTARGET(sin) |
| 572 | |
| 573 | vmovsd %xmm0, 1216(%rsp,%r15) |
| 574 | vmovsd 1152(%rsp,%r15), %xmm0 |
| 575 | |
| 576 | call JUMPTARGET(cos) |
| 577 | |
| 578 | vmovsd %xmm0, 1280(%rsp,%r15) |
| 579 | jmp .LBL_2_7 |
| 580 | |
| 581 | END (_ZGVeN8vl8l8_sincos_skx) |
| 582 | libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx) |
| 583 | |
| 584 | /* Wrapper between vvv and vl8l8 vector variants. */ |
| 585 | .macro WRAPPER_AVX512_vvv_vl8l8 callee |
| 586 | #ifndef __ILP32__ |
| 587 | pushq %rbp |
| 588 | cfi_adjust_cfa_offset (8) |
| 589 | cfi_rel_offset (%rbp, 0) |
| 590 | movq %rsp, %rbp |
| 591 | cfi_def_cfa_register (%rbp) |
| 592 | andq $-64, %rsp |
| 593 | subq $256, %rsp |
| 594 | vmovups %zmm1, 128(%rsp) |
| 595 | lea (%rsp), %rdi |
| 596 | vmovups %zmm2, 192(%rdi) |
| 597 | lea 64(%rsp), %rsi |
| 598 | call HIDDEN_JUMPTARGET(\callee) |
| 599 | movq 128(%rsp), %rdx |
| 600 | movq 136(%rsp), %rsi |
| 601 | movq 144(%rsp), %r8 |
| 602 | movq 152(%rsp), %r10 |
| 603 | movq (%rsp), %rax |
| 604 | movq 8(%rsp), %rcx |
| 605 | movq 16(%rsp), %rdi |
| 606 | movq 24(%rsp), %r9 |
| 607 | movq %rax, (%rdx) |
| 608 | movq %rcx, (%rsi) |
| 609 | movq 160(%rsp), %rax |
| 610 | movq 168(%rsp), %rcx |
| 611 | movq %rdi, (%r8) |
| 612 | movq %r9, (%r10) |
| 613 | movq 176(%rsp), %rdi |
| 614 | movq 184(%rsp), %r9 |
| 615 | movq 32(%rsp), %r11 |
| 616 | movq 40(%rsp), %rdx |
| 617 | movq 48(%rsp), %rsi |
| 618 | movq 56(%rsp), %r8 |
| 619 | movq %r11, (%rax) |
| 620 | movq %rdx, (%rcx) |
| 621 | movq 192(%rsp), %r11 |
| 622 | movq 200(%rsp), %rdx |
| 623 | movq %rsi, (%rdi) |
| 624 | movq %r8, (%r9) |
| 625 | movq 208(%rsp), %rsi |
| 626 | movq 216(%rsp), %r8 |
| 627 | movq 64(%rsp), %r10 |
| 628 | movq 72(%rsp), %rax |
| 629 | movq 80(%rsp), %rcx |
| 630 | movq 88(%rsp), %rdi |
| 631 | movq %r10, (%r11) |
| 632 | movq %rax, (%rdx) |
| 633 | movq 224(%rsp), %r10 |
| 634 | movq 232(%rsp), %rax |
| 635 | movq %rcx, (%rsi) |
| 636 | movq %rdi, (%r8) |
| 637 | movq 240(%rsp), %rcx |
| 638 | movq 248(%rsp), %rdi |
| 639 | movq 96(%rsp), %r9 |
| 640 | movq 104(%rsp), %r11 |
| 641 | movq 112(%rsp), %rdx |
| 642 | movq 120(%rsp), %rsi |
| 643 | movq %r9, (%r10) |
| 644 | movq %r11, (%rax) |
| 645 | movq %rdx, (%rcx) |
| 646 | movq %rsi, (%rdi) |
| 647 | movq %rbp, %rsp |
| 648 | cfi_def_cfa_register (%rsp) |
| 649 | popq %rbp |
| 650 | cfi_adjust_cfa_offset (-8) |
| 651 | cfi_restore (%rbp) |
| 652 | ret |
| 653 | #else |
| 654 | leal 8(%rsp), %r10d |
| 655 | .cfi_def_cfa 10, 0 |
| 656 | andl $-64, %esp |
| 657 | pushq -8(%r10d) |
| 658 | pushq %rbp |
| 659 | .cfi_escape 0x10,0x6,0x2,0x76,0 |
| 660 | movl %esp, %ebp |
| 661 | pushq %r10 |
| 662 | .cfi_escape 0xf,0x3,0x76,0x78,0x6 |
| 663 | leal -112(%rbp), %esi |
| 664 | leal -176(%rbp), %edi |
| 665 | subl $232, %esp |
| 666 | vmovdqa %ymm1, -208(%ebp) |
| 667 | vmovdqa %ymm2, -240(%ebp) |
| 668 | call HIDDEN_JUMPTARGET(\callee) |
| 669 | vmovdqa -208(%ebp), %xmm0 |
| 670 | vmovq %xmm0, %rax |
| 671 | vmovsd -176(%ebp), %xmm0 |
| 672 | vmovsd %xmm0, (%eax) |
| 673 | shrq $32, %rax |
| 674 | vmovsd -168(%ebp), %xmm0 |
| 675 | vmovsd %xmm0, (%eax) |
| 676 | movq -200(%ebp), %rax |
| 677 | vmovsd -160(%ebp), %xmm0 |
| 678 | vmovsd %xmm0, (%eax) |
| 679 | shrq $32, %rax |
| 680 | vmovsd -152(%ebp), %xmm0 |
| 681 | vmovsd %xmm0, (%eax) |
| 682 | movq -192(%ebp), %rax |
| 683 | vmovsd -144(%ebp), %xmm0 |
| 684 | vmovsd %xmm0, (%eax) |
| 685 | shrq $32, %rax |
| 686 | vmovsd -136(%ebp), %xmm0 |
| 687 | vmovsd %xmm0, (%eax) |
| 688 | movq -184(%ebp), %rax |
| 689 | vmovsd -128(%ebp), %xmm0 |
| 690 | vmovsd %xmm0, (%eax) |
| 691 | shrq $32, %rax |
| 692 | vmovsd -120(%ebp), %xmm0 |
| 693 | vmovsd %xmm0, (%eax) |
| 694 | vmovdqa -240(%ebp), %xmm0 |
| 695 | vmovq %xmm0, %rax |
| 696 | vmovsd -112(%ebp), %xmm0 |
| 697 | vmovsd %xmm0, (%eax) |
| 698 | shrq $32, %rax |
| 699 | vmovsd -104(%ebp), %xmm0 |
| 700 | vmovsd %xmm0, (%eax) |
| 701 | movq -232(%ebp), %rax |
| 702 | vmovsd -96(%ebp), %xmm0 |
| 703 | vmovsd %xmm0, (%eax) |
| 704 | shrq $32, %rax |
| 705 | vmovsd -88(%ebp), %xmm0 |
| 706 | vmovsd %xmm0, (%eax) |
| 707 | movq -224(%ebp), %rax |
| 708 | vmovsd -80(%ebp), %xmm0 |
| 709 | vmovsd %xmm0, (%eax) |
| 710 | shrq $32, %rax |
| 711 | vmovsd -72(%ebp), %xmm0 |
| 712 | vmovsd %xmm0, (%eax) |
| 713 | movq -216(%ebp), %rax |
| 714 | vmovsd -64(%ebp), %xmm0 |
| 715 | vmovsd %xmm0, (%eax) |
| 716 | shrq $32, %rax |
| 717 | vmovsd -56(%ebp), %xmm0 |
| 718 | vmovsd %xmm0, (%eax) |
| 719 | addl $232, %esp |
| 720 | popq %r10 |
| 721 | .cfi_def_cfa 10, 0 |
| 722 | popq %rbp |
| 723 | leal -8(%r10), %esp |
| 724 | .cfi_def_cfa 7, 8 |
| 725 | ret |
| 726 | #endif |
| 727 | .endm |
| 728 | |
| 729 | ENTRY (_ZGVeN8vvv_sincos_knl) |
| 730 | WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_knl |
| 731 | END (_ZGVeN8vvv_sincos_knl) |
| 732 | |
| 733 | ENTRY (_ZGVeN8vvv_sincos_skx) |
| 734 | WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx |
| 735 | END (_ZGVeN8vvv_sincos_skx) |
| 736 | |