svml_d_hypot4_core_avx2.S source code [glibc/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S]

1	/ Function hypot vectorized with AVX2.*
2	Copyright (C) 2021-2024 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	https://www.gnu.org/licenses/. /*
18
19	/*
20	* ALGORITHM DESCRIPTION:
21	*
22	* HIGH LEVEL OVERVIEW
23	*
24	* Calculate z = (xx+yy)
25	* Calculate reciplicle sqrt (z)
26	* Calculate error = z(rsqrt(z)rsqrt(z)) - 1
27	* Calculate fixing part p with polynom
28	* Fix answer with sqrt(z) = z * rsqrt(z) + error * p * z
29	*
30	* ALGORITHM DETAILS
31	*
32	* Multiprecision branch for _HA_ only
33	* Remove sigm from both arguments
34	* Find maximum (_x) and minimum (_y) (by abs value) between arguments
35	* Split _x int _a and _b for multiprecision
36	* If _x >> _y we will we will not split _y for multiprecision
37	* all _y will be put into lower part (_d) and higher part (_c = 0)
38	* Fixing _hilo_mask for the case _x >> _y
39	* Split _y into _c and _d for multiprecision with fixed mask
40	*
41	* compute Hi and Lo parts of _z = _x_x + _y_y
42	*
43	* _zHi = _a_a + _c_c
44	* _zLo = (_x + _a)_b + _d_y + _d*_c
45	* _z = _zHi + _zLo
46	*
47	* No multiprecision branch for _LA_ and _EP_
48	* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
49	*
50	* Check _z exponent to be within borders [3BC ; 441] else goto Callout
51	*
52	* _s ~ 1.0/sqrt(_z)
53	* _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z = (1.0/_z + O)
54	* _e[rror] = (1.0/_z + O) * _z - 1.0
55	* calculate fixing part _p
56	* _p = (((_POLY_C5_e + _POLY_C4)_e +_POLY_C3)_e +_POLY_C2)_e + _POLY_C1
57	* some parts of polynom are skipped for lower flav
58	*
59	* result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z
60	*
61	*
62	*/
63
64	/ Offsets for data table __svml_dhypot_data_internal*
65	*/
66	#define _dHiLoMask 0
67	#define _dAbsMask 32
68	#define _dOne 64
69	#define _POLY_C5 96
70	#define _POLY_C4 128
71	#define _POLY_C3 160
72	#define _POLY_C2 192
73	#define _POLY_C1 224
74	#define _LowBoundary 256
75	#define _HighBoundary 288
76
77	#include <sysdep.h>
78
79	.section .text.avx2, "ax", @progbits
80	ENTRY(_ZGVdN4vv_hypot_avx2)
81	pushq %rbp
82	cfi_def_cfa_offset(`16`)
83	movq %rsp, %rbp
84	cfi_def_cfa(`6`, `16`)
85	cfi_offset(`6`, -`16`)
86	andq $-`32`, %rsp
87	subq $`128`, %rsp
88	vmovapd %ymm1, %ymm2
89	vmovapd %ymm0, %ymm1
90
91	/*
92	* Defines
93	* Implementation
94	* Multiprecision branch for _HA_ only
95	* _z = _VARG1 * _VARG1 + _VARG2 * _VARG2
96	*/
97	vmulpd %ymm1, %ymm1, %ymm0
98
99	/*
100	* calculate fixing part _p
101	* _p = (((_POLY_C5_e + _POLY_C4)_e +_POLY_C3)_e +_POLY_C2)_e + _POLY_C1
102	* some parts of polynom are skipped for lower flav
103	*/
104	vmovupd _POLY_C4+__svml_dhypot_data_internal(%rip), %ymm15
105	vmovups _LowBoundary+__svml_dhypot_data_internal(%rip), %xmm4
106	vfmadd231pd %ymm2, %ymm2, %ymm0
107
108	/*
109	* _s ~ 1.0/sqrt(_z)
110	* _s2 ~ 1.0/(sqrt(_z)*sqrt(_z)) ~ 1.0/_z
111	*/
112	vcvtpd2ps %ymm0, %xmm12
113
114	/ Check _z exponent to be within borders [3BC ; 441] else goto Callout /
115	vextractf128 $`1`, %ymm0, %xmm3
116	vrsqrtps %xmm12, %xmm13
117	vshufps $`221`, %xmm3, %xmm0, %xmm5
118	vcvtps2pd %xmm13, %ymm3
119	vpcmpgtd %xmm5, %xmm4, %xmm6
120	vpcmpgtd _HighBoundary+__svml_dhypot_data_internal(%rip), %xmm5, %xmm7
121	vpor %xmm7, %xmm6, %xmm9
122	vpshufd $`80`, %xmm9, %xmm8
123	vmulpd %ymm3, %ymm3, %ymm14
124	vpshufd $`250`, %xmm9, %xmm10
125
126	/ _e[rror] ~ (1.0/_z + O) * _z - 1.0 /
127	vfmsub213pd _dOne+__svml_dhypot_data_internal(%rip), %ymm0, %ymm14
128	vfmadd213pd _POLY_C3+__svml_dhypot_data_internal(%rip), %ymm14, %ymm15
129	vfmadd213pd _POLY_C2+__svml_dhypot_data_internal(%rip), %ymm14, %ymm15
130	vfmadd213pd _POLY_C1+__svml_dhypot_data_internal(%rip), %ymm14, %ymm15
131
132	/ result = _z * (1.0/sqrt(_z) + O) + _p * _e[rror] * _z /
133	vmulpd %ymm15, %ymm14, %ymm14
134	vmulpd %ymm14, %ymm3, %ymm15
135	vmulpd %ymm15, %ymm0, %ymm4
136	vfmadd213pd %ymm4, %ymm3, %ymm0
137	vinsertf128 $`1`, %xmm10, %ymm8, %ymm11
138	vmovmskpd %ymm11, %edx
139
140	/ The end of implementation /
141	testl %edx, %edx
142
143	/ Go to special inputs processing branch /
144	jne L(SPECIAL_VALUES_BRANCH)
145	# LOE rbx r12 r13 r14 r15 edx ymm0 ymm1 ymm2
146
147	/ Restore registers*
148	* and exit the function
149	*/
150
151	L(EXIT):
152	movq %rbp, %rsp
153	popq %rbp
154	cfi_def_cfa(`7`, `8`)
155	cfi_restore(`6`)
156	ret
157	cfi_def_cfa(`6`, `16`)
158	cfi_offset(`6`, -`16`)
159
160	/ Branch to process*
161	* special inputs
162	*/
163
164	L(SPECIAL_VALUES_BRANCH):
165	vmovupd %ymm1, `32`(%rsp)
166	vmovupd %ymm2, `64`(%rsp)
167	vmovupd %ymm0, `96`(%rsp)
168	# LOE rbx r12 r13 r14 r15 edx ymm0
169
170	xorl %eax, %eax
171	# LOE rbx r12 r13 r14 r15 eax edx
172
173	vzeroupper
174	movq %r12, `16`(%rsp)
175	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) /
176	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x90`, `0xff`, `0xff`, `0xff`, `0x22`
177	movl %eax, %r12d
178	movq %r13, `8`(%rsp)
179	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) /
180	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x88`, `0xff`, `0xff`, `0xff`, `0x22`
181	movl %edx, %r13d
182	movq %r14, (%rsp)
183	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) /
184	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x80`, `0xff`, `0xff`, `0xff`, `0x22`
185	# LOE rbx r15 r12d r13d
186
187	/ Range mask*
188	* bits check
189	*/
190
191	L(RANGEMASK_CHECK):
192	btl %r12d, %r13d
193
194	/ Call scalar math function /
195	jc L(SCALAR_MATH_CALL)
196	# LOE rbx r15 r12d r13d
197
198	/ Special inputs*
199	* processing loop
200	*/
201
202	L(SPECIAL_VALUES_LOOP):
203	incl %r12d
204	cmpl $`4`, %r12d
205
206	/ Check bits in range mask /
207	jl L(RANGEMASK_CHECK)
208	# LOE rbx r15 r12d r13d
209
210	movq `16`(%rsp), %r12
211	cfi_restore(`12`)
212	movq `8`(%rsp), %r13
213	cfi_restore(`13`)
214	movq (%rsp), %r14
215	cfi_restore(`14`)
216	vmovupd `96`(%rsp), %ymm0
217
218	/ Go to exit /
219	jmp L(EXIT)
220	/ DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -112; DW_OP_plus) /
221	.cfi_escape `0x10`, `0x0c`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x90`, `0xff`, `0xff`, `0xff`, `0x22`
222	/ DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -120; DW_OP_plus) /
223	.cfi_escape `0x10`, `0x0d`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x88`, `0xff`, `0xff`, `0xff`, `0x22`
224	/ DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -128; DW_OP_plus) /
225	.cfi_escape `0x10`, `0x0e`, `0x0e`, `0x38`, `0x1c`, `0x0d`, `0xe0`, `0xff`, `0xff`, `0xff`, `0x1a`, `0x0d`, `0x80`, `0xff`, `0xff`, `0xff`, `0x22`
226	# LOE rbx r12 r13 r14 r15 ymm0
227
228	/ Scalar math function call*
229	* to process special input
230	*/
231
232	L(SCALAR_MATH_CALL):
233	movl %r12d, %r14d
234	vmovsd `32`(%rsp, %r14, `8`), %xmm0
235	vmovsd `64`(%rsp, %r14, `8`), %xmm1
236	call hypot@PLT
237	# LOE rbx r14 r15 r12d r13d xmm0
238
239	vmovsd %xmm0, `96`(%rsp, %r14, `8`)
240
241	/ Process special inputs in loop /
242	jmp L(SPECIAL_VALUES_LOOP)
243	# LOE rbx r15 r12d r13d
244	END(_ZGVdN4vv_hypot_avx2)
245
246	.section .rodata, "a"
247	.align `32`
248
249	#ifdef __svml_dhypot_data_internal_typedef
250	typedef unsigned int VUINT32;
251	typedef struct {
252	__declspec(align(`32`)) VUINT32 _dHiLoMask[`4`][`2`];
253	__declspec(align(`32`)) VUINT32 _dAbsMask[`4`][`2`];
254	__declspec(align(`32`)) VUINT32 _dOne[`4`][`2`];
255	__declspec(align(`32`)) VUINT32 _POLY_C5[`4`][`2`];
256	__declspec(align(`32`)) VUINT32 _POLY_C4[`4`][`2`];
257	__declspec(align(`32`)) VUINT32 _POLY_C3[`4`][`2`];
258	__declspec(align(`32`)) VUINT32 _POLY_C2[`4`][`2`];
259	__declspec(align(`32`)) VUINT32 _POLY_C1[`4`][`2`];
260	__declspec(align(`32`)) VUINT32 _LowBoundary[`8`][`1`];
261	__declspec(align(`32`)) VUINT32 _HighBoundary[`8`][`1`];
262	} __svml_dhypot_data_internal;
263	#endif
264	__svml_dhypot_data_internal:
265	/ legacy algorithm /
266	.quad `0xffffc00000000000`, `0xffffc00000000000`, `0xffffc00000000000`, `0xffffc00000000000` / _dHiLoMask /
267	.align `32`
268	.quad `0x7fffffffffffffff`, `0x7fffffffffffffff`, `0x7fffffffffffffff`, `0x7fffffffffffffff` / _dAbsMask /
269	.align `32`
270	.quad `0x3FF0000000000000`, `0x3FF0000000000000`, `0x3FF0000000000000`, `0x3FF0000000000000` / _dOne /
271	.align `32`
272	.quad `0xBFCF800000000000`, `0xBFCF800000000000`, `0xBFCF800000000000`, `0xBFCF800000000000` / _POLY_C5 /
273	.align `32`
274	.quad `0x3FD1800000000000`, `0x3FD1800000000000`, `0x3FD1800000000000`, `0x3FD1800000000000` / _POLY_C4 /
275	.align `32`
276	.quad `0xBFD4000000000000`, `0xBFD4000000000000`, `0xBFD4000000000000`, `0xBFD4000000000000` / _POLY_C3 /
277	.align `32`
278	.quad `0x3FD8000000000000`, `0x3FD8000000000000`, `0x3FD8000000000000`, `0x3FD8000000000000` / _POLY_C2 /
279	.align `32`
280	.quad `0xBFE0000000000000`, `0xBFE0000000000000`, `0xBFE0000000000000`, `0xBFE0000000000000` / _POLY_C1 /
281	.align `32`
282	.long `0x3BC00000`, `0x3BC00000`, `0x3BC00000`, `0x3BC00000`, `0x3BC00000`, `0x3BC00000`, `0x3BC00000`, `0x3BC00000` / _LowBoundary /
283	.align `32`
284	.long `0x44100000`, `0x44100000`, `0x44100000`, `0x44100000`, `0x44100000`, `0x44100000`, `0x44100000`, `0x44100000` / _HighBoundary /
285	.align `32`
286	.type __svml_dhypot_data_internal, @object
287	.size __svml_dhypot_data_internal, .-__svml_dhypot_data_internal
288

source code of glibc/sysdeps/x86_64/fpu/multiarch/svml_d_hypot4_core_avx2.S