1/* memset optimized with AVX512 for KNL hardware.
2 Copyright (C) 2015-2024 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include <isa-level.h>
21
22#if ISA_SHOULD_BUILD (4)
23
24
25#include "asm-syntax.h"
26#ifndef MEMSET
27# define MEMSET __memset_avx512_no_vzeroupper
28# define MEMSET_CHK __memset_chk_avx512_no_vzeroupper
29#endif
30
31 .section .text.avx512,"ax",@progbits
32#if defined PIC
33ENTRY (MEMSET_CHK)
34 cmp %RDX_LP, %RCX_LP
35 jb HIDDEN_JUMPTARGET (__chk_fail)
36END (MEMSET_CHK)
37#endif
38
39ENTRY (MEMSET)
40# ifdef __ILP32__
41 /* Clear the upper 32 bits. */
42 mov %edx, %edx
43# endif
44 vpxor %xmm0, %xmm0, %xmm0
45 vmovd %esi, %xmm1
46 lea (%rdi, %rdx), %rsi
47 mov %rdi, %rax
48 vpshufb %xmm0, %xmm1, %xmm0
49 cmp $16, %rdx
50 jb L(less_16bytes)
51 cmp $512, %rdx
52 vbroadcastss %xmm0, %zmm2
53 ja L(512bytesormore)
54 cmp $256, %rdx
55 jb L(less_256bytes)
56 vmovups %zmm2, (%rdi)
57 vmovups %zmm2, 0x40(%rdi)
58 vmovups %zmm2, 0x80(%rdi)
59 vmovups %zmm2, 0xC0(%rdi)
60 vmovups %zmm2, -0x100(%rsi)
61 vmovups %zmm2, -0xC0(%rsi)
62 vmovups %zmm2, -0x80(%rsi)
63 vmovups %zmm2, -0x40(%rsi)
64 ret
65
66L(less_256bytes):
67 cmp $128, %dl
68 jb L(less_128bytes)
69 vmovups %zmm2, (%rdi)
70 vmovups %zmm2, 0x40(%rdi)
71 vmovups %zmm2, -0x80(%rsi)
72 vmovups %zmm2, -0x40(%rsi)
73 ret
74
75L(less_128bytes):
76 cmp $64, %dl
77 jb L(less_64bytes)
78 vmovups %zmm2, (%rdi)
79 vmovups %zmm2, -0x40(%rsi)
80 ret
81
82L(less_64bytes):
83 cmp $32, %dl
84 jb L(less_32bytes)
85 vmovdqu %ymm2, (%rdi)
86 vmovdqu %ymm2, -0x20(%rsi)
87 ret
88
89L(less_32bytes):
90 vmovdqu %xmm0, (%rdi)
91 vmovdqu %xmm0, -0x10(%rsi)
92 ret
93
94L(less_16bytes):
95 cmp $8, %dl
96 jb L(less_8bytes)
97 vmovq %xmm0, (%rdi)
98 vmovq %xmm0, -0x08(%rsi)
99 ret
100
101L(less_8bytes):
102 vmovd %xmm0, %ecx
103 cmp $4, %dl
104 jb L(less_4bytes)
105 mov %ecx, (%rdi)
106 mov %ecx, -0x04(%rsi)
107 ret
108
109L(less_4bytes):
110 cmp $2, %dl
111 jb L(less_2bytes)
112 mov %cx, (%rdi)
113 mov %cx, -0x02(%rsi)
114 ret
115
116L(less_2bytes):
117 cmp $1, %dl
118 jb L(less_1bytes)
119 mov %cl, (%rdi)
120L(less_1bytes):
121 ret
122
123L(512bytesormore):
124 mov __x86_shared_cache_size_half(%rip), %rcx
125 cmp %rcx, %rdx
126 ja L(preloop_large)
127 cmp $1024, %rdx
128 ja L(1024bytesormore)
129
130 vmovups %zmm2, (%rdi)
131 vmovups %zmm2, 0x40(%rdi)
132 vmovups %zmm2, 0x80(%rdi)
133 vmovups %zmm2, 0xC0(%rdi)
134 vmovups %zmm2, 0x100(%rdi)
135 vmovups %zmm2, 0x140(%rdi)
136 vmovups %zmm2, 0x180(%rdi)
137 vmovups %zmm2, 0x1C0(%rdi)
138 vmovups %zmm2, -0x200(%rsi)
139 vmovups %zmm2, -0x1C0(%rsi)
140 vmovups %zmm2, -0x180(%rsi)
141 vmovups %zmm2, -0x140(%rsi)
142 vmovups %zmm2, -0x100(%rsi)
143 vmovups %zmm2, -0xC0(%rsi)
144 vmovups %zmm2, -0x80(%rsi)
145 vmovups %zmm2, -0x40(%rsi)
146 ret
147
148/* Align on 64 and loop with aligned stores. */
149L(1024bytesormore):
150 sub $0x100, %rsi
151 vmovups %zmm2, (%rax)
152 and $-0x40, %rdi
153 add $0x40, %rdi
154
155L(gobble_256bytes_loop):
156 vmovaps %zmm2, (%rdi)
157 vmovaps %zmm2, 0x40(%rdi)
158 vmovaps %zmm2, 0x80(%rdi)
159 vmovaps %zmm2, 0xC0(%rdi)
160 add $0x100, %rdi
161 cmp %rsi, %rdi
162 jb L(gobble_256bytes_loop)
163 vmovups %zmm2, (%rsi)
164 vmovups %zmm2, 0x40(%rsi)
165 vmovups %zmm2, 0x80(%rsi)
166 vmovups %zmm2, 0xC0(%rsi)
167 ret
168
169/* Align on 128 and loop with non-temporal stores. */
170L(preloop_large):
171 and $-0x80, %rdi
172 add $0x80, %rdi
173 vmovups %zmm2, (%rax)
174 vmovups %zmm2, 0x40(%rax)
175 sub $0x200, %rsi
176
177L(gobble_512bytes_nt_loop):
178 vmovntdq %zmm2, (%rdi)
179 vmovntdq %zmm2, 0x40(%rdi)
180 vmovntdq %zmm2, 0x80(%rdi)
181 vmovntdq %zmm2, 0xC0(%rdi)
182 vmovntdq %zmm2, 0x100(%rdi)
183 vmovntdq %zmm2, 0x140(%rdi)
184 vmovntdq %zmm2, 0x180(%rdi)
185 vmovntdq %zmm2, 0x1C0(%rdi)
186 add $0x200, %rdi
187 cmp %rsi, %rdi
188 jb L(gobble_512bytes_nt_loop)
189 sfence
190 vmovups %zmm2, (%rsi)
191 vmovups %zmm2, 0x40(%rsi)
192 vmovups %zmm2, 0x80(%rsi)
193 vmovups %zmm2, 0xC0(%rsi)
194 vmovups %zmm2, 0x100(%rsi)
195 vmovups %zmm2, 0x140(%rsi)
196 vmovups %zmm2, 0x180(%rsi)
197 vmovups %zmm2, 0x1C0(%rsi)
198 ret
199END (MEMSET)
200#endif
201

source code of glibc/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S