memset.S source code [linux/arch/arm64/lib/memset.S]

1	/ SPDX-License-Identifier: GPL-2.0-only /
2	/*
3	* Copyright (C) 2013 ARM Ltd.
4	* Copyright (C) 2013 Linaro.
5	*
6	* This code is based on glibc cortex strings work originally authored by Linaro
7	* be found @
8	*
9	* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
10	* files/head:/src/aarch64/
11	*/
12
13	#include <linux/linkage.h>
14	#include <asm/assembler.h>
15	#include <asm/cache.h>
16
17	/*
18	* Fill in the buffer with character c (alignment handled by the hardware)
19	*
20	* Parameters:
21	* x0 - buf
22	* x1 - c
23	* x2 - n
24	* Returns:
25	* x0 - buf
26	*/
27
28	dstin .req x0
29	val_x .req x1
30	val .req w1
31	count .req x2
32	tmp1 .req x3
33	tmp1w .req w3
34	tmp2 .req x4
35	tmp2w .req w4
36	zva_len_x .req x5
37	zva_len .req w5
38	zva_bits_x .req x6
39
40	A_l .req x7
41	A_lw .req w7
42	dst .req x8
43	tmp3w .req w9
44	tmp3 .req x9
45
46	SYM_FUNC_START_LOCAL(__pi_memset_generic)
47	mov dst, dstin / Preserve return value. /
48	and A_lw, val, #`255`
49	orr A_lw, A_lw, A_lw, lsl #`8`
50	orr A_lw, A_lw, A_lw, lsl #`16`
51	orr A_l, A_l, A_l, lsl #`32`
52
53	cmp count, #`15`
54	b.hi .Lover16_proc
55	/All store maybe are non-aligned../
56	tbz count, #`3`, `1f`
57	str A_l, [dst], #`8`
58	`1`:
59	tbz count, #`2`, `2f`
60	str A_lw, [dst], #`4`
61	`2`:
62	tbz count, #`1`, `3f`
63	strh A_lw, [dst], #`2`
64	`3`:
65	tbz count, #`0`, `4f`
66	strb A_lw, [dst]
67	`4`:
68	ret
69
70	.Lover16_proc:
71	/Whether the start address is aligned with 16./
72	neg tmp2, dst
73	ands tmp2, tmp2, #`15`
74	b.eq .Laligned
75	/*
76	* The count is not less than 16, we can use stp to store the start 16 bytes,
77	* then adjust the dst aligned with 16.This process will make the current
78	* memory address at alignment boundary.
79	*/
80	stp A_l, A_l, [dst] /non-aligned store../
81	/make the dst aligned../
82	sub count, count, tmp2
83	add dst, dst, tmp2
84
85	.Laligned:
86	cbz A_l, .Lzero_mem
87
88	.Ltail_maybe_long:
89	cmp count, #`64`
90	b.ge .Lnot_short
91	.Ltail63:
92	ands tmp1, count, #`0x30`
93	b.eq `3f`
94	cmp tmp1w, #`0x20`
95	b.eq `1f`
96	b.lt `2f`
97	stp A_l, A_l, [dst], #`16`
98	`1`:
99	stp A_l, A_l, [dst], #`16`
100	`2`:
101	stp A_l, A_l, [dst], #`16`
102	/*
103	* The last store length is less than 16,use stp to write last 16 bytes.
104	* It will lead some bytes written twice and the access is non-aligned.
105	*/
106	`3`:
107	ands count, count, #`15`
108	cbz count, `4f`
109	add dst, dst, count
110	stp A_l, A_l, [dst, #-`16`] / Repeat some/all of last store. /
111	`4`:
112	ret
113
114	/*
115	* Critical loop. Start at a new cache line boundary. Assuming
116	* 64 bytes per line, this ensures the entire loop is in one line.
117	*/
118	.p2align L1_CACHE_SHIFT
119	.Lnot_short:
120	sub dst, dst, #`16`/ Pre-bias. /
121	sub count, count, #`64`
122	`1`:
123	stp A_l, A_l, [dst, #`16`]
124	stp A_l, A_l, [dst, #`32`]
125	stp A_l, A_l, [dst, #`48`]
126	stp A_l, A_l, [dst, #`64`]!
127	subs count, count, #`64`
128	b.ge `1b`
129	tst count, #`0x3f`
130	add dst, dst, #`16`
131	b.ne .Ltail63
132	.Lexitfunc:
133	ret
134
135	/*
136	* For zeroing memory, check to see if we can use the ZVA feature to
137	* zero entire 'cache' lines.
138	*/
139	.Lzero_mem:
140	cmp count, #`63`
141	b.le .Ltail63
142	/*
143	* For zeroing small amounts of memory, it's not worth setting up
144	* the line-clear code.
145	*/
146	cmp count, #`128`
147	b.lt .Lnot_short /count is at least 128 bytes/
148
149	mrs tmp1, dczid_el0
150	tbnz tmp1, #`4`, .Lnot_short
151	mov tmp3w, #`4`
152	and zva_len, tmp1w, #`15` / Safety: other bits reserved. /
153	lsl zva_len, tmp3w, zva_len
154
155	ands tmp3w, zva_len, #`63`
156	/*
157	* ensure the zva_len is not less than 64.
158	* It is not meaningful to use ZVA if the block size is less than 64.
159	*/
160	b.ne .Lnot_short
161	.Lzero_by_line:
162	/*
163	* Compute how far we need to go to become suitably aligned. We're
164	* already at quad-word alignment.
165	*/
166	cmp count, zva_len_x
167	b.lt .Lnot_short / Not enough to reach alignment. /
168	sub zva_bits_x, zva_len_x, #`1`
169	neg tmp2, dst
170	ands tmp2, tmp2, zva_bits_x
171	b.eq `2f` / Already aligned. /
172	/ Not aligned, check that there's enough to copy after alignment./
173	sub tmp1, count, tmp2
174	/*
175	* grantee the remain length to be ZVA is bigger than 64,
176	* avoid to make the 2f's process over mem range.*/
177	cmp tmp1, #`64`
178	ccmp tmp1, zva_len_x, #`8`, ge / NZCV=0b1000 /
179	b.lt .Lnot_short
180	/*
181	* We know that there's at least 64 bytes to zero and that it's safe
182	* to overrun by 64 bytes.
183	*/
184	mov count, tmp1
185	`1`:
186	stp A_l, A_l, [dst]
187	stp A_l, A_l, [dst, #`16`]
188	stp A_l, A_l, [dst, #`32`]
189	subs tmp2, tmp2, #`64`
190	stp A_l, A_l, [dst, #`48`]
191	add dst, dst, #`64`
192	b.ge `1b`
193	/ We've overrun a bit, so adjust dst downwards./
194	add dst, dst, tmp2
195	`2`:
196	sub count, count, zva_len_x
197	`3`:
198	dc zva, dst
199	add dst, dst, zva_len_x
200	subs count, count, zva_len_x
201	b.ge `3b`
202	ands count, count, zva_bits_x
203	b.ne .Ltail_maybe_long
204	ret
205	SYM_FUNC_END(__pi_memset_generic)
206
207	#ifdef CONFIG_AS_HAS_MOPS
208	.arch_extension mops
209	SYM_FUNC_START(__pi_memset)
210	alternative_if_not ARM64_HAS_MOPS
211	b __pi_memset_generic
212	alternative_else_nop_endif
213
214	mov dst, dstin
215	setp [dst]!, count!, val_x
216	setm [dst]!, count!, val_x
217	sete [dst]!, count!, val_x
218	ret
219	SYM_FUNC_END(__pi_memset)
220	#else
221	SYM_FUNC_ALIAS(__pi_memset, __pi_memset_generic)
222	#endif
223
224	SYM_FUNC_ALIAS(__memset, __pi_memset)
225	EXPORT_SYMBOL(__memset)
226
227	SYM_FUNC_ALIAS_WEAK(memset, __pi_memset)
228	EXPORT_SYMBOL(memset)
229

source code of linux/arch/arm64/lib/memset.S