1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (c) 2011, The Linux Foundation. All rights reserved.
4 */
5
6
7/* HEXAGON assembly optimized memset */
8/* Replaces the standard library function memset */
9
10
11 .macro HEXAGON_OPT_FUNC_BEGIN name
12 .text
13 .p2align 4
14 .globl \name
15 .type \name, @function
16\name:
17 .endm
18
19 .macro HEXAGON_OPT_FUNC_FINISH name
20 .size \name, . - \name
21 .endm
22
23/* FUNCTION: memset (v2 version) */
24#if __HEXAGON_ARCH__ < 3
25HEXAGON_OPT_FUNC_BEGIN memset
26 {
27 r6 = #8
28 r7 = extractu(r0, #3 , #0)
29 p0 = cmp.eq(r2, #0)
30 p1 = cmp.gtu(r2, #7)
31 }
32 {
33 r4 = vsplatb(r1)
34 r8 = r0 /* leave r0 intact for return val */
35 r9 = sub(r6, r7) /* bytes until double alignment */
36 if p0 jumpr r31 /* count == 0, so return */
37 }
38 {
39 r3 = #0
40 r7 = #0
41 p0 = tstbit(r9, #0)
42 if p1 jump 2f /* skip byte loop */
43 }
44
45/* less than 8 bytes to set, so just set a byte at a time and return */
46
47 loop0(1f, r2) /* byte loop */
48 .falign
491: /* byte loop */
50 {
51 memb(r8++#1) = r4
52 }:endloop0
53 jumpr r31
54 .falign
552: /* skip byte loop */
56 {
57 r6 = #1
58 p0 = tstbit(r9, #1)
59 p1 = cmp.eq(r2, #1)
60 if !p0 jump 3f /* skip initial byte store */
61 }
62 {
63 memb(r8++#1) = r4
64 r3:2 = sub(r3:2, r7:6)
65 if p1 jumpr r31
66 }
67 .falign
683: /* skip initial byte store */
69 {
70 r6 = #2
71 p0 = tstbit(r9, #2)
72 p1 = cmp.eq(r2, #2)
73 if !p0 jump 4f /* skip initial half store */
74 }
75 {
76 memh(r8++#2) = r4
77 r3:2 = sub(r3:2, r7:6)
78 if p1 jumpr r31
79 }
80 .falign
814: /* skip initial half store */
82 {
83 r6 = #4
84 p0 = cmp.gtu(r2, #7)
85 p1 = cmp.eq(r2, #4)
86 if !p0 jump 5f /* skip initial word store */
87 }
88 {
89 memw(r8++#4) = r4
90 r3:2 = sub(r3:2, r7:6)
91 p0 = cmp.gtu(r2, #11)
92 if p1 jumpr r31
93 }
94 .falign
955: /* skip initial word store */
96 {
97 r10 = lsr(r2, #3)
98 p1 = cmp.eq(r3, #1)
99 if !p0 jump 7f /* skip double loop */
100 }
101 {
102 r5 = r4
103 r6 = #8
104 loop0(6f, r10) /* double loop */
105 }
106
107/* set bytes a double word at a time */
108
109 .falign
1106: /* double loop */
111 {
112 memd(r8++#8) = r5:4
113 r3:2 = sub(r3:2, r7:6)
114 p1 = cmp.eq(r2, #8)
115 }:endloop0
116 .falign
1177: /* skip double loop */
118 {
119 p0 = tstbit(r2, #2)
120 if p1 jumpr r31
121 }
122 {
123 r6 = #4
124 p0 = tstbit(r2, #1)
125 p1 = cmp.eq(r2, #4)
126 if !p0 jump 8f /* skip final word store */
127 }
128 {
129 memw(r8++#4) = r4
130 r3:2 = sub(r3:2, r7:6)
131 if p1 jumpr r31
132 }
133 .falign
1348: /* skip final word store */
135 {
136 p1 = cmp.eq(r2, #2)
137 if !p0 jump 9f /* skip final half store */
138 }
139 {
140 memh(r8++#2) = r4
141 if p1 jumpr r31
142 }
143 .falign
1449: /* skip final half store */
145 {
146 memb(r8++#1) = r4
147 jumpr r31
148 }
149HEXAGON_OPT_FUNC_FINISH memset
150#endif
151
152
153/* FUNCTION: memset (v3 and higher version) */
154#if __HEXAGON_ARCH__ >= 3
155HEXAGON_OPT_FUNC_BEGIN memset
156 {
157 r7=vsplatb(r1)
158 r6 = r0
159 if (r2==#0) jump:nt .L1
160 }
161 {
162 r5:4=combine(r7,r7)
163 p0 = cmp.gtu(r2,#8)
164 if (p0.new) jump:nt .L3
165 }
166 {
167 r3 = r0
168 loop0(.L47,r2)
169 }
170 .falign
171.L47:
172 {
173 memb(r3++#1) = r1
174 }:endloop0 /* start=.L47 */
175 jumpr r31
176.L3:
177 {
178 p0 = tstbit(r0,#0)
179 if (!p0.new) jump:nt .L8
180 p1 = cmp.eq(r2, #1)
181 }
182 {
183 r6 = add(r0, #1)
184 r2 = add(r2,#-1)
185 memb(r0) = r1
186 if (p1) jump .L1
187 }
188.L8:
189 {
190 p0 = tstbit(r6,#1)
191 if (!p0.new) jump:nt .L10
192 }
193 {
194 r2 = add(r2,#-2)
195 memh(r6++#2) = r7
196 p0 = cmp.eq(r2, #2)
197 if (p0.new) jump:nt .L1
198 }
199.L10:
200 {
201 p0 = tstbit(r6,#2)
202 if (!p0.new) jump:nt .L12
203 }
204 {
205 r2 = add(r2,#-4)
206 memw(r6++#4) = r7
207 p0 = cmp.eq(r2, #4)
208 if (p0.new) jump:nt .L1
209 }
210.L12:
211 {
212 p0 = cmp.gtu(r2,#127)
213 if (!p0.new) jump:nt .L14
214 }
215 r3 = and(r6,#31)
216 if (r3==#0) jump:nt .L17
217 {
218 memd(r6++#8) = r5:4
219 r2 = add(r2,#-8)
220 }
221 r3 = and(r6,#31)
222 if (r3==#0) jump:nt .L17
223 {
224 memd(r6++#8) = r5:4
225 r2 = add(r2,#-8)
226 }
227 r3 = and(r6,#31)
228 if (r3==#0) jump:nt .L17
229 {
230 memd(r6++#8) = r5:4
231 r2 = add(r2,#-8)
232 }
233.L17:
234 {
235 r3 = lsr(r2,#5)
236 if (r1!=#0) jump:nt .L18
237 }
238 {
239 r8 = r3
240 r3 = r6
241 loop0(.L46,r3)
242 }
243 .falign
244.L46:
245 {
246 dczeroa(r6)
247 r6 = add(r6,#32)
248 r2 = add(r2,#-32)
249 }:endloop0 /* start=.L46 */
250.L14:
251 {
252 p0 = cmp.gtu(r2,#7)
253 if (!p0.new) jump:nt .L28
254 r8 = lsr(r2,#3)
255 }
256 loop0(.L44,r8)
257 .falign
258.L44:
259 {
260 memd(r6++#8) = r5:4
261 r2 = add(r2,#-8)
262 }:endloop0 /* start=.L44 */
263.L28:
264 {
265 p0 = tstbit(r2,#2)
266 if (!p0.new) jump:nt .L33
267 }
268 {
269 r2 = add(r2,#-4)
270 memw(r6++#4) = r7
271 }
272.L33:
273 {
274 p0 = tstbit(r2,#1)
275 if (!p0.new) jump:nt .L35
276 }
277 {
278 r2 = add(r2,#-2)
279 memh(r6++#2) = r7
280 }
281.L35:
282 p0 = cmp.eq(r2,#1)
283 if (p0) memb(r6) = r1
284.L1:
285 jumpr r31
286.L18:
287 loop0(.L45,r3)
288 .falign
289.L45:
290 dczeroa(r6)
291 {
292 memd(r6++#8) = r5:4
293 r2 = add(r2,#-32)
294 }
295 memd(r6++#8) = r5:4
296 memd(r6++#8) = r5:4
297 {
298 memd(r6++#8) = r5:4
299 }:endloop0 /* start=.L45 */
300 jump .L14
301HEXAGON_OPT_FUNC_FINISH memset
302#endif
303

source code of linux/arch/hexagon/lib/memset.S