1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* |
3 | * arch/alpha/lib/ev6-memset.S |
4 | * |
5 | * This is an efficient (and relatively small) implementation of the C library |
6 | * "memset()" function for the 21264 implementation of Alpha. |
7 | * |
8 | * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> |
9 | * |
10 | * Much of the information about 21264 scheduling/coding comes from: |
11 | * Compiler Writer's Guide for the Alpha 21264 |
12 | * abbreviated as 'CWG' in other comments here |
13 | * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html |
14 | * Scheduling notation: |
15 | * E - either cluster |
16 | * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 |
17 | * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 |
18 | * The algorithm for the leading and trailing quadwords remains the same, |
19 | * however the loop has been unrolled to enable better memory throughput, |
20 | * and the code has been replicated for each of the entry points: __memset |
21 | * and __memset16 to permit better scheduling to eliminate the stalling |
22 | * encountered during the mask replication. |
23 | * A future enhancement might be to put in a byte store loop for really |
24 | * small (say < 32 bytes) memset()s. Whether or not that change would be |
25 | * a win in the kernel would depend upon the contextual usage. |
26 | * WARNING: Maintaining this is going to be more work than the above version, |
27 | * as fixes will need to be made in multiple places. The performance gain |
28 | * is worth it. |
29 | */ |
30 | #include <linux/export.h> |
31 | .set noat |
32 | .set noreorder |
33 | .text |
34 | .globl memset |
35 | .globl __memset |
36 | .globl ___memset |
37 | .globl __memset16 |
38 | .globl __constant_c_memset |
39 | |
40 | .ent ___memset |
41 | .align 5 |
42 | ___memset: |
43 | .frame $30,0,$26,0 |
44 | .prologue 0 |
45 | |
46 | /* |
47 | * Serious stalling happens. The only way to mitigate this is to |
48 | * undertake a major re-write to interleave the constant materialization |
49 | * with other parts of the fall-through code. This is important, even |
50 | * though it makes maintenance tougher. |
51 | * Do this later. |
52 | */ |
53 | and $17,255,$1 # E : 00000000000000ch |
54 | insbl $17,1,$2 # U : 000000000000ch00 |
55 | bis $16,$16,$0 # E : return value |
56 | ble $18,end_b # U : zero length requested? |
57 | |
58 | addq $18,$16,$6 # E : max address to write to |
59 | bis $1,$2,$17 # E : 000000000000chch |
60 | insbl $1,2,$3 # U : 0000000000ch0000 |
61 | insbl $1,3,$4 # U : 00000000ch000000 |
62 | |
63 | or $3,$4,$3 # E : 00000000chch0000 |
64 | inswl $17,4,$5 # U : 0000chch00000000 |
65 | xor $16,$6,$1 # E : will complete write be within one quadword? |
66 | inswl $17,6,$2 # U : chch000000000000 |
67 | |
68 | or $17,$3,$17 # E : 00000000chchchch |
69 | or $2,$5,$2 # E : chchchch00000000 |
70 | bic $1,7,$1 # E : fit within a single quadword? |
71 | and $16,7,$3 # E : Target addr misalignment |
72 | |
73 | or $17,$2,$17 # E : chchchchchchchch |
74 | beq $1,within_quad_b # U : |
75 | nop # E : |
76 | beq $3,aligned_b # U : target is 0mod8 |
77 | |
78 | /* |
79 | * Target address is misaligned, and won't fit within a quadword |
80 | */ |
81 | ldq_u $4,0($16) # L : Fetch first partial |
82 | bis $16,$16,$5 # E : Save the address |
83 | insql $17,$16,$2 # U : Insert new bytes |
84 | subq $3,8,$3 # E : Invert (for addressing uses) |
85 | |
86 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) |
87 | mskql $4,$16,$4 # U : clear relevant parts of the quad |
88 | subq $16,$3,$16 # E : $16 is new aligned destination |
89 | bis $2,$4,$1 # E : Final bytes |
90 | |
91 | nop |
92 | stq_u $1,0($5) # L : Store result |
93 | nop |
94 | nop |
95 | |
96 | .align 4 |
97 | aligned_b: |
98 | /* |
99 | * We are now guaranteed to be quad aligned, with at least |
100 | * one partial quad to write. |
101 | */ |
102 | |
103 | sra $18,3,$3 # U : Number of remaining quads to write |
104 | and $18,7,$18 # E : Number of trailing bytes to write |
105 | bis $16,$16,$5 # E : Save dest address |
106 | beq $3,no_quad_b # U : tail stuff only |
107 | |
108 | /* |
109 | * it's worth the effort to unroll this and use wh64 if possible |
110 | * Lifted a bunch of code from clear_user.S |
111 | * At this point, entry values are: |
112 | * $16 Current destination address |
113 | * $5 A copy of $16 |
114 | * $6 The max quadword address to write to |
115 | * $18 Number trailer bytes |
116 | * $3 Number quads to write |
117 | */ |
118 | |
119 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) |
120 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes |
121 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) |
122 | blt $4, loop_b # U : |
123 | |
124 | /* |
125 | * We know we've got at least 16 quads, minimum of one trip |
126 | * through unrolled loop. Do a quad at a time to get us 0mod64 |
127 | * aligned. |
128 | */ |
129 | |
130 | nop # E : |
131 | nop # E : |
132 | nop # E : |
133 | beq $1, $bigalign_b # U : |
134 | |
135 | $alignmod64_b: |
136 | stq $17, 0($5) # L : |
137 | subq $3, 1, $3 # E : For consistency later |
138 | addq $1, 8, $1 # E : Increment towards zero for alignment |
139 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) |
140 | |
141 | nop |
142 | nop |
143 | addq $5, 8, $5 # E : Inc address |
144 | blt $1, $alignmod64_b # U : |
145 | |
146 | $bigalign_b: |
147 | /* |
148 | * $3 - number quads left to go |
149 | * $5 - target address (aligned 0mod64) |
150 | * $17 - mask of stuff to store |
151 | * Scratch registers available: $7, $2, $4, $1 |
152 | * we know that we'll be taking a minimum of one trip through |
153 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle |
154 | * Assumes the wh64 needs to be for 2 trips through the loop in the future |
155 | * The wh64 is issued on for the starting destination address for trip +2 |
156 | * through the loop, and if there are less than two trips left, the target |
157 | * address will be for the current trip. |
158 | */ |
159 | |
160 | $do_wh64_b: |
161 | wh64 ($4) # L1 : memory subsystem write hint |
162 | subq $3, 24, $2 # E : For determining future wh64 addresses |
163 | stq $17, 0($5) # L : |
164 | nop # E : |
165 | |
166 | addq $5, 128, $4 # E : speculative target of next wh64 |
167 | stq $17, 8($5) # L : |
168 | stq $17, 16($5) # L : |
169 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) |
170 | |
171 | stq $17, 24($5) # L : |
172 | stq $17, 32($5) # L : |
173 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle |
174 | nop |
175 | |
176 | stq $17, 40($5) # L : |
177 | stq $17, 48($5) # L : |
178 | subq $3, 16, $2 # E : Repeat the loop at least once more? |
179 | nop |
180 | |
181 | stq $17, 56($5) # L : |
182 | addq $5, 64, $5 # E : |
183 | subq $3, 8, $3 # E : |
184 | bge $2, $do_wh64_b # U : |
185 | |
186 | nop |
187 | nop |
188 | nop |
189 | beq $3, no_quad_b # U : Might have finished already |
190 | |
191 | .align 4 |
192 | /* |
193 | * Simple loop for trailing quadwords, or for small amounts |
194 | * of data (where we can't use an unrolled loop and wh64) |
195 | */ |
196 | loop_b: |
197 | stq $17,0($5) # L : |
198 | subq $3,1,$3 # E : Decrement number quads left |
199 | addq $5,8,$5 # E : Inc address |
200 | bne $3,loop_b # U : more? |
201 | |
202 | no_quad_b: |
203 | /* |
204 | * Write 0..7 trailing bytes. |
205 | */ |
206 | nop # E : |
207 | beq $18,end_b # U : All done? |
208 | ldq $7,0($5) # L : |
209 | mskqh $7,$6,$2 # U : Mask final quad |
210 | |
211 | insqh $17,$6,$4 # U : New bits |
212 | bis $2,$4,$1 # E : Put it all together |
213 | stq $1,0($5) # L : And back to memory |
214 | ret $31,($26),1 # L0 : |
215 | |
216 | within_quad_b: |
217 | ldq_u $1,0($16) # L : |
218 | insql $17,$16,$2 # U : New bits |
219 | mskql $1,$16,$4 # U : Clear old |
220 | bis $2,$4,$2 # E : New result |
221 | |
222 | mskql $2,$6,$4 # U : |
223 | mskqh $1,$6,$2 # U : |
224 | bis $2,$4,$1 # E : |
225 | stq_u $1,0($16) # L : |
226 | |
227 | end_b: |
228 | nop |
229 | nop |
230 | nop |
231 | ret $31,($26),1 # L0 : |
232 | .end ___memset |
233 | EXPORT_SYMBOL(___memset) |
234 | |
235 | /* |
236 | * This is the original body of code, prior to replication and |
237 | * rescheduling. Leave it here, as there may be calls to this |
238 | * entry point. |
239 | */ |
240 | .align 4 |
241 | .ent __constant_c_memset |
242 | __constant_c_memset: |
243 | .frame $30,0,$26,0 |
244 | .prologue 0 |
245 | |
246 | addq $18,$16,$6 # E : max address to write to |
247 | bis $16,$16,$0 # E : return value |
248 | xor $16,$6,$1 # E : will complete write be within one quadword? |
249 | ble $18,end # U : zero length requested? |
250 | |
251 | bic $1,7,$1 # E : fit within a single quadword |
252 | beq $1,within_one_quad # U : |
253 | and $16,7,$3 # E : Target addr misalignment |
254 | beq $3,aligned # U : target is 0mod8 |
255 | |
256 | /* |
257 | * Target address is misaligned, and won't fit within a quadword |
258 | */ |
259 | ldq_u $4,0($16) # L : Fetch first partial |
260 | bis $16,$16,$5 # E : Save the address |
261 | insql $17,$16,$2 # U : Insert new bytes |
262 | subq $3,8,$3 # E : Invert (for addressing uses) |
263 | |
264 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) |
265 | mskql $4,$16,$4 # U : clear relevant parts of the quad |
266 | subq $16,$3,$16 # E : $16 is new aligned destination |
267 | bis $2,$4,$1 # E : Final bytes |
268 | |
269 | nop |
270 | stq_u $1,0($5) # L : Store result |
271 | nop |
272 | nop |
273 | |
274 | .align 4 |
275 | aligned: |
276 | /* |
277 | * We are now guaranteed to be quad aligned, with at least |
278 | * one partial quad to write. |
279 | */ |
280 | |
281 | sra $18,3,$3 # U : Number of remaining quads to write |
282 | and $18,7,$18 # E : Number of trailing bytes to write |
283 | bis $16,$16,$5 # E : Save dest address |
284 | beq $3,no_quad # U : tail stuff only |
285 | |
286 | /* |
287 | * it's worth the effort to unroll this and use wh64 if possible |
288 | * Lifted a bunch of code from clear_user.S |
289 | * At this point, entry values are: |
290 | * $16 Current destination address |
291 | * $5 A copy of $16 |
292 | * $6 The max quadword address to write to |
293 | * $18 Number trailer bytes |
294 | * $3 Number quads to write |
295 | */ |
296 | |
297 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) |
298 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes |
299 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) |
300 | blt $4, loop # U : |
301 | |
302 | /* |
303 | * We know we've got at least 16 quads, minimum of one trip |
304 | * through unrolled loop. Do a quad at a time to get us 0mod64 |
305 | * aligned. |
306 | */ |
307 | |
308 | nop # E : |
309 | nop # E : |
310 | nop # E : |
311 | beq $1, $bigalign # U : |
312 | |
313 | $alignmod64: |
314 | stq $17, 0($5) # L : |
315 | subq $3, 1, $3 # E : For consistency later |
316 | addq $1, 8, $1 # E : Increment towards zero for alignment |
317 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) |
318 | |
319 | nop |
320 | nop |
321 | addq $5, 8, $5 # E : Inc address |
322 | blt $1, $alignmod64 # U : |
323 | |
324 | $bigalign: |
325 | /* |
326 | * $3 - number quads left to go |
327 | * $5 - target address (aligned 0mod64) |
328 | * $17 - mask of stuff to store |
329 | * Scratch registers available: $7, $2, $4, $1 |
330 | * we know that we'll be taking a minimum of one trip through |
331 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle |
332 | * Assumes the wh64 needs to be for 2 trips through the loop in the future |
333 | * The wh64 is issued on for the starting destination address for trip +2 |
334 | * through the loop, and if there are less than two trips left, the target |
335 | * address will be for the current trip. |
336 | */ |
337 | |
338 | $do_wh64: |
339 | wh64 ($4) # L1 : memory subsystem write hint |
340 | subq $3, 24, $2 # E : For determining future wh64 addresses |
341 | stq $17, 0($5) # L : |
342 | nop # E : |
343 | |
344 | addq $5, 128, $4 # E : speculative target of next wh64 |
345 | stq $17, 8($5) # L : |
346 | stq $17, 16($5) # L : |
347 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) |
348 | |
349 | stq $17, 24($5) # L : |
350 | stq $17, 32($5) # L : |
351 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle |
352 | nop |
353 | |
354 | stq $17, 40($5) # L : |
355 | stq $17, 48($5) # L : |
356 | subq $3, 16, $2 # E : Repeat the loop at least once more? |
357 | nop |
358 | |
359 | stq $17, 56($5) # L : |
360 | addq $5, 64, $5 # E : |
361 | subq $3, 8, $3 # E : |
362 | bge $2, $do_wh64 # U : |
363 | |
364 | nop |
365 | nop |
366 | nop |
367 | beq $3, no_quad # U : Might have finished already |
368 | |
369 | .align 4 |
370 | /* |
371 | * Simple loop for trailing quadwords, or for small amounts |
372 | * of data (where we can't use an unrolled loop and wh64) |
373 | */ |
374 | loop: |
375 | stq $17,0($5) # L : |
376 | subq $3,1,$3 # E : Decrement number quads left |
377 | addq $5,8,$5 # E : Inc address |
378 | bne $3,loop # U : more? |
379 | |
380 | no_quad: |
381 | /* |
382 | * Write 0..7 trailing bytes. |
383 | */ |
384 | nop # E : |
385 | beq $18,end # U : All done? |
386 | ldq $7,0($5) # L : |
387 | mskqh $7,$6,$2 # U : Mask final quad |
388 | |
389 | insqh $17,$6,$4 # U : New bits |
390 | bis $2,$4,$1 # E : Put it all together |
391 | stq $1,0($5) # L : And back to memory |
392 | ret $31,($26),1 # L0 : |
393 | |
394 | within_one_quad: |
395 | ldq_u $1,0($16) # L : |
396 | insql $17,$16,$2 # U : New bits |
397 | mskql $1,$16,$4 # U : Clear old |
398 | bis $2,$4,$2 # E : New result |
399 | |
400 | mskql $2,$6,$4 # U : |
401 | mskqh $1,$6,$2 # U : |
402 | bis $2,$4,$1 # E : |
403 | stq_u $1,0($16) # L : |
404 | |
405 | end: |
406 | nop |
407 | nop |
408 | nop |
409 | ret $31,($26),1 # L0 : |
410 | .end __constant_c_memset |
411 | EXPORT_SYMBOL(__constant_c_memset) |
412 | |
413 | /* |
414 | * This is a replicant of the __constant_c_memset code, rescheduled |
415 | * to mask stalls. Note that entry point names also had to change |
416 | */ |
417 | .align 5 |
418 | .ent __memset16 |
419 | |
420 | __memset16: |
421 | .frame $30,0,$26,0 |
422 | .prologue 0 |
423 | |
424 | inswl $17,0,$5 # U : 000000000000c1c2 |
425 | inswl $17,2,$2 # U : 00000000c1c20000 |
426 | bis $16,$16,$0 # E : return value |
427 | addq $18,$16,$6 # E : max address to write to |
428 | |
429 | ble $18, end_w # U : zero length requested? |
430 | inswl $17,4,$3 # U : 0000c1c200000000 |
431 | inswl $17,6,$4 # U : c1c2000000000000 |
432 | xor $16,$6,$1 # E : will complete write be within one quadword? |
433 | |
434 | or $2,$5,$2 # E : 00000000c1c2c1c2 |
435 | or $3,$4,$17 # E : c1c2c1c200000000 |
436 | bic $1,7,$1 # E : fit within a single quadword |
437 | and $16,7,$3 # E : Target addr misalignment |
438 | |
439 | or $17,$2,$17 # E : c1c2c1c2c1c2c1c2 |
440 | beq $1,within_quad_w # U : |
441 | nop |
442 | beq $3,aligned_w # U : target is 0mod8 |
443 | |
444 | /* |
445 | * Target address is misaligned, and won't fit within a quadword |
446 | */ |
447 | ldq_u $4,0($16) # L : Fetch first partial |
448 | bis $16,$16,$5 # E : Save the address |
449 | insql $17,$16,$2 # U : Insert new bytes |
450 | subq $3,8,$3 # E : Invert (for addressing uses) |
451 | |
452 | addq $18,$3,$18 # E : $18 is new count ($3 is negative) |
453 | mskql $4,$16,$4 # U : clear relevant parts of the quad |
454 | subq $16,$3,$16 # E : $16 is new aligned destination |
455 | bis $2,$4,$1 # E : Final bytes |
456 | |
457 | nop |
458 | stq_u $1,0($5) # L : Store result |
459 | nop |
460 | nop |
461 | |
462 | .align 4 |
463 | aligned_w: |
464 | /* |
465 | * We are now guaranteed to be quad aligned, with at least |
466 | * one partial quad to write. |
467 | */ |
468 | |
469 | sra $18,3,$3 # U : Number of remaining quads to write |
470 | and $18,7,$18 # E : Number of trailing bytes to write |
471 | bis $16,$16,$5 # E : Save dest address |
472 | beq $3,no_quad_w # U : tail stuff only |
473 | |
474 | /* |
475 | * it's worth the effort to unroll this and use wh64 if possible |
476 | * Lifted a bunch of code from clear_user.S |
477 | * At this point, entry values are: |
478 | * $16 Current destination address |
479 | * $5 A copy of $16 |
480 | * $6 The max quadword address to write to |
481 | * $18 Number trailer bytes |
482 | * $3 Number quads to write |
483 | */ |
484 | |
485 | and $16, 0x3f, $2 # E : Forward work (only useful for unrolled loop) |
486 | subq $3, 16, $4 # E : Only try to unroll if > 128 bytes |
487 | subq $2, 0x40, $1 # E : bias counter (aligning stuff 0mod64) |
488 | blt $4, loop_w # U : |
489 | |
490 | /* |
491 | * We know we've got at least 16 quads, minimum of one trip |
492 | * through unrolled loop. Do a quad at a time to get us 0mod64 |
493 | * aligned. |
494 | */ |
495 | |
496 | nop # E : |
497 | nop # E : |
498 | nop # E : |
499 | beq $1, $bigalign_w # U : |
500 | |
501 | $alignmod64_w: |
502 | stq $17, 0($5) # L : |
503 | subq $3, 1, $3 # E : For consistency later |
504 | addq $1, 8, $1 # E : Increment towards zero for alignment |
505 | addq $5, 8, $4 # E : Initial wh64 address (filler instruction) |
506 | |
507 | nop |
508 | nop |
509 | addq $5, 8, $5 # E : Inc address |
510 | blt $1, $alignmod64_w # U : |
511 | |
512 | $bigalign_w: |
513 | /* |
514 | * $3 - number quads left to go |
515 | * $5 - target address (aligned 0mod64) |
516 | * $17 - mask of stuff to store |
517 | * Scratch registers available: $7, $2, $4, $1 |
518 | * we know that we'll be taking a minimum of one trip through |
519 | * CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle |
520 | * Assumes the wh64 needs to be for 2 trips through the loop in the future |
521 | * The wh64 is issued on for the starting destination address for trip +2 |
522 | * through the loop, and if there are less than two trips left, the target |
523 | * address will be for the current trip. |
524 | */ |
525 | |
526 | $do_wh64_w: |
527 | wh64 ($4) # L1 : memory subsystem write hint |
528 | subq $3, 24, $2 # E : For determining future wh64 addresses |
529 | stq $17, 0($5) # L : |
530 | nop # E : |
531 | |
532 | addq $5, 128, $4 # E : speculative target of next wh64 |
533 | stq $17, 8($5) # L : |
534 | stq $17, 16($5) # L : |
535 | addq $5, 64, $7 # E : Fallback address for wh64 (== next trip addr) |
536 | |
537 | stq $17, 24($5) # L : |
538 | stq $17, 32($5) # L : |
539 | cmovlt $2, $7, $4 # E : Latency 2, extra mapping cycle |
540 | nop |
541 | |
542 | stq $17, 40($5) # L : |
543 | stq $17, 48($5) # L : |
544 | subq $3, 16, $2 # E : Repeat the loop at least once more? |
545 | nop |
546 | |
547 | stq $17, 56($5) # L : |
548 | addq $5, 64, $5 # E : |
549 | subq $3, 8, $3 # E : |
550 | bge $2, $do_wh64_w # U : |
551 | |
552 | nop |
553 | nop |
554 | nop |
555 | beq $3, no_quad_w # U : Might have finished already |
556 | |
557 | .align 4 |
558 | /* |
559 | * Simple loop for trailing quadwords, or for small amounts |
560 | * of data (where we can't use an unrolled loop and wh64) |
561 | */ |
562 | loop_w: |
563 | stq $17,0($5) # L : |
564 | subq $3,1,$3 # E : Decrement number quads left |
565 | addq $5,8,$5 # E : Inc address |
566 | bne $3,loop_w # U : more? |
567 | |
568 | no_quad_w: |
569 | /* |
570 | * Write 0..7 trailing bytes. |
571 | */ |
572 | nop # E : |
573 | beq $18,end_w # U : All done? |
574 | ldq $7,0($5) # L : |
575 | mskqh $7,$6,$2 # U : Mask final quad |
576 | |
577 | insqh $17,$6,$4 # U : New bits |
578 | bis $2,$4,$1 # E : Put it all together |
579 | stq $1,0($5) # L : And back to memory |
580 | ret $31,($26),1 # L0 : |
581 | |
582 | within_quad_w: |
583 | ldq_u $1,0($16) # L : |
584 | insql $17,$16,$2 # U : New bits |
585 | mskql $1,$16,$4 # U : Clear old |
586 | bis $2,$4,$2 # E : New result |
587 | |
588 | mskql $2,$6,$4 # U : |
589 | mskqh $1,$6,$2 # U : |
590 | bis $2,$4,$1 # E : |
591 | stq_u $1,0($16) # L : |
592 | |
593 | end_w: |
594 | nop |
595 | nop |
596 | nop |
597 | ret $31,($26),1 # L0 : |
598 | |
599 | .end __memset16 |
600 | EXPORT_SYMBOL(__memset16) |
601 | |
602 | memset = ___memset |
603 | __memset = ___memset |
604 | EXPORT_SYMBOL(memset) |
605 | EXPORT_SYMBOL(__memset) |
606 | |