ev6-memset.S source code [linux/arch/alpha/lib/ev6-memset.S]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/*
3	* arch/alpha/lib/ev6-memset.S
4	*
5	* This is an efficient (and relatively small) implementation of the C library
6	* "memset()" function for the 21264 implementation of Alpha.
7	*
8	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
9	*
10	* Much of the information about 21264 scheduling/coding comes from:
11	* Compiler Writer's Guide for the Alpha 21264
12	* abbreviated as 'CWG' in other comments here
13	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
14	* Scheduling notation:
15	* E - either cluster
16	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
17	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
18	* The algorithm for the leading and trailing quadwords remains the same,
19	* however the loop has been unrolled to enable better memory throughput,
20	* and the code has been replicated for each of the entry points: __memset
21	* and __memset16 to permit better scheduling to eliminate the stalling
22	* encountered during the mask replication.
23	* A future enhancement might be to put in a byte store loop for really
24	* small (say < 32 bytes) memset()s. Whether or not that change would be
25	* a win in the kernel would depend upon the contextual usage.
26	* WARNING: Maintaining this is going to be more work than the above version,
27	* as fixes will need to be made in multiple places. The performance gain
28	* is worth it.
29	*/
30	#include <linux/export.h>
31	.set noat
32	.set noreorder
33	.text
34	.globl memset
35	.globl __memset
36	.globl ___memset
37	.globl __memset16
38	.globl __constant_c_memset
39
40	.ent ___memset
41	.align `5`
42	___memset:
43	.frame $`30`,`0`,$`26`,`0`
44	.prologue `0`
45
46	/*
47	* Serious stalling happens. The only way to mitigate this is to
48	* undertake a major re-write to interleave the constant materialization
49	* with other parts of the fall-through code. This is important, even
50	* though it makes maintenance tougher.
51	* Do this later.
52	*/
53	and $`17`,`255`,$`1` # E : `00000000000000ch`
54	insbl $`17`,`1`,$`2` # U : `000000000000ch00`
55	bis $`16`,$`16`,$`0` # E : return value
56	ble $`18`,end_b # U : zero length requested?
57
58	addq $`18`,$`16`,$`6` # E : max address to write to
59	bis $`1`,$`2`,$`17` # E : `000000000000chch`
60	insbl $`1`,`2`,$`3` # U : `0000000000ch0000`
61	insbl $`1`,`3`,$`4` # U : `00000000ch000000`
62
63	or $`3`,$`4`,$`3` # E : `00000000chch0000`
64	inswl $`17`,`4`,$`5` # U : `0000chch00000000`
65	xor $`16`,$`6`,$`1` # E : will complete write be within one quadword?
66	inswl $`17`,`6`,$`2` # U : chch000000000000
67
68	or $`17`,$`3`,$`17` # E : `00000000chchchch`
69	or $`2`,$`5`,$`2` # E : chchchch00000000
70	bic $`1`,`7`,$`1` # E : fit within a single quadword?
71	and $`16`,`7`,$`3` # E : Target addr misalignment
72
73	or $`17`,$`2`,$`17` # E : chchchchchchchch
74	beq $`1`,within_quad_b # U :
75	nop # E :
76	beq $`3`,aligned_b # U : target is `0mod8`
77
78	/*
79	* Target address is misaligned, and won't fit within a quadword
80	*/
81	ldq_u $`4`,`0`($`16`) # L : Fetch first partial
82	bis $`16`,$`16`,$`5` # E : Save the address
83	insql $`17`,$`16`,$`2` # U : Insert new bytes
84	subq $`3`,`8`,$`3` # E : Invert (for addressing uses)
85
86	addq $`18`,$`3`,$`18` # E : $`18` is new count ($`3` is negative)
87	mskql $`4`,$`16`,$`4` # U : clear relevant parts of the quad
88	subq $`16`,$`3`,$`16` # E : $`16` is new aligned destination
89	bis $`2`,$`4`,$`1` # E : Final bytes
90
91	nop
92	stq_u $`1`,`0`($`5`) # L : Store result
93	nop
94	nop
95
96	.align `4`
97	aligned_b:
98	/*
99	* We are now guaranteed to be quad aligned, with at least
100	* one partial quad to write.
101	*/
102
103	sra $`18`,`3`,$`3` # U : Number of remaining quads to write
104	and $`18`,`7`,$`18` # E : Number of trailing bytes to write
105	bis $`16`,$`16`,$`5` # E : Save dest address
106	beq $`3`,no_quad_b # U : tail stuff only
107
108	/*
109	* it's worth the effort to unroll this and use wh64 if possible
110	* Lifted a bunch of code from clear_user.S
111	* At this point, entry values are:
112	* $16 Current destination address
113	* $5 A copy of $16
114	* $6 The max quadword address to write to
115	* $18 Number trailer bytes
116	* $3 Number quads to write
117	*/
118
119	and $`16`, `0x3f`, $`2` # E : Forward work (only useful for unrolled loop)
120	subq $`3`, `16`, $`4` # E : Only try to unroll if > `128` bytes
121	subq $`2`, `0x40`, $`1` # E : bias counter (aligning stuff `0mod64`)
122	blt $`4`, loop_b # U :
123
124	/*
125	* We know we've got at least 16 quads, minimum of one trip
126	* through unrolled loop. Do a quad at a time to get us 0mod64
127	* aligned.
128	*/
129
130	nop # E :
131	nop # E :
132	nop # E :
133	beq $`1`, $bigalign_b # U :
134
135	$alignmod64_b:
136	stq $`17`, `0`($`5`) # L :
137	subq $`3`, `1`, $`3` # E : For consistency later
138	addq $`1`, `8`, $`1` # E : Increment towards zero for alignment
139	addq $`5`, `8`, $`4` # E : Initial wh64 address (filler instruction)
140
141	nop
142	nop
143	addq $`5`, `8`, $`5` # E : Inc address
144	blt $`1`, $alignmod64_b # U :
145
146	$bigalign_b:
147	/*
148	* $3 - number quads left to go
149	* $5 - target address (aligned 0mod64)
150	* $17 - mask of stuff to store
151	* Scratch registers available: $7, $2, $4, $1
152	* we know that we'll be taking a minimum of one trip through
153	* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
154	* Assumes the wh64 needs to be for 2 trips through the loop in the future
155	* The wh64 is issued on for the starting destination address for trip +2
156	* through the loop, and if there are less than two trips left, the target
157	* address will be for the current trip.
158	*/
159
160	$do_wh64_b:
161	wh64 ($`4`) # L1 : memory subsystem write hint
162	subq $`3`, `24`, $`2` # E : For determining future wh64 addresses
163	stq $`17`, `0`($`5`) # L :
164	nop # E :
165
166	addq $`5`, `128`, $`4` # E : speculative target of next wh64
167	stq $`17`, `8`($`5`) # L :
168	stq $`17`, `16`($`5`) # L :
169	addq $`5`, `64`, $`7` # E : Fallback address for wh64 (== next trip addr)
170
171	stq $`17`, `24`($`5`) # L :
172	stq $`17`, `32`($`5`) # L :
173	cmovlt $`2`, $`7`, $`4` # E : Latency `2`, extra mapping cycle
174	nop
175
176	stq $`17`, `40`($`5`) # L :
177	stq $`17`, `48`($`5`) # L :
178	subq $`3`, `16`, $`2` # E : Repeat the loop at least once more?
179	nop
180
181	stq $`17`, `56`($`5`) # L :
182	addq $`5`, `64`, $`5` # E :
183	subq $`3`, `8`, $`3` # E :
184	bge $`2`, $do_wh64_b # U :
185
186	nop
187	nop
188	nop
189	beq $`3`, no_quad_b # U : Might have finished already
190
191	.align `4`
192	/*
193	* Simple loop for trailing quadwords, or for small amounts
194	* of data (where we can't use an unrolled loop and wh64)
195	*/
196	loop_b:
197	stq $`17`,`0`($`5`) # L :
198	subq $`3`,`1`,$`3` # E : Decrement number quads left
199	addq $`5`,`8`,$`5` # E : Inc address
200	bne $`3`,loop_b # U : more?
201
202	no_quad_b:
203	/*
204	* Write 0..7 trailing bytes.
205	*/
206	nop # E :
207	beq $`18`,end_b # U : All done?
208	ldq $`7`,`0`($`5`) # L :
209	mskqh $`7`,$`6`,$`2` # U : Mask final quad
210
211	insqh $`17`,$`6`,$`4` # U : New bits
212	bis $`2`,$`4`,$`1` # E : Put it all together
213	stq $`1`,`0`($`5`) # L : And back to memory
214	ret $`31`,($`26`),`1` # L0 :
215
216	within_quad_b:
217	ldq_u $`1`,`0`($`16`) # L :
218	insql $`17`,$`16`,$`2` # U : New bits
219	mskql $`1`,$`16`,$`4` # U : Clear old
220	bis $`2`,$`4`,$`2` # E : New result
221
222	mskql $`2`,$`6`,$`4` # U :
223	mskqh $`1`,$`6`,$`2` # U :
224	bis $`2`,$`4`,$`1` # E :
225	stq_u $`1`,`0`($`16`) # L :
226
227	end_b:
228	nop
229	nop
230	nop
231	ret $`31`,($`26`),`1` # L0 :
232	.end ___memset
233	EXPORT_SYMBOL(___memset)
234
235	/*
236	* This is the original body of code, prior to replication and
237	* rescheduling. Leave it here, as there may be calls to this
238	* entry point.
239	*/
240	.align `4`
241	.ent __constant_c_memset
242	__constant_c_memset:
243	.frame $`30`,`0`,$`26`,`0`
244	.prologue `0`
245
246	addq $`18`,$`16`,$`6` # E : max address to write to
247	bis $`16`,$`16`,$`0` # E : return value
248	xor $`16`,$`6`,$`1` # E : will complete write be within one quadword?
249	ble $`18`,end # U : zero length requested?
250
251	bic $`1`,`7`,$`1` # E : fit within a single quadword
252	beq $`1`,within_one_quad # U :
253	and $`16`,`7`,$`3` # E : Target addr misalignment
254	beq $`3`,aligned # U : target is `0mod8`
255
256	/*
257	* Target address is misaligned, and won't fit within a quadword
258	*/
259	ldq_u $`4`,`0`($`16`) # L : Fetch first partial
260	bis $`16`,$`16`,$`5` # E : Save the address
261	insql $`17`,$`16`,$`2` # U : Insert new bytes
262	subq $`3`,`8`,$`3` # E : Invert (for addressing uses)
263
264	addq $`18`,$`3`,$`18` # E : $`18` is new count ($`3` is negative)
265	mskql $`4`,$`16`,$`4` # U : clear relevant parts of the quad
266	subq $`16`,$`3`,$`16` # E : $`16` is new aligned destination
267	bis $`2`,$`4`,$`1` # E : Final bytes
268
269	nop
270	stq_u $`1`,`0`($`5`) # L : Store result
271	nop
272	nop
273
274	.align `4`
275	aligned:
276	/*
277	* We are now guaranteed to be quad aligned, with at least
278	* one partial quad to write.
279	*/
280
281	sra $`18`,`3`,$`3` # U : Number of remaining quads to write
282	and $`18`,`7`,$`18` # E : Number of trailing bytes to write
283	bis $`16`,$`16`,$`5` # E : Save dest address
284	beq $`3`,no_quad # U : tail stuff only
285
286	/*
287	* it's worth the effort to unroll this and use wh64 if possible
288	* Lifted a bunch of code from clear_user.S
289	* At this point, entry values are:
290	* $16 Current destination address
291	* $5 A copy of $16
292	* $6 The max quadword address to write to
293	* $18 Number trailer bytes
294	* $3 Number quads to write
295	*/
296
297	and $`16`, `0x3f`, $`2` # E : Forward work (only useful for unrolled loop)
298	subq $`3`, `16`, $`4` # E : Only try to unroll if > `128` bytes
299	subq $`2`, `0x40`, $`1` # E : bias counter (aligning stuff `0mod64`)
300	blt $`4`, loop # U :
301
302	/*
303	* We know we've got at least 16 quads, minimum of one trip
304	* through unrolled loop. Do a quad at a time to get us 0mod64
305	* aligned.
306	*/
307
308	nop # E :
309	nop # E :
310	nop # E :
311	beq $`1`, $bigalign # U :
312
313	$alignmod64:
314	stq $`17`, `0`($`5`) # L :
315	subq $`3`, `1`, $`3` # E : For consistency later
316	addq $`1`, `8`, $`1` # E : Increment towards zero for alignment
317	addq $`5`, `8`, $`4` # E : Initial wh64 address (filler instruction)
318
319	nop
320	nop
321	addq $`5`, `8`, $`5` # E : Inc address
322	blt $`1`, $alignmod64 # U :
323
324	$bigalign:
325	/*
326	* $3 - number quads left to go
327	* $5 - target address (aligned 0mod64)
328	* $17 - mask of stuff to store
329	* Scratch registers available: $7, $2, $4, $1
330	* we know that we'll be taking a minimum of one trip through
331	* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
332	* Assumes the wh64 needs to be for 2 trips through the loop in the future
333	* The wh64 is issued on for the starting destination address for trip +2
334	* through the loop, and if there are less than two trips left, the target
335	* address will be for the current trip.
336	*/
337
338	$do_wh64:
339	wh64 ($`4`) # L1 : memory subsystem write hint
340	subq $`3`, `24`, $`2` # E : For determining future wh64 addresses
341	stq $`17`, `0`($`5`) # L :
342	nop # E :
343
344	addq $`5`, `128`, $`4` # E : speculative target of next wh64
345	stq $`17`, `8`($`5`) # L :
346	stq $`17`, `16`($`5`) # L :
347	addq $`5`, `64`, $`7` # E : Fallback address for wh64 (== next trip addr)
348
349	stq $`17`, `24`($`5`) # L :
350	stq $`17`, `32`($`5`) # L :
351	cmovlt $`2`, $`7`, $`4` # E : Latency `2`, extra mapping cycle
352	nop
353
354	stq $`17`, `40`($`5`) # L :
355	stq $`17`, `48`($`5`) # L :
356	subq $`3`, `16`, $`2` # E : Repeat the loop at least once more?
357	nop
358
359	stq $`17`, `56`($`5`) # L :
360	addq $`5`, `64`, $`5` # E :
361	subq $`3`, `8`, $`3` # E :
362	bge $`2`, $do_wh64 # U :
363
364	nop
365	nop
366	nop
367	beq $`3`, no_quad # U : Might have finished already
368
369	.align `4`
370	/*
371	* Simple loop for trailing quadwords, or for small amounts
372	* of data (where we can't use an unrolled loop and wh64)
373	*/
374	loop:
375	stq $`17`,`0`($`5`) # L :
376	subq $`3`,`1`,$`3` # E : Decrement number quads left
377	addq $`5`,`8`,$`5` # E : Inc address
378	bne $`3`,loop # U : more?
379
380	no_quad:
381	/*
382	* Write 0..7 trailing bytes.
383	*/
384	nop # E :
385	beq $`18`,end # U : All done?
386	ldq $`7`,`0`($`5`) # L :
387	mskqh $`7`,$`6`,$`2` # U : Mask final quad
388
389	insqh $`17`,$`6`,$`4` # U : New bits
390	bis $`2`,$`4`,$`1` # E : Put it all together
391	stq $`1`,`0`($`5`) # L : And back to memory
392	ret $`31`,($`26`),`1` # L0 :
393
394	within_one_quad:
395	ldq_u $`1`,`0`($`16`) # L :
396	insql $`17`,$`16`,$`2` # U : New bits
397	mskql $`1`,$`16`,$`4` # U : Clear old
398	bis $`2`,$`4`,$`2` # E : New result
399
400	mskql $`2`,$`6`,$`4` # U :
401	mskqh $`1`,$`6`,$`2` # U :
402	bis $`2`,$`4`,$`1` # E :
403	stq_u $`1`,`0`($`16`) # L :
404
405	end:
406	nop
407	nop
408	nop
409	ret $`31`,($`26`),`1` # L0 :
410	.end __constant_c_memset
411	EXPORT_SYMBOL(__constant_c_memset)
412
413	/*
414	* This is a replicant of the __constant_c_memset code, rescheduled
415	* to mask stalls. Note that entry point names also had to change
416	*/
417	.align `5`
418	.ent __memset16
419
420	__memset16:
421	.frame $`30`,`0`,$`26`,`0`
422	.prologue `0`
423
424	inswl $`17`,`0`,$`5` # U : `000000000000c1c2`
425	inswl $`17`,`2`,$`2` # U : `00000000c1c20000`
426	bis $`16`,$`16`,$`0` # E : return value
427	addq $`18`,$`16`,$`6` # E : max address to write to
428
429	ble $`18`, end_w # U : zero length requested?
430	inswl $`17`,`4`,$`3` # U : `0000c1c200000000`
431	inswl $`17`,`6`,$`4` # U : c1c2000000000000
432	xor $`16`,$`6`,$`1` # E : will complete write be within one quadword?
433
434	or $`2`,$`5`,$`2` # E : `00000000c1c2c1c2`
435	or $`3`,$`4`,$`17` # E : c1c2c1c200000000
436	bic $`1`,`7`,$`1` # E : fit within a single quadword
437	and $`16`,`7`,$`3` # E : Target addr misalignment
438
439	or $`17`,$`2`,$`17` # E : c1c2c1c2c1c2c1c2
440	beq $`1`,within_quad_w # U :
441	nop
442	beq $`3`,aligned_w # U : target is `0mod8`
443
444	/*
445	* Target address is misaligned, and won't fit within a quadword
446	*/
447	ldq_u $`4`,`0`($`16`) # L : Fetch first partial
448	bis $`16`,$`16`,$`5` # E : Save the address
449	insql $`17`,$`16`,$`2` # U : Insert new bytes
450	subq $`3`,`8`,$`3` # E : Invert (for addressing uses)
451
452	addq $`18`,$`3`,$`18` # E : $`18` is new count ($`3` is negative)
453	mskql $`4`,$`16`,$`4` # U : clear relevant parts of the quad
454	subq $`16`,$`3`,$`16` # E : $`16` is new aligned destination
455	bis $`2`,$`4`,$`1` # E : Final bytes
456
457	nop
458	stq_u $`1`,`0`($`5`) # L : Store result
459	nop
460	nop
461
462	.align `4`
463	aligned_w:
464	/*
465	* We are now guaranteed to be quad aligned, with at least
466	* one partial quad to write.
467	*/
468
469	sra $`18`,`3`,$`3` # U : Number of remaining quads to write
470	and $`18`,`7`,$`18` # E : Number of trailing bytes to write
471	bis $`16`,$`16`,$`5` # E : Save dest address
472	beq $`3`,no_quad_w # U : tail stuff only
473
474	/*
475	* it's worth the effort to unroll this and use wh64 if possible
476	* Lifted a bunch of code from clear_user.S
477	* At this point, entry values are:
478	* $16 Current destination address
479	* $5 A copy of $16
480	* $6 The max quadword address to write to
481	* $18 Number trailer bytes
482	* $3 Number quads to write
483	*/
484
485	and $`16`, `0x3f`, $`2` # E : Forward work (only useful for unrolled loop)
486	subq $`3`, `16`, $`4` # E : Only try to unroll if > `128` bytes
487	subq $`2`, `0x40`, $`1` # E : bias counter (aligning stuff `0mod64`)
488	blt $`4`, loop_w # U :
489
490	/*
491	* We know we've got at least 16 quads, minimum of one trip
492	* through unrolled loop. Do a quad at a time to get us 0mod64
493	* aligned.
494	*/
495
496	nop # E :
497	nop # E :
498	nop # E :
499	beq $`1`, $bigalign_w # U :
500
501	$alignmod64_w:
502	stq $`17`, `0`($`5`) # L :
503	subq $`3`, `1`, $`3` # E : For consistency later
504	addq $`1`, `8`, $`1` # E : Increment towards zero for alignment
505	addq $`5`, `8`, $`4` # E : Initial wh64 address (filler instruction)
506
507	nop
508	nop
509	addq $`5`, `8`, $`5` # E : Inc address
510	blt $`1`, $alignmod64_w # U :
511
512	$bigalign_w:
513	/*
514	* $3 - number quads left to go
515	* $5 - target address (aligned 0mod64)
516	* $17 - mask of stuff to store
517	* Scratch registers available: $7, $2, $4, $1
518	* we know that we'll be taking a minimum of one trip through
519	* CWG Section 3.7.6: do not expect a sustained store rate of > 1/cycle
520	* Assumes the wh64 needs to be for 2 trips through the loop in the future
521	* The wh64 is issued on for the starting destination address for trip +2
522	* through the loop, and if there are less than two trips left, the target
523	* address will be for the current trip.
524	*/
525
526	$do_wh64_w:
527	wh64 ($`4`) # L1 : memory subsystem write hint
528	subq $`3`, `24`, $`2` # E : For determining future wh64 addresses
529	stq $`17`, `0`($`5`) # L :
530	nop # E :
531
532	addq $`5`, `128`, $`4` # E : speculative target of next wh64
533	stq $`17`, `8`($`5`) # L :
534	stq $`17`, `16`($`5`) # L :
535	addq $`5`, `64`, $`7` # E : Fallback address for wh64 (== next trip addr)
536
537	stq $`17`, `24`($`5`) # L :
538	stq $`17`, `32`($`5`) # L :
539	cmovlt $`2`, $`7`, $`4` # E : Latency `2`, extra mapping cycle
540	nop
541
542	stq $`17`, `40`($`5`) # L :
543	stq $`17`, `48`($`5`) # L :
544	subq $`3`, `16`, $`2` # E : Repeat the loop at least once more?
545	nop
546
547	stq $`17`, `56`($`5`) # L :
548	addq $`5`, `64`, $`5` # E :
549	subq $`3`, `8`, $`3` # E :
550	bge $`2`, $do_wh64_w # U :
551
552	nop
553	nop
554	nop
555	beq $`3`, no_quad_w # U : Might have finished already
556
557	.align `4`
558	/*
559	* Simple loop for trailing quadwords, or for small amounts
560	* of data (where we can't use an unrolled loop and wh64)
561	*/
562	loop_w:
563	stq $`17`,`0`($`5`) # L :
564	subq $`3`,`1`,$`3` # E : Decrement number quads left
565	addq $`5`,`8`,$`5` # E : Inc address
566	bne $`3`,loop_w # U : more?
567
568	no_quad_w:
569	/*
570	* Write 0..7 trailing bytes.
571	*/
572	nop # E :
573	beq $`18`,end_w # U : All done?
574	ldq $`7`,`0`($`5`) # L :
575	mskqh $`7`,$`6`,$`2` # U : Mask final quad
576
577	insqh $`17`,$`6`,$`4` # U : New bits
578	bis $`2`,$`4`,$`1` # E : Put it all together
579	stq $`1`,`0`($`5`) # L : And back to memory
580	ret $`31`,($`26`),`1` # L0 :
581
582	within_quad_w:
583	ldq_u $`1`,`0`($`16`) # L :
584	insql $`17`,$`16`,$`2` # U : New bits
585	mskql $`1`,$`16`,$`4` # U : Clear old
586	bis $`2`,$`4`,$`2` # E : New result
587
588	mskql $`2`,$`6`,$`4` # U :
589	mskqh $`1`,$`6`,$`2` # U :
590	bis $`2`,$`4`,$`1` # E :
591	stq_u $`1`,`0`($`16`) # L :
592
593	end_w:
594	nop
595	nop
596	nop
597	ret $`31`,($`26`),`1` # L0 :
598
599	.end __memset16
600	EXPORT_SYMBOL(__memset16)
601
602	memset = ___memset
603	__memset = ___memset
604	EXPORT_SYMBOL(memset)
605	EXPORT_SYMBOL(__memset)
606

source code of linux/arch/alpha/lib/ev6-memset.S