memcpy-sh4.S source code [linux/arch/sh/lib/memcpy-sh4.S]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/*
3	* "memcpy" implementation of SuperH
4	*
5	* Copyright (C) 1999 Niibe Yutaka
6	* Copyright (c) 2002 STMicroelectronics Ltd
7	* Modified from memcpy.S and micro-optimised for SH4
8	* Stuart Menefy (stuart.menefy@st.com)
9	*
10	*/
11	#include <linux/linkage.h>
12
13	/*
14	* void memcpy(void dst, const void *src, size_t n);
15	*
16	* It is assumed that there is no overlap between src and dst.
17	* If there is an overlap, then the results are undefined.
18	*/
19
20	!
21	! GHIJ KLMN OPQR --> ...G HIJK LMNO PQR.
22	!
23
24	! Size is `16` or greater, and may have trailing bytes
25
26	.balign `32`
27	.Lcase1:
28	! Read a long word and write a long word at once
29	! At the start of each iteration, r7 contains last long load
30	add #-`1`,r5 ! `79` EX
31	mov r4,r2 ! `5` MT (`0` cycles latency)
32
33	mov.l @(r0,r5),r7 ! `21` LS (`2` cycles latency)
34	add #-`4`,r5 ! `50` EX
35
36	add #`7`,r2 ! `79` EX
37	!
38	#ifdef CONFIG_CPU_LITTLE_ENDIAN
39	! `6` cycles, `4` bytes per iteration
40	`3`: mov.l @(r0,r5),r1 ! `21` LS (latency=`2`) ! NMLK
41	mov r7, r3 ! `5` MT (latency=`0`) ! RQPO
42
43	cmp/hi r2,r0 ! `57` MT
44	shll16 r3 ! `103` EX
45
46	mov r1,r6 ! `5` MT (latency=`0`)
47	shll8 r3 ! `102` EX ! Oxxx
48
49	shlr8 r6 ! `106` EX ! xNML
50	mov r1, r7 ! `5` MT (latency=`0`)
51
52	or r6,r3 ! `82` EX ! ONML
53	bt/s `3b` ! `109` BR
54
55	mov.l r3,@-r0 ! `30` LS
56	#else
57	`3`: mov.l @(r0,r5),r1 ! `21` LS (latency=`2`) ! KLMN
58	mov r7,r3 ! `5` MT (latency=`0`) ! OPQR
59
60	cmp/hi r2,r0 ! `57` MT
61	shlr16 r3 ! `107` EX
62
63	shlr8 r3 ! `106` EX ! xxxO
64	mov r1,r6 ! `5` MT (latency=`0`)
65
66	shll8 r6 ! `102` EX ! LMNx
67	mov r1,r7 ! `5` MT (latency=`0`)
68
69	or r6,r3 ! `82` EX ! LMNO
70	bt/s `3b` ! `109` BR
71
72	mov.l r3,@-r0 ! `30` LS
73	#endif
74	! Finally, copy a byte at once, if necessary
75
76	add #`4`,r5 ! `50` EX
77	cmp/eq r4,r0 ! `54` MT
78
79	add #-`6`,r2 ! `50` EX
80	bt `9f` ! `109` BR
81
82	`8`: cmp/hi r2,r0 ! `57` MT
83	mov.b @(r0,r5),r1 ! `20` LS (latency=`2`)
84
85	bt/s `8b` ! `109` BR
86
87	mov.b r1,@-r0 ! `29` LS
88
89	`9`: rts
90	nop
91
92
93	!
94	! GHIJ KLMN OPQR --> .GHI JKLM NOPQ R...
95	!
96
97	! Size is `16` or greater, and may have trailing bytes
98
99	.balign `32`
100	.Lcase3:
101	! Read a long word and write a long word at once
102	! At the start of each iteration, r7 contains last long load
103	add #-`3`,r5 ! `79` EX
104	mov r4,r2 ! `5` MT (`0` cycles latency)
105
106	mov.l @(r0,r5),r7 ! `21` LS (`2` cycles latency)
107	add #-`4`,r5 ! `50` EX
108
109	add #`7`,r2 ! `79` EX
110	!
111	#ifdef CONFIG_CPU_LITTLE_ENDIAN
112	! `6` cycles, `4` bytes per iteration
113	`3`: mov.l @(r0,r5),r1 ! `21` LS (latency=`2`) ! NMLK
114	mov r7, r3 ! `5` MT (latency=`0`) ! RQPO
115
116	cmp/hi r2,r0 ! `57` MT
117	shll8 r3 ! `102` EX ! QPOx
118
119	mov r1,r6 ! `5` MT (latency=`0`)
120	shlr16 r6 ! `107` EX
121
122	shlr8 r6 ! `106` EX ! xxxN
123	mov r1, r7 ! `5` MT (latency=`0`)
124
125	or r6,r3 ! `82` EX ! QPON
126	bt/s `3b` ! `109` BR
127
128	mov.l r3,@-r0 ! `30` LS
129	#else
130	`3`: mov r7,r3 ! OPQR
131	shlr8 r3 ! xOPQ
132	mov.l @(r0,r5),r7 ! KLMN
133	mov r7,r6
134	shll16 r6
135	shll8 r6 ! Nxxx
136	or r6,r3 ! NOPQ
137	cmp/hi r2,r0
138	bt/s `3b`
139	mov.l r3,@-r0
140	#endif
141
142	! Finally, copy a byte at once, if necessary
143
144	add #`6`,r5 ! `50` EX
145	cmp/eq r4,r0 ! `54` MT
146
147	add #-`6`,r2 ! `50` EX
148	bt `9f` ! `109` BR
149
150	`8`: cmp/hi r2,r0 ! `57` MT
151	mov.b @(r0,r5),r1 ! `20` LS (latency=`2`)
152
153	bt/s `8b` ! `109` BR
154
155	mov.b r1,@-r0 ! `29` LS
156
157	`9`: rts
158	nop
159
160	ENTRY(memcpy)
161
162	! Calculate the invariants which will be used in the remainder
163	! of the code:
164	!
165	! r4 --> [ ... ] DST [ ... ] SRC
166	! [ ... ] [ ... ]
167	! : :
168	! r0 --> [ ... ] r0+r5 --> [ ... ]
169	!
170	!
171
172	! Short circuit the common case of src, dst and len being `32` bit aligned
173	! and test for zero length move
174
175	mov r6, r0 ! `5` MT (`0` cycle latency)
176	or r4, r0 ! `82` EX
177
178	or r5, r0 ! `82` EX
179	tst r6, r6 ! `86` MT
180
181	bt/s `99f` ! `111` BR (zero len)
182	tst #`3`, r0 ! `87` MT
183
184	mov r4, r0 ! `5` MT (`0` cycle latency)
185	add r6, r0 ! `49` EX
186
187	mov #`16`, r1 ! `6` EX
188	bt/s .Lcase00 ! `111` BR (aligned)
189
190	sub r4, r5 ! `75` EX
191
192	! Arguments are not nicely long word aligned or zero len.
193	! Check for small copies, and if so do a simple byte at a time copy.
194	!
195	! Deciding on an exact value of `'small'` is not easy, as the point at which
196	! using the optimised routines become worthwhile varies (these are the
197	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):
198	! size byte-at-time long word byte
199	! `16` `42` `39`-`40` `46`-`50` `50`-`55`
200	! `24` `58` `43`-`44` `54`-`58` `62`-`67`
201	! `36` `82` `49`-`50` `66`-`70` `80`-`85`
202	! However the penalty for getting it `'wrong'` is much higher for long word
203	! aligned data (and this is more common), so use a value of `16.`
204
205	cmp/gt r6,r1 ! `56` MT
206
207	add #-`1`,r5 ! `50` EX
208	bf/s `6f` ! `108` BR (not small)
209
210	mov r5, r3 ! `5` MT (latency=`0`)
211	shlr r6 ! `104` EX
212
213	mov.b @(r0,r5),r1 ! `20` LS (latency=`2`)
214	bf/s `4f` ! `111` BR
215
216	add #-`1`,r3 ! `50` EX
217	tst r6, r6 ! `86` MT
218
219	bt/s `98f` ! `110` BR
220	mov.b r1,@-r0 ! `29` LS
221
222	! `4` cycles, `2` bytes per iteration
223	`3`: mov.b @(r0,r5),r1 ! `20` LS (latency=`2`)
224
225	`4`: mov.b @(r0,r3),r2 ! `20` LS (latency=`2`)
226	dt r6 ! `67` EX
227
228	mov.b r1,@-r0 ! `29` LS
229	bf/s `3b` ! `111` BR
230
231	mov.b r2,@-r0 ! `29` LS
232	`98`:
233	rts
234	nop
235
236	`99`: rts
237	mov r4, r0
238
239	! Size is not small, so its worthwhile looking for optimisations.
240	! First align destination to a long word boundary.
241	!
242	! r5 = normal value -`1`
243
244	`6`: tst #`3`, r0 ! `87` MT
245	mov #`3`, r3 ! `6` EX
246
247	bt/s `2f` ! `111` BR
248	and r0,r3 ! `78` EX
249
250	! `3` cycles, `1` byte per iteration
251	`1`: dt r3 ! `67` EX
252	mov.b @(r0,r5),r1 ! `19` LS (latency=`2`)
253
254	add #-`1`, r6 ! `79` EX
255	bf/s `1b` ! `109` BR
256
257	mov.b r1,@-r0 ! `28` LS
258
259	`2`: add #`1`, r5 ! `79` EX
260
261	! Now select the appropriate bulk transfer code based on relative
262	! alignment of src and dst.
263
264	mov r0, r3 ! `5` MT (latency=`0`)
265
266	mov r5, r0 ! `5` MT (latency=`0`)
267	tst #`1`, r0 ! `87` MT
268
269	bf/s `1f` ! `111` BR
270	mov #`64`, r7 ! `6` EX
271
272	! bit `0` clear
273
274	cmp/ge r7, r6 ! `55` MT
275
276	bt/s `2f` ! `111` BR
277	tst #`2`, r0 ! `87` MT
278
279	! small
280	bt/s .Lcase0
281	mov r3, r0
282
283	bra .Lcase2
284	nop
285
286	! big
287	`2`: bt/s .Lcase0b
288	mov r3, r0
289
290	bra .Lcase2b
291	nop
292
293	! bit `0` set
294	`1`: tst #`2`, r0 ! `87` MT
295
296	bt/s .Lcase1
297	mov r3, r0
298
299	bra .Lcase3
300	nop
301
302
303	!
304	! GHIJ KLMN OPQR --> GHIJ KLMN OPQR
305	!
306
307	! src, dst and size are all long word aligned
308	! size is non-zero
309
310	.balign `32`
311	.Lcase00:
312	mov #`64`, r1 ! `6` EX
313	mov r5, r3 ! `5` MT (latency=`0`)
314
315	cmp/gt r6, r1 ! `56` MT
316	add #-`4`, r5 ! `50` EX
317
318	bf .Lcase00b ! `108` BR (big loop)
319	shlr2 r6 ! `105` EX
320
321	shlr r6 ! `104` EX
322	mov.l @(r0, r5), r1 ! `21` LS (latency=`2`)
323
324	bf/s `4f` ! `111` BR
325	add #-`8`, r3 ! `50` EX
326
327	tst r6, r6 ! `86` MT
328	bt/s `5f` ! `110` BR
329
330	mov.l r1,@-r0 ! `30` LS
331
332	! `4` cycles, `2` long words per iteration
333	`3`: mov.l @(r0, r5), r1 ! `21` LS (latency=`2`)
334
335	`4`: mov.l @(r0, r3), r2 ! `21` LS (latency=`2`)
336	dt r6 ! `67` EX
337
338	mov.l r1, @-r0 ! `30` LS
339	bf/s `3b` ! `109` BR
340
341	mov.l r2, @-r0 ! `30` LS
342
343	`5`: rts
344	nop
345
346
347	! Size is `16` or greater and less than `64`, but may have trailing bytes
348
349	.balign `32`
350	.Lcase0:
351	add #-`4`, r5 ! `50` EX
352	mov r4, r7 ! `5` MT (latency=`0`)
353
354	mov.l @(r0, r5), r1 ! `21` LS (latency=`2`)
355	mov #`4`, r2 ! `6` EX
356
357	add #`11`, r7 ! `50` EX
358	tst r2, r6 ! `86` MT
359
360	mov r5, r3 ! `5` MT (latency=`0`)
361	bt/s `4f` ! `111` BR
362
363	add #-`4`, r3 ! `50` EX
364	mov.l r1,@-r0 ! `30` LS
365
366	! `4` cycles, `2` long words per iteration
367	`3`: mov.l @(r0, r5), r1 ! `21` LS (latency=`2`)
368
369	`4`: mov.l @(r0, r3), r2 ! `21` LS (latency=`2`)
370	cmp/hi r7, r0
371
372	mov.l r1, @-r0 ! `30` LS
373	bt/s `3b` ! `109` BR
374
375	mov.l r2, @-r0 ! `30` LS
376
377	! Copy the final `0`-`3` bytes
378
379	add #`3`,r5 ! `50` EX
380
381	cmp/eq r0, r4 ! `54` MT
382	add #-`10`, r7 ! `50` EX
383
384	bt `9f` ! `110` BR
385
386	! `3` cycles, `1` byte per iteration
387	`1`: mov.b @(r0,r5),r1 ! `19` LS
388	cmp/hi r7,r0 ! `57` MT
389
390	bt/s `1b` ! `111` BR
391	mov.b r1,@-r0 ! `28` LS
392
393	`9`: rts
394	nop
395
396	! Size is at least `64` bytes, so will be going round the big loop at least once.
397	!
398	! r2 = rounded up r4
399	! r3 = rounded down r0
400
401	.balign `32`
402	.Lcase0b:
403	add #-`4`, r5 ! `50` EX
404
405	.Lcase00b:
406	mov r0, r3 ! `5` MT (latency=`0`)
407	mov #(~`0x1f`), r1 ! `6` EX
408
409	and r1, r3 ! `78` EX
410	mov r4, r2 ! `5` MT (latency=`0`)
411
412	cmp/eq r3, r0 ! `54` MT
413	add #`0x1f`, r2 ! `50` EX
414
415	bt/s `1f` ! `110` BR
416	and r1, r2 ! `78` EX
417
418	! copy initial words until cache line aligned
419
420	mov.l @(r0, r5), r1 ! `21` LS (latency=`2`)
421	tst #`4`, r0 ! `87` MT
422
423	mov r5, r6 ! `5` MT (latency=`0`)
424	add #-`4`, r6 ! `50` EX
425
426	bt/s `4f` ! `111` BR
427	add #`8`, r3 ! `50` EX
428
429	tst #`0x18`, r0 ! `87` MT
430
431	bt/s `1f` ! `109` BR
432	mov.l r1,@-r0 ! `30` LS
433
434	! `4` cycles, `2` long words per iteration
435	`3`: mov.l @(r0, r5), r1 ! `21` LS (latency=`2`)
436
437	`4`: mov.l @(r0, r6), r7 ! `21` LS (latency=`2`)
438	cmp/eq r3, r0 ! `54` MT
439
440	mov.l r1, @-r0 ! `30` LS
441	bf/s `3b` ! `109` BR
442
443	mov.l r7, @-r0 ! `30` LS
444
445	! Copy the cache line aligned blocks
446	!
447	! In use: r0, r2, r4, r5
448	! Scratch: r1, r3, r6, r7
449	!
450	! We could do this with the four scratch registers, but if src
451	! and dest hit the same cache line, this will thrash, so make
452	! use of additional registers.
453	!
454	! We also need r0 as a temporary (for movca), so `'undo'` the invariant:
455	! r5: src (was r0+r5)
456	! r1: dest (was r0)
457	! this can be reversed at the end, so we don't need to save any extra
458	! state.
459	!
460	`1`: mov.l r8, @-r15 ! `30` LS
461	add r0, r5 ! `49` EX
462
463	mov.l r9, @-r15 ! `30` LS
464	mov r0, r1 ! `5` MT (latency=`0`)
465
466	mov.l r10, @-r15 ! `30` LS
467	add #-`0x1c`, r5 ! `50` EX
468
469	mov.l r11, @-r15 ! `30` LS
470
471	! `16` cycles, `32` bytes per iteration
472	`2`: mov.l @(`0x00`,r5),r0 ! `18` LS (latency=`2`)
473	add #-`0x20`, r1 ! `50` EX
474	mov.l @(`0x04`,r5),r3 ! `18` LS (latency=`2`)
475	mov.l @(`0x08`,r5),r6 ! `18` LS (latency=`2`)
476	mov.l @(`0x0c`,r5),r7 ! `18` LS (latency=`2`)
477	mov.l @(`0x10`,r5),r8 ! `18` LS (latency=`2`)
478	mov.l @(`0x14`,r5),r9 ! `18` LS (latency=`2`)
479	mov.l @(`0x18`,r5),r10 ! `18` LS (latency=`2`)
480	mov.l @(`0x1c`,r5),r11 ! `18` LS (latency=`2`)
481	movca.l r0,@r1 ! `40` LS (latency=`3`-`7`)
482	mov.l r3,@(`0x04`,r1) ! `33` LS
483	mov.l r6,@(`0x08`,r1) ! `33` LS
484	mov.l r7,@(`0x0c`,r1) ! `33` LS
485
486	mov.l r8,@(`0x10`,r1) ! `33` LS
487	add #-`0x20`, r5 ! `50` EX
488
489	mov.l r9,@(`0x14`,r1) ! `33` LS
490	cmp/eq r2,r1 ! `54` MT
491
492	mov.l r10,@(`0x18`,r1) ! `33` LS
493	bf/s `2b` ! `109` BR
494
495	mov.l r11,@(`0x1c`,r1) ! `33` LS
496
497	mov r1, r0 ! `5` MT (latency=`0`)
498
499	mov.l @r15+, r11 ! `15` LS
500	sub r1, r5 ! `75` EX
501
502	mov.l @r15+, r10 ! `15` LS
503	cmp/eq r4, r0 ! `54` MT
504
505	bf/s `1f` ! `109` BR
506	mov.l @r15+, r9 ! `15` LS
507
508	rts
509	`1`: mov.l @r15+, r8 ! `15` LS
510	sub r4, r1 ! `75` EX (len remaining)
511
512	! number of trailing bytes is non-zero
513	!
514	! invariants restored (r5 already decremented by `4`)
515	! also r1=num bytes remaining
516
517	mov #`4`, r2 ! `6` EX
518	mov r4, r7 ! `5` MT (latency=`0`)
519
520	add #`0x1c`, r5 ! `50` EX (back to -`4`)
521	cmp/hs r2, r1 ! `58` MT
522
523	bf/s `5f` ! `108` BR
524	add #`11`, r7 ! `50` EX
525
526	mov.l @(r0, r5), r6 ! `21` LS (latency=`2`)
527	tst r2, r1 ! `86` MT
528
529	mov r5, r3 ! `5` MT (latency=`0`)
530	bt/s `4f` ! `111` BR
531
532	add #-`4`, r3 ! `50` EX
533	cmp/hs r2, r1 ! `58` MT
534
535	bt/s `5f` ! `111` BR
536	mov.l r6,@-r0 ! `30` LS
537
538	! `4` cycles, `2` long words per iteration
539	`3`: mov.l @(r0, r5), r6 ! `21` LS (latency=`2`)
540
541	`4`: mov.l @(r0, r3), r2 ! `21` LS (latency=`2`)
542	cmp/hi r7, r0
543
544	mov.l r6, @-r0 ! `30` LS
545	bt/s `3b` ! `109` BR
546
547	mov.l r2, @-r0 ! `30` LS
548
549	! Copy the final `0`-`3` bytes
550
551	`5`: cmp/eq r0, r4 ! `54` MT
552	add #-`10`, r7 ! `50` EX
553
554	bt `9f` ! `110` BR
555	add #`3`,r5 ! `50` EX
556
557	! `3` cycles, `1` byte per iteration
558	`1`: mov.b @(r0,r5),r1 ! `19` LS
559	cmp/hi r7,r0 ! `57` MT
560
561	bt/s `1b` ! `111` BR
562	mov.b r1,@-r0 ! `28` LS
563
564	`9`: rts
565	nop
566
567	!
568	! GHIJ KLMN OPQR --> ..GH IJKL MNOP QR..
569	!
570
571	.balign `32`
572	.Lcase2:
573	! Size is `16` or greater and less then `64`, but may have trailing bytes
574
575	`2`: mov r5, r6 ! `5` MT (latency=`0`)
576	add #-`2`,r5 ! `50` EX
577
578	mov r4,r2 ! `5` MT (latency=`0`)
579	add #-`4`,r6 ! `50` EX
580
581	add #`7`,r2 ! `50` EX
582	`3`: mov.w @(r0,r5),r1 ! `20` LS (latency=`2`)
583
584	mov.w @(r0,r6),r3 ! `20` LS (latency=`2`)
585	cmp/hi r2,r0 ! `57` MT
586
587	mov.w r1,@-r0 ! `29` LS
588	bt/s `3b` ! `111` BR
589
590	mov.w r3,@-r0 ! `29` LS
591
592	bra `10f`
593	nop
594
595
596	.balign `32`
597	.Lcase2b:
598	! Size is at least `64` bytes, so will be going round the big loop at least once.
599	!
600	! r2 = rounded up r4
601	! r3 = rounded down r0
602
603	mov r0, r3 ! `5` MT (latency=`0`)
604	mov #(~`0x1f`), r1 ! `6` EX
605
606	and r1, r3 ! `78` EX
607	mov r4, r2 ! `5` MT (latency=`0`)
608
609	cmp/eq r3, r0 ! `54` MT
610	add #`0x1f`, r2 ! `50` EX
611
612	add #-`2`, r5 ! `50` EX
613	bt/s `1f` ! `110` BR
614	and r1, r2 ! `78` EX
615
616	! Copy a short word one at a time until we are cache line aligned
617	! Normal values: r0, r2, r3, r4
618	! Unused: r1, r6, r7
619	! Mod: r5 (=r5-`2`)
620	!
621	add #`2`, r3 ! `50` EX
622
623	`2`: mov.w @(r0,r5),r1 ! `20` LS (latency=`2`)
624	cmp/eq r3,r0 ! `54` MT
625
626	bf/s `2b` ! `111` BR
627
628	mov.w r1,@-r0 ! `29` LS
629
630	! Copy the cache line aligned blocks
631	!
632	! In use: r0, r2, r4, r5 (=r5-`2`)
633	! Scratch: r1, r3, r6, r7
634	!
635	! We could do this with the four scratch registers, but if src
636	! and dest hit the same cache line, this will thrash, so make
637	! use of additional registers.
638	!
639	! We also need r0 as a temporary (for movca), so `'undo'` the invariant:
640	! r5: src (was r0+r5)
641	! r1: dest (was r0)
642	! this can be reversed at the end, so we don't need to save any extra
643	! state.
644	!
645	`1`: mov.l r8, @-r15 ! `30` LS
646	add r0, r5 ! `49` EX
647
648	mov.l r9, @-r15 ! `30` LS
649	mov r0, r1 ! `5` MT (latency=`0`)
650
651	mov.l r10, @-r15 ! `30` LS
652	add #-`0x1e`, r5 ! `50` EX
653
654	mov.l r11, @-r15 ! `30` LS
655
656	mov.l r12, @-r15 ! `30` LS
657
658	! `17` cycles, `32` bytes per iteration
659	#ifdef CONFIG_CPU_LITTLE_ENDIAN
660	`2`: mov.w @r5+, r0 ! `14` LS (latency=`2`) ..JI
661	add #-`0x20`, r1 ! `50` EX
662
663	mov.l @r5+, r3 ! `15` LS (latency=`2`) NMLK
664
665	mov.l @r5+, r6 ! `15` LS (latency=`2`) RQPO
666	shll16 r0 ! `103` EX JI..
667
668	mov.l @r5+, r7 ! `15` LS (latency=`2`)
669	xtrct r3, r0 ! `48` EX LKJI
670
671	mov.l @r5+, r8 ! `15` LS (latency=`2`)
672	xtrct r6, r3 ! `48` EX PONM
673
674	mov.l @r5+, r9 ! `15` LS (latency=`2`)
675	xtrct r7, r6 ! `48` EX
676
677	mov.l @r5+, r10 ! `15` LS (latency=`2`)
678	xtrct r8, r7 ! `48` EX
679
680	mov.l @r5+, r11 ! `15` LS (latency=`2`)
681	xtrct r9, r8 ! `48` EX
682
683	mov.w @r5+, r12 ! `15` LS (latency=`2`)
684	xtrct r10, r9 ! `48` EX
685
686	movca.l r0,@r1 ! `40` LS (latency=`3`-`7`)
687	xtrct r11, r10 ! `48` EX
688
689	mov.l r3, @(`0x04`,r1) ! `33` LS
690	xtrct r12, r11 ! `48` EX
691
692	mov.l r6, @(`0x08`,r1) ! `33` LS
693
694	mov.l r7, @(`0x0c`,r1) ! `33` LS
695
696	mov.l r8, @(`0x10`,r1) ! `33` LS
697	add #-`0x40`, r5 ! `50` EX
698
699	mov.l r9, @(`0x14`,r1) ! `33` LS
700	cmp/eq r2,r1 ! `54` MT
701
702	mov.l r10, @(`0x18`,r1) ! `33` LS
703	bf/s `2b` ! `109` BR
704
705	mov.l r11, @(`0x1c`,r1) ! `33` LS
706	#else
707	`2`: mov.w @(`0x1e`,r5), r0 ! `17` LS (latency=`2`)
708	add #-`2`, r5 ! `50` EX
709
710	mov.l @(`0x1c`,r5), r3 ! `18` LS (latency=`2`)
711	add #-`4`, r1 ! `50` EX
712
713	mov.l @(`0x18`,r5), r6 ! `18` LS (latency=`2`)
714	shll16 r0 ! `103` EX
715
716	mov.l @(`0x14`,r5), r7 ! `18` LS (latency=`2`)
717	xtrct r3, r0 ! `48` EX
718
719	mov.l @(`0x10`,r5), r8 ! `18` LS (latency=`2`)
720	xtrct r6, r3 ! `48` EX
721
722	mov.l @(`0x0c`,r5), r9 ! `18` LS (latency=`2`)
723	xtrct r7, r6 ! `48` EX
724
725	mov.l @(`0x08`,r5), r10 ! `18` LS (latency=`2`)
726	xtrct r8, r7 ! `48` EX
727
728	mov.l @(`0x04`,r5), r11 ! `18` LS (latency=`2`)
729	xtrct r9, r8 ! `48` EX
730
731	mov.l @(`0x00`,r5), r12 ! `18` LS (latency=`2`)
732	xtrct r10, r9 ! `48` EX
733
734	movca.l r0,@r1 ! `40` LS (latency=`3`-`7`)
735	add #-`0x1c`, r1 ! `50` EX
736
737	mov.l r3, @(`0x18`,r1) ! `33` LS
738	xtrct r11, r10 ! `48` EX
739
740	mov.l r6, @(`0x14`,r1) ! `33` LS
741	xtrct r12, r11 ! `48` EX
742
743	mov.l r7, @(`0x10`,r1) ! `33` LS
744
745	mov.l r8, @(`0x0c`,r1) ! `33` LS
746	add #-`0x1e`, r5 ! `50` EX
747
748	mov.l r9, @(`0x08`,r1) ! `33` LS
749	cmp/eq r2,r1 ! `54` MT
750
751	mov.l r10, @(`0x04`,r1) ! `33` LS
752	bf/s `2b` ! `109` BR
753
754	mov.l r11, @(`0x00`,r1) ! `33` LS
755	#endif
756
757	mov.l @r15+, r12
758	mov r1, r0 ! `5` MT (latency=`0`)
759
760	mov.l @r15+, r11 ! `15` LS
761	sub r1, r5 ! `75` EX
762
763	mov.l @r15+, r10 ! `15` LS
764	cmp/eq r4, r0 ! `54` MT
765
766	bf/s `1f` ! `109` BR
767	mov.l @r15+, r9 ! `15` LS
768
769	rts
770	`1`: mov.l @r15+, r8 ! `15` LS
771
772	add #`0x1e`, r5 ! `50` EX
773
774	! Finish off a short word at a time
775	! r5 must be invariant - `2`
776	`10`: mov r4,r2 ! `5` MT (latency=`0`)
777	add #`1`,r2 ! `50` EX
778
779	cmp/hi r2, r0 ! `57` MT
780	bf/s `1f` ! `109` BR
781
782	add #`2`, r2 ! `50` EX
783
784	`3`: mov.w @(r0,r5),r1 ! `20` LS
785	cmp/hi r2,r0 ! `57` MT
786
787	bt/s `3b` ! `109` BR
788
789	mov.w r1,@-r0 ! `29` LS
790	`1`:
791
792	!
793	! Finally, copy the last byte if necessary
794	cmp/eq r4,r0 ! `54` MT
795	bt/s `9b`
796	add #`1`,r5
797	mov.b @(r0,r5),r1
798	rts
799	mov.b r1,@-r0
800
801

source code of linux/arch/sh/lib/memcpy-sh4.S