1/* SPDX-License-Identifier: GPL-2.0 */
2/*
3 * arch/alpha/lib/ev6-copy_page.S
4 *
5 * Copy an entire page.
6 */
7
8/* The following comparison of this routine vs the normal copy_page.S
9 was written by an unnamed ev6 hardware designer and forwarded to me
10 via Steven Hobbs <hobbs@steven.zko.dec.com>.
11
12 First Problem: STQ overflows.
13 -----------------------------
14
15 It would be nice if EV6 handled every resource overflow efficiently,
16 but for some it doesn't. Including store queue overflows. It causes
17 a trap and a restart of the pipe.
18
19 To get around this we sometimes use (to borrow a term from a VSSAD
20 researcher) "aeration". The idea is to slow the rate at which the
21 processor receives valid instructions by inserting nops in the fetch
22 path. In doing so, you can prevent the overflow and actually make
23 the code run faster. You can, of course, take advantage of the fact
24 that the processor can fetch at most 4 aligned instructions per cycle.
25
26 I inserted enough nops to force it to take 10 cycles to fetch the
27 loop code. In theory, EV6 should be able to execute this loop in
28 9 cycles but I was not able to get it to run that fast -- the initial
29 conditions were such that I could not reach this optimum rate on
30 (chaotic) EV6. I wrote the code such that everything would issue
31 in order.
32
33 Second Problem: Dcache index matches.
34 -------------------------------------
35
36 If you are going to use this routine on random aligned pages, there
37 is a 25% chance that the pages will be at the same dcache indices.
38 This results in many nasty memory traps without care.
39
40 The solution is to schedule the prefetches to avoid the memory
41 conflicts. I schedule the wh64 prefetches farther ahead of the
42 read prefetches to avoid this problem.
43
44 Third Problem: Needs more prefetching.
45 --------------------------------------
46
47 In order to improve the code I added deeper prefetching to take the
48 most advantage of EV6's bandwidth.
49
50 I also prefetched the read stream. Note that adding the read prefetch
51 forced me to add another cycle to the inner-most kernel - up to 11
52 from the original 8 cycles per iteration. We could improve performance
53 further by unrolling the loop and doing multiple prefetches per cycle.
54
55 I think that the code below will be very robust and fast code for the
56 purposes of copying aligned pages. It is slower when both source and
57 destination pages are in the dcache, but it is my guess that this is
58 less important than the dcache miss case. */
59
60#include <linux/export.h>
61 .text
62 .align 4
63 .global copy_page
64 .ent copy_page
65copy_page:
66 .prologue 0
67
68 /* Prefetch 5 read cachelines; write-hint 10 cache lines. */
69 wh64 ($16)
70 ldl $31,0($17)
71 ldl $31,64($17)
72 lda $1,1*64($16)
73
74 wh64 ($1)
75 ldl $31,128($17)
76 ldl $31,192($17)
77 lda $1,2*64($16)
78
79 wh64 ($1)
80 ldl $31,256($17)
81 lda $18,118
82 lda $1,3*64($16)
83
84 wh64 ($1)
85 nop
86 lda $1,4*64($16)
87 lda $2,5*64($16)
88
89 wh64 ($1)
90 wh64 ($2)
91 lda $1,6*64($16)
92 lda $2,7*64($16)
93
94 wh64 ($1)
95 wh64 ($2)
96 lda $1,8*64($16)
97 lda $2,9*64($16)
98
99 wh64 ($1)
100 wh64 ($2)
101 lda $19,10*64($16)
102 nop
103
104 /* Main prefetching/write-hinting loop. */
1051: ldq $0,0($17)
106 ldq $1,8($17)
107 unop
108 unop
109
110 unop
111 unop
112 ldq $2,16($17)
113 ldq $3,24($17)
114
115 ldq $4,32($17)
116 ldq $5,40($17)
117 unop
118 unop
119
120 unop
121 unop
122 ldq $6,48($17)
123 ldq $7,56($17)
124
125 ldl $31,320($17)
126 unop
127 unop
128 unop
129
130 /* This gives the extra cycle of aeration above the minimum. */
131 unop
132 unop
133 unop
134 unop
135
136 wh64 ($19)
137 unop
138 unop
139 unop
140
141 stq $0,0($16)
142 subq $18,1,$18
143 stq $1,8($16)
144 unop
145
146 unop
147 stq $2,16($16)
148 addq $17,64,$17
149 stq $3,24($16)
150
151 stq $4,32($16)
152 stq $5,40($16)
153 addq $19,64,$19
154 unop
155
156 stq $6,48($16)
157 stq $7,56($16)
158 addq $16,64,$16
159 bne $18, 1b
160
161 /* Prefetch the final 5 cache lines of the read stream. */
162 lda $18,10
163 ldl $31,320($17)
164 ldl $31,384($17)
165 ldl $31,448($17)
166
167 ldl $31,512($17)
168 ldl $31,576($17)
169 nop
170 nop
171
172 /* Non-prefetching, non-write-hinting cleanup loop for the
173 final 10 cache lines. */
1742: ldq $0,0($17)
175 ldq $1,8($17)
176 ldq $2,16($17)
177 ldq $3,24($17)
178
179 ldq $4,32($17)
180 ldq $5,40($17)
181 ldq $6,48($17)
182 ldq $7,56($17)
183
184 stq $0,0($16)
185 subq $18,1,$18
186 stq $1,8($16)
187 addq $17,64,$17
188
189 stq $2,16($16)
190 stq $3,24($16)
191 stq $4,32($16)
192 stq $5,40($16)
193
194 stq $6,48($16)
195 stq $7,56($16)
196 addq $16,64,$16
197 bne $18, 2b
198
199 ret
200 nop
201 unop
202 nop
203
204 .end copy_page
205 EXPORT_SYMBOL(copy_page)
206

source code of linux/arch/alpha/lib/ev6-copy_page.S