1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* |
3 | * arch/alpha/lib/ev6-copy_page.S |
4 | * |
5 | * Copy an entire page. |
6 | */ |
7 | |
8 | /* The following comparison of this routine vs the normal copy_page.S |
9 | was written by an unnamed ev6 hardware designer and forwarded to me |
10 | via Steven Hobbs <hobbs@steven.zko.dec.com>. |
11 | |
12 | First Problem: STQ overflows. |
13 | ----------------------------- |
14 | |
15 | It would be nice if EV6 handled every resource overflow efficiently, |
16 | but for some it doesn't. Including store queue overflows. It causes |
17 | a trap and a restart of the pipe. |
18 | |
19 | To get around this we sometimes use (to borrow a term from a VSSAD |
20 | researcher) "aeration". The idea is to slow the rate at which the |
21 | processor receives valid instructions by inserting nops in the fetch |
22 | path. In doing so, you can prevent the overflow and actually make |
23 | the code run faster. You can, of course, take advantage of the fact |
24 | that the processor can fetch at most 4 aligned instructions per cycle. |
25 | |
26 | I inserted enough nops to force it to take 10 cycles to fetch the |
27 | loop code. In theory, EV6 should be able to execute this loop in |
28 | 9 cycles but I was not able to get it to run that fast -- the initial |
29 | conditions were such that I could not reach this optimum rate on |
30 | (chaotic) EV6. I wrote the code such that everything would issue |
31 | in order. |
32 | |
33 | Second Problem: Dcache index matches. |
34 | ------------------------------------- |
35 | |
36 | If you are going to use this routine on random aligned pages, there |
37 | is a 25% chance that the pages will be at the same dcache indices. |
38 | This results in many nasty memory traps without care. |
39 | |
40 | The solution is to schedule the prefetches to avoid the memory |
41 | conflicts. I schedule the wh64 prefetches farther ahead of the |
42 | read prefetches to avoid this problem. |
43 | |
44 | Third Problem: Needs more prefetching. |
45 | -------------------------------------- |
46 | |
47 | In order to improve the code I added deeper prefetching to take the |
48 | most advantage of EV6's bandwidth. |
49 | |
50 | I also prefetched the read stream. Note that adding the read prefetch |
51 | forced me to add another cycle to the inner-most kernel - up to 11 |
52 | from the original 8 cycles per iteration. We could improve performance |
53 | further by unrolling the loop and doing multiple prefetches per cycle. |
54 | |
55 | I think that the code below will be very robust and fast code for the |
56 | purposes of copying aligned pages. It is slower when both source and |
57 | destination pages are in the dcache, but it is my guess that this is |
58 | less important than the dcache miss case. */ |
59 | |
60 | #include <linux/export.h> |
61 | .text |
62 | .align 4 |
63 | .global copy_page |
64 | .ent copy_page |
65 | copy_page: |
66 | .prologue 0 |
67 | |
68 | /* Prefetch 5 read cachelines; write-hint 10 cache lines. */ |
69 | wh64 ($16) |
70 | ldl $31,0($17) |
71 | ldl $31,64($17) |
72 | lda $1,1*64($16) |
73 | |
74 | wh64 ($1) |
75 | ldl $31,128($17) |
76 | ldl $31,192($17) |
77 | lda $1,2*64($16) |
78 | |
79 | wh64 ($1) |
80 | ldl $31,256($17) |
81 | lda $18,118 |
82 | lda $1,3*64($16) |
83 | |
84 | wh64 ($1) |
85 | nop |
86 | lda $1,4*64($16) |
87 | lda $2,5*64($16) |
88 | |
89 | wh64 ($1) |
90 | wh64 ($2) |
91 | lda $1,6*64($16) |
92 | lda $2,7*64($16) |
93 | |
94 | wh64 ($1) |
95 | wh64 ($2) |
96 | lda $1,8*64($16) |
97 | lda $2,9*64($16) |
98 | |
99 | wh64 ($1) |
100 | wh64 ($2) |
101 | lda $19,10*64($16) |
102 | nop |
103 | |
104 | /* Main prefetching/write-hinting loop. */ |
105 | 1: ldq $0,0($17) |
106 | ldq $1,8($17) |
107 | unop |
108 | unop |
109 | |
110 | unop |
111 | unop |
112 | ldq $2,16($17) |
113 | ldq $3,24($17) |
114 | |
115 | ldq $4,32($17) |
116 | ldq $5,40($17) |
117 | unop |
118 | unop |
119 | |
120 | unop |
121 | unop |
122 | ldq $6,48($17) |
123 | ldq $7,56($17) |
124 | |
125 | ldl $31,320($17) |
126 | unop |
127 | unop |
128 | unop |
129 | |
130 | /* This gives the extra cycle of aeration above the minimum. */ |
131 | unop |
132 | unop |
133 | unop |
134 | unop |
135 | |
136 | wh64 ($19) |
137 | unop |
138 | unop |
139 | unop |
140 | |
141 | stq $0,0($16) |
142 | subq $18,1,$18 |
143 | stq $1,8($16) |
144 | unop |
145 | |
146 | unop |
147 | stq $2,16($16) |
148 | addq $17,64,$17 |
149 | stq $3,24($16) |
150 | |
151 | stq $4,32($16) |
152 | stq $5,40($16) |
153 | addq $19,64,$19 |
154 | unop |
155 | |
156 | stq $6,48($16) |
157 | stq $7,56($16) |
158 | addq $16,64,$16 |
159 | bne $18, 1b |
160 | |
161 | /* Prefetch the final 5 cache lines of the read stream. */ |
162 | lda $18,10 |
163 | ldl $31,320($17) |
164 | ldl $31,384($17) |
165 | ldl $31,448($17) |
166 | |
167 | ldl $31,512($17) |
168 | ldl $31,576($17) |
169 | nop |
170 | nop |
171 | |
172 | /* Non-prefetching, non-write-hinting cleanup loop for the |
173 | final 10 cache lines. */ |
174 | 2: ldq $0,0($17) |
175 | ldq $1,8($17) |
176 | ldq $2,16($17) |
177 | ldq $3,24($17) |
178 | |
179 | ldq $4,32($17) |
180 | ldq $5,40($17) |
181 | ldq $6,48($17) |
182 | ldq $7,56($17) |
183 | |
184 | stq $0,0($16) |
185 | subq $18,1,$18 |
186 | stq $1,8($16) |
187 | addq $17,64,$17 |
188 | |
189 | stq $2,16($16) |
190 | stq $3,24($16) |
191 | stq $4,32($16) |
192 | stq $5,40($16) |
193 | |
194 | stq $6,48($16) |
195 | stq $7,56($16) |
196 | addq $16,64,$16 |
197 | bne $18, 2b |
198 | |
199 | ret |
200 | nop |
201 | unop |
202 | nop |
203 | |
204 | .end copy_page |
205 | EXPORT_SYMBOL(copy_page) |
206 | |