1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* NGmemcpy.S: Niagara optimized memcpy. |
3 | * |
4 | * Copyright (C) 2006, 2007 David S. Miller (davem@davemloft.net) |
5 | */ |
6 | |
7 | #ifdef __KERNEL__ |
8 | #include <linux/linkage.h> |
9 | #include <asm/asi.h> |
10 | #include <asm/thread_info.h> |
11 | #define GLOBAL_SPARE %g7 |
12 | #define RESTORE_ASI(TMP) \ |
13 | wr %g0, ASI_AIUS, %asi |
14 | #else |
15 | #define GLOBAL_SPARE %g5 |
16 | #define RESTORE_ASI(TMP) \ |
17 | wr %g0, ASI_PNF, %asi |
18 | #endif |
19 | |
20 | #ifdef __sparc_v9__ |
21 | #define SAVE_AMOUNT 128 |
22 | #else |
23 | #define SAVE_AMOUNT 64 |
24 | #endif |
25 | |
26 | #ifndef STORE_ASI |
27 | #define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P |
28 | #endif |
29 | |
30 | #ifndef EX_LD |
31 | #define EX_LD(x,y) x |
32 | #endif |
33 | |
34 | #ifndef EX_ST |
35 | #define EX_ST(x,y) x |
36 | #endif |
37 | |
38 | #ifndef LOAD |
39 | #ifndef MEMCPY_DEBUG |
40 | #define LOAD(type,addr,dest) type [addr], dest |
41 | #else |
42 | #define LOAD(type,addr,dest) type##a [addr] 0x80, dest |
43 | #endif |
44 | #endif |
45 | |
46 | #ifndef LOAD_TWIN |
47 | #define LOAD_TWIN(addr_reg,dest0,dest1) \ |
48 | ldda [addr_reg] ASI_BLK_INIT_QUAD_LDD_P, dest0 |
49 | #endif |
50 | |
51 | #ifndef STORE |
52 | #define STORE(type,src,addr) type src, [addr] |
53 | #endif |
54 | |
55 | #ifndef STORE_INIT |
56 | #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA |
57 | #define STORE_INIT(src,addr) stxa src, [addr] %asi |
58 | #else |
59 | #define STORE_INIT(src,addr) stx src, [addr + 0x00] |
60 | #endif |
61 | #endif |
62 | |
63 | #ifndef FUNC_NAME |
64 | #define FUNC_NAME NGmemcpy |
65 | #endif |
66 | |
67 | #ifndef PREAMBLE |
68 | #define PREAMBLE |
69 | #endif |
70 | |
71 | #ifndef XCC |
72 | #define XCC xcc |
73 | #endif |
74 | |
75 | .register %g2,#scratch |
76 | .register %g3,#scratch |
77 | |
78 | .text |
79 | #ifndef EX_RETVAL |
80 | #define EX_RETVAL(x) x |
81 | __restore_asi: |
82 | ret |
83 | wr %g0, ASI_AIUS, %asi |
84 | restore |
85 | ENTRY(NG_ret_i2_plus_i4_plus_1) |
86 | ba,pt %xcc, __restore_asi |
87 | add %i2, %i5, %i0 |
88 | ENDPROC(NG_ret_i2_plus_i4_plus_1) |
89 | ENTRY(NG_ret_i2_plus_g1) |
90 | ba,pt %xcc, __restore_asi |
91 | add %i2, %g1, %i0 |
92 | ENDPROC(NG_ret_i2_plus_g1) |
93 | ENTRY(NG_ret_i2_plus_g1_minus_8) |
94 | sub %g1, 8, %g1 |
95 | ba,pt %xcc, __restore_asi |
96 | add %i2, %g1, %i0 |
97 | ENDPROC(NG_ret_i2_plus_g1_minus_8) |
98 | ENTRY(NG_ret_i2_plus_g1_minus_16) |
99 | sub %g1, 16, %g1 |
100 | ba,pt %xcc, __restore_asi |
101 | add %i2, %g1, %i0 |
102 | ENDPROC(NG_ret_i2_plus_g1_minus_16) |
103 | ENTRY(NG_ret_i2_plus_g1_minus_24) |
104 | sub %g1, 24, %g1 |
105 | ba,pt %xcc, __restore_asi |
106 | add %i2, %g1, %i0 |
107 | ENDPROC(NG_ret_i2_plus_g1_minus_24) |
108 | ENTRY(NG_ret_i2_plus_g1_minus_32) |
109 | sub %g1, 32, %g1 |
110 | ba,pt %xcc, __restore_asi |
111 | add %i2, %g1, %i0 |
112 | ENDPROC(NG_ret_i2_plus_g1_minus_32) |
113 | ENTRY(NG_ret_i2_plus_g1_minus_40) |
114 | sub %g1, 40, %g1 |
115 | ba,pt %xcc, __restore_asi |
116 | add %i2, %g1, %i0 |
117 | ENDPROC(NG_ret_i2_plus_g1_minus_40) |
118 | ENTRY(NG_ret_i2_plus_g1_minus_48) |
119 | sub %g1, 48, %g1 |
120 | ba,pt %xcc, __restore_asi |
121 | add %i2, %g1, %i0 |
122 | ENDPROC(NG_ret_i2_plus_g1_minus_48) |
123 | ENTRY(NG_ret_i2_plus_g1_minus_56) |
124 | sub %g1, 56, %g1 |
125 | ba,pt %xcc, __restore_asi |
126 | add %i2, %g1, %i0 |
127 | ENDPROC(NG_ret_i2_plus_g1_minus_56) |
128 | ENTRY(NG_ret_i2_plus_i4) |
129 | ba,pt %xcc, __restore_asi |
130 | add %i2, %i4, %i0 |
131 | ENDPROC(NG_ret_i2_plus_i4) |
132 | ENTRY(NG_ret_i2_plus_i4_minus_8) |
133 | sub %i4, 8, %i4 |
134 | ba,pt %xcc, __restore_asi |
135 | add %i2, %i4, %i0 |
136 | ENDPROC(NG_ret_i2_plus_i4_minus_8) |
137 | ENTRY(NG_ret_i2_plus_8) |
138 | ba,pt %xcc, __restore_asi |
139 | add %i2, 8, %i0 |
140 | ENDPROC(NG_ret_i2_plus_8) |
141 | ENTRY(NG_ret_i2_plus_4) |
142 | ba,pt %xcc, __restore_asi |
143 | add %i2, 4, %i0 |
144 | ENDPROC(NG_ret_i2_plus_4) |
145 | ENTRY(NG_ret_i2_plus_1) |
146 | ba,pt %xcc, __restore_asi |
147 | add %i2, 1, %i0 |
148 | ENDPROC(NG_ret_i2_plus_1) |
149 | ENTRY(NG_ret_i2_plus_g1_plus_1) |
150 | add %g1, 1, %g1 |
151 | ba,pt %xcc, __restore_asi |
152 | add %i2, %g1, %i0 |
153 | ENDPROC(NG_ret_i2_plus_g1_plus_1) |
154 | ENTRY(NG_ret_i2) |
155 | ba,pt %xcc, __restore_asi |
156 | mov %i2, %i0 |
157 | ENDPROC(NG_ret_i2) |
158 | ENTRY(NG_ret_i2_and_7_plus_i4) |
159 | and %i2, 7, %i2 |
160 | ba,pt %xcc, __restore_asi |
161 | add %i2, %i4, %i0 |
162 | ENDPROC(NG_ret_i2_and_7_plus_i4) |
163 | #endif |
164 | |
165 | .align 64 |
166 | |
167 | .globl FUNC_NAME |
168 | .type FUNC_NAME,#function |
169 | FUNC_NAME: /* %i0=dst, %i1=src, %i2=len */ |
170 | PREAMBLE |
171 | save %sp, -SAVE_AMOUNT, %sp |
172 | srlx %i2, 31, %g2 |
173 | cmp %g2, 0 |
174 | tne %xcc, 5 |
175 | mov %i0, %o0 |
176 | cmp %i2, 0 |
177 | be,pn %XCC, 85f |
178 | or %o0, %i1, %i3 |
179 | cmp %i2, 16 |
180 | blu,a,pn %XCC, 80f |
181 | or %i3, %i2, %i3 |
182 | |
183 | /* 2 blocks (128 bytes) is the minimum we can do the block |
184 | * copy with. We need to ensure that we'll iterate at least |
185 | * once in the block copy loop. At worst we'll need to align |
186 | * the destination to a 64-byte boundary which can chew up |
187 | * to (64 - 1) bytes from the length before we perform the |
188 | * block copy loop. |
189 | */ |
190 | cmp %i2, (2 * 64) |
191 | blu,pt %XCC, 70f |
192 | andcc %i3, 0x7, %g0 |
193 | |
194 | /* %o0: dst |
195 | * %i1: src |
196 | * %i2: len (known to be >= 128) |
197 | * |
198 | * The block copy loops will use %i4/%i5,%g2/%g3 as |
199 | * temporaries while copying the data. |
200 | */ |
201 | |
202 | LOAD(prefetch, %i1, #one_read) |
203 | wr %g0, STORE_ASI, %asi |
204 | |
205 | /* Align destination on 64-byte boundary. */ |
206 | andcc %o0, (64 - 1), %i4 |
207 | be,pt %XCC, 2f |
208 | sub %i4, 64, %i4 |
209 | sub %g0, %i4, %i4 ! bytes to align dst |
210 | sub %i2, %i4, %i2 |
211 | 1: subcc %i4, 1, %i4 |
212 | EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_i4_plus_1) |
213 | EX_ST(STORE(stb, %g1, %o0), NG_ret_i2_plus_i4_plus_1) |
214 | add %i1, 1, %i1 |
215 | bne,pt %XCC, 1b |
216 | add %o0, 1, %o0 |
217 | |
218 | /* If the source is on a 16-byte boundary we can do |
219 | * the direct block copy loop. If it is 8-byte aligned |
220 | * we can do the 16-byte loads offset by -8 bytes and the |
221 | * init stores offset by one register. |
222 | * |
223 | * If the source is not even 8-byte aligned, we need to do |
224 | * shifting and masking (basically integer faligndata). |
225 | * |
226 | * The careful bit with init stores is that if we store |
227 | * to any part of the cache line we have to store the whole |
228 | * cacheline else we can end up with corrupt L2 cache line |
229 | * contents. Since the loop works on 64-bytes of 64-byte |
230 | * aligned store data at a time, this is easy to ensure. |
231 | */ |
232 | 2: |
233 | andcc %i1, (16 - 1), %i4 |
234 | andn %i2, (64 - 1), %g1 ! block copy loop iterator |
235 | be,pt %XCC, 50f |
236 | sub %i2, %g1, %i2 ! final sub-block copy bytes |
237 | |
238 | cmp %i4, 8 |
239 | be,pt %XCC, 10f |
240 | sub %i1, %i4, %i1 |
241 | |
242 | /* Neither 8-byte nor 16-byte aligned, shift and mask. */ |
243 | and %i4, 0x7, GLOBAL_SPARE |
244 | sll GLOBAL_SPARE, 3, GLOBAL_SPARE |
245 | mov 64, %i5 |
246 | EX_LD(LOAD_TWIN(%i1, %g2, %g3), NG_ret_i2_plus_g1) |
247 | sub %i5, GLOBAL_SPARE, %i5 |
248 | mov 16, %o4 |
249 | mov 32, %o5 |
250 | mov 48, %o7 |
251 | mov 64, %i3 |
252 | |
253 | bg,pn %XCC, 9f |
254 | nop |
255 | |
256 | #define MIX_THREE_WORDS(WORD1, WORD2, WORD3, PRE_SHIFT, POST_SHIFT, TMP) \ |
257 | sllx WORD1, POST_SHIFT, WORD1; \ |
258 | srlx WORD2, PRE_SHIFT, TMP; \ |
259 | sllx WORD2, POST_SHIFT, WORD2; \ |
260 | or WORD1, TMP, WORD1; \ |
261 | srlx WORD3, PRE_SHIFT, TMP; \ |
262 | or WORD2, TMP, WORD2; |
263 | |
264 | 8: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) |
265 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) |
266 | LOAD(prefetch, %i1 + %i3, #one_read) |
267 | |
268 | EX_ST(STORE_INIT(%g2, %o0 + 0x00), NG_ret_i2_plus_g1) |
269 | EX_ST(STORE_INIT(%g3, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) |
270 | |
271 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) |
272 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) |
273 | |
274 | EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) |
275 | EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) |
276 | |
277 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) |
278 | MIX_THREE_WORDS(%g2, %g3, %o2, %i5, GLOBAL_SPARE, %o1) |
279 | |
280 | EX_ST(STORE_INIT(%g2, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) |
281 | EX_ST(STORE_INIT(%g3, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) |
282 | |
283 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) |
284 | add %i1, 64, %i1 |
285 | MIX_THREE_WORDS(%o2, %o3, %g2, %i5, GLOBAL_SPARE, %o1) |
286 | |
287 | EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) |
288 | EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) |
289 | |
290 | subcc %g1, 64, %g1 |
291 | bne,pt %XCC, 8b |
292 | add %o0, 64, %o0 |
293 | |
294 | ba,pt %XCC, 60f |
295 | add %i1, %i4, %i1 |
296 | |
297 | 9: EX_LD(LOAD_TWIN(%i1 + %o4, %o2, %o3), NG_ret_i2_plus_g1) |
298 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) |
299 | LOAD(prefetch, %i1 + %i3, #one_read) |
300 | |
301 | EX_ST(STORE_INIT(%g3, %o0 + 0x00), NG_ret_i2_plus_g1) |
302 | EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) |
303 | |
304 | EX_LD(LOAD_TWIN(%i1 + %o5, %g2, %g3), NG_ret_i2_plus_g1_minus_16) |
305 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) |
306 | |
307 | EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) |
308 | EX_ST(STORE_INIT(%g2, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) |
309 | |
310 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1_minus_32) |
311 | MIX_THREE_WORDS(%g3, %o2, %o3, %i5, GLOBAL_SPARE, %o1) |
312 | |
313 | EX_ST(STORE_INIT(%g3, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) |
314 | EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) |
315 | |
316 | EX_LD(LOAD_TWIN(%i1 + %i3, %g2, %g3), NG_ret_i2_plus_g1_minus_48) |
317 | add %i1, 64, %i1 |
318 | MIX_THREE_WORDS(%o3, %g2, %g3, %i5, GLOBAL_SPARE, %o1) |
319 | |
320 | EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) |
321 | EX_ST(STORE_INIT(%g2, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) |
322 | |
323 | subcc %g1, 64, %g1 |
324 | bne,pt %XCC, 9b |
325 | add %o0, 64, %o0 |
326 | |
327 | ba,pt %XCC, 60f |
328 | add %i1, %i4, %i1 |
329 | |
330 | 10: /* Destination is 64-byte aligned, source was only 8-byte |
331 | * aligned but it has been subtracted by 8 and we perform |
332 | * one twin load ahead, then add 8 back into source when |
333 | * we finish the loop. |
334 | */ |
335 | EX_LD(LOAD_TWIN(%i1, %o4, %o5), NG_ret_i2_plus_g1) |
336 | mov 16, %o7 |
337 | mov 32, %g2 |
338 | mov 48, %g3 |
339 | mov 64, %o1 |
340 | 1: EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) |
341 | LOAD(prefetch, %i1 + %o1, #one_read) |
342 | EX_ST(STORE_INIT(%o5, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line |
343 | EX_ST(STORE_INIT(%o2, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) |
344 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) |
345 | EX_ST(STORE_INIT(%o3, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) |
346 | EX_ST(STORE_INIT(%o4, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) |
347 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) |
348 | EX_ST(STORE_INIT(%o5, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) |
349 | EX_ST(STORE_INIT(%o2, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) |
350 | EX_LD(LOAD_TWIN(%i1 + %o1, %o4, %o5), NG_ret_i2_plus_g1_minus_48) |
351 | add %i1, 64, %i1 |
352 | EX_ST(STORE_INIT(%o3, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) |
353 | EX_ST(STORE_INIT(%o4, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) |
354 | subcc %g1, 64, %g1 |
355 | bne,pt %XCC, 1b |
356 | add %o0, 64, %o0 |
357 | |
358 | ba,pt %XCC, 60f |
359 | add %i1, 0x8, %i1 |
360 | |
361 | 50: /* Destination is 64-byte aligned, and source is 16-byte |
362 | * aligned. |
363 | */ |
364 | mov 16, %o7 |
365 | mov 32, %g2 |
366 | mov 48, %g3 |
367 | mov 64, %o1 |
368 | 1: EX_LD(LOAD_TWIN(%i1 + %g0, %o4, %o5), NG_ret_i2_plus_g1) |
369 | EX_LD(LOAD_TWIN(%i1 + %o7, %o2, %o3), NG_ret_i2_plus_g1) |
370 | LOAD(prefetch, %i1 + %o1, #one_read) |
371 | EX_ST(STORE_INIT(%o4, %o0 + 0x00), NG_ret_i2_plus_g1) ! initializes cache line |
372 | EX_ST(STORE_INIT(%o5, %o0 + 0x08), NG_ret_i2_plus_g1_minus_8) |
373 | EX_LD(LOAD_TWIN(%i1 + %g2, %o4, %o5), NG_ret_i2_plus_g1_minus_16) |
374 | EX_ST(STORE_INIT(%o2, %o0 + 0x10), NG_ret_i2_plus_g1_minus_16) |
375 | EX_ST(STORE_INIT(%o3, %o0 + 0x18), NG_ret_i2_plus_g1_minus_24) |
376 | EX_LD(LOAD_TWIN(%i1 + %g3, %o2, %o3), NG_ret_i2_plus_g1_minus_32) |
377 | add %i1, 64, %i1 |
378 | EX_ST(STORE_INIT(%o4, %o0 + 0x20), NG_ret_i2_plus_g1_minus_32) |
379 | EX_ST(STORE_INIT(%o5, %o0 + 0x28), NG_ret_i2_plus_g1_minus_40) |
380 | EX_ST(STORE_INIT(%o2, %o0 + 0x30), NG_ret_i2_plus_g1_minus_48) |
381 | EX_ST(STORE_INIT(%o3, %o0 + 0x38), NG_ret_i2_plus_g1_minus_56) |
382 | subcc %g1, 64, %g1 |
383 | bne,pt %XCC, 1b |
384 | add %o0, 64, %o0 |
385 | /* fall through */ |
386 | |
387 | 60: |
388 | membar #Sync |
389 | |
390 | /* %i2 contains any final bytes still needed to be copied |
391 | * over. If anything is left, we copy it one byte at a time. |
392 | */ |
393 | RESTORE_ASI(%i3) |
394 | brz,pt %i2, 85f |
395 | sub %o0, %i1, %i3 |
396 | ba,a,pt %XCC, 90f |
397 | nop |
398 | |
399 | .align 64 |
400 | 70: /* 16 < len <= 64 */ |
401 | bne,pn %XCC, 75f |
402 | sub %o0, %i1, %i3 |
403 | |
404 | 72: |
405 | andn %i2, 0xf, %i4 |
406 | and %i2, 0xf, %i2 |
407 | 1: subcc %i4, 0x10, %i4 |
408 | EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_i4) |
409 | add %i1, 0x08, %i1 |
410 | EX_LD(LOAD(ldx, %i1, %g1), NG_ret_i2_plus_i4) |
411 | sub %i1, 0x08, %i1 |
412 | EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_i4) |
413 | add %i1, 0x8, %i1 |
414 | EX_ST(STORE(stx, %g1, %i1 + %i3), NG_ret_i2_plus_i4_minus_8) |
415 | bgu,pt %XCC, 1b |
416 | add %i1, 0x8, %i1 |
417 | 73: andcc %i2, 0x8, %g0 |
418 | be,pt %XCC, 1f |
419 | nop |
420 | sub %i2, 0x8, %i2 |
421 | EX_LD(LOAD(ldx, %i1, %o4), NG_ret_i2_plus_8) |
422 | EX_ST(STORE(stx, %o4, %i1 + %i3), NG_ret_i2_plus_8) |
423 | add %i1, 0x8, %i1 |
424 | 1: andcc %i2, 0x4, %g0 |
425 | be,pt %XCC, 1f |
426 | nop |
427 | sub %i2, 0x4, %i2 |
428 | EX_LD(LOAD(lduw, %i1, %i5), NG_ret_i2_plus_4) |
429 | EX_ST(STORE(stw, %i5, %i1 + %i3), NG_ret_i2_plus_4) |
430 | add %i1, 0x4, %i1 |
431 | 1: cmp %i2, 0 |
432 | be,pt %XCC, 85f |
433 | nop |
434 | ba,pt %xcc, 90f |
435 | nop |
436 | |
437 | 75: |
438 | andcc %o0, 0x7, %g1 |
439 | sub %g1, 0x8, %g1 |
440 | be,pn %icc, 2f |
441 | sub %g0, %g1, %g1 |
442 | sub %i2, %g1, %i2 |
443 | |
444 | 1: subcc %g1, 1, %g1 |
445 | EX_LD(LOAD(ldub, %i1, %i5), NG_ret_i2_plus_g1_plus_1) |
446 | EX_ST(STORE(stb, %i5, %i1 + %i3), NG_ret_i2_plus_g1_plus_1) |
447 | bgu,pt %icc, 1b |
448 | add %i1, 1, %i1 |
449 | |
450 | 2: add %i1, %i3, %o0 |
451 | andcc %i1, 0x7, %g1 |
452 | bne,pt %icc, 8f |
453 | sll %g1, 3, %g1 |
454 | |
455 | cmp %i2, 16 |
456 | bgeu,pt %icc, 72b |
457 | nop |
458 | ba,a,pt %xcc, 73b |
459 | |
460 | 8: mov 64, %i3 |
461 | andn %i1, 0x7, %i1 |
462 | EX_LD(LOAD(ldx, %i1, %g2), NG_ret_i2) |
463 | sub %i3, %g1, %i3 |
464 | andn %i2, 0x7, %i4 |
465 | sllx %g2, %g1, %g2 |
466 | 1: add %i1, 0x8, %i1 |
467 | EX_LD(LOAD(ldx, %i1, %g3), NG_ret_i2_and_7_plus_i4) |
468 | subcc %i4, 0x8, %i4 |
469 | srlx %g3, %i3, %i5 |
470 | or %i5, %g2, %i5 |
471 | EX_ST(STORE(stx, %i5, %o0), NG_ret_i2_and_7_plus_i4) |
472 | add %o0, 0x8, %o0 |
473 | bgu,pt %icc, 1b |
474 | sllx %g3, %g1, %g2 |
475 | |
476 | srl %g1, 3, %g1 |
477 | andcc %i2, 0x7, %i2 |
478 | be,pn %icc, 85f |
479 | add %i1, %g1, %i1 |
480 | ba,pt %xcc, 90f |
481 | sub %o0, %i1, %i3 |
482 | |
483 | .align 64 |
484 | 80: /* 0 < len <= 16 */ |
485 | andcc %i3, 0x3, %g0 |
486 | bne,pn %XCC, 90f |
487 | sub %o0, %i1, %i3 |
488 | |
489 | 1: |
490 | subcc %i2, 4, %i2 |
491 | EX_LD(LOAD(lduw, %i1, %g1), NG_ret_i2_plus_4) |
492 | EX_ST(STORE(stw, %g1, %i1 + %i3), NG_ret_i2_plus_4) |
493 | bgu,pt %XCC, 1b |
494 | add %i1, 4, %i1 |
495 | |
496 | 85: ret |
497 | restore EX_RETVAL(%i0), %g0, %o0 |
498 | |
499 | .align 32 |
500 | 90: |
501 | subcc %i2, 1, %i2 |
502 | EX_LD(LOAD(ldub, %i1, %g1), NG_ret_i2_plus_1) |
503 | EX_ST(STORE(stb, %g1, %i1 + %i3), NG_ret_i2_plus_1) |
504 | bgu,pt %XCC, 90b |
505 | add %i1, 1, %i1 |
506 | ret |
507 | restore EX_RETVAL(%i0), %g0, %o0 |
508 | |
509 | .size FUNC_NAME, .-FUNC_NAME |
510 | |