1/*
2 * M7memcpy: Optimized SPARC M7 memcpy
3 *
4 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
5 */
6
7 .file "M7memcpy.S"
8
9/*
10 * memcpy(s1, s2, len)
11 *
12 * Copy s2 to s1, always copy n bytes.
13 * Note: this C code does not work for overlapped copies.
14 *
15 * Fast assembler language version of the following C-program for memcpy
16 * which represents the `standard' for the C-library.
17 *
18 * void *
19 * memcpy(void *s, const void *s0, size_t n)
20 * {
21 * if (n != 0) {
22 * char *s1 = s;
23 * const char *s2 = s0;
24 * do {
25 * *s1++ = *s2++;
26 * } while (--n != 0);
27 * }
28 * return (s);
29 * }
30 *
31 *
32 * SPARC T7/M7 Flow :
33 *
34 * if (count < SMALL_MAX) {
35 * if count < SHORTCOPY (SHORTCOPY=3)
36 * copy bytes; exit with dst addr
37 * if src & dst aligned on word boundary but not long word boundary,
38 * copy with ldw/stw; branch to finish_up
39 * if src & dst aligned on long word boundary
40 * copy with ldx/stx; branch to finish_up
41 * if src & dst not aligned and length <= SHORTCHECK (SHORTCHECK=14)
42 * copy bytes; exit with dst addr
43 * move enough bytes to get src to word boundary
44 * if dst now on word boundary
45 * move_words:
46 * copy words; branch to finish_up
47 * if dst now on half word boundary
48 * load words, shift half words, store words; branch to finish_up
49 * if dst on byte 1
50 * load words, shift 3 bytes, store words; branch to finish_up
51 * if dst on byte 3
52 * load words, shift 1 byte, store words; branch to finish_up
53 * finish_up:
54 * copy bytes; exit with dst addr
55 * } else { More than SMALL_MAX bytes
56 * move bytes until dst is on long word boundary
57 * if( src is on long word boundary ) {
58 * if (count < MED_MAX) {
59 * finish_long: src/dst aligned on 8 bytes
60 * copy with ldx/stx in 8-way unrolled loop;
61 * copy final 0-63 bytes; exit with dst addr
62 * } else { src/dst aligned; count > MED_MAX
63 * align dst on 64 byte boundary; for main data movement:
64 * prefetch src data to L2 cache; let HW prefetch move data to L1 cache
65 * Use BIS (block initializing store) to avoid copying store cache
66 * lines from memory. But pre-store first element of each cache line
67 * ST_CHUNK lines in advance of the rest of that cache line. That
68 * gives time for replacement cache lines to be written back without
69 * excess STQ and Miss Buffer filling. Repeat until near the end,
70 * then finish up storing before going to finish_long.
71 * }
72 * } else { src/dst not aligned on 8 bytes
73 * if src is word aligned and count < MED_WMAX
74 * move words in 8-way unrolled loop
75 * move final 0-31 bytes; exit with dst addr
76 * if count < MED_UMAX
77 * use alignaddr/faligndata combined with ldd/std in 8-way
78 * unrolled loop to move data.
79 * go to unalign_done
80 * else
81 * setup alignaddr for faligndata instructions
82 * align dst on 64 byte boundary; prefetch src data to L1 cache
83 * loadx8, falign, block-store, prefetch loop
84 * (only use block-init-store when src/dst on 8 byte boundaries.)
85 * unalign_done:
86 * move remaining bytes for unaligned cases. exit with dst addr.
87 * }
88 *
89 */
90
91#include <asm/visasm.h>
92#include <asm/asi.h>
93
94#if !defined(EX_LD) && !defined(EX_ST)
95#define NON_USER_COPY
96#endif
97
98#ifndef EX_LD
99#define EX_LD(x,y) x
100#endif
101#ifndef EX_LD_FP
102#define EX_LD_FP(x,y) x
103#endif
104
105#ifndef EX_ST
106#define EX_ST(x,y) x
107#endif
108#ifndef EX_ST_FP
109#define EX_ST_FP(x,y) x
110#endif
111
112#ifndef EX_RETVAL
113#define EX_RETVAL(x) x
114#endif
115
116#ifndef LOAD
117#define LOAD(type,addr,dest) type [addr], dest
118#endif
119
120#ifndef STORE
121#define STORE(type,src,addr) type src, [addr]
122#endif
123
124/*
125 * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
126 * line as "least recently used" which means if many threads are
127 * active, it has a high probability of being pushed out of the cache
128 * between the first initializing store and the final stores.
129 * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
130 * marks the cache line as "most recently used" for all
131 * but the last cache line
132 */
133#ifndef STORE_ASI
134#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
135#define STORE_ASI ASI_BLK_INIT_QUAD_LDD_P
136#else
137#define STORE_ASI 0x80 /* ASI_P */
138#endif
139#endif
140
141#ifndef STORE_MRU_ASI
142#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
143#define STORE_MRU_ASI ASI_ST_BLKINIT_MRU_P
144#else
145#define STORE_MRU_ASI 0x80 /* ASI_P */
146#endif
147#endif
148
149#ifndef STORE_INIT
150#define STORE_INIT(src,addr) stxa src, [addr] STORE_ASI
151#endif
152
153#ifndef STORE_INIT_MRU
154#define STORE_INIT_MRU(src,addr) stxa src, [addr] STORE_MRU_ASI
155#endif
156
157#ifndef FUNC_NAME
158#define FUNC_NAME M7memcpy
159#endif
160
161#ifndef PREAMBLE
162#define PREAMBLE
163#endif
164
165#define BLOCK_SIZE 64
166#define SHORTCOPY 3
167#define SHORTCHECK 14
168#define SHORT_LONG 64 /* max copy for short longword-aligned case */
169 /* must be at least 64 */
170#define SMALL_MAX 128
171#define MED_UMAX 1024 /* max copy for medium un-aligned case */
172#define MED_WMAX 1024 /* max copy for medium word-aligned case */
173#define MED_MAX 1024 /* max copy for medium longword-aligned case */
174#define ST_CHUNK 24 /* ST_CHUNK - block of values for BIS Store */
175#define ALIGN_PRE 24 /* distance for aligned prefetch loop */
176
177 .register %g2,#scratch
178
179 .section ".text"
180 .global FUNC_NAME
181 .type FUNC_NAME, #function
182 .align 16
183FUNC_NAME:
184 srlx %o2, 31, %g2
185 cmp %g2, 0
186 tne %xcc, 5
187 PREAMBLE
188 mov %o0, %g1 ! save %o0
189 brz,pn %o2, .Lsmallx
190 cmp %o2, 3
191 ble,pn %icc, .Ltiny_cp
192 cmp %o2, 19
193 ble,pn %icc, .Lsmall_cp
194 or %o0, %o1, %g2
195 cmp %o2, SMALL_MAX
196 bl,pn %icc, .Lmedium_cp
197 nop
198
199.Lmedium:
200 neg %o0, %o5
201 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned
202 brz,pt %o5, .Ldst_aligned_on_8
203
204 ! %o5 has the bytes to be written in partial store.
205 sub %o2, %o5, %o2
206 sub %o1, %o0, %o1 ! %o1 gets the difference
2077: ! dst aligning loop
208 add %o1, %o0, %o4
209 EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5) ! load one byte
210 subcc %o5, 1, %o5
211 EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
212 bgu,pt %xcc, 7b
213 add %o0, 1, %o0 ! advance dst
214 add %o1, %o0, %o1 ! restore %o1
215.Ldst_aligned_on_8:
216 andcc %o1, 7, %o5
217 brnz,pt %o5, .Lsrc_dst_unaligned_on_8
218 nop
219
220.Lsrc_dst_aligned_on_8:
221 ! check if we are copying MED_MAX or more bytes
222 set MED_MAX, %o3
223 cmp %o2, %o3 ! limit to store buffer size
224 bgu,pn %xcc, .Llarge_align8_copy
225 nop
226
227/*
228 * Special case for handling when src and dest are both long word aligned
229 * and total data to move is less than MED_MAX bytes
230 */
231.Lmedlong:
232 subcc %o2, 63, %o2 ! adjust length to allow cc test
233 ble,pn %xcc, .Lmedl63 ! skip big loop if less than 64 bytes
234 nop
235.Lmedl64:
236 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63) ! load
237 subcc %o2, 64, %o2 ! decrement length count
238 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64) ! and store
239 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64
240 EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
241 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
242 EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
243 EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
244 EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
245 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
246 EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
247 EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
248 add %o1, 64, %o1 ! increase src ptr by 64
249 EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
250 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
251 add %o0, 64, %o0 ! increase dst ptr by 64
252 EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
253 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
254 bgu,pt %xcc, .Lmedl64 ! repeat if at least 64 bytes left
255 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
256.Lmedl63:
257 addcc %o2, 32, %o2 ! adjust remaining count
258 ble,pt %xcc, .Lmedl31 ! to skip if 31 or fewer bytes left
259 nop
260 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31) ! load
261 sub %o2, 32, %o2 ! decrement length count
262 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32) ! and store
263 EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32
264 add %o1, 32, %o1 ! increase src ptr by 32
265 EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
266 EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
267 add %o0, 32, %o0 ! increase dst ptr by 32
268 EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
269 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
270 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
271.Lmedl31:
272 addcc %o2, 16, %o2 ! adjust remaining count
273 ble,pt %xcc, .Lmedl15 ! skip if 15 or fewer bytes left
274 nop !
275 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
276 add %o1, 16, %o1 ! increase src ptr by 16
277 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
278 sub %o2, 16, %o2 ! decrease count by 16
279 EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
280 add %o0, 16, %o0 ! increase dst ptr by 16
281 EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
282.Lmedl15:
283 addcc %o2, 15, %o2 ! restore count
284 bz,pt %xcc, .Lsmallx ! exit if finished
285 cmp %o2, 8
286 blt,pt %xcc, .Lmedw7 ! skip if 7 or fewer bytes left
287 tst %o2
288 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2) ! load 8 bytes
289 add %o1, 8, %o1 ! increase src ptr by 8
290 add %o0, 8, %o0 ! increase dst ptr by 8
291 subcc %o2, 8, %o2 ! decrease count by 8
292 bnz,pn %xcc, .Lmedw7
293 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8) ! and store 8
294 retl
295 mov EX_RETVAL(%g1), %o0 ! restore %o0
296
297 .align 16
298.Lsrc_dst_unaligned_on_8:
299 ! DST is 8-byte aligned, src is not
3002:
301 andcc %o1, 0x3, %o5 ! test word alignment
302 bnz,pt %xcc, .Lunalignsetup ! branch to skip if not word aligned
303 nop
304
305/*
306 * Handle all cases where src and dest are aligned on word
307 * boundaries. Use unrolled loops for better performance.
308 * This option wins over standard large data move when
309 * source and destination is in cache for.Lmedium
310 * to short data moves.
311 */
312 set MED_WMAX, %o3
313 cmp %o2, %o3 ! limit to store buffer size
314 bge,pt %xcc, .Lunalignrejoin ! otherwise rejoin main loop
315 nop
316
317 subcc %o2, 31, %o2 ! adjust length to allow cc test
318 ! for end of loop
319 ble,pt %xcc, .Lmedw31 ! skip big loop if less than 16
320.Lmedw32:
321 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
322 sllx %o4, 32, %o5
323 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
324 or %o4, %o5, %o5
325 EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
326 subcc %o2, 32, %o2 ! decrement length count
327 EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
328 sllx %o4, 32, %o5
329 EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
330 or %o4, %o5, %o5
331 EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
332 add %o1, 32, %o1 ! increase src ptr by 32
333 EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
334 sllx %o4, 32, %o5
335 EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
336 or %o4, %o5, %o5
337 EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
338 add %o0, 32, %o0 ! increase dst ptr by 32
339 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
340 sllx %o4, 32, %o5
341 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
342 or %o4, %o5, %o5
343 bgu,pt %xcc, .Lmedw32 ! repeat if at least 32 bytes left
344 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
345.Lmedw31:
346 addcc %o2, 31, %o2 ! restore count
347
348 bz,pt %xcc, .Lsmallx ! exit if finished
349 nop
350 cmp %o2, 16
351 blt,pt %xcc, .Lmedw15
352 nop
353 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
354 sllx %o4, 32, %o5
355 subcc %o2, 16, %o2 ! decrement length count
356 EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
357 or %o4, %o5, %o5
358 EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
359 add %o1, 16, %o1 ! increase src ptr by 16
360 EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
361 add %o0, 16, %o0 ! increase dst ptr by 16
362 sllx %o4, 32, %o5
363 EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
364 or %o4, %o5, %o5
365 EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
366.Lmedw15:
367 bz,pt %xcc, .Lsmallx ! exit if finished
368 cmp %o2, 8
369 blt,pn %xcc, .Lmedw7 ! skip if 7 or fewer bytes left
370 tst %o2
371 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes
372 subcc %o2, 8, %o2 ! decrease count by 8
373 EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
374 add %o1, 8, %o1 ! increase src ptr by 8
375 EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4) ! load 4 bytes
376 add %o0, 8, %o0 ! increase dst ptr by 8
377 EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
378 bz,pt %xcc, .Lsmallx ! exit if finished
379.Lmedw7: ! count is ge 1, less than 8
380 cmp %o2, 4 ! check for 4 bytes left
381 blt,pn %xcc, .Lsmallleft3 ! skip if 3 or fewer bytes left
382 nop !
383 EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2) ! load 4 bytes
384 add %o1, 4, %o1 ! increase src ptr by 4
385 add %o0, 4, %o0 ! increase dst ptr by 4
386 subcc %o2, 4, %o2 ! decrease count by 4
387 bnz .Lsmallleft3
388 EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
389 retl
390 mov EX_RETVAL(%g1), %o0
391
392 .align 16
393.Llarge_align8_copy: ! Src and dst share 8 byte alignment
394 ! align dst to 64 byte boundary
395 andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned
396 brz,pn %o3, .Laligned_to_64
397 andcc %o0, 8, %o3 ! odd long words to move?
398 brz,pt %o3, .Laligned_to_16
399 nop
400 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
401 sub %o2, 8, %o2
402 add %o1, 8, %o1 ! increment src ptr
403 add %o0, 8, %o0 ! increment dst ptr
404 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
405.Laligned_to_16:
406 andcc %o0, 16, %o3 ! pair of long words to move?
407 brz,pt %o3, .Laligned_to_32
408 nop
409 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
410 sub %o2, 16, %o2
411 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
412 add %o1, 16, %o1 ! increment src ptr
413 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
414 add %o0, 16, %o0 ! increment dst ptr
415 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
416.Laligned_to_32:
417 andcc %o0, 32, %o3 ! four long words to move?
418 brz,pt %o3, .Laligned_to_64
419 nop
420 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
421 sub %o2, 32, %o2
422 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
423 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
424 EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
425 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
426 EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
427 add %o1, 32, %o1 ! increment src ptr
428 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
429 add %o0, 32, %o0 ! increment dst ptr
430 EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
431.Laligned_to_64:
432!
433! Using block init store (BIS) instructions to avoid fetching cache
434! lines from memory. Use ST_CHUNK stores to first element of each cache
435! line (similar to prefetching) to avoid overfilling STQ or miss buffers.
436! Gives existing cache lines time to be moved out of L1/L2/L3 cache.
437! Initial stores using MRU version of BIS to keep cache line in
438! cache until we are ready to store final element of cache line.
439! Then store last element using the LRU version of BIS.
440!
441 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
442 and %o2, 0x3f, %o2 ! residue bytes in %o2
443!
444! We use STORE_MRU_ASI for the first seven stores to each cache line
445! followed by STORE_ASI (mark as LRU) for the last store. That
446! mixed approach reduces the probability that the cache line is removed
447! before we finish setting it, while minimizing the effects on
448! other cached values during a large memcpy
449!
450! ST_CHUNK batches up initial BIS operations for several cache lines
451! to allow multiple requests to not be blocked by overflowing the
452! the store miss buffer. Then the matching stores for all those
453! BIS operations are executed.
454!
455
456 sub %o0, 8, %o0 ! adjust %o0 for ASI alignment
457.Lalign_loop:
458 cmp %o5, ST_CHUNK*64
459 blu,pt %xcc, .Lalign_loop_fin
460 mov ST_CHUNK,%o3
461.Lalign_loop_start:
462 prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
463 subcc %o3, 1, %o3
464 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
465 add %o1, 64, %o1
466 add %o0, 8, %o0
467 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
468 bgu %xcc,.Lalign_loop_start
469 add %o0, 56, %o0
470
471 mov ST_CHUNK,%o3
472 sllx %o3, 6, %o4 ! ST_CHUNK*64
473 sub %o1, %o4, %o1 ! reset %o1
474 sub %o0, %o4, %o0 ! reset %o0
475
476.Lalign_loop_rest:
477 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
478 add %o0, 16, %o0
479 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
480 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
481 add %o0, 8, %o0
482 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
483 subcc %o3, 1, %o3
484 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
485 add %o0, 8, %o0
486 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
487 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
488 add %o0, 8, %o0
489 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
490 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
491 add %o0, 8, %o0
492 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
493 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
494 add %o1, 64, %o1
495 add %o0, 8, %o0
496 EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
497 add %o0, 8, %o0
498 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
499 sub %o5, 64, %o5
500 bgu %xcc,.Lalign_loop_rest
501 ! mark cache line as LRU
502 EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
503
504 cmp %o5, ST_CHUNK*64
505 bgu,pt %xcc, .Lalign_loop_start
506 mov ST_CHUNK,%o3
507
508 cmp %o5, 0
509 beq .Lalign_done
510 nop
511.Lalign_loop_fin:
512 EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
513 EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
514 EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
515 EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
516 EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
517 EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
518 subcc %o5, 64, %o5
519 EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
520 EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
521 EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
522 EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
523 EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
524 EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
525 EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
526 add %o1, 64, %o1
527 EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
528 add %o0, 64, %o0
529 EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
530 bgu %xcc,.Lalign_loop_fin
531 EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
532
533.Lalign_done:
534 add %o0, 8, %o0 ! restore %o0 from ASI alignment
535 membar #StoreStore
536 sub %o2, 63, %o2 ! adjust length to allow cc test
537 ba .Lmedl63 ! in .Lmedl63
538 nop
539
540 .align 16
541 ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
542.Lunalignsetup:
543.Lunalignrejoin:
544 mov %g1, %o3 ! save %g1 as VISEntryHalf clobbers it
545#ifdef NON_USER_COPY
546 VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
547#else
548 VISEntryHalf
549#endif
550 mov %o3, %g1 ! restore %g1
551
552 set MED_UMAX, %o3
553 cmp %o2, %o3 ! check for.Lmedium unaligned limit
554 bge,pt %xcc,.Lunalign_large
555 prefetch [%o1 + (4 * BLOCK_SIZE)], 20
556 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
557 and %o2, 0x3f, %o2 ! residue bytes in %o2
558 cmp %o2, 8 ! Insure we do not load beyond
559 bgt .Lunalign_adjust ! end of source buffer
560 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
561 add %o2, 64, %o2 ! adjust to leave loop
562 sub %o5, 64, %o5 ! early if necessary
563.Lunalign_adjust:
564 alignaddr %o1, %g0, %g0 ! generate %gsr
565 add %o1, %o5, %o1 ! advance %o1 to after blocks
566 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
567.Lunalign_loop:
568 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
569 faligndata %f0, %f2, %f16
570 EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
571 subcc %o5, BLOCK_SIZE, %o5
572 EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
573 faligndata %f2, %f4, %f18
574 EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
575 EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
576 faligndata %f4, %f6, %f20
577 EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
578 EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
579 faligndata %f6, %f8, %f22
580 EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
581 EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
582 faligndata %f8, %f10, %f24
583 EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
584 EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
585 faligndata %f10, %f12, %f26
586 EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
587 add %o4, BLOCK_SIZE, %o4
588 EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
589 faligndata %f12, %f14, %f28
590 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
591 EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
592 faligndata %f14, %f0, %f30
593 EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
594 add %o0, BLOCK_SIZE, %o0
595 bgu,pt %xcc, .Lunalign_loop
596 prefetch [%o4 + (5 * BLOCK_SIZE)], 20
597 ba .Lunalign_done
598 nop
599
600.Lunalign_large:
601 andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned?
602 bz %xcc, .Lunalignsrc
603 sub %o3, 64, %o3 ! %o3 will be multiple of 8
604 neg %o3 ! bytes until dest is 64 byte aligned
605 sub %o2, %o3, %o2 ! update cnt with bytes to be moved
606 ! Move bytes according to source alignment
607 andcc %o1, 0x1, %o5
608 bnz %xcc, .Lunalignbyte ! check for byte alignment
609 nop
610 andcc %o1, 2, %o5 ! check for half word alignment
611 bnz %xcc, .Lunalignhalf
612 nop
613 ! Src is word aligned
614.Lunalignword:
615 EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 4 bytes
616 add %o1, 8, %o1 ! increase src ptr by 8
617 EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3) ! and store 4
618 subcc %o3, 8, %o3 ! decrease count by 8
619 EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
620 add %o0, 8, %o0 ! increase dst ptr by 8
621 bnz %xcc, .Lunalignword
622 EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
623 ba .Lunalignsrc
624 nop
625
626 ! Src is half-word aligned
627.Lunalignhalf:
628 EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3) ! load 2 bytes
629 sllx %o4, 32, %o5 ! shift left
630 EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
631 or %o4, %o5, %o5
632 sllx %o5, 16, %o5
633 EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
634 or %o4, %o5, %o5
635 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
636 add %o1, 8, %o1
637 subcc %o3, 8, %o3
638 bnz %xcc, .Lunalignhalf
639 add %o0, 8, %o0
640 ba .Lunalignsrc
641 nop
642
643 ! Src is Byte aligned
644.Lunalignbyte:
645 sub %o0, %o1, %o0 ! share pointer advance
646.Lunalignbyte_loop:
647 EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
648 sllx %o4, 56, %o5
649 EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
650 sllx %o4, 40, %o4
651 or %o4, %o5, %o5
652 EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
653 sllx %o4, 24, %o4
654 or %o4, %o5, %o5
655 EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
656 sllx %o4, 8, %o4
657 or %o4, %o5, %o5
658 EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
659 or %o4, %o5, %o5
660 add %o0, %o1, %o0
661 EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
662 sub %o0, %o1, %o0
663 subcc %o3, 8, %o3
664 bnz %xcc, .Lunalignbyte_loop
665 add %o1, 8, %o1
666 add %o0,%o1, %o0 ! restore pointer
667
668 ! Destination is now block (64 byte aligned)
669.Lunalignsrc:
670 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size
671 and %o2, 0x3f, %o2 ! residue bytes in %o2
672 add %o2, 64, %o2 ! Insure we do not load beyond
673 sub %o5, 64, %o5 ! end of source buffer
674
675 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
676 alignaddr %o1, %g0, %g0 ! generate %gsr
677 add %o1, %o5, %o1 ! advance %o1 to after blocks
678
679 EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
680 add %o4, 8, %o4
681.Lunalign_sloop:
682 EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
683 faligndata %f14, %f16, %f0
684 EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
685 faligndata %f16, %f18, %f2
686 EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
687 faligndata %f18, %f20, %f4
688 EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
689 subcc %o5, 64, %o5
690 EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
691 faligndata %f20, %f22, %f6
692 EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
693 EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
694 faligndata %f22, %f24, %f8
695 EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
696 EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
697 faligndata %f24, %f26, %f10
698 EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
699 EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
700 faligndata %f26, %f28, %f12
701 EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
702 add %o4, 64, %o4
703 EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
704 faligndata %f28, %f30, %f14
705 EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
706 EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
707 add %o0, 64, %o0
708 EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
709 fsrc2 %f30, %f14
710 bgu,pt %xcc, .Lunalign_sloop
711 prefetch [%o4 + (8 * BLOCK_SIZE)], 20
712
713.Lunalign_done:
714 ! Handle trailing bytes, 64 to 127
715 ! Dest long word aligned, Src not long word aligned
716 cmp %o2, 15
717 bleu %xcc, .Lunalign_short
718
719 andn %o2, 0x7, %o5 ! %o5 is multiple of 8
720 and %o2, 0x7, %o2 ! residue bytes in %o2
721 add %o2, 8, %o2
722 sub %o5, 8, %o5 ! insure we do not load past end of src
723 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address
724 add %o1, %o5, %o1 ! advance %o1 to after multiple of 8
725 EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
726.Lunalign_by8:
727 EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
728 add %o4, 8, %o4
729 faligndata %f0, %f2, %f16
730 subcc %o5, 8, %o5
731 EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
732 fsrc2 %f2, %f0
733 bgu,pt %xcc, .Lunalign_by8
734 add %o0, 8, %o0
735
736.Lunalign_short:
737#ifdef NON_USER_COPY
738 VISExitHalfFast
739#else
740 VISExitHalf
741#endif
742 ba .Lsmallrest
743 nop
744
745/*
746 * This is a special case of nested memcpy. This can happen when kernel
747 * calls unaligned memcpy back to back without saving FP registers. We need
748 * traps(context switch) to save/restore FP registers. If the kernel calls
749 * memcpy without this trap sequence we will hit FP corruption. Let's use
750 * the normal integer load/store method in this case.
751 */
752
753#ifdef NON_USER_COPY
754.Lmedium_vis_entry_fail_cp:
755 or %o0, %o1, %g2
756#endif
757.Lmedium_cp:
758 LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
759 andcc %g2, 0x7, %g0
760 bne,pn %xcc, .Lmedium_unaligned_cp
761 nop
762
763.Lmedium_noprefetch_cp:
764 andncc %o2, 0x20 - 1, %o5
765 be,pn %xcc, 2f
766 sub %o2, %o5, %o2
7671: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
768 EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
769 EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
770 EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
771 add %o1, 0x20, %o1
772 subcc %o5, 0x20, %o5
773 EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
774 EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
775 EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
776 EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
777 bne,pt %xcc, 1b
778 add %o0, 0x20, %o0
7792: andcc %o2, 0x18, %o5
780 be,pt %xcc, 3f
781 sub %o2, %o5, %o2
7821: EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
783 add %o1, 0x08, %o1
784 add %o0, 0x08, %o0
785 subcc %o5, 0x08, %o5
786 bne,pt %xcc, 1b
787 EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
7883: brz,pt %o2, .Lexit_cp
789 cmp %o2, 0x04
790 bl,pn %xcc, .Ltiny_cp
791 nop
792 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
793 add %o1, 0x04, %o1
794 add %o0, 0x04, %o0
795 subcc %o2, 0x04, %o2
796 bne,pn %xcc, .Ltiny_cp
797 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
798 ba,a,pt %xcc, .Lexit_cp
799
800.Lmedium_unaligned_cp:
801 /* First get dest 8 byte aligned. */
802 sub %g0, %o0, %o3
803 and %o3, 0x7, %o3
804 brz,pt %o3, 2f
805 sub %o2, %o3, %o2
806
8071: EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
808 add %o1, 1, %o1
809 subcc %o3, 1, %o3
810 add %o0, 1, %o0
811 bne,pt %xcc, 1b
812 EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
8132:
814 and %o1, 0x7, %o3
815 brz,pn %o3, .Lmedium_noprefetch_cp
816 sll %o3, 3, %o3
817 mov 64, %g2
818 sub %g2, %o3, %g2
819 andn %o1, 0x7, %o1
820 EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
821 sllx %o4, %o3, %o4
822 andn %o2, 0x08 - 1, %o5
823 sub %o2, %o5, %o2
824
8251: EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
826 add %o1, 0x08, %o1
827 subcc %o5, 0x08, %o5
828 srlx %g3, %g2, %g7
829 or %g7, %o4, %g7
830 EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
831 add %o0, 0x08, %o0
832 bne,pt %xcc, 1b
833 sllx %g3, %o3, %o4
834 srl %o3, 3, %o3
835 add %o1, %o3, %o1
836 brz,pn %o2, .Lexit_cp
837 nop
838 ba,pt %xcc, .Lsmall_unaligned_cp
839
840.Ltiny_cp:
841 EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
842 subcc %o2, 1, %o2
843 be,pn %xcc, .Lexit_cp
844 EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
845 EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
846 subcc %o2, 1, %o2
847 be,pn %xcc, .Lexit_cp
848 EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
849 EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
850 ba,pt %xcc, .Lexit_cp
851 EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
852
853.Lsmall_cp:
854 andcc %g2, 0x3, %g0
855 bne,pn %xcc, .Lsmall_unaligned_cp
856 andn %o2, 0x4 - 1, %o5
857 sub %o2, %o5, %o2
8581:
859 EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
860 add %o1, 0x04, %o1
861 subcc %o5, 0x04, %o5
862 add %o0, 0x04, %o0
863 bne,pt %xcc, 1b
864 EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
865 brz,pt %o2, .Lexit_cp
866 nop
867 ba,a,pt %xcc, .Ltiny_cp
868
869.Lsmall_unaligned_cp:
8701: EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
871 add %o1, 1, %o1
872 add %o0, 1, %o0
873 subcc %o2, 1, %o2
874 bne,pt %xcc, 1b
875 EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
876 ba,a,pt %xcc, .Lexit_cp
877
878.Lsmallrest:
879 tst %o2
880 bz,pt %xcc, .Lsmallx
881 cmp %o2, 4
882 blt,pn %xcc, .Lsmallleft3
883 nop
884 sub %o2, 3, %o2
885.Lsmallnotalign4:
886 EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
887 subcc %o2, 4, %o2 ! reduce count by 4
888 EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
889 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
890 add %o1, 4, %o1 ! advance SRC by 4
891 EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
892 EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
893 add %o0, 4, %o0 ! advance DST by 4
894 EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
895 EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
896 bgu,pt %xcc, .Lsmallnotalign4 ! loop til 3 or fewer bytes remain
897 EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
898 addcc %o2, 3, %o2 ! restore count
899 bz,pt %xcc, .Lsmallx
900.Lsmallleft3: ! 1, 2, or 3 bytes remain
901 subcc %o2, 1, %o2
902 EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1) ! load one byte
903 bz,pt %xcc, .Lsmallx
904 EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1) ! store one byte
905 EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2) ! load second byte
906 subcc %o2, 1, %o2
907 bz,pt %xcc, .Lsmallx
908 EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
909 EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2) ! load third byte
910 EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2) ! store third byte
911.Lsmallx:
912 retl
913 mov EX_RETVAL(%g1), %o0
914.Lsmallfin:
915 tst %o2
916 bnz,pn %xcc, .Lsmallleft3
917 nop
918 retl
919 mov EX_RETVAL(%g1), %o0 ! restore %o0
920.Lexit_cp:
921 retl
922 mov EX_RETVAL(%g1), %o0
923 .size FUNC_NAME, .-FUNC_NAME
924

source code of linux/arch/sparc/lib/M7memcpy.S