1 | /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb |
2 | Copyright (C) 2016-2022 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* memmove/memcpy/mempcpy is implemented as: |
20 | 1. Use overlapping load and store to avoid branch. |
21 | 2. Load all sources into registers and store them together to avoid |
22 | possible address overlap between source and destination. |
23 | 3. If size is 8 * VEC_SIZE or less, load all sources into registers |
24 | and store them together. |
25 | 4. If address of destination > address of source, backward copy |
26 | 4 * VEC_SIZE at a time with unaligned load and aligned store. |
27 | Load the first 4 * VEC and last VEC before the loop and store |
28 | them after the loop to support overlapping addresses. |
29 | 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned |
30 | load and aligned store. Load the last 4 * VEC and first VEC |
31 | before the loop and store them after the loop to support |
32 | overlapping addresses. |
33 | 6. On machines with ERMS feature, if size greater than equal or to |
34 | __x86_rep_movsb_threshold and less than |
35 | __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. |
36 | 7. If size >= __x86_shared_non_temporal_threshold and there is no |
37 | overlap between destination and source, use non-temporal store |
38 | instead of aligned store copying from either 2 or 4 pages at |
39 | once. |
40 | 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold |
41 | and source and destination do not page alias, copy from 2 pages |
42 | at once using non-temporal stores. Page aliasing in this case is |
43 | considered true if destination's page alignment - sources' page |
44 | alignment is less than 8 * VEC_SIZE. |
45 | 9. If size >= 16 * __x86_shared_non_temporal_threshold or source |
46 | and destination do page alias copy from 4 pages at once using |
47 | non-temporal stores. */ |
48 | |
49 | #include <sysdep.h> |
50 | |
51 | #ifndef MEMCPY_SYMBOL |
52 | # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
53 | #endif |
54 | |
55 | #ifndef MEMPCPY_SYMBOL |
56 | # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
57 | #endif |
58 | |
59 | #ifndef MEMMOVE_CHK_SYMBOL |
60 | # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
61 | #endif |
62 | |
63 | #ifndef XMM0 |
64 | # define XMM0 xmm0 |
65 | #endif |
66 | |
67 | #ifndef YMM0 |
68 | # define YMM0 ymm0 |
69 | #endif |
70 | |
71 | #ifndef VZEROUPPER |
72 | # if VEC_SIZE > 16 |
73 | # define VZEROUPPER vzeroupper |
74 | # else |
75 | # define VZEROUPPER |
76 | # endif |
77 | #endif |
78 | |
79 | /* Whether to align before movsb. Ultimately we want 64 byte |
80 | align and not worth it to load 4x VEC for VEC_SIZE == 16. */ |
81 | #define ALIGN_MOVSB (VEC_SIZE > 16) |
82 | /* Number of bytes to align movsb to. */ |
83 | #define MOVSB_ALIGN_TO 64 |
84 | |
85 | #define SMALL_MOV_SIZE (MOV_SIZE <= 4) |
86 | #define LARGE_MOV_SIZE (MOV_SIZE > 4) |
87 | |
88 | #if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 |
89 | # error MOV_SIZE Unknown |
90 | #endif |
91 | |
92 | #if LARGE_MOV_SIZE |
93 | # define SMALL_SIZE_OFFSET (4) |
94 | #else |
95 | # define SMALL_SIZE_OFFSET (0) |
96 | #endif |
97 | |
98 | #ifndef PAGE_SIZE |
99 | # define PAGE_SIZE 4096 |
100 | #endif |
101 | |
102 | #if PAGE_SIZE != 4096 |
103 | # error Unsupported PAGE_SIZE |
104 | #endif |
105 | |
106 | #ifndef LOG_PAGE_SIZE |
107 | # define LOG_PAGE_SIZE 12 |
108 | #endif |
109 | |
110 | #if PAGE_SIZE != (1 << LOG_PAGE_SIZE) |
111 | # error Invalid LOG_PAGE_SIZE |
112 | #endif |
113 | |
114 | /* Byte per page for large_memcpy inner loop. */ |
115 | #if VEC_SIZE == 64 |
116 | # define LARGE_LOAD_SIZE (VEC_SIZE * 2) |
117 | #else |
118 | # define LARGE_LOAD_SIZE (VEC_SIZE * 4) |
119 | #endif |
120 | |
121 | /* Amount to shift __x86_shared_non_temporal_threshold by for |
122 | bound for memcpy_large_4x. This is essentially use to to |
123 | indicate that the copy is far beyond the scope of L3 |
124 | (assuming no user config x86_non_temporal_threshold) and to |
125 | use a more aggressively unrolled loop. NB: before |
126 | increasing the value also update initialization of |
127 | x86_non_temporal_threshold. */ |
128 | #ifndef LOG_4X_MEMCPY_THRESH |
129 | # define LOG_4X_MEMCPY_THRESH 4 |
130 | #endif |
131 | |
132 | /* Avoid short distance rep movsb only with non-SSE vector. */ |
133 | #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB |
134 | # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) |
135 | #else |
136 | # define AVOID_SHORT_DISTANCE_REP_MOVSB 0 |
137 | #endif |
138 | |
139 | #ifndef PREFETCH |
140 | # define PREFETCH(addr) prefetcht0 addr |
141 | #endif |
142 | |
143 | /* Assume 64-byte prefetch size. */ |
144 | #ifndef PREFETCH_SIZE |
145 | # define PREFETCH_SIZE 64 |
146 | #endif |
147 | |
148 | #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) |
149 | |
150 | #if PREFETCH_SIZE == 64 |
151 | # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE |
152 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
153 | PREFETCH ((offset)base) |
154 | # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE |
155 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
156 | PREFETCH ((offset)base); \ |
157 | PREFETCH ((offset + dir * PREFETCH_SIZE)base) |
158 | # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE |
159 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
160 | PREFETCH ((offset)base); \ |
161 | PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ |
162 | PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ |
163 | PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) |
164 | # else |
165 | # error Unsupported PREFETCHED_LOAD_SIZE! |
166 | # endif |
167 | #else |
168 | # error Unsupported PREFETCH_SIZE! |
169 | #endif |
170 | |
171 | #if LARGE_LOAD_SIZE == (VEC_SIZE * 2) |
172 | # define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ |
173 | VMOVU (offset)base, vec0; \ |
174 | VMOVU ((offset) + VEC_SIZE)base, vec1; |
175 | # define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ |
176 | VMOVNT vec0, (offset)base; \ |
177 | VMOVNT vec1, ((offset) + VEC_SIZE)base; |
178 | #elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) |
179 | # define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |
180 | VMOVU (offset)base, vec0; \ |
181 | VMOVU ((offset) + VEC_SIZE)base, vec1; \ |
182 | VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ |
183 | VMOVU ((offset) + VEC_SIZE * 3)base, vec3; |
184 | # define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |
185 | VMOVNT vec0, (offset)base; \ |
186 | VMOVNT vec1, ((offset) + VEC_SIZE)base; \ |
187 | VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ |
188 | VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; |
189 | #else |
190 | # error Invalid LARGE_LOAD_SIZE |
191 | #endif |
192 | |
193 | #ifndef SECTION |
194 | # error SECTION is not defined! |
195 | #endif |
196 | |
197 | .section SECTION(.text),"ax" ,@progbits |
198 | #if defined SHARED && IS_IN (libc) |
199 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
200 | cmp %RDX_LP, %RCX_LP |
201 | jb HIDDEN_JUMPTARGET (__chk_fail) |
202 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
203 | #endif |
204 | |
205 | ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
206 | mov %RDI_LP, %RAX_LP |
207 | add %RDX_LP, %RAX_LP |
208 | jmp L(start) |
209 | END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
210 | |
211 | #if defined SHARED && IS_IN (libc) |
212 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
213 | cmp %RDX_LP, %RCX_LP |
214 | jb HIDDEN_JUMPTARGET (__chk_fail) |
215 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
216 | #endif |
217 | |
218 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) |
219 | movq %rdi, %rax |
220 | L(start): |
221 | # ifdef __ILP32__ |
222 | /* Clear the upper 32 bits. */ |
223 | movl %edx, %edx |
224 | # endif |
225 | cmp $VEC_SIZE, %RDX_LP |
226 | jb L(less_vec) |
227 | /* Load regardless. */ |
228 | VMOVU (%rsi), %VEC(0) |
229 | cmp $(VEC_SIZE * 2), %RDX_LP |
230 | ja L(more_2x_vec) |
231 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
232 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |
233 | VMOVU %VEC(0), (%rdi) |
234 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |
235 | #if !(defined USE_MULTIARCH && IS_IN (libc)) |
236 | ZERO_UPPER_VEC_REGISTERS_RETURN |
237 | #else |
238 | VZEROUPPER_RETURN |
239 | #endif |
240 | #if defined USE_MULTIARCH && IS_IN (libc) |
241 | END (MEMMOVE_SYMBOL (__memmove, unaligned)) |
242 | |
243 | # ifdef SHARED |
244 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
245 | cmp %RDX_LP, %RCX_LP |
246 | jb HIDDEN_JUMPTARGET (__chk_fail) |
247 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
248 | # endif |
249 | |
250 | ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
251 | mov %RDI_LP, %RAX_LP |
252 | add %RDX_LP, %RAX_LP |
253 | jmp L(start_erms) |
254 | END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
255 | |
256 | # ifdef SHARED |
257 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
258 | cmp %RDX_LP, %RCX_LP |
259 | jb HIDDEN_JUMPTARGET (__chk_fail) |
260 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
261 | # endif |
262 | |
263 | ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) |
264 | movq %rdi, %rax |
265 | L(start_erms): |
266 | # ifdef __ILP32__ |
267 | /* Clear the upper 32 bits. */ |
268 | movl %edx, %edx |
269 | # endif |
270 | cmp $VEC_SIZE, %RDX_LP |
271 | jb L(less_vec) |
272 | /* Load regardless. */ |
273 | VMOVU (%rsi), %VEC(0) |
274 | cmp $(VEC_SIZE * 2), %RDX_LP |
275 | ja L(movsb_more_2x_vec) |
276 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. |
277 | */ |
278 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) |
279 | VMOVU %VEC(0), (%rdi) |
280 | VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) |
281 | L(return): |
282 | # if VEC_SIZE > 16 |
283 | ZERO_UPPER_VEC_REGISTERS_RETURN |
284 | # else |
285 | ret |
286 | # endif |
287 | #endif |
288 | |
289 | #if LARGE_MOV_SIZE |
290 | /* If LARGE_MOV_SIZE this fits in the aligning bytes between the |
291 | ENTRY block and L(less_vec). */ |
292 | .p2align 4,, 8 |
293 | L(between_4_7): |
294 | /* From 4 to 7. No branch when size == 4. */ |
295 | movl (%rsi), %ecx |
296 | movl (%rsi, %rdx), %esi |
297 | movl %ecx, (%rdi) |
298 | movl %esi, (%rdi, %rdx) |
299 | ret |
300 | #endif |
301 | |
302 | .p2align 4 |
303 | L(less_vec): |
304 | /* Less than 1 VEC. */ |
305 | #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |
306 | # error Unsupported VEC_SIZE! |
307 | #endif |
308 | #if VEC_SIZE > 32 |
309 | cmpl $32, %edx |
310 | jae L(between_32_63) |
311 | #endif |
312 | #if VEC_SIZE > 16 |
313 | cmpl $16, %edx |
314 | jae L(between_16_31) |
315 | #endif |
316 | cmpl $8, %edx |
317 | jae L(between_8_15) |
318 | #if SMALL_MOV_SIZE |
319 | cmpl $4, %edx |
320 | #else |
321 | subq $4, %rdx |
322 | #endif |
323 | jae L(between_4_7) |
324 | cmpl $(1 - SMALL_SIZE_OFFSET), %edx |
325 | jl L(copy_0) |
326 | movb (%rsi), %cl |
327 | je L(copy_1) |
328 | movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi |
329 | movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) |
330 | L(copy_1): |
331 | movb %cl, (%rdi) |
332 | L(copy_0): |
333 | ret |
334 | |
335 | #if SMALL_MOV_SIZE |
336 | .p2align 4,, 8 |
337 | L(between_4_7): |
338 | /* From 4 to 7. No branch when size == 4. */ |
339 | movl -4(%rsi, %rdx), %ecx |
340 | movl (%rsi), %esi |
341 | movl %ecx, -4(%rdi, %rdx) |
342 | movl %esi, (%rdi) |
343 | ret |
344 | #endif |
345 | |
346 | #if VEC_SIZE > 16 |
347 | /* From 16 to 31. No branch when size == 16. */ |
348 | .p2align 4,, 8 |
349 | L(between_16_31): |
350 | vmovdqu (%rsi), %xmm0 |
351 | vmovdqu -16(%rsi, %rdx), %xmm1 |
352 | vmovdqu %xmm0, (%rdi) |
353 | vmovdqu %xmm1, -16(%rdi, %rdx) |
354 | /* No ymm registers have been touched. */ |
355 | ret |
356 | #endif |
357 | |
358 | #if VEC_SIZE > 32 |
359 | .p2align 4,, 10 |
360 | L(between_32_63): |
361 | /* From 32 to 63. No branch when size == 32. */ |
362 | VMOVU (%rsi), %YMM0 |
363 | VMOVU -32(%rsi, %rdx), %YMM1 |
364 | VMOVU %YMM0, (%rdi) |
365 | VMOVU %YMM1, -32(%rdi, %rdx) |
366 | VZEROUPPER_RETURN |
367 | #endif |
368 | |
369 | .p2align 4,, 10 |
370 | L(between_8_15): |
371 | /* From 8 to 15. No branch when size == 8. */ |
372 | movq -8(%rsi, %rdx), %rcx |
373 | movq (%rsi), %rsi |
374 | movq %rsi, (%rdi) |
375 | movq %rcx, -8(%rdi, %rdx) |
376 | ret |
377 | |
378 | .p2align 4,, 10 |
379 | L(last_4x_vec): |
380 | /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ |
381 | |
382 | /* VEC(0) and VEC(1) have already been loaded. */ |
383 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) |
384 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) |
385 | VMOVU %VEC(0), (%rdi) |
386 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
387 | VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) |
388 | VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) |
389 | VZEROUPPER_RETURN |
390 | |
391 | .p2align 4 |
392 | #if defined USE_MULTIARCH && IS_IN (libc) |
393 | L(movsb_more_2x_vec): |
394 | cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |
395 | ja L(movsb) |
396 | #endif |
397 | L(more_2x_vec): |
398 | /* More than 2 * VEC and there may be overlap between |
399 | destination and source. */ |
400 | cmpq $(VEC_SIZE * 8), %rdx |
401 | ja L(more_8x_vec) |
402 | /* Load VEC(1) regardless. VEC(0) has already been loaded. */ |
403 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
404 | cmpq $(VEC_SIZE * 4), %rdx |
405 | jbe L(last_4x_vec) |
406 | /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ |
407 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
408 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
409 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) |
410 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) |
411 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) |
412 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) |
413 | VMOVU %VEC(0), (%rdi) |
414 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
415 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |
416 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |
417 | VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) |
418 | VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) |
419 | VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) |
420 | VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) |
421 | VZEROUPPER_RETURN |
422 | |
423 | .p2align 4,, 4 |
424 | L(more_8x_vec): |
425 | movq %rdi, %rcx |
426 | subq %rsi, %rcx |
427 | /* Go to backwards temporal copy if overlap no matter what as |
428 | backward REP MOVSB is slow and we don't want to use NT stores if |
429 | there is overlap. */ |
430 | cmpq %rdx, %rcx |
431 | /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |
432 | jb L(more_8x_vec_backward_check_nop) |
433 | /* Check if non-temporal move candidate. */ |
434 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
435 | /* Check non-temporal store threshold. */ |
436 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
437 | ja L(large_memcpy_2x) |
438 | #endif |
439 | /* To reach this point there cannot be overlap and dst > src. So |
440 | check for overlap and src > dst in which case correctness |
441 | requires forward copy. Otherwise decide between backward/forward |
442 | copy depending on address aliasing. */ |
443 | |
444 | /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold |
445 | but less than __x86_shared_non_temporal_threshold. */ |
446 | L(more_8x_vec_check): |
447 | /* rcx contains dst - src. Add back length (rdx). */ |
448 | leaq (%rcx, %rdx), %r8 |
449 | /* If r8 has different sign than rcx then there is overlap so we |
450 | must do forward copy. */ |
451 | xorq %rcx, %r8 |
452 | /* Isolate just sign bit of r8. */ |
453 | shrq $63, %r8 |
454 | /* Get 4k difference dst - src. */ |
455 | andl $(PAGE_SIZE - 256), %ecx |
456 | /* If r8 is non-zero must do foward for correctness. Otherwise |
457 | if ecx is non-zero there is 4k False Alaising so do backward |
458 | copy. */ |
459 | addl %r8d, %ecx |
460 | jz L(more_8x_vec_backward) |
461 | |
462 | /* if rdx is greater than __x86_shared_non_temporal_threshold |
463 | but there is overlap, or from short distance movsb. */ |
464 | L(more_8x_vec_forward): |
465 | /* Load first and last 4 * VEC to support overlapping addresses. |
466 | */ |
467 | |
468 | /* First vec was already loaded into VEC(0). */ |
469 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) |
470 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) |
471 | /* Save begining of dst. */ |
472 | movq %rdi, %rcx |
473 | /* Align dst to VEC_SIZE - 1. */ |
474 | orq $(VEC_SIZE - 1), %rdi |
475 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) |
476 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) |
477 | |
478 | /* Subtract dst from src. Add back after dst aligned. */ |
479 | subq %rcx, %rsi |
480 | /* Finish aligning dst. */ |
481 | incq %rdi |
482 | /* Restore src adjusted with new value for aligned dst. */ |
483 | addq %rdi, %rsi |
484 | /* Store end of buffer minus tail in rdx. */ |
485 | leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx |
486 | |
487 | /* Dont use multi-byte nop to align. */ |
488 | .p2align 4,, 11 |
489 | L(loop_4x_vec_forward): |
490 | /* Copy 4 * VEC a time forward. */ |
491 | VMOVU (%rsi), %VEC(1) |
492 | VMOVU VEC_SIZE(%rsi), %VEC(2) |
493 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3) |
494 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4) |
495 | subq $-(VEC_SIZE * 4), %rsi |
496 | VMOVA %VEC(1), (%rdi) |
497 | VMOVA %VEC(2), VEC_SIZE(%rdi) |
498 | VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi) |
499 | VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi) |
500 | subq $-(VEC_SIZE * 4), %rdi |
501 | cmpq %rdi, %rdx |
502 | ja L(loop_4x_vec_forward) |
503 | /* Store the last 4 * VEC. */ |
504 | VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) |
505 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) |
506 | VMOVU %VEC(7), VEC_SIZE(%rdx) |
507 | VMOVU %VEC(8), (%rdx) |
508 | /* Store the first VEC. */ |
509 | VMOVU %VEC(0), (%rcx) |
510 | /* Keep L(nop_backward) target close to jmp for 2-byte encoding. |
511 | */ |
512 | L(nop_backward): |
513 | VZEROUPPER_RETURN |
514 | |
515 | .p2align 4,, 8 |
516 | L(more_8x_vec_backward_check_nop): |
517 | /* rcx contains dst - src. Test for dst == src to skip all of |
518 | memmove. */ |
519 | testq %rcx, %rcx |
520 | jz L(nop_backward) |
521 | L(more_8x_vec_backward): |
522 | /* Load the first 4 * VEC and last VEC to support overlapping |
523 | addresses. */ |
524 | |
525 | /* First vec was also loaded into VEC(0). */ |
526 | VMOVU VEC_SIZE(%rsi), %VEC(5) |
527 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) |
528 | /* Begining of region for 4x backward copy stored in rcx. */ |
529 | leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx |
530 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) |
531 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) |
532 | /* Subtract dst from src. Add back after dst aligned. */ |
533 | subq %rdi, %rsi |
534 | /* Align dst. */ |
535 | andq $-(VEC_SIZE), %rcx |
536 | /* Restore src. */ |
537 | addq %rcx, %rsi |
538 | |
539 | /* Don't use multi-byte nop to align. */ |
540 | .p2align 4,, 11 |
541 | L(loop_4x_vec_backward): |
542 | /* Copy 4 * VEC a time backward. */ |
543 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) |
544 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
545 | VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3) |
546 | VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4) |
547 | addq $(VEC_SIZE * -4), %rsi |
548 | VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx) |
549 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) |
550 | VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx) |
551 | VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx) |
552 | addq $(VEC_SIZE * -4), %rcx |
553 | cmpq %rcx, %rdi |
554 | jb L(loop_4x_vec_backward) |
555 | /* Store the first 4 * VEC. */ |
556 | VMOVU %VEC(0), (%rdi) |
557 | VMOVU %VEC(5), VEC_SIZE(%rdi) |
558 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |
559 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |
560 | /* Store the last VEC. */ |
561 | VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi) |
562 | VZEROUPPER_RETURN |
563 | |
564 | #if defined USE_MULTIARCH && IS_IN (libc) |
565 | /* L(skip_short_movsb_check) is only used with ERMS. Not for |
566 | FSRM. */ |
567 | .p2align 5,, 16 |
568 | # if ALIGN_MOVSB |
569 | L(skip_short_movsb_check): |
570 | # if MOVSB_ALIGN_TO > VEC_SIZE |
571 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
572 | # endif |
573 | # if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |
574 | # error Unsupported MOVSB_ALIGN_TO |
575 | # endif |
576 | /* If CPU does not have FSRM two options for aligning. Align src |
577 | if dst and src 4k alias. Otherwise align dst. */ |
578 | testl $(PAGE_SIZE - 512), %ecx |
579 | jnz L(movsb_align_dst) |
580 | /* Fall through. dst and src 4k alias. It's better to align src |
581 | here because the bottleneck will be loads dues to the false |
582 | dependency on dst. */ |
583 | |
584 | /* rcx already has dst - src. */ |
585 | movq %rcx, %r9 |
586 | /* Add src to len. Subtract back after src aligned. -1 because |
587 | src is initially aligned to MOVSB_ALIGN_TO - 1. */ |
588 | leaq -1(%rsi, %rdx), %rcx |
589 | /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ |
590 | orq $(MOVSB_ALIGN_TO - 1), %rsi |
591 | /* Restore dst and len adjusted with new values for aligned dst. |
592 | */ |
593 | leaq 1(%rsi, %r9), %rdi |
594 | subq %rsi, %rcx |
595 | /* Finish aligning src. */ |
596 | incq %rsi |
597 | |
598 | rep movsb |
599 | |
600 | VMOVU %VEC(0), (%r8) |
601 | # if MOVSB_ALIGN_TO > VEC_SIZE |
602 | VMOVU %VEC(1), VEC_SIZE(%r8) |
603 | # endif |
604 | VZEROUPPER_RETURN |
605 | # endif |
606 | |
607 | .p2align 4,, 12 |
608 | L(movsb): |
609 | movq %rdi, %rcx |
610 | subq %rsi, %rcx |
611 | /* Go to backwards temporal copy if overlap no matter what as |
612 | backward REP MOVSB is slow and we don't want to use NT stores if |
613 | there is overlap. */ |
614 | cmpq %rdx, %rcx |
615 | /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |
616 | jb L(more_8x_vec_backward_check_nop) |
617 | # if ALIGN_MOVSB |
618 | /* Save dest for storing aligning VECs later. */ |
619 | movq %rdi, %r8 |
620 | # endif |
621 | /* If above __x86_rep_movsb_stop_threshold most likely is |
622 | candidate for NT moves aswell. */ |
623 | cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP |
624 | jae L(large_memcpy_2x_check) |
625 | # if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB |
626 | /* Only avoid short movsb if CPU has FSRM. */ |
627 | testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) |
628 | jz L(skip_short_movsb_check) |
629 | # if AVOID_SHORT_DISTANCE_REP_MOVSB |
630 | /* Avoid "rep movsb" if RCX, the distance between source and |
631 | destination, is N*4GB + [1..63] with N >= 0. */ |
632 | |
633 | /* ecx contains dst - src. Early check for backward copy |
634 | conditions means only case of slow movsb with src = dst + [0, |
635 | 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check |
636 | for that case. */ |
637 | cmpl $-64, %ecx |
638 | ja L(more_8x_vec_forward) |
639 | # endif |
640 | # endif |
641 | # if ALIGN_MOVSB |
642 | # if MOVSB_ALIGN_TO > VEC_SIZE |
643 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
644 | # endif |
645 | # if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |
646 | # error Unsupported MOVSB_ALIGN_TO |
647 | # endif |
648 | /* Fall through means cpu has FSRM. In that case exclusively |
649 | align destination. */ |
650 | L(movsb_align_dst): |
651 | /* Subtract dst from src. Add back after dst aligned. */ |
652 | subq %rdi, %rsi |
653 | /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ |
654 | addq $(MOVSB_ALIGN_TO - 1), %rdi |
655 | /* Add dst to len. Subtract back after dst aligned. */ |
656 | leaq (%r8, %rdx), %rcx |
657 | /* Finish aligning dst. */ |
658 | andq $-(MOVSB_ALIGN_TO), %rdi |
659 | /* Restore src and len adjusted with new values for aligned dst. |
660 | */ |
661 | addq %rdi, %rsi |
662 | subq %rdi, %rcx |
663 | |
664 | rep movsb |
665 | |
666 | /* Store VECs loaded for aligning. */ |
667 | VMOVU %VEC(0), (%r8) |
668 | # if MOVSB_ALIGN_TO > VEC_SIZE |
669 | VMOVU %VEC(1), VEC_SIZE(%r8) |
670 | # endif |
671 | VZEROUPPER_RETURN |
672 | # else /* !ALIGN_MOVSB. */ |
673 | L(skip_short_movsb_check): |
674 | mov %RDX_LP, %RCX_LP |
675 | rep movsb |
676 | ret |
677 | # endif |
678 | #endif |
679 | |
680 | .p2align 4,, 10 |
681 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
682 | L(large_memcpy_2x_check): |
683 | /* Entry from L(large_memcpy_2x) has a redundant load of |
684 | __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x) |
685 | is only use for the non-erms memmove which is generally less |
686 | common. */ |
687 | L(large_memcpy_2x): |
688 | mov __x86_shared_non_temporal_threshold(%rip), %R11_LP |
689 | cmp %R11_LP, %RDX_LP |
690 | jb L(more_8x_vec_check) |
691 | /* To reach this point it is impossible for dst > src and |
692 | overlap. Remaining to check is src > dst and overlap. rcx |
693 | already contains dst - src. Negate rcx to get src - dst. If |
694 | length > rcx then there is overlap and forward copy is best. */ |
695 | negq %rcx |
696 | cmpq %rcx, %rdx |
697 | ja L(more_8x_vec_forward) |
698 | |
699 | /* Cache align destination. First store the first 64 bytes then |
700 | adjust alignments. */ |
701 | |
702 | /* First vec was also loaded into VEC(0). */ |
703 | # if VEC_SIZE < 64 |
704 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
705 | # if VEC_SIZE < 32 |
706 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
707 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
708 | # endif |
709 | # endif |
710 | VMOVU %VEC(0), (%rdi) |
711 | # if VEC_SIZE < 64 |
712 | VMOVU %VEC(1), VEC_SIZE(%rdi) |
713 | # if VEC_SIZE < 32 |
714 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |
715 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |
716 | # endif |
717 | # endif |
718 | |
719 | /* Adjust source, destination, and size. */ |
720 | movq %rdi, %r8 |
721 | andq $63, %r8 |
722 | /* Get the negative of offset for alignment. */ |
723 | subq $64, %r8 |
724 | /* Adjust source. */ |
725 | subq %r8, %rsi |
726 | /* Adjust destination which should be aligned now. */ |
727 | subq %r8, %rdi |
728 | /* Adjust length. */ |
729 | addq %r8, %rdx |
730 | |
731 | /* Test if source and destination addresses will alias. If they |
732 | do the larger pipeline in large_memcpy_4x alleviated the |
733 | performance drop. */ |
734 | |
735 | /* ecx contains -(dst - src). not ecx will return dst - src - 1 |
736 | which works for testing aliasing. */ |
737 | notl %ecx |
738 | movq %rdx, %r10 |
739 | testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx |
740 | jz L(large_memcpy_4x) |
741 | |
742 | /* r11 has __x86_shared_non_temporal_threshold. Shift it left |
743 | by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold. |
744 | */ |
745 | shlq $LOG_4X_MEMCPY_THRESH, %r11 |
746 | cmp %r11, %rdx |
747 | jae L(large_memcpy_4x) |
748 | |
749 | /* edx will store remainder size for copying tail. */ |
750 | andl $(PAGE_SIZE * 2 - 1), %edx |
751 | /* r10 stores outer loop counter. */ |
752 | shrq $(LOG_PAGE_SIZE + 1), %r10 |
753 | /* Copy 4x VEC at a time from 2 pages. */ |
754 | .p2align 4 |
755 | L(loop_large_memcpy_2x_outer): |
756 | /* ecx stores inner loop counter. */ |
757 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |
758 | L(loop_large_memcpy_2x_inner): |
759 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |
760 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) |
761 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |
762 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) |
763 | /* Load vectors from rsi. */ |
764 | LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
765 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
766 | subq $-LARGE_LOAD_SIZE, %rsi |
767 | /* Non-temporal store vectors to rdi. */ |
768 | STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
769 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
770 | subq $-LARGE_LOAD_SIZE, %rdi |
771 | decl %ecx |
772 | jnz L(loop_large_memcpy_2x_inner) |
773 | addq $PAGE_SIZE, %rdi |
774 | addq $PAGE_SIZE, %rsi |
775 | decq %r10 |
776 | jne L(loop_large_memcpy_2x_outer) |
777 | sfence |
778 | |
779 | /* Check if only last 4 loads are needed. */ |
780 | cmpl $(VEC_SIZE * 4), %edx |
781 | jbe L(large_memcpy_2x_end) |
782 | |
783 | /* Handle the last 2 * PAGE_SIZE bytes. */ |
784 | L(loop_large_memcpy_2x_tail): |
785 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |
786 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |
787 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |
788 | VMOVU (%rsi), %VEC(0) |
789 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
790 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
791 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
792 | subq $-(VEC_SIZE * 4), %rsi |
793 | addl $-(VEC_SIZE * 4), %edx |
794 | VMOVA %VEC(0), (%rdi) |
795 | VMOVA %VEC(1), VEC_SIZE(%rdi) |
796 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |
797 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |
798 | subq $-(VEC_SIZE * 4), %rdi |
799 | cmpl $(VEC_SIZE * 4), %edx |
800 | ja L(loop_large_memcpy_2x_tail) |
801 | |
802 | L(large_memcpy_2x_end): |
803 | /* Store the last 4 * VEC. */ |
804 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) |
805 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) |
806 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) |
807 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) |
808 | |
809 | VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) |
810 | VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) |
811 | VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) |
812 | VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) |
813 | VZEROUPPER_RETURN |
814 | |
815 | .p2align 4 |
816 | L(large_memcpy_4x): |
817 | /* edx will store remainder size for copying tail. */ |
818 | andl $(PAGE_SIZE * 4 - 1), %edx |
819 | /* r10 stores outer loop counter. */ |
820 | shrq $(LOG_PAGE_SIZE + 2), %r10 |
821 | /* Copy 4x VEC at a time from 4 pages. */ |
822 | .p2align 4 |
823 | L(loop_large_memcpy_4x_outer): |
824 | /* ecx stores inner loop counter. */ |
825 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |
826 | L(loop_large_memcpy_4x_inner): |
827 | /* Only one prefetch set per page as doing 4 pages give more |
828 | time for prefetcher to keep up. */ |
829 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |
830 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |
831 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) |
832 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) |
833 | /* Load vectors from rsi. */ |
834 | LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
835 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
836 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) |
837 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) |
838 | subq $-LARGE_LOAD_SIZE, %rsi |
839 | /* Non-temporal store vectors to rdi. */ |
840 | STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |
841 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |
842 | STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) |
843 | STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) |
844 | subq $-LARGE_LOAD_SIZE, %rdi |
845 | decl %ecx |
846 | jnz L(loop_large_memcpy_4x_inner) |
847 | addq $(PAGE_SIZE * 3), %rdi |
848 | addq $(PAGE_SIZE * 3), %rsi |
849 | decq %r10 |
850 | jne L(loop_large_memcpy_4x_outer) |
851 | sfence |
852 | /* Check if only last 4 loads are needed. */ |
853 | cmpl $(VEC_SIZE * 4), %edx |
854 | jbe L(large_memcpy_4x_end) |
855 | |
856 | /* Handle the last 4 * PAGE_SIZE bytes. */ |
857 | L(loop_large_memcpy_4x_tail): |
858 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |
859 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |
860 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |
861 | VMOVU (%rsi), %VEC(0) |
862 | VMOVU VEC_SIZE(%rsi), %VEC(1) |
863 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |
864 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |
865 | subq $-(VEC_SIZE * 4), %rsi |
866 | addl $-(VEC_SIZE * 4), %edx |
867 | VMOVA %VEC(0), (%rdi) |
868 | VMOVA %VEC(1), VEC_SIZE(%rdi) |
869 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |
870 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |
871 | subq $-(VEC_SIZE * 4), %rdi |
872 | cmpl $(VEC_SIZE * 4), %edx |
873 | ja L(loop_large_memcpy_4x_tail) |
874 | |
875 | L(large_memcpy_4x_end): |
876 | /* Store the last 4 * VEC. */ |
877 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) |
878 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) |
879 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) |
880 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) |
881 | |
882 | VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) |
883 | VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) |
884 | VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) |
885 | VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) |
886 | VZEROUPPER_RETURN |
887 | #endif |
888 | END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |
889 | |
890 | #if IS_IN (libc) |
891 | # ifdef USE_MULTIARCH |
892 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), |
893 | MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) |
894 | # ifdef SHARED |
895 | strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), |
896 | MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) |
897 | # endif |
898 | # endif |
899 | # ifdef SHARED |
900 | strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), |
901 | MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) |
902 | # endif |
903 | #endif |
904 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), |
905 | MEMCPY_SYMBOL (__memcpy, unaligned)) |
906 | |