1 | /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb |
2 | Copyright (C) 2016-2024 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | /* memmove/memcpy/mempcpy is implemented as: |
20 | 1. Use overlapping load and store to avoid branch. |
21 | 2. Load all sources into registers and store them together to avoid |
22 | possible address overlap between source and destination. |
23 | 3. If size is 8 * VEC_SIZE or less, load all sources into registers |
24 | and store them together. |
25 | 4. If address of destination > address of source, backward copy |
26 | 4 * VEC_SIZE at a time with unaligned load and aligned store. |
27 | Load the first 4 * VEC and last VEC before the loop and store |
28 | them after the loop to support overlapping addresses. |
29 | 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned |
30 | load and aligned store. Load the last 4 * VEC and first VEC |
31 | before the loop and store them after the loop to support |
32 | overlapping addresses. |
33 | 6. On machines with ERMS feature, if size greater than equal or to |
34 | __x86_rep_movsb_threshold and less than |
35 | __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. |
36 | 7. If size >= __x86_shared_non_temporal_threshold and there is no |
37 | overlap between destination and source, use non-temporal store |
38 | instead of aligned store copying from either 2 or 4 pages at |
39 | once. |
40 | 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold |
41 | and source and destination do not page alias, copy from 2 pages |
42 | at once using non-temporal stores. Page aliasing in this case is |
43 | considered true if destination's page alignment - sources' page |
44 | alignment is less than 8 * VEC_SIZE. |
45 | 9. If size >= 16 * __x86_shared_non_temporal_threshold or source |
46 | and destination do page alias copy from 4 pages at once using |
47 | non-temporal stores. */ |
48 | |
49 | #include <sysdep.h> |
50 | |
51 | #ifndef MEMCPY_SYMBOL |
52 | # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
53 | #endif |
54 | |
55 | #ifndef MEMPCPY_SYMBOL |
56 | # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
57 | #endif |
58 | |
59 | #ifndef MEMMOVE_CHK_SYMBOL |
60 | # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |
61 | #endif |
62 | |
63 | #ifndef VZEROUPPER |
64 | # if VEC_SIZE > 16 |
65 | # define VZEROUPPER vzeroupper |
66 | # else |
67 | # define VZEROUPPER |
68 | # endif |
69 | #endif |
70 | |
71 | /* Whether to align before movsb. Ultimately we want 64 byte |
72 | align and not worth it to load 4x VEC for VEC_SIZE == 16. */ |
73 | #define ALIGN_MOVSB (VEC_SIZE > 16) |
74 | /* Number of bytes to align movsb to. */ |
75 | #define MOVSB_ALIGN_TO 64 |
76 | |
77 | #define SMALL_MOV_SIZE (MOV_SIZE <= 4) |
78 | #define LARGE_MOV_SIZE (MOV_SIZE > 4) |
79 | |
80 | #if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 |
81 | # error MOV_SIZE Unknown |
82 | #endif |
83 | |
84 | #if LARGE_MOV_SIZE |
85 | # define SMALL_SIZE_OFFSET (4) |
86 | #else |
87 | # define SMALL_SIZE_OFFSET (0) |
88 | #endif |
89 | |
90 | #ifndef PAGE_SIZE |
91 | # define PAGE_SIZE 4096 |
92 | #endif |
93 | |
94 | #if PAGE_SIZE != 4096 |
95 | # error Unsupported PAGE_SIZE |
96 | #endif |
97 | |
98 | #ifndef LOG_PAGE_SIZE |
99 | # define LOG_PAGE_SIZE 12 |
100 | #endif |
101 | |
102 | #if PAGE_SIZE != (1 << LOG_PAGE_SIZE) |
103 | # error Invalid LOG_PAGE_SIZE |
104 | #endif |
105 | |
106 | /* Byte per page for large_memcpy inner loop. */ |
107 | #if VEC_SIZE == 64 |
108 | # define LARGE_LOAD_SIZE (VEC_SIZE * 2) |
109 | #else |
110 | # define LARGE_LOAD_SIZE (VEC_SIZE * 4) |
111 | #endif |
112 | |
113 | /* Amount to shift __x86_shared_non_temporal_threshold by for |
114 | bound for memcpy_large_4x. This is essentially use to to |
115 | indicate that the copy is far beyond the scope of L3 |
116 | (assuming no user config x86_non_temporal_threshold) and to |
117 | use a more aggressively unrolled loop. NB: before |
118 | increasing the value also update initialization of |
119 | x86_non_temporal_threshold. */ |
120 | #ifndef LOG_4X_MEMCPY_THRESH |
121 | # define LOG_4X_MEMCPY_THRESH 4 |
122 | #endif |
123 | |
124 | /* Avoid short distance rep movsb only with non-SSE vector. */ |
125 | #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB |
126 | # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) |
127 | #else |
128 | # define AVOID_SHORT_DISTANCE_REP_MOVSB 0 |
129 | #endif |
130 | |
131 | #ifndef PREFETCH |
132 | # define PREFETCH(addr) prefetcht0 addr |
133 | #endif |
134 | |
135 | /* Assume 64-byte prefetch size. */ |
136 | #ifndef PREFETCH_SIZE |
137 | # define PREFETCH_SIZE 64 |
138 | #endif |
139 | |
140 | #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) |
141 | |
142 | #if PREFETCH_SIZE == 64 |
143 | # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE |
144 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
145 | PREFETCH ((offset)base) |
146 | # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE |
147 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
148 | PREFETCH ((offset)base); \ |
149 | PREFETCH ((offset + dir * PREFETCH_SIZE)base) |
150 | # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE |
151 | # define PREFETCH_ONE_SET(dir, base, offset) \ |
152 | PREFETCH ((offset)base); \ |
153 | PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ |
154 | PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ |
155 | PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) |
156 | # else |
157 | # error Unsupported PREFETCHED_LOAD_SIZE! |
158 | # endif |
159 | #else |
160 | # error Unsupported PREFETCH_SIZE! |
161 | #endif |
162 | |
163 | #if LARGE_LOAD_SIZE == (VEC_SIZE * 2) |
164 | # define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ |
165 | VMOVU (offset)base, vec0; \ |
166 | VMOVU ((offset) + VEC_SIZE)base, vec1; |
167 | # define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ |
168 | VMOVNT vec0, (offset)base; \ |
169 | VMOVNT vec1, ((offset) + VEC_SIZE)base; |
170 | #elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) |
171 | # define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |
172 | VMOVU (offset)base, vec0; \ |
173 | VMOVU ((offset) + VEC_SIZE)base, vec1; \ |
174 | VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ |
175 | VMOVU ((offset) + VEC_SIZE * 3)base, vec3; |
176 | # define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |
177 | VMOVNT vec0, (offset)base; \ |
178 | VMOVNT vec1, ((offset) + VEC_SIZE)base; \ |
179 | VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ |
180 | VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; |
181 | #else |
182 | # error Invalid LARGE_LOAD_SIZE |
183 | #endif |
184 | |
185 | #ifndef SECTION |
186 | # error SECTION is not defined! |
187 | #endif |
188 | |
189 | .section SECTION(.text),"ax" ,@progbits |
190 | #if defined SHARED && IS_IN (libc) |
191 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
192 | cmp %RDX_LP, %RCX_LP |
193 | jb HIDDEN_JUMPTARGET (__chk_fail) |
194 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |
195 | #endif |
196 | |
197 | ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
198 | mov %RDI_LP, %RAX_LP |
199 | add %RDX_LP, %RAX_LP |
200 | jmp L(start) |
201 | END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |
202 | |
203 | #if defined SHARED && IS_IN (libc) |
204 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
205 | cmp %RDX_LP, %RCX_LP |
206 | jb HIDDEN_JUMPTARGET (__chk_fail) |
207 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |
208 | #endif |
209 | |
210 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) |
211 | movq %rdi, %rax |
212 | L(start): |
213 | # ifdef __ILP32__ |
214 | /* Clear the upper 32 bits. */ |
215 | movl %edx, %edx |
216 | # endif |
217 | cmp $VEC_SIZE, %RDX_LP |
218 | jb L(less_vec) |
219 | /* Load regardless. */ |
220 | VMOVU (%rsi), %VMM(0) |
221 | cmp $(VEC_SIZE * 2), %RDX_LP |
222 | ja L(more_2x_vec) |
223 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |
224 | VMOVU -VEC_SIZE(%rsi,%rdx), %VMM(1) |
225 | VMOVU %VMM(0), (%rdi) |
226 | VMOVU %VMM(1), -VEC_SIZE(%rdi,%rdx) |
227 | #if !(defined USE_MULTIARCH && IS_IN (libc)) |
228 | ZERO_UPPER_VEC_REGISTERS_RETURN |
229 | #else |
230 | VZEROUPPER_RETURN |
231 | #endif |
232 | #if defined USE_MULTIARCH && IS_IN (libc) |
233 | END (MEMMOVE_SYMBOL (__memmove, unaligned)) |
234 | |
235 | # ifdef SHARED |
236 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
237 | cmp %RDX_LP, %RCX_LP |
238 | jb HIDDEN_JUMPTARGET (__chk_fail) |
239 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |
240 | # endif |
241 | |
242 | ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
243 | mov %RDI_LP, %RAX_LP |
244 | add %RDX_LP, %RAX_LP |
245 | jmp L(start_erms) |
246 | END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |
247 | |
248 | # ifdef SHARED |
249 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
250 | cmp %RDX_LP, %RCX_LP |
251 | jb HIDDEN_JUMPTARGET (__chk_fail) |
252 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |
253 | # endif |
254 | |
255 | ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) |
256 | movq %rdi, %rax |
257 | L(start_erms): |
258 | # ifdef __ILP32__ |
259 | /* Clear the upper 32 bits. */ |
260 | movl %edx, %edx |
261 | # endif |
262 | cmp $VEC_SIZE, %RDX_LP |
263 | jb L(less_vec) |
264 | /* Load regardless. */ |
265 | VMOVU (%rsi), %VMM(0) |
266 | cmp $(VEC_SIZE * 2), %RDX_LP |
267 | ja L(movsb_more_2x_vec) |
268 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. |
269 | */ |
270 | VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(1) |
271 | VMOVU %VMM(0), (%rdi) |
272 | VMOVU %VMM(1), -VEC_SIZE(%rdi, %rdx) |
273 | L(return_vzeroupper): |
274 | # if VEC_SIZE > 16 |
275 | ZERO_UPPER_VEC_REGISTERS_RETURN |
276 | # else |
277 | ret |
278 | # endif |
279 | #endif |
280 | |
281 | #if LARGE_MOV_SIZE |
282 | /* If LARGE_MOV_SIZE this fits in the aligning bytes between the |
283 | ENTRY block and L(less_vec). */ |
284 | .p2align 4,, 8 |
285 | L(between_4_7): |
286 | /* From 4 to 7. No branch when size == 4. */ |
287 | movl (%rsi), %ecx |
288 | movl (%rsi, %rdx), %esi |
289 | movl %ecx, (%rdi) |
290 | movl %esi, (%rdi, %rdx) |
291 | ret |
292 | #endif |
293 | |
294 | .p2align 4 |
295 | L(less_vec): |
296 | /* Less than 1 VEC. */ |
297 | #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |
298 | # error Unsupported VEC_SIZE! |
299 | #endif |
300 | #if VEC_SIZE > 32 |
301 | cmpl $32, %edx |
302 | jae L(between_32_63) |
303 | #endif |
304 | #if VEC_SIZE > 16 |
305 | cmpl $16, %edx |
306 | jae L(between_16_31) |
307 | #endif |
308 | cmpl $8, %edx |
309 | jae L(between_8_15) |
310 | #if SMALL_MOV_SIZE |
311 | cmpl $4, %edx |
312 | #else |
313 | subq $4, %rdx |
314 | #endif |
315 | jae L(between_4_7) |
316 | cmpl $(1 - SMALL_SIZE_OFFSET), %edx |
317 | jl L(copy_0) |
318 | movb (%rsi), %cl |
319 | je L(copy_1) |
320 | movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi |
321 | movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) |
322 | L(copy_1): |
323 | movb %cl, (%rdi) |
324 | L(copy_0): |
325 | ret |
326 | |
327 | #if SMALL_MOV_SIZE |
328 | .p2align 4,, 8 |
329 | L(between_4_7): |
330 | /* From 4 to 7. No branch when size == 4. */ |
331 | movl -4(%rsi, %rdx), %ecx |
332 | movl (%rsi), %esi |
333 | movl %ecx, -4(%rdi, %rdx) |
334 | movl %esi, (%rdi) |
335 | ret |
336 | #endif |
337 | |
338 | #if VEC_SIZE > 16 |
339 | /* From 16 to 31. No branch when size == 16. */ |
340 | .p2align 4,, 8 |
341 | L(between_16_31): |
342 | vmovdqu (%rsi), %xmm0 |
343 | vmovdqu -16(%rsi, %rdx), %xmm1 |
344 | vmovdqu %xmm0, (%rdi) |
345 | vmovdqu %xmm1, -16(%rdi, %rdx) |
346 | /* No ymm registers have been touched. */ |
347 | ret |
348 | #endif |
349 | |
350 | #if VEC_SIZE > 32 |
351 | .p2align 4,, 10 |
352 | L(between_32_63): |
353 | /* From 32 to 63. No branch when size == 32. */ |
354 | VMOVU (%rsi), %VMM_256(0) |
355 | VMOVU -32(%rsi, %rdx), %VMM_256(1) |
356 | VMOVU %VMM_256(0), (%rdi) |
357 | VMOVU %VMM_256(1), -32(%rdi, %rdx) |
358 | VZEROUPPER_RETURN |
359 | #endif |
360 | |
361 | .p2align 4,, 10 |
362 | L(between_8_15): |
363 | /* From 8 to 15. No branch when size == 8. */ |
364 | movq -8(%rsi, %rdx), %rcx |
365 | movq (%rsi), %rsi |
366 | movq %rsi, (%rdi) |
367 | movq %rcx, -8(%rdi, %rdx) |
368 | ret |
369 | |
370 | .p2align 4,, 10 |
371 | L(last_4x_vec): |
372 | /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ |
373 | |
374 | /* VEC(0) and VEC(1) have already been loaded. */ |
375 | VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(2) |
376 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(3) |
377 | VMOVU %VMM(0), (%rdi) |
378 | VMOVU %VMM(1), VEC_SIZE(%rdi) |
379 | VMOVU %VMM(2), -VEC_SIZE(%rdi, %rdx) |
380 | VMOVU %VMM(3), -(VEC_SIZE * 2)(%rdi, %rdx) |
381 | VZEROUPPER_RETURN |
382 | |
383 | .p2align 4 |
384 | #if defined USE_MULTIARCH && IS_IN (libc) |
385 | L(movsb_more_2x_vec): |
386 | cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |
387 | ja L(movsb) |
388 | #endif |
389 | L(more_2x_vec): |
390 | /* More than 2 * VEC and there may be overlap between |
391 | destination and source. */ |
392 | cmpq $(VEC_SIZE * 8), %rdx |
393 | ja L(more_8x_vec) |
394 | /* Load VEC(1) regardless. VEC(0) has already been loaded. */ |
395 | VMOVU VEC_SIZE(%rsi), %VMM(1) |
396 | cmpq $(VEC_SIZE * 4), %rdx |
397 | jbe L(last_4x_vec) |
398 | /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ |
399 | VMOVU (VEC_SIZE * 2)(%rsi), %VMM(2) |
400 | VMOVU (VEC_SIZE * 3)(%rsi), %VMM(3) |
401 | VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(4) |
402 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(5) |
403 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(6) |
404 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(7) |
405 | VMOVU %VMM(0), (%rdi) |
406 | VMOVU %VMM(1), VEC_SIZE(%rdi) |
407 | VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) |
408 | VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) |
409 | VMOVU %VMM(4), -VEC_SIZE(%rdi, %rdx) |
410 | VMOVU %VMM(5), -(VEC_SIZE * 2)(%rdi, %rdx) |
411 | VMOVU %VMM(6), -(VEC_SIZE * 3)(%rdi, %rdx) |
412 | VMOVU %VMM(7), -(VEC_SIZE * 4)(%rdi, %rdx) |
413 | VZEROUPPER_RETURN |
414 | |
415 | .p2align 4,, 4 |
416 | L(more_8x_vec): |
417 | movq %rdi, %rcx |
418 | subq %rsi, %rcx |
419 | /* Go to backwards temporal copy if overlap no matter what as |
420 | backward REP MOVSB is slow and we don't want to use NT stores if |
421 | there is overlap. */ |
422 | cmpq %rdx, %rcx |
423 | /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |
424 | jb L(more_8x_vec_backward_check_nop) |
425 | /* Check if non-temporal move candidate. */ |
426 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
427 | /* Check non-temporal store threshold. */ |
428 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |
429 | ja L(large_memcpy_2x) |
430 | #endif |
431 | /* To reach this point there cannot be overlap and dst > src. So |
432 | check for overlap and src > dst in which case correctness |
433 | requires forward copy. Otherwise decide between backward/forward |
434 | copy depending on address aliasing. */ |
435 | |
436 | /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold |
437 | but less than __x86_shared_non_temporal_threshold. */ |
438 | L(more_8x_vec_check): |
439 | /* rcx contains dst - src. Add back length (rdx). */ |
440 | leaq (%rcx, %rdx), %r8 |
441 | /* If r8 has different sign than rcx then there is overlap so we |
442 | must do forward copy. */ |
443 | xorq %rcx, %r8 |
444 | /* Isolate just sign bit of r8. */ |
445 | shrq $63, %r8 |
446 | /* Get 4k difference dst - src. */ |
447 | andl $(PAGE_SIZE - 256), %ecx |
448 | /* If r8 is non-zero must do forward for correctness. Otherwise |
449 | if ecx is non-zero there is 4k False Alaising so do backward |
450 | copy. */ |
451 | addl %r8d, %ecx |
452 | jz L(more_8x_vec_backward) |
453 | |
454 | /* if rdx is greater than __x86_shared_non_temporal_threshold |
455 | but there is overlap, or from short distance movsb. */ |
456 | L(more_8x_vec_forward): |
457 | /* Load first and last 4 * VEC to support overlapping addresses. |
458 | */ |
459 | |
460 | /* First vec was already loaded into VEC(0). */ |
461 | VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(5) |
462 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(6) |
463 | /* Save beginning of dst. */ |
464 | movq %rdi, %rcx |
465 | /* Align dst to VEC_SIZE - 1. */ |
466 | orq $(VEC_SIZE - 1), %rdi |
467 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(7) |
468 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(8) |
469 | |
470 | /* Subtract dst from src. Add back after dst aligned. */ |
471 | subq %rcx, %rsi |
472 | /* Finish aligning dst. */ |
473 | incq %rdi |
474 | /* Restore src adjusted with new value for aligned dst. */ |
475 | addq %rdi, %rsi |
476 | /* Store end of buffer minus tail in rdx. */ |
477 | leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx |
478 | |
479 | /* Dont use multi-byte nop to align. */ |
480 | .p2align 4,, 11 |
481 | L(loop_4x_vec_forward): |
482 | /* Copy 4 * VEC a time forward. */ |
483 | VMOVU (%rsi), %VMM(1) |
484 | VMOVU VEC_SIZE(%rsi), %VMM(2) |
485 | VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3) |
486 | VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4) |
487 | subq $-(VEC_SIZE * 4), %rsi |
488 | VMOVA %VMM(1), (%rdi) |
489 | VMOVA %VMM(2), VEC_SIZE(%rdi) |
490 | VMOVA %VMM(3), (VEC_SIZE * 2)(%rdi) |
491 | VMOVA %VMM(4), (VEC_SIZE * 3)(%rdi) |
492 | subq $-(VEC_SIZE * 4), %rdi |
493 | cmpq %rdi, %rdx |
494 | ja L(loop_4x_vec_forward) |
495 | /* Store the last 4 * VEC. */ |
496 | VMOVU %VMM(5), (VEC_SIZE * 3)(%rdx) |
497 | VMOVU %VMM(6), (VEC_SIZE * 2)(%rdx) |
498 | VMOVU %VMM(7), VEC_SIZE(%rdx) |
499 | VMOVU %VMM(8), (%rdx) |
500 | /* Store the first VEC. */ |
501 | VMOVU %VMM(0), (%rcx) |
502 | /* Keep L(nop_backward) target close to jmp for 2-byte encoding. |
503 | */ |
504 | L(nop_backward): |
505 | VZEROUPPER_RETURN |
506 | |
507 | .p2align 4,, 8 |
508 | L(more_8x_vec_backward_check_nop): |
509 | /* rcx contains dst - src. Test for dst == src to skip all of |
510 | memmove. */ |
511 | testq %rcx, %rcx |
512 | jz L(nop_backward) |
513 | L(more_8x_vec_backward): |
514 | /* Load the first 4 * VEC and last VEC to support overlapping |
515 | addresses. */ |
516 | |
517 | /* First vec was also loaded into VEC(0). */ |
518 | VMOVU VEC_SIZE(%rsi), %VMM(5) |
519 | VMOVU (VEC_SIZE * 2)(%rsi), %VMM(6) |
520 | /* Beginning of region for 4x backward copy stored in rcx. */ |
521 | leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx |
522 | VMOVU (VEC_SIZE * 3)(%rsi), %VMM(7) |
523 | VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(8) |
524 | /* Subtract dst from src. Add back after dst aligned. */ |
525 | subq %rdi, %rsi |
526 | /* Align dst. */ |
527 | andq $-(VEC_SIZE), %rcx |
528 | /* Restore src. */ |
529 | addq %rcx, %rsi |
530 | |
531 | /* Don't use multi-byte nop to align. */ |
532 | .p2align 4,, 11 |
533 | L(loop_4x_vec_backward): |
534 | /* Copy 4 * VEC a time backward. */ |
535 | VMOVU (VEC_SIZE * 3)(%rsi), %VMM(1) |
536 | VMOVU (VEC_SIZE * 2)(%rsi), %VMM(2) |
537 | VMOVU (VEC_SIZE * 1)(%rsi), %VMM(3) |
538 | VMOVU (VEC_SIZE * 0)(%rsi), %VMM(4) |
539 | addq $(VEC_SIZE * -4), %rsi |
540 | VMOVA %VMM(1), (VEC_SIZE * 3)(%rcx) |
541 | VMOVA %VMM(2), (VEC_SIZE * 2)(%rcx) |
542 | VMOVA %VMM(3), (VEC_SIZE * 1)(%rcx) |
543 | VMOVA %VMM(4), (VEC_SIZE * 0)(%rcx) |
544 | addq $(VEC_SIZE * -4), %rcx |
545 | cmpq %rcx, %rdi |
546 | jb L(loop_4x_vec_backward) |
547 | /* Store the first 4 * VEC. */ |
548 | VMOVU %VMM(0), (%rdi) |
549 | VMOVU %VMM(5), VEC_SIZE(%rdi) |
550 | VMOVU %VMM(6), (VEC_SIZE * 2)(%rdi) |
551 | VMOVU %VMM(7), (VEC_SIZE * 3)(%rdi) |
552 | /* Store the last VEC. */ |
553 | VMOVU %VMM(8), -VEC_SIZE(%rdx, %rdi) |
554 | VZEROUPPER_RETURN |
555 | |
556 | #if defined USE_MULTIARCH && IS_IN (libc) |
557 | /* L(skip_short_movsb_check) is only used with ERMS. Not for |
558 | FSRM. */ |
559 | .p2align 5,, 16 |
560 | # if ALIGN_MOVSB |
561 | L(skip_short_movsb_check): |
562 | # if MOVSB_ALIGN_TO > VEC_SIZE |
563 | VMOVU VEC_SIZE(%rsi), %VMM(1) |
564 | # endif |
565 | # if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |
566 | # error Unsupported MOVSB_ALIGN_TO |
567 | # endif |
568 | /* If CPU does not have FSRM two options for aligning. Align src |
569 | if dst and src 4k alias. Otherwise align dst. */ |
570 | testl $(PAGE_SIZE - 512), %ecx |
571 | jnz L(movsb_align_dst) |
572 | /* Fall through. dst and src 4k alias. It's better to align src |
573 | here because the bottleneck will be loads dues to the false |
574 | dependency on dst. */ |
575 | |
576 | /* rcx already has dst - src. */ |
577 | movq %rcx, %r9 |
578 | /* Add src to len. Subtract back after src aligned. -1 because |
579 | src is initially aligned to MOVSB_ALIGN_TO - 1. */ |
580 | leaq -1(%rsi, %rdx), %rcx |
581 | /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ |
582 | orq $(MOVSB_ALIGN_TO - 1), %rsi |
583 | /* Restore dst and len adjusted with new values for aligned dst. |
584 | */ |
585 | leaq 1(%rsi, %r9), %rdi |
586 | subq %rsi, %rcx |
587 | /* Finish aligning src. */ |
588 | incq %rsi |
589 | |
590 | rep movsb |
591 | |
592 | VMOVU %VMM(0), (%r8) |
593 | # if MOVSB_ALIGN_TO > VEC_SIZE |
594 | VMOVU %VMM(1), VEC_SIZE(%r8) |
595 | # endif |
596 | VZEROUPPER_RETURN |
597 | # endif |
598 | |
599 | .p2align 4,, 12 |
600 | L(movsb): |
601 | movq %rdi, %rcx |
602 | subq %rsi, %rcx |
603 | /* Go to backwards temporal copy if overlap no matter what as |
604 | backward REP MOVSB is slow and we don't want to use NT stores if |
605 | there is overlap. */ |
606 | cmpq %rdx, %rcx |
607 | /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |
608 | jb L(more_8x_vec_backward_check_nop) |
609 | # if ALIGN_MOVSB |
610 | /* Save dest for storing aligning VECs later. */ |
611 | movq %rdi, %r8 |
612 | # endif |
613 | /* If above __x86_rep_movsb_stop_threshold most likely is |
614 | candidate for NT moves as well. */ |
615 | cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP |
616 | jae L(large_memcpy_2x_check) |
617 | # if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB |
618 | /* Only avoid short movsb if CPU has FSRM. */ |
619 | # if X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB < 256 |
620 | testb $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) |
621 | # else |
622 | testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) |
623 | # endif |
624 | jz L(skip_short_movsb_check) |
625 | # if AVOID_SHORT_DISTANCE_REP_MOVSB |
626 | /* Avoid "rep movsb" if RCX, the distance between source and |
627 | destination, is N*4GB + [1..63] with N >= 0. */ |
628 | |
629 | /* ecx contains dst - src. Early check for backward copy |
630 | conditions means only case of slow movsb with src = dst + [0, |
631 | 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check |
632 | for that case. */ |
633 | cmpl $-64, %ecx |
634 | ja L(more_8x_vec_forward) |
635 | # endif |
636 | # endif |
637 | # if ALIGN_MOVSB |
638 | # if MOVSB_ALIGN_TO > VEC_SIZE |
639 | VMOVU VEC_SIZE(%rsi), %VMM(1) |
640 | # endif |
641 | # if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |
642 | # error Unsupported MOVSB_ALIGN_TO |
643 | # endif |
644 | /* Fall through means cpu has FSRM. In that case exclusively |
645 | align destination. */ |
646 | L(movsb_align_dst): |
647 | /* Subtract dst from src. Add back after dst aligned. */ |
648 | subq %rdi, %rsi |
649 | /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ |
650 | addq $(MOVSB_ALIGN_TO - 1), %rdi |
651 | /* Add dst to len. Subtract back after dst aligned. */ |
652 | leaq (%r8, %rdx), %rcx |
653 | /* Finish aligning dst. */ |
654 | andq $-(MOVSB_ALIGN_TO), %rdi |
655 | /* Restore src and len adjusted with new values for aligned dst. |
656 | */ |
657 | addq %rdi, %rsi |
658 | subq %rdi, %rcx |
659 | |
660 | rep movsb |
661 | |
662 | /* Store VECs loaded for aligning. */ |
663 | VMOVU %VMM(0), (%r8) |
664 | # if MOVSB_ALIGN_TO > VEC_SIZE |
665 | VMOVU %VMM(1), VEC_SIZE(%r8) |
666 | # endif |
667 | VZEROUPPER_RETURN |
668 | # else /* !ALIGN_MOVSB. */ |
669 | L(skip_short_movsb_check): |
670 | mov %RDX_LP, %RCX_LP |
671 | rep movsb |
672 | ret |
673 | # endif |
674 | #endif |
675 | |
676 | .p2align 4,, 10 |
677 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |
678 | L(large_memcpy_2x_check): |
679 | /* Entry from L(large_memcpy_2x) has a redundant load of |
680 | __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x) |
681 | is only use for the non-erms memmove which is generally less |
682 | common. */ |
683 | L(large_memcpy_2x): |
684 | mov __x86_shared_non_temporal_threshold(%rip), %R11_LP |
685 | cmp %R11_LP, %RDX_LP |
686 | jb L(more_8x_vec_check) |
687 | /* To reach this point it is impossible for dst > src and |
688 | overlap. Remaining to check is src > dst and overlap. rcx |
689 | already contains dst - src. Negate rcx to get src - dst. If |
690 | length > rcx then there is overlap and forward copy is best. */ |
691 | negq %rcx |
692 | cmpq %rcx, %rdx |
693 | ja L(more_8x_vec_forward) |
694 | |
695 | /* Cache align destination. First store the first 64 bytes then |
696 | adjust alignments. */ |
697 | |
698 | /* First vec was also loaded into VEC(0). */ |
699 | # if VEC_SIZE < 64 |
700 | VMOVU VEC_SIZE(%rsi), %VMM(1) |
701 | # if VEC_SIZE < 32 |
702 | VMOVU (VEC_SIZE * 2)(%rsi), %VMM(2) |
703 | VMOVU (VEC_SIZE * 3)(%rsi), %VMM(3) |
704 | # endif |
705 | # endif |
706 | VMOVU %VMM(0), (%rdi) |
707 | # if VEC_SIZE < 64 |
708 | VMOVU %VMM(1), VEC_SIZE(%rdi) |
709 | # if VEC_SIZE < 32 |
710 | VMOVU %VMM(2), (VEC_SIZE * 2)(%rdi) |
711 | VMOVU %VMM(3), (VEC_SIZE * 3)(%rdi) |
712 | # endif |
713 | # endif |
714 | |
715 | /* Adjust source, destination, and size. */ |
716 | movq %rdi, %r8 |
717 | andq $63, %r8 |
718 | /* Get the negative of offset for alignment. */ |
719 | subq $64, %r8 |
720 | /* Adjust source. */ |
721 | subq %r8, %rsi |
722 | /* Adjust destination which should be aligned now. */ |
723 | subq %r8, %rdi |
724 | /* Adjust length. */ |
725 | addq %r8, %rdx |
726 | |
727 | /* Test if source and destination addresses will alias. If they |
728 | do the larger pipeline in large_memcpy_4x alleviated the |
729 | performance drop. */ |
730 | |
731 | /* ecx contains -(dst - src). not ecx will return dst - src - 1 |
732 | which works for testing aliasing. */ |
733 | notl %ecx |
734 | movq %rdx, %r10 |
735 | testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx |
736 | jz L(large_memcpy_4x) |
737 | |
738 | /* r11 has __x86_shared_non_temporal_threshold. Shift it left |
739 | by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold. |
740 | */ |
741 | shlq $LOG_4X_MEMCPY_THRESH, %r11 |
742 | cmp %r11, %rdx |
743 | jae L(large_memcpy_4x) |
744 | |
745 | /* edx will store remainder size for copying tail. */ |
746 | andl $(PAGE_SIZE * 2 - 1), %edx |
747 | /* r10 stores outer loop counter. */ |
748 | shrq $(LOG_PAGE_SIZE + 1), %r10 |
749 | /* Copy 4x VEC at a time from 2 pages. */ |
750 | .p2align 4 |
751 | L(loop_large_memcpy_2x_outer): |
752 | /* ecx stores inner loop counter. */ |
753 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |
754 | L(loop_large_memcpy_2x_inner): |
755 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |
756 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) |
757 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |
758 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) |
759 | /* Load vectors from rsi. */ |
760 | LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3)) |
761 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7)) |
762 | subq $-LARGE_LOAD_SIZE, %rsi |
763 | /* Non-temporal store vectors to rdi. */ |
764 | STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3)) |
765 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7)) |
766 | subq $-LARGE_LOAD_SIZE, %rdi |
767 | decl %ecx |
768 | jnz L(loop_large_memcpy_2x_inner) |
769 | addq $PAGE_SIZE, %rdi |
770 | addq $PAGE_SIZE, %rsi |
771 | decq %r10 |
772 | jne L(loop_large_memcpy_2x_outer) |
773 | sfence |
774 | |
775 | /* Check if only last 4 loads are needed. */ |
776 | cmpl $(VEC_SIZE * 4), %edx |
777 | jbe L(large_memcpy_2x_end) |
778 | |
779 | /* Handle the last 2 * PAGE_SIZE bytes. */ |
780 | L(loop_large_memcpy_2x_tail): |
781 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |
782 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |
783 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |
784 | VMOVU (%rsi), %VMM(0) |
785 | VMOVU VEC_SIZE(%rsi), %VMM(1) |
786 | VMOVU (VEC_SIZE * 2)(%rsi), %VMM(2) |
787 | VMOVU (VEC_SIZE * 3)(%rsi), %VMM(3) |
788 | subq $-(VEC_SIZE * 4), %rsi |
789 | addl $-(VEC_SIZE * 4), %edx |
790 | VMOVA %VMM(0), (%rdi) |
791 | VMOVA %VMM(1), VEC_SIZE(%rdi) |
792 | VMOVA %VMM(2), (VEC_SIZE * 2)(%rdi) |
793 | VMOVA %VMM(3), (VEC_SIZE * 3)(%rdi) |
794 | subq $-(VEC_SIZE * 4), %rdi |
795 | cmpl $(VEC_SIZE * 4), %edx |
796 | ja L(loop_large_memcpy_2x_tail) |
797 | |
798 | L(large_memcpy_2x_end): |
799 | /* Store the last 4 * VEC. */ |
800 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0) |
801 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1) |
802 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2) |
803 | VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(3) |
804 | |
805 | VMOVU %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx) |
806 | VMOVU %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx) |
807 | VMOVU %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx) |
808 | VMOVU %VMM(3), -VEC_SIZE(%rdi, %rdx) |
809 | VZEROUPPER_RETURN |
810 | |
811 | .p2align 4 |
812 | L(large_memcpy_4x): |
813 | /* edx will store remainder size for copying tail. */ |
814 | andl $(PAGE_SIZE * 4 - 1), %edx |
815 | /* r10 stores outer loop counter. */ |
816 | shrq $(LOG_PAGE_SIZE + 2), %r10 |
817 | /* Copy 4x VEC at a time from 4 pages. */ |
818 | .p2align 4 |
819 | L(loop_large_memcpy_4x_outer): |
820 | /* ecx stores inner loop counter. */ |
821 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |
822 | L(loop_large_memcpy_4x_inner): |
823 | /* Only one prefetch set per page as doing 4 pages give more |
824 | time for prefetcher to keep up. */ |
825 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |
826 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |
827 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) |
828 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) |
829 | /* Load vectors from rsi. */ |
830 | LOAD_ONE_SET((%rsi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3)) |
831 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7)) |
832 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11)) |
833 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15)) |
834 | subq $-LARGE_LOAD_SIZE, %rsi |
835 | /* Non-temporal store vectors to rdi. */ |
836 | STORE_ONE_SET((%rdi), 0, %VMM(0), %VMM(1), %VMM(2), %VMM(3)) |
837 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VMM(4), %VMM(5), %VMM(6), %VMM(7)) |
838 | STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VMM(8), %VMM(9), %VMM(10), %VMM(11)) |
839 | STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VMM(12), %VMM(13), %VMM(14), %VMM(15)) |
840 | subq $-LARGE_LOAD_SIZE, %rdi |
841 | decl %ecx |
842 | jnz L(loop_large_memcpy_4x_inner) |
843 | addq $(PAGE_SIZE * 3), %rdi |
844 | addq $(PAGE_SIZE * 3), %rsi |
845 | decq %r10 |
846 | jne L(loop_large_memcpy_4x_outer) |
847 | sfence |
848 | /* Check if only last 4 loads are needed. */ |
849 | cmpl $(VEC_SIZE * 4), %edx |
850 | jbe L(large_memcpy_4x_end) |
851 | |
852 | /* Handle the last 4 * PAGE_SIZE bytes. */ |
853 | L(loop_large_memcpy_4x_tail): |
854 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |
855 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |
856 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |
857 | VMOVU (%rsi), %VMM(0) |
858 | VMOVU VEC_SIZE(%rsi), %VMM(1) |
859 | VMOVU (VEC_SIZE * 2)(%rsi), %VMM(2) |
860 | VMOVU (VEC_SIZE * 3)(%rsi), %VMM(3) |
861 | subq $-(VEC_SIZE * 4), %rsi |
862 | addl $-(VEC_SIZE * 4), %edx |
863 | VMOVA %VMM(0), (%rdi) |
864 | VMOVA %VMM(1), VEC_SIZE(%rdi) |
865 | VMOVA %VMM(2), (VEC_SIZE * 2)(%rdi) |
866 | VMOVA %VMM(3), (VEC_SIZE * 3)(%rdi) |
867 | subq $-(VEC_SIZE * 4), %rdi |
868 | cmpl $(VEC_SIZE * 4), %edx |
869 | ja L(loop_large_memcpy_4x_tail) |
870 | |
871 | L(large_memcpy_4x_end): |
872 | /* Store the last 4 * VEC. */ |
873 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(0) |
874 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(1) |
875 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2) |
876 | VMOVU -VEC_SIZE(%rsi, %rdx), %VMM(3) |
877 | |
878 | VMOVU %VMM(0), -(VEC_SIZE * 4)(%rdi, %rdx) |
879 | VMOVU %VMM(1), -(VEC_SIZE * 3)(%rdi, %rdx) |
880 | VMOVU %VMM(2), -(VEC_SIZE * 2)(%rdi, %rdx) |
881 | VMOVU %VMM(3), -VEC_SIZE(%rdi, %rdx) |
882 | VZEROUPPER_RETURN |
883 | #endif |
884 | END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |
885 | |
886 | #if IS_IN (libc) |
887 | # ifdef USE_MULTIARCH |
888 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), |
889 | MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) |
890 | # ifdef SHARED |
891 | strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), |
892 | MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) |
893 | # endif |
894 | # endif |
895 | # ifdef SHARED |
896 | strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), |
897 | MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) |
898 | # endif |
899 | #endif |
900 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), |
901 | MEMCPY_SYMBOL (__memcpy, unaligned)) |
902 | |