1 | /* memmove/memcpy/mempcpy with unaligned load/store and rep movsb |
---|---|

2 | Copyright (C) 2016-2022 Free Software Foundation, Inc. |

3 | This file is part of the GNU C Library. |

4 | |

5 | The GNU C Library is free software; you can redistribute it and/or |

6 | modify it under the terms of the GNU Lesser General Public |

7 | License as published by the Free Software Foundation; either |

8 | version 2.1 of the License, or (at your option) any later version. |

9 | |

10 | The GNU C Library is distributed in the hope that it will be useful, |

11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |

12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

13 | Lesser General Public License for more details. |

14 | |

15 | You should have received a copy of the GNU Lesser General Public |

16 | License along with the GNU C Library; if not, see |

17 | <https://www.gnu.org/licenses/>. */ |

18 | |

19 | /* memmove/memcpy/mempcpy is implemented as: |

20 | 1. Use overlapping load and store to avoid branch. |

21 | 2. Load all sources into registers and store them together to avoid |

22 | possible address overlap between source and destination. |

23 | 3. If size is 8 * VEC_SIZE or less, load all sources into registers |

24 | and store them together. |

25 | 4. If address of destination > address of source, backward copy |

26 | 4 * VEC_SIZE at a time with unaligned load and aligned store. |

27 | Load the first 4 * VEC and last VEC before the loop and store |

28 | them after the loop to support overlapping addresses. |

29 | 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned |

30 | load and aligned store. Load the last 4 * VEC and first VEC |

31 | before the loop and store them after the loop to support |

32 | overlapping addresses. |

33 | 6. On machines with ERMS feature, if size greater than equal or to |

34 | __x86_rep_movsb_threshold and less than |

35 | __x86_rep_movsb_stop_threshold, then REP MOVSB will be used. |

36 | 7. If size >= __x86_shared_non_temporal_threshold and there is no |

37 | overlap between destination and source, use non-temporal store |

38 | instead of aligned store copying from either 2 or 4 pages at |

39 | once. |

40 | 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold |

41 | and source and destination do not page alias, copy from 2 pages |

42 | at once using non-temporal stores. Page aliasing in this case is |

43 | considered true if destination's page alignment - sources' page |

44 | alignment is less than 8 * VEC_SIZE. |

45 | 9. If size >= 16 * __x86_shared_non_temporal_threshold or source |

46 | and destination do page alias copy from 4 pages at once using |

47 | non-temporal stores. */ |

48 | |

49 | #include <sysdep.h> |

50 | |

51 | #ifndef MEMCPY_SYMBOL |

52 | # define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |

53 | #endif |

54 | |

55 | #ifndef MEMPCPY_SYMBOL |

56 | # define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |

57 | #endif |

58 | |

59 | #ifndef MEMMOVE_CHK_SYMBOL |

60 | # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) |

61 | #endif |

62 | |

63 | #ifndef XMM0 |

64 | # define XMM0 xmm0 |

65 | #endif |

66 | |

67 | #ifndef YMM0 |

68 | # define YMM0 ymm0 |

69 | #endif |

70 | |

71 | #ifndef VZEROUPPER |

72 | # if VEC_SIZE > 16 |

73 | # define VZEROUPPER vzeroupper |

74 | # else |

75 | # define VZEROUPPER |

76 | # endif |

77 | #endif |

78 | |

79 | /* Whether to align before movsb. Ultimately we want 64 byte |

80 | align and not worth it to load 4x VEC for VEC_SIZE == 16. */ |

81 | #define ALIGN_MOVSB (VEC_SIZE > 16) |

82 | /* Number of bytes to align movsb to. */ |

83 | #define MOVSB_ALIGN_TO 64 |

84 | |

85 | #define SMALL_MOV_SIZE (MOV_SIZE <= 4) |

86 | #define LARGE_MOV_SIZE (MOV_SIZE > 4) |

87 | |

88 | #if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1 |

89 | # error MOV_SIZE Unknown |

90 | #endif |

91 | |

92 | #if LARGE_MOV_SIZE |

93 | # define SMALL_SIZE_OFFSET (4) |

94 | #else |

95 | # define SMALL_SIZE_OFFSET (0) |

96 | #endif |

97 | |

98 | #ifndef PAGE_SIZE |

99 | # define PAGE_SIZE 4096 |

100 | #endif |

101 | |

102 | #if PAGE_SIZE != 4096 |

103 | # error Unsupported PAGE_SIZE |

104 | #endif |

105 | |

106 | #ifndef LOG_PAGE_SIZE |

107 | # define LOG_PAGE_SIZE 12 |

108 | #endif |

109 | |

110 | #if PAGE_SIZE != (1 << LOG_PAGE_SIZE) |

111 | # error Invalid LOG_PAGE_SIZE |

112 | #endif |

113 | |

114 | /* Byte per page for large_memcpy inner loop. */ |

115 | #if VEC_SIZE == 64 |

116 | # define LARGE_LOAD_SIZE (VEC_SIZE * 2) |

117 | #else |

118 | # define LARGE_LOAD_SIZE (VEC_SIZE * 4) |

119 | #endif |

120 | |

121 | /* Amount to shift __x86_shared_non_temporal_threshold by for |

122 | bound for memcpy_large_4x. This is essentially use to to |

123 | indicate that the copy is far beyond the scope of L3 |

124 | (assuming no user config x86_non_temporal_threshold) and to |

125 | use a more aggressively unrolled loop. NB: before |

126 | increasing the value also update initialization of |

127 | x86_non_temporal_threshold. */ |

128 | #ifndef LOG_4X_MEMCPY_THRESH |

129 | # define LOG_4X_MEMCPY_THRESH 4 |

130 | #endif |

131 | |

132 | /* Avoid short distance rep movsb only with non-SSE vector. */ |

133 | #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB |

134 | # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) |

135 | #else |

136 | # define AVOID_SHORT_DISTANCE_REP_MOVSB 0 |

137 | #endif |

138 | |

139 | #ifndef PREFETCH |

140 | # define PREFETCH(addr) prefetcht0 addr |

141 | #endif |

142 | |

143 | /* Assume 64-byte prefetch size. */ |

144 | #ifndef PREFETCH_SIZE |

145 | # define PREFETCH_SIZE 64 |

146 | #endif |

147 | |

148 | #define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4) |

149 | |

150 | #if PREFETCH_SIZE == 64 |

151 | # if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE |

152 | # define PREFETCH_ONE_SET(dir, base, offset) \ |

153 | PREFETCH ((offset)base) |

154 | # elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE |

155 | # define PREFETCH_ONE_SET(dir, base, offset) \ |

156 | PREFETCH ((offset)base); \ |

157 | PREFETCH ((offset + dir * PREFETCH_SIZE)base) |

158 | # elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE |

159 | # define PREFETCH_ONE_SET(dir, base, offset) \ |

160 | PREFETCH ((offset)base); \ |

161 | PREFETCH ((offset + dir * PREFETCH_SIZE)base); \ |

162 | PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \ |

163 | PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base) |

164 | # else |

165 | # error Unsupported PREFETCHED_LOAD_SIZE! |

166 | # endif |

167 | #else |

168 | # error Unsupported PREFETCH_SIZE! |

169 | #endif |

170 | |

171 | #if LARGE_LOAD_SIZE == (VEC_SIZE * 2) |

172 | # define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ |

173 | VMOVU (offset)base, vec0; \ |

174 | VMOVU ((offset) + VEC_SIZE)base, vec1; |

175 | # define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ |

176 | VMOVNT vec0, (offset)base; \ |

177 | VMOVNT vec1, ((offset) + VEC_SIZE)base; |

178 | #elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) |

179 | # define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |

180 | VMOVU (offset)base, vec0; \ |

181 | VMOVU ((offset) + VEC_SIZE)base, vec1; \ |

182 | VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ |

183 | VMOVU ((offset) + VEC_SIZE * 3)base, vec3; |

184 | # define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ |

185 | VMOVNT vec0, (offset)base; \ |

186 | VMOVNT vec1, ((offset) + VEC_SIZE)base; \ |

187 | VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ |

188 | VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; |

189 | #else |

190 | # error Invalid LARGE_LOAD_SIZE |

191 | #endif |

192 | |

193 | #ifndef SECTION |

194 | # error SECTION is not defined! |

195 | #endif |

196 | |

197 | .section SECTION(.text),"ax",@progbits |

198 | #if defined SHARED && IS_IN (libc) |

199 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |

200 | cmp %RDX_LP, %RCX_LP |

201 | jb HIDDEN_JUMPTARGET (__chk_fail) |

202 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) |

203 | #endif |

204 | |

205 | ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |

206 | mov %RDI_LP, %RAX_LP |

207 | add %RDX_LP, %RAX_LP |

208 | jmp L(start) |

209 | END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) |

210 | |

211 | #if defined SHARED && IS_IN (libc) |

212 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |

213 | cmp %RDX_LP, %RCX_LP |

214 | jb HIDDEN_JUMPTARGET (__chk_fail) |

215 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) |

216 | #endif |

217 | |

218 | ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) |

219 | movq %rdi, %rax |

220 | L(start): |

221 | # ifdef __ILP32__ |

222 | /* Clear the upper 32 bits. */ |

223 | movl %edx, %edx |

224 | # endif |

225 | cmp $VEC_SIZE, %RDX_LP |

226 | jb L(less_vec) |

227 | /* Load regardless. */ |

228 | VMOVU (%rsi), %VEC(0) |

229 | cmp $(VEC_SIZE * 2), %RDX_LP |

230 | ja L(more_2x_vec) |

231 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ |

232 | VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) |

233 | VMOVU %VEC(0), (%rdi) |

234 | VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) |

235 | #if !(defined USE_MULTIARCH && IS_IN (libc)) |

236 | ZERO_UPPER_VEC_REGISTERS_RETURN |

237 | #else |

238 | VZEROUPPER_RETURN |

239 | #endif |

240 | #if defined USE_MULTIARCH && IS_IN (libc) |

241 | END (MEMMOVE_SYMBOL (__memmove, unaligned)) |

242 | |

243 | # ifdef SHARED |

244 | ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |

245 | cmp %RDX_LP, %RCX_LP |

246 | jb HIDDEN_JUMPTARGET (__chk_fail) |

247 | END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) |

248 | # endif |

249 | |

250 | ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |

251 | mov %RDI_LP, %RAX_LP |

252 | add %RDX_LP, %RAX_LP |

253 | jmp L(start_erms) |

254 | END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) |

255 | |

256 | # ifdef SHARED |

257 | ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |

258 | cmp %RDX_LP, %RCX_LP |

259 | jb HIDDEN_JUMPTARGET (__chk_fail) |

260 | END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) |

261 | # endif |

262 | |

263 | ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6) |

264 | movq %rdi, %rax |

265 | L(start_erms): |

266 | # ifdef __ILP32__ |

267 | /* Clear the upper 32 bits. */ |

268 | movl %edx, %edx |

269 | # endif |

270 | cmp $VEC_SIZE, %RDX_LP |

271 | jb L(less_vec) |

272 | /* Load regardless. */ |

273 | VMOVU (%rsi), %VEC(0) |

274 | cmp $(VEC_SIZE * 2), %RDX_LP |

275 | ja L(movsb_more_2x_vec) |

276 | /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. |

277 | */ |

278 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1) |

279 | VMOVU %VEC(0), (%rdi) |

280 | VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx) |

281 | L(return): |

282 | # if VEC_SIZE > 16 |

283 | ZERO_UPPER_VEC_REGISTERS_RETURN |

284 | # else |

285 | ret |

286 | # endif |

287 | #endif |

288 | |

289 | #if LARGE_MOV_SIZE |

290 | /* If LARGE_MOV_SIZE this fits in the aligning bytes between the |

291 | ENTRY block and L(less_vec). */ |

292 | .p2align 4,, 8 |

293 | L(between_4_7): |

294 | /* From 4 to 7. No branch when size == 4. */ |

295 | movl (%rsi), %ecx |

296 | movl (%rsi, %rdx), %esi |

297 | movl %ecx, (%rdi) |

298 | movl %esi, (%rdi, %rdx) |

299 | ret |

300 | #endif |

301 | |

302 | .p2align 4 |

303 | L(less_vec): |

304 | /* Less than 1 VEC. */ |

305 | #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 |

306 | # error Unsupported VEC_SIZE! |

307 | #endif |

308 | #if VEC_SIZE > 32 |

309 | cmpl $32, %edx |

310 | jae L(between_32_63) |

311 | #endif |

312 | #if VEC_SIZE > 16 |

313 | cmpl $16, %edx |

314 | jae L(between_16_31) |

315 | #endif |

316 | cmpl $8, %edx |

317 | jae L(between_8_15) |

318 | #if SMALL_MOV_SIZE |

319 | cmpl $4, %edx |

320 | #else |

321 | subq $4, %rdx |

322 | #endif |

323 | jae L(between_4_7) |

324 | cmpl $(1 - SMALL_SIZE_OFFSET), %edx |

325 | jl L(copy_0) |

326 | movb (%rsi), %cl |

327 | je L(copy_1) |

328 | movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi |

329 | movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx) |

330 | L(copy_1): |

331 | movb %cl, (%rdi) |

332 | L(copy_0): |

333 | ret |

334 | |

335 | #if SMALL_MOV_SIZE |

336 | .p2align 4,, 8 |

337 | L(between_4_7): |

338 | /* From 4 to 7. No branch when size == 4. */ |

339 | movl -4(%rsi, %rdx), %ecx |

340 | movl (%rsi), %esi |

341 | movl %ecx, -4(%rdi, %rdx) |

342 | movl %esi, (%rdi) |

343 | ret |

344 | #endif |

345 | |

346 | #if VEC_SIZE > 16 |

347 | /* From 16 to 31. No branch when size == 16. */ |

348 | .p2align 4,, 8 |

349 | L(between_16_31): |

350 | vmovdqu (%rsi), %xmm0 |

351 | vmovdqu -16(%rsi, %rdx), %xmm1 |

352 | vmovdqu %xmm0, (%rdi) |

353 | vmovdqu %xmm1, -16(%rdi, %rdx) |

354 | /* No ymm registers have been touched. */ |

355 | ret |

356 | #endif |

357 | |

358 | #if VEC_SIZE > 32 |

359 | .p2align 4,, 10 |

360 | L(between_32_63): |

361 | /* From 32 to 63. No branch when size == 32. */ |

362 | VMOVU (%rsi), %YMM0 |

363 | VMOVU -32(%rsi, %rdx), %YMM1 |

364 | VMOVU %YMM0, (%rdi) |

365 | VMOVU %YMM1, -32(%rdi, %rdx) |

366 | VZEROUPPER_RETURN |

367 | #endif |

368 | |

369 | .p2align 4,, 10 |

370 | L(between_8_15): |

371 | /* From 8 to 15. No branch when size == 8. */ |

372 | movq -8(%rsi, %rdx), %rcx |

373 | movq (%rsi), %rsi |

374 | movq %rsi, (%rdi) |

375 | movq %rcx, -8(%rdi, %rdx) |

376 | ret |

377 | |

378 | .p2align 4,, 10 |

379 | L(last_4x_vec): |

380 | /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ |

381 | |

382 | /* VEC(0) and VEC(1) have already been loaded. */ |

383 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2) |

384 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3) |

385 | VMOVU %VEC(0), (%rdi) |

386 | VMOVU %VEC(1), VEC_SIZE(%rdi) |

387 | VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx) |

388 | VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx) |

389 | VZEROUPPER_RETURN |

390 | |

391 | .p2align 4 |

392 | #if defined USE_MULTIARCH && IS_IN (libc) |

393 | L(movsb_more_2x_vec): |

394 | cmp __x86_rep_movsb_threshold(%rip), %RDX_LP |

395 | ja L(movsb) |

396 | #endif |

397 | L(more_2x_vec): |

398 | /* More than 2 * VEC and there may be overlap between |

399 | destination and source. */ |

400 | cmpq $(VEC_SIZE * 8), %rdx |

401 | ja L(more_8x_vec) |

402 | /* Load VEC(1) regardless. VEC(0) has already been loaded. */ |

403 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

404 | cmpq $(VEC_SIZE * 4), %rdx |

405 | jbe L(last_4x_vec) |

406 | /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ |

407 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |

408 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |

409 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4) |

410 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5) |

411 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6) |

412 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7) |

413 | VMOVU %VEC(0), (%rdi) |

414 | VMOVU %VEC(1), VEC_SIZE(%rdi) |

415 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |

416 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |

417 | VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx) |

418 | VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx) |

419 | VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx) |

420 | VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx) |

421 | VZEROUPPER_RETURN |

422 | |

423 | .p2align 4,, 4 |

424 | L(more_8x_vec): |

425 | movq %rdi, %rcx |

426 | subq %rsi, %rcx |

427 | /* Go to backwards temporal copy if overlap no matter what as |

428 | backward REP MOVSB is slow and we don't want to use NT stores if |

429 | there is overlap. */ |

430 | cmpq %rdx, %rcx |

431 | /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |

432 | jb L(more_8x_vec_backward_check_nop) |

433 | /* Check if non-temporal move candidate. */ |

434 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |

435 | /* Check non-temporal store threshold. */ |

436 | cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP |

437 | ja L(large_memcpy_2x) |

438 | #endif |

439 | /* To reach this point there cannot be overlap and dst > src. So |

440 | check for overlap and src > dst in which case correctness |

441 | requires forward copy. Otherwise decide between backward/forward |

442 | copy depending on address aliasing. */ |

443 | |

444 | /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold |

445 | but less than __x86_shared_non_temporal_threshold. */ |

446 | L(more_8x_vec_check): |

447 | /* rcx contains dst - src. Add back length (rdx). */ |

448 | leaq (%rcx, %rdx), %r8 |

449 | /* If r8 has different sign than rcx then there is overlap so we |

450 | must do forward copy. */ |

451 | xorq %rcx, %r8 |

452 | /* Isolate just sign bit of r8. */ |

453 | shrq $63, %r8 |

454 | /* Get 4k difference dst - src. */ |

455 | andl $(PAGE_SIZE - 256), %ecx |

456 | /* If r8 is non-zero must do foward for correctness. Otherwise |

457 | if ecx is non-zero there is 4k False Alaising so do backward |

458 | copy. */ |

459 | addl %r8d, %ecx |

460 | jz L(more_8x_vec_backward) |

461 | |

462 | /* if rdx is greater than __x86_shared_non_temporal_threshold |

463 | but there is overlap, or from short distance movsb. */ |

464 | L(more_8x_vec_forward): |

465 | /* Load first and last 4 * VEC to support overlapping addresses. |

466 | */ |

467 | |

468 | /* First vec was already loaded into VEC(0). */ |

469 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5) |

470 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6) |

471 | /* Save begining of dst. */ |

472 | movq %rdi, %rcx |

473 | /* Align dst to VEC_SIZE - 1. */ |

474 | orq $(VEC_SIZE - 1), %rdi |

475 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7) |

476 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8) |

477 | |

478 | /* Subtract dst from src. Add back after dst aligned. */ |

479 | subq %rcx, %rsi |

480 | /* Finish aligning dst. */ |

481 | incq %rdi |

482 | /* Restore src adjusted with new value for aligned dst. */ |

483 | addq %rdi, %rsi |

484 | /* Store end of buffer minus tail in rdx. */ |

485 | leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx |

486 | |

487 | /* Dont use multi-byte nop to align. */ |

488 | .p2align 4,, 11 |

489 | L(loop_4x_vec_forward): |

490 | /* Copy 4 * VEC a time forward. */ |

491 | VMOVU (%rsi), %VEC(1) |

492 | VMOVU VEC_SIZE(%rsi), %VEC(2) |

493 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3) |

494 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4) |

495 | subq $-(VEC_SIZE * 4), %rsi |

496 | VMOVA %VEC(1), (%rdi) |

497 | VMOVA %VEC(2), VEC_SIZE(%rdi) |

498 | VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi) |

499 | VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi) |

500 | subq $-(VEC_SIZE * 4), %rdi |

501 | cmpq %rdi, %rdx |

502 | ja L(loop_4x_vec_forward) |

503 | /* Store the last 4 * VEC. */ |

504 | VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx) |

505 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx) |

506 | VMOVU %VEC(7), VEC_SIZE(%rdx) |

507 | VMOVU %VEC(8), (%rdx) |

508 | /* Store the first VEC. */ |

509 | VMOVU %VEC(0), (%rcx) |

510 | /* Keep L(nop_backward) target close to jmp for 2-byte encoding. |

511 | */ |

512 | L(nop_backward): |

513 | VZEROUPPER_RETURN |

514 | |

515 | .p2align 4,, 8 |

516 | L(more_8x_vec_backward_check_nop): |

517 | /* rcx contains dst - src. Test for dst == src to skip all of |

518 | memmove. */ |

519 | testq %rcx, %rcx |

520 | jz L(nop_backward) |

521 | L(more_8x_vec_backward): |

522 | /* Load the first 4 * VEC and last VEC to support overlapping |

523 | addresses. */ |

524 | |

525 | /* First vec was also loaded into VEC(0). */ |

526 | VMOVU VEC_SIZE(%rsi), %VEC(5) |

527 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6) |

528 | /* Begining of region for 4x backward copy stored in rcx. */ |

529 | leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx |

530 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7) |

531 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8) |

532 | /* Subtract dst from src. Add back after dst aligned. */ |

533 | subq %rdi, %rsi |

534 | /* Align dst. */ |

535 | andq $-(VEC_SIZE), %rcx |

536 | /* Restore src. */ |

537 | addq %rcx, %rsi |

538 | |

539 | /* Don't use multi-byte nop to align. */ |

540 | .p2align 4,, 11 |

541 | L(loop_4x_vec_backward): |

542 | /* Copy 4 * VEC a time backward. */ |

543 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) |

544 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |

545 | VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3) |

546 | VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4) |

547 | addq $(VEC_SIZE * -4), %rsi |

548 | VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx) |

549 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx) |

550 | VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx) |

551 | VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx) |

552 | addq $(VEC_SIZE * -4), %rcx |

553 | cmpq %rcx, %rdi |

554 | jb L(loop_4x_vec_backward) |

555 | /* Store the first 4 * VEC. */ |

556 | VMOVU %VEC(0), (%rdi) |

557 | VMOVU %VEC(5), VEC_SIZE(%rdi) |

558 | VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) |

559 | VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) |

560 | /* Store the last VEC. */ |

561 | VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi) |

562 | VZEROUPPER_RETURN |

563 | |

564 | #if defined USE_MULTIARCH && IS_IN (libc) |

565 | /* L(skip_short_movsb_check) is only used with ERMS. Not for |

566 | FSRM. */ |

567 | .p2align 5,, 16 |

568 | # if ALIGN_MOVSB |

569 | L(skip_short_movsb_check): |

570 | # if MOVSB_ALIGN_TO > VEC_SIZE |

571 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

572 | # endif |

573 | # if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |

574 | # error Unsupported MOVSB_ALIGN_TO |

575 | # endif |

576 | /* If CPU does not have FSRM two options for aligning. Align src |

577 | if dst and src 4k alias. Otherwise align dst. */ |

578 | testl $(PAGE_SIZE - 512), %ecx |

579 | jnz L(movsb_align_dst) |

580 | /* Fall through. dst and src 4k alias. It's better to align src |

581 | here because the bottleneck will be loads dues to the false |

582 | dependency on dst. */ |

583 | |

584 | /* rcx already has dst - src. */ |

585 | movq %rcx, %r9 |

586 | /* Add src to len. Subtract back after src aligned. -1 because |

587 | src is initially aligned to MOVSB_ALIGN_TO - 1. */ |

588 | leaq -1(%rsi, %rdx), %rcx |

589 | /* Inclusively align src to MOVSB_ALIGN_TO - 1. */ |

590 | orq $(MOVSB_ALIGN_TO - 1), %rsi |

591 | /* Restore dst and len adjusted with new values for aligned dst. |

592 | */ |

593 | leaq 1(%rsi, %r9), %rdi |

594 | subq %rsi, %rcx |

595 | /* Finish aligning src. */ |

596 | incq %rsi |

597 | |

598 | rep movsb |

599 | |

600 | VMOVU %VEC(0), (%r8) |

601 | # if MOVSB_ALIGN_TO > VEC_SIZE |

602 | VMOVU %VEC(1), VEC_SIZE(%r8) |

603 | # endif |

604 | VZEROUPPER_RETURN |

605 | # endif |

606 | |

607 | .p2align 4,, 12 |

608 | L(movsb): |

609 | movq %rdi, %rcx |

610 | subq %rsi, %rcx |

611 | /* Go to backwards temporal copy if overlap no matter what as |

612 | backward REP MOVSB is slow and we don't want to use NT stores if |

613 | there is overlap. */ |

614 | cmpq %rdx, %rcx |

615 | /* L(more_8x_vec_backward_check_nop) checks for src == dst. */ |

616 | jb L(more_8x_vec_backward_check_nop) |

617 | # if ALIGN_MOVSB |

618 | /* Save dest for storing aligning VECs later. */ |

619 | movq %rdi, %r8 |

620 | # endif |

621 | /* If above __x86_rep_movsb_stop_threshold most likely is |

622 | candidate for NT moves aswell. */ |

623 | cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP |

624 | jae L(large_memcpy_2x_check) |

625 | # if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB |

626 | /* Only avoid short movsb if CPU has FSRM. */ |

627 | testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip) |

628 | jz L(skip_short_movsb_check) |

629 | # if AVOID_SHORT_DISTANCE_REP_MOVSB |

630 | /* Avoid "rep movsb" if RCX, the distance between source and |

631 | destination, is N*4GB + [1..63] with N >= 0. */ |

632 | |

633 | /* ecx contains dst - src. Early check for backward copy |

634 | conditions means only case of slow movsb with src = dst + [0, |

635 | 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check |

636 | for that case. */ |

637 | cmpl $-64, %ecx |

638 | ja L(more_8x_vec_forward) |

639 | # endif |

640 | # endif |

641 | # if ALIGN_MOVSB |

642 | # if MOVSB_ALIGN_TO > VEC_SIZE |

643 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

644 | # endif |

645 | # if MOVSB_ALIGN_TO > (VEC_SIZE * 2) |

646 | # error Unsupported MOVSB_ALIGN_TO |

647 | # endif |

648 | /* Fall through means cpu has FSRM. In that case exclusively |

649 | align destination. */ |

650 | L(movsb_align_dst): |

651 | /* Subtract dst from src. Add back after dst aligned. */ |

652 | subq %rdi, %rsi |

653 | /* Exclusively align dst to MOVSB_ALIGN_TO (64). */ |

654 | addq $(MOVSB_ALIGN_TO - 1), %rdi |

655 | /* Add dst to len. Subtract back after dst aligned. */ |

656 | leaq (%r8, %rdx), %rcx |

657 | /* Finish aligning dst. */ |

658 | andq $-(MOVSB_ALIGN_TO), %rdi |

659 | /* Restore src and len adjusted with new values for aligned dst. |

660 | */ |

661 | addq %rdi, %rsi |

662 | subq %rdi, %rcx |

663 | |

664 | rep movsb |

665 | |

666 | /* Store VECs loaded for aligning. */ |

667 | VMOVU %VEC(0), (%r8) |

668 | # if MOVSB_ALIGN_TO > VEC_SIZE |

669 | VMOVU %VEC(1), VEC_SIZE(%r8) |

670 | # endif |

671 | VZEROUPPER_RETURN |

672 | # else /* !ALIGN_MOVSB. */ |

673 | L(skip_short_movsb_check): |

674 | mov %RDX_LP, %RCX_LP |

675 | rep movsb |

676 | ret |

677 | # endif |

678 | #endif |

679 | |

680 | .p2align 4,, 10 |

681 | #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) |

682 | L(large_memcpy_2x_check): |

683 | /* Entry from L(large_memcpy_2x) has a redundant load of |

684 | __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x) |

685 | is only use for the non-erms memmove which is generally less |

686 | common. */ |

687 | L(large_memcpy_2x): |

688 | mov __x86_shared_non_temporal_threshold(%rip), %R11_LP |

689 | cmp %R11_LP, %RDX_LP |

690 | jb L(more_8x_vec_check) |

691 | /* To reach this point it is impossible for dst > src and |

692 | overlap. Remaining to check is src > dst and overlap. rcx |

693 | already contains dst - src. Negate rcx to get src - dst. If |

694 | length > rcx then there is overlap and forward copy is best. */ |

695 | negq %rcx |

696 | cmpq %rcx, %rdx |

697 | ja L(more_8x_vec_forward) |

698 | |

699 | /* Cache align destination. First store the first 64 bytes then |

700 | adjust alignments. */ |

701 | |

702 | /* First vec was also loaded into VEC(0). */ |

703 | # if VEC_SIZE < 64 |

704 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

705 | # if VEC_SIZE < 32 |

706 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |

707 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |

708 | # endif |

709 | # endif |

710 | VMOVU %VEC(0), (%rdi) |

711 | # if VEC_SIZE < 64 |

712 | VMOVU %VEC(1), VEC_SIZE(%rdi) |

713 | # if VEC_SIZE < 32 |

714 | VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi) |

715 | VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi) |

716 | # endif |

717 | # endif |

718 | |

719 | /* Adjust source, destination, and size. */ |

720 | movq %rdi, %r8 |

721 | andq $63, %r8 |

722 | /* Get the negative of offset for alignment. */ |

723 | subq $64, %r8 |

724 | /* Adjust source. */ |

725 | subq %r8, %rsi |

726 | /* Adjust destination which should be aligned now. */ |

727 | subq %r8, %rdi |

728 | /* Adjust length. */ |

729 | addq %r8, %rdx |

730 | |

731 | /* Test if source and destination addresses will alias. If they |

732 | do the larger pipeline in large_memcpy_4x alleviated the |

733 | performance drop. */ |

734 | |

735 | /* ecx contains -(dst - src). not ecx will return dst - src - 1 |

736 | which works for testing aliasing. */ |

737 | notl %ecx |

738 | movq %rdx, %r10 |

739 | testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx |

740 | jz L(large_memcpy_4x) |

741 | |

742 | /* r11 has __x86_shared_non_temporal_threshold. Shift it left |

743 | by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold. |

744 | */ |

745 | shlq $LOG_4X_MEMCPY_THRESH, %r11 |

746 | cmp %r11, %rdx |

747 | jae L(large_memcpy_4x) |

748 | |

749 | /* edx will store remainder size for copying tail. */ |

750 | andl $(PAGE_SIZE * 2 - 1), %edx |

751 | /* r10 stores outer loop counter. */ |

752 | shrq $(LOG_PAGE_SIZE + 1), %r10 |

753 | /* Copy 4x VEC at a time from 2 pages. */ |

754 | .p2align 4 |

755 | L(loop_large_memcpy_2x_outer): |

756 | /* ecx stores inner loop counter. */ |

757 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |

758 | L(loop_large_memcpy_2x_inner): |

759 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |

760 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) |

761 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |

762 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) |

763 | /* Load vectors from rsi. */ |

764 | LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |

765 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |

766 | subq $-LARGE_LOAD_SIZE, %rsi |

767 | /* Non-temporal store vectors to rdi. */ |

768 | STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |

769 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |

770 | subq $-LARGE_LOAD_SIZE, %rdi |

771 | decl %ecx |

772 | jnz L(loop_large_memcpy_2x_inner) |

773 | addq $PAGE_SIZE, %rdi |

774 | addq $PAGE_SIZE, %rsi |

775 | decq %r10 |

776 | jne L(loop_large_memcpy_2x_outer) |

777 | sfence |

778 | |

779 | /* Check if only last 4 loads are needed. */ |

780 | cmpl $(VEC_SIZE * 4), %edx |

781 | jbe L(large_memcpy_2x_end) |

782 | |

783 | /* Handle the last 2 * PAGE_SIZE bytes. */ |

784 | L(loop_large_memcpy_2x_tail): |

785 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |

786 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |

787 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |

788 | VMOVU (%rsi), %VEC(0) |

789 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

790 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |

791 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |

792 | subq $-(VEC_SIZE * 4), %rsi |

793 | addl $-(VEC_SIZE * 4), %edx |

794 | VMOVA %VEC(0), (%rdi) |

795 | VMOVA %VEC(1), VEC_SIZE(%rdi) |

796 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |

797 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |

798 | subq $-(VEC_SIZE * 4), %rdi |

799 | cmpl $(VEC_SIZE * 4), %edx |

800 | ja L(loop_large_memcpy_2x_tail) |

801 | |

802 | L(large_memcpy_2x_end): |

803 | /* Store the last 4 * VEC. */ |

804 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) |

805 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) |

806 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) |

807 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) |

808 | |

809 | VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) |

810 | VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) |

811 | VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) |

812 | VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) |

813 | VZEROUPPER_RETURN |

814 | |

815 | .p2align 4 |

816 | L(large_memcpy_4x): |

817 | /* edx will store remainder size for copying tail. */ |

818 | andl $(PAGE_SIZE * 4 - 1), %edx |

819 | /* r10 stores outer loop counter. */ |

820 | shrq $(LOG_PAGE_SIZE + 2), %r10 |

821 | /* Copy 4x VEC at a time from 4 pages. */ |

822 | .p2align 4 |

823 | L(loop_large_memcpy_4x_outer): |

824 | /* ecx stores inner loop counter. */ |

825 | movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx |

826 | L(loop_large_memcpy_4x_inner): |

827 | /* Only one prefetch set per page as doing 4 pages give more |

828 | time for prefetcher to keep up. */ |

829 | PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) |

830 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) |

831 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) |

832 | PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) |

833 | /* Load vectors from rsi. */ |

834 | LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |

835 | LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |

836 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) |

837 | LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) |

838 | subq $-LARGE_LOAD_SIZE, %rsi |

839 | /* Non-temporal store vectors to rdi. */ |

840 | STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) |

841 | STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) |

842 | STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) |

843 | STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) |

844 | subq $-LARGE_LOAD_SIZE, %rdi |

845 | decl %ecx |

846 | jnz L(loop_large_memcpy_4x_inner) |

847 | addq $(PAGE_SIZE * 3), %rdi |

848 | addq $(PAGE_SIZE * 3), %rsi |

849 | decq %r10 |

850 | jne L(loop_large_memcpy_4x_outer) |

851 | sfence |

852 | /* Check if only last 4 loads are needed. */ |

853 | cmpl $(VEC_SIZE * 4), %edx |

854 | jbe L(large_memcpy_4x_end) |

855 | |

856 | /* Handle the last 4 * PAGE_SIZE bytes. */ |

857 | L(loop_large_memcpy_4x_tail): |

858 | /* Copy 4 * VEC a time forward with non-temporal stores. */ |

859 | PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) |

860 | PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) |

861 | VMOVU (%rsi), %VEC(0) |

862 | VMOVU VEC_SIZE(%rsi), %VEC(1) |

863 | VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) |

864 | VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) |

865 | subq $-(VEC_SIZE * 4), %rsi |

866 | addl $-(VEC_SIZE * 4), %edx |

867 | VMOVA %VEC(0), (%rdi) |

868 | VMOVA %VEC(1), VEC_SIZE(%rdi) |

869 | VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) |

870 | VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) |

871 | subq $-(VEC_SIZE * 4), %rdi |

872 | cmpl $(VEC_SIZE * 4), %edx |

873 | ja L(loop_large_memcpy_4x_tail) |

874 | |

875 | L(large_memcpy_4x_end): |

876 | /* Store the last 4 * VEC. */ |

877 | VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) |

878 | VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) |

879 | VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) |

880 | VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) |

881 | |

882 | VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) |

883 | VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) |

884 | VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) |

885 | VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) |

886 | VZEROUPPER_RETURN |

887 | #endif |

888 | END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) |

889 | |

890 | #if IS_IN (libc) |

891 | # ifdef USE_MULTIARCH |

892 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms), |

893 | MEMMOVE_SYMBOL (__memcpy, unaligned_erms)) |

894 | # ifdef SHARED |

895 | strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms), |

896 | MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms)) |

897 | # endif |

898 | # endif |

899 | # ifdef SHARED |

900 | strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned), |

901 | MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned)) |

902 | # endif |

903 | #endif |

904 | strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned), |

905 | MEMCPY_SYMBOL (__memcpy, unaligned)) |

906 |