1 | /* strlen/strnlen/wcslen/wcsnlen optimized with AVX2. |
---|---|

2 | Copyright (C) 2017-2022 Free Software Foundation, Inc. |

3 | This file is part of the GNU C Library. |

4 | |

5 | The GNU C Library is free software; you can redistribute it and/or |

6 | modify it under the terms of the GNU Lesser General Public |

7 | License as published by the Free Software Foundation; either |

8 | version 2.1 of the License, or (at your option) any later version. |

9 | |

10 | The GNU C Library is distributed in the hope that it will be useful, |

11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |

12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

13 | Lesser General Public License for more details. |

14 | |

15 | You should have received a copy of the GNU Lesser General Public |

16 | License along with the GNU C Library; if not, see |

17 | <https://www.gnu.org/licenses/>. */ |

18 | |

19 | #if IS_IN (libc) |

20 | |

21 | # include <sysdep.h> |

22 | |

23 | # ifndef STRLEN |

24 | # define STRLEN __strlen_avx2 |

25 | # endif |

26 | |

27 | # ifdef USE_AS_WCSLEN |

28 | # define VPCMPEQ vpcmpeqd |

29 | # define VPMINU vpminud |

30 | # define CHAR_SIZE 4 |

31 | # else |

32 | # define VPCMPEQ vpcmpeqb |

33 | # define VPMINU vpminub |

34 | # define CHAR_SIZE 1 |

35 | # endif |

36 | |

37 | # ifndef VZEROUPPER |

38 | # define VZEROUPPER vzeroupper |

39 | # endif |

40 | |

41 | # ifndef SECTION |

42 | # define SECTION(p) p##.avx |

43 | # endif |

44 | |

45 | # define VEC_SIZE 32 |

46 | # define PAGE_SIZE 4096 |

47 | # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) |

48 | |

49 | .section SECTION(.text),"ax",@progbits |

50 | ENTRY (STRLEN) |

51 | # ifdef USE_AS_STRNLEN |

52 | /* Check zero length. */ |

53 | # ifdef __ILP32__ |

54 | /* Clear upper bits. */ |

55 | and %RSI_LP, %RSI_LP |

56 | # else |

57 | test %RSI_LP, %RSI_LP |

58 | # endif |

59 | jz L(zero) |

60 | /* Store max len in R8_LP before adjusting if using WCSLEN. */ |

61 | mov %RSI_LP, %R8_LP |

62 | # endif |

63 | movl %edi, %eax |

64 | movq %rdi, %rdx |

65 | vpxor %xmm0, %xmm0, %xmm0 |

66 | /* Clear high bits from edi. Only keeping bits relevant to page |

67 | cross check. */ |

68 | andl $(PAGE_SIZE - 1), %eax |

69 | /* Check if we may cross page boundary with one vector load. */ |

70 | cmpl $(PAGE_SIZE - VEC_SIZE), %eax |

71 | ja L(cross_page_boundary) |

72 | |

73 | /* Check the first VEC_SIZE bytes. */ |

74 | VPCMPEQ (%rdi), %ymm0, %ymm1 |

75 | vpmovmskb %ymm1, %eax |

76 | # ifdef USE_AS_STRNLEN |

77 | /* If length < VEC_SIZE handle special. */ |

78 | cmpq $CHAR_PER_VEC, %rsi |

79 | jbe L(first_vec_x0) |

80 | # endif |

81 | /* If empty continue to aligned_more. Otherwise return bit |

82 | position of first match. */ |

83 | testl %eax, %eax |

84 | jz L(aligned_more) |

85 | tzcntl %eax, %eax |

86 | # ifdef USE_AS_WCSLEN |

87 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

88 | shrl $2, %eax |

89 | # endif |

90 | VZEROUPPER_RETURN |

91 | |

92 | # ifdef USE_AS_STRNLEN |

93 | L(zero): |

94 | xorl %eax, %eax |

95 | ret |

96 | |

97 | .p2align 4 |

98 | L(first_vec_x0): |

99 | /* Set bit for max len so that tzcnt will return min of max len |

100 | and position of first match. */ |

101 | # ifdef USE_AS_WCSLEN |

102 | /* NB: Multiply length by 4 to get byte count. */ |

103 | sall $2, %esi |

104 | # endif |

105 | btsq %rsi, %rax |

106 | tzcntl %eax, %eax |

107 | # ifdef USE_AS_WCSLEN |

108 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

109 | shrl $2, %eax |

110 | # endif |

111 | VZEROUPPER_RETURN |

112 | # endif |

113 | |

114 | .p2align 4 |

115 | L(first_vec_x1): |

116 | tzcntl %eax, %eax |

117 | /* Safe to use 32 bit instructions as these are only called for |

118 | size = [1, 159]. */ |

119 | # ifdef USE_AS_STRNLEN |

120 | /* Use ecx which was computed earlier to compute correct value. |

121 | */ |

122 | # ifdef USE_AS_WCSLEN |

123 | leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax |

124 | # else |

125 | subl $(VEC_SIZE * 4 + 1), %ecx |

126 | addl %ecx, %eax |

127 | # endif |

128 | # else |

129 | subl %edx, %edi |

130 | incl %edi |

131 | addl %edi, %eax |

132 | # endif |

133 | # ifdef USE_AS_WCSLEN |

134 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

135 | shrl $2, %eax |

136 | # endif |

137 | VZEROUPPER_RETURN |

138 | |

139 | .p2align 4 |

140 | L(first_vec_x2): |

141 | tzcntl %eax, %eax |

142 | /* Safe to use 32 bit instructions as these are only called for |

143 | size = [1, 159]. */ |

144 | # ifdef USE_AS_STRNLEN |

145 | /* Use ecx which was computed earlier to compute correct value. |

146 | */ |

147 | # ifdef USE_AS_WCSLEN |

148 | leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax |

149 | # else |

150 | subl $(VEC_SIZE * 3 + 1), %ecx |

151 | addl %ecx, %eax |

152 | # endif |

153 | # else |

154 | subl %edx, %edi |

155 | addl $(VEC_SIZE + 1), %edi |

156 | addl %edi, %eax |

157 | # endif |

158 | # ifdef USE_AS_WCSLEN |

159 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

160 | shrl $2, %eax |

161 | # endif |

162 | VZEROUPPER_RETURN |

163 | |

164 | .p2align 4 |

165 | L(first_vec_x3): |

166 | tzcntl %eax, %eax |

167 | /* Safe to use 32 bit instructions as these are only called for |

168 | size = [1, 159]. */ |

169 | # ifdef USE_AS_STRNLEN |

170 | /* Use ecx which was computed earlier to compute correct value. |

171 | */ |

172 | # ifdef USE_AS_WCSLEN |

173 | leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax |

174 | # else |

175 | subl $(VEC_SIZE * 2 + 1), %ecx |

176 | addl %ecx, %eax |

177 | # endif |

178 | # else |

179 | subl %edx, %edi |

180 | addl $(VEC_SIZE * 2 + 1), %edi |

181 | addl %edi, %eax |

182 | # endif |

183 | # ifdef USE_AS_WCSLEN |

184 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

185 | shrl $2, %eax |

186 | # endif |

187 | VZEROUPPER_RETURN |

188 | |

189 | .p2align 4 |

190 | L(first_vec_x4): |

191 | tzcntl %eax, %eax |

192 | /* Safe to use 32 bit instructions as these are only called for |

193 | size = [1, 159]. */ |

194 | # ifdef USE_AS_STRNLEN |

195 | /* Use ecx which was computed earlier to compute correct value. |

196 | */ |

197 | # ifdef USE_AS_WCSLEN |

198 | leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax |

199 | # else |

200 | subl $(VEC_SIZE + 1), %ecx |

201 | addl %ecx, %eax |

202 | # endif |

203 | # else |

204 | subl %edx, %edi |

205 | addl $(VEC_SIZE * 3 + 1), %edi |

206 | addl %edi, %eax |

207 | # endif |

208 | # ifdef USE_AS_WCSLEN |

209 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

210 | shrl $2, %eax |

211 | # endif |

212 | VZEROUPPER_RETURN |

213 | |

214 | .p2align 5 |

215 | L(aligned_more): |

216 | /* Align data to VEC_SIZE - 1. This is the same number of |

217 | instructions as using andq with -VEC_SIZE but saves 4 bytes of |

218 | code on the x4 check. */ |

219 | orq $(VEC_SIZE - 1), %rdi |

220 | L(cross_page_continue): |

221 | /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time |

222 | since data is only aligned to VEC_SIZE. */ |

223 | # ifdef USE_AS_STRNLEN |

224 | /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE |

225 | because it simplies the logic in last_4x_vec_or_less. */ |

226 | leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx |

227 | subq %rdx, %rcx |

228 | # ifdef USE_AS_WCSLEN |

229 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |

230 | sarl $2, %ecx |

231 | # endif |

232 | # endif |

233 | /* Load first VEC regardless. */ |

234 | VPCMPEQ 1(%rdi), %ymm0, %ymm1 |

235 | # ifdef USE_AS_STRNLEN |

236 | /* Adjust length. If near end handle specially. */ |

237 | subq %rcx, %rsi |

238 | jb L(last_4x_vec_or_less) |

239 | # endif |

240 | vpmovmskb %ymm1, %eax |

241 | testl %eax, %eax |

242 | jnz L(first_vec_x1) |

243 | |

244 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |

245 | vpmovmskb %ymm1, %eax |

246 | testl %eax, %eax |

247 | jnz L(first_vec_x2) |

248 | |

249 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 |

250 | vpmovmskb %ymm1, %eax |

251 | testl %eax, %eax |

252 | jnz L(first_vec_x3) |

253 | |

254 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 |

255 | vpmovmskb %ymm1, %eax |

256 | testl %eax, %eax |

257 | jnz L(first_vec_x4) |

258 | |

259 | /* Align data to VEC_SIZE * 4 - 1. */ |

260 | # ifdef USE_AS_STRNLEN |

261 | /* Before adjusting length check if at last VEC_SIZE * 4. */ |

262 | cmpq $(CHAR_PER_VEC * 4 - 1), %rsi |

263 | jbe L(last_4x_vec_or_less_load) |

264 | incq %rdi |

265 | movl %edi, %ecx |

266 | orq $(VEC_SIZE * 4 - 1), %rdi |

267 | andl $(VEC_SIZE * 4 - 1), %ecx |

268 | # ifdef USE_AS_WCSLEN |

269 | /* NB: Divide bytes by 4 to get the wchar_t count. */ |

270 | sarl $2, %ecx |

271 | # endif |

272 | /* Readjust length. */ |

273 | addq %rcx, %rsi |

274 | # else |

275 | incq %rdi |

276 | orq $(VEC_SIZE * 4 - 1), %rdi |

277 | # endif |

278 | /* Compare 4 * VEC at a time forward. */ |

279 | .p2align 4 |

280 | L(loop_4x_vec): |

281 | # ifdef USE_AS_STRNLEN |

282 | /* Break if at end of length. */ |

283 | subq $(CHAR_PER_VEC * 4), %rsi |

284 | jb L(last_4x_vec_or_less_cmpeq) |

285 | # endif |

286 | /* Save some code size by microfusing VPMINU with the load. |

287 | Since the matches in ymm2/ymm4 can only be returned if there |

288 | where no matches in ymm1/ymm3 respectively there is no issue |

289 | with overlap. */ |

290 | vmovdqa 1(%rdi), %ymm1 |

291 | VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 |

292 | vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 |

293 | VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 |

294 | |

295 | VPMINU %ymm2, %ymm4, %ymm5 |

296 | VPCMPEQ %ymm5, %ymm0, %ymm5 |

297 | vpmovmskb %ymm5, %ecx |

298 | |

299 | subq $-(VEC_SIZE * 4), %rdi |

300 | testl %ecx, %ecx |

301 | jz L(loop_4x_vec) |

302 | |

303 | |

304 | VPCMPEQ %ymm1, %ymm0, %ymm1 |

305 | vpmovmskb %ymm1, %eax |

306 | subq %rdx, %rdi |

307 | testl %eax, %eax |

308 | jnz L(last_vec_return_x0) |

309 | |

310 | VPCMPEQ %ymm2, %ymm0, %ymm2 |

311 | vpmovmskb %ymm2, %eax |

312 | testl %eax, %eax |

313 | jnz L(last_vec_return_x1) |

314 | |

315 | /* Combine last 2 VEC. */ |

316 | VPCMPEQ %ymm3, %ymm0, %ymm3 |

317 | vpmovmskb %ymm3, %eax |

318 | /* rcx has combined result from all 4 VEC. It will only be used |

319 | if the first 3 other VEC all did not contain a match. */ |

320 | salq $32, %rcx |

321 | orq %rcx, %rax |

322 | tzcntq %rax, %rax |

323 | subq $(VEC_SIZE * 2 - 1), %rdi |

324 | addq %rdi, %rax |

325 | # ifdef USE_AS_WCSLEN |

326 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

327 | shrq $2, %rax |

328 | # endif |

329 | VZEROUPPER_RETURN |

330 | |

331 | |

332 | # ifdef USE_AS_STRNLEN |

333 | .p2align 4 |

334 | L(last_4x_vec_or_less_load): |

335 | /* Depending on entry adjust rdi / prepare first VEC in ymm1. |

336 | */ |

337 | subq $-(VEC_SIZE * 4), %rdi |

338 | L(last_4x_vec_or_less_cmpeq): |

339 | VPCMPEQ 1(%rdi), %ymm0, %ymm1 |

340 | L(last_4x_vec_or_less): |

341 | # ifdef USE_AS_WCSLEN |

342 | /* NB: Multiply length by 4 to get byte count. */ |

343 | sall $2, %esi |

344 | # endif |

345 | vpmovmskb %ymm1, %eax |

346 | /* If remaining length > VEC_SIZE * 2. This works if esi is off |

347 | by VEC_SIZE * 4. */ |

348 | testl $(VEC_SIZE * 2), %esi |

349 | jnz L(last_4x_vec) |

350 | |

351 | /* length may have been negative or positive by an offset of |

352 | VEC_SIZE * 4 depending on where this was called from. This fixes |

353 | that. */ |

354 | andl $(VEC_SIZE * 4 - 1), %esi |

355 | testl %eax, %eax |

356 | jnz L(last_vec_x1_check) |

357 | |

358 | subl $VEC_SIZE, %esi |

359 | jb L(max) |

360 | |

361 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |

362 | vpmovmskb %ymm1, %eax |

363 | tzcntl %eax, %eax |

364 | /* Check the end of data. */ |

365 | cmpl %eax, %esi |

366 | jb L(max) |

367 | subq %rdx, %rdi |

368 | addl $(VEC_SIZE + 1), %eax |

369 | addq %rdi, %rax |

370 | # ifdef USE_AS_WCSLEN |

371 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

372 | shrq $2, %rax |

373 | # endif |

374 | VZEROUPPER_RETURN |

375 | # endif |

376 | |

377 | .p2align 4 |

378 | L(last_vec_return_x0): |

379 | tzcntl %eax, %eax |

380 | subq $(VEC_SIZE * 4 - 1), %rdi |

381 | addq %rdi, %rax |

382 | # ifdef USE_AS_WCSLEN |

383 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

384 | shrq $2, %rax |

385 | # endif |

386 | VZEROUPPER_RETURN |

387 | |

388 | .p2align 4 |

389 | L(last_vec_return_x1): |

390 | tzcntl %eax, %eax |

391 | subq $(VEC_SIZE * 3 - 1), %rdi |

392 | addq %rdi, %rax |

393 | # ifdef USE_AS_WCSLEN |

394 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

395 | shrq $2, %rax |

396 | # endif |

397 | VZEROUPPER_RETURN |

398 | |

399 | # ifdef USE_AS_STRNLEN |

400 | .p2align 4 |

401 | L(last_vec_x1_check): |

402 | |

403 | tzcntl %eax, %eax |

404 | /* Check the end of data. */ |

405 | cmpl %eax, %esi |

406 | jb L(max) |

407 | subq %rdx, %rdi |

408 | incl %eax |

409 | addq %rdi, %rax |

410 | # ifdef USE_AS_WCSLEN |

411 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

412 | shrq $2, %rax |

413 | # endif |

414 | VZEROUPPER_RETURN |

415 | |

416 | L(max): |

417 | movq %r8, %rax |

418 | VZEROUPPER_RETURN |

419 | |

420 | .p2align 4 |

421 | L(last_4x_vec): |

422 | /* Test first 2x VEC normally. */ |

423 | testl %eax, %eax |

424 | jnz L(last_vec_x1) |

425 | |

426 | VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 |

427 | vpmovmskb %ymm1, %eax |

428 | testl %eax, %eax |

429 | jnz L(last_vec_x2) |

430 | |

431 | /* Normalize length. */ |

432 | andl $(VEC_SIZE * 4 - 1), %esi |

433 | VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 |

434 | vpmovmskb %ymm1, %eax |

435 | testl %eax, %eax |

436 | jnz L(last_vec_x3) |

437 | |

438 | subl $(VEC_SIZE * 3), %esi |

439 | jb L(max) |

440 | |

441 | VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 |

442 | vpmovmskb %ymm1, %eax |

443 | tzcntl %eax, %eax |

444 | /* Check the end of data. */ |

445 | cmpl %eax, %esi |

446 | jb L(max) |

447 | subq %rdx, %rdi |

448 | addl $(VEC_SIZE * 3 + 1), %eax |

449 | addq %rdi, %rax |

450 | # ifdef USE_AS_WCSLEN |

451 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

452 | shrq $2, %rax |

453 | # endif |

454 | VZEROUPPER_RETURN |

455 | |

456 | |

457 | .p2align 4 |

458 | L(last_vec_x1): |

459 | /* essentially duplicates of first_vec_x1 but use 64 bit |

460 | instructions. */ |

461 | tzcntl %eax, %eax |

462 | subq %rdx, %rdi |

463 | incl %eax |

464 | addq %rdi, %rax |

465 | # ifdef USE_AS_WCSLEN |

466 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

467 | shrq $2, %rax |

468 | # endif |

469 | VZEROUPPER_RETURN |

470 | |

471 | .p2align 4 |

472 | L(last_vec_x2): |

473 | /* essentially duplicates of first_vec_x1 but use 64 bit |

474 | instructions. */ |

475 | tzcntl %eax, %eax |

476 | subq %rdx, %rdi |

477 | addl $(VEC_SIZE + 1), %eax |

478 | addq %rdi, %rax |

479 | # ifdef USE_AS_WCSLEN |

480 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

481 | shrq $2, %rax |

482 | # endif |

483 | VZEROUPPER_RETURN |

484 | |

485 | .p2align 4 |

486 | L(last_vec_x3): |

487 | tzcntl %eax, %eax |

488 | subl $(VEC_SIZE * 2), %esi |

489 | /* Check the end of data. */ |

490 | cmpl %eax, %esi |

491 | jb L(max_end) |

492 | subq %rdx, %rdi |

493 | addl $(VEC_SIZE * 2 + 1), %eax |

494 | addq %rdi, %rax |

495 | # ifdef USE_AS_WCSLEN |

496 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

497 | shrq $2, %rax |

498 | # endif |

499 | VZEROUPPER_RETURN |

500 | L(max_end): |

501 | movq %r8, %rax |

502 | VZEROUPPER_RETURN |

503 | # endif |

504 | |

505 | /* Cold case for crossing page with first load. */ |

506 | .p2align 4 |

507 | L(cross_page_boundary): |

508 | /* Align data to VEC_SIZE - 1. */ |

509 | orq $(VEC_SIZE - 1), %rdi |

510 | VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 |

511 | vpmovmskb %ymm1, %eax |

512 | /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT |

513 | so no need to manually mod rdx. */ |

514 | sarxl %edx, %eax, %eax |

515 | # ifdef USE_AS_STRNLEN |

516 | testl %eax, %eax |

517 | jnz L(cross_page_less_vec) |

518 | leaq 1(%rdi), %rcx |

519 | subq %rdx, %rcx |

520 | # ifdef USE_AS_WCSLEN |

521 | /* NB: Divide bytes by 4 to get wchar_t count. */ |

522 | shrl $2, %ecx |

523 | # endif |

524 | /* Check length. */ |

525 | cmpq %rsi, %rcx |

526 | jb L(cross_page_continue) |

527 | movq %r8, %rax |

528 | # else |

529 | testl %eax, %eax |

530 | jz L(cross_page_continue) |

531 | tzcntl %eax, %eax |

532 | # ifdef USE_AS_WCSLEN |

533 | /* NB: Divide length by 4 to get wchar_t count. */ |

534 | shrl $2, %eax |

535 | # endif |

536 | # endif |

537 | L(return_vzeroupper): |

538 | ZERO_UPPER_VEC_REGISTERS_RETURN |

539 | |

540 | # ifdef USE_AS_STRNLEN |

541 | .p2align 4 |

542 | L(cross_page_less_vec): |

543 | tzcntl %eax, %eax |

544 | # ifdef USE_AS_WCSLEN |

545 | /* NB: Multiply length by 4 to get byte count. */ |

546 | sall $2, %esi |

547 | # endif |

548 | cmpq %rax, %rsi |

549 | cmovb %esi, %eax |

550 | # ifdef USE_AS_WCSLEN |

551 | shrl $2, %eax |

552 | # endif |

553 | VZEROUPPER_RETURN |

554 | # endif |

555 | |

556 | END (STRLEN) |

557 | #endif |

558 |