1/*
2 * memchr - find a character in a memory zone
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9/* Assumptions:
10 *
11 * ARMv8-a, AArch64
12 * Neon Available.
13 */
14
15#include "../asmdefs.h"
16
17/* Arguments and results. */
18#define srcin x0
19#define chrin w1
20#define cntin x2
21
22#define result x0
23
24#define src x3
25#define tmp x4
26#define wtmp2 w5
27#define synd x6
28#define soff x9
29#define cntrem x10
30
31#define vrepchr v0
32#define vdata1 v1
33#define vdata2 v2
34#define vhas_chr1 v3
35#define vhas_chr2 v4
36#define vrepmask v5
37#define vend v6
38
39/*
40 * Core algorithm:
41 *
42 * For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
43 * per byte. For each tuple, bit 0 is set if the relevant byte matched the
44 * requested character and bit 1 is not used (faster than using a 32bit
45 * syndrome). Since the bits in the syndrome reflect exactly the order in which
46 * things occur in the original string, counting trailing zeros allows to
47 * identify exactly which byte has matched.
48 */
49
50ENTRY (__memchr_aarch64)
51 /* Do not dereference srcin if no bytes to compare. */
52 cbz cntin, L(zero_length)
53 /*
54 * Magic constant 0x40100401 allows us to identify which lane matches
55 * the requested byte.
56 */
57 mov wtmp2, #0x0401
58 movk wtmp2, #0x4010, lsl #16
59 dup vrepchr.16b, chrin
60 /* Work with aligned 32-byte chunks */
61 bic src, srcin, #31
62 dup vrepmask.4s, wtmp2
63 ands soff, srcin, #31
64 and cntrem, cntin, #31
65 b.eq L(loop)
66
67 /*
68 * Input string is not 32-byte aligned. We calculate the syndrome
69 * value for the aligned 32 bytes block containing the first bytes
70 * and mask the irrelevant part.
71 */
72
73 ld1 {vdata1.16b, vdata2.16b}, [src], #32
74 sub tmp, soff, #32
75 adds cntin, cntin, tmp
76 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
77 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
78 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
79 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
80 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
81 addp vend.16b, vend.16b, vend.16b /* 128->64 */
82 mov synd, vend.d[0]
83 /* Clear the soff*2 lower bits */
84 lsl tmp, soff, #1
85 lsr synd, synd, tmp
86 lsl synd, synd, tmp
87 /* The first block can also be the last */
88 b.ls L(masklast)
89 /* Have we found something already? */
90 cbnz synd, L(tail)
91
92L(loop):
93 ld1 {vdata1.16b, vdata2.16b}, [src], #32
94 subs cntin, cntin, #32
95 cmeq vhas_chr1.16b, vdata1.16b, vrepchr.16b
96 cmeq vhas_chr2.16b, vdata2.16b, vrepchr.16b
97 /* If we're out of data we finish regardless of the result */
98 b.ls L(end)
99 /* Use a fast check for the termination condition */
100 orr vend.16b, vhas_chr1.16b, vhas_chr2.16b
101 addp vend.2d, vend.2d, vend.2d
102 mov synd, vend.d[0]
103 /* We're not out of data, loop if we haven't found the character */
104 cbz synd, L(loop)
105
106L(end):
107 /* Termination condition found, let's calculate the syndrome value */
108 and vhas_chr1.16b, vhas_chr1.16b, vrepmask.16b
109 and vhas_chr2.16b, vhas_chr2.16b, vrepmask.16b
110 addp vend.16b, vhas_chr1.16b, vhas_chr2.16b /* 256->128 */
111 addp vend.16b, vend.16b, vend.16b /* 128->64 */
112 mov synd, vend.d[0]
113 /* Only do the clear for the last possible block */
114 b.hi L(tail)
115
116L(masklast):
117 /* Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits */
118 add tmp, cntrem, soff
119 and tmp, tmp, #31
120 sub tmp, tmp, #32
121 neg tmp, tmp, lsl #1
122 lsl synd, synd, tmp
123 lsr synd, synd, tmp
124
125L(tail):
126 /* Count the trailing zeros using bit reversing */
127 rbit synd, synd
128 /* Compensate the last post-increment */
129 sub src, src, #32
130 /* Check that we have found a character */
131 cmp synd, #0
132 /* And count the leading zeros */
133 clz synd, synd
134 /* Compute the potential result */
135 add result, src, synd, lsr #1
136 /* Select result or NULL */
137 csel result, xzr, result, eq
138 ret
139
140L(zero_length):
141 mov result, #0
142 ret
143
144END (__memchr_aarch64)
145

source code of libc/AOR_v20.02/string/aarch64/memchr.S