memchr.S source code [libc/AOR_v20.02/string/aarch64/memchr.S]

1	/*
2	* memchr - find a character in a memory zone
3	*
4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	* See https://llvm.org/LICENSE.txt for license information.
6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	*/
8
9	/ Assumptions:*
10	*
11	* ARMv8-a, AArch64
12	* Neon Available.
13	*/
14
15	#include "../asmdefs.h"
16
17	/ Arguments and results. /
18	#define srcin x0
19	#define chrin w1
20	#define cntin x2
21
22	#define result x0
23
24	#define src x3
25	#define tmp x4
26	#define wtmp2 w5
27	#define synd x6
28	#define soff x9
29	#define cntrem x10
30
31	#define vrepchr v0
32	#define vdata1 v1
33	#define vdata2 v2
34	#define vhas_chr1 v3
35	#define vhas_chr2 v4
36	#define vrepmask v5
37	#define vend v6
38
39	/*
40	* Core algorithm:
41	*
42	* For each 32-byte chunk we calculate a 64-bit syndrome value, with two bits
43	* per byte. For each tuple, bit 0 is set if the relevant byte matched the
44	* requested character and bit 1 is not used (faster than using a 32bit
45	* syndrome). Since the bits in the syndrome reflect exactly the order in which
46	* things occur in the original string, counting trailing zeros allows to
47	* identify exactly which byte has matched.
48	*/
49
50	ENTRY (__memchr_aarch64)
51	/ Do not dereference srcin if no bytes to compare. /
52	cbz cntin, L(zero_length)
53	/*
54	* Magic constant 0x40100401 allows us to identify which lane matches
55	* the requested byte.
56	*/
57	mov wtmp2, #`0x0401`
58	movk wtmp2, #`0x4010`, lsl #`16`
59	dup vrepchr`.16b`, chrin
60	/ Work with aligned 32-byte chunks /
61	bic src, srcin, #`31`
62	dup vrepmask`.4s`, wtmp2
63	ands soff, srcin, #`31`
64	and cntrem, cntin, #`31`
65	b.eq L(loop)
66
67	/*
68	* Input string is not 32-byte aligned. We calculate the syndrome
69	* value for the aligned 32 bytes block containing the first bytes
70	* and mask the irrelevant part.
71	*/
72
73	ld1 {vdata1`.16b`, vdata2`.16b`}, [src], #`32`
74	sub tmp, soff, #`32`
75	adds cntin, cntin, tmp
76	cmeq vhas_chr1`.16b`, vdata1`.16b`, vrepchr`.16b`
77	cmeq vhas_chr2`.16b`, vdata2`.16b`, vrepchr`.16b`
78	and vhas_chr1`.16b`, vhas_chr1`.16b`, vrepmask`.16b`
79	and vhas_chr2`.16b`, vhas_chr2`.16b`, vrepmask`.16b`
80	addp vend`.16b`, vhas_chr1`.16b`, vhas_chr2`.16b` / 256->128 /
81	addp vend`.16b`, vend`.16b`, vend`.16b` / 128->64 /
82	mov synd, vend.d[`0`]
83	/ Clear the soff2 lower bits /*
84	lsl tmp, soff, #`1`
85	lsr synd, synd, tmp
86	lsl synd, synd, tmp
87	/ The first block can also be the last /
88	b.ls L(masklast)
89	/ Have we found something already? /
90	cbnz synd, L(tail)
91
92	L(loop):
93	ld1 {vdata1`.16b`, vdata2`.16b`}, [src], #`32`
94	subs cntin, cntin, #`32`
95	cmeq vhas_chr1`.16b`, vdata1`.16b`, vrepchr`.16b`
96	cmeq vhas_chr2`.16b`, vdata2`.16b`, vrepchr`.16b`
97	/ If we're out of data we finish regardless of the result /
98	b.ls L(end)
99	/ Use a fast check for the termination condition /
100	orr vend`.16b`, vhas_chr1`.16b`, vhas_chr2`.16b`
101	addp vend`.2d`, vend`.2d`, vend`.2d`
102	mov synd, vend.d[`0`]
103	/ We're not out of data, loop if we haven't found the character /
104	cbz synd, L(loop)
105
106	L(end):
107	/ Termination condition found, let's calculate the syndrome value /
108	and vhas_chr1`.16b`, vhas_chr1`.16b`, vrepmask`.16b`
109	and vhas_chr2`.16b`, vhas_chr2`.16b`, vrepmask`.16b`
110	addp vend`.16b`, vhas_chr1`.16b`, vhas_chr2`.16b` / 256->128 /
111	addp vend`.16b`, vend`.16b`, vend`.16b` / 128->64 /
112	mov synd, vend.d[`0`]
113	/ Only do the clear for the last possible block /
114	b.hi L(tail)
115
116	L(masklast):
117	/ Clear the (32 - ((cntrem + soff) % 32)) * 2 upper bits /
118	add tmp, cntrem, soff
119	and tmp, tmp, #`31`
120	sub tmp, tmp, #`32`
121	neg tmp, tmp, lsl #`1`
122	lsl synd, synd, tmp
123	lsr synd, synd, tmp
124
125	L(tail):
126	/ Count the trailing zeros using bit reversing /
127	rbit synd, synd
128	/ Compensate the last post-increment /
129	sub src, src, #`32`
130	/ Check that we have found a character /
131	cmp synd, #`0`
132	/ And count the leading zeros /
133	clz synd, synd
134	/ Compute the potential result /
135	add result, src, synd, lsr #`1`
136	/ Select result or NULL /
137	csel result, xzr, result, eq
138	ret
139
140	L(zero_length):
141	mov result, #`0`
142	ret
143
144	END (__memchr_aarch64)
145

source code of libc/AOR_v20.02/string/aarch64/memchr.S