strchrnul.S source code [libc/AOR_v20.02/string/aarch64/strchrnul.S]

1	/*
2	* strchrnul - find a character or nul in a string
3	*
4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5	* See https://llvm.org/LICENSE.txt for license information.
6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7	*/
8
9	/ Assumptions:*
10	*
11	* ARMv8-a, AArch64
12	* Neon Available.
13	*/
14
15	#include "../asmdefs.h"
16
17	/ Arguments and results. /
18	#define srcin x0
19	#define chrin w1
20
21	#define result x0
22
23	#define src x2
24	#define tmp1 x3
25	#define wtmp2 w4
26	#define tmp3 x5
27
28	#define vrepchr v0
29	#define vdata1 v1
30	#define vdata2 v2
31	#define vhas_nul1 v3
32	#define vhas_nul2 v4
33	#define vhas_chr1 v5
34	#define vhas_chr2 v6
35	#define vrepmask v7
36	#define vend1 v16
37
38	/ Core algorithm.*
39
40	For each 32-byte hunk we calculate a 64-bit syndrome value, with
41	two bits per byte (LSB is always in bits 0 and 1, for both big
42	and little-endian systems). For each tuple, bit 0 is set iff
43	the relevant byte matched the requested character or nul. Since the
44	bits in the syndrome reflect exactly the order in which things occur
45	in the original string a count_trailing_zeros() operation will
46	identify exactly which byte is causing the termination. /*
47
48	/ Locals and temporaries. /
49
50	ENTRY (__strchrnul_aarch64)
51	/ Magic constant 0x40100401 to allow us to identify which lane*
52	matches the termination condition. /*
53	mov wtmp2, #`0x0401`
54	movk wtmp2, #`0x4010`, lsl #`16`
55	dup vrepchr`.16b`, chrin
56	bic src, srcin, #`31` / Work with aligned 32-byte hunks. /
57	dup vrepmask`.4s`, wtmp2
58	ands tmp1, srcin, #`31`
59	b.eq L(loop)
60
61	/ Input string is not 32-byte aligned. Rather than forcing*
62	the padding bytes to a safe value, we calculate the syndrome
63	for all the bytes, but then mask off those bits of the
64	syndrome that are related to the padding. /*
65	ld1 {vdata1`.16b`, vdata2`.16b`}, [src], #`32`
66	neg tmp1, tmp1
67	cmeq vhas_nul1`.16b`, vdata1`.16b`, #`0`
68	cmeq vhas_chr1`.16b`, vdata1`.16b`, vrepchr`.16b`
69	cmeq vhas_nul2`.16b`, vdata2`.16b`, #`0`
70	cmeq vhas_chr2`.16b`, vdata2`.16b`, vrepchr`.16b`
71	orr vhas_chr1`.16b`, vhas_chr1`.16b`, vhas_nul1`.16b`
72	orr vhas_chr2`.16b`, vhas_chr2`.16b`, vhas_nul2`.16b`
73	and vhas_chr1`.16b`, vhas_chr1`.16b`, vrepmask`.16b`
74	and vhas_chr2`.16b`, vhas_chr2`.16b`, vrepmask`.16b`
75	lsl tmp1, tmp1, #`1`
76	addp vend1`.16b`, vhas_chr1`.16b`, vhas_chr2`.16b` // 256->128
77	mov tmp3, #~`0`
78	addp vend1`.16b`, vend1`.16b`, vend1`.16b` // 128->64
79	lsr tmp1, tmp3, tmp1
80
81	mov tmp3, vend1.d[`0`]
82	bic tmp1, tmp3, tmp1 // Mask padding bits.
83	cbnz tmp1, L(tail)
84
85	L(loop):
86	ld1 {vdata1`.16b`, vdata2`.16b`}, [src], #`32`
87	cmeq vhas_nul1`.16b`, vdata1`.16b`, #`0`
88	cmeq vhas_chr1`.16b`, vdata1`.16b`, vrepchr`.16b`
89	cmeq vhas_nul2`.16b`, vdata2`.16b`, #`0`
90	cmeq vhas_chr2`.16b`, vdata2`.16b`, vrepchr`.16b`
91	/ Use a fast check for the termination condition. /
92	orr vhas_chr1`.16b`, vhas_nul1`.16b`, vhas_chr1`.16b`
93	orr vhas_chr2`.16b`, vhas_nul2`.16b`, vhas_chr2`.16b`
94	orr vend1`.16b`, vhas_chr1`.16b`, vhas_chr2`.16b`
95	addp vend1`.2d`, vend1`.2d`, vend1`.2d`
96	mov tmp1, vend1.d[`0`]
97	cbz tmp1, L(loop)
98
99	/ Termination condition found. Now need to establish exactly why*
100	we terminated. /*
101	and vhas_chr1`.16b`, vhas_chr1`.16b`, vrepmask`.16b`
102	and vhas_chr2`.16b`, vhas_chr2`.16b`, vrepmask`.16b`
103	addp vend1`.16b`, vhas_chr1`.16b`, vhas_chr2`.16b` // 256->128
104	addp vend1`.16b`, vend1`.16b`, vend1`.16b` // 128->64
105
106	mov tmp1, vend1.d[`0`]
107	L(tail):
108	/ Count the trailing zeros, by bit reversing... /
109	rbit tmp1, tmp1
110	/ Re-bias source. /
111	sub src, src, #`32`
112	clz tmp1, tmp1 / ... and counting the leading zeros. /
113	/ tmp1 is twice the offset into the fragment. /
114	add result, src, tmp1, lsr #`1`
115	ret
116
117	END (__strchrnul_aarch64)
118

source code of libc/AOR_v20.02/string/aarch64/strchrnul.S