strlen.S source code [glibc/sysdeps/powerpc/powerpc64/power8/strlen.S]

1	/ Optimized strlen implementation for PowerPC64/POWER8 using a vectorized*
2	loop.
3	Copyright (C) 2016-2024 Free Software Foundation, Inc.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<https://www.gnu.org/licenses/>. /*
19
20	#include <sysdep.h>
21
22	/ int [r3] strlen (char s [r3]) /*
23
24	#ifndef STRLEN
25	# define STRLEN strlen
26	#endif
27	.machine power8
28	ENTRY_TOCLESS (STRLEN, `4`)
29	CALL_MCOUNT `1`
30	dcbt `0`,r3
31	clrrdi r4,r3,`3` / Align the address to doubleword boundary. /
32	rlwinm r6,r3,`3`,`26`,`28` / Calculate padding. /
33	li r0,`0` / Doubleword with null chars to use*
34	with cmpb. /*
35	li r5,-`1` / MASK = 0xffffffffffffffff. /
36	ld r12,`0`(r4) / Load doubleword from memory. /
37	#ifdef __LITTLE_ENDIAN__
38	sld r5,r5,r6
39	#else
40	srd r5,r5,r6 / MASK = MASK >> padding. /
41	#endif
42	orc r9,r12,r5 / Mask bits that are not part of the string. /
43	cmpb r10,r9,r0 / Check for null bytes in DWORD1. /
44	cmpdi cr7,r10,`0` / If r10 == 0, no null's have been found. /
45	bne cr7,L(done)
46
47	/ For shorter strings (< 64 bytes), we will not use vector registers,*
48	as the overhead isn't worth it. So, let's use GPRs instead. This
49	will be done the same way as we do in the POWER7 implementation.
50	Let's see if we are aligned to a quadword boundary. If so, we can
51	jump to the first (non-vectorized) loop. Otherwise, we have to
52	handle the next DWORD first. /*
53	mtcrf `0x01`,r4
54	mr r9,r4
55	addi r9,r9,`8`
56	bt `28`,L(align64)
57
58	/ Handle the next 8 bytes so we are aligned to a quadword*
59	boundary. /*
60	ldu r5,`8`(r4)
61	cmpb r10,r5,r0
62	cmpdi cr7,r10,`0`
63	addi r9,r9,`8`
64	bne cr7,L(done)
65
66	L(align64):
67	/ Proceed to the old (POWER7) implementation, checking two doublewords*
68	per iteration. For the first 56 bytes, we will just check for null
69	characters. After that, we will also check if we are 64-byte aligned
70	so we can jump to the vectorized implementation. We will unroll
71	these loops to avoid excessive branching. /*
72	ld r6,`8`(r4)
73	ldu r5,`16`(r4)
74	cmpb r10,r6,r0
75	cmpb r11,r5,r0
76	or r5,r10,r11
77	cmpdi cr7,r5,`0`
78	addi r9,r9,`16`
79	bne cr7,L(dword_zero)
80
81	ld r6,`8`(r4)
82	ldu r5,`16`(r4)
83	cmpb r10,r6,r0
84	cmpb r11,r5,r0
85	or r5,r10,r11
86	cmpdi cr7,r5,`0`
87	addi r9,r9,`16`
88	bne cr7,L(dword_zero)
89
90	ld r6,`8`(r4)
91	ldu r5,`16`(r4)
92	cmpb r10,r6,r0
93	cmpb r11,r5,r0
94	or r5,r10,r11
95	cmpdi cr7,r5,`0`
96	addi r9,r9,`16`
97	bne cr7,L(dword_zero)
98
99	/ Are we 64-byte aligned? If so, jump to the vectorized loop.*
100	Note: aligning to 64-byte will necessarily slow down performance for
101	strings around 64 bytes in length due to the extra comparisons
102	required to check alignment for the vectorized loop. This is a
103	necessary tradeoff we are willing to take in order to speed up the
104	calculation for larger strings. /*
105	andi. r10,r9,`63`
106	beq cr0,L(preloop)
107	ld r6,`8`(r4)
108	ldu r5,`16`(r4)
109	cmpb r10,r6,r0
110	cmpb r11,r5,r0
111	or r5,r10,r11
112	cmpdi cr7,r5,`0`
113	addi r9,r9,`16`
114	bne cr7,L(dword_zero)
115
116	andi. r10,r9,`63`
117	beq cr0,L(preloop)
118	ld r6,`8`(r4)
119	ldu r5,`16`(r4)
120	cmpb r10,r6,r0
121	cmpb r11,r5,r0
122	or r5,r10,r11
123	cmpdi cr7,r5,`0`
124	addi r9,r9,`16`
125	bne cr7,L(dword_zero)
126
127	andi. r10,r9,`63`
128	beq cr0,L(preloop)
129	ld r6,`8`(r4)
130	ldu r5,`16`(r4)
131	cmpb r10,r6,r0
132	cmpb r11,r5,r0
133	or r5,r10,r11
134	cmpdi cr7,r5,`0`
135	addi r9,r9,`16`
136
137	/ At this point, we are necessarily 64-byte aligned. If no zeroes were*
138	found, jump to the vectorized loop. /*
139	beq cr7,L(preloop)
140
141	L(dword_zero):
142	/ OK, one (or both) of the doublewords contains a null byte. Check*
143	the first doubleword and decrement the address in case the first
144	doubleword really contains a null byte. /*
145
146	cmpdi cr6,r10,`0`
147	addi r4,r4,-`8`
148	bne cr6,L(done)
149
150	/ The null byte must be in the second doubleword. Adjust the address*
151	again and move the result of cmpb to r10 so we can calculate the
152	length. /*
153
154	mr r10,r11
155	addi r4,r4,`8`
156
157	/ If the null byte was found in the non-vectorized code, compute the*
158	final length. r10 has the output of the cmpb instruction, that is,
159	it contains 0xff in the same position as the null byte in the
160	original doubleword from the string. Use that to calculate the
161	length. /*
162	L(done):
163	#ifdef __LITTLE_ENDIAN__
164	addi r9, r10,-`1` / Form a mask from trailing zeros. /
165	andc r9, r9,r10
166	popcntd r0, r9 / Count the bits in the mask. /
167	#else
168	cntlzd r0,r10 / Count leading zeros before the match. /
169	#endif
170	subf r5,r3,r4
171	srdi r0,r0,`3` / Convert leading/trailing zeros to bytes. /
172	add r3,r5,r0 / Compute final length. /
173	blr
174
175	/ Vectorized implementation starts here. /
176	.p2align `4`
177	L(preloop):
178	/ Set up for the loop. /
179	mr r4,r9
180	li r7, `16` / Load required offsets. /
181	li r8, `32`
182	li r9, `48`
183	li r12, `8`
184	vxor v0,v0,v0 / VR with null chars to use with*
185	vcmpequb. /*
186
187	/ Main loop to look for the end of the string. We will read in*
188	64-byte chunks. Align it to 32 bytes and unroll it 3 times to
189	leverage the icache performance. /*
190	.p2align `5`
191	L(loop):
192	lvx v1,r4,r0 / Load 4 quadwords. /
193	lvx v2,r4,r7
194	lvx v3,r4,r8
195	lvx v4,r4,r9
196	vminub v5,v1,v2 / Compare and merge into one VR for speed. /
197	vminub v6,v3,v4
198	vminub v7,v5,v6
199	vcmpequb. v7,v7,v0 / Check for NULLs. /
200	addi r4,r4,`64` / Adjust address for the next iteration. /
201	bne cr6,L(vmx_zero)
202
203	lvx v1,r4,r0 / Load 4 quadwords. /
204	lvx v2,r4,r7
205	lvx v3,r4,r8
206	lvx v4,r4,r9
207	vminub v5,v1,v2 / Compare and merge into one VR for speed. /
208	vminub v6,v3,v4
209	vminub v7,v5,v6
210	vcmpequb. v7,v7,v0 / Check for NULLs. /
211	addi r4,r4,`64` / Adjust address for the next iteration. /
212	bne cr6,L(vmx_zero)
213
214	lvx v1,r4,r0 / Load 4 quadwords. /
215	lvx v2,r4,r7
216	lvx v3,r4,r8
217	lvx v4,r4,r9
218	vminub v5,v1,v2 / Compare and merge into one VR for speed. /
219	vminub v6,v3,v4
220	vminub v7,v5,v6
221	vcmpequb. v7,v7,v0 / Check for NULLs. /
222	addi r4,r4,`64` / Adjust address for the next iteration. /
223	beq cr6,L(loop)
224
225	L(vmx_zero):
226	/ OK, we found a null byte. Let's look for it in the current 64-byte*
227	block and mark it in its corresponding VR. /*
228	vcmpequb v1,v1,v0
229	vcmpequb v2,v2,v0
230	vcmpequb v3,v3,v0
231	vcmpequb v4,v4,v0
232
233	/ We will now 'compress' the result into a single doubleword, so it*
234	can be moved to a GPR for the final calculation. First, we
235	generate an appropriate mask for vbpermq, so we can permute bits into
236	the first halfword. /*
237	vspltisb v10,`3`
238	lvsl v11,r0,r0
239	vslb v10,v11,v10
240
241	/ Permute the first bit of each byte into bits 48-63. /
242	vbpermq v1,v1,v10
243	vbpermq v2,v2,v10
244	vbpermq v3,v3,v10
245	vbpermq v4,v4,v10
246
247	/ Shift each component into its correct position for merging. /
248	#ifdef __LITTLE_ENDIAN__
249	vsldoi v2,v2,v2,`2`
250	vsldoi v3,v3,v3,`4`
251	vsldoi v4,v4,v4,`6`
252	#else
253	vsldoi v1,v1,v1,`6`
254	vsldoi v2,v2,v2,`4`
255	vsldoi v3,v3,v3,`2`
256	#endif
257
258	/ Merge the results and move to a GPR. /
259	vor v1,v2,v1
260	vor v2,v3,v4
261	vor v4,v1,v2
262	mfvrd r10,v4
263
264	/ Adjust address to the begninning of the current 64-byte block. /
265	addi r4,r4,-`64`
266
267	#ifdef __LITTLE_ENDIAN__
268	addi r9, r10,-`1` / Form a mask from trailing zeros. /
269	andc r9, r9,r10
270	popcntd r0, r9 / Count the bits in the mask. /
271	#else
272	cntlzd r0,r10 / Count leading zeros before the match. /
273	#endif
274	subf r5,r3,r4
275	add r3,r5,r0 / Compute final length. /
276	blr
277
278	END (STRLEN)
279	libc_hidden_builtin_def (strlen)
280

source code of glibc/sysdeps/powerpc/powerpc64/power8/strlen.S