ev6-memchr.S source code [linux/arch/alpha/lib/ev6-memchr.S]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/*
3	* arch/alpha/lib/ev6-memchr.S
4	*
5	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
6	*
7	* Finds characters in a memory area. Optimized for the Alpha:
8	*
9	* - memory accessed as aligned quadwords only
10	* - uses cmpbge to compare 8 bytes in parallel
11	* - does binary search to find 0 byte in last
12	* quadword (HAKMEM needed 12 instructions to
13	* do this instead of the 9 instructions that
14	* binary search needs).
15	*
16	* For correctness consider that:
17	*
18	* - only minimum number of quadwords may be accessed
19	* - the third argument is an unsigned long
20	*
21	* Much of the information about 21264 scheduling/coding comes from:
22	* Compiler Writer's Guide for the Alpha 21264
23	* abbreviated as 'CWG' in other comments here
24	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
25	* Scheduling notation:
26	* E - either cluster
27	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
28	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
29	* Try not to change the actual algorithm if possible for consistency.
30	*/
31	#include <linux/export.h>
32	.set noreorder
33	.set noat
34
35	.align `4`
36	.globl memchr
37	.ent memchr
38	memchr:
39	.frame $`30`,`0`,$`26`,`0`
40	.prologue `0`
41
42	# Hack -- if someone passes in (size_t)-1, hoping to just
43	# search til the end of the address space, we will overflow
44	# below when we find the address of the last byte. Given
45	# that we will never have a 56-bit address space, cropping
46	# the length is the easiest way to avoid trouble.
47	zap $`18`, `0x80`, $`5` # U : Bound length
48	beq $`18`, $not_found # U :
49	ldq_u $`1`, `0`($`16`) # L : load first quadword Latency=`3`
50	and $`17`, `0xff`, $`17` # E : L L U U : `00000000000000ch`
51
52	insbl $`17`, `1`, $`2` # U : `000000000000ch00`
53	cmpult $`18`, `9`, $`4` # E : small (< `1` quad) string?
54	or $`2`, $`17`, $`17` # E : `000000000000chch`
55	lda $`3`, -`1`($`31`) # E : U L L U
56
57	sll $`17`, `16`, $`2` # U : `00000000chch0000`
58	addq $`16`, $`5`, $`5` # E : Max search address
59	or $`2`, $`17`, $`17` # E : `00000000chchchch`
60	sll $`17`, `32`, $`2` # U : U L L U : chchchch00000000
61
62	or $`2`, $`17`, $`17` # E : chchchchchchchch
63	extql $`1`, $`16`, $`7` # U : $`7` is upper bits
64	beq $`4`, $first_quad # U :
65	ldq_u $`6`, -`1`($`5`) # L : L U U L : eight or less bytes to search Latency=`3`
66
67	extqh $`6`, $`16`, $`6` # U : `2` cycle stall for $`6`
68	mov $`16`, $`0` # E :
69	nop # E :
70	or $`7`, $`6`, $`1` # E : L U L U $`1` = quadword starting at $`16`
71
72	# Deal with the case where at most 8 bytes remain to be searched
73	# in $1. E.g.:
74	# $18 = 6
75	# $1 = ????c6c5c4c3c2c1
76	$last_quad:
77	negq $`18`, $`6` # E :
78	xor $`17`, $`1`, $`1` # E :
79	srl $`3`, $`6`, $`6` # U : $`6` = mask of $`18` bits set
80	cmpbge $`31`, $`1`, $`2` # E : L U L U
81
82	nop
83	nop
84	and $`2`, $`6`, $`2` # E :
85	beq $`2`, $not_found # U : U L U L
86
87	$found_it:
88	#ifdef CONFIG_ALPHA_EV67
89	/*
90	* Since we are guaranteed to have set one of the bits, we don't
91	* have to worry about coming back with a 0x40 out of cttz...
92	*/
93	cttz $`2`, $`3` # U0 :
94	addq $`0`, $`3`, $`0` # E : All done
95	nop # E :
96	ret # L0 : L U L U
97	#else
98	/*
99	* Slow and clunky. It can probably be improved.
100	* An exercise left for others.
101	*/
102	negq $`2`, $`3` # E :
103	and $`2`, $`3`, $`2` # E :
104	and $`2`, `0x0f`, $`1` # E :
105	addq $`0`, `4`, $`3` # E :
106
107	cmoveq $`1`, $`3`, $`0` # E : Latency `2`, extra map cycle
108	nop # E : keep with cmov
109	and $`2`, `0x33`, $`1` # E :
110	addq $`0`, `2`, $`3` # E : U L U L : `2` cycle stall on $`0`
111
112	cmoveq $`1`, $`3`, $`0` # E : Latency `2`, extra map cycle
113	nop # E : keep with cmov
114	and $`2`, `0x55`, $`1` # E :
115	addq $`0`, `1`, $`3` # E : U L U L : `2` cycle stall on $`0`
116
117	cmoveq $`1`, $`3`, $`0` # E : Latency `2`, extra map cycle
118	nop
119	nop
120	ret # L0 : L U L U
121	#endif
122
123	# Deal with the case where $18 > 8 bytes remain to be
124	# searched. $16 may not be aligned.
125	.align `4`
126	$first_quad:
127	andnot $`16`, `0x7`, $`0` # E :
128	insqh $`3`, $`16`, $`2` # U : $`2` = `0000ffffffffffff` ($`16`<`0`:`2`> ff)
129	xor $`1`, $`17`, $`1` # E :
130	or $`1`, $`2`, $`1` # E : U L U L $`1` = ====ffffffffffff
131
132	cmpbge $`31`, $`1`, $`2` # E :
133	bne $`2`, $found_it # U :
134	# At least one byte left to process.
135	ldq $`1`, `8`($`0`) # L :
136	subq $`5`, `1`, $`18` # E : U L U L
137
138	addq $`0`, `8`, $`0` # E :
139	# Make $18 point to last quad to be accessed (the
140	# last quad may or may not be partial).
141	andnot $`18`, `0x7`, $`18` # E :
142	cmpult $`0`, $`18`, $`2` # E :
143	beq $`2`, $final # U : U L U L
144
145	# At least two quads remain to be accessed.
146
147	subq $`18`, $`0`, $`4` # E : $`4` <- nr quads to be processed
148	and $`4`, `8`, $`4` # E : odd number of quads?
149	bne $`4`, $odd_quad_count # U :
150	# At least three quads remain to be accessed
151	mov $`1`, $`4` # E : L U L U : move prefetched value to correct reg
152
153	.align `4`
154	$unrolled_loop:
155	ldq $`1`, `8`($`0`) # L : prefetch $`1`
156	xor $`17`, $`4`, $`2` # E :
157	cmpbge $`31`, $`2`, $`2` # E :
158	bne $`2`, $found_it # U : U L U L
159
160	addq $`0`, `8`, $`0` # E :
161	nop # E :
162	nop # E :
163	nop # E :
164
165	$odd_quad_count:
166	xor $`17`, $`1`, $`2` # E :
167	ldq $`4`, `8`($`0`) # L : prefetch $`4`
168	cmpbge $`31`, $`2`, $`2` # E :
169	addq $`0`, `8`, $`6` # E :
170
171	bne $`2`, $found_it # U :
172	cmpult $`6`, $`18`, $`6` # E :
173	addq $`0`, `8`, $`0` # E :
174	nop # E :
175
176	bne $`6`, $unrolled_loop # U :
177	mov $`4`, $`1` # E : move prefetched value into $`1`
178	nop # E :
179	nop # E :
180
181	$final: subq $`5`, $`0`, $`18` # E : $`18` <- number of bytes left to do
182	nop # E :
183	nop # E :
184	bne $`18`, $last_quad # U :
185
186	$not_found:
187	mov $`31`, $`0` # E :
188	nop # E :
189	nop # E :
190	ret # L0 :
191
192	.end memchr
193	EXPORT_SYMBOL(memchr)
194

source code of linux/arch/alpha/lib/ev6-memchr.S