ev6-csum_ipv6_magic.S source code [linux/arch/alpha/lib/ev6-csum_ipv6_magic.S]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	/*
3	* arch/alpha/lib/ev6-csum_ipv6_magic.S
4	* 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
5	*
6	* unsigned short csum_ipv6_magic(struct in6_addr *saddr,
7	* struct in6_addr *daddr,
8	* __u32 len,
9	* unsigned short proto,
10	* unsigned int csum);
11	*
12	* Much of the information about 21264 scheduling/coding comes from:
13	* Compiler Writer's Guide for the Alpha 21264
14	* abbreviated as 'CWG' in other comments here
15	* ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
16	* Scheduling notation:
17	* E - either cluster
18	* U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
19	* L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
20	* Try not to change the actual algorithm if possible for consistency.
21	* Determining actual stalls (other than slotting) doesn't appear to be easy to do.
22	*
23	* unsigned short csum_ipv6_magic(struct in6_addr *saddr,
24	* struct in6_addr *daddr,
25	* __u32 len,
26	* unsigned short proto,
27	* unsigned int csum);
28	*
29	* Swap <proto> (takes form 0xaabb)
30	* Then shift it left by 48, so result is:
31	* 0xbbaa0000 00000000
32	* Then turn it back into a sign extended 32-bit item
33	* 0xbbaa0000
34	*
35	* Swap <len> (an unsigned int) using Mike Burrows' 7-instruction sequence
36	* (we can't hide the 3-cycle latency of the unpkbw in the 6-instruction sequence)
37	* Assume input takes form 0xAABBCCDD
38	*
39	* Finally, original 'folding' approach is to split the long into 4 unsigned shorts
40	* add 4 ushorts, resulting in ushort/carry
41	* add carry bits + ushort --> ushort
42	* add carry bits + ushort --> ushort (in case the carry results in an overflow)
43	* Truncate to a ushort. (took 13 instructions)
44	* From doing some testing, using the approach in checksum.c:from64to16()
45	* results in the same outcome:
46	* split into 2 uints, add those, generating a ulong
47	* add the 3 low ushorts together, generating a uint
48	* a final add of the 2 lower ushorts
49	* truncating the result.
50	*
51	* Misalignment handling added by Ivan Kokshaysky <ink@jurassic.park.msu.ru>
52	* The cost is 16 instructions (~8 cycles), including two extra loads which
53	* may cause additional delay in rare cases (load-load replay traps).
54	*/
55
56	#include <linux/export.h>
57	.globl csum_ipv6_magic
58	.align `4`
59	.ent csum_ipv6_magic
60	.frame $`30`,`0`,$`26`,`0`
61	csum_ipv6_magic:
62	.prologue `0`
63
64	ldq_u $`0`,`0`($`16`) # L : Latency: `3`
65	inslh $`18`,`7`,$`4` # U : `0000000000AABBCC`
66	ldq_u $`1`,`8`($`16`) # L : Latency: `3`
67	sll $`19`,`8`,$`7` # U : U L U L : `0x00000000` `00aabb00`
68
69	and $`16`,`7`,$`6` # E : src misalignment
70	ldq_u $`5`,`15`($`16`) # L : Latency: `3`
71	zapnot $`20`,`15`,$`20` # U : zero extend incoming csum
72	ldq_u $`2`,`0`($`17`) # L : U L U L : Latency: `3`
73
74	extql $`0`,$`6`,$`0` # U :
75	extqh $`1`,$`6`,$`22` # U :
76	ldq_u $`3`,`8`($`17`) # L : Latency: `3`
77	sll $`19`,`24`,$`19` # U : U U L U : `0x000000aa` bb000000
78
79	cmoveq $`6`,$`31`,$`22` # E : src aligned?
80	ldq_u $`23`,`15`($`17`) # L : Latency: `3`
81	inswl $`18`,`3`,$`18` # U : `000000CCDD000000`
82	addl $`19`,$`7`,$`19` # E : U L U L : <sign bits>bbaabb00
83
84	or $`0`,$`22`,$`0` # E : `1st` src word complete
85	extql $`1`,$`6`,$`1` # U :
86	or $`18`,$`4`,$`18` # E : `000000CCDDAABBCC`
87	extqh $`5`,$`6`,$`5` # U : L U L U
88
89	and $`17`,`7`,$`6` # E : dst misalignment
90	extql $`2`,$`6`,$`2` # U :
91	or $`1`,$`5`,$`1` # E : `2nd` src word complete
92	extqh $`3`,$`6`,$`22` # U : L U L U :
93
94	cmoveq $`6`,$`31`,$`22` # E : dst aligned?
95	extql $`3`,$`6`,$`3` # U :
96	addq $`20`,$`0`,$`20` # E : begin summing the words
97	extqh $`23`,$`6`,$`23` # U : L U L U :
98
99	srl $`18`,`16`,$`4` # U : `0000000000CCDDAA`
100	or $`2`,$`22`,$`2` # E : `1st` dst word complete
101	zap $`19`,`0x3`,$`19` # U : <sign bits>bbaa0000
102	or $`3`,$`23`,$`3` # E : U L U L : `2nd` dst word complete
103
104	cmpult $`20`,$`0`,$`0` # E :
105	addq $`20`,$`1`,$`20` # E :
106	zapnot $`18`,`0xa`,$`18` # U : `00000000DD00BB00`
107	zap $`4`,`0xa`,$`4` # U : U U L L : `0000000000CC00AA`
108
109	or $`18`,$`4`,$`18` # E : `00000000DDCCBBAA`
110	nop # E :
111	cmpult $`20`,$`1`,$`1` # E :
112	addq $`20`,$`2`,$`20` # E : U L U L
113
114	cmpult $`20`,$`2`,$`2` # E :
115	addq $`20`,$`3`,$`20` # E :
116	cmpult $`20`,$`3`,$`3` # E : (`1` cycle stall on $`20`)
117	addq $`20`,$`18`,$`20` # E : U L U L (`1` cycle stall on $`20`)
118
119	cmpult $`20`,$`18`,$`18` # E :
120	addq $`20`,$`19`,$`20` # E : (`1` cycle stall on $`20`)
121	addq $`0`,$`1`,$`0` # E : merge the carries back into the csum
122	addq $`2`,$`3`,$`2` # E :
123
124	cmpult $`20`,$`19`,$`19` # E :
125	addq $`18`,$`19`,$`18` # E : (`1` cycle stall on $`19`)
126	addq $`0`,$`2`,$`0` # E :
127	addq $`20`,$`18`,$`20` # E : U L U L :
128	/ (1 cycle stall on $18, 2 cycles on $20) /
129
130	addq $`0`,$`20`,$`0` # E :
131	zapnot $`0`,`15`,$`1` # U : Start folding output (`1` cycle stall on $`0`)
132	nop # E :
133	srl $`0`,`32`,$`0` # U : U L U L : (`1` cycle stall on $`0`)
134
135	addq $`1`,$`0`,$`1` # E : Finished generating ulong
136	extwl $`1`,`2`,$`2` # U : ushort[`1`] (`1` cycle stall on $`1`)
137	zapnot $`1`,`3`,$`0` # U : ushort[`0`] (`1` cycle stall on $`1`)
138	extwl $`1`,`4`,$`1` # U : ushort[`2`] (`1` cycle stall on $`1`)
139
140	addq $`0`,$`2`,$`0` # E
141	addq $`0`,$`1`,$`3` # E : Finished generating uint
142	/ (1 cycle stall on $0) /
143	extwl $`3`,`2`,$`1` # U : ushort[`1`] (`1` cycle stall on $`3`)
144	nop # E : L U L U
145
146	addq $`1`,$`3`,$`0` # E : Final carry
147	not $`0`,$`4` # E : complement (`1` cycle stall on $`0`)
148	zapnot $`4`,`3`,$`0` # U : clear upper garbage bits
149	/ (1 cycle stall on $4) /
150	ret # L0 : L U L U
151
152	.end csum_ipv6_magic
153	EXPORT_SYMBOL(csum_ipv6_magic)
154

source code of linux/arch/alpha/lib/ev6-csum_ipv6_magic.S