1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* checksum.S: Sparc V9 optimized checksum code. |
3 | * |
4 | * Copyright(C) 1995 Linus Torvalds |
5 | * Copyright(C) 1995 Miguel de Icaza |
6 | * Copyright(C) 1996, 2000 David S. Miller |
7 | * Copyright(C) 1997 Jakub Jelinek |
8 | * |
9 | * derived from: |
10 | * Linux/Alpha checksum c-code |
11 | * Linux/ix86 inline checksum assembly |
12 | * RFC1071 Computing the Internet Checksum (esp. Jacobsons m68k code) |
13 | * David Mosberger-Tang for optimized reference c-code |
14 | * BSD4.4 portable checksum routine |
15 | */ |
16 | |
17 | #include <linux/export.h> |
18 | .text |
19 | |
20 | csum_partial_fix_alignment: |
21 | /* We checked for zero length already, so there must be |
22 | * at least one byte. |
23 | */ |
24 | be,pt %icc, 1f |
25 | nop |
26 | ldub [%o0 + 0x00], %o4 |
27 | add %o0, 1, %o0 |
28 | sub %o1, 1, %o1 |
29 | 1: andcc %o0, 0x2, %g0 |
30 | be,pn %icc, csum_partial_post_align |
31 | cmp %o1, 2 |
32 | blu,pn %icc, csum_partial_end_cruft |
33 | nop |
34 | lduh [%o0 + 0x00], %o5 |
35 | add %o0, 2, %o0 |
36 | sub %o1, 2, %o1 |
37 | ba,pt %xcc, csum_partial_post_align |
38 | add %o5, %o4, %o4 |
39 | |
40 | .align 32 |
41 | .globl csum_partial |
42 | .type csum_partial,#function |
43 | EXPORT_SYMBOL(csum_partial) |
44 | csum_partial: /* %o0=buff, %o1=len, %o2=sum */ |
45 | prefetch [%o0 + 0x000], #n_reads |
46 | clr %o4 |
47 | prefetch [%o0 + 0x040], #n_reads |
48 | brz,pn %o1, csum_partial_finish |
49 | andcc %o0, 0x3, %g0 |
50 | |
51 | /* We "remember" whether the lowest bit in the address |
52 | * was set in %g7. Because if it is, we have to swap |
53 | * upper and lower 8 bit fields of the sum we calculate. |
54 | */ |
55 | bne,pn %icc, csum_partial_fix_alignment |
56 | andcc %o0, 0x1, %g7 |
57 | |
58 | csum_partial_post_align: |
59 | prefetch [%o0 + 0x080], #n_reads |
60 | andncc %o1, 0x3f, %o3 |
61 | |
62 | prefetch [%o0 + 0x0c0], #n_reads |
63 | sub %o1, %o3, %o1 |
64 | brz,pn %o3, 2f |
65 | prefetch [%o0 + 0x100], #n_reads |
66 | |
67 | /* So that we don't need to use the non-pairing |
68 | * add-with-carry instructions we accumulate 32-bit |
69 | * values into a 64-bit register. At the end of the |
70 | * loop we fold it down to 32-bits and so on. |
71 | */ |
72 | prefetch [%o0 + 0x140], #n_reads |
73 | 1: lduw [%o0 + 0x00], %o5 |
74 | lduw [%o0 + 0x04], %g1 |
75 | lduw [%o0 + 0x08], %g2 |
76 | add %o4, %o5, %o4 |
77 | lduw [%o0 + 0x0c], %g3 |
78 | add %o4, %g1, %o4 |
79 | lduw [%o0 + 0x10], %o5 |
80 | add %o4, %g2, %o4 |
81 | lduw [%o0 + 0x14], %g1 |
82 | add %o4, %g3, %o4 |
83 | lduw [%o0 + 0x18], %g2 |
84 | add %o4, %o5, %o4 |
85 | lduw [%o0 + 0x1c], %g3 |
86 | add %o4, %g1, %o4 |
87 | lduw [%o0 + 0x20], %o5 |
88 | add %o4, %g2, %o4 |
89 | lduw [%o0 + 0x24], %g1 |
90 | add %o4, %g3, %o4 |
91 | lduw [%o0 + 0x28], %g2 |
92 | add %o4, %o5, %o4 |
93 | lduw [%o0 + 0x2c], %g3 |
94 | add %o4, %g1, %o4 |
95 | lduw [%o0 + 0x30], %o5 |
96 | add %o4, %g2, %o4 |
97 | lduw [%o0 + 0x34], %g1 |
98 | add %o4, %g3, %o4 |
99 | lduw [%o0 + 0x38], %g2 |
100 | add %o4, %o5, %o4 |
101 | lduw [%o0 + 0x3c], %g3 |
102 | add %o4, %g1, %o4 |
103 | prefetch [%o0 + 0x180], #n_reads |
104 | add %o4, %g2, %o4 |
105 | subcc %o3, 0x40, %o3 |
106 | add %o0, 0x40, %o0 |
107 | bne,pt %icc, 1b |
108 | add %o4, %g3, %o4 |
109 | |
110 | 2: and %o1, 0x3c, %o3 |
111 | brz,pn %o3, 2f |
112 | sub %o1, %o3, %o1 |
113 | 1: lduw [%o0 + 0x00], %o5 |
114 | subcc %o3, 0x4, %o3 |
115 | add %o0, 0x4, %o0 |
116 | bne,pt %icc, 1b |
117 | add %o4, %o5, %o4 |
118 | |
119 | 2: |
120 | /* fold 64-->32 */ |
121 | srlx %o4, 32, %o5 |
122 | srl %o4, 0, %o4 |
123 | add %o4, %o5, %o4 |
124 | srlx %o4, 32, %o5 |
125 | srl %o4, 0, %o4 |
126 | add %o4, %o5, %o4 |
127 | |
128 | /* fold 32-->16 */ |
129 | sethi %hi(0xffff0000), %g1 |
130 | srl %o4, 16, %o5 |
131 | andn %o4, %g1, %g2 |
132 | add %o5, %g2, %o4 |
133 | srl %o4, 16, %o5 |
134 | andn %o4, %g1, %g2 |
135 | add %o5, %g2, %o4 |
136 | |
137 | csum_partial_end_cruft: |
138 | /* %o4 has the 16-bit sum we have calculated so-far. */ |
139 | cmp %o1, 2 |
140 | blu,pt %icc, 1f |
141 | nop |
142 | lduh [%o0 + 0x00], %o5 |
143 | sub %o1, 2, %o1 |
144 | add %o0, 2, %o0 |
145 | add %o4, %o5, %o4 |
146 | 1: brz,pt %o1, 1f |
147 | nop |
148 | ldub [%o0 + 0x00], %o5 |
149 | sub %o1, 1, %o1 |
150 | add %o0, 1, %o0 |
151 | sllx %o5, 8, %o5 |
152 | add %o4, %o5, %o4 |
153 | 1: |
154 | /* fold 32-->16 */ |
155 | sethi %hi(0xffff0000), %g1 |
156 | srl %o4, 16, %o5 |
157 | andn %o4, %g1, %g2 |
158 | add %o5, %g2, %o4 |
159 | srl %o4, 16, %o5 |
160 | andn %o4, %g1, %g2 |
161 | add %o5, %g2, %o4 |
162 | |
163 | 1: brz,pt %g7, 1f |
164 | nop |
165 | |
166 | /* We started with an odd byte, byte-swap the result. */ |
167 | srl %o4, 8, %o5 |
168 | and %o4, 0xff, %g1 |
169 | sll %g1, 8, %g1 |
170 | or %o5, %g1, %o4 |
171 | |
172 | 1: addcc %o2, %o4, %o2 |
173 | addc %g0, %o2, %o2 |
174 | |
175 | csum_partial_finish: |
176 | retl |
177 | srl %o2, 0, %o0 |
178 | |