dec_mips_dsp_r2.c source code [qtimageformats/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c]

1	// Copyright 2014 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// MIPS version of dsp functions
11	//
12	// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
13	// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
14
15	#include "src/dsp/dsp.h"
16
17	#if defined(WEBP_USE_MIPS_DSP_R2)
18
19	#include "src/dsp/mips_macro.h"
20
21	static const int kC1 = `20091` + (`1` << `16`);
22	static const int kC2 = `35468`;
23
24	#define MUL(a, b) (((a) * (b)) >> 16)
25
26	static void TransformDC(const int16_t* in, uint8_t* dst) {
27	int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
28
29	__asm__ volatile (
30	LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
31	`0`, `0`, `0`, `0`,
32	`0`, `1`, `2`, `3`,
33	BPS)
34	"lh %[temp5], 0(%[in]) \n\t"
35	"addiu %[temp5], %[temp5], 4 \n\t"
36	"ins %[temp5], %[temp5], 16, 16 \n\t"
37	"shra.ph %[temp5], %[temp5], 3 \n\t"
38	CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
39	temp3, temp1, temp2, temp3, temp4)
40	STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
41	temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
42	dst, `0`, `1`, `2`, `3`, BPS)
43
44	OUTPUT_EARLY_CLOBBER_REGS_10()
45	: [in]"r"(in), [dst]"r"(dst)
46	: "memory"
47	);
48	}
49
50	static void TransformAC3(const int16_t* in, uint8_t* dst) {
51	const int a = in[`0`] + `4`;
52	int c4 = MUL(in[`4`], kC2);
53	const int d4 = MUL(in[`4`], kC1);
54	const int c1 = MUL(in[`1`], kC2);
55	const int d1 = MUL(in[`1`], kC1);
56	int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
57	int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
58
59	__asm__ volatile (
60	"ins %[c4], %[d4], 16, 16 \n\t"
61	"replv.ph %[temp1], %[a] \n\t"
62	"replv.ph %[temp4], %[d1] \n\t"
63	ADD_SUB_HALVES(temp2, temp3, temp1, c4)
64	"replv.ph %[temp5], %[c1] \n\t"
65	SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
66	temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
67	LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
68	`0`, `0`, `0`, `0`,
69	`0`, `1`, `2`, `3`,
70	BPS)
71	CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
72	temp11, temp17, temp3, temp5, temp11, temp12)
73	PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
74	temp4, temp7, temp6, temp10, temp9)
75	STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
76	temp17, temp12, temp18, temp1, temp8, temp2, temp4,
77	temp7, temp6, dst, `0`, `1`, `2`, `3`, BPS)
78
79	OUTPUT_EARLY_CLOBBER_REGS_18(),
80	[c4]"+&r"(c4)
81	: [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
82	: "memory"
83	);
84	}
85
86	static void TransformOne(const int16_t* in, uint8_t* dst) {
87	int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
88	int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
89
90	__asm__ volatile (
91	"ulw %[temp1], 0(%[in]) \n\t"
92	"ulw %[temp2], 16(%[in]) \n\t"
93	LOAD_IN_X2(temp5, temp6, `24`, `26`)
94	ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
95	LOAD_IN_X2(temp1, temp2, `8`, `10`)
96	MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
97	temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
98	temp13, temp11, temp14, temp12)
99	INSERT_HALF_X2(temp8, temp7, temp10, temp9)
100	"ulw %[temp17], 4(%[in]) \n\t"
101	"ulw %[temp18], 20(%[in]) \n\t"
102	ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
103	ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
104	ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
105	LOAD_IN_X2(temp17, temp18, `12`, `14`)
106	LOAD_IN_X2(temp9, temp10, `28`, `30`)
107	MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
108	temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
109	temp15, temp4, temp16, temp17)
110	INSERT_HALF_X2(temp11, temp12, temp13, temp14)
111	ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
112	ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
113
114	// horizontal
115	SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
116	INSERT_HALF_X2(temp1, temp6, temp5, temp2)
117	SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
118	"repl.ph %[temp2], 0x4 \n\t"
119	INSERT_HALF_X2(temp3, temp8, temp17, temp4)
120	"addq.ph %[temp1], %[temp1], %[temp2] \n\t"
121	"addq.ph %[temp6], %[temp6], %[temp2] \n\t"
122	ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
123	ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
124	MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
125	temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
126	temp6, temp17, temp8, temp18)
127	MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
128	temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
129	temp18, temp12, temp17, temp16)
130	INSERT_HALF_X2(temp1, temp3, temp9, temp13)
131	INSERT_HALF_X2(temp6, temp8, temp11, temp15)
132	SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
133	temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
134	temp6)
135	PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
136	temp16, temp11, temp10, temp15, temp14)
137	LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
138	`0`, `0`, `0`, `0`,
139	`0`, `1`, `2`, `3`,
140	BPS)
141	CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
142	temp11, temp10, temp11, temp14, temp15)
143	STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
144	temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
145	dst, `0`, `1`, `2`, `3`, BPS)
146
147	OUTPUT_EARLY_CLOBBER_REGS_18()
148	: [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
149	: "memory", "hi", "lo"
150	);
151	}
152
153	static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
154	TransformOne(in, dst);
155	if (do_two) {
156	TransformOne(in + `16`, dst + `4`);
157	}
158	}
159
160	static WEBP_INLINE void FilterLoop26(uint8_t* p,
161	int hstride, int vstride, int size,
162	int thresh, int ithresh, int hev_thresh) {
163	const int thresh2 = `2` * thresh + `1`;
164	int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
165	int temp10, temp11, temp12, temp13, temp14, temp15;
166
167	__asm__ volatile (
168	".set push \n\t"
169	".set noreorder \n\t"
170	"1: \n\t"
171	"negu %[temp1], %[hstride] \n\t"
172	"addiu %[size], %[size], -1 \n\t"
173	"sll %[temp2], %[hstride], 1 \n\t"
174	"sll %[temp3], %[temp1], 1 \n\t"
175	"addu %[temp4], %[temp2], %[hstride] \n\t"
176	"addu %[temp5], %[temp3], %[temp1] \n\t"
177	"lbu %[temp7], 0(%[p]) \n\t"
178	"sll %[temp6], %[temp3], 1 \n\t"
179	"lbux %[temp8], %[temp5](%[p]) \n\t"
180	"lbux %[temp9], %[temp3](%[p]) \n\t"
181	"lbux %[temp10], %[temp1](%[p]) \n\t"
182	"lbux %[temp11], %[temp6](%[p]) \n\t"
183	"lbux %[temp12], %[hstride](%[p]) \n\t"
184	"lbux %[temp13], %[temp2](%[p]) \n\t"
185	"lbux %[temp14], %[temp4](%[p]) \n\t"
186	"subu %[temp1], %[temp10], %[temp7] \n\t"
187	"subu %[temp2], %[temp9], %[temp12] \n\t"
188	"absq_s.w %[temp3], %[temp1] \n\t"
189	"absq_s.w %[temp4], %[temp2] \n\t"
190	"negu %[temp1], %[temp1] \n\t"
191	"sll %[temp3], %[temp3], 2 \n\t"
192	"addu %[temp15], %[temp3], %[temp4] \n\t"
193	"subu %[temp3], %[temp15], %[thresh2] \n\t"
194	"sll %[temp6], %[temp1], 1 \n\t"
195	"bgtz %[temp3], 3f \n\t"
196	" subu %[temp4], %[temp11], %[temp8] \n\t"
197	"absq_s.w %[temp4], %[temp4] \n\t"
198	"shll_s.w %[temp2], %[temp2], 24 \n\t"
199	"subu %[temp4], %[temp4], %[ithresh] \n\t"
200	"bgtz %[temp4], 3f \n\t"
201	" subu %[temp3], %[temp8], %[temp9] \n\t"
202	"absq_s.w %[temp3], %[temp3] \n\t"
203	"subu %[temp3], %[temp3], %[ithresh] \n\t"
204	"bgtz %[temp3], 3f \n\t"
205	" subu %[temp5], %[temp9], %[temp10] \n\t"
206	"absq_s.w %[temp3], %[temp5] \n\t"
207	"absq_s.w %[temp5], %[temp5] \n\t"
208	"subu %[temp3], %[temp3], %[ithresh] \n\t"
209	"bgtz %[temp3], 3f \n\t"
210	" subu %[temp3], %[temp14], %[temp13] \n\t"
211	"absq_s.w %[temp3], %[temp3] \n\t"
212	"slt %[temp5], %[hev_thresh], %[temp5] \n\t"
213	"subu %[temp3], %[temp3], %[ithresh] \n\t"
214	"bgtz %[temp3], 3f \n\t"
215	" subu %[temp3], %[temp13], %[temp12] \n\t"
216	"absq_s.w %[temp3], %[temp3] \n\t"
217	"sra %[temp4], %[temp2], 24 \n\t"
218	"subu %[temp3], %[temp3], %[ithresh] \n\t"
219	"bgtz %[temp3], 3f \n\t"
220	" subu %[temp15], %[temp12], %[temp7] \n\t"
221	"absq_s.w %[temp3], %[temp15] \n\t"
222	"absq_s.w %[temp15], %[temp15] \n\t"
223	"subu %[temp3], %[temp3], %[ithresh] \n\t"
224	"bgtz %[temp3], 3f \n\t"
225	" slt %[temp15], %[hev_thresh], %[temp15] \n\t"
226	"addu %[temp3], %[temp6], %[temp1] \n\t"
227	"or %[temp2], %[temp5], %[temp15] \n\t"
228	"addu %[temp5], %[temp4], %[temp3] \n\t"
229	"beqz %[temp2], 4f \n\t"
230	" shra_r.w %[temp1], %[temp5], 3 \n\t"
231	"addiu %[temp2], %[temp5], 3 \n\t"
232	"sra %[temp2], %[temp2], 3 \n\t"
233	"shll_s.w %[temp1], %[temp1], 27 \n\t"
234	"shll_s.w %[temp2], %[temp2], 27 \n\t"
235	"subu %[temp3], %[p], %[hstride] \n\t"
236	"sra %[temp1], %[temp1], 27 \n\t"
237	"sra %[temp2], %[temp2], 27 \n\t"
238	"subu %[temp1], %[temp7], %[temp1] \n\t"
239	"addu %[temp2], %[temp10], %[temp2] \n\t"
240	"lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t"
241	"lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t"
242	"sb %[temp2], 0(%[temp3]) \n\t"
243	"j 3f \n\t"
244	" sb %[temp1], 0(%[p]) \n\t"
245	"4: \n\t"
246	"shll_s.w %[temp5], %[temp5], 24 \n\t"
247	"subu %[temp14], %[p], %[hstride] \n\t"
248	"subu %[temp11], %[temp14], %[hstride] \n\t"
249	"sra %[temp6], %[temp5], 24 \n\t"
250	"sll %[temp1], %[temp6], 3 \n\t"
251	"subu %[temp15], %[temp11], %[hstride] \n\t"
252	"addu %[temp2], %[temp6], %[temp1] \n\t"
253	"sll %[temp3], %[temp2], 1 \n\t"
254	"addu %[temp4], %[temp3], %[temp2] \n\t"
255	"addiu %[temp2], %[temp2], 63 \n\t"
256	"addiu %[temp3], %[temp3], 63 \n\t"
257	"addiu %[temp4], %[temp4], 63 \n\t"
258	"sra %[temp2], %[temp2], 7 \n\t"
259	"sra %[temp3], %[temp3], 7 \n\t"
260	"sra %[temp4], %[temp4], 7 \n\t"
261	"addu %[temp1], %[temp8], %[temp2] \n\t"
262	"addu %[temp5], %[temp9], %[temp3] \n\t"
263	"addu %[temp6], %[temp10], %[temp4] \n\t"
264	"subu %[temp8], %[temp7], %[temp4] \n\t"
265	"subu %[temp7], %[temp12], %[temp3] \n\t"
266	"addu %[temp10], %[p], %[hstride] \n\t"
267	"subu %[temp9], %[temp13], %[temp2] \n\t"
268	"addu %[temp12], %[temp10], %[hstride] \n\t"
269	"lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t"
270	"lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t"
271	"lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t"
272	"lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t"
273	"lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t"
274	"lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t"
275	"sb %[temp2], 0(%[temp15]) \n\t"
276	"sb %[temp3], 0(%[temp11]) \n\t"
277	"sb %[temp4], 0(%[temp14]) \n\t"
278	"sb %[temp5], 0(%[p]) \n\t"
279	"sb %[temp6], 0(%[temp10]) \n\t"
280	"sb %[temp8], 0(%[temp12]) \n\t"
281	"3: \n\t"
282	"bgtz %[size], 1b \n\t"
283	" addu %[p], %[p], %[vstride] \n\t"
284	".set pop \n\t"
285	: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
286	[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
287	[temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
288	[temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
289	[temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
290	[size]"+&r"(size), [p]"+&r"(p)
291	: [hstride]"r"(hstride), [thresh2]"r"(thresh2),
292	[ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
293	[VP8kclip1]"r"(VP8kclip1)
294	: "memory"
295	);
296	}
297
298	static WEBP_INLINE void FilterLoop24(uint8_t* p,
299	int hstride, int vstride, int size,
300	int thresh, int ithresh, int hev_thresh) {
301	int p0, q0, p1, q1, p2, q2, p3, q3;
302	int step1, step2, temp1, temp2, temp3, temp4;
303	uint8_t* pTemp0;
304	uint8_t* pTemp1;
305	const int thresh2 = `2` * thresh + `1`;
306
307	__asm__ volatile (
308	".set push \n\t"
309	".set noreorder \n\t"
310	"bltz %[size], 3f \n\t"
311	" nop \n\t"
312	"2: \n\t"
313	"negu %[step1], %[hstride] \n\t"
314	"lbu %[q0], 0(%[p]) \n\t"
315	"lbux %[p0], %[step1](%[p]) \n\t"
316	"subu %[step1], %[step1], %[hstride] \n\t"
317	"lbux %[q1], %[hstride](%[p]) \n\t"
318	"subu %[temp1], %[p0], %[q0] \n\t"
319	"lbux %[p1], %[step1](%[p]) \n\t"
320	"addu %[step2], %[hstride], %[hstride] \n\t"
321	"absq_s.w %[temp2], %[temp1] \n\t"
322	"subu %[temp3], %[p1], %[q1] \n\t"
323	"absq_s.w %[temp4], %[temp3] \n\t"
324	"sll %[temp2], %[temp2], 2 \n\t"
325	"addu %[temp2], %[temp2], %[temp4] \n\t"
326	"subu %[temp4], %[temp2], %[thresh2] \n\t"
327	"subu %[step1], %[step1], %[hstride] \n\t"
328	"bgtz %[temp4], 0f \n\t"
329	" lbux %[p2], %[step1](%[p]) \n\t"
330	"subu %[step1], %[step1], %[hstride] \n\t"
331	"lbux %[q2], %[step2](%[p]) \n\t"
332	"lbux %[p3], %[step1](%[p]) \n\t"
333	"subu %[temp4], %[p2], %[p1] \n\t"
334	"addu %[step2], %[step2], %[hstride] \n\t"
335	"subu %[temp2], %[p3], %[p2] \n\t"
336	"absq_s.w %[temp4], %[temp4] \n\t"
337	"absq_s.w %[temp2], %[temp2] \n\t"
338	"lbux %[q3], %[step2](%[p]) \n\t"
339	"subu %[temp4], %[temp4], %[ithresh] \n\t"
340	"negu %[temp1], %[temp1] \n\t"
341	"bgtz %[temp4], 0f \n\t"
342	" subu %[temp2], %[temp2], %[ithresh] \n\t"
343	"subu %[p3], %[p1], %[p0] \n\t"
344	"bgtz %[temp2], 0f \n\t"
345	" absq_s.w %[p3], %[p3] \n\t"
346	"subu %[temp4], %[q3], %[q2] \n\t"
347	"subu %[pTemp0], %[p], %[hstride] \n\t"
348	"absq_s.w %[temp4], %[temp4] \n\t"
349	"subu %[temp2], %[p3], %[ithresh] \n\t"
350	"sll %[step1], %[temp1], 1 \n\t"
351	"bgtz %[temp2], 0f \n\t"
352	" subu %[temp4], %[temp4], %[ithresh] \n\t"
353	"subu %[temp2], %[q2], %[q1] \n\t"
354	"bgtz %[temp4], 0f \n\t"
355	" absq_s.w %[temp2], %[temp2] \n\t"
356	"subu %[q3], %[q1], %[q0] \n\t"
357	"absq_s.w %[q3], %[q3] \n\t"
358	"subu %[temp2], %[temp2], %[ithresh] \n\t"
359	"addu %[temp1], %[temp1], %[step1] \n\t"
360	"bgtz %[temp2], 0f \n\t"
361	" subu %[temp4], %[q3], %[ithresh] \n\t"
362	"slt %[p3], %[hev_thresh], %[p3] \n\t"
363	"bgtz %[temp4], 0f \n\t"
364	" slt %[q3], %[hev_thresh], %[q3] \n\t"
365	"or %[q3], %[q3], %[p3] \n\t"
366	"bgtz %[q3], 1f \n\t"
367	" shra_r.w %[temp2], %[temp1], 3 \n\t"
368	"addiu %[temp1], %[temp1], 3 \n\t"
369	"sra %[temp1], %[temp1], 3 \n\t"
370	"shll_s.w %[temp2], %[temp2], 27 \n\t"
371	"shll_s.w %[temp1], %[temp1], 27 \n\t"
372	"addu %[pTemp1], %[p], %[hstride] \n\t"
373	"sra %[temp2], %[temp2], 27 \n\t"
374	"sra %[temp1], %[temp1], 27 \n\t"
375	"addiu %[step1], %[temp2], 1 \n\t"
376	"sra %[step1], %[step1], 1 \n\t"
377	"addu %[p0], %[p0], %[temp1] \n\t"
378	"addu %[p1], %[p1], %[step1] \n\t"
379	"subu %[q0], %[q0], %[temp2] \n\t"
380	"subu %[q1], %[q1], %[step1] \n\t"
381	"lbux %[temp2], %[p0](%[VP8kclip1]) \n\t"
382	"lbux %[temp3], %[q0](%[VP8kclip1]) \n\t"
383	"lbux %[temp4], %[q1](%[VP8kclip1]) \n\t"
384	"sb %[temp2], 0(%[pTemp0]) \n\t"
385	"lbux %[temp1], %[p1](%[VP8kclip1]) \n\t"
386	"subu %[pTemp0], %[pTemp0], %[hstride] \n\t"
387	"sb %[temp3], 0(%[p]) \n\t"
388	"sb %[temp4], 0(%[pTemp1]) \n\t"
389	"j 0f \n\t"
390	" sb %[temp1], 0(%[pTemp0]) \n\t"
391	"1: \n\t"
392	"shll_s.w %[temp3], %[temp3], 24 \n\t"
393	"sra %[temp3], %[temp3], 24 \n\t"
394	"addu %[temp1], %[temp1], %[temp3] \n\t"
395	"shra_r.w %[temp2], %[temp1], 3 \n\t"
396	"addiu %[temp1], %[temp1], 3 \n\t"
397	"shll_s.w %[temp2], %[temp2], 27 \n\t"
398	"sra %[temp1], %[temp1], 3 \n\t"
399	"shll_s.w %[temp1], %[temp1], 27 \n\t"
400	"sra %[temp2], %[temp2], 27 \n\t"
401	"sra %[temp1], %[temp1], 27 \n\t"
402	"addu %[p0], %[p0], %[temp1] \n\t"
403	"subu %[q0], %[q0], %[temp2] \n\t"
404	"lbux %[temp1], %[p0](%[VP8kclip1]) \n\t"
405	"lbux %[temp2], %[q0](%[VP8kclip1]) \n\t"
406	"sb %[temp2], 0(%[p]) \n\t"
407	"sb %[temp1], 0(%[pTemp0]) \n\t"
408	"0: \n\t"
409	"subu %[size], %[size], 1 \n\t"
410	"bgtz %[size], 2b \n\t"
411	" addu %[p], %[p], %[vstride] \n\t"
412	"3: \n\t"
413	".set pop \n\t"
414	: [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
415	[p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
416	[step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
417	[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
418	[pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
419	[size]"+&r"(size)
420	: [vstride]"r"(vstride), [ithresh]"r"(ithresh),
421	[hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
422	[VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
423	: "memory"
424	);
425	}
426
427	// on macroblock edges
428	static void VFilter16(uint8_t* p, int stride,
429	int thresh, int ithresh, int hev_thresh) {
430	FilterLoop26(p, stride, `1`, `16`, thresh, ithresh, hev_thresh);
431	}
432
433	static void HFilter16(uint8_t* p, int stride,
434	int thresh, int ithresh, int hev_thresh) {
435	FilterLoop26(p, `1`, stride, `16`, thresh, ithresh, hev_thresh);
436	}
437
438	// 8-pixels wide variant, for chroma filtering
439	static void VFilter8(uint8_t* u, uint8_t* v, int stride,
440	int thresh, int ithresh, int hev_thresh) {
441	FilterLoop26(u, stride, `1`, `8`, thresh, ithresh, hev_thresh);
442	FilterLoop26(v, stride, `1`, `8`, thresh, ithresh, hev_thresh);
443	}
444
445	static void HFilter8(uint8_t* u, uint8_t* v, int stride,
446	int thresh, int ithresh, int hev_thresh) {
447	FilterLoop26(u, `1`, stride, `8`, thresh, ithresh, hev_thresh);
448	FilterLoop26(v, `1`, stride, `8`, thresh, ithresh, hev_thresh);
449	}
450
451	// on three inner edges
452	static void VFilter16i(uint8_t* p, int stride,
453	int thresh, int ithresh, int hev_thresh) {
454	int k;
455	for (k = `3`; k > `0`; --k) {
456	p += `4` * stride;
457	FilterLoop24(p, stride, `1`, `16`, thresh, ithresh, hev_thresh);
458	}
459	}
460
461	static void HFilter16i(uint8_t* p, int stride,
462	int thresh, int ithresh, int hev_thresh) {
463	int k;
464	for (k = `3`; k > `0`; --k) {
465	p += `4`;
466	FilterLoop24(p, `1`, stride, `16`, thresh, ithresh, hev_thresh);
467	}
468	}
469
470	static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
471	int thresh, int ithresh, int hev_thresh) {
472	FilterLoop24(u + `4` * stride, stride, `1`, `8`, thresh, ithresh, hev_thresh);
473	FilterLoop24(v + `4` * stride, stride, `1`, `8`, thresh, ithresh, hev_thresh);
474	}
475
476	static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
477	int thresh, int ithresh, int hev_thresh) {
478	FilterLoop24(u + `4`, `1`, stride, `8`, thresh, ithresh, hev_thresh);
479	FilterLoop24(v + `4`, `1`, stride, `8`, thresh, ithresh, hev_thresh);
480	}
481
482	#undef MUL
483
484	//------------------------------------------------------------------------------
485	// Simple In-loop filtering (Paragraph 15.2)
486
487	static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
488	int i;
489	const int thresh2 = `2` * thresh + `1`;
490	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
491	uint8_t* p1 = p - stride;
492	__asm__ volatile (
493	".set push \n\t"
494	".set noreorder \n\t"
495	"li %[i], 16 \n\t"
496	"0: \n\t"
497	"negu %[temp4], %[stride] \n\t"
498	"sll %[temp5], %[temp4], 1 \n\t"
499	"lbu %[temp2], 0(%[p]) \n\t"
500	"lbux %[temp3], %[stride](%[p]) \n\t"
501	"lbux %[temp1], %[temp4](%[p]) \n\t"
502	"lbux %[temp0], %[temp5](%[p]) \n\t"
503	"subu %[temp7], %[temp1], %[temp2] \n\t"
504	"subu %[temp6], %[temp0], %[temp3] \n\t"
505	"absq_s.w %[temp4], %[temp7] \n\t"
506	"absq_s.w %[temp5], %[temp6] \n\t"
507	"sll %[temp4], %[temp4], 2 \n\t"
508	"subu %[temp5], %[temp5], %[thresh2] \n\t"
509	"addu %[temp5], %[temp4], %[temp5] \n\t"
510	"negu %[temp8], %[temp7] \n\t"
511	"bgtz %[temp5], 1f \n\t"
512	" addiu %[i], %[i], -1 \n\t"
513	"sll %[temp4], %[temp8], 1 \n\t"
514	"shll_s.w %[temp5], %[temp6], 24 \n\t"
515	"addu %[temp3], %[temp4], %[temp8] \n\t"
516	"sra %[temp5], %[temp5], 24 \n\t"
517	"addu %[temp3], %[temp3], %[temp5] \n\t"
518	"addiu %[temp7], %[temp3], 3 \n\t"
519	"sra %[temp7], %[temp7], 3 \n\t"
520	"shra_r.w %[temp8], %[temp3], 3 \n\t"
521	"shll_s.w %[temp0], %[temp7], 27 \n\t"
522	"shll_s.w %[temp4], %[temp8], 27 \n\t"
523	"sra %[temp0], %[temp0], 27 \n\t"
524	"sra %[temp4], %[temp4], 27 \n\t"
525	"addu %[temp7], %[temp1], %[temp0] \n\t"
526	"subu %[temp2], %[temp2], %[temp4] \n\t"
527	"lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
528	"lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
529	"sb %[temp3], 0(%[p1]) \n\t"
530	"sb %[temp4], 0(%[p]) \n\t"
531	"1: \n\t"
532	"addiu %[p1], %[p1], 1 \n\t"
533	"bgtz %[i], 0b \n\t"
534	" addiu %[p], %[p], 1 \n\t"
535	" .set pop \n\t"
536	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
537	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
538	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
539	[p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
540	: [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
541	: "memory"
542	);
543	}
544
545	// TEMP0 = SRC[A + A1 BPS]*
546	// TEMP1 = SRC[B + B1 BPS]*
547	// TEMP2 = SRC[C + C1 BPS]*
548	// TEMP3 = SRC[D + D1 BPS]*
549	#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \
550	A, A1, B, B1, C, C1, D, D1, SRC) \
551	"lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
552	"lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
553	"lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
554	"lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
555
556	static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
557	int i;
558	const int thresh2 = `2` * thresh + `1`;
559	int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
560	__asm__ volatile (
561	".set push \n\t"
562	".set noreorder \n\t"
563	"li %[i], 16 \n\t"
564	"0: \n\t"
565	LOAD_4_BYTES(temp0, temp1, temp2, temp3, -`2`, `0`, -`1`, `0`, `0`, `0`, `1`, `0`, p)
566	"subu %[temp7], %[temp1], %[temp2] \n\t"
567	"subu %[temp6], %[temp0], %[temp3] \n\t"
568	"absq_s.w %[temp4], %[temp7] \n\t"
569	"absq_s.w %[temp5], %[temp6] \n\t"
570	"sll %[temp4], %[temp4], 2 \n\t"
571	"addu %[temp5], %[temp4], %[temp5] \n\t"
572	"subu %[temp5], %[temp5], %[thresh2] \n\t"
573	"negu %[temp8], %[temp7] \n\t"
574	"bgtz %[temp5], 1f \n\t"
575	" addiu %[i], %[i], -1 \n\t"
576	"sll %[temp4], %[temp8], 1 \n\t"
577	"shll_s.w %[temp5], %[temp6], 24 \n\t"
578	"addu %[temp3], %[temp4], %[temp8] \n\t"
579	"sra %[temp5], %[temp5], 24 \n\t"
580	"addu %[temp3], %[temp3], %[temp5] \n\t"
581	"addiu %[temp7], %[temp3], 3 \n\t"
582	"sra %[temp7], %[temp7], 3 \n\t"
583	"shra_r.w %[temp8], %[temp3], 3 \n\t"
584	"shll_s.w %[temp0], %[temp7], 27 \n\t"
585	"shll_s.w %[temp4], %[temp8], 27 \n\t"
586	"sra %[temp0], %[temp0], 27 \n\t"
587	"sra %[temp4], %[temp4], 27 \n\t"
588	"addu %[temp7], %[temp1], %[temp0] \n\t"
589	"subu %[temp2], %[temp2], %[temp4] \n\t"
590	"lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
591	"lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
592	"sb %[temp3], -1(%[p]) \n\t"
593	"sb %[temp4], 0(%[p]) \n\t"
594	"1: \n\t"
595	"bgtz %[i], 0b \n\t"
596	" addu %[p], %[p], %[stride] \n\t"
597	".set pop \n\t"
598	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
599	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
600	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
601	[p]"+&r"(p), [i]"=&r"(i)
602	: [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
603	: "memory"
604	);
605	}
606
607	static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
608	int k;
609	for (k = `3`; k > `0`; --k) {
610	p += `4` * stride;
611	SimpleVFilter16(p, stride, thresh);
612	}
613	}
614
615	static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
616	int k;
617	for (k = `3`; k > `0`; --k) {
618	p += `4`;
619	SimpleHFilter16(p, stride, thresh);
620	}
621	}
622
623	// DST[A BPS] = TEMP0*
624	// DST[B + C BPS] = TEMP1*
625	#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \
626	"usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \
627	"usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t"
628
629	static void VE4(uint8_t* dst) { // vertical
630	const uint8_t* top = dst - BPS;
631	int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
632	__asm__ volatile (
633	"ulw %[temp0], -1(%[top]) \n\t"
634	"ulh %[temp1], 3(%[top]) \n\t"
635	"preceu.ph.qbr %[temp2], %[temp0] \n\t"
636	"preceu.ph.qbl %[temp3], %[temp0] \n\t"
637	"preceu.ph.qbr %[temp4], %[temp1] \n\t"
638	"packrl.ph %[temp5], %[temp3], %[temp2] \n\t"
639	"packrl.ph %[temp6], %[temp4], %[temp3] \n\t"
640	"shll.ph %[temp5], %[temp5], 1 \n\t"
641	"shll.ph %[temp6], %[temp6], 1 \n\t"
642	"addq.ph %[temp2], %[temp5], %[temp2] \n\t"
643	"addq.ph %[temp6], %[temp6], %[temp4] \n\t"
644	"addq.ph %[temp2], %[temp2], %[temp3] \n\t"
645	"addq.ph %[temp6], %[temp6], %[temp3] \n\t"
646	"shra_r.ph %[temp2], %[temp2], 2 \n\t"
647	"shra_r.ph %[temp6], %[temp6], 2 \n\t"
648	"precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t"
649	STORE_8_BYTES(temp4, temp4, `0`, `0`, `1`, dst)
650	STORE_8_BYTES(temp4, temp4, `2`, `0`, `3`, dst)
651	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
652	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
653	[temp6]"=&r"(temp6)
654	: [top]"r"(top), [dst]"r"(dst)
655	: "memory"
656	);
657	}
658
659	static void DC4(uint8_t* dst) { // DC
660	int temp0, temp1, temp2, temp3, temp4;
661	__asm__ volatile (
662	"ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t"
663	LOAD_4_BYTES(temp1, temp2, temp3, temp4, -`1`, `0`, -`1`, `1`, -`1`, `2`, -`1`, `3`, dst)
664	"ins %[temp1], %[temp2], 8, 8 \n\t"
665	"ins %[temp1], %[temp3], 16, 8 \n\t"
666	"ins %[temp1], %[temp4], 24, 8 \n\t"
667	"raddu.w.qb %[temp0], %[temp0] \n\t"
668	"raddu.w.qb %[temp1], %[temp1] \n\t"
669	"addu %[temp0], %[temp0], %[temp1] \n\t"
670	"shra_r.w %[temp0], %[temp0], 3 \n\t"
671	"replv.qb %[temp0], %[temp0] \n\t"
672	STORE_8_BYTES(temp0, temp0, `0`, `0`, `1`, dst)
673	STORE_8_BYTES(temp0, temp0, `2`, `0`, `3`, dst)
674	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
675	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
676	: [dst]"r"(dst)
677	: "memory"
678	);
679	}
680
681	static void RD4(uint8_t* dst) { // Down-right
682	int temp0, temp1, temp2, temp3, temp4;
683	int temp5, temp6, temp7, temp8;
684	__asm__ volatile (
685	LOAD_4_BYTES(temp0, temp1, temp2, temp3, -`1`, `0`, -`1`, `1`, -`1`, `2`, -`1`, `3`, dst)
686	"ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t"
687	"ins %[temp1], %[temp0], 16, 16 \n\t"
688	"preceu.ph.qbr %[temp5], %[temp7] \n\t"
689	"ins %[temp2], %[temp1], 16, 16 \n\t"
690	"preceu.ph.qbl %[temp4], %[temp7] \n\t"
691	"ins %[temp3], %[temp2], 16, 16 \n\t"
692	"shll.ph %[temp2], %[temp2], 1 \n\t"
693	"addq.ph %[temp3], %[temp3], %[temp1] \n\t"
694	"packrl.ph %[temp6], %[temp5], %[temp1] \n\t"
695	"addq.ph %[temp3], %[temp3], %[temp2] \n\t"
696	"addq.ph %[temp1], %[temp1], %[temp5] \n\t"
697	"shll.ph %[temp6], %[temp6], 1 \n\t"
698	"addq.ph %[temp1], %[temp1], %[temp6] \n\t"
699	"packrl.ph %[temp0], %[temp4], %[temp5] \n\t"
700	"addq.ph %[temp8], %[temp5], %[temp4] \n\t"
701	"shra_r.ph %[temp3], %[temp3], 2 \n\t"
702	"shll.ph %[temp0], %[temp0], 1 \n\t"
703	"shra_r.ph %[temp1], %[temp1], 2 \n\t"
704	"addq.ph %[temp8], %[temp0], %[temp8] \n\t"
705	"lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t"
706	"precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t"
707	"shra_r.ph %[temp8], %[temp8], 2 \n\t"
708	"ins %[temp7], %[temp5], 0, 8 \n\t"
709	"precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t"
710	"raddu.w.qb %[temp4], %[temp7] \n\t"
711	"precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t"
712	"shra_r.w %[temp4], %[temp4], 2 \n\t"
713	STORE_8_BYTES(temp2, temp6, `3`, `0`, `1`, dst)
714	"prepend %[temp2], %[temp8], 8 \n\t"
715	"prepend %[temp6], %[temp4], 8 \n\t"
716	STORE_8_BYTES(temp2, temp6, `2`, `0`, `0`, dst)
717	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
718	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
719	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
720	: [dst]"r"(dst)
721	: "memory"
722	);
723	}
724
725	// TEMP0 = SRC[A BPS]*
726	// TEMP1 = SRC[B + C BPS]*
727	#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \
728	"ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
729	"ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t"
730
731	static void LD4(uint8_t* dst) { // Down-Left
732	int temp0, temp1, temp2, temp3, temp4;
733	int temp5, temp6, temp7, temp8, temp9;
734	__asm__ volatile (
735	LOAD_8_BYTES(temp0, temp1, -`1`, `4`, -`1`, dst)
736	"preceu.ph.qbl %[temp2], %[temp0] \n\t"
737	"preceu.ph.qbr %[temp3], %[temp0] \n\t"
738	"preceu.ph.qbr %[temp4], %[temp1] \n\t"
739	"preceu.ph.qbl %[temp5], %[temp1] \n\t"
740	"packrl.ph %[temp6], %[temp2], %[temp3] \n\t"
741	"packrl.ph %[temp7], %[temp4], %[temp2] \n\t"
742	"packrl.ph %[temp8], %[temp5], %[temp4] \n\t"
743	"shll.ph %[temp6], %[temp6], 1 \n\t"
744	"addq.ph %[temp9], %[temp2], %[temp6] \n\t"
745	"shll.ph %[temp7], %[temp7], 1 \n\t"
746	"addq.ph %[temp9], %[temp9], %[temp3] \n\t"
747	"shll.ph %[temp8], %[temp8], 1 \n\t"
748	"shra_r.ph %[temp9], %[temp9], 2 \n\t"
749	"addq.ph %[temp3], %[temp4], %[temp7] \n\t"
750	"addq.ph %[temp0], %[temp5], %[temp8] \n\t"
751	"addq.ph %[temp3], %[temp3], %[temp2] \n\t"
752	"addq.ph %[temp0], %[temp0], %[temp4] \n\t"
753	"shra_r.ph %[temp3], %[temp3], 2 \n\t"
754	"shra_r.ph %[temp0], %[temp0], 2 \n\t"
755	"srl %[temp1], %[temp1], 24 \n\t"
756	"sll %[temp1], %[temp1], 1 \n\t"
757	"raddu.w.qb %[temp5], %[temp5] \n\t"
758	"precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t"
759	"precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t"
760	"addu %[temp1], %[temp1], %[temp5] \n\t"
761	"shra_r.w %[temp1], %[temp1], 2 \n\t"
762	STORE_8_BYTES(temp9, temp3, `0`, `0`, `2`, dst)
763	"prepend %[temp9], %[temp0], 8 \n\t"
764	"prepend %[temp3], %[temp1], 8 \n\t"
765	STORE_8_BYTES(temp9, temp3, `1`, `0`, `3`, dst)
766	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
767	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
768	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
769	[temp9]"=&r"(temp9)
770	: [dst]"r"(dst)
771	: "memory"
772	);
773	}
774
775	//------------------------------------------------------------------------------
776	// Chroma
777
778	static void DC8uv(uint8_t* dst) { // DC
779	int temp0, temp1, temp2, temp3, temp4;
780	int temp5, temp6, temp7, temp8, temp9;
781	__asm__ volatile (
782	LOAD_8_BYTES(temp0, temp1, -`1`, `4`, -`1`, dst)
783	LOAD_4_BYTES(temp2, temp3, temp4, temp5, -`1`, `0`, -`1`, `1`, -`1`, `2`, -`1`, `3`, dst)
784	LOAD_4_BYTES(temp6, temp7, temp8, temp9, -`1`, `4`, -`1`, `5`, -`1`, `6`, -`1`, `7`, dst)
785	"raddu.w.qb %[temp0], %[temp0] \n\t"
786	"raddu.w.qb %[temp1], %[temp1] \n\t"
787	"addu %[temp2], %[temp2], %[temp3] \n\t"
788	"addu %[temp4], %[temp4], %[temp5] \n\t"
789	"addu %[temp6], %[temp6], %[temp7] \n\t"
790	"addu %[temp8], %[temp8], %[temp9] \n\t"
791	"addu %[temp0], %[temp0], %[temp1] \n\t"
792	"addu %[temp2], %[temp2], %[temp4] \n\t"
793	"addu %[temp6], %[temp6], %[temp8] \n\t"
794	"addu %[temp0], %[temp0], %[temp2] \n\t"
795	"addu %[temp0], %[temp0], %[temp6] \n\t"
796	"shra_r.w %[temp0], %[temp0], 4 \n\t"
797	"replv.qb %[temp0], %[temp0] \n\t"
798	STORE_8_BYTES(temp0, temp0, `0`, `4`, `0`, dst)
799	STORE_8_BYTES(temp0, temp0, `1`, `4`, `1`, dst)
800	STORE_8_BYTES(temp0, temp0, `2`, `4`, `2`, dst)
801	STORE_8_BYTES(temp0, temp0, `3`, `4`, `3`, dst)
802	STORE_8_BYTES(temp0, temp0, `4`, `4`, `4`, dst)
803	STORE_8_BYTES(temp0, temp0, `5`, `4`, `5`, dst)
804	STORE_8_BYTES(temp0, temp0, `6`, `4`, `6`, dst)
805	STORE_8_BYTES(temp0, temp0, `7`, `4`, `7`, dst)
806	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
807	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
808	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
809	[temp9]"=&r"(temp9)
810	: [dst]"r"(dst)
811	: "memory"
812	);
813	}
814
815	static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
816	int temp0, temp1;
817	__asm__ volatile (
818	LOAD_8_BYTES(temp0, temp1, -`1`, `4`, -`1`, dst)
819	"raddu.w.qb %[temp0], %[temp0] \n\t"
820	"raddu.w.qb %[temp1], %[temp1] \n\t"
821	"addu %[temp0], %[temp0], %[temp1] \n\t"
822	"shra_r.w %[temp0], %[temp0], 3 \n\t"
823	"replv.qb %[temp0], %[temp0] \n\t"
824	STORE_8_BYTES(temp0, temp0, `0`, `4`, `0`, dst)
825	STORE_8_BYTES(temp0, temp0, `1`, `4`, `1`, dst)
826	STORE_8_BYTES(temp0, temp0, `2`, `4`, `2`, dst)
827	STORE_8_BYTES(temp0, temp0, `3`, `4`, `3`, dst)
828	STORE_8_BYTES(temp0, temp0, `4`, `4`, `4`, dst)
829	STORE_8_BYTES(temp0, temp0, `5`, `4`, `5`, dst)
830	STORE_8_BYTES(temp0, temp0, `6`, `4`, `6`, dst)
831	STORE_8_BYTES(temp0, temp0, `7`, `4`, `7`, dst)
832	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
833	: [dst]"r"(dst)
834	: "memory"
835	);
836	}
837
838	static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
839	int temp0, temp1, temp2, temp3, temp4;
840	int temp5, temp6, temp7, temp8;
841	__asm__ volatile (
842	LOAD_4_BYTES(temp2, temp3, temp4, temp5, -`1`, `0`, -`1`, `1`, -`1`, `2`, -`1`, `3`, dst)
843	LOAD_4_BYTES(temp6, temp7, temp8, temp1, -`1`, `4`, -`1`, `5`, -`1`, `6`, -`1`, `7`, dst)
844	"addu %[temp2], %[temp2], %[temp3] \n\t"
845	"addu %[temp4], %[temp4], %[temp5] \n\t"
846	"addu %[temp6], %[temp6], %[temp7] \n\t"
847	"addu %[temp8], %[temp8], %[temp1] \n\t"
848	"addu %[temp2], %[temp2], %[temp4] \n\t"
849	"addu %[temp6], %[temp6], %[temp8] \n\t"
850	"addu %[temp0], %[temp6], %[temp2] \n\t"
851	"shra_r.w %[temp0], %[temp0], 3 \n\t"
852	"replv.qb %[temp0], %[temp0] \n\t"
853	STORE_8_BYTES(temp0, temp0, `0`, `4`, `0`, dst)
854	STORE_8_BYTES(temp0, temp0, `1`, `4`, `1`, dst)
855	STORE_8_BYTES(temp0, temp0, `2`, `4`, `2`, dst)
856	STORE_8_BYTES(temp0, temp0, `3`, `4`, `3`, dst)
857	STORE_8_BYTES(temp0, temp0, `4`, `4`, `4`, dst)
858	STORE_8_BYTES(temp0, temp0, `5`, `4`, `5`, dst)
859	STORE_8_BYTES(temp0, temp0, `6`, `4`, `6`, dst)
860	STORE_8_BYTES(temp0, temp0, `7`, `4`, `7`, dst)
861	: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
862	[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
863	[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
864	: [dst]"r"(dst)
865	: "memory"
866	);
867	}
868
869	#undef LOAD_8_BYTES
870	#undef STORE_8_BYTES
871	#undef LOAD_4_BYTES
872
873	#define CLIPPING(SIZE) \
874	"preceu.ph.qbl %[temp2], %[temp0] \n\t" \
875	"preceu.ph.qbr %[temp0], %[temp0] \n\t" \
876	".if " #SIZE " == 8 \n\t" \
877	"preceu.ph.qbl %[temp3], %[temp1] \n\t" \
878	"preceu.ph.qbr %[temp1], %[temp1] \n\t" \
879	".endif \n\t" \
880	"addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \
881	"addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \
882	".if " #SIZE " == 8 \n\t" \
883	"addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \
884	"addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \
885	".endif \n\t" \
886	"shll_s.ph %[temp2], %[temp2], 7 \n\t" \
887	"shll_s.ph %[temp0], %[temp0], 7 \n\t" \
888	".if " #SIZE " == 8 \n\t" \
889	"shll_s.ph %[temp3], %[temp3], 7 \n\t" \
890	"shll_s.ph %[temp1], %[temp1], 7 \n\t" \
891	".endif \n\t" \
892	"precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \
893	".if " #SIZE " == 8 \n\t" \
894	"precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \
895	".endif \n\t"
896
897
898	#define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \
899	int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \
900	int temp0, temp1, temp2, temp3; \
901	__asm__ volatile ( \
902	".if " #SIZE " < 8 \n\t" \
903	"ulw %[temp0], 0(%[top]) \n\t" \
904	"subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
905	CLIPPING(4) \
906	"usw %[temp0], 0(%[dst]) \n\t" \
907	".else \n\t" \
908	"ulw %[temp0], 0(%[top]) \n\t" \
909	"ulw %[temp1], 4(%[top]) \n\t" \
910	"subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
911	CLIPPING(8) \
912	"usw %[temp0], 0(%[dst]) \n\t" \
913	"usw %[temp1], 4(%[dst]) \n\t" \
914	".if " #SIZE " == 16 \n\t" \
915	"ulw %[temp0], 8(%[top]) \n\t" \
916	"ulw %[temp1], 12(%[top]) \n\t" \
917	CLIPPING(8) \
918	"usw %[temp0], 8(%[dst]) \n\t" \
919	"usw %[temp1], 12(%[dst]) \n\t" \
920	".endif \n\t" \
921	".endif \n\t" \
922	: [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
923	[temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
924	: [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \
925	: "memory" \
926	); \
927	} while (0)
928
929	#define CLIP_TO_DST(DST, SIZE) do { \
930	int y; \
931	const uint8_t* top = (DST) - BPS; \
932	const int top_1 = ((int)top[-1] << 16) + top[-1]; \
933	for (y = 0; y < (SIZE); ++y) { \
934	CLIP_8B_TO_DST((DST), top, (SIZE)); \
935	(DST) += BPS; \
936	} \
937	} while (0)
938
939	#define TRUE_MOTION(DST, SIZE) \
940	static void TrueMotion##SIZE(uint8_t* (DST)) { \
941	CLIP_TO_DST((DST), (SIZE)); \
942	}
943
944	TRUE_MOTION(dst, `4`)
945	TRUE_MOTION(dst, `8`)
946	TRUE_MOTION(dst, `16`)
947
948	#undef TRUE_MOTION
949	#undef CLIP_TO_DST
950	#undef CLIP_8B_TO_DST
951	#undef CLIPPING
952
953	//------------------------------------------------------------------------------
954	// Entry point
955
956	extern void VP8DspInitMIPSdspR2(void);
957
958	WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
959	VP8TransformDC = TransformDC;
960	VP8TransformAC3 = TransformAC3;
961	VP8Transform = TransformTwo;
962
963	VP8VFilter16 = VFilter16;
964	VP8HFilter16 = HFilter16;
965	VP8VFilter8 = VFilter8;
966	VP8HFilter8 = HFilter8;
967	VP8VFilter16i = VFilter16i;
968	VP8HFilter16i = HFilter16i;
969	VP8VFilter8i = VFilter8i;
970	VP8HFilter8i = HFilter8i;
971	VP8SimpleVFilter16 = SimpleVFilter16;
972	VP8SimpleHFilter16 = SimpleHFilter16;
973	VP8SimpleVFilter16i = SimpleVFilter16i;
974	VP8SimpleHFilter16i = SimpleHFilter16i;
975
976	VP8PredLuma4[`0`] = DC4;
977	VP8PredLuma4[`1`] = TrueMotion4;
978	VP8PredLuma4[`2`] = VE4;
979	VP8PredLuma4[`4`] = RD4;
980	VP8PredLuma4[`6`] = LD4;
981
982	VP8PredChroma8[`0`] = DC8uv;
983	VP8PredChroma8[`1`] = TrueMotion8;
984	VP8PredChroma8[`4`] = DC8uvNoTop;
985	VP8PredChroma8[`5`] = DC8uvNoLeft;
986
987	VP8PredLuma16[`1`] = TrueMotion16;
988	}
989
990	#else // !WEBP_USE_MIPS_DSP_R2
991
992	WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
993
994	#endif // WEBP_USE_MIPS_DSP_R2
995

source code of qtimageformats/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c