scalar.rs source code [crates/zune_jpeg/src/idct/scalar.rs]

1	/*
2	* Copyright (c) 2023.
3	*
4	* This software is free software;
5	*
6	* You can redistribute it or modify it under terms of the MIT, Apache License or Zlib license
7	*/
8
9	//! Platform independent IDCT algorithm
10	//!
11	//! Not as fast as AVX one.
12
13	const SCALE_BITS: i32 = `512` + `65536` + (`128` << `17`);
14
15	#[allow(unused_assignments)]
16	#[allow(
17	clippy::too_many_lines,
18	clippy::op_ref,
19	clippy::cast_possible_truncation
20	)]
21	pub fn idct_int(in_vector: &mut [i32; `64`], out_vector: &mut [i16], stride: usize) {
22	// Temporary variables.
23
24	let mut pos = `0`;
25
26	let mut i = `0`;
27	// Don't check for zeroes inside loop, lift it and check outside
28	// we want to accelerate the case with 63 0 ac coeff
29	if &in_vector[`1`..] == &[`0_i32`; `63`] {
30	// okay then if you work, yay, let's write you really quick
31	let coeff = [(((in_vector[`0`] >> `3`) + `128`) as i16).clamp(`0`, `255`); `8`];
32
33	macro_rules! store {
34	($index:tt) => {
35	// position of the MCU
36	let mcu_stride: &mut [i16; `8`] = out_vector
37	.get_mut($index..$index + `8`)
38	.unwrap()
39	.try_into()
40	.unwrap();
41	// copy coefficients
42	mcu_stride.copy_from_slice(&coeff);
43	// increment index
44	$index += stride;
45	};
46	}
47	// write to four positions
48	store!(pos);
49	store!(pos);
50	store!(pos);
51	store!(pos);
52
53	store!(pos);
54	store!(pos);
55	store!(pos);
56	store!(pos);
57	} else {
58	// because the compiler fails to see that it can be auto_vectorised so i'll
59	// leave it here check out [idct_int_slow, and idct_int_1D to get what i mean ] https://godbolt.org/z/8hqW9z9j9
60	for ptr in `0`..`8` {
61	let p2 = in_vector[ptr + `16`];
62	let p3 = in_vector[ptr + `48`];
63
64	let p1 = (p2 + p3).wrapping_mul(`2217`);
65
66	let t2 = p1 + p3 * `-7567`;
67	let t3 = p1 + p2 * `3135`;
68
69	let p2 = in_vector[ptr];
70	let p3 = in_vector[`32` + ptr];
71	let t0 = fsh(p2 + p3);
72	let t1 = fsh(p2 - p3);
73
74	let x0 = t0 + t3 + `512`;
75	let x3 = t0 - t3 + `512`;
76	let x1 = t1 + t2 + `512`;
77	let x2 = t1 - t2 + `512`;
78
79	// odd part
80	let mut t0 = in_vector[ptr + `56`];
81	let mut t1 = in_vector[ptr + `40`];
82	let mut t2 = in_vector[ptr + `24`];
83	let mut t3 = in_vector[ptr + `8`];
84
85	let p3 = t0 + t2;
86	let p4 = t1 + t3;
87	let p1 = t0 + t3;
88	let p2 = t1 + t2;
89	let p5 = (p3 + p4) * `4816`;
90
91	t0 *= `1223`;
92	t1 *= `8410`;
93	t2 *= `12586`;
94	t3 *= `6149`;
95
96	let p1 = p5 + p1 * `-3685`;
97	let p2 = p5 + p2 * `-10497`;
98	let p3 = p3 * `-8034`;
99	let p4 = p4 * `-1597`;
100
101	t3 += p1 + p4;
102	t2 += p2 + p3;
103	t1 += p2 + p4;
104	t0 += p1 + p3;
105
106	// constants scaled things up by 1<<12; let's bring them back
107	// down, but keep 2 extra bits of precision
108	in_vector[ptr] = (x0 + t3) >> `10`;
109	in_vector[ptr + `8`] = (x1 + t2) >> `10`;
110	in_vector[ptr + `16`] = (x2 + t1) >> `10`;
111	in_vector[ptr + `24`] = (x3 + t0) >> `10`;
112	in_vector[ptr + `32`] = (x3 - t0) >> `10`;
113	in_vector[ptr + `40`] = (x2 - t1) >> `10`;
114	in_vector[ptr + `48`] = (x1 - t2) >> `10`;
115	in_vector[ptr + `56`] = (x0 - t3) >> `10`;
116	}
117
118	// This is vectorised in architectures supporting SSE 4.1
119	while i < `64` {
120	// We won't try to short circuit here because it rarely works
121
122	// Even part
123	let p2 = in_vector[i + `2`];
124	let p3 = in_vector[i + `6`];
125
126	let p1 = (p2 + p3) * `2217`;
127	let t2 = p1 + p3 * `-7567`;
128	let t3 = p1 + p2 * `3135`;
129
130	let p2 = in_vector[i];
131	let p3 = in_vector[i + `4`];
132
133	let t0 = fsh(p2 + p3);
134	let t1 = fsh(p2 - p3);
135	// constants scaled things up by 1<<12, plus we had 1<<2 from first
136	// loop, plus horizontal and vertical each scale by sqrt(8) so together
137	// we've got an extra 1<<3, so 1<<17 total we need to remove.
138	// so we want to round that, which means adding 0.5 1<<17,*
139	// aka 65536. Also, we'll end up with -128 to 127 that we want
140	// to encode as 0..255 by adding 128, so we'll add that before the shift
141	let x0 = t0 + t3 + SCALE_BITS;
142	let x3 = t0 - t3 + SCALE_BITS;
143	let x1 = t1 + t2 + SCALE_BITS;
144	let x2 = t1 - t2 + SCALE_BITS;
145	// odd part
146	let mut t0 = in_vector[i + `7`];
147	let mut t1 = in_vector[i + `5`];
148	let mut t2 = in_vector[i + `3`];
149	let mut t3 = in_vector[i + `1`];
150
151	let p3 = t0 + t2;
152	let p4 = t1 + t3;
153	let p1 = t0 + t3;
154	let p2 = t1 + t2;
155	let p5 = (p3 + p4) * f2f(`1.175875602`);
156
157	t0 = t0.wrapping_mul(`1223`);
158	t1 = t1.wrapping_mul(`8410`);
159	t2 = t2.wrapping_mul(`12586`);
160	t3 = t3.wrapping_mul(`6149`);
161
162	let p1 = p5 + p1 * `-3685`;
163	let p2 = p5 + p2 * `-10497`;
164	let p3 = p3 * `-8034`;
165	let p4 = p4 * `-1597`;
166
167	t3 += p1 + p4;
168	t2 += p2 + p3;
169	t1 += p2 + p4;
170	t0 += p1 + p3;
171
172	let out: &mut [i16; `8`] = out_vector
173	.get_mut(pos..pos + `8`)
174	.unwrap()
175	.try_into()
176	.unwrap();
177
178	out[`0`] = clamp((x0 + t3) >> `17`);
179	out[`1`] = clamp((x1 + t2) >> `17`);
180	out[`2`] = clamp((x2 + t1) >> `17`);
181	out[`3`] = clamp((x3 + t0) >> `17`);
182	out[`4`] = clamp((x3 - t0) >> `17`);
183	out[`5`] = clamp((x2 - t1) >> `17`);
184	out[`6`] = clamp((x1 - t2) >> `17`);
185	out[`7`] = clamp((x0 - t3) >> `17`);
186
187	i += `8`;
188
189	pos += stride;
190	}
191	}
192	}
193
194	#[inline]
195	#[allow(clippy::cast_possible_truncation)]
196	/// Multiply a number by 4096
197	fn f2f(x: f32) -> i32 {
198	(x * `4096.0` + `0.5`) as i32
199	}
200
201	#[inline]
202	/// Multiply a number by 4096
203	fn fsh(x: i32) -> i32 {
204	x << `12`
205	}
206
207	/// Clamp values between 0 and 255
208	#[inline]
209	#[allow(clippy::cast_possible_truncation)]
210	fn clamp(a: i32) -> i16 {
211	a.clamp(min:`0`, max:`255`) as i16
212	}
213