lowp.rs source code [crates/tiny_skia/src/pipeline/lowp.rs]

1	// Copyright 2018 Google Inc.
2	// Copyright 2020 Yevhenii Reizner
3	//
4	// Use of this source code is governed by a BSD-style license that can be
5	// found in the LICENSE file.
6
7	/!*
8	A low precision raster pipeline implementation.
9
10	A lowp pipeline uses u16 instead of f32 for math.
11	Because of that, it doesn't implement stages that require high precision.
12	The pipeline compiler will automatically decide which one to use.
13
14	Skia uses u16x8 (128bit) types for a generic CPU and u16x16 (256bit) for modern x86 CPUs.
15	But instead of explicit SIMD instructions, it mainly relies on clang's vector extensions.
16	And since they are unavailable in Rust, we have to do everything manually.
17
18	According to our benchmarks, a SIMD-accelerated u16x8 in Rust is almost 2x slower than in Skia.
19	Not sure why. For example, there are no div instruction for u16x8, so we have to use
20	a basic scalar version. Which means unnecessary load/store. No idea what clang does in this case.
21	Surprisingly, a SIMD-accelerated u16x8 is even slower than a scalar one. Again, not sure why.
22
23	Therefore we are using scalar u16x16 by default and relying on rustc/llvm auto vectorization instead.
24	When targeting a generic CPU, we're just 5-10% slower than Skia. While u16x8 is 30-40% slower.
25	And while `-C target-cpu=haswell` boosts our performance by around 25%,
26	we are still 40-60% behind Skia built for Haswell.
27
28	On ARM AArch64 the story is different and explicit SIMD make our code up to 2-3x faster.
29	*/
30
31	use crate::PremultipliedColorU8;
32
33	use crate::pixmap::SubPixmapMut;
34	use crate::wide::{f32x8, u16x16, f32x16};
35	use crate::geom::ScreenIntRect;
36
37	pub const STAGE_WIDTH: usize = `16`;
38
39	pub type StageFn = fn(p: &mut Pipeline);
40
41	pub struct Pipeline<'a, 'b: 'a> {
42	index: usize,
43	functions: &'a [StageFn],
44	pixmap: &'a mut SubPixmapMut<'b>,
45	mask_ctx: super::MaskCtx<'a>,
46	aa_mask_ctx: super::AAMaskCtx,
47	ctx: &'a mut super::Context,
48	r: u16x16,
49	g: u16x16,
50	b: u16x16,
51	a: u16x16,
52	dr: u16x16,
53	dg: u16x16,
54	db: u16x16,
55	da: u16x16,
56	tail: usize,
57	dx: usize,
58	dy: usize,
59	}
60
61	impl Pipeline<'_, '_> {
62	#[inline(always)]
63	fn next_stage(&mut self) {
64	let next: fn(&mut Self) = self.functions[self.index];
65	self.index += `1`;
66	next(self);
67	}
68	}
69
70
71	// Must be in the same order as raster_pipeline::Stage
72	pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
73	move_source_to_destination,
74	move_destination_to_source,
75	null_fn, // Clamp0
76	null_fn, // ClampA
77	premultiply,
78	uniform_color,
79	seed_shader,
80	load_dst,
81	store,
82	load_dst_u8,
83	store_u8,
84	null_fn, // Gather
85	load_mask_u8,
86	mask_u8,
87	scale_u8,
88	lerp_u8,
89	scale_1_float,
90	lerp_1_float,
91	destination_atop,
92	destination_in,
93	destination_out,
94	destination_over,
95	source_atop,
96	source_in,
97	source_out,
98	source_over,
99	clear,
100	modulate,
101	multiply,
102	plus,
103	screen,
104	xor,
105	null_fn, // ColorBurn
106	null_fn, // ColorDodge
107	darken,
108	difference,
109	exclusion,
110	hard_light,
111	lighten,
112	overlay,
113	null_fn, // SoftLight
114	null_fn, // Hue
115	null_fn, // Saturation
116	null_fn, // Color
117	null_fn, // Luminosity
118	source_over_rgba,
119	transform,
120	null_fn, // Reflect
121	null_fn, // Repeat
122	null_fn, // Bilinear
123	null_fn, // Bicubic
124	pad_x1,
125	reflect_x1,
126	repeat_x1,
127	gradient,
128	evenly_spaced_2_stop_gradient,
129	xy_to_radius,
130	null_fn, // XYTo2PtConicalFocalOnCircle
131	null_fn, // XYTo2PtConicalWellBehaved
132	null_fn, // XYTo2PtConicalGreater
133	null_fn, // Mask2PtConicalDegenerates
134	null_fn, // ApplyVectorMask
135	];
136
137	pub fn fn_ptr(f: StageFn) -> *const () {
138	f as *const ()
139	}
140
141	pub fn fn_ptr_eq(f1: StageFn, f2: StageFn) -> bool {
142	core::ptr::eq(a:f1 as *const (), b:f2 as *const ())
143	}
144
145	#[inline(never)]
146	pub fn start(
147	functions: &[StageFn],
148	functions_tail: &[StageFn],
149	rect: &ScreenIntRect,
150	aa_mask_ctx: super::AAMaskCtx,
151	mask_ctx: super::MaskCtx,
152	ctx: &mut super::Context,
153	pixmap: &mut SubPixmapMut,
154	) {
155	let mut p = Pipeline {
156	index: `0`,
157	functions: &[],
158	pixmap,
159	mask_ctx,
160	aa_mask_ctx,
161	ctx,
162	r: u16x16::default(),
163	g: u16x16::default(),
164	b: u16x16::default(),
165	a: u16x16::default(),
166	dr: u16x16::default(),
167	dg: u16x16::default(),
168	db: u16x16::default(),
169	da: u16x16::default(),
170	tail: `0`,
171	dx: `0`,
172	dy: `0`,
173	};
174
175	for y in rect.y()..rect.bottom() {
176	let mut x = rect.x() as usize;
177	let end = rect.right() as usize;
178
179	p.functions = functions;
180	while x + STAGE_WIDTH <= end {
181	p.index = `0`;
182	p.dx = x;
183	p.dy = y as usize;
184	p.tail = STAGE_WIDTH;
185	p.next_stage();
186	x += STAGE_WIDTH;
187	}
188
189	if x != end {
190	p.index = `0`;
191	p.functions = functions_tail;
192	p.dx = x;
193	p.dy = y as usize;
194	p.tail = end - x;
195	p.next_stage();
196	}
197	}
198	}
199
200	fn move_source_to_destination(p: &mut Pipeline) {
201	p.dr = p.r;
202	p.dg = p.g;
203	p.db = p.b;
204	p.da = p.a;
205
206	p.next_stage();
207	}
208
209	fn move_destination_to_source(p: &mut Pipeline) {
210	p.r = p.dr;
211	p.g = p.dg;
212	p.b = p.db;
213	p.a = p.da;
214
215	p.next_stage();
216	}
217
218	fn premultiply(p: &mut Pipeline) {
219	p.r = div255(p.r * p.a);
220	p.g = div255(p.g * p.a);
221	p.b = div255(p.b * p.a);
222
223	p.next_stage();
224	}
225
226	fn uniform_color(p: &mut Pipeline) {
227	let ctx: UniformColorCtx = p.ctx.uniform_color;
228	p.r = u16x16::splat(ctx.rgba[`0`]);
229	p.g = u16x16::splat(ctx.rgba[`1`]);
230	p.b = u16x16::splat(ctx.rgba[`2`]);
231	p.a = u16x16::splat(ctx.rgba[`3`]);
232
233	p.next_stage();
234	}
235
236	fn seed_shader(p: &mut Pipeline) {
237	let iota: f32x16 = f32x16(
238	f32x8::from([`0.5`, `1.5`, `2.5`, `3.5`, `4.5`, `5.5`, `6.5`, `7.5`]),
239	f32x8::from([`8.5`, `9.5`, `10.5`, `11.5`, `12.5`, `13.5`, `14.5`, `15.5`]),
240	);
241
242	let x: f32x16 = f32x16::splat(p.dx as f32) + iota;
243	let y: f32x16 = f32x16::splat(p.dy as f32 + `0.5`);
244	split(&x, &mut p.r, &mut p.g);
245	split(&y, &mut p.b, &mut p.a);
246
247	p.next_stage();
248	}
249
250	pub fn load_dst(p: &mut Pipeline) {
251	load_8888(data:p.pixmap.slice16_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
252	p.next_stage();
253	}
254
255	pub fn load_dst_tail(p: &mut Pipeline) {
256	load_8888_tail(p.tail, data:p.pixmap.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
257	p.next_stage();
258	}
259
260	pub fn store(p: &mut Pipeline) {
261	store_8888(&p.r, &p.g, &p.b, &p.a, data:p.pixmap.slice16_at_xy(p.dx, p.dy));
262	p.next_stage();
263	}
264
265	pub fn store_tail(p: &mut Pipeline) {
266	store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:p.pixmap.slice_at_xy(p.dx, p.dy));
267	p.next_stage();
268	}
269
270	pub fn load_dst_u8(p: &mut Pipeline) {
271	load_8(data:p.pixmap.slice16_mask_at_xy(p.dx, p.dy), &mut p.da);
272	p.next_stage();
273	}
274
275	pub fn load_dst_u8_tail(p: &mut Pipeline) {
276	// Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
277	// This way we can reuse the `load_8888__` method and remove any branches.
278	let data: &mut [u8] = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
279	let mut tmp: [u8; 16] = [`0u8`; STAGE_WIDTH];
280	tmp[`0`..p.tail].copy_from_slice(&data[`0`..p.tail]);
281	load_8(&tmp, &mut p.da);
282
283	p.next_stage();
284	}
285
286	pub fn store_u8(p: &mut Pipeline) {
287	let data: &mut [u8; 16] = p.pixmap.slice16_mask_at_xy(p.dx, p.dy);
288	let a: &[u16; 16] = p.a.as_slice();
289
290	data[ `0`] = a[ `0`] as u8;
291	data[ `1`] = a[ `1`] as u8;
292	data[ `2`] = a[ `2`] as u8;
293	data[ `3`] = a[ `3`] as u8;
294	data[ `4`] = a[ `4`] as u8;
295	data[ `5`] = a[ `5`] as u8;
296	data[ `6`] = a[ `6`] as u8;
297	data[ `7`] = a[ `7`] as u8;
298	data[ `8`] = a[ `8`] as u8;
299	data[ `9`] = a[ `9`] as u8;
300	data[`10`] = a[`10`] as u8;
301	data[`11`] = a[`11`] as u8;
302	data[`12`] = a[`12`] as u8;
303	data[`13`] = a[`13`] as u8;
304	data[`14`] = a[`14`] as u8;
305	data[`15`] = a[`15`] as u8;
306
307	p.next_stage();
308	}
309
310	pub fn store_u8_tail(p: &mut Pipeline) {
311	let data: &mut [u8] = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
312	let a: &[u16; 16] = p.a.as_slice();
313
314	// This is better than `for i in 0..tail`, because this way the compiler
315	// knows that we have only 16 steps and slices access is guarantee to be valid.
316	// This removes bounds checking and a possible panic call.
317	for i: usize in `0`..STAGE_WIDTH {
318	data[i] = a[i] as u8;
319
320	if i + `1` == p.tail {
321	break;
322	}
323	}
324
325	p.next_stage();
326	}
327
328	// Similar to mask_u8, but only loads the mask values without actually masking the pipeline.
329	fn load_mask_u8(p: &mut Pipeline) {
330	let offset: usize = p.mask_ctx.offset(p.dx, p.dy);
331
332	let mut c: u16x16 = u16x16::default();
333	for i: usize in `0`..p.tail {
334	c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
335	}
336
337	p.r = u16x16::splat(`0`);
338	p.g = u16x16::splat(`0`);
339	p.b = u16x16::splat(`0`);
340	p.a = c;
341
342	p.next_stage();
343	}
344
345	fn mask_u8(p: &mut Pipeline) {
346	let offset: usize = p.mask_ctx.offset(p.dx, p.dy);
347
348	let mut c: u16x16 = u16x16::default();
349	for i: usize in `0`..p.tail {
350	c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
351	}
352
353	if c == u16x16::default() {
354	return;
355	}
356
357	p.r = div255(p.r * c);
358	p.g = div255(p.g * c);
359	p.b = div255(p.b * c);
360	p.a = div255(p.a * c);
361
362	p.next_stage();
363	}
364
365	fn scale_u8(p: &mut Pipeline) {
366	// Load u8xTail and cast it to u16x16.
367	let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
368	let c = u16x16([
369	u16::from(data[`0`]),
370	u16::from(data[`1`]),
371	`0`,
372	`0`,
373	`0`,
374	`0`,
375	`0`,
376	`0`,
377	`0`,
378	`0`,
379	`0`,
380	`0`,
381	`0`,
382	`0`,
383	`0`,
384	`0`,
385	]);
386
387	p.r = div255(p.r * c);
388	p.g = div255(p.g * c);
389	p.b = div255(p.b * c);
390	p.a = div255(p.a * c);
391
392	p.next_stage();
393	}
394
395	fn lerp_u8(p: &mut Pipeline) {
396	// Load u8xTail and cast it to u16x16.
397	let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
398	let c = u16x16([
399	u16::from(data[`0`]),
400	u16::from(data[`1`]),
401	`0`,
402	`0`,
403	`0`,
404	`0`,
405	`0`,
406	`0`,
407	`0`,
408	`0`,
409	`0`,
410	`0`,
411	`0`,
412	`0`,
413	`0`,
414	`0`,
415	]);
416
417	p.r = lerp(p.dr, p.r, c);
418	p.g = lerp(p.dg, p.g, c);
419	p.b = lerp(p.db, p.b, c);
420	p.a = lerp(p.da, p.a, c);
421
422	p.next_stage();
423	}
424
425	fn scale_1_float(p: &mut Pipeline) {
426	let c: u16x16 = from_float(p.ctx.current_coverage);
427	p.r = div255(p.r * c);
428	p.g = div255(p.g * c);
429	p.b = div255(p.b * c);
430	p.a = div255(p.a * c);
431
432	p.next_stage();
433	}
434
435	fn lerp_1_float(p: &mut Pipeline) {
436	let c: u16x16 = from_float(p.ctx.current_coverage);
437	p.r = lerp(from:p.dr, to:p.r, t:c);
438	p.g = lerp(from:p.dg, to:p.g, t:c);
439	p.b = lerp(from:p.db, to:p.b, t:c);
440	p.a = lerp(from:p.da, to:p.a, t:c);
441
442	p.next_stage();
443	}
444
445	macro_rules! blend_fn {
446	($name:ident, $f:expr) => {
447	fn $name(p: &mut Pipeline) {
448	p.r = $f(p.r, p.dr, p.a, p.da);
449	p.g = $f(p.g, p.dg, p.a, p.da);
450	p.b = $f(p.b, p.db, p.a, p.da);
451	p.a = $f(p.a, p.da, p.a, p.da);
452
453	p.next_stage();
454	}
455	};
456	}
457
458	blend_fn!(clear, \|_, _, _, _\| u16x16::splat(`0`));
459	blend_fn!(source_atop, \|s, d, sa, da\| div255(s * da + d * inv(sa)));
460	blend_fn!(destination_atop, \|s, d, sa, da\| div255(d * sa + s * inv(da)));
461	blend_fn!(source_in, \|s, _, _, da\| div255(s * da));
462	blend_fn!(destination_in, \|_, d, sa, _\| div255(d * sa));
463	blend_fn!(source_out, \|s, _, _, da\| div255(s * inv(da)));
464	blend_fn!(destination_out, \|_, d, sa, _\| div255(d * inv(sa)));
465	blend_fn!(source_over, \|s, d, sa, _\| s + div255(d * inv(sa)));
466	blend_fn!(destination_over, \|s, d, _, da\| d + div255(s * inv(da)));
467	blend_fn!(modulate, \|s, d, _, _\| div255(s * d));
468	blend_fn!(multiply, \|s, d, sa, da\| div255(s * inv(da) + d * inv(sa) + s * d));
469	blend_fn!(screen, \|s, d, _, _\| s + d - div255(s * d));
470	blend_fn!(xor, \|s, d, sa, da\| div255(s * inv(da) + d * inv(sa)));
471
472	// Wants a type for some reason.
473	blend_fn!(plus, \|s: u16x16, d, _, _\| (s + d).min(&u16x16::splat(`255`)));
474
475
476	macro_rules! blend_fn2 {
477	($name:ident, $f:expr) => {
478	fn $name(p: &mut Pipeline) {
479	// The same logic applied to color, and source_over for alpha.
480	p.r = $f(p.r, p.dr, p.a, p.da);
481	p.g = $f(p.g, p.dg, p.a, p.da);
482	p.b = $f(p.b, p.db, p.a, p.da);
483	p.a = p.a + div255(p.da * inv(p.a));
484
485	p.next_stage();
486	}
487	};
488	}
489
490	blend_fn2!(darken, \|s: u16x16, d, sa, da\| s + d - div255((s * da).max(&(d * sa))));
491	blend_fn2!(lighten, \|s: u16x16, d, sa, da\| s + d - div255((s * da).min(&(d * sa))));
492	blend_fn2!(exclusion, \|s: u16x16, d, _, _\| s + d - u16x16::splat(`2`) * div255(s * d));
493
494	blend_fn2!(difference, \|s: u16x16, d, sa, da\|
495	s + d - u16x16::splat(`2`) * div255((s * da).min(&(d * sa))));
496
497	blend_fn2!(hard_light, \|s: u16x16, d: u16x16, sa, da\| {
498	div255(s * inv(da) + d * inv(sa)
499	+ (s+s).cmp_le(&sa).blend(
500	u16x16::splat(`2`) * s * d,
501	sa * da - u16x16::splat(`2`) * (sa-s)*(da-d)
502	)
503	)
504	});
505
506	blend_fn2!(overlay, \|s: u16x16, d: u16x16, sa, da\| {
507	div255(s * inv(da) + d * inv(sa)
508	+ (d+d).cmp_le(&da).blend(
509	u16x16::splat(`2`) * s * d,
510	sa * da - u16x16::splat(`2`) * (sa-s)*(da-d)
511	)
512	)
513	});
514
515	pub fn source_over_rgba(p: &mut Pipeline) {
516	let pixels: &mut [PremultipliedColorU8; 16] = p.pixmap.slice16_at_xy(p.dx, p.dy);
517	load_8888(data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
518	p.r = p.r + div255(p.dr * inv(p.a));
519	p.g = p.g + div255(p.dg * inv(p.a));
520	p.b = p.b + div255(p.db * inv(p.a));
521	p.a = p.a + div255(p.da * inv(p.a));
522	store_8888(&p.r, &p.g, &p.b, &p.a, data:pixels);
523
524	p.next_stage();
525	}
526
527	pub fn source_over_rgba_tail(p: &mut Pipeline) {
528	let pixels: &mut [PremultipliedColorU8] = p.pixmap.slice_at_xy(p.dx, p.dy);
529	load_8888_tail(p.tail, data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
530	p.r = p.r + div255(p.dr * inv(p.a));
531	p.g = p.g + div255(p.dg * inv(p.a));
532	p.b = p.b + div255(p.db * inv(p.a));
533	p.a = p.a + div255(p.da * inv(p.a));
534	store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:pixels);
535
536	p.next_stage();
537	}
538
539	fn transform(p: &mut Pipeline) {
540	let ts: &Transform = &p.ctx.transform;
541
542	let x: f32x16 = join(&p.r, &p.g);
543	let y: f32x16 = join(&p.b, &p.a);
544
545	let nx: f32x16 = mad(f:x, m:f32x16::splat(ts.sx), a:mad(f:y, m:f32x16::splat(ts.kx), a:f32x16::splat(ts.tx)));
546	let ny: f32x16 = mad(f:x, m:f32x16::splat(ts.ky), a:mad(f:y, m:f32x16::splat(ts.sy), a:f32x16::splat(ts.ty)));
547
548	split(&nx, &mut p.r, &mut p.g);
549	split(&ny, &mut p.b, &mut p.a);
550
551	p.next_stage();
552	}
553
554	fn pad_x1(p: &mut Pipeline) {
555	let x: f32x16 = join(&p.r, &p.g);
556	let x: f32x16 = x.normalize();
557	split(&x, &mut p.r, &mut p.g);
558
559	p.next_stage();
560	}
561
562	fn reflect_x1(p: &mut Pipeline) {
563	let x: f32x16 = join(&p.r, &p.g);
564	let two: impl Fn(f32x16) -> f32x16 = \|x: f32x16\| x + x;
565	let x: f32x16 = (
566	(x - f32x16::splat(`1.0`))
567	- two(((x - f32x16::splat(`1.0`)) * f32x16::splat(`0.5`)).floor())
568	- f32x16::splat(`1.0`)
569	).abs().normalize();
570	split(&x, &mut p.r, &mut p.g);
571
572	p.next_stage();
573	}
574
575	fn repeat_x1(p: &mut Pipeline) {
576	let x: f32x16 = join(&p.r, &p.g);
577	let x: f32x16 = (x - x.floor()).normalize();
578	split(&x, &mut p.r, &mut p.g);
579
580	p.next_stage();
581	}
582
583	fn gradient(p: &mut Pipeline) {
584	let ctx = &p.ctx.gradient;
585
586	// N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
587	let t = join(&p.r, &p.g);
588	let mut idx = u16x16::splat(`0`);
589	for i in `1`..ctx.len {
590	let tt = ctx.t_values[i].get();
591	let t0: [f32; `8`] = t.0.into();
592	let t1: [f32; `8`] = t.1.into();
593	idx.0[ `0`] += (t0[`0`] >= tt) as u16;
594	idx.0[ `1`] += (t0[`1`] >= tt) as u16;
595	idx.0[ `2`] += (t0[`2`] >= tt) as u16;
596	idx.0[ `3`] += (t0[`3`] >= tt) as u16;
597	idx.0[ `4`] += (t0[`4`] >= tt) as u16;
598	idx.0[ `5`] += (t0[`5`] >= tt) as u16;
599	idx.0[ `6`] += (t0[`6`] >= tt) as u16;
600	idx.0[ `7`] += (t0[`7`] >= tt) as u16;
601	idx.0[ `8`] += (t1[`0`] >= tt) as u16;
602	idx.0[ `9`] += (t1[`1`] >= tt) as u16;
603	idx.0[`10`] += (t1[`2`] >= tt) as u16;
604	idx.0[`11`] += (t1[`3`] >= tt) as u16;
605	idx.0[`12`] += (t1[`4`] >= tt) as u16;
606	idx.0[`13`] += (t1[`5`] >= tt) as u16;
607	idx.0[`14`] += (t1[`6`] >= tt) as u16;
608	idx.0[`15`] += (t1[`7`] >= tt) as u16;
609	}
610	gradient_lookup(ctx, &idx, t, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
611
612	p.next_stage();
613	}
614
615	fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
616	let ctx: &EvenlySpaced2StopGradientCtx = &p.ctx.evenly_spaced_2_stop_gradient;
617
618	let t: f32x16 = join(&p.r, &p.g);
619	round_f32_to_u16(
620	rf:mad(t, f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)),
621	gf:mad(t, f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)),
622	bf:mad(t, f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)),
623	af:mad(f:t, m:f32x16::splat(ctx.factor.a), a:f32x16::splat(ctx.bias.a)),
624	&mut p.r, &mut p.g, &mut p.b, &mut p.a,
625	);
626
627	p.next_stage();
628	}
629
630	fn xy_to_radius(p: &mut Pipeline) {
631	let x: f32x16 = join(&p.r, &p.g);
632	let y: f32x16 = join(&p.b, &p.a);
633	let x: f32x16 = (xx + yy).sqrt();
634	split(&x, &mut p.r, &mut p.g);
635	split(&y, &mut p.b, &mut p.a);
636
637	p.next_stage();
638	}
639
640	// We are using u16 for index, not u32 as Skia, to simplify the code a bit.
641	// The gradient creation code will not allow that many stops anyway.
642	fn gradient_lookup(
643	ctx: &super::GradientCtx, idx: &u16x16, t: f32x16,
644	r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
645	) {
646	macro_rules! gather {
647	($d:expr, $c:ident) => {
648	// Surprisingly, but bound checking doesn't affect the performance.
649	// And since `idx` can contain any number, we should leave it in place.
650	f32x16(
651	f32x8::from([
652	$d[idx.`0`[ `0`] as usize].$c,
653	$d[idx.`0`[ `1`] as usize].$c,
654	$d[idx.`0`[ `2`] as usize].$c,
655	$d[idx.`0`[ `3`] as usize].$c,
656	$d[idx.`0`[ `4`] as usize].$c,
657	$d[idx.`0`[ `5`] as usize].$c,
658	$d[idx.`0`[ `6`] as usize].$c,
659	$d[idx.`0`[ `7`] as usize].$c,
660	]),
661	f32x8::from([
662	$d[idx.`0`[ `8`] as usize].$c,
663	$d[idx.`0`[ `9`] as usize].$c,
664	$d[idx.`0`[`10`] as usize].$c,
665	$d[idx.`0`[`11`] as usize].$c,
666	$d[idx.`0`[`12`] as usize].$c,
667	$d[idx.`0`[`13`] as usize].$c,
668	$d[idx.`0`[`14`] as usize].$c,
669	$d[idx.`0`[`15`] as usize].$c,
670	]),
671	)
672	};
673	}
674
675	let fr = gather!(&ctx.factors, r);
676	let fg = gather!(&ctx.factors, g);
677	let fb = gather!(&ctx.factors, b);
678	let fa = gather!(&ctx.factors, a);
679
680	let br = gather!(&ctx.biases, r);
681	let bg = gather!(&ctx.biases, g);
682	let bb = gather!(&ctx.biases, b);
683	let ba = gather!(&ctx.biases, a);
684
685	round_f32_to_u16(
686	mad(t, fr, br),
687	mad(t, fg, bg),
688	mad(t, fb, bb),
689	mad(t, fa, ba),
690	r, g, b, a,
691	);
692	}
693
694	#[inline(always)]
695	fn round_f32_to_u16(
696	rf: f32x16, gf: f32x16, bf: f32x16, af: f32x16,
697	r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
698	) {
699	// TODO: may produce a slightly different result to Skia
700	// affects the two_stops_linear_mirror test
701
702	let rf: f32x16 = rf.normalize() * f32x16::splat(`255.0`) + f32x16::splat(`0.5`);
703	let gf: f32x16 = gf.normalize() * f32x16::splat(`255.0`) + f32x16::splat(`0.5`);
704	let bf: f32x16 = bf.normalize() * f32x16::splat(`255.0`) + f32x16::splat(`0.5`);
705	let af: f32x16 = af * f32x16::splat(`255.0`) + f32x16::splat(`0.5`);
706
707	rf.save_to_u16x16(dst:r);
708	gf.save_to_u16x16(dst:g);
709	bf.save_to_u16x16(dst:b);
710	af.save_to_u16x16(dst:a);
711	}
712
713	pub fn just_return(_: &mut Pipeline) {
714	// Ends the loop.
715	}
716
717	pub fn null_fn(_: &mut Pipeline) {
718	// Just for unsupported functions in STAGES.
719	}
720
721	#[inline(always)]
722	fn load_8888(
723	data: &[PremultipliedColorU8; STAGE_WIDTH],
724	r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
725	) {
726	*r = u16x16([
727	data[ `0`].red() as u16, data[ `1`].red() as u16, data[ `2`].red() as u16, data[ `3`].red() as u16,
728	data[ `4`].red() as u16, data[ `5`].red() as u16, data[ `6`].red() as u16, data[ `7`].red() as u16,
729	data[ `8`].red() as u16, data[ `9`].red() as u16, data[`10`].red() as u16, data[`11`].red() as u16,
730	data[`12`].red() as u16, data[`13`].red() as u16, data[`14`].red() as u16, data[`15`].red() as u16,
731	]);
732
733	*g = u16x16([
734	data[ `0`].green() as u16, data[ `1`].green() as u16, data[ `2`].green() as u16, data[ `3`].green() as u16,
735	data[ `4`].green() as u16, data[ `5`].green() as u16, data[ `6`].green() as u16, data[ `7`].green() as u16,
736	data[ `8`].green() as u16, data[ `9`].green() as u16, data[`10`].green() as u16, data[`11`].green() as u16,
737	data[`12`].green() as u16, data[`13`].green() as u16, data[`14`].green() as u16, data[`15`].green() as u16,
738	]);
739
740	*b = u16x16([
741	data[ `0`].blue() as u16, data[ `1`].blue() as u16, data[ `2`].blue() as u16, data[ `3`].blue() as u16,
742	data[ `4`].blue() as u16, data[ `5`].blue() as u16, data[ `6`].blue() as u16, data[ `7`].blue() as u16,
743	data[ `8`].blue() as u16, data[ `9`].blue() as u16, data[`10`].blue() as u16, data[`11`].blue() as u16,
744	data[`12`].blue() as u16, data[`13`].blue() as u16, data[`14`].blue() as u16, data[`15`].blue() as u16,
745	]);
746
747	*a = u16x16([
748	data[ `0`].alpha() as u16, data[ `1`].alpha() as u16, data[ `2`].alpha() as u16, data[ `3`].alpha() as u16,
749	data[ `4`].alpha() as u16, data[ `5`].alpha() as u16, data[ `6`].alpha() as u16, data[ `7`].alpha() as u16,
750	data[ `8`].alpha() as u16, data[ `9`].alpha() as u16, data[`10`].alpha() as u16, data[`11`].alpha() as u16,
751	data[`12`].alpha() as u16, data[`13`].alpha() as u16, data[`14`].alpha() as u16, data[`15`].alpha() as u16,
752	]);
753	}
754
755	#[inline(always)]
756	fn load_8888_tail(
757	tail: usize, data: &[PremultipliedColorU8],
758	r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
759	) {
760	// Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
761	// This way we can reuse the `load_8888__` method and remove any branches.
762	let mut tmp: [PremultipliedColorU8; 16] = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
763	tmp[`0`..tail].copy_from_slice(&data[`0`..tail]);
764	load_8888(&tmp, r, g, b, a);
765	}
766
767	#[inline(always)]
768	fn store_8888(
769	r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
770	data: &mut [PremultipliedColorU8; STAGE_WIDTH],
771	) {
772	let r: &[u16; 16] = r.as_slice();
773	let g: &[u16; 16] = g.as_slice();
774	let b: &[u16; 16] = b.as_slice();
775	let a: &[u16; 16] = a.as_slice();
776
777	data[ `0`] = PremultipliedColorU8::from_rgba_unchecked(r[ `0`] as u8, g[ `0`] as u8, b[ `0`] as u8, a[ `0`] as u8);
778	data[ `1`] = PremultipliedColorU8::from_rgba_unchecked(r[ `1`] as u8, g[ `1`] as u8, b[ `1`] as u8, a[ `1`] as u8);
779	data[ `2`] = PremultipliedColorU8::from_rgba_unchecked(r[ `2`] as u8, g[ `2`] as u8, b[ `2`] as u8, a[ `2`] as u8);
780	data[ `3`] = PremultipliedColorU8::from_rgba_unchecked(r[ `3`] as u8, g[ `3`] as u8, b[ `3`] as u8, a[ `3`] as u8);
781	data[ `4`] = PremultipliedColorU8::from_rgba_unchecked(r[ `4`] as u8, g[ `4`] as u8, b[ `4`] as u8, a[ `4`] as u8);
782	data[ `5`] = PremultipliedColorU8::from_rgba_unchecked(r[ `5`] as u8, g[ `5`] as u8, b[ `5`] as u8, a[ `5`] as u8);
783	data[ `6`] = PremultipliedColorU8::from_rgba_unchecked(r[ `6`] as u8, g[ `6`] as u8, b[ `6`] as u8, a[ `6`] as u8);
784	data[ `7`] = PremultipliedColorU8::from_rgba_unchecked(r[ `7`] as u8, g[ `7`] as u8, b[ `7`] as u8, a[ `7`] as u8);
785	data[ `8`] = PremultipliedColorU8::from_rgba_unchecked(r[ `8`] as u8, g[ `8`] as u8, b[ `8`] as u8, a[ `8`] as u8);
786	data[ `9`] = PremultipliedColorU8::from_rgba_unchecked(r[ `9`] as u8, g[ `9`] as u8, b[ `9`] as u8, a[ `9`] as u8);
787	data[`10`] = PremultipliedColorU8::from_rgba_unchecked(r[`10`] as u8, g[`10`] as u8, b[`10`] as u8, a[`10`] as u8);
788	data[`11`] = PremultipliedColorU8::from_rgba_unchecked(r[`11`] as u8, g[`11`] as u8, b[`11`] as u8, a[`11`] as u8);
789	data[`12`] = PremultipliedColorU8::from_rgba_unchecked(r[`12`] as u8, g[`12`] as u8, b[`12`] as u8, a[`12`] as u8);
790	data[`13`] = PremultipliedColorU8::from_rgba_unchecked(r[`13`] as u8, g[`13`] as u8, b[`13`] as u8, a[`13`] as u8);
791	data[`14`] = PremultipliedColorU8::from_rgba_unchecked(r[`14`] as u8, g[`14`] as u8, b[`14`] as u8, a[`14`] as u8);
792	data[`15`] = PremultipliedColorU8::from_rgba_unchecked(r[`15`] as u8, g[`15`] as u8, b[`15`] as u8, a[`15`] as u8);
793	}
794
795	#[inline(always)]
796	fn store_8888_tail(
797	r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
798	tail: usize, data: &mut [PremultipliedColorU8],
799	) {
800	let r: &[u16; 16] = r.as_slice();
801	let g: &[u16; 16] = g.as_slice();
802	let b: &[u16; 16] = b.as_slice();
803	let a: &[u16; 16] = a.as_slice();
804
805	// This is better than `for i in 0..tail`, because this way the compiler
806	// knows that we have only 16 steps and slices access is guarantee to be valid.
807	// This removes bounds checking and a possible panic call.
808	for i: usize in `0`..STAGE_WIDTH {
809	data[i] = PremultipliedColorU8::from_rgba_unchecked(
810	r[i] as u8, g[i] as u8, b[i] as u8, a[i] as u8,
811	);
812
813	if i + `1` == tail {
814	break;
815	}
816	}
817	}
818
819	#[inline(always)]
820	fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) {
821	*a = u16x16([
822	data[ `0`] as u16, data[ `1`] as u16, data[ `2`] as u16, data[ `3`] as u16,
823	data[ `4`] as u16, data[ `5`] as u16, data[ `6`] as u16, data[ `7`] as u16,
824	data[ `8`] as u16, data[ `9`] as u16, data[`10`] as u16, data[`11`] as u16,
825	data[`12`] as u16, data[`13`] as u16, data[`14`] as u16, data[`15`] as u16,
826	]);
827	}
828
829	#[inline(always)]
830	fn div255(v: u16x16) -> u16x16 {
831	// Skia uses `vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8)` here when NEON is available,
832	// but it doesn't affect performance much and breaks reproducible result. Ignore it.
833	// NOTE: the compiler does not replace the devision with a shift.
834	(v + u16x16::splat(`255`)) >> u16x16::splat(`8`) // / u16x16::splat(256)
835	}
836
837	#[inline(always)]
838	fn inv(v: u16x16) -> u16x16 {
839	u16x16::splat(`255`) - v
840	}
841
842	#[inline(always)]
843	fn from_float(f: f32) -> u16x16 {
844	u16x16::splat((f * `255.0` + `0.5`) as u16)
845	}
846
847	#[inline(always)]
848	fn lerp(from: u16x16, to: u16x16, t: u16x16) -> u16x16 {
849	div255(from * inv(t) + to * t)
850	}
851
852	#[inline(always)]
853	fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) {
854	// We're splitting f32x16 (512bit) into two u16x16 (256 bit).
855	let data: [u8; `64`] = bytemuck::cast(*v);
856	let d0: &mut [u8; `32`] = bytemuck::cast_mut(&mut lo.0);
857	let d1: &mut [u8; `32`] = bytemuck::cast_mut(&mut hi.0);
858
859	d0.copy_from_slice(&data[`0`..`32`]);
860	d1.copy_from_slice(&data[`32`..`64`]);
861	}
862
863	#[inline(always)]
864	fn join(lo: &u16x16, hi: &u16x16) -> f32x16 {
865	// We're joining two u16x16 (256 bit) into f32x16 (512bit).
866
867	let d0: [u8; `32`] = bytemuck::cast(lo.0);
868	let d1: [u8; `32`] = bytemuck::cast(hi.0);
869
870	let mut v: f32x16 = f32x16::default();
871	let data: &mut [u8; `64`] = bytemuck::cast_mut(&mut v);
872
873	data[`0`..`32`].copy_from_slice(&d0);
874	data[`32`..`64`].copy_from_slice(&d1);
875
876	v
877	}
878
879	#[inline(always)]
880	fn mad(f: f32x16, m: f32x16, a: f32x16) -> f32x16 {
881	// NEON vmlaq_f32 doesn't seem to affect performance in any way. Ignore it.
882	f * m + a
883	}
884