highp.rs source code [crates/tiny-skia-0.11.4/src/pipeline/highp.rs]

1	// Copyright 2018 Google Inc.
2	// Copyright 2020 Yevhenii Reizner
3	//
4	// Use of this source code is governed by a BSD-style license that can be
5	// found in the LICENSE file.
6
7	/!*
8	A high precision raster pipeline implementation.
9
10	Unlike lowp, this one implements all stages.
11
12	Just like Skia, this pipeline is implemented using f32x8.
13
14	For some reason, we are almost 2x slower. Maybe because Skia uses clang's vector extensions
15	and we're using a manual implementation.
16	*/
17
18	use crate::{PremultipliedColorU8, SpreadMode, PixmapRef};
19
20	use crate::geom::ScreenIntRect;
21	use crate::pixmap::SubPixmapMut;
22	use crate::wide::{f32x8, i32x8, u32x8};
23
24	pub const STAGE_WIDTH: usize = `8`;
25
26	pub type StageFn = fn(p: &mut Pipeline);
27
28	pub struct Pipeline<'a, 'b: 'a> {
29	index: usize,
30	functions: &'a [StageFn],
31	pixmap_src: PixmapRef<'a>,
32	pixmap_dst: &'a mut SubPixmapMut<'b>,
33	ctx: &'a mut super::Context, // TODO: remove mut
34	mask_ctx: super::MaskCtx<'a>,
35	aa_mask_ctx: super::AAMaskCtx,
36	r: f32x8,
37	g: f32x8,
38	b: f32x8,
39	a: f32x8,
40	dr: f32x8,
41	dg: f32x8,
42	db: f32x8,
43	da: f32x8,
44	tail: usize,
45	dx: usize,
46	dy: usize,
47	}
48
49	impl Pipeline<'_, '_> {
50	#[inline(always)]
51	fn next_stage(&mut self) {
52	let next: fn(&mut Self) = self.functions[self.index];
53	self.index += `1`;
54	next(self);
55	}
56	}
57
58	// Must be in the same order as raster_pipeline::Stage
59	pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
60	move_source_to_destination,
61	move_destination_to_source,
62	clamp_0,
63	clamp_a,
64	premultiply,
65	uniform_color,
66	seed_shader,
67	load_dst,
68	store,
69	load_dst_u8,
70	store_u8,
71	gather,
72	load_mask_u8,
73	mask_u8,
74	scale_u8,
75	lerp_u8,
76	scale_1_float,
77	lerp_1_float,
78	destination_atop,
79	destination_in,
80	destination_out,
81	destination_over,
82	source_atop,
83	source_in,
84	source_out,
85	source_over,
86	clear,
87	modulate,
88	multiply,
89	plus,
90	screen,
91	xor,
92	color_burn,
93	color_dodge,
94	darken,
95	difference,
96	exclusion,
97	hard_light,
98	lighten,
99	overlay,
100	soft_light,
101	hue,
102	saturation,
103	color,
104	luminosity,
105	source_over_rgba,
106	transform,
107	reflect,
108	repeat,
109	bilinear,
110	bicubic,
111	pad_x1,
112	reflect_x1,
113	repeat_x1,
114	gradient,
115	evenly_spaced_2_stop_gradient,
116	xy_to_radius,
117	xy_to_2pt_conical_focal_on_circle,
118	xy_to_2pt_conical_well_behaved,
119	xy_to_2pt_conical_greater,
120	mask_2pt_conical_degenerates,
121	apply_vector_mask,
122	];
123
124	pub fn fn_ptr(f: StageFn) -> *const () {
125	f as *const ()
126	}
127
128	#[inline(never)]
129	pub fn start(
130	functions: &[StageFn],
131	functions_tail: &[StageFn],
132	rect: &ScreenIntRect,
133	aa_mask_ctx: super::AAMaskCtx,
134	mask_ctx: super::MaskCtx,
135	ctx: &mut super::Context,
136	pixmap_src: PixmapRef,
137	pixmap_dst: &mut SubPixmapMut,
138	) {
139	let mut p = Pipeline {
140	index: `0`,
141	functions: &[],
142	pixmap_src,
143	pixmap_dst,
144	mask_ctx,
145	aa_mask_ctx,
146	ctx,
147	r: f32x8::default(),
148	g: f32x8::default(),
149	b: f32x8::default(),
150	a: f32x8::default(),
151	dr: f32x8::default(),
152	dg: f32x8::default(),
153	db: f32x8::default(),
154	da: f32x8::default(),
155	tail: `0`,
156	dx: `0`,
157	dy: `0`,
158	};
159
160	for y in rect.y()..rect.bottom() {
161	let mut x = rect.x() as usize;
162	let end = rect.right() as usize;
163
164	p.functions = functions;
165	while x + STAGE_WIDTH <= end {
166	p.index = `0`;
167	p.dx = x;
168	p.dy = y as usize;
169	p.tail = STAGE_WIDTH;
170	p.next_stage();
171	x += STAGE_WIDTH;
172	}
173
174	if x != end {
175	p.index = `0`;
176	p.functions = functions_tail;
177	p.dx = x;
178	p.dy = y as usize;
179	p.tail = end - x;
180	p.next_stage();
181	}
182	}
183	}
184
185	fn move_source_to_destination(p: &mut Pipeline) {
186	p.dr = p.r;
187	p.dg = p.g;
188	p.db = p.b;
189	p.da = p.a;
190
191	p.next_stage();
192	}
193
194	fn premultiply(p: &mut Pipeline) {
195	p.r *= p.a;
196	p.g *= p.a;
197	p.b *= p.a;
198
199	p.next_stage();
200	}
201
202	fn move_destination_to_source(p: &mut Pipeline) {
203	p.r = p.dr;
204	p.g = p.dg;
205	p.b = p.db;
206	p.a = p.da;
207
208	p.next_stage();
209	}
210
211	fn clamp_0(p: &mut Pipeline) {
212	p.r = p.r.max(f32x8::default());
213	p.g = p.g.max(f32x8::default());
214	p.b = p.b.max(f32x8::default());
215	p.a = p.a.max(f32x8::default());
216
217	p.next_stage();
218	}
219
220	fn clamp_a(p: &mut Pipeline) {
221	p.r = p.r.min(f32x8::splat(`1.0`));
222	p.g = p.g.min(f32x8::splat(`1.0`));
223	p.b = p.b.min(f32x8::splat(`1.0`));
224	p.a = p.a.min(f32x8::splat(`1.0`));
225
226	p.next_stage();
227	}
228
229	fn uniform_color(p: &mut Pipeline) {
230	let ctx: &UniformColorCtx = &p.ctx.uniform_color;
231	p.r = f32x8::splat(ctx.r);
232	p.g = f32x8::splat(ctx.g);
233	p.b = f32x8::splat(ctx.b);
234	p.a = f32x8::splat(ctx.a);
235
236	p.next_stage();
237	}
238
239	fn seed_shader(p: &mut Pipeline) {
240	let iota: f32x8 = f32x8::from([`0.5`, `1.5`, `2.5`, `3.5`, `4.5`, `5.5`, `6.5`, `7.5`]);
241
242	p.r = f32x8::splat(p.dx as f32) + iota;
243	p.g = f32x8::splat(p.dy as f32 + `0.5`);
244	p.b = f32x8::splat(`1.0`);
245	p.a = f32x8::default();
246
247	p.dr = f32x8::default();
248	p.dg = f32x8::default();
249	p.db = f32x8::default();
250	p.da = f32x8::default();
251
252	p.next_stage();
253	}
254
255	pub fn load_dst(p: &mut Pipeline) {
256	load_8888(data:p.pixmap_dst.slice4_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
257	p.next_stage();
258	}
259
260	pub fn load_dst_tail(p: &mut Pipeline) {
261	load_8888_tail(p.tail, data:p.pixmap_dst.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
262	p.next_stage();
263	}
264
265	pub fn store(p: &mut Pipeline) {
266	store_8888(&p.r, &p.g, &p.b, &p.a, data:p.pixmap_dst.slice4_at_xy(p.dx, p.dy));
267	p.next_stage();
268	}
269
270	pub fn store_tail(p: &mut Pipeline) {
271	store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:p.pixmap_dst.slice_at_xy(p.dx, p.dy));
272	p.next_stage();
273	}
274
275	// Currently, all mask/A8 pixmaps are handled by lowp.
276	pub fn load_dst_u8(_: &mut Pipeline) {
277	// unreachable
278	}
279
280	pub fn load_dst_u8_tail(_: &mut Pipeline) {
281	// unreachable
282	}
283
284	pub fn store_u8(_: &mut Pipeline) {
285	// unreachable
286	}
287
288	pub fn store_u8_tail(_: &mut Pipeline) {
289	// unreachable
290	}
291
292	pub fn gather(p: &mut Pipeline) {
293	let ix: u32x8 = gather_ix(p.pixmap_src, x:p.r, y:p.g);
294	load_8888(&p.pixmap_src.gather(index:ix), &mut p.r, &mut p.g, &mut p.b, &mut p.a);
295
296	p.next_stage();
297	}
298
299	#[inline(always)]
300	fn gather_ix(pixmap: PixmapRef, mut x: f32x8, mut y: f32x8) -> u32x8 {
301	// Exclusive -> inclusive.
302	let w: f32 = ulp_sub(pixmap.width() as f32);
303	let h: f32 = ulp_sub(pixmap.height() as f32);
304	x = x.max(f32x8::default()).min(f32x8::splat(w));
305	y = y.max(f32x8::default()).min(f32x8::splat(h));
306
307	(y.trunc_int() * i32x8::splat(pixmap.width() as i32) + x.trunc_int()).to_u32x8_bitcast()
308	}
309
310	#[inline(always)]
311	fn ulp_sub(v: f32) -> f32 {
312	// Somewhat similar to v - f32::EPSILON
313	bytemuck::cast::<u32, f32>(bytemuck::cast::<f32, u32>(v) - `1`)
314	}
315
316	fn load_mask_u8(_: &mut Pipeline) {
317	// unreachable
318	}
319
320	fn mask_u8(p: &mut Pipeline) {
321	let offset: usize = p.mask_ctx.offset(p.dx, p.dy);
322	let mut c: [f32; 8] = [`0.0`; `8`];
323	for i: usize in `0`..p.tail {
324	c[i] = p.mask_ctx.data[offset + i] as f32;
325	}
326	let c: f32x8 = f32x8::from(c) / f32x8::splat(`255.0`);
327
328	if c == f32x8::default() {
329	return;
330	}
331
332	p.r *= c;
333	p.g *= c;
334	p.b *= c;
335	p.a *= c;
336
337	p.next_stage();
338	}
339
340	fn scale_u8(p: &mut Pipeline) {
341	// Load u8xTail and cast it to f32x8.
342	let data: [u8; 2] = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
343	let c: f32x8 = f32x8::from([data[`0`] as f32, data[`1`] as f32, `0.0`, `0.0`, `0.0`, `0.0`, `0.0`, `0.0`]);
344	let c: f32x8 = c / f32x8::splat(`255.0`);
345
346	p.r *= c;
347	p.g *= c;
348	p.b *= c;
349	p.a *= c;
350
351	p.next_stage();
352	}
353
354	fn lerp_u8(p: &mut Pipeline) {
355	// Load u8xTail and cast it to f32x8.
356	let data: [u8; 2] = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
357	let c: f32x8 = f32x8::from([data[`0`] as f32, data[`1`] as f32, `0.0`, `0.0`, `0.0`, `0.0`, `0.0`, `0.0`]);
358	let c: f32x8 = c / f32x8::splat(`255.0`);
359
360	p.r = lerp(from:p.dr, to:p.r, t:c);
361	p.g = lerp(from:p.dg, to:p.g, t:c);
362	p.b = lerp(from:p.db, to:p.b, t:c);
363	p.a = lerp(from:p.da, to:p.a, t:c);
364
365	p.next_stage();
366	}
367
368	fn scale_1_float(p: &mut Pipeline) {
369	let c: f32x8 = f32x8::splat(p.ctx.current_coverage);
370	p.r *= c;
371	p.g *= c;
372	p.b *= c;
373	p.a *= c;
374
375	p.next_stage();
376	}
377
378	fn lerp_1_float(p: &mut Pipeline) {
379	let c: f32x8 = f32x8::splat(p.ctx.current_coverage);
380	p.r = lerp(from:p.dr, to:p.r, t:c);
381	p.g = lerp(from:p.dg, to:p.g, t:c);
382	p.b = lerp(from:p.db, to:p.b, t:c);
383	p.a = lerp(from:p.da, to:p.a, t:c);
384
385	p.next_stage();
386	}
387
388	macro_rules! blend_fn {
389	($name:ident, $f:expr) => {
390	fn $name(p: &mut Pipeline) {
391	p.r = $f(p.r, p.dr, p.a, p.da);
392	p.g = $f(p.g, p.dg, p.a, p.da);
393	p.b = $f(p.b, p.db, p.a, p.da);
394	p.a = $f(p.a, p.da, p.a, p.da);
395
396	p.next_stage();
397	}
398	};
399	}
400
401	blend_fn!(clear, \|_, _, _, _\| f32x8::default());
402	blend_fn!(source_atop, \|s, d, sa, da\| s * da + d * inv(sa));
403	blend_fn!(destination_atop, \|s, d, sa, da\| d * sa + s * inv(da));
404	blend_fn!(source_in, \|s, _, _, da\| s * da);
405	blend_fn!(destination_in, \|_, d, sa, _\| d * sa);
406	blend_fn!(source_out, \|s, _, _, da\| s * inv(da));
407	blend_fn!(destination_out, \|_, d, sa, _\| d * inv(sa));
408	blend_fn!(source_over, \|s, d, sa, _\| mad(d, inv(sa), s));
409	blend_fn!(destination_over, \|s, d, _, da\| mad(s, inv(da), d));
410	blend_fn!(modulate, \|s, d, _, _\| s * d);
411	blend_fn!(multiply, \|s, d, sa, da\| s * inv(da) + d * inv(sa) + s * d);
412	blend_fn!(screen, \|s, d, _, _\| s + d - s * d);
413	blend_fn!(xor, \|s, d, sa, da\| s * inv(da) + d * inv(sa));
414
415	// Wants a type for some reason.
416	blend_fn!(plus, \|s: f32x8, d: f32x8, _, _\| (s + d).min(f32x8::splat(`1.0`)));
417
418	macro_rules! blend_fn2 {
419	($name:ident, $f:expr) => {
420	fn $name(p: &mut Pipeline) {
421	// The same logic applied to color, and source_over for alpha.
422	p.r = $f(p.r, p.dr, p.a, p.da);
423	p.g = $f(p.g, p.dg, p.a, p.da);
424	p.b = $f(p.b, p.db, p.a, p.da);
425	p.a = mad(p.da, inv(p.a), p.a);
426
427	p.next_stage();
428	}
429	};
430	}
431
432	blend_fn2!(darken, \|s: f32x8, d, sa, da: f32x8\| s + d - (s * da).max(d * sa));
433	blend_fn2!(lighten, \|s: f32x8, d, sa, da: f32x8\| s + d - (s * da).min(d * sa));
434	blend_fn2!(difference, \|s: f32x8, d, sa, da: f32x8\| s + d - two((s * da).min(d * sa)));
435	blend_fn2!(exclusion, \|s: f32x8, d, _, _\| s + d - two(s * d));
436
437	blend_fn2!(color_burn, \|s: f32x8, d: f32x8, sa: f32x8, da: f32x8\|
438	d.cmp_eq(da).blend(
439	d + s * inv(da),
440	s.cmp_eq(f32x8::default()).blend(
441	d * inv(sa),
442	sa * (da - da.min((da - d) * sa * s.recip_fast())) + s * inv(da) + d * inv(sa)
443	)
444	)
445	);
446
447	blend_fn2!(color_dodge, \|s: f32x8, d: f32x8, sa: f32x8, da: f32x8\|
448	d.cmp_eq(f32x8::default()).blend(
449	s * inv(da),
450	s.cmp_eq(sa).blend(
451	s + d * inv(sa),
452	sa * da.min((d * sa) * (sa - s).recip_fast()) + s * inv(da) + d * inv(sa)
453	)
454	)
455	);
456
457	blend_fn2!(hard_light, \|s: f32x8, d: f32x8, sa, da\|
458	s * inv(da) + d * inv(sa) + two(s).cmp_le(sa).blend(
459	two(s * d),
460	sa * da - two((da - d) * (sa - s))
461	)
462	);
463
464	blend_fn2!(overlay, \|s: f32x8, d: f32x8, sa, da\|
465	s * inv(da) + d * inv(sa) + two(d).cmp_le(da).blend(
466	two(s * d),
467	sa * da - two((da - d) * (sa - s))
468	)
469	);
470
471	blend_fn2!(soft_light, \|s: f32x8, d: f32x8, sa: f32x8, da: f32x8\| {
472	let m = da.cmp_gt(f32x8::default()).blend(d / da, f32x8::default());
473	let s2 = two(s);
474	let m4 = two(two(m));
475
476	// The logic forks three ways:
477	// 1. dark src?
478	// 2. light src, dark dst?
479	// 3. light src, light dst?
480	let dark_src = d * (sa + (s2 - sa) * (f32x8::splat(`1.0`) - m));
481	let dark_dst = (m4 * m4 + m4) * (m - f32x8::splat(`1.0`)) + f32x8::splat(`7.0`) * m;
482	let lite_dst = m.sqrt() - m;
483	let lite_src = d * sa + da * (s2 - sa)
484	* two(two(d)).cmp_le(da).blend(dark_dst, lite_dst); // 2 or 3?
485
486	s * inv(da) + d * inv(sa) + s2.cmp_le(sa).blend(dark_src, lite_src) // 1 or (2 or 3)?
487	});
488
489	// We're basing our implementation of non-separable blend modes on
490	// https://www.w3.org/TR/compositing-1/#blendingnonseparable.
491	// and
492	// https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
493	// They're equivalent, but ES' math has been better simplified.
494	//
495	// Anything extra we add beyond that is to make the math work with premul inputs.
496
497	macro_rules! blend_fn3 {
498	($name:ident, $f:expr) => {
499	fn $name(p: &mut Pipeline) {
500	let (tr, tg, tb, ta) = $f(p.r, p.g, p.b, p.a, p.dr, p.dg, p.db, p.da);
501	p.r = tr;
502	p.g = tg;
503	p.b = tb;
504	p.a = ta;
505
506	p.next_stage();
507	}
508	};
509	}
510
511	blend_fn3!(hue, hue_k);
512
513	#[inline(always)]
514	fn hue_k(
515	r: f32x8, g: f32x8, b: f32x8, a: f32x8,
516	dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
517	) -> (f32x8, f32x8, f32x8, f32x8) {
518	let rr: &mut f32x8 = &mut (r * a);
519	let gg: &mut f32x8 = &mut (g * a);
520	let bb: &mut f32x8 = &mut (b * a);
521
522	set_sat(r:rr, g:gg, b:bb, s:sat(r:dr, g:dg, b:db) * a);
523	set_lum(r:rr, g:gg, b:bb, l:lum(r:dr, g:dg, b:db) * a);
524	clip_color(r:rr, g:gg, b:bb, a:a * da);
525
526	let r: f32x8 = r * inv(da) + dr * inv(a) + *rr;
527	let g: f32x8 = g * inv(da) + dg * inv(a) + *gg;
528	let b: f32x8 = b * inv(da) + db * inv(a) + *bb;
529	let a: f32x8 = a + da - a * da;
530
531	(r, g, b, a)
532	}
533
534	blend_fn3!(saturation, saturation_k);
535
536	#[inline(always)]
537	fn saturation_k(
538	r: f32x8, g: f32x8, b: f32x8, a: f32x8,
539	dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
540	) -> (f32x8, f32x8, f32x8, f32x8) {
541	let rr: &mut f32x8 = &mut (dr * a);
542	let gg: &mut f32x8 = &mut (dg * a);
543	let bb: &mut f32x8 = &mut (db * a);
544
545	set_sat(r:rr, g:gg, b:bb, s:sat(r, g, b) * da);
546	set_lum(r:rr, g:gg, b:bb, l:lum(r:dr, g:dg, b:db) * a); // (This is not redundant.)
547	clip_color(r:rr, g:gg, b:bb, a:a * da);
548
549	let r: f32x8 = r * inv(da) + dr * inv(a) + *rr;
550	let g: f32x8 = g * inv(da) + dg * inv(a) + *gg;
551	let b: f32x8 = b * inv(da) + db * inv(a) + *bb;
552	let a: f32x8 = a + da - a * da;
553
554	(r, g, b, a)
555	}
556
557	blend_fn3!(color, color_k);
558
559	#[inline(always)]
560	fn color_k(
561	r: f32x8, g: f32x8, b: f32x8, a: f32x8,
562	dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
563	) -> (f32x8, f32x8, f32x8, f32x8) {
564	let rr: &mut f32x8 = &mut (r * da);
565	let gg: &mut f32x8 = &mut (g * da);
566	let bb: &mut f32x8 = &mut (b * da);
567
568	set_lum(r:rr, g:gg, b:bb, l:lum(r:dr, g:dg, b:db) * a);
569	clip_color(r:rr, g:gg, b:bb, a:a * da);
570
571	let r: f32x8 = r * inv(da) + dr * inv(a) + *rr;
572	let g: f32x8 = g * inv(da) + dg * inv(a) + *gg;
573	let b: f32x8 = b * inv(da) + db * inv(a) + *bb;
574	let a: f32x8 = a + da - a * da;
575
576	(r, g, b, a)
577	}
578
579	blend_fn3!(luminosity, luminosity_k);
580
581	#[inline(always)]
582	fn luminosity_k(
583	r: f32x8, g: f32x8, b: f32x8, a: f32x8,
584	dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
585	) -> (f32x8, f32x8, f32x8, f32x8) {
586	let rr: &mut f32x8 = &mut (dr * a);
587	let gg: &mut f32x8 = &mut (dg * a);
588	let bb: &mut f32x8 = &mut (db * a);
589
590	set_lum(r:rr, g:gg, b:bb, l:lum(r, g, b) * da);
591	clip_color(r:rr, g:gg, b:bb, a:a * da);
592
593	let r: f32x8 = r * inv(da) + dr * inv(a) + *rr;
594	let g: f32x8 = g * inv(da) + dg * inv(a) + *gg;
595	let b: f32x8 = b * inv(da) + db * inv(a) + *bb;
596	let a: f32x8 = a + da - a * da;
597
598	(r, g, b, a)
599	}
600
601	#[inline(always)]
602	fn sat(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
603	r.max(g.max(b)) - r.min(g.min(b))
604	}
605
606	#[inline(always)]
607	fn lum(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
608	r * f32x8::splat(`0.30`) + g * f32x8::splat(`0.59`) + b * f32x8::splat(`0.11`)
609	}
610
611	#[inline(always)]
612	fn set_sat(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, s: f32x8) {
613	let mn: f32x8 = r.min(g.min(*b));
614	let mx: f32x8 = r.max(g.max(*b));
615	let sat: f32x8 = mx - mn;
616
617	// Map min channel to 0, max channel to s, and scale the middle proportionally.
618	let scale: impl Fn(f32x8) -> f32x8 = \|c: f32x8\| sat.cmp_eq(f32x8::default())
619	.blend(t:f32x8::default(), (c - mn) * s / sat);
620
621	r = scale(r);
622	g = scale(g);
623	b = scale(b);
624	}
625
626	#[inline(always)]
627	fn set_lum(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, l: f32x8) {
628	let diff: f32x8 = l - lum(r, g, *b);
629	*r += diff;
630	*g += diff;
631	*b += diff;
632	}
633
634	#[inline(always)]
635	fn clip_color(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: f32x8) {
636	let mn: f32x8 = r.min(g.min(*b));
637	let mx: f32x8 = r.max(g.max(*b));
638	let l: f32x8 = lum(r, g, *b);
639
640	let clip: impl Fn(f32x8) -> f32x8 = \|mut c: f32x8\| {
641	c = mx.cmp_ge(f32x8::default()).blend(t:c, f:l + (c - l) * l / (l - mn));
642	c = mx.cmp_gt(a).blend(t:l + (c - l) * (a - l) / (mx - l), f:c);
643	c = c.max(f32x8::default()); // Sometimes without this we may dip just a little negative.
644	c
645	};
646
647	r = clip(r);
648	g = clip(g);
649	b = clip(b);
650	}
651
652	pub fn source_over_rgba(p: &mut Pipeline) {
653	let pixels: &mut [PremultipliedColorU8; 8] = p.pixmap_dst.slice4_at_xy(p.dx, p.dy);
654	load_8888(data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
655	p.r = mad(f:p.dr, m:inv(p.a), a:p.r);
656	p.g = mad(f:p.dg, m:inv(p.a), a:p.g);
657	p.b = mad(f:p.db, m:inv(p.a), a:p.b);
658	p.a = mad(f:p.da, m:inv(p.a), p.a);
659	store_8888(&p.r, &p.g, &p.b, &p.a, data:pixels);
660
661	p.next_stage();
662	}
663
664	pub fn source_over_rgba_tail(p: &mut Pipeline) {
665	let pixels: &mut [PremultipliedColorU8] = p.pixmap_dst.slice_at_xy(p.dx, p.dy);
666	load_8888_tail(p.tail, data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
667	p.r = mad(f:p.dr, m:inv(p.a), a:p.r);
668	p.g = mad(f:p.dg, m:inv(p.a), a:p.g);
669	p.b = mad(f:p.db, m:inv(p.a), a:p.b);
670	p.a = mad(f:p.da, m:inv(p.a), p.a);
671	store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:pixels);
672
673	p.next_stage();
674	}
675
676	fn transform(p: &mut Pipeline) {
677	let ts: &Transform = &p.ctx.transform;
678
679	let tr: f32x8 = mad(f:p.r, m:f32x8::splat(ts.sx), a:mad(f:p.g, m:f32x8::splat(ts.kx), a:f32x8::splat(ts.tx)));
680	let tg: f32x8 = mad(f:p.r, m:f32x8::splat(ts.ky), a:mad(f:p.g, m:f32x8::splat(ts.sy), a:f32x8::splat(ts.ty)));
681	p.r = tr;
682	p.g = tg;
683
684	p.next_stage();
685	}
686
687	// Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
688	// The gather stages will hard clamp the output of these stages to [0,limit)...
689	// we just need to do the basic repeat or mirroring.
690
691	fn reflect(p: &mut Pipeline) {
692	let ctx: &TileCtx = &p.ctx.limit_x;
693	p.r = exclusive_reflect(v:p.r, limit:ctx.scale, inv_limit:ctx.inv_scale);
694
695	let ctx: &TileCtx = &p.ctx.limit_y;
696	p.g = exclusive_reflect(v:p.g, limit:ctx.scale, inv_limit:ctx.inv_scale);
697
698	p.next_stage();
699	}
700
701	#[inline(always)]
702	fn exclusive_reflect(v: f32x8, limit: f32, inv_limit: f32) -> f32x8 {
703	let limit: f32x8 = f32x8::splat(limit);
704	let inv_limit: f32x8 = f32x8::splat(inv_limit);
705	((v - limit) - (limit + limit)
706	* ((v - limit) * (inv_limit * f32x8::splat(`0.5`))).floor() - limit).abs()
707	}
708
709	fn repeat(p: &mut Pipeline) {
710	let ctx: &TileCtx = &p.ctx.limit_x;
711	p.r = exclusive_repeat(v:p.r, limit:ctx.scale, inv_limit:ctx.inv_scale);
712
713	let ctx: &TileCtx = &p.ctx.limit_y;
714	p.g = exclusive_repeat(v:p.g, limit:ctx.scale, inv_limit:ctx.inv_scale);
715
716	p.next_stage();
717	}
718
719	#[inline(always)]
720	fn exclusive_repeat(v: f32x8, limit: f32, inv_limit: f32) -> f32x8 {
721	v - (v * f32x8::splat(inv_limit)).floor() * f32x8::splat(limit)
722	}
723
724	fn bilinear(p: &mut Pipeline) {
725	let x: f32x8 = p.r;
726	let fx: f32x8 = (x + f32x8::splat(`0.5`)).fract();
727	let y: f32x8 = p.g;
728	let fy: f32x8 = (y + f32x8::splat(`0.5`)).fract();
729	let one: f32x8 = f32x8::splat(`1.0`);
730	let wx: [f32x8; 2] = [one - fx, fx];
731	let wy: [f32x8; 2] = [one - fy, fy];
732
733	sampler_2x2(p.pixmap_src, &p.ctx.sampler, cx:x, cy:y, &wx, &wy, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
734
735	p.next_stage();
736	}
737
738	fn bicubic(p: &mut Pipeline) {
739	let x: f32x8 = p.r;
740	let fx: f32x8 = (x + f32x8::splat(`0.5`)).fract();
741	let y: f32x8 = p.g;
742	let fy: f32x8 = (y + f32x8::splat(`0.5`)).fract();
743	let one: f32x8 = f32x8::splat(`1.0`);
744	let wx: [f32x8; 4] = [bicubic_far(one - fx), bicubic_near(one - fx), bicubic_near(fx), bicubic_far(fx)];
745	let wy: [f32x8; 4] = [bicubic_far(one - fy), bicubic_near(one - fy), bicubic_near(fy), bicubic_far(fy)];
746
747	sampler_4x4(p.pixmap_src, &p.ctx.sampler, cx:x, cy:y, &wx, &wy, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
748
749	p.next_stage();
750	}
751
752	// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
753	// pixel center are combined with a non-uniform cubic filter, with higher values near the center.
754	//
755	// We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
756
757	#[inline(always)]
758	fn bicubic_near(t: f32x8) -> f32x8 {
759	// 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
760	mad(
761	f:t,
762	m:mad(t,
763	mad(
764	f32x8::splat(`-21.0`/`18.0`),
765	t,
766	f32x8::splat(`27.0`/`18.0`),
767	),
768	f32x8::splat(`9.0`/`18.0`),
769	),
770	a:f32x8::splat(`1.0`/`18.0`),
771	)
772	}
773
774	#[inline(always)]
775	fn bicubic_far(t: f32x8) -> f32x8 {
776	// 0/18 + 0/18t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)*
777	(t * t) * mad(f:f32x8::splat(`7.0`/`18.0`), m:t, a:f32x8::splat(`-6.0`/`18.0`))
778	}
779
780	#[inline(always)]
781	fn sampler_2x2(
782	pixmap: PixmapRef,
783	ctx: &super::SamplerCtx,
784	cx: f32x8, cy: f32x8,
785	wx: &[f32x8; `2`], wy: &[f32x8; `2`],
786	r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
787	) {
788	*r = f32x8::default();
789	*g = f32x8::default();
790	*b = f32x8::default();
791	*a = f32x8::default();
792
793	let one = f32x8::splat(`1.0`);
794	let start = `-0.5`;
795	let mut y = cy + f32x8::splat(start);
796	for j in `0`..`2` {
797	let mut x = cx + f32x8::splat(start);
798	for i in `0`..`2` {
799	let mut rr = f32x8::default();
800	let mut gg = f32x8::default();
801	let mut bb = f32x8::default();
802	let mut aa = f32x8::default();
803	sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
804
805	let w = wx[i] * wy[j];
806	r = mad(w, rr, r);
807	g = mad(w, gg, g);
808	b = mad(w, bb, b);
809	a = mad(w, aa, a);
810
811	x += one;
812	}
813
814	y += one;
815	}
816	}
817
818	#[inline(always)]
819	fn sampler_4x4(
820	pixmap: PixmapRef,
821	ctx: &super::SamplerCtx,
822	cx: f32x8, cy: f32x8,
823	wx: &[f32x8; `4`], wy: &[f32x8; `4`],
824	r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
825	) {
826	*r = f32x8::default();
827	*g = f32x8::default();
828	*b = f32x8::default();
829	*a = f32x8::default();
830
831	let one = f32x8::splat(`1.0`);
832	let start = `-1.5`;
833	let mut y = cy + f32x8::splat(start);
834	for j in `0`..`4` {
835	let mut x = cx + f32x8::splat(start);
836	for i in `0`..`4` {
837	let mut rr = f32x8::default();
838	let mut gg = f32x8::default();
839	let mut bb = f32x8::default();
840	let mut aa = f32x8::default();
841	sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
842
843	let w = wx[i] * wy[j];
844	r = mad(w, rr, r);
845	g = mad(w, gg, g);
846	b = mad(w, bb, b);
847	a = mad(w, aa, a);
848
849	x += one;
850	}
851
852	y += one;
853	}
854	}
855
856	#[inline(always)]
857	fn sample(
858	pixmap: PixmapRef, ctx: &super::SamplerCtx, mut x: f32x8, mut y: f32x8,
859	r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
860	) {
861	x = tile(v:x, ctx.spread_mode, limit:pixmap.width() as f32, inv_limit:ctx.inv_width);
862	y = tile(v:y, ctx.spread_mode, limit:pixmap.height() as f32, inv_limit:ctx.inv_height);
863
864	let ix: u32x8 = gather_ix(pixmap, x, y);
865	load_8888(&pixmap.gather(index:ix), r, g, b, a);
866	}
867
868	#[inline(always)]
869	fn tile(v: f32x8, mode: SpreadMode, limit: f32, inv_limit: f32) -> f32x8 {
870	match mode {
871	SpreadMode::Pad => v,
872	SpreadMode::Repeat => exclusive_repeat(v, limit, inv_limit),
873	SpreadMode::Reflect => exclusive_reflect(v, limit, inv_limit),
874	}
875	}
876
877	fn pad_x1(p: &mut Pipeline) {
878	p.r = p.r.normalize();
879
880	p.next_stage();
881	}
882
883	fn reflect_x1(p: &mut Pipeline) {
884	p.r = (
885	(p.r - f32x8::splat(`1.0`))
886	- two(((p.r - f32x8::splat(`1.0`)) * f32x8::splat(`0.5`)).floor())
887	- f32x8::splat(`1.0`)
888	).abs().normalize();
889
890	p.next_stage();
891	}
892
893	fn repeat_x1(p: &mut Pipeline) {
894	p.r = (p.r - p.r.floor()).normalize();
895
896	p.next_stage();
897	}
898
899	fn gradient(p: &mut Pipeline) {
900	let ctx: &GradientCtx = &p.ctx.gradient;
901
902	// N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
903	let t: [f32; `8`] = p.r.into();
904	let mut idx: u32x8 = u32x8::default();
905	for i: usize in `1`..ctx.len {
906	let tt: f32 = ctx.t_values[i].get();
907	let n: u32x8 = bytemuck::cast([
908	(t[`0`] >= tt) as u32,
909	(t[`1`] >= tt) as u32,
910	(t[`2`] >= tt) as u32,
911	(t[`3`] >= tt) as u32,
912	(t[`4`] >= tt) as u32,
913	(t[`5`] >= tt) as u32,
914	(t[`6`] >= tt) as u32,
915	(t[`7`] >= tt) as u32,
916	]);
917	idx = idx + n;
918	}
919	gradient_lookup(ctx, &idx, t:p.r, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
920
921	p.next_stage();
922	}
923
924	fn gradient_lookup(
925	ctx: &super::GradientCtx, idx: &u32x8, t: f32x8,
926	r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
927	) {
928	let idx: [u32; `8`] = bytemuck::cast(*idx);
929
930	macro_rules! gather {
931	($d:expr, $c:ident) => {
932	// Surprisingly, but bound checking doesn't affect the performance.
933	// And since `idx` can contain any number, we should leave it in place.
934	f32x8::from([
935	$d[idx[`0`] as usize].$c,
936	$d[idx[`1`] as usize].$c,
937	$d[idx[`2`] as usize].$c,
938	$d[idx[`3`] as usize].$c,
939	$d[idx[`4`] as usize].$c,
940	$d[idx[`5`] as usize].$c,
941	$d[idx[`6`] as usize].$c,
942	$d[idx[`7`] as usize].$c,
943	])
944	};
945	}
946
947	let fr = gather!(&ctx.factors, r);
948	let fg = gather!(&ctx.factors, g);
949	let fb = gather!(&ctx.factors, b);
950	let fa = gather!(&ctx.factors, a);
951
952	let br = gather!(&ctx.biases, r);
953	let bg = gather!(&ctx.biases, g);
954	let bb = gather!(&ctx.biases, b);
955	let ba = gather!(&ctx.biases, a);
956
957	*r = mad(t, fr, br);
958	*g = mad(t, fg, bg);
959	*b = mad(t, fb, bb);
960	*a = mad(t, fa, ba);
961	}
962
963	fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
964	let ctx: &EvenlySpaced2StopGradientCtx = &p.ctx.evenly_spaced_2_stop_gradient;
965
966	let t: f32x8 = p.r;
967	p.r = mad(f:t, m:f32x8::splat(ctx.factor.r), a:f32x8::splat(ctx.bias.r));
968	p.g = mad(f:t, m:f32x8::splat(ctx.factor.g), a:f32x8::splat(ctx.bias.g));
969	p.b = mad(f:t, m:f32x8::splat(ctx.factor.b), a:f32x8::splat(ctx.bias.b));
970	p.a = mad(f:t, m:f32x8::splat(ctx.factor.a), a:f32x8::splat(ctx.bias.a));
971
972	p.next_stage();
973	}
974
975	fn xy_to_radius(p: &mut Pipeline) {
976	let x2: f32x8 = p.r * p.r;
977	let y2: f32x8 = p.g * p.g;
978	p.r = (x2 + y2).sqrt();
979
980	p.next_stage();
981	}
982
983	fn xy_to_2pt_conical_focal_on_circle(p: &mut Pipeline) {
984	let x: f32x8 = p.r;
985	let y: f32x8 = p.g;
986	p.r = x + y * y / x;
987
988	p.next_stage();
989	}
990
991	fn xy_to_2pt_conical_well_behaved(p: &mut Pipeline) {
992	let ctx: &TwoPointConicalGradientCtx = &p.ctx.two_point_conical_gradient;
993
994	let x: f32x8 = p.r;
995	let y: f32x8 = p.g;
996	p.r = (x * x + y * y).sqrt() - x * f32x8::splat(ctx.p0);
997
998	p.next_stage();
999	}
1000
1001	fn xy_to_2pt_conical_greater(p: &mut Pipeline) {
1002	let ctx: &TwoPointConicalGradientCtx = &p.ctx.two_point_conical_gradient;
1003
1004	let x: f32x8 = p.r;
1005	let y: f32x8 = p.g;
1006	p.r = (x * x - y * y).sqrt() - x * f32x8::splat(ctx.p0);
1007
1008	p.next_stage();
1009	}
1010
1011	fn mask_2pt_conical_degenerates(p: &mut Pipeline) {
1012	let ctx: &mut TwoPointConicalGradientCtx = &mut p.ctx.two_point_conical_gradient;
1013
1014	let t: f32x8 = p.r;
1015	let is_degenerate: f32x8 = t.cmp_le(f32x8::default()) \| t.cmp_ne(t);
1016	p.r = is_degenerate.blend(t:f32x8::default(), f:t);
1017
1018	let is_not_degenerate: u32x8 = !is_degenerate.to_u32x8_bitcast();
1019	let is_not_degenerate: [u32; `8`] = bytemuck::cast(is_not_degenerate);
1020	ctx.mask = bytemuck::cast([
1021	if is_not_degenerate[`0`] != `0` { !`0` } else { `0` },
1022	if is_not_degenerate[`1`] != `0` { !`0` } else { `0` },
1023	if is_not_degenerate[`2`] != `0` { !`0` } else { `0` },
1024	if is_not_degenerate[`3`] != `0` { !`0` } else { `0` },
1025	if is_not_degenerate[`4`] != `0` { !`0` } else { `0` },
1026	if is_not_degenerate[`5`] != `0` { !`0` } else { `0` },
1027	if is_not_degenerate[`6`] != `0` { !`0` } else { `0` },
1028	if is_not_degenerate[`7`] != `0` { !`0` } else { `0` },
1029	]);
1030
1031	p.next_stage();
1032	}
1033
1034	fn apply_vector_mask(p: &mut Pipeline) {
1035	let ctx: &TwoPointConicalGradientCtx = &p.ctx.two_point_conical_gradient;
1036
1037	p.r = (p.r.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1038	p.g = (p.g.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1039	p.b = (p.b.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1040	p.a = (p.a.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1041
1042	p.next_stage();
1043	}
1044
1045	pub fn just_return(_: &mut Pipeline) {
1046	// Ends the loop.
1047	}
1048
1049	#[inline(always)]
1050	fn load_8888(
1051	data: &[PremultipliedColorU8; STAGE_WIDTH],
1052	r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
1053	) {
1054	// Surprisingly, `f32 FACTOR` is way faster than `f32x8 * f32x8::splat(FACTOR)`.*
1055
1056	const FACTOR: f32 = `1.0` / `255.0`;
1057
1058	*r = f32x8::from([
1059	data[`0`].red() as f32 * FACTOR, data[`1`].red() as f32 * FACTOR,
1060	data[`2`].red() as f32 * FACTOR, data[`3`].red() as f32 * FACTOR,
1061	data[`4`].red() as f32 * FACTOR, data[`5`].red() as f32 * FACTOR,
1062	data[`6`].red() as f32 * FACTOR, data[`7`].red() as f32 * FACTOR,
1063	]);
1064
1065	*g = f32x8::from([
1066	data[`0`].green() as f32 * FACTOR, data[`1`].green() as f32 * FACTOR,
1067	data[`2`].green() as f32 * FACTOR, data[`3`].green() as f32 * FACTOR,
1068	data[`4`].green() as f32 * FACTOR, data[`5`].green() as f32 * FACTOR,
1069	data[`6`].green() as f32 * FACTOR, data[`7`].green() as f32 * FACTOR,
1070	]);
1071
1072	*b = f32x8::from([
1073	data[`0`].blue() as f32 * FACTOR, data[`1`].blue() as f32 * FACTOR,
1074	data[`2`].blue() as f32 * FACTOR, data[`3`].blue() as f32 * FACTOR,
1075	data[`4`].blue() as f32 * FACTOR, data[`5`].blue() as f32 * FACTOR,
1076	data[`6`].blue() as f32 * FACTOR, data[`7`].blue() as f32 * FACTOR,
1077	]);
1078
1079	*a = f32x8::from([
1080	data[`0`].alpha() as f32 * FACTOR, data[`1`].alpha() as f32 * FACTOR,
1081	data[`2`].alpha() as f32 * FACTOR, data[`3`].alpha() as f32 * FACTOR,
1082	data[`4`].alpha() as f32 * FACTOR, data[`5`].alpha() as f32 * FACTOR,
1083	data[`6`].alpha() as f32 * FACTOR, data[`7`].alpha() as f32 * FACTOR,
1084	]);
1085	}
1086
1087	#[inline(always)]
1088	fn load_8888_tail(
1089	tail: usize, data: &[PremultipliedColorU8],
1090	r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
1091	) {
1092	// Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
1093	// This way we can reuse the `load_8888_` method and remove any branches.
1094	let mut tmp: [PremultipliedColorU8; 8] = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
1095	tmp[`0`..tail].copy_from_slice(&data[`0`..tail]);
1096	load_8888(&tmp, r, g, b, a);
1097	}
1098
1099	#[inline(always)]
1100	fn store_8888(
1101	r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8,
1102	data: &mut [PremultipliedColorU8; STAGE_WIDTH],
1103	) {
1104	let r: [i32; `8`] = unnorm(r).into();
1105	let g: [i32; `8`] = unnorm(g).into();
1106	let b: [i32; `8`] = unnorm(b).into();
1107	let a: [i32; `8`] = unnorm(a).into();
1108
1109	let conv: impl Fn(i32, i32, i32, i32) -> … = \|rr: i32, gg: i32, bb: i32, aa: i32\|
1110	PremultipliedColorU8::from_rgba_unchecked(r:rr as u8, g:gg as u8, b:bb as u8, a:aa as u8);
1111
1112	data[`0`] = conv(rr:r[`0`], gg:g[`0`], bb:b[`0`], aa:a[`0`]);
1113	data[`1`] = conv(rr:r[`1`], gg:g[`1`], bb:b[`1`], aa:a[`1`]);
1114	data[`2`] = conv(rr:r[`2`], gg:g[`2`], bb:b[`2`], aa:a[`2`]);
1115	data[`3`] = conv(rr:r[`3`], gg:g[`3`], bb:b[`3`], aa:a[`3`]);
1116	data[`4`] = conv(rr:r[`4`], gg:g[`4`], bb:b[`4`], aa:a[`4`]);
1117	data[`5`] = conv(rr:r[`5`], gg:g[`5`], bb:b[`5`], aa:a[`5`]);
1118	data[`6`] = conv(rr:r[`6`], gg:g[`6`], bb:b[`6`], aa:a[`6`]);
1119	data[`7`] = conv(rr:r[`7`], gg:g[`7`], bb:b[`7`], aa:a[`7`]);
1120	}
1121
1122	#[inline(always)]
1123	fn store_8888_tail(
1124	r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8,
1125	tail: usize, data: &mut [PremultipliedColorU8],
1126	) {
1127	let r: [i32; `8`] = unnorm(r).into();
1128	let g: [i32; `8`] = unnorm(g).into();
1129	let b: [i32; `8`] = unnorm(b).into();
1130	let a: [i32; `8`] = unnorm(a).into();
1131
1132	// This is better than `for i in 0..tail`, because this way the compiler
1133	// knows that we have only 4 steps and slices access is guarantee to be valid.
1134	// This removes bounds checking and a possible panic call.
1135	for i: usize in `0`..STAGE_WIDTH {
1136	data[i] = PremultipliedColorU8::from_rgba_unchecked(
1137	r:r[i] as u8, g:g[i] as u8, b:b[i] as u8, a:a[i] as u8,
1138	);
1139
1140	if i + `1` == tail {
1141	break;
1142	}
1143	}
1144	}
1145
1146	#[inline(always)]
1147	fn unnorm(v: &f32x8) -> i32x8 {
1148	(v.max(f32x8::default()).min(f32x8::splat(`1.0`)) * f32x8::splat(`255.0`)).round_int()
1149	}
1150
1151	#[inline(always)]
1152	fn inv(v: f32x8) -> f32x8 {
1153	f32x8::splat(`1.0`) - v
1154	}
1155
1156	#[inline(always)]
1157	fn two(v: f32x8) -> f32x8 {
1158	v + v
1159	}
1160
1161	#[inline(always)]
1162	fn mad(f: f32x8, m: f32x8, a: f32x8) -> f32x8 {
1163	f * m + a
1164	}
1165
1166	#[inline(always)]
1167	fn lerp(from: f32x8, to: f32x8, t: f32x8) -> f32x8 {
1168	mad(f:to - from, m:t, a:from)
1169	}
1170