1// Copyright 2018 Google Inc.
2// Copyright 2020 Yevhenii Reizner
3//
4// Use of this source code is governed by a BSD-style license that can be
5// found in the LICENSE file.
6
7/*!
8A high precision raster pipeline implementation.
9
10Unlike lowp, this one implements all stages.
11
12Just like Skia, this pipeline is implemented using f32x8.
13
14For some reason, we are almost 2x slower. Maybe because Skia uses clang's vector extensions
15and we're using a manual implementation.
16*/
17
18use crate::{PremultipliedColorU8, SpreadMode, PixmapRef};
19
20use crate::geom::ScreenIntRect;
21use crate::pixmap::SubPixmapMut;
22use crate::wide::{f32x8, i32x8, u32x8};
23
24pub const STAGE_WIDTH: usize = 8;
25
26pub type StageFn = fn(p: &mut Pipeline);
27
28pub struct Pipeline<'a, 'b: 'a> {
29 index: usize,
30 functions: &'a [StageFn],
31 pixmap_src: PixmapRef<'a>,
32 pixmap_dst: &'a mut SubPixmapMut<'b>,
33 ctx: &'a mut super::Context, // TODO: remove mut
34 mask_ctx: super::MaskCtx<'a>,
35 aa_mask_ctx: super::AAMaskCtx,
36 r: f32x8,
37 g: f32x8,
38 b: f32x8,
39 a: f32x8,
40 dr: f32x8,
41 dg: f32x8,
42 db: f32x8,
43 da: f32x8,
44 tail: usize,
45 dx: usize,
46 dy: usize,
47}
48
49impl Pipeline<'_, '_> {
50 #[inline(always)]
51 fn next_stage(&mut self) {
52 let next: fn(&mut Self) = self.functions[self.index];
53 self.index += 1;
54 next(self);
55 }
56}
57
58// Must be in the same order as raster_pipeline::Stage
59pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
60 move_source_to_destination,
61 move_destination_to_source,
62 clamp_0,
63 clamp_a,
64 premultiply,
65 uniform_color,
66 seed_shader,
67 load_dst,
68 store,
69 load_dst_u8,
70 store_u8,
71 gather,
72 load_mask_u8,
73 mask_u8,
74 scale_u8,
75 lerp_u8,
76 scale_1_float,
77 lerp_1_float,
78 destination_atop,
79 destination_in,
80 destination_out,
81 destination_over,
82 source_atop,
83 source_in,
84 source_out,
85 source_over,
86 clear,
87 modulate,
88 multiply,
89 plus,
90 screen,
91 xor,
92 color_burn,
93 color_dodge,
94 darken,
95 difference,
96 exclusion,
97 hard_light,
98 lighten,
99 overlay,
100 soft_light,
101 hue,
102 saturation,
103 color,
104 luminosity,
105 source_over_rgba,
106 transform,
107 reflect,
108 repeat,
109 bilinear,
110 bicubic,
111 pad_x1,
112 reflect_x1,
113 repeat_x1,
114 gradient,
115 evenly_spaced_2_stop_gradient,
116 xy_to_radius,
117 xy_to_2pt_conical_focal_on_circle,
118 xy_to_2pt_conical_well_behaved,
119 xy_to_2pt_conical_greater,
120 mask_2pt_conical_degenerates,
121 apply_vector_mask,
122];
123
124pub fn fn_ptr(f: StageFn) -> *const () {
125 f as *const ()
126}
127
128#[inline(never)]
129pub fn start(
130 functions: &[StageFn],
131 functions_tail: &[StageFn],
132 rect: &ScreenIntRect,
133 aa_mask_ctx: super::AAMaskCtx,
134 mask_ctx: super::MaskCtx,
135 ctx: &mut super::Context,
136 pixmap_src: PixmapRef,
137 pixmap_dst: &mut SubPixmapMut,
138) {
139 let mut p = Pipeline {
140 index: 0,
141 functions: &[],
142 pixmap_src,
143 pixmap_dst,
144 mask_ctx,
145 aa_mask_ctx,
146 ctx,
147 r: f32x8::default(),
148 g: f32x8::default(),
149 b: f32x8::default(),
150 a: f32x8::default(),
151 dr: f32x8::default(),
152 dg: f32x8::default(),
153 db: f32x8::default(),
154 da: f32x8::default(),
155 tail: 0,
156 dx: 0,
157 dy: 0,
158 };
159
160 for y in rect.y()..rect.bottom() {
161 let mut x = rect.x() as usize;
162 let end = rect.right() as usize;
163
164 p.functions = functions;
165 while x + STAGE_WIDTH <= end {
166 p.index = 0;
167 p.dx = x;
168 p.dy = y as usize;
169 p.tail = STAGE_WIDTH;
170 p.next_stage();
171 x += STAGE_WIDTH;
172 }
173
174 if x != end {
175 p.index = 0;
176 p.functions = functions_tail;
177 p.dx = x;
178 p.dy = y as usize;
179 p.tail = end - x;
180 p.next_stage();
181 }
182 }
183}
184
185fn move_source_to_destination(p: &mut Pipeline) {
186 p.dr = p.r;
187 p.dg = p.g;
188 p.db = p.b;
189 p.da = p.a;
190
191 p.next_stage();
192}
193
194fn premultiply(p: &mut Pipeline) {
195 p.r *= p.a;
196 p.g *= p.a;
197 p.b *= p.a;
198
199 p.next_stage();
200}
201
202fn move_destination_to_source(p: &mut Pipeline) {
203 p.r = p.dr;
204 p.g = p.dg;
205 p.b = p.db;
206 p.a = p.da;
207
208 p.next_stage();
209}
210
211fn clamp_0(p: &mut Pipeline) {
212 p.r = p.r.max(f32x8::default());
213 p.g = p.g.max(f32x8::default());
214 p.b = p.b.max(f32x8::default());
215 p.a = p.a.max(f32x8::default());
216
217 p.next_stage();
218}
219
220fn clamp_a(p: &mut Pipeline) {
221 p.r = p.r.min(f32x8::splat(1.0));
222 p.g = p.g.min(f32x8::splat(1.0));
223 p.b = p.b.min(f32x8::splat(1.0));
224 p.a = p.a.min(f32x8::splat(1.0));
225
226 p.next_stage();
227}
228
229fn uniform_color(p: &mut Pipeline) {
230 let ctx: &UniformColorCtx = &p.ctx.uniform_color;
231 p.r = f32x8::splat(ctx.r);
232 p.g = f32x8::splat(ctx.g);
233 p.b = f32x8::splat(ctx.b);
234 p.a = f32x8::splat(ctx.a);
235
236 p.next_stage();
237}
238
239fn seed_shader(p: &mut Pipeline) {
240 let iota: f32x8 = f32x8::from([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]);
241
242 p.r = f32x8::splat(p.dx as f32) + iota;
243 p.g = f32x8::splat(p.dy as f32 + 0.5);
244 p.b = f32x8::splat(1.0);
245 p.a = f32x8::default();
246
247 p.dr = f32x8::default();
248 p.dg = f32x8::default();
249 p.db = f32x8::default();
250 p.da = f32x8::default();
251
252 p.next_stage();
253}
254
255pub fn load_dst(p: &mut Pipeline) {
256 load_8888(data:p.pixmap_dst.slice4_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
257 p.next_stage();
258}
259
260pub fn load_dst_tail(p: &mut Pipeline) {
261 load_8888_tail(p.tail, data:p.pixmap_dst.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
262 p.next_stage();
263}
264
265pub fn store(p: &mut Pipeline) {
266 store_8888(&p.r, &p.g, &p.b, &p.a, data:p.pixmap_dst.slice4_at_xy(p.dx, p.dy));
267 p.next_stage();
268}
269
270pub fn store_tail(p: &mut Pipeline) {
271 store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:p.pixmap_dst.slice_at_xy(p.dx, p.dy));
272 p.next_stage();
273}
274
275// Currently, all mask/A8 pixmaps are handled by lowp.
276pub fn load_dst_u8(_: &mut Pipeline) {
277 // unreachable
278}
279
280pub fn load_dst_u8_tail(_: &mut Pipeline) {
281 // unreachable
282}
283
284pub fn store_u8(_: &mut Pipeline) {
285 // unreachable
286}
287
288pub fn store_u8_tail(_: &mut Pipeline) {
289 // unreachable
290}
291
292pub fn gather(p: &mut Pipeline) {
293 let ix: u32x8 = gather_ix(p.pixmap_src, x:p.r, y:p.g);
294 load_8888(&p.pixmap_src.gather(index:ix), &mut p.r, &mut p.g, &mut p.b, &mut p.a);
295
296 p.next_stage();
297}
298
299#[inline(always)]
300fn gather_ix(pixmap: PixmapRef, mut x: f32x8, mut y: f32x8) -> u32x8 {
301 // Exclusive -> inclusive.
302 let w: f32 = ulp_sub(pixmap.width() as f32);
303 let h: f32 = ulp_sub(pixmap.height() as f32);
304 x = x.max(f32x8::default()).min(f32x8::splat(w));
305 y = y.max(f32x8::default()).min(f32x8::splat(h));
306
307 (y.trunc_int() * i32x8::splat(pixmap.width() as i32) + x.trunc_int()).to_u32x8_bitcast()
308}
309
310#[inline(always)]
311fn ulp_sub(v: f32) -> f32 {
312 // Somewhat similar to v - f32::EPSILON
313 bytemuck::cast::<u32, f32>(bytemuck::cast::<f32, u32>(v) - 1)
314}
315
316fn load_mask_u8(_: &mut Pipeline) {
317 // unreachable
318}
319
320fn mask_u8(p: &mut Pipeline) {
321 let offset: usize = p.mask_ctx.offset(p.dx, p.dy);
322 let mut c: [f32; 8] = [0.0; 8];
323 for i: usize in 0..p.tail {
324 c[i] = p.mask_ctx.data[offset + i] as f32;
325 }
326 let c: f32x8 = f32x8::from(c) / f32x8::splat(255.0);
327
328 if c == f32x8::default() {
329 return;
330 }
331
332 p.r *= c;
333 p.g *= c;
334 p.b *= c;
335 p.a *= c;
336
337 p.next_stage();
338}
339
340fn scale_u8(p: &mut Pipeline) {
341 // Load u8xTail and cast it to f32x8.
342 let data: [u8; 2] = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
343 let c: f32x8 = f32x8::from([data[0] as f32, data[1] as f32, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
344 let c: f32x8 = c / f32x8::splat(255.0);
345
346 p.r *= c;
347 p.g *= c;
348 p.b *= c;
349 p.a *= c;
350
351 p.next_stage();
352}
353
354fn lerp_u8(p: &mut Pipeline) {
355 // Load u8xTail and cast it to f32x8.
356 let data: [u8; 2] = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
357 let c: f32x8 = f32x8::from([data[0] as f32, data[1] as f32, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]);
358 let c: f32x8 = c / f32x8::splat(255.0);
359
360 p.r = lerp(from:p.dr, to:p.r, t:c);
361 p.g = lerp(from:p.dg, to:p.g, t:c);
362 p.b = lerp(from:p.db, to:p.b, t:c);
363 p.a = lerp(from:p.da, to:p.a, t:c);
364
365 p.next_stage();
366}
367
368fn scale_1_float(p: &mut Pipeline) {
369 let c: f32x8 = f32x8::splat(p.ctx.current_coverage);
370 p.r *= c;
371 p.g *= c;
372 p.b *= c;
373 p.a *= c;
374
375 p.next_stage();
376}
377
378fn lerp_1_float(p: &mut Pipeline) {
379 let c: f32x8 = f32x8::splat(p.ctx.current_coverage);
380 p.r = lerp(from:p.dr, to:p.r, t:c);
381 p.g = lerp(from:p.dg, to:p.g, t:c);
382 p.b = lerp(from:p.db, to:p.b, t:c);
383 p.a = lerp(from:p.da, to:p.a, t:c);
384
385 p.next_stage();
386}
387
388macro_rules! blend_fn {
389 ($name:ident, $f:expr) => {
390 fn $name(p: &mut Pipeline) {
391 p.r = $f(p.r, p.dr, p.a, p.da);
392 p.g = $f(p.g, p.dg, p.a, p.da);
393 p.b = $f(p.b, p.db, p.a, p.da);
394 p.a = $f(p.a, p.da, p.a, p.da);
395
396 p.next_stage();
397 }
398 };
399}
400
401blend_fn!(clear, |_, _, _, _| f32x8::default());
402blend_fn!(source_atop, |s, d, sa, da| s * da + d * inv(sa));
403blend_fn!(destination_atop, |s, d, sa, da| d * sa + s * inv(da));
404blend_fn!(source_in, |s, _, _, da| s * da);
405blend_fn!(destination_in, |_, d, sa, _| d * sa);
406blend_fn!(source_out, |s, _, _, da| s * inv(da));
407blend_fn!(destination_out, |_, d, sa, _| d * inv(sa));
408blend_fn!(source_over, |s, d, sa, _| mad(d, inv(sa), s));
409blend_fn!(destination_over, |s, d, _, da| mad(s, inv(da), d));
410blend_fn!(modulate, |s, d, _, _| s * d);
411blend_fn!(multiply, |s, d, sa, da| s * inv(da) + d * inv(sa) + s * d);
412blend_fn!(screen, |s, d, _, _| s + d - s * d);
413blend_fn!(xor, |s, d, sa, da| s * inv(da) + d * inv(sa));
414
415// Wants a type for some reason.
416blend_fn!(plus, |s: f32x8, d: f32x8, _, _| (s + d).min(f32x8::splat(1.0)));
417
418macro_rules! blend_fn2 {
419 ($name:ident, $f:expr) => {
420 fn $name(p: &mut Pipeline) {
421 // The same logic applied to color, and source_over for alpha.
422 p.r = $f(p.r, p.dr, p.a, p.da);
423 p.g = $f(p.g, p.dg, p.a, p.da);
424 p.b = $f(p.b, p.db, p.a, p.da);
425 p.a = mad(p.da, inv(p.a), p.a);
426
427 p.next_stage();
428 }
429 };
430}
431
432blend_fn2!(darken, |s: f32x8, d, sa, da: f32x8| s + d - (s * da).max(d * sa));
433blend_fn2!(lighten, |s: f32x8, d, sa, da: f32x8| s + d - (s * da).min(d * sa));
434blend_fn2!(difference, |s: f32x8, d, sa, da: f32x8| s + d - two((s * da).min(d * sa)));
435blend_fn2!(exclusion, |s: f32x8, d, _, _| s + d - two(s * d));
436
437blend_fn2!(color_burn, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
438 d.cmp_eq(da).blend(
439 d + s * inv(da),
440 s.cmp_eq(f32x8::default()).blend(
441 d * inv(sa),
442 sa * (da - da.min((da - d) * sa * s.recip_fast())) + s * inv(da) + d * inv(sa)
443 )
444 )
445);
446
447blend_fn2!(color_dodge, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8|
448 d.cmp_eq(f32x8::default()).blend(
449 s * inv(da),
450 s.cmp_eq(sa).blend(
451 s + d * inv(sa),
452 sa * da.min((d * sa) * (sa - s).recip_fast()) + s * inv(da) + d * inv(sa)
453 )
454 )
455);
456
457blend_fn2!(hard_light, |s: f32x8, d: f32x8, sa, da|
458 s * inv(da) + d * inv(sa) + two(s).cmp_le(sa).blend(
459 two(s * d),
460 sa * da - two((da - d) * (sa - s))
461 )
462);
463
464blend_fn2!(overlay, |s: f32x8, d: f32x8, sa, da|
465 s * inv(da) + d * inv(sa) + two(d).cmp_le(da).blend(
466 two(s * d),
467 sa * da - two((da - d) * (sa - s))
468 )
469);
470
471blend_fn2!(soft_light, |s: f32x8, d: f32x8, sa: f32x8, da: f32x8| {
472 let m = da.cmp_gt(f32x8::default()).blend(d / da, f32x8::default());
473 let s2 = two(s);
474 let m4 = two(two(m));
475
476 // The logic forks three ways:
477 // 1. dark src?
478 // 2. light src, dark dst?
479 // 3. light src, light dst?
480 let dark_src = d * (sa + (s2 - sa) * (f32x8::splat(1.0) - m));
481 let dark_dst = (m4 * m4 + m4) * (m - f32x8::splat(1.0)) + f32x8::splat(7.0) * m;
482 let lite_dst = m.sqrt() - m;
483 let lite_src = d * sa + da * (s2 - sa)
484 * two(two(d)).cmp_le(da).blend(dark_dst, lite_dst); // 2 or 3?
485
486 s * inv(da) + d * inv(sa) + s2.cmp_le(sa).blend(dark_src, lite_src) // 1 or (2 or 3)?
487});
488
489// We're basing our implementation of non-separable blend modes on
490// https://www.w3.org/TR/compositing-1/#blendingnonseparable.
491// and
492// https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
493// They're equivalent, but ES' math has been better simplified.
494//
495// Anything extra we add beyond that is to make the math work with premul inputs.
496
497macro_rules! blend_fn3 {
498 ($name:ident, $f:expr) => {
499 fn $name(p: &mut Pipeline) {
500 let (tr, tg, tb, ta) = $f(p.r, p.g, p.b, p.a, p.dr, p.dg, p.db, p.da);
501 p.r = tr;
502 p.g = tg;
503 p.b = tb;
504 p.a = ta;
505
506 p.next_stage();
507 }
508 };
509}
510
511blend_fn3!(hue, hue_k);
512
513#[inline(always)]
514fn hue_k(
515 r: f32x8, g: f32x8, b: f32x8, a: f32x8,
516 dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
517) -> (f32x8, f32x8, f32x8, f32x8) {
518 let rr: &mut f32x8 = &mut (r * a);
519 let gg: &mut f32x8 = &mut (g * a);
520 let bb: &mut f32x8 = &mut (b * a);
521
522 set_sat(r:rr, g:gg, b:bb, s:sat(r:dr, g:dg, b:db) * a);
523 set_lum(r:rr, g:gg, b:bb, l:lum(r:dr, g:dg, b:db) * a);
524 clip_color(r:rr, g:gg, b:bb, a:a * da);
525
526 let r: f32x8 = r * inv(da) + dr * inv(a) + *rr;
527 let g: f32x8 = g * inv(da) + dg * inv(a) + *gg;
528 let b: f32x8 = b * inv(da) + db * inv(a) + *bb;
529 let a: f32x8 = a + da - a * da;
530
531 (r, g, b, a)
532}
533
534blend_fn3!(saturation, saturation_k);
535
536#[inline(always)]
537fn saturation_k(
538 r: f32x8, g: f32x8, b: f32x8, a: f32x8,
539 dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
540) -> (f32x8, f32x8, f32x8, f32x8) {
541 let rr: &mut f32x8 = &mut (dr * a);
542 let gg: &mut f32x8 = &mut (dg * a);
543 let bb: &mut f32x8 = &mut (db * a);
544
545 set_sat(r:rr, g:gg, b:bb, s:sat(r, g, b) * da);
546 set_lum(r:rr, g:gg, b:bb, l:lum(r:dr, g:dg, b:db) * a); // (This is not redundant.)
547 clip_color(r:rr, g:gg, b:bb, a:a * da);
548
549 let r: f32x8 = r * inv(da) + dr * inv(a) + *rr;
550 let g: f32x8 = g * inv(da) + dg * inv(a) + *gg;
551 let b: f32x8 = b * inv(da) + db * inv(a) + *bb;
552 let a: f32x8 = a + da - a * da;
553
554 (r, g, b, a)
555}
556
557blend_fn3!(color, color_k);
558
559#[inline(always)]
560fn color_k(
561 r: f32x8, g: f32x8, b: f32x8, a: f32x8,
562 dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
563) -> (f32x8, f32x8, f32x8, f32x8) {
564 let rr: &mut f32x8 = &mut (r * da);
565 let gg: &mut f32x8 = &mut (g * da);
566 let bb: &mut f32x8 = &mut (b * da);
567
568 set_lum(r:rr, g:gg, b:bb, l:lum(r:dr, g:dg, b:db) * a);
569 clip_color(r:rr, g:gg, b:bb, a:a * da);
570
571 let r: f32x8 = r * inv(da) + dr * inv(a) + *rr;
572 let g: f32x8 = g * inv(da) + dg * inv(a) + *gg;
573 let b: f32x8 = b * inv(da) + db * inv(a) + *bb;
574 let a: f32x8 = a + da - a * da;
575
576 (r, g, b, a)
577}
578
579blend_fn3!(luminosity, luminosity_k);
580
581#[inline(always)]
582fn luminosity_k(
583 r: f32x8, g: f32x8, b: f32x8, a: f32x8,
584 dr: f32x8, dg: f32x8, db: f32x8, da: f32x8,
585) -> (f32x8, f32x8, f32x8, f32x8) {
586 let rr: &mut f32x8 = &mut (dr * a);
587 let gg: &mut f32x8 = &mut (dg * a);
588 let bb: &mut f32x8 = &mut (db * a);
589
590 set_lum(r:rr, g:gg, b:bb, l:lum(r, g, b) * da);
591 clip_color(r:rr, g:gg, b:bb, a:a * da);
592
593 let r: f32x8 = r * inv(da) + dr * inv(a) + *rr;
594 let g: f32x8 = g * inv(da) + dg * inv(a) + *gg;
595 let b: f32x8 = b * inv(da) + db * inv(a) + *bb;
596 let a: f32x8 = a + da - a * da;
597
598 (r, g, b, a)
599}
600
601#[inline(always)]
602fn sat(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
603 r.max(g.max(b)) - r.min(g.min(b))
604}
605
606#[inline(always)]
607fn lum(r: f32x8, g: f32x8, b: f32x8) -> f32x8 {
608 r * f32x8::splat(0.30) + g * f32x8::splat(0.59) + b * f32x8::splat(0.11)
609}
610
611#[inline(always)]
612fn set_sat(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, s: f32x8) {
613 let mn: f32x8 = r.min(g.min(*b));
614 let mx: f32x8 = r.max(g.max(*b));
615 let sat: f32x8 = mx - mn;
616
617 // Map min channel to 0, max channel to s, and scale the middle proportionally.
618 let scale: impl Fn(f32x8) -> f32x8 = |c: f32x8| sat.cmp_eq(f32x8::default())
619 .blend(t:f32x8::default(), (c - mn) * s / sat);
620
621 *r = scale(*r);
622 *g = scale(*g);
623 *b = scale(*b);
624}
625
626#[inline(always)]
627fn set_lum(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, l: f32x8) {
628 let diff: f32x8 = l - lum(*r, *g, *b);
629 *r += diff;
630 *g += diff;
631 *b += diff;
632}
633
634#[inline(always)]
635fn clip_color(r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: f32x8) {
636 let mn: f32x8 = r.min(g.min(*b));
637 let mx: f32x8 = r.max(g.max(*b));
638 let l: f32x8 = lum(*r, *g, *b);
639
640 let clip: impl Fn(f32x8) -> f32x8 = |mut c: f32x8| {
641 c = mx.cmp_ge(f32x8::default()).blend(t:c, f:l + (c - l) * l / (l - mn));
642 c = mx.cmp_gt(a).blend(t:l + (c - l) * (a - l) / (mx - l), f:c);
643 c = c.max(f32x8::default()); // Sometimes without this we may dip just a little negative.
644 c
645 };
646
647 *r = clip(*r);
648 *g = clip(*g);
649 *b = clip(*b);
650}
651
652pub fn source_over_rgba(p: &mut Pipeline) {
653 let pixels: &mut [PremultipliedColorU8; 8] = p.pixmap_dst.slice4_at_xy(p.dx, p.dy);
654 load_8888(data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
655 p.r = mad(f:p.dr, m:inv(p.a), a:p.r);
656 p.g = mad(f:p.dg, m:inv(p.a), a:p.g);
657 p.b = mad(f:p.db, m:inv(p.a), a:p.b);
658 p.a = mad(f:p.da, m:inv(p.a), p.a);
659 store_8888(&p.r, &p.g, &p.b, &p.a, data:pixels);
660
661 p.next_stage();
662}
663
664pub fn source_over_rgba_tail(p: &mut Pipeline) {
665 let pixels: &mut [PremultipliedColorU8] = p.pixmap_dst.slice_at_xy(p.dx, p.dy);
666 load_8888_tail(p.tail, data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
667 p.r = mad(f:p.dr, m:inv(p.a), a:p.r);
668 p.g = mad(f:p.dg, m:inv(p.a), a:p.g);
669 p.b = mad(f:p.db, m:inv(p.a), a:p.b);
670 p.a = mad(f:p.da, m:inv(p.a), p.a);
671 store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:pixels);
672
673 p.next_stage();
674}
675
676fn transform(p: &mut Pipeline) {
677 let ts: &Transform = &p.ctx.transform;
678
679 let tr: f32x8 = mad(f:p.r, m:f32x8::splat(ts.sx), a:mad(f:p.g, m:f32x8::splat(ts.kx), a:f32x8::splat(ts.tx)));
680 let tg: f32x8 = mad(f:p.r, m:f32x8::splat(ts.ky), a:mad(f:p.g, m:f32x8::splat(ts.sy), a:f32x8::splat(ts.ty)));
681 p.r = tr;
682 p.g = tg;
683
684 p.next_stage();
685}
686
687// Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
688// The gather stages will hard clamp the output of these stages to [0,limit)...
689// we just need to do the basic repeat or mirroring.
690
691fn reflect(p: &mut Pipeline) {
692 let ctx: &TileCtx = &p.ctx.limit_x;
693 p.r = exclusive_reflect(v:p.r, limit:ctx.scale, inv_limit:ctx.inv_scale);
694
695 let ctx: &TileCtx = &p.ctx.limit_y;
696 p.g = exclusive_reflect(v:p.g, limit:ctx.scale, inv_limit:ctx.inv_scale);
697
698 p.next_stage();
699}
700
701#[inline(always)]
702fn exclusive_reflect(v: f32x8, limit: f32, inv_limit: f32) -> f32x8 {
703 let limit: f32x8 = f32x8::splat(limit);
704 let inv_limit: f32x8 = f32x8::splat(inv_limit);
705 ((v - limit) - (limit + limit)
706 * ((v - limit) * (inv_limit * f32x8::splat(0.5))).floor() - limit).abs()
707}
708
709fn repeat(p: &mut Pipeline) {
710 let ctx: &TileCtx = &p.ctx.limit_x;
711 p.r = exclusive_repeat(v:p.r, limit:ctx.scale, inv_limit:ctx.inv_scale);
712
713 let ctx: &TileCtx = &p.ctx.limit_y;
714 p.g = exclusive_repeat(v:p.g, limit:ctx.scale, inv_limit:ctx.inv_scale);
715
716 p.next_stage();
717}
718
719#[inline(always)]
720fn exclusive_repeat(v: f32x8, limit: f32, inv_limit: f32) -> f32x8 {
721 v - (v * f32x8::splat(inv_limit)).floor() * f32x8::splat(limit)
722}
723
724fn bilinear(p: &mut Pipeline) {
725 let x: f32x8 = p.r;
726 let fx: f32x8 = (x + f32x8::splat(0.5)).fract();
727 let y: f32x8 = p.g;
728 let fy: f32x8 = (y + f32x8::splat(0.5)).fract();
729 let one: f32x8 = f32x8::splat(1.0);
730 let wx: [f32x8; 2] = [one - fx, fx];
731 let wy: [f32x8; 2] = [one - fy, fy];
732
733 sampler_2x2(p.pixmap_src, &p.ctx.sampler, cx:x, cy:y, &wx, &wy, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
734
735 p.next_stage();
736}
737
738fn bicubic(p: &mut Pipeline) {
739 let x: f32x8 = p.r;
740 let fx: f32x8 = (x + f32x8::splat(0.5)).fract();
741 let y: f32x8 = p.g;
742 let fy: f32x8 = (y + f32x8::splat(0.5)).fract();
743 let one: f32x8 = f32x8::splat(1.0);
744 let wx: [f32x8; 4] = [bicubic_far(one - fx), bicubic_near(one - fx), bicubic_near(fx), bicubic_far(fx)];
745 let wy: [f32x8; 4] = [bicubic_far(one - fy), bicubic_near(one - fy), bicubic_near(fy), bicubic_far(fy)];
746
747 sampler_4x4(p.pixmap_src, &p.ctx.sampler, cx:x, cy:y, &wx, &wy, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
748
749 p.next_stage();
750}
751
752// In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
753// pixel center are combined with a non-uniform cubic filter, with higher values near the center.
754//
755// We break this function into two parts, one for near 0.5 offsets and one for far 1.5 offsets.
756
757#[inline(always)]
758fn bicubic_near(t: f32x8) -> f32x8 {
759 // 1/18 + 9/18t + 27/18t^2 - 21/18t^3 == t ( t ( -21/18t + 27/18) + 9/18) + 1/18
760 mad(
761 f:t,
762 m:mad(t,
763 mad(
764 f32x8::splat(-21.0/18.0),
765 t,
766 f32x8::splat(27.0/18.0),
767 ),
768 f32x8::splat(9.0/18.0),
769 ),
770 a:f32x8::splat(1.0/18.0),
771 )
772}
773
774#[inline(always)]
775fn bicubic_far(t: f32x8) -> f32x8 {
776 // 0/18 + 0/18*t - 6/18t^2 + 7/18t^3 == t^2 (7/18t - 6/18)
777 (t * t) * mad(f:f32x8::splat(7.0/18.0), m:t, a:f32x8::splat(-6.0/18.0))
778}
779
780#[inline(always)]
781fn sampler_2x2(
782 pixmap: PixmapRef,
783 ctx: &super::SamplerCtx,
784 cx: f32x8, cy: f32x8,
785 wx: &[f32x8; 2], wy: &[f32x8; 2],
786 r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
787) {
788 *r = f32x8::default();
789 *g = f32x8::default();
790 *b = f32x8::default();
791 *a = f32x8::default();
792
793 let one = f32x8::splat(1.0);
794 let start = -0.5;
795 let mut y = cy + f32x8::splat(start);
796 for j in 0..2 {
797 let mut x = cx + f32x8::splat(start);
798 for i in 0..2 {
799 let mut rr = f32x8::default();
800 let mut gg = f32x8::default();
801 let mut bb = f32x8::default();
802 let mut aa = f32x8::default();
803 sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
804
805 let w = wx[i] * wy[j];
806 *r = mad(w, rr, *r);
807 *g = mad(w, gg, *g);
808 *b = mad(w, bb, *b);
809 *a = mad(w, aa, *a);
810
811 x += one;
812 }
813
814 y += one;
815 }
816}
817
818#[inline(always)]
819fn sampler_4x4(
820 pixmap: PixmapRef,
821 ctx: &super::SamplerCtx,
822 cx: f32x8, cy: f32x8,
823 wx: &[f32x8; 4], wy: &[f32x8; 4],
824 r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
825) {
826 *r = f32x8::default();
827 *g = f32x8::default();
828 *b = f32x8::default();
829 *a = f32x8::default();
830
831 let one = f32x8::splat(1.0);
832 let start = -1.5;
833 let mut y = cy + f32x8::splat(start);
834 for j in 0..4 {
835 let mut x = cx + f32x8::splat(start);
836 for i in 0..4 {
837 let mut rr = f32x8::default();
838 let mut gg = f32x8::default();
839 let mut bb = f32x8::default();
840 let mut aa = f32x8::default();
841 sample(pixmap, ctx, x,y, &mut rr, &mut gg, &mut bb, &mut aa);
842
843 let w = wx[i] * wy[j];
844 *r = mad(w, rr, *r);
845 *g = mad(w, gg, *g);
846 *b = mad(w, bb, *b);
847 *a = mad(w, aa, *a);
848
849 x += one;
850 }
851
852 y += one;
853 }
854}
855
856#[inline(always)]
857fn sample(
858 pixmap: PixmapRef, ctx: &super::SamplerCtx, mut x: f32x8, mut y: f32x8,
859 r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
860) {
861 x = tile(v:x, ctx.spread_mode, limit:pixmap.width() as f32, inv_limit:ctx.inv_width);
862 y = tile(v:y, ctx.spread_mode, limit:pixmap.height() as f32, inv_limit:ctx.inv_height);
863
864 let ix: u32x8 = gather_ix(pixmap, x, y);
865 load_8888(&pixmap.gather(index:ix), r, g, b, a);
866}
867
868#[inline(always)]
869fn tile(v: f32x8, mode: SpreadMode, limit: f32, inv_limit: f32) -> f32x8 {
870 match mode {
871 SpreadMode::Pad => v,
872 SpreadMode::Repeat => exclusive_repeat(v, limit, inv_limit),
873 SpreadMode::Reflect => exclusive_reflect(v, limit, inv_limit),
874 }
875}
876
877fn pad_x1(p: &mut Pipeline) {
878 p.r = p.r.normalize();
879
880 p.next_stage();
881}
882
883fn reflect_x1(p: &mut Pipeline) {
884 p.r = (
885 (p.r - f32x8::splat(1.0))
886 - two(((p.r - f32x8::splat(1.0)) * f32x8::splat(0.5)).floor())
887 - f32x8::splat(1.0)
888 ).abs().normalize();
889
890 p.next_stage();
891}
892
893fn repeat_x1(p: &mut Pipeline) {
894 p.r = (p.r - p.r.floor()).normalize();
895
896 p.next_stage();
897}
898
899fn gradient(p: &mut Pipeline) {
900 let ctx: &GradientCtx = &p.ctx.gradient;
901
902 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
903 let t: [f32; 8] = p.r.into();
904 let mut idx: u32x8 = u32x8::default();
905 for i: usize in 1..ctx.len {
906 let tt: f32 = ctx.t_values[i].get();
907 let n: u32x8 = bytemuck::cast([
908 (t[0] >= tt) as u32,
909 (t[1] >= tt) as u32,
910 (t[2] >= tt) as u32,
911 (t[3] >= tt) as u32,
912 (t[4] >= tt) as u32,
913 (t[5] >= tt) as u32,
914 (t[6] >= tt) as u32,
915 (t[7] >= tt) as u32,
916 ]);
917 idx = idx + n;
918 }
919 gradient_lookup(ctx, &idx, t:p.r, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
920
921 p.next_stage();
922}
923
924fn gradient_lookup(
925 ctx: &super::GradientCtx, idx: &u32x8, t: f32x8,
926 r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
927) {
928 let idx: [u32; 8] = bytemuck::cast(*idx);
929
930 macro_rules! gather {
931 ($d:expr, $c:ident) => {
932 // Surprisingly, but bound checking doesn't affect the performance.
933 // And since `idx` can contain any number, we should leave it in place.
934 f32x8::from([
935 $d[idx[0] as usize].$c,
936 $d[idx[1] as usize].$c,
937 $d[idx[2] as usize].$c,
938 $d[idx[3] as usize].$c,
939 $d[idx[4] as usize].$c,
940 $d[idx[5] as usize].$c,
941 $d[idx[6] as usize].$c,
942 $d[idx[7] as usize].$c,
943 ])
944 };
945 }
946
947 let fr = gather!(&ctx.factors, r);
948 let fg = gather!(&ctx.factors, g);
949 let fb = gather!(&ctx.factors, b);
950 let fa = gather!(&ctx.factors, a);
951
952 let br = gather!(&ctx.biases, r);
953 let bg = gather!(&ctx.biases, g);
954 let bb = gather!(&ctx.biases, b);
955 let ba = gather!(&ctx.biases, a);
956
957 *r = mad(t, fr, br);
958 *g = mad(t, fg, bg);
959 *b = mad(t, fb, bb);
960 *a = mad(t, fa, ba);
961}
962
963fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
964 let ctx: &EvenlySpaced2StopGradientCtx = &p.ctx.evenly_spaced_2_stop_gradient;
965
966 let t: f32x8 = p.r;
967 p.r = mad(f:t, m:f32x8::splat(ctx.factor.r), a:f32x8::splat(ctx.bias.r));
968 p.g = mad(f:t, m:f32x8::splat(ctx.factor.g), a:f32x8::splat(ctx.bias.g));
969 p.b = mad(f:t, m:f32x8::splat(ctx.factor.b), a:f32x8::splat(ctx.bias.b));
970 p.a = mad(f:t, m:f32x8::splat(ctx.factor.a), a:f32x8::splat(ctx.bias.a));
971
972 p.next_stage();
973}
974
975fn xy_to_radius(p: &mut Pipeline) {
976 let x2: f32x8 = p.r * p.r;
977 let y2: f32x8 = p.g * p.g;
978 p.r = (x2 + y2).sqrt();
979
980 p.next_stage();
981}
982
983fn xy_to_2pt_conical_focal_on_circle(p: &mut Pipeline) {
984 let x: f32x8 = p.r;
985 let y: f32x8 = p.g;
986 p.r = x + y * y / x;
987
988 p.next_stage();
989}
990
991fn xy_to_2pt_conical_well_behaved(p: &mut Pipeline) {
992 let ctx: &TwoPointConicalGradientCtx = &p.ctx.two_point_conical_gradient;
993
994 let x: f32x8 = p.r;
995 let y: f32x8 = p.g;
996 p.r = (x * x + y * y).sqrt() - x * f32x8::splat(ctx.p0);
997
998 p.next_stage();
999}
1000
1001fn xy_to_2pt_conical_greater(p: &mut Pipeline) {
1002 let ctx: &TwoPointConicalGradientCtx = &p.ctx.two_point_conical_gradient;
1003
1004 let x: f32x8 = p.r;
1005 let y: f32x8 = p.g;
1006 p.r = (x * x - y * y).sqrt() - x * f32x8::splat(ctx.p0);
1007
1008 p.next_stage();
1009}
1010
1011fn mask_2pt_conical_degenerates(p: &mut Pipeline) {
1012 let ctx: &mut TwoPointConicalGradientCtx = &mut p.ctx.two_point_conical_gradient;
1013
1014 let t: f32x8 = p.r;
1015 let is_degenerate: f32x8 = t.cmp_le(f32x8::default()) | t.cmp_ne(t);
1016 p.r = is_degenerate.blend(t:f32x8::default(), f:t);
1017
1018 let is_not_degenerate: u32x8 = !is_degenerate.to_u32x8_bitcast();
1019 let is_not_degenerate: [u32; 8] = bytemuck::cast(is_not_degenerate);
1020 ctx.mask = bytemuck::cast([
1021 if is_not_degenerate[0] != 0 { !0 } else { 0 },
1022 if is_not_degenerate[1] != 0 { !0 } else { 0 },
1023 if is_not_degenerate[2] != 0 { !0 } else { 0 },
1024 if is_not_degenerate[3] != 0 { !0 } else { 0 },
1025 if is_not_degenerate[4] != 0 { !0 } else { 0 },
1026 if is_not_degenerate[5] != 0 { !0 } else { 0 },
1027 if is_not_degenerate[6] != 0 { !0 } else { 0 },
1028 if is_not_degenerate[7] != 0 { !0 } else { 0 },
1029 ]);
1030
1031 p.next_stage();
1032}
1033
1034fn apply_vector_mask(p: &mut Pipeline) {
1035 let ctx: &TwoPointConicalGradientCtx = &p.ctx.two_point_conical_gradient;
1036
1037 p.r = (p.r.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1038 p.g = (p.g.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1039 p.b = (p.b.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1040 p.a = (p.a.to_u32x8_bitcast() & ctx.mask).to_f32x8_bitcast();
1041
1042 p.next_stage();
1043}
1044
1045pub fn just_return(_: &mut Pipeline) {
1046 // Ends the loop.
1047}
1048
1049#[inline(always)]
1050fn load_8888(
1051 data: &[PremultipliedColorU8; STAGE_WIDTH],
1052 r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
1053) {
1054 // Surprisingly, `f32 * FACTOR` is way faster than `f32x8 * f32x8::splat(FACTOR)`.
1055
1056 const FACTOR: f32 = 1.0 / 255.0;
1057
1058 *r = f32x8::from([
1059 data[0].red() as f32 * FACTOR, data[1].red() as f32 * FACTOR,
1060 data[2].red() as f32 * FACTOR, data[3].red() as f32 * FACTOR,
1061 data[4].red() as f32 * FACTOR, data[5].red() as f32 * FACTOR,
1062 data[6].red() as f32 * FACTOR, data[7].red() as f32 * FACTOR,
1063 ]);
1064
1065 *g = f32x8::from([
1066 data[0].green() as f32 * FACTOR, data[1].green() as f32 * FACTOR,
1067 data[2].green() as f32 * FACTOR, data[3].green() as f32 * FACTOR,
1068 data[4].green() as f32 * FACTOR, data[5].green() as f32 * FACTOR,
1069 data[6].green() as f32 * FACTOR, data[7].green() as f32 * FACTOR,
1070 ]);
1071
1072 *b = f32x8::from([
1073 data[0].blue() as f32 * FACTOR, data[1].blue() as f32 * FACTOR,
1074 data[2].blue() as f32 * FACTOR, data[3].blue() as f32 * FACTOR,
1075 data[4].blue() as f32 * FACTOR, data[5].blue() as f32 * FACTOR,
1076 data[6].blue() as f32 * FACTOR, data[7].blue() as f32 * FACTOR,
1077 ]);
1078
1079 *a = f32x8::from([
1080 data[0].alpha() as f32 * FACTOR, data[1].alpha() as f32 * FACTOR,
1081 data[2].alpha() as f32 * FACTOR, data[3].alpha() as f32 * FACTOR,
1082 data[4].alpha() as f32 * FACTOR, data[5].alpha() as f32 * FACTOR,
1083 data[6].alpha() as f32 * FACTOR, data[7].alpha() as f32 * FACTOR,
1084 ]);
1085}
1086
1087#[inline(always)]
1088fn load_8888_tail(
1089 tail: usize, data: &[PremultipliedColorU8],
1090 r: &mut f32x8, g: &mut f32x8, b: &mut f32x8, a: &mut f32x8,
1091) {
1092 // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
1093 // This way we can reuse the `load_8888_` method and remove any branches.
1094 let mut tmp: [PremultipliedColorU8; 8] = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
1095 tmp[0..tail].copy_from_slice(&data[0..tail]);
1096 load_8888(&tmp, r, g, b, a);
1097}
1098
1099#[inline(always)]
1100fn store_8888(
1101 r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8,
1102 data: &mut [PremultipliedColorU8; STAGE_WIDTH],
1103) {
1104 let r: [i32; 8] = unnorm(r).into();
1105 let g: [i32; 8] = unnorm(g).into();
1106 let b: [i32; 8] = unnorm(b).into();
1107 let a: [i32; 8] = unnorm(a).into();
1108
1109 let conv: impl Fn(i32, i32, i32, i32) -> … = |rr: i32, gg: i32, bb: i32, aa: i32|
1110 PremultipliedColorU8::from_rgba_unchecked(r:rr as u8, g:gg as u8, b:bb as u8, a:aa as u8);
1111
1112 data[0] = conv(rr:r[0], gg:g[0], bb:b[0], aa:a[0]);
1113 data[1] = conv(rr:r[1], gg:g[1], bb:b[1], aa:a[1]);
1114 data[2] = conv(rr:r[2], gg:g[2], bb:b[2], aa:a[2]);
1115 data[3] = conv(rr:r[3], gg:g[3], bb:b[3], aa:a[3]);
1116 data[4] = conv(rr:r[4], gg:g[4], bb:b[4], aa:a[4]);
1117 data[5] = conv(rr:r[5], gg:g[5], bb:b[5], aa:a[5]);
1118 data[6] = conv(rr:r[6], gg:g[6], bb:b[6], aa:a[6]);
1119 data[7] = conv(rr:r[7], gg:g[7], bb:b[7], aa:a[7]);
1120}
1121
1122#[inline(always)]
1123fn store_8888_tail(
1124 r: &f32x8, g: &f32x8, b: &f32x8, a: &f32x8,
1125 tail: usize, data: &mut [PremultipliedColorU8],
1126) {
1127 let r: [i32; 8] = unnorm(r).into();
1128 let g: [i32; 8] = unnorm(g).into();
1129 let b: [i32; 8] = unnorm(b).into();
1130 let a: [i32; 8] = unnorm(a).into();
1131
1132 // This is better than `for i in 0..tail`, because this way the compiler
1133 // knows that we have only 4 steps and slices access is guarantee to be valid.
1134 // This removes bounds checking and a possible panic call.
1135 for i: usize in 0..STAGE_WIDTH {
1136 data[i] = PremultipliedColorU8::from_rgba_unchecked(
1137 r:r[i] as u8, g:g[i] as u8, b:b[i] as u8, a:a[i] as u8,
1138 );
1139
1140 if i + 1 == tail {
1141 break;
1142 }
1143 }
1144}
1145
1146#[inline(always)]
1147fn unnorm(v: &f32x8) -> i32x8 {
1148 (v.max(f32x8::default()).min(f32x8::splat(1.0)) * f32x8::splat(255.0)).round_int()
1149}
1150
1151#[inline(always)]
1152fn inv(v: f32x8) -> f32x8 {
1153 f32x8::splat(1.0) - v
1154}
1155
1156#[inline(always)]
1157fn two(v: f32x8) -> f32x8 {
1158 v + v
1159}
1160
1161#[inline(always)]
1162fn mad(f: f32x8, m: f32x8, a: f32x8) -> f32x8 {
1163 f * m + a
1164}
1165
1166#[inline(always)]
1167fn lerp(from: f32x8, to: f32x8, t: f32x8) -> f32x8 {
1168 mad(f:to - from, m:t, a:from)
1169}
1170