1// Copyright 2018 Google Inc.
2// Copyright 2020 Yevhenii Reizner
3//
4// Use of this source code is governed by a BSD-style license that can be
5// found in the LICENSE file.
6
7/*!
8A low precision raster pipeline implementation.
9
10A lowp pipeline uses u16 instead of f32 for math.
11Because of that, it doesn't implement stages that require high precision.
12The pipeline compiler will automatically decide which one to use.
13
14Skia uses u16x8 (128bit) types for a generic CPU and u16x16 (256bit) for modern x86 CPUs.
15But instead of explicit SIMD instructions, it mainly relies on clang's vector extensions.
16And since they are unavailable in Rust, we have to do everything manually.
17
18According to our benchmarks, a SIMD-accelerated u16x8 in Rust is almost 2x slower than in Skia.
19Not sure why. For example, there are no div instruction for u16x8, so we have to use
20a basic scalar version. Which means unnecessary load/store. No idea what clang does in this case.
21Surprisingly, a SIMD-accelerated u16x8 is even slower than a scalar one. Again, not sure why.
22
23Therefore we are using scalar u16x16 by default and relying on rustc/llvm auto vectorization instead.
24When targeting a generic CPU, we're just 5-10% slower than Skia. While u16x8 is 30-40% slower.
25And while `-C target-cpu=haswell` boosts our performance by around 25%,
26we are still 40-60% behind Skia built for Haswell.
27
28On ARM AArch64 the story is different and explicit SIMD make our code up to 2-3x faster.
29*/
30
31use crate::PremultipliedColorU8;
32
33use crate::pixmap::SubPixmapMut;
34use crate::wide::{f32x8, u16x16, f32x16};
35use crate::geom::ScreenIntRect;
36
37pub const STAGE_WIDTH: usize = 16;
38
39pub type StageFn = fn(p: &mut Pipeline);
40
41pub struct Pipeline<'a, 'b: 'a> {
42 index: usize,
43 functions: &'a [StageFn],
44 pixmap: &'a mut SubPixmapMut<'b>,
45 mask_ctx: super::MaskCtx<'a>,
46 aa_mask_ctx: super::AAMaskCtx,
47 ctx: &'a mut super::Context,
48 r: u16x16,
49 g: u16x16,
50 b: u16x16,
51 a: u16x16,
52 dr: u16x16,
53 dg: u16x16,
54 db: u16x16,
55 da: u16x16,
56 tail: usize,
57 dx: usize,
58 dy: usize,
59}
60
61impl Pipeline<'_, '_> {
62 #[inline(always)]
63 fn next_stage(&mut self) {
64 let next: fn(&mut Self) = self.functions[self.index];
65 self.index += 1;
66 next(self);
67 }
68}
69
70
71// Must be in the same order as raster_pipeline::Stage
72pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[
73 move_source_to_destination,
74 move_destination_to_source,
75 null_fn, // Clamp0
76 null_fn, // ClampA
77 premultiply,
78 uniform_color,
79 seed_shader,
80 load_dst,
81 store,
82 load_dst_u8,
83 store_u8,
84 null_fn, // Gather
85 load_mask_u8,
86 mask_u8,
87 scale_u8,
88 lerp_u8,
89 scale_1_float,
90 lerp_1_float,
91 destination_atop,
92 destination_in,
93 destination_out,
94 destination_over,
95 source_atop,
96 source_in,
97 source_out,
98 source_over,
99 clear,
100 modulate,
101 multiply,
102 plus,
103 screen,
104 xor,
105 null_fn, // ColorBurn
106 null_fn, // ColorDodge
107 darken,
108 difference,
109 exclusion,
110 hard_light,
111 lighten,
112 overlay,
113 null_fn, // SoftLight
114 null_fn, // Hue
115 null_fn, // Saturation
116 null_fn, // Color
117 null_fn, // Luminosity
118 source_over_rgba,
119 transform,
120 null_fn, // Reflect
121 null_fn, // Repeat
122 null_fn, // Bilinear
123 null_fn, // Bicubic
124 pad_x1,
125 reflect_x1,
126 repeat_x1,
127 gradient,
128 evenly_spaced_2_stop_gradient,
129 xy_to_radius,
130 null_fn, // XYTo2PtConicalFocalOnCircle
131 null_fn, // XYTo2PtConicalWellBehaved
132 null_fn, // XYTo2PtConicalGreater
133 null_fn, // Mask2PtConicalDegenerates
134 null_fn, // ApplyVectorMask
135];
136
137pub fn fn_ptr(f: StageFn) -> *const () {
138 f as *const ()
139}
140
141pub fn fn_ptr_eq(f1: StageFn, f2: StageFn) -> bool {
142 core::ptr::eq(a:f1 as *const (), b:f2 as *const ())
143}
144
145#[inline(never)]
146pub fn start(
147 functions: &[StageFn],
148 functions_tail: &[StageFn],
149 rect: &ScreenIntRect,
150 aa_mask_ctx: super::AAMaskCtx,
151 mask_ctx: super::MaskCtx,
152 ctx: &mut super::Context,
153 pixmap: &mut SubPixmapMut,
154) {
155 let mut p = Pipeline {
156 index: 0,
157 functions: &[],
158 pixmap,
159 mask_ctx,
160 aa_mask_ctx,
161 ctx,
162 r: u16x16::default(),
163 g: u16x16::default(),
164 b: u16x16::default(),
165 a: u16x16::default(),
166 dr: u16x16::default(),
167 dg: u16x16::default(),
168 db: u16x16::default(),
169 da: u16x16::default(),
170 tail: 0,
171 dx: 0,
172 dy: 0,
173 };
174
175 for y in rect.y()..rect.bottom() {
176 let mut x = rect.x() as usize;
177 let end = rect.right() as usize;
178
179 p.functions = functions;
180 while x + STAGE_WIDTH <= end {
181 p.index = 0;
182 p.dx = x;
183 p.dy = y as usize;
184 p.tail = STAGE_WIDTH;
185 p.next_stage();
186 x += STAGE_WIDTH;
187 }
188
189 if x != end {
190 p.index = 0;
191 p.functions = functions_tail;
192 p.dx = x;
193 p.dy = y as usize;
194 p.tail = end - x;
195 p.next_stage();
196 }
197 }
198}
199
200fn move_source_to_destination(p: &mut Pipeline) {
201 p.dr = p.r;
202 p.dg = p.g;
203 p.db = p.b;
204 p.da = p.a;
205
206 p.next_stage();
207}
208
209fn move_destination_to_source(p: &mut Pipeline) {
210 p.r = p.dr;
211 p.g = p.dg;
212 p.b = p.db;
213 p.a = p.da;
214
215 p.next_stage();
216}
217
218fn premultiply(p: &mut Pipeline) {
219 p.r = div255(p.r * p.a);
220 p.g = div255(p.g * p.a);
221 p.b = div255(p.b * p.a);
222
223 p.next_stage();
224}
225
226fn uniform_color(p: &mut Pipeline) {
227 let ctx: UniformColorCtx = p.ctx.uniform_color;
228 p.r = u16x16::splat(ctx.rgba[0]);
229 p.g = u16x16::splat(ctx.rgba[1]);
230 p.b = u16x16::splat(ctx.rgba[2]);
231 p.a = u16x16::splat(ctx.rgba[3]);
232
233 p.next_stage();
234}
235
236fn seed_shader(p: &mut Pipeline) {
237 let iota: f32x16 = f32x16(
238 f32x8::from([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]),
239 f32x8::from([8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5]),
240 );
241
242 let x: f32x16 = f32x16::splat(p.dx as f32) + iota;
243 let y: f32x16 = f32x16::splat(p.dy as f32 + 0.5);
244 split(&x, &mut p.r, &mut p.g);
245 split(&y, &mut p.b, &mut p.a);
246
247 p.next_stage();
248}
249
250pub fn load_dst(p: &mut Pipeline) {
251 load_8888(data:p.pixmap.slice16_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
252 p.next_stage();
253}
254
255pub fn load_dst_tail(p: &mut Pipeline) {
256 load_8888_tail(p.tail, data:p.pixmap.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
257 p.next_stage();
258}
259
260pub fn store(p: &mut Pipeline) {
261 store_8888(&p.r, &p.g, &p.b, &p.a, data:p.pixmap.slice16_at_xy(p.dx, p.dy));
262 p.next_stage();
263}
264
265pub fn store_tail(p: &mut Pipeline) {
266 store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:p.pixmap.slice_at_xy(p.dx, p.dy));
267 p.next_stage();
268}
269
270pub fn load_dst_u8(p: &mut Pipeline) {
271 load_8(data:p.pixmap.slice16_mask_at_xy(p.dx, p.dy), &mut p.da);
272 p.next_stage();
273}
274
275pub fn load_dst_u8_tail(p: &mut Pipeline) {
276 // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
277 // This way we can reuse the `load_8888__` method and remove any branches.
278 let data: &mut [u8] = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
279 let mut tmp: [u8; 16] = [0u8; STAGE_WIDTH];
280 tmp[0..p.tail].copy_from_slice(&data[0..p.tail]);
281 load_8(&tmp, &mut p.da);
282
283 p.next_stage();
284}
285
286pub fn store_u8(p: &mut Pipeline) {
287 let data: &mut [u8; 16] = p.pixmap.slice16_mask_at_xy(p.dx, p.dy);
288 let a: &[u16; 16] = p.a.as_slice();
289
290 data[ 0] = a[ 0] as u8;
291 data[ 1] = a[ 1] as u8;
292 data[ 2] = a[ 2] as u8;
293 data[ 3] = a[ 3] as u8;
294 data[ 4] = a[ 4] as u8;
295 data[ 5] = a[ 5] as u8;
296 data[ 6] = a[ 6] as u8;
297 data[ 7] = a[ 7] as u8;
298 data[ 8] = a[ 8] as u8;
299 data[ 9] = a[ 9] as u8;
300 data[10] = a[10] as u8;
301 data[11] = a[11] as u8;
302 data[12] = a[12] as u8;
303 data[13] = a[13] as u8;
304 data[14] = a[14] as u8;
305 data[15] = a[15] as u8;
306
307 p.next_stage();
308}
309
310pub fn store_u8_tail(p: &mut Pipeline) {
311 let data: &mut [u8] = p.pixmap.slice_mask_at_xy(p.dx, p.dy);
312 let a: &[u16; 16] = p.a.as_slice();
313
314 // This is better than `for i in 0..tail`, because this way the compiler
315 // knows that we have only 16 steps and slices access is guarantee to be valid.
316 // This removes bounds checking and a possible panic call.
317 for i: usize in 0..STAGE_WIDTH {
318 data[i] = a[i] as u8;
319
320 if i + 1 == p.tail {
321 break;
322 }
323 }
324
325 p.next_stage();
326}
327
328// Similar to mask_u8, but only loads the mask values without actually masking the pipeline.
329fn load_mask_u8(p: &mut Pipeline) {
330 let offset: usize = p.mask_ctx.offset(p.dx, p.dy);
331
332 let mut c: u16x16 = u16x16::default();
333 for i: usize in 0..p.tail {
334 c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
335 }
336
337 p.r = u16x16::splat(0);
338 p.g = u16x16::splat(0);
339 p.b = u16x16::splat(0);
340 p.a = c;
341
342 p.next_stage();
343}
344
345fn mask_u8(p: &mut Pipeline) {
346 let offset: usize = p.mask_ctx.offset(p.dx, p.dy);
347
348 let mut c: u16x16 = u16x16::default();
349 for i: usize in 0..p.tail {
350 c.0[i] = u16::from(p.mask_ctx.data[offset + i]);
351 }
352
353 if c == u16x16::default() {
354 return;
355 }
356
357 p.r = div255(p.r * c);
358 p.g = div255(p.g * c);
359 p.b = div255(p.b * c);
360 p.a = div255(p.a * c);
361
362 p.next_stage();
363}
364
365fn scale_u8(p: &mut Pipeline) {
366 // Load u8xTail and cast it to u16x16.
367 let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
368 let c = u16x16([
369 u16::from(data[0]),
370 u16::from(data[1]),
371 0,
372 0,
373 0,
374 0,
375 0,
376 0,
377 0,
378 0,
379 0,
380 0,
381 0,
382 0,
383 0,
384 0,
385 ]);
386
387 p.r = div255(p.r * c);
388 p.g = div255(p.g * c);
389 p.b = div255(p.b * c);
390 p.a = div255(p.a * c);
391
392 p.next_stage();
393}
394
395fn lerp_u8(p: &mut Pipeline) {
396 // Load u8xTail and cast it to u16x16.
397 let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail);
398 let c = u16x16([
399 u16::from(data[0]),
400 u16::from(data[1]),
401 0,
402 0,
403 0,
404 0,
405 0,
406 0,
407 0,
408 0,
409 0,
410 0,
411 0,
412 0,
413 0,
414 0,
415 ]);
416
417 p.r = lerp(p.dr, p.r, c);
418 p.g = lerp(p.dg, p.g, c);
419 p.b = lerp(p.db, p.b, c);
420 p.a = lerp(p.da, p.a, c);
421
422 p.next_stage();
423}
424
425fn scale_1_float(p: &mut Pipeline) {
426 let c: u16x16 = from_float(p.ctx.current_coverage);
427 p.r = div255(p.r * c);
428 p.g = div255(p.g * c);
429 p.b = div255(p.b * c);
430 p.a = div255(p.a * c);
431
432 p.next_stage();
433}
434
435fn lerp_1_float(p: &mut Pipeline) {
436 let c: u16x16 = from_float(p.ctx.current_coverage);
437 p.r = lerp(from:p.dr, to:p.r, t:c);
438 p.g = lerp(from:p.dg, to:p.g, t:c);
439 p.b = lerp(from:p.db, to:p.b, t:c);
440 p.a = lerp(from:p.da, to:p.a, t:c);
441
442 p.next_stage();
443}
444
445macro_rules! blend_fn {
446 ($name:ident, $f:expr) => {
447 fn $name(p: &mut Pipeline) {
448 p.r = $f(p.r, p.dr, p.a, p.da);
449 p.g = $f(p.g, p.dg, p.a, p.da);
450 p.b = $f(p.b, p.db, p.a, p.da);
451 p.a = $f(p.a, p.da, p.a, p.da);
452
453 p.next_stage();
454 }
455 };
456}
457
458blend_fn!(clear, |_, _, _, _| u16x16::splat(0));
459blend_fn!(source_atop, |s, d, sa, da| div255(s * da + d * inv(sa)));
460blend_fn!(destination_atop, |s, d, sa, da| div255(d * sa + s * inv(da)));
461blend_fn!(source_in, |s, _, _, da| div255(s * da));
462blend_fn!(destination_in, |_, d, sa, _| div255(d * sa));
463blend_fn!(source_out, |s, _, _, da| div255(s * inv(da)));
464blend_fn!(destination_out, |_, d, sa, _| div255(d * inv(sa)));
465blend_fn!(source_over, |s, d, sa, _| s + div255(d * inv(sa)));
466blend_fn!(destination_over, |s, d, _, da| d + div255(s * inv(da)));
467blend_fn!(modulate, |s, d, _, _| div255(s * d));
468blend_fn!(multiply, |s, d, sa, da| div255(s * inv(da) + d * inv(sa) + s * d));
469blend_fn!(screen, |s, d, _, _| s + d - div255(s * d));
470blend_fn!(xor, |s, d, sa, da| div255(s * inv(da) + d * inv(sa)));
471
472// Wants a type for some reason.
473blend_fn!(plus, |s: u16x16, d, _, _| (s + d).min(&u16x16::splat(255)));
474
475
476macro_rules! blend_fn2 {
477 ($name:ident, $f:expr) => {
478 fn $name(p: &mut Pipeline) {
479 // The same logic applied to color, and source_over for alpha.
480 p.r = $f(p.r, p.dr, p.a, p.da);
481 p.g = $f(p.g, p.dg, p.a, p.da);
482 p.b = $f(p.b, p.db, p.a, p.da);
483 p.a = p.a + div255(p.da * inv(p.a));
484
485 p.next_stage();
486 }
487 };
488}
489
490blend_fn2!(darken, |s: u16x16, d, sa, da| s + d - div255((s * da).max(&(d * sa))));
491blend_fn2!(lighten, |s: u16x16, d, sa, da| s + d - div255((s * da).min(&(d * sa))));
492blend_fn2!(exclusion, |s: u16x16, d, _, _| s + d - u16x16::splat(2) * div255(s * d));
493
494blend_fn2!(difference, |s: u16x16, d, sa, da|
495 s + d - u16x16::splat(2) * div255((s * da).min(&(d * sa))));
496
497blend_fn2!(hard_light, |s: u16x16, d: u16x16, sa, da| {
498 div255(s * inv(da) + d * inv(sa)
499 + (s+s).cmp_le(&sa).blend(
500 u16x16::splat(2) * s * d,
501 sa * da - u16x16::splat(2) * (sa-s)*(da-d)
502 )
503 )
504});
505
506blend_fn2!(overlay, |s: u16x16, d: u16x16, sa, da| {
507 div255(s * inv(da) + d * inv(sa)
508 + (d+d).cmp_le(&da).blend(
509 u16x16::splat(2) * s * d,
510 sa * da - u16x16::splat(2) * (sa-s)*(da-d)
511 )
512 )
513});
514
515pub fn source_over_rgba(p: &mut Pipeline) {
516 let pixels: &mut [PremultipliedColorU8; 16] = p.pixmap.slice16_at_xy(p.dx, p.dy);
517 load_8888(data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
518 p.r = p.r + div255(p.dr * inv(p.a));
519 p.g = p.g + div255(p.dg * inv(p.a));
520 p.b = p.b + div255(p.db * inv(p.a));
521 p.a = p.a + div255(p.da * inv(p.a));
522 store_8888(&p.r, &p.g, &p.b, &p.a, data:pixels);
523
524 p.next_stage();
525}
526
527pub fn source_over_rgba_tail(p: &mut Pipeline) {
528 let pixels: &mut [PremultipliedColorU8] = p.pixmap.slice_at_xy(p.dx, p.dy);
529 load_8888_tail(p.tail, data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da);
530 p.r = p.r + div255(p.dr * inv(p.a));
531 p.g = p.g + div255(p.dg * inv(p.a));
532 p.b = p.b + div255(p.db * inv(p.a));
533 p.a = p.a + div255(p.da * inv(p.a));
534 store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:pixels);
535
536 p.next_stage();
537}
538
539fn transform(p: &mut Pipeline) {
540 let ts: &Transform = &p.ctx.transform;
541
542 let x: f32x16 = join(&p.r, &p.g);
543 let y: f32x16 = join(&p.b, &p.a);
544
545 let nx: f32x16 = mad(f:x, m:f32x16::splat(ts.sx), a:mad(f:y, m:f32x16::splat(ts.kx), a:f32x16::splat(ts.tx)));
546 let ny: f32x16 = mad(f:x, m:f32x16::splat(ts.ky), a:mad(f:y, m:f32x16::splat(ts.sy), a:f32x16::splat(ts.ty)));
547
548 split(&nx, &mut p.r, &mut p.g);
549 split(&ny, &mut p.b, &mut p.a);
550
551 p.next_stage();
552}
553
554fn pad_x1(p: &mut Pipeline) {
555 let x: f32x16 = join(&p.r, &p.g);
556 let x: f32x16 = x.normalize();
557 split(&x, &mut p.r, &mut p.g);
558
559 p.next_stage();
560}
561
562fn reflect_x1(p: &mut Pipeline) {
563 let x: f32x16 = join(&p.r, &p.g);
564 let two: impl Fn(f32x16) -> f32x16 = |x: f32x16| x + x;
565 let x: f32x16 = (
566 (x - f32x16::splat(1.0))
567 - two(((x - f32x16::splat(1.0)) * f32x16::splat(0.5)).floor())
568 - f32x16::splat(1.0)
569 ).abs().normalize();
570 split(&x, &mut p.r, &mut p.g);
571
572 p.next_stage();
573}
574
575fn repeat_x1(p: &mut Pipeline) {
576 let x: f32x16 = join(&p.r, &p.g);
577 let x: f32x16 = (x - x.floor()).normalize();
578 split(&x, &mut p.r, &mut p.g);
579
580 p.next_stage();
581}
582
583fn gradient(p: &mut Pipeline) {
584 let ctx = &p.ctx.gradient;
585
586 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
587 let t = join(&p.r, &p.g);
588 let mut idx = u16x16::splat(0);
589 for i in 1..ctx.len {
590 let tt = ctx.t_values[i].get();
591 let t0: [f32; 8] = t.0.into();
592 let t1: [f32; 8] = t.1.into();
593 idx.0[ 0] += (t0[0] >= tt) as u16;
594 idx.0[ 1] += (t0[1] >= tt) as u16;
595 idx.0[ 2] += (t0[2] >= tt) as u16;
596 idx.0[ 3] += (t0[3] >= tt) as u16;
597 idx.0[ 4] += (t0[4] >= tt) as u16;
598 idx.0[ 5] += (t0[5] >= tt) as u16;
599 idx.0[ 6] += (t0[6] >= tt) as u16;
600 idx.0[ 7] += (t0[7] >= tt) as u16;
601 idx.0[ 8] += (t1[0] >= tt) as u16;
602 idx.0[ 9] += (t1[1] >= tt) as u16;
603 idx.0[10] += (t1[2] >= tt) as u16;
604 idx.0[11] += (t1[3] >= tt) as u16;
605 idx.0[12] += (t1[4] >= tt) as u16;
606 idx.0[13] += (t1[5] >= tt) as u16;
607 idx.0[14] += (t1[6] >= tt) as u16;
608 idx.0[15] += (t1[7] >= tt) as u16;
609 }
610 gradient_lookup(ctx, &idx, t, &mut p.r, &mut p.g, &mut p.b, &mut p.a);
611
612 p.next_stage();
613}
614
615fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) {
616 let ctx: &EvenlySpaced2StopGradientCtx = &p.ctx.evenly_spaced_2_stop_gradient;
617
618 let t: f32x16 = join(&p.r, &p.g);
619 round_f32_to_u16(
620 rf:mad(t, f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)),
621 gf:mad(t, f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)),
622 bf:mad(t, f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)),
623 af:mad(f:t, m:f32x16::splat(ctx.factor.a), a:f32x16::splat(ctx.bias.a)),
624 &mut p.r, &mut p.g, &mut p.b, &mut p.a,
625 );
626
627 p.next_stage();
628}
629
630fn xy_to_radius(p: &mut Pipeline) {
631 let x: f32x16 = join(&p.r, &p.g);
632 let y: f32x16 = join(&p.b, &p.a);
633 let x: f32x16 = (x*x + y*y).sqrt();
634 split(&x, &mut p.r, &mut p.g);
635 split(&y, &mut p.b, &mut p.a);
636
637 p.next_stage();
638}
639
640// We are using u16 for index, not u32 as Skia, to simplify the code a bit.
641// The gradient creation code will not allow that many stops anyway.
642fn gradient_lookup(
643 ctx: &super::GradientCtx, idx: &u16x16, t: f32x16,
644 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
645) {
646 macro_rules! gather {
647 ($d:expr, $c:ident) => {
648 // Surprisingly, but bound checking doesn't affect the performance.
649 // And since `idx` can contain any number, we should leave it in place.
650 f32x16(
651 f32x8::from([
652 $d[idx.0[ 0] as usize].$c,
653 $d[idx.0[ 1] as usize].$c,
654 $d[idx.0[ 2] as usize].$c,
655 $d[idx.0[ 3] as usize].$c,
656 $d[idx.0[ 4] as usize].$c,
657 $d[idx.0[ 5] as usize].$c,
658 $d[idx.0[ 6] as usize].$c,
659 $d[idx.0[ 7] as usize].$c,
660 ]),
661 f32x8::from([
662 $d[idx.0[ 8] as usize].$c,
663 $d[idx.0[ 9] as usize].$c,
664 $d[idx.0[10] as usize].$c,
665 $d[idx.0[11] as usize].$c,
666 $d[idx.0[12] as usize].$c,
667 $d[idx.0[13] as usize].$c,
668 $d[idx.0[14] as usize].$c,
669 $d[idx.0[15] as usize].$c,
670 ]),
671 )
672 };
673 }
674
675 let fr = gather!(&ctx.factors, r);
676 let fg = gather!(&ctx.factors, g);
677 let fb = gather!(&ctx.factors, b);
678 let fa = gather!(&ctx.factors, a);
679
680 let br = gather!(&ctx.biases, r);
681 let bg = gather!(&ctx.biases, g);
682 let bb = gather!(&ctx.biases, b);
683 let ba = gather!(&ctx.biases, a);
684
685 round_f32_to_u16(
686 mad(t, fr, br),
687 mad(t, fg, bg),
688 mad(t, fb, bb),
689 mad(t, fa, ba),
690 r, g, b, a,
691 );
692}
693
694#[inline(always)]
695fn round_f32_to_u16(
696 rf: f32x16, gf: f32x16, bf: f32x16, af: f32x16,
697 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
698) {
699 // TODO: may produce a slightly different result to Skia
700 // affects the two_stops_linear_mirror test
701
702 let rf: f32x16 = rf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
703 let gf: f32x16 = gf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
704 let bf: f32x16 = bf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5);
705 let af: f32x16 = af * f32x16::splat(255.0) + f32x16::splat(0.5);
706
707 rf.save_to_u16x16(dst:r);
708 gf.save_to_u16x16(dst:g);
709 bf.save_to_u16x16(dst:b);
710 af.save_to_u16x16(dst:a);
711}
712
713pub fn just_return(_: &mut Pipeline) {
714 // Ends the loop.
715}
716
717pub fn null_fn(_: &mut Pipeline) {
718 // Just for unsupported functions in STAGES.
719}
720
721#[inline(always)]
722fn load_8888(
723 data: &[PremultipliedColorU8; STAGE_WIDTH],
724 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
725) {
726 *r = u16x16([
727 data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16,
728 data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16,
729 data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16,
730 data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16,
731 ]);
732
733 *g = u16x16([
734 data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16,
735 data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16,
736 data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16,
737 data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16,
738 ]);
739
740 *b = u16x16([
741 data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16,
742 data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16,
743 data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16,
744 data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16,
745 ]);
746
747 *a = u16x16([
748 data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16,
749 data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16,
750 data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16,
751 data[12].alpha() as u16, data[13].alpha() as u16, data[14].alpha() as u16, data[15].alpha() as u16,
752 ]);
753}
754
755#[inline(always)]
756fn load_8888_tail(
757 tail: usize, data: &[PremultipliedColorU8],
758 r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16,
759) {
760 // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range.
761 // This way we can reuse the `load_8888__` method and remove any branches.
762 let mut tmp: [PremultipliedColorU8; 16] = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH];
763 tmp[0..tail].copy_from_slice(&data[0..tail]);
764 load_8888(&tmp, r, g, b, a);
765}
766
767#[inline(always)]
768fn store_8888(
769 r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
770 data: &mut [PremultipliedColorU8; STAGE_WIDTH],
771) {
772 let r: &[u16; 16] = r.as_slice();
773 let g: &[u16; 16] = g.as_slice();
774 let b: &[u16; 16] = b.as_slice();
775 let a: &[u16; 16] = a.as_slice();
776
777 data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 0] as u8, g:g[ 0] as u8, b:b[ 0] as u8, a:a[ 0] as u8);
778 data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 1] as u8, g:g[ 1] as u8, b:b[ 1] as u8, a:a[ 1] as u8);
779 data[ 2] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 2] as u8, g:g[ 2] as u8, b:b[ 2] as u8, a:a[ 2] as u8);
780 data[ 3] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 3] as u8, g:g[ 3] as u8, b:b[ 3] as u8, a:a[ 3] as u8);
781 data[ 4] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 4] as u8, g:g[ 4] as u8, b:b[ 4] as u8, a:a[ 4] as u8);
782 data[ 5] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 5] as u8, g:g[ 5] as u8, b:b[ 5] as u8, a:a[ 5] as u8);
783 data[ 6] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 6] as u8, g:g[ 6] as u8, b:b[ 6] as u8, a:a[ 6] as u8);
784 data[ 7] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 7] as u8, g:g[ 7] as u8, b:b[ 7] as u8, a:a[ 7] as u8);
785 data[ 8] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 8] as u8, g:g[ 8] as u8, b:b[ 8] as u8, a:a[ 8] as u8);
786 data[ 9] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 9] as u8, g:g[ 9] as u8, b:b[ 9] as u8, a:a[ 9] as u8);
787 data[10] = PremultipliedColorU8::from_rgba_unchecked(r:r[10] as u8, g:g[10] as u8, b:b[10] as u8, a:a[10] as u8);
788 data[11] = PremultipliedColorU8::from_rgba_unchecked(r:r[11] as u8, g:g[11] as u8, b:b[11] as u8, a:a[11] as u8);
789 data[12] = PremultipliedColorU8::from_rgba_unchecked(r:r[12] as u8, g:g[12] as u8, b:b[12] as u8, a:a[12] as u8);
790 data[13] = PremultipliedColorU8::from_rgba_unchecked(r:r[13] as u8, g:g[13] as u8, b:b[13] as u8, a:a[13] as u8);
791 data[14] = PremultipliedColorU8::from_rgba_unchecked(r:r[14] as u8, g:g[14] as u8, b:b[14] as u8, a:a[14] as u8);
792 data[15] = PremultipliedColorU8::from_rgba_unchecked(r:r[15] as u8, g:g[15] as u8, b:b[15] as u8, a:a[15] as u8);
793}
794
795#[inline(always)]
796fn store_8888_tail(
797 r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16,
798 tail: usize, data: &mut [PremultipliedColorU8],
799) {
800 let r: &[u16; 16] = r.as_slice();
801 let g: &[u16; 16] = g.as_slice();
802 let b: &[u16; 16] = b.as_slice();
803 let a: &[u16; 16] = a.as_slice();
804
805 // This is better than `for i in 0..tail`, because this way the compiler
806 // knows that we have only 16 steps and slices access is guarantee to be valid.
807 // This removes bounds checking and a possible panic call.
808 for i: usize in 0..STAGE_WIDTH {
809 data[i] = PremultipliedColorU8::from_rgba_unchecked(
810 r:r[i] as u8, g:g[i] as u8, b:b[i] as u8, a:a[i] as u8,
811 );
812
813 if i + 1 == tail {
814 break;
815 }
816 }
817}
818
819#[inline(always)]
820fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) {
821 *a = u16x16([
822 data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16,
823 data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16,
824 data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16,
825 data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16,
826 ]);
827}
828
829#[inline(always)]
830fn div255(v: u16x16) -> u16x16 {
831 // Skia uses `vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8)` here when NEON is available,
832 // but it doesn't affect performance much and breaks reproducible result. Ignore it.
833 // NOTE: the compiler does not replace the devision with a shift.
834 (v + u16x16::splat(255)) >> u16x16::splat(8) // / u16x16::splat(256)
835}
836
837#[inline(always)]
838fn inv(v: u16x16) -> u16x16 {
839 u16x16::splat(255) - v
840}
841
842#[inline(always)]
843fn from_float(f: f32) -> u16x16 {
844 u16x16::splat((f * 255.0 + 0.5) as u16)
845}
846
847#[inline(always)]
848fn lerp(from: u16x16, to: u16x16, t: u16x16) -> u16x16 {
849 div255(from * inv(t) + to * t)
850}
851
852#[inline(always)]
853fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) {
854 // We're splitting f32x16 (512bit) into two u16x16 (256 bit).
855 let data: [u8; 64] = bytemuck::cast(*v);
856 let d0: &mut [u8; 32] = bytemuck::cast_mut(&mut lo.0);
857 let d1: &mut [u8; 32] = bytemuck::cast_mut(&mut hi.0);
858
859 d0.copy_from_slice(&data[0..32]);
860 d1.copy_from_slice(&data[32..64]);
861}
862
863#[inline(always)]
864fn join(lo: &u16x16, hi: &u16x16) -> f32x16 {
865 // We're joining two u16x16 (256 bit) into f32x16 (512bit).
866
867 let d0: [u8; 32] = bytemuck::cast(lo.0);
868 let d1: [u8; 32] = bytemuck::cast(hi.0);
869
870 let mut v: f32x16 = f32x16::default();
871 let data: &mut [u8; 64] = bytemuck::cast_mut(&mut v);
872
873 data[0..32].copy_from_slice(&d0);
874 data[32..64].copy_from_slice(&d1);
875
876 v
877}
878
879#[inline(always)]
880fn mad(f: f32x16, m: f32x16, a: f32x16) -> f32x16 {
881 // NEON vmlaq_f32 doesn't seem to affect performance in any way. Ignore it.
882 f * m + a
883}
884