1 | // Copyright 2018 Google Inc. |
2 | // Copyright 2020 Yevhenii Reizner |
3 | // |
4 | // Use of this source code is governed by a BSD-style license that can be |
5 | // found in the LICENSE file. |
6 | |
7 | /*! |
8 | A low precision raster pipeline implementation. |
9 | |
10 | A lowp pipeline uses u16 instead of f32 for math. |
11 | Because of that, it doesn't implement stages that require high precision. |
12 | The pipeline compiler will automatically decide which one to use. |
13 | |
14 | Skia uses u16x8 (128bit) types for a generic CPU and u16x16 (256bit) for modern x86 CPUs. |
15 | But instead of explicit SIMD instructions, it mainly relies on clang's vector extensions. |
16 | And since they are unavailable in Rust, we have to do everything manually. |
17 | |
18 | According to our benchmarks, a SIMD-accelerated u16x8 in Rust is almost 2x slower than in Skia. |
19 | Not sure why. For example, there are no div instruction for u16x8, so we have to use |
20 | a basic scalar version. Which means unnecessary load/store. No idea what clang does in this case. |
21 | Surprisingly, a SIMD-accelerated u16x8 is even slower than a scalar one. Again, not sure why. |
22 | |
23 | Therefore we are using scalar u16x16 by default and relying on rustc/llvm auto vectorization instead. |
24 | When targeting a generic CPU, we're just 5-10% slower than Skia. While u16x8 is 30-40% slower. |
25 | And while `-C target-cpu=haswell` boosts our performance by around 25%, |
26 | we are still 40-60% behind Skia built for Haswell. |
27 | |
28 | On ARM AArch64 the story is different and explicit SIMD make our code up to 2-3x faster. |
29 | */ |
30 | |
31 | use crate::PremultipliedColorU8; |
32 | |
33 | use crate::pixmap::SubPixmapMut; |
34 | use crate::wide::{f32x8, u16x16, f32x16}; |
35 | use crate::geom::ScreenIntRect; |
36 | |
37 | pub const STAGE_WIDTH: usize = 16; |
38 | |
39 | pub type StageFn = fn(p: &mut Pipeline); |
40 | |
41 | pub struct Pipeline<'a, 'b: 'a> { |
42 | index: usize, |
43 | functions: &'a [StageFn], |
44 | pixmap: &'a mut SubPixmapMut<'b>, |
45 | mask_ctx: super::MaskCtx<'a>, |
46 | aa_mask_ctx: super::AAMaskCtx, |
47 | ctx: &'a mut super::Context, |
48 | r: u16x16, |
49 | g: u16x16, |
50 | b: u16x16, |
51 | a: u16x16, |
52 | dr: u16x16, |
53 | dg: u16x16, |
54 | db: u16x16, |
55 | da: u16x16, |
56 | tail: usize, |
57 | dx: usize, |
58 | dy: usize, |
59 | } |
60 | |
61 | impl Pipeline<'_, '_> { |
62 | #[inline (always)] |
63 | fn next_stage(&mut self) { |
64 | let next: fn(&mut Self) = self.functions[self.index]; |
65 | self.index += 1; |
66 | next(self); |
67 | } |
68 | } |
69 | |
70 | |
71 | // Must be in the same order as raster_pipeline::Stage |
72 | pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[ |
73 | move_source_to_destination, |
74 | move_destination_to_source, |
75 | null_fn, // Clamp0 |
76 | null_fn, // ClampA |
77 | premultiply, |
78 | uniform_color, |
79 | seed_shader, |
80 | load_dst, |
81 | store, |
82 | load_dst_u8, |
83 | store_u8, |
84 | null_fn, // Gather |
85 | load_mask_u8, |
86 | mask_u8, |
87 | scale_u8, |
88 | lerp_u8, |
89 | scale_1_float, |
90 | lerp_1_float, |
91 | destination_atop, |
92 | destination_in, |
93 | destination_out, |
94 | destination_over, |
95 | source_atop, |
96 | source_in, |
97 | source_out, |
98 | source_over, |
99 | clear, |
100 | modulate, |
101 | multiply, |
102 | plus, |
103 | screen, |
104 | xor, |
105 | null_fn, // ColorBurn |
106 | null_fn, // ColorDodge |
107 | darken, |
108 | difference, |
109 | exclusion, |
110 | hard_light, |
111 | lighten, |
112 | overlay, |
113 | null_fn, // SoftLight |
114 | null_fn, // Hue |
115 | null_fn, // Saturation |
116 | null_fn, // Color |
117 | null_fn, // Luminosity |
118 | source_over_rgba, |
119 | transform, |
120 | null_fn, // Reflect |
121 | null_fn, // Repeat |
122 | null_fn, // Bilinear |
123 | null_fn, // Bicubic |
124 | pad_x1, |
125 | reflect_x1, |
126 | repeat_x1, |
127 | gradient, |
128 | evenly_spaced_2_stop_gradient, |
129 | xy_to_radius, |
130 | null_fn, // XYTo2PtConicalFocalOnCircle |
131 | null_fn, // XYTo2PtConicalWellBehaved |
132 | null_fn, // XYTo2PtConicalGreater |
133 | null_fn, // Mask2PtConicalDegenerates |
134 | null_fn, // ApplyVectorMask |
135 | ]; |
136 | |
137 | pub fn fn_ptr(f: StageFn) -> *const () { |
138 | f as *const () |
139 | } |
140 | |
141 | pub fn fn_ptr_eq(f1: StageFn, f2: StageFn) -> bool { |
142 | core::ptr::eq(a:f1 as *const (), b:f2 as *const ()) |
143 | } |
144 | |
145 | #[inline (never)] |
146 | pub fn start( |
147 | functions: &[StageFn], |
148 | functions_tail: &[StageFn], |
149 | rect: &ScreenIntRect, |
150 | aa_mask_ctx: super::AAMaskCtx, |
151 | mask_ctx: super::MaskCtx, |
152 | ctx: &mut super::Context, |
153 | pixmap: &mut SubPixmapMut, |
154 | ) { |
155 | let mut p = Pipeline { |
156 | index: 0, |
157 | functions: &[], |
158 | pixmap, |
159 | mask_ctx, |
160 | aa_mask_ctx, |
161 | ctx, |
162 | r: u16x16::default(), |
163 | g: u16x16::default(), |
164 | b: u16x16::default(), |
165 | a: u16x16::default(), |
166 | dr: u16x16::default(), |
167 | dg: u16x16::default(), |
168 | db: u16x16::default(), |
169 | da: u16x16::default(), |
170 | tail: 0, |
171 | dx: 0, |
172 | dy: 0, |
173 | }; |
174 | |
175 | for y in rect.y()..rect.bottom() { |
176 | let mut x = rect.x() as usize; |
177 | let end = rect.right() as usize; |
178 | |
179 | p.functions = functions; |
180 | while x + STAGE_WIDTH <= end { |
181 | p.index = 0; |
182 | p.dx = x; |
183 | p.dy = y as usize; |
184 | p.tail = STAGE_WIDTH; |
185 | p.next_stage(); |
186 | x += STAGE_WIDTH; |
187 | } |
188 | |
189 | if x != end { |
190 | p.index = 0; |
191 | p.functions = functions_tail; |
192 | p.dx = x; |
193 | p.dy = y as usize; |
194 | p.tail = end - x; |
195 | p.next_stage(); |
196 | } |
197 | } |
198 | } |
199 | |
200 | fn move_source_to_destination(p: &mut Pipeline) { |
201 | p.dr = p.r; |
202 | p.dg = p.g; |
203 | p.db = p.b; |
204 | p.da = p.a; |
205 | |
206 | p.next_stage(); |
207 | } |
208 | |
209 | fn move_destination_to_source(p: &mut Pipeline) { |
210 | p.r = p.dr; |
211 | p.g = p.dg; |
212 | p.b = p.db; |
213 | p.a = p.da; |
214 | |
215 | p.next_stage(); |
216 | } |
217 | |
218 | fn premultiply(p: &mut Pipeline) { |
219 | p.r = div255(p.r * p.a); |
220 | p.g = div255(p.g * p.a); |
221 | p.b = div255(p.b * p.a); |
222 | |
223 | p.next_stage(); |
224 | } |
225 | |
226 | fn uniform_color(p: &mut Pipeline) { |
227 | let ctx: UniformColorCtx = p.ctx.uniform_color; |
228 | p.r = u16x16::splat(ctx.rgba[0]); |
229 | p.g = u16x16::splat(ctx.rgba[1]); |
230 | p.b = u16x16::splat(ctx.rgba[2]); |
231 | p.a = u16x16::splat(ctx.rgba[3]); |
232 | |
233 | p.next_stage(); |
234 | } |
235 | |
236 | fn seed_shader(p: &mut Pipeline) { |
237 | let iota: f32x16 = f32x16( |
238 | f32x8::from([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), |
239 | f32x8::from([8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5]), |
240 | ); |
241 | |
242 | let x: f32x16 = f32x16::splat(p.dx as f32) + iota; |
243 | let y: f32x16 = f32x16::splat(p.dy as f32 + 0.5); |
244 | split(&x, &mut p.r, &mut p.g); |
245 | split(&y, &mut p.b, &mut p.a); |
246 | |
247 | p.next_stage(); |
248 | } |
249 | |
250 | pub fn load_dst(p: &mut Pipeline) { |
251 | load_8888(data:p.pixmap.slice16_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da); |
252 | p.next_stage(); |
253 | } |
254 | |
255 | pub fn load_dst_tail(p: &mut Pipeline) { |
256 | load_8888_tail(p.tail, data:p.pixmap.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da); |
257 | p.next_stage(); |
258 | } |
259 | |
260 | pub fn store(p: &mut Pipeline) { |
261 | store_8888(&p.r, &p.g, &p.b, &p.a, data:p.pixmap.slice16_at_xy(p.dx, p.dy)); |
262 | p.next_stage(); |
263 | } |
264 | |
265 | pub fn store_tail(p: &mut Pipeline) { |
266 | store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:p.pixmap.slice_at_xy(p.dx, p.dy)); |
267 | p.next_stage(); |
268 | } |
269 | |
270 | pub fn load_dst_u8(p: &mut Pipeline) { |
271 | load_8(data:p.pixmap.slice16_mask_at_xy(p.dx, p.dy), &mut p.da); |
272 | p.next_stage(); |
273 | } |
274 | |
275 | pub fn load_dst_u8_tail(p: &mut Pipeline) { |
276 | // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range. |
277 | // This way we can reuse the `load_8888__` method and remove any branches. |
278 | let data: &mut [u8] = p.pixmap.slice_mask_at_xy(p.dx, p.dy); |
279 | let mut tmp: [u8; 16] = [0u8; STAGE_WIDTH]; |
280 | tmp[0..p.tail].copy_from_slice(&data[0..p.tail]); |
281 | load_8(&tmp, &mut p.da); |
282 | |
283 | p.next_stage(); |
284 | } |
285 | |
286 | pub fn store_u8(p: &mut Pipeline) { |
287 | let data: &mut [u8; 16] = p.pixmap.slice16_mask_at_xy(p.dx, p.dy); |
288 | let a: &[u16; 16] = p.a.as_slice(); |
289 | |
290 | data[ 0] = a[ 0] as u8; |
291 | data[ 1] = a[ 1] as u8; |
292 | data[ 2] = a[ 2] as u8; |
293 | data[ 3] = a[ 3] as u8; |
294 | data[ 4] = a[ 4] as u8; |
295 | data[ 5] = a[ 5] as u8; |
296 | data[ 6] = a[ 6] as u8; |
297 | data[ 7] = a[ 7] as u8; |
298 | data[ 8] = a[ 8] as u8; |
299 | data[ 9] = a[ 9] as u8; |
300 | data[10] = a[10] as u8; |
301 | data[11] = a[11] as u8; |
302 | data[12] = a[12] as u8; |
303 | data[13] = a[13] as u8; |
304 | data[14] = a[14] as u8; |
305 | data[15] = a[15] as u8; |
306 | |
307 | p.next_stage(); |
308 | } |
309 | |
310 | pub fn store_u8_tail(p: &mut Pipeline) { |
311 | let data: &mut [u8] = p.pixmap.slice_mask_at_xy(p.dx, p.dy); |
312 | let a: &[u16; 16] = p.a.as_slice(); |
313 | |
314 | // This is better than `for i in 0..tail`, because this way the compiler |
315 | // knows that we have only 16 steps and slices access is guarantee to be valid. |
316 | // This removes bounds checking and a possible panic call. |
317 | for i: usize in 0..STAGE_WIDTH { |
318 | data[i] = a[i] as u8; |
319 | |
320 | if i + 1 == p.tail { |
321 | break; |
322 | } |
323 | } |
324 | |
325 | p.next_stage(); |
326 | } |
327 | |
328 | // Similar to mask_u8, but only loads the mask values without actually masking the pipeline. |
329 | fn load_mask_u8(p: &mut Pipeline) { |
330 | let offset: usize = p.mask_ctx.offset(p.dx, p.dy); |
331 | |
332 | let mut c: u16x16 = u16x16::default(); |
333 | for i: usize in 0..p.tail { |
334 | c.0[i] = u16::from(p.mask_ctx.data[offset + i]); |
335 | } |
336 | |
337 | p.r = u16x16::splat(0); |
338 | p.g = u16x16::splat(0); |
339 | p.b = u16x16::splat(0); |
340 | p.a = c; |
341 | |
342 | p.next_stage(); |
343 | } |
344 | |
345 | fn mask_u8(p: &mut Pipeline) { |
346 | let offset: usize = p.mask_ctx.offset(p.dx, p.dy); |
347 | |
348 | let mut c: u16x16 = u16x16::default(); |
349 | for i: usize in 0..p.tail { |
350 | c.0[i] = u16::from(p.mask_ctx.data[offset + i]); |
351 | } |
352 | |
353 | if c == u16x16::default() { |
354 | return; |
355 | } |
356 | |
357 | p.r = div255(p.r * c); |
358 | p.g = div255(p.g * c); |
359 | p.b = div255(p.b * c); |
360 | p.a = div255(p.a * c); |
361 | |
362 | p.next_stage(); |
363 | } |
364 | |
365 | fn scale_u8(p: &mut Pipeline) { |
366 | // Load u8xTail and cast it to u16x16. |
367 | let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail); |
368 | let c = u16x16([ |
369 | u16::from(data[0]), |
370 | u16::from(data[1]), |
371 | 0, |
372 | 0, |
373 | 0, |
374 | 0, |
375 | 0, |
376 | 0, |
377 | 0, |
378 | 0, |
379 | 0, |
380 | 0, |
381 | 0, |
382 | 0, |
383 | 0, |
384 | 0, |
385 | ]); |
386 | |
387 | p.r = div255(p.r * c); |
388 | p.g = div255(p.g * c); |
389 | p.b = div255(p.b * c); |
390 | p.a = div255(p.a * c); |
391 | |
392 | p.next_stage(); |
393 | } |
394 | |
395 | fn lerp_u8(p: &mut Pipeline) { |
396 | // Load u8xTail and cast it to u16x16. |
397 | let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail); |
398 | let c = u16x16([ |
399 | u16::from(data[0]), |
400 | u16::from(data[1]), |
401 | 0, |
402 | 0, |
403 | 0, |
404 | 0, |
405 | 0, |
406 | 0, |
407 | 0, |
408 | 0, |
409 | 0, |
410 | 0, |
411 | 0, |
412 | 0, |
413 | 0, |
414 | 0, |
415 | ]); |
416 | |
417 | p.r = lerp(p.dr, p.r, c); |
418 | p.g = lerp(p.dg, p.g, c); |
419 | p.b = lerp(p.db, p.b, c); |
420 | p.a = lerp(p.da, p.a, c); |
421 | |
422 | p.next_stage(); |
423 | } |
424 | |
425 | fn scale_1_float(p: &mut Pipeline) { |
426 | let c: u16x16 = from_float(p.ctx.current_coverage); |
427 | p.r = div255(p.r * c); |
428 | p.g = div255(p.g * c); |
429 | p.b = div255(p.b * c); |
430 | p.a = div255(p.a * c); |
431 | |
432 | p.next_stage(); |
433 | } |
434 | |
435 | fn lerp_1_float(p: &mut Pipeline) { |
436 | let c: u16x16 = from_float(p.ctx.current_coverage); |
437 | p.r = lerp(from:p.dr, to:p.r, t:c); |
438 | p.g = lerp(from:p.dg, to:p.g, t:c); |
439 | p.b = lerp(from:p.db, to:p.b, t:c); |
440 | p.a = lerp(from:p.da, to:p.a, t:c); |
441 | |
442 | p.next_stage(); |
443 | } |
444 | |
445 | macro_rules! blend_fn { |
446 | ($name:ident, $f:expr) => { |
447 | fn $name(p: &mut Pipeline) { |
448 | p.r = $f(p.r, p.dr, p.a, p.da); |
449 | p.g = $f(p.g, p.dg, p.a, p.da); |
450 | p.b = $f(p.b, p.db, p.a, p.da); |
451 | p.a = $f(p.a, p.da, p.a, p.da); |
452 | |
453 | p.next_stage(); |
454 | } |
455 | }; |
456 | } |
457 | |
458 | blend_fn!(clear, |_, _, _, _| u16x16::splat(0)); |
459 | blend_fn!(source_atop, |s, d, sa, da| div255(s * da + d * inv(sa))); |
460 | blend_fn!(destination_atop, |s, d, sa, da| div255(d * sa + s * inv(da))); |
461 | blend_fn!(source_in, |s, _, _, da| div255(s * da)); |
462 | blend_fn!(destination_in, |_, d, sa, _| div255(d * sa)); |
463 | blend_fn!(source_out, |s, _, _, da| div255(s * inv(da))); |
464 | blend_fn!(destination_out, |_, d, sa, _| div255(d * inv(sa))); |
465 | blend_fn!(source_over, |s, d, sa, _| s + div255(d * inv(sa))); |
466 | blend_fn!(destination_over, |s, d, _, da| d + div255(s * inv(da))); |
467 | blend_fn!(modulate, |s, d, _, _| div255(s * d)); |
468 | blend_fn!(multiply, |s, d, sa, da| div255(s * inv(da) + d * inv(sa) + s * d)); |
469 | blend_fn!(screen, |s, d, _, _| s + d - div255(s * d)); |
470 | blend_fn!(xor, |s, d, sa, da| div255(s * inv(da) + d * inv(sa))); |
471 | |
472 | // Wants a type for some reason. |
473 | blend_fn!(plus, |s: u16x16, d, _, _| (s + d).min(&u16x16::splat(255))); |
474 | |
475 | |
476 | macro_rules! blend_fn2 { |
477 | ($name:ident, $f:expr) => { |
478 | fn $name(p: &mut Pipeline) { |
479 | // The same logic applied to color, and source_over for alpha. |
480 | p.r = $f(p.r, p.dr, p.a, p.da); |
481 | p.g = $f(p.g, p.dg, p.a, p.da); |
482 | p.b = $f(p.b, p.db, p.a, p.da); |
483 | p.a = p.a + div255(p.da * inv(p.a)); |
484 | |
485 | p.next_stage(); |
486 | } |
487 | }; |
488 | } |
489 | |
490 | blend_fn2!(darken, |s: u16x16, d, sa, da| s + d - div255((s * da).max(&(d * sa)))); |
491 | blend_fn2!(lighten, |s: u16x16, d, sa, da| s + d - div255((s * da).min(&(d * sa)))); |
492 | blend_fn2!(exclusion, |s: u16x16, d, _, _| s + d - u16x16::splat(2) * div255(s * d)); |
493 | |
494 | blend_fn2!(difference, |s: u16x16, d, sa, da| |
495 | s + d - u16x16::splat(2) * div255((s * da).min(&(d * sa)))); |
496 | |
497 | blend_fn2!(hard_light, |s: u16x16, d: u16x16, sa, da| { |
498 | div255(s * inv(da) + d * inv(sa) |
499 | + (s+s).cmp_le(&sa).blend( |
500 | u16x16::splat(2) * s * d, |
501 | sa * da - u16x16::splat(2) * (sa-s)*(da-d) |
502 | ) |
503 | ) |
504 | }); |
505 | |
506 | blend_fn2!(overlay, |s: u16x16, d: u16x16, sa, da| { |
507 | div255(s * inv(da) + d * inv(sa) |
508 | + (d+d).cmp_le(&da).blend( |
509 | u16x16::splat(2) * s * d, |
510 | sa * da - u16x16::splat(2) * (sa-s)*(da-d) |
511 | ) |
512 | ) |
513 | }); |
514 | |
515 | pub fn source_over_rgba(p: &mut Pipeline) { |
516 | let pixels: &mut [PremultipliedColorU8; 16] = p.pixmap.slice16_at_xy(p.dx, p.dy); |
517 | load_8888(data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da); |
518 | p.r = p.r + div255(p.dr * inv(p.a)); |
519 | p.g = p.g + div255(p.dg * inv(p.a)); |
520 | p.b = p.b + div255(p.db * inv(p.a)); |
521 | p.a = p.a + div255(p.da * inv(p.a)); |
522 | store_8888(&p.r, &p.g, &p.b, &p.a, data:pixels); |
523 | |
524 | p.next_stage(); |
525 | } |
526 | |
527 | pub fn source_over_rgba_tail(p: &mut Pipeline) { |
528 | let pixels: &mut [PremultipliedColorU8] = p.pixmap.slice_at_xy(p.dx, p.dy); |
529 | load_8888_tail(p.tail, data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da); |
530 | p.r = p.r + div255(p.dr * inv(p.a)); |
531 | p.g = p.g + div255(p.dg * inv(p.a)); |
532 | p.b = p.b + div255(p.db * inv(p.a)); |
533 | p.a = p.a + div255(p.da * inv(p.a)); |
534 | store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:pixels); |
535 | |
536 | p.next_stage(); |
537 | } |
538 | |
539 | fn transform(p: &mut Pipeline) { |
540 | let ts: &Transform = &p.ctx.transform; |
541 | |
542 | let x: f32x16 = join(&p.r, &p.g); |
543 | let y: f32x16 = join(&p.b, &p.a); |
544 | |
545 | let nx: f32x16 = mad(f:x, m:f32x16::splat(ts.sx), a:mad(f:y, m:f32x16::splat(ts.kx), a:f32x16::splat(ts.tx))); |
546 | let ny: f32x16 = mad(f:x, m:f32x16::splat(ts.ky), a:mad(f:y, m:f32x16::splat(ts.sy), a:f32x16::splat(ts.ty))); |
547 | |
548 | split(&nx, &mut p.r, &mut p.g); |
549 | split(&ny, &mut p.b, &mut p.a); |
550 | |
551 | p.next_stage(); |
552 | } |
553 | |
554 | fn pad_x1(p: &mut Pipeline) { |
555 | let x: f32x16 = join(&p.r, &p.g); |
556 | let x: f32x16 = x.normalize(); |
557 | split(&x, &mut p.r, &mut p.g); |
558 | |
559 | p.next_stage(); |
560 | } |
561 | |
562 | fn reflect_x1(p: &mut Pipeline) { |
563 | let x: f32x16 = join(&p.r, &p.g); |
564 | let two: impl Fn(f32x16) -> f32x16 = |x: f32x16| x + x; |
565 | let x: f32x16 = ( |
566 | (x - f32x16::splat(1.0)) |
567 | - two(((x - f32x16::splat(1.0)) * f32x16::splat(0.5)).floor()) |
568 | - f32x16::splat(1.0) |
569 | ).abs().normalize(); |
570 | split(&x, &mut p.r, &mut p.g); |
571 | |
572 | p.next_stage(); |
573 | } |
574 | |
575 | fn repeat_x1(p: &mut Pipeline) { |
576 | let x: f32x16 = join(&p.r, &p.g); |
577 | let x: f32x16 = (x - x.floor()).normalize(); |
578 | split(&x, &mut p.r, &mut p.g); |
579 | |
580 | p.next_stage(); |
581 | } |
582 | |
583 | fn gradient(p: &mut Pipeline) { |
584 | let ctx = &p.ctx.gradient; |
585 | |
586 | // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. |
587 | let t = join(&p.r, &p.g); |
588 | let mut idx = u16x16::splat(0); |
589 | for i in 1..ctx.len { |
590 | let tt = ctx.t_values[i].get(); |
591 | let t0: [f32; 8] = t.0.into(); |
592 | let t1: [f32; 8] = t.1.into(); |
593 | idx.0[ 0] += (t0[0] >= tt) as u16; |
594 | idx.0[ 1] += (t0[1] >= tt) as u16; |
595 | idx.0[ 2] += (t0[2] >= tt) as u16; |
596 | idx.0[ 3] += (t0[3] >= tt) as u16; |
597 | idx.0[ 4] += (t0[4] >= tt) as u16; |
598 | idx.0[ 5] += (t0[5] >= tt) as u16; |
599 | idx.0[ 6] += (t0[6] >= tt) as u16; |
600 | idx.0[ 7] += (t0[7] >= tt) as u16; |
601 | idx.0[ 8] += (t1[0] >= tt) as u16; |
602 | idx.0[ 9] += (t1[1] >= tt) as u16; |
603 | idx.0[10] += (t1[2] >= tt) as u16; |
604 | idx.0[11] += (t1[3] >= tt) as u16; |
605 | idx.0[12] += (t1[4] >= tt) as u16; |
606 | idx.0[13] += (t1[5] >= tt) as u16; |
607 | idx.0[14] += (t1[6] >= tt) as u16; |
608 | idx.0[15] += (t1[7] >= tt) as u16; |
609 | } |
610 | gradient_lookup(ctx, &idx, t, &mut p.r, &mut p.g, &mut p.b, &mut p.a); |
611 | |
612 | p.next_stage(); |
613 | } |
614 | |
615 | fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) { |
616 | let ctx: &EvenlySpaced2StopGradientCtx = &p.ctx.evenly_spaced_2_stop_gradient; |
617 | |
618 | let t: f32x16 = join(&p.r, &p.g); |
619 | round_f32_to_u16( |
620 | rf:mad(t, f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)), |
621 | gf:mad(t, f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)), |
622 | bf:mad(t, f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)), |
623 | af:mad(f:t, m:f32x16::splat(ctx.factor.a), a:f32x16::splat(ctx.bias.a)), |
624 | &mut p.r, &mut p.g, &mut p.b, &mut p.a, |
625 | ); |
626 | |
627 | p.next_stage(); |
628 | } |
629 | |
630 | fn xy_to_radius(p: &mut Pipeline) { |
631 | let x: f32x16 = join(&p.r, &p.g); |
632 | let y: f32x16 = join(&p.b, &p.a); |
633 | let x: f32x16 = (x*x + y*y).sqrt(); |
634 | split(&x, &mut p.r, &mut p.g); |
635 | split(&y, &mut p.b, &mut p.a); |
636 | |
637 | p.next_stage(); |
638 | } |
639 | |
640 | // We are using u16 for index, not u32 as Skia, to simplify the code a bit. |
641 | // The gradient creation code will not allow that many stops anyway. |
642 | fn gradient_lookup( |
643 | ctx: &super::GradientCtx, idx: &u16x16, t: f32x16, |
644 | r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, |
645 | ) { |
646 | macro_rules! gather { |
647 | ($d:expr, $c:ident) => { |
648 | // Surprisingly, but bound checking doesn't affect the performance. |
649 | // And since `idx` can contain any number, we should leave it in place. |
650 | f32x16( |
651 | f32x8::from([ |
652 | $d[idx.0[ 0] as usize].$c, |
653 | $d[idx.0[ 1] as usize].$c, |
654 | $d[idx.0[ 2] as usize].$c, |
655 | $d[idx.0[ 3] as usize].$c, |
656 | $d[idx.0[ 4] as usize].$c, |
657 | $d[idx.0[ 5] as usize].$c, |
658 | $d[idx.0[ 6] as usize].$c, |
659 | $d[idx.0[ 7] as usize].$c, |
660 | ]), |
661 | f32x8::from([ |
662 | $d[idx.0[ 8] as usize].$c, |
663 | $d[idx.0[ 9] as usize].$c, |
664 | $d[idx.0[10] as usize].$c, |
665 | $d[idx.0[11] as usize].$c, |
666 | $d[idx.0[12] as usize].$c, |
667 | $d[idx.0[13] as usize].$c, |
668 | $d[idx.0[14] as usize].$c, |
669 | $d[idx.0[15] as usize].$c, |
670 | ]), |
671 | ) |
672 | }; |
673 | } |
674 | |
675 | let fr = gather!(&ctx.factors, r); |
676 | let fg = gather!(&ctx.factors, g); |
677 | let fb = gather!(&ctx.factors, b); |
678 | let fa = gather!(&ctx.factors, a); |
679 | |
680 | let br = gather!(&ctx.biases, r); |
681 | let bg = gather!(&ctx.biases, g); |
682 | let bb = gather!(&ctx.biases, b); |
683 | let ba = gather!(&ctx.biases, a); |
684 | |
685 | round_f32_to_u16( |
686 | mad(t, fr, br), |
687 | mad(t, fg, bg), |
688 | mad(t, fb, bb), |
689 | mad(t, fa, ba), |
690 | r, g, b, a, |
691 | ); |
692 | } |
693 | |
694 | #[inline (always)] |
695 | fn round_f32_to_u16( |
696 | rf: f32x16, gf: f32x16, bf: f32x16, af: f32x16, |
697 | r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, |
698 | ) { |
699 | // TODO: may produce a slightly different result to Skia |
700 | // affects the two_stops_linear_mirror test |
701 | |
702 | let rf: f32x16 = rf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5); |
703 | let gf: f32x16 = gf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5); |
704 | let bf: f32x16 = bf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5); |
705 | let af: f32x16 = af * f32x16::splat(255.0) + f32x16::splat(0.5); |
706 | |
707 | rf.save_to_u16x16(dst:r); |
708 | gf.save_to_u16x16(dst:g); |
709 | bf.save_to_u16x16(dst:b); |
710 | af.save_to_u16x16(dst:a); |
711 | } |
712 | |
713 | pub fn just_return(_: &mut Pipeline) { |
714 | // Ends the loop. |
715 | } |
716 | |
717 | pub fn null_fn(_: &mut Pipeline) { |
718 | // Just for unsupported functions in STAGES. |
719 | } |
720 | |
721 | #[inline (always)] |
722 | fn load_8888( |
723 | data: &[PremultipliedColorU8; STAGE_WIDTH], |
724 | r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, |
725 | ) { |
726 | *r = u16x16([ |
727 | data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16, |
728 | data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16, |
729 | data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16, |
730 | data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16, |
731 | ]); |
732 | |
733 | *g = u16x16([ |
734 | data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16, |
735 | data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16, |
736 | data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16, |
737 | data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16, |
738 | ]); |
739 | |
740 | *b = u16x16([ |
741 | data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16, |
742 | data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16, |
743 | data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16, |
744 | data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16, |
745 | ]); |
746 | |
747 | *a = u16x16([ |
748 | data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16, |
749 | data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16, |
750 | data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16, |
751 | data[12].alpha() as u16, data[13].alpha() as u16, data[14].alpha() as u16, data[15].alpha() as u16, |
752 | ]); |
753 | } |
754 | |
755 | #[inline (always)] |
756 | fn load_8888_tail( |
757 | tail: usize, data: &[PremultipliedColorU8], |
758 | r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, |
759 | ) { |
760 | // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range. |
761 | // This way we can reuse the `load_8888__` method and remove any branches. |
762 | let mut tmp: [PremultipliedColorU8; 16] = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH]; |
763 | tmp[0..tail].copy_from_slice(&data[0..tail]); |
764 | load_8888(&tmp, r, g, b, a); |
765 | } |
766 | |
767 | #[inline (always)] |
768 | fn store_8888( |
769 | r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16, |
770 | data: &mut [PremultipliedColorU8; STAGE_WIDTH], |
771 | ) { |
772 | let r: &[u16; 16] = r.as_slice(); |
773 | let g: &[u16; 16] = g.as_slice(); |
774 | let b: &[u16; 16] = b.as_slice(); |
775 | let a: &[u16; 16] = a.as_slice(); |
776 | |
777 | data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 0] as u8, g:g[ 0] as u8, b:b[ 0] as u8, a:a[ 0] as u8); |
778 | data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 1] as u8, g:g[ 1] as u8, b:b[ 1] as u8, a:a[ 1] as u8); |
779 | data[ 2] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 2] as u8, g:g[ 2] as u8, b:b[ 2] as u8, a:a[ 2] as u8); |
780 | data[ 3] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 3] as u8, g:g[ 3] as u8, b:b[ 3] as u8, a:a[ 3] as u8); |
781 | data[ 4] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 4] as u8, g:g[ 4] as u8, b:b[ 4] as u8, a:a[ 4] as u8); |
782 | data[ 5] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 5] as u8, g:g[ 5] as u8, b:b[ 5] as u8, a:a[ 5] as u8); |
783 | data[ 6] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 6] as u8, g:g[ 6] as u8, b:b[ 6] as u8, a:a[ 6] as u8); |
784 | data[ 7] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 7] as u8, g:g[ 7] as u8, b:b[ 7] as u8, a:a[ 7] as u8); |
785 | data[ 8] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 8] as u8, g:g[ 8] as u8, b:b[ 8] as u8, a:a[ 8] as u8); |
786 | data[ 9] = PremultipliedColorU8::from_rgba_unchecked(r:r[ 9] as u8, g:g[ 9] as u8, b:b[ 9] as u8, a:a[ 9] as u8); |
787 | data[10] = PremultipliedColorU8::from_rgba_unchecked(r:r[10] as u8, g:g[10] as u8, b:b[10] as u8, a:a[10] as u8); |
788 | data[11] = PremultipliedColorU8::from_rgba_unchecked(r:r[11] as u8, g:g[11] as u8, b:b[11] as u8, a:a[11] as u8); |
789 | data[12] = PremultipliedColorU8::from_rgba_unchecked(r:r[12] as u8, g:g[12] as u8, b:b[12] as u8, a:a[12] as u8); |
790 | data[13] = PremultipliedColorU8::from_rgba_unchecked(r:r[13] as u8, g:g[13] as u8, b:b[13] as u8, a:a[13] as u8); |
791 | data[14] = PremultipliedColorU8::from_rgba_unchecked(r:r[14] as u8, g:g[14] as u8, b:b[14] as u8, a:a[14] as u8); |
792 | data[15] = PremultipliedColorU8::from_rgba_unchecked(r:r[15] as u8, g:g[15] as u8, b:b[15] as u8, a:a[15] as u8); |
793 | } |
794 | |
795 | #[inline (always)] |
796 | fn store_8888_tail( |
797 | r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16, |
798 | tail: usize, data: &mut [PremultipliedColorU8], |
799 | ) { |
800 | let r: &[u16; 16] = r.as_slice(); |
801 | let g: &[u16; 16] = g.as_slice(); |
802 | let b: &[u16; 16] = b.as_slice(); |
803 | let a: &[u16; 16] = a.as_slice(); |
804 | |
805 | // This is better than `for i in 0..tail`, because this way the compiler |
806 | // knows that we have only 16 steps and slices access is guarantee to be valid. |
807 | // This removes bounds checking and a possible panic call. |
808 | for i: usize in 0..STAGE_WIDTH { |
809 | data[i] = PremultipliedColorU8::from_rgba_unchecked( |
810 | r:r[i] as u8, g:g[i] as u8, b:b[i] as u8, a:a[i] as u8, |
811 | ); |
812 | |
813 | if i + 1 == tail { |
814 | break; |
815 | } |
816 | } |
817 | } |
818 | |
819 | #[inline (always)] |
820 | fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) { |
821 | *a = u16x16([ |
822 | data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16, |
823 | data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16, |
824 | data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16, |
825 | data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16, |
826 | ]); |
827 | } |
828 | |
829 | #[inline (always)] |
830 | fn div255(v: u16x16) -> u16x16 { |
831 | // Skia uses `vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8)` here when NEON is available, |
832 | // but it doesn't affect performance much and breaks reproducible result. Ignore it. |
833 | // NOTE: the compiler does not replace the devision with a shift. |
834 | (v + u16x16::splat(255)) >> u16x16::splat(8) // / u16x16::splat(256) |
835 | } |
836 | |
837 | #[inline (always)] |
838 | fn inv(v: u16x16) -> u16x16 { |
839 | u16x16::splat(255) - v |
840 | } |
841 | |
842 | #[inline (always)] |
843 | fn from_float(f: f32) -> u16x16 { |
844 | u16x16::splat((f * 255.0 + 0.5) as u16) |
845 | } |
846 | |
847 | #[inline (always)] |
848 | fn lerp(from: u16x16, to: u16x16, t: u16x16) -> u16x16 { |
849 | div255(from * inv(t) + to * t) |
850 | } |
851 | |
852 | #[inline (always)] |
853 | fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) { |
854 | // We're splitting f32x16 (512bit) into two u16x16 (256 bit). |
855 | let data: [u8; 64] = bytemuck::cast(*v); |
856 | let d0: &mut [u8; 32] = bytemuck::cast_mut(&mut lo.0); |
857 | let d1: &mut [u8; 32] = bytemuck::cast_mut(&mut hi.0); |
858 | |
859 | d0.copy_from_slice(&data[0..32]); |
860 | d1.copy_from_slice(&data[32..64]); |
861 | } |
862 | |
863 | #[inline (always)] |
864 | fn join(lo: &u16x16, hi: &u16x16) -> f32x16 { |
865 | // We're joining two u16x16 (256 bit) into f32x16 (512bit). |
866 | |
867 | let d0: [u8; 32] = bytemuck::cast(lo.0); |
868 | let d1: [u8; 32] = bytemuck::cast(hi.0); |
869 | |
870 | let mut v: f32x16 = f32x16::default(); |
871 | let data: &mut [u8; 64] = bytemuck::cast_mut(&mut v); |
872 | |
873 | data[0..32].copy_from_slice(&d0); |
874 | data[32..64].copy_from_slice(&d1); |
875 | |
876 | v |
877 | } |
878 | |
879 | #[inline (always)] |
880 | fn mad(f: f32x16, m: f32x16, a: f32x16) -> f32x16 { |
881 | // NEON vmlaq_f32 doesn't seem to affect performance in any way. Ignore it. |
882 | f * m + a |
883 | } |
884 | |