| 1 | // Copyright 2018 Google Inc. |
| 2 | // Copyright 2020 Yevhenii Reizner |
| 3 | // |
| 4 | // Use of this source code is governed by a BSD-style license that can be |
| 5 | // found in the LICENSE file. |
| 6 | |
| 7 | /*! |
| 8 | A low precision raster pipeline implementation. |
| 9 | |
| 10 | A lowp pipeline uses u16 instead of f32 for math. |
| 11 | Because of that, it doesn't implement stages that require high precision. |
| 12 | The pipeline compiler will automatically decide which one to use. |
| 13 | |
| 14 | Skia uses u16x8 (128bit) types for a generic CPU and u16x16 (256bit) for modern x86 CPUs. |
| 15 | But instead of explicit SIMD instructions, it mainly relies on clang's vector extensions. |
| 16 | And since they are unavailable in Rust, we have to do everything manually. |
| 17 | |
| 18 | According to our benchmarks, a SIMD-accelerated u16x8 in Rust is almost 2x slower than in Skia. |
| 19 | Not sure why. For example, there are no div instruction for u16x8, so we have to use |
| 20 | a basic scalar version. Which means unnecessary load/store. No idea what clang does in this case. |
| 21 | Surprisingly, a SIMD-accelerated u16x8 is even slower than a scalar one. Again, not sure why. |
| 22 | |
| 23 | Therefore we are using scalar u16x16 by default and relying on rustc/llvm auto vectorization instead. |
| 24 | When targeting a generic CPU, we're just 5-10% slower than Skia. While u16x8 is 30-40% slower. |
| 25 | And while `-C target-cpu=haswell` boosts our performance by around 25%, |
| 26 | we are still 40-60% behind Skia built for Haswell. |
| 27 | |
| 28 | On ARM AArch64 the story is different and explicit SIMD make our code up to 2-3x faster. |
| 29 | */ |
| 30 | |
| 31 | use crate::PremultipliedColorU8; |
| 32 | |
| 33 | use crate::pixmap::SubPixmapMut; |
| 34 | use crate::wide::{f32x8, u16x16, f32x16}; |
| 35 | use crate::geom::ScreenIntRect; |
| 36 | |
| 37 | pub const STAGE_WIDTH: usize = 16; |
| 38 | |
| 39 | pub type StageFn = fn(p: &mut Pipeline); |
| 40 | |
| 41 | pub struct Pipeline<'a, 'b: 'a> { |
| 42 | index: usize, |
| 43 | functions: &'a [StageFn], |
| 44 | pixmap: &'a mut SubPixmapMut<'b>, |
| 45 | mask_ctx: super::MaskCtx<'a>, |
| 46 | aa_mask_ctx: super::AAMaskCtx, |
| 47 | ctx: &'a mut super::Context, |
| 48 | r: u16x16, |
| 49 | g: u16x16, |
| 50 | b: u16x16, |
| 51 | a: u16x16, |
| 52 | dr: u16x16, |
| 53 | dg: u16x16, |
| 54 | db: u16x16, |
| 55 | da: u16x16, |
| 56 | tail: usize, |
| 57 | dx: usize, |
| 58 | dy: usize, |
| 59 | } |
| 60 | |
| 61 | impl Pipeline<'_, '_> { |
| 62 | #[inline (always)] |
| 63 | fn next_stage(&mut self) { |
| 64 | let next: fn(&mut Self) = self.functions[self.index]; |
| 65 | self.index += 1; |
| 66 | next(self); |
| 67 | } |
| 68 | } |
| 69 | |
| 70 | |
| 71 | // Must be in the same order as raster_pipeline::Stage |
| 72 | pub const STAGES: &[StageFn; super::STAGES_COUNT] = &[ |
| 73 | move_source_to_destination, |
| 74 | move_destination_to_source, |
| 75 | null_fn, // Clamp0 |
| 76 | null_fn, // ClampA |
| 77 | premultiply, |
| 78 | uniform_color, |
| 79 | seed_shader, |
| 80 | load_dst, |
| 81 | store, |
| 82 | load_dst_u8, |
| 83 | store_u8, |
| 84 | null_fn, // Gather |
| 85 | load_mask_u8, |
| 86 | mask_u8, |
| 87 | scale_u8, |
| 88 | lerp_u8, |
| 89 | scale_1_float, |
| 90 | lerp_1_float, |
| 91 | destination_atop, |
| 92 | destination_in, |
| 93 | destination_out, |
| 94 | destination_over, |
| 95 | source_atop, |
| 96 | source_in, |
| 97 | source_out, |
| 98 | source_over, |
| 99 | clear, |
| 100 | modulate, |
| 101 | multiply, |
| 102 | plus, |
| 103 | screen, |
| 104 | xor, |
| 105 | null_fn, // ColorBurn |
| 106 | null_fn, // ColorDodge |
| 107 | darken, |
| 108 | difference, |
| 109 | exclusion, |
| 110 | hard_light, |
| 111 | lighten, |
| 112 | overlay, |
| 113 | null_fn, // SoftLight |
| 114 | null_fn, // Hue |
| 115 | null_fn, // Saturation |
| 116 | null_fn, // Color |
| 117 | null_fn, // Luminosity |
| 118 | source_over_rgba, |
| 119 | transform, |
| 120 | null_fn, // Reflect |
| 121 | null_fn, // Repeat |
| 122 | null_fn, // Bilinear |
| 123 | null_fn, // Bicubic |
| 124 | pad_x1, |
| 125 | reflect_x1, |
| 126 | repeat_x1, |
| 127 | gradient, |
| 128 | evenly_spaced_2_stop_gradient, |
| 129 | xy_to_radius, |
| 130 | null_fn, // XYTo2PtConicalFocalOnCircle |
| 131 | null_fn, // XYTo2PtConicalWellBehaved |
| 132 | null_fn, // XYTo2PtConicalGreater |
| 133 | null_fn, // Mask2PtConicalDegenerates |
| 134 | null_fn, // ApplyVectorMask |
| 135 | ]; |
| 136 | |
| 137 | pub fn fn_ptr(f: StageFn) -> *const () { |
| 138 | f as *const () |
| 139 | } |
| 140 | |
| 141 | pub fn fn_ptr_eq(f1: StageFn, f2: StageFn) -> bool { |
| 142 | core::ptr::eq(a:f1 as *const (), b:f2 as *const ()) |
| 143 | } |
| 144 | |
| 145 | #[inline (never)] |
| 146 | pub fn start( |
| 147 | functions: &[StageFn], |
| 148 | functions_tail: &[StageFn], |
| 149 | rect: &ScreenIntRect, |
| 150 | aa_mask_ctx: super::AAMaskCtx, |
| 151 | mask_ctx: super::MaskCtx, |
| 152 | ctx: &mut super::Context, |
| 153 | pixmap: &mut SubPixmapMut, |
| 154 | ) { |
| 155 | let mut p = Pipeline { |
| 156 | index: 0, |
| 157 | functions: &[], |
| 158 | pixmap, |
| 159 | mask_ctx, |
| 160 | aa_mask_ctx, |
| 161 | ctx, |
| 162 | r: u16x16::default(), |
| 163 | g: u16x16::default(), |
| 164 | b: u16x16::default(), |
| 165 | a: u16x16::default(), |
| 166 | dr: u16x16::default(), |
| 167 | dg: u16x16::default(), |
| 168 | db: u16x16::default(), |
| 169 | da: u16x16::default(), |
| 170 | tail: 0, |
| 171 | dx: 0, |
| 172 | dy: 0, |
| 173 | }; |
| 174 | |
| 175 | for y in rect.y()..rect.bottom() { |
| 176 | let mut x = rect.x() as usize; |
| 177 | let end = rect.right() as usize; |
| 178 | |
| 179 | p.functions = functions; |
| 180 | while x + STAGE_WIDTH <= end { |
| 181 | p.index = 0; |
| 182 | p.dx = x; |
| 183 | p.dy = y as usize; |
| 184 | p.tail = STAGE_WIDTH; |
| 185 | p.next_stage(); |
| 186 | x += STAGE_WIDTH; |
| 187 | } |
| 188 | |
| 189 | if x != end { |
| 190 | p.index = 0; |
| 191 | p.functions = functions_tail; |
| 192 | p.dx = x; |
| 193 | p.dy = y as usize; |
| 194 | p.tail = end - x; |
| 195 | p.next_stage(); |
| 196 | } |
| 197 | } |
| 198 | } |
| 199 | |
| 200 | fn move_source_to_destination(p: &mut Pipeline) { |
| 201 | p.dr = p.r; |
| 202 | p.dg = p.g; |
| 203 | p.db = p.b; |
| 204 | p.da = p.a; |
| 205 | |
| 206 | p.next_stage(); |
| 207 | } |
| 208 | |
| 209 | fn move_destination_to_source(p: &mut Pipeline) { |
| 210 | p.r = p.dr; |
| 211 | p.g = p.dg; |
| 212 | p.b = p.db; |
| 213 | p.a = p.da; |
| 214 | |
| 215 | p.next_stage(); |
| 216 | } |
| 217 | |
| 218 | fn premultiply(p: &mut Pipeline) { |
| 219 | p.r = div255(p.r * p.a); |
| 220 | p.g = div255(p.g * p.a); |
| 221 | p.b = div255(p.b * p.a); |
| 222 | |
| 223 | p.next_stage(); |
| 224 | } |
| 225 | |
| 226 | fn uniform_color(p: &mut Pipeline) { |
| 227 | let ctx: UniformColorCtx = p.ctx.uniform_color; |
| 228 | p.r = u16x16::splat(ctx.rgba[0]); |
| 229 | p.g = u16x16::splat(ctx.rgba[1]); |
| 230 | p.b = u16x16::splat(ctx.rgba[2]); |
| 231 | p.a = u16x16::splat(ctx.rgba[3]); |
| 232 | |
| 233 | p.next_stage(); |
| 234 | } |
| 235 | |
| 236 | fn seed_shader(p: &mut Pipeline) { |
| 237 | let iota: f32x16 = f32x16( |
| 238 | f32x8::from([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), |
| 239 | f32x8::from([8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5]), |
| 240 | ); |
| 241 | |
| 242 | let x: f32x16 = f32x16::splat(p.dx as f32) + iota; |
| 243 | let y: f32x16 = f32x16::splat(p.dy as f32 + 0.5); |
| 244 | split(&x, &mut p.r, &mut p.g); |
| 245 | split(&y, &mut p.b, &mut p.a); |
| 246 | |
| 247 | p.next_stage(); |
| 248 | } |
| 249 | |
| 250 | pub fn load_dst(p: &mut Pipeline) { |
| 251 | load_8888(data:p.pixmap.slice16_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da); |
| 252 | p.next_stage(); |
| 253 | } |
| 254 | |
| 255 | pub fn load_dst_tail(p: &mut Pipeline) { |
| 256 | load_8888_tail(p.tail, data:p.pixmap.slice_at_xy(p.dx, p.dy), &mut p.dr, &mut p.dg, &mut p.db, &mut p.da); |
| 257 | p.next_stage(); |
| 258 | } |
| 259 | |
| 260 | pub fn store(p: &mut Pipeline) { |
| 261 | store_8888(&p.r, &p.g, &p.b, &p.a, data:p.pixmap.slice16_at_xy(p.dx, p.dy)); |
| 262 | p.next_stage(); |
| 263 | } |
| 264 | |
| 265 | pub fn store_tail(p: &mut Pipeline) { |
| 266 | store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:p.pixmap.slice_at_xy(p.dx, p.dy)); |
| 267 | p.next_stage(); |
| 268 | } |
| 269 | |
| 270 | pub fn load_dst_u8(p: &mut Pipeline) { |
| 271 | load_8(data:p.pixmap.slice16_mask_at_xy(p.dx, p.dy), &mut p.da); |
| 272 | p.next_stage(); |
| 273 | } |
| 274 | |
| 275 | pub fn load_dst_u8_tail(p: &mut Pipeline) { |
| 276 | // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range. |
| 277 | // This way we can reuse the `load_8888__` method and remove any branches. |
| 278 | let data: &mut [u8] = p.pixmap.slice_mask_at_xy(p.dx, p.dy); |
| 279 | let mut tmp: [u8; 16] = [0u8; STAGE_WIDTH]; |
| 280 | tmp[0..p.tail].copy_from_slice(&data[0..p.tail]); |
| 281 | load_8(&tmp, &mut p.da); |
| 282 | |
| 283 | p.next_stage(); |
| 284 | } |
| 285 | |
| 286 | pub fn store_u8(p: &mut Pipeline) { |
| 287 | let data: &mut [u8; 16] = p.pixmap.slice16_mask_at_xy(p.dx, p.dy); |
| 288 | let a: &[u16; 16] = p.a.as_slice(); |
| 289 | |
| 290 | data[ 0] = a[ 0] as u8; |
| 291 | data[ 1] = a[ 1] as u8; |
| 292 | data[ 2] = a[ 2] as u8; |
| 293 | data[ 3] = a[ 3] as u8; |
| 294 | data[ 4] = a[ 4] as u8; |
| 295 | data[ 5] = a[ 5] as u8; |
| 296 | data[ 6] = a[ 6] as u8; |
| 297 | data[ 7] = a[ 7] as u8; |
| 298 | data[ 8] = a[ 8] as u8; |
| 299 | data[ 9] = a[ 9] as u8; |
| 300 | data[10] = a[10] as u8; |
| 301 | data[11] = a[11] as u8; |
| 302 | data[12] = a[12] as u8; |
| 303 | data[13] = a[13] as u8; |
| 304 | data[14] = a[14] as u8; |
| 305 | data[15] = a[15] as u8; |
| 306 | |
| 307 | p.next_stage(); |
| 308 | } |
| 309 | |
| 310 | pub fn store_u8_tail(p: &mut Pipeline) { |
| 311 | let data: &mut [u8] = p.pixmap.slice_mask_at_xy(p.dx, p.dy); |
| 312 | let a: &[u16; 16] = p.a.as_slice(); |
| 313 | |
| 314 | // This is better than `for i in 0..tail`, because this way the compiler |
| 315 | // knows that we have only 16 steps and slices access is guarantee to be valid. |
| 316 | // This removes bounds checking and a possible panic call. |
| 317 | for i: usize in 0..STAGE_WIDTH { |
| 318 | data[i] = a[i] as u8; |
| 319 | |
| 320 | if i + 1 == p.tail { |
| 321 | break; |
| 322 | } |
| 323 | } |
| 324 | |
| 325 | p.next_stage(); |
| 326 | } |
| 327 | |
| 328 | // Similar to mask_u8, but only loads the mask values without actually masking the pipeline. |
| 329 | fn load_mask_u8(p: &mut Pipeline) { |
| 330 | let offset: usize = p.mask_ctx.offset(p.dx, p.dy); |
| 331 | |
| 332 | let mut c: u16x16 = u16x16::default(); |
| 333 | for i: usize in 0..p.tail { |
| 334 | c.0[i] = u16::from(p.mask_ctx.data[offset + i]); |
| 335 | } |
| 336 | |
| 337 | p.r = u16x16::splat(0); |
| 338 | p.g = u16x16::splat(0); |
| 339 | p.b = u16x16::splat(0); |
| 340 | p.a = c; |
| 341 | |
| 342 | p.next_stage(); |
| 343 | } |
| 344 | |
| 345 | fn mask_u8(p: &mut Pipeline) { |
| 346 | let offset: usize = p.mask_ctx.offset(p.dx, p.dy); |
| 347 | |
| 348 | let mut c: u16x16 = u16x16::default(); |
| 349 | for i: usize in 0..p.tail { |
| 350 | c.0[i] = u16::from(p.mask_ctx.data[offset + i]); |
| 351 | } |
| 352 | |
| 353 | if c == u16x16::default() { |
| 354 | return; |
| 355 | } |
| 356 | |
| 357 | p.r = div255(p.r * c); |
| 358 | p.g = div255(p.g * c); |
| 359 | p.b = div255(p.b * c); |
| 360 | p.a = div255(p.a * c); |
| 361 | |
| 362 | p.next_stage(); |
| 363 | } |
| 364 | |
| 365 | fn scale_u8(p: &mut Pipeline) { |
| 366 | // Load u8xTail and cast it to u16x16. |
| 367 | let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail); |
| 368 | let c = u16x16([ |
| 369 | u16::from(data[0]), |
| 370 | u16::from(data[1]), |
| 371 | 0, |
| 372 | 0, |
| 373 | 0, |
| 374 | 0, |
| 375 | 0, |
| 376 | 0, |
| 377 | 0, |
| 378 | 0, |
| 379 | 0, |
| 380 | 0, |
| 381 | 0, |
| 382 | 0, |
| 383 | 0, |
| 384 | 0, |
| 385 | ]); |
| 386 | |
| 387 | p.r = div255(p.r * c); |
| 388 | p.g = div255(p.g * c); |
| 389 | p.b = div255(p.b * c); |
| 390 | p.a = div255(p.a * c); |
| 391 | |
| 392 | p.next_stage(); |
| 393 | } |
| 394 | |
| 395 | fn lerp_u8(p: &mut Pipeline) { |
| 396 | // Load u8xTail and cast it to u16x16. |
| 397 | let data = p.aa_mask_ctx.copy_at_xy(p.dx, p.dy, p.tail); |
| 398 | let c = u16x16([ |
| 399 | u16::from(data[0]), |
| 400 | u16::from(data[1]), |
| 401 | 0, |
| 402 | 0, |
| 403 | 0, |
| 404 | 0, |
| 405 | 0, |
| 406 | 0, |
| 407 | 0, |
| 408 | 0, |
| 409 | 0, |
| 410 | 0, |
| 411 | 0, |
| 412 | 0, |
| 413 | 0, |
| 414 | 0, |
| 415 | ]); |
| 416 | |
| 417 | p.r = lerp(p.dr, p.r, c); |
| 418 | p.g = lerp(p.dg, p.g, c); |
| 419 | p.b = lerp(p.db, p.b, c); |
| 420 | p.a = lerp(p.da, p.a, c); |
| 421 | |
| 422 | p.next_stage(); |
| 423 | } |
| 424 | |
| 425 | fn scale_1_float(p: &mut Pipeline) { |
| 426 | let c: u16x16 = from_float(p.ctx.current_coverage); |
| 427 | p.r = div255(p.r * c); |
| 428 | p.g = div255(p.g * c); |
| 429 | p.b = div255(p.b * c); |
| 430 | p.a = div255(p.a * c); |
| 431 | |
| 432 | p.next_stage(); |
| 433 | } |
| 434 | |
| 435 | fn lerp_1_float(p: &mut Pipeline) { |
| 436 | let c: u16x16 = from_float(p.ctx.current_coverage); |
| 437 | p.r = lerp(from:p.dr, to:p.r, t:c); |
| 438 | p.g = lerp(from:p.dg, to:p.g, t:c); |
| 439 | p.b = lerp(from:p.db, to:p.b, t:c); |
| 440 | p.a = lerp(from:p.da, to:p.a, t:c); |
| 441 | |
| 442 | p.next_stage(); |
| 443 | } |
| 444 | |
| 445 | macro_rules! blend_fn { |
| 446 | ($name:ident, $f:expr) => { |
| 447 | fn $name(p: &mut Pipeline) { |
| 448 | p.r = $f(p.r, p.dr, p.a, p.da); |
| 449 | p.g = $f(p.g, p.dg, p.a, p.da); |
| 450 | p.b = $f(p.b, p.db, p.a, p.da); |
| 451 | p.a = $f(p.a, p.da, p.a, p.da); |
| 452 | |
| 453 | p.next_stage(); |
| 454 | } |
| 455 | }; |
| 456 | } |
| 457 | |
| 458 | blend_fn!(clear, |_, _, _, _| u16x16::splat(0)); |
| 459 | blend_fn!(source_atop, |s, d, sa, da| div255(s * da + d * inv(sa))); |
| 460 | blend_fn!(destination_atop, |s, d, sa, da| div255(d * sa + s * inv(da))); |
| 461 | blend_fn!(source_in, |s, _, _, da| div255(s * da)); |
| 462 | blend_fn!(destination_in, |_, d, sa, _| div255(d * sa)); |
| 463 | blend_fn!(source_out, |s, _, _, da| div255(s * inv(da))); |
| 464 | blend_fn!(destination_out, |_, d, sa, _| div255(d * inv(sa))); |
| 465 | blend_fn!(source_over, |s, d, sa, _| s + div255(d * inv(sa))); |
| 466 | blend_fn!(destination_over, |s, d, _, da| d + div255(s * inv(da))); |
| 467 | blend_fn!(modulate, |s, d, _, _| div255(s * d)); |
| 468 | blend_fn!(multiply, |s, d, sa, da| div255(s * inv(da) + d * inv(sa) + s * d)); |
| 469 | blend_fn!(screen, |s, d, _, _| s + d - div255(s * d)); |
| 470 | blend_fn!(xor, |s, d, sa, da| div255(s * inv(da) + d * inv(sa))); |
| 471 | |
| 472 | // Wants a type for some reason. |
| 473 | blend_fn!(plus, |s: u16x16, d, _, _| (s + d).min(&u16x16::splat(255))); |
| 474 | |
| 475 | |
| 476 | macro_rules! blend_fn2 { |
| 477 | ($name:ident, $f:expr) => { |
| 478 | fn $name(p: &mut Pipeline) { |
| 479 | // The same logic applied to color, and source_over for alpha. |
| 480 | p.r = $f(p.r, p.dr, p.a, p.da); |
| 481 | p.g = $f(p.g, p.dg, p.a, p.da); |
| 482 | p.b = $f(p.b, p.db, p.a, p.da); |
| 483 | p.a = p.a + div255(p.da * inv(p.a)); |
| 484 | |
| 485 | p.next_stage(); |
| 486 | } |
| 487 | }; |
| 488 | } |
| 489 | |
| 490 | blend_fn2!(darken, |s: u16x16, d, sa, da| s + d - div255((s * da).max(&(d * sa)))); |
| 491 | blend_fn2!(lighten, |s: u16x16, d, sa, da| s + d - div255((s * da).min(&(d * sa)))); |
| 492 | blend_fn2!(exclusion, |s: u16x16, d, _, _| s + d - u16x16::splat(2) * div255(s * d)); |
| 493 | |
| 494 | blend_fn2!(difference, |s: u16x16, d, sa, da| |
| 495 | s + d - u16x16::splat(2) * div255((s * da).min(&(d * sa)))); |
| 496 | |
| 497 | blend_fn2!(hard_light, |s: u16x16, d: u16x16, sa, da| { |
| 498 | div255(s * inv(da) + d * inv(sa) |
| 499 | + (s+s).cmp_le(&sa).blend( |
| 500 | u16x16::splat(2) * s * d, |
| 501 | sa * da - u16x16::splat(2) * (sa-s)*(da-d) |
| 502 | ) |
| 503 | ) |
| 504 | }); |
| 505 | |
| 506 | blend_fn2!(overlay, |s: u16x16, d: u16x16, sa, da| { |
| 507 | div255(s * inv(da) + d * inv(sa) |
| 508 | + (d+d).cmp_le(&da).blend( |
| 509 | u16x16::splat(2) * s * d, |
| 510 | sa * da - u16x16::splat(2) * (sa-s)*(da-d) |
| 511 | ) |
| 512 | ) |
| 513 | }); |
| 514 | |
| 515 | pub fn source_over_rgba(p: &mut Pipeline) { |
| 516 | let pixels: &mut [PremultipliedColorU8; 16] = p.pixmap.slice16_at_xy(p.dx, p.dy); |
| 517 | load_8888(data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da); |
| 518 | p.r = p.r + div255(p.dr * inv(p.a)); |
| 519 | p.g = p.g + div255(p.dg * inv(p.a)); |
| 520 | p.b = p.b + div255(p.db * inv(p.a)); |
| 521 | p.a = p.a + div255(p.da * inv(p.a)); |
| 522 | store_8888(&p.r, &p.g, &p.b, &p.a, data:pixels); |
| 523 | |
| 524 | p.next_stage(); |
| 525 | } |
| 526 | |
| 527 | pub fn source_over_rgba_tail(p: &mut Pipeline) { |
| 528 | let pixels: &mut [PremultipliedColorU8] = p.pixmap.slice_at_xy(p.dx, p.dy); |
| 529 | load_8888_tail(p.tail, data:pixels, &mut p.dr, &mut p.dg, &mut p.db, &mut p.da); |
| 530 | p.r = p.r + div255(p.dr * inv(p.a)); |
| 531 | p.g = p.g + div255(p.dg * inv(p.a)); |
| 532 | p.b = p.b + div255(p.db * inv(p.a)); |
| 533 | p.a = p.a + div255(p.da * inv(p.a)); |
| 534 | store_8888_tail(&p.r, &p.g, &p.b, &p.a, p.tail, data:pixels); |
| 535 | |
| 536 | p.next_stage(); |
| 537 | } |
| 538 | |
| 539 | fn transform(p: &mut Pipeline) { |
| 540 | let ts: &Transform = &p.ctx.transform; |
| 541 | |
| 542 | let x: f32x16 = join(&p.r, &p.g); |
| 543 | let y: f32x16 = join(&p.b, &p.a); |
| 544 | |
| 545 | let nx: f32x16 = mad(f:x, m:f32x16::splat(ts.sx), a:mad(f:y, m:f32x16::splat(ts.kx), a:f32x16::splat(ts.tx))); |
| 546 | let ny: f32x16 = mad(f:x, m:f32x16::splat(ts.ky), a:mad(f:y, m:f32x16::splat(ts.sy), a:f32x16::splat(ts.ty))); |
| 547 | |
| 548 | split(&nx, &mut p.r, &mut p.g); |
| 549 | split(&ny, &mut p.b, &mut p.a); |
| 550 | |
| 551 | p.next_stage(); |
| 552 | } |
| 553 | |
| 554 | fn pad_x1(p: &mut Pipeline) { |
| 555 | let x: f32x16 = join(&p.r, &p.g); |
| 556 | let x: f32x16 = x.normalize(); |
| 557 | split(&x, &mut p.r, &mut p.g); |
| 558 | |
| 559 | p.next_stage(); |
| 560 | } |
| 561 | |
| 562 | fn reflect_x1(p: &mut Pipeline) { |
| 563 | let x: f32x16 = join(&p.r, &p.g); |
| 564 | let two: impl Fn(f32x16) -> f32x16 = |x: f32x16| x + x; |
| 565 | let x: f32x16 = ( |
| 566 | (x - f32x16::splat(1.0)) |
| 567 | - two(((x - f32x16::splat(1.0)) * f32x16::splat(0.5)).floor()) |
| 568 | - f32x16::splat(1.0) |
| 569 | ).abs().normalize(); |
| 570 | split(&x, &mut p.r, &mut p.g); |
| 571 | |
| 572 | p.next_stage(); |
| 573 | } |
| 574 | |
| 575 | fn repeat_x1(p: &mut Pipeline) { |
| 576 | let x: f32x16 = join(&p.r, &p.g); |
| 577 | let x: f32x16 = (x - x.floor()).normalize(); |
| 578 | split(&x, &mut p.r, &mut p.g); |
| 579 | |
| 580 | p.next_stage(); |
| 581 | } |
| 582 | |
| 583 | fn gradient(p: &mut Pipeline) { |
| 584 | let ctx = &p.ctx.gradient; |
| 585 | |
| 586 | // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop. |
| 587 | let t = join(&p.r, &p.g); |
| 588 | let mut idx = u16x16::splat(0); |
| 589 | for i in 1..ctx.len { |
| 590 | let tt = ctx.t_values[i].get(); |
| 591 | let t0: [f32; 8] = t.0.into(); |
| 592 | let t1: [f32; 8] = t.1.into(); |
| 593 | idx.0[ 0] += (t0[0] >= tt) as u16; |
| 594 | idx.0[ 1] += (t0[1] >= tt) as u16; |
| 595 | idx.0[ 2] += (t0[2] >= tt) as u16; |
| 596 | idx.0[ 3] += (t0[3] >= tt) as u16; |
| 597 | idx.0[ 4] += (t0[4] >= tt) as u16; |
| 598 | idx.0[ 5] += (t0[5] >= tt) as u16; |
| 599 | idx.0[ 6] += (t0[6] >= tt) as u16; |
| 600 | idx.0[ 7] += (t0[7] >= tt) as u16; |
| 601 | idx.0[ 8] += (t1[0] >= tt) as u16; |
| 602 | idx.0[ 9] += (t1[1] >= tt) as u16; |
| 603 | idx.0[10] += (t1[2] >= tt) as u16; |
| 604 | idx.0[11] += (t1[3] >= tt) as u16; |
| 605 | idx.0[12] += (t1[4] >= tt) as u16; |
| 606 | idx.0[13] += (t1[5] >= tt) as u16; |
| 607 | idx.0[14] += (t1[6] >= tt) as u16; |
| 608 | idx.0[15] += (t1[7] >= tt) as u16; |
| 609 | } |
| 610 | gradient_lookup(ctx, &idx, t, &mut p.r, &mut p.g, &mut p.b, &mut p.a); |
| 611 | |
| 612 | p.next_stage(); |
| 613 | } |
| 614 | |
| 615 | fn evenly_spaced_2_stop_gradient(p: &mut Pipeline) { |
| 616 | let ctx: &EvenlySpaced2StopGradientCtx = &p.ctx.evenly_spaced_2_stop_gradient; |
| 617 | |
| 618 | let t: f32x16 = join(&p.r, &p.g); |
| 619 | round_f32_to_u16( |
| 620 | rf:mad(t, f32x16::splat(ctx.factor.r), f32x16::splat(ctx.bias.r)), |
| 621 | gf:mad(t, f32x16::splat(ctx.factor.g), f32x16::splat(ctx.bias.g)), |
| 622 | bf:mad(t, f32x16::splat(ctx.factor.b), f32x16::splat(ctx.bias.b)), |
| 623 | af:mad(f:t, m:f32x16::splat(ctx.factor.a), a:f32x16::splat(ctx.bias.a)), |
| 624 | &mut p.r, &mut p.g, &mut p.b, &mut p.a, |
| 625 | ); |
| 626 | |
| 627 | p.next_stage(); |
| 628 | } |
| 629 | |
| 630 | fn xy_to_radius(p: &mut Pipeline) { |
| 631 | let x: f32x16 = join(&p.r, &p.g); |
| 632 | let y: f32x16 = join(&p.b, &p.a); |
| 633 | let x: f32x16 = (x*x + y*y).sqrt(); |
| 634 | split(&x, &mut p.r, &mut p.g); |
| 635 | split(&y, &mut p.b, &mut p.a); |
| 636 | |
| 637 | p.next_stage(); |
| 638 | } |
| 639 | |
| 640 | // We are using u16 for index, not u32 as Skia, to simplify the code a bit. |
| 641 | // The gradient creation code will not allow that many stops anyway. |
| 642 | fn gradient_lookup( |
| 643 | ctx: &super::GradientCtx, idx: &u16x16, t: f32x16, |
| 644 | r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, |
| 645 | ) { |
| 646 | macro_rules! gather { |
| 647 | ($d:expr, $c:ident) => { |
| 648 | // Surprisingly, but bound checking doesn't affect the performance. |
| 649 | // And since `idx` can contain any number, we should leave it in place. |
| 650 | f32x16( |
| 651 | f32x8::from([ |
| 652 | $d[idx.0[ 0] as usize].$c, |
| 653 | $d[idx.0[ 1] as usize].$c, |
| 654 | $d[idx.0[ 2] as usize].$c, |
| 655 | $d[idx.0[ 3] as usize].$c, |
| 656 | $d[idx.0[ 4] as usize].$c, |
| 657 | $d[idx.0[ 5] as usize].$c, |
| 658 | $d[idx.0[ 6] as usize].$c, |
| 659 | $d[idx.0[ 7] as usize].$c, |
| 660 | ]), |
| 661 | f32x8::from([ |
| 662 | $d[idx.0[ 8] as usize].$c, |
| 663 | $d[idx.0[ 9] as usize].$c, |
| 664 | $d[idx.0[10] as usize].$c, |
| 665 | $d[idx.0[11] as usize].$c, |
| 666 | $d[idx.0[12] as usize].$c, |
| 667 | $d[idx.0[13] as usize].$c, |
| 668 | $d[idx.0[14] as usize].$c, |
| 669 | $d[idx.0[15] as usize].$c, |
| 670 | ]), |
| 671 | ) |
| 672 | }; |
| 673 | } |
| 674 | |
| 675 | let fr = gather!(&ctx.factors, r); |
| 676 | let fg = gather!(&ctx.factors, g); |
| 677 | let fb = gather!(&ctx.factors, b); |
| 678 | let fa = gather!(&ctx.factors, a); |
| 679 | |
| 680 | let br = gather!(&ctx.biases, r); |
| 681 | let bg = gather!(&ctx.biases, g); |
| 682 | let bb = gather!(&ctx.biases, b); |
| 683 | let ba = gather!(&ctx.biases, a); |
| 684 | |
| 685 | round_f32_to_u16( |
| 686 | mad(t, fr, br), |
| 687 | mad(t, fg, bg), |
| 688 | mad(t, fb, bb), |
| 689 | mad(t, fa, ba), |
| 690 | r, g, b, a, |
| 691 | ); |
| 692 | } |
| 693 | |
| 694 | #[inline (always)] |
| 695 | fn round_f32_to_u16( |
| 696 | rf: f32x16, gf: f32x16, bf: f32x16, af: f32x16, |
| 697 | r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, |
| 698 | ) { |
| 699 | // TODO: may produce a slightly different result to Skia |
| 700 | // affects the two_stops_linear_mirror test |
| 701 | |
| 702 | let rf: f32x16 = rf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5); |
| 703 | let gf: f32x16 = gf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5); |
| 704 | let bf: f32x16 = bf.normalize() * f32x16::splat(255.0) + f32x16::splat(0.5); |
| 705 | let af: f32x16 = af * f32x16::splat(255.0) + f32x16::splat(0.5); |
| 706 | |
| 707 | rf.save_to_u16x16(dst:r); |
| 708 | gf.save_to_u16x16(dst:g); |
| 709 | bf.save_to_u16x16(dst:b); |
| 710 | af.save_to_u16x16(dst:a); |
| 711 | } |
| 712 | |
| 713 | pub fn just_return(_: &mut Pipeline) { |
| 714 | // Ends the loop. |
| 715 | } |
| 716 | |
| 717 | pub fn null_fn(_: &mut Pipeline) { |
| 718 | // Just for unsupported functions in STAGES. |
| 719 | } |
| 720 | |
| 721 | #[inline (always)] |
| 722 | fn load_8888( |
| 723 | data: &[PremultipliedColorU8; STAGE_WIDTH], |
| 724 | r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, |
| 725 | ) { |
| 726 | *r = u16x16([ |
| 727 | data[ 0].red() as u16, data[ 1].red() as u16, data[ 2].red() as u16, data[ 3].red() as u16, |
| 728 | data[ 4].red() as u16, data[ 5].red() as u16, data[ 6].red() as u16, data[ 7].red() as u16, |
| 729 | data[ 8].red() as u16, data[ 9].red() as u16, data[10].red() as u16, data[11].red() as u16, |
| 730 | data[12].red() as u16, data[13].red() as u16, data[14].red() as u16, data[15].red() as u16, |
| 731 | ]); |
| 732 | |
| 733 | *g = u16x16([ |
| 734 | data[ 0].green() as u16, data[ 1].green() as u16, data[ 2].green() as u16, data[ 3].green() as u16, |
| 735 | data[ 4].green() as u16, data[ 5].green() as u16, data[ 6].green() as u16, data[ 7].green() as u16, |
| 736 | data[ 8].green() as u16, data[ 9].green() as u16, data[10].green() as u16, data[11].green() as u16, |
| 737 | data[12].green() as u16, data[13].green() as u16, data[14].green() as u16, data[15].green() as u16, |
| 738 | ]); |
| 739 | |
| 740 | *b = u16x16([ |
| 741 | data[ 0].blue() as u16, data[ 1].blue() as u16, data[ 2].blue() as u16, data[ 3].blue() as u16, |
| 742 | data[ 4].blue() as u16, data[ 5].blue() as u16, data[ 6].blue() as u16, data[ 7].blue() as u16, |
| 743 | data[ 8].blue() as u16, data[ 9].blue() as u16, data[10].blue() as u16, data[11].blue() as u16, |
| 744 | data[12].blue() as u16, data[13].blue() as u16, data[14].blue() as u16, data[15].blue() as u16, |
| 745 | ]); |
| 746 | |
| 747 | *a = u16x16([ |
| 748 | data[ 0].alpha() as u16, data[ 1].alpha() as u16, data[ 2].alpha() as u16, data[ 3].alpha() as u16, |
| 749 | data[ 4].alpha() as u16, data[ 5].alpha() as u16, data[ 6].alpha() as u16, data[ 7].alpha() as u16, |
| 750 | data[ 8].alpha() as u16, data[ 9].alpha() as u16, data[10].alpha() as u16, data[11].alpha() as u16, |
| 751 | data[12].alpha() as u16, data[13].alpha() as u16, data[14].alpha() as u16, data[15].alpha() as u16, |
| 752 | ]); |
| 753 | } |
| 754 | |
| 755 | #[inline (always)] |
| 756 | fn load_8888_tail( |
| 757 | tail: usize, data: &[PremultipliedColorU8], |
| 758 | r: &mut u16x16, g: &mut u16x16, b: &mut u16x16, a: &mut u16x16, |
| 759 | ) { |
| 760 | // Fill a dummy array with `tail` values. `tail` is always in a 1..STAGE_WIDTH-1 range. |
| 761 | // This way we can reuse the `load_8888__` method and remove any branches. |
| 762 | let mut tmp: [PremultipliedColorU8; 16] = [PremultipliedColorU8::TRANSPARENT; STAGE_WIDTH]; |
| 763 | tmp[0..tail].copy_from_slice(&data[0..tail]); |
| 764 | load_8888(&tmp, r, g, b, a); |
| 765 | } |
| 766 | |
| 767 | #[inline (always)] |
| 768 | fn store_8888( |
| 769 | r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16, |
| 770 | data: &mut [PremultipliedColorU8; STAGE_WIDTH], |
| 771 | ) { |
| 772 | let r: &[u16; 16] = r.as_slice(); |
| 773 | let g: &[u16; 16] = g.as_slice(); |
| 774 | let b: &[u16; 16] = b.as_slice(); |
| 775 | let a: &[u16; 16] = a.as_slice(); |
| 776 | |
| 777 | data[ 0] = PremultipliedColorU8::from_rgba_unchecked(r[ 0] as u8, g[ 0] as u8, b[ 0] as u8, a[ 0] as u8); |
| 778 | data[ 1] = PremultipliedColorU8::from_rgba_unchecked(r[ 1] as u8, g[ 1] as u8, b[ 1] as u8, a[ 1] as u8); |
| 779 | data[ 2] = PremultipliedColorU8::from_rgba_unchecked(r[ 2] as u8, g[ 2] as u8, b[ 2] as u8, a[ 2] as u8); |
| 780 | data[ 3] = PremultipliedColorU8::from_rgba_unchecked(r[ 3] as u8, g[ 3] as u8, b[ 3] as u8, a[ 3] as u8); |
| 781 | data[ 4] = PremultipliedColorU8::from_rgba_unchecked(r[ 4] as u8, g[ 4] as u8, b[ 4] as u8, a[ 4] as u8); |
| 782 | data[ 5] = PremultipliedColorU8::from_rgba_unchecked(r[ 5] as u8, g[ 5] as u8, b[ 5] as u8, a[ 5] as u8); |
| 783 | data[ 6] = PremultipliedColorU8::from_rgba_unchecked(r[ 6] as u8, g[ 6] as u8, b[ 6] as u8, a[ 6] as u8); |
| 784 | data[ 7] = PremultipliedColorU8::from_rgba_unchecked(r[ 7] as u8, g[ 7] as u8, b[ 7] as u8, a[ 7] as u8); |
| 785 | data[ 8] = PremultipliedColorU8::from_rgba_unchecked(r[ 8] as u8, g[ 8] as u8, b[ 8] as u8, a[ 8] as u8); |
| 786 | data[ 9] = PremultipliedColorU8::from_rgba_unchecked(r[ 9] as u8, g[ 9] as u8, b[ 9] as u8, a[ 9] as u8); |
| 787 | data[10] = PremultipliedColorU8::from_rgba_unchecked(r[10] as u8, g[10] as u8, b[10] as u8, a[10] as u8); |
| 788 | data[11] = PremultipliedColorU8::from_rgba_unchecked(r[11] as u8, g[11] as u8, b[11] as u8, a[11] as u8); |
| 789 | data[12] = PremultipliedColorU8::from_rgba_unchecked(r[12] as u8, g[12] as u8, b[12] as u8, a[12] as u8); |
| 790 | data[13] = PremultipliedColorU8::from_rgba_unchecked(r[13] as u8, g[13] as u8, b[13] as u8, a[13] as u8); |
| 791 | data[14] = PremultipliedColorU8::from_rgba_unchecked(r[14] as u8, g[14] as u8, b[14] as u8, a[14] as u8); |
| 792 | data[15] = PremultipliedColorU8::from_rgba_unchecked(r[15] as u8, g[15] as u8, b[15] as u8, a[15] as u8); |
| 793 | } |
| 794 | |
| 795 | #[inline (always)] |
| 796 | fn store_8888_tail( |
| 797 | r: &u16x16, g: &u16x16, b: &u16x16, a: &u16x16, |
| 798 | tail: usize, data: &mut [PremultipliedColorU8], |
| 799 | ) { |
| 800 | let r: &[u16; 16] = r.as_slice(); |
| 801 | let g: &[u16; 16] = g.as_slice(); |
| 802 | let b: &[u16; 16] = b.as_slice(); |
| 803 | let a: &[u16; 16] = a.as_slice(); |
| 804 | |
| 805 | // This is better than `for i in 0..tail`, because this way the compiler |
| 806 | // knows that we have only 16 steps and slices access is guarantee to be valid. |
| 807 | // This removes bounds checking and a possible panic call. |
| 808 | for i: usize in 0..STAGE_WIDTH { |
| 809 | data[i] = PremultipliedColorU8::from_rgba_unchecked( |
| 810 | r[i] as u8, g[i] as u8, b[i] as u8, a[i] as u8, |
| 811 | ); |
| 812 | |
| 813 | if i + 1 == tail { |
| 814 | break; |
| 815 | } |
| 816 | } |
| 817 | } |
| 818 | |
| 819 | #[inline (always)] |
| 820 | fn load_8(data: &[u8; STAGE_WIDTH], a: &mut u16x16) { |
| 821 | *a = u16x16([ |
| 822 | data[ 0] as u16, data[ 1] as u16, data[ 2] as u16, data[ 3] as u16, |
| 823 | data[ 4] as u16, data[ 5] as u16, data[ 6] as u16, data[ 7] as u16, |
| 824 | data[ 8] as u16, data[ 9] as u16, data[10] as u16, data[11] as u16, |
| 825 | data[12] as u16, data[13] as u16, data[14] as u16, data[15] as u16, |
| 826 | ]); |
| 827 | } |
| 828 | |
| 829 | #[inline (always)] |
| 830 | fn div255(v: u16x16) -> u16x16 { |
| 831 | // Skia uses `vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8)` here when NEON is available, |
| 832 | // but it doesn't affect performance much and breaks reproducible result. Ignore it. |
| 833 | // NOTE: the compiler does not replace the devision with a shift. |
| 834 | (v + u16x16::splat(255)) >> u16x16::splat(8) // / u16x16::splat(256) |
| 835 | } |
| 836 | |
| 837 | #[inline (always)] |
| 838 | fn inv(v: u16x16) -> u16x16 { |
| 839 | u16x16::splat(255) - v |
| 840 | } |
| 841 | |
| 842 | #[inline (always)] |
| 843 | fn from_float(f: f32) -> u16x16 { |
| 844 | u16x16::splat((f * 255.0 + 0.5) as u16) |
| 845 | } |
| 846 | |
| 847 | #[inline (always)] |
| 848 | fn lerp(from: u16x16, to: u16x16, t: u16x16) -> u16x16 { |
| 849 | div255(from * inv(t) + to * t) |
| 850 | } |
| 851 | |
| 852 | #[inline (always)] |
| 853 | fn split(v: &f32x16, lo: &mut u16x16, hi: &mut u16x16) { |
| 854 | // We're splitting f32x16 (512bit) into two u16x16 (256 bit). |
| 855 | let data: [u8; 64] = bytemuck::cast(*v); |
| 856 | let d0: &mut [u8; 32] = bytemuck::cast_mut(&mut lo.0); |
| 857 | let d1: &mut [u8; 32] = bytemuck::cast_mut(&mut hi.0); |
| 858 | |
| 859 | d0.copy_from_slice(&data[0..32]); |
| 860 | d1.copy_from_slice(&data[32..64]); |
| 861 | } |
| 862 | |
| 863 | #[inline (always)] |
| 864 | fn join(lo: &u16x16, hi: &u16x16) -> f32x16 { |
| 865 | // We're joining two u16x16 (256 bit) into f32x16 (512bit). |
| 866 | |
| 867 | let d0: [u8; 32] = bytemuck::cast(lo.0); |
| 868 | let d1: [u8; 32] = bytemuck::cast(hi.0); |
| 869 | |
| 870 | let mut v: f32x16 = f32x16::default(); |
| 871 | let data: &mut [u8; 64] = bytemuck::cast_mut(&mut v); |
| 872 | |
| 873 | data[0..32].copy_from_slice(&d0); |
| 874 | data[32..64].copy_from_slice(&d1); |
| 875 | |
| 876 | v |
| 877 | } |
| 878 | |
| 879 | #[inline (always)] |
| 880 | fn mad(f: f32x16, m: f32x16, a: f32x16) -> f32x16 { |
| 881 | // NEON vmlaq_f32 doesn't seem to affect performance in any way. Ignore it. |
| 882 | f * m + a |
| 883 | } |
| 884 | |