| 1 | use core::fmt::Debug; |
| 2 | |
| 3 | use alloc::{ |
| 4 | boxed::Box, collections::BTreeMap, format, sync::Arc, vec, vec::Vec, |
| 5 | }; |
| 6 | |
| 7 | use crate::{ |
| 8 | packed::{ |
| 9 | ext::Pointer, |
| 10 | pattern::Patterns, |
| 11 | vector::{FatVector, Vector}, |
| 12 | }, |
| 13 | util::int::U32, |
| 14 | PatternID, |
| 15 | }; |
| 16 | |
| 17 | /// A match type specialized to the Teddy implementations below. |
| 18 | /// |
| 19 | /// Essentially, instead of representing a match at byte offsets, we use |
| 20 | /// raw pointers. This is because the implementations below operate on raw |
| 21 | /// pointers, and so this is a more natural return type based on how the |
| 22 | /// implementation works. |
| 23 | /// |
| 24 | /// Also, the `PatternID` used here is a `u16`. |
| 25 | #[derive (Clone, Copy, Debug)] |
| 26 | pub(crate) struct Match { |
| 27 | pid: PatternID, |
| 28 | start: *const u8, |
| 29 | end: *const u8, |
| 30 | } |
| 31 | |
| 32 | impl Match { |
| 33 | /// Returns the ID of the pattern that matched. |
| 34 | pub(crate) fn pattern(&self) -> PatternID { |
| 35 | self.pid |
| 36 | } |
| 37 | |
| 38 | /// Returns a pointer into the haystack at which the match starts. |
| 39 | pub(crate) fn start(&self) -> *const u8 { |
| 40 | self.start |
| 41 | } |
| 42 | |
| 43 | /// Returns a pointer into the haystack at which the match ends. |
| 44 | pub(crate) fn end(&self) -> *const u8 { |
| 45 | self.end |
| 46 | } |
| 47 | } |
| 48 | |
| 49 | /// A "slim" Teddy implementation that is generic over both the vector type |
| 50 | /// and the minimum length of the patterns being searched for. |
| 51 | /// |
| 52 | /// Only 1, 2, 3 and 4 bytes are supported as minimum lengths. |
| 53 | #[derive (Clone, Debug)] |
| 54 | pub(crate) struct Slim<V, const BYTES: usize> { |
| 55 | /// A generic data structure for doing "slim" Teddy verification. |
| 56 | teddy: Teddy<8>, |
| 57 | /// The masks used as inputs to the shuffle operation to generate |
| 58 | /// candidates (which are fed into the verification routines). |
| 59 | masks: [Mask<V>; BYTES], |
| 60 | } |
| 61 | |
| 62 | impl<V: Vector, const BYTES: usize> Slim<V, BYTES> { |
| 63 | /// Create a new "slim" Teddy searcher for the given patterns. |
| 64 | /// |
| 65 | /// # Panics |
| 66 | /// |
| 67 | /// This panics when `BYTES` is any value other than 1, 2, 3 or 4. |
| 68 | /// |
| 69 | /// # Safety |
| 70 | /// |
| 71 | /// Callers must ensure that this is okay to call in the current target for |
| 72 | /// the current CPU. |
| 73 | #[inline (always)] |
| 74 | pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Slim<V, BYTES> { |
| 75 | assert!( |
| 76 | 1 <= BYTES && BYTES <= 4, |
| 77 | "only 1, 2, 3 or 4 bytes are supported" |
| 78 | ); |
| 79 | let teddy = Teddy::new(patterns); |
| 80 | let masks = SlimMaskBuilder::from_teddy(&teddy); |
| 81 | Slim { teddy, masks } |
| 82 | } |
| 83 | |
| 84 | /// Returns the approximate total amount of heap used by this type, in |
| 85 | /// units of bytes. |
| 86 | #[inline (always)] |
| 87 | pub(crate) fn memory_usage(&self) -> usize { |
| 88 | self.teddy.memory_usage() |
| 89 | } |
| 90 | |
| 91 | /// Returns the minimum length, in bytes, that a haystack must be in order |
| 92 | /// to use it with this searcher. |
| 93 | #[inline (always)] |
| 94 | pub(crate) fn minimum_len(&self) -> usize { |
| 95 | V::BYTES + (BYTES - 1) |
| 96 | } |
| 97 | } |
| 98 | |
| 99 | impl<V: Vector> Slim<V, 1> { |
| 100 | /// Look for an occurrences of the patterns in this finder in the haystack |
| 101 | /// given by the `start` and `end` pointers. |
| 102 | /// |
| 103 | /// If no match could be found, then `None` is returned. |
| 104 | /// |
| 105 | /// # Safety |
| 106 | /// |
| 107 | /// The given pointers representing the haystack must be valid to read |
| 108 | /// from. They must also point to a region of memory that is at least the |
| 109 | /// minimum length required by this searcher. |
| 110 | /// |
| 111 | /// Callers must ensure that this is okay to call in the current target for |
| 112 | /// the current CPU. |
| 113 | #[inline (always)] |
| 114 | pub(crate) unsafe fn find( |
| 115 | &self, |
| 116 | start: *const u8, |
| 117 | end: *const u8, |
| 118 | ) -> Option<Match> { |
| 119 | let len = end.distance(start); |
| 120 | debug_assert!(len >= self.minimum_len()); |
| 121 | let mut cur = start; |
| 122 | while cur <= end.sub(V::BYTES) { |
| 123 | if let Some(m) = self.find_one(cur, end) { |
| 124 | return Some(m); |
| 125 | } |
| 126 | cur = cur.add(V::BYTES); |
| 127 | } |
| 128 | if cur < end { |
| 129 | cur = end.sub(V::BYTES); |
| 130 | if let Some(m) = self.find_one(cur, end) { |
| 131 | return Some(m); |
| 132 | } |
| 133 | } |
| 134 | None |
| 135 | } |
| 136 | |
| 137 | /// Look for a match starting at the `V::BYTES` at and after `cur`. If |
| 138 | /// there isn't one, then `None` is returned. |
| 139 | /// |
| 140 | /// # Safety |
| 141 | /// |
| 142 | /// The given pointers representing the haystack must be valid to read |
| 143 | /// from. They must also point to a region of memory that is at least the |
| 144 | /// minimum length required by this searcher. |
| 145 | /// |
| 146 | /// Callers must ensure that this is okay to call in the current target for |
| 147 | /// the current CPU. |
| 148 | #[inline (always)] |
| 149 | unsafe fn find_one( |
| 150 | &self, |
| 151 | cur: *const u8, |
| 152 | end: *const u8, |
| 153 | ) -> Option<Match> { |
| 154 | let c = self.candidate(cur); |
| 155 | if !c.is_zero() { |
| 156 | if let Some(m) = self.teddy.verify(cur, end, c) { |
| 157 | return Some(m); |
| 158 | } |
| 159 | } |
| 160 | None |
| 161 | } |
| 162 | |
| 163 | /// Look for a candidate match (represented as a vector) starting at the |
| 164 | /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with |
| 165 | /// all bits set to zero is returned. |
| 166 | /// |
| 167 | /// # Safety |
| 168 | /// |
| 169 | /// The given pointer representing the haystack must be valid to read |
| 170 | /// from. |
| 171 | /// |
| 172 | /// Callers must ensure that this is okay to call in the current target for |
| 173 | /// the current CPU. |
| 174 | #[inline (always)] |
| 175 | unsafe fn candidate(&self, cur: *const u8) -> V { |
| 176 | let chunk = V::load_unaligned(cur); |
| 177 | Mask::members1(chunk, self.masks) |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | impl<V: Vector> Slim<V, 2> { |
| 182 | /// See Slim<V, 1>::find. |
| 183 | #[inline (always)] |
| 184 | pub(crate) unsafe fn find( |
| 185 | &self, |
| 186 | start: *const u8, |
| 187 | end: *const u8, |
| 188 | ) -> Option<Match> { |
| 189 | let len = end.distance(start); |
| 190 | debug_assert!(len >= self.minimum_len()); |
| 191 | let mut cur = start.add(1); |
| 192 | let mut prev0 = V::splat(0xFF); |
| 193 | while cur <= end.sub(V::BYTES) { |
| 194 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
| 195 | return Some(m); |
| 196 | } |
| 197 | cur = cur.add(V::BYTES); |
| 198 | } |
| 199 | if cur < end { |
| 200 | cur = end.sub(V::BYTES); |
| 201 | prev0 = V::splat(0xFF); |
| 202 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
| 203 | return Some(m); |
| 204 | } |
| 205 | } |
| 206 | None |
| 207 | } |
| 208 | |
| 209 | /// See Slim<V, 1>::find_one. |
| 210 | #[inline (always)] |
| 211 | unsafe fn find_one( |
| 212 | &self, |
| 213 | cur: *const u8, |
| 214 | end: *const u8, |
| 215 | prev0: &mut V, |
| 216 | ) -> Option<Match> { |
| 217 | let c = self.candidate(cur, prev0); |
| 218 | if !c.is_zero() { |
| 219 | if let Some(m) = self.teddy.verify(cur.sub(1), end, c) { |
| 220 | return Some(m); |
| 221 | } |
| 222 | } |
| 223 | None |
| 224 | } |
| 225 | |
| 226 | /// See Slim<V, 1>::candidate. |
| 227 | #[inline (always)] |
| 228 | unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V { |
| 229 | let chunk = V::load_unaligned(cur); |
| 230 | let (res0, res1) = Mask::members2(chunk, self.masks); |
| 231 | let res0prev0 = res0.shift_in_one_byte(*prev0); |
| 232 | let res = res0prev0.and(res1); |
| 233 | *prev0 = res0; |
| 234 | res |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | impl<V: Vector> Slim<V, 3> { |
| 239 | /// See Slim<V, 1>::find. |
| 240 | #[inline (always)] |
| 241 | pub(crate) unsafe fn find( |
| 242 | &self, |
| 243 | start: *const u8, |
| 244 | end: *const u8, |
| 245 | ) -> Option<Match> { |
| 246 | let len = end.distance(start); |
| 247 | debug_assert!(len >= self.minimum_len()); |
| 248 | let mut cur = start.add(2); |
| 249 | let mut prev0 = V::splat(0xFF); |
| 250 | let mut prev1 = V::splat(0xFF); |
| 251 | while cur <= end.sub(V::BYTES) { |
| 252 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
| 253 | return Some(m); |
| 254 | } |
| 255 | cur = cur.add(V::BYTES); |
| 256 | } |
| 257 | if cur < end { |
| 258 | cur = end.sub(V::BYTES); |
| 259 | prev0 = V::splat(0xFF); |
| 260 | prev1 = V::splat(0xFF); |
| 261 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
| 262 | return Some(m); |
| 263 | } |
| 264 | } |
| 265 | None |
| 266 | } |
| 267 | |
| 268 | /// See Slim<V, 1>::find_one. |
| 269 | #[inline (always)] |
| 270 | unsafe fn find_one( |
| 271 | &self, |
| 272 | cur: *const u8, |
| 273 | end: *const u8, |
| 274 | prev0: &mut V, |
| 275 | prev1: &mut V, |
| 276 | ) -> Option<Match> { |
| 277 | let c = self.candidate(cur, prev0, prev1); |
| 278 | if !c.is_zero() { |
| 279 | if let Some(m) = self.teddy.verify(cur.sub(2), end, c) { |
| 280 | return Some(m); |
| 281 | } |
| 282 | } |
| 283 | None |
| 284 | } |
| 285 | |
| 286 | /// See Slim<V, 1>::candidate. |
| 287 | #[inline (always)] |
| 288 | unsafe fn candidate( |
| 289 | &self, |
| 290 | cur: *const u8, |
| 291 | prev0: &mut V, |
| 292 | prev1: &mut V, |
| 293 | ) -> V { |
| 294 | let chunk = V::load_unaligned(cur); |
| 295 | let (res0, res1, res2) = Mask::members3(chunk, self.masks); |
| 296 | let res0prev0 = res0.shift_in_two_bytes(*prev0); |
| 297 | let res1prev1 = res1.shift_in_one_byte(*prev1); |
| 298 | let res = res0prev0.and(res1prev1).and(res2); |
| 299 | *prev0 = res0; |
| 300 | *prev1 = res1; |
| 301 | res |
| 302 | } |
| 303 | } |
| 304 | |
| 305 | impl<V: Vector> Slim<V, 4> { |
| 306 | /// See Slim<V, 1>::find. |
| 307 | #[inline (always)] |
| 308 | pub(crate) unsafe fn find( |
| 309 | &self, |
| 310 | start: *const u8, |
| 311 | end: *const u8, |
| 312 | ) -> Option<Match> { |
| 313 | let len = end.distance(start); |
| 314 | debug_assert!(len >= self.minimum_len()); |
| 315 | let mut cur = start.add(3); |
| 316 | let mut prev0 = V::splat(0xFF); |
| 317 | let mut prev1 = V::splat(0xFF); |
| 318 | let mut prev2 = V::splat(0xFF); |
| 319 | while cur <= end.sub(V::BYTES) { |
| 320 | if let Some(m) = |
| 321 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
| 322 | { |
| 323 | return Some(m); |
| 324 | } |
| 325 | cur = cur.add(V::BYTES); |
| 326 | } |
| 327 | if cur < end { |
| 328 | cur = end.sub(V::BYTES); |
| 329 | prev0 = V::splat(0xFF); |
| 330 | prev1 = V::splat(0xFF); |
| 331 | prev2 = V::splat(0xFF); |
| 332 | if let Some(m) = |
| 333 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
| 334 | { |
| 335 | return Some(m); |
| 336 | } |
| 337 | } |
| 338 | None |
| 339 | } |
| 340 | |
| 341 | /// See Slim<V, 1>::find_one. |
| 342 | #[inline (always)] |
| 343 | unsafe fn find_one( |
| 344 | &self, |
| 345 | cur: *const u8, |
| 346 | end: *const u8, |
| 347 | prev0: &mut V, |
| 348 | prev1: &mut V, |
| 349 | prev2: &mut V, |
| 350 | ) -> Option<Match> { |
| 351 | let c = self.candidate(cur, prev0, prev1, prev2); |
| 352 | if !c.is_zero() { |
| 353 | if let Some(m) = self.teddy.verify(cur.sub(3), end, c) { |
| 354 | return Some(m); |
| 355 | } |
| 356 | } |
| 357 | None |
| 358 | } |
| 359 | |
| 360 | /// See Slim<V, 1>::candidate. |
| 361 | #[inline (always)] |
| 362 | unsafe fn candidate( |
| 363 | &self, |
| 364 | cur: *const u8, |
| 365 | prev0: &mut V, |
| 366 | prev1: &mut V, |
| 367 | prev2: &mut V, |
| 368 | ) -> V { |
| 369 | let chunk = V::load_unaligned(cur); |
| 370 | let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks); |
| 371 | let res0prev0 = res0.shift_in_three_bytes(*prev0); |
| 372 | let res1prev1 = res1.shift_in_two_bytes(*prev1); |
| 373 | let res2prev2 = res2.shift_in_one_byte(*prev2); |
| 374 | let res = res0prev0.and(res1prev1).and(res2prev2).and(res3); |
| 375 | *prev0 = res0; |
| 376 | *prev1 = res1; |
| 377 | *prev2 = res2; |
| 378 | res |
| 379 | } |
| 380 | } |
| 381 | |
| 382 | /// A "fat" Teddy implementation that is generic over both the vector type |
| 383 | /// and the minimum length of the patterns being searched for. |
| 384 | /// |
| 385 | /// Only 1, 2, 3 and 4 bytes are supported as minimum lengths. |
| 386 | #[derive (Clone, Debug)] |
| 387 | pub(crate) struct Fat<V, const BYTES: usize> { |
| 388 | /// A generic data structure for doing "fat" Teddy verification. |
| 389 | teddy: Teddy<16>, |
| 390 | /// The masks used as inputs to the shuffle operation to generate |
| 391 | /// candidates (which are fed into the verification routines). |
| 392 | masks: [Mask<V>; BYTES], |
| 393 | } |
| 394 | |
| 395 | impl<V: FatVector, const BYTES: usize> Fat<V, BYTES> { |
| 396 | /// Create a new "fat" Teddy searcher for the given patterns. |
| 397 | /// |
| 398 | /// # Panics |
| 399 | /// |
| 400 | /// This panics when `BYTES` is any value other than 1, 2, 3 or 4. |
| 401 | /// |
| 402 | /// # Safety |
| 403 | /// |
| 404 | /// Callers must ensure that this is okay to call in the current target for |
| 405 | /// the current CPU. |
| 406 | #[inline (always)] |
| 407 | pub(crate) unsafe fn new(patterns: Arc<Patterns>) -> Fat<V, BYTES> { |
| 408 | assert!( |
| 409 | 1 <= BYTES && BYTES <= 4, |
| 410 | "only 1, 2, 3 or 4 bytes are supported" |
| 411 | ); |
| 412 | let teddy = Teddy::new(patterns); |
| 413 | let masks = FatMaskBuilder::from_teddy(&teddy); |
| 414 | Fat { teddy, masks } |
| 415 | } |
| 416 | |
| 417 | /// Returns the approximate total amount of heap used by this type, in |
| 418 | /// units of bytes. |
| 419 | #[inline (always)] |
| 420 | pub(crate) fn memory_usage(&self) -> usize { |
| 421 | self.teddy.memory_usage() |
| 422 | } |
| 423 | |
| 424 | /// Returns the minimum length, in bytes, that a haystack must be in order |
| 425 | /// to use it with this searcher. |
| 426 | #[inline (always)] |
| 427 | pub(crate) fn minimum_len(&self) -> usize { |
| 428 | V::Half::BYTES + (BYTES - 1) |
| 429 | } |
| 430 | } |
| 431 | |
| 432 | impl<V: FatVector> Fat<V, 1> { |
| 433 | /// Look for an occurrences of the patterns in this finder in the haystack |
| 434 | /// given by the `start` and `end` pointers. |
| 435 | /// |
| 436 | /// If no match could be found, then `None` is returned. |
| 437 | /// |
| 438 | /// # Safety |
| 439 | /// |
| 440 | /// The given pointers representing the haystack must be valid to read |
| 441 | /// from. They must also point to a region of memory that is at least the |
| 442 | /// minimum length required by this searcher. |
| 443 | /// |
| 444 | /// Callers must ensure that this is okay to call in the current target for |
| 445 | /// the current CPU. |
| 446 | #[inline (always)] |
| 447 | pub(crate) unsafe fn find( |
| 448 | &self, |
| 449 | start: *const u8, |
| 450 | end: *const u8, |
| 451 | ) -> Option<Match> { |
| 452 | let len = end.distance(start); |
| 453 | debug_assert!(len >= self.minimum_len()); |
| 454 | let mut cur = start; |
| 455 | while cur <= end.sub(V::Half::BYTES) { |
| 456 | if let Some(m) = self.find_one(cur, end) { |
| 457 | return Some(m); |
| 458 | } |
| 459 | cur = cur.add(V::Half::BYTES); |
| 460 | } |
| 461 | if cur < end { |
| 462 | cur = end.sub(V::Half::BYTES); |
| 463 | if let Some(m) = self.find_one(cur, end) { |
| 464 | return Some(m); |
| 465 | } |
| 466 | } |
| 467 | None |
| 468 | } |
| 469 | |
| 470 | /// Look for a match starting at the `V::BYTES` at and after `cur`. If |
| 471 | /// there isn't one, then `None` is returned. |
| 472 | /// |
| 473 | /// # Safety |
| 474 | /// |
| 475 | /// The given pointers representing the haystack must be valid to read |
| 476 | /// from. They must also point to a region of memory that is at least the |
| 477 | /// minimum length required by this searcher. |
| 478 | /// |
| 479 | /// Callers must ensure that this is okay to call in the current target for |
| 480 | /// the current CPU. |
| 481 | #[inline (always)] |
| 482 | unsafe fn find_one( |
| 483 | &self, |
| 484 | cur: *const u8, |
| 485 | end: *const u8, |
| 486 | ) -> Option<Match> { |
| 487 | let c = self.candidate(cur); |
| 488 | if !c.is_zero() { |
| 489 | if let Some(m) = self.teddy.verify(cur, end, c) { |
| 490 | return Some(m); |
| 491 | } |
| 492 | } |
| 493 | None |
| 494 | } |
| 495 | |
| 496 | /// Look for a candidate match (represented as a vector) starting at the |
| 497 | /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with |
| 498 | /// all bits set to zero is returned. |
| 499 | /// |
| 500 | /// # Safety |
| 501 | /// |
| 502 | /// The given pointer representing the haystack must be valid to read |
| 503 | /// from. |
| 504 | /// |
| 505 | /// Callers must ensure that this is okay to call in the current target for |
| 506 | /// the current CPU. |
| 507 | #[inline (always)] |
| 508 | unsafe fn candidate(&self, cur: *const u8) -> V { |
| 509 | let chunk = V::load_half_unaligned(cur); |
| 510 | Mask::members1(chunk, self.masks) |
| 511 | } |
| 512 | } |
| 513 | |
| 514 | impl<V: FatVector> Fat<V, 2> { |
| 515 | /// See `Fat<V, 1>::find`. |
| 516 | #[inline (always)] |
| 517 | pub(crate) unsafe fn find( |
| 518 | &self, |
| 519 | start: *const u8, |
| 520 | end: *const u8, |
| 521 | ) -> Option<Match> { |
| 522 | let len = end.distance(start); |
| 523 | debug_assert!(len >= self.minimum_len()); |
| 524 | let mut cur = start.add(1); |
| 525 | let mut prev0 = V::splat(0xFF); |
| 526 | while cur <= end.sub(V::Half::BYTES) { |
| 527 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
| 528 | return Some(m); |
| 529 | } |
| 530 | cur = cur.add(V::Half::BYTES); |
| 531 | } |
| 532 | if cur < end { |
| 533 | cur = end.sub(V::Half::BYTES); |
| 534 | prev0 = V::splat(0xFF); |
| 535 | if let Some(m) = self.find_one(cur, end, &mut prev0) { |
| 536 | return Some(m); |
| 537 | } |
| 538 | } |
| 539 | None |
| 540 | } |
| 541 | |
| 542 | /// See `Fat<V, 1>::find_one`. |
| 543 | #[inline (always)] |
| 544 | unsafe fn find_one( |
| 545 | &self, |
| 546 | cur: *const u8, |
| 547 | end: *const u8, |
| 548 | prev0: &mut V, |
| 549 | ) -> Option<Match> { |
| 550 | let c = self.candidate(cur, prev0); |
| 551 | if !c.is_zero() { |
| 552 | if let Some(m) = self.teddy.verify(cur.sub(1), end, c) { |
| 553 | return Some(m); |
| 554 | } |
| 555 | } |
| 556 | None |
| 557 | } |
| 558 | |
| 559 | /// See `Fat<V, 1>::candidate`. |
| 560 | #[inline (always)] |
| 561 | unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V { |
| 562 | let chunk = V::load_half_unaligned(cur); |
| 563 | let (res0, res1) = Mask::members2(chunk, self.masks); |
| 564 | let res0prev0 = res0.half_shift_in_one_byte(*prev0); |
| 565 | let res = res0prev0.and(res1); |
| 566 | *prev0 = res0; |
| 567 | res |
| 568 | } |
| 569 | } |
| 570 | |
| 571 | impl<V: FatVector> Fat<V, 3> { |
| 572 | /// See `Fat<V, 1>::find`. |
| 573 | #[inline (always)] |
| 574 | pub(crate) unsafe fn find( |
| 575 | &self, |
| 576 | start: *const u8, |
| 577 | end: *const u8, |
| 578 | ) -> Option<Match> { |
| 579 | let len = end.distance(start); |
| 580 | debug_assert!(len >= self.minimum_len()); |
| 581 | let mut cur = start.add(2); |
| 582 | let mut prev0 = V::splat(0xFF); |
| 583 | let mut prev1 = V::splat(0xFF); |
| 584 | while cur <= end.sub(V::Half::BYTES) { |
| 585 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
| 586 | return Some(m); |
| 587 | } |
| 588 | cur = cur.add(V::Half::BYTES); |
| 589 | } |
| 590 | if cur < end { |
| 591 | cur = end.sub(V::Half::BYTES); |
| 592 | prev0 = V::splat(0xFF); |
| 593 | prev1 = V::splat(0xFF); |
| 594 | if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { |
| 595 | return Some(m); |
| 596 | } |
| 597 | } |
| 598 | None |
| 599 | } |
| 600 | |
| 601 | /// See `Fat<V, 1>::find_one`. |
| 602 | #[inline (always)] |
| 603 | unsafe fn find_one( |
| 604 | &self, |
| 605 | cur: *const u8, |
| 606 | end: *const u8, |
| 607 | prev0: &mut V, |
| 608 | prev1: &mut V, |
| 609 | ) -> Option<Match> { |
| 610 | let c = self.candidate(cur, prev0, prev1); |
| 611 | if !c.is_zero() { |
| 612 | if let Some(m) = self.teddy.verify(cur.sub(2), end, c) { |
| 613 | return Some(m); |
| 614 | } |
| 615 | } |
| 616 | None |
| 617 | } |
| 618 | |
| 619 | /// See `Fat<V, 1>::candidate`. |
| 620 | #[inline (always)] |
| 621 | unsafe fn candidate( |
| 622 | &self, |
| 623 | cur: *const u8, |
| 624 | prev0: &mut V, |
| 625 | prev1: &mut V, |
| 626 | ) -> V { |
| 627 | let chunk = V::load_half_unaligned(cur); |
| 628 | let (res0, res1, res2) = Mask::members3(chunk, self.masks); |
| 629 | let res0prev0 = res0.half_shift_in_two_bytes(*prev0); |
| 630 | let res1prev1 = res1.half_shift_in_one_byte(*prev1); |
| 631 | let res = res0prev0.and(res1prev1).and(res2); |
| 632 | *prev0 = res0; |
| 633 | *prev1 = res1; |
| 634 | res |
| 635 | } |
| 636 | } |
| 637 | |
| 638 | impl<V: FatVector> Fat<V, 4> { |
| 639 | /// See `Fat<V, 1>::find`. |
| 640 | #[inline (always)] |
| 641 | pub(crate) unsafe fn find( |
| 642 | &self, |
| 643 | start: *const u8, |
| 644 | end: *const u8, |
| 645 | ) -> Option<Match> { |
| 646 | let len = end.distance(start); |
| 647 | debug_assert!(len >= self.minimum_len()); |
| 648 | let mut cur = start.add(3); |
| 649 | let mut prev0 = V::splat(0xFF); |
| 650 | let mut prev1 = V::splat(0xFF); |
| 651 | let mut prev2 = V::splat(0xFF); |
| 652 | while cur <= end.sub(V::Half::BYTES) { |
| 653 | if let Some(m) = |
| 654 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
| 655 | { |
| 656 | return Some(m); |
| 657 | } |
| 658 | cur = cur.add(V::Half::BYTES); |
| 659 | } |
| 660 | if cur < end { |
| 661 | cur = end.sub(V::Half::BYTES); |
| 662 | prev0 = V::splat(0xFF); |
| 663 | prev1 = V::splat(0xFF); |
| 664 | prev2 = V::splat(0xFF); |
| 665 | if let Some(m) = |
| 666 | self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) |
| 667 | { |
| 668 | return Some(m); |
| 669 | } |
| 670 | } |
| 671 | None |
| 672 | } |
| 673 | |
| 674 | /// See `Fat<V, 1>::find_one`. |
| 675 | #[inline (always)] |
| 676 | unsafe fn find_one( |
| 677 | &self, |
| 678 | cur: *const u8, |
| 679 | end: *const u8, |
| 680 | prev0: &mut V, |
| 681 | prev1: &mut V, |
| 682 | prev2: &mut V, |
| 683 | ) -> Option<Match> { |
| 684 | let c = self.candidate(cur, prev0, prev1, prev2); |
| 685 | if !c.is_zero() { |
| 686 | if let Some(m) = self.teddy.verify(cur.sub(3), end, c) { |
| 687 | return Some(m); |
| 688 | } |
| 689 | } |
| 690 | None |
| 691 | } |
| 692 | |
| 693 | /// See `Fat<V, 1>::candidate`. |
| 694 | #[inline (always)] |
| 695 | unsafe fn candidate( |
| 696 | &self, |
| 697 | cur: *const u8, |
| 698 | prev0: &mut V, |
| 699 | prev1: &mut V, |
| 700 | prev2: &mut V, |
| 701 | ) -> V { |
| 702 | let chunk = V::load_half_unaligned(cur); |
| 703 | let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks); |
| 704 | let res0prev0 = res0.half_shift_in_three_bytes(*prev0); |
| 705 | let res1prev1 = res1.half_shift_in_two_bytes(*prev1); |
| 706 | let res2prev2 = res2.half_shift_in_one_byte(*prev2); |
| 707 | let res = res0prev0.and(res1prev1).and(res2prev2).and(res3); |
| 708 | *prev0 = res0; |
| 709 | *prev1 = res1; |
| 710 | *prev2 = res2; |
| 711 | res |
| 712 | } |
| 713 | } |
| 714 | |
| 715 | /// The common elements of all "slim" and "fat" Teddy search implementations. |
| 716 | /// |
| 717 | /// Essentially, this contains the patterns and the buckets. Namely, it |
| 718 | /// contains enough to implement the verification step after candidates are |
| 719 | /// identified via the shuffle masks. |
| 720 | /// |
| 721 | /// It is generic over the number of buckets used. In general, the number of |
| 722 | /// buckets is either 8 (for "slim" Teddy) or 16 (for "fat" Teddy). The generic |
| 723 | /// parameter isn't really meant to be instantiated for any value other than |
| 724 | /// 8 or 16, although it is technically possible. The main hiccup is that there |
| 725 | /// is some bit-shifting done in the critical part of verification that could |
| 726 | /// be quite expensive if `N` is not a multiple of 2. |
| 727 | #[derive (Clone, Debug)] |
| 728 | struct Teddy<const BUCKETS: usize> { |
| 729 | /// The patterns we are searching for. |
| 730 | /// |
| 731 | /// A pattern string can be found by its `PatternID`. |
| 732 | patterns: Arc<Patterns>, |
| 733 | /// The allocation of patterns in buckets. This only contains the IDs of |
| 734 | /// patterns. In order to do full verification, callers must provide the |
| 735 | /// actual patterns when using Teddy. |
| 736 | buckets: [Vec<PatternID>; BUCKETS], |
| 737 | // N.B. The above representation is very simple, but it definitely results |
| 738 | // in ping-ponging between different allocations during verification. I've |
| 739 | // tried experimenting with other representations that flatten the pattern |
| 740 | // strings into a single allocation, but it doesn't seem to help much. |
| 741 | // Probably everything is small enough to fit into cache anyway, and so the |
| 742 | // pointer chasing isn't a big deal? |
| 743 | // |
| 744 | // One other avenue I haven't explored is some kind of hashing trick |
| 745 | // that let's us do another high-confidence check before launching into |
| 746 | // `memcmp`. |
| 747 | } |
| 748 | |
| 749 | impl<const BUCKETS: usize> Teddy<BUCKETS> { |
| 750 | /// Create a new generic data structure for Teddy verification. |
| 751 | fn new(patterns: Arc<Patterns>) -> Teddy<BUCKETS> { |
| 752 | assert_ne!(0, patterns.len(), "Teddy requires at least one pattern" ); |
| 753 | assert_ne!( |
| 754 | 0, |
| 755 | patterns.minimum_len(), |
| 756 | "Teddy does not support zero-length patterns" |
| 757 | ); |
| 758 | assert!( |
| 759 | BUCKETS == 8 || BUCKETS == 16, |
| 760 | "Teddy only supports 8 or 16 buckets" |
| 761 | ); |
| 762 | // MSRV(1.63): Use core::array::from_fn below instead of allocating a |
| 763 | // superfluous outer Vec. Not a big deal (especially given the BTreeMap |
| 764 | // allocation below), but nice to not do it. |
| 765 | let buckets = |
| 766 | <[Vec<PatternID>; BUCKETS]>::try_from(vec![vec![]; BUCKETS]) |
| 767 | .unwrap(); |
| 768 | let mut t = Teddy { patterns, buckets }; |
| 769 | |
| 770 | let mut map: BTreeMap<Box<[u8]>, usize> = BTreeMap::new(); |
| 771 | for (id, pattern) in t.patterns.iter() { |
| 772 | // We try to be slightly clever in how we assign patterns into |
| 773 | // buckets. Generally speaking, we want patterns with the same |
| 774 | // prefix to be in the same bucket, since it minimizes the amount |
| 775 | // of time we spend churning through buckets in the verification |
| 776 | // step. |
| 777 | // |
| 778 | // So we could assign patterns with the same N-prefix (where N is |
| 779 | // the size of the mask, which is one of {1, 2, 3}) to the same |
| 780 | // bucket. However, case insensitive searches are fairly common, so |
| 781 | // we'd for example, ideally want to treat `abc` and `ABC` as if |
| 782 | // they shared the same prefix. ASCII has the nice property that |
| 783 | // the lower 4 bits of A and a are the same, so we therefore group |
| 784 | // patterns with the same low-nybble-N-prefix into the same bucket. |
| 785 | // |
| 786 | // MOREOVER, this is actually necessary for correctness! In |
| 787 | // particular, by grouping patterns with the same prefix into the |
| 788 | // same bucket, we ensure that we preserve correct leftmost-first |
| 789 | // and leftmost-longest match semantics. In addition to the fact |
| 790 | // that `patterns.iter()` iterates in the correct order, this |
| 791 | // guarantees that all possible ambiguous matches will occur in |
| 792 | // the same bucket. The verification routine could be adjusted to |
| 793 | // support correct leftmost match semantics regardless of bucket |
| 794 | // allocation, but that results in a performance hit. It's much |
| 795 | // nicer to be able to just stop as soon as a match is found. |
| 796 | let lonybs = pattern.low_nybbles(t.mask_len()); |
| 797 | if let Some(&bucket) = map.get(&lonybs) { |
| 798 | t.buckets[bucket].push(id); |
| 799 | } else { |
| 800 | // N.B. We assign buckets in reverse because it shouldn't have |
| 801 | // any influence on performance, but it does make it harder to |
| 802 | // get leftmost match semantics accidentally correct. |
| 803 | let bucket = (BUCKETS - 1) - (id.as_usize() % BUCKETS); |
| 804 | t.buckets[bucket].push(id); |
| 805 | map.insert(lonybs, bucket); |
| 806 | } |
| 807 | } |
| 808 | t |
| 809 | } |
| 810 | |
| 811 | /// Verify whether there are any matches starting at or after `cur` in the |
| 812 | /// haystack. The candidate chunk given should correspond to 8-bit bitsets |
| 813 | /// for N buckets. |
| 814 | /// |
| 815 | /// # Safety |
| 816 | /// |
| 817 | /// The given pointers representing the haystack must be valid to read |
| 818 | /// from. |
| 819 | #[inline (always)] |
| 820 | unsafe fn verify64( |
| 821 | &self, |
| 822 | cur: *const u8, |
| 823 | end: *const u8, |
| 824 | mut candidate_chunk: u64, |
| 825 | ) -> Option<Match> { |
| 826 | while candidate_chunk != 0 { |
| 827 | let bit = candidate_chunk.trailing_zeros().as_usize(); |
| 828 | candidate_chunk &= !(1 << bit); |
| 829 | |
| 830 | let cur = cur.add(bit / BUCKETS); |
| 831 | let bucket = bit % BUCKETS; |
| 832 | if let Some(m) = self.verify_bucket(cur, end, bucket) { |
| 833 | return Some(m); |
| 834 | } |
| 835 | } |
| 836 | None |
| 837 | } |
| 838 | |
| 839 | /// Verify whether there are any matches starting at `at` in the given |
| 840 | /// `haystack` corresponding only to patterns in the given bucket. |
| 841 | /// |
| 842 | /// # Safety |
| 843 | /// |
| 844 | /// The given pointers representing the haystack must be valid to read |
| 845 | /// from. |
| 846 | /// |
| 847 | /// The bucket index must be less than or equal to `self.buckets.len()`. |
| 848 | #[inline (always)] |
| 849 | unsafe fn verify_bucket( |
| 850 | &self, |
| 851 | cur: *const u8, |
| 852 | end: *const u8, |
| 853 | bucket: usize, |
| 854 | ) -> Option<Match> { |
| 855 | debug_assert!(bucket < self.buckets.len()); |
| 856 | // SAFETY: The caller must ensure that the bucket index is correct. |
| 857 | for pid in self.buckets.get_unchecked(bucket).iter().copied() { |
| 858 | // SAFETY: This is safe because we are guaranteed that every |
| 859 | // index in a Teddy bucket is a valid index into `pats`, by |
| 860 | // construction. |
| 861 | debug_assert!(pid.as_usize() < self.patterns.len()); |
| 862 | let pat = self.patterns.get_unchecked(pid); |
| 863 | if pat.is_prefix_raw(cur, end) { |
| 864 | let start = cur; |
| 865 | let end = start.add(pat.len()); |
| 866 | return Some(Match { pid, start, end }); |
| 867 | } |
| 868 | } |
| 869 | None |
| 870 | } |
| 871 | |
| 872 | /// Returns the total number of masks required by the patterns in this |
| 873 | /// Teddy searcher. |
| 874 | /// |
| 875 | /// Basically, the mask length corresponds to the type of Teddy searcher |
| 876 | /// to use: a 1-byte, 2-byte, 3-byte or 4-byte searcher. The bigger the |
| 877 | /// better, typically, since searching for longer substrings usually |
| 878 | /// decreases the rate of false positives. Therefore, the number of masks |
| 879 | /// needed is the length of the shortest pattern in this searcher. If the |
| 880 | /// length of the shortest pattern (in bytes) is bigger than 4, then the |
| 881 | /// mask length is 4 since there are no Teddy searchers for more than 4 |
| 882 | /// bytes. |
| 883 | fn mask_len(&self) -> usize { |
| 884 | core::cmp::min(4, self.patterns.minimum_len()) |
| 885 | } |
| 886 | |
| 887 | /// Returns the approximate total amount of heap used by this type, in |
| 888 | /// units of bytes. |
| 889 | fn memory_usage(&self) -> usize { |
| 890 | // This is an upper bound rather than a precise accounting. No |
| 891 | // particular reason, other than it's probably very close to actual |
| 892 | // memory usage in practice. |
| 893 | self.patterns.len() * core::mem::size_of::<PatternID>() |
| 894 | } |
| 895 | } |
| 896 | |
| 897 | impl Teddy<8> { |
| 898 | /// Runs the verification routine for "slim" Teddy. |
| 899 | /// |
| 900 | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
| 901 | /// per lane), where the ith bit is set in the jth lane if and only if the |
| 902 | /// byte occurring at `at + j` in `cur` is in the bucket `i`. |
| 903 | /// |
| 904 | /// # Safety |
| 905 | /// |
| 906 | /// Callers must ensure that this is okay to call in the current target for |
| 907 | /// the current CPU. |
| 908 | /// |
| 909 | /// The given pointers must be valid to read from. |
| 910 | #[inline (always)] |
| 911 | unsafe fn verify<V: Vector>( |
| 912 | &self, |
| 913 | mut cur: *const u8, |
| 914 | end: *const u8, |
| 915 | candidate: V, |
| 916 | ) -> Option<Match> { |
| 917 | debug_assert!(!candidate.is_zero()); |
| 918 | // Convert the candidate into 64-bit chunks, and then verify each of |
| 919 | // those chunks. |
| 920 | candidate.for_each_64bit_lane( |
| 921 | #[inline (always)] |
| 922 | |_, chunk| { |
| 923 | let result = self.verify64(cur, end, chunk); |
| 924 | cur = cur.add(8); |
| 925 | result |
| 926 | }, |
| 927 | ) |
| 928 | } |
| 929 | } |
| 930 | |
| 931 | impl Teddy<16> { |
| 932 | /// Runs the verification routine for "fat" Teddy. |
| 933 | /// |
| 934 | /// The candidate given should be a collection of 8-bit bitsets (one bitset |
| 935 | /// per lane), where the ith bit is set in the jth lane if and only if the |
| 936 | /// byte occurring at `at + (j < 16 ? j : j - 16)` in `cur` is in the |
| 937 | /// bucket `j < 16 ? i : i + 8`. |
| 938 | /// |
| 939 | /// # Safety |
| 940 | /// |
| 941 | /// Callers must ensure that this is okay to call in the current target for |
| 942 | /// the current CPU. |
| 943 | /// |
| 944 | /// The given pointers must be valid to read from. |
| 945 | #[inline (always)] |
| 946 | unsafe fn verify<V: FatVector>( |
| 947 | &self, |
| 948 | mut cur: *const u8, |
| 949 | end: *const u8, |
| 950 | candidate: V, |
| 951 | ) -> Option<Match> { |
| 952 | // This is a bit tricky, but we basically want to convert our |
| 953 | // candidate, which looks like this (assuming a 256-bit vector): |
| 954 | // |
| 955 | // a31 a30 ... a17 a16 a15 a14 ... a01 a00 |
| 956 | // |
| 957 | // where each a(i) is an 8-bit bitset corresponding to the activated |
| 958 | // buckets, to this |
| 959 | // |
| 960 | // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00 |
| 961 | // |
| 962 | // Namely, for Fat Teddy, the high 128-bits of the candidate correspond |
| 963 | // to the same bytes in the haystack in the low 128-bits (so we only |
| 964 | // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7. |
| 965 | // |
| 966 | // The verification routine wants to look at all potentially matching |
| 967 | // buckets before moving on to the next lane. So for example, both |
| 968 | // a16 and a00 both correspond to the first byte in our window; a00 |
| 969 | // contains buckets 0-7 and a16 contains buckets 8-15. Specifically, |
| 970 | // a16 should be checked before a01. So the transformation shown above |
| 971 | // allows us to use our normal verification procedure with one small |
| 972 | // change: we treat each bitset as 16 bits instead of 8 bits. |
| 973 | debug_assert!(!candidate.is_zero()); |
| 974 | |
| 975 | // Swap the 128-bit lanes in the candidate vector. |
| 976 | let swapped = candidate.swap_halves(); |
| 977 | // Interleave the bytes from the low 128-bit lanes, starting with |
| 978 | // cand first. |
| 979 | let r1 = candidate.interleave_low_8bit_lanes(swapped); |
| 980 | // Interleave the bytes from the high 128-bit lanes, starting with |
| 981 | // cand first. |
| 982 | let r2 = candidate.interleave_high_8bit_lanes(swapped); |
| 983 | // Now just take the 2 low 64-bit integers from both r1 and r2. We |
| 984 | // can drop the high 64-bit integers because they are a mirror image |
| 985 | // of the low 64-bit integers. All we care about are the low 128-bit |
| 986 | // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets |
| 987 | // laid out in the desired order, as described above. |
| 988 | r1.for_each_low_64bit_lane( |
| 989 | r2, |
| 990 | #[inline (always)] |
| 991 | |_, chunk| { |
| 992 | let result = self.verify64(cur, end, chunk); |
| 993 | cur = cur.add(4); |
| 994 | result |
| 995 | }, |
| 996 | ) |
| 997 | } |
| 998 | } |
| 999 | |
| 1000 | /// A vector generic mask for the low and high nybbles in a set of patterns. |
| 1001 | /// Each 8-bit lane `j` in a vector corresponds to a bitset where the `i`th bit |
| 1002 | /// is set if and only if the nybble `j` is in the bucket `i` at a particular |
| 1003 | /// position. |
| 1004 | /// |
| 1005 | /// This is slightly tweaked dependending on whether Slim or Fat Teddy is being |
| 1006 | /// used. For Slim Teddy, the bitsets in the lower half are the same as the |
| 1007 | /// bitsets in the higher half, so that we can search `V::BYTES` bytes at a |
| 1008 | /// time. (Remember, the nybbles in the haystack are used as indices into these |
| 1009 | /// masks, and 256-bit shuffles only operate on 128-bit lanes.) |
| 1010 | /// |
| 1011 | /// For Fat Teddy, the bitsets are not repeated, but instead, the high half |
| 1012 | /// bits correspond to an addition 8 buckets. So that a bitset `00100010` has |
| 1013 | /// buckets 1 and 5 set if it's in the lower half, but has buckets 9 and 13 set |
| 1014 | /// if it's in the higher half. |
| 1015 | #[derive (Clone, Copy, Debug)] |
| 1016 | struct Mask<V> { |
| 1017 | lo: V, |
| 1018 | hi: V, |
| 1019 | } |
| 1020 | |
| 1021 | impl<V: Vector> Mask<V> { |
| 1022 | /// Return a candidate for Teddy (fat or slim) that is searching for 1-byte |
| 1023 | /// candidates. |
| 1024 | /// |
| 1025 | /// If a candidate is returned, it will be a collection of 8-bit bitsets |
| 1026 | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
| 1027 | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
| 1028 | /// `i`. If no candidate is found, then the vector returned will have all |
| 1029 | /// lanes set to zero. |
| 1030 | /// |
| 1031 | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
| 1032 | /// the least significant byte corresponds to the start of the window). For |
| 1033 | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
| 1034 | /// the window repeated in each half of the vector. |
| 1035 | /// |
| 1036 | /// `mask1` should correspond to a low/high mask for the first byte of all |
| 1037 | /// patterns that are being searched. |
| 1038 | #[inline (always)] |
| 1039 | unsafe fn members1(chunk: V, masks: [Mask<V>; 1]) -> V { |
| 1040 | let lomask = V::splat(0xF); |
| 1041 | let hlo = chunk.and(lomask); |
| 1042 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
| 1043 | let locand = masks[0].lo.shuffle_bytes(hlo); |
| 1044 | let hicand = masks[0].hi.shuffle_bytes(hhi); |
| 1045 | locand.and(hicand) |
| 1046 | } |
| 1047 | |
| 1048 | /// Return a candidate for Teddy (fat or slim) that is searching for 2-byte |
| 1049 | /// candidates. |
| 1050 | /// |
| 1051 | /// If candidates are returned, each will be a collection of 8-bit bitsets |
| 1052 | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
| 1053 | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
| 1054 | /// `i`. Each candidate returned corresponds to the first and second bytes |
| 1055 | /// of the patterns being searched. If no candidate is found, then all of |
| 1056 | /// the lanes will be set to zero in at least one of the vectors returned. |
| 1057 | /// |
| 1058 | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
| 1059 | /// the least significant byte corresponds to the start of the window). For |
| 1060 | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
| 1061 | /// the window repeated in each half of the vector. |
| 1062 | /// |
| 1063 | /// The masks should correspond to the masks computed for the first and |
| 1064 | /// second bytes of all patterns that are being searched. |
| 1065 | #[inline (always)] |
| 1066 | unsafe fn members2(chunk: V, masks: [Mask<V>; 2]) -> (V, V) { |
| 1067 | let lomask = V::splat(0xF); |
| 1068 | let hlo = chunk.and(lomask); |
| 1069 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
| 1070 | |
| 1071 | let locand1 = masks[0].lo.shuffle_bytes(hlo); |
| 1072 | let hicand1 = masks[0].hi.shuffle_bytes(hhi); |
| 1073 | let cand1 = locand1.and(hicand1); |
| 1074 | |
| 1075 | let locand2 = masks[1].lo.shuffle_bytes(hlo); |
| 1076 | let hicand2 = masks[1].hi.shuffle_bytes(hhi); |
| 1077 | let cand2 = locand2.and(hicand2); |
| 1078 | |
| 1079 | (cand1, cand2) |
| 1080 | } |
| 1081 | |
| 1082 | /// Return a candidate for Teddy (fat or slim) that is searching for 3-byte |
| 1083 | /// candidates. |
| 1084 | /// |
| 1085 | /// If candidates are returned, each will be a collection of 8-bit bitsets |
| 1086 | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
| 1087 | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
| 1088 | /// `i`. Each candidate returned corresponds to the first, second and third |
| 1089 | /// bytes of the patterns being searched. If no candidate is found, then |
| 1090 | /// all of the lanes will be set to zero in at least one of the vectors |
| 1091 | /// returned. |
| 1092 | /// |
| 1093 | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
| 1094 | /// the least significant byte corresponds to the start of the window). For |
| 1095 | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
| 1096 | /// the window repeated in each half of the vector. |
| 1097 | /// |
| 1098 | /// The masks should correspond to the masks computed for the first, second |
| 1099 | /// and third bytes of all patterns that are being searched. |
| 1100 | #[inline (always)] |
| 1101 | unsafe fn members3(chunk: V, masks: [Mask<V>; 3]) -> (V, V, V) { |
| 1102 | let lomask = V::splat(0xF); |
| 1103 | let hlo = chunk.and(lomask); |
| 1104 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
| 1105 | |
| 1106 | let locand1 = masks[0].lo.shuffle_bytes(hlo); |
| 1107 | let hicand1 = masks[0].hi.shuffle_bytes(hhi); |
| 1108 | let cand1 = locand1.and(hicand1); |
| 1109 | |
| 1110 | let locand2 = masks[1].lo.shuffle_bytes(hlo); |
| 1111 | let hicand2 = masks[1].hi.shuffle_bytes(hhi); |
| 1112 | let cand2 = locand2.and(hicand2); |
| 1113 | |
| 1114 | let locand3 = masks[2].lo.shuffle_bytes(hlo); |
| 1115 | let hicand3 = masks[2].hi.shuffle_bytes(hhi); |
| 1116 | let cand3 = locand3.and(hicand3); |
| 1117 | |
| 1118 | (cand1, cand2, cand3) |
| 1119 | } |
| 1120 | |
| 1121 | /// Return a candidate for Teddy (fat or slim) that is searching for 4-byte |
| 1122 | /// candidates. |
| 1123 | /// |
| 1124 | /// If candidates are returned, each will be a collection of 8-bit bitsets |
| 1125 | /// (one bitset per lane), where the ith bit is set in the jth lane if and |
| 1126 | /// only if the byte occurring at the jth lane in `chunk` is in the bucket |
| 1127 | /// `i`. Each candidate returned corresponds to the first, second, third |
| 1128 | /// and fourth bytes of the patterns being searched. If no candidate is |
| 1129 | /// found, then all of the lanes will be set to zero in at least one of the |
| 1130 | /// vectors returned. |
| 1131 | /// |
| 1132 | /// `chunk` should correspond to a `V::BYTES` window of the haystack (where |
| 1133 | /// the least significant byte corresponds to the start of the window). For |
| 1134 | /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with |
| 1135 | /// the window repeated in each half of the vector. |
| 1136 | /// |
| 1137 | /// The masks should correspond to the masks computed for the first, |
| 1138 | /// second, third and fourth bytes of all patterns that are being searched. |
| 1139 | #[inline (always)] |
| 1140 | unsafe fn members4(chunk: V, masks: [Mask<V>; 4]) -> (V, V, V, V) { |
| 1141 | let lomask = V::splat(0xF); |
| 1142 | let hlo = chunk.and(lomask); |
| 1143 | let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); |
| 1144 | |
| 1145 | let locand1 = masks[0].lo.shuffle_bytes(hlo); |
| 1146 | let hicand1 = masks[0].hi.shuffle_bytes(hhi); |
| 1147 | let cand1 = locand1.and(hicand1); |
| 1148 | |
| 1149 | let locand2 = masks[1].lo.shuffle_bytes(hlo); |
| 1150 | let hicand2 = masks[1].hi.shuffle_bytes(hhi); |
| 1151 | let cand2 = locand2.and(hicand2); |
| 1152 | |
| 1153 | let locand3 = masks[2].lo.shuffle_bytes(hlo); |
| 1154 | let hicand3 = masks[2].hi.shuffle_bytes(hhi); |
| 1155 | let cand3 = locand3.and(hicand3); |
| 1156 | |
| 1157 | let locand4 = masks[3].lo.shuffle_bytes(hlo); |
| 1158 | let hicand4 = masks[3].hi.shuffle_bytes(hhi); |
| 1159 | let cand4 = locand4.and(hicand4); |
| 1160 | |
| 1161 | (cand1, cand2, cand3, cand4) |
| 1162 | } |
| 1163 | } |
| 1164 | |
| 1165 | /// Represents the low and high nybble masks that will be used during |
| 1166 | /// search. Each mask is 32 bytes wide, although only the first 16 bytes are |
| 1167 | /// used for 128-bit vectors. |
| 1168 | /// |
| 1169 | /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set |
| 1170 | /// if and only if the corresponding nybble is in the ith bucket. The index of |
| 1171 | /// the byte (0-15, inclusive) corresponds to the nybble. |
| 1172 | /// |
| 1173 | /// Each mask is used as the target of a shuffle, where the indices for the |
| 1174 | /// shuffle are taken from the haystack. AND'ing the shuffles for both the |
| 1175 | /// low and high masks together also results in 8-bit bitsets, but where bit |
| 1176 | /// `i` is set if and only if the correspond *byte* is in the ith bucket. |
| 1177 | #[derive (Clone, Default)] |
| 1178 | struct SlimMaskBuilder { |
| 1179 | lo: [u8; 32], |
| 1180 | hi: [u8; 32], |
| 1181 | } |
| 1182 | |
| 1183 | impl SlimMaskBuilder { |
| 1184 | /// Update this mask by adding the given byte to the given bucket. The |
| 1185 | /// given bucket must be in the range 0-7. |
| 1186 | /// |
| 1187 | /// # Panics |
| 1188 | /// |
| 1189 | /// When `bucket >= 8`. |
| 1190 | fn add(&mut self, bucket: usize, byte: u8) { |
| 1191 | assert!(bucket < 8); |
| 1192 | |
| 1193 | let bucket = u8::try_from(bucket).unwrap(); |
| 1194 | let byte_lo = usize::from(byte & 0xF); |
| 1195 | let byte_hi = usize::from((byte >> 4) & 0xF); |
| 1196 | // When using 256-bit vectors, we need to set this bucket assignment in |
| 1197 | // the low and high 128-bit portions of the mask. This allows us to |
| 1198 | // process 32 bytes at a time. Namely, AVX2 shuffles operate on each |
| 1199 | // of the 128-bit lanes, rather than the full 256-bit vector at once. |
| 1200 | self.lo[byte_lo] |= 1 << bucket; |
| 1201 | self.lo[byte_lo + 16] |= 1 << bucket; |
| 1202 | self.hi[byte_hi] |= 1 << bucket; |
| 1203 | self.hi[byte_hi + 16] |= 1 << bucket; |
| 1204 | } |
| 1205 | |
| 1206 | /// Turn this builder into a vector mask. |
| 1207 | /// |
| 1208 | /// # Panics |
| 1209 | /// |
| 1210 | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
| 1211 | /// |
| 1212 | /// # Safety |
| 1213 | /// |
| 1214 | /// Callers must ensure that this is okay to call in the current target for |
| 1215 | /// the current CPU. |
| 1216 | #[inline (always)] |
| 1217 | unsafe fn build<V: Vector>(&self) -> Mask<V> { |
| 1218 | assert!(V::BYTES <= self.lo.len()); |
| 1219 | assert!(V::BYTES <= self.hi.len()); |
| 1220 | Mask { |
| 1221 | lo: V::load_unaligned(self.lo[..].as_ptr()), |
| 1222 | hi: V::load_unaligned(self.hi[..].as_ptr()), |
| 1223 | } |
| 1224 | } |
| 1225 | |
| 1226 | /// A convenience function for building `N` vector masks from a slim |
| 1227 | /// `Teddy` value. |
| 1228 | /// |
| 1229 | /// # Panics |
| 1230 | /// |
| 1231 | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
| 1232 | /// |
| 1233 | /// # Safety |
| 1234 | /// |
| 1235 | /// Callers must ensure that this is okay to call in the current target for |
| 1236 | /// the current CPU. |
| 1237 | #[inline (always)] |
| 1238 | unsafe fn from_teddy<const BYTES: usize, V: Vector>( |
| 1239 | teddy: &Teddy<8>, |
| 1240 | ) -> [Mask<V>; BYTES] { |
| 1241 | // MSRV(1.63): Use core::array::from_fn to just build the array here |
| 1242 | // instead of creating a vector and turning it into an array. |
| 1243 | let mut mask_builders = vec![SlimMaskBuilder::default(); BYTES]; |
| 1244 | for (bucket_index, bucket) in teddy.buckets.iter().enumerate() { |
| 1245 | for pid in bucket.iter().copied() { |
| 1246 | let pat = teddy.patterns.get(pid); |
| 1247 | for (i, builder) in mask_builders.iter_mut().enumerate() { |
| 1248 | builder.add(bucket_index, pat.bytes()[i]); |
| 1249 | } |
| 1250 | } |
| 1251 | } |
| 1252 | let array = |
| 1253 | <[SlimMaskBuilder; BYTES]>::try_from(mask_builders).unwrap(); |
| 1254 | array.map(|builder| builder.build()) |
| 1255 | } |
| 1256 | } |
| 1257 | |
| 1258 | impl Debug for SlimMaskBuilder { |
| 1259 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
| 1260 | let (mut parts_lo: Vec, mut parts_hi: Vec) = (vec![], vec![]); |
| 1261 | for i: usize in 0..32 { |
| 1262 | parts_lo.push(format!(" {:02}: {:08b}" , i, self.lo[i])); |
| 1263 | parts_hi.push(format!(" {:02}: {:08b}" , i, self.hi[i])); |
| 1264 | } |
| 1265 | f&mut DebugStruct<'_, '_>.debug_struct("SlimMaskBuilder" ) |
| 1266 | .field("lo" , &parts_lo) |
| 1267 | .field(name:"hi" , &parts_hi) |
| 1268 | .finish() |
| 1269 | } |
| 1270 | } |
| 1271 | |
| 1272 | /// Represents the low and high nybble masks that will be used during "fat" |
| 1273 | /// Teddy search. |
| 1274 | /// |
| 1275 | /// Each mask is 32 bytes wide, and at the time of writing, only 256-bit vectors |
| 1276 | /// support fat Teddy. |
| 1277 | /// |
| 1278 | /// A fat Teddy mask is like a slim Teddy mask, except that instead of |
| 1279 | /// repeating the bitsets in the high and low 128-bits in 256-bit vectors, the |
| 1280 | /// high and low 128-bit halves each represent distinct buckets. (Bringing the |
| 1281 | /// total to 16 instead of 8.) This permits spreading the patterns out a bit |
| 1282 | /// more and thus putting less pressure on verification to be fast. |
| 1283 | /// |
| 1284 | /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set |
| 1285 | /// if and only if the corresponding nybble is in the ith bucket. The index of |
| 1286 | /// the byte (0-15, inclusive) corresponds to the nybble. |
| 1287 | #[derive (Clone, Copy, Default)] |
| 1288 | struct FatMaskBuilder { |
| 1289 | lo: [u8; 32], |
| 1290 | hi: [u8; 32], |
| 1291 | } |
| 1292 | |
| 1293 | impl FatMaskBuilder { |
| 1294 | /// Update this mask by adding the given byte to the given bucket. The |
| 1295 | /// given bucket must be in the range 0-15. |
| 1296 | /// |
| 1297 | /// # Panics |
| 1298 | /// |
| 1299 | /// When `bucket >= 16`. |
| 1300 | fn add(&mut self, bucket: usize, byte: u8) { |
| 1301 | assert!(bucket < 16); |
| 1302 | |
| 1303 | let bucket = u8::try_from(bucket).unwrap(); |
| 1304 | let byte_lo = usize::from(byte & 0xF); |
| 1305 | let byte_hi = usize::from((byte >> 4) & 0xF); |
| 1306 | // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, |
| 1307 | // the high 128 bits of our mask correspond to buckets 8-15, while the |
| 1308 | // low 128 bits correspond to buckets 0-7. |
| 1309 | if bucket < 8 { |
| 1310 | self.lo[byte_lo] |= 1 << bucket; |
| 1311 | self.hi[byte_hi] |= 1 << bucket; |
| 1312 | } else { |
| 1313 | self.lo[byte_lo + 16] |= 1 << (bucket % 8); |
| 1314 | self.hi[byte_hi + 16] |= 1 << (bucket % 8); |
| 1315 | } |
| 1316 | } |
| 1317 | |
| 1318 | /// Turn this builder into a vector mask. |
| 1319 | /// |
| 1320 | /// # Panics |
| 1321 | /// |
| 1322 | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
| 1323 | /// |
| 1324 | /// # Safety |
| 1325 | /// |
| 1326 | /// Callers must ensure that this is okay to call in the current target for |
| 1327 | /// the current CPU. |
| 1328 | #[inline (always)] |
| 1329 | unsafe fn build<V: Vector>(&self) -> Mask<V> { |
| 1330 | assert!(V::BYTES <= self.lo.len()); |
| 1331 | assert!(V::BYTES <= self.hi.len()); |
| 1332 | Mask { |
| 1333 | lo: V::load_unaligned(self.lo[..].as_ptr()), |
| 1334 | hi: V::load_unaligned(self.hi[..].as_ptr()), |
| 1335 | } |
| 1336 | } |
| 1337 | |
| 1338 | /// A convenience function for building `N` vector masks from a fat |
| 1339 | /// `Teddy` value. |
| 1340 | /// |
| 1341 | /// # Panics |
| 1342 | /// |
| 1343 | /// When `V` represents a vector bigger than what `MaskBytes` can contain. |
| 1344 | /// |
| 1345 | /// # Safety |
| 1346 | /// |
| 1347 | /// Callers must ensure that this is okay to call in the current target for |
| 1348 | /// the current CPU. |
| 1349 | #[inline (always)] |
| 1350 | unsafe fn from_teddy<const BYTES: usize, V: Vector>( |
| 1351 | teddy: &Teddy<16>, |
| 1352 | ) -> [Mask<V>; BYTES] { |
| 1353 | // MSRV(1.63): Use core::array::from_fn to just build the array here |
| 1354 | // instead of creating a vector and turning it into an array. |
| 1355 | let mut mask_builders = vec![FatMaskBuilder::default(); BYTES]; |
| 1356 | for (bucket_index, bucket) in teddy.buckets.iter().enumerate() { |
| 1357 | for pid in bucket.iter().copied() { |
| 1358 | let pat = teddy.patterns.get(pid); |
| 1359 | for (i, builder) in mask_builders.iter_mut().enumerate() { |
| 1360 | builder.add(bucket_index, pat.bytes()[i]); |
| 1361 | } |
| 1362 | } |
| 1363 | } |
| 1364 | let array = |
| 1365 | <[FatMaskBuilder; BYTES]>::try_from(mask_builders).unwrap(); |
| 1366 | array.map(|builder| builder.build()) |
| 1367 | } |
| 1368 | } |
| 1369 | |
| 1370 | impl Debug for FatMaskBuilder { |
| 1371 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
| 1372 | let (mut parts_lo: Vec, mut parts_hi: Vec) = (vec![], vec![]); |
| 1373 | for i: usize in 0..32 { |
| 1374 | parts_lo.push(format!(" {:02}: {:08b}" , i, self.lo[i])); |
| 1375 | parts_hi.push(format!(" {:02}: {:08b}" , i, self.hi[i])); |
| 1376 | } |
| 1377 | f&mut DebugStruct<'_, '_>.debug_struct("FatMaskBuilder" ) |
| 1378 | .field("lo" , &parts_lo) |
| 1379 | .field(name:"hi" , &parts_hi) |
| 1380 | .finish() |
| 1381 | } |
| 1382 | } |
| 1383 | |