| 1 | /*! |
| 2 | Types and routines for working with look-around assertions. |
| 3 | |
| 4 | This module principally defines two types: |
| 5 | |
| 6 | * [`Look`] enumerates all of the assertions supported by this crate. |
| 7 | * [`LookSet`] provides a way to efficiently store a set of [`Look`] values. |
| 8 | * [`LookMatcher`] provides routines for checking whether a `Look` or a |
| 9 | `LookSet` matches at a particular position in a haystack. |
| 10 | */ |
| 11 | |
| 12 | // LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically |
| 13 | // copied verbatim from the regex-syntax crate. I would have no problems using |
| 14 | // the regex-syntax types and defining the matching routines (only found |
| 15 | // in this crate) as free functions, except the `Look` and `LookSet` types |
| 16 | // are used in lots of places. Including in places we expect to work when |
| 17 | // regex-syntax is *not* enabled, such as in the definition of the NFA itself. |
| 18 | // |
| 19 | // Thankfully the code we copy is pretty simple and there isn't much of it. |
| 20 | // Otherwise, the rest of this module deals with *matching* the assertions, |
| 21 | // which is not something that regex-syntax handles. |
| 22 | |
| 23 | use crate::util::{escape::DebugByte, utf8}; |
| 24 | |
| 25 | /// A look-around assertion. |
| 26 | /// |
| 27 | /// An assertion matches at a position between characters in a haystack. |
| 28 | /// Namely, it does not actually "consume" any input as most parts of a regular |
| 29 | /// expression do. Assertions are a way of stating that some property must be |
| 30 | /// true at a particular point during matching. |
| 31 | /// |
| 32 | /// For example, `(?m)^[a-z]+$` is a pattern that: |
| 33 | /// |
| 34 | /// * Scans the haystack for a position at which `(?m:^)` is satisfied. That |
| 35 | /// occurs at either the beginning of the haystack, or immediately following |
| 36 | /// a `\n` character. |
| 37 | /// * Looks for one or more occurrences of `[a-z]`. |
| 38 | /// * Once `[a-z]+` has matched as much as it can, an overall match is only |
| 39 | /// reported when `[a-z]+` stops just before a `\n`. |
| 40 | /// |
| 41 | /// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not. |
| 42 | /// |
| 43 | /// Assertions are also called "look-around," "look-behind" and "look-ahead." |
| 44 | /// Specifically, some assertions are look-behind (like `^`), other assertions |
| 45 | /// are look-ahead (like `$`) and yet other assertions are both look-ahead and |
| 46 | /// look-behind (like `\b`). |
| 47 | /// |
| 48 | /// # Assertions in an NFA |
| 49 | /// |
| 50 | /// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be |
| 51 | /// thought of as a conditional epsilon transition. That is, a matching engine |
| 52 | /// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits |
| 53 | /// moving through conditional epsilon transitions when their condition |
| 54 | /// is satisfied at whatever position the `PikeVM` is currently at in the |
| 55 | /// haystack. |
| 56 | /// |
| 57 | /// How assertions are handled in a `DFA` is trickier, since a DFA does not |
| 58 | /// have epsilon transitions at all. In this case, they are compiled into the |
| 59 | /// automaton itself, at the expense of more states than what would be required |
| 60 | /// without an assertion. |
| 61 | #[derive (Clone, Copy, Debug, Eq, PartialEq)] |
| 62 | pub enum Look { |
| 63 | /// Match the beginning of text. Specifically, this matches at the starting |
| 64 | /// position of the input. |
| 65 | Start = 1 << 0, |
| 66 | /// Match the end of text. Specifically, this matches at the ending |
| 67 | /// position of the input. |
| 68 | End = 1 << 1, |
| 69 | /// Match the beginning of a line or the beginning of text. Specifically, |
| 70 | /// this matches at the starting position of the input, or at the position |
| 71 | /// immediately following a `\n` character. |
| 72 | StartLF = 1 << 2, |
| 73 | /// Match the end of a line or the end of text. Specifically, this matches |
| 74 | /// at the end position of the input, or at the position immediately |
| 75 | /// preceding a `\n` character. |
| 76 | EndLF = 1 << 3, |
| 77 | /// Match the beginning of a line or the beginning of text. Specifically, |
| 78 | /// this matches at the starting position of the input, or at the position |
| 79 | /// immediately following either a `\r` or `\n` character, but never after |
| 80 | /// a `\r` when a `\n` follows. |
| 81 | StartCRLF = 1 << 4, |
| 82 | /// Match the end of a line or the end of text. Specifically, this matches |
| 83 | /// at the end position of the input, or at the position immediately |
| 84 | /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` |
| 85 | /// precedes it. |
| 86 | EndCRLF = 1 << 5, |
| 87 | /// Match an ASCII-only word boundary. That is, this matches a position |
| 88 | /// where the left adjacent character and right adjacent character |
| 89 | /// correspond to a word and non-word or a non-word and word character. |
| 90 | WordAscii = 1 << 6, |
| 91 | /// Match an ASCII-only negation of a word boundary. |
| 92 | WordAsciiNegate = 1 << 7, |
| 93 | /// Match a Unicode-aware word boundary. That is, this matches a position |
| 94 | /// where the left adjacent character and right adjacent character |
| 95 | /// correspond to a word and non-word or a non-word and word character. |
| 96 | WordUnicode = 1 << 8, |
| 97 | /// Match a Unicode-aware negation of a word boundary. |
| 98 | WordUnicodeNegate = 1 << 9, |
| 99 | /// Match the start of an ASCII-only word boundary. That is, this matches a |
| 100 | /// position at either the beginning of the haystack or where the previous |
| 101 | /// character is not a word character and the following character is a word |
| 102 | /// character. |
| 103 | WordStartAscii = 1 << 10, |
| 104 | /// Match the end of an ASCII-only word boundary. That is, this matches |
| 105 | /// a position at either the end of the haystack or where the previous |
| 106 | /// character is a word character and the following character is not a word |
| 107 | /// character. |
| 108 | WordEndAscii = 1 << 11, |
| 109 | /// Match the start of a Unicode word boundary. That is, this matches a |
| 110 | /// position at either the beginning of the haystack or where the previous |
| 111 | /// character is not a word character and the following character is a word |
| 112 | /// character. |
| 113 | WordStartUnicode = 1 << 12, |
| 114 | /// Match the end of a Unicode word boundary. That is, this matches a |
| 115 | /// position at either the end of the haystack or where the previous |
| 116 | /// character is a word character and the following character is not a word |
| 117 | /// character. |
| 118 | WordEndUnicode = 1 << 13, |
| 119 | /// Match the start half of an ASCII-only word boundary. That is, this |
| 120 | /// matches a position at either the beginning of the haystack or where the |
| 121 | /// previous character is not a word character. |
| 122 | WordStartHalfAscii = 1 << 14, |
| 123 | /// Match the end half of an ASCII-only word boundary. That is, this |
| 124 | /// matches a position at either the end of the haystack or where the |
| 125 | /// following character is not a word character. |
| 126 | WordEndHalfAscii = 1 << 15, |
| 127 | /// Match the start half of a Unicode word boundary. That is, this matches |
| 128 | /// a position at either the beginning of the haystack or where the |
| 129 | /// previous character is not a word character. |
| 130 | WordStartHalfUnicode = 1 << 16, |
| 131 | /// Match the end half of a Unicode word boundary. That is, this matches |
| 132 | /// a position at either the end of the haystack or where the following |
| 133 | /// character is not a word character. |
| 134 | WordEndHalfUnicode = 1 << 17, |
| 135 | } |
| 136 | |
| 137 | impl Look { |
| 138 | /// Flip the look-around assertion to its equivalent for reverse searches. |
| 139 | /// For example, `StartLF` gets translated to `EndLF`. |
| 140 | /// |
| 141 | /// Some assertions, such as `WordUnicode`, remain the same since they |
| 142 | /// match the same positions regardless of the direction of the search. |
| 143 | #[inline ] |
| 144 | pub const fn reversed(self) -> Look { |
| 145 | match self { |
| 146 | Look::Start => Look::End, |
| 147 | Look::End => Look::Start, |
| 148 | Look::StartLF => Look::EndLF, |
| 149 | Look::EndLF => Look::StartLF, |
| 150 | Look::StartCRLF => Look::EndCRLF, |
| 151 | Look::EndCRLF => Look::StartCRLF, |
| 152 | Look::WordAscii => Look::WordAscii, |
| 153 | Look::WordAsciiNegate => Look::WordAsciiNegate, |
| 154 | Look::WordUnicode => Look::WordUnicode, |
| 155 | Look::WordUnicodeNegate => Look::WordUnicodeNegate, |
| 156 | Look::WordStartAscii => Look::WordEndAscii, |
| 157 | Look::WordEndAscii => Look::WordStartAscii, |
| 158 | Look::WordStartUnicode => Look::WordEndUnicode, |
| 159 | Look::WordEndUnicode => Look::WordStartUnicode, |
| 160 | Look::WordStartHalfAscii => Look::WordEndHalfAscii, |
| 161 | Look::WordEndHalfAscii => Look::WordStartHalfAscii, |
| 162 | Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, |
| 163 | Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, |
| 164 | } |
| 165 | } |
| 166 | |
| 167 | /// Return the underlying representation of this look-around enumeration |
| 168 | /// as an integer. Giving the return value to the [`Look::from_repr`] |
| 169 | /// constructor is guaranteed to return the same look-around variant that |
| 170 | /// one started with within a semver compatible release of this crate. |
| 171 | #[inline ] |
| 172 | pub const fn as_repr(self) -> u32 { |
| 173 | // AFAIK, 'as' is the only way to zero-cost convert an int enum to an |
| 174 | // actual int. |
| 175 | self as u32 |
| 176 | } |
| 177 | |
| 178 | /// Given the underlying representation of a `Look` value, return the |
| 179 | /// corresponding `Look` value if the representation is valid. Otherwise |
| 180 | /// `None` is returned. |
| 181 | #[inline ] |
| 182 | pub const fn from_repr(repr: u32) -> Option<Look> { |
| 183 | match repr { |
| 184 | 0b00_0000_0000_0000_0001 => Some(Look::Start), |
| 185 | 0b00_0000_0000_0000_0010 => Some(Look::End), |
| 186 | 0b00_0000_0000_0000_0100 => Some(Look::StartLF), |
| 187 | 0b00_0000_0000_0000_1000 => Some(Look::EndLF), |
| 188 | 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), |
| 189 | 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), |
| 190 | 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), |
| 191 | 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), |
| 192 | 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), |
| 193 | 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), |
| 194 | 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), |
| 195 | 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), |
| 196 | 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), |
| 197 | 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), |
| 198 | 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), |
| 199 | 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), |
| 200 | 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), |
| 201 | 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), |
| 202 | _ => None, |
| 203 | } |
| 204 | } |
| 205 | |
| 206 | /// Returns a convenient single codepoint representation of this |
| 207 | /// look-around assertion. Each assertion is guaranteed to be represented |
| 208 | /// by a distinct character. |
| 209 | /// |
| 210 | /// This is useful for succinctly representing a look-around assertion in |
| 211 | /// human friendly but succinct output intended for a programmer working on |
| 212 | /// regex internals. |
| 213 | #[inline ] |
| 214 | pub const fn as_char(self) -> char { |
| 215 | match self { |
| 216 | Look::Start => 'A' , |
| 217 | Look::End => 'z' , |
| 218 | Look::StartLF => '^' , |
| 219 | Look::EndLF => '$' , |
| 220 | Look::StartCRLF => 'r' , |
| 221 | Look::EndCRLF => 'R' , |
| 222 | Look::WordAscii => 'b' , |
| 223 | Look::WordAsciiNegate => 'B' , |
| 224 | Look::WordUnicode => '𝛃' , |
| 225 | Look::WordUnicodeNegate => '𝚩' , |
| 226 | Look::WordStartAscii => '<' , |
| 227 | Look::WordEndAscii => '>' , |
| 228 | Look::WordStartUnicode => '〈' , |
| 229 | Look::WordEndUnicode => '〉' , |
| 230 | Look::WordStartHalfAscii => '◁' , |
| 231 | Look::WordEndHalfAscii => '▷' , |
| 232 | Look::WordStartHalfUnicode => '◀' , |
| 233 | Look::WordEndHalfUnicode => '▶' , |
| 234 | } |
| 235 | } |
| 236 | } |
| 237 | |
| 238 | /// LookSet is a memory-efficient set of look-around assertions. |
| 239 | /// |
| 240 | /// This is useful for efficiently tracking look-around assertions. For |
| 241 | /// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties |
| 242 | /// that return `LookSet`s. |
| 243 | #[derive (Clone, Copy, Default, Eq, PartialEq)] |
| 244 | pub struct LookSet { |
| 245 | /// The underlying representation this set is exposed to make it possible |
| 246 | /// to store it somewhere efficiently. The representation is that |
| 247 | /// of a bitset, where each assertion occupies bit `i` where |
| 248 | /// `i = Look::as_repr()`. |
| 249 | /// |
| 250 | /// Note that users of this internal representation must permit the full |
| 251 | /// range of `u16` values to be represented. For example, even if the |
| 252 | /// current implementation only makes use of the 10 least significant bits, |
| 253 | /// it may use more bits in a future semver compatible release. |
| 254 | pub bits: u32, |
| 255 | } |
| 256 | |
| 257 | impl LookSet { |
| 258 | /// Create an empty set of look-around assertions. |
| 259 | #[inline ] |
| 260 | pub fn empty() -> LookSet { |
| 261 | LookSet { bits: 0 } |
| 262 | } |
| 263 | |
| 264 | /// Create a full set of look-around assertions. |
| 265 | /// |
| 266 | /// This set contains all possible look-around assertions. |
| 267 | #[inline ] |
| 268 | pub fn full() -> LookSet { |
| 269 | LookSet { bits: !0 } |
| 270 | } |
| 271 | |
| 272 | /// Create a look-around set containing the look-around assertion given. |
| 273 | /// |
| 274 | /// This is a convenience routine for creating an empty set and inserting |
| 275 | /// one look-around assertions. |
| 276 | #[inline ] |
| 277 | pub fn singleton(look: Look) -> LookSet { |
| 278 | LookSet::empty().insert(look) |
| 279 | } |
| 280 | |
| 281 | /// Returns the total number of look-around assertions in this set. |
| 282 | #[inline ] |
| 283 | pub fn len(self) -> usize { |
| 284 | // OK because max value always fits in a u8, which in turn always |
| 285 | // fits in a usize, regardless of target. |
| 286 | usize::try_from(self.bits.count_ones()).unwrap() |
| 287 | } |
| 288 | |
| 289 | /// Returns true if and only if this set is empty. |
| 290 | #[inline ] |
| 291 | pub fn is_empty(self) -> bool { |
| 292 | self.len() == 0 |
| 293 | } |
| 294 | |
| 295 | /// Returns true if and only if the given look-around assertion is in this |
| 296 | /// set. |
| 297 | #[inline ] |
| 298 | pub fn contains(self, look: Look) -> bool { |
| 299 | self.bits & look.as_repr() != 0 |
| 300 | } |
| 301 | |
| 302 | /// Returns true if and only if this set contains any anchor assertions. |
| 303 | /// This includes both "start/end of haystack" and "start/end of line." |
| 304 | #[inline ] |
| 305 | pub fn contains_anchor(&self) -> bool { |
| 306 | self.contains_anchor_haystack() || self.contains_anchor_line() |
| 307 | } |
| 308 | |
| 309 | /// Returns true if and only if this set contains any "start/end of |
| 310 | /// haystack" anchors. This doesn't include "start/end of line" anchors. |
| 311 | #[inline ] |
| 312 | pub fn contains_anchor_haystack(&self) -> bool { |
| 313 | self.contains(Look::Start) || self.contains(Look::End) |
| 314 | } |
| 315 | |
| 316 | /// Returns true if and only if this set contains any "start/end of line" |
| 317 | /// anchors. This doesn't include "start/end of haystack" anchors. This |
| 318 | /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. |
| 319 | #[inline ] |
| 320 | pub fn contains_anchor_line(&self) -> bool { |
| 321 | self.contains(Look::StartLF) |
| 322 | || self.contains(Look::EndLF) |
| 323 | || self.contains(Look::StartCRLF) |
| 324 | || self.contains(Look::EndCRLF) |
| 325 | } |
| 326 | |
| 327 | /// Returns true if and only if this set contains any "start/end of line" |
| 328 | /// anchors that only treat `\n` as line terminators. This does not include |
| 329 | /// haystack anchors or CRLF aware line anchors. |
| 330 | #[inline ] |
| 331 | pub fn contains_anchor_lf(&self) -> bool { |
| 332 | self.contains(Look::StartLF) || self.contains(Look::EndLF) |
| 333 | } |
| 334 | |
| 335 | /// Returns true if and only if this set contains any "start/end of line" |
| 336 | /// anchors that are CRLF-aware. This doesn't include "start/end of |
| 337 | /// haystack" or "start/end of line-feed" anchors. |
| 338 | #[inline ] |
| 339 | pub fn contains_anchor_crlf(&self) -> bool { |
| 340 | self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) |
| 341 | } |
| 342 | |
| 343 | /// Returns true if and only if this set contains any word boundary or |
| 344 | /// negated word boundary assertions. This include both Unicode and ASCII |
| 345 | /// word boundaries. |
| 346 | #[inline ] |
| 347 | pub fn contains_word(self) -> bool { |
| 348 | self.contains_word_unicode() || self.contains_word_ascii() |
| 349 | } |
| 350 | |
| 351 | /// Returns true if and only if this set contains any Unicode word boundary |
| 352 | /// or negated Unicode word boundary assertions. |
| 353 | #[inline ] |
| 354 | pub fn contains_word_unicode(self) -> bool { |
| 355 | self.contains(Look::WordUnicode) |
| 356 | || self.contains(Look::WordUnicodeNegate) |
| 357 | || self.contains(Look::WordStartUnicode) |
| 358 | || self.contains(Look::WordEndUnicode) |
| 359 | || self.contains(Look::WordStartHalfUnicode) |
| 360 | || self.contains(Look::WordEndHalfUnicode) |
| 361 | } |
| 362 | |
| 363 | /// Returns true if and only if this set contains any ASCII word boundary |
| 364 | /// or negated ASCII word boundary assertions. |
| 365 | #[inline ] |
| 366 | pub fn contains_word_ascii(self) -> bool { |
| 367 | self.contains(Look::WordAscii) |
| 368 | || self.contains(Look::WordAsciiNegate) |
| 369 | || self.contains(Look::WordStartAscii) |
| 370 | || self.contains(Look::WordEndAscii) |
| 371 | || self.contains(Look::WordStartHalfAscii) |
| 372 | || self.contains(Look::WordEndHalfAscii) |
| 373 | } |
| 374 | |
| 375 | /// Returns an iterator over all of the look-around assertions in this set. |
| 376 | #[inline ] |
| 377 | pub fn iter(self) -> LookSetIter { |
| 378 | LookSetIter { set: self } |
| 379 | } |
| 380 | |
| 381 | /// Return a new set that is equivalent to the original, but with the given |
| 382 | /// assertion added to it. If the assertion is already in the set, then the |
| 383 | /// returned set is equivalent to the original. |
| 384 | #[inline ] |
| 385 | pub fn insert(self, look: Look) -> LookSet { |
| 386 | LookSet { bits: self.bits | look.as_repr() } |
| 387 | } |
| 388 | |
| 389 | /// Updates this set in place with the result of inserting the given |
| 390 | /// assertion into this set. |
| 391 | #[inline ] |
| 392 | pub fn set_insert(&mut self, look: Look) { |
| 393 | *self = self.insert(look); |
| 394 | } |
| 395 | |
| 396 | /// Return a new set that is equivalent to the original, but with the given |
| 397 | /// assertion removed from it. If the assertion is not in the set, then the |
| 398 | /// returned set is equivalent to the original. |
| 399 | #[inline ] |
| 400 | pub fn remove(self, look: Look) -> LookSet { |
| 401 | LookSet { bits: self.bits & !look.as_repr() } |
| 402 | } |
| 403 | |
| 404 | /// Updates this set in place with the result of removing the given |
| 405 | /// assertion from this set. |
| 406 | #[inline ] |
| 407 | pub fn set_remove(&mut self, look: Look) { |
| 408 | *self = self.remove(look); |
| 409 | } |
| 410 | |
| 411 | /// Returns a new set that is the result of subtracting the given set from |
| 412 | /// this set. |
| 413 | #[inline ] |
| 414 | pub fn subtract(self, other: LookSet) -> LookSet { |
| 415 | LookSet { bits: self.bits & !other.bits } |
| 416 | } |
| 417 | |
| 418 | /// Updates this set in place with the result of subtracting the given set |
| 419 | /// from this set. |
| 420 | #[inline ] |
| 421 | pub fn set_subtract(&mut self, other: LookSet) { |
| 422 | *self = self.subtract(other); |
| 423 | } |
| 424 | |
| 425 | /// Returns a new set that is the union of this and the one given. |
| 426 | #[inline ] |
| 427 | pub fn union(self, other: LookSet) -> LookSet { |
| 428 | LookSet { bits: self.bits | other.bits } |
| 429 | } |
| 430 | |
| 431 | /// Updates this set in place with the result of unioning it with the one |
| 432 | /// given. |
| 433 | #[inline ] |
| 434 | pub fn set_union(&mut self, other: LookSet) { |
| 435 | *self = self.union(other); |
| 436 | } |
| 437 | |
| 438 | /// Returns a new set that is the intersection of this and the one given. |
| 439 | #[inline ] |
| 440 | pub fn intersect(self, other: LookSet) -> LookSet { |
| 441 | LookSet { bits: self.bits & other.bits } |
| 442 | } |
| 443 | |
| 444 | /// Updates this set in place with the result of intersecting it with the |
| 445 | /// one given. |
| 446 | #[inline ] |
| 447 | pub fn set_intersect(&mut self, other: LookSet) { |
| 448 | *self = self.intersect(other); |
| 449 | } |
| 450 | |
| 451 | /// Return a `LookSet` from the slice given as a native endian 32-bit |
| 452 | /// integer. |
| 453 | /// |
| 454 | /// # Panics |
| 455 | /// |
| 456 | /// This panics if `slice.len() < 4`. |
| 457 | #[inline ] |
| 458 | pub fn read_repr(slice: &[u8]) -> LookSet { |
| 459 | let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); |
| 460 | LookSet { bits } |
| 461 | } |
| 462 | |
| 463 | /// Write a `LookSet` as a native endian 32-bit integer to the beginning |
| 464 | /// of the slice given. |
| 465 | /// |
| 466 | /// # Panics |
| 467 | /// |
| 468 | /// This panics if `slice.len() < 4`. |
| 469 | #[inline ] |
| 470 | pub fn write_repr(self, slice: &mut [u8]) { |
| 471 | let raw = self.bits.to_ne_bytes(); |
| 472 | slice[0] = raw[0]; |
| 473 | slice[1] = raw[1]; |
| 474 | slice[2] = raw[2]; |
| 475 | slice[3] = raw[3]; |
| 476 | } |
| 477 | |
| 478 | /// Checks that all assertions in this set can be matched. |
| 479 | /// |
| 480 | /// Some assertions, such as Unicode word boundaries, require optional (but |
| 481 | /// enabled by default) tables that may not be available. If there are |
| 482 | /// assertions in this set that require tables that are not available, then |
| 483 | /// this will return an error. |
| 484 | /// |
| 485 | /// Specifically, this returns an error when the the |
| 486 | /// `unicode-word-boundary` feature is _not_ enabled _and_ this set |
| 487 | /// contains a Unicode word boundary assertion. |
| 488 | /// |
| 489 | /// It can be useful to use this on the result of |
| 490 | /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any) |
| 491 | /// when building a matcher engine to ensure methods like |
| 492 | /// [`LookMatcher::matches_set`] do not panic at search time. |
| 493 | pub fn available(self) -> Result<(), UnicodeWordBoundaryError> { |
| 494 | if self.contains_word_unicode() { |
| 495 | UnicodeWordBoundaryError::check()?; |
| 496 | } |
| 497 | Ok(()) |
| 498 | } |
| 499 | } |
| 500 | |
| 501 | impl core::fmt::Debug for LookSet { |
| 502 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
| 503 | if self.is_empty() { |
| 504 | return write!(f, "∅" ); |
| 505 | } |
| 506 | for look: Look in self.iter() { |
| 507 | write!(f, " {}" , look.as_char())?; |
| 508 | } |
| 509 | Ok(()) |
| 510 | } |
| 511 | } |
| 512 | |
| 513 | /// An iterator over all look-around assertions in a [`LookSet`]. |
| 514 | /// |
| 515 | /// This iterator is created by [`LookSet::iter`]. |
| 516 | #[derive (Clone, Debug)] |
| 517 | pub struct LookSetIter { |
| 518 | set: LookSet, |
| 519 | } |
| 520 | |
| 521 | impl Iterator for LookSetIter { |
| 522 | type Item = Look; |
| 523 | |
| 524 | #[inline ] |
| 525 | fn next(&mut self) -> Option<Look> { |
| 526 | if self.set.is_empty() { |
| 527 | return None; |
| 528 | } |
| 529 | // We'll never have more than u8::MAX distinct look-around assertions, |
| 530 | // so 'bit' will always fit into a u16. |
| 531 | let bit: u16 = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); |
| 532 | let look: Look = Look::from_repr(1 << bit)?; |
| 533 | self.set = self.set.remove(look); |
| 534 | Some(look) |
| 535 | } |
| 536 | } |
| 537 | |
| 538 | /// A matcher for look-around assertions. |
| 539 | /// |
| 540 | /// This matcher permits configuring aspects of how look-around assertions are |
| 541 | /// matched. |
| 542 | /// |
| 543 | /// # Example |
| 544 | /// |
| 545 | /// A `LookMatcher` can change the line terminator used for matching multi-line |
| 546 | /// anchors such as `(?m:^)` and `(?m:$)`. |
| 547 | /// |
| 548 | /// ``` |
| 549 | /// use regex_automata::{ |
| 550 | /// nfa::thompson::{self, pikevm::PikeVM}, |
| 551 | /// util::look::LookMatcher, |
| 552 | /// Match, Input, |
| 553 | /// }; |
| 554 | /// |
| 555 | /// let mut lookm = LookMatcher::new(); |
| 556 | /// lookm.set_line_terminator(b' \x00' ); |
| 557 | /// |
| 558 | /// let re = PikeVM::builder() |
| 559 | /// .thompson(thompson::Config::new().look_matcher(lookm)) |
| 560 | /// .build(r"(?m)^[a-z]+$" )?; |
| 561 | /// let mut cache = re.create_cache(); |
| 562 | /// |
| 563 | /// // Multi-line assertions now use NUL as a terminator. |
| 564 | /// assert_eq!( |
| 565 | /// Some(Match::must(0, 1..4)), |
| 566 | /// re.find(&mut cache, b" \x00abc \x00" ), |
| 567 | /// ); |
| 568 | /// // ... and \n is no longer recognized as a terminator. |
| 569 | /// assert_eq!( |
| 570 | /// None, |
| 571 | /// re.find(&mut cache, b" \nabc \n" ), |
| 572 | /// ); |
| 573 | /// |
| 574 | /// # Ok::<(), Box<dyn std::error::Error>>(()) |
| 575 | /// ``` |
| 576 | #[derive (Clone, Debug)] |
| 577 | pub struct LookMatcher { |
| 578 | lineterm: DebugByte, |
| 579 | } |
| 580 | |
| 581 | impl LookMatcher { |
| 582 | /// Creates a new default matcher for look-around assertions. |
| 583 | pub fn new() -> LookMatcher { |
| 584 | LookMatcher { lineterm: DebugByte(b' \n' ) } |
| 585 | } |
| 586 | |
| 587 | /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`. |
| 588 | /// |
| 589 | /// Namely, instead of `^` matching after `\n` and `$` matching immediately |
| 590 | /// before a `\n`, this will cause it to match after and before the byte |
| 591 | /// given. |
| 592 | /// |
| 593 | /// It can occasionally be useful to use this to configure the line |
| 594 | /// terminator to the NUL byte when searching binary data. |
| 595 | /// |
| 596 | /// Note that this does not apply to CRLF-aware line anchors such as |
| 597 | /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to |
| 598 | /// use `\r` and `\n`. |
| 599 | pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher { |
| 600 | self.lineterm.0 = byte; |
| 601 | self |
| 602 | } |
| 603 | |
| 604 | /// Returns the line terminator that was configured for this matcher. |
| 605 | /// |
| 606 | /// If no line terminator was configured, then this returns `\n`. |
| 607 | /// |
| 608 | /// Note that the line terminator should only be used for matching `(?m:^)` |
| 609 | /// and `(?m:$)` assertions. It specifically should _not_ be used for |
| 610 | /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`. |
| 611 | pub fn get_line_terminator(&self) -> u8 { |
| 612 | self.lineterm.0 |
| 613 | } |
| 614 | |
| 615 | /// Returns true when the position `at` in `haystack` satisfies the given |
| 616 | /// look-around assertion. |
| 617 | /// |
| 618 | /// # Panics |
| 619 | /// |
| 620 | /// This panics when testing any Unicode word boundary assertion in this |
| 621 | /// set and when the Unicode word data is not available. Specifically, this |
| 622 | /// only occurs when the `unicode-word-boundary` feature is not enabled. |
| 623 | /// |
| 624 | /// Since it's generally expected that this routine is called inside of |
| 625 | /// a matching engine, callers should check the error condition when |
| 626 | /// building the matching engine. If there is a Unicode word boundary |
| 627 | /// in the matcher and the data isn't available, then the matcher should |
| 628 | /// fail to build. |
| 629 | /// |
| 630 | /// Callers can check the error condition with [`LookSet::available`]. |
| 631 | /// |
| 632 | /// This also may panic when `at > haystack.len()`. Note that `at == |
| 633 | /// haystack.len()` is legal and guaranteed not to panic. |
| 634 | #[inline ] |
| 635 | pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool { |
| 636 | self.matches_inline(look, haystack, at) |
| 637 | } |
| 638 | |
| 639 | /// Like `matches`, but forcefully inlined. |
| 640 | /// |
| 641 | /// # Panics |
| 642 | /// |
| 643 | /// This panics when testing any Unicode word boundary assertion in this |
| 644 | /// set and when the Unicode word data is not available. Specifically, this |
| 645 | /// only occurs when the `unicode-word-boundary` feature is not enabled. |
| 646 | /// |
| 647 | /// Since it's generally expected that this routine is called inside of |
| 648 | /// a matching engine, callers should check the error condition when |
| 649 | /// building the matching engine. If there is a Unicode word boundary |
| 650 | /// in the matcher and the data isn't available, then the matcher should |
| 651 | /// fail to build. |
| 652 | /// |
| 653 | /// Callers can check the error condition with [`LookSet::available`]. |
| 654 | /// |
| 655 | /// This also may panic when `at > haystack.len()`. Note that `at == |
| 656 | /// haystack.len()` is legal and guaranteed not to panic. |
| 657 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 658 | pub(crate) fn matches_inline( |
| 659 | &self, |
| 660 | look: Look, |
| 661 | haystack: &[u8], |
| 662 | at: usize, |
| 663 | ) -> bool { |
| 664 | match look { |
| 665 | Look::Start => self.is_start(haystack, at), |
| 666 | Look::End => self.is_end(haystack, at), |
| 667 | Look::StartLF => self.is_start_lf(haystack, at), |
| 668 | Look::EndLF => self.is_end_lf(haystack, at), |
| 669 | Look::StartCRLF => self.is_start_crlf(haystack, at), |
| 670 | Look::EndCRLF => self.is_end_crlf(haystack, at), |
| 671 | Look::WordAscii => self.is_word_ascii(haystack, at), |
| 672 | Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at), |
| 673 | Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(), |
| 674 | Look::WordUnicodeNegate => { |
| 675 | self.is_word_unicode_negate(haystack, at).unwrap() |
| 676 | } |
| 677 | Look::WordStartAscii => self.is_word_start_ascii(haystack, at), |
| 678 | Look::WordEndAscii => self.is_word_end_ascii(haystack, at), |
| 679 | Look::WordStartUnicode => { |
| 680 | self.is_word_start_unicode(haystack, at).unwrap() |
| 681 | } |
| 682 | Look::WordEndUnicode => { |
| 683 | self.is_word_end_unicode(haystack, at).unwrap() |
| 684 | } |
| 685 | Look::WordStartHalfAscii => { |
| 686 | self.is_word_start_half_ascii(haystack, at) |
| 687 | } |
| 688 | Look::WordEndHalfAscii => { |
| 689 | self.is_word_end_half_ascii(haystack, at) |
| 690 | } |
| 691 | Look::WordStartHalfUnicode => { |
| 692 | self.is_word_start_half_unicode(haystack, at).unwrap() |
| 693 | } |
| 694 | Look::WordEndHalfUnicode => { |
| 695 | self.is_word_end_half_unicode(haystack, at).unwrap() |
| 696 | } |
| 697 | } |
| 698 | } |
| 699 | |
| 700 | /// Returns true when _all_ of the assertions in the given set match at the |
| 701 | /// given position in the haystack. |
| 702 | /// |
| 703 | /// # Panics |
| 704 | /// |
| 705 | /// This panics when testing any Unicode word boundary assertion in this |
| 706 | /// set and when the Unicode word data is not available. Specifically, this |
| 707 | /// only occurs when the `unicode-word-boundary` feature is not enabled. |
| 708 | /// |
| 709 | /// Since it's generally expected that this routine is called inside of |
| 710 | /// a matching engine, callers should check the error condition when |
| 711 | /// building the matching engine. If there is a Unicode word boundary |
| 712 | /// in the matcher and the data isn't available, then the matcher should |
| 713 | /// fail to build. |
| 714 | /// |
| 715 | /// Callers can check the error condition with [`LookSet::available`]. |
| 716 | /// |
| 717 | /// This also may panic when `at > haystack.len()`. Note that `at == |
| 718 | /// haystack.len()` is legal and guaranteed not to panic. |
| 719 | #[inline ] |
| 720 | pub fn matches_set( |
| 721 | &self, |
| 722 | set: LookSet, |
| 723 | haystack: &[u8], |
| 724 | at: usize, |
| 725 | ) -> bool { |
| 726 | self.matches_set_inline(set, haystack, at) |
| 727 | } |
| 728 | |
| 729 | /// Like `LookSet::matches`, but forcefully inlined for perf. |
| 730 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 731 | pub(crate) fn matches_set_inline( |
| 732 | &self, |
| 733 | set: LookSet, |
| 734 | haystack: &[u8], |
| 735 | at: usize, |
| 736 | ) -> bool { |
| 737 | // This used to luse LookSet::iter with Look::matches on each element, |
| 738 | // but that proved to be quite diastrous for perf. The manual "if |
| 739 | // the set has this assertion, check it" turns out to be quite a bit |
| 740 | // faster. |
| 741 | if set.contains(Look::Start) { |
| 742 | if !self.is_start(haystack, at) { |
| 743 | return false; |
| 744 | } |
| 745 | } |
| 746 | if set.contains(Look::End) { |
| 747 | if !self.is_end(haystack, at) { |
| 748 | return false; |
| 749 | } |
| 750 | } |
| 751 | if set.contains(Look::StartLF) { |
| 752 | if !self.is_start_lf(haystack, at) { |
| 753 | return false; |
| 754 | } |
| 755 | } |
| 756 | if set.contains(Look::EndLF) { |
| 757 | if !self.is_end_lf(haystack, at) { |
| 758 | return false; |
| 759 | } |
| 760 | } |
| 761 | if set.contains(Look::StartCRLF) { |
| 762 | if !self.is_start_crlf(haystack, at) { |
| 763 | return false; |
| 764 | } |
| 765 | } |
| 766 | if set.contains(Look::EndCRLF) { |
| 767 | if !self.is_end_crlf(haystack, at) { |
| 768 | return false; |
| 769 | } |
| 770 | } |
| 771 | if set.contains(Look::WordAscii) { |
| 772 | if !self.is_word_ascii(haystack, at) { |
| 773 | return false; |
| 774 | } |
| 775 | } |
| 776 | if set.contains(Look::WordAsciiNegate) { |
| 777 | if !self.is_word_ascii_negate(haystack, at) { |
| 778 | return false; |
| 779 | } |
| 780 | } |
| 781 | if set.contains(Look::WordUnicode) { |
| 782 | if !self.is_word_unicode(haystack, at).unwrap() { |
| 783 | return false; |
| 784 | } |
| 785 | } |
| 786 | if set.contains(Look::WordUnicodeNegate) { |
| 787 | if !self.is_word_unicode_negate(haystack, at).unwrap() { |
| 788 | return false; |
| 789 | } |
| 790 | } |
| 791 | if set.contains(Look::WordStartAscii) { |
| 792 | if !self.is_word_start_ascii(haystack, at) { |
| 793 | return false; |
| 794 | } |
| 795 | } |
| 796 | if set.contains(Look::WordEndAscii) { |
| 797 | if !self.is_word_end_ascii(haystack, at) { |
| 798 | return false; |
| 799 | } |
| 800 | } |
| 801 | if set.contains(Look::WordStartUnicode) { |
| 802 | if !self.is_word_start_unicode(haystack, at).unwrap() { |
| 803 | return false; |
| 804 | } |
| 805 | } |
| 806 | if set.contains(Look::WordEndUnicode) { |
| 807 | if !self.is_word_end_unicode(haystack, at).unwrap() { |
| 808 | return false; |
| 809 | } |
| 810 | } |
| 811 | if set.contains(Look::WordStartHalfAscii) { |
| 812 | if !self.is_word_start_half_ascii(haystack, at) { |
| 813 | return false; |
| 814 | } |
| 815 | } |
| 816 | if set.contains(Look::WordEndHalfAscii) { |
| 817 | if !self.is_word_end_half_ascii(haystack, at) { |
| 818 | return false; |
| 819 | } |
| 820 | } |
| 821 | if set.contains(Look::WordStartHalfUnicode) { |
| 822 | if !self.is_word_start_half_unicode(haystack, at).unwrap() { |
| 823 | return false; |
| 824 | } |
| 825 | } |
| 826 | if set.contains(Look::WordEndHalfUnicode) { |
| 827 | if !self.is_word_end_half_unicode(haystack, at).unwrap() { |
| 828 | return false; |
| 829 | } |
| 830 | } |
| 831 | true |
| 832 | } |
| 833 | |
| 834 | /// Split up the given byte classes into equivalence classes in a way that |
| 835 | /// is consistent with this look-around assertion. |
| 836 | #[cfg (feature = "alloc" )] |
| 837 | pub(crate) fn add_to_byteset( |
| 838 | &self, |
| 839 | look: Look, |
| 840 | set: &mut crate::util::alphabet::ByteClassSet, |
| 841 | ) { |
| 842 | match look { |
| 843 | Look::Start | Look::End => {} |
| 844 | Look::StartLF | Look::EndLF => { |
| 845 | set.set_range(self.lineterm.0, self.lineterm.0); |
| 846 | } |
| 847 | Look::StartCRLF | Look::EndCRLF => { |
| 848 | set.set_range(b' \r' , b' \r' ); |
| 849 | set.set_range(b' \n' , b' \n' ); |
| 850 | } |
| 851 | Look::WordAscii |
| 852 | | Look::WordAsciiNegate |
| 853 | | Look::WordUnicode |
| 854 | | Look::WordUnicodeNegate |
| 855 | | Look::WordStartAscii |
| 856 | | Look::WordEndAscii |
| 857 | | Look::WordStartUnicode |
| 858 | | Look::WordEndUnicode |
| 859 | | Look::WordStartHalfAscii |
| 860 | | Look::WordEndHalfAscii |
| 861 | | Look::WordStartHalfUnicode |
| 862 | | Look::WordEndHalfUnicode => { |
| 863 | // We need to mark all ranges of bytes whose pairs result in |
| 864 | // evaluating \b differently. This isn't technically correct |
| 865 | // for Unicode word boundaries, but DFAs can't handle those |
| 866 | // anyway, and thus, the byte classes don't need to either |
| 867 | // since they are themselves only used in DFAs. |
| 868 | // |
| 869 | // FIXME: It seems like the calls to 'set_range' here are |
| 870 | // completely invariant, which means we could just hard-code |
| 871 | // them here without needing to write a loop. And we only need |
| 872 | // to do this dance at most once per regex. |
| 873 | // |
| 874 | // FIXME: Is this correct for \B? |
| 875 | let iswb = utf8::is_word_byte; |
| 876 | // This unwrap is OK because we guard every use of 'asu8' with |
| 877 | // a check that the input is <= 255. |
| 878 | let asu8 = |b: u16| u8::try_from(b).unwrap(); |
| 879 | let mut b1: u16 = 0; |
| 880 | let mut b2: u16; |
| 881 | while b1 <= 255 { |
| 882 | b2 = b1 + 1; |
| 883 | while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) { |
| 884 | b2 += 1; |
| 885 | } |
| 886 | // The guards above guarantee that b2 can never get any |
| 887 | // bigger. |
| 888 | assert!(b2 <= 256); |
| 889 | // Subtracting 1 from b2 is always OK because it is always |
| 890 | // at least 1 greater than b1, and the assert above |
| 891 | // guarantees that the asu8 conversion will succeed. |
| 892 | set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap())); |
| 893 | b1 = b2; |
| 894 | } |
| 895 | } |
| 896 | } |
| 897 | } |
| 898 | |
| 899 | /// Returns true when [`Look::Start`] is satisfied `at` the given position |
| 900 | /// in `haystack`. |
| 901 | /// |
| 902 | /// # Panics |
| 903 | /// |
| 904 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 905 | /// haystack.len()` is legal and guaranteed not to panic. |
| 906 | #[inline ] |
| 907 | pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool { |
| 908 | at == 0 |
| 909 | } |
| 910 | |
| 911 | /// Returns true when [`Look::End`] is satisfied `at` the given position in |
| 912 | /// `haystack`. |
| 913 | /// |
| 914 | /// # Panics |
| 915 | /// |
| 916 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 917 | /// haystack.len()` is legal and guaranteed not to panic. |
| 918 | #[inline ] |
| 919 | pub fn is_end(&self, haystack: &[u8], at: usize) -> bool { |
| 920 | at == haystack.len() |
| 921 | } |
| 922 | |
| 923 | /// Returns true when [`Look::StartLF`] is satisfied `at` the given |
| 924 | /// position in `haystack`. |
| 925 | /// |
| 926 | /// # Panics |
| 927 | /// |
| 928 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 929 | /// haystack.len()` is legal and guaranteed not to panic. |
| 930 | #[inline ] |
| 931 | pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool { |
| 932 | self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0 |
| 933 | } |
| 934 | |
| 935 | /// Returns true when [`Look::EndLF`] is satisfied `at` the given position |
| 936 | /// in `haystack`. |
| 937 | /// |
| 938 | /// # Panics |
| 939 | /// |
| 940 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 941 | /// haystack.len()` is legal and guaranteed not to panic. |
| 942 | #[inline ] |
| 943 | pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool { |
| 944 | self.is_end(haystack, at) || haystack[at] == self.lineterm.0 |
| 945 | } |
| 946 | |
| 947 | /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given |
| 948 | /// position in `haystack`. |
| 949 | /// |
| 950 | /// # Panics |
| 951 | /// |
| 952 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 953 | /// haystack.len()` is legal and guaranteed not to panic. |
| 954 | #[inline ] |
| 955 | pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool { |
| 956 | self.is_start(haystack, at) |
| 957 | || haystack[at - 1] == b' \n' |
| 958 | || (haystack[at - 1] == b' \r' |
| 959 | && (at >= haystack.len() || haystack[at] != b' \n' )) |
| 960 | } |
| 961 | |
| 962 | /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given |
| 963 | /// position in `haystack`. |
| 964 | /// |
| 965 | /// # Panics |
| 966 | /// |
| 967 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 968 | /// haystack.len()` is legal and guaranteed not to panic. |
| 969 | #[inline ] |
| 970 | pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool { |
| 971 | self.is_end(haystack, at) |
| 972 | || haystack[at] == b' \r' |
| 973 | || (haystack[at] == b' \n' |
| 974 | && (at == 0 || haystack[at - 1] != b' \r' )) |
| 975 | } |
| 976 | |
| 977 | /// Returns true when [`Look::WordAscii`] is satisfied `at` the given |
| 978 | /// position in `haystack`. |
| 979 | /// |
| 980 | /// # Panics |
| 981 | /// |
| 982 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 983 | /// haystack.len()` is legal and guaranteed not to panic. |
| 984 | #[inline ] |
| 985 | pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool { |
| 986 | let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); |
| 987 | let word_after = |
| 988 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
| 989 | word_before != word_after |
| 990 | } |
| 991 | |
| 992 | /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given |
| 993 | /// position in `haystack`. |
| 994 | /// |
| 995 | /// # Panics |
| 996 | /// |
| 997 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 998 | /// haystack.len()` is legal and guaranteed not to panic. |
| 999 | #[inline ] |
| 1000 | pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool { |
| 1001 | !self.is_word_ascii(haystack, at) |
| 1002 | } |
| 1003 | |
| 1004 | /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given |
| 1005 | /// position in `haystack`. |
| 1006 | /// |
| 1007 | /// # Panics |
| 1008 | /// |
| 1009 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 1010 | /// haystack.len()` is legal and guaranteed not to panic. |
| 1011 | /// |
| 1012 | /// # Errors |
| 1013 | /// |
| 1014 | /// This returns an error when Unicode word boundary tables |
| 1015 | /// are not available. Specifically, this only occurs when the |
| 1016 | /// `unicode-word-boundary` feature is not enabled. |
| 1017 | #[inline ] |
| 1018 | pub fn is_word_unicode( |
| 1019 | &self, |
| 1020 | haystack: &[u8], |
| 1021 | at: usize, |
| 1022 | ) -> Result<bool, UnicodeWordBoundaryError> { |
| 1023 | let word_before = is_word_char::rev(haystack, at)?; |
| 1024 | let word_after = is_word_char::fwd(haystack, at)?; |
| 1025 | Ok(word_before != word_after) |
| 1026 | } |
| 1027 | |
| 1028 | /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the |
| 1029 | /// given position in `haystack`. |
| 1030 | /// |
| 1031 | /// # Panics |
| 1032 | /// |
| 1033 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 1034 | /// haystack.len()` is legal and guaranteed not to panic. |
| 1035 | /// |
| 1036 | /// # Errors |
| 1037 | /// |
| 1038 | /// This returns an error when Unicode word boundary tables |
| 1039 | /// are not available. Specifically, this only occurs when the |
| 1040 | /// `unicode-word-boundary` feature is not enabled. |
| 1041 | #[inline ] |
| 1042 | pub fn is_word_unicode_negate( |
| 1043 | &self, |
| 1044 | haystack: &[u8], |
| 1045 | at: usize, |
| 1046 | ) -> Result<bool, UnicodeWordBoundaryError> { |
| 1047 | // This is pretty subtle. Why do we need to do UTF-8 decoding here? |
| 1048 | // Well... at time of writing, the is_word_char_{fwd,rev} routines will |
| 1049 | // only return true if there is a valid UTF-8 encoding of a "word" |
| 1050 | // codepoint, and false in every other case (including invalid UTF-8). |
| 1051 | // This means that in regions of invalid UTF-8 (which might be a |
| 1052 | // subset of valid UTF-8!), it would result in \B matching. While this |
| 1053 | // would be questionable in the context of truly invalid UTF-8, it is |
| 1054 | // *certainly* wrong to report match boundaries that split the encoding |
| 1055 | // of a codepoint. So to work around this, we ensure that we can decode |
| 1056 | // a codepoint on either side of `at`. If either direction fails, then |
| 1057 | // we don't permit \B to match at all. |
| 1058 | // |
| 1059 | // Now, this isn't exactly optimal from a perf perspective. We could |
| 1060 | // try and detect this in is_word_char::{fwd,rev}, but it's not clear |
| 1061 | // if it's worth it. \B is, after all, rarely used. Even worse, |
| 1062 | // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this |
| 1063 | // will wind up doing UTF-8 decoding twice. Owch. We could fix this |
| 1064 | // with more code complexity, but it just doesn't feel worth it for \B. |
| 1065 | // |
| 1066 | // And in particular, we do *not* have to do this with \b, because \b |
| 1067 | // *requires* that at least one side of `at` be a "word" codepoint, |
| 1068 | // which in turn implies one side of `at` must be valid UTF-8. This in |
| 1069 | // turn implies that \b can never split a valid UTF-8 encoding of a |
| 1070 | // codepoint. In the case where one side of `at` is truly invalid UTF-8 |
| 1071 | // and the other side IS a word codepoint, then we want \b to match |
| 1072 | // since it represents a valid UTF-8 boundary. It also makes sense. For |
| 1073 | // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. |
| 1074 | // |
| 1075 | // Note also that this is not just '!is_word_unicode(..)' like it is |
| 1076 | // for the ASCII case. For example, neither \b nor \B is satisfied |
| 1077 | // within invalid UTF-8 sequences. |
| 1078 | let word_before = at > 0 |
| 1079 | && match utf8::decode_last(&haystack[..at]) { |
| 1080 | None | Some(Err(_)) => return Ok(false), |
| 1081 | Some(Ok(_)) => is_word_char::rev(haystack, at)?, |
| 1082 | }; |
| 1083 | let word_after = at < haystack.len() |
| 1084 | && match utf8::decode(&haystack[at..]) { |
| 1085 | None | Some(Err(_)) => return Ok(false), |
| 1086 | Some(Ok(_)) => is_word_char::fwd(haystack, at)?, |
| 1087 | }; |
| 1088 | Ok(word_before == word_after) |
| 1089 | } |
| 1090 | |
| 1091 | /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given |
| 1092 | /// position in `haystack`. |
| 1093 | /// |
| 1094 | /// # Panics |
| 1095 | /// |
| 1096 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 1097 | /// haystack.len()` is legal and guaranteed not to panic. |
| 1098 | #[inline ] |
| 1099 | pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool { |
| 1100 | let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); |
| 1101 | let word_after = |
| 1102 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
| 1103 | !word_before && word_after |
| 1104 | } |
| 1105 | |
| 1106 | /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given |
| 1107 | /// position in `haystack`. |
| 1108 | /// |
| 1109 | /// # Panics |
| 1110 | /// |
| 1111 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 1112 | /// haystack.len()` is legal and guaranteed not to panic. |
| 1113 | #[inline ] |
| 1114 | pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool { |
| 1115 | let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); |
| 1116 | let word_after = |
| 1117 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
| 1118 | word_before && !word_after |
| 1119 | } |
| 1120 | |
| 1121 | /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the |
| 1122 | /// given position in `haystack`. |
| 1123 | /// |
| 1124 | /// # Panics |
| 1125 | /// |
| 1126 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 1127 | /// haystack.len()` is legal and guaranteed not to panic. |
| 1128 | /// |
| 1129 | /// # Errors |
| 1130 | /// |
| 1131 | /// This returns an error when Unicode word boundary tables |
| 1132 | /// are not available. Specifically, this only occurs when the |
| 1133 | /// `unicode-word-boundary` feature is not enabled. |
| 1134 | #[inline ] |
| 1135 | pub fn is_word_start_unicode( |
| 1136 | &self, |
| 1137 | haystack: &[u8], |
| 1138 | at: usize, |
| 1139 | ) -> Result<bool, UnicodeWordBoundaryError> { |
| 1140 | let word_before = is_word_char::rev(haystack, at)?; |
| 1141 | let word_after = is_word_char::fwd(haystack, at)?; |
| 1142 | Ok(!word_before && word_after) |
| 1143 | } |
| 1144 | |
| 1145 | /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the |
| 1146 | /// given position in `haystack`. |
| 1147 | /// |
| 1148 | /// # Panics |
| 1149 | /// |
| 1150 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 1151 | /// haystack.len()` is legal and guaranteed not to panic. |
| 1152 | /// |
| 1153 | /// # Errors |
| 1154 | /// |
| 1155 | /// This returns an error when Unicode word boundary tables |
| 1156 | /// are not available. Specifically, this only occurs when the |
| 1157 | /// `unicode-word-boundary` feature is not enabled. |
| 1158 | #[inline ] |
| 1159 | pub fn is_word_end_unicode( |
| 1160 | &self, |
| 1161 | haystack: &[u8], |
| 1162 | at: usize, |
| 1163 | ) -> Result<bool, UnicodeWordBoundaryError> { |
| 1164 | let word_before = is_word_char::rev(haystack, at)?; |
| 1165 | let word_after = is_word_char::fwd(haystack, at)?; |
| 1166 | Ok(word_before && !word_after) |
| 1167 | } |
| 1168 | |
| 1169 | /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the |
| 1170 | /// given position in `haystack`. |
| 1171 | /// |
| 1172 | /// # Panics |
| 1173 | /// |
| 1174 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 1175 | /// haystack.len()` is legal and guaranteed not to panic. |
| 1176 | #[inline ] |
| 1177 | pub fn is_word_start_half_ascii( |
| 1178 | &self, |
| 1179 | haystack: &[u8], |
| 1180 | at: usize, |
| 1181 | ) -> bool { |
| 1182 | let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); |
| 1183 | !word_before |
| 1184 | } |
| 1185 | |
| 1186 | /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the |
| 1187 | /// given position in `haystack`. |
| 1188 | /// |
| 1189 | /// # Panics |
| 1190 | /// |
| 1191 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 1192 | /// haystack.len()` is legal and guaranteed not to panic. |
| 1193 | #[inline ] |
| 1194 | pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool { |
| 1195 | let word_after = |
| 1196 | at < haystack.len() && utf8::is_word_byte(haystack[at]); |
| 1197 | !word_after |
| 1198 | } |
| 1199 | |
| 1200 | /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the |
| 1201 | /// given position in `haystack`. |
| 1202 | /// |
| 1203 | /// # Panics |
| 1204 | /// |
| 1205 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 1206 | /// haystack.len()` is legal and guaranteed not to panic. |
| 1207 | /// |
| 1208 | /// # Errors |
| 1209 | /// |
| 1210 | /// This returns an error when Unicode word boundary tables |
| 1211 | /// are not available. Specifically, this only occurs when the |
| 1212 | /// `unicode-word-boundary` feature is not enabled. |
| 1213 | #[inline ] |
| 1214 | pub fn is_word_start_half_unicode( |
| 1215 | &self, |
| 1216 | haystack: &[u8], |
| 1217 | at: usize, |
| 1218 | ) -> Result<bool, UnicodeWordBoundaryError> { |
| 1219 | // See `is_word_unicode_negate` for why we need to do this. We don't |
| 1220 | // need to do it for `is_word_start_unicode` because that guarantees |
| 1221 | // that the position matched falls on a valid UTF-8 boundary given |
| 1222 | // that the right side must be in \w. |
| 1223 | let word_before = at > 0 |
| 1224 | && match utf8::decode_last(&haystack[..at]) { |
| 1225 | None | Some(Err(_)) => return Ok(false), |
| 1226 | Some(Ok(_)) => is_word_char::rev(haystack, at)?, |
| 1227 | }; |
| 1228 | Ok(!word_before) |
| 1229 | } |
| 1230 | |
| 1231 | /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the |
| 1232 | /// given position in `haystack`. |
| 1233 | /// |
| 1234 | /// # Panics |
| 1235 | /// |
| 1236 | /// This may panic when `at > haystack.len()`. Note that `at == |
| 1237 | /// haystack.len()` is legal and guaranteed not to panic. |
| 1238 | /// |
| 1239 | /// # Errors |
| 1240 | /// |
| 1241 | /// This returns an error when Unicode word boundary tables |
| 1242 | /// are not available. Specifically, this only occurs when the |
| 1243 | /// `unicode-word-boundary` feature is not enabled. |
| 1244 | #[inline ] |
| 1245 | pub fn is_word_end_half_unicode( |
| 1246 | &self, |
| 1247 | haystack: &[u8], |
| 1248 | at: usize, |
| 1249 | ) -> Result<bool, UnicodeWordBoundaryError> { |
| 1250 | // See `is_word_unicode_negate` for why we need to do this. We don't |
| 1251 | // need to do it for `is_word_end_unicode` because that guarantees |
| 1252 | // that the position matched falls on a valid UTF-8 boundary given |
| 1253 | // that the left side must be in \w. |
| 1254 | let word_after = at < haystack.len() |
| 1255 | && match utf8::decode(&haystack[at..]) { |
| 1256 | None | Some(Err(_)) => return Ok(false), |
| 1257 | Some(Ok(_)) => is_word_char::fwd(haystack, at)?, |
| 1258 | }; |
| 1259 | Ok(!word_after) |
| 1260 | } |
| 1261 | } |
| 1262 | |
| 1263 | impl Default for LookMatcher { |
| 1264 | fn default() -> LookMatcher { |
| 1265 | LookMatcher::new() |
| 1266 | } |
| 1267 | } |
| 1268 | |
| 1269 | /// An error that occurs when the Unicode-aware `\w` class is unavailable. |
| 1270 | /// |
| 1271 | /// This error can occur when the data tables necessary for the Unicode aware |
| 1272 | /// Perl character class `\w` are unavailable. The `\w` class is used to |
| 1273 | /// determine whether a codepoint is considered a word character or not when |
| 1274 | /// determining whether a Unicode aware `\b` (or `\B`) matches at a particular |
| 1275 | /// position. |
| 1276 | /// |
| 1277 | /// This error can only occur when the `unicode-word-boundary` feature is |
| 1278 | /// disabled. |
| 1279 | #[derive (Clone, Debug)] |
| 1280 | pub struct UnicodeWordBoundaryError(()); |
| 1281 | |
| 1282 | impl UnicodeWordBoundaryError { |
| 1283 | #[cfg (not(feature = "unicode-word-boundary" ))] |
| 1284 | pub(crate) fn new() -> UnicodeWordBoundaryError { |
| 1285 | UnicodeWordBoundaryError(()) |
| 1286 | } |
| 1287 | |
| 1288 | /// Returns an error if and only if Unicode word boundary data is |
| 1289 | /// unavailable. |
| 1290 | pub fn check() -> Result<(), UnicodeWordBoundaryError> { |
| 1291 | is_word_char::check() |
| 1292 | } |
| 1293 | } |
| 1294 | |
| 1295 | #[cfg (feature = "std" )] |
| 1296 | impl std::error::Error for UnicodeWordBoundaryError {} |
| 1297 | |
| 1298 | impl core::fmt::Display for UnicodeWordBoundaryError { |
| 1299 | fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { |
| 1300 | write!( |
| 1301 | f, |
| 1302 | "Unicode-aware \\b and \\B are unavailable because the \ |
| 1303 | requisite data tables are missing, please enable the \ |
| 1304 | unicode-word-boundary feature" |
| 1305 | ) |
| 1306 | } |
| 1307 | } |
| 1308 | |
| 1309 | // Below are FOUR different ways for checking whether whether a "word" |
| 1310 | // codepoint exists at a particular position in the haystack. The four |
| 1311 | // different approaches are, in order of preference: |
| 1312 | // |
| 1313 | // 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the |
| 1314 | // first call, and then use that DFA for all subsequent calls. |
| 1315 | // 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available. |
| 1316 | // 3. Do UTF-8 decoding and use our own 'perl_word' table. |
| 1317 | // 4. Return an error. |
| 1318 | // |
| 1319 | // The reason for all of these approaches is a combination of perf and |
| 1320 | // permitting one to build regex-automata without the Unicode data necessary |
| 1321 | // for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would |
| 1322 | // still work.) |
| 1323 | // |
| 1324 | // The DFA approach is the fastest, but it requires the regex parser, the |
| 1325 | // NFA compiler, the DFA builder and the DFA search runtime. That's a lot to |
| 1326 | // bring in, but if it's available, it's (probably) the best we can do. |
| 1327 | // |
| 1328 | // Approaches (2) and (3) are effectively equivalent, but (2) reuses the |
| 1329 | // data in regex-syntax and avoids duplicating it in regex-automata. |
| 1330 | // |
| 1331 | // Finally, (4) unconditionally returns an error since the requisite data isn't |
| 1332 | // available anywhere. |
| 1333 | // |
| 1334 | // There are actually more approaches possible that we didn't implement. For |
| 1335 | // example, if the DFA builder is available but the syntax parser is not, we |
| 1336 | // could technically hand construct our own NFA from the 'perl_word' data |
| 1337 | // table. But to avoid some pretty hairy code duplication, we would in turn |
| 1338 | // need to pull the UTF-8 compiler out of the NFA compiler. Yikes. |
| 1339 | // |
| 1340 | // A possibly more sensible alternative is to use a lazy DFA when the full |
| 1341 | // DFA builder isn't available... |
| 1342 | // |
| 1343 | // Yet another choice would be to build the full DFA and then embed it into the |
| 1344 | // source. Then we'd only need to bring in the DFA search runtime, which is |
| 1345 | // considerably smaller than the DFA builder code. The problem here is that the |
| 1346 | // Debian people have spooked me[1] into avoiding cyclic dependencies. Namely, |
| 1347 | // we'd need to build regex-cli, which depends on regex-automata in order to |
| 1348 | // build some part of regex-automata. But to be honest, something like this has |
| 1349 | // to be allowed somehow? I just don't know what the right process is. |
| 1350 | // |
| 1351 | // There are perhaps other choices as well. Why did I stop at these 4? Because |
| 1352 | // I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA |
| 1353 | // approach eventually, as the benefits of the DFA approach are somewhat |
| 1354 | // compelling. The 'boundary-words-holmes' benchmark tests this. (Note that |
| 1355 | // the commands below no longer work. If necessary, we should re-capitulate |
| 1356 | // the benchmark from whole cloth in rebar.) |
| 1357 | // |
| 1358 | // $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv |
| 1359 | // |
| 1360 | // Then I changed the code below so that the util/unicode_data/perl_word table |
| 1361 | // was used and re-ran the benchmark: |
| 1362 | // |
| 1363 | // $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv |
| 1364 | // |
| 1365 | // And compared them: |
| 1366 | // |
| 1367 | // $ regex-cli bench diff dfa.csv table.csv |
| 1368 | // benchmark engine dfa table |
| 1369 | // --------- ------ --- ----- |
| 1370 | // internal/count/boundary-words-holmes regex/automata/pikevm 18.6 MB/s 12.9 MB/s |
| 1371 | // |
| 1372 | // Which is a nice improvement. |
| 1373 | // |
| 1374 | // UPDATE: It turns out that it takes approximately 22ms to build the reverse |
| 1375 | // DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in |
| 1376 | // the grand scheme things, but that is a significant latency cost. So I'm not |
| 1377 | // sure that's a good idea. I then tried using a lazy DFA instead, and that |
| 1378 | // eliminated the overhead, but since the lazy DFA requires mutable working |
| 1379 | // memory, that requires introducing a 'Cache' for every simultaneous call. |
| 1380 | // |
| 1381 | // I ended up deciding for now to just keep the "UTF-8 decode and check the |
| 1382 | // table." The DFA and lazy DFA approaches are still below, but commented out. |
| 1383 | // |
| 1384 | // [1]: https://github.com/BurntSushi/ucd-generate/issues/11 |
| 1385 | |
| 1386 | /* |
| 1387 | /// A module that looks for word codepoints using lazy DFAs. |
| 1388 | #[cfg(all( |
| 1389 | feature = "unicode-word-boundary", |
| 1390 | feature = "syntax", |
| 1391 | feature = "unicode-perl", |
| 1392 | feature = "hybrid" |
| 1393 | ))] |
| 1394 | mod is_word_char { |
| 1395 | use alloc::vec::Vec; |
| 1396 | |
| 1397 | use crate::{ |
| 1398 | hybrid::dfa::{Cache, DFA}, |
| 1399 | nfa::thompson::NFA, |
| 1400 | util::{lazy::Lazy, pool::Pool, primitives::StateID}, |
| 1401 | Anchored, Input, |
| 1402 | }; |
| 1403 | |
| 1404 | pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { |
| 1405 | Ok(()) |
| 1406 | } |
| 1407 | |
| 1408 | #[cfg_attr(feature = "perf-inline", inline(always))] |
| 1409 | pub(super) fn fwd( |
| 1410 | haystack: &[u8], |
| 1411 | mut at: usize, |
| 1412 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
| 1413 | static WORD: Lazy<DFA> = Lazy::new(|| DFA::new(r"\w").unwrap()); |
| 1414 | static CACHE: Lazy<Pool<Cache>> = |
| 1415 | Lazy::new(|| Pool::new(|| WORD.create_cache())); |
| 1416 | let dfa = Lazy::get(&WORD); |
| 1417 | let mut cache = Lazy::get(&CACHE).get(); |
| 1418 | let mut sid = dfa |
| 1419 | .start_state_forward( |
| 1420 | &mut cache, |
| 1421 | &Input::new("").anchored(Anchored::Yes), |
| 1422 | ) |
| 1423 | .unwrap(); |
| 1424 | while at < haystack.len() { |
| 1425 | let byte = haystack[at]; |
| 1426 | sid = dfa.next_state(&mut cache, sid, byte).unwrap(); |
| 1427 | at += 1; |
| 1428 | if sid.is_tagged() { |
| 1429 | if sid.is_match() { |
| 1430 | return Ok(true); |
| 1431 | } else if sid.is_dead() { |
| 1432 | return Ok(false); |
| 1433 | } |
| 1434 | } |
| 1435 | } |
| 1436 | Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) |
| 1437 | } |
| 1438 | |
| 1439 | #[cfg_attr(feature = "perf-inline", inline(always))] |
| 1440 | pub(super) fn rev( |
| 1441 | haystack: &[u8], |
| 1442 | mut at: usize, |
| 1443 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
| 1444 | static WORD: Lazy<DFA> = Lazy::new(|| { |
| 1445 | DFA::builder() |
| 1446 | .thompson(NFA::config().reverse(true)) |
| 1447 | .build(r"\w") |
| 1448 | .unwrap() |
| 1449 | }); |
| 1450 | static CACHE: Lazy<Pool<Cache>> = |
| 1451 | Lazy::new(|| Pool::new(|| WORD.create_cache())); |
| 1452 | let dfa = Lazy::get(&WORD); |
| 1453 | let mut cache = Lazy::get(&CACHE).get(); |
| 1454 | let mut sid = dfa |
| 1455 | .start_state_reverse( |
| 1456 | &mut cache, |
| 1457 | &Input::new("").anchored(Anchored::Yes), |
| 1458 | ) |
| 1459 | .unwrap(); |
| 1460 | while at > 0 { |
| 1461 | at -= 1; |
| 1462 | let byte = haystack[at]; |
| 1463 | sid = dfa.next_state(&mut cache, sid, byte).unwrap(); |
| 1464 | if sid.is_tagged() { |
| 1465 | if sid.is_match() { |
| 1466 | return Ok(true); |
| 1467 | } else if sid.is_dead() { |
| 1468 | return Ok(false); |
| 1469 | } |
| 1470 | } |
| 1471 | } |
| 1472 | Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) |
| 1473 | } |
| 1474 | } |
| 1475 | */ |
| 1476 | |
| 1477 | /* |
| 1478 | /// A module that looks for word codepoints using fully compiled DFAs. |
| 1479 | #[cfg(all( |
| 1480 | feature = "unicode-word-boundary", |
| 1481 | feature = "syntax", |
| 1482 | feature = "unicode-perl", |
| 1483 | feature = "dfa-build" |
| 1484 | ))] |
| 1485 | mod is_word_char { |
| 1486 | use alloc::vec::Vec; |
| 1487 | |
| 1488 | use crate::{ |
| 1489 | dfa::{dense::DFA, Automaton, StartKind}, |
| 1490 | nfa::thompson::NFA, |
| 1491 | util::{lazy::Lazy, primitives::StateID}, |
| 1492 | Anchored, Input, |
| 1493 | }; |
| 1494 | |
| 1495 | pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { |
| 1496 | Ok(()) |
| 1497 | } |
| 1498 | |
| 1499 | #[cfg_attr(feature = "perf-inline", inline(always))] |
| 1500 | pub(super) fn fwd( |
| 1501 | haystack: &[u8], |
| 1502 | mut at: usize, |
| 1503 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
| 1504 | static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { |
| 1505 | let dfa = DFA::builder() |
| 1506 | .configure(DFA::config().start_kind(StartKind::Anchored)) |
| 1507 | .build(r"\w") |
| 1508 | .unwrap(); |
| 1509 | // OK because our regex has no look-around. |
| 1510 | let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); |
| 1511 | (dfa, start_id) |
| 1512 | }); |
| 1513 | let &(ref dfa, mut sid) = Lazy::get(&WORD); |
| 1514 | while at < haystack.len() { |
| 1515 | let byte = haystack[at]; |
| 1516 | sid = dfa.next_state(sid, byte); |
| 1517 | at += 1; |
| 1518 | if dfa.is_special_state(sid) { |
| 1519 | if dfa.is_match_state(sid) { |
| 1520 | return Ok(true); |
| 1521 | } else if dfa.is_dead_state(sid) { |
| 1522 | return Ok(false); |
| 1523 | } |
| 1524 | } |
| 1525 | } |
| 1526 | Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) |
| 1527 | } |
| 1528 | |
| 1529 | #[cfg_attr(feature = "perf-inline", inline(always))] |
| 1530 | pub(super) fn rev( |
| 1531 | haystack: &[u8], |
| 1532 | mut at: usize, |
| 1533 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
| 1534 | static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| { |
| 1535 | let dfa = DFA::builder() |
| 1536 | .configure(DFA::config().start_kind(StartKind::Anchored)) |
| 1537 | // From ad hoc measurements, it looks like setting |
| 1538 | // shrink==false is slightly faster than shrink==true. I kind |
| 1539 | // of feel like this indicates that shrinking is probably a |
| 1540 | // failure, although it can help in some cases. Sigh. |
| 1541 | .thompson(NFA::config().reverse(true).shrink(false)) |
| 1542 | .build(r"\w") |
| 1543 | .unwrap(); |
| 1544 | // OK because our regex has no look-around. |
| 1545 | let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); |
| 1546 | (dfa, start_id) |
| 1547 | }); |
| 1548 | let &(ref dfa, mut sid) = Lazy::get(&WORD); |
| 1549 | while at > 0 { |
| 1550 | at -= 1; |
| 1551 | let byte = haystack[at]; |
| 1552 | sid = dfa.next_state(sid, byte); |
| 1553 | if dfa.is_special_state(sid) { |
| 1554 | if dfa.is_match_state(sid) { |
| 1555 | return Ok(true); |
| 1556 | } else if dfa.is_dead_state(sid) { |
| 1557 | return Ok(false); |
| 1558 | } |
| 1559 | } |
| 1560 | } |
| 1561 | Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) |
| 1562 | } |
| 1563 | } |
| 1564 | */ |
| 1565 | |
| 1566 | /// A module that looks for word codepoints using regex-syntax's data tables. |
| 1567 | #[cfg (all( |
| 1568 | feature = "unicode-word-boundary" , |
| 1569 | feature = "syntax" , |
| 1570 | feature = "unicode-perl" , |
| 1571 | ))] |
| 1572 | mod is_word_char { |
| 1573 | use regex_syntax::try_is_word_character; |
| 1574 | |
| 1575 | use crate::util::utf8; |
| 1576 | |
| 1577 | pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { |
| 1578 | Ok(()) |
| 1579 | } |
| 1580 | |
| 1581 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1582 | pub(super) fn fwd( |
| 1583 | haystack: &[u8], |
| 1584 | at: usize, |
| 1585 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
| 1586 | Ok(match utf8::decode(&haystack[at..]) { |
| 1587 | None | Some(Err(_)) => false, |
| 1588 | Some(Ok(ch)) => try_is_word_character(ch).expect( |
| 1589 | "since unicode-word-boundary, syntax and unicode-perl \ |
| 1590 | are all enabled, it is expected that \ |
| 1591 | try_is_word_character succeeds" , |
| 1592 | ), |
| 1593 | }) |
| 1594 | } |
| 1595 | |
| 1596 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1597 | pub(super) fn rev( |
| 1598 | haystack: &[u8], |
| 1599 | at: usize, |
| 1600 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
| 1601 | Ok(match utf8::decode_last(&haystack[..at]) { |
| 1602 | None | Some(Err(_)) => false, |
| 1603 | Some(Ok(ch)) => try_is_word_character(ch).expect( |
| 1604 | "since unicode-word-boundary, syntax and unicode-perl \ |
| 1605 | are all enabled, it is expected that \ |
| 1606 | try_is_word_character succeeds" , |
| 1607 | ), |
| 1608 | }) |
| 1609 | } |
| 1610 | } |
| 1611 | |
| 1612 | /// A module that looks for word codepoints using regex-automata's data tables |
| 1613 | /// (which are only compiled when regex-syntax's tables aren't available). |
| 1614 | /// |
| 1615 | /// Note that the cfg should match the one in src/util/unicode_data/mod.rs for |
| 1616 | /// perl_word. |
| 1617 | #[cfg (all( |
| 1618 | feature = "unicode-word-boundary" , |
| 1619 | not(all(feature = "syntax" , feature = "unicode-perl" )), |
| 1620 | ))] |
| 1621 | mod is_word_char { |
| 1622 | use crate::util::utf8; |
| 1623 | |
| 1624 | pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { |
| 1625 | Ok(()) |
| 1626 | } |
| 1627 | |
| 1628 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1629 | pub(super) fn fwd( |
| 1630 | haystack: &[u8], |
| 1631 | at: usize, |
| 1632 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
| 1633 | Ok(match utf8::decode(&haystack[at..]) { |
| 1634 | None | Some(Err(_)) => false, |
| 1635 | Some(Ok(ch)) => is_word_character(ch), |
| 1636 | }) |
| 1637 | } |
| 1638 | |
| 1639 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1640 | pub(super) fn rev( |
| 1641 | haystack: &[u8], |
| 1642 | at: usize, |
| 1643 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
| 1644 | Ok(match utf8::decode_last(&haystack[..at]) { |
| 1645 | None | Some(Err(_)) => false, |
| 1646 | Some(Ok(ch)) => is_word_character(ch), |
| 1647 | }) |
| 1648 | } |
| 1649 | |
| 1650 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1651 | fn is_word_character(c: char) -> bool { |
| 1652 | use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; |
| 1653 | |
| 1654 | if u8::try_from(c).map_or(false, utf8::is_word_byte) { |
| 1655 | return true; |
| 1656 | } |
| 1657 | PERL_WORD |
| 1658 | .binary_search_by(|&(start, end)| { |
| 1659 | use core::cmp::Ordering; |
| 1660 | |
| 1661 | if start <= c && c <= end { |
| 1662 | Ordering::Equal |
| 1663 | } else if start > c { |
| 1664 | Ordering::Greater |
| 1665 | } else { |
| 1666 | Ordering::Less |
| 1667 | } |
| 1668 | }) |
| 1669 | .is_ok() |
| 1670 | } |
| 1671 | } |
| 1672 | |
| 1673 | /// A module that always returns an error if Unicode word boundaries are |
| 1674 | /// disabled. When this feature is disabled, then regex-automata will not |
| 1675 | /// include its own data tables even if regex-syntax is disabled. |
| 1676 | #[cfg (not(feature = "unicode-word-boundary" ))] |
| 1677 | mod is_word_char { |
| 1678 | pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { |
| 1679 | Err(super::UnicodeWordBoundaryError::new()) |
| 1680 | } |
| 1681 | |
| 1682 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1683 | pub(super) fn fwd( |
| 1684 | _bytes: &[u8], |
| 1685 | _at: usize, |
| 1686 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
| 1687 | Err(super::UnicodeWordBoundaryError::new()) |
| 1688 | } |
| 1689 | |
| 1690 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 1691 | pub(super) fn rev( |
| 1692 | _bytes: &[u8], |
| 1693 | _at: usize, |
| 1694 | ) -> Result<bool, super::UnicodeWordBoundaryError> { |
| 1695 | Err(super::UnicodeWordBoundaryError::new()) |
| 1696 | } |
| 1697 | } |
| 1698 | |
| 1699 | #[cfg (test)] |
| 1700 | mod tests { |
| 1701 | use super::*; |
| 1702 | |
| 1703 | macro_rules! testlook { |
| 1704 | ($look:expr, $haystack:expr, $at:expr) => { |
| 1705 | LookMatcher::default().matches($look, $haystack.as_bytes(), $at) |
| 1706 | }; |
| 1707 | } |
| 1708 | |
| 1709 | #[test ] |
| 1710 | fn look_matches_start_line() { |
| 1711 | let look = Look::StartLF; |
| 1712 | |
| 1713 | assert!(testlook!(look, "" , 0)); |
| 1714 | assert!(testlook!(look, " \n" , 0)); |
| 1715 | assert!(testlook!(look, " \n" , 1)); |
| 1716 | assert!(testlook!(look, "a" , 0)); |
| 1717 | assert!(testlook!(look, " \na" , 1)); |
| 1718 | |
| 1719 | assert!(!testlook!(look, "a" , 1)); |
| 1720 | assert!(!testlook!(look, "a \na" , 1)); |
| 1721 | } |
| 1722 | |
| 1723 | #[test ] |
| 1724 | fn look_matches_end_line() { |
| 1725 | let look = Look::EndLF; |
| 1726 | |
| 1727 | assert!(testlook!(look, "" , 0)); |
| 1728 | assert!(testlook!(look, " \n" , 1)); |
| 1729 | assert!(testlook!(look, " \na" , 0)); |
| 1730 | assert!(testlook!(look, " \na" , 2)); |
| 1731 | assert!(testlook!(look, "a \na" , 1)); |
| 1732 | |
| 1733 | assert!(!testlook!(look, "a" , 0)); |
| 1734 | assert!(!testlook!(look, " \na" , 1)); |
| 1735 | assert!(!testlook!(look, "a \na" , 0)); |
| 1736 | assert!(!testlook!(look, "a \na" , 2)); |
| 1737 | } |
| 1738 | |
| 1739 | #[test ] |
| 1740 | fn look_matches_start_text() { |
| 1741 | let look = Look::Start; |
| 1742 | |
| 1743 | assert!(testlook!(look, "" , 0)); |
| 1744 | assert!(testlook!(look, " \n" , 0)); |
| 1745 | assert!(testlook!(look, "a" , 0)); |
| 1746 | |
| 1747 | assert!(!testlook!(look, " \n" , 1)); |
| 1748 | assert!(!testlook!(look, " \na" , 1)); |
| 1749 | assert!(!testlook!(look, "a" , 1)); |
| 1750 | assert!(!testlook!(look, "a \na" , 1)); |
| 1751 | } |
| 1752 | |
| 1753 | #[test ] |
| 1754 | fn look_matches_end_text() { |
| 1755 | let look = Look::End; |
| 1756 | |
| 1757 | assert!(testlook!(look, "" , 0)); |
| 1758 | assert!(testlook!(look, " \n" , 1)); |
| 1759 | assert!(testlook!(look, " \na" , 2)); |
| 1760 | |
| 1761 | assert!(!testlook!(look, " \na" , 0)); |
| 1762 | assert!(!testlook!(look, "a \na" , 1)); |
| 1763 | assert!(!testlook!(look, "a" , 0)); |
| 1764 | assert!(!testlook!(look, " \na" , 1)); |
| 1765 | assert!(!testlook!(look, "a \na" , 0)); |
| 1766 | assert!(!testlook!(look, "a \na" , 2)); |
| 1767 | } |
| 1768 | |
| 1769 | #[test ] |
| 1770 | #[cfg (all(not(miri), feature = "unicode-word-boundary" ))] |
| 1771 | fn look_matches_word_unicode() { |
| 1772 | let look = Look::WordUnicode; |
| 1773 | |
| 1774 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 1775 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 1776 | |
| 1777 | // Simple ASCII word boundaries. |
| 1778 | assert!(testlook!(look, "a" , 0)); |
| 1779 | assert!(testlook!(look, "a" , 1)); |
| 1780 | assert!(testlook!(look, "a " , 1)); |
| 1781 | assert!(testlook!(look, " a " , 1)); |
| 1782 | assert!(testlook!(look, " a " , 2)); |
| 1783 | |
| 1784 | // Unicode word boundaries with a non-ASCII codepoint. |
| 1785 | assert!(testlook!(look, "𝛃" , 0)); |
| 1786 | assert!(testlook!(look, "𝛃" , 4)); |
| 1787 | assert!(testlook!(look, "𝛃 " , 4)); |
| 1788 | assert!(testlook!(look, " 𝛃 " , 1)); |
| 1789 | assert!(testlook!(look, " 𝛃 " , 5)); |
| 1790 | |
| 1791 | // Unicode word boundaries between non-ASCII codepoints. |
| 1792 | assert!(testlook!(look, "𝛃𐆀" , 0)); |
| 1793 | assert!(testlook!(look, "𝛃𐆀" , 4)); |
| 1794 | |
| 1795 | // Non word boundaries for ASCII. |
| 1796 | assert!(!testlook!(look, "" , 0)); |
| 1797 | assert!(!testlook!(look, "ab" , 1)); |
| 1798 | assert!(!testlook!(look, "a " , 2)); |
| 1799 | assert!(!testlook!(look, " a " , 0)); |
| 1800 | assert!(!testlook!(look, " a " , 3)); |
| 1801 | |
| 1802 | // Non word boundaries with a non-ASCII codepoint. |
| 1803 | assert!(!testlook!(look, "𝛃b" , 4)); |
| 1804 | assert!(!testlook!(look, "𝛃 " , 5)); |
| 1805 | assert!(!testlook!(look, " 𝛃 " , 0)); |
| 1806 | assert!(!testlook!(look, " 𝛃 " , 6)); |
| 1807 | assert!(!testlook!(look, "𝛃" , 1)); |
| 1808 | assert!(!testlook!(look, "𝛃" , 2)); |
| 1809 | assert!(!testlook!(look, "𝛃" , 3)); |
| 1810 | |
| 1811 | // Non word boundaries with non-ASCII codepoints. |
| 1812 | assert!(!testlook!(look, "𝛃𐆀" , 1)); |
| 1813 | assert!(!testlook!(look, "𝛃𐆀" , 2)); |
| 1814 | assert!(!testlook!(look, "𝛃𐆀" , 3)); |
| 1815 | assert!(!testlook!(look, "𝛃𐆀" , 5)); |
| 1816 | assert!(!testlook!(look, "𝛃𐆀" , 6)); |
| 1817 | assert!(!testlook!(look, "𝛃𐆀" , 7)); |
| 1818 | assert!(!testlook!(look, "𝛃𐆀" , 8)); |
| 1819 | } |
| 1820 | |
| 1821 | #[test ] |
| 1822 | fn look_matches_word_ascii() { |
| 1823 | let look = Look::WordAscii; |
| 1824 | |
| 1825 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 1826 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 1827 | |
| 1828 | // Simple ASCII word boundaries. |
| 1829 | assert!(testlook!(look, "a" , 0)); |
| 1830 | assert!(testlook!(look, "a" , 1)); |
| 1831 | assert!(testlook!(look, "a " , 1)); |
| 1832 | assert!(testlook!(look, " a " , 1)); |
| 1833 | assert!(testlook!(look, " a " , 2)); |
| 1834 | |
| 1835 | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
| 1836 | // an ASCII word boundary, none of these match. |
| 1837 | assert!(!testlook!(look, "𝛃" , 0)); |
| 1838 | assert!(!testlook!(look, "𝛃" , 4)); |
| 1839 | assert!(!testlook!(look, "𝛃 " , 4)); |
| 1840 | assert!(!testlook!(look, " 𝛃 " , 1)); |
| 1841 | assert!(!testlook!(look, " 𝛃 " , 5)); |
| 1842 | |
| 1843 | // Unicode word boundaries between non-ASCII codepoints. Again, since |
| 1844 | // this is an ASCII word boundary, none of these match. |
| 1845 | assert!(!testlook!(look, "𝛃𐆀" , 0)); |
| 1846 | assert!(!testlook!(look, "𝛃𐆀" , 4)); |
| 1847 | |
| 1848 | // Non word boundaries for ASCII. |
| 1849 | assert!(!testlook!(look, "" , 0)); |
| 1850 | assert!(!testlook!(look, "ab" , 1)); |
| 1851 | assert!(!testlook!(look, "a " , 2)); |
| 1852 | assert!(!testlook!(look, " a " , 0)); |
| 1853 | assert!(!testlook!(look, " a " , 3)); |
| 1854 | |
| 1855 | // Non word boundaries with a non-ASCII codepoint. |
| 1856 | assert!(testlook!(look, "𝛃b" , 4)); |
| 1857 | assert!(!testlook!(look, "𝛃 " , 5)); |
| 1858 | assert!(!testlook!(look, " 𝛃 " , 0)); |
| 1859 | assert!(!testlook!(look, " 𝛃 " , 6)); |
| 1860 | assert!(!testlook!(look, "𝛃" , 1)); |
| 1861 | assert!(!testlook!(look, "𝛃" , 2)); |
| 1862 | assert!(!testlook!(look, "𝛃" , 3)); |
| 1863 | |
| 1864 | // Non word boundaries with non-ASCII codepoints. |
| 1865 | assert!(!testlook!(look, "𝛃𐆀" , 1)); |
| 1866 | assert!(!testlook!(look, "𝛃𐆀" , 2)); |
| 1867 | assert!(!testlook!(look, "𝛃𐆀" , 3)); |
| 1868 | assert!(!testlook!(look, "𝛃𐆀" , 5)); |
| 1869 | assert!(!testlook!(look, "𝛃𐆀" , 6)); |
| 1870 | assert!(!testlook!(look, "𝛃𐆀" , 7)); |
| 1871 | assert!(!testlook!(look, "𝛃𐆀" , 8)); |
| 1872 | } |
| 1873 | |
| 1874 | #[test ] |
| 1875 | #[cfg (all(not(miri), feature = "unicode-word-boundary" ))] |
| 1876 | fn look_matches_word_unicode_negate() { |
| 1877 | let look = Look::WordUnicodeNegate; |
| 1878 | |
| 1879 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 1880 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 1881 | |
| 1882 | // Simple ASCII word boundaries. |
| 1883 | assert!(!testlook!(look, "a" , 0)); |
| 1884 | assert!(!testlook!(look, "a" , 1)); |
| 1885 | assert!(!testlook!(look, "a " , 1)); |
| 1886 | assert!(!testlook!(look, " a " , 1)); |
| 1887 | assert!(!testlook!(look, " a " , 2)); |
| 1888 | |
| 1889 | // Unicode word boundaries with a non-ASCII codepoint. |
| 1890 | assert!(!testlook!(look, "𝛃" , 0)); |
| 1891 | assert!(!testlook!(look, "𝛃" , 4)); |
| 1892 | assert!(!testlook!(look, "𝛃 " , 4)); |
| 1893 | assert!(!testlook!(look, " 𝛃 " , 1)); |
| 1894 | assert!(!testlook!(look, " 𝛃 " , 5)); |
| 1895 | |
| 1896 | // Unicode word boundaries between non-ASCII codepoints. |
| 1897 | assert!(!testlook!(look, "𝛃𐆀" , 0)); |
| 1898 | assert!(!testlook!(look, "𝛃𐆀" , 4)); |
| 1899 | |
| 1900 | // Non word boundaries for ASCII. |
| 1901 | assert!(testlook!(look, "" , 0)); |
| 1902 | assert!(testlook!(look, "ab" , 1)); |
| 1903 | assert!(testlook!(look, "a " , 2)); |
| 1904 | assert!(testlook!(look, " a " , 0)); |
| 1905 | assert!(testlook!(look, " a " , 3)); |
| 1906 | |
| 1907 | // Non word boundaries with a non-ASCII codepoint. |
| 1908 | assert!(testlook!(look, "𝛃b" , 4)); |
| 1909 | assert!(testlook!(look, "𝛃 " , 5)); |
| 1910 | assert!(testlook!(look, " 𝛃 " , 0)); |
| 1911 | assert!(testlook!(look, " 𝛃 " , 6)); |
| 1912 | // These don't match because they could otherwise return an offset that |
| 1913 | // splits the UTF-8 encoding of a codepoint. |
| 1914 | assert!(!testlook!(look, "𝛃" , 1)); |
| 1915 | assert!(!testlook!(look, "𝛃" , 2)); |
| 1916 | assert!(!testlook!(look, "𝛃" , 3)); |
| 1917 | |
| 1918 | // Non word boundaries with non-ASCII codepoints. These also don't |
| 1919 | // match because they could otherwise return an offset that splits the |
| 1920 | // UTF-8 encoding of a codepoint. |
| 1921 | assert!(!testlook!(look, "𝛃𐆀" , 1)); |
| 1922 | assert!(!testlook!(look, "𝛃𐆀" , 2)); |
| 1923 | assert!(!testlook!(look, "𝛃𐆀" , 3)); |
| 1924 | assert!(!testlook!(look, "𝛃𐆀" , 5)); |
| 1925 | assert!(!testlook!(look, "𝛃𐆀" , 6)); |
| 1926 | assert!(!testlook!(look, "𝛃𐆀" , 7)); |
| 1927 | // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end |
| 1928 | // of the haystack. So the "end" of the haystack isn't a word and 𐆀 |
| 1929 | // isn't a word, thus, \B matches. |
| 1930 | assert!(testlook!(look, "𝛃𐆀" , 8)); |
| 1931 | } |
| 1932 | |
| 1933 | #[test ] |
| 1934 | fn look_matches_word_ascii_negate() { |
| 1935 | let look = Look::WordAsciiNegate; |
| 1936 | |
| 1937 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 1938 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 1939 | |
| 1940 | // Simple ASCII word boundaries. |
| 1941 | assert!(!testlook!(look, "a" , 0)); |
| 1942 | assert!(!testlook!(look, "a" , 1)); |
| 1943 | assert!(!testlook!(look, "a " , 1)); |
| 1944 | assert!(!testlook!(look, " a " , 1)); |
| 1945 | assert!(!testlook!(look, " a " , 2)); |
| 1946 | |
| 1947 | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
| 1948 | // an ASCII word boundary, none of these match. |
| 1949 | assert!(testlook!(look, "𝛃" , 0)); |
| 1950 | assert!(testlook!(look, "𝛃" , 4)); |
| 1951 | assert!(testlook!(look, "𝛃 " , 4)); |
| 1952 | assert!(testlook!(look, " 𝛃 " , 1)); |
| 1953 | assert!(testlook!(look, " 𝛃 " , 5)); |
| 1954 | |
| 1955 | // Unicode word boundaries between non-ASCII codepoints. Again, since |
| 1956 | // this is an ASCII word boundary, none of these match. |
| 1957 | assert!(testlook!(look, "𝛃𐆀" , 0)); |
| 1958 | assert!(testlook!(look, "𝛃𐆀" , 4)); |
| 1959 | |
| 1960 | // Non word boundaries for ASCII. |
| 1961 | assert!(testlook!(look, "" , 0)); |
| 1962 | assert!(testlook!(look, "ab" , 1)); |
| 1963 | assert!(testlook!(look, "a " , 2)); |
| 1964 | assert!(testlook!(look, " a " , 0)); |
| 1965 | assert!(testlook!(look, " a " , 3)); |
| 1966 | |
| 1967 | // Non word boundaries with a non-ASCII codepoint. |
| 1968 | assert!(!testlook!(look, "𝛃b" , 4)); |
| 1969 | assert!(testlook!(look, "𝛃 " , 5)); |
| 1970 | assert!(testlook!(look, " 𝛃 " , 0)); |
| 1971 | assert!(testlook!(look, " 𝛃 " , 6)); |
| 1972 | assert!(testlook!(look, "𝛃" , 1)); |
| 1973 | assert!(testlook!(look, "𝛃" , 2)); |
| 1974 | assert!(testlook!(look, "𝛃" , 3)); |
| 1975 | |
| 1976 | // Non word boundaries with non-ASCII codepoints. |
| 1977 | assert!(testlook!(look, "𝛃𐆀" , 1)); |
| 1978 | assert!(testlook!(look, "𝛃𐆀" , 2)); |
| 1979 | assert!(testlook!(look, "𝛃𐆀" , 3)); |
| 1980 | assert!(testlook!(look, "𝛃𐆀" , 5)); |
| 1981 | assert!(testlook!(look, "𝛃𐆀" , 6)); |
| 1982 | assert!(testlook!(look, "𝛃𐆀" , 7)); |
| 1983 | assert!(testlook!(look, "𝛃𐆀" , 8)); |
| 1984 | } |
| 1985 | |
| 1986 | #[test ] |
| 1987 | fn look_matches_word_start_ascii() { |
| 1988 | let look = Look::WordStartAscii; |
| 1989 | |
| 1990 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 1991 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 1992 | |
| 1993 | // Simple ASCII word boundaries. |
| 1994 | assert!(testlook!(look, "a" , 0)); |
| 1995 | assert!(!testlook!(look, "a" , 1)); |
| 1996 | assert!(!testlook!(look, "a " , 1)); |
| 1997 | assert!(testlook!(look, " a " , 1)); |
| 1998 | assert!(!testlook!(look, " a " , 2)); |
| 1999 | |
| 2000 | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
| 2001 | // an ASCII word boundary, none of these match. |
| 2002 | assert!(!testlook!(look, "𝛃" , 0)); |
| 2003 | assert!(!testlook!(look, "𝛃" , 4)); |
| 2004 | assert!(!testlook!(look, "𝛃 " , 4)); |
| 2005 | assert!(!testlook!(look, " 𝛃 " , 1)); |
| 2006 | assert!(!testlook!(look, " 𝛃 " , 5)); |
| 2007 | |
| 2008 | // Unicode word boundaries between non-ASCII codepoints. Again, since |
| 2009 | // this is an ASCII word boundary, none of these match. |
| 2010 | assert!(!testlook!(look, "𝛃𐆀" , 0)); |
| 2011 | assert!(!testlook!(look, "𝛃𐆀" , 4)); |
| 2012 | |
| 2013 | // Non word boundaries for ASCII. |
| 2014 | assert!(!testlook!(look, "" , 0)); |
| 2015 | assert!(!testlook!(look, "ab" , 1)); |
| 2016 | assert!(!testlook!(look, "a " , 2)); |
| 2017 | assert!(!testlook!(look, " a " , 0)); |
| 2018 | assert!(!testlook!(look, " a " , 3)); |
| 2019 | |
| 2020 | // Non word boundaries with a non-ASCII codepoint. |
| 2021 | assert!(testlook!(look, "𝛃b" , 4)); |
| 2022 | assert!(!testlook!(look, "b𝛃" , 1)); |
| 2023 | assert!(!testlook!(look, "𝛃 " , 5)); |
| 2024 | assert!(!testlook!(look, " 𝛃 " , 0)); |
| 2025 | assert!(!testlook!(look, " 𝛃 " , 6)); |
| 2026 | assert!(!testlook!(look, "𝛃" , 1)); |
| 2027 | assert!(!testlook!(look, "𝛃" , 2)); |
| 2028 | assert!(!testlook!(look, "𝛃" , 3)); |
| 2029 | |
| 2030 | // Non word boundaries with non-ASCII codepoints. |
| 2031 | assert!(!testlook!(look, "𝛃𐆀" , 1)); |
| 2032 | assert!(!testlook!(look, "𝛃𐆀" , 2)); |
| 2033 | assert!(!testlook!(look, "𝛃𐆀" , 3)); |
| 2034 | assert!(!testlook!(look, "𝛃𐆀" , 5)); |
| 2035 | assert!(!testlook!(look, "𝛃𐆀" , 6)); |
| 2036 | assert!(!testlook!(look, "𝛃𐆀" , 7)); |
| 2037 | assert!(!testlook!(look, "𝛃𐆀" , 8)); |
| 2038 | } |
| 2039 | |
| 2040 | #[test ] |
| 2041 | fn look_matches_word_end_ascii() { |
| 2042 | let look = Look::WordEndAscii; |
| 2043 | |
| 2044 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 2045 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 2046 | |
| 2047 | // Simple ASCII word boundaries. |
| 2048 | assert!(!testlook!(look, "a" , 0)); |
| 2049 | assert!(testlook!(look, "a" , 1)); |
| 2050 | assert!(testlook!(look, "a " , 1)); |
| 2051 | assert!(!testlook!(look, " a " , 1)); |
| 2052 | assert!(testlook!(look, " a " , 2)); |
| 2053 | |
| 2054 | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
| 2055 | // an ASCII word boundary, none of these match. |
| 2056 | assert!(!testlook!(look, "𝛃" , 0)); |
| 2057 | assert!(!testlook!(look, "𝛃" , 4)); |
| 2058 | assert!(!testlook!(look, "𝛃 " , 4)); |
| 2059 | assert!(!testlook!(look, " 𝛃 " , 1)); |
| 2060 | assert!(!testlook!(look, " 𝛃 " , 5)); |
| 2061 | |
| 2062 | // Unicode word boundaries between non-ASCII codepoints. Again, since |
| 2063 | // this is an ASCII word boundary, none of these match. |
| 2064 | assert!(!testlook!(look, "𝛃𐆀" , 0)); |
| 2065 | assert!(!testlook!(look, "𝛃𐆀" , 4)); |
| 2066 | |
| 2067 | // Non word boundaries for ASCII. |
| 2068 | assert!(!testlook!(look, "" , 0)); |
| 2069 | assert!(!testlook!(look, "ab" , 1)); |
| 2070 | assert!(!testlook!(look, "a " , 2)); |
| 2071 | assert!(!testlook!(look, " a " , 0)); |
| 2072 | assert!(!testlook!(look, " a " , 3)); |
| 2073 | |
| 2074 | // Non word boundaries with a non-ASCII codepoint. |
| 2075 | assert!(!testlook!(look, "𝛃b" , 4)); |
| 2076 | assert!(testlook!(look, "b𝛃" , 1)); |
| 2077 | assert!(!testlook!(look, "𝛃 " , 5)); |
| 2078 | assert!(!testlook!(look, " 𝛃 " , 0)); |
| 2079 | assert!(!testlook!(look, " 𝛃 " , 6)); |
| 2080 | assert!(!testlook!(look, "𝛃" , 1)); |
| 2081 | assert!(!testlook!(look, "𝛃" , 2)); |
| 2082 | assert!(!testlook!(look, "𝛃" , 3)); |
| 2083 | |
| 2084 | // Non word boundaries with non-ASCII codepoints. |
| 2085 | assert!(!testlook!(look, "𝛃𐆀" , 1)); |
| 2086 | assert!(!testlook!(look, "𝛃𐆀" , 2)); |
| 2087 | assert!(!testlook!(look, "𝛃𐆀" , 3)); |
| 2088 | assert!(!testlook!(look, "𝛃𐆀" , 5)); |
| 2089 | assert!(!testlook!(look, "𝛃𐆀" , 6)); |
| 2090 | assert!(!testlook!(look, "𝛃𐆀" , 7)); |
| 2091 | assert!(!testlook!(look, "𝛃𐆀" , 8)); |
| 2092 | } |
| 2093 | |
| 2094 | #[test ] |
| 2095 | #[cfg (all(not(miri), feature = "unicode-word-boundary" ))] |
| 2096 | fn look_matches_word_start_unicode() { |
| 2097 | let look = Look::WordStartUnicode; |
| 2098 | |
| 2099 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 2100 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 2101 | |
| 2102 | // Simple ASCII word boundaries. |
| 2103 | assert!(testlook!(look, "a" , 0)); |
| 2104 | assert!(!testlook!(look, "a" , 1)); |
| 2105 | assert!(!testlook!(look, "a " , 1)); |
| 2106 | assert!(testlook!(look, " a " , 1)); |
| 2107 | assert!(!testlook!(look, " a " , 2)); |
| 2108 | |
| 2109 | // Unicode word boundaries with a non-ASCII codepoint. |
| 2110 | assert!(testlook!(look, "𝛃" , 0)); |
| 2111 | assert!(!testlook!(look, "𝛃" , 4)); |
| 2112 | assert!(!testlook!(look, "𝛃 " , 4)); |
| 2113 | assert!(testlook!(look, " 𝛃 " , 1)); |
| 2114 | assert!(!testlook!(look, " 𝛃 " , 5)); |
| 2115 | |
| 2116 | // Unicode word boundaries between non-ASCII codepoints. |
| 2117 | assert!(testlook!(look, "𝛃𐆀" , 0)); |
| 2118 | assert!(!testlook!(look, "𝛃𐆀" , 4)); |
| 2119 | |
| 2120 | // Non word boundaries for ASCII. |
| 2121 | assert!(!testlook!(look, "" , 0)); |
| 2122 | assert!(!testlook!(look, "ab" , 1)); |
| 2123 | assert!(!testlook!(look, "a " , 2)); |
| 2124 | assert!(!testlook!(look, " a " , 0)); |
| 2125 | assert!(!testlook!(look, " a " , 3)); |
| 2126 | |
| 2127 | // Non word boundaries with a non-ASCII codepoint. |
| 2128 | assert!(!testlook!(look, "𝛃b" , 4)); |
| 2129 | assert!(!testlook!(look, "b𝛃" , 1)); |
| 2130 | assert!(!testlook!(look, "𝛃 " , 5)); |
| 2131 | assert!(!testlook!(look, " 𝛃 " , 0)); |
| 2132 | assert!(!testlook!(look, " 𝛃 " , 6)); |
| 2133 | assert!(!testlook!(look, "𝛃" , 1)); |
| 2134 | assert!(!testlook!(look, "𝛃" , 2)); |
| 2135 | assert!(!testlook!(look, "𝛃" , 3)); |
| 2136 | |
| 2137 | // Non word boundaries with non-ASCII codepoints. |
| 2138 | assert!(!testlook!(look, "𝛃𐆀" , 1)); |
| 2139 | assert!(!testlook!(look, "𝛃𐆀" , 2)); |
| 2140 | assert!(!testlook!(look, "𝛃𐆀" , 3)); |
| 2141 | assert!(!testlook!(look, "𝛃𐆀" , 5)); |
| 2142 | assert!(!testlook!(look, "𝛃𐆀" , 6)); |
| 2143 | assert!(!testlook!(look, "𝛃𐆀" , 7)); |
| 2144 | assert!(!testlook!(look, "𝛃𐆀" , 8)); |
| 2145 | } |
| 2146 | |
| 2147 | #[test ] |
| 2148 | #[cfg (all(not(miri), feature = "unicode-word-boundary" ))] |
| 2149 | fn look_matches_word_end_unicode() { |
| 2150 | let look = Look::WordEndUnicode; |
| 2151 | |
| 2152 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 2153 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 2154 | |
| 2155 | // Simple ASCII word boundaries. |
| 2156 | assert!(!testlook!(look, "a" , 0)); |
| 2157 | assert!(testlook!(look, "a" , 1)); |
| 2158 | assert!(testlook!(look, "a " , 1)); |
| 2159 | assert!(!testlook!(look, " a " , 1)); |
| 2160 | assert!(testlook!(look, " a " , 2)); |
| 2161 | |
| 2162 | // Unicode word boundaries with a non-ASCII codepoint. |
| 2163 | assert!(!testlook!(look, "𝛃" , 0)); |
| 2164 | assert!(testlook!(look, "𝛃" , 4)); |
| 2165 | assert!(testlook!(look, "𝛃 " , 4)); |
| 2166 | assert!(!testlook!(look, " 𝛃 " , 1)); |
| 2167 | assert!(testlook!(look, " 𝛃 " , 5)); |
| 2168 | |
| 2169 | // Unicode word boundaries between non-ASCII codepoints. |
| 2170 | assert!(!testlook!(look, "𝛃𐆀" , 0)); |
| 2171 | assert!(testlook!(look, "𝛃𐆀" , 4)); |
| 2172 | |
| 2173 | // Non word boundaries for ASCII. |
| 2174 | assert!(!testlook!(look, "" , 0)); |
| 2175 | assert!(!testlook!(look, "ab" , 1)); |
| 2176 | assert!(!testlook!(look, "a " , 2)); |
| 2177 | assert!(!testlook!(look, " a " , 0)); |
| 2178 | assert!(!testlook!(look, " a " , 3)); |
| 2179 | |
| 2180 | // Non word boundaries with a non-ASCII codepoint. |
| 2181 | assert!(!testlook!(look, "𝛃b" , 4)); |
| 2182 | assert!(!testlook!(look, "b𝛃" , 1)); |
| 2183 | assert!(!testlook!(look, "𝛃 " , 5)); |
| 2184 | assert!(!testlook!(look, " 𝛃 " , 0)); |
| 2185 | assert!(!testlook!(look, " 𝛃 " , 6)); |
| 2186 | assert!(!testlook!(look, "𝛃" , 1)); |
| 2187 | assert!(!testlook!(look, "𝛃" , 2)); |
| 2188 | assert!(!testlook!(look, "𝛃" , 3)); |
| 2189 | |
| 2190 | // Non word boundaries with non-ASCII codepoints. |
| 2191 | assert!(!testlook!(look, "𝛃𐆀" , 1)); |
| 2192 | assert!(!testlook!(look, "𝛃𐆀" , 2)); |
| 2193 | assert!(!testlook!(look, "𝛃𐆀" , 3)); |
| 2194 | assert!(!testlook!(look, "𝛃𐆀" , 5)); |
| 2195 | assert!(!testlook!(look, "𝛃𐆀" , 6)); |
| 2196 | assert!(!testlook!(look, "𝛃𐆀" , 7)); |
| 2197 | assert!(!testlook!(look, "𝛃𐆀" , 8)); |
| 2198 | } |
| 2199 | |
| 2200 | #[test ] |
| 2201 | fn look_matches_word_start_half_ascii() { |
| 2202 | let look = Look::WordStartHalfAscii; |
| 2203 | |
| 2204 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 2205 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 2206 | |
| 2207 | // Simple ASCII word boundaries. |
| 2208 | assert!(testlook!(look, "a" , 0)); |
| 2209 | assert!(!testlook!(look, "a" , 1)); |
| 2210 | assert!(!testlook!(look, "a " , 1)); |
| 2211 | assert!(testlook!(look, " a " , 1)); |
| 2212 | assert!(!testlook!(look, " a " , 2)); |
| 2213 | |
| 2214 | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
| 2215 | // an ASCII word boundary, none of these match. |
| 2216 | assert!(testlook!(look, "𝛃" , 0)); |
| 2217 | assert!(testlook!(look, "𝛃" , 4)); |
| 2218 | assert!(testlook!(look, "𝛃 " , 4)); |
| 2219 | assert!(testlook!(look, " 𝛃 " , 1)); |
| 2220 | assert!(testlook!(look, " 𝛃 " , 5)); |
| 2221 | |
| 2222 | // Unicode word boundaries between non-ASCII codepoints. Again, since |
| 2223 | // this is an ASCII word boundary, none of these match. |
| 2224 | assert!(testlook!(look, "𝛃𐆀" , 0)); |
| 2225 | assert!(testlook!(look, "𝛃𐆀" , 4)); |
| 2226 | |
| 2227 | // Non word boundaries for ASCII. |
| 2228 | assert!(testlook!(look, "" , 0)); |
| 2229 | assert!(!testlook!(look, "ab" , 1)); |
| 2230 | assert!(testlook!(look, "a " , 2)); |
| 2231 | assert!(testlook!(look, " a " , 0)); |
| 2232 | assert!(testlook!(look, " a " , 3)); |
| 2233 | |
| 2234 | // Non word boundaries with a non-ASCII codepoint. |
| 2235 | assert!(testlook!(look, "𝛃b" , 4)); |
| 2236 | assert!(!testlook!(look, "b𝛃" , 1)); |
| 2237 | assert!(testlook!(look, "𝛃 " , 5)); |
| 2238 | assert!(testlook!(look, " 𝛃 " , 0)); |
| 2239 | assert!(testlook!(look, " 𝛃 " , 6)); |
| 2240 | assert!(testlook!(look, "𝛃" , 1)); |
| 2241 | assert!(testlook!(look, "𝛃" , 2)); |
| 2242 | assert!(testlook!(look, "𝛃" , 3)); |
| 2243 | |
| 2244 | // Non word boundaries with non-ASCII codepoints. |
| 2245 | assert!(testlook!(look, "𝛃𐆀" , 1)); |
| 2246 | assert!(testlook!(look, "𝛃𐆀" , 2)); |
| 2247 | assert!(testlook!(look, "𝛃𐆀" , 3)); |
| 2248 | assert!(testlook!(look, "𝛃𐆀" , 5)); |
| 2249 | assert!(testlook!(look, "𝛃𐆀" , 6)); |
| 2250 | assert!(testlook!(look, "𝛃𐆀" , 7)); |
| 2251 | assert!(testlook!(look, "𝛃𐆀" , 8)); |
| 2252 | } |
| 2253 | |
| 2254 | #[test ] |
| 2255 | fn look_matches_word_end_half_ascii() { |
| 2256 | let look = Look::WordEndHalfAscii; |
| 2257 | |
| 2258 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 2259 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 2260 | |
| 2261 | // Simple ASCII word boundaries. |
| 2262 | assert!(!testlook!(look, "a" , 0)); |
| 2263 | assert!(testlook!(look, "a" , 1)); |
| 2264 | assert!(testlook!(look, "a " , 1)); |
| 2265 | assert!(!testlook!(look, " a " , 1)); |
| 2266 | assert!(testlook!(look, " a " , 2)); |
| 2267 | |
| 2268 | // Unicode word boundaries with a non-ASCII codepoint. Since this is |
| 2269 | // an ASCII word boundary, none of these match. |
| 2270 | assert!(testlook!(look, "𝛃" , 0)); |
| 2271 | assert!(testlook!(look, "𝛃" , 4)); |
| 2272 | assert!(testlook!(look, "𝛃 " , 4)); |
| 2273 | assert!(testlook!(look, " 𝛃 " , 1)); |
| 2274 | assert!(testlook!(look, " 𝛃 " , 5)); |
| 2275 | |
| 2276 | // Unicode word boundaries between non-ASCII codepoints. Again, since |
| 2277 | // this is an ASCII word boundary, none of these match. |
| 2278 | assert!(testlook!(look, "𝛃𐆀" , 0)); |
| 2279 | assert!(testlook!(look, "𝛃𐆀" , 4)); |
| 2280 | |
| 2281 | // Non word boundaries for ASCII. |
| 2282 | assert!(testlook!(look, "" , 0)); |
| 2283 | assert!(!testlook!(look, "ab" , 1)); |
| 2284 | assert!(testlook!(look, "a " , 2)); |
| 2285 | assert!(testlook!(look, " a " , 0)); |
| 2286 | assert!(testlook!(look, " a " , 3)); |
| 2287 | |
| 2288 | // Non word boundaries with a non-ASCII codepoint. |
| 2289 | assert!(!testlook!(look, "𝛃b" , 4)); |
| 2290 | assert!(testlook!(look, "b𝛃" , 1)); |
| 2291 | assert!(testlook!(look, "𝛃 " , 5)); |
| 2292 | assert!(testlook!(look, " 𝛃 " , 0)); |
| 2293 | assert!(testlook!(look, " 𝛃 " , 6)); |
| 2294 | assert!(testlook!(look, "𝛃" , 1)); |
| 2295 | assert!(testlook!(look, "𝛃" , 2)); |
| 2296 | assert!(testlook!(look, "𝛃" , 3)); |
| 2297 | |
| 2298 | // Non word boundaries with non-ASCII codepoints. |
| 2299 | assert!(testlook!(look, "𝛃𐆀" , 1)); |
| 2300 | assert!(testlook!(look, "𝛃𐆀" , 2)); |
| 2301 | assert!(testlook!(look, "𝛃𐆀" , 3)); |
| 2302 | assert!(testlook!(look, "𝛃𐆀" , 5)); |
| 2303 | assert!(testlook!(look, "𝛃𐆀" , 6)); |
| 2304 | assert!(testlook!(look, "𝛃𐆀" , 7)); |
| 2305 | assert!(testlook!(look, "𝛃𐆀" , 8)); |
| 2306 | } |
| 2307 | |
| 2308 | #[test ] |
| 2309 | #[cfg (all(not(miri), feature = "unicode-word-boundary" ))] |
| 2310 | fn look_matches_word_start_half_unicode() { |
| 2311 | let look = Look::WordStartHalfUnicode; |
| 2312 | |
| 2313 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 2314 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 2315 | |
| 2316 | // Simple ASCII word boundaries. |
| 2317 | assert!(testlook!(look, "a" , 0)); |
| 2318 | assert!(!testlook!(look, "a" , 1)); |
| 2319 | assert!(!testlook!(look, "a " , 1)); |
| 2320 | assert!(testlook!(look, " a " , 1)); |
| 2321 | assert!(!testlook!(look, " a " , 2)); |
| 2322 | |
| 2323 | // Unicode word boundaries with a non-ASCII codepoint. |
| 2324 | assert!(testlook!(look, "𝛃" , 0)); |
| 2325 | assert!(!testlook!(look, "𝛃" , 4)); |
| 2326 | assert!(!testlook!(look, "𝛃 " , 4)); |
| 2327 | assert!(testlook!(look, " 𝛃 " , 1)); |
| 2328 | assert!(!testlook!(look, " 𝛃 " , 5)); |
| 2329 | |
| 2330 | // Unicode word boundaries between non-ASCII codepoints. |
| 2331 | assert!(testlook!(look, "𝛃𐆀" , 0)); |
| 2332 | assert!(!testlook!(look, "𝛃𐆀" , 4)); |
| 2333 | |
| 2334 | // Non word boundaries for ASCII. |
| 2335 | assert!(testlook!(look, "" , 0)); |
| 2336 | assert!(!testlook!(look, "ab" , 1)); |
| 2337 | assert!(testlook!(look, "a " , 2)); |
| 2338 | assert!(testlook!(look, " a " , 0)); |
| 2339 | assert!(testlook!(look, " a " , 3)); |
| 2340 | |
| 2341 | // Non word boundaries with a non-ASCII codepoint. |
| 2342 | assert!(!testlook!(look, "𝛃b" , 4)); |
| 2343 | assert!(!testlook!(look, "b𝛃" , 1)); |
| 2344 | assert!(testlook!(look, "𝛃 " , 5)); |
| 2345 | assert!(testlook!(look, " 𝛃 " , 0)); |
| 2346 | assert!(testlook!(look, " 𝛃 " , 6)); |
| 2347 | assert!(!testlook!(look, "𝛃" , 1)); |
| 2348 | assert!(!testlook!(look, "𝛃" , 2)); |
| 2349 | assert!(!testlook!(look, "𝛃" , 3)); |
| 2350 | |
| 2351 | // Non word boundaries with non-ASCII codepoints. |
| 2352 | assert!(!testlook!(look, "𝛃𐆀" , 1)); |
| 2353 | assert!(!testlook!(look, "𝛃𐆀" , 2)); |
| 2354 | assert!(!testlook!(look, "𝛃𐆀" , 3)); |
| 2355 | assert!(!testlook!(look, "𝛃𐆀" , 5)); |
| 2356 | assert!(!testlook!(look, "𝛃𐆀" , 6)); |
| 2357 | assert!(!testlook!(look, "𝛃𐆀" , 7)); |
| 2358 | assert!(testlook!(look, "𝛃𐆀" , 8)); |
| 2359 | } |
| 2360 | |
| 2361 | #[test ] |
| 2362 | #[cfg (all(not(miri), feature = "unicode-word-boundary" ))] |
| 2363 | fn look_matches_word_end_half_unicode() { |
| 2364 | let look = Look::WordEndHalfUnicode; |
| 2365 | |
| 2366 | // \xF0\x9D\x9B\x83 = 𝛃 (in \w) |
| 2367 | // \xF0\x90\x86\x80 = 𐆀 (not in \w) |
| 2368 | |
| 2369 | // Simple ASCII word boundaries. |
| 2370 | assert!(!testlook!(look, "a" , 0)); |
| 2371 | assert!(testlook!(look, "a" , 1)); |
| 2372 | assert!(testlook!(look, "a " , 1)); |
| 2373 | assert!(!testlook!(look, " a " , 1)); |
| 2374 | assert!(testlook!(look, " a " , 2)); |
| 2375 | |
| 2376 | // Unicode word boundaries with a non-ASCII codepoint. |
| 2377 | assert!(!testlook!(look, "𝛃" , 0)); |
| 2378 | assert!(testlook!(look, "𝛃" , 4)); |
| 2379 | assert!(testlook!(look, "𝛃 " , 4)); |
| 2380 | assert!(!testlook!(look, " 𝛃 " , 1)); |
| 2381 | assert!(testlook!(look, " 𝛃 " , 5)); |
| 2382 | |
| 2383 | // Unicode word boundaries between non-ASCII codepoints. |
| 2384 | assert!(!testlook!(look, "𝛃𐆀" , 0)); |
| 2385 | assert!(testlook!(look, "𝛃𐆀" , 4)); |
| 2386 | |
| 2387 | // Non word boundaries for ASCII. |
| 2388 | assert!(testlook!(look, "" , 0)); |
| 2389 | assert!(!testlook!(look, "ab" , 1)); |
| 2390 | assert!(testlook!(look, "a " , 2)); |
| 2391 | assert!(testlook!(look, " a " , 0)); |
| 2392 | assert!(testlook!(look, " a " , 3)); |
| 2393 | |
| 2394 | // Non word boundaries with a non-ASCII codepoint. |
| 2395 | assert!(!testlook!(look, "𝛃b" , 4)); |
| 2396 | assert!(!testlook!(look, "b𝛃" , 1)); |
| 2397 | assert!(testlook!(look, "𝛃 " , 5)); |
| 2398 | assert!(testlook!(look, " 𝛃 " , 0)); |
| 2399 | assert!(testlook!(look, " 𝛃 " , 6)); |
| 2400 | assert!(!testlook!(look, "𝛃" , 1)); |
| 2401 | assert!(!testlook!(look, "𝛃" , 2)); |
| 2402 | assert!(!testlook!(look, "𝛃" , 3)); |
| 2403 | |
| 2404 | // Non word boundaries with non-ASCII codepoints. |
| 2405 | assert!(!testlook!(look, "𝛃𐆀" , 1)); |
| 2406 | assert!(!testlook!(look, "𝛃𐆀" , 2)); |
| 2407 | assert!(!testlook!(look, "𝛃𐆀" , 3)); |
| 2408 | assert!(!testlook!(look, "𝛃𐆀" , 5)); |
| 2409 | assert!(!testlook!(look, "𝛃𐆀" , 6)); |
| 2410 | assert!(!testlook!(look, "𝛃𐆀" , 7)); |
| 2411 | assert!(testlook!(look, "𝛃𐆀" , 8)); |
| 2412 | } |
| 2413 | |
| 2414 | #[test ] |
| 2415 | fn look_set() { |
| 2416 | let mut f = LookSet::default(); |
| 2417 | assert!(!f.contains(Look::Start)); |
| 2418 | assert!(!f.contains(Look::End)); |
| 2419 | assert!(!f.contains(Look::StartLF)); |
| 2420 | assert!(!f.contains(Look::EndLF)); |
| 2421 | assert!(!f.contains(Look::WordUnicode)); |
| 2422 | assert!(!f.contains(Look::WordUnicodeNegate)); |
| 2423 | assert!(!f.contains(Look::WordAscii)); |
| 2424 | assert!(!f.contains(Look::WordAsciiNegate)); |
| 2425 | |
| 2426 | f = f.insert(Look::Start); |
| 2427 | assert!(f.contains(Look::Start)); |
| 2428 | f = f.remove(Look::Start); |
| 2429 | assert!(!f.contains(Look::Start)); |
| 2430 | |
| 2431 | f = f.insert(Look::End); |
| 2432 | assert!(f.contains(Look::End)); |
| 2433 | f = f.remove(Look::End); |
| 2434 | assert!(!f.contains(Look::End)); |
| 2435 | |
| 2436 | f = f.insert(Look::StartLF); |
| 2437 | assert!(f.contains(Look::StartLF)); |
| 2438 | f = f.remove(Look::StartLF); |
| 2439 | assert!(!f.contains(Look::StartLF)); |
| 2440 | |
| 2441 | f = f.insert(Look::EndLF); |
| 2442 | assert!(f.contains(Look::EndLF)); |
| 2443 | f = f.remove(Look::EndLF); |
| 2444 | assert!(!f.contains(Look::EndLF)); |
| 2445 | |
| 2446 | f = f.insert(Look::StartCRLF); |
| 2447 | assert!(f.contains(Look::StartCRLF)); |
| 2448 | f = f.remove(Look::StartCRLF); |
| 2449 | assert!(!f.contains(Look::StartCRLF)); |
| 2450 | |
| 2451 | f = f.insert(Look::EndCRLF); |
| 2452 | assert!(f.contains(Look::EndCRLF)); |
| 2453 | f = f.remove(Look::EndCRLF); |
| 2454 | assert!(!f.contains(Look::EndCRLF)); |
| 2455 | |
| 2456 | f = f.insert(Look::WordUnicode); |
| 2457 | assert!(f.contains(Look::WordUnicode)); |
| 2458 | f = f.remove(Look::WordUnicode); |
| 2459 | assert!(!f.contains(Look::WordUnicode)); |
| 2460 | |
| 2461 | f = f.insert(Look::WordUnicodeNegate); |
| 2462 | assert!(f.contains(Look::WordUnicodeNegate)); |
| 2463 | f = f.remove(Look::WordUnicodeNegate); |
| 2464 | assert!(!f.contains(Look::WordUnicodeNegate)); |
| 2465 | |
| 2466 | f = f.insert(Look::WordAscii); |
| 2467 | assert!(f.contains(Look::WordAscii)); |
| 2468 | f = f.remove(Look::WordAscii); |
| 2469 | assert!(!f.contains(Look::WordAscii)); |
| 2470 | |
| 2471 | f = f.insert(Look::WordAsciiNegate); |
| 2472 | assert!(f.contains(Look::WordAsciiNegate)); |
| 2473 | f = f.remove(Look::WordAsciiNegate); |
| 2474 | assert!(!f.contains(Look::WordAsciiNegate)); |
| 2475 | |
| 2476 | f = f.insert(Look::WordStartAscii); |
| 2477 | assert!(f.contains(Look::WordStartAscii)); |
| 2478 | f = f.remove(Look::WordStartAscii); |
| 2479 | assert!(!f.contains(Look::WordStartAscii)); |
| 2480 | |
| 2481 | f = f.insert(Look::WordEndAscii); |
| 2482 | assert!(f.contains(Look::WordEndAscii)); |
| 2483 | f = f.remove(Look::WordEndAscii); |
| 2484 | assert!(!f.contains(Look::WordEndAscii)); |
| 2485 | |
| 2486 | f = f.insert(Look::WordStartUnicode); |
| 2487 | assert!(f.contains(Look::WordStartUnicode)); |
| 2488 | f = f.remove(Look::WordStartUnicode); |
| 2489 | assert!(!f.contains(Look::WordStartUnicode)); |
| 2490 | |
| 2491 | f = f.insert(Look::WordEndUnicode); |
| 2492 | assert!(f.contains(Look::WordEndUnicode)); |
| 2493 | f = f.remove(Look::WordEndUnicode); |
| 2494 | assert!(!f.contains(Look::WordEndUnicode)); |
| 2495 | |
| 2496 | f = f.insert(Look::WordStartHalfAscii); |
| 2497 | assert!(f.contains(Look::WordStartHalfAscii)); |
| 2498 | f = f.remove(Look::WordStartHalfAscii); |
| 2499 | assert!(!f.contains(Look::WordStartHalfAscii)); |
| 2500 | |
| 2501 | f = f.insert(Look::WordEndHalfAscii); |
| 2502 | assert!(f.contains(Look::WordEndHalfAscii)); |
| 2503 | f = f.remove(Look::WordEndHalfAscii); |
| 2504 | assert!(!f.contains(Look::WordEndHalfAscii)); |
| 2505 | |
| 2506 | f = f.insert(Look::WordStartHalfUnicode); |
| 2507 | assert!(f.contains(Look::WordStartHalfUnicode)); |
| 2508 | f = f.remove(Look::WordStartHalfUnicode); |
| 2509 | assert!(!f.contains(Look::WordStartHalfUnicode)); |
| 2510 | |
| 2511 | f = f.insert(Look::WordEndHalfUnicode); |
| 2512 | assert!(f.contains(Look::WordEndHalfUnicode)); |
| 2513 | f = f.remove(Look::WordEndHalfUnicode); |
| 2514 | assert!(!f.contains(Look::WordEndHalfUnicode)); |
| 2515 | } |
| 2516 | |
| 2517 | #[test ] |
| 2518 | fn look_set_iter() { |
| 2519 | let set = LookSet::empty(); |
| 2520 | assert_eq!(0, set.iter().count()); |
| 2521 | |
| 2522 | let set = LookSet::full(); |
| 2523 | assert_eq!(18, set.iter().count()); |
| 2524 | |
| 2525 | let set = |
| 2526 | LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); |
| 2527 | assert_eq!(2, set.iter().count()); |
| 2528 | |
| 2529 | let set = LookSet::empty().insert(Look::StartLF); |
| 2530 | assert_eq!(1, set.iter().count()); |
| 2531 | |
| 2532 | let set = LookSet::empty().insert(Look::WordAsciiNegate); |
| 2533 | assert_eq!(1, set.iter().count()); |
| 2534 | |
| 2535 | let set = LookSet::empty().insert(Look::WordEndHalfUnicode); |
| 2536 | assert_eq!(1, set.iter().count()); |
| 2537 | } |
| 2538 | |
| 2539 | #[test ] |
| 2540 | #[cfg (feature = "alloc" )] |
| 2541 | fn look_set_debug() { |
| 2542 | let res = alloc::format!("{:?}" , LookSet::empty()); |
| 2543 | assert_eq!("∅" , res); |
| 2544 | let res = alloc::format!("{:?}" , LookSet::full()); |
| 2545 | assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶" , res); |
| 2546 | } |
| 2547 | } |
| 2548 | |