1/*!
2Types and routines for working with look-around assertions.
3
4This module principally defines two types:
5
6* [`Look`] enumerates all of the assertions supported by this crate.
7* [`LookSet`] provides a way to efficiently store a set of [`Look`] values.
8* [`LookMatcher`] provides routines for checking whether a `Look` or a
9`LookSet` matches at a particular position in a haystack.
10*/
11
12// LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically
13// copied verbatim from the regex-syntax crate. I would have no problems using
14// the regex-syntax types and defining the matching routines (only found
15// in this crate) as free functions, except the `Look` and `LookSet` types
16// are used in lots of places. Including in places we expect to work when
17// regex-syntax is *not* enabled, such as in the definition of the NFA itself.
18//
19// Thankfully the code we copy is pretty simple and there isn't much of it.
20// Otherwise, the rest of this module deals with *matching* the assertions,
21// which is not something that regex-syntax handles.
22
23use crate::util::{escape::DebugByte, utf8};
24
25/// A look-around assertion.
26///
27/// An assertion matches at a position between characters in a haystack.
28/// Namely, it does not actually "consume" any input as most parts of a regular
29/// expression do. Assertions are a way of stating that some property must be
30/// true at a particular point during matching.
31///
32/// For example, `(?m)^[a-z]+$` is a pattern that:
33///
34/// * Scans the haystack for a position at which `(?m:^)` is satisfied. That
35/// occurs at either the beginning of the haystack, or immediately following
36/// a `\n` character.
37/// * Looks for one or more occurrences of `[a-z]`.
38/// * Once `[a-z]+` has matched as much as it can, an overall match is only
39/// reported when `[a-z]+` stops just before a `\n`.
40///
41/// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not.
42///
43/// Assertions are also called "look-around," "look-behind" and "look-ahead."
44/// Specifically, some assertions are look-behind (like `^`), other assertions
45/// are look-ahead (like `$`) and yet other assertions are both look-ahead and
46/// look-behind (like `\b`).
47///
48/// # Assertions in an NFA
49///
50/// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be
51/// thought of as a conditional epsilon transition. That is, a matching engine
52/// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits
53/// moving through conditional epsilon transitions when their condition
54/// is satisfied at whatever position the `PikeVM` is currently at in the
55/// haystack.
56///
57/// How assertions are handled in a `DFA` is trickier, since a DFA does not
58/// have epsilon transitions at all. In this case, they are compiled into the
59/// automaton itself, at the expense of more states than what would be required
60/// without an assertion.
61#[derive(Clone, Copy, Debug, Eq, PartialEq)]
62pub enum Look {
63 /// Match the beginning of text. Specifically, this matches at the starting
64 /// position of the input.
65 Start = 1 << 0,
66 /// Match the end of text. Specifically, this matches at the ending
67 /// position of the input.
68 End = 1 << 1,
69 /// Match the beginning of a line or the beginning of text. Specifically,
70 /// this matches at the starting position of the input, or at the position
71 /// immediately following a `\n` character.
72 StartLF = 1 << 2,
73 /// Match the end of a line or the end of text. Specifically, this matches
74 /// at the end position of the input, or at the position immediately
75 /// preceding a `\n` character.
76 EndLF = 1 << 3,
77 /// Match the beginning of a line or the beginning of text. Specifically,
78 /// this matches at the starting position of the input, or at the position
79 /// immediately following either a `\r` or `\n` character, but never after
80 /// a `\r` when a `\n` follows.
81 StartCRLF = 1 << 4,
82 /// Match the end of a line or the end of text. Specifically, this matches
83 /// at the end position of the input, or at the position immediately
84 /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
85 /// precedes it.
86 EndCRLF = 1 << 5,
87 /// Match an ASCII-only word boundary. That is, this matches a position
88 /// where the left adjacent character and right adjacent character
89 /// correspond to a word and non-word or a non-word and word character.
90 WordAscii = 1 << 6,
91 /// Match an ASCII-only negation of a word boundary.
92 WordAsciiNegate = 1 << 7,
93 /// Match a Unicode-aware word boundary. That is, this matches a position
94 /// where the left adjacent character and right adjacent character
95 /// correspond to a word and non-word or a non-word and word character.
96 WordUnicode = 1 << 8,
97 /// Match a Unicode-aware negation of a word boundary.
98 WordUnicodeNegate = 1 << 9,
99 /// Match the start of an ASCII-only word boundary. That is, this matches a
100 /// position at either the beginning of the haystack or where the previous
101 /// character is not a word character and the following character is a word
102 /// character.
103 WordStartAscii = 1 << 10,
104 /// Match the end of an ASCII-only word boundary. That is, this matches
105 /// a position at either the end of the haystack or where the previous
106 /// character is a word character and the following character is not a word
107 /// character.
108 WordEndAscii = 1 << 11,
109 /// Match the start of a Unicode word boundary. That is, this matches a
110 /// position at either the beginning of the haystack or where the previous
111 /// character is not a word character and the following character is a word
112 /// character.
113 WordStartUnicode = 1 << 12,
114 /// Match the end of a Unicode word boundary. That is, this matches a
115 /// position at either the end of the haystack or where the previous
116 /// character is a word character and the following character is not a word
117 /// character.
118 WordEndUnicode = 1 << 13,
119 /// Match the start half of an ASCII-only word boundary. That is, this
120 /// matches a position at either the beginning of the haystack or where the
121 /// previous character is not a word character.
122 WordStartHalfAscii = 1 << 14,
123 /// Match the end half of an ASCII-only word boundary. That is, this
124 /// matches a position at either the end of the haystack or where the
125 /// following character is not a word character.
126 WordEndHalfAscii = 1 << 15,
127 /// Match the start half of a Unicode word boundary. That is, this matches
128 /// a position at either the beginning of the haystack or where the
129 /// previous character is not a word character.
130 WordStartHalfUnicode = 1 << 16,
131 /// Match the end half of a Unicode word boundary. That is, this matches
132 /// a position at either the end of the haystack or where the following
133 /// character is not a word character.
134 WordEndHalfUnicode = 1 << 17,
135}
136
137impl Look {
138 /// Flip the look-around assertion to its equivalent for reverse searches.
139 /// For example, `StartLF` gets translated to `EndLF`.
140 ///
141 /// Some assertions, such as `WordUnicode`, remain the same since they
142 /// match the same positions regardless of the direction of the search.
143 #[inline]
144 pub const fn reversed(self) -> Look {
145 match self {
146 Look::Start => Look::End,
147 Look::End => Look::Start,
148 Look::StartLF => Look::EndLF,
149 Look::EndLF => Look::StartLF,
150 Look::StartCRLF => Look::EndCRLF,
151 Look::EndCRLF => Look::StartCRLF,
152 Look::WordAscii => Look::WordAscii,
153 Look::WordAsciiNegate => Look::WordAsciiNegate,
154 Look::WordUnicode => Look::WordUnicode,
155 Look::WordUnicodeNegate => Look::WordUnicodeNegate,
156 Look::WordStartAscii => Look::WordEndAscii,
157 Look::WordEndAscii => Look::WordStartAscii,
158 Look::WordStartUnicode => Look::WordEndUnicode,
159 Look::WordEndUnicode => Look::WordStartUnicode,
160 Look::WordStartHalfAscii => Look::WordEndHalfAscii,
161 Look::WordEndHalfAscii => Look::WordStartHalfAscii,
162 Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
163 Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
164 }
165 }
166
167 /// Return the underlying representation of this look-around enumeration
168 /// as an integer. Giving the return value to the [`Look::from_repr`]
169 /// constructor is guaranteed to return the same look-around variant that
170 /// one started with within a semver compatible release of this crate.
171 #[inline]
172 pub const fn as_repr(self) -> u32 {
173 // AFAIK, 'as' is the only way to zero-cost convert an int enum to an
174 // actual int.
175 self as u32
176 }
177
178 /// Given the underlying representation of a `Look` value, return the
179 /// corresponding `Look` value if the representation is valid. Otherwise
180 /// `None` is returned.
181 #[inline]
182 pub const fn from_repr(repr: u32) -> Option<Look> {
183 match repr {
184 0b00_0000_0000_0000_0001 => Some(Look::Start),
185 0b00_0000_0000_0000_0010 => Some(Look::End),
186 0b00_0000_0000_0000_0100 => Some(Look::StartLF),
187 0b00_0000_0000_0000_1000 => Some(Look::EndLF),
188 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF),
189 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF),
190 0b00_0000_0000_0100_0000 => Some(Look::WordAscii),
191 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate),
192 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode),
193 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate),
194 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii),
195 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii),
196 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode),
197 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode),
198 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii),
199 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii),
200 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode),
201 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode),
202 _ => None,
203 }
204 }
205
206 /// Returns a convenient single codepoint representation of this
207 /// look-around assertion. Each assertion is guaranteed to be represented
208 /// by a distinct character.
209 ///
210 /// This is useful for succinctly representing a look-around assertion in
211 /// human friendly but succinct output intended for a programmer working on
212 /// regex internals.
213 #[inline]
214 pub const fn as_char(self) -> char {
215 match self {
216 Look::Start => 'A',
217 Look::End => 'z',
218 Look::StartLF => '^',
219 Look::EndLF => '$',
220 Look::StartCRLF => 'r',
221 Look::EndCRLF => 'R',
222 Look::WordAscii => 'b',
223 Look::WordAsciiNegate => 'B',
224 Look::WordUnicode => '𝛃',
225 Look::WordUnicodeNegate => '𝚩',
226 Look::WordStartAscii => '<',
227 Look::WordEndAscii => '>',
228 Look::WordStartUnicode => '〈',
229 Look::WordEndUnicode => '〉',
230 Look::WordStartHalfAscii => '◁',
231 Look::WordEndHalfAscii => '▷',
232 Look::WordStartHalfUnicode => '◀',
233 Look::WordEndHalfUnicode => '▶',
234 }
235 }
236}
237
238/// LookSet is a memory-efficient set of look-around assertions.
239///
240/// This is useful for efficiently tracking look-around assertions. For
241/// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties
242/// that return `LookSet`s.
243#[derive(Clone, Copy, Default, Eq, PartialEq)]
244pub struct LookSet {
245 /// The underlying representation this set is exposed to make it possible
246 /// to store it somewhere efficiently. The representation is that
247 /// of a bitset, where each assertion occupies bit `i` where
248 /// `i = Look::as_repr()`.
249 ///
250 /// Note that users of this internal representation must permit the full
251 /// range of `u16` values to be represented. For example, even if the
252 /// current implementation only makes use of the 10 least significant bits,
253 /// it may use more bits in a future semver compatible release.
254 pub bits: u32,
255}
256
257impl LookSet {
258 /// Create an empty set of look-around assertions.
259 #[inline]
260 pub fn empty() -> LookSet {
261 LookSet { bits: 0 }
262 }
263
264 /// Create a full set of look-around assertions.
265 ///
266 /// This set contains all possible look-around assertions.
267 #[inline]
268 pub fn full() -> LookSet {
269 LookSet { bits: !0 }
270 }
271
272 /// Create a look-around set containing the look-around assertion given.
273 ///
274 /// This is a convenience routine for creating an empty set and inserting
275 /// one look-around assertions.
276 #[inline]
277 pub fn singleton(look: Look) -> LookSet {
278 LookSet::empty().insert(look)
279 }
280
281 /// Returns the total number of look-around assertions in this set.
282 #[inline]
283 pub fn len(self) -> usize {
284 // OK because max value always fits in a u8, which in turn always
285 // fits in a usize, regardless of target.
286 usize::try_from(self.bits.count_ones()).unwrap()
287 }
288
289 /// Returns true if and only if this set is empty.
290 #[inline]
291 pub fn is_empty(self) -> bool {
292 self.len() == 0
293 }
294
295 /// Returns true if and only if the given look-around assertion is in this
296 /// set.
297 #[inline]
298 pub fn contains(self, look: Look) -> bool {
299 self.bits & look.as_repr() != 0
300 }
301
302 /// Returns true if and only if this set contains any anchor assertions.
303 /// This includes both "start/end of haystack" and "start/end of line."
304 #[inline]
305 pub fn contains_anchor(&self) -> bool {
306 self.contains_anchor_haystack() || self.contains_anchor_line()
307 }
308
309 /// Returns true if and only if this set contains any "start/end of
310 /// haystack" anchors. This doesn't include "start/end of line" anchors.
311 #[inline]
312 pub fn contains_anchor_haystack(&self) -> bool {
313 self.contains(Look::Start) || self.contains(Look::End)
314 }
315
316 /// Returns true if and only if this set contains any "start/end of line"
317 /// anchors. This doesn't include "start/end of haystack" anchors. This
318 /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
319 #[inline]
320 pub fn contains_anchor_line(&self) -> bool {
321 self.contains(Look::StartLF)
322 || self.contains(Look::EndLF)
323 || self.contains(Look::StartCRLF)
324 || self.contains(Look::EndCRLF)
325 }
326
327 /// Returns true if and only if this set contains any "start/end of line"
328 /// anchors that only treat `\n` as line terminators. This does not include
329 /// haystack anchors or CRLF aware line anchors.
330 #[inline]
331 pub fn contains_anchor_lf(&self) -> bool {
332 self.contains(Look::StartLF) || self.contains(Look::EndLF)
333 }
334
335 /// Returns true if and only if this set contains any "start/end of line"
336 /// anchors that are CRLF-aware. This doesn't include "start/end of
337 /// haystack" or "start/end of line-feed" anchors.
338 #[inline]
339 pub fn contains_anchor_crlf(&self) -> bool {
340 self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF)
341 }
342
343 /// Returns true if and only if this set contains any word boundary or
344 /// negated word boundary assertions. This include both Unicode and ASCII
345 /// word boundaries.
346 #[inline]
347 pub fn contains_word(self) -> bool {
348 self.contains_word_unicode() || self.contains_word_ascii()
349 }
350
351 /// Returns true if and only if this set contains any Unicode word boundary
352 /// or negated Unicode word boundary assertions.
353 #[inline]
354 pub fn contains_word_unicode(self) -> bool {
355 self.contains(Look::WordUnicode)
356 || self.contains(Look::WordUnicodeNegate)
357 || self.contains(Look::WordStartUnicode)
358 || self.contains(Look::WordEndUnicode)
359 || self.contains(Look::WordStartHalfUnicode)
360 || self.contains(Look::WordEndHalfUnicode)
361 }
362
363 /// Returns true if and only if this set contains any ASCII word boundary
364 /// or negated ASCII word boundary assertions.
365 #[inline]
366 pub fn contains_word_ascii(self) -> bool {
367 self.contains(Look::WordAscii)
368 || self.contains(Look::WordAsciiNegate)
369 || self.contains(Look::WordStartAscii)
370 || self.contains(Look::WordEndAscii)
371 || self.contains(Look::WordStartHalfAscii)
372 || self.contains(Look::WordEndHalfAscii)
373 }
374
375 /// Returns an iterator over all of the look-around assertions in this set.
376 #[inline]
377 pub fn iter(self) -> LookSetIter {
378 LookSetIter { set: self }
379 }
380
381 /// Return a new set that is equivalent to the original, but with the given
382 /// assertion added to it. If the assertion is already in the set, then the
383 /// returned set is equivalent to the original.
384 #[inline]
385 pub fn insert(self, look: Look) -> LookSet {
386 LookSet { bits: self.bits | look.as_repr() }
387 }
388
389 /// Updates this set in place with the result of inserting the given
390 /// assertion into this set.
391 #[inline]
392 pub fn set_insert(&mut self, look: Look) {
393 *self = self.insert(look);
394 }
395
396 /// Return a new set that is equivalent to the original, but with the given
397 /// assertion removed from it. If the assertion is not in the set, then the
398 /// returned set is equivalent to the original.
399 #[inline]
400 pub fn remove(self, look: Look) -> LookSet {
401 LookSet { bits: self.bits & !look.as_repr() }
402 }
403
404 /// Updates this set in place with the result of removing the given
405 /// assertion from this set.
406 #[inline]
407 pub fn set_remove(&mut self, look: Look) {
408 *self = self.remove(look);
409 }
410
411 /// Returns a new set that is the result of subtracting the given set from
412 /// this set.
413 #[inline]
414 pub fn subtract(self, other: LookSet) -> LookSet {
415 LookSet { bits: self.bits & !other.bits }
416 }
417
418 /// Updates this set in place with the result of subtracting the given set
419 /// from this set.
420 #[inline]
421 pub fn set_subtract(&mut self, other: LookSet) {
422 *self = self.subtract(other);
423 }
424
425 /// Returns a new set that is the union of this and the one given.
426 #[inline]
427 pub fn union(self, other: LookSet) -> LookSet {
428 LookSet { bits: self.bits | other.bits }
429 }
430
431 /// Updates this set in place with the result of unioning it with the one
432 /// given.
433 #[inline]
434 pub fn set_union(&mut self, other: LookSet) {
435 *self = self.union(other);
436 }
437
438 /// Returns a new set that is the intersection of this and the one given.
439 #[inline]
440 pub fn intersect(self, other: LookSet) -> LookSet {
441 LookSet { bits: self.bits & other.bits }
442 }
443
444 /// Updates this set in place with the result of intersecting it with the
445 /// one given.
446 #[inline]
447 pub fn set_intersect(&mut self, other: LookSet) {
448 *self = self.intersect(other);
449 }
450
451 /// Return a `LookSet` from the slice given as a native endian 32-bit
452 /// integer.
453 ///
454 /// # Panics
455 ///
456 /// This panics if `slice.len() < 4`.
457 #[inline]
458 pub fn read_repr(slice: &[u8]) -> LookSet {
459 let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap());
460 LookSet { bits }
461 }
462
463 /// Write a `LookSet` as a native endian 32-bit integer to the beginning
464 /// of the slice given.
465 ///
466 /// # Panics
467 ///
468 /// This panics if `slice.len() < 4`.
469 #[inline]
470 pub fn write_repr(self, slice: &mut [u8]) {
471 let raw = self.bits.to_ne_bytes();
472 slice[0] = raw[0];
473 slice[1] = raw[1];
474 slice[2] = raw[2];
475 slice[3] = raw[3];
476 }
477
478 /// Checks that all assertions in this set can be matched.
479 ///
480 /// Some assertions, such as Unicode word boundaries, require optional (but
481 /// enabled by default) tables that may not be available. If there are
482 /// assertions in this set that require tables that are not available, then
483 /// this will return an error.
484 ///
485 /// Specifically, this returns an error when the the
486 /// `unicode-word-boundary` feature is _not_ enabled _and_ this set
487 /// contains a Unicode word boundary assertion.
488 ///
489 /// It can be useful to use this on the result of
490 /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any)
491 /// when building a matcher engine to ensure methods like
492 /// [`LookMatcher::matches_set`] do not panic at search time.
493 pub fn available(self) -> Result<(), UnicodeWordBoundaryError> {
494 if self.contains_word_unicode() {
495 UnicodeWordBoundaryError::check()?;
496 }
497 Ok(())
498 }
499}
500
501impl core::fmt::Debug for LookSet {
502 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
503 if self.is_empty() {
504 return write!(f, "∅");
505 }
506 for look in self.iter() {
507 write!(f, "{}", look.as_char())?;
508 }
509 Ok(())
510 }
511}
512
513/// An iterator over all look-around assertions in a [`LookSet`].
514///
515/// This iterator is created by [`LookSet::iter`].
516#[derive(Clone, Debug)]
517pub struct LookSetIter {
518 set: LookSet,
519}
520
521impl Iterator for LookSetIter {
522 type Item = Look;
523
524 #[inline]
525 fn next(&mut self) -> Option<Look> {
526 if self.set.is_empty() {
527 return None;
528 }
529 // We'll never have more than u8::MAX distinct look-around assertions,
530 // so 'bit' will always fit into a u16.
531 let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
532 let look = Look::from_repr(1 << bit)?;
533 self.set = self.set.remove(look);
534 Some(look)
535 }
536}
537
538/// A matcher for look-around assertions.
539///
540/// This matcher permits configuring aspects of how look-around assertions are
541/// matched.
542///
543/// # Example
544///
545/// A `LookMatcher` can change the line terminator used for matching multi-line
546/// anchors such as `(?m:^)` and `(?m:$)`.
547///
548/// ```
549/// use regex_automata::{
550/// nfa::thompson::{self, pikevm::PikeVM},
551/// util::look::LookMatcher,
552/// Match, Input,
553/// };
554///
555/// let mut lookm = LookMatcher::new();
556/// lookm.set_line_terminator(b'\x00');
557///
558/// let re = PikeVM::builder()
559/// .thompson(thompson::Config::new().look_matcher(lookm))
560/// .build(r"(?m)^[a-z]+$")?;
561/// let mut cache = re.create_cache();
562///
563/// // Multi-line assertions now use NUL as a terminator.
564/// assert_eq!(
565/// Some(Match::must(0, 1..4)),
566/// re.find(&mut cache, b"\x00abc\x00"),
567/// );
568/// // ... and \n is no longer recognized as a terminator.
569/// assert_eq!(
570/// None,
571/// re.find(&mut cache, b"\nabc\n"),
572/// );
573///
574/// # Ok::<(), Box<dyn std::error::Error>>(())
575/// ```
576#[derive(Clone, Debug)]
577pub struct LookMatcher {
578 lineterm: DebugByte,
579}
580
581impl LookMatcher {
582 /// Creates a new default matcher for look-around assertions.
583 pub fn new() -> LookMatcher {
584 LookMatcher { lineterm: DebugByte(b'\n') }
585 }
586
587 /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`.
588 ///
589 /// Namely, instead of `^` matching after `\n` and `$` matching immediately
590 /// before a `\n`, this will cause it to match after and before the byte
591 /// given.
592 ///
593 /// It can occasionally be useful to use this to configure the line
594 /// terminator to the NUL byte when searching binary data.
595 ///
596 /// Note that this does not apply to CRLF-aware line anchors such as
597 /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to
598 /// use `\r` and `\n`.
599 pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher {
600 self.lineterm.0 = byte;
601 self
602 }
603
604 /// Returns the line terminator that was configured for this matcher.
605 ///
606 /// If no line terminator was configured, then this returns `\n`.
607 ///
608 /// Note that the line terminator should only be used for matching `(?m:^)`
609 /// and `(?m:$)` assertions. It specifically should _not_ be used for
610 /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`.
611 pub fn get_line_terminator(&self) -> u8 {
612 self.lineterm.0
613 }
614
615 /// Returns true when the position `at` in `haystack` satisfies the given
616 /// look-around assertion.
617 ///
618 /// # Panics
619 ///
620 /// This panics when testing any Unicode word boundary assertion in this
621 /// set and when the Unicode word data is not available. Specifically, this
622 /// only occurs when the `unicode-word-boundary` feature is not enabled.
623 ///
624 /// Since it's generally expected that this routine is called inside of
625 /// a matching engine, callers should check the error condition when
626 /// building the matching engine. If there is a Unicode word boundary
627 /// in the matcher and the data isn't available, then the matcher should
628 /// fail to build.
629 ///
630 /// Callers can check the error condition with [`LookSet::available`].
631 ///
632 /// This also may panic when `at > haystack.len()`. Note that `at ==
633 /// haystack.len()` is legal and guaranteed not to panic.
634 #[inline]
635 pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool {
636 self.matches_inline(look, haystack, at)
637 }
638
639 /// Like `matches`, but forcefully inlined.
640 ///
641 /// # Panics
642 ///
643 /// This panics when testing any Unicode word boundary assertion in this
644 /// set and when the Unicode word data is not available. Specifically, this
645 /// only occurs when the `unicode-word-boundary` feature is not enabled.
646 ///
647 /// Since it's generally expected that this routine is called inside of
648 /// a matching engine, callers should check the error condition when
649 /// building the matching engine. If there is a Unicode word boundary
650 /// in the matcher and the data isn't available, then the matcher should
651 /// fail to build.
652 ///
653 /// Callers can check the error condition with [`LookSet::available`].
654 ///
655 /// This also may panic when `at > haystack.len()`. Note that `at ==
656 /// haystack.len()` is legal and guaranteed not to panic.
657 #[cfg_attr(feature = "perf-inline", inline(always))]
658 pub(crate) fn matches_inline(
659 &self,
660 look: Look,
661 haystack: &[u8],
662 at: usize,
663 ) -> bool {
664 match look {
665 Look::Start => self.is_start(haystack, at),
666 Look::End => self.is_end(haystack, at),
667 Look::StartLF => self.is_start_lf(haystack, at),
668 Look::EndLF => self.is_end_lf(haystack, at),
669 Look::StartCRLF => self.is_start_crlf(haystack, at),
670 Look::EndCRLF => self.is_end_crlf(haystack, at),
671 Look::WordAscii => self.is_word_ascii(haystack, at),
672 Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at),
673 Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(),
674 Look::WordUnicodeNegate => {
675 self.is_word_unicode_negate(haystack, at).unwrap()
676 }
677 Look::WordStartAscii => self.is_word_start_ascii(haystack, at),
678 Look::WordEndAscii => self.is_word_end_ascii(haystack, at),
679 Look::WordStartUnicode => {
680 self.is_word_start_unicode(haystack, at).unwrap()
681 }
682 Look::WordEndUnicode => {
683 self.is_word_end_unicode(haystack, at).unwrap()
684 }
685 Look::WordStartHalfAscii => {
686 self.is_word_start_half_ascii(haystack, at)
687 }
688 Look::WordEndHalfAscii => {
689 self.is_word_end_half_ascii(haystack, at)
690 }
691 Look::WordStartHalfUnicode => {
692 self.is_word_start_half_unicode(haystack, at).unwrap()
693 }
694 Look::WordEndHalfUnicode => {
695 self.is_word_end_half_unicode(haystack, at).unwrap()
696 }
697 }
698 }
699
700 /// Returns true when _all_ of the assertions in the given set match at the
701 /// given position in the haystack.
702 ///
703 /// # Panics
704 ///
705 /// This panics when testing any Unicode word boundary assertion in this
706 /// set and when the Unicode word data is not available. Specifically, this
707 /// only occurs when the `unicode-word-boundary` feature is not enabled.
708 ///
709 /// Since it's generally expected that this routine is called inside of
710 /// a matching engine, callers should check the error condition when
711 /// building the matching engine. If there is a Unicode word boundary
712 /// in the matcher and the data isn't available, then the matcher should
713 /// fail to build.
714 ///
715 /// Callers can check the error condition with [`LookSet::available`].
716 ///
717 /// This also may panic when `at > haystack.len()`. Note that `at ==
718 /// haystack.len()` is legal and guaranteed not to panic.
719 #[inline]
720 pub fn matches_set(
721 &self,
722 set: LookSet,
723 haystack: &[u8],
724 at: usize,
725 ) -> bool {
726 self.matches_set_inline(set, haystack, at)
727 }
728
729 /// Like `LookSet::matches`, but forcefully inlined for perf.
730 #[cfg_attr(feature = "perf-inline", inline(always))]
731 pub(crate) fn matches_set_inline(
732 &self,
733 set: LookSet,
734 haystack: &[u8],
735 at: usize,
736 ) -> bool {
737 // This used to luse LookSet::iter with Look::matches on each element,
738 // but that proved to be quite diastrous for perf. The manual "if
739 // the set has this assertion, check it" turns out to be quite a bit
740 // faster.
741 if set.contains(Look::Start) {
742 if !self.is_start(haystack, at) {
743 return false;
744 }
745 }
746 if set.contains(Look::End) {
747 if !self.is_end(haystack, at) {
748 return false;
749 }
750 }
751 if set.contains(Look::StartLF) {
752 if !self.is_start_lf(haystack, at) {
753 return false;
754 }
755 }
756 if set.contains(Look::EndLF) {
757 if !self.is_end_lf(haystack, at) {
758 return false;
759 }
760 }
761 if set.contains(Look::StartCRLF) {
762 if !self.is_start_crlf(haystack, at) {
763 return false;
764 }
765 }
766 if set.contains(Look::EndCRLF) {
767 if !self.is_end_crlf(haystack, at) {
768 return false;
769 }
770 }
771 if set.contains(Look::WordAscii) {
772 if !self.is_word_ascii(haystack, at) {
773 return false;
774 }
775 }
776 if set.contains(Look::WordAsciiNegate) {
777 if !self.is_word_ascii_negate(haystack, at) {
778 return false;
779 }
780 }
781 if set.contains(Look::WordUnicode) {
782 if !self.is_word_unicode(haystack, at).unwrap() {
783 return false;
784 }
785 }
786 if set.contains(Look::WordUnicodeNegate) {
787 if !self.is_word_unicode_negate(haystack, at).unwrap() {
788 return false;
789 }
790 }
791 if set.contains(Look::WordStartAscii) {
792 if !self.is_word_start_ascii(haystack, at) {
793 return false;
794 }
795 }
796 if set.contains(Look::WordEndAscii) {
797 if !self.is_word_end_ascii(haystack, at) {
798 return false;
799 }
800 }
801 if set.contains(Look::WordStartUnicode) {
802 if !self.is_word_start_unicode(haystack, at).unwrap() {
803 return false;
804 }
805 }
806 if set.contains(Look::WordEndUnicode) {
807 if !self.is_word_end_unicode(haystack, at).unwrap() {
808 return false;
809 }
810 }
811 if set.contains(Look::WordStartHalfAscii) {
812 if !self.is_word_start_half_ascii(haystack, at) {
813 return false;
814 }
815 }
816 if set.contains(Look::WordEndHalfAscii) {
817 if !self.is_word_end_half_ascii(haystack, at) {
818 return false;
819 }
820 }
821 if set.contains(Look::WordStartHalfUnicode) {
822 if !self.is_word_start_half_unicode(haystack, at).unwrap() {
823 return false;
824 }
825 }
826 if set.contains(Look::WordEndHalfUnicode) {
827 if !self.is_word_end_half_unicode(haystack, at).unwrap() {
828 return false;
829 }
830 }
831 true
832 }
833
834 /// Split up the given byte classes into equivalence classes in a way that
835 /// is consistent with this look-around assertion.
836 #[cfg(feature = "alloc")]
837 pub(crate) fn add_to_byteset(
838 &self,
839 look: Look,
840 set: &mut crate::util::alphabet::ByteClassSet,
841 ) {
842 match look {
843 Look::Start | Look::End => {}
844 Look::StartLF | Look::EndLF => {
845 set.set_range(self.lineterm.0, self.lineterm.0);
846 }
847 Look::StartCRLF | Look::EndCRLF => {
848 set.set_range(b'\r', b'\r');
849 set.set_range(b'\n', b'\n');
850 }
851 Look::WordAscii
852 | Look::WordAsciiNegate
853 | Look::WordUnicode
854 | Look::WordUnicodeNegate
855 | Look::WordStartAscii
856 | Look::WordEndAscii
857 | Look::WordStartUnicode
858 | Look::WordEndUnicode
859 | Look::WordStartHalfAscii
860 | Look::WordEndHalfAscii
861 | Look::WordStartHalfUnicode
862 | Look::WordEndHalfUnicode => {
863 // We need to mark all ranges of bytes whose pairs result in
864 // evaluating \b differently. This isn't technically correct
865 // for Unicode word boundaries, but DFAs can't handle those
866 // anyway, and thus, the byte classes don't need to either
867 // since they are themselves only used in DFAs.
868 //
869 // FIXME: It seems like the calls to 'set_range' here are
870 // completely invariant, which means we could just hard-code
871 // them here without needing to write a loop. And we only need
872 // to do this dance at most once per regex.
873 //
874 // FIXME: Is this correct for \B?
875 let iswb = utf8::is_word_byte;
876 // This unwrap is OK because we guard every use of 'asu8' with
877 // a check that the input is <= 255.
878 let asu8 = |b: u16| u8::try_from(b).unwrap();
879 let mut b1: u16 = 0;
880 let mut b2: u16;
881 while b1 <= 255 {
882 b2 = b1 + 1;
883 while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) {
884 b2 += 1;
885 }
886 // The guards above guarantee that b2 can never get any
887 // bigger.
888 assert!(b2 <= 256);
889 // Subtracting 1 from b2 is always OK because it is always
890 // at least 1 greater than b1, and the assert above
891 // guarantees that the asu8 conversion will succeed.
892 set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap()));
893 b1 = b2;
894 }
895 }
896 }
897 }
898
899 /// Returns true when [`Look::Start`] is satisfied `at` the given position
900 /// in `haystack`.
901 ///
902 /// # Panics
903 ///
904 /// This may panic when `at > haystack.len()`. Note that `at ==
905 /// haystack.len()` is legal and guaranteed not to panic.
906 #[inline]
907 pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool {
908 at == 0
909 }
910
911 /// Returns true when [`Look::End`] is satisfied `at` the given position in
912 /// `haystack`.
913 ///
914 /// # Panics
915 ///
916 /// This may panic when `at > haystack.len()`. Note that `at ==
917 /// haystack.len()` is legal and guaranteed not to panic.
918 #[inline]
919 pub fn is_end(&self, haystack: &[u8], at: usize) -> bool {
920 at == haystack.len()
921 }
922
923 /// Returns true when [`Look::StartLF`] is satisfied `at` the given
924 /// position in `haystack`.
925 ///
926 /// # Panics
927 ///
928 /// This may panic when `at > haystack.len()`. Note that `at ==
929 /// haystack.len()` is legal and guaranteed not to panic.
930 #[inline]
931 pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool {
932 self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0
933 }
934
935 /// Returns true when [`Look::EndLF`] is satisfied `at` the given position
936 /// in `haystack`.
937 ///
938 /// # Panics
939 ///
940 /// This may panic when `at > haystack.len()`. Note that `at ==
941 /// haystack.len()` is legal and guaranteed not to panic.
942 #[inline]
943 pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool {
944 self.is_end(haystack, at) || haystack[at] == self.lineterm.0
945 }
946
947 /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given
948 /// position in `haystack`.
949 ///
950 /// # Panics
951 ///
952 /// This may panic when `at > haystack.len()`. Note that `at ==
953 /// haystack.len()` is legal and guaranteed not to panic.
954 #[inline]
955 pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool {
956 self.is_start(haystack, at)
957 || haystack[at - 1] == b'\n'
958 || (haystack[at - 1] == b'\r'
959 && (at >= haystack.len() || haystack[at] != b'\n'))
960 }
961
962 /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given
963 /// position in `haystack`.
964 ///
965 /// # Panics
966 ///
967 /// This may panic when `at > haystack.len()`. Note that `at ==
968 /// haystack.len()` is legal and guaranteed not to panic.
969 #[inline]
970 pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool {
971 self.is_end(haystack, at)
972 || haystack[at] == b'\r'
973 || (haystack[at] == b'\n'
974 && (at == 0 || haystack[at - 1] != b'\r'))
975 }
976
977 /// Returns true when [`Look::WordAscii`] is satisfied `at` the given
978 /// position in `haystack`.
979 ///
980 /// # Panics
981 ///
982 /// This may panic when `at > haystack.len()`. Note that `at ==
983 /// haystack.len()` is legal and guaranteed not to panic.
984 #[inline]
985 pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool {
986 let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
987 let word_after =
988 at < haystack.len() && utf8::is_word_byte(haystack[at]);
989 word_before != word_after
990 }
991
992 /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given
993 /// position in `haystack`.
994 ///
995 /// # Panics
996 ///
997 /// This may panic when `at > haystack.len()`. Note that `at ==
998 /// haystack.len()` is legal and guaranteed not to panic.
999 #[inline]
1000 pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool {
1001 !self.is_word_ascii(haystack, at)
1002 }
1003
1004 /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given
1005 /// position in `haystack`.
1006 ///
1007 /// # Panics
1008 ///
1009 /// This may panic when `at > haystack.len()`. Note that `at ==
1010 /// haystack.len()` is legal and guaranteed not to panic.
1011 ///
1012 /// # Errors
1013 ///
1014 /// This returns an error when Unicode word boundary tables
1015 /// are not available. Specifically, this only occurs when the
1016 /// `unicode-word-boundary` feature is not enabled.
1017 #[inline]
1018 pub fn is_word_unicode(
1019 &self,
1020 haystack: &[u8],
1021 at: usize,
1022 ) -> Result<bool, UnicodeWordBoundaryError> {
1023 let word_before = is_word_char::rev(haystack, at)?;
1024 let word_after = is_word_char::fwd(haystack, at)?;
1025 Ok(word_before != word_after)
1026 }
1027
1028 /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the
1029 /// given position in `haystack`.
1030 ///
1031 /// # Panics
1032 ///
1033 /// This may panic when `at > haystack.len()`. Note that `at ==
1034 /// haystack.len()` is legal and guaranteed not to panic.
1035 ///
1036 /// # Errors
1037 ///
1038 /// This returns an error when Unicode word boundary tables
1039 /// are not available. Specifically, this only occurs when the
1040 /// `unicode-word-boundary` feature is not enabled.
1041 #[inline]
1042 pub fn is_word_unicode_negate(
1043 &self,
1044 haystack: &[u8],
1045 at: usize,
1046 ) -> Result<bool, UnicodeWordBoundaryError> {
1047 // This is pretty subtle. Why do we need to do UTF-8 decoding here?
1048 // Well... at time of writing, the is_word_char_{fwd,rev} routines will
1049 // only return true if there is a valid UTF-8 encoding of a "word"
1050 // codepoint, and false in every other case (including invalid UTF-8).
1051 // This means that in regions of invalid UTF-8 (which might be a
1052 // subset of valid UTF-8!), it would result in \B matching. While this
1053 // would be questionable in the context of truly invalid UTF-8, it is
1054 // *certainly* wrong to report match boundaries that split the encoding
1055 // of a codepoint. So to work around this, we ensure that we can decode
1056 // a codepoint on either side of `at`. If either direction fails, then
1057 // we don't permit \B to match at all.
1058 //
1059 // Now, this isn't exactly optimal from a perf perspective. We could
1060 // try and detect this in is_word_char::{fwd,rev}, but it's not clear
1061 // if it's worth it. \B is, after all, rarely used. Even worse,
1062 // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this
1063 // will wind up doing UTF-8 decoding twice. Owch. We could fix this
1064 // with more code complexity, but it just doesn't feel worth it for \B.
1065 //
1066 // And in particular, we do *not* have to do this with \b, because \b
1067 // *requires* that at least one side of `at` be a "word" codepoint,
1068 // which in turn implies one side of `at` must be valid UTF-8. This in
1069 // turn implies that \b can never split a valid UTF-8 encoding of a
1070 // codepoint. In the case where one side of `at` is truly invalid UTF-8
1071 // and the other side IS a word codepoint, then we want \b to match
1072 // since it represents a valid UTF-8 boundary. It also makes sense. For
1073 // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'.
1074 //
1075 // Note also that this is not just '!is_word_unicode(..)' like it is
1076 // for the ASCII case. For example, neither \b nor \B is satisfied
1077 // within invalid UTF-8 sequences.
1078 let word_before = at > 0
1079 && match utf8::decode_last(&haystack[..at]) {
1080 None | Some(Err(_)) => return Ok(false),
1081 Some(Ok(_)) => is_word_char::rev(haystack, at)?,
1082 };
1083 let word_after = at < haystack.len()
1084 && match utf8::decode(&haystack[at..]) {
1085 None | Some(Err(_)) => return Ok(false),
1086 Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
1087 };
1088 Ok(word_before == word_after)
1089 }
1090
1091 /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given
1092 /// position in `haystack`.
1093 ///
1094 /// # Panics
1095 ///
1096 /// This may panic when `at > haystack.len()`. Note that `at ==
1097 /// haystack.len()` is legal and guaranteed not to panic.
1098 #[inline]
1099 pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool {
1100 let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1101 let word_after =
1102 at < haystack.len() && utf8::is_word_byte(haystack[at]);
1103 !word_before && word_after
1104 }
1105
1106 /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given
1107 /// position in `haystack`.
1108 ///
1109 /// # Panics
1110 ///
1111 /// This may panic when `at > haystack.len()`. Note that `at ==
1112 /// haystack.len()` is legal and guaranteed not to panic.
1113 #[inline]
1114 pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool {
1115 let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1116 let word_after =
1117 at < haystack.len() && utf8::is_word_byte(haystack[at]);
1118 word_before && !word_after
1119 }
1120
1121 /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the
1122 /// given position in `haystack`.
1123 ///
1124 /// # Panics
1125 ///
1126 /// This may panic when `at > haystack.len()`. Note that `at ==
1127 /// haystack.len()` is legal and guaranteed not to panic.
1128 ///
1129 /// # Errors
1130 ///
1131 /// This returns an error when Unicode word boundary tables
1132 /// are not available. Specifically, this only occurs when the
1133 /// `unicode-word-boundary` feature is not enabled.
1134 #[inline]
1135 pub fn is_word_start_unicode(
1136 &self,
1137 haystack: &[u8],
1138 at: usize,
1139 ) -> Result<bool, UnicodeWordBoundaryError> {
1140 let word_before = is_word_char::rev(haystack, at)?;
1141 let word_after = is_word_char::fwd(haystack, at)?;
1142 Ok(!word_before && word_after)
1143 }
1144
1145 /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the
1146 /// given position in `haystack`.
1147 ///
1148 /// # Panics
1149 ///
1150 /// This may panic when `at > haystack.len()`. Note that `at ==
1151 /// haystack.len()` is legal and guaranteed not to panic.
1152 ///
1153 /// # Errors
1154 ///
1155 /// This returns an error when Unicode word boundary tables
1156 /// are not available. Specifically, this only occurs when the
1157 /// `unicode-word-boundary` feature is not enabled.
1158 #[inline]
1159 pub fn is_word_end_unicode(
1160 &self,
1161 haystack: &[u8],
1162 at: usize,
1163 ) -> Result<bool, UnicodeWordBoundaryError> {
1164 let word_before = is_word_char::rev(haystack, at)?;
1165 let word_after = is_word_char::fwd(haystack, at)?;
1166 Ok(word_before && !word_after)
1167 }
1168
1169 /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the
1170 /// given position in `haystack`.
1171 ///
1172 /// # Panics
1173 ///
1174 /// This may panic when `at > haystack.len()`. Note that `at ==
1175 /// haystack.len()` is legal and guaranteed not to panic.
1176 #[inline]
1177 pub fn is_word_start_half_ascii(
1178 &self,
1179 haystack: &[u8],
1180 at: usize,
1181 ) -> bool {
1182 let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]);
1183 !word_before
1184 }
1185
1186 /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the
1187 /// given position in `haystack`.
1188 ///
1189 /// # Panics
1190 ///
1191 /// This may panic when `at > haystack.len()`. Note that `at ==
1192 /// haystack.len()` is legal and guaranteed not to panic.
1193 #[inline]
1194 pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool {
1195 let word_after =
1196 at < haystack.len() && utf8::is_word_byte(haystack[at]);
1197 !word_after
1198 }
1199
1200 /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the
1201 /// given position in `haystack`.
1202 ///
1203 /// # Panics
1204 ///
1205 /// This may panic when `at > haystack.len()`. Note that `at ==
1206 /// haystack.len()` is legal and guaranteed not to panic.
1207 ///
1208 /// # Errors
1209 ///
1210 /// This returns an error when Unicode word boundary tables
1211 /// are not available. Specifically, this only occurs when the
1212 /// `unicode-word-boundary` feature is not enabled.
1213 #[inline]
1214 pub fn is_word_start_half_unicode(
1215 &self,
1216 haystack: &[u8],
1217 at: usize,
1218 ) -> Result<bool, UnicodeWordBoundaryError> {
1219 // See `is_word_unicode_negate` for why we need to do this. We don't
1220 // need to do it for `is_word_start_unicode` because that guarantees
1221 // that the position matched falls on a valid UTF-8 boundary given
1222 // that the right side must be in \w.
1223 let word_before = at > 0
1224 && match utf8::decode_last(&haystack[..at]) {
1225 None | Some(Err(_)) => return Ok(false),
1226 Some(Ok(_)) => is_word_char::rev(haystack, at)?,
1227 };
1228 Ok(!word_before)
1229 }
1230
1231 /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the
1232 /// given position in `haystack`.
1233 ///
1234 /// # Panics
1235 ///
1236 /// This may panic when `at > haystack.len()`. Note that `at ==
1237 /// haystack.len()` is legal and guaranteed not to panic.
1238 ///
1239 /// # Errors
1240 ///
1241 /// This returns an error when Unicode word boundary tables
1242 /// are not available. Specifically, this only occurs when the
1243 /// `unicode-word-boundary` feature is not enabled.
1244 #[inline]
1245 pub fn is_word_end_half_unicode(
1246 &self,
1247 haystack: &[u8],
1248 at: usize,
1249 ) -> Result<bool, UnicodeWordBoundaryError> {
1250 // See `is_word_unicode_negate` for why we need to do this. We don't
1251 // need to do it for `is_word_end_unicode` because that guarantees
1252 // that the position matched falls on a valid UTF-8 boundary given
1253 // that the left side must be in \w.
1254 let word_after = at < haystack.len()
1255 && match utf8::decode(&haystack[at..]) {
1256 None | Some(Err(_)) => return Ok(false),
1257 Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
1258 };
1259 Ok(!word_after)
1260 }
1261}
1262
1263impl Default for LookMatcher {
1264 fn default() -> LookMatcher {
1265 LookMatcher::new()
1266 }
1267}
1268
1269/// An error that occurs when the Unicode-aware `\w` class is unavailable.
1270///
1271/// This error can occur when the data tables necessary for the Unicode aware
1272/// Perl character class `\w` are unavailable. The `\w` class is used to
1273/// determine whether a codepoint is considered a word character or not when
1274/// determining whether a Unicode aware `\b` (or `\B`) matches at a particular
1275/// position.
1276///
1277/// This error can only occur when the `unicode-word-boundary` feature is
1278/// disabled.
1279#[derive(Clone, Debug)]
1280pub struct UnicodeWordBoundaryError(());
1281
1282impl UnicodeWordBoundaryError {
1283 #[cfg(not(feature = "unicode-word-boundary"))]
1284 pub(crate) fn new() -> UnicodeWordBoundaryError {
1285 UnicodeWordBoundaryError(())
1286 }
1287
1288 /// Returns an error if and only if Unicode word boundary data is
1289 /// unavailable.
1290 pub fn check() -> Result<(), UnicodeWordBoundaryError> {
1291 is_word_char::check()
1292 }
1293}
1294
1295#[cfg(feature = "std")]
1296impl std::error::Error for UnicodeWordBoundaryError {}
1297
1298impl core::fmt::Display for UnicodeWordBoundaryError {
1299 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1300 write!(
1301 f,
1302 "Unicode-aware \\b and \\B are unavailable because the \
1303 requisite data tables are missing, please enable the \
1304 unicode-word-boundary feature"
1305 )
1306 }
1307}
1308
1309// Below are FOUR different ways for checking whether whether a "word"
1310// codepoint exists at a particular position in the haystack. The four
1311// different approaches are, in order of preference:
1312//
1313// 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the
1314// first call, and then use that DFA for all subsequent calls.
1315// 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available.
1316// 3. Do UTF-8 decoding and use our own 'perl_word' table.
1317// 4. Return an error.
1318//
1319// The reason for all of these approaches is a combination of perf and
1320// permitting one to build regex-automata without the Unicode data necessary
1321// for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would
1322// still work.)
1323//
1324// The DFA approach is the fastest, but it requires the regex parser, the
1325// NFA compiler, the DFA builder and the DFA search runtime. That's a lot to
1326// bring in, but if it's available, it's (probably) the best we can do.
1327//
1328// Approaches (2) and (3) are effectively equivalent, but (2) reuses the
1329// data in regex-syntax and avoids duplicating it in regex-automata.
1330//
1331// Finally, (4) unconditionally returns an error since the requisite data isn't
1332// available anywhere.
1333//
1334// There are actually more approaches possible that we didn't implement. For
1335// example, if the DFA builder is available but the syntax parser is not, we
1336// could technically hand construct our own NFA from the 'perl_word' data
1337// table. But to avoid some pretty hairy code duplication, we would in turn
1338// need to pull the UTF-8 compiler out of the NFA compiler. Yikes.
1339//
1340// A possibly more sensible alternative is to use a lazy DFA when the full
1341// DFA builder isn't available...
1342//
1343// Yet another choice would be to build the full DFA and then embed it into the
1344// source. Then we'd only need to bring in the DFA search runtime, which is
1345// considerably smaller than the DFA builder code. The problem here is that the
1346// Debian people have spooked me[1] into avoiding cyclic dependencies. Namely,
1347// we'd need to build regex-cli, which depends on regex-automata in order to
1348// build some part of regex-automata. But to be honest, something like this has
1349// to be allowed somehow? I just don't know what the right process is.
1350//
1351// There are perhaps other choices as well. Why did I stop at these 4? Because
1352// I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA
1353// approach eventually, as the benefits of the DFA approach are somewhat
1354// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that
1355// the commands below no longer work. If necessary, we should re-capitulate
1356// the benchmark from whole cloth in rebar.)
1357//
1358// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv
1359//
1360// Then I changed the code below so that the util/unicode_data/perl_word table
1361// was used and re-ran the benchmark:
1362//
1363// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv
1364//
1365// And compared them:
1366//
1367// $ regex-cli bench diff dfa.csv table.csv
1368// benchmark engine dfa table
1369// --------- ------ --- -----
1370// internal/count/boundary-words-holmes regex/automata/pikevm 18.6 MB/s 12.9 MB/s
1371//
1372// Which is a nice improvement.
1373//
1374// UPDATE: It turns out that it takes approximately 22ms to build the reverse
1375// DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in
1376// the grand scheme things, but that is a significant latency cost. So I'm not
1377// sure that's a good idea. I then tried using a lazy DFA instead, and that
1378// eliminated the overhead, but since the lazy DFA requires mutable working
1379// memory, that requires introducing a 'Cache' for every simultaneous call.
1380//
1381// I ended up deciding for now to just keep the "UTF-8 decode and check the
1382// table." The DFA and lazy DFA approaches are still below, but commented out.
1383//
1384// [1]: https://github.com/BurntSushi/ucd-generate/issues/11
1385
1386/*
1387/// A module that looks for word codepoints using lazy DFAs.
1388#[cfg(all(
1389 feature = "unicode-word-boundary",
1390 feature = "syntax",
1391 feature = "unicode-perl",
1392 feature = "hybrid"
1393))]
1394mod is_word_char {
1395 use alloc::vec::Vec;
1396
1397 use crate::{
1398 hybrid::dfa::{Cache, DFA},
1399 nfa::thompson::NFA,
1400 util::{lazy::Lazy, pool::Pool, primitives::StateID},
1401 Anchored, Input,
1402 };
1403
1404 pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1405 Ok(())
1406 }
1407
1408 #[cfg_attr(feature = "perf-inline", inline(always))]
1409 pub(super) fn fwd(
1410 haystack: &[u8],
1411 mut at: usize,
1412 ) -> Result<bool, super::UnicodeWordBoundaryError> {
1413 static WORD: Lazy<DFA> = Lazy::new(|| DFA::new(r"\w").unwrap());
1414 static CACHE: Lazy<Pool<Cache>> =
1415 Lazy::new(|| Pool::new(|| WORD.create_cache()));
1416 let dfa = Lazy::get(&WORD);
1417 let mut cache = Lazy::get(&CACHE).get();
1418 let mut sid = dfa
1419 .start_state_forward(
1420 &mut cache,
1421 &Input::new("").anchored(Anchored::Yes),
1422 )
1423 .unwrap();
1424 while at < haystack.len() {
1425 let byte = haystack[at];
1426 sid = dfa.next_state(&mut cache, sid, byte).unwrap();
1427 at += 1;
1428 if sid.is_tagged() {
1429 if sid.is_match() {
1430 return Ok(true);
1431 } else if sid.is_dead() {
1432 return Ok(false);
1433 }
1434 }
1435 }
1436 Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
1437 }
1438
1439 #[cfg_attr(feature = "perf-inline", inline(always))]
1440 pub(super) fn rev(
1441 haystack: &[u8],
1442 mut at: usize,
1443 ) -> Result<bool, super::UnicodeWordBoundaryError> {
1444 static WORD: Lazy<DFA> = Lazy::new(|| {
1445 DFA::builder()
1446 .thompson(NFA::config().reverse(true))
1447 .build(r"\w")
1448 .unwrap()
1449 });
1450 static CACHE: Lazy<Pool<Cache>> =
1451 Lazy::new(|| Pool::new(|| WORD.create_cache()));
1452 let dfa = Lazy::get(&WORD);
1453 let mut cache = Lazy::get(&CACHE).get();
1454 let mut sid = dfa
1455 .start_state_reverse(
1456 &mut cache,
1457 &Input::new("").anchored(Anchored::Yes),
1458 )
1459 .unwrap();
1460 while at > 0 {
1461 at -= 1;
1462 let byte = haystack[at];
1463 sid = dfa.next_state(&mut cache, sid, byte).unwrap();
1464 if sid.is_tagged() {
1465 if sid.is_match() {
1466 return Ok(true);
1467 } else if sid.is_dead() {
1468 return Ok(false);
1469 }
1470 }
1471 }
1472 Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
1473 }
1474}
1475*/
1476
1477/*
1478/// A module that looks for word codepoints using fully compiled DFAs.
1479#[cfg(all(
1480 feature = "unicode-word-boundary",
1481 feature = "syntax",
1482 feature = "unicode-perl",
1483 feature = "dfa-build"
1484))]
1485mod is_word_char {
1486 use alloc::vec::Vec;
1487
1488 use crate::{
1489 dfa::{dense::DFA, Automaton, StartKind},
1490 nfa::thompson::NFA,
1491 util::{lazy::Lazy, primitives::StateID},
1492 Anchored, Input,
1493 };
1494
1495 pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1496 Ok(())
1497 }
1498
1499 #[cfg_attr(feature = "perf-inline", inline(always))]
1500 pub(super) fn fwd(
1501 haystack: &[u8],
1502 mut at: usize,
1503 ) -> Result<bool, super::UnicodeWordBoundaryError> {
1504 static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| {
1505 let dfa = DFA::builder()
1506 .configure(DFA::config().start_kind(StartKind::Anchored))
1507 .build(r"\w")
1508 .unwrap();
1509 // OK because our regex has no look-around.
1510 let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
1511 (dfa, start_id)
1512 });
1513 let &(ref dfa, mut sid) = Lazy::get(&WORD);
1514 while at < haystack.len() {
1515 let byte = haystack[at];
1516 sid = dfa.next_state(sid, byte);
1517 at += 1;
1518 if dfa.is_special_state(sid) {
1519 if dfa.is_match_state(sid) {
1520 return Ok(true);
1521 } else if dfa.is_dead_state(sid) {
1522 return Ok(false);
1523 }
1524 }
1525 }
1526 Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
1527 }
1528
1529 #[cfg_attr(feature = "perf-inline", inline(always))]
1530 pub(super) fn rev(
1531 haystack: &[u8],
1532 mut at: usize,
1533 ) -> Result<bool, super::UnicodeWordBoundaryError> {
1534 static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(|| {
1535 let dfa = DFA::builder()
1536 .configure(DFA::config().start_kind(StartKind::Anchored))
1537 // From ad hoc measurements, it looks like setting
1538 // shrink==false is slightly faster than shrink==true. I kind
1539 // of feel like this indicates that shrinking is probably a
1540 // failure, although it can help in some cases. Sigh.
1541 .thompson(NFA::config().reverse(true).shrink(false))
1542 .build(r"\w")
1543 .unwrap();
1544 // OK because our regex has no look-around.
1545 let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
1546 (dfa, start_id)
1547 });
1548 let &(ref dfa, mut sid) = Lazy::get(&WORD);
1549 while at > 0 {
1550 at -= 1;
1551 let byte = haystack[at];
1552 sid = dfa.next_state(sid, byte);
1553 if dfa.is_special_state(sid) {
1554 if dfa.is_match_state(sid) {
1555 return Ok(true);
1556 } else if dfa.is_dead_state(sid) {
1557 return Ok(false);
1558 }
1559 }
1560 }
1561 Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
1562 }
1563}
1564*/
1565
1566/// A module that looks for word codepoints using regex-syntax's data tables.
1567#[cfg(all(
1568 feature = "unicode-word-boundary",
1569 feature = "syntax",
1570 feature = "unicode-perl",
1571))]
1572mod is_word_char {
1573 use regex_syntax::try_is_word_character;
1574
1575 use crate::util::utf8;
1576
1577 pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1578 Ok(())
1579 }
1580
1581 #[cfg_attr(feature = "perf-inline", inline(always))]
1582 pub(super) fn fwd(
1583 haystack: &[u8],
1584 at: usize,
1585 ) -> Result<bool, super::UnicodeWordBoundaryError> {
1586 Ok(match utf8::decode(&haystack[at..]) {
1587 None | Some(Err(_)) => false,
1588 Some(Ok(ch)) => try_is_word_character(ch).expect(
1589 "since unicode-word-boundary, syntax and unicode-perl \
1590 are all enabled, it is expected that \
1591 try_is_word_character succeeds",
1592 ),
1593 })
1594 }
1595
1596 #[cfg_attr(feature = "perf-inline", inline(always))]
1597 pub(super) fn rev(
1598 haystack: &[u8],
1599 at: usize,
1600 ) -> Result<bool, super::UnicodeWordBoundaryError> {
1601 Ok(match utf8::decode_last(&haystack[..at]) {
1602 None | Some(Err(_)) => false,
1603 Some(Ok(ch)) => try_is_word_character(ch).expect(
1604 "since unicode-word-boundary, syntax and unicode-perl \
1605 are all enabled, it is expected that \
1606 try_is_word_character succeeds",
1607 ),
1608 })
1609 }
1610}
1611
1612/// A module that looks for word codepoints using regex-automata's data tables
1613/// (which are only compiled when regex-syntax's tables aren't available).
1614///
1615/// Note that the cfg should match the one in src/util/unicode_data/mod.rs for
1616/// perl_word.
1617#[cfg(all(
1618 feature = "unicode-word-boundary",
1619 not(all(feature = "syntax", feature = "unicode-perl")),
1620))]
1621mod is_word_char {
1622 use crate::util::utf8;
1623
1624 pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1625 Ok(())
1626 }
1627
1628 #[cfg_attr(feature = "perf-inline", inline(always))]
1629 pub(super) fn fwd(
1630 haystack: &[u8],
1631 at: usize,
1632 ) -> Result<bool, super::UnicodeWordBoundaryError> {
1633 Ok(match utf8::decode(&haystack[at..]) {
1634 None | Some(Err(_)) => false,
1635 Some(Ok(ch)) => is_word_character(ch),
1636 })
1637 }
1638
1639 #[cfg_attr(feature = "perf-inline", inline(always))]
1640 pub(super) fn rev(
1641 haystack: &[u8],
1642 at: usize,
1643 ) -> Result<bool, super::UnicodeWordBoundaryError> {
1644 Ok(match utf8::decode_last(&haystack[..at]) {
1645 None | Some(Err(_)) => false,
1646 Some(Ok(ch)) => is_word_character(ch),
1647 })
1648 }
1649
1650 #[cfg_attr(feature = "perf-inline", inline(always))]
1651 fn is_word_character(c: char) -> bool {
1652 use crate::util::{unicode_data::perl_word::PERL_WORD, utf8};
1653
1654 if u8::try_from(c).map_or(false, utf8::is_word_byte) {
1655 return true;
1656 }
1657 PERL_WORD
1658 .binary_search_by(|&(start, end)| {
1659 use core::cmp::Ordering;
1660
1661 if start <= c && c <= end {
1662 Ordering::Equal
1663 } else if start > c {
1664 Ordering::Greater
1665 } else {
1666 Ordering::Less
1667 }
1668 })
1669 .is_ok()
1670 }
1671}
1672
1673/// A module that always returns an error if Unicode word boundaries are
1674/// disabled. When this feature is disabled, then regex-automata will not
1675/// include its own data tables even if regex-syntax is disabled.
1676#[cfg(not(feature = "unicode-word-boundary"))]
1677mod is_word_char {
1678 pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1679 Err(super::UnicodeWordBoundaryError::new())
1680 }
1681
1682 #[cfg_attr(feature = "perf-inline", inline(always))]
1683 pub(super) fn fwd(
1684 _bytes: &[u8],
1685 _at: usize,
1686 ) -> Result<bool, super::UnicodeWordBoundaryError> {
1687 Err(super::UnicodeWordBoundaryError::new())
1688 }
1689
1690 #[cfg_attr(feature = "perf-inline", inline(always))]
1691 pub(super) fn rev(
1692 _bytes: &[u8],
1693 _at: usize,
1694 ) -> Result<bool, super::UnicodeWordBoundaryError> {
1695 Err(super::UnicodeWordBoundaryError::new())
1696 }
1697}
1698
1699#[cfg(test)]
1700mod tests {
1701 use super::*;
1702
1703 macro_rules! testlook {
1704 ($look:expr, $haystack:expr, $at:expr) => {
1705 LookMatcher::default().matches($look, $haystack.as_bytes(), $at)
1706 };
1707 }
1708
1709 #[test]
1710 fn look_matches_start_line() {
1711 let look = Look::StartLF;
1712
1713 assert!(testlook!(look, "", 0));
1714 assert!(testlook!(look, "\n", 0));
1715 assert!(testlook!(look, "\n", 1));
1716 assert!(testlook!(look, "a", 0));
1717 assert!(testlook!(look, "\na", 1));
1718
1719 assert!(!testlook!(look, "a", 1));
1720 assert!(!testlook!(look, "a\na", 1));
1721 }
1722
1723 #[test]
1724 fn look_matches_end_line() {
1725 let look = Look::EndLF;
1726
1727 assert!(testlook!(look, "", 0));
1728 assert!(testlook!(look, "\n", 1));
1729 assert!(testlook!(look, "\na", 0));
1730 assert!(testlook!(look, "\na", 2));
1731 assert!(testlook!(look, "a\na", 1));
1732
1733 assert!(!testlook!(look, "a", 0));
1734 assert!(!testlook!(look, "\na", 1));
1735 assert!(!testlook!(look, "a\na", 0));
1736 assert!(!testlook!(look, "a\na", 2));
1737 }
1738
1739 #[test]
1740 fn look_matches_start_text() {
1741 let look = Look::Start;
1742
1743 assert!(testlook!(look, "", 0));
1744 assert!(testlook!(look, "\n", 0));
1745 assert!(testlook!(look, "a", 0));
1746
1747 assert!(!testlook!(look, "\n", 1));
1748 assert!(!testlook!(look, "\na", 1));
1749 assert!(!testlook!(look, "a", 1));
1750 assert!(!testlook!(look, "a\na", 1));
1751 }
1752
1753 #[test]
1754 fn look_matches_end_text() {
1755 let look = Look::End;
1756
1757 assert!(testlook!(look, "", 0));
1758 assert!(testlook!(look, "\n", 1));
1759 assert!(testlook!(look, "\na", 2));
1760
1761 assert!(!testlook!(look, "\na", 0));
1762 assert!(!testlook!(look, "a\na", 1));
1763 assert!(!testlook!(look, "a", 0));
1764 assert!(!testlook!(look, "\na", 1));
1765 assert!(!testlook!(look, "a\na", 0));
1766 assert!(!testlook!(look, "a\na", 2));
1767 }
1768
1769 #[test]
1770 #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
1771 fn look_matches_word_unicode() {
1772 let look = Look::WordUnicode;
1773
1774 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1775 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1776
1777 // Simple ASCII word boundaries.
1778 assert!(testlook!(look, "a", 0));
1779 assert!(testlook!(look, "a", 1));
1780 assert!(testlook!(look, "a ", 1));
1781 assert!(testlook!(look, " a ", 1));
1782 assert!(testlook!(look, " a ", 2));
1783
1784 // Unicode word boundaries with a non-ASCII codepoint.
1785 assert!(testlook!(look, "𝛃", 0));
1786 assert!(testlook!(look, "𝛃", 4));
1787 assert!(testlook!(look, "𝛃 ", 4));
1788 assert!(testlook!(look, " 𝛃 ", 1));
1789 assert!(testlook!(look, " 𝛃 ", 5));
1790
1791 // Unicode word boundaries between non-ASCII codepoints.
1792 assert!(testlook!(look, "𝛃𐆀", 0));
1793 assert!(testlook!(look, "𝛃𐆀", 4));
1794
1795 // Non word boundaries for ASCII.
1796 assert!(!testlook!(look, "", 0));
1797 assert!(!testlook!(look, "ab", 1));
1798 assert!(!testlook!(look, "a ", 2));
1799 assert!(!testlook!(look, " a ", 0));
1800 assert!(!testlook!(look, " a ", 3));
1801
1802 // Non word boundaries with a non-ASCII codepoint.
1803 assert!(!testlook!(look, "𝛃b", 4));
1804 assert!(!testlook!(look, "𝛃 ", 5));
1805 assert!(!testlook!(look, " 𝛃 ", 0));
1806 assert!(!testlook!(look, " 𝛃 ", 6));
1807 assert!(!testlook!(look, "𝛃", 1));
1808 assert!(!testlook!(look, "𝛃", 2));
1809 assert!(!testlook!(look, "𝛃", 3));
1810
1811 // Non word boundaries with non-ASCII codepoints.
1812 assert!(!testlook!(look, "𝛃𐆀", 1));
1813 assert!(!testlook!(look, "𝛃𐆀", 2));
1814 assert!(!testlook!(look, "𝛃𐆀", 3));
1815 assert!(!testlook!(look, "𝛃𐆀", 5));
1816 assert!(!testlook!(look, "𝛃𐆀", 6));
1817 assert!(!testlook!(look, "𝛃𐆀", 7));
1818 assert!(!testlook!(look, "𝛃𐆀", 8));
1819 }
1820
1821 #[test]
1822 fn look_matches_word_ascii() {
1823 let look = Look::WordAscii;
1824
1825 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1826 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1827
1828 // Simple ASCII word boundaries.
1829 assert!(testlook!(look, "a", 0));
1830 assert!(testlook!(look, "a", 1));
1831 assert!(testlook!(look, "a ", 1));
1832 assert!(testlook!(look, " a ", 1));
1833 assert!(testlook!(look, " a ", 2));
1834
1835 // Unicode word boundaries with a non-ASCII codepoint. Since this is
1836 // an ASCII word boundary, none of these match.
1837 assert!(!testlook!(look, "𝛃", 0));
1838 assert!(!testlook!(look, "𝛃", 4));
1839 assert!(!testlook!(look, "𝛃 ", 4));
1840 assert!(!testlook!(look, " 𝛃 ", 1));
1841 assert!(!testlook!(look, " 𝛃 ", 5));
1842
1843 // Unicode word boundaries between non-ASCII codepoints. Again, since
1844 // this is an ASCII word boundary, none of these match.
1845 assert!(!testlook!(look, "𝛃𐆀", 0));
1846 assert!(!testlook!(look, "𝛃𐆀", 4));
1847
1848 // Non word boundaries for ASCII.
1849 assert!(!testlook!(look, "", 0));
1850 assert!(!testlook!(look, "ab", 1));
1851 assert!(!testlook!(look, "a ", 2));
1852 assert!(!testlook!(look, " a ", 0));
1853 assert!(!testlook!(look, " a ", 3));
1854
1855 // Non word boundaries with a non-ASCII codepoint.
1856 assert!(testlook!(look, "𝛃b", 4));
1857 assert!(!testlook!(look, "𝛃 ", 5));
1858 assert!(!testlook!(look, " 𝛃 ", 0));
1859 assert!(!testlook!(look, " 𝛃 ", 6));
1860 assert!(!testlook!(look, "𝛃", 1));
1861 assert!(!testlook!(look, "𝛃", 2));
1862 assert!(!testlook!(look, "𝛃", 3));
1863
1864 // Non word boundaries with non-ASCII codepoints.
1865 assert!(!testlook!(look, "𝛃𐆀", 1));
1866 assert!(!testlook!(look, "𝛃𐆀", 2));
1867 assert!(!testlook!(look, "𝛃𐆀", 3));
1868 assert!(!testlook!(look, "𝛃𐆀", 5));
1869 assert!(!testlook!(look, "𝛃𐆀", 6));
1870 assert!(!testlook!(look, "𝛃𐆀", 7));
1871 assert!(!testlook!(look, "𝛃𐆀", 8));
1872 }
1873
1874 #[test]
1875 #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
1876 fn look_matches_word_unicode_negate() {
1877 let look = Look::WordUnicodeNegate;
1878
1879 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1880 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1881
1882 // Simple ASCII word boundaries.
1883 assert!(!testlook!(look, "a", 0));
1884 assert!(!testlook!(look, "a", 1));
1885 assert!(!testlook!(look, "a ", 1));
1886 assert!(!testlook!(look, " a ", 1));
1887 assert!(!testlook!(look, " a ", 2));
1888
1889 // Unicode word boundaries with a non-ASCII codepoint.
1890 assert!(!testlook!(look, "𝛃", 0));
1891 assert!(!testlook!(look, "𝛃", 4));
1892 assert!(!testlook!(look, "𝛃 ", 4));
1893 assert!(!testlook!(look, " 𝛃 ", 1));
1894 assert!(!testlook!(look, " 𝛃 ", 5));
1895
1896 // Unicode word boundaries between non-ASCII codepoints.
1897 assert!(!testlook!(look, "𝛃𐆀", 0));
1898 assert!(!testlook!(look, "𝛃𐆀", 4));
1899
1900 // Non word boundaries for ASCII.
1901 assert!(testlook!(look, "", 0));
1902 assert!(testlook!(look, "ab", 1));
1903 assert!(testlook!(look, "a ", 2));
1904 assert!(testlook!(look, " a ", 0));
1905 assert!(testlook!(look, " a ", 3));
1906
1907 // Non word boundaries with a non-ASCII codepoint.
1908 assert!(testlook!(look, "𝛃b", 4));
1909 assert!(testlook!(look, "𝛃 ", 5));
1910 assert!(testlook!(look, " 𝛃 ", 0));
1911 assert!(testlook!(look, " 𝛃 ", 6));
1912 // These don't match because they could otherwise return an offset that
1913 // splits the UTF-8 encoding of a codepoint.
1914 assert!(!testlook!(look, "𝛃", 1));
1915 assert!(!testlook!(look, "𝛃", 2));
1916 assert!(!testlook!(look, "𝛃", 3));
1917
1918 // Non word boundaries with non-ASCII codepoints. These also don't
1919 // match because they could otherwise return an offset that splits the
1920 // UTF-8 encoding of a codepoint.
1921 assert!(!testlook!(look, "𝛃𐆀", 1));
1922 assert!(!testlook!(look, "𝛃𐆀", 2));
1923 assert!(!testlook!(look, "𝛃𐆀", 3));
1924 assert!(!testlook!(look, "𝛃𐆀", 5));
1925 assert!(!testlook!(look, "𝛃𐆀", 6));
1926 assert!(!testlook!(look, "𝛃𐆀", 7));
1927 // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end
1928 // of the haystack. So the "end" of the haystack isn't a word and 𐆀
1929 // isn't a word, thus, \B matches.
1930 assert!(testlook!(look, "𝛃𐆀", 8));
1931 }
1932
1933 #[test]
1934 fn look_matches_word_ascii_negate() {
1935 let look = Look::WordAsciiNegate;
1936
1937 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1938 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1939
1940 // Simple ASCII word boundaries.
1941 assert!(!testlook!(look, "a", 0));
1942 assert!(!testlook!(look, "a", 1));
1943 assert!(!testlook!(look, "a ", 1));
1944 assert!(!testlook!(look, " a ", 1));
1945 assert!(!testlook!(look, " a ", 2));
1946
1947 // Unicode word boundaries with a non-ASCII codepoint. Since this is
1948 // an ASCII word boundary, none of these match.
1949 assert!(testlook!(look, "𝛃", 0));
1950 assert!(testlook!(look, "𝛃", 4));
1951 assert!(testlook!(look, "𝛃 ", 4));
1952 assert!(testlook!(look, " 𝛃 ", 1));
1953 assert!(testlook!(look, " 𝛃 ", 5));
1954
1955 // Unicode word boundaries between non-ASCII codepoints. Again, since
1956 // this is an ASCII word boundary, none of these match.
1957 assert!(testlook!(look, "𝛃𐆀", 0));
1958 assert!(testlook!(look, "𝛃𐆀", 4));
1959
1960 // Non word boundaries for ASCII.
1961 assert!(testlook!(look, "", 0));
1962 assert!(testlook!(look, "ab", 1));
1963 assert!(testlook!(look, "a ", 2));
1964 assert!(testlook!(look, " a ", 0));
1965 assert!(testlook!(look, " a ", 3));
1966
1967 // Non word boundaries with a non-ASCII codepoint.
1968 assert!(!testlook!(look, "𝛃b", 4));
1969 assert!(testlook!(look, "𝛃 ", 5));
1970 assert!(testlook!(look, " 𝛃 ", 0));
1971 assert!(testlook!(look, " 𝛃 ", 6));
1972 assert!(testlook!(look, "𝛃", 1));
1973 assert!(testlook!(look, "𝛃", 2));
1974 assert!(testlook!(look, "𝛃", 3));
1975
1976 // Non word boundaries with non-ASCII codepoints.
1977 assert!(testlook!(look, "𝛃𐆀", 1));
1978 assert!(testlook!(look, "𝛃𐆀", 2));
1979 assert!(testlook!(look, "𝛃𐆀", 3));
1980 assert!(testlook!(look, "𝛃𐆀", 5));
1981 assert!(testlook!(look, "𝛃𐆀", 6));
1982 assert!(testlook!(look, "𝛃𐆀", 7));
1983 assert!(testlook!(look, "𝛃𐆀", 8));
1984 }
1985
1986 #[test]
1987 fn look_matches_word_start_ascii() {
1988 let look = Look::WordStartAscii;
1989
1990 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1991 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
1992
1993 // Simple ASCII word boundaries.
1994 assert!(testlook!(look, "a", 0));
1995 assert!(!testlook!(look, "a", 1));
1996 assert!(!testlook!(look, "a ", 1));
1997 assert!(testlook!(look, " a ", 1));
1998 assert!(!testlook!(look, " a ", 2));
1999
2000 // Unicode word boundaries with a non-ASCII codepoint. Since this is
2001 // an ASCII word boundary, none of these match.
2002 assert!(!testlook!(look, "𝛃", 0));
2003 assert!(!testlook!(look, "𝛃", 4));
2004 assert!(!testlook!(look, "𝛃 ", 4));
2005 assert!(!testlook!(look, " 𝛃 ", 1));
2006 assert!(!testlook!(look, " 𝛃 ", 5));
2007
2008 // Unicode word boundaries between non-ASCII codepoints. Again, since
2009 // this is an ASCII word boundary, none of these match.
2010 assert!(!testlook!(look, "𝛃𐆀", 0));
2011 assert!(!testlook!(look, "𝛃𐆀", 4));
2012
2013 // Non word boundaries for ASCII.
2014 assert!(!testlook!(look, "", 0));
2015 assert!(!testlook!(look, "ab", 1));
2016 assert!(!testlook!(look, "a ", 2));
2017 assert!(!testlook!(look, " a ", 0));
2018 assert!(!testlook!(look, " a ", 3));
2019
2020 // Non word boundaries with a non-ASCII codepoint.
2021 assert!(testlook!(look, "𝛃b", 4));
2022 assert!(!testlook!(look, "b𝛃", 1));
2023 assert!(!testlook!(look, "𝛃 ", 5));
2024 assert!(!testlook!(look, " 𝛃 ", 0));
2025 assert!(!testlook!(look, " 𝛃 ", 6));
2026 assert!(!testlook!(look, "𝛃", 1));
2027 assert!(!testlook!(look, "𝛃", 2));
2028 assert!(!testlook!(look, "𝛃", 3));
2029
2030 // Non word boundaries with non-ASCII codepoints.
2031 assert!(!testlook!(look, "𝛃𐆀", 1));
2032 assert!(!testlook!(look, "𝛃𐆀", 2));
2033 assert!(!testlook!(look, "𝛃𐆀", 3));
2034 assert!(!testlook!(look, "𝛃𐆀", 5));
2035 assert!(!testlook!(look, "𝛃𐆀", 6));
2036 assert!(!testlook!(look, "𝛃𐆀", 7));
2037 assert!(!testlook!(look, "𝛃𐆀", 8));
2038 }
2039
2040 #[test]
2041 fn look_matches_word_end_ascii() {
2042 let look = Look::WordEndAscii;
2043
2044 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2045 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2046
2047 // Simple ASCII word boundaries.
2048 assert!(!testlook!(look, "a", 0));
2049 assert!(testlook!(look, "a", 1));
2050 assert!(testlook!(look, "a ", 1));
2051 assert!(!testlook!(look, " a ", 1));
2052 assert!(testlook!(look, " a ", 2));
2053
2054 // Unicode word boundaries with a non-ASCII codepoint. Since this is
2055 // an ASCII word boundary, none of these match.
2056 assert!(!testlook!(look, "𝛃", 0));
2057 assert!(!testlook!(look, "𝛃", 4));
2058 assert!(!testlook!(look, "𝛃 ", 4));
2059 assert!(!testlook!(look, " 𝛃 ", 1));
2060 assert!(!testlook!(look, " 𝛃 ", 5));
2061
2062 // Unicode word boundaries between non-ASCII codepoints. Again, since
2063 // this is an ASCII word boundary, none of these match.
2064 assert!(!testlook!(look, "𝛃𐆀", 0));
2065 assert!(!testlook!(look, "𝛃𐆀", 4));
2066
2067 // Non word boundaries for ASCII.
2068 assert!(!testlook!(look, "", 0));
2069 assert!(!testlook!(look, "ab", 1));
2070 assert!(!testlook!(look, "a ", 2));
2071 assert!(!testlook!(look, " a ", 0));
2072 assert!(!testlook!(look, " a ", 3));
2073
2074 // Non word boundaries with a non-ASCII codepoint.
2075 assert!(!testlook!(look, "𝛃b", 4));
2076 assert!(testlook!(look, "b𝛃", 1));
2077 assert!(!testlook!(look, "𝛃 ", 5));
2078 assert!(!testlook!(look, " 𝛃 ", 0));
2079 assert!(!testlook!(look, " 𝛃 ", 6));
2080 assert!(!testlook!(look, "𝛃", 1));
2081 assert!(!testlook!(look, "𝛃", 2));
2082 assert!(!testlook!(look, "𝛃", 3));
2083
2084 // Non word boundaries with non-ASCII codepoints.
2085 assert!(!testlook!(look, "𝛃𐆀", 1));
2086 assert!(!testlook!(look, "𝛃𐆀", 2));
2087 assert!(!testlook!(look, "𝛃𐆀", 3));
2088 assert!(!testlook!(look, "𝛃𐆀", 5));
2089 assert!(!testlook!(look, "𝛃𐆀", 6));
2090 assert!(!testlook!(look, "𝛃𐆀", 7));
2091 assert!(!testlook!(look, "𝛃𐆀", 8));
2092 }
2093
2094 #[test]
2095 #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2096 fn look_matches_word_start_unicode() {
2097 let look = Look::WordStartUnicode;
2098
2099 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2100 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2101
2102 // Simple ASCII word boundaries.
2103 assert!(testlook!(look, "a", 0));
2104 assert!(!testlook!(look, "a", 1));
2105 assert!(!testlook!(look, "a ", 1));
2106 assert!(testlook!(look, " a ", 1));
2107 assert!(!testlook!(look, " a ", 2));
2108
2109 // Unicode word boundaries with a non-ASCII codepoint.
2110 assert!(testlook!(look, "𝛃", 0));
2111 assert!(!testlook!(look, "𝛃", 4));
2112 assert!(!testlook!(look, "𝛃 ", 4));
2113 assert!(testlook!(look, " 𝛃 ", 1));
2114 assert!(!testlook!(look, " 𝛃 ", 5));
2115
2116 // Unicode word boundaries between non-ASCII codepoints.
2117 assert!(testlook!(look, "𝛃𐆀", 0));
2118 assert!(!testlook!(look, "𝛃𐆀", 4));
2119
2120 // Non word boundaries for ASCII.
2121 assert!(!testlook!(look, "", 0));
2122 assert!(!testlook!(look, "ab", 1));
2123 assert!(!testlook!(look, "a ", 2));
2124 assert!(!testlook!(look, " a ", 0));
2125 assert!(!testlook!(look, " a ", 3));
2126
2127 // Non word boundaries with a non-ASCII codepoint.
2128 assert!(!testlook!(look, "𝛃b", 4));
2129 assert!(!testlook!(look, "b𝛃", 1));
2130 assert!(!testlook!(look, "𝛃 ", 5));
2131 assert!(!testlook!(look, " 𝛃 ", 0));
2132 assert!(!testlook!(look, " 𝛃 ", 6));
2133 assert!(!testlook!(look, "𝛃", 1));
2134 assert!(!testlook!(look, "𝛃", 2));
2135 assert!(!testlook!(look, "𝛃", 3));
2136
2137 // Non word boundaries with non-ASCII codepoints.
2138 assert!(!testlook!(look, "𝛃𐆀", 1));
2139 assert!(!testlook!(look, "𝛃𐆀", 2));
2140 assert!(!testlook!(look, "𝛃𐆀", 3));
2141 assert!(!testlook!(look, "𝛃𐆀", 5));
2142 assert!(!testlook!(look, "𝛃𐆀", 6));
2143 assert!(!testlook!(look, "𝛃𐆀", 7));
2144 assert!(!testlook!(look, "𝛃𐆀", 8));
2145 }
2146
2147 #[test]
2148 #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2149 fn look_matches_word_end_unicode() {
2150 let look = Look::WordEndUnicode;
2151
2152 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2153 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2154
2155 // Simple ASCII word boundaries.
2156 assert!(!testlook!(look, "a", 0));
2157 assert!(testlook!(look, "a", 1));
2158 assert!(testlook!(look, "a ", 1));
2159 assert!(!testlook!(look, " a ", 1));
2160 assert!(testlook!(look, " a ", 2));
2161
2162 // Unicode word boundaries with a non-ASCII codepoint.
2163 assert!(!testlook!(look, "𝛃", 0));
2164 assert!(testlook!(look, "𝛃", 4));
2165 assert!(testlook!(look, "𝛃 ", 4));
2166 assert!(!testlook!(look, " 𝛃 ", 1));
2167 assert!(testlook!(look, " 𝛃 ", 5));
2168
2169 // Unicode word boundaries between non-ASCII codepoints.
2170 assert!(!testlook!(look, "𝛃𐆀", 0));
2171 assert!(testlook!(look, "𝛃𐆀", 4));
2172
2173 // Non word boundaries for ASCII.
2174 assert!(!testlook!(look, "", 0));
2175 assert!(!testlook!(look, "ab", 1));
2176 assert!(!testlook!(look, "a ", 2));
2177 assert!(!testlook!(look, " a ", 0));
2178 assert!(!testlook!(look, " a ", 3));
2179
2180 // Non word boundaries with a non-ASCII codepoint.
2181 assert!(!testlook!(look, "𝛃b", 4));
2182 assert!(!testlook!(look, "b𝛃", 1));
2183 assert!(!testlook!(look, "𝛃 ", 5));
2184 assert!(!testlook!(look, " 𝛃 ", 0));
2185 assert!(!testlook!(look, " 𝛃 ", 6));
2186 assert!(!testlook!(look, "𝛃", 1));
2187 assert!(!testlook!(look, "𝛃", 2));
2188 assert!(!testlook!(look, "𝛃", 3));
2189
2190 // Non word boundaries with non-ASCII codepoints.
2191 assert!(!testlook!(look, "𝛃𐆀", 1));
2192 assert!(!testlook!(look, "𝛃𐆀", 2));
2193 assert!(!testlook!(look, "𝛃𐆀", 3));
2194 assert!(!testlook!(look, "𝛃𐆀", 5));
2195 assert!(!testlook!(look, "𝛃𐆀", 6));
2196 assert!(!testlook!(look, "𝛃𐆀", 7));
2197 assert!(!testlook!(look, "𝛃𐆀", 8));
2198 }
2199
2200 #[test]
2201 fn look_matches_word_start_half_ascii() {
2202 let look = Look::WordStartHalfAscii;
2203
2204 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2205 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2206
2207 // Simple ASCII word boundaries.
2208 assert!(testlook!(look, "a", 0));
2209 assert!(!testlook!(look, "a", 1));
2210 assert!(!testlook!(look, "a ", 1));
2211 assert!(testlook!(look, " a ", 1));
2212 assert!(!testlook!(look, " a ", 2));
2213
2214 // Unicode word boundaries with a non-ASCII codepoint. Since this is
2215 // an ASCII word boundary, none of these match.
2216 assert!(testlook!(look, "𝛃", 0));
2217 assert!(testlook!(look, "𝛃", 4));
2218 assert!(testlook!(look, "𝛃 ", 4));
2219 assert!(testlook!(look, " 𝛃 ", 1));
2220 assert!(testlook!(look, " 𝛃 ", 5));
2221
2222 // Unicode word boundaries between non-ASCII codepoints. Again, since
2223 // this is an ASCII word boundary, none of these match.
2224 assert!(testlook!(look, "𝛃𐆀", 0));
2225 assert!(testlook!(look, "𝛃𐆀", 4));
2226
2227 // Non word boundaries for ASCII.
2228 assert!(testlook!(look, "", 0));
2229 assert!(!testlook!(look, "ab", 1));
2230 assert!(testlook!(look, "a ", 2));
2231 assert!(testlook!(look, " a ", 0));
2232 assert!(testlook!(look, " a ", 3));
2233
2234 // Non word boundaries with a non-ASCII codepoint.
2235 assert!(testlook!(look, "𝛃b", 4));
2236 assert!(!testlook!(look, "b𝛃", 1));
2237 assert!(testlook!(look, "𝛃 ", 5));
2238 assert!(testlook!(look, " 𝛃 ", 0));
2239 assert!(testlook!(look, " 𝛃 ", 6));
2240 assert!(testlook!(look, "𝛃", 1));
2241 assert!(testlook!(look, "𝛃", 2));
2242 assert!(testlook!(look, "𝛃", 3));
2243
2244 // Non word boundaries with non-ASCII codepoints.
2245 assert!(testlook!(look, "𝛃𐆀", 1));
2246 assert!(testlook!(look, "𝛃𐆀", 2));
2247 assert!(testlook!(look, "𝛃𐆀", 3));
2248 assert!(testlook!(look, "𝛃𐆀", 5));
2249 assert!(testlook!(look, "𝛃𐆀", 6));
2250 assert!(testlook!(look, "𝛃𐆀", 7));
2251 assert!(testlook!(look, "𝛃𐆀", 8));
2252 }
2253
2254 #[test]
2255 fn look_matches_word_end_half_ascii() {
2256 let look = Look::WordEndHalfAscii;
2257
2258 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2259 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2260
2261 // Simple ASCII word boundaries.
2262 assert!(!testlook!(look, "a", 0));
2263 assert!(testlook!(look, "a", 1));
2264 assert!(testlook!(look, "a ", 1));
2265 assert!(!testlook!(look, " a ", 1));
2266 assert!(testlook!(look, " a ", 2));
2267
2268 // Unicode word boundaries with a non-ASCII codepoint. Since this is
2269 // an ASCII word boundary, none of these match.
2270 assert!(testlook!(look, "𝛃", 0));
2271 assert!(testlook!(look, "𝛃", 4));
2272 assert!(testlook!(look, "𝛃 ", 4));
2273 assert!(testlook!(look, " 𝛃 ", 1));
2274 assert!(testlook!(look, " 𝛃 ", 5));
2275
2276 // Unicode word boundaries between non-ASCII codepoints. Again, since
2277 // this is an ASCII word boundary, none of these match.
2278 assert!(testlook!(look, "𝛃𐆀", 0));
2279 assert!(testlook!(look, "𝛃𐆀", 4));
2280
2281 // Non word boundaries for ASCII.
2282 assert!(testlook!(look, "", 0));
2283 assert!(!testlook!(look, "ab", 1));
2284 assert!(testlook!(look, "a ", 2));
2285 assert!(testlook!(look, " a ", 0));
2286 assert!(testlook!(look, " a ", 3));
2287
2288 // Non word boundaries with a non-ASCII codepoint.
2289 assert!(!testlook!(look, "𝛃b", 4));
2290 assert!(testlook!(look, "b𝛃", 1));
2291 assert!(testlook!(look, "𝛃 ", 5));
2292 assert!(testlook!(look, " 𝛃 ", 0));
2293 assert!(testlook!(look, " 𝛃 ", 6));
2294 assert!(testlook!(look, "𝛃", 1));
2295 assert!(testlook!(look, "𝛃", 2));
2296 assert!(testlook!(look, "𝛃", 3));
2297
2298 // Non word boundaries with non-ASCII codepoints.
2299 assert!(testlook!(look, "𝛃𐆀", 1));
2300 assert!(testlook!(look, "𝛃𐆀", 2));
2301 assert!(testlook!(look, "𝛃𐆀", 3));
2302 assert!(testlook!(look, "𝛃𐆀", 5));
2303 assert!(testlook!(look, "𝛃𐆀", 6));
2304 assert!(testlook!(look, "𝛃𐆀", 7));
2305 assert!(testlook!(look, "𝛃𐆀", 8));
2306 }
2307
2308 #[test]
2309 #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2310 fn look_matches_word_start_half_unicode() {
2311 let look = Look::WordStartHalfUnicode;
2312
2313 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2314 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2315
2316 // Simple ASCII word boundaries.
2317 assert!(testlook!(look, "a", 0));
2318 assert!(!testlook!(look, "a", 1));
2319 assert!(!testlook!(look, "a ", 1));
2320 assert!(testlook!(look, " a ", 1));
2321 assert!(!testlook!(look, " a ", 2));
2322
2323 // Unicode word boundaries with a non-ASCII codepoint.
2324 assert!(testlook!(look, "𝛃", 0));
2325 assert!(!testlook!(look, "𝛃", 4));
2326 assert!(!testlook!(look, "𝛃 ", 4));
2327 assert!(testlook!(look, " 𝛃 ", 1));
2328 assert!(!testlook!(look, " 𝛃 ", 5));
2329
2330 // Unicode word boundaries between non-ASCII codepoints.
2331 assert!(testlook!(look, "𝛃𐆀", 0));
2332 assert!(!testlook!(look, "𝛃𐆀", 4));
2333
2334 // Non word boundaries for ASCII.
2335 assert!(testlook!(look, "", 0));
2336 assert!(!testlook!(look, "ab", 1));
2337 assert!(testlook!(look, "a ", 2));
2338 assert!(testlook!(look, " a ", 0));
2339 assert!(testlook!(look, " a ", 3));
2340
2341 // Non word boundaries with a non-ASCII codepoint.
2342 assert!(!testlook!(look, "𝛃b", 4));
2343 assert!(!testlook!(look, "b𝛃", 1));
2344 assert!(testlook!(look, "𝛃 ", 5));
2345 assert!(testlook!(look, " 𝛃 ", 0));
2346 assert!(testlook!(look, " 𝛃 ", 6));
2347 assert!(!testlook!(look, "𝛃", 1));
2348 assert!(!testlook!(look, "𝛃", 2));
2349 assert!(!testlook!(look, "𝛃", 3));
2350
2351 // Non word boundaries with non-ASCII codepoints.
2352 assert!(!testlook!(look, "𝛃𐆀", 1));
2353 assert!(!testlook!(look, "𝛃𐆀", 2));
2354 assert!(!testlook!(look, "𝛃𐆀", 3));
2355 assert!(!testlook!(look, "𝛃𐆀", 5));
2356 assert!(!testlook!(look, "𝛃𐆀", 6));
2357 assert!(!testlook!(look, "𝛃𐆀", 7));
2358 assert!(testlook!(look, "𝛃𐆀", 8));
2359 }
2360
2361 #[test]
2362 #[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2363 fn look_matches_word_end_half_unicode() {
2364 let look = Look::WordEndHalfUnicode;
2365
2366 // \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2367 // \xF0\x90\x86\x80 = 𐆀 (not in \w)
2368
2369 // Simple ASCII word boundaries.
2370 assert!(!testlook!(look, "a", 0));
2371 assert!(testlook!(look, "a", 1));
2372 assert!(testlook!(look, "a ", 1));
2373 assert!(!testlook!(look, " a ", 1));
2374 assert!(testlook!(look, " a ", 2));
2375
2376 // Unicode word boundaries with a non-ASCII codepoint.
2377 assert!(!testlook!(look, "𝛃", 0));
2378 assert!(testlook!(look, "𝛃", 4));
2379 assert!(testlook!(look, "𝛃 ", 4));
2380 assert!(!testlook!(look, " 𝛃 ", 1));
2381 assert!(testlook!(look, " 𝛃 ", 5));
2382
2383 // Unicode word boundaries between non-ASCII codepoints.
2384 assert!(!testlook!(look, "𝛃𐆀", 0));
2385 assert!(testlook!(look, "𝛃𐆀", 4));
2386
2387 // Non word boundaries for ASCII.
2388 assert!(testlook!(look, "", 0));
2389 assert!(!testlook!(look, "ab", 1));
2390 assert!(testlook!(look, "a ", 2));
2391 assert!(testlook!(look, " a ", 0));
2392 assert!(testlook!(look, " a ", 3));
2393
2394 // Non word boundaries with a non-ASCII codepoint.
2395 assert!(!testlook!(look, "𝛃b", 4));
2396 assert!(!testlook!(look, "b𝛃", 1));
2397 assert!(testlook!(look, "𝛃 ", 5));
2398 assert!(testlook!(look, " 𝛃 ", 0));
2399 assert!(testlook!(look, " 𝛃 ", 6));
2400 assert!(!testlook!(look, "𝛃", 1));
2401 assert!(!testlook!(look, "𝛃", 2));
2402 assert!(!testlook!(look, "𝛃", 3));
2403
2404 // Non word boundaries with non-ASCII codepoints.
2405 assert!(!testlook!(look, "𝛃𐆀", 1));
2406 assert!(!testlook!(look, "𝛃𐆀", 2));
2407 assert!(!testlook!(look, "𝛃𐆀", 3));
2408 assert!(!testlook!(look, "𝛃𐆀", 5));
2409 assert!(!testlook!(look, "𝛃𐆀", 6));
2410 assert!(!testlook!(look, "𝛃𐆀", 7));
2411 assert!(testlook!(look, "𝛃𐆀", 8));
2412 }
2413
2414 #[test]
2415 fn look_set() {
2416 let mut f = LookSet::default();
2417 assert!(!f.contains(Look::Start));
2418 assert!(!f.contains(Look::End));
2419 assert!(!f.contains(Look::StartLF));
2420 assert!(!f.contains(Look::EndLF));
2421 assert!(!f.contains(Look::WordUnicode));
2422 assert!(!f.contains(Look::WordUnicodeNegate));
2423 assert!(!f.contains(Look::WordAscii));
2424 assert!(!f.contains(Look::WordAsciiNegate));
2425
2426 f = f.insert(Look::Start);
2427 assert!(f.contains(Look::Start));
2428 f = f.remove(Look::Start);
2429 assert!(!f.contains(Look::Start));
2430
2431 f = f.insert(Look::End);
2432 assert!(f.contains(Look::End));
2433 f = f.remove(Look::End);
2434 assert!(!f.contains(Look::End));
2435
2436 f = f.insert(Look::StartLF);
2437 assert!(f.contains(Look::StartLF));
2438 f = f.remove(Look::StartLF);
2439 assert!(!f.contains(Look::StartLF));
2440
2441 f = f.insert(Look::EndLF);
2442 assert!(f.contains(Look::EndLF));
2443 f = f.remove(Look::EndLF);
2444 assert!(!f.contains(Look::EndLF));
2445
2446 f = f.insert(Look::StartCRLF);
2447 assert!(f.contains(Look::StartCRLF));
2448 f = f.remove(Look::StartCRLF);
2449 assert!(!f.contains(Look::StartCRLF));
2450
2451 f = f.insert(Look::EndCRLF);
2452 assert!(f.contains(Look::EndCRLF));
2453 f = f.remove(Look::EndCRLF);
2454 assert!(!f.contains(Look::EndCRLF));
2455
2456 f = f.insert(Look::WordUnicode);
2457 assert!(f.contains(Look::WordUnicode));
2458 f = f.remove(Look::WordUnicode);
2459 assert!(!f.contains(Look::WordUnicode));
2460
2461 f = f.insert(Look::WordUnicodeNegate);
2462 assert!(f.contains(Look::WordUnicodeNegate));
2463 f = f.remove(Look::WordUnicodeNegate);
2464 assert!(!f.contains(Look::WordUnicodeNegate));
2465
2466 f = f.insert(Look::WordAscii);
2467 assert!(f.contains(Look::WordAscii));
2468 f = f.remove(Look::WordAscii);
2469 assert!(!f.contains(Look::WordAscii));
2470
2471 f = f.insert(Look::WordAsciiNegate);
2472 assert!(f.contains(Look::WordAsciiNegate));
2473 f = f.remove(Look::WordAsciiNegate);
2474 assert!(!f.contains(Look::WordAsciiNegate));
2475
2476 f = f.insert(Look::WordStartAscii);
2477 assert!(f.contains(Look::WordStartAscii));
2478 f = f.remove(Look::WordStartAscii);
2479 assert!(!f.contains(Look::WordStartAscii));
2480
2481 f = f.insert(Look::WordEndAscii);
2482 assert!(f.contains(Look::WordEndAscii));
2483 f = f.remove(Look::WordEndAscii);
2484 assert!(!f.contains(Look::WordEndAscii));
2485
2486 f = f.insert(Look::WordStartUnicode);
2487 assert!(f.contains(Look::WordStartUnicode));
2488 f = f.remove(Look::WordStartUnicode);
2489 assert!(!f.contains(Look::WordStartUnicode));
2490
2491 f = f.insert(Look::WordEndUnicode);
2492 assert!(f.contains(Look::WordEndUnicode));
2493 f = f.remove(Look::WordEndUnicode);
2494 assert!(!f.contains(Look::WordEndUnicode));
2495
2496 f = f.insert(Look::WordStartHalfAscii);
2497 assert!(f.contains(Look::WordStartHalfAscii));
2498 f = f.remove(Look::WordStartHalfAscii);
2499 assert!(!f.contains(Look::WordStartHalfAscii));
2500
2501 f = f.insert(Look::WordEndHalfAscii);
2502 assert!(f.contains(Look::WordEndHalfAscii));
2503 f = f.remove(Look::WordEndHalfAscii);
2504 assert!(!f.contains(Look::WordEndHalfAscii));
2505
2506 f = f.insert(Look::WordStartHalfUnicode);
2507 assert!(f.contains(Look::WordStartHalfUnicode));
2508 f = f.remove(Look::WordStartHalfUnicode);
2509 assert!(!f.contains(Look::WordStartHalfUnicode));
2510
2511 f = f.insert(Look::WordEndHalfUnicode);
2512 assert!(f.contains(Look::WordEndHalfUnicode));
2513 f = f.remove(Look::WordEndHalfUnicode);
2514 assert!(!f.contains(Look::WordEndHalfUnicode));
2515 }
2516
2517 #[test]
2518 fn look_set_iter() {
2519 let set = LookSet::empty();
2520 assert_eq!(0, set.iter().count());
2521
2522 let set = LookSet::full();
2523 assert_eq!(18, set.iter().count());
2524
2525 let set =
2526 LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
2527 assert_eq!(2, set.iter().count());
2528
2529 let set = LookSet::empty().insert(Look::StartLF);
2530 assert_eq!(1, set.iter().count());
2531
2532 let set = LookSet::empty().insert(Look::WordAsciiNegate);
2533 assert_eq!(1, set.iter().count());
2534
2535 let set = LookSet::empty().insert(Look::WordEndHalfUnicode);
2536 assert_eq!(1, set.iter().count());
2537 }
2538
2539 #[test]
2540 #[cfg(feature = "alloc")]
2541 fn look_set_debug() {
2542 let res = alloc::format!("{:?}", LookSet::empty());
2543 assert_eq!("∅", res);
2544 let res = alloc::format!("{:?}", LookSet::full());
2545 assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
2546 }
2547}
2548