look.rs - Codebrowser

1	/!*
2	Types and routines for working with look-around assertions.
3
4	This module principally defines two types:
5
6	* [`Look`] enumerates all of the assertions supported by this crate.
7	* [`LookSet`] provides a way to efficiently store a set of [`Look`] values.
8	* [`LookMatcher`] provides routines for checking whether a `Look` or a
9	`LookSet` matches at a particular position in a haystack.
10	*/
11
12	// LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically
13	// copied verbatim from the regex-syntax crate. I would have no problems using
14	// the regex-syntax types and defining the matching routines (only found
15	// in this crate) as free functions, except the `Look` and `LookSet` types
16	// are used in lots of places. Including in places we expect to work when
17	// regex-syntax is not* enabled, such as in the definition of the NFA itself.*
18	//
19	// Thankfully the code we copy is pretty simple and there isn't much of it.
20	// Otherwise, the rest of this module deals with matching* the assertions,*
21	// which is not something that regex-syntax handles.
22
23	use crate::util::{escape::DebugByte, utf8};
24
25	/// A look-around assertion.
26	///
27	/// An assertion matches at a position between characters in a haystack.
28	/// Namely, it does not actually "consume" any input as most parts of a regular
29	/// expression do. Assertions are a way of stating that some property must be
30	/// true at a particular point during matching.
31	///
32	/// For example, `(?m)^[a-z]+$` is a pattern that:
33	///
34	/// Scans the haystack for a position at which `(?m:^)` is satisfied. That*
35	/// occurs at either the beginning of the haystack, or immediately following
36	/// a `\n` character.
37	/// Looks for one or more occurrences of `[a-z]`.*
38	/// Once `[a-z]+` has matched as much as it can, an overall match is only*
39	/// reported when `[a-z]+` stops just before a `\n`.
40	///
41	/// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not.
42	///
43	/// Assertions are also called "look-around," "look-behind" and "look-ahead."
44	/// Specifically, some assertions are look-behind (like `^`), other assertions
45	/// are look-ahead (like `$`) and yet other assertions are both look-ahead and
46	/// look-behind (like `\b`).
47	///
48	/// # Assertions in an NFA
49	///
50	/// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be
51	/// thought of as a conditional epsilon transition. That is, a matching engine
52	/// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits
53	/// moving through conditional epsilon transitions when their condition
54	/// is satisfied at whatever position the `PikeVM` is currently at in the
55	/// haystack.
56	///
57	/// How assertions are handled in a `DFA` is trickier, since a DFA does not
58	/// have epsilon transitions at all. In this case, they are compiled into the
59	/// automaton itself, at the expense of more states than what would be required
60	/// without an assertion.
61	#[derive(Clone, Copy, Debug, Eq, PartialEq)]
62	pub enum Look {
63	/// Match the beginning of text. Specifically, this matches at the starting
64	/// position of the input.
65	Start = `1` << `0`,
66	/// Match the end of text. Specifically, this matches at the ending
67	/// position of the input.
68	End = `1` << `1`,
69	/// Match the beginning of a line or the beginning of text. Specifically,
70	/// this matches at the starting position of the input, or at the position
71	/// immediately following a `\n` character.
72	StartLF = `1` << `2`,
73	/// Match the end of a line or the end of text. Specifically, this matches
74	/// at the end position of the input, or at the position immediately
75	/// preceding a `\n` character.
76	EndLF = `1` << `3`,
77	/// Match the beginning of a line or the beginning of text. Specifically,
78	/// this matches at the starting position of the input, or at the position
79	/// immediately following either a `\r` or `\n` character, but never after
80	/// a `\r` when a `\n` follows.
81	StartCRLF = `1` << `4`,
82	/// Match the end of a line or the end of text. Specifically, this matches
83	/// at the end position of the input, or at the position immediately
84	/// preceding a `\r` or `\n` character, but never before a `\n` when a `\r`
85	/// precedes it.
86	EndCRLF = `1` << `5`,
87	/// Match an ASCII-only word boundary. That is, this matches a position
88	/// where the left adjacent character and right adjacent character
89	/// correspond to a word and non-word or a non-word and word character.
90	WordAscii = `1` << `6`,
91	/// Match an ASCII-only negation of a word boundary.
92	WordAsciiNegate = `1` << `7`,
93	/// Match a Unicode-aware word boundary. That is, this matches a position
94	/// where the left adjacent character and right adjacent character
95	/// correspond to a word and non-word or a non-word and word character.
96	WordUnicode = `1` << `8`,
97	/// Match a Unicode-aware negation of a word boundary.
98	WordUnicodeNegate = `1` << `9`,
99	/// Match the start of an ASCII-only word boundary. That is, this matches a
100	/// position at either the beginning of the haystack or where the previous
101	/// character is not a word character and the following character is a word
102	/// character.
103	WordStartAscii = `1` << `10`,
104	/// Match the end of an ASCII-only word boundary. That is, this matches
105	/// a position at either the end of the haystack or where the previous
106	/// character is a word character and the following character is not a word
107	/// character.
108	WordEndAscii = `1` << `11`,
109	/// Match the start of a Unicode word boundary. That is, this matches a
110	/// position at either the beginning of the haystack or where the previous
111	/// character is not a word character and the following character is a word
112	/// character.
113	WordStartUnicode = `1` << `12`,
114	/// Match the end of a Unicode word boundary. That is, this matches a
115	/// position at either the end of the haystack or where the previous
116	/// character is a word character and the following character is not a word
117	/// character.
118	WordEndUnicode = `1` << `13`,
119	/// Match the start half of an ASCII-only word boundary. That is, this
120	/// matches a position at either the beginning of the haystack or where the
121	/// previous character is not a word character.
122	WordStartHalfAscii = `1` << `14`,
123	/// Match the end half of an ASCII-only word boundary. That is, this
124	/// matches a position at either the end of the haystack or where the
125	/// following character is not a word character.
126	WordEndHalfAscii = `1` << `15`,
127	/// Match the start half of a Unicode word boundary. That is, this matches
128	/// a position at either the beginning of the haystack or where the
129	/// previous character is not a word character.
130	WordStartHalfUnicode = `1` << `16`,
131	/// Match the end half of a Unicode word boundary. That is, this matches
132	/// a position at either the end of the haystack or where the following
133	/// character is not a word character.
134	WordEndHalfUnicode = `1` << `17`,
135	}
136
137	impl Look {
138	/// Flip the look-around assertion to its equivalent for reverse searches.
139	/// For example, `StartLF` gets translated to `EndLF`.
140	///
141	/// Some assertions, such as `WordUnicode`, remain the same since they
142	/// match the same positions regardless of the direction of the search.
143	#[inline]
144	pub const fn reversed(self) -> Look {
145	match self {
146	Look::Start => Look::End,
147	Look::End => Look::Start,
148	Look::StartLF => Look::EndLF,
149	Look::EndLF => Look::StartLF,
150	Look::StartCRLF => Look::EndCRLF,
151	Look::EndCRLF => Look::StartCRLF,
152	Look::WordAscii => Look::WordAscii,
153	Look::WordAsciiNegate => Look::WordAsciiNegate,
154	Look::WordUnicode => Look::WordUnicode,
155	Look::WordUnicodeNegate => Look::WordUnicodeNegate,
156	Look::WordStartAscii => Look::WordEndAscii,
157	Look::WordEndAscii => Look::WordStartAscii,
158	Look::WordStartUnicode => Look::WordEndUnicode,
159	Look::WordEndUnicode => Look::WordStartUnicode,
160	Look::WordStartHalfAscii => Look::WordEndHalfAscii,
161	Look::WordEndHalfAscii => Look::WordStartHalfAscii,
162	Look::WordStartHalfUnicode => Look::WordEndHalfUnicode,
163	Look::WordEndHalfUnicode => Look::WordStartHalfUnicode,
164	}
165	}
166
167	/// Return the underlying representation of this look-around enumeration
168	/// as an integer. Giving the return value to the [`Look::from_repr`]
169	/// constructor is guaranteed to return the same look-around variant that
170	/// one started with within a semver compatible release of this crate.
171	#[inline]
172	pub const fn as_repr(self) -> u32 {
173	// AFAIK, 'as' is the only way to zero-cost convert an int enum to an
174	// actual int.
175	self as u32
176	}
177
178	/// Given the underlying representation of a `Look` value, return the
179	/// corresponding `Look` value if the representation is valid. Otherwise
180	/// `None` is returned.
181	#[inline]
182	pub const fn from_repr(repr: u32) -> Option<Look> {
183	match repr {
184	`0b00_0000_0000_0000_0001` => Some(Look::Start),
185	`0b00_0000_0000_0000_0010` => Some(Look::End),
186	`0b00_0000_0000_0000_0100` => Some(Look::StartLF),
187	`0b00_0000_0000_0000_1000` => Some(Look::EndLF),
188	`0b00_0000_0000_0001_0000` => Some(Look::StartCRLF),
189	`0b00_0000_0000_0010_0000` => Some(Look::EndCRLF),
190	`0b00_0000_0000_0100_0000` => Some(Look::WordAscii),
191	`0b00_0000_0000_1000_0000` => Some(Look::WordAsciiNegate),
192	`0b00_0000_0001_0000_0000` => Some(Look::WordUnicode),
193	`0b00_0000_0010_0000_0000` => Some(Look::WordUnicodeNegate),
194	`0b00_0000_0100_0000_0000` => Some(Look::WordStartAscii),
195	`0b00_0000_1000_0000_0000` => Some(Look::WordEndAscii),
196	`0b00_0001_0000_0000_0000` => Some(Look::WordStartUnicode),
197	`0b00_0010_0000_0000_0000` => Some(Look::WordEndUnicode),
198	`0b00_0100_0000_0000_0000` => Some(Look::WordStartHalfAscii),
199	`0b00_1000_0000_0000_0000` => Some(Look::WordEndHalfAscii),
200	`0b01_0000_0000_0000_0000` => Some(Look::WordStartHalfUnicode),
201	`0b10_0000_0000_0000_0000` => Some(Look::WordEndHalfUnicode),
202	_ => None,
203	}
204	}
205
206	/// Returns a convenient single codepoint representation of this
207	/// look-around assertion. Each assertion is guaranteed to be represented
208	/// by a distinct character.
209	///
210	/// This is useful for succinctly representing a look-around assertion in
211	/// human friendly but succinct output intended for a programmer working on
212	/// regex internals.
213	#[inline]
214	pub const fn as_char(self) -> char {
215	match self {
216	Look::Start => 'A',
217	Look::End => 'z',
218	Look::StartLF => '^',
219	Look::EndLF => '$',
220	Look::StartCRLF => 'r',
221	Look::EndCRLF => 'R',
222	Look::WordAscii => 'b',
223	Look::WordAsciiNegate => 'B',
224	Look::WordUnicode => '𝛃',
225	Look::WordUnicodeNegate => '𝚩',
226	Look::WordStartAscii => '<',
227	Look::WordEndAscii => '>',
228	Look::WordStartUnicode => '〈',
229	Look::WordEndUnicode => '〉',
230	Look::WordStartHalfAscii => '◁',
231	Look::WordEndHalfAscii => '▷',
232	Look::WordStartHalfUnicode => '◀',
233	Look::WordEndHalfUnicode => '▶',
234	}
235	}
236	}
237
238	/// LookSet is a memory-efficient set of look-around assertions.
239	///
240	/// This is useful for efficiently tracking look-around assertions. For
241	/// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties
242	/// that return `LookSet`s.
243	#[derive(Clone, Copy, Default, Eq, PartialEq)]
244	pub struct LookSet {
245	/// The underlying representation this set is exposed to make it possible
246	/// to store it somewhere efficiently. The representation is that
247	/// of a bitset, where each assertion occupies bit `i` where
248	/// `i = Look::as_repr()`.
249	///
250	/// Note that users of this internal representation must permit the full
251	/// range of `u16` values to be represented. For example, even if the
252	/// current implementation only makes use of the 10 least significant bits,
253	/// it may use more bits in a future semver compatible release.
254	pub bits: u32,
255	}
256
257	impl LookSet {
258	/// Create an empty set of look-around assertions.
259	#[inline]
260	pub fn empty() -> LookSet {
261	LookSet { bits: `0` }
262	}
263
264	/// Create a full set of look-around assertions.
265	///
266	/// This set contains all possible look-around assertions.
267	#[inline]
268	pub fn full() -> LookSet {
269	LookSet { bits: !`0` }
270	}
271
272	/// Create a look-around set containing the look-around assertion given.
273	///
274	/// This is a convenience routine for creating an empty set and inserting
275	/// one look-around assertions.
276	#[inline]
277	pub fn singleton(look: Look) -> LookSet {
278	LookSet::empty().insert(look)
279	}
280
281	/// Returns the total number of look-around assertions in this set.
282	#[inline]
283	pub fn len(self) -> usize {
284	// OK because max value always fits in a u8, which in turn always
285	// fits in a usize, regardless of target.
286	usize::try_from(self.bits.count_ones()).unwrap()
287	}
288
289	/// Returns true if and only if this set is empty.
290	#[inline]
291	pub fn is_empty(self) -> bool {
292	self.len() == `0`
293	}
294
295	/// Returns true if and only if the given look-around assertion is in this
296	/// set.
297	#[inline]
298	pub fn contains(self, look: Look) -> bool {
299	self.bits & look.as_repr() != `0`
300	}
301
302	/// Returns true if and only if this set contains any anchor assertions.
303	/// This includes both "start/end of haystack" and "start/end of line."
304	#[inline]
305	pub fn contains_anchor(&self) -> bool {
306	self.contains_anchor_haystack() \|\| self.contains_anchor_line()
307	}
308
309	/// Returns true if and only if this set contains any "start/end of
310	/// haystack" anchors. This doesn't include "start/end of line" anchors.
311	#[inline]
312	pub fn contains_anchor_haystack(&self) -> bool {
313	self.contains(Look::Start) \|\| self.contains(Look::End)
314	}
315
316	/// Returns true if and only if this set contains any "start/end of line"
317	/// anchors. This doesn't include "start/end of haystack" anchors. This
318	/// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors.
319	#[inline]
320	pub fn contains_anchor_line(&self) -> bool {
321	self.contains(Look::StartLF)
322	\|\| self.contains(Look::EndLF)
323	\|\| self.contains(Look::StartCRLF)
324	\|\| self.contains(Look::EndCRLF)
325	}
326
327	/// Returns true if and only if this set contains any "start/end of line"
328	/// anchors that only treat `\n` as line terminators. This does not include
329	/// haystack anchors or CRLF aware line anchors.
330	#[inline]
331	pub fn contains_anchor_lf(&self) -> bool {
332	self.contains(Look::StartLF) \|\| self.contains(Look::EndLF)
333	}
334
335	/// Returns true if and only if this set contains any "start/end of line"
336	/// anchors that are CRLF-aware. This doesn't include "start/end of
337	/// haystack" or "start/end of line-feed" anchors.
338	#[inline]
339	pub fn contains_anchor_crlf(&self) -> bool {
340	self.contains(Look::StartCRLF) \|\| self.contains(Look::EndCRLF)
341	}
342
343	/// Returns true if and only if this set contains any word boundary or
344	/// negated word boundary assertions. This include both Unicode and ASCII
345	/// word boundaries.
346	#[inline]
347	pub fn contains_word(self) -> bool {
348	self.contains_word_unicode() \|\| self.contains_word_ascii()
349	}
350
351	/// Returns true if and only if this set contains any Unicode word boundary
352	/// or negated Unicode word boundary assertions.
353	#[inline]
354	pub fn contains_word_unicode(self) -> bool {
355	self.contains(Look::WordUnicode)
356	\|\| self.contains(Look::WordUnicodeNegate)
357	\|\| self.contains(Look::WordStartUnicode)
358	\|\| self.contains(Look::WordEndUnicode)
359	\|\| self.contains(Look::WordStartHalfUnicode)
360	\|\| self.contains(Look::WordEndHalfUnicode)
361	}
362
363	/// Returns true if and only if this set contains any ASCII word boundary
364	/// or negated ASCII word boundary assertions.
365	#[inline]
366	pub fn contains_word_ascii(self) -> bool {
367	self.contains(Look::WordAscii)
368	\|\| self.contains(Look::WordAsciiNegate)
369	\|\| self.contains(Look::WordStartAscii)
370	\|\| self.contains(Look::WordEndAscii)
371	\|\| self.contains(Look::WordStartHalfAscii)
372	\|\| self.contains(Look::WordEndHalfAscii)
373	}
374
375	/// Returns an iterator over all of the look-around assertions in this set.
376	#[inline]
377	pub fn iter(self) -> LookSetIter {
378	LookSetIter { set: self }
379	}
380
381	/// Return a new set that is equivalent to the original, but with the given
382	/// assertion added to it. If the assertion is already in the set, then the
383	/// returned set is equivalent to the original.
384	#[inline]
385	pub fn insert(self, look: Look) -> LookSet {
386	LookSet { bits: self.bits \| look.as_repr() }
387	}
388
389	/// Updates this set in place with the result of inserting the given
390	/// assertion into this set.
391	#[inline]
392	pub fn set_insert(&mut self, look: Look) {
393	*self = self.insert(look);
394	}
395
396	/// Return a new set that is equivalent to the original, but with the given
397	/// assertion removed from it. If the assertion is not in the set, then the
398	/// returned set is equivalent to the original.
399	#[inline]
400	pub fn remove(self, look: Look) -> LookSet {
401	LookSet { bits: self.bits & !look.as_repr() }
402	}
403
404	/// Updates this set in place with the result of removing the given
405	/// assertion from this set.
406	#[inline]
407	pub fn set_remove(&mut self, look: Look) {
408	*self = self.remove(look);
409	}
410
411	/// Returns a new set that is the result of subtracting the given set from
412	/// this set.
413	#[inline]
414	pub fn subtract(self, other: LookSet) -> LookSet {
415	LookSet { bits: self.bits & !other.bits }
416	}
417
418	/// Updates this set in place with the result of subtracting the given set
419	/// from this set.
420	#[inline]
421	pub fn set_subtract(&mut self, other: LookSet) {
422	*self = self.subtract(other);
423	}
424
425	/// Returns a new set that is the union of this and the one given.
426	#[inline]
427	pub fn union(self, other: LookSet) -> LookSet {
428	LookSet { bits: self.bits \| other.bits }
429	}
430
431	/// Updates this set in place with the result of unioning it with the one
432	/// given.
433	#[inline]
434	pub fn set_union(&mut self, other: LookSet) {
435	*self = self.union(other);
436	}
437
438	/// Returns a new set that is the intersection of this and the one given.
439	#[inline]
440	pub fn intersect(self, other: LookSet) -> LookSet {
441	LookSet { bits: self.bits & other.bits }
442	}
443
444	/// Updates this set in place with the result of intersecting it with the
445	/// one given.
446	#[inline]
447	pub fn set_intersect(&mut self, other: LookSet) {
448	*self = self.intersect(other);
449	}
450
451	/// Return a `LookSet` from the slice given as a native endian 32-bit
452	/// integer.
453	///
454	/// # Panics
455	///
456	/// This panics if `slice.len() < 4`.
457	#[inline]
458	pub fn read_repr(slice: &[u8]) -> LookSet {
459	let bits = u32::from_ne_bytes(slice[..`4`].try_into().unwrap());
460	LookSet { bits }
461	}
462
463	/// Write a `LookSet` as a native endian 32-bit integer to the beginning
464	/// of the slice given.
465	///
466	/// # Panics
467	///
468	/// This panics if `slice.len() < 4`.
469	#[inline]
470	pub fn write_repr(self, slice: &mut [u8]) {
471	let raw = self.bits.to_ne_bytes();
472	slice[`0`] = raw[`0`];
473	slice[`1`] = raw[`1`];
474	slice[`2`] = raw[`2`];
475	slice[`3`] = raw[`3`];
476	}
477
478	/// Checks that all assertions in this set can be matched.
479	///
480	/// Some assertions, such as Unicode word boundaries, require optional (but
481	/// enabled by default) tables that may not be available. If there are
482	/// assertions in this set that require tables that are not available, then
483	/// this will return an error.
484	///
485	/// Specifically, this returns an error when the the
486	/// `unicode-word-boundary` feature is _not_ enabled _and_ this set
487	/// contains a Unicode word boundary assertion.
488	///
489	/// It can be useful to use this on the result of
490	/// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any)
491	/// when building a matcher engine to ensure methods like
492	/// [`LookMatcher::matches_set`] do not panic at search time.
493	pub fn available(self) -> Result<(), UnicodeWordBoundaryError> {
494	if self.contains_word_unicode() {
495	UnicodeWordBoundaryError::check()?;
496	}
497	Ok(())
498	}
499	}
500
501	impl core::fmt::Debug for LookSet {
502	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
503	if self.is_empty() {
504	return write!(f, "∅");
505	}
506	for look in self.iter() {
507	write!(f, "{}", look.as_char())?;
508	}
509	Ok(())
510	}
511	}
512
513	/// An iterator over all look-around assertions in a [`LookSet`].
514	///
515	/// This iterator is created by [`LookSet::iter`].
516	#[derive(Clone, Debug)]
517	pub struct LookSetIter {
518	set: LookSet,
519	}
520
521	impl Iterator for LookSetIter {
522	type Item = Look;
523
524	#[inline]
525	fn next(&mut self) -> Option<Look> {
526	if self.set.is_empty() {
527	return None;
528	}
529	// We'll never have more than u8::MAX distinct look-around assertions,
530	// so 'bit' will always fit into a u16.
531	let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap();
532	let look = Look::from_repr(`1` << bit)?;
533	self.set = self.set.remove(look);
534	Some(look)
535	}
536	}
537
538	/// A matcher for look-around assertions.
539	///
540	/// This matcher permits configuring aspects of how look-around assertions are
541	/// matched.
542	///
543	/// # Example
544	///
545	/// A `LookMatcher` can change the line terminator used for matching multi-line
546	/// anchors such as `(?m:^)` and `(?m:$)`.
547	///
548	/// ```
549	/// use regex_automata::{
550	/// nfa::thompson::{self, pikevm::PikeVM},
551	/// util::look::LookMatcher,
552	/// Match, Input,
553	/// };
554	///
555	/// let mut lookm = LookMatcher::new();
556	/// lookm.set_line_terminator(b'`\x00`');
557	///
558	/// let re = PikeVM::builder()
559	/// .thompson(thompson::Config::new().look_matcher(lookm))
560	/// .build(r"(?m)^[a-z]+$")?;
561	/// let mut cache = re.create_cache();
562	///
563	/// // Multi-line assertions now use NUL as a terminator.
564	/// assert_eq!(
565	/// Some(Match::must(`0`, `1`..`4`)),
566	/// re.find(&mut cache, b"`\x00`abc`\x00`"),
567	/// );
568	/// // ... and \n is no longer recognized as a terminator.
569	/// assert_eq!(
570	/// None,
571	/// re.find(&mut cache, b"`\n`abc`\n`"),
572	/// );
573	///
574	/// # Ok::<(), Box<dyn std::error::Error>>(())
575	/// ```
576	#[derive(Clone, Debug)]
577	pub struct LookMatcher {
578	lineterm: DebugByte,
579	}
580
581	impl LookMatcher {
582	/// Creates a new default matcher for look-around assertions.
583	pub fn new() -> LookMatcher {
584	LookMatcher { lineterm: DebugByte(b'`\n`') }
585	}
586
587	/// Sets the line terminator for use with `(?m:^)` and `(?m:$)`.
588	///
589	/// Namely, instead of `^` matching after `\n` and `$` matching immediately
590	/// before a `\n`, this will cause it to match after and before the byte
591	/// given.
592	///
593	/// It can occasionally be useful to use this to configure the line
594	/// terminator to the NUL byte when searching binary data.
595	///
596	/// Note that this does not apply to CRLF-aware line anchors such as
597	/// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to
598	/// use `\r` and `\n`.
599	pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher {
600	self.lineterm.0 = byte;
601	self
602	}
603
604	/// Returns the line terminator that was configured for this matcher.
605	///
606	/// If no line terminator was configured, then this returns `\n`.
607	///
608	/// Note that the line terminator should only be used for matching `(?m:^)`
609	/// and `(?m:$)` assertions. It specifically should _not_ be used for
610	/// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`.
611	pub fn get_line_terminator(&self) -> u8 {
612	self.lineterm.0
613	}
614
615	/// Returns true when the position `at` in `haystack` satisfies the given
616	/// look-around assertion.
617	///
618	/// # Panics
619	///
620	/// This panics when testing any Unicode word boundary assertion in this
621	/// set and when the Unicode word data is not available. Specifically, this
622	/// only occurs when the `unicode-word-boundary` feature is not enabled.
623	///
624	/// Since it's generally expected that this routine is called inside of
625	/// a matching engine, callers should check the error condition when
626	/// building the matching engine. If there is a Unicode word boundary
627	/// in the matcher and the data isn't available, then the matcher should
628	/// fail to build.
629	///
630	/// Callers can check the error condition with [`LookSet::available`].
631	///
632	/// This also may panic when `at > haystack.len()`. Note that `at ==
633	/// haystack.len()` is legal and guaranteed not to panic.
634	#[inline]
635	pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool {
636	self.matches_inline(look, haystack, at)
637	}
638
639	/// Like `matches`, but forcefully inlined.
640	///
641	/// # Panics
642	///
643	/// This panics when testing any Unicode word boundary assertion in this
644	/// set and when the Unicode word data is not available. Specifically, this
645	/// only occurs when the `unicode-word-boundary` feature is not enabled.
646	///
647	/// Since it's generally expected that this routine is called inside of
648	/// a matching engine, callers should check the error condition when
649	/// building the matching engine. If there is a Unicode word boundary
650	/// in the matcher and the data isn't available, then the matcher should
651	/// fail to build.
652	///
653	/// Callers can check the error condition with [`LookSet::available`].
654	///
655	/// This also may panic when `at > haystack.len()`. Note that `at ==
656	/// haystack.len()` is legal and guaranteed not to panic.
657	#[cfg_attr(feature = "perf-inline", inline(always))]
658	pub(crate) fn matches_inline(
659	&self,
660	look: Look,
661	haystack: &[u8],
662	at: usize,
663	) -> bool {
664	match look {
665	Look::Start => self.is_start(haystack, at),
666	Look::End => self.is_end(haystack, at),
667	Look::StartLF => self.is_start_lf(haystack, at),
668	Look::EndLF => self.is_end_lf(haystack, at),
669	Look::StartCRLF => self.is_start_crlf(haystack, at),
670	Look::EndCRLF => self.is_end_crlf(haystack, at),
671	Look::WordAscii => self.is_word_ascii(haystack, at),
672	Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at),
673	Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(),
674	Look::WordUnicodeNegate => {
675	self.is_word_unicode_negate(haystack, at).unwrap()
676	}
677	Look::WordStartAscii => self.is_word_start_ascii(haystack, at),
678	Look::WordEndAscii => self.is_word_end_ascii(haystack, at),
679	Look::WordStartUnicode => {
680	self.is_word_start_unicode(haystack, at).unwrap()
681	}
682	Look::WordEndUnicode => {
683	self.is_word_end_unicode(haystack, at).unwrap()
684	}
685	Look::WordStartHalfAscii => {
686	self.is_word_start_half_ascii(haystack, at)
687	}
688	Look::WordEndHalfAscii => {
689	self.is_word_end_half_ascii(haystack, at)
690	}
691	Look::WordStartHalfUnicode => {
692	self.is_word_start_half_unicode(haystack, at).unwrap()
693	}
694	Look::WordEndHalfUnicode => {
695	self.is_word_end_half_unicode(haystack, at).unwrap()
696	}
697	}
698	}
699
700	/// Returns true when _all_ of the assertions in the given set match at the
701	/// given position in the haystack.
702	///
703	/// # Panics
704	///
705	/// This panics when testing any Unicode word boundary assertion in this
706	/// set and when the Unicode word data is not available. Specifically, this
707	/// only occurs when the `unicode-word-boundary` feature is not enabled.
708	///
709	/// Since it's generally expected that this routine is called inside of
710	/// a matching engine, callers should check the error condition when
711	/// building the matching engine. If there is a Unicode word boundary
712	/// in the matcher and the data isn't available, then the matcher should
713	/// fail to build.
714	///
715	/// Callers can check the error condition with [`LookSet::available`].
716	///
717	/// This also may panic when `at > haystack.len()`. Note that `at ==
718	/// haystack.len()` is legal and guaranteed not to panic.
719	#[inline]
720	pub fn matches_set(
721	&self,
722	set: LookSet,
723	haystack: &[u8],
724	at: usize,
725	) -> bool {
726	self.matches_set_inline(set, haystack, at)
727	}
728
729	/// Like `LookSet::matches`, but forcefully inlined for perf.
730	#[cfg_attr(feature = "perf-inline", inline(always))]
731	pub(crate) fn matches_set_inline(
732	&self,
733	set: LookSet,
734	haystack: &[u8],
735	at: usize,
736	) -> bool {
737	// This used to luse LookSet::iter with Look::matches on each element,
738	// but that proved to be quite diastrous for perf. The manual "if
739	// the set has this assertion, check it" turns out to be quite a bit
740	// faster.
741	if set.contains(Look::Start) {
742	if !self.is_start(haystack, at) {
743	return `false`;
744	}
745	}
746	if set.contains(Look::End) {
747	if !self.is_end(haystack, at) {
748	return `false`;
749	}
750	}
751	if set.contains(Look::StartLF) {
752	if !self.is_start_lf(haystack, at) {
753	return `false`;
754	}
755	}
756	if set.contains(Look::EndLF) {
757	if !self.is_end_lf(haystack, at) {
758	return `false`;
759	}
760	}
761	if set.contains(Look::StartCRLF) {
762	if !self.is_start_crlf(haystack, at) {
763	return `false`;
764	}
765	}
766	if set.contains(Look::EndCRLF) {
767	if !self.is_end_crlf(haystack, at) {
768	return `false`;
769	}
770	}
771	if set.contains(Look::WordAscii) {
772	if !self.is_word_ascii(haystack, at) {
773	return `false`;
774	}
775	}
776	if set.contains(Look::WordAsciiNegate) {
777	if !self.is_word_ascii_negate(haystack, at) {
778	return `false`;
779	}
780	}
781	if set.contains(Look::WordUnicode) {
782	if !self.is_word_unicode(haystack, at).unwrap() {
783	return `false`;
784	}
785	}
786	if set.contains(Look::WordUnicodeNegate) {
787	if !self.is_word_unicode_negate(haystack, at).unwrap() {
788	return `false`;
789	}
790	}
791	if set.contains(Look::WordStartAscii) {
792	if !self.is_word_start_ascii(haystack, at) {
793	return `false`;
794	}
795	}
796	if set.contains(Look::WordEndAscii) {
797	if !self.is_word_end_ascii(haystack, at) {
798	return `false`;
799	}
800	}
801	if set.contains(Look::WordStartUnicode) {
802	if !self.is_word_start_unicode(haystack, at).unwrap() {
803	return `false`;
804	}
805	}
806	if set.contains(Look::WordEndUnicode) {
807	if !self.is_word_end_unicode(haystack, at).unwrap() {
808	return `false`;
809	}
810	}
811	if set.contains(Look::WordStartHalfAscii) {
812	if !self.is_word_start_half_ascii(haystack, at) {
813	return `false`;
814	}
815	}
816	if set.contains(Look::WordEndHalfAscii) {
817	if !self.is_word_end_half_ascii(haystack, at) {
818	return `false`;
819	}
820	}
821	if set.contains(Look::WordStartHalfUnicode) {
822	if !self.is_word_start_half_unicode(haystack, at).unwrap() {
823	return `false`;
824	}
825	}
826	if set.contains(Look::WordEndHalfUnicode) {
827	if !self.is_word_end_half_unicode(haystack, at).unwrap() {
828	return `false`;
829	}
830	}
831	`true`
832	}
833
834	/// Split up the given byte classes into equivalence classes in a way that
835	/// is consistent with this look-around assertion.
836	#[cfg(feature = "alloc")]
837	pub(crate) fn add_to_byteset(
838	&self,
839	look: Look,
840	set: &mut crate::util::alphabet::ByteClassSet,
841	) {
842	match look {
843	Look::Start \| Look::End => {}
844	Look::StartLF \| Look::EndLF => {
845	set.set_range(self.lineterm.0, self.lineterm.0);
846	}
847	Look::StartCRLF \| Look::EndCRLF => {
848	set.set_range(b'`\r`', b'`\r`');
849	set.set_range(b'`\n`', b'`\n`');
850	}
851	Look::WordAscii
852	\| Look::WordAsciiNegate
853	\| Look::WordUnicode
854	\| Look::WordUnicodeNegate
855	\| Look::WordStartAscii
856	\| Look::WordEndAscii
857	\| Look::WordStartUnicode
858	\| Look::WordEndUnicode
859	\| Look::WordStartHalfAscii
860	\| Look::WordEndHalfAscii
861	\| Look::WordStartHalfUnicode
862	\| Look::WordEndHalfUnicode => {
863	// We need to mark all ranges of bytes whose pairs result in
864	// evaluating \b differently. This isn't technically correct
865	// for Unicode word boundaries, but DFAs can't handle those
866	// anyway, and thus, the byte classes don't need to either
867	// since they are themselves only used in DFAs.
868	//
869	// FIXME: It seems like the calls to 'set_range' here are
870	// completely invariant, which means we could just hard-code
871	// them here without needing to write a loop. And we only need
872	// to do this dance at most once per regex.
873	//
874	// FIXME: Is this correct for \B?
875	let iswb = utf8::is_word_byte;
876	// This unwrap is OK because we guard every use of 'asu8' with
877	// a check that the input is <= 255.
878	let asu8 = \|b: u16\| u8::try_from(b).unwrap();
879	let mut b1: u16 = `0`;
880	let mut b2: u16;
881	while b1 <= `255` {
882	b2 = b1 + `1`;
883	while b2 <= `255` && iswb(asu8(b1)) == iswb(asu8(b2)) {
884	b2 += `1`;
885	}
886	// The guards above guarantee that b2 can never get any
887	// bigger.
888	assert!(b2 <= `256`);
889	// Subtracting 1 from b2 is always OK because it is always
890	// at least 1 greater than b1, and the assert above
891	// guarantees that the asu8 conversion will succeed.
892	set.set_range(asu8(b1), asu8(b2.checked_sub(`1`).unwrap()));
893	b1 = b2;
894	}
895	}
896	}
897	}
898
899	/// Returns true when [`Look::Start`] is satisfied `at` the given position
900	/// in `haystack`.
901	///
902	/// # Panics
903	///
904	/// This may panic when `at > haystack.len()`. Note that `at ==
905	/// haystack.len()` is legal and guaranteed not to panic.
906	#[inline]
907	pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool {
908	at == `0`
909	}
910
911	/// Returns true when [`Look::End`] is satisfied `at` the given position in
912	/// `haystack`.
913	///
914	/// # Panics
915	///
916	/// This may panic when `at > haystack.len()`. Note that `at ==
917	/// haystack.len()` is legal and guaranteed not to panic.
918	#[inline]
919	pub fn is_end(&self, haystack: &[u8], at: usize) -> bool {
920	at == haystack.len()
921	}
922
923	/// Returns true when [`Look::StartLF`] is satisfied `at` the given
924	/// position in `haystack`.
925	///
926	/// # Panics
927	///
928	/// This may panic when `at > haystack.len()`. Note that `at ==
929	/// haystack.len()` is legal and guaranteed not to panic.
930	#[inline]
931	pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool {
932	self.is_start(haystack, at) \|\| haystack[at - `1`] == self.lineterm.0
933	}
934
935	/// Returns true when [`Look::EndLF`] is satisfied `at` the given position
936	/// in `haystack`.
937	///
938	/// # Panics
939	///
940	/// This may panic when `at > haystack.len()`. Note that `at ==
941	/// haystack.len()` is legal and guaranteed not to panic.
942	#[inline]
943	pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool {
944	self.is_end(haystack, at) \|\| haystack[at] == self.lineterm.0
945	}
946
947	/// Returns true when [`Look::StartCRLF`] is satisfied `at` the given
948	/// position in `haystack`.
949	///
950	/// # Panics
951	///
952	/// This may panic when `at > haystack.len()`. Note that `at ==
953	/// haystack.len()` is legal and guaranteed not to panic.
954	#[inline]
955	pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool {
956	self.is_start(haystack, at)
957	\|\| haystack[at - `1`] == b'`\n`'
958	\|\| (haystack[at - `1`] == b'`\r`'
959	&& (at >= haystack.len() \|\| haystack[at] != b'`\n`'))
960	}
961
962	/// Returns true when [`Look::EndCRLF`] is satisfied `at` the given
963	/// position in `haystack`.
964	///
965	/// # Panics
966	///
967	/// This may panic when `at > haystack.len()`. Note that `at ==
968	/// haystack.len()` is legal and guaranteed not to panic.
969	#[inline]
970	pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool {
971	self.is_end(haystack, at)
972	\|\| haystack[at] == b'`\r`'
973	\|\| (haystack[at] == b'`\n`'
974	&& (at == `0` \|\| haystack[at - `1`] != b'`\r`'))
975	}
976
977	/// Returns true when [`Look::WordAscii`] is satisfied `at` the given
978	/// position in `haystack`.
979	///
980	/// # Panics
981	///
982	/// This may panic when `at > haystack.len()`. Note that `at ==
983	/// haystack.len()` is legal and guaranteed not to panic.
984	#[inline]
985	pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool {
986	let word_before = at > `0` && utf8::is_word_byte(haystack[at - `1`]);
987	let word_after =
988	at < haystack.len() && utf8::is_word_byte(haystack[at]);
989	word_before != word_after
990	}
991
992	/// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given
993	/// position in `haystack`.
994	///
995	/// # Panics
996	///
997	/// This may panic when `at > haystack.len()`. Note that `at ==
998	/// haystack.len()` is legal and guaranteed not to panic.
999	#[inline]
1000	pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool {
1001	!self.is_word_ascii(haystack, at)
1002	}
1003
1004	/// Returns true when [`Look::WordUnicode`] is satisfied `at` the given
1005	/// position in `haystack`.
1006	///
1007	/// # Panics
1008	///
1009	/// This may panic when `at > haystack.len()`. Note that `at ==
1010	/// haystack.len()` is legal and guaranteed not to panic.
1011	///
1012	/// # Errors
1013	///
1014	/// This returns an error when Unicode word boundary tables
1015	/// are not available. Specifically, this only occurs when the
1016	/// `unicode-word-boundary` feature is not enabled.
1017	#[inline]
1018	pub fn is_word_unicode(
1019	&self,
1020	haystack: &[u8],
1021	at: usize,
1022	) -> Result<bool, UnicodeWordBoundaryError> {
1023	let word_before = is_word_char::rev(haystack, at)?;
1024	let word_after = is_word_char::fwd(haystack, at)?;
1025	Ok(word_before != word_after)
1026	}
1027
1028	/// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the
1029	/// given position in `haystack`.
1030	///
1031	/// # Panics
1032	///
1033	/// This may panic when `at > haystack.len()`. Note that `at ==
1034	/// haystack.len()` is legal and guaranteed not to panic.
1035	///
1036	/// # Errors
1037	///
1038	/// This returns an error when Unicode word boundary tables
1039	/// are not available. Specifically, this only occurs when the
1040	/// `unicode-word-boundary` feature is not enabled.
1041	#[inline]
1042	pub fn is_word_unicode_negate(
1043	&self,
1044	haystack: &[u8],
1045	at: usize,
1046	) -> Result<bool, UnicodeWordBoundaryError> {
1047	// This is pretty subtle. Why do we need to do UTF-8 decoding here?
1048	// Well... at time of writing, the is_word_char_{fwd,rev} routines will
1049	// only return true if there is a valid UTF-8 encoding of a "word"
1050	// codepoint, and false in every other case (including invalid UTF-8).
1051	// This means that in regions of invalid UTF-8 (which might be a
1052	// subset of valid UTF-8!), it would result in \B matching. While this
1053	// would be questionable in the context of truly invalid UTF-8, it is
1054	// certainly* wrong to report match boundaries that split the encoding*
1055	// of a codepoint. So to work around this, we ensure that we can decode
1056	// a codepoint on either side of `at`. If either direction fails, then
1057	// we don't permit \B to match at all.
1058	//
1059	// Now, this isn't exactly optimal from a perf perspective. We could
1060	// try and detect this in is_word_char::{fwd,rev}, but it's not clear
1061	// if it's worth it. \B is, after all, rarely used. Even worse,
1062	// is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this
1063	// will wind up doing UTF-8 decoding twice. Owch. We could fix this
1064	// with more code complexity, but it just doesn't feel worth it for \B.
1065	//
1066	// And in particular, we do not* have to do this with \b, because \b*
1067	// requires* that at least one side of `at` be a "word" codepoint,*
1068	// which in turn implies one side of `at` must be valid UTF-8. This in
1069	// turn implies that \b can never split a valid UTF-8 encoding of a
1070	// codepoint. In the case where one side of `at` is truly invalid UTF-8
1071	// and the other side IS a word codepoint, then we want \b to match
1072	// since it represents a valid UTF-8 boundary. It also makes sense. For
1073	// example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'.
1074	//
1075	// Note also that this is not just '!is_word_unicode(..)' like it is
1076	// for the ASCII case. For example, neither \b nor \B is satisfied
1077	// within invalid UTF-8 sequences.
1078	let word_before = at > `0`
1079	&& match utf8::decode_last(&haystack[..at]) {
1080	None \| Some(Err(_)) => return Ok(`false`),
1081	Some(Ok(_)) => is_word_char::rev(haystack, at)?,
1082	};
1083	let word_after = at < haystack.len()
1084	&& match utf8::decode(&haystack[at..]) {
1085	None \| Some(Err(_)) => return Ok(`false`),
1086	Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
1087	};
1088	Ok(word_before == word_after)
1089	}
1090
1091	/// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given
1092	/// position in `haystack`.
1093	///
1094	/// # Panics
1095	///
1096	/// This may panic when `at > haystack.len()`. Note that `at ==
1097	/// haystack.len()` is legal and guaranteed not to panic.
1098	#[inline]
1099	pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool {
1100	let word_before = at > `0` && utf8::is_word_byte(haystack[at - `1`]);
1101	let word_after =
1102	at < haystack.len() && utf8::is_word_byte(haystack[at]);
1103	!word_before && word_after
1104	}
1105
1106	/// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given
1107	/// position in `haystack`.
1108	///
1109	/// # Panics
1110	///
1111	/// This may panic when `at > haystack.len()`. Note that `at ==
1112	/// haystack.len()` is legal and guaranteed not to panic.
1113	#[inline]
1114	pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool {
1115	let word_before = at > `0` && utf8::is_word_byte(haystack[at - `1`]);
1116	let word_after =
1117	at < haystack.len() && utf8::is_word_byte(haystack[at]);
1118	word_before && !word_after
1119	}
1120
1121	/// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the
1122	/// given position in `haystack`.
1123	///
1124	/// # Panics
1125	///
1126	/// This may panic when `at > haystack.len()`. Note that `at ==
1127	/// haystack.len()` is legal and guaranteed not to panic.
1128	///
1129	/// # Errors
1130	///
1131	/// This returns an error when Unicode word boundary tables
1132	/// are not available. Specifically, this only occurs when the
1133	/// `unicode-word-boundary` feature is not enabled.
1134	#[inline]
1135	pub fn is_word_start_unicode(
1136	&self,
1137	haystack: &[u8],
1138	at: usize,
1139	) -> Result<bool, UnicodeWordBoundaryError> {
1140	let word_before = is_word_char::rev(haystack, at)?;
1141	let word_after = is_word_char::fwd(haystack, at)?;
1142	Ok(!word_before && word_after)
1143	}
1144
1145	/// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the
1146	/// given position in `haystack`.
1147	///
1148	/// # Panics
1149	///
1150	/// This may panic when `at > haystack.len()`. Note that `at ==
1151	/// haystack.len()` is legal and guaranteed not to panic.
1152	///
1153	/// # Errors
1154	///
1155	/// This returns an error when Unicode word boundary tables
1156	/// are not available. Specifically, this only occurs when the
1157	/// `unicode-word-boundary` feature is not enabled.
1158	#[inline]
1159	pub fn is_word_end_unicode(
1160	&self,
1161	haystack: &[u8],
1162	at: usize,
1163	) -> Result<bool, UnicodeWordBoundaryError> {
1164	let word_before = is_word_char::rev(haystack, at)?;
1165	let word_after = is_word_char::fwd(haystack, at)?;
1166	Ok(word_before && !word_after)
1167	}
1168
1169	/// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the
1170	/// given position in `haystack`.
1171	///
1172	/// # Panics
1173	///
1174	/// This may panic when `at > haystack.len()`. Note that `at ==
1175	/// haystack.len()` is legal and guaranteed not to panic.
1176	#[inline]
1177	pub fn is_word_start_half_ascii(
1178	&self,
1179	haystack: &[u8],
1180	at: usize,
1181	) -> bool {
1182	let word_before = at > `0` && utf8::is_word_byte(haystack[at - `1`]);
1183	!word_before
1184	}
1185
1186	/// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the
1187	/// given position in `haystack`.
1188	///
1189	/// # Panics
1190	///
1191	/// This may panic when `at > haystack.len()`. Note that `at ==
1192	/// haystack.len()` is legal and guaranteed not to panic.
1193	#[inline]
1194	pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool {
1195	let word_after =
1196	at < haystack.len() && utf8::is_word_byte(haystack[at]);
1197	!word_after
1198	}
1199
1200	/// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the
1201	/// given position in `haystack`.
1202	///
1203	/// # Panics
1204	///
1205	/// This may panic when `at > haystack.len()`. Note that `at ==
1206	/// haystack.len()` is legal and guaranteed not to panic.
1207	///
1208	/// # Errors
1209	///
1210	/// This returns an error when Unicode word boundary tables
1211	/// are not available. Specifically, this only occurs when the
1212	/// `unicode-word-boundary` feature is not enabled.
1213	#[inline]
1214	pub fn is_word_start_half_unicode(
1215	&self,
1216	haystack: &[u8],
1217	at: usize,
1218	) -> Result<bool, UnicodeWordBoundaryError> {
1219	// See `is_word_unicode_negate` for why we need to do this. We don't
1220	// need to do it for `is_word_start_unicode` because that guarantees
1221	// that the position matched falls on a valid UTF-8 boundary given
1222	// that the right side must be in \w.
1223	let word_before = at > `0`
1224	&& match utf8::decode_last(&haystack[..at]) {
1225	None \| Some(Err(_)) => return Ok(`false`),
1226	Some(Ok(_)) => is_word_char::rev(haystack, at)?,
1227	};
1228	Ok(!word_before)
1229	}
1230
1231	/// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the
1232	/// given position in `haystack`.
1233	///
1234	/// # Panics
1235	///
1236	/// This may panic when `at > haystack.len()`. Note that `at ==
1237	/// haystack.len()` is legal and guaranteed not to panic.
1238	///
1239	/// # Errors
1240	///
1241	/// This returns an error when Unicode word boundary tables
1242	/// are not available. Specifically, this only occurs when the
1243	/// `unicode-word-boundary` feature is not enabled.
1244	#[inline]
1245	pub fn is_word_end_half_unicode(
1246	&self,
1247	haystack: &[u8],
1248	at: usize,
1249	) -> Result<bool, UnicodeWordBoundaryError> {
1250	// See `is_word_unicode_negate` for why we need to do this. We don't
1251	// need to do it for `is_word_end_unicode` because that guarantees
1252	// that the position matched falls on a valid UTF-8 boundary given
1253	// that the left side must be in \w.
1254	let word_after = at < haystack.len()
1255	&& match utf8::decode(&haystack[at..]) {
1256	None \| Some(Err(_)) => return Ok(`false`),
1257	Some(Ok(_)) => is_word_char::fwd(haystack, at)?,
1258	};
1259	Ok(!word_after)
1260	}
1261	}
1262
1263	impl Default for LookMatcher {
1264	fn default() -> LookMatcher {
1265	LookMatcher::new()
1266	}
1267	}
1268
1269	/// An error that occurs when the Unicode-aware `\w` class is unavailable.
1270	///
1271	/// This error can occur when the data tables necessary for the Unicode aware
1272	/// Perl character class `\w` are unavailable. The `\w` class is used to
1273	/// determine whether a codepoint is considered a word character or not when
1274	/// determining whether a Unicode aware `\b` (or `\B`) matches at a particular
1275	/// position.
1276	///
1277	/// This error can only occur when the `unicode-word-boundary` feature is
1278	/// disabled.
1279	#[derive(Clone, Debug)]
1280	pub struct UnicodeWordBoundaryError(());
1281
1282	impl UnicodeWordBoundaryError {
1283	#[cfg(not(feature = "unicode-word-boundary"))]
1284	pub(crate) fn new() -> UnicodeWordBoundaryError {
1285	UnicodeWordBoundaryError(())
1286	}
1287
1288	/// Returns an error if and only if Unicode word boundary data is
1289	/// unavailable.
1290	pub fn check() -> Result<(), UnicodeWordBoundaryError> {
1291	is_word_char::check()
1292	}
1293	}
1294
1295	#[cfg(feature = "std")]
1296	impl std::error::Error for UnicodeWordBoundaryError {}
1297
1298	impl core::fmt::Display for UnicodeWordBoundaryError {
1299	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
1300	write!(
1301	f,
1302	"Unicode-aware `\\`b and `\\`B are unavailable because the \
1303	requisite data tables are missing, please enable the \
1304	unicode-word-boundary feature"
1305	)
1306	}
1307	}
1308
1309	// Below are FOUR different ways for checking whether whether a "word"
1310	// codepoint exists at a particular position in the haystack. The four
1311	// different approaches are, in order of preference:
1312	//
1313	// 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the
1314	// first call, and then use that DFA for all subsequent calls.
1315	// 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available.
1316	// 3. Do UTF-8 decoding and use our own 'perl_word' table.
1317	// 4. Return an error.
1318	//
1319	// The reason for all of these approaches is a combination of perf and
1320	// permitting one to build regex-automata without the Unicode data necessary
1321	// for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would
1322	// still work.)
1323	//
1324	// The DFA approach is the fastest, but it requires the regex parser, the
1325	// NFA compiler, the DFA builder and the DFA search runtime. That's a lot to
1326	// bring in, but if it's available, it's (probably) the best we can do.
1327	//
1328	// Approaches (2) and (3) are effectively equivalent, but (2) reuses the
1329	// data in regex-syntax and avoids duplicating it in regex-automata.
1330	//
1331	// Finally, (4) unconditionally returns an error since the requisite data isn't
1332	// available anywhere.
1333	//
1334	// There are actually more approaches possible that we didn't implement. For
1335	// example, if the DFA builder is available but the syntax parser is not, we
1336	// could technically hand construct our own NFA from the 'perl_word' data
1337	// table. But to avoid some pretty hairy code duplication, we would in turn
1338	// need to pull the UTF-8 compiler out of the NFA compiler. Yikes.
1339	//
1340	// A possibly more sensible alternative is to use a lazy DFA when the full
1341	// DFA builder isn't available...
1342	//
1343	// Yet another choice would be to build the full DFA and then embed it into the
1344	// source. Then we'd only need to bring in the DFA search runtime, which is
1345	// considerably smaller than the DFA builder code. The problem here is that the
1346	// Debian people have spooked me[1] into avoiding cyclic dependencies. Namely,
1347	// we'd need to build regex-cli, which depends on regex-automata in order to
1348	// build some part of regex-automata. But to be honest, something like this has
1349	// to be allowed somehow? I just don't know what the right process is.
1350	//
1351	// There are perhaps other choices as well. Why did I stop at these 4? Because
1352	// I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA
1353	// approach eventually, as the benefits of the DFA approach are somewhat
1354	// compelling. The 'boundary-words-holmes' benchmark tests this. (Note that
1355	// the commands below no longer work. If necessary, we should re-capitulate
1356	// the benchmark from whole cloth in rebar.)
1357	//
1358	// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv
1359	//
1360	// Then I changed the code below so that the util/unicode_data/perl_word table
1361	// was used and re-ran the benchmark:
1362	//
1363	// $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv
1364	//
1365	// And compared them:
1366	//
1367	// $ regex-cli bench diff dfa.csv table.csv
1368	// benchmark engine dfa table
1369	// --------- ------ --- -----
1370	// internal/count/boundary-words-holmes regex/automata/pikevm 18.6 MB/s 12.9 MB/s
1371	//
1372	// Which is a nice improvement.
1373	//
1374	// UPDATE: It turns out that it takes approximately 22ms to build the reverse
1375	// DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in
1376	// the grand scheme things, but that is a significant latency cost. So I'm not
1377	// sure that's a good idea. I then tried using a lazy DFA instead, and that
1378	// eliminated the overhead, but since the lazy DFA requires mutable working
1379	// memory, that requires introducing a 'Cache' for every simultaneous call.
1380	//
1381	// I ended up deciding for now to just keep the "UTF-8 decode and check the
1382	// table." The DFA and lazy DFA approaches are still below, but commented out.
1383	//
1384	// [1]: https://github.com/BurntSushi/ucd-generate/issues/11
1385
1386	/*
1387	/// A module that looks for word codepoints using lazy DFAs.
1388	#[cfg(all(
1389	feature = "unicode-word-boundary",
1390	feature = "syntax",
1391	feature = "unicode-perl",
1392	feature = "hybrid"
1393	))]
1394	mod is_word_char {
1395	use alloc::vec::Vec;
1396
1397	use crate::{
1398	hybrid::dfa::{Cache, DFA},
1399	nfa::thompson::NFA,
1400	util::{lazy::Lazy, pool::Pool, primitives::StateID},
1401	Anchored, Input,
1402	};
1403
1404	pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1405	Ok(())
1406	}
1407
1408	#[cfg_attr(feature = "perf-inline", inline(always))]
1409	pub(super) fn fwd(
1410	haystack: &[u8],
1411	mut at: usize,
1412	) -> Result<bool, super::UnicodeWordBoundaryError> {
1413	static WORD: Lazy<DFA> = Lazy::new(\|\| DFA::new(r"\w").unwrap());
1414	static CACHE: Lazy<Pool<Cache>> =
1415	Lazy::new(\|\| Pool::new(\|\| WORD.create_cache()));
1416	let dfa = Lazy::get(&WORD);
1417	let mut cache = Lazy::get(&CACHE).get();
1418	let mut sid = dfa
1419	.start_state_forward(
1420	&mut cache,
1421	&Input::new("").anchored(Anchored::Yes),
1422	)
1423	.unwrap();
1424	while at < haystack.len() {
1425	let byte = haystack[at];
1426	sid = dfa.next_state(&mut cache, sid, byte).unwrap();
1427	at += 1;
1428	if sid.is_tagged() {
1429	if sid.is_match() {
1430	return Ok(true);
1431	} else if sid.is_dead() {
1432	return Ok(false);
1433	}
1434	}
1435	}
1436	Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
1437	}
1438
1439	#[cfg_attr(feature = "perf-inline", inline(always))]
1440	pub(super) fn rev(
1441	haystack: &[u8],
1442	mut at: usize,
1443	) -> Result<bool, super::UnicodeWordBoundaryError> {
1444	static WORD: Lazy<DFA> = Lazy::new(\|\| {
1445	DFA::builder()
1446	.thompson(NFA::config().reverse(true))
1447	.build(r"\w")
1448	.unwrap()
1449	});
1450	static CACHE: Lazy<Pool<Cache>> =
1451	Lazy::new(\|\| Pool::new(\|\| WORD.create_cache()));
1452	let dfa = Lazy::get(&WORD);
1453	let mut cache = Lazy::get(&CACHE).get();
1454	let mut sid = dfa
1455	.start_state_reverse(
1456	&mut cache,
1457	&Input::new("").anchored(Anchored::Yes),
1458	)
1459	.unwrap();
1460	while at > 0 {
1461	at -= 1;
1462	let byte = haystack[at];
1463	sid = dfa.next_state(&mut cache, sid, byte).unwrap();
1464	if sid.is_tagged() {
1465	if sid.is_match() {
1466	return Ok(true);
1467	} else if sid.is_dead() {
1468	return Ok(false);
1469	}
1470	}
1471	}
1472	Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match())
1473	}
1474	}
1475	*/
1476
1477	/*
1478	/// A module that looks for word codepoints using fully compiled DFAs.
1479	#[cfg(all(
1480	feature = "unicode-word-boundary",
1481	feature = "syntax",
1482	feature = "unicode-perl",
1483	feature = "dfa-build"
1484	))]
1485	mod is_word_char {
1486	use alloc::vec::Vec;
1487
1488	use crate::{
1489	dfa::{dense::DFA, Automaton, StartKind},
1490	nfa::thompson::NFA,
1491	util::{lazy::Lazy, primitives::StateID},
1492	Anchored, Input,
1493	};
1494
1495	pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1496	Ok(())
1497	}
1498
1499	#[cfg_attr(feature = "perf-inline", inline(always))]
1500	pub(super) fn fwd(
1501	haystack: &[u8],
1502	mut at: usize,
1503	) -> Result<bool, super::UnicodeWordBoundaryError> {
1504	static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(\|\| {
1505	let dfa = DFA::builder()
1506	.configure(DFA::config().start_kind(StartKind::Anchored))
1507	.build(r"\w")
1508	.unwrap();
1509	// OK because our regex has no look-around.
1510	let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
1511	(dfa, start_id)
1512	});
1513	let &(ref dfa, mut sid) = Lazy::get(&WORD);
1514	while at < haystack.len() {
1515	let byte = haystack[at];
1516	sid = dfa.next_state(sid, byte);
1517	at += 1;
1518	if dfa.is_special_state(sid) {
1519	if dfa.is_match_state(sid) {
1520	return Ok(true);
1521	} else if dfa.is_dead_state(sid) {
1522	return Ok(false);
1523	}
1524	}
1525	}
1526	Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
1527	}
1528
1529	#[cfg_attr(feature = "perf-inline", inline(always))]
1530	pub(super) fn rev(
1531	haystack: &[u8],
1532	mut at: usize,
1533	) -> Result<bool, super::UnicodeWordBoundaryError> {
1534	static WORD: Lazy<(DFA<Vec<u32>>, StateID)> = Lazy::new(\|\| {
1535	let dfa = DFA::builder()
1536	.configure(DFA::config().start_kind(StartKind::Anchored))
1537	// From ad hoc measurements, it looks like setting
1538	// shrink==false is slightly faster than shrink==true. I kind
1539	// of feel like this indicates that shrinking is probably a
1540	// failure, although it can help in some cases. Sigh.
1541	.thompson(NFA::config().reverse(true).shrink(false))
1542	.build(r"\w")
1543	.unwrap();
1544	// OK because our regex has no look-around.
1545	let start_id = dfa.universal_start_state(Anchored::Yes).unwrap();
1546	(dfa, start_id)
1547	});
1548	let &(ref dfa, mut sid) = Lazy::get(&WORD);
1549	while at > 0 {
1550	at -= 1;
1551	let byte = haystack[at];
1552	sid = dfa.next_state(sid, byte);
1553	if dfa.is_special_state(sid) {
1554	if dfa.is_match_state(sid) {
1555	return Ok(true);
1556	} else if dfa.is_dead_state(sid) {
1557	return Ok(false);
1558	}
1559	}
1560	}
1561	Ok(dfa.is_match_state(dfa.next_eoi_state(sid)))
1562	}
1563	}
1564	*/
1565
1566	/// A module that looks for word codepoints using regex-syntax's data tables.
1567	#[cfg(all(
1568	feature = "unicode-word-boundary",
1569	feature = "syntax",
1570	feature = "unicode-perl",
1571	))]
1572	mod is_word_char {
1573	use regex_syntax::try_is_word_character;
1574
1575	use crate::util::utf8;
1576
1577	pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1578	Ok(())
1579	}
1580
1581	#[cfg_attr(feature = "perf-inline", inline(always))]
1582	pub(super) fn fwd(
1583	haystack: &[u8],
1584	at: usize,
1585	) -> Result<bool, super::UnicodeWordBoundaryError> {
1586	Ok(match utf8::decode(&haystack[at..]) {
1587	None \| Some(Err(_)) => `false`,
1588	Some(Ok(ch)) => try_is_word_character(ch).expect(
1589	"since unicode-word-boundary, syntax and unicode-perl \
1590	are all enabled, it is expected that \
1591	try_is_word_character succeeds",
1592	),
1593	})
1594	}
1595
1596	#[cfg_attr(feature = "perf-inline", inline(always))]
1597	pub(super) fn rev(
1598	haystack: &[u8],
1599	at: usize,
1600	) -> Result<bool, super::UnicodeWordBoundaryError> {
1601	Ok(match utf8::decode_last(&haystack[..at]) {
1602	None \| Some(Err(_)) => `false`,
1603	Some(Ok(ch)) => try_is_word_character(ch).expect(
1604	"since unicode-word-boundary, syntax and unicode-perl \
1605	are all enabled, it is expected that \
1606	try_is_word_character succeeds",
1607	),
1608	})
1609	}
1610	}
1611
1612	/// A module that looks for word codepoints using regex-automata's data tables
1613	/// (which are only compiled when regex-syntax's tables aren't available).
1614	///
1615	/// Note that the cfg should match the one in src/util/unicode_data/mod.rs for
1616	/// perl_word.
1617	#[cfg(all(
1618	feature = "unicode-word-boundary",
1619	not(all(feature = "syntax", feature = "unicode-perl")),
1620	))]
1621	mod is_word_char {
1622	use crate::util::utf8;
1623
1624	pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1625	Ok(())
1626	}
1627
1628	#[cfg_attr(feature = "perf-inline", inline(always))]
1629	pub(super) fn fwd(
1630	haystack: &[u8],
1631	at: usize,
1632	) -> Result<bool, super::UnicodeWordBoundaryError> {
1633	Ok(match utf8::decode(&haystack[at..]) {
1634	None \| Some(Err(_)) => `false`,
1635	Some(Ok(ch)) => is_word_character(ch),
1636	})
1637	}
1638
1639	#[cfg_attr(feature = "perf-inline", inline(always))]
1640	pub(super) fn rev(
1641	haystack: &[u8],
1642	at: usize,
1643	) -> Result<bool, super::UnicodeWordBoundaryError> {
1644	Ok(match utf8::decode_last(&haystack[..at]) {
1645	None \| Some(Err(_)) => `false`,
1646	Some(Ok(ch)) => is_word_character(ch),
1647	})
1648	}
1649
1650	#[cfg_attr(feature = "perf-inline", inline(always))]
1651	fn is_word_character(c: char) -> bool {
1652	use crate::util::{unicode_data::perl_word::PERL_WORD, utf8};
1653
1654	if u8::try_from(c).map_or(`false`, utf8::is_word_byte) {
1655	return `true`;
1656	}
1657	PERL_WORD
1658	.binary_search_by(\|&(start, end)\| {
1659	use core::cmp::Ordering;
1660
1661	if start <= c && c <= end {
1662	Ordering::Equal
1663	} else if start > c {
1664	Ordering::Greater
1665	} else {
1666	Ordering::Less
1667	}
1668	})
1669	.is_ok()
1670	}
1671	}
1672
1673	/// A module that always returns an error if Unicode word boundaries are
1674	/// disabled. When this feature is disabled, then regex-automata will not
1675	/// include its own data tables even if regex-syntax is disabled.
1676	#[cfg(not(feature = "unicode-word-boundary"))]
1677	mod is_word_char {
1678	pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> {
1679	Err(super::UnicodeWordBoundaryError::new())
1680	}
1681
1682	#[cfg_attr(feature = "perf-inline", inline(always))]
1683	pub(super) fn fwd(
1684	_bytes: &[u8],
1685	_at: usize,
1686	) -> Result<bool, super::UnicodeWordBoundaryError> {
1687	Err(super::UnicodeWordBoundaryError::new())
1688	}
1689
1690	#[cfg_attr(feature = "perf-inline", inline(always))]
1691	pub(super) fn rev(
1692	_bytes: &[u8],
1693	_at: usize,
1694	) -> Result<bool, super::UnicodeWordBoundaryError> {
1695	Err(super::UnicodeWordBoundaryError::new())
1696	}
1697	}
1698
1699	#[cfg(test)]
1700	mod tests {
1701	use super::*;
1702
1703	macro_rules! testlook {
1704	($look:expr, $haystack:expr, $at:expr) => {
1705	LookMatcher::default().matches($look, $haystack.as_bytes(), $at)
1706	};
1707	}
1708
1709	#[test]
1710	fn look_matches_start_line() {
1711	let look = Look::StartLF;
1712
1713	assert!(testlook!(look, "", `0`));
1714	assert!(testlook!(look, "`\n`", `0`));
1715	assert!(testlook!(look, "`\n`", `1`));
1716	assert!(testlook!(look, "a", `0`));
1717	assert!(testlook!(look, "`\n`a", `1`));
1718
1719	assert!(!testlook!(look, "a", `1`));
1720	assert!(!testlook!(look, "a`\n`a", `1`));
1721	}
1722
1723	#[test]
1724	fn look_matches_end_line() {
1725	let look = Look::EndLF;
1726
1727	assert!(testlook!(look, "", `0`));
1728	assert!(testlook!(look, "`\n`", `1`));
1729	assert!(testlook!(look, "`\n`a", `0`));
1730	assert!(testlook!(look, "`\n`a", `2`));
1731	assert!(testlook!(look, "a`\n`a", `1`));
1732
1733	assert!(!testlook!(look, "a", `0`));
1734	assert!(!testlook!(look, "`\n`a", `1`));
1735	assert!(!testlook!(look, "a`\n`a", `0`));
1736	assert!(!testlook!(look, "a`\n`a", `2`));
1737	}
1738
1739	#[test]
1740	fn look_matches_start_text() {
1741	let look = Look::Start;
1742
1743	assert!(testlook!(look, "", `0`));
1744	assert!(testlook!(look, "`\n`", `0`));
1745	assert!(testlook!(look, "a", `0`));
1746
1747	assert!(!testlook!(look, "`\n`", `1`));
1748	assert!(!testlook!(look, "`\n`a", `1`));
1749	assert!(!testlook!(look, "a", `1`));
1750	assert!(!testlook!(look, "a`\n`a", `1`));
1751	}
1752
1753	#[test]
1754	fn look_matches_end_text() {
1755	let look = Look::End;
1756
1757	assert!(testlook!(look, "", `0`));
1758	assert!(testlook!(look, "`\n`", `1`));
1759	assert!(testlook!(look, "`\n`a", `2`));
1760
1761	assert!(!testlook!(look, "`\n`a", `0`));
1762	assert!(!testlook!(look, "a`\n`a", `1`));
1763	assert!(!testlook!(look, "a", `0`));
1764	assert!(!testlook!(look, "`\n`a", `1`));
1765	assert!(!testlook!(look, "a`\n`a", `0`));
1766	assert!(!testlook!(look, "a`\n`a", `2`));
1767	}
1768
1769	#[test]
1770	#[cfg(all(not(miri), feature = "unicode-word-boundary"))]
1771	fn look_matches_word_unicode() {
1772	let look = Look::WordUnicode;
1773
1774	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1775	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
1776
1777	// Simple ASCII word boundaries.
1778	assert!(testlook!(look, "a", `0`));
1779	assert!(testlook!(look, "a", `1`));
1780	assert!(testlook!(look, "a ", `1`));
1781	assert!(testlook!(look, " a ", `1`));
1782	assert!(testlook!(look, " a ", `2`));
1783
1784	// Unicode word boundaries with a non-ASCII codepoint.
1785	assert!(testlook!(look, "𝛃", `0`));
1786	assert!(testlook!(look, "𝛃", `4`));
1787	assert!(testlook!(look, "𝛃 ", `4`));
1788	assert!(testlook!(look, " 𝛃 ", `1`));
1789	assert!(testlook!(look, " 𝛃 ", `5`));
1790
1791	// Unicode word boundaries between non-ASCII codepoints.
1792	assert!(testlook!(look, "𝛃𐆀", `0`));
1793	assert!(testlook!(look, "𝛃𐆀", `4`));
1794
1795	// Non word boundaries for ASCII.
1796	assert!(!testlook!(look, "", `0`));
1797	assert!(!testlook!(look, "ab", `1`));
1798	assert!(!testlook!(look, "a ", `2`));
1799	assert!(!testlook!(look, " a ", `0`));
1800	assert!(!testlook!(look, " a ", `3`));
1801
1802	// Non word boundaries with a non-ASCII codepoint.
1803	assert!(!testlook!(look, "𝛃b", `4`));
1804	assert!(!testlook!(look, "𝛃 ", `5`));
1805	assert!(!testlook!(look, " 𝛃 ", `0`));
1806	assert!(!testlook!(look, " 𝛃 ", `6`));
1807	assert!(!testlook!(look, "𝛃", `1`));
1808	assert!(!testlook!(look, "𝛃", `2`));
1809	assert!(!testlook!(look, "𝛃", `3`));
1810
1811	// Non word boundaries with non-ASCII codepoints.
1812	assert!(!testlook!(look, "𝛃𐆀", `1`));
1813	assert!(!testlook!(look, "𝛃𐆀", `2`));
1814	assert!(!testlook!(look, "𝛃𐆀", `3`));
1815	assert!(!testlook!(look, "𝛃𐆀", `5`));
1816	assert!(!testlook!(look, "𝛃𐆀", `6`));
1817	assert!(!testlook!(look, "𝛃𐆀", `7`));
1818	assert!(!testlook!(look, "𝛃𐆀", `8`));
1819	}
1820
1821	#[test]
1822	fn look_matches_word_ascii() {
1823	let look = Look::WordAscii;
1824
1825	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1826	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
1827
1828	// Simple ASCII word boundaries.
1829	assert!(testlook!(look, "a", `0`));
1830	assert!(testlook!(look, "a", `1`));
1831	assert!(testlook!(look, "a ", `1`));
1832	assert!(testlook!(look, " a ", `1`));
1833	assert!(testlook!(look, " a ", `2`));
1834
1835	// Unicode word boundaries with a non-ASCII codepoint. Since this is
1836	// an ASCII word boundary, none of these match.
1837	assert!(!testlook!(look, "𝛃", `0`));
1838	assert!(!testlook!(look, "𝛃", `4`));
1839	assert!(!testlook!(look, "𝛃 ", `4`));
1840	assert!(!testlook!(look, " 𝛃 ", `1`));
1841	assert!(!testlook!(look, " 𝛃 ", `5`));
1842
1843	// Unicode word boundaries between non-ASCII codepoints. Again, since
1844	// this is an ASCII word boundary, none of these match.
1845	assert!(!testlook!(look, "𝛃𐆀", `0`));
1846	assert!(!testlook!(look, "𝛃𐆀", `4`));
1847
1848	// Non word boundaries for ASCII.
1849	assert!(!testlook!(look, "", `0`));
1850	assert!(!testlook!(look, "ab", `1`));
1851	assert!(!testlook!(look, "a ", `2`));
1852	assert!(!testlook!(look, " a ", `0`));
1853	assert!(!testlook!(look, " a ", `3`));
1854
1855	// Non word boundaries with a non-ASCII codepoint.
1856	assert!(testlook!(look, "𝛃b", `4`));
1857	assert!(!testlook!(look, "𝛃 ", `5`));
1858	assert!(!testlook!(look, " 𝛃 ", `0`));
1859	assert!(!testlook!(look, " 𝛃 ", `6`));
1860	assert!(!testlook!(look, "𝛃", `1`));
1861	assert!(!testlook!(look, "𝛃", `2`));
1862	assert!(!testlook!(look, "𝛃", `3`));
1863
1864	// Non word boundaries with non-ASCII codepoints.
1865	assert!(!testlook!(look, "𝛃𐆀", `1`));
1866	assert!(!testlook!(look, "𝛃𐆀", `2`));
1867	assert!(!testlook!(look, "𝛃𐆀", `3`));
1868	assert!(!testlook!(look, "𝛃𐆀", `5`));
1869	assert!(!testlook!(look, "𝛃𐆀", `6`));
1870	assert!(!testlook!(look, "𝛃𐆀", `7`));
1871	assert!(!testlook!(look, "𝛃𐆀", `8`));
1872	}
1873
1874	#[test]
1875	#[cfg(all(not(miri), feature = "unicode-word-boundary"))]
1876	fn look_matches_word_unicode_negate() {
1877	let look = Look::WordUnicodeNegate;
1878
1879	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1880	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
1881
1882	// Simple ASCII word boundaries.
1883	assert!(!testlook!(look, "a", `0`));
1884	assert!(!testlook!(look, "a", `1`));
1885	assert!(!testlook!(look, "a ", `1`));
1886	assert!(!testlook!(look, " a ", `1`));
1887	assert!(!testlook!(look, " a ", `2`));
1888
1889	// Unicode word boundaries with a non-ASCII codepoint.
1890	assert!(!testlook!(look, "𝛃", `0`));
1891	assert!(!testlook!(look, "𝛃", `4`));
1892	assert!(!testlook!(look, "𝛃 ", `4`));
1893	assert!(!testlook!(look, " 𝛃 ", `1`));
1894	assert!(!testlook!(look, " 𝛃 ", `5`));
1895
1896	// Unicode word boundaries between non-ASCII codepoints.
1897	assert!(!testlook!(look, "𝛃𐆀", `0`));
1898	assert!(!testlook!(look, "𝛃𐆀", `4`));
1899
1900	// Non word boundaries for ASCII.
1901	assert!(testlook!(look, "", `0`));
1902	assert!(testlook!(look, "ab", `1`));
1903	assert!(testlook!(look, "a ", `2`));
1904	assert!(testlook!(look, " a ", `0`));
1905	assert!(testlook!(look, " a ", `3`));
1906
1907	// Non word boundaries with a non-ASCII codepoint.
1908	assert!(testlook!(look, "𝛃b", `4`));
1909	assert!(testlook!(look, "𝛃 ", `5`));
1910	assert!(testlook!(look, " 𝛃 ", `0`));
1911	assert!(testlook!(look, " 𝛃 ", `6`));
1912	// These don't match because they could otherwise return an offset that
1913	// splits the UTF-8 encoding of a codepoint.
1914	assert!(!testlook!(look, "𝛃", `1`));
1915	assert!(!testlook!(look, "𝛃", `2`));
1916	assert!(!testlook!(look, "𝛃", `3`));
1917
1918	// Non word boundaries with non-ASCII codepoints. These also don't
1919	// match because they could otherwise return an offset that splits the
1920	// UTF-8 encoding of a codepoint.
1921	assert!(!testlook!(look, "𝛃𐆀", `1`));
1922	assert!(!testlook!(look, "𝛃𐆀", `2`));
1923	assert!(!testlook!(look, "𝛃𐆀", `3`));
1924	assert!(!testlook!(look, "𝛃𐆀", `5`));
1925	assert!(!testlook!(look, "𝛃𐆀", `6`));
1926	assert!(!testlook!(look, "𝛃𐆀", `7`));
1927	// But this one does, since 𐆀 isn't a word codepoint, and 8 is the end
1928	// of the haystack. So the "end" of the haystack isn't a word and 𐆀
1929	// isn't a word, thus, \B matches.
1930	assert!(testlook!(look, "𝛃𐆀", `8`));
1931	}
1932
1933	#[test]
1934	fn look_matches_word_ascii_negate() {
1935	let look = Look::WordAsciiNegate;
1936
1937	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1938	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
1939
1940	// Simple ASCII word boundaries.
1941	assert!(!testlook!(look, "a", `0`));
1942	assert!(!testlook!(look, "a", `1`));
1943	assert!(!testlook!(look, "a ", `1`));
1944	assert!(!testlook!(look, " a ", `1`));
1945	assert!(!testlook!(look, " a ", `2`));
1946
1947	// Unicode word boundaries with a non-ASCII codepoint. Since this is
1948	// an ASCII word boundary, none of these match.
1949	assert!(testlook!(look, "𝛃", `0`));
1950	assert!(testlook!(look, "𝛃", `4`));
1951	assert!(testlook!(look, "𝛃 ", `4`));
1952	assert!(testlook!(look, " 𝛃 ", `1`));
1953	assert!(testlook!(look, " 𝛃 ", `5`));
1954
1955	// Unicode word boundaries between non-ASCII codepoints. Again, since
1956	// this is an ASCII word boundary, none of these match.
1957	assert!(testlook!(look, "𝛃𐆀", `0`));
1958	assert!(testlook!(look, "𝛃𐆀", `4`));
1959
1960	// Non word boundaries for ASCII.
1961	assert!(testlook!(look, "", `0`));
1962	assert!(testlook!(look, "ab", `1`));
1963	assert!(testlook!(look, "a ", `2`));
1964	assert!(testlook!(look, " a ", `0`));
1965	assert!(testlook!(look, " a ", `3`));
1966
1967	// Non word boundaries with a non-ASCII codepoint.
1968	assert!(!testlook!(look, "𝛃b", `4`));
1969	assert!(testlook!(look, "𝛃 ", `5`));
1970	assert!(testlook!(look, " 𝛃 ", `0`));
1971	assert!(testlook!(look, " 𝛃 ", `6`));
1972	assert!(testlook!(look, "𝛃", `1`));
1973	assert!(testlook!(look, "𝛃", `2`));
1974	assert!(testlook!(look, "𝛃", `3`));
1975
1976	// Non word boundaries with non-ASCII codepoints.
1977	assert!(testlook!(look, "𝛃𐆀", `1`));
1978	assert!(testlook!(look, "𝛃𐆀", `2`));
1979	assert!(testlook!(look, "𝛃𐆀", `3`));
1980	assert!(testlook!(look, "𝛃𐆀", `5`));
1981	assert!(testlook!(look, "𝛃𐆀", `6`));
1982	assert!(testlook!(look, "𝛃𐆀", `7`));
1983	assert!(testlook!(look, "𝛃𐆀", `8`));
1984	}
1985
1986	#[test]
1987	fn look_matches_word_start_ascii() {
1988	let look = Look::WordStartAscii;
1989
1990	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
1991	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
1992
1993	// Simple ASCII word boundaries.
1994	assert!(testlook!(look, "a", `0`));
1995	assert!(!testlook!(look, "a", `1`));
1996	assert!(!testlook!(look, "a ", `1`));
1997	assert!(testlook!(look, " a ", `1`));
1998	assert!(!testlook!(look, " a ", `2`));
1999
2000	// Unicode word boundaries with a non-ASCII codepoint. Since this is
2001	// an ASCII word boundary, none of these match.
2002	assert!(!testlook!(look, "𝛃", `0`));
2003	assert!(!testlook!(look, "𝛃", `4`));
2004	assert!(!testlook!(look, "𝛃 ", `4`));
2005	assert!(!testlook!(look, " 𝛃 ", `1`));
2006	assert!(!testlook!(look, " 𝛃 ", `5`));
2007
2008	// Unicode word boundaries between non-ASCII codepoints. Again, since
2009	// this is an ASCII word boundary, none of these match.
2010	assert!(!testlook!(look, "𝛃𐆀", `0`));
2011	assert!(!testlook!(look, "𝛃𐆀", `4`));
2012
2013	// Non word boundaries for ASCII.
2014	assert!(!testlook!(look, "", `0`));
2015	assert!(!testlook!(look, "ab", `1`));
2016	assert!(!testlook!(look, "a ", `2`));
2017	assert!(!testlook!(look, " a ", `0`));
2018	assert!(!testlook!(look, " a ", `3`));
2019
2020	// Non word boundaries with a non-ASCII codepoint.
2021	assert!(testlook!(look, "𝛃b", `4`));
2022	assert!(!testlook!(look, "b𝛃", `1`));
2023	assert!(!testlook!(look, "𝛃 ", `5`));
2024	assert!(!testlook!(look, " 𝛃 ", `0`));
2025	assert!(!testlook!(look, " 𝛃 ", `6`));
2026	assert!(!testlook!(look, "𝛃", `1`));
2027	assert!(!testlook!(look, "𝛃", `2`));
2028	assert!(!testlook!(look, "𝛃", `3`));
2029
2030	// Non word boundaries with non-ASCII codepoints.
2031	assert!(!testlook!(look, "𝛃𐆀", `1`));
2032	assert!(!testlook!(look, "𝛃𐆀", `2`));
2033	assert!(!testlook!(look, "𝛃𐆀", `3`));
2034	assert!(!testlook!(look, "𝛃𐆀", `5`));
2035	assert!(!testlook!(look, "𝛃𐆀", `6`));
2036	assert!(!testlook!(look, "𝛃𐆀", `7`));
2037	assert!(!testlook!(look, "𝛃𐆀", `8`));
2038	}
2039
2040	#[test]
2041	fn look_matches_word_end_ascii() {
2042	let look = Look::WordEndAscii;
2043
2044	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2045	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
2046
2047	// Simple ASCII word boundaries.
2048	assert!(!testlook!(look, "a", `0`));
2049	assert!(testlook!(look, "a", `1`));
2050	assert!(testlook!(look, "a ", `1`));
2051	assert!(!testlook!(look, " a ", `1`));
2052	assert!(testlook!(look, " a ", `2`));
2053
2054	// Unicode word boundaries with a non-ASCII codepoint. Since this is
2055	// an ASCII word boundary, none of these match.
2056	assert!(!testlook!(look, "𝛃", `0`));
2057	assert!(!testlook!(look, "𝛃", `4`));
2058	assert!(!testlook!(look, "𝛃 ", `4`));
2059	assert!(!testlook!(look, " 𝛃 ", `1`));
2060	assert!(!testlook!(look, " 𝛃 ", `5`));
2061
2062	// Unicode word boundaries between non-ASCII codepoints. Again, since
2063	// this is an ASCII word boundary, none of these match.
2064	assert!(!testlook!(look, "𝛃𐆀", `0`));
2065	assert!(!testlook!(look, "𝛃𐆀", `4`));
2066
2067	// Non word boundaries for ASCII.
2068	assert!(!testlook!(look, "", `0`));
2069	assert!(!testlook!(look, "ab", `1`));
2070	assert!(!testlook!(look, "a ", `2`));
2071	assert!(!testlook!(look, " a ", `0`));
2072	assert!(!testlook!(look, " a ", `3`));
2073
2074	// Non word boundaries with a non-ASCII codepoint.
2075	assert!(!testlook!(look, "𝛃b", `4`));
2076	assert!(testlook!(look, "b𝛃", `1`));
2077	assert!(!testlook!(look, "𝛃 ", `5`));
2078	assert!(!testlook!(look, " 𝛃 ", `0`));
2079	assert!(!testlook!(look, " 𝛃 ", `6`));
2080	assert!(!testlook!(look, "𝛃", `1`));
2081	assert!(!testlook!(look, "𝛃", `2`));
2082	assert!(!testlook!(look, "𝛃", `3`));
2083
2084	// Non word boundaries with non-ASCII codepoints.
2085	assert!(!testlook!(look, "𝛃𐆀", `1`));
2086	assert!(!testlook!(look, "𝛃𐆀", `2`));
2087	assert!(!testlook!(look, "𝛃𐆀", `3`));
2088	assert!(!testlook!(look, "𝛃𐆀", `5`));
2089	assert!(!testlook!(look, "𝛃𐆀", `6`));
2090	assert!(!testlook!(look, "𝛃𐆀", `7`));
2091	assert!(!testlook!(look, "𝛃𐆀", `8`));
2092	}
2093
2094	#[test]
2095	#[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2096	fn look_matches_word_start_unicode() {
2097	let look = Look::WordStartUnicode;
2098
2099	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2100	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
2101
2102	// Simple ASCII word boundaries.
2103	assert!(testlook!(look, "a", `0`));
2104	assert!(!testlook!(look, "a", `1`));
2105	assert!(!testlook!(look, "a ", `1`));
2106	assert!(testlook!(look, " a ", `1`));
2107	assert!(!testlook!(look, " a ", `2`));
2108
2109	// Unicode word boundaries with a non-ASCII codepoint.
2110	assert!(testlook!(look, "𝛃", `0`));
2111	assert!(!testlook!(look, "𝛃", `4`));
2112	assert!(!testlook!(look, "𝛃 ", `4`));
2113	assert!(testlook!(look, " 𝛃 ", `1`));
2114	assert!(!testlook!(look, " 𝛃 ", `5`));
2115
2116	// Unicode word boundaries between non-ASCII codepoints.
2117	assert!(testlook!(look, "𝛃𐆀", `0`));
2118	assert!(!testlook!(look, "𝛃𐆀", `4`));
2119
2120	// Non word boundaries for ASCII.
2121	assert!(!testlook!(look, "", `0`));
2122	assert!(!testlook!(look, "ab", `1`));
2123	assert!(!testlook!(look, "a ", `2`));
2124	assert!(!testlook!(look, " a ", `0`));
2125	assert!(!testlook!(look, " a ", `3`));
2126
2127	// Non word boundaries with a non-ASCII codepoint.
2128	assert!(!testlook!(look, "𝛃b", `4`));
2129	assert!(!testlook!(look, "b𝛃", `1`));
2130	assert!(!testlook!(look, "𝛃 ", `5`));
2131	assert!(!testlook!(look, " 𝛃 ", `0`));
2132	assert!(!testlook!(look, " 𝛃 ", `6`));
2133	assert!(!testlook!(look, "𝛃", `1`));
2134	assert!(!testlook!(look, "𝛃", `2`));
2135	assert!(!testlook!(look, "𝛃", `3`));
2136
2137	// Non word boundaries with non-ASCII codepoints.
2138	assert!(!testlook!(look, "𝛃𐆀", `1`));
2139	assert!(!testlook!(look, "𝛃𐆀", `2`));
2140	assert!(!testlook!(look, "𝛃𐆀", `3`));
2141	assert!(!testlook!(look, "𝛃𐆀", `5`));
2142	assert!(!testlook!(look, "𝛃𐆀", `6`));
2143	assert!(!testlook!(look, "𝛃𐆀", `7`));
2144	assert!(!testlook!(look, "𝛃𐆀", `8`));
2145	}
2146
2147	#[test]
2148	#[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2149	fn look_matches_word_end_unicode() {
2150	let look = Look::WordEndUnicode;
2151
2152	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2153	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
2154
2155	// Simple ASCII word boundaries.
2156	assert!(!testlook!(look, "a", `0`));
2157	assert!(testlook!(look, "a", `1`));
2158	assert!(testlook!(look, "a ", `1`));
2159	assert!(!testlook!(look, " a ", `1`));
2160	assert!(testlook!(look, " a ", `2`));
2161
2162	// Unicode word boundaries with a non-ASCII codepoint.
2163	assert!(!testlook!(look, "𝛃", `0`));
2164	assert!(testlook!(look, "𝛃", `4`));
2165	assert!(testlook!(look, "𝛃 ", `4`));
2166	assert!(!testlook!(look, " 𝛃 ", `1`));
2167	assert!(testlook!(look, " 𝛃 ", `5`));
2168
2169	// Unicode word boundaries between non-ASCII codepoints.
2170	assert!(!testlook!(look, "𝛃𐆀", `0`));
2171	assert!(testlook!(look, "𝛃𐆀", `4`));
2172
2173	// Non word boundaries for ASCII.
2174	assert!(!testlook!(look, "", `0`));
2175	assert!(!testlook!(look, "ab", `1`));
2176	assert!(!testlook!(look, "a ", `2`));
2177	assert!(!testlook!(look, " a ", `0`));
2178	assert!(!testlook!(look, " a ", `3`));
2179
2180	// Non word boundaries with a non-ASCII codepoint.
2181	assert!(!testlook!(look, "𝛃b", `4`));
2182	assert!(!testlook!(look, "b𝛃", `1`));
2183	assert!(!testlook!(look, "𝛃 ", `5`));
2184	assert!(!testlook!(look, " 𝛃 ", `0`));
2185	assert!(!testlook!(look, " 𝛃 ", `6`));
2186	assert!(!testlook!(look, "𝛃", `1`));
2187	assert!(!testlook!(look, "𝛃", `2`));
2188	assert!(!testlook!(look, "𝛃", `3`));
2189
2190	// Non word boundaries with non-ASCII codepoints.
2191	assert!(!testlook!(look, "𝛃𐆀", `1`));
2192	assert!(!testlook!(look, "𝛃𐆀", `2`));
2193	assert!(!testlook!(look, "𝛃𐆀", `3`));
2194	assert!(!testlook!(look, "𝛃𐆀", `5`));
2195	assert!(!testlook!(look, "𝛃𐆀", `6`));
2196	assert!(!testlook!(look, "𝛃𐆀", `7`));
2197	assert!(!testlook!(look, "𝛃𐆀", `8`));
2198	}
2199
2200	#[test]
2201	fn look_matches_word_start_half_ascii() {
2202	let look = Look::WordStartHalfAscii;
2203
2204	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2205	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
2206
2207	// Simple ASCII word boundaries.
2208	assert!(testlook!(look, "a", `0`));
2209	assert!(!testlook!(look, "a", `1`));
2210	assert!(!testlook!(look, "a ", `1`));
2211	assert!(testlook!(look, " a ", `1`));
2212	assert!(!testlook!(look, " a ", `2`));
2213
2214	// Unicode word boundaries with a non-ASCII codepoint. Since this is
2215	// an ASCII word boundary, none of these match.
2216	assert!(testlook!(look, "𝛃", `0`));
2217	assert!(testlook!(look, "𝛃", `4`));
2218	assert!(testlook!(look, "𝛃 ", `4`));
2219	assert!(testlook!(look, " 𝛃 ", `1`));
2220	assert!(testlook!(look, " 𝛃 ", `5`));
2221
2222	// Unicode word boundaries between non-ASCII codepoints. Again, since
2223	// this is an ASCII word boundary, none of these match.
2224	assert!(testlook!(look, "𝛃𐆀", `0`));
2225	assert!(testlook!(look, "𝛃𐆀", `4`));
2226
2227	// Non word boundaries for ASCII.
2228	assert!(testlook!(look, "", `0`));
2229	assert!(!testlook!(look, "ab", `1`));
2230	assert!(testlook!(look, "a ", `2`));
2231	assert!(testlook!(look, " a ", `0`));
2232	assert!(testlook!(look, " a ", `3`));
2233
2234	// Non word boundaries with a non-ASCII codepoint.
2235	assert!(testlook!(look, "𝛃b", `4`));
2236	assert!(!testlook!(look, "b𝛃", `1`));
2237	assert!(testlook!(look, "𝛃 ", `5`));
2238	assert!(testlook!(look, " 𝛃 ", `0`));
2239	assert!(testlook!(look, " 𝛃 ", `6`));
2240	assert!(testlook!(look, "𝛃", `1`));
2241	assert!(testlook!(look, "𝛃", `2`));
2242	assert!(testlook!(look, "𝛃", `3`));
2243
2244	// Non word boundaries with non-ASCII codepoints.
2245	assert!(testlook!(look, "𝛃𐆀", `1`));
2246	assert!(testlook!(look, "𝛃𐆀", `2`));
2247	assert!(testlook!(look, "𝛃𐆀", `3`));
2248	assert!(testlook!(look, "𝛃𐆀", `5`));
2249	assert!(testlook!(look, "𝛃𐆀", `6`));
2250	assert!(testlook!(look, "𝛃𐆀", `7`));
2251	assert!(testlook!(look, "𝛃𐆀", `8`));
2252	}
2253
2254	#[test]
2255	fn look_matches_word_end_half_ascii() {
2256	let look = Look::WordEndHalfAscii;
2257
2258	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2259	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
2260
2261	// Simple ASCII word boundaries.
2262	assert!(!testlook!(look, "a", `0`));
2263	assert!(testlook!(look, "a", `1`));
2264	assert!(testlook!(look, "a ", `1`));
2265	assert!(!testlook!(look, " a ", `1`));
2266	assert!(testlook!(look, " a ", `2`));
2267
2268	// Unicode word boundaries with a non-ASCII codepoint. Since this is
2269	// an ASCII word boundary, none of these match.
2270	assert!(testlook!(look, "𝛃", `0`));
2271	assert!(testlook!(look, "𝛃", `4`));
2272	assert!(testlook!(look, "𝛃 ", `4`));
2273	assert!(testlook!(look, " 𝛃 ", `1`));
2274	assert!(testlook!(look, " 𝛃 ", `5`));
2275
2276	// Unicode word boundaries between non-ASCII codepoints. Again, since
2277	// this is an ASCII word boundary, none of these match.
2278	assert!(testlook!(look, "𝛃𐆀", `0`));
2279	assert!(testlook!(look, "𝛃𐆀", `4`));
2280
2281	// Non word boundaries for ASCII.
2282	assert!(testlook!(look, "", `0`));
2283	assert!(!testlook!(look, "ab", `1`));
2284	assert!(testlook!(look, "a ", `2`));
2285	assert!(testlook!(look, " a ", `0`));
2286	assert!(testlook!(look, " a ", `3`));
2287
2288	// Non word boundaries with a non-ASCII codepoint.
2289	assert!(!testlook!(look, "𝛃b", `4`));
2290	assert!(testlook!(look, "b𝛃", `1`));
2291	assert!(testlook!(look, "𝛃 ", `5`));
2292	assert!(testlook!(look, " 𝛃 ", `0`));
2293	assert!(testlook!(look, " 𝛃 ", `6`));
2294	assert!(testlook!(look, "𝛃", `1`));
2295	assert!(testlook!(look, "𝛃", `2`));
2296	assert!(testlook!(look, "𝛃", `3`));
2297
2298	// Non word boundaries with non-ASCII codepoints.
2299	assert!(testlook!(look, "𝛃𐆀", `1`));
2300	assert!(testlook!(look, "𝛃𐆀", `2`));
2301	assert!(testlook!(look, "𝛃𐆀", `3`));
2302	assert!(testlook!(look, "𝛃𐆀", `5`));
2303	assert!(testlook!(look, "𝛃𐆀", `6`));
2304	assert!(testlook!(look, "𝛃𐆀", `7`));
2305	assert!(testlook!(look, "𝛃𐆀", `8`));
2306	}
2307
2308	#[test]
2309	#[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2310	fn look_matches_word_start_half_unicode() {
2311	let look = Look::WordStartHalfUnicode;
2312
2313	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2314	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
2315
2316	// Simple ASCII word boundaries.
2317	assert!(testlook!(look, "a", `0`));
2318	assert!(!testlook!(look, "a", `1`));
2319	assert!(!testlook!(look, "a ", `1`));
2320	assert!(testlook!(look, " a ", `1`));
2321	assert!(!testlook!(look, " a ", `2`));
2322
2323	// Unicode word boundaries with a non-ASCII codepoint.
2324	assert!(testlook!(look, "𝛃", `0`));
2325	assert!(!testlook!(look, "𝛃", `4`));
2326	assert!(!testlook!(look, "𝛃 ", `4`));
2327	assert!(testlook!(look, " 𝛃 ", `1`));
2328	assert!(!testlook!(look, " 𝛃 ", `5`));
2329
2330	// Unicode word boundaries between non-ASCII codepoints.
2331	assert!(testlook!(look, "𝛃𐆀", `0`));
2332	assert!(!testlook!(look, "𝛃𐆀", `4`));
2333
2334	// Non word boundaries for ASCII.
2335	assert!(testlook!(look, "", `0`));
2336	assert!(!testlook!(look, "ab", `1`));
2337	assert!(testlook!(look, "a ", `2`));
2338	assert!(testlook!(look, " a ", `0`));
2339	assert!(testlook!(look, " a ", `3`));
2340
2341	// Non word boundaries with a non-ASCII codepoint.
2342	assert!(!testlook!(look, "𝛃b", `4`));
2343	assert!(!testlook!(look, "b𝛃", `1`));
2344	assert!(testlook!(look, "𝛃 ", `5`));
2345	assert!(testlook!(look, " 𝛃 ", `0`));
2346	assert!(testlook!(look, " 𝛃 ", `6`));
2347	assert!(!testlook!(look, "𝛃", `1`));
2348	assert!(!testlook!(look, "𝛃", `2`));
2349	assert!(!testlook!(look, "𝛃", `3`));
2350
2351	// Non word boundaries with non-ASCII codepoints.
2352	assert!(!testlook!(look, "𝛃𐆀", `1`));
2353	assert!(!testlook!(look, "𝛃𐆀", `2`));
2354	assert!(!testlook!(look, "𝛃𐆀", `3`));
2355	assert!(!testlook!(look, "𝛃𐆀", `5`));
2356	assert!(!testlook!(look, "𝛃𐆀", `6`));
2357	assert!(!testlook!(look, "𝛃𐆀", `7`));
2358	assert!(testlook!(look, "𝛃𐆀", `8`));
2359	}
2360
2361	#[test]
2362	#[cfg(all(not(miri), feature = "unicode-word-boundary"))]
2363	fn look_matches_word_end_half_unicode() {
2364	let look = Look::WordEndHalfUnicode;
2365
2366	// \xF0\x9D\x9B\x83 = 𝛃 (in \w)
2367	// \xF0\x90\x86\x80 = 𐆀 (not in \w)
2368
2369	// Simple ASCII word boundaries.
2370	assert!(!testlook!(look, "a", `0`));
2371	assert!(testlook!(look, "a", `1`));
2372	assert!(testlook!(look, "a ", `1`));
2373	assert!(!testlook!(look, " a ", `1`));
2374	assert!(testlook!(look, " a ", `2`));
2375
2376	// Unicode word boundaries with a non-ASCII codepoint.
2377	assert!(!testlook!(look, "𝛃", `0`));
2378	assert!(testlook!(look, "𝛃", `4`));
2379	assert!(testlook!(look, "𝛃 ", `4`));
2380	assert!(!testlook!(look, " 𝛃 ", `1`));
2381	assert!(testlook!(look, " 𝛃 ", `5`));
2382
2383	// Unicode word boundaries between non-ASCII codepoints.
2384	assert!(!testlook!(look, "𝛃𐆀", `0`));
2385	assert!(testlook!(look, "𝛃𐆀", `4`));
2386
2387	// Non word boundaries for ASCII.
2388	assert!(testlook!(look, "", `0`));
2389	assert!(!testlook!(look, "ab", `1`));
2390	assert!(testlook!(look, "a ", `2`));
2391	assert!(testlook!(look, " a ", `0`));
2392	assert!(testlook!(look, " a ", `3`));
2393
2394	// Non word boundaries with a non-ASCII codepoint.
2395	assert!(!testlook!(look, "𝛃b", `4`));
2396	assert!(!testlook!(look, "b𝛃", `1`));
2397	assert!(testlook!(look, "𝛃 ", `5`));
2398	assert!(testlook!(look, " 𝛃 ", `0`));
2399	assert!(testlook!(look, " 𝛃 ", `6`));
2400	assert!(!testlook!(look, "𝛃", `1`));
2401	assert!(!testlook!(look, "𝛃", `2`));
2402	assert!(!testlook!(look, "𝛃", `3`));
2403
2404	// Non word boundaries with non-ASCII codepoints.
2405	assert!(!testlook!(look, "𝛃𐆀", `1`));
2406	assert!(!testlook!(look, "𝛃𐆀", `2`));
2407	assert!(!testlook!(look, "𝛃𐆀", `3`));
2408	assert!(!testlook!(look, "𝛃𐆀", `5`));
2409	assert!(!testlook!(look, "𝛃𐆀", `6`));
2410	assert!(!testlook!(look, "𝛃𐆀", `7`));
2411	assert!(testlook!(look, "𝛃𐆀", `8`));
2412	}
2413
2414	#[test]
2415	fn look_set() {
2416	let mut f = LookSet::default();
2417	assert!(!f.contains(Look::Start));
2418	assert!(!f.contains(Look::End));
2419	assert!(!f.contains(Look::StartLF));
2420	assert!(!f.contains(Look::EndLF));
2421	assert!(!f.contains(Look::WordUnicode));
2422	assert!(!f.contains(Look::WordUnicodeNegate));
2423	assert!(!f.contains(Look::WordAscii));
2424	assert!(!f.contains(Look::WordAsciiNegate));
2425
2426	f = f.insert(Look::Start);
2427	assert!(f.contains(Look::Start));
2428	f = f.remove(Look::Start);
2429	assert!(!f.contains(Look::Start));
2430
2431	f = f.insert(Look::End);
2432	assert!(f.contains(Look::End));
2433	f = f.remove(Look::End);
2434	assert!(!f.contains(Look::End));
2435
2436	f = f.insert(Look::StartLF);
2437	assert!(f.contains(Look::StartLF));
2438	f = f.remove(Look::StartLF);
2439	assert!(!f.contains(Look::StartLF));
2440
2441	f = f.insert(Look::EndLF);
2442	assert!(f.contains(Look::EndLF));
2443	f = f.remove(Look::EndLF);
2444	assert!(!f.contains(Look::EndLF));
2445
2446	f = f.insert(Look::StartCRLF);
2447	assert!(f.contains(Look::StartCRLF));
2448	f = f.remove(Look::StartCRLF);
2449	assert!(!f.contains(Look::StartCRLF));
2450
2451	f = f.insert(Look::EndCRLF);
2452	assert!(f.contains(Look::EndCRLF));
2453	f = f.remove(Look::EndCRLF);
2454	assert!(!f.contains(Look::EndCRLF));
2455
2456	f = f.insert(Look::WordUnicode);
2457	assert!(f.contains(Look::WordUnicode));
2458	f = f.remove(Look::WordUnicode);
2459	assert!(!f.contains(Look::WordUnicode));
2460
2461	f = f.insert(Look::WordUnicodeNegate);
2462	assert!(f.contains(Look::WordUnicodeNegate));
2463	f = f.remove(Look::WordUnicodeNegate);
2464	assert!(!f.contains(Look::WordUnicodeNegate));
2465
2466	f = f.insert(Look::WordAscii);
2467	assert!(f.contains(Look::WordAscii));
2468	f = f.remove(Look::WordAscii);
2469	assert!(!f.contains(Look::WordAscii));
2470
2471	f = f.insert(Look::WordAsciiNegate);
2472	assert!(f.contains(Look::WordAsciiNegate));
2473	f = f.remove(Look::WordAsciiNegate);
2474	assert!(!f.contains(Look::WordAsciiNegate));
2475
2476	f = f.insert(Look::WordStartAscii);
2477	assert!(f.contains(Look::WordStartAscii));
2478	f = f.remove(Look::WordStartAscii);
2479	assert!(!f.contains(Look::WordStartAscii));
2480
2481	f = f.insert(Look::WordEndAscii);
2482	assert!(f.contains(Look::WordEndAscii));
2483	f = f.remove(Look::WordEndAscii);
2484	assert!(!f.contains(Look::WordEndAscii));
2485
2486	f = f.insert(Look::WordStartUnicode);
2487	assert!(f.contains(Look::WordStartUnicode));
2488	f = f.remove(Look::WordStartUnicode);
2489	assert!(!f.contains(Look::WordStartUnicode));
2490
2491	f = f.insert(Look::WordEndUnicode);
2492	assert!(f.contains(Look::WordEndUnicode));
2493	f = f.remove(Look::WordEndUnicode);
2494	assert!(!f.contains(Look::WordEndUnicode));
2495
2496	f = f.insert(Look::WordStartHalfAscii);
2497	assert!(f.contains(Look::WordStartHalfAscii));
2498	f = f.remove(Look::WordStartHalfAscii);
2499	assert!(!f.contains(Look::WordStartHalfAscii));
2500
2501	f = f.insert(Look::WordEndHalfAscii);
2502	assert!(f.contains(Look::WordEndHalfAscii));
2503	f = f.remove(Look::WordEndHalfAscii);
2504	assert!(!f.contains(Look::WordEndHalfAscii));
2505
2506	f = f.insert(Look::WordStartHalfUnicode);
2507	assert!(f.contains(Look::WordStartHalfUnicode));
2508	f = f.remove(Look::WordStartHalfUnicode);
2509	assert!(!f.contains(Look::WordStartHalfUnicode));
2510
2511	f = f.insert(Look::WordEndHalfUnicode);
2512	assert!(f.contains(Look::WordEndHalfUnicode));
2513	f = f.remove(Look::WordEndHalfUnicode);
2514	assert!(!f.contains(Look::WordEndHalfUnicode));
2515	}
2516
2517	#[test]
2518	fn look_set_iter() {
2519	let set = LookSet::empty();
2520	assert_eq!(`0`, set.iter().count());
2521
2522	let set = LookSet::full();
2523	assert_eq!(`18`, set.iter().count());
2524
2525	let set =
2526	LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode);
2527	assert_eq!(`2`, set.iter().count());
2528
2529	let set = LookSet::empty().insert(Look::StartLF);
2530	assert_eq!(`1`, set.iter().count());
2531
2532	let set = LookSet::empty().insert(Look::WordAsciiNegate);
2533	assert_eq!(`1`, set.iter().count());
2534
2535	let set = LookSet::empty().insert(Look::WordEndHalfUnicode);
2536	assert_eq!(`1`, set.iter().count());
2537	}
2538
2539	#[test]
2540	#[cfg(feature = "alloc")]
2541	fn look_set_debug() {
2542	let res = alloc::format!("{:?}", LookSet::empty());
2543	assert_eq!("∅", res);
2544	let res = alloc::format!("{:?}", LookSet::full());
2545	assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res);
2546	}
2547	}
2548

Provided by KDAB

Definitions