1/*!
2TODO
3*/
4
5use core::{ascii, fmt, str};
6
7#[cfg(feature = "alloc")]
8use alloc::vec::Vec;
9
10pub mod alphabet;
11pub(crate) mod bytes;
12#[cfg(feature = "alloc")]
13pub(crate) mod determinize;
14pub mod id;
15#[cfg(feature = "alloc")]
16pub(crate) mod lazy;
17pub(crate) mod matchtypes;
18pub mod prefilter;
19#[cfg(feature = "alloc")]
20pub(crate) mod sparse_set;
21pub(crate) mod start;
22#[cfg(feature = "alloc")]
23pub(crate) mod syntax;
24
25/// The offset, in bytes, that a match is delayed by in the DFAs generated by
26/// this crate. (This includes lazy DFAs.)
27///
28/// The purpose of this delay is to support look-ahead such as \b (ASCII-only)
29/// and $. In particular, both of these operators may require the
30/// identification of the end of input in order to confirm a match. Not only
31/// does this mean that all matches must therefore be delayed by a single byte,
32/// but that a special EOI value is added to the alphabet of all DFAs. (Which
33/// means that even though the alphabet of a DFA is typically all byte values,
34/// the actual maximum alphabet size is 257 due to the extra EOI value.)
35///
36/// Since we delay matches by only 1 byte, this can't fully support a
37/// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed,
38/// DFAs in this crate do not support it. (It's not as simple as just
39/// increasing the match offset to do it---otherwise we would---but building
40/// the full Unicode-aware word boundary detection into an automaton is quite
41/// tricky.)
42pub(crate) const MATCH_OFFSET: usize = 1;
43
44/// A type that wraps a single byte with a convenient fmt::Debug impl that
45/// escapes the byte.
46pub(crate) struct DebugByte(pub u8);
47
48impl fmt::Debug for DebugByte {
49 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
50 // 10 bytes is enough to cover any output from ascii::escape_default.
51 let mut bytes: [u8; 10] = [0u8; 10];
52 let mut len: usize = 0;
53 for (i: usize, mut b: u8) in ascii::escape_default(self.0).enumerate() {
54 // capitalize \xab to \xAB
55 if i >= 2 && b'a' <= b && b <= b'f' {
56 b -= 32;
57 }
58 bytes[len] = b;
59 len += 1;
60 }
61 write!(f, "{}", str::from_utf8(&bytes[..len]).unwrap())
62 }
63}
64
65/// Returns the smallest possible index of the next valid UTF-8 sequence
66/// starting after `i`.
67///
68/// For all inputs, including invalid UTF-8 and any value of `i`, the return
69/// value is guaranteed to be greater than `i`.
70///
71/// Generally speaking, this should only be called on `text` when it is
72/// permitted to assume that it is valid UTF-8 and where either `i >=
73/// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence.
74#[inline(always)]
75pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize {
76 let b: u8 = match text.get(index:i) {
77 None => return i.checked_add(1).unwrap(),
78 Some(&b: u8) => b,
79 };
80 // For cases where we see an invalid UTF-8 byte, there isn't much we can do
81 // other than just start at the next byte.
82 let inc: usize = utf8_len(b).unwrap_or(default:1);
83 i.checked_add(inc).unwrap()
84}
85
86/// Returns true if and only if the given byte is considered a word character.
87/// This only applies to ASCII.
88///
89/// This was copied from regex-syntax so that we can use it to determine the
90/// starting DFA state while searching without depending on regex-syntax. The
91/// definition is never going to change, so there's no maintenance/bit-rot
92/// hazard here.
93#[inline(always)]
94pub(crate) fn is_word_byte(b: u8) -> bool {
95 match b {
96 b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true,
97 _ => false,
98 }
99}
100
101/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
102///
103/// If no valid encoding of a codepoint exists at the beginning of the given
104/// byte slice, then the first byte is returned instead.
105///
106/// This returns `None` if and only if `bytes` is empty.
107#[inline(always)]
108pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
109 if bytes.is_empty() {
110 return None;
111 }
112 let len: usize = match utf8_len(byte:bytes[0]) {
113 None => return Some(Err(bytes[0])),
114 Some(len: usize) if len > bytes.len() => return Some(Err(bytes[0])),
115 Some(1) => return Some(Ok(bytes[0] as char)),
116 Some(len: usize) => len,
117 };
118 match str::from_utf8(&bytes[..len]) {
119 Ok(s: &str) => Some(Ok(s.chars().next().unwrap())),
120 Err(_) => Some(Err(bytes[0])),
121 }
122}
123
124/// Decodes the last UTF-8 encoded codepoint from the given byte slice.
125///
126/// If no valid encoding of a codepoint exists at the end of the given byte
127/// slice, then the last byte is returned instead.
128///
129/// This returns `None` if and only if `bytes` is empty.
130#[inline(always)]
131pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> {
132 if bytes.is_empty() {
133 return None;
134 }
135 let mut start: usize = bytes.len() - 1;
136 let limit: usize = bytes.len().saturating_sub(4);
137 while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) {
138 start -= 1;
139 }
140 match decode_utf8(&bytes[start..]) {
141 None => None,
142 Some(Ok(ch: char)) => Some(Ok(ch)),
143 Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])),
144 }
145}
146
147/// Given a UTF-8 leading byte, this returns the total number of code units
148/// in the following encoded codepoint.
149///
150/// If the given byte is not a valid UTF-8 leading byte, then this returns
151/// `None`.
152#[inline(always)]
153fn utf8_len(byte: u8) -> Option<usize> {
154 if byte <= 0x7F {
155 return Some(1);
156 } else if byte & 0b1100_0000 == 0b1000_0000 {
157 return None;
158 } else if byte <= 0b1101_1111 {
159 Some(2)
160 } else if byte <= 0b1110_1111 {
161 Some(3)
162 } else if byte <= 0b1111_0111 {
163 Some(4)
164 } else {
165 None
166 }
167}
168
169/// Returns true if and only if the given byte is either a valid leading UTF-8
170/// byte, or is otherwise an invalid byte that can never appear anywhere in a
171/// valid UTF-8 sequence.
172#[inline(always)]
173fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
174 // In the ASCII case, the most significant bit is never set. The leading
175 // byte of a 2/3/4-byte sequence always has the top two most significant
176 // bits set. For bytes that can never appear anywhere in valid UTF-8, this
177 // also returns true, since every such byte has its two most significant
178 // bits set:
179 //
180 // \xC0 :: 11000000
181 // \xC1 :: 11000001
182 // \xF5 :: 11110101
183 // \xF6 :: 11110110
184 // \xF7 :: 11110111
185 // \xF8 :: 11111000
186 // \xF9 :: 11111001
187 // \xFA :: 11111010
188 // \xFB :: 11111011
189 // \xFC :: 11111100
190 // \xFD :: 11111101
191 // \xFE :: 11111110
192 // \xFF :: 11111111
193 (b & 0b1100_0000) != 0b1000_0000
194}
195
196#[cfg(feature = "alloc")]
197#[inline(always)]
198pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool {
199 use core::{ptr, sync::atomic::AtomicPtr};
200
201 use crate::{
202 dfa::{
203 dense::{self, DFA},
204 Automaton,
205 },
206 util::lazy,
207 };
208
209 static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
210
211 let dfa = lazy::get_or_init(&WORD, || {
212 // TODO: Should we use a lazy DFA here instead? It does complicate
213 // things somewhat, since we then need a mutable cache, which probably
214 // means a thread local.
215 dense::Builder::new()
216 .configure(dense::Config::new().anchored(true))
217 .build(r"\w")
218 .unwrap()
219 });
220 // This is OK since '\w' contains no look-around.
221 let mut sid = dfa.universal_start_state();
222 while at < bytes.len() {
223 let byte = bytes[at];
224 sid = dfa.next_state(sid, byte);
225 at += 1;
226 if dfa.is_special_state(sid) {
227 if dfa.is_match_state(sid) {
228 return true;
229 } else if dfa.is_dead_state(sid) {
230 return false;
231 }
232 }
233 }
234 dfa.is_match_state(dfa.next_eoi_state(sid))
235}
236
237#[cfg(feature = "alloc")]
238#[inline(always)]
239pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool {
240 use core::{ptr, sync::atomic::AtomicPtr};
241
242 use crate::{
243 dfa::{
244 dense::{self, DFA},
245 Automaton,
246 },
247 nfa::thompson::NFA,
248 };
249
250 static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut());
251
252 let dfa = lazy::get_or_init(&WORD, || {
253 dense::Builder::new()
254 .configure(dense::Config::new().anchored(true))
255 .thompson(NFA::config().reverse(true).shrink(true))
256 .build(r"\w")
257 .unwrap()
258 });
259
260 // This is OK since '\w' contains no look-around.
261 let mut sid = dfa.universal_start_state();
262 while at > 0 {
263 at -= 1;
264 let byte = bytes[at];
265 sid = dfa.next_state(sid, byte);
266 if dfa.is_special_state(sid) {
267 if dfa.is_match_state(sid) {
268 return true;
269 } else if dfa.is_dead_state(sid) {
270 return false;
271 }
272 }
273 }
274 dfa.is_match_state(dfa.next_eoi_state(sid))
275}
276