| 1 | /*! |
| 2 | TODO |
| 3 | */ |
| 4 | |
| 5 | use core::{ascii, fmt, str}; |
| 6 | |
| 7 | #[cfg (feature = "alloc" )] |
| 8 | use alloc::vec::Vec; |
| 9 | |
| 10 | pub mod alphabet; |
| 11 | pub(crate) mod bytes; |
| 12 | #[cfg (feature = "alloc" )] |
| 13 | pub(crate) mod determinize; |
| 14 | pub mod id; |
| 15 | #[cfg (feature = "alloc" )] |
| 16 | pub(crate) mod lazy; |
| 17 | pub(crate) mod matchtypes; |
| 18 | pub mod prefilter; |
| 19 | #[cfg (feature = "alloc" )] |
| 20 | pub(crate) mod sparse_set; |
| 21 | pub(crate) mod start; |
| 22 | #[cfg (feature = "alloc" )] |
| 23 | pub(crate) mod syntax; |
| 24 | |
| 25 | /// The offset, in bytes, that a match is delayed by in the DFAs generated by |
| 26 | /// this crate. (This includes lazy DFAs.) |
| 27 | /// |
| 28 | /// The purpose of this delay is to support look-ahead such as \b (ASCII-only) |
| 29 | /// and $. In particular, both of these operators may require the |
| 30 | /// identification of the end of input in order to confirm a match. Not only |
| 31 | /// does this mean that all matches must therefore be delayed by a single byte, |
| 32 | /// but that a special EOI value is added to the alphabet of all DFAs. (Which |
| 33 | /// means that even though the alphabet of a DFA is typically all byte values, |
| 34 | /// the actual maximum alphabet size is 257 due to the extra EOI value.) |
| 35 | /// |
| 36 | /// Since we delay matches by only 1 byte, this can't fully support a |
| 37 | /// Unicode-aware \b operator, which requires multi-byte look-ahead. Indeed, |
| 38 | /// DFAs in this crate do not support it. (It's not as simple as just |
| 39 | /// increasing the match offset to do it---otherwise we would---but building |
| 40 | /// the full Unicode-aware word boundary detection into an automaton is quite |
| 41 | /// tricky.) |
| 42 | pub(crate) const MATCH_OFFSET: usize = 1; |
| 43 | |
| 44 | /// A type that wraps a single byte with a convenient fmt::Debug impl that |
| 45 | /// escapes the byte. |
| 46 | pub(crate) struct DebugByte(pub u8); |
| 47 | |
| 48 | impl fmt::Debug for DebugByte { |
| 49 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 50 | // 10 bytes is enough to cover any output from ascii::escape_default. |
| 51 | let mut bytes: [u8; 10] = [0u8; 10]; |
| 52 | let mut len: usize = 0; |
| 53 | for (i: usize, mut b: u8) in ascii::escape_default(self.0).enumerate() { |
| 54 | // capitalize \xab to \xAB |
| 55 | if i >= 2 && b'a' <= b && b <= b'f' { |
| 56 | b -= 32; |
| 57 | } |
| 58 | bytes[len] = b; |
| 59 | len += 1; |
| 60 | } |
| 61 | write!(f, " {}" , str::from_utf8(&bytes[..len]).unwrap()) |
| 62 | } |
| 63 | } |
| 64 | |
| 65 | /// Returns the smallest possible index of the next valid UTF-8 sequence |
| 66 | /// starting after `i`. |
| 67 | /// |
| 68 | /// For all inputs, including invalid UTF-8 and any value of `i`, the return |
| 69 | /// value is guaranteed to be greater than `i`. |
| 70 | /// |
| 71 | /// Generally speaking, this should only be called on `text` when it is |
| 72 | /// permitted to assume that it is valid UTF-8 and where either `i >= |
| 73 | /// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. |
| 74 | #[inline (always)] |
| 75 | pub(crate) fn next_utf8(text: &[u8], i: usize) -> usize { |
| 76 | let b: u8 = match text.get(index:i) { |
| 77 | None => return i.checked_add(1).unwrap(), |
| 78 | Some(&b: u8) => b, |
| 79 | }; |
| 80 | // For cases where we see an invalid UTF-8 byte, there isn't much we can do |
| 81 | // other than just start at the next byte. |
| 82 | let inc: usize = utf8_len(b).unwrap_or(default:1); |
| 83 | i.checked_add(inc).unwrap() |
| 84 | } |
| 85 | |
| 86 | /// Returns true if and only if the given byte is considered a word character. |
| 87 | /// This only applies to ASCII. |
| 88 | /// |
| 89 | /// This was copied from regex-syntax so that we can use it to determine the |
| 90 | /// starting DFA state while searching without depending on regex-syntax. The |
| 91 | /// definition is never going to change, so there's no maintenance/bit-rot |
| 92 | /// hazard here. |
| 93 | #[inline (always)] |
| 94 | pub(crate) fn is_word_byte(b: u8) -> bool { |
| 95 | match b { |
| 96 | b'_' | b'0' ..=b'9' | b'a' ..=b'z' | b'A' ..=b'Z' => true, |
| 97 | _ => false, |
| 98 | } |
| 99 | } |
| 100 | |
| 101 | /// Decodes the next UTF-8 encoded codepoint from the given byte slice. |
| 102 | /// |
| 103 | /// If no valid encoding of a codepoint exists at the beginning of the given |
| 104 | /// byte slice, then the first byte is returned instead. |
| 105 | /// |
| 106 | /// This returns `None` if and only if `bytes` is empty. |
| 107 | #[inline (always)] |
| 108 | pub(crate) fn decode_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { |
| 109 | if bytes.is_empty() { |
| 110 | return None; |
| 111 | } |
| 112 | let len: usize = match utf8_len(byte:bytes[0]) { |
| 113 | None => return Some(Err(bytes[0])), |
| 114 | Some(len: usize) if len > bytes.len() => return Some(Err(bytes[0])), |
| 115 | Some(1) => return Some(Ok(bytes[0] as char)), |
| 116 | Some(len: usize) => len, |
| 117 | }; |
| 118 | match str::from_utf8(&bytes[..len]) { |
| 119 | Ok(s: &str) => Some(Ok(s.chars().next().unwrap())), |
| 120 | Err(_) => Some(Err(bytes[0])), |
| 121 | } |
| 122 | } |
| 123 | |
| 124 | /// Decodes the last UTF-8 encoded codepoint from the given byte slice. |
| 125 | /// |
| 126 | /// If no valid encoding of a codepoint exists at the end of the given byte |
| 127 | /// slice, then the last byte is returned instead. |
| 128 | /// |
| 129 | /// This returns `None` if and only if `bytes` is empty. |
| 130 | #[inline (always)] |
| 131 | pub(crate) fn decode_last_utf8(bytes: &[u8]) -> Option<Result<char, u8>> { |
| 132 | if bytes.is_empty() { |
| 133 | return None; |
| 134 | } |
| 135 | let mut start: usize = bytes.len() - 1; |
| 136 | let limit: usize = bytes.len().saturating_sub(4); |
| 137 | while start > limit && !is_leading_or_invalid_utf8_byte(bytes[start]) { |
| 138 | start -= 1; |
| 139 | } |
| 140 | match decode_utf8(&bytes[start..]) { |
| 141 | None => None, |
| 142 | Some(Ok(ch: char)) => Some(Ok(ch)), |
| 143 | Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), |
| 144 | } |
| 145 | } |
| 146 | |
| 147 | /// Given a UTF-8 leading byte, this returns the total number of code units |
| 148 | /// in the following encoded codepoint. |
| 149 | /// |
| 150 | /// If the given byte is not a valid UTF-8 leading byte, then this returns |
| 151 | /// `None`. |
| 152 | #[inline (always)] |
| 153 | fn utf8_len(byte: u8) -> Option<usize> { |
| 154 | if byte <= 0x7F { |
| 155 | return Some(1); |
| 156 | } else if byte & 0b1100_0000 == 0b1000_0000 { |
| 157 | return None; |
| 158 | } else if byte <= 0b1101_1111 { |
| 159 | Some(2) |
| 160 | } else if byte <= 0b1110_1111 { |
| 161 | Some(3) |
| 162 | } else if byte <= 0b1111_0111 { |
| 163 | Some(4) |
| 164 | } else { |
| 165 | None |
| 166 | } |
| 167 | } |
| 168 | |
| 169 | /// Returns true if and only if the given byte is either a valid leading UTF-8 |
| 170 | /// byte, or is otherwise an invalid byte that can never appear anywhere in a |
| 171 | /// valid UTF-8 sequence. |
| 172 | #[inline (always)] |
| 173 | fn is_leading_or_invalid_utf8_byte(b: u8) -> bool { |
| 174 | // In the ASCII case, the most significant bit is never set. The leading |
| 175 | // byte of a 2/3/4-byte sequence always has the top two most significant |
| 176 | // bits set. For bytes that can never appear anywhere in valid UTF-8, this |
| 177 | // also returns true, since every such byte has its two most significant |
| 178 | // bits set: |
| 179 | // |
| 180 | // \xC0 :: 11000000 |
| 181 | // \xC1 :: 11000001 |
| 182 | // \xF5 :: 11110101 |
| 183 | // \xF6 :: 11110110 |
| 184 | // \xF7 :: 11110111 |
| 185 | // \xF8 :: 11111000 |
| 186 | // \xF9 :: 11111001 |
| 187 | // \xFA :: 11111010 |
| 188 | // \xFB :: 11111011 |
| 189 | // \xFC :: 11111100 |
| 190 | // \xFD :: 11111101 |
| 191 | // \xFE :: 11111110 |
| 192 | // \xFF :: 11111111 |
| 193 | (b & 0b1100_0000) != 0b1000_0000 |
| 194 | } |
| 195 | |
| 196 | #[cfg (feature = "alloc" )] |
| 197 | #[inline (always)] |
| 198 | pub(crate) fn is_word_char_fwd(bytes: &[u8], mut at: usize) -> bool { |
| 199 | use core::{ptr, sync::atomic::AtomicPtr}; |
| 200 | |
| 201 | use crate::{ |
| 202 | dfa::{ |
| 203 | dense::{self, DFA}, |
| 204 | Automaton, |
| 205 | }, |
| 206 | util::lazy, |
| 207 | }; |
| 208 | |
| 209 | static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); |
| 210 | |
| 211 | let dfa = lazy::get_or_init(&WORD, || { |
| 212 | // TODO: Should we use a lazy DFA here instead? It does complicate |
| 213 | // things somewhat, since we then need a mutable cache, which probably |
| 214 | // means a thread local. |
| 215 | dense::Builder::new() |
| 216 | .configure(dense::Config::new().anchored(true)) |
| 217 | .build(r"\w" ) |
| 218 | .unwrap() |
| 219 | }); |
| 220 | // This is OK since '\w' contains no look-around. |
| 221 | let mut sid = dfa.universal_start_state(); |
| 222 | while at < bytes.len() { |
| 223 | let byte = bytes[at]; |
| 224 | sid = dfa.next_state(sid, byte); |
| 225 | at += 1; |
| 226 | if dfa.is_special_state(sid) { |
| 227 | if dfa.is_match_state(sid) { |
| 228 | return true; |
| 229 | } else if dfa.is_dead_state(sid) { |
| 230 | return false; |
| 231 | } |
| 232 | } |
| 233 | } |
| 234 | dfa.is_match_state(dfa.next_eoi_state(sid)) |
| 235 | } |
| 236 | |
| 237 | #[cfg (feature = "alloc" )] |
| 238 | #[inline (always)] |
| 239 | pub(crate) fn is_word_char_rev(bytes: &[u8], mut at: usize) -> bool { |
| 240 | use core::{ptr, sync::atomic::AtomicPtr}; |
| 241 | |
| 242 | use crate::{ |
| 243 | dfa::{ |
| 244 | dense::{self, DFA}, |
| 245 | Automaton, |
| 246 | }, |
| 247 | nfa::thompson::NFA, |
| 248 | }; |
| 249 | |
| 250 | static WORD: AtomicPtr<DFA<Vec<u32>>> = AtomicPtr::new(ptr::null_mut()); |
| 251 | |
| 252 | let dfa = lazy::get_or_init(&WORD, || { |
| 253 | dense::Builder::new() |
| 254 | .configure(dense::Config::new().anchored(true)) |
| 255 | .thompson(NFA::config().reverse(true).shrink(true)) |
| 256 | .build(r"\w" ) |
| 257 | .unwrap() |
| 258 | }); |
| 259 | |
| 260 | // This is OK since '\w' contains no look-around. |
| 261 | let mut sid = dfa.universal_start_state(); |
| 262 | while at > 0 { |
| 263 | at -= 1; |
| 264 | let byte = bytes[at]; |
| 265 | sid = dfa.next_state(sid, byte); |
| 266 | if dfa.is_special_state(sid) { |
| 267 | if dfa.is_match_state(sid) { |
| 268 | return true; |
| 269 | } else if dfa.is_dead_state(sid) { |
| 270 | return false; |
| 271 | } |
| 272 | } |
| 273 | } |
| 274 | dfa.is_match_state(dfa.next_eoi_state(sid)) |
| 275 | } |
| 276 | |