1 | use std::char; |
2 | use std::cmp::Ordering; |
3 | use std::fmt; |
4 | use std::ops; |
5 | use std::u32; |
6 | |
7 | use crate::literal::LiteralSearcher; |
8 | use crate::prog::InstEmptyLook; |
9 | use crate::utf8::{decode_last_utf8, decode_utf8}; |
10 | |
11 | /// Represents a location in the input. |
12 | #[derive (Clone, Copy, Debug)] |
13 | pub struct InputAt { |
14 | pos: usize, |
15 | c: Char, |
16 | byte: Option<u8>, |
17 | len: usize, |
18 | } |
19 | |
20 | impl InputAt { |
21 | /// Returns true iff this position is at the beginning of the input. |
22 | pub fn is_start(&self) -> bool { |
23 | self.pos == 0 |
24 | } |
25 | |
26 | /// Returns true iff this position is past the end of the input. |
27 | pub fn is_end(&self) -> bool { |
28 | self.c.is_none() && self.byte.is_none() |
29 | } |
30 | |
31 | /// Returns the character at this position. |
32 | /// |
33 | /// If this position is just before or after the input, then an absent |
34 | /// character is returned. |
35 | pub fn char(&self) -> Char { |
36 | self.c |
37 | } |
38 | |
39 | /// Returns the byte at this position. |
40 | pub fn byte(&self) -> Option<u8> { |
41 | self.byte |
42 | } |
43 | |
44 | /// Returns the UTF-8 width of the character at this position. |
45 | pub fn len(&self) -> usize { |
46 | self.len |
47 | } |
48 | |
49 | /// Returns whether the UTF-8 width of the character at this position |
50 | /// is zero. |
51 | pub fn is_empty(&self) -> bool { |
52 | self.len == 0 |
53 | } |
54 | |
55 | /// Returns the byte offset of this position. |
56 | pub fn pos(&self) -> usize { |
57 | self.pos |
58 | } |
59 | |
60 | /// Returns the byte offset of the next position in the input. |
61 | pub fn next_pos(&self) -> usize { |
62 | self.pos + self.len |
63 | } |
64 | } |
65 | |
66 | /// An abstraction over input used in the matching engines. |
67 | pub trait Input: fmt::Debug { |
68 | /// Return an encoding of the position at byte offset `i`. |
69 | fn at(&self, i: usize) -> InputAt; |
70 | |
71 | /// Return the Unicode character occurring next to `at`. |
72 | /// |
73 | /// If no such character could be decoded, then `Char` is absent. |
74 | fn next_char(&self, at: InputAt) -> Char; |
75 | |
76 | /// Return the Unicode character occurring previous to `at`. |
77 | /// |
78 | /// If no such character could be decoded, then `Char` is absent. |
79 | fn previous_char(&self, at: InputAt) -> Char; |
80 | |
81 | /// Return true if the given empty width instruction matches at the |
82 | /// input position given. |
83 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; |
84 | |
85 | /// Scan the input for a matching prefix. |
86 | fn prefix_at( |
87 | &self, |
88 | prefixes: &LiteralSearcher, |
89 | at: InputAt, |
90 | ) -> Option<InputAt>; |
91 | |
92 | /// The number of bytes in the input. |
93 | fn len(&self) -> usize; |
94 | |
95 | /// Whether the input is empty. |
96 | fn is_empty(&self) -> bool { |
97 | self.len() == 0 |
98 | } |
99 | |
100 | /// Return the given input as a sequence of bytes. |
101 | fn as_bytes(&self) -> &[u8]; |
102 | } |
103 | |
104 | impl<'a, T: Input> Input for &'a T { |
105 | fn at(&self, i: usize) -> InputAt { |
106 | (**self).at(i) |
107 | } |
108 | |
109 | fn next_char(&self, at: InputAt) -> Char { |
110 | (**self).next_char(at) |
111 | } |
112 | |
113 | fn previous_char(&self, at: InputAt) -> Char { |
114 | (**self).previous_char(at) |
115 | } |
116 | |
117 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
118 | (**self).is_empty_match(at, empty) |
119 | } |
120 | |
121 | fn prefix_at( |
122 | &self, |
123 | prefixes: &LiteralSearcher, |
124 | at: InputAt, |
125 | ) -> Option<InputAt> { |
126 | (**self).prefix_at(prefixes, at) |
127 | } |
128 | |
129 | fn len(&self) -> usize { |
130 | (**self).len() |
131 | } |
132 | |
133 | fn as_bytes(&self) -> &[u8] { |
134 | (**self).as_bytes() |
135 | } |
136 | } |
137 | |
138 | /// An input reader over characters. |
139 | #[derive (Clone, Copy, Debug)] |
140 | pub struct CharInput<'t>(&'t [u8]); |
141 | |
142 | impl<'t> CharInput<'t> { |
143 | /// Return a new character input reader for the given string. |
144 | pub fn new(s: &'t [u8]) -> CharInput<'t> { |
145 | CharInput(s) |
146 | } |
147 | } |
148 | |
149 | impl<'t> ops::Deref for CharInput<'t> { |
150 | type Target = [u8]; |
151 | |
152 | fn deref(&self) -> &[u8] { |
153 | self.0 |
154 | } |
155 | } |
156 | |
157 | impl<'t> Input for CharInput<'t> { |
158 | fn at(&self, i: usize) -> InputAt { |
159 | if i >= self.len() { |
160 | InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } |
161 | } else { |
162 | let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); |
163 | InputAt { pos: i, c, byte: None, len: c.len_utf8() } |
164 | } |
165 | } |
166 | |
167 | fn next_char(&self, at: InputAt) -> Char { |
168 | at.char() |
169 | } |
170 | |
171 | fn previous_char(&self, at: InputAt) -> Char { |
172 | decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() |
173 | } |
174 | |
175 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
176 | use crate::prog::EmptyLook::*; |
177 | match empty.look { |
178 | StartLine => { |
179 | let c = self.previous_char(at); |
180 | at.pos() == 0 || c == ' \n' |
181 | } |
182 | EndLine => { |
183 | let c = self.next_char(at); |
184 | at.pos() == self.len() || c == ' \n' |
185 | } |
186 | StartText => at.pos() == 0, |
187 | EndText => at.pos() == self.len(), |
188 | WordBoundary => { |
189 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
190 | c1.is_word_char() != c2.is_word_char() |
191 | } |
192 | NotWordBoundary => { |
193 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
194 | c1.is_word_char() == c2.is_word_char() |
195 | } |
196 | WordBoundaryAscii => { |
197 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
198 | c1.is_word_byte() != c2.is_word_byte() |
199 | } |
200 | NotWordBoundaryAscii => { |
201 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
202 | c1.is_word_byte() == c2.is_word_byte() |
203 | } |
204 | } |
205 | } |
206 | |
207 | fn prefix_at( |
208 | &self, |
209 | prefixes: &LiteralSearcher, |
210 | at: InputAt, |
211 | ) -> Option<InputAt> { |
212 | prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) |
213 | } |
214 | |
215 | fn len(&self) -> usize { |
216 | self.0.len() |
217 | } |
218 | |
219 | fn as_bytes(&self) -> &[u8] { |
220 | self.0 |
221 | } |
222 | } |
223 | |
224 | /// An input reader over bytes. |
225 | #[derive (Clone, Copy, Debug)] |
226 | pub struct ByteInput<'t> { |
227 | text: &'t [u8], |
228 | only_utf8: bool, |
229 | } |
230 | |
231 | impl<'t> ByteInput<'t> { |
232 | /// Return a new byte-based input reader for the given string. |
233 | pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { |
234 | ByteInput { text, only_utf8 } |
235 | } |
236 | } |
237 | |
238 | impl<'t> ops::Deref for ByteInput<'t> { |
239 | type Target = [u8]; |
240 | |
241 | fn deref(&self) -> &[u8] { |
242 | self.text |
243 | } |
244 | } |
245 | |
246 | impl<'t> Input for ByteInput<'t> { |
247 | fn at(&self, i: usize) -> InputAt { |
248 | if i >= self.len() { |
249 | InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } |
250 | } else { |
251 | InputAt { |
252 | pos: i, |
253 | c: None.into(), |
254 | byte: self.get(i).cloned(), |
255 | len: 1, |
256 | } |
257 | } |
258 | } |
259 | |
260 | fn next_char(&self, at: InputAt) -> Char { |
261 | decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() |
262 | } |
263 | |
264 | fn previous_char(&self, at: InputAt) -> Char { |
265 | decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() |
266 | } |
267 | |
268 | fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { |
269 | use crate::prog::EmptyLook::*; |
270 | match empty.look { |
271 | StartLine => { |
272 | let c = self.previous_char(at); |
273 | at.pos() == 0 || c == ' \n' |
274 | } |
275 | EndLine => { |
276 | let c = self.next_char(at); |
277 | at.pos() == self.len() || c == ' \n' |
278 | } |
279 | StartText => at.pos() == 0, |
280 | EndText => at.pos() == self.len(), |
281 | WordBoundary => { |
282 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
283 | c1.is_word_char() != c2.is_word_char() |
284 | } |
285 | NotWordBoundary => { |
286 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
287 | c1.is_word_char() == c2.is_word_char() |
288 | } |
289 | WordBoundaryAscii => { |
290 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
291 | if self.only_utf8 { |
292 | // If we must match UTF-8, then we can't match word |
293 | // boundaries at invalid UTF-8. |
294 | if c1.is_none() && !at.is_start() { |
295 | return false; |
296 | } |
297 | if c2.is_none() && !at.is_end() { |
298 | return false; |
299 | } |
300 | } |
301 | c1.is_word_byte() != c2.is_word_byte() |
302 | } |
303 | NotWordBoundaryAscii => { |
304 | let (c1, c2) = (self.previous_char(at), self.next_char(at)); |
305 | if self.only_utf8 { |
306 | // If we must match UTF-8, then we can't match word |
307 | // boundaries at invalid UTF-8. |
308 | if c1.is_none() && !at.is_start() { |
309 | return false; |
310 | } |
311 | if c2.is_none() && !at.is_end() { |
312 | return false; |
313 | } |
314 | } |
315 | c1.is_word_byte() == c2.is_word_byte() |
316 | } |
317 | } |
318 | } |
319 | |
320 | fn prefix_at( |
321 | &self, |
322 | prefixes: &LiteralSearcher, |
323 | at: InputAt, |
324 | ) -> Option<InputAt> { |
325 | prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) |
326 | } |
327 | |
328 | fn len(&self) -> usize { |
329 | self.text.len() |
330 | } |
331 | |
332 | fn as_bytes(&self) -> &[u8] { |
333 | self.text |
334 | } |
335 | } |
336 | |
337 | /// An inline representation of `Option<char>`. |
338 | /// |
339 | /// This eliminates the need to do case analysis on `Option<char>` to determine |
340 | /// ordinality with other characters. |
341 | /// |
342 | /// (The `Option<char>` is not related to encoding. Instead, it is used in the |
343 | /// matching engines to represent the beginning and ending boundaries of the |
344 | /// search text.) |
345 | #[derive (Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] |
346 | pub struct Char(u32); |
347 | |
348 | impl fmt::Debug for Char { |
349 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
350 | match char::from_u32(self.0) { |
351 | None => write!(f, "Empty" ), |
352 | Some(c: char) => write!(f, " {:?}" , c), |
353 | } |
354 | } |
355 | } |
356 | |
357 | impl Char { |
358 | /// Returns true iff the character is absent. |
359 | #[inline ] |
360 | pub fn is_none(self) -> bool { |
361 | self.0 == u32::MAX |
362 | } |
363 | |
364 | /// Returns the length of the character's UTF-8 encoding. |
365 | /// |
366 | /// If the character is absent, then `1` is returned. |
367 | #[inline ] |
368 | pub fn len_utf8(self) -> usize { |
369 | char::from_u32(self.0).map_or(1, |c| c.len_utf8()) |
370 | } |
371 | |
372 | /// Returns true iff the character is a word character. |
373 | /// |
374 | /// If the character is absent, then false is returned. |
375 | pub fn is_word_char(self) -> bool { |
376 | // is_word_character can panic if the Unicode data for \w isn't |
377 | // available. However, our compiler ensures that if a Unicode word |
378 | // boundary is used, then the data must also be available. If it isn't, |
379 | // then the compiler returns an error. |
380 | char::from_u32(self.0).map_or(false, regex_syntax::is_word_character) |
381 | } |
382 | |
383 | /// Returns true iff the byte is a word byte. |
384 | /// |
385 | /// If the byte is absent, then false is returned. |
386 | pub fn is_word_byte(self) -> bool { |
387 | match char::from_u32(self.0) { |
388 | Some(c) if c <= ' \u{7F}' => regex_syntax::is_word_byte(c as u8), |
389 | None | Some(_) => false, |
390 | } |
391 | } |
392 | } |
393 | |
394 | impl From<char> for Char { |
395 | fn from(c: char) -> Char { |
396 | Char(c as u32) |
397 | } |
398 | } |
399 | |
400 | impl From<Option<char>> for Char { |
401 | fn from(c: Option<char>) -> Char { |
402 | c.map_or(default:Char(u32::MAX), |c: char| c.into()) |
403 | } |
404 | } |
405 | |
406 | impl PartialEq<char> for Char { |
407 | #[inline ] |
408 | fn eq(&self, other: &char) -> bool { |
409 | self.0 == *other as u32 |
410 | } |
411 | } |
412 | |
413 | impl PartialEq<Char> for char { |
414 | #[inline ] |
415 | fn eq(&self, other: &Char) -> bool { |
416 | *self as u32 == other.0 |
417 | } |
418 | } |
419 | |
420 | impl PartialOrd<char> for Char { |
421 | #[inline ] |
422 | fn partial_cmp(&self, other: &char) -> Option<Ordering> { |
423 | self.0.partial_cmp(&(*other as u32)) |
424 | } |
425 | } |
426 | |
427 | impl PartialOrd<Char> for char { |
428 | #[inline ] |
429 | fn partial_cmp(&self, other: &Char) -> Option<Ordering> { |
430 | (*self as u32).partial_cmp(&other.0) |
431 | } |
432 | } |
433 | |