1use std::char;
2use std::cmp::Ordering;
3use std::fmt;
4use std::ops;
5use std::u32;
6
7use crate::literal::LiteralSearcher;
8use crate::prog::InstEmptyLook;
9use crate::utf8::{decode_last_utf8, decode_utf8};
10
11/// Represents a location in the input.
12#[derive(Clone, Copy, Debug)]
13pub struct InputAt {
14 pos: usize,
15 c: Char,
16 byte: Option<u8>,
17 len: usize,
18}
19
20impl InputAt {
21 /// Returns true iff this position is at the beginning of the input.
22 pub fn is_start(&self) -> bool {
23 self.pos == 0
24 }
25
26 /// Returns true iff this position is past the end of the input.
27 pub fn is_end(&self) -> bool {
28 self.c.is_none() && self.byte.is_none()
29 }
30
31 /// Returns the character at this position.
32 ///
33 /// If this position is just before or after the input, then an absent
34 /// character is returned.
35 pub fn char(&self) -> Char {
36 self.c
37 }
38
39 /// Returns the byte at this position.
40 pub fn byte(&self) -> Option<u8> {
41 self.byte
42 }
43
44 /// Returns the UTF-8 width of the character at this position.
45 pub fn len(&self) -> usize {
46 self.len
47 }
48
49 /// Returns whether the UTF-8 width of the character at this position
50 /// is zero.
51 pub fn is_empty(&self) -> bool {
52 self.len == 0
53 }
54
55 /// Returns the byte offset of this position.
56 pub fn pos(&self) -> usize {
57 self.pos
58 }
59
60 /// Returns the byte offset of the next position in the input.
61 pub fn next_pos(&self) -> usize {
62 self.pos + self.len
63 }
64}
65
66/// An abstraction over input used in the matching engines.
67pub trait Input: fmt::Debug {
68 /// Return an encoding of the position at byte offset `i`.
69 fn at(&self, i: usize) -> InputAt;
70
71 /// Return the Unicode character occurring next to `at`.
72 ///
73 /// If no such character could be decoded, then `Char` is absent.
74 fn next_char(&self, at: InputAt) -> Char;
75
76 /// Return the Unicode character occurring previous to `at`.
77 ///
78 /// If no such character could be decoded, then `Char` is absent.
79 fn previous_char(&self, at: InputAt) -> Char;
80
81 /// Return true if the given empty width instruction matches at the
82 /// input position given.
83 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
84
85 /// Scan the input for a matching prefix.
86 fn prefix_at(
87 &self,
88 prefixes: &LiteralSearcher,
89 at: InputAt,
90 ) -> Option<InputAt>;
91
92 /// The number of bytes in the input.
93 fn len(&self) -> usize;
94
95 /// Whether the input is empty.
96 fn is_empty(&self) -> bool {
97 self.len() == 0
98 }
99
100 /// Return the given input as a sequence of bytes.
101 fn as_bytes(&self) -> &[u8];
102}
103
104impl<'a, T: Input> Input for &'a T {
105 fn at(&self, i: usize) -> InputAt {
106 (**self).at(i)
107 }
108
109 fn next_char(&self, at: InputAt) -> Char {
110 (**self).next_char(at)
111 }
112
113 fn previous_char(&self, at: InputAt) -> Char {
114 (**self).previous_char(at)
115 }
116
117 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
118 (**self).is_empty_match(at, empty)
119 }
120
121 fn prefix_at(
122 &self,
123 prefixes: &LiteralSearcher,
124 at: InputAt,
125 ) -> Option<InputAt> {
126 (**self).prefix_at(prefixes, at)
127 }
128
129 fn len(&self) -> usize {
130 (**self).len()
131 }
132
133 fn as_bytes(&self) -> &[u8] {
134 (**self).as_bytes()
135 }
136}
137
138/// An input reader over characters.
139#[derive(Clone, Copy, Debug)]
140pub struct CharInput<'t>(&'t [u8]);
141
142impl<'t> CharInput<'t> {
143 /// Return a new character input reader for the given string.
144 pub fn new(s: &'t [u8]) -> CharInput<'t> {
145 CharInput(s)
146 }
147}
148
149impl<'t> ops::Deref for CharInput<'t> {
150 type Target = [u8];
151
152 fn deref(&self) -> &[u8] {
153 self.0
154 }
155}
156
157impl<'t> Input for CharInput<'t> {
158 fn at(&self, i: usize) -> InputAt {
159 if i >= self.len() {
160 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
161 } else {
162 let c = decode_utf8(&self[i..]).map(|(c, _)| c).into();
163 InputAt { pos: i, c, byte: None, len: c.len_utf8() }
164 }
165 }
166
167 fn next_char(&self, at: InputAt) -> Char {
168 at.char()
169 }
170
171 fn previous_char(&self, at: InputAt) -> Char {
172 decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
173 }
174
175 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
176 use crate::prog::EmptyLook::*;
177 match empty.look {
178 StartLine => {
179 let c = self.previous_char(at);
180 at.pos() == 0 || c == '\n'
181 }
182 EndLine => {
183 let c = self.next_char(at);
184 at.pos() == self.len() || c == '\n'
185 }
186 StartText => at.pos() == 0,
187 EndText => at.pos() == self.len(),
188 WordBoundary => {
189 let (c1, c2) = (self.previous_char(at), self.next_char(at));
190 c1.is_word_char() != c2.is_word_char()
191 }
192 NotWordBoundary => {
193 let (c1, c2) = (self.previous_char(at), self.next_char(at));
194 c1.is_word_char() == c2.is_word_char()
195 }
196 WordBoundaryAscii => {
197 let (c1, c2) = (self.previous_char(at), self.next_char(at));
198 c1.is_word_byte() != c2.is_word_byte()
199 }
200 NotWordBoundaryAscii => {
201 let (c1, c2) = (self.previous_char(at), self.next_char(at));
202 c1.is_word_byte() == c2.is_word_byte()
203 }
204 }
205 }
206
207 fn prefix_at(
208 &self,
209 prefixes: &LiteralSearcher,
210 at: InputAt,
211 ) -> Option<InputAt> {
212 prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
213 }
214
215 fn len(&self) -> usize {
216 self.0.len()
217 }
218
219 fn as_bytes(&self) -> &[u8] {
220 self.0
221 }
222}
223
224/// An input reader over bytes.
225#[derive(Clone, Copy, Debug)]
226pub struct ByteInput<'t> {
227 text: &'t [u8],
228 only_utf8: bool,
229}
230
231impl<'t> ByteInput<'t> {
232 /// Return a new byte-based input reader for the given string.
233 pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
234 ByteInput { text, only_utf8 }
235 }
236}
237
238impl<'t> ops::Deref for ByteInput<'t> {
239 type Target = [u8];
240
241 fn deref(&self) -> &[u8] {
242 self.text
243 }
244}
245
246impl<'t> Input for ByteInput<'t> {
247 fn at(&self, i: usize) -> InputAt {
248 if i >= self.len() {
249 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 }
250 } else {
251 InputAt {
252 pos: i,
253 c: None.into(),
254 byte: self.get(i).cloned(),
255 len: 1,
256 }
257 }
258 }
259
260 fn next_char(&self, at: InputAt) -> Char {
261 decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into()
262 }
263
264 fn previous_char(&self, at: InputAt) -> Char {
265 decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into()
266 }
267
268 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
269 use crate::prog::EmptyLook::*;
270 match empty.look {
271 StartLine => {
272 let c = self.previous_char(at);
273 at.pos() == 0 || c == '\n'
274 }
275 EndLine => {
276 let c = self.next_char(at);
277 at.pos() == self.len() || c == '\n'
278 }
279 StartText => at.pos() == 0,
280 EndText => at.pos() == self.len(),
281 WordBoundary => {
282 let (c1, c2) = (self.previous_char(at), self.next_char(at));
283 c1.is_word_char() != c2.is_word_char()
284 }
285 NotWordBoundary => {
286 let (c1, c2) = (self.previous_char(at), self.next_char(at));
287 c1.is_word_char() == c2.is_word_char()
288 }
289 WordBoundaryAscii => {
290 let (c1, c2) = (self.previous_char(at), self.next_char(at));
291 if self.only_utf8 {
292 // If we must match UTF-8, then we can't match word
293 // boundaries at invalid UTF-8.
294 if c1.is_none() && !at.is_start() {
295 return false;
296 }
297 if c2.is_none() && !at.is_end() {
298 return false;
299 }
300 }
301 c1.is_word_byte() != c2.is_word_byte()
302 }
303 NotWordBoundaryAscii => {
304 let (c1, c2) = (self.previous_char(at), self.next_char(at));
305 if self.only_utf8 {
306 // If we must match UTF-8, then we can't match word
307 // boundaries at invalid UTF-8.
308 if c1.is_none() && !at.is_start() {
309 return false;
310 }
311 if c2.is_none() && !at.is_end() {
312 return false;
313 }
314 }
315 c1.is_word_byte() == c2.is_word_byte()
316 }
317 }
318 }
319
320 fn prefix_at(
321 &self,
322 prefixes: &LiteralSearcher,
323 at: InputAt,
324 ) -> Option<InputAt> {
325 prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s))
326 }
327
328 fn len(&self) -> usize {
329 self.text.len()
330 }
331
332 fn as_bytes(&self) -> &[u8] {
333 self.text
334 }
335}
336
337/// An inline representation of `Option<char>`.
338///
339/// This eliminates the need to do case analysis on `Option<char>` to determine
340/// ordinality with other characters.
341///
342/// (The `Option<char>` is not related to encoding. Instead, it is used in the
343/// matching engines to represent the beginning and ending boundaries of the
344/// search text.)
345#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
346pub struct Char(u32);
347
348impl fmt::Debug for Char {
349 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
350 match char::from_u32(self.0) {
351 None => write!(f, "Empty"),
352 Some(c: char) => write!(f, "{:?}", c),
353 }
354 }
355}
356
357impl Char {
358 /// Returns true iff the character is absent.
359 #[inline]
360 pub fn is_none(self) -> bool {
361 self.0 == u32::MAX
362 }
363
364 /// Returns the length of the character's UTF-8 encoding.
365 ///
366 /// If the character is absent, then `1` is returned.
367 #[inline]
368 pub fn len_utf8(self) -> usize {
369 char::from_u32(self.0).map_or(1, |c| c.len_utf8())
370 }
371
372 /// Returns true iff the character is a word character.
373 ///
374 /// If the character is absent, then false is returned.
375 pub fn is_word_char(self) -> bool {
376 // is_word_character can panic if the Unicode data for \w isn't
377 // available. However, our compiler ensures that if a Unicode word
378 // boundary is used, then the data must also be available. If it isn't,
379 // then the compiler returns an error.
380 char::from_u32(self.0).map_or(false, regex_syntax::is_word_character)
381 }
382
383 /// Returns true iff the byte is a word byte.
384 ///
385 /// If the byte is absent, then false is returned.
386 pub fn is_word_byte(self) -> bool {
387 match char::from_u32(self.0) {
388 Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8),
389 None | Some(_) => false,
390 }
391 }
392}
393
394impl From<char> for Char {
395 fn from(c: char) -> Char {
396 Char(c as u32)
397 }
398}
399
400impl From<Option<char>> for Char {
401 fn from(c: Option<char>) -> Char {
402 c.map_or(default:Char(u32::MAX), |c: char| c.into())
403 }
404}
405
406impl PartialEq<char> for Char {
407 #[inline]
408 fn eq(&self, other: &char) -> bool {
409 self.0 == *other as u32
410 }
411}
412
413impl PartialEq<Char> for char {
414 #[inline]
415 fn eq(&self, other: &Char) -> bool {
416 *self as u32 == other.0
417 }
418}
419
420impl PartialOrd<char> for Char {
421 #[inline]
422 fn partial_cmp(&self, other: &char) -> Option<Ordering> {
423 self.0.partial_cmp(&(*other as u32))
424 }
425}
426
427impl PartialOrd<Char> for char {
428 #[inline]
429 fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
430 (*self as u32).partial_cmp(&other.0)
431 }
432}
433