input.rs source code [crates/regex-1.8.4/src/input.rs]

1	use std::char;
2	use std::cmp::Ordering;
3	use std::fmt;
4	use std::ops;
5	use std::u32;
6
7	use crate::literal::LiteralSearcher;
8	use crate::prog::InstEmptyLook;
9	use crate::utf8::{decode_last_utf8, decode_utf8};
10
11	/// Represents a location in the input.
12	#[derive(Clone, Copy, Debug)]
13	pub struct InputAt {
14	pos: usize,
15	c: Char,
16	byte: Option<u8>,
17	len: usize,
18	}
19
20	impl InputAt {
21	/// Returns true iff this position is at the beginning of the input.
22	pub fn is_start(&self) -> bool {
23	self.pos == `0`
24	}
25
26	/// Returns true iff this position is past the end of the input.
27	pub fn is_end(&self) -> bool {
28	self.c.is_none() && self.byte.is_none()
29	}
30
31	/// Returns the character at this position.
32	///
33	/// If this position is just before or after the input, then an absent
34	/// character is returned.
35	pub fn char(&self) -> Char {
36	self.c
37	}
38
39	/// Returns the byte at this position.
40	pub fn byte(&self) -> Option<u8> {
41	self.byte
42	}
43
44	/// Returns the UTF-8 width of the character at this position.
45	pub fn len(&self) -> usize {
46	self.len
47	}
48
49	/// Returns whether the UTF-8 width of the character at this position
50	/// is zero.
51	pub fn is_empty(&self) -> bool {
52	self.len == `0`
53	}
54
55	/// Returns the byte offset of this position.
56	pub fn pos(&self) -> usize {
57	self.pos
58	}
59
60	/// Returns the byte offset of the next position in the input.
61	pub fn next_pos(&self) -> usize {
62	self.pos + self.len
63	}
64	}
65
66	/// An abstraction over input used in the matching engines.
67	pub trait Input: fmt::Debug {
68	/// Return an encoding of the position at byte offset `i`.
69	fn at(&self, i: usize) -> InputAt;
70
71	/// Return the Unicode character occurring next to `at`.
72	///
73	/// If no such character could be decoded, then `Char` is absent.
74	fn next_char(&self, at: InputAt) -> Char;
75
76	/// Return the Unicode character occurring previous to `at`.
77	///
78	/// If no such character could be decoded, then `Char` is absent.
79	fn previous_char(&self, at: InputAt) -> Char;
80
81	/// Return true if the given empty width instruction matches at the
82	/// input position given.
83	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool;
84
85	/// Scan the input for a matching prefix.
86	fn prefix_at(
87	&self,
88	prefixes: &LiteralSearcher,
89	at: InputAt,
90	) -> Option<InputAt>;
91
92	/// The number of bytes in the input.
93	fn len(&self) -> usize;
94
95	/// Whether the input is empty.
96	fn is_empty(&self) -> bool {
97	self.len() == `0`
98	}
99
100	/// Return the given input as a sequence of bytes.
101	fn as_bytes(&self) -> &[u8];
102	}
103
104	impl<'a, T: Input> Input for &'a T {
105	fn at(&self, i: usize) -> InputAt {
106	(**self).at(i)
107	}
108
109	fn next_char(&self, at: InputAt) -> Char {
110	(**self).next_char(at)
111	}
112
113	fn previous_char(&self, at: InputAt) -> Char {
114	(**self).previous_char(at)
115	}
116
117	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
118	(**self).is_empty_match(at, empty)
119	}
120
121	fn prefix_at(
122	&self,
123	prefixes: &LiteralSearcher,
124	at: InputAt,
125	) -> Option<InputAt> {
126	(**self).prefix_at(prefixes, at)
127	}
128
129	fn len(&self) -> usize {
130	(**self).len()
131	}
132
133	fn as_bytes(&self) -> &[u8] {
134	(**self).as_bytes()
135	}
136	}
137
138	/// An input reader over characters.
139	#[derive(Clone, Copy, Debug)]
140	pub struct CharInput<'t>(&'t [u8]);
141
142	impl<'t> CharInput<'t> {
143	/// Return a new character input reader for the given string.
144	pub fn new(s: &'t [u8]) -> CharInput<'t> {
145	CharInput(s)
146	}
147	}
148
149	impl<'t> ops::Deref for CharInput<'t> {
150	type Target = [u8];
151
152	fn deref(&self) -> &[u8] {
153	self.0
154	}
155	}
156
157	impl<'t> Input for CharInput<'t> {
158	fn at(&self, i: usize) -> InputAt {
159	if i >= self.len() {
160	InputAt { pos: self.len(), c: None.into(), byte: None, len: `0` }
161	} else {
162	let c = decode_utf8(&self[i..]).map(\|(c, _)\| c).into();
163	InputAt { pos: i, c, byte: None, len: c.len_utf8() }
164	}
165	}
166
167	fn next_char(&self, at: InputAt) -> Char {
168	at.char()
169	}
170
171	fn previous_char(&self, at: InputAt) -> Char {
172	decode_last_utf8(&self[..at.pos()]).map(\|(c, _)\| c).into()
173	}
174
175	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
176	use crate::prog::EmptyLook::*;
177	match empty.look {
178	StartLine => {
179	let c = self.previous_char(at);
180	at.pos() == `0` \|\| c == '`\n`'
181	}
182	EndLine => {
183	let c = self.next_char(at);
184	at.pos() == self.len() \|\| c == '`\n`'
185	}
186	StartText => at.pos() == `0`,
187	EndText => at.pos() == self.len(),
188	WordBoundary => {
189	let (c1, c2) = (self.previous_char(at), self.next_char(at));
190	c1.is_word_char() != c2.is_word_char()
191	}
192	NotWordBoundary => {
193	let (c1, c2) = (self.previous_char(at), self.next_char(at));
194	c1.is_word_char() == c2.is_word_char()
195	}
196	WordBoundaryAscii => {
197	let (c1, c2) = (self.previous_char(at), self.next_char(at));
198	c1.is_word_byte() != c2.is_word_byte()
199	}
200	NotWordBoundaryAscii => {
201	let (c1, c2) = (self.previous_char(at), self.next_char(at));
202	c1.is_word_byte() == c2.is_word_byte()
203	}
204	}
205	}
206
207	fn prefix_at(
208	&self,
209	prefixes: &LiteralSearcher,
210	at: InputAt,
211	) -> Option<InputAt> {
212	prefixes.find(&self[at.pos()..]).map(\|(s, _)\| self.at(at.pos() + s))
213	}
214
215	fn len(&self) -> usize {
216	self.0.len()
217	}
218
219	fn as_bytes(&self) -> &[u8] {
220	self.0
221	}
222	}
223
224	/// An input reader over bytes.
225	#[derive(Clone, Copy, Debug)]
226	pub struct ByteInput<'t> {
227	text: &'t [u8],
228	only_utf8: bool,
229	}
230
231	impl<'t> ByteInput<'t> {
232	/// Return a new byte-based input reader for the given string.
233	pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> {
234	ByteInput { text, only_utf8 }
235	}
236	}
237
238	impl<'t> ops::Deref for ByteInput<'t> {
239	type Target = [u8];
240
241	fn deref(&self) -> &[u8] {
242	self.text
243	}
244	}
245
246	impl<'t> Input for ByteInput<'t> {
247	fn at(&self, i: usize) -> InputAt {
248	if i >= self.len() {
249	InputAt { pos: self.len(), c: None.into(), byte: None, len: `0` }
250	} else {
251	InputAt {
252	pos: i,
253	c: None.into(),
254	byte: self.get(i).cloned(),
255	len: `1`,
256	}
257	}
258	}
259
260	fn next_char(&self, at: InputAt) -> Char {
261	decode_utf8(&self[at.pos()..]).map(\|(c, _)\| c).into()
262	}
263
264	fn previous_char(&self, at: InputAt) -> Char {
265	decode_last_utf8(&self[..at.pos()]).map(\|(c, _)\| c).into()
266	}
267
268	fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool {
269	use crate::prog::EmptyLook::*;
270	match empty.look {
271	StartLine => {
272	let c = self.previous_char(at);
273	at.pos() == `0` \|\| c == '`\n`'
274	}
275	EndLine => {
276	let c = self.next_char(at);
277	at.pos() == self.len() \|\| c == '`\n`'
278	}
279	StartText => at.pos() == `0`,
280	EndText => at.pos() == self.len(),
281	WordBoundary => {
282	let (c1, c2) = (self.previous_char(at), self.next_char(at));
283	c1.is_word_char() != c2.is_word_char()
284	}
285	NotWordBoundary => {
286	let (c1, c2) = (self.previous_char(at), self.next_char(at));
287	c1.is_word_char() == c2.is_word_char()
288	}
289	WordBoundaryAscii => {
290	let (c1, c2) = (self.previous_char(at), self.next_char(at));
291	if self.only_utf8 {
292	// If we must match UTF-8, then we can't match word
293	// boundaries at invalid UTF-8.
294	if c1.is_none() && !at.is_start() {
295	return `false`;
296	}
297	if c2.is_none() && !at.is_end() {
298	return `false`;
299	}
300	}
301	c1.is_word_byte() != c2.is_word_byte()
302	}
303	NotWordBoundaryAscii => {
304	let (c1, c2) = (self.previous_char(at), self.next_char(at));
305	if self.only_utf8 {
306	// If we must match UTF-8, then we can't match word
307	// boundaries at invalid UTF-8.
308	if c1.is_none() && !at.is_start() {
309	return `false`;
310	}
311	if c2.is_none() && !at.is_end() {
312	return `false`;
313	}
314	}
315	c1.is_word_byte() == c2.is_word_byte()
316	}
317	}
318	}
319
320	fn prefix_at(
321	&self,
322	prefixes: &LiteralSearcher,
323	at: InputAt,
324	) -> Option<InputAt> {
325	prefixes.find(&self[at.pos()..]).map(\|(s, _)\| self.at(at.pos() + s))
326	}
327
328	fn len(&self) -> usize {
329	self.text.len()
330	}
331
332	fn as_bytes(&self) -> &[u8] {
333	self.text
334	}
335	}
336
337	/// An inline representation of `Option<char>`.
338	///
339	/// This eliminates the need to do case analysis on `Option<char>` to determine
340	/// ordinality with other characters.
341	///
342	/// (The `Option<char>` is not related to encoding. Instead, it is used in the
343	/// matching engines to represent the beginning and ending boundaries of the
344	/// search text.)
345	#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
346	pub struct Char(u32);
347
348	impl fmt::Debug for Char {
349	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
350	match char::from_u32(self.0) {
351	None => write!(f, "Empty"),
352	Some(c: char) => write!(f, "{:?}", c),
353	}
354	}
355	}
356
357	impl Char {
358	/// Returns true iff the character is absent.
359	#[inline]
360	pub fn is_none(self) -> bool {
361	self.0 == u32::MAX
362	}
363
364	/// Returns the length of the character's UTF-8 encoding.
365	///
366	/// If the character is absent, then `1` is returned.
367	#[inline]
368	pub fn len_utf8(self) -> usize {
369	char::from_u32(self.0).map_or(`1`, \|c\| c.len_utf8())
370	}
371
372	/// Returns true iff the character is a word character.
373	///
374	/// If the character is absent, then false is returned.
375	pub fn is_word_char(self) -> bool {
376	// is_word_character can panic if the Unicode data for \w isn't
377	// available. However, our compiler ensures that if a Unicode word
378	// boundary is used, then the data must also be available. If it isn't,
379	// then the compiler returns an error.
380	char::from_u32(self.0).map_or(`false`, regex_syntax::is_word_character)
381	}
382
383	/// Returns true iff the byte is a word byte.
384	///
385	/// If the byte is absent, then false is returned.
386	pub fn is_word_byte(self) -> bool {
387	match char::from_u32(self.0) {
388	Some(c) if c <= '`\u{7F}`' => regex_syntax::is_word_byte(c as u8),
389	None \| Some(_) => `false`,
390	}
391	}
392	}
393
394	impl From<char> for Char {
395	fn from(c: char) -> Char {
396	Char(c as u32)
397	}
398	}
399
400	impl From<Option<char>> for Char {
401	fn from(c: Option<char>) -> Char {
402	c.map_or(default:Char(u32::MAX), \|c: char\| c.into())
403	}
404	}
405
406	impl PartialEq<char> for Char {
407	#[inline]
408	fn eq(&self, other: &char) -> bool {
409	self.0 == other as u32*
410	}
411	}
412
413	impl PartialEq<Char> for char {
414	#[inline]
415	fn eq(&self, other: &Char) -> bool {
416	self as u32* == other.0
417	}
418	}
419
420	impl PartialOrd<char> for Char {
421	#[inline]
422	fn partial_cmp(&self, other: &char) -> Option<Ordering> {
423	self.0.partial_cmp(&(other as u32*))
424	}
425	}
426
427	impl PartialOrd<Char> for char {
428	#[inline]
429	fn partial_cmp(&self, other: &Char) -> Option<Ordering> {
430	(self as u32*).partial_cmp(&other.0)
431	}
432	}
433