1 | /*! |
2 | Type definitions for identifier types. |
3 | |
4 | A [`StateID`] represents the possible set of identifiers used in regex engine |
5 | implementations in this crate. For example, they are used to identify both NFA |
6 | and DFA states. |
7 | |
8 | A [`PatternID`] represents the possible set of identifiers for patterns. All |
9 | regex engine implementations in this crate support searching for multiple |
10 | patterns simultaneously. A `PatternID` is how each pattern is uniquely |
11 | identified for a particular instance of a regex engine. Namely, a pattern is |
12 | assigned an auto-incrementing integer, starting at `0`, based on the order of |
13 | patterns supplied during the construction of the regex engine. |
14 | |
15 | These identifier types represent a way for this crate to make correctness |
16 | guarantees around the possible set of values that a `StateID` or a `PatternID` |
17 | might represent. Similarly, they also provide a way of constraining the size of |
18 | these identifiers to reduce space usage while still guaranteeing that all such |
19 | identifiers are repsentable by a `usize` for the current target. |
20 | |
21 | Moreover, the identifier types clamp the range of permissible values to a range |
22 | that is typically smaller than its internal representation. (With the maximum |
23 | value being, e.g., `StateID::MAX`.) Users of these types may not rely this |
24 | clamping for the purpose of memory safety. Users may, however, rely on these |
25 | invariants to avoid panics or other types of logic bugs. |
26 | */ |
27 | |
28 | // Continuing from the above comment about correctness guarantees, an example |
29 | // of a way in which we use the guarantees on these types is delta encoding. |
30 | // Namely, we require that IDs can be at most 2^31 - 2, which means the |
31 | // difference between any two IDs is always representable as an i32. |
32 | |
33 | use core::{ |
34 | convert::{Infallible, TryFrom}, |
35 | mem, ops, |
36 | }; |
37 | |
38 | #[cfg (feature = "alloc" )] |
39 | use alloc::vec::Vec; |
40 | |
41 | /// An identifier for a regex pattern. |
42 | /// |
43 | /// The identifier for a pattern corresponds to its relative position among |
44 | /// other patterns in a single finite state machine. Namely, when building |
45 | /// a multi-pattern regex engine, one must supply a sequence of patterns to |
46 | /// match. The position (starting at 0) of each pattern in that sequence |
47 | /// represents its identifier. This identifier is in turn used to identify and |
48 | /// report matches of that pattern in various APIs. |
49 | /// |
50 | /// A pattern ID is guaranteed to be representable by a `usize`. Similarly, |
51 | /// the number of patterns in any regex engine in this crate is guaranteed to |
52 | /// be representable by a `usize`. This applies to regex engines that have |
53 | /// been deserialized; a deserialization error will be returned if it contains |
54 | /// pattern IDs that violate these requirements in your current environment. |
55 | /// |
56 | /// For extra convenience in some cases, this type also guarantees that all |
57 | /// IDs can fit into an `i32` and an `isize` without overflowing. |
58 | /// |
59 | /// # Representation |
60 | /// |
61 | /// This type is always represented internally by a `u32` and is marked as |
62 | /// `repr(transparent)`. Thus, this type always has the same representation as |
63 | /// a `u32`. |
64 | /// |
65 | /// # Indexing |
66 | /// |
67 | /// For convenience, callers may use a `PatternID` to index slices. |
68 | /// |
69 | /// # Safety |
70 | /// |
71 | /// While a `PatternID` is meant to guarantee that its value fits into `usize` |
72 | /// (while using a possibly smaller representation than `usize` on some |
73 | /// targets), callers must not rely on this property for safety. Callers may |
74 | /// choose to rely on this property for correctness however. |
75 | #[repr (transparent)] |
76 | #[derive ( |
77 | Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, |
78 | )] |
79 | pub struct PatternID(u32); |
80 | |
81 | impl PatternID { |
82 | /// The maximum pattern ID value, represented as a `usize`. |
83 | #[cfg (any(target_pointer_width = "32" , target_pointer_width = "64" ))] |
84 | pub const MAX: PatternID = |
85 | PatternID::new_unchecked(core::i32::MAX as usize - 1); |
86 | |
87 | /// The maximum pattern ID value, represented as a `usize`. |
88 | #[cfg (target_pointer_width = "16" )] |
89 | pub const MAX: PatternID = PatternID::new_unchecked(core::isize::MAX - 1); |
90 | |
91 | /// The total number of patterns that are allowed in any single regex |
92 | /// engine. |
93 | pub const LIMIT: usize = PatternID::MAX.as_usize() + 1; |
94 | |
95 | /// The zero pattern ID value. |
96 | pub const ZERO: PatternID = PatternID::new_unchecked(0); |
97 | |
98 | /// The number of bytes that a single `PatternID` uses in memory. |
99 | pub const SIZE: usize = core::mem::size_of::<PatternID>(); |
100 | |
101 | /// Create a new pattern ID. |
102 | /// |
103 | /// If the given identifier exceeds [`PatternID::MAX`], then this returns |
104 | /// an error. |
105 | #[inline ] |
106 | pub fn new(id: usize) -> Result<PatternID, PatternIDError> { |
107 | PatternID::try_from(id) |
108 | } |
109 | |
110 | /// Create a new pattern ID without checking whether the given value |
111 | /// exceeds [`PatternID::MAX`]. |
112 | /// |
113 | /// While this is unchecked, providing an incorrect value must never |
114 | /// sacrifice memory safety, as documented above. |
115 | #[inline ] |
116 | pub const fn new_unchecked(id: usize) -> PatternID { |
117 | PatternID(id as u32) |
118 | } |
119 | |
120 | /// Like [`PatternID::new`], but panics if the given ID is not valid. |
121 | #[inline ] |
122 | pub fn must(id: usize) -> PatternID { |
123 | PatternID::new(id).unwrap() |
124 | } |
125 | |
126 | /// Return this pattern ID as a `usize`. |
127 | #[inline ] |
128 | pub const fn as_usize(&self) -> usize { |
129 | self.0 as usize |
130 | } |
131 | |
132 | /// Return the internal u32 of this pattern ID. |
133 | #[inline ] |
134 | pub const fn as_u32(&self) -> u32 { |
135 | self.0 |
136 | } |
137 | |
138 | /// Return the internal u32 of this pattern ID represented as an i32. |
139 | /// |
140 | /// This is guaranteed to never overflow an `i32`. |
141 | #[inline ] |
142 | pub const fn as_i32(&self) -> i32 { |
143 | self.0 as i32 |
144 | } |
145 | |
146 | /// Returns one more than this pattern ID as a usize. |
147 | /// |
148 | /// Since a pattern ID has constraints on its maximum value, adding `1` to |
149 | /// it will always fit in a `usize` (and a `u32`). |
150 | #[inline ] |
151 | pub fn one_more(&self) -> usize { |
152 | self.as_usize().checked_add(1).unwrap() |
153 | } |
154 | |
155 | /// Decode this pattern ID from the bytes given using the native endian |
156 | /// byte order for the current target. |
157 | /// |
158 | /// If the decoded integer is not representable as a pattern ID for the |
159 | /// current target, then this returns an error. |
160 | #[inline ] |
161 | pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<PatternID, PatternIDError> { |
162 | let id = u32::from_ne_bytes(bytes); |
163 | if id > PatternID::MAX.as_u32() { |
164 | return Err(PatternIDError { attempted: id as u64 }); |
165 | } |
166 | Ok(PatternID::new_unchecked(id as usize)) |
167 | } |
168 | |
169 | /// Decode this pattern ID from the bytes given using the native endian |
170 | /// byte order for the current target. |
171 | /// |
172 | /// This is analogous to [`PatternID::new_unchecked`] in that is does not |
173 | /// check whether the decoded integer is representable as a pattern ID. |
174 | #[inline ] |
175 | pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> PatternID { |
176 | PatternID::new_unchecked(u32::from_ne_bytes(bytes) as usize) |
177 | } |
178 | |
179 | /// Return the underlying pattern ID integer as raw bytes in native endian |
180 | /// format. |
181 | #[inline ] |
182 | pub fn to_ne_bytes(&self) -> [u8; 4] { |
183 | self.0.to_ne_bytes() |
184 | } |
185 | |
186 | /// Returns an iterator over all pattern IDs from 0 up to and not including |
187 | /// the given length. |
188 | /// |
189 | /// If the given length exceeds [`PatternID::LIMIT`], then this panics. |
190 | #[cfg (feature = "alloc" )] |
191 | pub(crate) fn iter(len: usize) -> PatternIDIter { |
192 | PatternIDIter::new(len) |
193 | } |
194 | } |
195 | |
196 | /// This error occurs when a pattern ID could not be constructed. |
197 | /// |
198 | /// This occurs when given an integer exceeding the maximum pattern ID value. |
199 | /// |
200 | /// When the `std` feature is enabled, this implements the `Error` trait. |
201 | #[derive (Clone, Debug, Eq, PartialEq)] |
202 | pub struct PatternIDError { |
203 | attempted: u64, |
204 | } |
205 | |
206 | impl PatternIDError { |
207 | /// Returns the value that failed to constructed a pattern ID. |
208 | pub fn attempted(&self) -> u64 { |
209 | self.attempted |
210 | } |
211 | } |
212 | |
213 | #[cfg (feature = "std" )] |
214 | impl std::error::Error for PatternIDError {} |
215 | |
216 | impl core::fmt::Display for PatternIDError { |
217 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
218 | write!( |
219 | f, |
220 | "failed to create PatternID from {:?}, which exceeds {:?}" , |
221 | self.attempted(), |
222 | PatternID::MAX, |
223 | ) |
224 | } |
225 | } |
226 | |
227 | /// An identifier for a state in a regex engine. |
228 | /// |
229 | /// A state ID is guaranteed to be representable by a `usize`. Similarly, the |
230 | /// number of states in any regex engine in this crate is guaranteed to be |
231 | /// representable by a `usize`. This applies to regex engines that have been |
232 | /// deserialized; a deserialization error will be returned if it contains state |
233 | /// IDs that violate these requirements in your current environment. |
234 | /// |
235 | /// For extra convenience in some cases, this type also guarantees that all |
236 | /// IDs can fit into an `i32` and an `isize` without overflowing. |
237 | /// |
238 | /// # Representation |
239 | /// |
240 | /// This type is always represented internally by a `u32` and is marked as |
241 | /// `repr(transparent)`. Thus, this type always has the same representation as |
242 | /// a `u32`. |
243 | /// |
244 | /// # Indexing |
245 | /// |
246 | /// For convenience, callers may use a `StateID` to index slices. |
247 | /// |
248 | /// # Safety |
249 | /// |
250 | /// While a `StateID` is meant to guarantee that its value fits into `usize` |
251 | /// (while using a possibly smaller representation than `usize` on some |
252 | /// targets), callers must not rely on this property for safety. Callers may |
253 | /// choose to rely on this property for correctness however. |
254 | #[repr (transparent)] |
255 | #[derive ( |
256 | Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, |
257 | )] |
258 | pub struct StateID(u32); |
259 | |
260 | impl StateID { |
261 | /// The maximum state ID value. |
262 | #[cfg (any(target_pointer_width = "32" , target_pointer_width = "64" ))] |
263 | pub const MAX: StateID = |
264 | StateID::new_unchecked(core::i32::MAX as usize - 1); |
265 | |
266 | /// The maximum state ID value. |
267 | #[cfg (target_pointer_width = "16" )] |
268 | pub const MAX: StateID = StateID::new_unchecked(core::isize::MAX - 1); |
269 | |
270 | /// The total number of states that are allowed in any single regex |
271 | /// engine, represented as a `usize`. |
272 | pub const LIMIT: usize = StateID::MAX.as_usize() + 1; |
273 | |
274 | /// The zero state ID value. |
275 | pub const ZERO: StateID = StateID::new_unchecked(0); |
276 | |
277 | /// The number of bytes that a single `StateID` uses in memory. |
278 | pub const SIZE: usize = core::mem::size_of::<StateID>(); |
279 | |
280 | /// Create a new state ID. |
281 | /// |
282 | /// If the given identifier exceeds [`StateID::MAX`], then this returns |
283 | /// an error. |
284 | #[inline ] |
285 | pub fn new(id: usize) -> Result<StateID, StateIDError> { |
286 | StateID::try_from(id) |
287 | } |
288 | |
289 | /// Create a new state ID without checking whether the given value |
290 | /// exceeds [`StateID::MAX`]. |
291 | /// |
292 | /// While this is unchecked, providing an incorrect value must never |
293 | /// sacrifice memory safety, as documented above. |
294 | #[inline ] |
295 | pub const fn new_unchecked(id: usize) -> StateID { |
296 | StateID(id as u32) |
297 | } |
298 | |
299 | /// Like [`StateID::new`], but panics if the given ID is not valid. |
300 | #[inline ] |
301 | pub fn must(id: usize) -> StateID { |
302 | StateID::new(id).unwrap() |
303 | } |
304 | |
305 | /// Return this state ID as a `usize`. |
306 | #[inline ] |
307 | pub const fn as_usize(&self) -> usize { |
308 | self.0 as usize |
309 | } |
310 | |
311 | /// Return the internal u32 of this state ID. |
312 | #[inline ] |
313 | pub const fn as_u32(&self) -> u32 { |
314 | self.0 |
315 | } |
316 | |
317 | /// Return the internal u32 of this pattern ID represented as an i32. |
318 | /// |
319 | /// This is guaranteed to never overflow an `i32`. |
320 | #[inline ] |
321 | pub const fn as_i32(&self) -> i32 { |
322 | self.0 as i32 |
323 | } |
324 | |
325 | /// Returns one more than this state ID as a usize. |
326 | /// |
327 | /// Since a state ID has constraints on its maximum value, adding `1` to |
328 | /// it will always fit in a `usize` (and a `u32`). |
329 | #[inline ] |
330 | pub fn one_more(&self) -> usize { |
331 | self.as_usize().checked_add(1).unwrap() |
332 | } |
333 | |
334 | /// Decode this state ID from the bytes given using the native endian byte |
335 | /// order for the current target. |
336 | /// |
337 | /// If the decoded integer is not representable as a state ID for the |
338 | /// current target, then this returns an error. |
339 | #[inline ] |
340 | pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<StateID, StateIDError> { |
341 | let id = u32::from_ne_bytes(bytes); |
342 | if id > StateID::MAX.as_u32() { |
343 | return Err(StateIDError { attempted: id as u64 }); |
344 | } |
345 | Ok(StateID::new_unchecked(id as usize)) |
346 | } |
347 | |
348 | /// Decode this state ID from the bytes given using the native endian |
349 | /// byte order for the current target. |
350 | /// |
351 | /// This is analogous to [`StateID::new_unchecked`] in that is does not |
352 | /// check whether the decoded integer is representable as a state ID. |
353 | #[inline ] |
354 | pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> StateID { |
355 | StateID::new_unchecked(u32::from_ne_bytes(bytes) as usize) |
356 | } |
357 | |
358 | /// Return the underlying state ID integer as raw bytes in native endian |
359 | /// format. |
360 | #[inline ] |
361 | pub fn to_ne_bytes(&self) -> [u8; 4] { |
362 | self.0.to_ne_bytes() |
363 | } |
364 | |
365 | /// Returns an iterator over all state IDs from 0 up to and not including |
366 | /// the given length. |
367 | /// |
368 | /// If the given length exceeds [`StateID::LIMIT`], then this panics. |
369 | #[cfg (feature = "alloc" )] |
370 | pub(crate) fn iter(len: usize) -> StateIDIter { |
371 | StateIDIter::new(len) |
372 | } |
373 | } |
374 | |
375 | /// This error occurs when a state ID could not be constructed. |
376 | /// |
377 | /// This occurs when given an integer exceeding the maximum state ID value. |
378 | /// |
379 | /// When the `std` feature is enabled, this implements the `Error` trait. |
380 | #[derive (Clone, Debug, Eq, PartialEq)] |
381 | pub struct StateIDError { |
382 | attempted: u64, |
383 | } |
384 | |
385 | impl StateIDError { |
386 | /// Returns the value that failed to constructed a state ID. |
387 | pub fn attempted(&self) -> u64 { |
388 | self.attempted |
389 | } |
390 | } |
391 | |
392 | #[cfg (feature = "std" )] |
393 | impl std::error::Error for StateIDError {} |
394 | |
395 | impl core::fmt::Display for StateIDError { |
396 | fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { |
397 | write!( |
398 | f, |
399 | "failed to create StateID from {:?}, which exceeds {:?}" , |
400 | self.attempted(), |
401 | StateID::MAX, |
402 | ) |
403 | } |
404 | } |
405 | |
406 | /// A macro for defining exactly identical (modulo names) impls for ID types. |
407 | macro_rules! impls { |
408 | ($ty:ident, $tyerr:ident, $tyiter:ident) => { |
409 | #[derive(Clone, Debug)] |
410 | pub(crate) struct $tyiter { |
411 | rng: ops::Range<usize>, |
412 | } |
413 | |
414 | impl $tyiter { |
415 | #[cfg(feature = "alloc" )] |
416 | fn new(len: usize) -> $tyiter { |
417 | assert!( |
418 | len <= $ty::LIMIT, |
419 | "cannot create iterator with IDs when number of \ |
420 | elements exceed {:?}" , |
421 | $ty::LIMIT, |
422 | ); |
423 | $tyiter { rng: 0..len } |
424 | } |
425 | } |
426 | |
427 | impl Iterator for $tyiter { |
428 | type Item = $ty; |
429 | |
430 | fn next(&mut self) -> Option<$ty> { |
431 | if self.rng.start >= self.rng.end { |
432 | return None; |
433 | } |
434 | let next_id = self.rng.start + 1; |
435 | let id = mem::replace(&mut self.rng.start, next_id); |
436 | // new_unchecked is OK since we asserted that the number of |
437 | // elements in this iterator will fit in an ID at construction. |
438 | Some($ty::new_unchecked(id)) |
439 | } |
440 | } |
441 | |
442 | impl<T> core::ops::Index<$ty> for [T] { |
443 | type Output = T; |
444 | |
445 | #[inline] |
446 | fn index(&self, index: $ty) -> &T { |
447 | &self[index.as_usize()] |
448 | } |
449 | } |
450 | |
451 | impl<T> core::ops::IndexMut<$ty> for [T] { |
452 | #[inline] |
453 | fn index_mut(&mut self, index: $ty) -> &mut T { |
454 | &mut self[index.as_usize()] |
455 | } |
456 | } |
457 | |
458 | #[cfg(feature = "alloc" )] |
459 | impl<T> core::ops::Index<$ty> for Vec<T> { |
460 | type Output = T; |
461 | |
462 | #[inline] |
463 | fn index(&self, index: $ty) -> &T { |
464 | &self[index.as_usize()] |
465 | } |
466 | } |
467 | |
468 | #[cfg(feature = "alloc" )] |
469 | impl<T> core::ops::IndexMut<$ty> for Vec<T> { |
470 | #[inline] |
471 | fn index_mut(&mut self, index: $ty) -> &mut T { |
472 | &mut self[index.as_usize()] |
473 | } |
474 | } |
475 | |
476 | impl TryFrom<usize> for $ty { |
477 | type Error = $tyerr; |
478 | |
479 | fn try_from(id: usize) -> Result<$ty, $tyerr> { |
480 | if id > $ty::MAX.as_usize() { |
481 | return Err($tyerr { attempted: id as u64 }); |
482 | } |
483 | Ok($ty::new_unchecked(id)) |
484 | } |
485 | } |
486 | |
487 | impl TryFrom<u8> for $ty { |
488 | type Error = Infallible; |
489 | |
490 | fn try_from(id: u8) -> Result<$ty, Infallible> { |
491 | Ok($ty::new_unchecked(id as usize)) |
492 | } |
493 | } |
494 | |
495 | impl TryFrom<u16> for $ty { |
496 | type Error = $tyerr; |
497 | |
498 | fn try_from(id: u16) -> Result<$ty, $tyerr> { |
499 | if id as u32 > $ty::MAX.as_u32() { |
500 | return Err($tyerr { attempted: id as u64 }); |
501 | } |
502 | Ok($ty::new_unchecked(id as usize)) |
503 | } |
504 | } |
505 | |
506 | impl TryFrom<u32> for $ty { |
507 | type Error = $tyerr; |
508 | |
509 | fn try_from(id: u32) -> Result<$ty, $tyerr> { |
510 | if id > $ty::MAX.as_u32() { |
511 | return Err($tyerr { attempted: id as u64 }); |
512 | } |
513 | Ok($ty::new_unchecked(id as usize)) |
514 | } |
515 | } |
516 | |
517 | impl TryFrom<u64> for $ty { |
518 | type Error = $tyerr; |
519 | |
520 | fn try_from(id: u64) -> Result<$ty, $tyerr> { |
521 | if id > $ty::MAX.as_u32() as u64 { |
522 | return Err($tyerr { attempted: id }); |
523 | } |
524 | Ok($ty::new_unchecked(id as usize)) |
525 | } |
526 | } |
527 | |
528 | #[cfg(test)] |
529 | impl quickcheck::Arbitrary for $ty { |
530 | fn arbitrary(gen: &mut quickcheck::Gen) -> $ty { |
531 | use core::cmp::max; |
532 | |
533 | let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs(); |
534 | if id > $ty::MAX.as_i32() { |
535 | $ty::MAX |
536 | } else { |
537 | $ty::new(usize::try_from(id).unwrap()).unwrap() |
538 | } |
539 | } |
540 | } |
541 | }; |
542 | } |
543 | |
544 | impls!(PatternID, PatternIDError, PatternIDIter); |
545 | impls!(StateID, StateIDError, StateIDIter); |
546 | |
547 | /// A utility trait that defines a couple of adapters for making it convenient |
548 | /// to access indices as ID types. We require ExactSizeIterator so that |
549 | /// iterator construction can do a single check to make sure the index of each |
550 | /// element is representable by its ID type. |
551 | #[cfg (feature = "alloc" )] |
552 | pub(crate) trait IteratorIDExt: Iterator { |
553 | fn with_pattern_ids(self) -> WithPatternIDIter<Self> |
554 | where |
555 | Self: Sized + ExactSizeIterator, |
556 | { |
557 | WithPatternIDIter::new(self) |
558 | } |
559 | |
560 | fn with_state_ids(self) -> WithStateIDIter<Self> |
561 | where |
562 | Self: Sized + ExactSizeIterator, |
563 | { |
564 | WithStateIDIter::new(self) |
565 | } |
566 | } |
567 | |
568 | #[cfg (feature = "alloc" )] |
569 | impl<I: Iterator> IteratorIDExt for I {} |
570 | |
571 | #[cfg (feature = "alloc" )] |
572 | macro_rules! iditer { |
573 | ($ty:ident, $iterty:ident, $withiterty:ident) => { |
574 | /// An iterator adapter that is like std::iter::Enumerate, but attaches |
575 | /// IDs. It requires ExactSizeIterator. At construction, it ensures |
576 | /// that the index of each element in the iterator is representable in |
577 | /// the corresponding ID type. |
578 | #[derive(Clone, Debug)] |
579 | pub(crate) struct $withiterty<I> { |
580 | it: I, |
581 | ids: $iterty, |
582 | } |
583 | |
584 | impl<I: Iterator + ExactSizeIterator> $withiterty<I> { |
585 | fn new(it: I) -> $withiterty<I> { |
586 | let ids = $ty::iter(it.len()); |
587 | $withiterty { it, ids } |
588 | } |
589 | } |
590 | |
591 | impl<I: Iterator + ExactSizeIterator> Iterator for $withiterty<I> { |
592 | type Item = ($ty, I::Item); |
593 | |
594 | fn next(&mut self) -> Option<($ty, I::Item)> { |
595 | let item = self.it.next()?; |
596 | // Number of elements in this iterator must match, according |
597 | // to contract of ExactSizeIterator. |
598 | let id = self.ids.next().unwrap(); |
599 | Some((id, item)) |
600 | } |
601 | } |
602 | }; |
603 | } |
604 | |
605 | #[cfg (feature = "alloc" )] |
606 | iditer!(PatternID, PatternIDIter, WithPatternIDIter); |
607 | #[cfg (feature = "alloc" )] |
608 | iditer!(StateID, StateIDIter, WithStateIDIter); |
609 | |