1/*!
2Type definitions for identifier types.
3
4A [`StateID`] represents the possible set of identifiers used in regex engine
5implementations in this crate. For example, they are used to identify both NFA
6and DFA states.
7
8A [`PatternID`] represents the possible set of identifiers for patterns. All
9regex engine implementations in this crate support searching for multiple
10patterns simultaneously. A `PatternID` is how each pattern is uniquely
11identified for a particular instance of a regex engine. Namely, a pattern is
12assigned an auto-incrementing integer, starting at `0`, based on the order of
13patterns supplied during the construction of the regex engine.
14
15These identifier types represent a way for this crate to make correctness
16guarantees around the possible set of values that a `StateID` or a `PatternID`
17might represent. Similarly, they also provide a way of constraining the size of
18these identifiers to reduce space usage while still guaranteeing that all such
19identifiers are repsentable by a `usize` for the current target.
20
21Moreover, the identifier types clamp the range of permissible values to a range
22that is typically smaller than its internal representation. (With the maximum
23value being, e.g., `StateID::MAX`.) Users of these types may not rely this
24clamping for the purpose of memory safety. Users may, however, rely on these
25invariants to avoid panics or other types of logic bugs.
26*/
27
28// Continuing from the above comment about correctness guarantees, an example
29// of a way in which we use the guarantees on these types is delta encoding.
30// Namely, we require that IDs can be at most 2^31 - 2, which means the
31// difference between any two IDs is always representable as an i32.
32
33use core::{
34 convert::{Infallible, TryFrom},
35 mem, ops,
36};
37
38#[cfg(feature = "alloc")]
39use alloc::vec::Vec;
40
41/// An identifier for a regex pattern.
42///
43/// The identifier for a pattern corresponds to its relative position among
44/// other patterns in a single finite state machine. Namely, when building
45/// a multi-pattern regex engine, one must supply a sequence of patterns to
46/// match. The position (starting at 0) of each pattern in that sequence
47/// represents its identifier. This identifier is in turn used to identify and
48/// report matches of that pattern in various APIs.
49///
50/// A pattern ID is guaranteed to be representable by a `usize`. Similarly,
51/// the number of patterns in any regex engine in this crate is guaranteed to
52/// be representable by a `usize`. This applies to regex engines that have
53/// been deserialized; a deserialization error will be returned if it contains
54/// pattern IDs that violate these requirements in your current environment.
55///
56/// For extra convenience in some cases, this type also guarantees that all
57/// IDs can fit into an `i32` and an `isize` without overflowing.
58///
59/// # Representation
60///
61/// This type is always represented internally by a `u32` and is marked as
62/// `repr(transparent)`. Thus, this type always has the same representation as
63/// a `u32`.
64///
65/// # Indexing
66///
67/// For convenience, callers may use a `PatternID` to index slices.
68///
69/// # Safety
70///
71/// While a `PatternID` is meant to guarantee that its value fits into `usize`
72/// (while using a possibly smaller representation than `usize` on some
73/// targets), callers must not rely on this property for safety. Callers may
74/// choose to rely on this property for correctness however.
75#[repr(transparent)]
76#[derive(
77 Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
78)]
79pub struct PatternID(u32);
80
81impl PatternID {
82 /// The maximum pattern ID value, represented as a `usize`.
83 #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
84 pub const MAX: PatternID =
85 PatternID::new_unchecked(core::i32::MAX as usize - 1);
86
87 /// The maximum pattern ID value, represented as a `usize`.
88 #[cfg(target_pointer_width = "16")]
89 pub const MAX: PatternID = PatternID::new_unchecked(core::isize::MAX - 1);
90
91 /// The total number of patterns that are allowed in any single regex
92 /// engine.
93 pub const LIMIT: usize = PatternID::MAX.as_usize() + 1;
94
95 /// The zero pattern ID value.
96 pub const ZERO: PatternID = PatternID::new_unchecked(0);
97
98 /// The number of bytes that a single `PatternID` uses in memory.
99 pub const SIZE: usize = core::mem::size_of::<PatternID>();
100
101 /// Create a new pattern ID.
102 ///
103 /// If the given identifier exceeds [`PatternID::MAX`], then this returns
104 /// an error.
105 #[inline]
106 pub fn new(id: usize) -> Result<PatternID, PatternIDError> {
107 PatternID::try_from(id)
108 }
109
110 /// Create a new pattern ID without checking whether the given value
111 /// exceeds [`PatternID::MAX`].
112 ///
113 /// While this is unchecked, providing an incorrect value must never
114 /// sacrifice memory safety, as documented above.
115 #[inline]
116 pub const fn new_unchecked(id: usize) -> PatternID {
117 PatternID(id as u32)
118 }
119
120 /// Like [`PatternID::new`], but panics if the given ID is not valid.
121 #[inline]
122 pub fn must(id: usize) -> PatternID {
123 PatternID::new(id).unwrap()
124 }
125
126 /// Return this pattern ID as a `usize`.
127 #[inline]
128 pub const fn as_usize(&self) -> usize {
129 self.0 as usize
130 }
131
132 /// Return the internal u32 of this pattern ID.
133 #[inline]
134 pub const fn as_u32(&self) -> u32 {
135 self.0
136 }
137
138 /// Return the internal u32 of this pattern ID represented as an i32.
139 ///
140 /// This is guaranteed to never overflow an `i32`.
141 #[inline]
142 pub const fn as_i32(&self) -> i32 {
143 self.0 as i32
144 }
145
146 /// Returns one more than this pattern ID as a usize.
147 ///
148 /// Since a pattern ID has constraints on its maximum value, adding `1` to
149 /// it will always fit in a `usize` (and a `u32`).
150 #[inline]
151 pub fn one_more(&self) -> usize {
152 self.as_usize().checked_add(1).unwrap()
153 }
154
155 /// Decode this pattern ID from the bytes given using the native endian
156 /// byte order for the current target.
157 ///
158 /// If the decoded integer is not representable as a pattern ID for the
159 /// current target, then this returns an error.
160 #[inline]
161 pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<PatternID, PatternIDError> {
162 let id = u32::from_ne_bytes(bytes);
163 if id > PatternID::MAX.as_u32() {
164 return Err(PatternIDError { attempted: id as u64 });
165 }
166 Ok(PatternID::new_unchecked(id as usize))
167 }
168
169 /// Decode this pattern ID from the bytes given using the native endian
170 /// byte order for the current target.
171 ///
172 /// This is analogous to [`PatternID::new_unchecked`] in that is does not
173 /// check whether the decoded integer is representable as a pattern ID.
174 #[inline]
175 pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> PatternID {
176 PatternID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
177 }
178
179 /// Return the underlying pattern ID integer as raw bytes in native endian
180 /// format.
181 #[inline]
182 pub fn to_ne_bytes(&self) -> [u8; 4] {
183 self.0.to_ne_bytes()
184 }
185
186 /// Returns an iterator over all pattern IDs from 0 up to and not including
187 /// the given length.
188 ///
189 /// If the given length exceeds [`PatternID::LIMIT`], then this panics.
190 #[cfg(feature = "alloc")]
191 pub(crate) fn iter(len: usize) -> PatternIDIter {
192 PatternIDIter::new(len)
193 }
194}
195
196/// This error occurs when a pattern ID could not be constructed.
197///
198/// This occurs when given an integer exceeding the maximum pattern ID value.
199///
200/// When the `std` feature is enabled, this implements the `Error` trait.
201#[derive(Clone, Debug, Eq, PartialEq)]
202pub struct PatternIDError {
203 attempted: u64,
204}
205
206impl PatternIDError {
207 /// Returns the value that failed to constructed a pattern ID.
208 pub fn attempted(&self) -> u64 {
209 self.attempted
210 }
211}
212
213#[cfg(feature = "std")]
214impl std::error::Error for PatternIDError {}
215
216impl core::fmt::Display for PatternIDError {
217 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
218 write!(
219 f,
220 "failed to create PatternID from {:?}, which exceeds {:?}",
221 self.attempted(),
222 PatternID::MAX,
223 )
224 }
225}
226
227/// An identifier for a state in a regex engine.
228///
229/// A state ID is guaranteed to be representable by a `usize`. Similarly, the
230/// number of states in any regex engine in this crate is guaranteed to be
231/// representable by a `usize`. This applies to regex engines that have been
232/// deserialized; a deserialization error will be returned if it contains state
233/// IDs that violate these requirements in your current environment.
234///
235/// For extra convenience in some cases, this type also guarantees that all
236/// IDs can fit into an `i32` and an `isize` without overflowing.
237///
238/// # Representation
239///
240/// This type is always represented internally by a `u32` and is marked as
241/// `repr(transparent)`. Thus, this type always has the same representation as
242/// a `u32`.
243///
244/// # Indexing
245///
246/// For convenience, callers may use a `StateID` to index slices.
247///
248/// # Safety
249///
250/// While a `StateID` is meant to guarantee that its value fits into `usize`
251/// (while using a possibly smaller representation than `usize` on some
252/// targets), callers must not rely on this property for safety. Callers may
253/// choose to rely on this property for correctness however.
254#[repr(transparent)]
255#[derive(
256 Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
257)]
258pub struct StateID(u32);
259
260impl StateID {
261 /// The maximum state ID value.
262 #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
263 pub const MAX: StateID =
264 StateID::new_unchecked(core::i32::MAX as usize - 1);
265
266 /// The maximum state ID value.
267 #[cfg(target_pointer_width = "16")]
268 pub const MAX: StateID = StateID::new_unchecked(core::isize::MAX - 1);
269
270 /// The total number of states that are allowed in any single regex
271 /// engine, represented as a `usize`.
272 pub const LIMIT: usize = StateID::MAX.as_usize() + 1;
273
274 /// The zero state ID value.
275 pub const ZERO: StateID = StateID::new_unchecked(0);
276
277 /// The number of bytes that a single `StateID` uses in memory.
278 pub const SIZE: usize = core::mem::size_of::<StateID>();
279
280 /// Create a new state ID.
281 ///
282 /// If the given identifier exceeds [`StateID::MAX`], then this returns
283 /// an error.
284 #[inline]
285 pub fn new(id: usize) -> Result<StateID, StateIDError> {
286 StateID::try_from(id)
287 }
288
289 /// Create a new state ID without checking whether the given value
290 /// exceeds [`StateID::MAX`].
291 ///
292 /// While this is unchecked, providing an incorrect value must never
293 /// sacrifice memory safety, as documented above.
294 #[inline]
295 pub const fn new_unchecked(id: usize) -> StateID {
296 StateID(id as u32)
297 }
298
299 /// Like [`StateID::new`], but panics if the given ID is not valid.
300 #[inline]
301 pub fn must(id: usize) -> StateID {
302 StateID::new(id).unwrap()
303 }
304
305 /// Return this state ID as a `usize`.
306 #[inline]
307 pub const fn as_usize(&self) -> usize {
308 self.0 as usize
309 }
310
311 /// Return the internal u32 of this state ID.
312 #[inline]
313 pub const fn as_u32(&self) -> u32 {
314 self.0
315 }
316
317 /// Return the internal u32 of this pattern ID represented as an i32.
318 ///
319 /// This is guaranteed to never overflow an `i32`.
320 #[inline]
321 pub const fn as_i32(&self) -> i32 {
322 self.0 as i32
323 }
324
325 /// Returns one more than this state ID as a usize.
326 ///
327 /// Since a state ID has constraints on its maximum value, adding `1` to
328 /// it will always fit in a `usize` (and a `u32`).
329 #[inline]
330 pub fn one_more(&self) -> usize {
331 self.as_usize().checked_add(1).unwrap()
332 }
333
334 /// Decode this state ID from the bytes given using the native endian byte
335 /// order for the current target.
336 ///
337 /// If the decoded integer is not representable as a state ID for the
338 /// current target, then this returns an error.
339 #[inline]
340 pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<StateID, StateIDError> {
341 let id = u32::from_ne_bytes(bytes);
342 if id > StateID::MAX.as_u32() {
343 return Err(StateIDError { attempted: id as u64 });
344 }
345 Ok(StateID::new_unchecked(id as usize))
346 }
347
348 /// Decode this state ID from the bytes given using the native endian
349 /// byte order for the current target.
350 ///
351 /// This is analogous to [`StateID::new_unchecked`] in that is does not
352 /// check whether the decoded integer is representable as a state ID.
353 #[inline]
354 pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> StateID {
355 StateID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
356 }
357
358 /// Return the underlying state ID integer as raw bytes in native endian
359 /// format.
360 #[inline]
361 pub fn to_ne_bytes(&self) -> [u8; 4] {
362 self.0.to_ne_bytes()
363 }
364
365 /// Returns an iterator over all state IDs from 0 up to and not including
366 /// the given length.
367 ///
368 /// If the given length exceeds [`StateID::LIMIT`], then this panics.
369 #[cfg(feature = "alloc")]
370 pub(crate) fn iter(len: usize) -> StateIDIter {
371 StateIDIter::new(len)
372 }
373}
374
375/// This error occurs when a state ID could not be constructed.
376///
377/// This occurs when given an integer exceeding the maximum state ID value.
378///
379/// When the `std` feature is enabled, this implements the `Error` trait.
380#[derive(Clone, Debug, Eq, PartialEq)]
381pub struct StateIDError {
382 attempted: u64,
383}
384
385impl StateIDError {
386 /// Returns the value that failed to constructed a state ID.
387 pub fn attempted(&self) -> u64 {
388 self.attempted
389 }
390}
391
392#[cfg(feature = "std")]
393impl std::error::Error for StateIDError {}
394
395impl core::fmt::Display for StateIDError {
396 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
397 write!(
398 f,
399 "failed to create StateID from {:?}, which exceeds {:?}",
400 self.attempted(),
401 StateID::MAX,
402 )
403 }
404}
405
406/// A macro for defining exactly identical (modulo names) impls for ID types.
407macro_rules! impls {
408 ($ty:ident, $tyerr:ident, $tyiter:ident) => {
409 #[derive(Clone, Debug)]
410 pub(crate) struct $tyiter {
411 rng: ops::Range<usize>,
412 }
413
414 impl $tyiter {
415 #[cfg(feature = "alloc")]
416 fn new(len: usize) -> $tyiter {
417 assert!(
418 len <= $ty::LIMIT,
419 "cannot create iterator with IDs when number of \
420 elements exceed {:?}",
421 $ty::LIMIT,
422 );
423 $tyiter { rng: 0..len }
424 }
425 }
426
427 impl Iterator for $tyiter {
428 type Item = $ty;
429
430 fn next(&mut self) -> Option<$ty> {
431 if self.rng.start >= self.rng.end {
432 return None;
433 }
434 let next_id = self.rng.start + 1;
435 let id = mem::replace(&mut self.rng.start, next_id);
436 // new_unchecked is OK since we asserted that the number of
437 // elements in this iterator will fit in an ID at construction.
438 Some($ty::new_unchecked(id))
439 }
440 }
441
442 impl<T> core::ops::Index<$ty> for [T] {
443 type Output = T;
444
445 #[inline]
446 fn index(&self, index: $ty) -> &T {
447 &self[index.as_usize()]
448 }
449 }
450
451 impl<T> core::ops::IndexMut<$ty> for [T] {
452 #[inline]
453 fn index_mut(&mut self, index: $ty) -> &mut T {
454 &mut self[index.as_usize()]
455 }
456 }
457
458 #[cfg(feature = "alloc")]
459 impl<T> core::ops::Index<$ty> for Vec<T> {
460 type Output = T;
461
462 #[inline]
463 fn index(&self, index: $ty) -> &T {
464 &self[index.as_usize()]
465 }
466 }
467
468 #[cfg(feature = "alloc")]
469 impl<T> core::ops::IndexMut<$ty> for Vec<T> {
470 #[inline]
471 fn index_mut(&mut self, index: $ty) -> &mut T {
472 &mut self[index.as_usize()]
473 }
474 }
475
476 impl TryFrom<usize> for $ty {
477 type Error = $tyerr;
478
479 fn try_from(id: usize) -> Result<$ty, $tyerr> {
480 if id > $ty::MAX.as_usize() {
481 return Err($tyerr { attempted: id as u64 });
482 }
483 Ok($ty::new_unchecked(id))
484 }
485 }
486
487 impl TryFrom<u8> for $ty {
488 type Error = Infallible;
489
490 fn try_from(id: u8) -> Result<$ty, Infallible> {
491 Ok($ty::new_unchecked(id as usize))
492 }
493 }
494
495 impl TryFrom<u16> for $ty {
496 type Error = $tyerr;
497
498 fn try_from(id: u16) -> Result<$ty, $tyerr> {
499 if id as u32 > $ty::MAX.as_u32() {
500 return Err($tyerr { attempted: id as u64 });
501 }
502 Ok($ty::new_unchecked(id as usize))
503 }
504 }
505
506 impl TryFrom<u32> for $ty {
507 type Error = $tyerr;
508
509 fn try_from(id: u32) -> Result<$ty, $tyerr> {
510 if id > $ty::MAX.as_u32() {
511 return Err($tyerr { attempted: id as u64 });
512 }
513 Ok($ty::new_unchecked(id as usize))
514 }
515 }
516
517 impl TryFrom<u64> for $ty {
518 type Error = $tyerr;
519
520 fn try_from(id: u64) -> Result<$ty, $tyerr> {
521 if id > $ty::MAX.as_u32() as u64 {
522 return Err($tyerr { attempted: id });
523 }
524 Ok($ty::new_unchecked(id as usize))
525 }
526 }
527
528 #[cfg(test)]
529 impl quickcheck::Arbitrary for $ty {
530 fn arbitrary(gen: &mut quickcheck::Gen) -> $ty {
531 use core::cmp::max;
532
533 let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs();
534 if id > $ty::MAX.as_i32() {
535 $ty::MAX
536 } else {
537 $ty::new(usize::try_from(id).unwrap()).unwrap()
538 }
539 }
540 }
541 };
542}
543
544impls!(PatternID, PatternIDError, PatternIDIter);
545impls!(StateID, StateIDError, StateIDIter);
546
547/// A utility trait that defines a couple of adapters for making it convenient
548/// to access indices as ID types. We require ExactSizeIterator so that
549/// iterator construction can do a single check to make sure the index of each
550/// element is representable by its ID type.
551#[cfg(feature = "alloc")]
552pub(crate) trait IteratorIDExt: Iterator {
553 fn with_pattern_ids(self) -> WithPatternIDIter<Self>
554 where
555 Self: Sized + ExactSizeIterator,
556 {
557 WithPatternIDIter::new(self)
558 }
559
560 fn with_state_ids(self) -> WithStateIDIter<Self>
561 where
562 Self: Sized + ExactSizeIterator,
563 {
564 WithStateIDIter::new(self)
565 }
566}
567
568#[cfg(feature = "alloc")]
569impl<I: Iterator> IteratorIDExt for I {}
570
571#[cfg(feature = "alloc")]
572macro_rules! iditer {
573 ($ty:ident, $iterty:ident, $withiterty:ident) => {
574 /// An iterator adapter that is like std::iter::Enumerate, but attaches
575 /// IDs. It requires ExactSizeIterator. At construction, it ensures
576 /// that the index of each element in the iterator is representable in
577 /// the corresponding ID type.
578 #[derive(Clone, Debug)]
579 pub(crate) struct $withiterty<I> {
580 it: I,
581 ids: $iterty,
582 }
583
584 impl<I: Iterator + ExactSizeIterator> $withiterty<I> {
585 fn new(it: I) -> $withiterty<I> {
586 let ids = $ty::iter(it.len());
587 $withiterty { it, ids }
588 }
589 }
590
591 impl<I: Iterator + ExactSizeIterator> Iterator for $withiterty<I> {
592 type Item = ($ty, I::Item);
593
594 fn next(&mut self) -> Option<($ty, I::Item)> {
595 let item = self.it.next()?;
596 // Number of elements in this iterator must match, according
597 // to contract of ExactSizeIterator.
598 let id = self.ids.next().unwrap();
599 Some((id, item))
600 }
601 }
602 };
603}
604
605#[cfg(feature = "alloc")]
606iditer!(PatternID, PatternIDIter, WithPatternIDIter);
607#[cfg(feature = "alloc")]
608iditer!(StateID, StateIDIter, WithStateIDIter);
609