id.rs source code [crates/regex_automata/src/util/id.rs]

1	/!*
2	Type definitions for identifier types.
3
4	A [`StateID`] represents the possible set of identifiers used in regex engine
5	implementations in this crate. For example, they are used to identify both NFA
6	and DFA states.
7
8	A [`PatternID`] represents the possible set of identifiers for patterns. All
9	regex engine implementations in this crate support searching for multiple
10	patterns simultaneously. A `PatternID` is how each pattern is uniquely
11	identified for a particular instance of a regex engine. Namely, a pattern is
12	assigned an auto-incrementing integer, starting at `0`, based on the order of
13	patterns supplied during the construction of the regex engine.
14
15	These identifier types represent a way for this crate to make correctness
16	guarantees around the possible set of values that a `StateID` or a `PatternID`
17	might represent. Similarly, they also provide a way of constraining the size of
18	these identifiers to reduce space usage while still guaranteeing that all such
19	identifiers are repsentable by a `usize` for the current target.
20
21	Moreover, the identifier types clamp the range of permissible values to a range
22	that is typically smaller than its internal representation. (With the maximum
23	value being, e.g., `StateID::MAX`.) Users of these types may not rely this
24	clamping for the purpose of memory safety. Users may, however, rely on these
25	invariants to avoid panics or other types of logic bugs.
26	*/
27
28	// Continuing from the above comment about correctness guarantees, an example
29	// of a way in which we use the guarantees on these types is delta encoding.
30	// Namely, we require that IDs can be at most 2^31 - 2, which means the
31	// difference between any two IDs is always representable as an i32.
32
33	use core::{
34	convert::{Infallible, TryFrom},
35	mem, ops,
36	};
37
38	#[cfg(feature = "alloc")]
39	use alloc::vec::Vec;
40
41	/// An identifier for a regex pattern.
42	///
43	/// The identifier for a pattern corresponds to its relative position among
44	/// other patterns in a single finite state machine. Namely, when building
45	/// a multi-pattern regex engine, one must supply a sequence of patterns to
46	/// match. The position (starting at 0) of each pattern in that sequence
47	/// represents its identifier. This identifier is in turn used to identify and
48	/// report matches of that pattern in various APIs.
49	///
50	/// A pattern ID is guaranteed to be representable by a `usize`. Similarly,
51	/// the number of patterns in any regex engine in this crate is guaranteed to
52	/// be representable by a `usize`. This applies to regex engines that have
53	/// been deserialized; a deserialization error will be returned if it contains
54	/// pattern IDs that violate these requirements in your current environment.
55	///
56	/// For extra convenience in some cases, this type also guarantees that all
57	/// IDs can fit into an `i32` and an `isize` without overflowing.
58	///
59	/// # Representation
60	///
61	/// This type is always represented internally by a `u32` and is marked as
62	/// `repr(transparent)`. Thus, this type always has the same representation as
63	/// a `u32`.
64	///
65	/// # Indexing
66	///
67	/// For convenience, callers may use a `PatternID` to index slices.
68	///
69	/// # Safety
70	///
71	/// While a `PatternID` is meant to guarantee that its value fits into `usize`
72	/// (while using a possibly smaller representation than `usize` on some
73	/// targets), callers must not rely on this property for safety. Callers may
74	/// choose to rely on this property for correctness however.
75	#[repr(transparent)]
76	#[derive(
77	Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
78	)]
79	pub struct PatternID(u32);
80
81	impl PatternID {
82	/// The maximum pattern ID value, represented as a `usize`.
83	#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
84	pub const MAX: PatternID =
85	PatternID::new_unchecked(core::i32::MAX as usize - `1`);
86
87	/// The maximum pattern ID value, represented as a `usize`.
88	#[cfg(target_pointer_width = "16")]
89	pub const MAX: PatternID = PatternID::new_unchecked(core::isize::MAX - `1`);
90
91	/// The total number of patterns that are allowed in any single regex
92	/// engine.
93	pub const LIMIT: usize = PatternID::MAX.as_usize() + `1`;
94
95	/// The zero pattern ID value.
96	pub const ZERO: PatternID = PatternID::new_unchecked(`0`);
97
98	/// The number of bytes that a single `PatternID` uses in memory.
99	pub const SIZE: usize = core::mem::size_of::<PatternID>();
100
101	/// Create a new pattern ID.
102	///
103	/// If the given identifier exceeds [`PatternID::MAX`], then this returns
104	/// an error.
105	#[inline]
106	pub fn new(id: usize) -> Result<PatternID, PatternIDError> {
107	PatternID::try_from(id)
108	}
109
110	/// Create a new pattern ID without checking whether the given value
111	/// exceeds [`PatternID::MAX`].
112	///
113	/// While this is unchecked, providing an incorrect value must never
114	/// sacrifice memory safety, as documented above.
115	#[inline]
116	pub const fn new_unchecked(id: usize) -> PatternID {
117	PatternID(id as u32)
118	}
119
120	/// Like [`PatternID::new`], but panics if the given ID is not valid.
121	#[inline]
122	pub fn must(id: usize) -> PatternID {
123	PatternID::new(id).unwrap()
124	}
125
126	/// Return this pattern ID as a `usize`.
127	#[inline]
128	pub const fn as_usize(&self) -> usize {
129	self.0 as usize
130	}
131
132	/// Return the internal u32 of this pattern ID.
133	#[inline]
134	pub const fn as_u32(&self) -> u32 {
135	self.0
136	}
137
138	/// Return the internal u32 of this pattern ID represented as an i32.
139	///
140	/// This is guaranteed to never overflow an `i32`.
141	#[inline]
142	pub const fn as_i32(&self) -> i32 {
143	self.0 as i32
144	}
145
146	/// Returns one more than this pattern ID as a usize.
147	///
148	/// Since a pattern ID has constraints on its maximum value, adding `1` to
149	/// it will always fit in a `usize` (and a `u32`).
150	#[inline]
151	pub fn one_more(&self) -> usize {
152	self.as_usize().checked_add(`1`).unwrap()
153	}
154
155	/// Decode this pattern ID from the bytes given using the native endian
156	/// byte order for the current target.
157	///
158	/// If the decoded integer is not representable as a pattern ID for the
159	/// current target, then this returns an error.
160	#[inline]
161	pub fn from_ne_bytes(bytes: [u8; `4`]) -> Result<PatternID, PatternIDError> {
162	let id = u32::from_ne_bytes(bytes);
163	if id > PatternID::MAX.as_u32() {
164	return Err(PatternIDError { attempted: id as u64 });
165	}
166	Ok(PatternID::new_unchecked(id as usize))
167	}
168
169	/// Decode this pattern ID from the bytes given using the native endian
170	/// byte order for the current target.
171	///
172	/// This is analogous to [`PatternID::new_unchecked`] in that is does not
173	/// check whether the decoded integer is representable as a pattern ID.
174	#[inline]
175	pub fn from_ne_bytes_unchecked(bytes: [u8; `4`]) -> PatternID {
176	PatternID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
177	}
178
179	/// Return the underlying pattern ID integer as raw bytes in native endian
180	/// format.
181	#[inline]
182	pub fn to_ne_bytes(&self) -> [u8; `4`] {
183	self.0.to_ne_bytes()
184	}
185
186	/// Returns an iterator over all pattern IDs from 0 up to and not including
187	/// the given length.
188	///
189	/// If the given length exceeds [`PatternID::LIMIT`], then this panics.
190	#[cfg(feature = "alloc")]
191	pub(crate) fn iter(len: usize) -> PatternIDIter {
192	PatternIDIter::new(len)
193	}
194	}
195
196	/// This error occurs when a pattern ID could not be constructed.
197	///
198	/// This occurs when given an integer exceeding the maximum pattern ID value.
199	///
200	/// When the `std` feature is enabled, this implements the `Error` trait.
201	#[derive(Clone, Debug, Eq, PartialEq)]
202	pub struct PatternIDError {
203	attempted: u64,
204	}
205
206	impl PatternIDError {
207	/// Returns the value that failed to constructed a pattern ID.
208	pub fn attempted(&self) -> u64 {
209	self.attempted
210	}
211	}
212
213	#[cfg(feature = "std")]
214	impl std::error::Error for PatternIDError {}
215
216	impl core::fmt::Display for PatternIDError {
217	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
218	write!(
219	f,
220	"failed to create PatternID from {:?}, which exceeds {:?}",
221	self.attempted(),
222	PatternID::MAX,
223	)
224	}
225	}
226
227	/// An identifier for a state in a regex engine.
228	///
229	/// A state ID is guaranteed to be representable by a `usize`. Similarly, the
230	/// number of states in any regex engine in this crate is guaranteed to be
231	/// representable by a `usize`. This applies to regex engines that have been
232	/// deserialized; a deserialization error will be returned if it contains state
233	/// IDs that violate these requirements in your current environment.
234	///
235	/// For extra convenience in some cases, this type also guarantees that all
236	/// IDs can fit into an `i32` and an `isize` without overflowing.
237	///
238	/// # Representation
239	///
240	/// This type is always represented internally by a `u32` and is marked as
241	/// `repr(transparent)`. Thus, this type always has the same representation as
242	/// a `u32`.
243	///
244	/// # Indexing
245	///
246	/// For convenience, callers may use a `StateID` to index slices.
247	///
248	/// # Safety
249	///
250	/// While a `StateID` is meant to guarantee that its value fits into `usize`
251	/// (while using a possibly smaller representation than `usize` on some
252	/// targets), callers must not rely on this property for safety. Callers may
253	/// choose to rely on this property for correctness however.
254	#[repr(transparent)]
255	#[derive(
256	Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
257	)]
258	pub struct StateID(u32);
259
260	impl StateID {
261	/// The maximum state ID value.
262	#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
263	pub const MAX: StateID =
264	StateID::new_unchecked(core::i32::MAX as usize - `1`);
265
266	/// The maximum state ID value.
267	#[cfg(target_pointer_width = "16")]
268	pub const MAX: StateID = StateID::new_unchecked(core::isize::MAX - `1`);
269
270	/// The total number of states that are allowed in any single regex
271	/// engine, represented as a `usize`.
272	pub const LIMIT: usize = StateID::MAX.as_usize() + `1`;
273
274	/// The zero state ID value.
275	pub const ZERO: StateID = StateID::new_unchecked(`0`);
276
277	/// The number of bytes that a single `StateID` uses in memory.
278	pub const SIZE: usize = core::mem::size_of::<StateID>();
279
280	/// Create a new state ID.
281	///
282	/// If the given identifier exceeds [`StateID::MAX`], then this returns
283	/// an error.
284	#[inline]
285	pub fn new(id: usize) -> Result<StateID, StateIDError> {
286	StateID::try_from(id)
287	}
288
289	/// Create a new state ID without checking whether the given value
290	/// exceeds [`StateID::MAX`].
291	///
292	/// While this is unchecked, providing an incorrect value must never
293	/// sacrifice memory safety, as documented above.
294	#[inline]
295	pub const fn new_unchecked(id: usize) -> StateID {
296	StateID(id as u32)
297	}
298
299	/// Like [`StateID::new`], but panics if the given ID is not valid.
300	#[inline]
301	pub fn must(id: usize) -> StateID {
302	StateID::new(id).unwrap()
303	}
304
305	/// Return this state ID as a `usize`.
306	#[inline]
307	pub const fn as_usize(&self) -> usize {
308	self.0 as usize
309	}
310
311	/// Return the internal u32 of this state ID.
312	#[inline]
313	pub const fn as_u32(&self) -> u32 {
314	self.0
315	}
316
317	/// Return the internal u32 of this pattern ID represented as an i32.
318	///
319	/// This is guaranteed to never overflow an `i32`.
320	#[inline]
321	pub const fn as_i32(&self) -> i32 {
322	self.0 as i32
323	}
324
325	/// Returns one more than this state ID as a usize.
326	///
327	/// Since a state ID has constraints on its maximum value, adding `1` to
328	/// it will always fit in a `usize` (and a `u32`).
329	#[inline]
330	pub fn one_more(&self) -> usize {
331	self.as_usize().checked_add(`1`).unwrap()
332	}
333
334	/// Decode this state ID from the bytes given using the native endian byte
335	/// order for the current target.
336	///
337	/// If the decoded integer is not representable as a state ID for the
338	/// current target, then this returns an error.
339	#[inline]
340	pub fn from_ne_bytes(bytes: [u8; `4`]) -> Result<StateID, StateIDError> {
341	let id = u32::from_ne_bytes(bytes);
342	if id > StateID::MAX.as_u32() {
343	return Err(StateIDError { attempted: id as u64 });
344	}
345	Ok(StateID::new_unchecked(id as usize))
346	}
347
348	/// Decode this state ID from the bytes given using the native endian
349	/// byte order for the current target.
350	///
351	/// This is analogous to [`StateID::new_unchecked`] in that is does not
352	/// check whether the decoded integer is representable as a state ID.
353	#[inline]
354	pub fn from_ne_bytes_unchecked(bytes: [u8; `4`]) -> StateID {
355	StateID::new_unchecked(u32::from_ne_bytes(bytes) as usize)
356	}
357
358	/// Return the underlying state ID integer as raw bytes in native endian
359	/// format.
360	#[inline]
361	pub fn to_ne_bytes(&self) -> [u8; `4`] {
362	self.0.to_ne_bytes()
363	}
364
365	/// Returns an iterator over all state IDs from 0 up to and not including
366	/// the given length.
367	///
368	/// If the given length exceeds [`StateID::LIMIT`], then this panics.
369	#[cfg(feature = "alloc")]
370	pub(crate) fn iter(len: usize) -> StateIDIter {
371	StateIDIter::new(len)
372	}
373	}
374
375	/// This error occurs when a state ID could not be constructed.
376	///
377	/// This occurs when given an integer exceeding the maximum state ID value.
378	///
379	/// When the `std` feature is enabled, this implements the `Error` trait.
380	#[derive(Clone, Debug, Eq, PartialEq)]
381	pub struct StateIDError {
382	attempted: u64,
383	}
384
385	impl StateIDError {
386	/// Returns the value that failed to constructed a state ID.
387	pub fn attempted(&self) -> u64 {
388	self.attempted
389	}
390	}
391
392	#[cfg(feature = "std")]
393	impl std::error::Error for StateIDError {}
394
395	impl core::fmt::Display for StateIDError {
396	fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
397	write!(
398	f,
399	"failed to create StateID from {:?}, which exceeds {:?}",
400	self.attempted(),
401	StateID::MAX,
402	)
403	}
404	}
405
406	/// A macro for defining exactly identical (modulo names) impls for ID types.
407	macro_rules! impls {
408	($ty:ident, $tyerr:ident, $tyiter:ident) => {
409	#[derive(Clone, Debug)]
410	pub(crate) struct $tyiter {
411	rng: ops::Range<usize>,
412	}
413
414	impl $tyiter {
415	#[cfg(feature = "alloc")]
416	fn new(len: usize) -> $tyiter {
417	assert!(
418	len <= $ty::LIMIT,
419	"cannot create iterator with IDs when number of \
420	elements exceed {:?}",
421	$ty::LIMIT,
422	);
423	$tyiter { rng: `0`..len }
424	}
425	}
426
427	impl Iterator for $tyiter {
428	type Item = $ty;
429
430	fn next(&mut self) -> Option<$ty> {
431	if self.rng.start >= self.rng.end {
432	return None;
433	}
434	let next_id = self.rng.start + `1`;
435	let id = mem::replace(&mut self.rng.start, next_id);
436	// new_unchecked is OK since we asserted that the number of
437	// elements in this iterator will fit in an ID at construction.
438	Some($ty::new_unchecked(id))
439	}
440	}
441
442	impl<T> core::ops::Index<$ty> for [T] {
443	type Output = T;
444
445	#[inline]
446	fn index(&self, index: $ty) -> &T {
447	&self[index.as_usize()]
448	}
449	}
450
451	impl<T> core::ops::IndexMut<$ty> for [T] {
452	#[inline]
453	fn index_mut(&mut self, index: $ty) -> &mut T {
454	&mut self[index.as_usize()]
455	}
456	}
457
458	#[cfg(feature = "alloc")]
459	impl<T> core::ops::Index<$ty> for Vec<T> {
460	type Output = T;
461
462	#[inline]
463	fn index(&self, index: $ty) -> &T {
464	&self[index.as_usize()]
465	}
466	}
467
468	#[cfg(feature = "alloc")]
469	impl<T> core::ops::IndexMut<$ty> for Vec<T> {
470	#[inline]
471	fn index_mut(&mut self, index: $ty) -> &mut T {
472	&mut self[index.as_usize()]
473	}
474	}
475
476	impl TryFrom<usize> for $ty {
477	type Error = $tyerr;
478
479	fn try_from(id: usize) -> Result<$ty, $tyerr> {
480	if id > $ty::MAX.as_usize() {
481	return Err($tyerr { attempted: id as u64 });
482	}
483	Ok($ty::new_unchecked(id))
484	}
485	}
486
487	impl TryFrom<u8> for $ty {
488	type Error = Infallible;
489
490	fn try_from(id: u8) -> Result<$ty, Infallible> {
491	Ok($ty::new_unchecked(id as usize))
492	}
493	}
494
495	impl TryFrom<u16> for $ty {
496	type Error = $tyerr;
497
498	fn try_from(id: u16) -> Result<$ty, $tyerr> {
499	if id as u32 > $ty::MAX.as_u32() {
500	return Err($tyerr { attempted: id as u64 });
501	}
502	Ok($ty::new_unchecked(id as usize))
503	}
504	}
505
506	impl TryFrom<u32> for $ty {
507	type Error = $tyerr;
508
509	fn try_from(id: u32) -> Result<$ty, $tyerr> {
510	if id > $ty::MAX.as_u32() {
511	return Err($tyerr { attempted: id as u64 });
512	}
513	Ok($ty::new_unchecked(id as usize))
514	}
515	}
516
517	impl TryFrom<u64> for $ty {
518	type Error = $tyerr;
519
520	fn try_from(id: u64) -> Result<$ty, $tyerr> {
521	if id > $ty::MAX.as_u32() as u64 {
522	return Err($tyerr { attempted: id });
523	}
524	Ok($ty::new_unchecked(id as usize))
525	}
526	}
527
528	#[cfg(test)]
529	impl quickcheck::Arbitrary for $ty {
530	fn arbitrary(gen: &mut quickcheck::Gen) -> $ty {
531	use core::cmp::max;
532
533	let id = max(i32::MIN + `1`, i32::arbitrary(gen)).abs();
534	if id > $ty::MAX.as_i32() {
535	$ty::MAX
536	} else {
537	$ty::new(usize::try_from(id).unwrap()).unwrap()
538	}
539	}
540	}
541	};
542	}
543
544	impls!(PatternID, PatternIDError, PatternIDIter);
545	impls!(StateID, StateIDError, StateIDIter);
546
547	/// A utility trait that defines a couple of adapters for making it convenient
548	/// to access indices as ID types. We require ExactSizeIterator so that
549	/// iterator construction can do a single check to make sure the index of each
550	/// element is representable by its ID type.
551	#[cfg(feature = "alloc")]
552	pub(crate) trait IteratorIDExt: Iterator {
553	fn with_pattern_ids(self) -> WithPatternIDIter<Self>
554	where
555	Self: Sized + ExactSizeIterator,
556	{
557	WithPatternIDIter::new(self)
558	}
559
560	fn with_state_ids(self) -> WithStateIDIter<Self>
561	where
562	Self: Sized + ExactSizeIterator,
563	{
564	WithStateIDIter::new(self)
565	}
566	}
567
568	#[cfg(feature = "alloc")]
569	impl<I: Iterator> IteratorIDExt for I {}
570
571	#[cfg(feature = "alloc")]
572	macro_rules! iditer {
573	($ty:ident, $iterty:ident, $withiterty:ident) => {
574	/// An iterator adapter that is like std::iter::Enumerate, but attaches
575	/// IDs. It requires ExactSizeIterator. At construction, it ensures
576	/// that the index of each element in the iterator is representable in
577	/// the corresponding ID type.
578	#[derive(Clone, Debug)]
579	pub(crate) struct $withiterty<I> {
580	it: I,
581	ids: $iterty,
582	}
583
584	impl<I: Iterator + ExactSizeIterator> $withiterty<I> {
585	fn new(it: I) -> $withiterty<I> {
586	let ids = $ty::iter(it.len());
587	$withiterty { it, ids }
588	}
589	}
590
591	impl<I: Iterator + ExactSizeIterator> Iterator for $withiterty<I> {
592	type Item = ($ty, I::Item);
593
594	fn next(&mut self) -> Option<($ty, I::Item)> {
595	let item = self.it.next()?;
596	// Number of elements in this iterator must match, according
597	// to contract of ExactSizeIterator.
598	let id = self.ids.next().unwrap();
599	Some((id, item))
600	}
601	}
602	};
603	}
604
605	#[cfg(feature = "alloc")]
606	iditer!(PatternID, PatternIDIter, WithPatternIDIter);
607	#[cfg(feature = "alloc")]
608	iditer!(StateID, StateIDIter, WithStateIDIter);
609