rarebytes.rs source code [crates/memchr-2.5.0/src/memmem/rarebytes.rs]

1	/// A heuristic frequency based detection of rare bytes for substring search.
2	///
3	/// This detector attempts to pick out two bytes in a needle that are predicted
4	/// to occur least frequently. The purpose is to use these bytes to implement
5	/// fast candidate search using vectorized code.
6	///
7	/// A set of offsets is only computed for needles of length 2 or greater.
8	/// Smaller needles should be special cased by the substring search algorithm
9	/// in use. (e.g., Use memchr for single byte needles.)
10	///
11	/// Note that we use `u8` to represent the offsets of the rare bytes in a
12	/// needle to reduce space usage. This means that rare byte occurring after the
13	/// first 255 bytes in a needle will never be used.
14	#[derive(Clone, Copy, Debug, Default)]
15	pub(crate) struct RareNeedleBytes {
16	/// The leftmost offset of the rarest byte in the needle, according to
17	/// pre-computed frequency analysis. The "leftmost offset" means that
18	/// rare1i <= i for all i where needle[i] == needle[rare1i].
19	rare1i: u8,
20	/// The leftmost offset of the second rarest byte in the needle, according
21	/// to pre-computed frequency analysis. The "leftmost offset" means that
22	/// rare2i <= i for all i where needle[i] == needle[rare2i].
23	///
24	/// The second rarest byte is used as a type of guard for quickly detecting
25	/// a mismatch if the first byte matches. This is a hedge against
26	/// pathological cases where the pre-computed frequency analysis may be
27	/// off. (But of course, does not prevent all* pathological cases.)*
28	///
29	/// In general, rare1i != rare2i by construction, although there is no hard
30	/// requirement that they be different. However, since the case of a single
31	/// byte needle is handled specially by memchr itself, rare2i generally
32	/// always should be different from rare1i since it would otherwise be
33	/// ineffective as a guard.
34	rare2i: u8,
35	}
36
37	impl RareNeedleBytes {
38	/// Create a new pair of rare needle bytes with the given offsets. This is
39	/// only used in tests for generating input data.
40	#[cfg(all(test, feature = "std"))]
41	pub(crate) fn new(rare1i: u8, rare2i: u8) -> RareNeedleBytes {
42	RareNeedleBytes { rare1i, rare2i }
43	}
44
45	/// Detect the leftmost offsets of the two rarest bytes in the given
46	/// needle.
47	pub(crate) fn forward(needle: &[u8]) -> RareNeedleBytes {
48	if needle.len() <= `1` \|\| needle.len() > core::u8::MAX as usize {
49	// For needles bigger than u8::MAX, our offsets aren't big enough.
50	// (We make our offsets small to reduce stack copying.)
51	// If you have a use case for it, please file an issue. In that
52	// case, we should probably just adjust the routine below to pick
53	// some rare bytes from the first 255 bytes of the needle.
54	//
55	// Also note that for needles of size 0 or 1, they are special
56	// cased in Two-Way.
57	//
58	// TODO: Benchmar this.
59	return RareNeedleBytes { rare1i: `0`, rare2i: `0` };
60	}
61
62	// Find the rarest two bytes. We make them distinct by construction.
63	let (mut rare1, mut rare1i) = (needle[`0`], `0`);
64	let (mut rare2, mut rare2i) = (needle[`1`], `1`);
65	if rank(rare2) < rank(rare1) {
66	core::mem::swap(&mut rare1, &mut rare2);
67	core::mem::swap(&mut rare1i, &mut rare2i);
68	}
69	for (i, &b) in needle.iter().enumerate().skip(`2`) {
70	if rank(b) < rank(rare1) {
71	rare2 = rare1;
72	rare2i = rare1i;
73	rare1 = b;
74	rare1i = i as u8;
75	} else if b != rare1 && rank(b) < rank(rare2) {
76	rare2 = b;
77	rare2i = i as u8;
78	}
79	}
80	// While not strictly required, we really don't want these to be
81	// equivalent. If they were, it would reduce the effectiveness of
82	// candidate searching using these rare bytes by increasing the rate of
83	// false positives.
84	assert_ne!(rare1i, rare2i);
85	RareNeedleBytes { rare1i, rare2i }
86	}
87
88	/// Return the rare bytes in the given needle in the forward direction.
89	/// The needle given must be the same one given to the RareNeedleBytes
90	/// constructor.
91	pub(crate) fn as_rare_bytes(&self, needle: &[u8]) -> (u8, u8) {
92	(needle[self.rare1i as usize], needle[self.rare2i as usize])
93	}
94
95	/// Return the rare offsets such that the first offset is always <= to the
96	/// second offset. This is useful when the caller doesn't care whether
97	/// rare1 is rarer than rare2, but just wants to ensure that they are
98	/// ordered with respect to one another.
99	#[cfg(memchr_runtime_simd)]
100	pub(crate) fn as_rare_ordered_usize(&self) -> (usize, usize) {
101	let (rare1i, rare2i) = self.as_rare_ordered_u8();
102	(rare1i as usize, rare2i as usize)
103	}
104
105	/// Like as_rare_ordered_usize, but returns the offsets as their native
106	/// u8 values.
107	#[cfg(memchr_runtime_simd)]
108	pub(crate) fn as_rare_ordered_u8(&self) -> (u8, u8) {
109	if self.rare1i <= self.rare2i {
110	(self.rare1i, self.rare2i)
111	} else {
112	(self.rare2i, self.rare1i)
113	}
114	}
115
116	/// Return the rare offsets as usize values in the order in which they were
117	/// constructed. rare1, for example, is constructed as the "rarer" byte,
118	/// and thus, callers may want to treat it differently from rare2.
119	pub(crate) fn as_rare_usize(&self) -> (usize, usize) {
120	(self.rare1i as usize, self.rare2i as usize)
121	}
122
123	/// Return the byte frequency rank of each byte. The higher the rank, the
124	/// more frequency the byte is predicted to be. The needle given must be
125	/// the same one given to the RareNeedleBytes constructor.
126	pub(crate) fn as_ranks(&self, needle: &[u8]) -> (usize, usize) {
127	let (b1, b2) = self.as_rare_bytes(needle);
128	(rank(b1), rank(b2))
129	}
130	}
131
132	/// Return the heuristical frequency rank of the given byte. A lower rank
133	/// means the byte is believed to occur less frequently.
134	fn rank(b: u8) -> usize {
135	crate::memmem::byte_frequencies::BYTE_FREQUENCIES[b as usize] as usize
136	}
137

Provided by KDAB

Definitions

Learn Rust with the experts

Find out more