buffer.rs source code [crates/aho-corasick-1.0.2/src/util/buffer.rs]

1	use alloc::{vec, vec::Vec};
2
3	/// The default buffer capacity that we use for the stream buffer.
4	const DEFAULT_BUFFER_CAPACITY: usize = `64` * (`1` << `10`); // 64 KB
5
6	/// A fairly simple roll buffer for supporting stream searches.
7	///
8	/// This buffer acts as a temporary place to store a fixed amount of data when
9	/// reading from a stream. Its central purpose is to allow "rolling" some
10	/// suffix of the data to the beginning of the buffer before refilling it with
11	/// more data from the stream. For example, let's say we are trying to match
12	/// "foobar" on a stream. When we report the match, we'd like to not only
13	/// report the correct offsets at which the match occurs, but also the matching
14	/// bytes themselves. So let's say our stream is a file with the following
15	/// contents: `test test foobar test test`. Now assume that we happen to read
16	/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
17	/// Naively, it would not be possible to report a single contiguous `foobar`
18	/// match, but this roll buffer allows us to do that. Namely, after the second
19	/// read, the contents of the buffer should be `st foobar test test`, where the
20	/// search should ultimately resume immediately after `foo`. (The prefix `st `
21	/// is included because the roll buffer saves N bytes at the end of the buffer,
22	/// where N is the maximum possible length of a match.)
23	///
24	/// A lot of the logic for dealing with this is unfortunately split out between
25	/// this roll buffer and the `StreamChunkIter`.
26	///
27	/// Note also that this buffer is not actually required to just report matches.
28	/// Because a `Match` is just some offsets. But it is* required for supporting*
29	/// things like `try_stream_replace_all` because that needs some mechanism for
30	/// knowing which bytes in the stream correspond to a match and which don't. So
31	/// when a match occurs across two `read` calls, something* needs to retain*
32	/// the bytes from the previous `read` call because you don't know before the
33	/// second read call whether a match exists or not.
34	#[derive(Debug)]
35	pub(crate) struct Buffer {
36	/// The raw buffer contents. This has a fixed size and never increases.
37	buf: Vec<u8>,
38	/// The minimum size of the buffer, which is equivalent to the maximum
39	/// possible length of a match. This corresponds to the amount that we
40	/// roll
41	min: usize,
42	/// The end of the contents of this buffer.
43	end: usize,
44	}
45
46	impl Buffer {
47	/// Create a new buffer for stream searching. The minimum buffer length
48	/// given should be the size of the maximum possible match length.
49	pub(crate) fn new(min_buffer_len: usize) -> Buffer {
50	let min = core::cmp::max(`1`, min_buffer_len);
51	// The minimum buffer amount is also the amount that we roll our
52	// buffer in order to support incremental searching. To this end,
53	// our actual capacity needs to be at least 1 byte bigger than our
54	// minimum amount, otherwise we won't have any overlap. In actuality,
55	// we want our buffer to be a bit bigger than that for performance
56	// reasons, so we set a lower bound of `8 min`.*
57	//
58	// TODO: It would be good to find a way to test the streaming
59	// implementation with the minimal buffer size. For now, we just
60	// uncomment out the next line and comment out the subsequent line.
61	// let capacity = 1 + min;
62	let capacity = core::cmp::max(min * `8`, DEFAULT_BUFFER_CAPACITY);
63	Buffer { buf: vec![`0`; capacity], min, end: `0` }
64	}
65
66	/// Return the contents of this buffer.
67	#[inline]
68	pub(crate) fn buffer(&self) -> &[u8] {
69	&self.buf[..self.end]
70	}
71
72	/// Return the minimum size of the buffer. The only way a buffer may be
73	/// smaller than this is if the stream itself contains less than the
74	/// minimum buffer amount.
75	#[inline]
76	pub(crate) fn min_buffer_len(&self) -> usize {
77	self.min
78	}
79
80	/// Return all free capacity in this buffer.
81	fn free_buffer(&mut self) -> &mut [u8] {
82	&mut self.buf[self.end..]
83	}
84
85	/// Refill the contents of this buffer by reading as much as possible into
86	/// this buffer's free capacity. If no more bytes could be read, then this
87	/// returns false. Otherwise, this reads until it has filled the buffer
88	/// past the minimum amount.
89	pub(crate) fn fill<R: std::io::Read>(
90	&mut self,
91	mut rdr: R,
92	) -> std::io::Result<bool> {
93	let mut readany = `false`;
94	loop {
95	let readlen = rdr.read(self.free_buffer())?;
96	if readlen == `0` {
97	return Ok(readany);
98	}
99	readany = `true`;
100	self.end += readlen;
101	if self.buffer().len() >= self.min {
102	return Ok(`true`);
103	}
104	}
105	}
106
107	/// Roll the contents of the buffer so that the suffix of this buffer is
108	/// moved to the front and all other contents are dropped. The size of the
109	/// suffix corresponds precisely to the minimum buffer length.
110	///
111	/// This should only be called when the entire contents of this buffer have
112	/// been searched.
113	pub(crate) fn roll(&mut self) {
114	let roll_start = self
115	.end
116	.checked_sub(self.min)
117	.expect("buffer capacity should be bigger than minimum amount");
118	let roll_end = roll_start + self.min;
119
120	assert!(roll_end <= self.end);
121	self.buf.copy_within(roll_start..roll_end, `0`);
122	self.end = self.min;
123	}
124	}
125