buffer.rs source code [crates/aho-corasick-0.7.20/src/buffer.rs]

1	use std::cmp;
2	use std::io;
3	use std::ptr;
4
5	/// The default buffer capacity that we use for the stream buffer.
6	const DEFAULT_BUFFER_CAPACITY: usize = `8` * (`1` << `10`); // 8 KB
7
8	/// A fairly simple roll buffer for supporting stream searches.
9	///
10	/// This buffer acts as a temporary place to store a fixed amount of data when
11	/// reading from a stream. Its central purpose is to allow "rolling" some
12	/// suffix of the data to the beginning of the buffer before refilling it with
13	/// more data from the stream. For example, let's say we are trying to match
14	/// "foobar" on a stream. When we report the match, we'd like to not only
15	/// report the correct offsets at which the match occurs, but also the matching
16	/// bytes themselves. So let's say our stream is a file with the following
17	/// contents: `test test foobar test test`. Now assume that we happen to read
18	/// the aforementioned file in two chunks: `test test foo` and `bar test test`.
19	/// Naively, it would not be possible to report a single contiguous `foobar`
20	/// match, but this roll buffer allows us to do that. Namely, after the second
21	/// read, the contents of the buffer should be `st foobar test test`, where the
22	/// search should ultimately resume immediately after `foo`. (The prefix `st `
23	/// is included because the roll buffer saves N bytes at the end of the buffer,
24	/// where N is the maximum possible length of a match.)
25	///
26	/// A lot of the logic for dealing with this is unfortunately split out between
27	/// this roll buffer and the `StreamChunkIter`.
28	#[derive(Debug)]
29	pub struct Buffer {
30	/// The raw buffer contents. This has a fixed size and never increases.
31	buf: Vec<u8>,
32	/// The minimum size of the buffer, which is equivalent to the maximum
33	/// possible length of a match. This corresponds to the amount that we
34	/// roll
35	min: usize,
36	/// The end of the contents of this buffer.
37	end: usize,
38	}
39
40	impl Buffer {
41	/// Create a new buffer for stream searching. The minimum buffer length
42	/// given should be the size of the maximum possible match length.
43	pub fn new(min_buffer_len: usize) -> Buffer {
44	let min = cmp::max(`1`, min_buffer_len);
45	// The minimum buffer amount is also the amount that we roll our
46	// buffer in order to support incremental searching. To this end,
47	// our actual capacity needs to be at least 1 byte bigger than our
48	// minimum amount, otherwise we won't have any overlap. In actuality,
49	// we want our buffer to be a bit bigger than that for performance
50	// reasons, so we set a lower bound of `8 min`.*
51	//
52	// TODO: It would be good to find a way to test the streaming
53	// implementation with the minimal buffer size. For now, we just
54	// uncomment out the next line and comment out the subsequent line.
55	// let capacity = 1 + min;
56	let capacity = cmp::max(min * `8`, DEFAULT_BUFFER_CAPACITY);
57	Buffer { buf: vec![`0`; capacity], min, end: `0` }
58	}
59
60	/// Return the contents of this buffer.
61	#[inline]
62	pub fn buffer(&self) -> &[u8] {
63	&self.buf[..self.end]
64	}
65
66	/// Return the minimum size of the buffer. The only way a buffer may be
67	/// smaller than this is if the stream itself contains less than the
68	/// minimum buffer amount.
69	#[inline]
70	pub fn min_buffer_len(&self) -> usize {
71	self.min
72	}
73
74	/// Return the total length of the contents in the buffer.
75	#[inline]
76	pub fn len(&self) -> usize {
77	self.end
78	}
79
80	/// Return all free capacity in this buffer.
81	fn free_buffer(&mut self) -> &mut [u8] {
82	&mut self.buf[self.end..]
83	}
84
85	/// Refill the contents of this buffer by reading as much as possible into
86	/// this buffer's free capacity. If no more bytes could be read, then this
87	/// returns false. Otherwise, this reads until it has filled the buffer
88	/// past the minimum amount.
89	pub fn fill<R: io::Read>(&mut self, mut rdr: R) -> io::Result<bool> {
90	let mut readany = `false`;
91	loop {
92	let readlen = rdr.read(self.free_buffer())?;
93	if readlen == `0` {
94	return Ok(readany);
95	}
96	readany = `true`;
97	self.end += readlen;
98	if self.len() >= self.min {
99	return Ok(`true`);
100	}
101	}
102	}
103
104	/// Roll the contents of the buffer so that the suffix of this buffer is
105	/// moved to the front and all other contents are dropped. The size of the
106	/// suffix corresponds precisely to the minimum buffer length.
107	///
108	/// This should only be called when the entire contents of this buffer have
109	/// been searched.
110	pub fn roll(&mut self) {
111	let roll_start = self
112	.end
113	.checked_sub(self.min)
114	.expect("buffer capacity should be bigger than minimum amount");
115	let roll_len = self.min;
116
117	assert!(roll_start + roll_len <= self.end);
118	unsafe {
119	// SAFETY: A buffer contains Copy data, so there's no problem
120	// moving it around. Safety also depends on our indices being in
121	// bounds, which they always should be, given the assert above.
122	//
123	// TODO: Switch to [T]::copy_within once our MSRV is high enough.
124	ptr::copy(
125	self.buf[roll_start..].as_ptr(),
126	self.buf.as_mut_ptr(),
127	roll_len,
128	);
129	}
130	self.end = roll_len;
131	}
132	}
133