1 | use alloc::{vec, vec::Vec}; |
2 | |
3 | /// The default buffer capacity that we use for the stream buffer. |
4 | const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB |
5 | |
6 | /// A fairly simple roll buffer for supporting stream searches. |
7 | /// |
8 | /// This buffer acts as a temporary place to store a fixed amount of data when |
9 | /// reading from a stream. Its central purpose is to allow "rolling" some |
10 | /// suffix of the data to the beginning of the buffer before refilling it with |
11 | /// more data from the stream. For example, let's say we are trying to match |
12 | /// "foobar" on a stream. When we report the match, we'd like to not only |
13 | /// report the correct offsets at which the match occurs, but also the matching |
14 | /// bytes themselves. So let's say our stream is a file with the following |
15 | /// contents: `test test foobar test test`. Now assume that we happen to read |
16 | /// the aforementioned file in two chunks: `test test foo` and `bar test test`. |
17 | /// Naively, it would not be possible to report a single contiguous `foobar` |
18 | /// match, but this roll buffer allows us to do that. Namely, after the second |
19 | /// read, the contents of the buffer should be `st foobar test test`, where the |
20 | /// search should ultimately resume immediately after `foo`. (The prefix `st ` |
21 | /// is included because the roll buffer saves N bytes at the end of the buffer, |
22 | /// where N is the maximum possible length of a match.) |
23 | /// |
24 | /// A lot of the logic for dealing with this is unfortunately split out between |
25 | /// this roll buffer and the `StreamChunkIter`. |
26 | /// |
27 | /// Note also that this buffer is not actually required to just report matches. |
28 | /// Because a `Match` is just some offsets. But it *is* required for supporting |
29 | /// things like `try_stream_replace_all` because that needs some mechanism for |
30 | /// knowing which bytes in the stream correspond to a match and which don't. So |
31 | /// when a match occurs across two `read` calls, *something* needs to retain |
32 | /// the bytes from the previous `read` call because you don't know before the |
33 | /// second read call whether a match exists or not. |
34 | #[derive(Debug)] |
35 | pub(crate) struct Buffer { |
36 | /// The raw buffer contents. This has a fixed size and never increases. |
37 | buf: Vec<u8>, |
38 | /// The minimum size of the buffer, which is equivalent to the maximum |
39 | /// possible length of a match. This corresponds to the amount that we |
40 | /// roll |
41 | min: usize, |
42 | /// The end of the contents of this buffer. |
43 | end: usize, |
44 | } |
45 | |
46 | impl Buffer { |
47 | /// Create a new buffer for stream searching. The minimum buffer length |
48 | /// given should be the size of the maximum possible match length. |
49 | pub(crate) fn new(min_buffer_len: usize) -> Buffer { |
50 | let min = core::cmp::max(1, min_buffer_len); |
51 | // The minimum buffer amount is also the amount that we roll our |
52 | // buffer in order to support incremental searching. To this end, |
53 | // our actual capacity needs to be at least 1 byte bigger than our |
54 | // minimum amount, otherwise we won't have any overlap. In actuality, |
55 | // we want our buffer to be a bit bigger than that for performance |
56 | // reasons, so we set a lower bound of `8 * min`. |
57 | // |
58 | // TODO: It would be good to find a way to test the streaming |
59 | // implementation with the minimal buffer size. For now, we just |
60 | // uncomment out the next line and comment out the subsequent line. |
61 | // let capacity = 1 + min; |
62 | let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); |
63 | Buffer { buf: vec![0; capacity], min, end: 0 } |
64 | } |
65 | |
66 | /// Return the contents of this buffer. |
67 | #[inline ] |
68 | pub(crate) fn buffer(&self) -> &[u8] { |
69 | &self.buf[..self.end] |
70 | } |
71 | |
72 | /// Return the minimum size of the buffer. The only way a buffer may be |
73 | /// smaller than this is if the stream itself contains less than the |
74 | /// minimum buffer amount. |
75 | #[inline ] |
76 | pub(crate) fn min_buffer_len(&self) -> usize { |
77 | self.min |
78 | } |
79 | |
80 | /// Return all free capacity in this buffer. |
81 | fn free_buffer(&mut self) -> &mut [u8] { |
82 | &mut self.buf[self.end..] |
83 | } |
84 | |
85 | /// Refill the contents of this buffer by reading as much as possible into |
86 | /// this buffer's free capacity. If no more bytes could be read, then this |
87 | /// returns false. Otherwise, this reads until it has filled the buffer |
88 | /// past the minimum amount. |
89 | pub(crate) fn fill<R: std::io::Read>( |
90 | &mut self, |
91 | mut rdr: R, |
92 | ) -> std::io::Result<bool> { |
93 | let mut readany = false; |
94 | loop { |
95 | let readlen = rdr.read(self.free_buffer())?; |
96 | if readlen == 0 { |
97 | return Ok(readany); |
98 | } |
99 | readany = true; |
100 | self.end += readlen; |
101 | if self.buffer().len() >= self.min { |
102 | return Ok(true); |
103 | } |
104 | } |
105 | } |
106 | |
107 | /// Roll the contents of the buffer so that the suffix of this buffer is |
108 | /// moved to the front and all other contents are dropped. The size of the |
109 | /// suffix corresponds precisely to the minimum buffer length. |
110 | /// |
111 | /// This should only be called when the entire contents of this buffer have |
112 | /// been searched. |
113 | pub(crate) fn roll(&mut self) { |
114 | let roll_start = self |
115 | .end |
116 | .checked_sub(self.min) |
117 | .expect("buffer capacity should be bigger than minimum amount" ); |
118 | let roll_end = roll_start + self.min; |
119 | |
120 | assert!(roll_end <= self.end); |
121 | self.buf.copy_within(roll_start..roll_end, 0); |
122 | self.end = self.min; |
123 | } |
124 | } |
125 | |