1 | use std::cmp; |
2 | use std::io; |
3 | use std::ptr; |
4 | |
5 | /// The default buffer capacity that we use for the stream buffer. |
6 | const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1 << 10); // 8 KB |
7 | |
8 | /// A fairly simple roll buffer for supporting stream searches. |
9 | /// |
10 | /// This buffer acts as a temporary place to store a fixed amount of data when |
11 | /// reading from a stream. Its central purpose is to allow "rolling" some |
12 | /// suffix of the data to the beginning of the buffer before refilling it with |
13 | /// more data from the stream. For example, let's say we are trying to match |
14 | /// "foobar" on a stream. When we report the match, we'd like to not only |
15 | /// report the correct offsets at which the match occurs, but also the matching |
16 | /// bytes themselves. So let's say our stream is a file with the following |
17 | /// contents: `test test foobar test test`. Now assume that we happen to read |
18 | /// the aforementioned file in two chunks: `test test foo` and `bar test test`. |
19 | /// Naively, it would not be possible to report a single contiguous `foobar` |
20 | /// match, but this roll buffer allows us to do that. Namely, after the second |
21 | /// read, the contents of the buffer should be `st foobar test test`, where the |
22 | /// search should ultimately resume immediately after `foo`. (The prefix `st ` |
23 | /// is included because the roll buffer saves N bytes at the end of the buffer, |
24 | /// where N is the maximum possible length of a match.) |
25 | /// |
26 | /// A lot of the logic for dealing with this is unfortunately split out between |
27 | /// this roll buffer and the `StreamChunkIter`. |
28 | #[derive (Debug)] |
29 | pub struct Buffer { |
30 | /// The raw buffer contents. This has a fixed size and never increases. |
31 | buf: Vec<u8>, |
32 | /// The minimum size of the buffer, which is equivalent to the maximum |
33 | /// possible length of a match. This corresponds to the amount that we |
34 | /// roll |
35 | min: usize, |
36 | /// The end of the contents of this buffer. |
37 | end: usize, |
38 | } |
39 | |
40 | impl Buffer { |
41 | /// Create a new buffer for stream searching. The minimum buffer length |
42 | /// given should be the size of the maximum possible match length. |
43 | pub fn new(min_buffer_len: usize) -> Buffer { |
44 | let min = cmp::max(1, min_buffer_len); |
45 | // The minimum buffer amount is also the amount that we roll our |
46 | // buffer in order to support incremental searching. To this end, |
47 | // our actual capacity needs to be at least 1 byte bigger than our |
48 | // minimum amount, otherwise we won't have any overlap. In actuality, |
49 | // we want our buffer to be a bit bigger than that for performance |
50 | // reasons, so we set a lower bound of `8 * min`. |
51 | // |
52 | // TODO: It would be good to find a way to test the streaming |
53 | // implementation with the minimal buffer size. For now, we just |
54 | // uncomment out the next line and comment out the subsequent line. |
55 | // let capacity = 1 + min; |
56 | let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); |
57 | Buffer { buf: vec![0; capacity], min, end: 0 } |
58 | } |
59 | |
60 | /// Return the contents of this buffer. |
61 | #[inline ] |
62 | pub fn buffer(&self) -> &[u8] { |
63 | &self.buf[..self.end] |
64 | } |
65 | |
66 | /// Return the minimum size of the buffer. The only way a buffer may be |
67 | /// smaller than this is if the stream itself contains less than the |
68 | /// minimum buffer amount. |
69 | #[inline ] |
70 | pub fn min_buffer_len(&self) -> usize { |
71 | self.min |
72 | } |
73 | |
74 | /// Return the total length of the contents in the buffer. |
75 | #[inline ] |
76 | pub fn len(&self) -> usize { |
77 | self.end |
78 | } |
79 | |
80 | /// Return all free capacity in this buffer. |
81 | fn free_buffer(&mut self) -> &mut [u8] { |
82 | &mut self.buf[self.end..] |
83 | } |
84 | |
85 | /// Refill the contents of this buffer by reading as much as possible into |
86 | /// this buffer's free capacity. If no more bytes could be read, then this |
87 | /// returns false. Otherwise, this reads until it has filled the buffer |
88 | /// past the minimum amount. |
89 | pub fn fill<R: io::Read>(&mut self, mut rdr: R) -> io::Result<bool> { |
90 | let mut readany = false; |
91 | loop { |
92 | let readlen = rdr.read(self.free_buffer())?; |
93 | if readlen == 0 { |
94 | return Ok(readany); |
95 | } |
96 | readany = true; |
97 | self.end += readlen; |
98 | if self.len() >= self.min { |
99 | return Ok(true); |
100 | } |
101 | } |
102 | } |
103 | |
104 | /// Roll the contents of the buffer so that the suffix of this buffer is |
105 | /// moved to the front and all other contents are dropped. The size of the |
106 | /// suffix corresponds precisely to the minimum buffer length. |
107 | /// |
108 | /// This should only be called when the entire contents of this buffer have |
109 | /// been searched. |
110 | pub fn roll(&mut self) { |
111 | let roll_start = self |
112 | .end |
113 | .checked_sub(self.min) |
114 | .expect("buffer capacity should be bigger than minimum amount" ); |
115 | let roll_len = self.min; |
116 | |
117 | assert!(roll_start + roll_len <= self.end); |
118 | unsafe { |
119 | // SAFETY: A buffer contains Copy data, so there's no problem |
120 | // moving it around. Safety also depends on our indices being in |
121 | // bounds, which they always should be, given the assert above. |
122 | // |
123 | // TODO: Switch to [T]::copy_within once our MSRV is high enough. |
124 | ptr::copy( |
125 | self.buf[roll_start..].as_ptr(), |
126 | self.buf.as_mut_ptr(), |
127 | roll_len, |
128 | ); |
129 | } |
130 | self.end = roll_len; |
131 | } |
132 | } |
133 | |