1 | use std::fmt; |
2 | use std::ops::{Deref, DerefMut}; |
3 | use std::ptr; |
4 | |
5 | use crate::bytes; |
6 | use crate::error::{Error, Result}; |
7 | use crate::{MAX_BLOCK_SIZE, MAX_INPUT_SIZE}; |
8 | |
9 | /// The total number of slots we permit for our hash table of 4 byte repeat |
10 | /// sequences. |
11 | const MAX_TABLE_SIZE: usize = 1 << 14; |
12 | |
13 | /// The size of a small hash table. This is useful for reducing overhead when |
14 | /// compressing very small blocks of bytes. |
15 | const SMALL_TABLE_SIZE: usize = 1 << 10; |
16 | |
17 | /// The total number of bytes that we always leave uncompressed at the end |
18 | /// of the buffer. This in particular affords us some wiggle room during |
19 | /// compression such that faster copy operations can be used. |
20 | const INPUT_MARGIN: usize = 16 - 1; |
21 | |
22 | /// The minimum block size that we're willing to consider for compression. |
23 | /// Anything smaller than this gets emitted as a literal. |
24 | const MIN_NON_LITERAL_BLOCK_SIZE: usize = 1 + 1 + INPUT_MARGIN; |
25 | |
26 | /// Nice names for the various Snappy tags. |
27 | enum Tag { |
28 | Literal = 0b00, |
29 | Copy1 = 0b01, |
30 | Copy2 = 0b10, |
31 | // Compression never actually emits a Copy4 operation and decompression |
32 | // uses tricks so that we never explicitly do case analysis on the copy |
33 | // operation type, therefore leading to the fact that we never use Copy4. |
34 | #[allow (dead_code)] |
35 | Copy4 = 0b11, |
36 | } |
37 | |
38 | /// Returns the maximum compressed size given the uncompressed size. |
39 | /// |
40 | /// If the uncompressed size exceeds the maximum allowable size then this |
41 | /// returns 0. |
42 | pub fn max_compress_len(input_len: usize) -> usize { |
43 | let input_len: u64 = input_len as u64; |
44 | if input_len > MAX_INPUT_SIZE { |
45 | return 0; |
46 | } |
47 | let max: u64 = 32 + input_len + (input_len / 6); |
48 | if max > MAX_INPUT_SIZE { |
49 | 0 |
50 | } else { |
51 | max as usize |
52 | } |
53 | } |
54 | |
55 | /// Encoder is a raw encoder for compressing bytes in the Snappy format. |
56 | /// |
57 | /// Thie encoder does not use the Snappy frame format and simply compresses the |
58 | /// given bytes in one big Snappy block (that is, it has a single header). |
59 | /// |
60 | /// Unless you explicitly need the low-level control, you should use |
61 | /// [`read::FrameEncoder`](../read/struct.FrameEncoder.html) |
62 | /// or |
63 | /// [`write::FrameEncoder`](../write/struct.FrameEncoder.html) |
64 | /// instead, which compresses to the Snappy frame format. |
65 | /// |
66 | /// It is beneficial to reuse an Encoder when possible. |
67 | pub struct Encoder { |
68 | small: [u16; SMALL_TABLE_SIZE], |
69 | big: Vec<u16>, |
70 | } |
71 | |
72 | impl fmt::Debug for Encoder { |
73 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
74 | write!(f, "Encoder(...)" ) |
75 | } |
76 | } |
77 | |
78 | impl Encoder { |
79 | /// Return a new encoder that can be used for compressing bytes. |
80 | pub fn new() -> Encoder { |
81 | Encoder { small: [0; SMALL_TABLE_SIZE], big: vec![] } |
82 | } |
83 | |
84 | /// Compresses all bytes in `input` into `output`. |
85 | /// |
86 | /// `input` can be any arbitrary sequence of bytes. |
87 | /// |
88 | /// `output` must be large enough to hold the maximum possible compressed |
89 | /// size of `input`, which can be computed using `max_compress_len`. |
90 | /// |
91 | /// On success, this returns the number of bytes written to `output`. |
92 | /// |
93 | /// # Errors |
94 | /// |
95 | /// This method returns an error in the following circumstances: |
96 | /// |
97 | /// * The total number of bytes to compress exceeds `2^32 - 1`. |
98 | /// * `output` has length less than `max_compress_len(input.len())`. |
99 | pub fn compress( |
100 | &mut self, |
101 | mut input: &[u8], |
102 | output: &mut [u8], |
103 | ) -> Result<usize> { |
104 | match max_compress_len(input.len()) { |
105 | 0 => { |
106 | return Err(Error::TooBig { |
107 | given: input.len() as u64, |
108 | max: MAX_INPUT_SIZE, |
109 | }); |
110 | } |
111 | min if output.len() < min => { |
112 | return Err(Error::BufferTooSmall { |
113 | given: output.len() as u64, |
114 | min: min as u64, |
115 | }); |
116 | } |
117 | _ => {} |
118 | } |
119 | // Handle an edge case specially. |
120 | if input.is_empty() { |
121 | // Encodes a varint of 0, denoting the total size of uncompressed |
122 | // bytes. |
123 | output[0] = 0; |
124 | return Ok(1); |
125 | } |
126 | // Write the Snappy header, which is just the total number of |
127 | // uncompressed bytes. |
128 | let mut d = bytes::write_varu64(output, input.len() as u64); |
129 | while !input.is_empty() { |
130 | // Find the next block. |
131 | let mut src = input; |
132 | if src.len() > MAX_BLOCK_SIZE { |
133 | src = &src[..MAX_BLOCK_SIZE as usize]; |
134 | } |
135 | input = &input[src.len()..]; |
136 | |
137 | // If the block is smallish, then don't waste time on it and just |
138 | // emit a literal. |
139 | let mut block = Block::new(src, output, d); |
140 | if block.src.len() < MIN_NON_LITERAL_BLOCK_SIZE { |
141 | let lit_end = block.src.len(); |
142 | unsafe { |
143 | // SAFETY: next_emit is zero (in bounds) and the end is |
144 | // the length of the block (in bounds). |
145 | block.emit_literal(lit_end); |
146 | } |
147 | } else { |
148 | let table = self.block_table(block.src.len()); |
149 | block.compress(table); |
150 | } |
151 | d = block.d; |
152 | } |
153 | Ok(d) |
154 | } |
155 | |
156 | /// Compresses all bytes in `input` into a freshly allocated `Vec`. |
157 | /// |
158 | /// This is just like the `compress` method, except it allocates a `Vec` |
159 | /// with the right size for you. (This is intended to be a convenience |
160 | /// method.) |
161 | /// |
162 | /// This method returns an error under the same circumstances that |
163 | /// `compress` does. |
164 | pub fn compress_vec(&mut self, input: &[u8]) -> Result<Vec<u8>> { |
165 | let mut buf = vec![0; max_compress_len(input.len())]; |
166 | let n = self.compress(input, &mut buf)?; |
167 | buf.truncate(n); |
168 | Ok(buf) |
169 | } |
170 | } |
171 | |
172 | struct Block<'s, 'd> { |
173 | src: &'s [u8], |
174 | s: usize, |
175 | s_limit: usize, |
176 | dst: &'d mut [u8], |
177 | d: usize, |
178 | next_emit: usize, |
179 | } |
180 | |
181 | impl<'s, 'd> Block<'s, 'd> { |
182 | #[inline (always)] |
183 | fn new(src: &'s [u8], dst: &'d mut [u8], d: usize) -> Block<'s, 'd> { |
184 | Block { |
185 | src: src, |
186 | s: 0, |
187 | s_limit: src.len(), |
188 | dst: dst, |
189 | d: d, |
190 | next_emit: 0, |
191 | } |
192 | } |
193 | |
194 | #[inline (always)] |
195 | fn compress(&mut self, mut table: BlockTable<'_>) { |
196 | debug_assert!(!table.is_empty()); |
197 | debug_assert!(self.src.len() >= MIN_NON_LITERAL_BLOCK_SIZE); |
198 | |
199 | self.s += 1; |
200 | self.s_limit -= INPUT_MARGIN; |
201 | let mut next_hash = |
202 | table.hash(bytes::read_u32_le(&self.src[self.s..])); |
203 | loop { |
204 | let mut skip = 32; |
205 | let mut candidate; |
206 | let mut s_next = self.s; |
207 | loop { |
208 | self.s = s_next; |
209 | let bytes_between_hash_lookups = skip >> 5; |
210 | s_next = self.s + bytes_between_hash_lookups; |
211 | skip += bytes_between_hash_lookups; |
212 | if s_next > self.s_limit { |
213 | return self.done(); |
214 | } |
215 | unsafe { |
216 | // SAFETY: next_hash is always computed by table.hash |
217 | // which is guaranteed to be in bounds. |
218 | candidate = *table.get_unchecked(next_hash) as usize; |
219 | *table.get_unchecked_mut(next_hash) = self.s as u16; |
220 | |
221 | let srcp = self.src.as_ptr(); |
222 | // SAFETY: s_next is guaranteed to be less than s_limit by |
223 | // the conditional above, which implies s_next is in |
224 | // bounds. |
225 | let x = bytes::loadu_u32_le(srcp.add(s_next)); |
226 | next_hash = table.hash(x); |
227 | // SAFETY: self.s is always less than s_next, so it is also |
228 | // in bounds by the argument above. |
229 | // |
230 | // candidate is extracted from table, which is only ever |
231 | // set to valid positions in the block and is therefore |
232 | // also in bounds. |
233 | // |
234 | // We only need to compare y/z for equality, so we don't |
235 | // need to both with endianness. cur corresponds to the |
236 | // bytes at the current position and cand corresponds to |
237 | // a potential match. If they're equal, we declare victory |
238 | // and move below to try and extend the match. |
239 | let cur = bytes::loadu_u32_ne(srcp.add(self.s)); |
240 | let cand = bytes::loadu_u32_ne(srcp.add(candidate)); |
241 | if cur == cand { |
242 | break; |
243 | } |
244 | } |
245 | } |
246 | // While the above found a candidate for compression, before we |
247 | // emit a copy operation for it, we need to make sure that we emit |
248 | // any bytes between the last copy operation and this one as a |
249 | // literal. |
250 | let lit_end = self.s; |
251 | unsafe { |
252 | // SAFETY: next_emit is set to a previous value of self.s, |
253 | // which is guaranteed to be less than s_limit (in bounds). |
254 | // lit_end is set to the current value of self.s, also |
255 | // guaranteed to be less than s_limit (in bounds). |
256 | self.emit_literal(lit_end); |
257 | } |
258 | loop { |
259 | // Look for more matching bytes starting at the position of |
260 | // the candidate and the current src position. We increment |
261 | // self.s and candidate by 4 since we already know the first 4 |
262 | // bytes match. |
263 | let base = self.s; |
264 | self.s += 4; |
265 | unsafe { |
266 | // SAFETY: candidate is always set to a value from our |
267 | // hash table, which only contains positions in self.src |
268 | // that have been seen for this block that occurred before |
269 | // self.s. |
270 | self.extend_match(candidate + 4); |
271 | } |
272 | let (offset, len) = (base - candidate, self.s - base); |
273 | self.emit_copy(offset, len); |
274 | self.next_emit = self.s; |
275 | if self.s >= self.s_limit { |
276 | return self.done(); |
277 | } |
278 | // Update the hash table with the byte sequences |
279 | // self.src[self.s - 1..self.s + 3] and |
280 | // self.src[self.s..self.s + 4]. Instead of reading 4 bytes |
281 | // twice, we read 8 bytes once. |
282 | // |
283 | // If we happen to get a hit on self.src[self.s..self.s + 4], |
284 | // then continue this loop and extend the match. |
285 | unsafe { |
286 | let srcp = self.src.as_ptr(); |
287 | // SAFETY: self.s can never exceed s_limit given by the |
288 | // conditional above and self.s is guaranteed to be |
289 | // non-zero and is therefore in bounds. |
290 | let x = bytes::loadu_u64_le(srcp.add(self.s - 1)); |
291 | // The lower 4 bytes of x correspond to |
292 | // self.src[self.s - 1..self.s + 3]. |
293 | let prev_hash = table.hash(x as u32); |
294 | // SAFETY: Hash values are guaranteed to be in bounds. |
295 | *table.get_unchecked_mut(prev_hash) = (self.s - 1) as u16; |
296 | // The lower 4 bytes of x>>8 correspond to |
297 | // self.src[self.s..self.s + 4]. |
298 | let cur_hash = table.hash((x >> 8) as u32); |
299 | // SAFETY: Hash values are guaranteed to be in bounds. |
300 | candidate = *table.get_unchecked(cur_hash) as usize; |
301 | *table.get_unchecked_mut(cur_hash) = self.s as u16; |
302 | |
303 | // SAFETY: candidate is set from table, which always |
304 | // contains valid positions in the current block. |
305 | let y = bytes::loadu_u32_le(srcp.add(candidate)); |
306 | if (x >> 8) as u32 != y { |
307 | // If we didn't get a hit, update the next hash |
308 | // and move on. Our initial 8 byte read continues to |
309 | // pay off. |
310 | next_hash = table.hash((x >> 16) as u32); |
311 | self.s += 1; |
312 | break; |
313 | } |
314 | } |
315 | } |
316 | } |
317 | } |
318 | |
319 | /// Emits one or more copy operations with the given offset and length. |
320 | /// offset must be in the range [1, 65535] and len must be in the range |
321 | /// [4, 65535]. |
322 | #[inline (always)] |
323 | fn emit_copy(&mut self, offset: usize, mut len: usize) { |
324 | debug_assert!(1 <= offset && offset <= 65535); |
325 | // Copy operations only allow lengths up to 64, but we'll allow bigger |
326 | // lengths and emit as many operations as we need. |
327 | // |
328 | // N.B. Since our block size is 64KB, we never actually emit a copy 4 |
329 | // operation. |
330 | debug_assert!(4 <= len && len <= 65535); |
331 | |
332 | // Emit copy 2 operations until we don't have to. |
333 | // We check on 68 here and emit a shorter copy than 64 below because |
334 | // it is cheaper to, e.g., encode a length 67 copy as a length 60 |
335 | // copy 2 followed by a length 7 copy 1 than to encode it as a length |
336 | // 64 copy 2 followed by a length 3 copy 2. They key here is that a |
337 | // copy 1 operation requires at least length 4 which forces a length 3 |
338 | // copy to use a copy 2 operation. |
339 | while len >= 68 { |
340 | self.emit_copy2(offset, 64); |
341 | len -= 64; |
342 | } |
343 | if len > 64 { |
344 | self.emit_copy2(offset, 60); |
345 | len -= 60; |
346 | } |
347 | // If we can squeeze the last copy into a copy 1 operation, do it. |
348 | if len <= 11 && offset <= 2047 { |
349 | self.dst[self.d] = (((offset >> 8) as u8) << 5) |
350 | | (((len - 4) as u8) << 2) |
351 | | (Tag::Copy1 as u8); |
352 | self.dst[self.d + 1] = offset as u8; |
353 | self.d += 2; |
354 | } else { |
355 | self.emit_copy2(offset, len); |
356 | } |
357 | } |
358 | |
359 | /// Emits a "copy 2" operation with the given offset and length. The |
360 | /// offset and length must be valid for a copy 2 operation. i.e., offset |
361 | /// must be in the range [1, 65535] and len must be in the range [1, 64]. |
362 | #[inline (always)] |
363 | fn emit_copy2(&mut self, offset: usize, len: usize) { |
364 | debug_assert!(1 <= offset && offset <= 65535); |
365 | debug_assert!(1 <= len && len <= 64); |
366 | self.dst[self.d] = (((len - 1) as u8) << 2) | (Tag::Copy2 as u8); |
367 | bytes::write_u16_le(offset as u16, &mut self.dst[self.d + 1..]); |
368 | self.d += 3; |
369 | } |
370 | |
371 | /// Attempts to extend a match from the current position in self.src with |
372 | /// the candidate position given. |
373 | /// |
374 | /// This method uses unaligned loads and elides bounds checks, so the |
375 | /// caller must guarantee that cand points to a valid location in self.src |
376 | /// and is less than the current position in src. |
377 | #[inline (always)] |
378 | unsafe fn extend_match(&mut self, mut cand: usize) { |
379 | debug_assert!(cand < self.s); |
380 | while self.s + 8 <= self.src.len() { |
381 | let srcp = self.src.as_ptr(); |
382 | // SAFETY: The loop invariant guarantees that there is at least |
383 | // 8 bytes to read at self.src + self.s. Since cand must be |
384 | // guaranteed by the caller to be valid and less than self.s, it |
385 | // also has enough room to read 8 bytes. |
386 | // |
387 | // TODO(ag): Despite my best efforts, I couldn't get this to |
388 | // autovectorize with 128-bit loads. The logic after the loads |
389 | // appears to be a little too clever... |
390 | let x = bytes::loadu_u64_ne(srcp.add(self.s)); |
391 | let y = bytes::loadu_u64_ne(srcp.add(cand)); |
392 | if x == y { |
393 | // If all 8 bytes are equal, move on... |
394 | self.s += 8; |
395 | cand += 8; |
396 | } else { |
397 | // Otherwise, find the last byte that was equal. We can do |
398 | // this efficiently by interpreted x/y as little endian |
399 | // numbers, which lets us use the number of trailing zeroes |
400 | // as a proxy for the number of equivalent bits (after an XOR). |
401 | let z = x.to_le() ^ y.to_le(); |
402 | self.s += z.trailing_zeros() as usize / 8; |
403 | return; |
404 | } |
405 | } |
406 | // When we have fewer than 8 bytes left in the block, fall back to the |
407 | // slow loop. |
408 | while self.s < self.src.len() && self.src[self.s] == self.src[cand] { |
409 | self.s += 1; |
410 | cand += 1; |
411 | } |
412 | } |
413 | |
414 | /// Executes any cleanup when the current block has finished compressing. |
415 | /// In particular, it emits any leftover bytes as a literal. |
416 | #[inline (always)] |
417 | fn done(&mut self) { |
418 | if self.next_emit < self.src.len() { |
419 | let lit_end = self.src.len(); |
420 | unsafe { |
421 | // SAFETY: Both next_emit and lit_end are trivially in bounds |
422 | // given the conditional and definition above. |
423 | self.emit_literal(lit_end); |
424 | } |
425 | } |
426 | } |
427 | |
428 | /// Emits a literal from self.src[self.next_emit..lit_end]. |
429 | /// |
430 | /// This uses unaligned loads and elides bounds checks, so the caller must |
431 | /// guarantee that self.src[self.next_emit..lit_end] is valid. |
432 | #[inline (always)] |
433 | unsafe fn emit_literal(&mut self, lit_end: usize) { |
434 | let lit_start = self.next_emit; |
435 | let len = lit_end - lit_start; |
436 | let n = len.checked_sub(1).unwrap(); |
437 | if n <= 59 { |
438 | self.dst[self.d] = ((n as u8) << 2) | (Tag::Literal as u8); |
439 | self.d += 1; |
440 | if len <= 16 && lit_start + 16 <= self.src.len() { |
441 | // SAFETY: lit_start is equivalent to self.next_emit, which is |
442 | // only set to self.s immediately following a copy emit. The |
443 | // conditional above also ensures that there is at least 16 |
444 | // bytes of room in both src and dst. |
445 | // |
446 | // dst is big enough because the buffer is guaranteed to |
447 | // be big enough to hold biggest possible compressed size plus |
448 | // an extra 32 bytes, which exceeds the 16 byte copy here. |
449 | let srcp = self.src.as_ptr().add(lit_start); |
450 | let dstp = self.dst.as_mut_ptr().add(self.d); |
451 | ptr::copy_nonoverlapping(srcp, dstp, 16); |
452 | self.d += len; |
453 | return; |
454 | } |
455 | } else if n < 256 { |
456 | self.dst[self.d] = (60 << 2) | (Tag::Literal as u8); |
457 | self.dst[self.d + 1] = n as u8; |
458 | self.d += 2; |
459 | } else { |
460 | self.dst[self.d] = (61 << 2) | (Tag::Literal as u8); |
461 | bytes::write_u16_le(n as u16, &mut self.dst[self.d + 1..]); |
462 | self.d += 3; |
463 | } |
464 | // SAFETY: lit_start is equivalent to self.next_emit, which is only set |
465 | // to self.s immediately following a copy, which implies that it always |
466 | // points to valid bytes in self.src. |
467 | // |
468 | // We can't guarantee that there are at least len bytes though, which |
469 | // must be guaranteed by the caller and is why this method is unsafe. |
470 | let srcp = self.src.as_ptr().add(lit_start); |
471 | let dstp = self.dst.as_mut_ptr().add(self.d); |
472 | ptr::copy_nonoverlapping(srcp, dstp, len); |
473 | self.d += len; |
474 | } |
475 | } |
476 | |
477 | /// `BlockTable` is a map from 4 byte sequences to positions of their most |
478 | /// recent occurrence in a block. In particular, this table lets us quickly |
479 | /// find candidates for compression. |
480 | /// |
481 | /// We expose the `hash` method so that callers can be fastidious about the |
482 | /// number of times a hash is computed. |
483 | struct BlockTable<'a> { |
484 | table: &'a mut [u16], |
485 | /// The number of bits required to shift the hash such that the result |
486 | /// is less than table.len(). |
487 | shift: u32, |
488 | } |
489 | |
490 | impl Encoder { |
491 | fn block_table(&mut self, block_size: usize) -> BlockTable<'_> { |
492 | let mut shift: u32 = 32 - 8; |
493 | let mut table_size = 256; |
494 | while table_size < MAX_TABLE_SIZE && table_size < block_size { |
495 | shift -= 1; |
496 | table_size *= 2; |
497 | } |
498 | // If our block size is small, then use a small stack allocated table |
499 | // instead of putting a bigger one on the heap. This particular |
500 | // optimization is important if the caller is using Snappy to compress |
501 | // many small blocks. (The memset savings alone is considerable.) |
502 | let table: &mut [u16] = if table_size <= SMALL_TABLE_SIZE { |
503 | &mut self.small[0..table_size] |
504 | } else { |
505 | if self.big.is_empty() { |
506 | // Interestingly, using `self.big.resize` here led to some |
507 | // very weird code getting generated that led to a large |
508 | // slow down. Forcing the issue with a new vec seems to |
509 | // fix it. ---AG |
510 | self.big = vec![0; MAX_TABLE_SIZE]; |
511 | } |
512 | &mut self.big[0..table_size] |
513 | }; |
514 | for x in &mut *table { |
515 | *x = 0; |
516 | } |
517 | BlockTable { table: table, shift: shift } |
518 | } |
519 | } |
520 | |
521 | impl<'a> BlockTable<'a> { |
522 | #[inline (always)] |
523 | fn hash(&self, x: u32) -> usize { |
524 | (x.wrapping_mul(0x1E35A7BD) >> self.shift) as usize |
525 | } |
526 | } |
527 | |
528 | impl<'a> Deref for BlockTable<'a> { |
529 | type Target = [u16]; |
530 | fn deref(&self) -> &[u16] { |
531 | self.table |
532 | } |
533 | } |
534 | |
535 | impl<'a> DerefMut for BlockTable<'a> { |
536 | fn deref_mut(&mut self) -> &mut [u16] { |
537 | self.table |
538 | } |
539 | } |
540 | |