huff0_decoder.rs source code [crates/ruzstd/src/huff0/huff0_decoder.rs]

1	//! Utilities for decoding Huff0 encoded huffman data.
2
3	use crate::decoding::bit_reader_reverse::{BitReaderReversed, GetBitsError};
4	use crate::fse::{FSEDecoder, FSEDecoderError, FSETable, FSETableError};
5	use alloc::vec::Vec;
6	#[cfg(feature = "std")]
7	use std::error::Error as StdError;
8
9	pub struct HuffmanTable {
10	decode: Vec<Entry>,
11	/// The weight of a symbol is the number of occurences in a table.
12	/// This value is used in constructing a binary tree referred to as
13	/// a huffman tree.
14	weights: Vec<u8>,
15	/// The maximum size in bits a prefix code in the encoded data can be.
16	/// This value is used so that the decoder knows how many bits
17	/// to read from the bitstream before checking the table. This
18	/// value must be 11 or lower.
19	pub max_num_bits: u8,
20	bits: Vec<u8>,
21	bit_ranks: Vec<u32>,
22	rank_indexes: Vec<usize>,
23	/// In some cases, the list of weights is compressed using FSE compression.
24	fse_table: FSETable,
25	}
26
27	#[derive(Debug)]
28	#[non_exhaustive]
29	pub enum HuffmanTableError {
30	GetBitsError(GetBitsError),
31	FSEDecoderError(FSEDecoderError),
32	FSETableError(FSETableError),
33	SourceIsEmpty,
34	NotEnoughBytesForWeights {
35	got_bytes: usize,
36	expected_bytes: u8,
37	},
38	ExtraPadding {
39	skipped_bits: i32,
40	},
41	TooManyWeights {
42	got: usize,
43	},
44	MissingWeights,
45	LeftoverIsNotAPowerOf2 {
46	got: u32,
47	},
48	NotEnoughBytesToDecompressWeights {
49	have: usize,
50	need: usize,
51	},
52	FSETableUsedTooManyBytes {
53	used: usize,
54	available_bytes: u8,
55	},
56	NotEnoughBytesInSource {
57	got: usize,
58	need: usize,
59	},
60	WeightBiggerThanMaxNumBits {
61	got: u8,
62	},
63	MaxBitsTooHigh {
64	got: u8,
65	},
66	}
67
68	#[cfg(feature = "std")]
69	impl StdError for HuffmanTableError {
70	fn source(&self) -> Option<&(dyn StdError + 'static)> {
71	match self {
72	HuffmanTableError::GetBitsError(source: &GetBitsError) => Some(source),
73	HuffmanTableError::FSEDecoderError(source: &FSEDecoderError) => Some(source),
74	HuffmanTableError::FSETableError(source: &FSETableError) => Some(source),
75	_ => None,
76	}
77	}
78	}
79
80	impl core::fmt::Display for HuffmanTableError {
81	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> ::core::fmt::Result {
82	match self {
83	HuffmanTableError::GetBitsError(e) => write!(f, "{:?}", e),
84	HuffmanTableError::FSEDecoderError(e) => write!(f, "{:?}", e),
85	HuffmanTableError::FSETableError(e) => write!(f, "{:?}", e),
86	HuffmanTableError::SourceIsEmpty => write!(f, "Source needs to have at least one byte"),
87	HuffmanTableError::NotEnoughBytesForWeights {
88	got_bytes,
89	expected_bytes,
90	} => {
91	write!(f, "Header says there should be {} bytes for the weights but there are only {} bytes in the stream",
92	expected_bytes,
93	got_bytes)
94	}
95	HuffmanTableError::ExtraPadding { skipped_bits } => {
96	write!(f,
97	"Padding at the end of the sequence_section was more than a byte long: {} bits. Probably caused by data corruption",
98	skipped_bits,
99	)
100	}
101	HuffmanTableError::TooManyWeights { got } => {
102	write!(
103	f,
104	"More than 255 weights decoded (got {} weights). Stream is probably corrupted",
105	got,
106	)
107	}
108	HuffmanTableError::MissingWeights => {
109	write!(f, "Can`\'`t build huffman table without any weights")
110	}
111	HuffmanTableError::LeftoverIsNotAPowerOf2 { got } => {
112	write!(f, "Leftover must be power of two but is: {}", got)
113	}
114	HuffmanTableError::NotEnoughBytesToDecompressWeights { have, need } => {
115	write!(
116	f,
117	"Not enough bytes in stream to decompress weights. Is: {}, Should be: {}",
118	have, need,
119	)
120	}
121	HuffmanTableError::FSETableUsedTooManyBytes {
122	used,
123	available_bytes,
124	} => {
125	write!(f,
126	"FSE table used more bytes: {} than were meant to be used for the whole stream of huffman weights ({})",
127	used,
128	available_bytes,
129	)
130	}
131	HuffmanTableError::NotEnoughBytesInSource { got, need } => {
132	write!(
133	f,
134	"Source needs to have at least {} bytes, got: {}",
135	need, got,
136	)
137	}
138	HuffmanTableError::WeightBiggerThanMaxNumBits { got } => {
139	write!(
140	f,
141	"Cant have weight: {} bigger than max_num_bits: {}",
142	got, MAX_MAX_NUM_BITS,
143	)
144	}
145	HuffmanTableError::MaxBitsTooHigh { got } => {
146	write!(
147	f,
148	"max_bits derived from weights is: {} should be lower than: {}",
149	got, MAX_MAX_NUM_BITS,
150	)
151	}
152	}
153	}
154	}
155
156	impl From<GetBitsError> for HuffmanTableError {
157	fn from(val: GetBitsError) -> Self {
158	Self::GetBitsError(val)
159	}
160	}
161
162	impl From<FSEDecoderError> for HuffmanTableError {
163	fn from(val: FSEDecoderError) -> Self {
164	Self::FSEDecoderError(val)
165	}
166	}
167
168	impl From<FSETableError> for HuffmanTableError {
169	fn from(val: FSETableError) -> Self {
170	Self::FSETableError(val)
171	}
172	}
173
174	/// An interface around a huffman table used to decode data.
175	pub struct HuffmanDecoder<'table> {
176	table: &'table HuffmanTable,
177	/// State is used to index into the table.
178	pub state: u64,
179	}
180
181	#[derive(Debug)]
182	#[non_exhaustive]
183	pub enum HuffmanDecoderError {
184	GetBitsError(GetBitsError),
185	}
186
187	impl core::fmt::Display for HuffmanDecoderError {
188	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
189	match self {
190	HuffmanDecoderError::GetBitsError(e: &GetBitsError) => write!(f, "{:?}", e),
191	}
192	}
193	}
194
195	#[cfg(feature = "std")]
196	impl StdError for HuffmanDecoderError {
197	fn source(&self) -> Option<&(dyn StdError + 'static)> {
198	match self {
199	HuffmanDecoderError::GetBitsError(source: &GetBitsError) => Some(source),
200	}
201	}
202	}
203
204	impl From<GetBitsError> for HuffmanDecoderError {
205	fn from(val: GetBitsError) -> Self {
206	Self::GetBitsError(val)
207	}
208	}
209
210	/// A single entry in the table contains the decoded symbol/literal and the
211	/// size of the prefix code.
212	#[derive(Copy, Clone)]
213	pub struct Entry {
214	/// The byte that the prefix code replaces during encoding.
215	symbol: u8,
216	/// The number of bits the prefix code occupies.
217	num_bits: u8,
218	}
219
220	/// The Zstandard specification limits the maximum length of a code to 11 bits.
221	const MAX_MAX_NUM_BITS: u8 = `11`;
222
223	/// Assert that the provided value is greater than zero, and returns the
224	/// 32 - the number of leading zeros
225	fn highest_bit_set(x: u32) -> u32 {
226	assert!(x > `0`);
227	u32::BITS - x.leading_zeros()
228	}
229
230	impl<'t> HuffmanDecoder<'t> {
231	/// Create a new decoder with the provided table
232	pub fn new(table: &'t HuffmanTable) -> HuffmanDecoder<'t> {
233	HuffmanDecoder { table, state: `0` }
234	}
235
236	/// Re-initialize the decoder, using the new table if one is provided.
237	/// This might used for treeless blocks, because they re-use the table from old
238	/// data.
239	pub fn reset(mut self, new_table: Option<&'t HuffmanTable>) {
240	self.state = `0`;
241	if let Some(next_table) = new_table {
242	self.table = next_table;
243	}
244	}
245
246	/// Decode the symbol the internal state (cursor) is pointed at and return the
247	/// decoded literal.
248	pub fn decode_symbol(&mut self) -> u8 {
249	self.table.decode[self.state as usize].symbol
250	}
251
252	/// Initialize internal state and prepare to decode data. Then, `decode_symbol` can be called
253	/// to read the byte the internal cursor is pointing at, and `next_state` can be called to advance
254	/// the cursor until the max number of bits has been read.
255	pub fn init_state(&mut self, br: &mut BitReaderReversed<'_>) -> u8 {
256	let num_bits = self.table.max_num_bits;
257	let new_bits = br.get_bits(num_bits);
258	self.state = new_bits;
259	num_bits
260	}
261
262	/// Advance the internal cursor to the next symbol. After this, you can call `decode_symbol`
263	/// to read from the new position.
264	pub fn next_state(&mut self, br: &mut BitReaderReversed<'_>) -> u8 {
265	// self.state stores a small section, or a window of the bit stream. The table can be indexed via this state,
266	// telling you how many bits identify the current symbol.
267	let num_bits = self.table.decode[self.state as usize].num_bits;
268	// New bits are read from the stream
269	let new_bits = br.get_bits(num_bits);
270	// Shift and mask out the bits that identify the current symbol
271	self.state <<= num_bits;
272	self.state &= self.table.decode.len() as u64 - `1`;
273	// The new bits are appended at the end of the current state.
274	self.state \|= new_bits;
275	num_bits
276	}
277	}
278
279	impl Default for HuffmanTable {
280	fn default() -> Self {
281	Self::new()
282	}
283	}
284
285	impl HuffmanTable {
286	/// Create a new, empty table.
287	pub fn new() -> HuffmanTable {
288	HuffmanTable {
289	decode: Vec::new(),
290
291	weights: Vec::with_capacity(`256`),
292	max_num_bits: `0`,
293	bits: Vec::with_capacity(`256`),
294	bit_ranks: Vec::with_capacity(`11`),
295	rank_indexes: Vec::with_capacity(`11`),
296	fse_table: FSETable::new(`100`),
297	}
298	}
299
300	/// Completely empty the table then repopulate as a replica
301	/// of `other`.
302	pub fn reinit_from(&mut self, other: &Self) {
303	self.reset();
304	self.decode.extend_from_slice(&other.decode);
305	self.weights.extend_from_slice(&other.weights);
306	self.max_num_bits = other.max_num_bits;
307	self.bits.extend_from_slice(&other.bits);
308	self.rank_indexes.extend_from_slice(&other.rank_indexes);
309	self.fse_table.reinit_from(&other.fse_table);
310	}
311
312	/// Completely empty the table of all data.
313	pub fn reset(&mut self) {
314	self.decode.clear();
315	self.weights.clear();
316	self.max_num_bits = `0`;
317	self.bits.clear();
318	self.bit_ranks.clear();
319	self.rank_indexes.clear();
320	self.fse_table.reset();
321	}
322
323	/// Read from `source` and parse it into a huffman table.
324	///
325	/// Returns the number of bytes read.
326	pub fn build_decoder(&mut self, source: &[u8]) -> Result<u32, HuffmanTableError> {
327	self.decode.clear();
328
329	let bytes_used = self.read_weights(source)?;
330	self.build_table_from_weights()?;
331	Ok(bytes_used)
332	}
333
334	/// Read weights from the provided source.
335	///
336	/// The huffman table is represented in the encoded data as a list of weights
337	/// at the most basic level. After the header, weights are read, then the table
338	/// can be built using that list of weights.
339	///
340	/// Returns the number of bytes read.
341	fn read_weights(&mut self, source: &[u8]) -> Result<u32, HuffmanTableError> {
342	use HuffmanTableError as err;
343
344	if source.is_empty() {
345	return Err(err::SourceIsEmpty);
346	}
347	let header = source[`0`];
348	let mut bits_read = `8`;
349
350	match header {
351	// If the header byte is less than 128, the series of weights
352	// is compressed using two interleaved FSE streams that share
353	// a distribution table.
354	`0`..=`127` => {
355	let fse_stream = &source[`1`..];
356	if header as usize > fse_stream.len() {
357	return Err(err::NotEnoughBytesForWeights {
358	got_bytes: fse_stream.len(),
359	expected_bytes: header,
360	});
361	}
362	//fse decompress weights
363	let bytes_used_by_fse_header = self
364	.fse_table
365	.build_decoder(fse_stream, /TODO find actual max/ `100`)?;
366
367	if bytes_used_by_fse_header > header as usize {
368	return Err(err::FSETableUsedTooManyBytes {
369	used: bytes_used_by_fse_header,
370	available_bytes: header,
371	});
372	}
373
374	vprintln!(
375	"Building fse table for huffman weights used: {}",
376	bytes_used_by_fse_header
377	);
378	// Huffman headers are compressed using two interleaved
379	// FSE bitstreams, where the first state (decoder) handles
380	// even symbols, and the second handles odd symbols.
381	let mut dec1 = FSEDecoder::new(&self.fse_table);
382	let mut dec2 = FSEDecoder::new(&self.fse_table);
383
384	let compressed_start = bytes_used_by_fse_header;
385	let compressed_length = header as usize - bytes_used_by_fse_header;
386
387	let compressed_weights = &fse_stream[compressed_start..];
388	if compressed_weights.len() < compressed_length {
389	return Err(err::NotEnoughBytesToDecompressWeights {
390	have: compressed_weights.len(),
391	need: compressed_length,
392	});
393	}
394	let compressed_weights = &compressed_weights[..compressed_length];
395	let mut br = BitReaderReversed::new(compressed_weights);
396
397	bits_read += (bytes_used_by_fse_header + compressed_length) * `8`;
398
399	//skip the 0 padding at the end of the last byte of the bit stream and throw away the first 1 found
400	let mut skipped_bits = `0`;
401	loop {
402	let val = br.get_bits(`1`);
403	skipped_bits += `1`;
404	if val == `1` \|\| skipped_bits > `8` {
405	break;
406	}
407	}
408	if skipped_bits > `8` {
409	//if more than 7 bits are 0, this is not the correct end of the bitstream. Either a bug or corrupted data
410	return Err(err::ExtraPadding { skipped_bits });
411	}
412
413	dec1.init_state(&mut br)?;
414	dec2.init_state(&mut br)?;
415
416	self.weights.clear();
417
418	// The two decoders take turns decoding a single symbol and updating their state.
419	loop {
420	let w = dec1.decode_symbol();
421	self.weights.push(w);
422	dec1.update_state(&mut br);
423
424	if br.bits_remaining() <= `-1` {
425	//collect final states
426	self.weights.push(dec2.decode_symbol());
427	break;
428	}
429
430	let w = dec2.decode_symbol();
431	self.weights.push(w);
432	dec2.update_state(&mut br);
433
434	if br.bits_remaining() <= `-1` {
435	//collect final states
436	self.weights.push(dec1.decode_symbol());
437	break;
438	}
439	//maximum number of weights is 255 because we use u8 symbols and the last weight is inferred from the sum of all others
440	if self.weights.len() > `255` {
441	return Err(err::TooManyWeights {
442	got: self.weights.len(),
443	});
444	}
445	}
446	}
447	// If the header byte is greater than or equal to 128,
448	// weights are directly represented, where each weight is
449	// encoded directly as a 4 bit field. The weights will
450	// always be encoded with full bytes, meaning if there's
451	// an odd number of weights, the last weight will still
452	// occupy a full byte.
453	_ => {
454	// weights are directly encoded
455	let weights_raw = &source[`1`..];
456	let num_weights = header - `127`;
457	self.weights.resize(num_weights as usize, `0`);
458
459	let bytes_needed = if num_weights % `2` == `0` {
460	num_weights as usize / `2`
461	} else {
462	(num_weights as usize / `2`) + `1`
463	};
464
465	if weights_raw.len() < bytes_needed {
466	return Err(err::NotEnoughBytesInSource {
467	got: weights_raw.len(),
468	need: bytes_needed,
469	});
470	}
471
472	for idx in `0`..num_weights {
473	if idx % `2` == `0` {
474	self.weights[idx as usize] = weights_raw[idx as usize / `2`] >> `4`;
475	} else {
476	self.weights[idx as usize] = weights_raw[idx as usize / `2`] & `0xF`;
477	}
478	bits_read += `4`;
479	}
480	}
481	}
482
483	let bytes_read = if bits_read % `8` == `0` {
484	bits_read / `8`
485	} else {
486	(bits_read / `8`) + `1`
487	};
488	Ok(bytes_read as u32)
489	}
490
491	/// Once the weights have been read from the data, you can decode the weights
492	/// into a table, and use that table to decode the actual compressed data.
493	///
494	/// This function populates the rest of the table from the series of weights.
495	fn build_table_from_weights(&mut self) -> Result<(), HuffmanTableError> {
496	use HuffmanTableError as err;
497
498	self.bits.clear();
499	self.bits.resize(self.weights.len() + `1`, `0`);
500
501	let mut weight_sum: u32 = `0`;
502	for w in &self.weights {
503	if *w > MAX_MAX_NUM_BITS {
504	return Err(err::WeightBiggerThanMaxNumBits { got: *w });
505	}
506	weight_sum += if w > `0` { `1_u32` << (w - `1`) } else { `0` };
507	}
508
509	if weight_sum == `0` {
510	return Err(err::MissingWeights);
511	}
512
513	let max_bits = highest_bit_set(weight_sum) as u8;
514	let left_over = (`1` << max_bits) - weight_sum;
515
516	//left_over must be power of two
517	if !left_over.is_power_of_two() {
518	return Err(err::LeftoverIsNotAPowerOf2 { got: left_over });
519	}
520
521	let last_weight = highest_bit_set(left_over) as u8;
522
523	for symbol in `0`..self.weights.len() {
524	let bits = if self.weights[symbol] > `0` {
525	max_bits + `1` - self.weights[symbol]
526	} else {
527	`0`
528	};
529	self.bits[symbol] = bits;
530	}
531
532	self.bits[self.weights.len()] = max_bits + `1` - last_weight;
533	self.max_num_bits = max_bits;
534
535	if max_bits > MAX_MAX_NUM_BITS {
536	return Err(err::MaxBitsTooHigh { got: max_bits });
537	}
538
539	self.bit_ranks.clear();
540	self.bit_ranks.resize((max_bits + `1`) as usize, `0`);
541	for num_bits in &self.bits {
542	self.bit_ranks[(num_bits) as usize*] += `1`;
543	}
544
545	//fill with dummy symbols
546	self.decode.resize(
547	`1` << self.max_num_bits,
548	Entry {
549	symbol: `0`,
550	num_bits: `0`,
551	},
552	);
553
554	//starting codes for each rank
555	self.rank_indexes.clear();
556	self.rank_indexes.resize((max_bits + `1`) as usize, `0`);
557
558	self.rank_indexes[max_bits as usize] = `0`;
559	for bits in (`1`..self.rank_indexes.len() as u8).rev() {
560	self.rank_indexes[bits as usize - `1`] = self.rank_indexes[bits as usize]
561	+ self.bit_ranks[bits as usize] as usize * (`1` << (max_bits - bits));
562	}
563
564	assert!(
565	self.rank_indexes[`0`] == self.decode.len(),
566	"rank_idx[0]: {} should be: {}",
567	self.rank_indexes[`0`],
568	self.decode.len()
569	);
570
571	for symbol in `0`..self.bits.len() {
572	let bits_for_symbol = self.bits[symbol];
573	if bits_for_symbol != `0` {
574	// allocate code for the symbol and set in the table
575	// a code ignores all max_bits - bits[symbol] bits, so it gets
576	// a range that spans all of those in the decoding table
577	let base_idx = self.rank_indexes[bits_for_symbol as usize];
578	let len = `1` << (max_bits - bits_for_symbol);
579	self.rank_indexes[bits_for_symbol as usize] += len;
580	for idx in `0`..len {
581	self.decode[base_idx + idx].symbol = symbol as u8;
582	self.decode[base_idx + idx].num_bits = bits_for_symbol;
583	}
584	}
585	}
586
587	Ok(())
588	}
589	}
590