sequence_section.rs source code [crates/ruzstd/src/blocks/sequence_section.rs]

1	//! Utilities and representations for the second half of a block, the sequence section.
2	//! This section copies literals from the literals section into the decompressed output.
3
4	pub(crate) const MAX_LITERAL_LENGTH_CODE: u8 = `35`;
5	pub(crate) const MAX_MATCH_LENGTH_CODE: u8 = `52`;
6	pub(crate) const MAX_OFFSET_CODE: u8 = `31`;
7
8	pub struct SequencesHeader {
9	pub num_sequences: u32,
10	pub modes: Option<CompressionModes>,
11	}
12
13	/// A sequence represents potentially redundant data, and it can be broken up into 2 steps:
14	/// - A copy step, where data is copied from the literals section to the decompressed output
15	/// - A match* copy step that copies data from within the previously decompressed output.*
16	///
17	/// <https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#sequence-execution>
18	#[derive(Clone, Copy)]
19	pub struct Sequence {
20	/// Literal length, or the number of bytes to be copied from the literals section
21	/// in the copy step.
22	pub ll: u32,
23	/// The length of the match to make during the match copy step.
24	pub ml: u32,
25	/// How far back to go in the decompressed data to read from the match copy step.
26	/// If this value is greater than 3, then the offset is `of -3`. If `of` is from 1-3,
27	/// then it has special handling:
28	///
29	/// The first 3 values define 3 different repeated offsets, with 1 referring to the most
30	/// recent, 2 the second recent, and so on. When the current sequence has a literal length of 0,
31	/// then the repeated offsets are shifted by 1. So an offset value of 1 refers to 2, 2 refers to 3,
32	/// and 3 refers to the most recent offset minus one. If that value is equal to zero, the data
33	/// is considered corrupted.
34	pub of: u32,
35	}
36
37	impl core::fmt::Display for Sequence {
38	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> Result<(), core::fmt::Error> {
39	write!(f, "LL: {}, ML: {}, OF: {}", self.ll, self.ml, self.of)
40	}
41	}
42
43	/// This byte defines the compression mode of each symbol type
44	#[derive(Copy, Clone)]
45	pub struct CompressionModes(u8);
46	/// The compression mode used for symbol compression
47	pub enum ModeType {
48	/// A predefined FSE distribution table is used, and no distribution table
49	/// will be present.
50	Predefined,
51	/// The table consists of a single byte, which contains the symbol's value.
52	RLE,
53	/// Standard FSE compression, a distribution table will be present. This
54	/// mode should not be used when only one symbol is present.
55	FSECompressed,
56	/// The table used in the previous compressed block with at least one sequence
57	/// will be used again. If this is the first block, the table in the dictionary will
58	/// be used.
59	Repeat,
60	}
61
62	impl CompressionModes {
63	/// Deserialize a two bit mode value into a [ModeType]
64	pub fn decode_mode(m: u8) -> ModeType {
65	match m {
66	`0` => ModeType::Predefined,
67	`1` => ModeType::RLE,
68	`2` => ModeType::FSECompressed,
69	`3` => ModeType::Repeat,
70	_ => panic!("This can never happen"),
71	}
72	}
73	/// Read the compression mode of the literal lengths field.
74	pub fn ll_mode(self) -> ModeType {
75	Self::decode_mode(self.0 >> `6`)
76	}
77
78	/// Read the compression mode of the offset value field.
79	pub fn of_mode(self) -> ModeType {
80	Self::decode_mode((self.0 >> `4`) & `0x3`)
81	}
82
83	/// Read the compression mode of the match lengths field.
84	pub fn ml_mode(self) -> ModeType {
85	Self::decode_mode((self.0 >> `2`) & `0x3`)
86	}
87	}
88
89	impl Default for SequencesHeader {
90	fn default() -> Self {
91	Self::new()
92	}
93	}
94
95	#[derive(Debug)]
96	#[non_exhaustive]
97	pub enum SequencesHeaderParseError {
98	NotEnoughBytes { need_at_least: u8, got: usize },
99	}
100
101	#[cfg(feature = "std")]
102	impl std::error::Error for SequencesHeaderParseError {}
103
104	impl core::fmt::Display for SequencesHeaderParseError {
105	fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
106	match self {
107	SequencesHeaderParseError::NotEnoughBytes { need_at_least: &u8, got: &usize } => {
108	write!(
109	f,
110	"source must have at least {} bytes to parse header; got {} bytes",
111	need_at_least, got,
112	)
113	}
114	}
115	}
116	}
117
118	impl SequencesHeader {
119	/// Create a new [SequencesHeader].
120	pub fn new() -> SequencesHeader {
121	SequencesHeader {
122	num_sequences: `0`,
123	modes: None,
124	}
125	}
126
127	/// Attempt to deserialize the provided buffer into `self`, returning the number of bytes read.
128	pub fn parse_from_header(&mut self, source: &[u8]) -> Result<u8, SequencesHeaderParseError> {
129	let mut bytes_read = `0`;
130	if source.is_empty() {
131	return Err(SequencesHeaderParseError::NotEnoughBytes {
132	need_at_least: `1`,
133	got: `0`,
134	});
135	}
136
137	let source = match source[`0`] {
138	`0` => {
139	self.num_sequences = `0`;
140	return Ok(`1`);
141	}
142	`1`..=`127` => {
143	if source.len() < `2` {
144	return Err(SequencesHeaderParseError::NotEnoughBytes {
145	need_at_least: `2`,
146	got: source.len(),
147	});
148	}
149	self.num_sequences = u32::from(source[`0`]);
150	bytes_read += `1`;
151	&source[`1`..]
152	}
153	`128`..=`254` => {
154	if source.len() < `3` {
155	return Err(SequencesHeaderParseError::NotEnoughBytes {
156	need_at_least: `3`,
157	got: source.len(),
158	});
159	}
160	self.num_sequences = ((u32::from(source[`0`]) - `128`) << `8`) + u32::from(source[`1`]);
161	bytes_read += `2`;
162	&source[`2`..]
163	}
164	`255` => {
165	if source.len() < `4` {
166	return Err(SequencesHeaderParseError::NotEnoughBytes {
167	need_at_least: `4`,
168	got: source.len(),
169	});
170	}
171	self.num_sequences = u32::from(source[`1`]) + (u32::from(source[`2`]) << `8`) + `0x7F00`;
172	bytes_read += `3`;
173	&source[`3`..]
174	}
175	};
176
177	self.modes = Some(CompressionModes(source[`0`]));
178	bytes_read += `1`;
179
180	Ok(bytes_read)
181	}
182	}
183