1 | // Copyright 2017 Brian Langenberger |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
6 | // option. This file may not be copied, modified, or distributed |
7 | // except according to those terms. |
8 | |
9 | //! Traits and implementations for reading or writing Huffman codes |
10 | //! from or to a stream. |
11 | |
12 | #![warn (missing_docs)] |
13 | |
14 | use super::BitQueue; |
15 | use super::Endianness; |
16 | #[cfg (feature = "alloc" )] |
17 | use alloc::boxed::Box; |
18 | #[cfg (feature = "alloc" )] |
19 | use alloc::collections::BTreeMap; |
20 | #[cfg (feature = "alloc" )] |
21 | use alloc::vec::Vec; |
22 | #[cfg (feature = "alloc" )] |
23 | use core::fmt; |
24 | #[cfg (feature = "alloc" )] |
25 | use core::marker::PhantomData; |
26 | #[cfg (feature = "alloc" )] |
27 | use core2::error::Error; |
28 | |
29 | #[cfg (not(feature = "alloc" ))] |
30 | use std::collections::BTreeMap; |
31 | #[cfg (not(feature = "alloc" ))] |
32 | use std::error::Error; |
33 | #[cfg (not(feature = "alloc" ))] |
34 | use std::fmt; |
35 | #[cfg (not(feature = "alloc" ))] |
36 | use std::marker::PhantomData; |
37 | |
38 | /// A compiled Huffman tree element for use with the `read_huffman` method. |
39 | /// Returned by `compile_read_tree`. |
40 | /// |
41 | /// Compiled read trees are optimized for faster lookup |
42 | /// and are therefore endian-specific. |
43 | /// |
44 | /// In addition, each symbol in the source tree may occur many times |
45 | /// in the compiled tree. If symbols require a nontrivial amount of space, |
46 | /// consider using reference counting so that they may be cloned |
47 | /// more efficiently. |
48 | pub enum ReadHuffmanTree<E: Endianness, T: Clone> { |
49 | /// The final value and new reader state |
50 | Done(T, u8, u32, PhantomData<E>), |
51 | /// Another byte is necessary to determine final value |
52 | Continue(Box<[ReadHuffmanTree<E, T>]>), |
53 | /// An invalid reader state has been used |
54 | InvalidState, |
55 | } |
56 | |
57 | /// Given a vector of symbol/code pairs, compiles a Huffman tree |
58 | /// for reading. |
59 | /// |
60 | /// Code must be 0 or 1 bits and are always read from the stream |
61 | /// from least-significant in the list to most signficant |
62 | /// (which makes them easier to read for humans). |
63 | /// |
64 | /// All possible codes must be assigned some symbol, |
65 | /// and it is acceptable for the same symbol to occur multiple times. |
66 | /// |
67 | /// ## Examples |
68 | /// ``` |
69 | /// use bitstream_io::huffman::compile_read_tree; |
70 | /// use bitstream_io::BigEndian; |
71 | /// assert!(compile_read_tree::<BigEndian,i32>( |
72 | /// vec![(1, vec![0]), |
73 | /// (2, vec![1, 0]), |
74 | /// (3, vec![1, 1])]).is_ok()); |
75 | /// ``` |
76 | /// |
77 | /// ``` |
78 | /// use std::io::{Read, Cursor}; |
79 | /// use bitstream_io::{BigEndian, BitReader, HuffmanRead}; |
80 | /// use bitstream_io::huffman::compile_read_tree; |
81 | /// let tree = compile_read_tree( |
82 | /// vec![('a' , vec![0]), |
83 | /// ('b' , vec![1, 0]), |
84 | /// ('c' , vec![1, 1, 0]), |
85 | /// ('d' , vec![1, 1, 1])]).unwrap(); |
86 | /// let data = [0b10110111]; |
87 | /// let mut cursor = Cursor::new(&data); |
88 | /// let mut reader = BitReader::endian(&mut cursor, BigEndian); |
89 | /// assert_eq!(reader.read_huffman(&tree).unwrap(), 'b' ); |
90 | /// assert_eq!(reader.read_huffman(&tree).unwrap(), 'c' ); |
91 | /// assert_eq!(reader.read_huffman(&tree).unwrap(), 'd' ); |
92 | /// ``` |
93 | pub fn compile_read_tree<E, T>( |
94 | values: Vec<(T, Vec<u8>)>, |
95 | ) -> Result<Box<[ReadHuffmanTree<E, T>]>, HuffmanTreeError> |
96 | where |
97 | E: Endianness, |
98 | T: Clone, |
99 | { |
100 | let tree: FinalHuffmanTree = FinalHuffmanTree::new(values)?; |
101 | |
102 | let mut result: Vec> = Vec::with_capacity(256); |
103 | result.extend((0..256).map(|_| ReadHuffmanTree::InvalidState)); |
104 | let queue: BitQueue = BitQueue::from_value(value:0, bits:0); |
105 | let i: usize = queue.to_state(); |
106 | result[i] = compile_queue(queue, &tree); |
107 | for bits: u32 in 1..8 { |
108 | for value: u8 in 0..(1 << bits) { |
109 | let queue: BitQueue = BitQueue::from_value(value, bits); |
110 | let i: usize = queue.to_state(); |
111 | result[i] = compile_queue(queue, &tree); |
112 | } |
113 | } |
114 | assert_eq!(result.len(), 256); |
115 | Ok(result.into_boxed_slice()) |
116 | } |
117 | |
118 | fn compile_queue<E, T>( |
119 | mut queue: BitQueue<E, u8>, |
120 | tree: &FinalHuffmanTree<T>, |
121 | ) -> ReadHuffmanTree<E, T> |
122 | where |
123 | E: Endianness, |
124 | T: Clone, |
125 | { |
126 | match tree { |
127 | FinalHuffmanTree::Leaf(ref value: &T) => { |
128 | let len: u32 = queue.len(); |
129 | ReadHuffmanTree::Done(value.clone(), queue.value(), len, PhantomData) |
130 | } |
131 | FinalHuffmanTree::Tree(ref bit0: &Box>, ref bit1: &Box>) => { |
132 | if queue.is_empty() { |
133 | ReadHuffmanTree::Continue( |
134 | (0..256) |
135 | .map(|byte: i32| compile_queue(queue:BitQueue::from_value(value:byte as u8, bits:8), tree)) |
136 | .collect::<Vec<ReadHuffmanTree<E, T>>>() |
137 | .into_boxed_slice(), |
138 | ) |
139 | } else if queue.pop(bits:1) == 0 { |
140 | compile_queue(queue, tree:bit0) |
141 | } else { |
142 | compile_queue(queue, tree:bit1) |
143 | } |
144 | } |
145 | } |
146 | } |
147 | |
148 | // A complete Huffman tree with no empty nodes |
149 | enum FinalHuffmanTree<T: Clone> { |
150 | Leaf(T), |
151 | Tree(Box<FinalHuffmanTree<T>>, Box<FinalHuffmanTree<T>>), |
152 | } |
153 | |
154 | impl<T: Clone> FinalHuffmanTree<T> { |
155 | fn new(values: Vec<(T, Vec<u8>)>) -> Result<FinalHuffmanTree<T>, HuffmanTreeError> { |
156 | let mut tree: WipHuffmanTree = WipHuffmanTree::new_empty(); |
157 | |
158 | for (symbol: T, code: Vec) in values { |
159 | tree.add(code.as_slice(), symbol)?; |
160 | } |
161 | |
162 | tree.into_read_tree() |
163 | } |
164 | } |
165 | |
166 | // Work-in-progress trees may have empty nodes during construction |
167 | // but those are not allowed in a finalized tree. |
168 | // If the user wants some codes to be None or an error symbol of some sort, |
169 | // those will need to be specified explicitly. |
170 | enum WipHuffmanTree<T: Clone> { |
171 | Empty, |
172 | Leaf(T), |
173 | Tree(Box<WipHuffmanTree<T>>, Box<WipHuffmanTree<T>>), |
174 | } |
175 | |
176 | impl<T: Clone> WipHuffmanTree<T> { |
177 | fn new_empty() -> WipHuffmanTree<T> { |
178 | WipHuffmanTree::Empty |
179 | } |
180 | |
181 | fn new_leaf(value: T) -> WipHuffmanTree<T> { |
182 | WipHuffmanTree::Leaf(value) |
183 | } |
184 | |
185 | fn new_tree() -> WipHuffmanTree<T> { |
186 | WipHuffmanTree::Tree(Box::new(Self::new_empty()), Box::new(Self::new_empty())) |
187 | } |
188 | |
189 | fn into_read_tree(self) -> Result<FinalHuffmanTree<T>, HuffmanTreeError> { |
190 | match self { |
191 | WipHuffmanTree::Empty => Err(HuffmanTreeError::MissingLeaf), |
192 | WipHuffmanTree::Leaf(v) => Ok(FinalHuffmanTree::Leaf(v)), |
193 | WipHuffmanTree::Tree(zero, one) => { |
194 | let zero = zero.into_read_tree()?; |
195 | let one = one.into_read_tree()?; |
196 | Ok(FinalHuffmanTree::Tree(Box::new(zero), Box::new(one))) |
197 | } |
198 | } |
199 | } |
200 | |
201 | fn add(&mut self, code: &[u8], symbol: T) -> Result<(), HuffmanTreeError> { |
202 | match self { |
203 | WipHuffmanTree::Empty => { |
204 | if code.is_empty() { |
205 | *self = WipHuffmanTree::new_leaf(symbol); |
206 | Ok(()) |
207 | } else { |
208 | *self = WipHuffmanTree::new_tree(); |
209 | self.add(code, symbol) |
210 | } |
211 | } |
212 | WipHuffmanTree::Leaf(_) => Err(if code.is_empty() { |
213 | HuffmanTreeError::DuplicateLeaf |
214 | } else { |
215 | HuffmanTreeError::OrphanedLeaf |
216 | }), |
217 | WipHuffmanTree::Tree(ref mut zero, ref mut one) => { |
218 | if code.is_empty() { |
219 | Err(HuffmanTreeError::DuplicateLeaf) |
220 | } else { |
221 | match code[0] { |
222 | 0 => zero.add(&code[1..], symbol), |
223 | 1 => one.add(&code[1..], symbol), |
224 | _ => Err(HuffmanTreeError::InvalidBit), |
225 | } |
226 | } |
227 | } |
228 | } |
229 | } |
230 | } |
231 | |
232 | /// An error type during Huffman tree compilation. |
233 | #[derive (PartialEq, Eq, Copy, Clone, Debug)] |
234 | pub enum HuffmanTreeError { |
235 | /// One of the bits in a Huffman code is not 0 or 1 |
236 | InvalidBit, |
237 | /// A Huffman code in the specification has no defined symbol |
238 | MissingLeaf, |
239 | /// The same Huffman code specifies multiple symbols |
240 | DuplicateLeaf, |
241 | /// A Huffman code is the prefix of some longer code |
242 | OrphanedLeaf, |
243 | } |
244 | |
245 | impl fmt::Display for HuffmanTreeError { |
246 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
247 | match *self { |
248 | HuffmanTreeError::InvalidBit => write!(f, "invalid bit in code" ), |
249 | HuffmanTreeError::MissingLeaf => write!(f, "missing leaf node in specification" ), |
250 | HuffmanTreeError::DuplicateLeaf => write!(f, "duplicate leaf node in specification" ), |
251 | HuffmanTreeError::OrphanedLeaf => write!(f, "orphaned leaf node in specification" ), |
252 | } |
253 | } |
254 | } |
255 | |
256 | impl Error for HuffmanTreeError {} |
257 | |
258 | /// Given a vector of symbol/code pairs, compiles a Huffman tree |
259 | /// for writing. |
260 | /// |
261 | /// Code must be 0 or 1 bits and are always written to the stream |
262 | /// from least-significant in the list to most signficant |
263 | /// (which makes them easier to read for humans). |
264 | /// |
265 | /// If the same symbol occurs multiple times, the first code is used. |
266 | /// Unlike in read trees, not all possible codes need to be |
267 | /// assigned a symbol. |
268 | /// |
269 | /// ## Examples |
270 | /// ``` |
271 | /// use bitstream_io::huffman::compile_write_tree; |
272 | /// use bitstream_io::BigEndian; |
273 | /// assert!(compile_write_tree::<BigEndian,i32>( |
274 | /// vec![(1, vec![0]), |
275 | /// (2, vec![1, 0]), |
276 | /// (3, vec![1, 1])]).is_ok()); |
277 | /// ``` |
278 | /// |
279 | /// ``` |
280 | /// use std::io::Write; |
281 | /// use bitstream_io::{BigEndian, BitWriter, HuffmanWrite}; |
282 | /// use bitstream_io::huffman::compile_write_tree; |
283 | /// let tree = compile_write_tree( |
284 | /// vec![('a' , vec![0]), |
285 | /// ('b' , vec![1, 0]), |
286 | /// ('c' , vec![1, 1, 0]), |
287 | /// ('d' , vec![1, 1, 1])]).unwrap(); |
288 | /// let mut data = Vec::new(); |
289 | /// { |
290 | /// let mut writer = BitWriter::endian(&mut data, BigEndian); |
291 | /// writer.write_huffman(&tree, 'b' ).unwrap(); |
292 | /// writer.write_huffman(&tree, 'c' ).unwrap(); |
293 | /// writer.write_huffman(&tree, 'd' ).unwrap(); |
294 | /// } |
295 | /// assert_eq!(data, [0b10110111]); |
296 | /// ``` |
297 | pub fn compile_write_tree<E, T>( |
298 | values: Vec<(T, Vec<u8>)>, |
299 | ) -> Result<WriteHuffmanTree<E, T>, HuffmanTreeError> |
300 | where |
301 | E: Endianness, |
302 | T: Ord + Clone, |
303 | { |
304 | let mut map = BTreeMap::new(); |
305 | |
306 | for (symbol, code) in values { |
307 | let mut encoded = Vec::new(); |
308 | for bits in code.chunks(32) { |
309 | let mut acc = BitQueue::<E, u32>::new(); |
310 | for bit in bits { |
311 | match *bit { |
312 | 0 => acc.push(1, 0), |
313 | 1 => acc.push(1, 1), |
314 | _ => return Err(HuffmanTreeError::InvalidBit), |
315 | } |
316 | } |
317 | let len = acc.len(); |
318 | encoded.push((len, acc.value())) |
319 | } |
320 | map.entry(symbol) |
321 | .or_insert_with(|| encoded.into_boxed_slice()); |
322 | } |
323 | |
324 | Ok(WriteHuffmanTree { |
325 | map, |
326 | phantom: PhantomData, |
327 | }) |
328 | } |
329 | |
330 | /// A compiled Huffman tree for use with the `write_huffman` method. |
331 | /// Returned by `compiled_write_tree`. |
332 | pub struct WriteHuffmanTree<E: Endianness, T: Ord> { |
333 | map: BTreeMap<T, Box<[(u32, u32)]>>, |
334 | phantom: PhantomData<E>, |
335 | } |
336 | |
337 | impl<E: Endianness, T: Ord + Clone> WriteHuffmanTree<E, T> { |
338 | /// Returns true if symbol is in tree. |
339 | #[inline ] |
340 | pub fn has_symbol(&self, symbol: &T) -> bool { |
341 | self.map.contains_key(symbol) |
342 | } |
343 | |
344 | /// Given symbol, returns iterator of |
345 | /// (bits, value) pairs for writing code. |
346 | /// Panics if symbol is not found. |
347 | #[inline ] |
348 | pub fn get(&self, symbol: &T) -> impl Iterator<Item = &(u32, u32)> { |
349 | self.map[symbol].iter() |
350 | } |
351 | } |
352 | |