| 1 | // Copyright 2017 Brian Langenberger |
| 2 | // |
| 3 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 4 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 5 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 6 | // option. This file may not be copied, modified, or distributed |
| 7 | // except according to those terms. |
| 8 | |
| 9 | //! Traits and implementations for reading or writing Huffman codes |
| 10 | //! from or to a stream. |
| 11 | |
| 12 | #![warn (missing_docs)] |
| 13 | |
| 14 | use super::BitQueue; |
| 15 | use super::Endianness; |
| 16 | #[cfg (feature = "alloc" )] |
| 17 | use alloc::boxed::Box; |
| 18 | #[cfg (feature = "alloc" )] |
| 19 | use alloc::collections::BTreeMap; |
| 20 | #[cfg (feature = "alloc" )] |
| 21 | use alloc::vec::Vec; |
| 22 | #[cfg (feature = "alloc" )] |
| 23 | use core::fmt; |
| 24 | #[cfg (feature = "alloc" )] |
| 25 | use core::marker::PhantomData; |
| 26 | #[cfg (feature = "alloc" )] |
| 27 | use core2::error::Error; |
| 28 | |
| 29 | #[cfg (not(feature = "alloc" ))] |
| 30 | use std::collections::BTreeMap; |
| 31 | #[cfg (not(feature = "alloc" ))] |
| 32 | use std::error::Error; |
| 33 | #[cfg (not(feature = "alloc" ))] |
| 34 | use std::fmt; |
| 35 | #[cfg (not(feature = "alloc" ))] |
| 36 | use std::marker::PhantomData; |
| 37 | |
| 38 | /// A compiled Huffman tree element for use with the `read_huffman` method. |
| 39 | /// Returned by `compile_read_tree`. |
| 40 | /// |
| 41 | /// Compiled read trees are optimized for faster lookup |
| 42 | /// and are therefore endian-specific. |
| 43 | /// |
| 44 | /// In addition, each symbol in the source tree may occur many times |
| 45 | /// in the compiled tree. If symbols require a nontrivial amount of space, |
| 46 | /// consider using reference counting so that they may be cloned |
| 47 | /// more efficiently. |
| 48 | pub enum ReadHuffmanTree<E: Endianness, T: Clone> { |
| 49 | /// The final value and new reader state |
| 50 | Done(T, u8, u32, PhantomData<E>), |
| 51 | /// Another byte is necessary to determine final value |
| 52 | Continue(Box<[ReadHuffmanTree<E, T>]>), |
| 53 | /// An invalid reader state has been used |
| 54 | InvalidState, |
| 55 | } |
| 56 | |
| 57 | /// Given a vector of symbol/code pairs, compiles a Huffman tree |
| 58 | /// for reading. |
| 59 | /// |
| 60 | /// Code must be 0 or 1 bits and are always read from the stream |
| 61 | /// from least-significant in the list to most signficant |
| 62 | /// (which makes them easier to read for humans). |
| 63 | /// |
| 64 | /// All possible codes must be assigned some symbol, |
| 65 | /// and it is acceptable for the same symbol to occur multiple times. |
| 66 | /// |
| 67 | /// ## Examples |
| 68 | /// ``` |
| 69 | /// use bitstream_io::huffman::compile_read_tree; |
| 70 | /// use bitstream_io::BigEndian; |
| 71 | /// assert!(compile_read_tree::<BigEndian,i32>( |
| 72 | /// vec![(1, vec![0]), |
| 73 | /// (2, vec![1, 0]), |
| 74 | /// (3, vec![1, 1])]).is_ok()); |
| 75 | /// ``` |
| 76 | /// |
| 77 | /// ``` |
| 78 | /// use std::io::{Read, Cursor}; |
| 79 | /// use bitstream_io::{BigEndian, BitReader, HuffmanRead}; |
| 80 | /// use bitstream_io::huffman::compile_read_tree; |
| 81 | /// let tree = compile_read_tree( |
| 82 | /// vec![('a' , vec![0]), |
| 83 | /// ('b' , vec![1, 0]), |
| 84 | /// ('c' , vec![1, 1, 0]), |
| 85 | /// ('d' , vec![1, 1, 1])]).unwrap(); |
| 86 | /// let data = [0b10110111]; |
| 87 | /// let mut cursor = Cursor::new(&data); |
| 88 | /// let mut reader = BitReader::endian(&mut cursor, BigEndian); |
| 89 | /// assert_eq!(reader.read_huffman(&tree).unwrap(), 'b' ); |
| 90 | /// assert_eq!(reader.read_huffman(&tree).unwrap(), 'c' ); |
| 91 | /// assert_eq!(reader.read_huffman(&tree).unwrap(), 'd' ); |
| 92 | /// ``` |
| 93 | pub fn compile_read_tree<E, T>( |
| 94 | values: Vec<(T, Vec<u8>)>, |
| 95 | ) -> Result<Box<[ReadHuffmanTree<E, T>]>, HuffmanTreeError> |
| 96 | where |
| 97 | E: Endianness, |
| 98 | T: Clone, |
| 99 | { |
| 100 | let tree: FinalHuffmanTree = FinalHuffmanTree::new(values)?; |
| 101 | |
| 102 | let mut result: Vec> = Vec::with_capacity(256); |
| 103 | result.extend((0..256).map(|_| ReadHuffmanTree::InvalidState)); |
| 104 | let queue: BitQueue = BitQueue::from_value(value:0, bits:0); |
| 105 | let i: usize = queue.to_state(); |
| 106 | result[i] = compile_queue(queue, &tree); |
| 107 | for bits: u32 in 1..8 { |
| 108 | for value: u8 in 0..(1 << bits) { |
| 109 | let queue: BitQueue = BitQueue::from_value(value, bits); |
| 110 | let i: usize = queue.to_state(); |
| 111 | result[i] = compile_queue(queue, &tree); |
| 112 | } |
| 113 | } |
| 114 | assert_eq!(result.len(), 256); |
| 115 | Ok(result.into_boxed_slice()) |
| 116 | } |
| 117 | |
| 118 | fn compile_queue<E, T>( |
| 119 | mut queue: BitQueue<E, u8>, |
| 120 | tree: &FinalHuffmanTree<T>, |
| 121 | ) -> ReadHuffmanTree<E, T> |
| 122 | where |
| 123 | E: Endianness, |
| 124 | T: Clone, |
| 125 | { |
| 126 | match tree { |
| 127 | FinalHuffmanTree::Leaf(ref value: &T) => { |
| 128 | let len: u32 = queue.len(); |
| 129 | ReadHuffmanTree::Done(value.clone(), queue.value(), len, PhantomData) |
| 130 | } |
| 131 | FinalHuffmanTree::Tree(ref bit0: &Box>, ref bit1: &Box>) => { |
| 132 | if queue.is_empty() { |
| 133 | ReadHuffmanTree::Continue( |
| 134 | (0..256) |
| 135 | .map(|byte: i32| compile_queue(queue:BitQueue::from_value(value:byte as u8, bits:8), tree)) |
| 136 | .collect::<Vec<ReadHuffmanTree<E, T>>>() |
| 137 | .into_boxed_slice(), |
| 138 | ) |
| 139 | } else if queue.pop(bits:1) == 0 { |
| 140 | compile_queue(queue, tree:bit0) |
| 141 | } else { |
| 142 | compile_queue(queue, tree:bit1) |
| 143 | } |
| 144 | } |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | // A complete Huffman tree with no empty nodes |
| 149 | enum FinalHuffmanTree<T: Clone> { |
| 150 | Leaf(T), |
| 151 | Tree(Box<FinalHuffmanTree<T>>, Box<FinalHuffmanTree<T>>), |
| 152 | } |
| 153 | |
| 154 | impl<T: Clone> FinalHuffmanTree<T> { |
| 155 | fn new(values: Vec<(T, Vec<u8>)>) -> Result<FinalHuffmanTree<T>, HuffmanTreeError> { |
| 156 | let mut tree: WipHuffmanTree = WipHuffmanTree::new_empty(); |
| 157 | |
| 158 | for (symbol: T, code: Vec) in values { |
| 159 | tree.add(code.as_slice(), symbol)?; |
| 160 | } |
| 161 | |
| 162 | tree.into_read_tree() |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | // Work-in-progress trees may have empty nodes during construction |
| 167 | // but those are not allowed in a finalized tree. |
| 168 | // If the user wants some codes to be None or an error symbol of some sort, |
| 169 | // those will need to be specified explicitly. |
| 170 | enum WipHuffmanTree<T: Clone> { |
| 171 | Empty, |
| 172 | Leaf(T), |
| 173 | Tree(Box<WipHuffmanTree<T>>, Box<WipHuffmanTree<T>>), |
| 174 | } |
| 175 | |
| 176 | impl<T: Clone> WipHuffmanTree<T> { |
| 177 | fn new_empty() -> WipHuffmanTree<T> { |
| 178 | WipHuffmanTree::Empty |
| 179 | } |
| 180 | |
| 181 | fn new_leaf(value: T) -> WipHuffmanTree<T> { |
| 182 | WipHuffmanTree::Leaf(value) |
| 183 | } |
| 184 | |
| 185 | fn new_tree() -> WipHuffmanTree<T> { |
| 186 | WipHuffmanTree::Tree(Box::new(Self::new_empty()), Box::new(Self::new_empty())) |
| 187 | } |
| 188 | |
| 189 | fn into_read_tree(self) -> Result<FinalHuffmanTree<T>, HuffmanTreeError> { |
| 190 | match self { |
| 191 | WipHuffmanTree::Empty => Err(HuffmanTreeError::MissingLeaf), |
| 192 | WipHuffmanTree::Leaf(v) => Ok(FinalHuffmanTree::Leaf(v)), |
| 193 | WipHuffmanTree::Tree(zero, one) => { |
| 194 | let zero = zero.into_read_tree()?; |
| 195 | let one = one.into_read_tree()?; |
| 196 | Ok(FinalHuffmanTree::Tree(Box::new(zero), Box::new(one))) |
| 197 | } |
| 198 | } |
| 199 | } |
| 200 | |
| 201 | fn add(&mut self, code: &[u8], symbol: T) -> Result<(), HuffmanTreeError> { |
| 202 | match self { |
| 203 | WipHuffmanTree::Empty => { |
| 204 | if code.is_empty() { |
| 205 | *self = WipHuffmanTree::new_leaf(symbol); |
| 206 | Ok(()) |
| 207 | } else { |
| 208 | *self = WipHuffmanTree::new_tree(); |
| 209 | self.add(code, symbol) |
| 210 | } |
| 211 | } |
| 212 | WipHuffmanTree::Leaf(_) => Err(if code.is_empty() { |
| 213 | HuffmanTreeError::DuplicateLeaf |
| 214 | } else { |
| 215 | HuffmanTreeError::OrphanedLeaf |
| 216 | }), |
| 217 | WipHuffmanTree::Tree(ref mut zero, ref mut one) => { |
| 218 | if code.is_empty() { |
| 219 | Err(HuffmanTreeError::DuplicateLeaf) |
| 220 | } else { |
| 221 | match code[0] { |
| 222 | 0 => zero.add(&code[1..], symbol), |
| 223 | 1 => one.add(&code[1..], symbol), |
| 224 | _ => Err(HuffmanTreeError::InvalidBit), |
| 225 | } |
| 226 | } |
| 227 | } |
| 228 | } |
| 229 | } |
| 230 | } |
| 231 | |
| 232 | /// An error type during Huffman tree compilation. |
| 233 | #[derive (PartialEq, Eq, Copy, Clone, Debug)] |
| 234 | pub enum HuffmanTreeError { |
| 235 | /// One of the bits in a Huffman code is not 0 or 1 |
| 236 | InvalidBit, |
| 237 | /// A Huffman code in the specification has no defined symbol |
| 238 | MissingLeaf, |
| 239 | /// The same Huffman code specifies multiple symbols |
| 240 | DuplicateLeaf, |
| 241 | /// A Huffman code is the prefix of some longer code |
| 242 | OrphanedLeaf, |
| 243 | } |
| 244 | |
| 245 | impl fmt::Display for HuffmanTreeError { |
| 246 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 247 | match *self { |
| 248 | HuffmanTreeError::InvalidBit => write!(f, "invalid bit in code" ), |
| 249 | HuffmanTreeError::MissingLeaf => write!(f, "missing leaf node in specification" ), |
| 250 | HuffmanTreeError::DuplicateLeaf => write!(f, "duplicate leaf node in specification" ), |
| 251 | HuffmanTreeError::OrphanedLeaf => write!(f, "orphaned leaf node in specification" ), |
| 252 | } |
| 253 | } |
| 254 | } |
| 255 | |
| 256 | impl Error for HuffmanTreeError {} |
| 257 | |
| 258 | /// Given a vector of symbol/code pairs, compiles a Huffman tree |
| 259 | /// for writing. |
| 260 | /// |
| 261 | /// Code must be 0 or 1 bits and are always written to the stream |
| 262 | /// from least-significant in the list to most signficant |
| 263 | /// (which makes them easier to read for humans). |
| 264 | /// |
| 265 | /// If the same symbol occurs multiple times, the first code is used. |
| 266 | /// Unlike in read trees, not all possible codes need to be |
| 267 | /// assigned a symbol. |
| 268 | /// |
| 269 | /// ## Examples |
| 270 | /// ``` |
| 271 | /// use bitstream_io::huffman::compile_write_tree; |
| 272 | /// use bitstream_io::BigEndian; |
| 273 | /// assert!(compile_write_tree::<BigEndian,i32>( |
| 274 | /// vec![(1, vec![0]), |
| 275 | /// (2, vec![1, 0]), |
| 276 | /// (3, vec![1, 1])]).is_ok()); |
| 277 | /// ``` |
| 278 | /// |
| 279 | /// ``` |
| 280 | /// use std::io::Write; |
| 281 | /// use bitstream_io::{BigEndian, BitWriter, HuffmanWrite}; |
| 282 | /// use bitstream_io::huffman::compile_write_tree; |
| 283 | /// let tree = compile_write_tree( |
| 284 | /// vec![('a' , vec![0]), |
| 285 | /// ('b' , vec![1, 0]), |
| 286 | /// ('c' , vec![1, 1, 0]), |
| 287 | /// ('d' , vec![1, 1, 1])]).unwrap(); |
| 288 | /// let mut data = Vec::new(); |
| 289 | /// { |
| 290 | /// let mut writer = BitWriter::endian(&mut data, BigEndian); |
| 291 | /// writer.write_huffman(&tree, 'b' ).unwrap(); |
| 292 | /// writer.write_huffman(&tree, 'c' ).unwrap(); |
| 293 | /// writer.write_huffman(&tree, 'd' ).unwrap(); |
| 294 | /// } |
| 295 | /// assert_eq!(data, [0b10110111]); |
| 296 | /// ``` |
| 297 | pub fn compile_write_tree<E, T>( |
| 298 | values: Vec<(T, Vec<u8>)>, |
| 299 | ) -> Result<WriteHuffmanTree<E, T>, HuffmanTreeError> |
| 300 | where |
| 301 | E: Endianness, |
| 302 | T: Ord + Clone, |
| 303 | { |
| 304 | let mut map = BTreeMap::new(); |
| 305 | |
| 306 | for (symbol, code) in values { |
| 307 | let mut encoded = Vec::new(); |
| 308 | for bits in code.chunks(32) { |
| 309 | let mut acc = BitQueue::<E, u32>::new(); |
| 310 | for bit in bits { |
| 311 | match *bit { |
| 312 | 0 => acc.push(1, 0), |
| 313 | 1 => acc.push(1, 1), |
| 314 | _ => return Err(HuffmanTreeError::InvalidBit), |
| 315 | } |
| 316 | } |
| 317 | let len = acc.len(); |
| 318 | encoded.push((len, acc.value())) |
| 319 | } |
| 320 | map.entry(symbol) |
| 321 | .or_insert_with(|| encoded.into_boxed_slice()); |
| 322 | } |
| 323 | |
| 324 | Ok(WriteHuffmanTree { |
| 325 | map, |
| 326 | phantom: PhantomData, |
| 327 | }) |
| 328 | } |
| 329 | |
| 330 | /// A compiled Huffman tree for use with the `write_huffman` method. |
| 331 | /// Returned by `compiled_write_tree`. |
| 332 | pub struct WriteHuffmanTree<E: Endianness, T: Ord> { |
| 333 | map: BTreeMap<T, Box<[(u32, u32)]>>, |
| 334 | phantom: PhantomData<E>, |
| 335 | } |
| 336 | |
| 337 | impl<E: Endianness, T: Ord + Clone> WriteHuffmanTree<E, T> { |
| 338 | /// Returns true if symbol is in tree. |
| 339 | #[inline ] |
| 340 | pub fn has_symbol(&self, symbol: &T) -> bool { |
| 341 | self.map.contains_key(symbol) |
| 342 | } |
| 343 | |
| 344 | /// Given symbol, returns iterator of |
| 345 | /// (bits, value) pairs for writing code. |
| 346 | /// Panics if symbol is not found. |
| 347 | #[inline ] |
| 348 | pub fn get(&self, symbol: &T) -> impl Iterator<Item = &(u32, u32)> { |
| 349 | self.map[symbol].iter() |
| 350 | } |
| 351 | } |
| 352 | |