1 | //! A module for all encoding needs. |
2 | use crate::error::{BufferResult, LzwError, LzwStatus, VectorResult}; |
3 | use crate::{BitOrder, Code, StreamBuf, MAX_CODESIZE, MAX_ENTRIES, STREAM_BUF_SIZE}; |
4 | |
5 | use crate::alloc::{boxed::Box, vec::Vec}; |
6 | #[cfg (feature = "std" )] |
7 | use crate::error::StreamResult; |
8 | #[cfg (feature = "std" )] |
9 | use std::io::{self, BufRead, Write}; |
10 | |
11 | /// The state for encoding data with an LZW algorithm. |
12 | /// |
13 | /// The same structure can be utilized with streams as well as your own buffers and driver logic. |
14 | /// It may even be possible to mix them if you are sufficiently careful not to lose any written |
15 | /// data in the process. |
16 | /// |
17 | /// This is a sans-IO implementation, meaning that it only contains the state of the encoder and |
18 | /// the caller will provide buffers for input and output data when calling the basic |
19 | /// [`encode_bytes`] method. Nevertheless, a number of _adapters_ are provided in the `into_*` |
20 | /// methods for enoding with a particular style of common IO. |
21 | /// |
22 | /// * [`encode`] for encoding once without any IO-loop. |
23 | /// * [`into_async`] for encoding with the `futures` traits for asynchronous IO. |
24 | /// * [`into_stream`] for encoding with the standard `io` traits. |
25 | /// * [`into_vec`] for in-memory encoding. |
26 | /// |
27 | /// [`encode_bytes`]: #method.encode_bytes |
28 | /// [`encode`]: #method.encode |
29 | /// [`into_async`]: #method.into_async |
30 | /// [`into_stream`]: #method.into_stream |
31 | /// [`into_vec`]: #method.into_vec |
32 | pub struct Encoder { |
33 | /// Internally dispatch via a dynamic trait object. This did not have any significant |
34 | /// performance impact as we batch data internally and this pointer does not change after |
35 | /// creation! |
36 | state: Box<dyn Stateful + Send + 'static>, |
37 | } |
38 | |
39 | /// A encoding stream sink. |
40 | /// |
41 | /// See [`Encoder::into_stream`] on how to create this type. |
42 | /// |
43 | /// [`Encoder::into_stream`]: struct.Encoder.html#method.into_stream |
44 | #[cfg_attr ( |
45 | not(feature = "std" ), |
46 | deprecated = "This type is only useful with the `std` feature." |
47 | )] |
48 | #[cfg_attr (not(feature = "std" ), allow(dead_code))] |
49 | pub struct IntoStream<'d, W> { |
50 | encoder: &'d mut Encoder, |
51 | writer: W, |
52 | buffer: Option<StreamBuf<'d>>, |
53 | default_size: usize, |
54 | } |
55 | |
56 | /// An async decoding sink. |
57 | /// |
58 | /// See [`Encoder::into_async`] on how to create this type. |
59 | /// |
60 | /// [`Encoder::into_async`]: struct.Encoder.html#method.into_async |
61 | #[cfg (feature = "async" )] |
62 | pub struct IntoAsync<'d, W> { |
63 | encoder: &'d mut Encoder, |
64 | writer: W, |
65 | buffer: Option<StreamBuf<'d>>, |
66 | default_size: usize, |
67 | } |
68 | |
69 | /// A encoding sink into a vector. |
70 | /// |
71 | /// See [`Encoder::into_vec`] on how to create this type. |
72 | /// |
73 | /// [`Encoder::into_vec`]: struct.Encoder.html#method.into_vec |
74 | pub struct IntoVec<'d> { |
75 | encoder: &'d mut Encoder, |
76 | vector: &'d mut Vec<u8>, |
77 | } |
78 | |
79 | trait Stateful { |
80 | fn advance(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult; |
81 | fn mark_ended(&mut self) -> bool; |
82 | /// Reset the state tracking if end code has been written. |
83 | fn restart(&mut self); |
84 | /// Reset the encoder to the beginning, dropping all buffers etc. |
85 | fn reset(&mut self); |
86 | } |
87 | |
88 | struct EncodeState<B: Buffer> { |
89 | /// The configured minimal code size. |
90 | min_size: u8, |
91 | /// The current encoding symbol tree. |
92 | tree: Tree, |
93 | /// If we have pushed the end code. |
94 | has_ended: bool, |
95 | /// If tiff then bumps are a single code sooner. |
96 | is_tiff: bool, |
97 | /// The code corresponding to the currently read characters. |
98 | current_code: Code, |
99 | /// The clear code for resetting the dictionary. |
100 | clear_code: Code, |
101 | /// The bit buffer for encoding. |
102 | buffer: B, |
103 | } |
104 | |
105 | struct MsbBuffer { |
106 | /// The current code length. |
107 | code_size: u8, |
108 | /// The buffer bits. |
109 | buffer: u64, |
110 | /// The number of valid buffer bits. |
111 | bits_in_buffer: u8, |
112 | } |
113 | |
114 | struct LsbBuffer { |
115 | /// The current code length. |
116 | code_size: u8, |
117 | /// The buffer bits. |
118 | buffer: u64, |
119 | /// The number of valid buffer bits. |
120 | bits_in_buffer: u8, |
121 | } |
122 | |
123 | trait Buffer { |
124 | fn new(size: u8) -> Self; |
125 | /// Reset the code size in the buffer. |
126 | fn reset(&mut self, min_size: u8); |
127 | /// Apply effects of a Clear Code. |
128 | fn clear(&mut self, min_size: u8); |
129 | /// Insert a code into the buffer. |
130 | fn buffer_code(&mut self, code: Code); |
131 | /// Push bytes if the buffer space is getting small. |
132 | fn push_out(&mut self, out: &mut &mut [u8]) -> bool; |
133 | /// Flush all full bytes, returning if at least one more byte remains. |
134 | fn flush_out(&mut self, out: &mut &mut [u8]) -> bool; |
135 | /// Pad the buffer to a full byte. |
136 | fn buffer_pad(&mut self); |
137 | /// Increase the maximum code size. |
138 | fn bump_code_size(&mut self); |
139 | /// Return the maximum code with the current code size. |
140 | fn max_code(&self) -> Code; |
141 | /// Return the current code size in bits. |
142 | fn code_size(&self) -> u8; |
143 | } |
144 | |
145 | /// One tree node for at most each code. |
146 | /// To avoid using too much memory we keep nodes with few successors in optimized form. This form |
147 | /// doesn't offer lookup by indexing but instead does a linear search. |
148 | #[derive (Default)] |
149 | struct Tree { |
150 | simples: Vec<Simple>, |
151 | complex: Vec<Full>, |
152 | keys: Vec<CompressedKey>, |
153 | } |
154 | |
155 | #[derive (Clone, Copy)] |
156 | enum FullKey { |
157 | NoSuccessor, |
158 | Simple(u16), |
159 | Full(u16), |
160 | } |
161 | |
162 | #[derive (Clone, Copy)] |
163 | struct CompressedKey(u16); |
164 | |
165 | const SHORT: usize = 16; |
166 | |
167 | #[derive (Clone, Copy)] |
168 | struct Simple { |
169 | codes: [Code; SHORT], |
170 | chars: [u8; SHORT], |
171 | count: u8, |
172 | } |
173 | |
174 | #[derive (Clone, Copy)] |
175 | struct Full { |
176 | char_continuation: [Code; 256], |
177 | } |
178 | |
179 | impl Encoder { |
180 | /// Create a new encoder with the specified bit order and symbol size. |
181 | /// |
182 | /// The algorithm for dynamically increasing the code symbol bit width is compatible with the |
183 | /// original specification. In particular you will need to specify an `Lsb` bit oder to encode |
184 | /// the data portion of a compressed `gif` image. |
185 | /// |
186 | /// # Panics |
187 | /// |
188 | /// The `size` needs to be in the interval `2..=12`. |
189 | pub fn new(order: BitOrder, size: u8) -> Self { |
190 | type Boxed = Box<dyn Stateful + Send + 'static>; |
191 | super::assert_encode_size(size); |
192 | let state = match order { |
193 | BitOrder::Lsb => Box::new(EncodeState::<LsbBuffer>::new(size)) as Boxed, |
194 | BitOrder::Msb => Box::new(EncodeState::<MsbBuffer>::new(size)) as Boxed, |
195 | }; |
196 | |
197 | Encoder { state } |
198 | } |
199 | |
200 | /// Create a TIFF compatible encoder with the specified bit order and symbol size. |
201 | /// |
202 | /// The algorithm for dynamically increasing the code symbol bit width is compatible with the |
203 | /// TIFF specification, which is a misinterpretation of the original algorithm for increasing |
204 | /// the code size. It switches one symbol sooner. |
205 | /// |
206 | /// # Panics |
207 | /// |
208 | /// The `size` needs to be in the interval `2..=12`. |
209 | pub fn with_tiff_size_switch(order: BitOrder, size: u8) -> Self { |
210 | type Boxed = Box<dyn Stateful + Send + 'static>; |
211 | super::assert_encode_size(size); |
212 | let state = match order { |
213 | BitOrder::Lsb => { |
214 | let mut state = Box::new(EncodeState::<LsbBuffer>::new(size)); |
215 | state.is_tiff = true; |
216 | state as Boxed |
217 | } |
218 | BitOrder::Msb => { |
219 | let mut state = Box::new(EncodeState::<MsbBuffer>::new(size)); |
220 | state.is_tiff = true; |
221 | state as Boxed |
222 | } |
223 | }; |
224 | |
225 | Encoder { state } |
226 | } |
227 | |
228 | /// Encode some bytes from `inp` into `out`. |
229 | /// |
230 | /// See [`into_stream`] for high-level functions (this interface is only available with the |
231 | /// `std` feature) and [`finish`] for marking the input data as complete. |
232 | /// |
233 | /// When some input byte is invalid, i.e. is not smaller than `1 << size`, then that byte and |
234 | /// all following ones will _not_ be consumed and the `status` of the result will signal an |
235 | /// error. The result will also indicate that all bytes up to but not including the offending |
236 | /// byte have been consumed. You may try again with a fixed byte. |
237 | /// |
238 | /// [`into_stream`]: #method.into_stream |
239 | /// [`finish`]: #method.finish |
240 | pub fn encode_bytes(&mut self, inp: &[u8], out: &mut [u8]) -> BufferResult { |
241 | self.state.advance(inp, out) |
242 | } |
243 | |
244 | /// Encode a single chunk of data. |
245 | /// |
246 | /// This method will add an end marker to the encoded chunk. |
247 | /// |
248 | /// This is a convenience wrapper around [`into_vec`]. Use the `into_vec` adapter to customize |
249 | /// buffer size, to supply an existing vector, to control whether an end marker is required, or |
250 | /// to preserve partial data in the case of a decoding error. |
251 | /// |
252 | /// [`into_vec`]: #into_vec |
253 | /// |
254 | /// # Example |
255 | /// |
256 | /// ``` |
257 | /// use weezl::{BitOrder, encode::Encoder}; |
258 | /// |
259 | /// let data = b"Hello, world" ; |
260 | /// let encoded = Encoder::new(BitOrder::Msb, 9) |
261 | /// .encode(data) |
262 | /// .expect("All bytes valid for code size" ); |
263 | /// ``` |
264 | pub fn encode(&mut self, data: &[u8]) -> Result<Vec<u8>, LzwError> { |
265 | let mut output = Vec::new(); |
266 | self.into_vec(&mut output).encode_all(data).status?; |
267 | Ok(output) |
268 | } |
269 | |
270 | /// Construct a encoder into a writer. |
271 | #[cfg (feature = "std" )] |
272 | pub fn into_stream<W: Write>(&mut self, writer: W) -> IntoStream<'_, W> { |
273 | IntoStream { |
274 | encoder: self, |
275 | writer, |
276 | buffer: None, |
277 | default_size: STREAM_BUF_SIZE, |
278 | } |
279 | } |
280 | |
281 | /// Construct a encoder into an async writer. |
282 | #[cfg (feature = "async" )] |
283 | pub fn into_async<W: futures::io::AsyncWrite>(&mut self, writer: W) -> IntoAsync<'_, W> { |
284 | IntoAsync { |
285 | encoder: self, |
286 | writer, |
287 | buffer: None, |
288 | default_size: STREAM_BUF_SIZE, |
289 | } |
290 | } |
291 | |
292 | /// Construct an encoder into a vector. |
293 | /// |
294 | /// All encoded data is appended and the vector is __not__ cleared. |
295 | /// |
296 | /// Compared to `into_stream` this interface allows a high-level access to encoding without |
297 | /// requires the `std`-feature. Also, it can make full use of the extra buffer control that the |
298 | /// special target exposes. |
299 | pub fn into_vec<'lt>(&'lt mut self, vec: &'lt mut Vec<u8>) -> IntoVec<'lt> { |
300 | IntoVec { |
301 | encoder: self, |
302 | vector: vec, |
303 | } |
304 | } |
305 | |
306 | /// Mark the encoding as in the process of finishing. |
307 | /// |
308 | /// The next following call to `encode_bytes` which is able to consume the complete input will |
309 | /// also try to emit an end code. It's not recommended, but also not unsound, to use different |
310 | /// byte slices in different calls from this point forward and thus to 'delay' the actual end |
311 | /// of the data stream. The behaviour after the end marker has been written is unspecified but |
312 | /// sound. |
313 | pub fn finish(&mut self) { |
314 | self.state.mark_ended(); |
315 | } |
316 | |
317 | /// Undo marking this data stream as ending. |
318 | /// FIXME: clarify how this interacts with padding introduced after end code. |
319 | #[allow (dead_code)] |
320 | pub(crate) fn restart(&mut self) { |
321 | self.state.restart() |
322 | } |
323 | |
324 | /// Reset all internal state. |
325 | /// |
326 | /// This produce an encoder as if just constructed with `new` but taking slightly less work. In |
327 | /// particular it will not deallocate any internal allocations. It will also avoid some |
328 | /// duplicate setup work. |
329 | pub fn reset(&mut self) { |
330 | self.state.reset() |
331 | } |
332 | } |
333 | |
334 | #[cfg (feature = "std" )] |
335 | impl<'d, W: Write> IntoStream<'d, W> { |
336 | /// Encode data from a reader. |
337 | /// |
338 | /// This will drain the supplied reader. It will not encode an end marker after all data has |
339 | /// been processed. |
340 | pub fn encode(&mut self, read: impl BufRead) -> StreamResult { |
341 | self.encode_part(read, false) |
342 | } |
343 | |
344 | /// Encode data from a reader and an end marker. |
345 | pub fn encode_all(mut self, read: impl BufRead) -> StreamResult { |
346 | self.encode_part(read, true) |
347 | } |
348 | |
349 | /// Set the size of the intermediate encode buffer. |
350 | /// |
351 | /// A buffer of this size is allocated to hold one part of the encoded stream when no buffer is |
352 | /// available and any encoding method is called. No buffer is allocated if `set_buffer` has |
353 | /// been called. The buffer is reused. |
354 | /// |
355 | /// # Panics |
356 | /// This method panics if `size` is `0`. |
357 | pub fn set_buffer_size(&mut self, size: usize) { |
358 | assert_ne!(size, 0, "Attempted to set empty buffer" ); |
359 | self.default_size = size; |
360 | } |
361 | |
362 | /// Use a particular buffer as an intermediate encode buffer. |
363 | /// |
364 | /// Calling this sets or replaces the buffer. When a buffer has been set then it is used |
365 | /// instead of a dynamically allocating a buffer. Note that the size of the buffer is relevant |
366 | /// for efficient encoding as there is additional overhead from `write` calls each time the |
367 | /// buffer has been filled. |
368 | /// |
369 | /// # Panics |
370 | /// This method panics if the `buffer` is empty. |
371 | pub fn set_buffer(&mut self, buffer: &'d mut [u8]) { |
372 | assert_ne!(buffer.len(), 0, "Attempted to set empty buffer" ); |
373 | self.buffer = Some(StreamBuf::Borrowed(buffer)); |
374 | } |
375 | |
376 | fn encode_part(&mut self, mut read: impl BufRead, finish: bool) -> StreamResult { |
377 | let IntoStream { |
378 | encoder, |
379 | writer, |
380 | buffer, |
381 | default_size, |
382 | } = self; |
383 | enum Progress { |
384 | Ok, |
385 | Done, |
386 | } |
387 | |
388 | let mut bytes_read = 0; |
389 | let mut bytes_written = 0; |
390 | |
391 | let read_bytes = &mut bytes_read; |
392 | let write_bytes = &mut bytes_written; |
393 | |
394 | let outbuf: &mut [u8] = |
395 | match { buffer.get_or_insert_with(|| StreamBuf::Owned(vec![0u8; *default_size])) } { |
396 | StreamBuf::Borrowed(slice) => &mut *slice, |
397 | StreamBuf::Owned(vec) => &mut *vec, |
398 | }; |
399 | assert!(!outbuf.is_empty()); |
400 | |
401 | let once = move || { |
402 | let data = read.fill_buf()?; |
403 | |
404 | if data.is_empty() { |
405 | if finish { |
406 | encoder.finish(); |
407 | } else { |
408 | return Ok(Progress::Done); |
409 | } |
410 | } |
411 | |
412 | let result = encoder.encode_bytes(data, &mut outbuf[..]); |
413 | *read_bytes += result.consumed_in; |
414 | *write_bytes += result.consumed_out; |
415 | read.consume(result.consumed_in); |
416 | |
417 | let done = result.status.map_err(|err| { |
418 | io::Error::new(io::ErrorKind::InvalidData, &*format!(" {:?}" , err)) |
419 | })?; |
420 | |
421 | if let LzwStatus::Done = done { |
422 | writer.write_all(&outbuf[..result.consumed_out])?; |
423 | return Ok(Progress::Done); |
424 | } |
425 | |
426 | if let LzwStatus::NoProgress = done { |
427 | return Err(io::Error::new( |
428 | io::ErrorKind::UnexpectedEof, |
429 | "No more data but no end marker detected" , |
430 | )); |
431 | } |
432 | |
433 | writer.write_all(&outbuf[..result.consumed_out])?; |
434 | Ok(Progress::Ok) |
435 | }; |
436 | |
437 | let status = core::iter::repeat_with(once) |
438 | // scan+fuse can be replaced with map_while |
439 | .scan((), |(), result| match result { |
440 | Ok(Progress::Ok) => Some(Ok(())), |
441 | Err(err) => Some(Err(err)), |
442 | Ok(Progress::Done) => None, |
443 | }) |
444 | .fuse() |
445 | .collect(); |
446 | |
447 | StreamResult { |
448 | bytes_read, |
449 | bytes_written, |
450 | status, |
451 | } |
452 | } |
453 | } |
454 | |
455 | impl IntoVec<'_> { |
456 | /// Encode data from a slice. |
457 | pub fn encode(&mut self, read: &[u8]) -> VectorResult { |
458 | self.encode_part(read, false) |
459 | } |
460 | |
461 | /// Decode data from a reader, adding an end marker. |
462 | pub fn encode_all(mut self, read: &[u8]) -> VectorResult { |
463 | self.encode_part(read, true) |
464 | } |
465 | |
466 | fn grab_buffer(&mut self) -> (&mut [u8], &mut Encoder) { |
467 | const CHUNK_SIZE: usize = 1 << 12; |
468 | let decoder = &mut self.encoder; |
469 | let length = self.vector.len(); |
470 | |
471 | // Use the vector to do overflow checks and w/e. |
472 | self.vector.reserve(CHUNK_SIZE); |
473 | // FIXME: encoding into uninit buffer? |
474 | self.vector.resize(length + CHUNK_SIZE, 0u8); |
475 | |
476 | (&mut self.vector[length..], decoder) |
477 | } |
478 | |
479 | fn encode_part(&mut self, part: &[u8], finish: bool) -> VectorResult { |
480 | let mut result = VectorResult { |
481 | consumed_in: 0, |
482 | consumed_out: 0, |
483 | status: Ok(LzwStatus::Ok), |
484 | }; |
485 | |
486 | enum Progress { |
487 | Ok, |
488 | Done, |
489 | } |
490 | |
491 | // Converting to mutable refs to move into the `once` closure. |
492 | let read_bytes = &mut result.consumed_in; |
493 | let write_bytes = &mut result.consumed_out; |
494 | let mut data = part; |
495 | |
496 | // A 64 MB buffer is quite large but should get alloc_zeroed. |
497 | // Note that the decoded size can be up to quadratic in code block. |
498 | let once = move || { |
499 | // Grab a new output buffer. |
500 | let (outbuf, encoder) = self.grab_buffer(); |
501 | |
502 | if finish { |
503 | encoder.finish(); |
504 | } |
505 | |
506 | // Decode as much of the buffer as fits. |
507 | let result = encoder.encode_bytes(data, &mut outbuf[..]); |
508 | // Do the bookkeeping and consume the buffer. |
509 | *read_bytes += result.consumed_in; |
510 | *write_bytes += result.consumed_out; |
511 | data = &data[result.consumed_in..]; |
512 | |
513 | let unfilled = outbuf.len() - result.consumed_out; |
514 | let filled = self.vector.len() - unfilled; |
515 | self.vector.truncate(filled); |
516 | |
517 | // Handle the status in the result. |
518 | let done = result.status?; |
519 | if let LzwStatus::Done = done { |
520 | Ok(Progress::Done) |
521 | } else { |
522 | Ok(Progress::Ok) |
523 | } |
524 | }; |
525 | |
526 | // Decode chunks of input data until we're done. |
527 | let status: Result<(), _> = core::iter::repeat_with(once) |
528 | // scan+fuse can be replaced with map_while |
529 | .scan((), |(), result| match result { |
530 | Ok(Progress::Ok) => Some(Ok(())), |
531 | Err(err) => Some(Err(err)), |
532 | Ok(Progress::Done) => None, |
533 | }) |
534 | .fuse() |
535 | .collect(); |
536 | |
537 | if let Err(err) = status { |
538 | result.status = Err(err); |
539 | } |
540 | |
541 | result |
542 | } |
543 | } |
544 | |
545 | // This is implemented in a separate file, so that 1.34.2 does not parse it. Otherwise, it would |
546 | // trip over the usage of await, which is a reserved keyword in that edition/version. It only |
547 | // contains an impl block. |
548 | #[cfg (feature = "async" )] |
549 | #[path = "encode_into_async.rs" ] |
550 | mod impl_encode_into_async; |
551 | |
552 | impl<B: Buffer> EncodeState<B> { |
553 | fn new(min_size: u8) -> Self { |
554 | let clear_code: u16 = 1 << min_size; |
555 | let mut tree: Tree = Tree::default(); |
556 | tree.init(min_size); |
557 | let mut state: EncodeState = EncodeState { |
558 | min_size, |
559 | tree, |
560 | has_ended: false, |
561 | is_tiff: false, |
562 | current_code: clear_code, |
563 | clear_code, |
564 | buffer: B::new(min_size), |
565 | }; |
566 | state.buffer_code(clear_code); |
567 | state |
568 | } |
569 | } |
570 | |
571 | impl<B: Buffer> Stateful for EncodeState<B> { |
572 | fn advance(&mut self, mut inp: &[u8], mut out: &mut [u8]) -> BufferResult { |
573 | let c_in = inp.len(); |
574 | let c_out = out.len(); |
575 | let mut status = Ok(LzwStatus::Ok); |
576 | |
577 | 'encoding: loop { |
578 | if self.push_out(&mut out) { |
579 | break; |
580 | } |
581 | |
582 | if inp.is_empty() && self.has_ended { |
583 | let end = self.end_code(); |
584 | if self.current_code != end { |
585 | if self.current_code != self.clear_code { |
586 | self.buffer_code(self.current_code); |
587 | |
588 | // When reading this code, the decoder will add an extra entry to its table |
589 | // before reading th end code. Thusly, it may increase its code size based |
590 | // on this additional entry. |
591 | if self.tree.keys.len() + usize::from(self.is_tiff) |
592 | > usize::from(self.buffer.max_code()) |
593 | && self.buffer.code_size() < MAX_CODESIZE |
594 | { |
595 | self.buffer.bump_code_size(); |
596 | } |
597 | } |
598 | self.buffer_code(end); |
599 | self.current_code = end; |
600 | self.buffer_pad(); |
601 | } |
602 | |
603 | break; |
604 | } |
605 | |
606 | let mut next_code = None; |
607 | let mut bytes = inp.iter(); |
608 | while let Some(&byte) = bytes.next() { |
609 | if self.min_size < 8 && byte >= 1 << self.min_size { |
610 | status = Err(LzwError::InvalidCode); |
611 | break 'encoding; |
612 | } |
613 | |
614 | inp = bytes.as_slice(); |
615 | match self.tree.iterate(self.current_code, byte) { |
616 | Ok(code) => self.current_code = code, |
617 | Err(_) => { |
618 | next_code = Some(self.current_code); |
619 | |
620 | self.current_code = u16::from(byte); |
621 | break; |
622 | } |
623 | } |
624 | } |
625 | |
626 | match next_code { |
627 | // No more bytes, no code produced. |
628 | None => break, |
629 | Some(code) => { |
630 | self.buffer_code(code); |
631 | |
632 | if self.tree.keys.len() + usize::from(self.is_tiff) |
633 | > usize::from(self.buffer.max_code()) + 1 |
634 | && self.buffer.code_size() < MAX_CODESIZE |
635 | { |
636 | self.buffer.bump_code_size(); |
637 | } |
638 | |
639 | if self.tree.keys.len() > MAX_ENTRIES { |
640 | self.buffer_code(self.clear_code); |
641 | self.tree.reset(self.min_size); |
642 | self.buffer.clear(self.min_size); |
643 | } |
644 | } |
645 | } |
646 | } |
647 | |
648 | if inp.is_empty() && self.current_code == self.end_code() { |
649 | if !self.flush_out(&mut out) { |
650 | status = Ok(LzwStatus::Done); |
651 | } |
652 | } |
653 | |
654 | BufferResult { |
655 | consumed_in: c_in - inp.len(), |
656 | consumed_out: c_out - out.len(), |
657 | status, |
658 | } |
659 | } |
660 | |
661 | fn mark_ended(&mut self) -> bool { |
662 | core::mem::replace(&mut self.has_ended, true) |
663 | } |
664 | |
665 | fn restart(&mut self) { |
666 | self.has_ended = false; |
667 | } |
668 | |
669 | fn reset(&mut self) { |
670 | self.restart(); |
671 | self.current_code = self.clear_code; |
672 | self.tree.reset(self.min_size); |
673 | self.buffer.reset(self.min_size); |
674 | self.buffer_code(self.clear_code); |
675 | } |
676 | } |
677 | |
678 | impl<B: Buffer> EncodeState<B> { |
679 | fn push_out(&mut self, out: &mut &mut [u8]) -> bool { |
680 | self.buffer.push_out(out) |
681 | } |
682 | |
683 | fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { |
684 | self.buffer.flush_out(out) |
685 | } |
686 | |
687 | fn end_code(&self) -> Code { |
688 | self.clear_code + 1 |
689 | } |
690 | |
691 | fn buffer_pad(&mut self) { |
692 | self.buffer.buffer_pad(); |
693 | } |
694 | |
695 | fn buffer_code(&mut self, code: Code) { |
696 | self.buffer.buffer_code(code); |
697 | } |
698 | } |
699 | |
700 | impl Buffer for MsbBuffer { |
701 | fn new(min_size: u8) -> Self { |
702 | MsbBuffer { |
703 | code_size: min_size + 1, |
704 | buffer: 0, |
705 | bits_in_buffer: 0, |
706 | } |
707 | } |
708 | |
709 | fn reset(&mut self, min_size: u8) { |
710 | self.code_size = min_size + 1; |
711 | self.buffer = 0; |
712 | self.bits_in_buffer = 0; |
713 | } |
714 | |
715 | fn clear(&mut self, min_size: u8) { |
716 | self.code_size = min_size + 1; |
717 | } |
718 | |
719 | fn buffer_code(&mut self, code: Code) { |
720 | let shift = 64 - self.bits_in_buffer - self.code_size; |
721 | self.buffer |= u64::from(code) << shift; |
722 | self.bits_in_buffer += self.code_size; |
723 | } |
724 | |
725 | fn push_out(&mut self, out: &mut &mut [u8]) -> bool { |
726 | if self.bits_in_buffer + 2 * self.code_size < 64 { |
727 | return false; |
728 | } |
729 | |
730 | self.flush_out(out) |
731 | } |
732 | |
733 | fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { |
734 | let want = usize::from(self.bits_in_buffer / 8); |
735 | let count = want.min((*out).len()); |
736 | let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count); |
737 | *out = tail; |
738 | |
739 | for b in bytes { |
740 | *b = ((self.buffer & 0xff00_0000_0000_0000) >> 56) as u8; |
741 | self.buffer <<= 8; |
742 | self.bits_in_buffer -= 8; |
743 | } |
744 | |
745 | count < want |
746 | } |
747 | |
748 | fn buffer_pad(&mut self) { |
749 | let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7; |
750 | self.bits_in_buffer += to_byte; |
751 | } |
752 | |
753 | fn bump_code_size(&mut self) { |
754 | self.code_size += 1; |
755 | } |
756 | |
757 | fn max_code(&self) -> Code { |
758 | (1 << self.code_size) - 1 |
759 | } |
760 | |
761 | fn code_size(&self) -> u8 { |
762 | self.code_size |
763 | } |
764 | } |
765 | |
766 | impl Buffer for LsbBuffer { |
767 | fn new(min_size: u8) -> Self { |
768 | LsbBuffer { |
769 | code_size: min_size + 1, |
770 | buffer: 0, |
771 | bits_in_buffer: 0, |
772 | } |
773 | } |
774 | |
775 | fn reset(&mut self, min_size: u8) { |
776 | self.code_size = min_size + 1; |
777 | self.buffer = 0; |
778 | self.bits_in_buffer = 0; |
779 | } |
780 | |
781 | fn clear(&mut self, min_size: u8) { |
782 | self.code_size = min_size + 1; |
783 | } |
784 | |
785 | fn buffer_code(&mut self, code: Code) { |
786 | self.buffer |= u64::from(code) << self.bits_in_buffer; |
787 | self.bits_in_buffer += self.code_size; |
788 | } |
789 | |
790 | fn push_out(&mut self, out: &mut &mut [u8]) -> bool { |
791 | if self.bits_in_buffer + 2 * self.code_size < 64 { |
792 | return false; |
793 | } |
794 | |
795 | self.flush_out(out) |
796 | } |
797 | |
798 | fn flush_out(&mut self, out: &mut &mut [u8]) -> bool { |
799 | let want = usize::from(self.bits_in_buffer / 8); |
800 | let count = want.min((*out).len()); |
801 | let (bytes, tail) = core::mem::replace(out, &mut []).split_at_mut(count); |
802 | *out = tail; |
803 | |
804 | for b in bytes { |
805 | *b = (self.buffer & 0x0000_0000_0000_00ff) as u8; |
806 | self.buffer >>= 8; |
807 | self.bits_in_buffer -= 8; |
808 | } |
809 | |
810 | count < want |
811 | } |
812 | |
813 | fn buffer_pad(&mut self) { |
814 | let to_byte = self.bits_in_buffer.wrapping_neg() & 0x7; |
815 | self.bits_in_buffer += to_byte; |
816 | } |
817 | |
818 | fn bump_code_size(&mut self) { |
819 | self.code_size += 1; |
820 | } |
821 | |
822 | fn max_code(&self) -> Code { |
823 | (1 << self.code_size) - 1 |
824 | } |
825 | |
826 | fn code_size(&self) -> u8 { |
827 | self.code_size |
828 | } |
829 | } |
830 | |
831 | impl Tree { |
832 | fn init(&mut self, min_size: u8) { |
833 | // We need a way to represent the state of a currently empty buffer. We use the clear code |
834 | // for this, thus create one complex mapping that leads to the one-char base codes. |
835 | self.keys |
836 | .resize((1 << min_size) + 2, FullKey::NoSuccessor.into()); |
837 | self.complex.push(Full { |
838 | char_continuation: [0; 256], |
839 | }); |
840 | let map_of_begin = self.complex.last_mut().unwrap(); |
841 | for ch in 0u16..256 { |
842 | map_of_begin.char_continuation[usize::from(ch)] = ch; |
843 | } |
844 | self.keys[1 << min_size] = FullKey::Full(0).into(); |
845 | } |
846 | |
847 | fn reset(&mut self, min_size: u8) { |
848 | self.simples.clear(); |
849 | self.keys.truncate((1 << min_size) + 2); |
850 | // Keep entry for clear code. |
851 | self.complex.truncate(1); |
852 | // The first complex is not changed.. |
853 | for k in self.keys[..(1 << min_size) + 2].iter_mut() { |
854 | *k = FullKey::NoSuccessor.into(); |
855 | } |
856 | self.keys[1 << min_size] = FullKey::Full(0).into(); |
857 | } |
858 | |
859 | fn at_key(&self, code: Code, ch: u8) -> Option<Code> { |
860 | let key = self.keys[usize::from(code)]; |
861 | match FullKey::from(key) { |
862 | FullKey::NoSuccessor => None, |
863 | FullKey::Simple(idx) => { |
864 | let nexts = &self.simples[usize::from(idx)]; |
865 | let successors = nexts |
866 | .codes |
867 | .iter() |
868 | .zip(nexts.chars.iter()) |
869 | .take(usize::from(nexts.count)); |
870 | for (&scode, &sch) in successors { |
871 | if sch == ch { |
872 | return Some(scode); |
873 | } |
874 | } |
875 | |
876 | None |
877 | } |
878 | FullKey::Full(idx) => { |
879 | let full = &self.complex[usize::from(idx)]; |
880 | let precode = full.char_continuation[usize::from(ch)]; |
881 | if usize::from(precode) < MAX_ENTRIES { |
882 | Some(precode) |
883 | } else { |
884 | None |
885 | } |
886 | } |
887 | } |
888 | } |
889 | |
890 | /// Iterate to the next char. |
891 | /// Return Ok when it was already in the tree or creates a new entry for it and returns Err. |
892 | fn iterate(&mut self, code: Code, ch: u8) -> Result<Code, Code> { |
893 | if let Some(next) = self.at_key(code, ch) { |
894 | Ok(next) |
895 | } else { |
896 | Err(self.append(code, ch)) |
897 | } |
898 | } |
899 | |
900 | fn append(&mut self, code: Code, ch: u8) -> Code { |
901 | let next: Code = self.keys.len() as u16; |
902 | let key = self.keys[usize::from(code)]; |
903 | // TODO: with debug assertions, check for non-existence |
904 | match FullKey::from(key) { |
905 | FullKey::NoSuccessor => { |
906 | let new_key = FullKey::Simple(self.simples.len() as u16); |
907 | self.simples.push(Simple::default()); |
908 | let simples = self.simples.last_mut().unwrap(); |
909 | simples.codes[0] = next; |
910 | simples.chars[0] = ch; |
911 | simples.count = 1; |
912 | self.keys[usize::from(code)] = new_key.into(); |
913 | } |
914 | FullKey::Simple(idx) if usize::from(self.simples[usize::from(idx)].count) < SHORT => { |
915 | let nexts = &mut self.simples[usize::from(idx)]; |
916 | let nidx = usize::from(nexts.count); |
917 | nexts.chars[nidx] = ch; |
918 | nexts.codes[nidx] = next; |
919 | nexts.count += 1; |
920 | } |
921 | FullKey::Simple(idx) => { |
922 | let new_key = FullKey::Full(self.complex.len() as u16); |
923 | let simples = &self.simples[usize::from(idx)]; |
924 | self.complex.push(Full { |
925 | char_continuation: [Code::max_value(); 256], |
926 | }); |
927 | let full = self.complex.last_mut().unwrap(); |
928 | for (&pch, &pcont) in simples.chars.iter().zip(simples.codes.iter()) { |
929 | full.char_continuation[usize::from(pch)] = pcont; |
930 | } |
931 | self.keys[usize::from(code)] = new_key.into(); |
932 | } |
933 | FullKey::Full(idx) => { |
934 | let full = &mut self.complex[usize::from(idx)]; |
935 | full.char_continuation[usize::from(ch)] = next; |
936 | } |
937 | } |
938 | self.keys.push(FullKey::NoSuccessor.into()); |
939 | next |
940 | } |
941 | } |
942 | |
943 | impl Default for FullKey { |
944 | fn default() -> Self { |
945 | FullKey::NoSuccessor |
946 | } |
947 | } |
948 | |
949 | impl Default for Simple { |
950 | fn default() -> Self { |
951 | Simple { |
952 | codes: [0; SHORT], |
953 | chars: [0; SHORT], |
954 | count: 0, |
955 | } |
956 | } |
957 | } |
958 | |
959 | impl From<CompressedKey> for FullKey { |
960 | fn from(CompressedKey(key: u16): CompressedKey) -> Self { |
961 | match (key >> MAX_CODESIZE) & 0xf { |
962 | 0 => FullKey::Full(key & 0xfff), |
963 | 1 => FullKey::Simple(key & 0xfff), |
964 | _ => FullKey::NoSuccessor, |
965 | } |
966 | } |
967 | } |
968 | |
969 | impl From<FullKey> for CompressedKey { |
970 | fn from(full: FullKey) -> Self { |
971 | CompressedKey(match full { |
972 | FullKey::NoSuccessor => 0x2000, |
973 | FullKey::Simple(code: u16) => 0x1000 | code, |
974 | FullKey::Full(code: u16) => code, |
975 | }) |
976 | } |
977 | } |
978 | |
979 | #[cfg (test)] |
980 | mod tests { |
981 | use super::{BitOrder, Encoder, LzwError, LzwStatus}; |
982 | use crate::alloc::vec::Vec; |
983 | use crate::decode::Decoder; |
984 | #[cfg (feature = "std" )] |
985 | use crate::StreamBuf; |
986 | |
987 | #[test ] |
988 | fn invalid_input_rejected() { |
989 | const BIT_LEN: u8 = 2; |
990 | let ref input = [0, 1 << BIT_LEN /* invalid */, 0]; |
991 | let ref mut target = [0u8; 128]; |
992 | let mut encoder = Encoder::new(BitOrder::Msb, BIT_LEN); |
993 | |
994 | encoder.finish(); |
995 | // We require simulation of normality, that is byte-for-byte compression. |
996 | let result = encoder.encode_bytes(input, target); |
997 | assert!(if let Err(LzwError::InvalidCode) = result.status { |
998 | true |
999 | } else { |
1000 | false |
1001 | }); |
1002 | assert_eq!(result.consumed_in, 1); |
1003 | |
1004 | let fixed = encoder.encode_bytes(&[1, 0], &mut target[result.consumed_out..]); |
1005 | assert!(if let Ok(LzwStatus::Done) = fixed.status { |
1006 | true |
1007 | } else { |
1008 | false |
1009 | }); |
1010 | assert_eq!(fixed.consumed_in, 2); |
1011 | |
1012 | // Okay, now test we actually fixed it. |
1013 | let ref mut compare = [0u8; 4]; |
1014 | let mut todo = &target[..result.consumed_out + fixed.consumed_out]; |
1015 | let mut free = &mut compare[..]; |
1016 | let mut decoder = Decoder::new(BitOrder::Msb, BIT_LEN); |
1017 | |
1018 | // Decode with up to 16 rounds, far too much but inconsequential. |
1019 | for _ in 0..16 { |
1020 | if decoder.has_ended() { |
1021 | break; |
1022 | } |
1023 | |
1024 | let result = decoder.decode_bytes(todo, free); |
1025 | assert!(result.status.is_ok()); |
1026 | todo = &todo[result.consumed_in..]; |
1027 | free = &mut free[result.consumed_out..]; |
1028 | } |
1029 | |
1030 | let remaining = { free }.len(); |
1031 | let len = compare.len() - remaining; |
1032 | assert_eq!(todo, &[]); |
1033 | assert_eq!(compare[..len], [0, 1, 0]); |
1034 | } |
1035 | |
1036 | #[test ] |
1037 | #[should_panic ] |
1038 | fn invalid_code_size_low() { |
1039 | let _ = Encoder::new(BitOrder::Msb, 1); |
1040 | } |
1041 | |
1042 | #[test ] |
1043 | #[should_panic ] |
1044 | fn invalid_code_size_high() { |
1045 | let _ = Encoder::new(BitOrder::Msb, 14); |
1046 | } |
1047 | |
1048 | fn make_decoded() -> Vec<u8> { |
1049 | const FILE: &'static [u8] = |
1050 | include_bytes!(concat!(env!("CARGO_MANIFEST_DIR" ), "/Cargo.lock" )); |
1051 | return Vec::from(FILE); |
1052 | } |
1053 | |
1054 | #[test ] |
1055 | #[cfg (feature = "std" )] |
1056 | fn into_stream_buffer_no_alloc() { |
1057 | let encoded = make_decoded(); |
1058 | let mut encoder = Encoder::new(BitOrder::Msb, 8); |
1059 | |
1060 | let mut output = vec![]; |
1061 | let mut buffer = [0; 512]; |
1062 | let mut istream = encoder.into_stream(&mut output); |
1063 | istream.set_buffer(&mut buffer[..]); |
1064 | istream.encode(&encoded[..]).status.unwrap(); |
1065 | |
1066 | match istream.buffer { |
1067 | Some(StreamBuf::Borrowed(_)) => {} |
1068 | None => panic!("Decoded without buffer??" ), |
1069 | Some(StreamBuf::Owned(_)) => panic!("Unexpected buffer allocation" ), |
1070 | } |
1071 | } |
1072 | |
1073 | #[test ] |
1074 | #[cfg (feature = "std" )] |
1075 | fn into_stream_buffer_small_alloc() { |
1076 | struct WriteTap<W: std::io::Write>(W); |
1077 | const BUF_SIZE: usize = 512; |
1078 | |
1079 | impl<W: std::io::Write> std::io::Write for WriteTap<W> { |
1080 | fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> { |
1081 | assert!(buf.len() <= BUF_SIZE); |
1082 | self.0.write(buf) |
1083 | } |
1084 | fn flush(&mut self) -> std::io::Result<()> { |
1085 | self.0.flush() |
1086 | } |
1087 | } |
1088 | |
1089 | let encoded = make_decoded(); |
1090 | let mut encoder = Encoder::new(BitOrder::Msb, 8); |
1091 | |
1092 | let mut output = vec![]; |
1093 | let mut istream = encoder.into_stream(WriteTap(&mut output)); |
1094 | istream.set_buffer_size(512); |
1095 | istream.encode(&encoded[..]).status.unwrap(); |
1096 | |
1097 | match istream.buffer { |
1098 | Some(StreamBuf::Owned(vec)) => assert!(vec.len() <= BUF_SIZE), |
1099 | Some(StreamBuf::Borrowed(_)) => panic!("Unexpected borrowed buffer, where from?" ), |
1100 | None => panic!("Decoded without buffer??" ), |
1101 | } |
1102 | } |
1103 | |
1104 | #[test ] |
1105 | #[cfg (feature = "std" )] |
1106 | fn reset() { |
1107 | let encoded = make_decoded(); |
1108 | let mut encoder = Encoder::new(BitOrder::Msb, 8); |
1109 | let mut reference = None; |
1110 | |
1111 | for _ in 0..2 { |
1112 | let mut output = vec![]; |
1113 | let mut buffer = [0; 512]; |
1114 | let mut istream = encoder.into_stream(&mut output); |
1115 | istream.set_buffer(&mut buffer[..]); |
1116 | istream.encode_all(&encoded[..]).status.unwrap(); |
1117 | |
1118 | encoder.reset(); |
1119 | if let Some(reference) = &reference { |
1120 | assert_eq!(output, *reference); |
1121 | } else { |
1122 | reference = Some(output); |
1123 | } |
1124 | } |
1125 | } |
1126 | } |
1127 | |