| 1 | mod lossy; |
| 2 | mod read; |
| 3 | |
| 4 | pub use lossy::LossyDecoder; |
| 5 | pub use read::{BufReadDecoder, BufReadDecoderError}; |
| 6 | |
| 7 | use std::cmp; |
| 8 | use std::error::Error; |
| 9 | use std::fmt; |
| 10 | use std::str; |
| 11 | |
| 12 | /// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error. |
| 13 | pub const REPLACEMENT_CHARACTER: &'static str = " \u{FFFD}" ; |
| 14 | |
| 15 | #[derive (Debug, Copy, Clone)] |
| 16 | pub enum DecodeError<'a> { |
| 17 | /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`, |
| 18 | /// then call `decode()` again with `remaining_input`. |
| 19 | Invalid { |
| 20 | valid_prefix: &'a str, |
| 21 | invalid_sequence: &'a [u8], |
| 22 | remaining_input: &'a [u8], |
| 23 | }, |
| 24 | |
| 25 | /// Call the `incomplete_suffix.try_complete` method with more input when available. |
| 26 | /// If no more input is available, this is an invalid byte sequence. |
| 27 | Incomplete { |
| 28 | valid_prefix: &'a str, |
| 29 | incomplete_suffix: Incomplete, |
| 30 | }, |
| 31 | } |
| 32 | |
| 33 | impl<'a> fmt::Display for DecodeError<'a> { |
| 34 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 35 | match *self { |
| 36 | DecodeError::Invalid { |
| 37 | valid_prefix, |
| 38 | invalid_sequence, |
| 39 | remaining_input, |
| 40 | } => write!( |
| 41 | f, |
| 42 | "found invalid byte sequence {invalid_sequence:02x?} after \ |
| 43 | {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \ |
| 44 | unprocessed bytes" , |
| 45 | invalid_sequence = invalid_sequence, |
| 46 | valid_byte_count = valid_prefix.len(), |
| 47 | unprocessed_byte_count = remaining_input.len() |
| 48 | ), |
| 49 | DecodeError::Incomplete { |
| 50 | valid_prefix, |
| 51 | incomplete_suffix, |
| 52 | } => write!( |
| 53 | f, |
| 54 | "found incomplete byte sequence {incomplete_suffix:02x?} after \ |
| 55 | {valid_byte_count} bytes" , |
| 56 | incomplete_suffix = incomplete_suffix, |
| 57 | valid_byte_count = valid_prefix.len() |
| 58 | ), |
| 59 | } |
| 60 | } |
| 61 | } |
| 62 | |
| 63 | impl<'a> Error for DecodeError<'a> {} |
| 64 | |
| 65 | #[derive (Debug, Copy, Clone)] |
| 66 | pub struct Incomplete { |
| 67 | pub buffer: [u8; 4], |
| 68 | pub buffer_len: u8, |
| 69 | } |
| 70 | |
| 71 | pub fn decode(input: &[u8]) -> Result<&str, DecodeError> { |
| 72 | let error = match str::from_utf8(input) { |
| 73 | Ok(valid) => return Ok(valid), |
| 74 | Err(error) => error, |
| 75 | }; |
| 76 | |
| 77 | // FIXME: separate function from here to guide inlining? |
| 78 | let (valid, after_valid) = input.split_at(error.valid_up_to()); |
| 79 | let valid = unsafe { |
| 80 | str::from_utf8_unchecked(valid) |
| 81 | }; |
| 82 | |
| 83 | match error.error_len() { |
| 84 | Some(invalid_sequence_length) => { |
| 85 | let (invalid, rest) = after_valid.split_at(invalid_sequence_length); |
| 86 | Err(DecodeError::Invalid { |
| 87 | valid_prefix: valid, |
| 88 | invalid_sequence: invalid, |
| 89 | remaining_input: rest |
| 90 | }) |
| 91 | } |
| 92 | None => { |
| 93 | Err(DecodeError::Incomplete { |
| 94 | valid_prefix: valid, |
| 95 | incomplete_suffix: Incomplete::new(after_valid), |
| 96 | }) |
| 97 | } |
| 98 | } |
| 99 | } |
| 100 | |
| 101 | impl Incomplete { |
| 102 | pub fn empty() -> Self { |
| 103 | Incomplete { |
| 104 | buffer: [0, 0, 0, 0], |
| 105 | buffer_len: 0, |
| 106 | } |
| 107 | } |
| 108 | |
| 109 | pub fn is_empty(&self) -> bool { |
| 110 | self.buffer_len == 0 |
| 111 | } |
| 112 | |
| 113 | pub fn new(bytes: &[u8]) -> Self { |
| 114 | let mut buffer = [0, 0, 0, 0]; |
| 115 | let len = bytes.len(); |
| 116 | buffer[..len].copy_from_slice(bytes); |
| 117 | Incomplete { |
| 118 | buffer: buffer, |
| 119 | buffer_len: len as u8, |
| 120 | } |
| 121 | } |
| 122 | |
| 123 | /// * `None`: still incomplete, call `try_complete` again with more input. |
| 124 | /// If no more input is available, this is invalid byte sequence. |
| 125 | /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`. |
| 126 | /// To keep decoding, pass `remaining_input` to `decode()`. |
| 127 | pub fn try_complete<'input>(&mut self, input: &'input [u8]) |
| 128 | -> Option<(Result<&str, &[u8]>, &'input [u8])> { |
| 129 | let (consumed, opt_result) = self.try_complete_offsets(input); |
| 130 | let result = opt_result?; |
| 131 | let remaining_input = &input[consumed..]; |
| 132 | let result_bytes = self.take_buffer(); |
| 133 | let result = match result { |
| 134 | Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }), |
| 135 | Err(()) => Err(result_bytes), |
| 136 | }; |
| 137 | Some((result, remaining_input)) |
| 138 | } |
| 139 | |
| 140 | fn take_buffer(&mut self) -> &[u8] { |
| 141 | let len = self.buffer_len as usize; |
| 142 | self.buffer_len = 0; |
| 143 | &self.buffer[..len as usize] |
| 144 | } |
| 145 | |
| 146 | /// (consumed_from_input, None): not enough input |
| 147 | /// (consumed_from_input, Some(Err(()))): error bytes in buffer |
| 148 | /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer |
| 149 | fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) { |
| 150 | let initial_buffer_len = self.buffer_len as usize; |
| 151 | let copied_from_input; |
| 152 | { |
| 153 | let unwritten = &mut self.buffer[initial_buffer_len..]; |
| 154 | copied_from_input = cmp::min(unwritten.len(), input.len()); |
| 155 | unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]); |
| 156 | } |
| 157 | let spliced = &self.buffer[..initial_buffer_len + copied_from_input]; |
| 158 | match str::from_utf8(spliced) { |
| 159 | Ok(_) => { |
| 160 | self.buffer_len = spliced.len() as u8; |
| 161 | (copied_from_input, Some(Ok(()))) |
| 162 | } |
| 163 | Err(error) => { |
| 164 | let valid_up_to = error.valid_up_to(); |
| 165 | if valid_up_to > 0 { |
| 166 | let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap(); |
| 167 | self.buffer_len = valid_up_to as u8; |
| 168 | (consumed, Some(Ok(()))) |
| 169 | } else { |
| 170 | match error.error_len() { |
| 171 | Some(invalid_sequence_length) => { |
| 172 | let consumed = invalid_sequence_length |
| 173 | .checked_sub(initial_buffer_len).unwrap(); |
| 174 | self.buffer_len = invalid_sequence_length as u8; |
| 175 | (consumed, Some(Err(()))) |
| 176 | } |
| 177 | None => { |
| 178 | self.buffer_len = spliced.len() as u8; |
| 179 | (copied_from_input, None) |
| 180 | } |
| 181 | } |
| 182 | } |
| 183 | } |
| 184 | } |
| 185 | } |
| 186 | } |
| 187 | |