1 | mod lossy; |
2 | mod read; |
3 | |
4 | pub use lossy::LossyDecoder; |
5 | pub use read::{BufReadDecoder, BufReadDecoderError}; |
6 | |
7 | use std::cmp; |
8 | use std::error::Error; |
9 | use std::fmt; |
10 | use std::str; |
11 | |
12 | /// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error. |
13 | pub const REPLACEMENT_CHARACTER: &'static str = " \u{FFFD}" ; |
14 | |
15 | #[derive (Debug, Copy, Clone)] |
16 | pub enum DecodeError<'a> { |
17 | /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`, |
18 | /// then call `decode()` again with `remaining_input`. |
19 | Invalid { |
20 | valid_prefix: &'a str, |
21 | invalid_sequence: &'a [u8], |
22 | remaining_input: &'a [u8], |
23 | }, |
24 | |
25 | /// Call the `incomplete_suffix.try_complete` method with more input when available. |
26 | /// If no more input is available, this is an invalid byte sequence. |
27 | Incomplete { |
28 | valid_prefix: &'a str, |
29 | incomplete_suffix: Incomplete, |
30 | }, |
31 | } |
32 | |
33 | impl<'a> fmt::Display for DecodeError<'a> { |
34 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
35 | match *self { |
36 | DecodeError::Invalid { |
37 | valid_prefix, |
38 | invalid_sequence, |
39 | remaining_input, |
40 | } => write!( |
41 | f, |
42 | "found invalid byte sequence {invalid_sequence:02x?} after \ |
43 | {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \ |
44 | unprocessed bytes" , |
45 | invalid_sequence = invalid_sequence, |
46 | valid_byte_count = valid_prefix.len(), |
47 | unprocessed_byte_count = remaining_input.len() |
48 | ), |
49 | DecodeError::Incomplete { |
50 | valid_prefix, |
51 | incomplete_suffix, |
52 | } => write!( |
53 | f, |
54 | "found incomplete byte sequence {incomplete_suffix:02x?} after \ |
55 | {valid_byte_count} bytes" , |
56 | incomplete_suffix = incomplete_suffix, |
57 | valid_byte_count = valid_prefix.len() |
58 | ), |
59 | } |
60 | } |
61 | } |
62 | |
63 | impl<'a> Error for DecodeError<'a> {} |
64 | |
65 | #[derive (Debug, Copy, Clone)] |
66 | pub struct Incomplete { |
67 | pub buffer: [u8; 4], |
68 | pub buffer_len: u8, |
69 | } |
70 | |
71 | pub fn decode(input: &[u8]) -> Result<&str, DecodeError> { |
72 | let error = match str::from_utf8(input) { |
73 | Ok(valid) => return Ok(valid), |
74 | Err(error) => error, |
75 | }; |
76 | |
77 | // FIXME: separate function from here to guide inlining? |
78 | let (valid, after_valid) = input.split_at(error.valid_up_to()); |
79 | let valid = unsafe { |
80 | str::from_utf8_unchecked(valid) |
81 | }; |
82 | |
83 | match error.error_len() { |
84 | Some(invalid_sequence_length) => { |
85 | let (invalid, rest) = after_valid.split_at(invalid_sequence_length); |
86 | Err(DecodeError::Invalid { |
87 | valid_prefix: valid, |
88 | invalid_sequence: invalid, |
89 | remaining_input: rest |
90 | }) |
91 | } |
92 | None => { |
93 | Err(DecodeError::Incomplete { |
94 | valid_prefix: valid, |
95 | incomplete_suffix: Incomplete::new(after_valid), |
96 | }) |
97 | } |
98 | } |
99 | } |
100 | |
101 | impl Incomplete { |
102 | pub fn empty() -> Self { |
103 | Incomplete { |
104 | buffer: [0, 0, 0, 0], |
105 | buffer_len: 0, |
106 | } |
107 | } |
108 | |
109 | pub fn is_empty(&self) -> bool { |
110 | self.buffer_len == 0 |
111 | } |
112 | |
113 | pub fn new(bytes: &[u8]) -> Self { |
114 | let mut buffer = [0, 0, 0, 0]; |
115 | let len = bytes.len(); |
116 | buffer[..len].copy_from_slice(bytes); |
117 | Incomplete { |
118 | buffer: buffer, |
119 | buffer_len: len as u8, |
120 | } |
121 | } |
122 | |
123 | /// * `None`: still incomplete, call `try_complete` again with more input. |
124 | /// If no more input is available, this is invalid byte sequence. |
125 | /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`. |
126 | /// To keep decoding, pass `remaining_input` to `decode()`. |
127 | pub fn try_complete<'input>(&mut self, input: &'input [u8]) |
128 | -> Option<(Result<&str, &[u8]>, &'input [u8])> { |
129 | let (consumed, opt_result) = self.try_complete_offsets(input); |
130 | let result = opt_result?; |
131 | let remaining_input = &input[consumed..]; |
132 | let result_bytes = self.take_buffer(); |
133 | let result = match result { |
134 | Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }), |
135 | Err(()) => Err(result_bytes), |
136 | }; |
137 | Some((result, remaining_input)) |
138 | } |
139 | |
140 | fn take_buffer(&mut self) -> &[u8] { |
141 | let len = self.buffer_len as usize; |
142 | self.buffer_len = 0; |
143 | &self.buffer[..len as usize] |
144 | } |
145 | |
146 | /// (consumed_from_input, None): not enough input |
147 | /// (consumed_from_input, Some(Err(()))): error bytes in buffer |
148 | /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer |
149 | fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) { |
150 | let initial_buffer_len = self.buffer_len as usize; |
151 | let copied_from_input; |
152 | { |
153 | let unwritten = &mut self.buffer[initial_buffer_len..]; |
154 | copied_from_input = cmp::min(unwritten.len(), input.len()); |
155 | unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]); |
156 | } |
157 | let spliced = &self.buffer[..initial_buffer_len + copied_from_input]; |
158 | match str::from_utf8(spliced) { |
159 | Ok(_) => { |
160 | self.buffer_len = spliced.len() as u8; |
161 | (copied_from_input, Some(Ok(()))) |
162 | } |
163 | Err(error) => { |
164 | let valid_up_to = error.valid_up_to(); |
165 | if valid_up_to > 0 { |
166 | let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap(); |
167 | self.buffer_len = valid_up_to as u8; |
168 | (consumed, Some(Ok(()))) |
169 | } else { |
170 | match error.error_len() { |
171 | Some(invalid_sequence_length) => { |
172 | let consumed = invalid_sequence_length |
173 | .checked_sub(initial_buffer_len).unwrap(); |
174 | self.buffer_len = invalid_sequence_length as u8; |
175 | (consumed, Some(Err(()))) |
176 | } |
177 | None => { |
178 | self.buffer_len = spliced.len() as u8; |
179 | (copied_from_input, None) |
180 | } |
181 | } |
182 | } |
183 | } |
184 | } |
185 | } |
186 | } |
187 | |