| 1 | use std::io::{self, BufRead}; |
| 2 | use std::error::Error; |
| 3 | use std::fmt; |
| 4 | use std::str; |
| 5 | use super::*; |
| 6 | |
| 7 | /// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8. |
| 8 | pub struct BufReadDecoder<B: BufRead> { |
| 9 | buf_read: B, |
| 10 | bytes_consumed: usize, |
| 11 | incomplete: Incomplete, |
| 12 | } |
| 13 | |
| 14 | #[derive (Debug)] |
| 15 | pub enum BufReadDecoderError<'a> { |
| 16 | /// Represents one UTF-8 error in the byte stream. |
| 17 | /// |
| 18 | /// In lossy decoding, each such error should be replaced with U+FFFD. |
| 19 | /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.) |
| 20 | InvalidByteSequence(&'a [u8]), |
| 21 | |
| 22 | /// An I/O error from the underlying byte stream |
| 23 | Io(io::Error), |
| 24 | } |
| 25 | |
| 26 | impl<'a> BufReadDecoderError<'a> { |
| 27 | /// Replace UTF-8 errors with U+FFFD |
| 28 | pub fn lossy(self) -> Result<&'static str, io::Error> { |
| 29 | match self { |
| 30 | BufReadDecoderError::Io(error: Error) => Err(error), |
| 31 | BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER), |
| 32 | } |
| 33 | } |
| 34 | } |
| 35 | |
| 36 | impl<'a> fmt::Display for BufReadDecoderError<'a> { |
| 37 | fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { |
| 38 | match *self { |
| 39 | BufReadDecoderError::InvalidByteSequence(bytes: &[u8]) => { |
| 40 | write!(f, "invalid byte sequence: {:02x?}" , bytes) |
| 41 | } |
| 42 | BufReadDecoderError::Io(ref err: &Error) => write!(f, "underlying bytestream error: {}" , err), |
| 43 | } |
| 44 | } |
| 45 | } |
| 46 | |
| 47 | impl<'a> Error for BufReadDecoderError<'a> { |
| 48 | fn source(&self) -> Option<&(dyn Error + 'static)> { |
| 49 | match *self { |
| 50 | BufReadDecoderError::InvalidByteSequence(_) => None, |
| 51 | BufReadDecoderError::Io(ref err: &Error) => Some(err), |
| 52 | } |
| 53 | } |
| 54 | } |
| 55 | |
| 56 | impl<B: BufRead> BufReadDecoder<B> { |
| 57 | /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`. |
| 58 | pub fn read_to_string_lossy(buf_read: B) -> io::Result<String> { |
| 59 | let mut decoder = Self::new(buf_read); |
| 60 | let mut string = String::new(); |
| 61 | while let Some(result) = decoder.next_lossy() { |
| 62 | string.push_str(result?) |
| 63 | } |
| 64 | Ok(string) |
| 65 | } |
| 66 | |
| 67 | pub fn new(buf_read: B) -> Self { |
| 68 | Self { |
| 69 | buf_read, |
| 70 | bytes_consumed: 0, |
| 71 | incomplete: Incomplete::empty(), |
| 72 | } |
| 73 | } |
| 74 | |
| 75 | /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD. |
| 76 | pub fn next_lossy(&mut self) -> Option<io::Result<&str>> { |
| 77 | self.next_strict().map(|result| result.or_else(|e| e.lossy())) |
| 78 | } |
| 79 | |
| 80 | /// Decode and consume the next chunk of UTF-8 input. |
| 81 | /// |
| 82 | /// This method is intended to be called repeatedly until it returns `None`, |
| 83 | /// which represents EOF from the underlying byte stream. |
| 84 | /// This is similar to `Iterator::next`, |
| 85 | /// except that decoded chunks borrow the decoder (~iterator) |
| 86 | /// so they need to be handled or copied before the next chunk can start decoding. |
| 87 | pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> { |
| 88 | enum BytesSource { |
| 89 | BufRead(usize), |
| 90 | Incomplete, |
| 91 | } |
| 92 | macro_rules! try_io { |
| 93 | ($io_result: expr) => { |
| 94 | match $io_result { |
| 95 | Ok(value) => value, |
| 96 | Err(error) => return Some(Err(BufReadDecoderError::Io(error))) |
| 97 | } |
| 98 | } |
| 99 | } |
| 100 | let (source, result) = loop { |
| 101 | if self.bytes_consumed > 0 { |
| 102 | self.buf_read.consume(self.bytes_consumed); |
| 103 | self.bytes_consumed = 0; |
| 104 | } |
| 105 | let buf = try_io!(self.buf_read.fill_buf()); |
| 106 | |
| 107 | // Force loop iteration to go through an explicit `continue` |
| 108 | enum Unreachable {} |
| 109 | let _: Unreachable = if self.incomplete.is_empty() { |
| 110 | if buf.is_empty() { |
| 111 | return None // EOF |
| 112 | } |
| 113 | match str::from_utf8(buf) { |
| 114 | Ok(_) => { |
| 115 | break (BytesSource::BufRead(buf.len()), Ok(())) |
| 116 | } |
| 117 | Err(error) => { |
| 118 | let valid_up_to = error.valid_up_to(); |
| 119 | if valid_up_to > 0 { |
| 120 | break (BytesSource::BufRead(valid_up_to), Ok(())) |
| 121 | } |
| 122 | match error.error_len() { |
| 123 | Some(invalid_sequence_length) => { |
| 124 | break (BytesSource::BufRead(invalid_sequence_length), Err(())) |
| 125 | } |
| 126 | None => { |
| 127 | self.bytes_consumed = buf.len(); |
| 128 | self.incomplete = Incomplete::new(buf); |
| 129 | // need more input bytes |
| 130 | continue |
| 131 | } |
| 132 | } |
| 133 | } |
| 134 | } |
| 135 | } else { |
| 136 | if buf.is_empty() { |
| 137 | break (BytesSource::Incomplete, Err(())) // EOF with incomplete code point |
| 138 | } |
| 139 | let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf); |
| 140 | self.bytes_consumed = consumed; |
| 141 | match opt_result { |
| 142 | None => { |
| 143 | // need more input bytes |
| 144 | continue |
| 145 | } |
| 146 | Some(result) => { |
| 147 | break (BytesSource::Incomplete, result) |
| 148 | } |
| 149 | } |
| 150 | }; |
| 151 | }; |
| 152 | let bytes = match source { |
| 153 | BytesSource::BufRead(byte_count) => { |
| 154 | self.bytes_consumed = byte_count; |
| 155 | let buf = try_io!(self.buf_read.fill_buf()); |
| 156 | &buf[..byte_count] |
| 157 | } |
| 158 | BytesSource::Incomplete => { |
| 159 | self.incomplete.take_buffer() |
| 160 | } |
| 161 | }; |
| 162 | match result { |
| 163 | Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })), |
| 164 | Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))), |
| 165 | } |
| 166 | } |
| 167 | } |
| 168 | |