| 1 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 2 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 3 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| 4 | // option. This file may not be copied, modified, or distributed |
| 5 | // except according to those terms. |
| 6 | |
| 7 | //! Streams of tendrils. |
| 8 | |
| 9 | use fmt; |
| 10 | use tendril::{Atomicity, NonAtomic, Tendril}; |
| 11 | |
| 12 | use std::borrow::Cow; |
| 13 | use std::fs::File; |
| 14 | use std::io; |
| 15 | use std::marker::PhantomData; |
| 16 | use std::path::Path; |
| 17 | |
| 18 | #[cfg (feature = "encoding" )] |
| 19 | use encoding; |
| 20 | #[cfg (feature = "encoding_rs" )] |
| 21 | use encoding_rs::{self, DecoderResult}; |
| 22 | use utf8; |
| 23 | |
| 24 | /// Trait for types that can process a tendril. |
| 25 | /// |
| 26 | /// This is a "push" interface, unlike the "pull" interface of |
| 27 | /// `Iterator<Item=Tendril<F>>`. The push interface matches |
| 28 | /// [html5ever][] and other incremental parsers with a similar |
| 29 | /// architecture. |
| 30 | /// |
| 31 | /// [html5ever]: https://github.com/servo/html5ever |
| 32 | pub trait TendrilSink<F, A = NonAtomic> |
| 33 | where |
| 34 | F: fmt::Format, |
| 35 | A: Atomicity, |
| 36 | { |
| 37 | /// Process this tendril. |
| 38 | fn process(&mut self, t: Tendril<F, A>); |
| 39 | |
| 40 | /// Indicates that an error has occurred. |
| 41 | fn error(&mut self, desc: Cow<'static, str>); |
| 42 | |
| 43 | /// What the overall result of processing is. |
| 44 | type Output; |
| 45 | |
| 46 | /// Indicates the end of the stream. |
| 47 | fn finish(self) -> Self::Output; |
| 48 | |
| 49 | /// Process one tendril and finish. |
| 50 | fn one<T>(mut self, t: T) -> Self::Output |
| 51 | where |
| 52 | Self: Sized, |
| 53 | T: Into<Tendril<F, A>>, |
| 54 | { |
| 55 | self.process(t.into()); |
| 56 | self.finish() |
| 57 | } |
| 58 | |
| 59 | /// Consume an iterator of tendrils, processing each item, then finish. |
| 60 | fn from_iter<I>(mut self, i: I) -> Self::Output |
| 61 | where |
| 62 | Self: Sized, |
| 63 | I: IntoIterator, |
| 64 | I::Item: Into<Tendril<F, A>>, |
| 65 | { |
| 66 | for t in i { |
| 67 | self.process(t.into()) |
| 68 | } |
| 69 | self.finish() |
| 70 | } |
| 71 | |
| 72 | /// Read from the given stream of bytes until exhaustion and process incrementally, |
| 73 | /// then finish. Return `Err` at the first I/O error. |
| 74 | fn read_from<R>(mut self, r: &mut R) -> io::Result<Self::Output> |
| 75 | where |
| 76 | Self: Sized, |
| 77 | R: io::Read, |
| 78 | F: fmt::SliceFormat<Slice = [u8]>, |
| 79 | { |
| 80 | const BUFFER_SIZE: u32 = 4 * 1024; |
| 81 | loop { |
| 82 | let mut tendril = Tendril::<F, A>::new(); |
| 83 | // FIXME: this exposes uninitialized bytes to a generic R type |
| 84 | // this is fine for R=File which never reads these bytes, |
| 85 | // but user-defined types might. |
| 86 | // The standard library pushes zeros to `Vec<u8>` for that reason. |
| 87 | unsafe { |
| 88 | tendril.push_uninitialized(BUFFER_SIZE); |
| 89 | } |
| 90 | loop { |
| 91 | match r.read(&mut tendril) { |
| 92 | Ok(0) => return Ok(self.finish()), |
| 93 | Ok(n) => { |
| 94 | tendril.pop_back(BUFFER_SIZE - n as u32); |
| 95 | self.process(tendril); |
| 96 | break; |
| 97 | } |
| 98 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} |
| 99 | Err(e) => return Err(e), |
| 100 | } |
| 101 | } |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | /// Read from the file at the given path and process incrementally, |
| 106 | /// then finish. Return `Err` at the first I/O error. |
| 107 | fn from_file<P>(self, path: P) -> io::Result<Self::Output> |
| 108 | where |
| 109 | Self: Sized, |
| 110 | P: AsRef<Path>, |
| 111 | F: fmt::SliceFormat<Slice = [u8]>, |
| 112 | { |
| 113 | self.read_from(&mut File::open(path)?) |
| 114 | } |
| 115 | } |
| 116 | |
| 117 | /// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8, |
| 118 | /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, |
| 119 | /// and emits Unicode (`StrTendril`). |
| 120 | /// |
| 121 | /// This does not allocate memory: the output is either subtendrils on the input, |
| 122 | /// on inline tendrils for a single code point. |
| 123 | pub struct Utf8LossyDecoder<Sink, A = NonAtomic> |
| 124 | where |
| 125 | Sink: TendrilSink<fmt::UTF8, A>, |
| 126 | A: Atomicity, |
| 127 | { |
| 128 | pub inner_sink: Sink, |
| 129 | incomplete: Option<utf8::Incomplete>, |
| 130 | marker: PhantomData<A>, |
| 131 | } |
| 132 | |
| 133 | impl<Sink, A> Utf8LossyDecoder<Sink, A> |
| 134 | where |
| 135 | Sink: TendrilSink<fmt::UTF8, A>, |
| 136 | A: Atomicity, |
| 137 | { |
| 138 | /// Create a new incremental UTF-8 decoder. |
| 139 | #[inline ] |
| 140 | pub fn new(inner_sink: Sink) -> Self { |
| 141 | Utf8LossyDecoder { |
| 142 | inner_sink: inner_sink, |
| 143 | incomplete: None, |
| 144 | marker: PhantomData, |
| 145 | } |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | impl<Sink, A> TendrilSink<fmt::Bytes, A> for Utf8LossyDecoder<Sink, A> |
| 150 | where |
| 151 | Sink: TendrilSink<fmt::UTF8, A>, |
| 152 | A: Atomicity, |
| 153 | { |
| 154 | #[inline ] |
| 155 | fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) { |
| 156 | // FIXME: remove take() and map() when non-lexical borrows are stable. |
| 157 | if let Some(mut incomplete) = self.incomplete.take() { |
| 158 | let resume_at = incomplete.try_complete(&t).map(|(result, rest)| { |
| 159 | match result { |
| 160 | Ok(s) => self.inner_sink.process(Tendril::from_slice(s)), |
| 161 | Err(_) => { |
| 162 | self.inner_sink.error("invalid byte sequence" .into()); |
| 163 | self.inner_sink |
| 164 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
| 165 | } |
| 166 | } |
| 167 | t.len() - rest.len() |
| 168 | }); |
| 169 | match resume_at { |
| 170 | None => { |
| 171 | self.incomplete = Some(incomplete); |
| 172 | return; |
| 173 | } |
| 174 | Some(resume_at) => t.pop_front(resume_at as u32), |
| 175 | } |
| 176 | } |
| 177 | while !t.is_empty() { |
| 178 | let unborrowed_result = match utf8::decode(&t) { |
| 179 | Ok(s) => { |
| 180 | debug_assert!(s.as_ptr() == t.as_ptr()); |
| 181 | debug_assert!(s.len() == t.len()); |
| 182 | Ok(()) |
| 183 | } |
| 184 | Err(utf8::DecodeError::Invalid { |
| 185 | valid_prefix, |
| 186 | invalid_sequence, |
| 187 | .. |
| 188 | }) => { |
| 189 | debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); |
| 190 | debug_assert!(valid_prefix.len() <= t.len()); |
| 191 | Err(( |
| 192 | valid_prefix.len(), |
| 193 | Err(valid_prefix.len() + invalid_sequence.len()), |
| 194 | )) |
| 195 | } |
| 196 | Err(utf8::DecodeError::Incomplete { |
| 197 | valid_prefix, |
| 198 | incomplete_suffix, |
| 199 | }) => { |
| 200 | debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); |
| 201 | debug_assert!(valid_prefix.len() <= t.len()); |
| 202 | Err((valid_prefix.len(), Ok(incomplete_suffix))) |
| 203 | } |
| 204 | }; |
| 205 | match unborrowed_result { |
| 206 | Ok(()) => { |
| 207 | unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } |
| 208 | return; |
| 209 | } |
| 210 | Err((valid_len, and_then)) => { |
| 211 | if valid_len > 0 { |
| 212 | let subtendril = t.subtendril(0, valid_len as u32); |
| 213 | unsafe { |
| 214 | self.inner_sink |
| 215 | .process(subtendril.reinterpret_without_validating()) |
| 216 | } |
| 217 | } |
| 218 | match and_then { |
| 219 | Ok(incomplete) => { |
| 220 | self.incomplete = Some(incomplete); |
| 221 | return; |
| 222 | } |
| 223 | Err(offset) => { |
| 224 | self.inner_sink.error("invalid byte sequence" .into()); |
| 225 | self.inner_sink |
| 226 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
| 227 | t.pop_front(offset as u32); |
| 228 | } |
| 229 | } |
| 230 | } |
| 231 | } |
| 232 | } |
| 233 | } |
| 234 | |
| 235 | #[inline ] |
| 236 | fn error(&mut self, desc: Cow<'static, str>) { |
| 237 | self.inner_sink.error(desc); |
| 238 | } |
| 239 | |
| 240 | type Output = Sink::Output; |
| 241 | |
| 242 | #[inline ] |
| 243 | fn finish(mut self) -> Sink::Output { |
| 244 | if self.incomplete.is_some() { |
| 245 | self.inner_sink |
| 246 | .error("incomplete byte sequence at end of stream" .into()); |
| 247 | self.inner_sink |
| 248 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
| 249 | } |
| 250 | self.inner_sink.finish() |
| 251 | } |
| 252 | } |
| 253 | |
| 254 | /// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding, |
| 255 | /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, |
| 256 | /// and emits Unicode (`StrTendril`). |
| 257 | /// |
| 258 | /// This allocates new tendrils for encodings other than UTF-8. |
| 259 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
| 260 | pub struct LossyDecoder<Sink, A = NonAtomic> |
| 261 | where |
| 262 | Sink: TendrilSink<fmt::UTF8, A>, |
| 263 | A: Atomicity, |
| 264 | { |
| 265 | inner: LossyDecoderInner<Sink, A>, |
| 266 | } |
| 267 | |
| 268 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
| 269 | enum LossyDecoderInner<Sink, A> |
| 270 | where |
| 271 | Sink: TendrilSink<fmt::UTF8, A>, |
| 272 | A: Atomicity, |
| 273 | { |
| 274 | Utf8(Utf8LossyDecoder<Sink, A>), |
| 275 | #[cfg (feature = "encoding" )] |
| 276 | Encoding(Box<encoding::RawDecoder>, Sink), |
| 277 | #[cfg (feature = "encoding_rs" )] |
| 278 | EncodingRs(encoding_rs::Decoder, Sink), |
| 279 | } |
| 280 | |
| 281 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
| 282 | impl<Sink, A> LossyDecoder<Sink, A> |
| 283 | where |
| 284 | Sink: TendrilSink<fmt::UTF8, A>, |
| 285 | A: Atomicity, |
| 286 | { |
| 287 | /// Create a new incremental decoder using the encoding crate. |
| 288 | #[cfg (feature = "encoding" )] |
| 289 | #[inline ] |
| 290 | pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self { |
| 291 | if encoding.name() == "utf-8" { |
| 292 | LossyDecoder::utf8(sink) |
| 293 | } else { |
| 294 | LossyDecoder { |
| 295 | inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink), |
| 296 | } |
| 297 | } |
| 298 | } |
| 299 | |
| 300 | /// Create a new incremental decoder using the encoding_rs crate. |
| 301 | #[cfg (feature = "encoding_rs" )] |
| 302 | #[inline ] |
| 303 | pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self { |
| 304 | if encoding == encoding_rs::UTF_8 { |
| 305 | return Self::utf8(sink); |
| 306 | } |
| 307 | Self { |
| 308 | inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink), |
| 309 | } |
| 310 | } |
| 311 | |
| 312 | /// Create a new incremental decoder for the UTF-8 encoding. |
| 313 | /// |
| 314 | /// This is useful for content that is known at run-time to be UTF-8 |
| 315 | /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.) |
| 316 | #[inline ] |
| 317 | pub fn utf8(sink: Sink) -> LossyDecoder<Sink, A> { |
| 318 | LossyDecoder { |
| 319 | inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)), |
| 320 | } |
| 321 | } |
| 322 | |
| 323 | /// Give a reference to the inner sink. |
| 324 | pub fn inner_sink(&self) -> &Sink { |
| 325 | match self.inner { |
| 326 | LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink, |
| 327 | #[cfg (feature = "encoding" )] |
| 328 | LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink, |
| 329 | #[cfg (feature = "encoding_rs" )] |
| 330 | LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink, |
| 331 | } |
| 332 | } |
| 333 | |
| 334 | /// Give a mutable reference to the inner sink. |
| 335 | pub fn inner_sink_mut(&mut self) -> &mut Sink { |
| 336 | match self.inner { |
| 337 | LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink, |
| 338 | #[cfg (feature = "encoding" )] |
| 339 | LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink, |
| 340 | #[cfg (feature = "encoding_rs" )] |
| 341 | LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink, |
| 342 | } |
| 343 | } |
| 344 | } |
| 345 | |
| 346 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
| 347 | impl<Sink, A> TendrilSink<fmt::Bytes, A> for LossyDecoder<Sink, A> |
| 348 | where |
| 349 | Sink: TendrilSink<fmt::UTF8, A>, |
| 350 | A: Atomicity, |
| 351 | { |
| 352 | #[inline ] |
| 353 | fn process(&mut self, t: Tendril<fmt::Bytes, A>) { |
| 354 | match self.inner { |
| 355 | LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t), |
| 356 | #[cfg (feature = "encoding" )] |
| 357 | LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => { |
| 358 | let mut out = Tendril::new(); |
| 359 | let mut t = t; |
| 360 | loop { |
| 361 | match decoder.raw_feed(&*t, &mut out) { |
| 362 | (_, Some(err)) => { |
| 363 | out.push_char(' \u{fffd}' ); |
| 364 | sink.error(err.cause); |
| 365 | debug_assert!(err.upto >= 0); |
| 366 | t.pop_front(err.upto as u32); |
| 367 | // continue loop and process remainder of t |
| 368 | } |
| 369 | (_, None) => break, |
| 370 | } |
| 371 | } |
| 372 | if out.len() > 0 { |
| 373 | sink.process(out); |
| 374 | } |
| 375 | } |
| 376 | #[cfg (feature = "encoding_rs" )] |
| 377 | LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { |
| 378 | if t.is_empty() { |
| 379 | return; |
| 380 | } |
| 381 | decode_to_sink(t, decoder, sink, false); |
| 382 | } |
| 383 | } |
| 384 | } |
| 385 | |
| 386 | #[inline ] |
| 387 | fn error(&mut self, desc: Cow<'static, str>) { |
| 388 | match self.inner { |
| 389 | LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc), |
| 390 | #[cfg (feature = "encoding" )] |
| 391 | LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc), |
| 392 | #[cfg (feature = "encoding_rs" )] |
| 393 | LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc), |
| 394 | } |
| 395 | } |
| 396 | |
| 397 | type Output = Sink::Output; |
| 398 | |
| 399 | #[inline ] |
| 400 | fn finish(self) -> Sink::Output { |
| 401 | match self.inner { |
| 402 | LossyDecoderInner::Utf8(utf8) => return utf8.finish(), |
| 403 | #[cfg (feature = "encoding" )] |
| 404 | LossyDecoderInner::Encoding(mut decoder, mut sink) => { |
| 405 | let mut out = Tendril::new(); |
| 406 | if let Some(err) = decoder.raw_finish(&mut out) { |
| 407 | out.push_char(' \u{fffd}' ); |
| 408 | sink.error(err.cause); |
| 409 | } |
| 410 | if out.len() > 0 { |
| 411 | sink.process(out); |
| 412 | } |
| 413 | sink.finish() |
| 414 | } |
| 415 | #[cfg (feature = "encoding_rs" )] |
| 416 | LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { |
| 417 | decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); |
| 418 | sink.finish() |
| 419 | } |
| 420 | } |
| 421 | } |
| 422 | } |
| 423 | |
| 424 | #[cfg (feature = "encoding_rs" )] |
| 425 | fn decode_to_sink<Sink, A>( |
| 426 | mut t: Tendril<fmt::Bytes, A>, |
| 427 | decoder: &mut encoding_rs::Decoder, |
| 428 | sink: &mut Sink, |
| 429 | last: bool, |
| 430 | ) where |
| 431 | Sink: TendrilSink<fmt::UTF8, A>, |
| 432 | A: Atomicity, |
| 433 | { |
| 434 | loop { |
| 435 | let mut out = <Tendril<fmt::Bytes, A>>::new(); |
| 436 | let max_len = decoder |
| 437 | .max_utf8_buffer_length_without_replacement(t.len()) |
| 438 | .unwrap_or(8192); |
| 439 | unsafe { |
| 440 | out.push_uninitialized(std::cmp::min(max_len as u32, 8192)); |
| 441 | } |
| 442 | let (result, bytes_read, bytes_written) = |
| 443 | decoder.decode_to_utf8_without_replacement(&t, &mut out, last); |
| 444 | if bytes_written > 0 { |
| 445 | sink.process(unsafe { |
| 446 | out.subtendril(0, bytes_written as u32) |
| 447 | .reinterpret_without_validating() |
| 448 | }); |
| 449 | } |
| 450 | match result { |
| 451 | DecoderResult::InputEmpty => return, |
| 452 | DecoderResult::OutputFull => {} |
| 453 | DecoderResult::Malformed(_, _) => { |
| 454 | sink.error(Cow::Borrowed("invalid sequence" )); |
| 455 | sink.process(" \u{FFFD}" .into()); |
| 456 | } |
| 457 | } |
| 458 | t.pop_front(bytes_read as u32); |
| 459 | if t.is_empty() { |
| 460 | return; |
| 461 | } |
| 462 | } |
| 463 | } |
| 464 | |
| 465 | #[cfg (test)] |
| 466 | mod test { |
| 467 | use super::{TendrilSink, Utf8LossyDecoder}; |
| 468 | use fmt; |
| 469 | use std::borrow::Cow; |
| 470 | use tendril::{Atomicity, NonAtomic, Tendril}; |
| 471 | |
| 472 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
| 473 | use super::LossyDecoder; |
| 474 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
| 475 | use tendril::SliceExt; |
| 476 | |
| 477 | #[cfg (feature = "encoding" )] |
| 478 | use encoding::all as enc; |
| 479 | #[cfg (feature = "encoding_rs" )] |
| 480 | use encoding_rs as enc_rs; |
| 481 | |
| 482 | struct Accumulate<A> |
| 483 | where |
| 484 | A: Atomicity, |
| 485 | { |
| 486 | tendrils: Vec<Tendril<fmt::UTF8, A>>, |
| 487 | errors: Vec<String>, |
| 488 | } |
| 489 | |
| 490 | impl<A> Accumulate<A> |
| 491 | where |
| 492 | A: Atomicity, |
| 493 | { |
| 494 | fn new() -> Accumulate<A> { |
| 495 | Accumulate { |
| 496 | tendrils: vec![], |
| 497 | errors: vec![], |
| 498 | } |
| 499 | } |
| 500 | } |
| 501 | |
| 502 | impl<A> TendrilSink<fmt::UTF8, A> for Accumulate<A> |
| 503 | where |
| 504 | A: Atomicity, |
| 505 | { |
| 506 | fn process(&mut self, t: Tendril<fmt::UTF8, A>) { |
| 507 | self.tendrils.push(t); |
| 508 | } |
| 509 | |
| 510 | fn error(&mut self, desc: Cow<'static, str>) { |
| 511 | self.errors.push(desc.into_owned()); |
| 512 | } |
| 513 | |
| 514 | type Output = (Vec<Tendril<fmt::UTF8, A>>, Vec<String>); |
| 515 | |
| 516 | fn finish(self) -> Self::Output { |
| 517 | (self.tendrils, self.errors) |
| 518 | } |
| 519 | } |
| 520 | |
| 521 | fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) { |
| 522 | let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); |
| 523 | let (tendrils, errors) = decoder.from_iter(input.iter().cloned()); |
| 524 | assert_eq!( |
| 525 | expected, |
| 526 | &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>() |
| 527 | ); |
| 528 | assert_eq!(errs, errors.len()); |
| 529 | } |
| 530 | |
| 531 | #[test ] |
| 532 | fn utf8() { |
| 533 | check_utf8(&[], &[], 0); |
| 534 | check_utf8(&[b"" ], &[], 0); |
| 535 | check_utf8(&[b"xyz" ], &["xyz" ], 0); |
| 536 | check_utf8(&[b"x" , b"y" , b"z" ], &["x" , "y" , "z" ], 0); |
| 537 | |
| 538 | check_utf8(&[b"xy \xEA\x99\xAEzw" ], &["xy \u{a66e}zw" ], 0); |
| 539 | check_utf8(&[b"xy \xEA" , b" \x99\xAEzw" ], &["xy" , " \u{a66e}z" , "w" ], 0); |
| 540 | check_utf8(&[b"xy \xEA\x99" , b" \xAEzw" ], &["xy" , " \u{a66e}z" , "w" ], 0); |
| 541 | check_utf8( |
| 542 | &[b"xy \xEA" , b" \x99" , b" \xAEzw" ], |
| 543 | &["xy" , " \u{a66e}z" , "w" ], |
| 544 | 0, |
| 545 | ); |
| 546 | check_utf8(&[b" \xEA" , b"" , b" \x99" , b"" , b" \xAE" ], &[" \u{a66e}" ], 0); |
| 547 | check_utf8( |
| 548 | &[b"" , b" \xEA" , b"" , b" \x99" , b"" , b" \xAE" , b"" ], |
| 549 | &[" \u{a66e}" ], |
| 550 | 0, |
| 551 | ); |
| 552 | |
| 553 | check_utf8( |
| 554 | &[b"xy \xEA" , b" \xFF" , b" \x99\xAEz" ], |
| 555 | &["xy" , " \u{fffd}" , " \u{fffd}" , " \u{fffd}" , " \u{fffd}" , "z" ], |
| 556 | 4, |
| 557 | ); |
| 558 | check_utf8( |
| 559 | &[b"xy \xEA\x99" , b" \xFFz" ], |
| 560 | &["xy" , " \u{fffd}" , " \u{fffd}" , "z" ], |
| 561 | 2, |
| 562 | ); |
| 563 | |
| 564 | check_utf8(&[b" \xC5\x91\xC5\x91\xC5\x91" ], &["őőő" ], 0); |
| 565 | check_utf8( |
| 566 | &[b" \xC5\x91" , b" \xC5\x91" , b" \xC5\x91" ], |
| 567 | &["ő" , "ő" , "ő" ], |
| 568 | 0, |
| 569 | ); |
| 570 | check_utf8( |
| 571 | &[b" \xC5" , b" \x91\xC5" , b" \x91\xC5" , b" \x91" ], |
| 572 | &["ő" , "ő" , "ő" ], |
| 573 | 0, |
| 574 | ); |
| 575 | check_utf8( |
| 576 | &[b" \xC5" , b" \x91\xff" , b" \x91\xC5" , b" \x91" ], |
| 577 | &["ő" , " \u{fffd}" , " \u{fffd}" , "ő" ], |
| 578 | 2, |
| 579 | ); |
| 580 | |
| 581 | // incomplete char at end of input |
| 582 | check_utf8(&[b" \xC0" ], &[" \u{fffd}" ], 1); |
| 583 | check_utf8(&[b" \xEA\x99" ], &[" \u{fffd}" ], 1); |
| 584 | } |
| 585 | |
| 586 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
| 587 | fn check_decode( |
| 588 | mut decoder: LossyDecoder<Accumulate<NonAtomic>>, |
| 589 | input: &[&[u8]], |
| 590 | expected: &str, |
| 591 | errs: usize, |
| 592 | ) { |
| 593 | for x in input { |
| 594 | decoder.process(x.to_tendril()); |
| 595 | } |
| 596 | let (tendrils, errors) = decoder.finish(); |
| 597 | let mut tendril: Tendril<fmt::UTF8> = Tendril::new(); |
| 598 | for t in tendrils { |
| 599 | tendril.push_tendril(&t); |
| 600 | } |
| 601 | assert_eq!(expected, &*tendril); |
| 602 | assert_eq!(errs, errors.len()); |
| 603 | } |
| 604 | |
| 605 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
| 606 | pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; |
| 607 | |
| 608 | #[cfg (any(feature = "encoding" ))] |
| 609 | const ASCII: Tests = &[ |
| 610 | (&[], "" , 0), |
| 611 | (&[b"" ], "" , 0), |
| 612 | (&[b"xyz" ], "xyz" , 0), |
| 613 | (&[b"xy" , b"" , b"" , b"z" ], "xyz" , 0), |
| 614 | (&[b"x" , b"y" , b"z" ], "xyz" , 0), |
| 615 | (&[b" \xFF" ], " \u{fffd}" , 1), |
| 616 | (&[b"x \xC0yz" ], "x \u{fffd}yz" , 1), |
| 617 | (&[b"x" , b" \xC0y" , b"z" ], "x \u{fffd}yz" , 1), |
| 618 | (&[b"x \xC0yz \xFF\xFFw" ], "x \u{fffd}yz \u{fffd}\u{fffd}w" , 3), |
| 619 | ]; |
| 620 | |
| 621 | #[cfg (feature = "encoding" )] |
| 622 | #[test ] |
| 623 | fn decode_ascii() { |
| 624 | for &(input, expected, errs) in ASCII { |
| 625 | let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new()); |
| 626 | check_decode(decoder, input, expected, errs); |
| 627 | } |
| 628 | } |
| 629 | |
| 630 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
| 631 | const UTF_8: Tests = &[ |
| 632 | (&[], "" , 0), |
| 633 | (&[b"" ], "" , 0), |
| 634 | (&[b"xyz" ], "xyz" , 0), |
| 635 | (&[b"x" , b"y" , b"z" ], "xyz" , 0), |
| 636 | (&[b" \xEA\x99\xAE" ], " \u{a66e}" , 0), |
| 637 | (&[b" \xEA" , b" \x99\xAE" ], " \u{a66e}" , 0), |
| 638 | (&[b" \xEA\x99" , b" \xAE" ], " \u{a66e}" , 0), |
| 639 | (&[b" \xEA" , b" \x99" , b" \xAE" ], " \u{a66e}" , 0), |
| 640 | (&[b" \xEA" , b"" , b" \x99" , b"" , b" \xAE" ], " \u{a66e}" , 0), |
| 641 | ( |
| 642 | &[b"" , b" \xEA" , b"" , b" \x99" , b"" , b" \xAE" , b"" ], |
| 643 | " \u{a66e}" , |
| 644 | 0, |
| 645 | ), |
| 646 | (&[b"xy \xEA" , b" \x99\xAEz" ], "xy \u{a66e}z" , 0), |
| 647 | ( |
| 648 | &[b"xy \xEA" , b" \xFF" , b" \x99\xAEz" ], |
| 649 | "xy \u{fffd}\u{fffd}\u{fffd}\u{fffd}z" , |
| 650 | 4, |
| 651 | ), |
| 652 | (&[b"xy \xEA\x99" , b" \xFFz" ], "xy \u{fffd}\u{fffd}z" , 2), |
| 653 | // incomplete char at end of input |
| 654 | (&[b" \xC0" ], " \u{fffd}" , 1), |
| 655 | (&[b" \xEA\x99" ], " \u{fffd}" , 1), |
| 656 | ]; |
| 657 | |
| 658 | #[cfg (feature = "encoding" )] |
| 659 | #[test ] |
| 660 | fn decode_utf8() { |
| 661 | for &(input, expected, errs) in UTF_8 { |
| 662 | let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new()); |
| 663 | check_decode(decoder, input, expected, errs); |
| 664 | } |
| 665 | } |
| 666 | |
| 667 | #[cfg (feature = "encoding_rs" )] |
| 668 | #[test ] |
| 669 | fn decode_utf8_encoding_rs() { |
| 670 | for &(input, expected, errs) in UTF_8 { |
| 671 | let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new()); |
| 672 | check_decode(decoder, input, expected, errs); |
| 673 | } |
| 674 | } |
| 675 | |
| 676 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
| 677 | const KOI8_U: Tests = &[ |
| 678 | (&[b" \xfc\xce\xc5\xd2\xc7\xc9\xd1" ], "Энергия" , 0), |
| 679 | (&[b" \xfc\xce" , b" \xc5\xd2\xc7\xc9\xd1" ], "Энергия" , 0), |
| 680 | (&[b" \xfc\xce" , b" \xc5\xd2\xc7" , b" \xc9\xd1" ], "Энергия" , 0), |
| 681 | ( |
| 682 | &[b" \xfc\xce" , b"" , b" \xc5\xd2\xc7" , b" \xc9\xd1" , b"" ], |
| 683 | "Энергия" , |
| 684 | 0, |
| 685 | ), |
| 686 | ]; |
| 687 | |
| 688 | #[cfg (feature = "encoding" )] |
| 689 | #[test ] |
| 690 | fn decode_koi8_u() { |
| 691 | for &(input, expected, errs) in KOI8_U { |
| 692 | let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new()); |
| 693 | check_decode(decoder, input, expected, errs); |
| 694 | } |
| 695 | } |
| 696 | |
| 697 | #[cfg (feature = "encoding_rs" )] |
| 698 | #[test ] |
| 699 | fn decode_koi8_u_encoding_rs() { |
| 700 | for &(input, expected, errs) in KOI8_U { |
| 701 | let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new()); |
| 702 | check_decode(decoder, input, expected, errs); |
| 703 | } |
| 704 | } |
| 705 | |
| 706 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
| 707 | const WINDOWS_949: Tests = &[ |
| 708 | (&[], "" , 0), |
| 709 | (&[b"" ], "" , 0), |
| 710 | (&[b" \xbe\xc8\xb3\xe7" ], "안녕" , 0), |
| 711 | (&[b" \xbe" , b" \xc8\xb3\xe7" ], "안녕" , 0), |
| 712 | (&[b" \xbe" , b"" , b" \xc8\xb3\xe7" ], "안녕" , 0), |
| 713 | ( |
| 714 | &[b" \xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4" ], |
| 715 | "안녕하세요" , |
| 716 | 0, |
| 717 | ), |
| 718 | (&[b" \xbe\xc8\xb3\xe7\xc7" ], "안녕 \u{fffd}" , 1), |
| 719 | (&[b" \xbe" , b"" , b" \xc8\xb3" ], "안 \u{fffd}" , 1), |
| 720 | (&[b" \xbe\x28\xb3\xe7" ], " \u{fffd}(녕" , 1), |
| 721 | ]; |
| 722 | |
| 723 | #[cfg (feature = "encoding" )] |
| 724 | #[test ] |
| 725 | fn decode_windows_949() { |
| 726 | for &(input, expected, errs) in WINDOWS_949 { |
| 727 | let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new()); |
| 728 | check_decode(decoder, input, expected, errs); |
| 729 | } |
| 730 | } |
| 731 | |
| 732 | #[cfg (feature = "encoding_rs" )] |
| 733 | #[test ] |
| 734 | fn decode_windows_949_encoding_rs() { |
| 735 | for &(input, expected, errs) in WINDOWS_949 { |
| 736 | let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new()); |
| 737 | check_decode(decoder, input, expected, errs); |
| 738 | } |
| 739 | } |
| 740 | |
| 741 | #[test ] |
| 742 | fn read_from() { |
| 743 | let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); |
| 744 | let mut bytes: &[u8] = b"foo \xffbar" ; |
| 745 | let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap(); |
| 746 | assert_eq!( |
| 747 | &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>(), |
| 748 | &["foo" , " \u{FFFD}" , "bar" ] |
| 749 | ); |
| 750 | assert_eq!(errors, &["invalid byte sequence" ]); |
| 751 | } |
| 752 | } |
| 753 | |