1 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
2 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
3 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
4 | // option. This file may not be copied, modified, or distributed |
5 | // except according to those terms. |
6 | |
7 | //! Streams of tendrils. |
8 | |
9 | use fmt; |
10 | use tendril::{Atomicity, NonAtomic, Tendril}; |
11 | |
12 | use std::borrow::Cow; |
13 | use std::fs::File; |
14 | use std::io; |
15 | use std::marker::PhantomData; |
16 | use std::path::Path; |
17 | |
18 | #[cfg (feature = "encoding" )] |
19 | use encoding; |
20 | #[cfg (feature = "encoding_rs" )] |
21 | use encoding_rs::{self, DecoderResult}; |
22 | use utf8; |
23 | |
24 | /// Trait for types that can process a tendril. |
25 | /// |
26 | /// This is a "push" interface, unlike the "pull" interface of |
27 | /// `Iterator<Item=Tendril<F>>`. The push interface matches |
28 | /// [html5ever][] and other incremental parsers with a similar |
29 | /// architecture. |
30 | /// |
31 | /// [html5ever]: https://github.com/servo/html5ever |
32 | pub trait TendrilSink<F, A = NonAtomic> |
33 | where |
34 | F: fmt::Format, |
35 | A: Atomicity, |
36 | { |
37 | /// Process this tendril. |
38 | fn process(&mut self, t: Tendril<F, A>); |
39 | |
40 | /// Indicates that an error has occurred. |
41 | fn error(&mut self, desc: Cow<'static, str>); |
42 | |
43 | /// What the overall result of processing is. |
44 | type Output; |
45 | |
46 | /// Indicates the end of the stream. |
47 | fn finish(self) -> Self::Output; |
48 | |
49 | /// Process one tendril and finish. |
50 | fn one<T>(mut self, t: T) -> Self::Output |
51 | where |
52 | Self: Sized, |
53 | T: Into<Tendril<F, A>>, |
54 | { |
55 | self.process(t.into()); |
56 | self.finish() |
57 | } |
58 | |
59 | /// Consume an iterator of tendrils, processing each item, then finish. |
60 | fn from_iter<I>(mut self, i: I) -> Self::Output |
61 | where |
62 | Self: Sized, |
63 | I: IntoIterator, |
64 | I::Item: Into<Tendril<F, A>>, |
65 | { |
66 | for t in i { |
67 | self.process(t.into()) |
68 | } |
69 | self.finish() |
70 | } |
71 | |
72 | /// Read from the given stream of bytes until exhaustion and process incrementally, |
73 | /// then finish. Return `Err` at the first I/O error. |
74 | fn read_from<R>(mut self, r: &mut R) -> io::Result<Self::Output> |
75 | where |
76 | Self: Sized, |
77 | R: io::Read, |
78 | F: fmt::SliceFormat<Slice = [u8]>, |
79 | { |
80 | const BUFFER_SIZE: u32 = 4 * 1024; |
81 | loop { |
82 | let mut tendril = Tendril::<F, A>::new(); |
83 | // FIXME: this exposes uninitialized bytes to a generic R type |
84 | // this is fine for R=File which never reads these bytes, |
85 | // but user-defined types might. |
86 | // The standard library pushes zeros to `Vec<u8>` for that reason. |
87 | unsafe { |
88 | tendril.push_uninitialized(BUFFER_SIZE); |
89 | } |
90 | loop { |
91 | match r.read(&mut tendril) { |
92 | Ok(0) => return Ok(self.finish()), |
93 | Ok(n) => { |
94 | tendril.pop_back(BUFFER_SIZE - n as u32); |
95 | self.process(tendril); |
96 | break; |
97 | } |
98 | Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {} |
99 | Err(e) => return Err(e), |
100 | } |
101 | } |
102 | } |
103 | } |
104 | |
105 | /// Read from the file at the given path and process incrementally, |
106 | /// then finish. Return `Err` at the first I/O error. |
107 | fn from_file<P>(self, path: P) -> io::Result<Self::Output> |
108 | where |
109 | Self: Sized, |
110 | P: AsRef<Path>, |
111 | F: fmt::SliceFormat<Slice = [u8]>, |
112 | { |
113 | self.read_from(&mut File::open(path)?) |
114 | } |
115 | } |
116 | |
117 | /// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8, |
118 | /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, |
119 | /// and emits Unicode (`StrTendril`). |
120 | /// |
121 | /// This does not allocate memory: the output is either subtendrils on the input, |
122 | /// on inline tendrils for a single code point. |
123 | pub struct Utf8LossyDecoder<Sink, A = NonAtomic> |
124 | where |
125 | Sink: TendrilSink<fmt::UTF8, A>, |
126 | A: Atomicity, |
127 | { |
128 | pub inner_sink: Sink, |
129 | incomplete: Option<utf8::Incomplete>, |
130 | marker: PhantomData<A>, |
131 | } |
132 | |
133 | impl<Sink, A> Utf8LossyDecoder<Sink, A> |
134 | where |
135 | Sink: TendrilSink<fmt::UTF8, A>, |
136 | A: Atomicity, |
137 | { |
138 | /// Create a new incremental UTF-8 decoder. |
139 | #[inline ] |
140 | pub fn new(inner_sink: Sink) -> Self { |
141 | Utf8LossyDecoder { |
142 | inner_sink: inner_sink, |
143 | incomplete: None, |
144 | marker: PhantomData, |
145 | } |
146 | } |
147 | } |
148 | |
149 | impl<Sink, A> TendrilSink<fmt::Bytes, A> for Utf8LossyDecoder<Sink, A> |
150 | where |
151 | Sink: TendrilSink<fmt::UTF8, A>, |
152 | A: Atomicity, |
153 | { |
154 | #[inline ] |
155 | fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) { |
156 | // FIXME: remove take() and map() when non-lexical borrows are stable. |
157 | if let Some(mut incomplete) = self.incomplete.take() { |
158 | let resume_at = incomplete.try_complete(&t).map(|(result, rest)| { |
159 | match result { |
160 | Ok(s) => self.inner_sink.process(Tendril::from_slice(s)), |
161 | Err(_) => { |
162 | self.inner_sink.error("invalid byte sequence" .into()); |
163 | self.inner_sink |
164 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
165 | } |
166 | } |
167 | t.len() - rest.len() |
168 | }); |
169 | match resume_at { |
170 | None => { |
171 | self.incomplete = Some(incomplete); |
172 | return; |
173 | } |
174 | Some(resume_at) => t.pop_front(resume_at as u32), |
175 | } |
176 | } |
177 | while !t.is_empty() { |
178 | let unborrowed_result = match utf8::decode(&t) { |
179 | Ok(s) => { |
180 | debug_assert!(s.as_ptr() == t.as_ptr()); |
181 | debug_assert!(s.len() == t.len()); |
182 | Ok(()) |
183 | } |
184 | Err(utf8::DecodeError::Invalid { |
185 | valid_prefix, |
186 | invalid_sequence, |
187 | .. |
188 | }) => { |
189 | debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); |
190 | debug_assert!(valid_prefix.len() <= t.len()); |
191 | Err(( |
192 | valid_prefix.len(), |
193 | Err(valid_prefix.len() + invalid_sequence.len()), |
194 | )) |
195 | } |
196 | Err(utf8::DecodeError::Incomplete { |
197 | valid_prefix, |
198 | incomplete_suffix, |
199 | }) => { |
200 | debug_assert!(valid_prefix.as_ptr() == t.as_ptr()); |
201 | debug_assert!(valid_prefix.len() <= t.len()); |
202 | Err((valid_prefix.len(), Ok(incomplete_suffix))) |
203 | } |
204 | }; |
205 | match unborrowed_result { |
206 | Ok(()) => { |
207 | unsafe { self.inner_sink.process(t.reinterpret_without_validating()) } |
208 | return; |
209 | } |
210 | Err((valid_len, and_then)) => { |
211 | if valid_len > 0 { |
212 | let subtendril = t.subtendril(0, valid_len as u32); |
213 | unsafe { |
214 | self.inner_sink |
215 | .process(subtendril.reinterpret_without_validating()) |
216 | } |
217 | } |
218 | match and_then { |
219 | Ok(incomplete) => { |
220 | self.incomplete = Some(incomplete); |
221 | return; |
222 | } |
223 | Err(offset) => { |
224 | self.inner_sink.error("invalid byte sequence" .into()); |
225 | self.inner_sink |
226 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
227 | t.pop_front(offset as u32); |
228 | } |
229 | } |
230 | } |
231 | } |
232 | } |
233 | } |
234 | |
235 | #[inline ] |
236 | fn error(&mut self, desc: Cow<'static, str>) { |
237 | self.inner_sink.error(desc); |
238 | } |
239 | |
240 | type Output = Sink::Output; |
241 | |
242 | #[inline ] |
243 | fn finish(mut self) -> Sink::Output { |
244 | if self.incomplete.is_some() { |
245 | self.inner_sink |
246 | .error("incomplete byte sequence at end of stream" .into()); |
247 | self.inner_sink |
248 | .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER)); |
249 | } |
250 | self.inner_sink.finish() |
251 | } |
252 | } |
253 | |
254 | /// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding, |
255 | /// lossily replace ill-formed byte sequences with U+FFFD replacement characters, |
256 | /// and emits Unicode (`StrTendril`). |
257 | /// |
258 | /// This allocates new tendrils for encodings other than UTF-8. |
259 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
260 | pub struct LossyDecoder<Sink, A = NonAtomic> |
261 | where |
262 | Sink: TendrilSink<fmt::UTF8, A>, |
263 | A: Atomicity, |
264 | { |
265 | inner: LossyDecoderInner<Sink, A>, |
266 | } |
267 | |
268 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
269 | enum LossyDecoderInner<Sink, A> |
270 | where |
271 | Sink: TendrilSink<fmt::UTF8, A>, |
272 | A: Atomicity, |
273 | { |
274 | Utf8(Utf8LossyDecoder<Sink, A>), |
275 | #[cfg (feature = "encoding" )] |
276 | Encoding(Box<encoding::RawDecoder>, Sink), |
277 | #[cfg (feature = "encoding_rs" )] |
278 | EncodingRs(encoding_rs::Decoder, Sink), |
279 | } |
280 | |
281 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
282 | impl<Sink, A> LossyDecoder<Sink, A> |
283 | where |
284 | Sink: TendrilSink<fmt::UTF8, A>, |
285 | A: Atomicity, |
286 | { |
287 | /// Create a new incremental decoder using the encoding crate. |
288 | #[cfg (feature = "encoding" )] |
289 | #[inline ] |
290 | pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self { |
291 | if encoding.name() == "utf-8" { |
292 | LossyDecoder::utf8(sink) |
293 | } else { |
294 | LossyDecoder { |
295 | inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink), |
296 | } |
297 | } |
298 | } |
299 | |
300 | /// Create a new incremental decoder using the encoding_rs crate. |
301 | #[cfg (feature = "encoding_rs" )] |
302 | #[inline ] |
303 | pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self { |
304 | if encoding == encoding_rs::UTF_8 { |
305 | return Self::utf8(sink); |
306 | } |
307 | Self { |
308 | inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink), |
309 | } |
310 | } |
311 | |
312 | /// Create a new incremental decoder for the UTF-8 encoding. |
313 | /// |
314 | /// This is useful for content that is known at run-time to be UTF-8 |
315 | /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.) |
316 | #[inline ] |
317 | pub fn utf8(sink: Sink) -> LossyDecoder<Sink, A> { |
318 | LossyDecoder { |
319 | inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)), |
320 | } |
321 | } |
322 | |
323 | /// Give a reference to the inner sink. |
324 | pub fn inner_sink(&self) -> &Sink { |
325 | match self.inner { |
326 | LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink, |
327 | #[cfg (feature = "encoding" )] |
328 | LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink, |
329 | #[cfg (feature = "encoding_rs" )] |
330 | LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink, |
331 | } |
332 | } |
333 | |
334 | /// Give a mutable reference to the inner sink. |
335 | pub fn inner_sink_mut(&mut self) -> &mut Sink { |
336 | match self.inner { |
337 | LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink, |
338 | #[cfg (feature = "encoding" )] |
339 | LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink, |
340 | #[cfg (feature = "encoding_rs" )] |
341 | LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink, |
342 | } |
343 | } |
344 | } |
345 | |
346 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
347 | impl<Sink, A> TendrilSink<fmt::Bytes, A> for LossyDecoder<Sink, A> |
348 | where |
349 | Sink: TendrilSink<fmt::UTF8, A>, |
350 | A: Atomicity, |
351 | { |
352 | #[inline ] |
353 | fn process(&mut self, t: Tendril<fmt::Bytes, A>) { |
354 | match self.inner { |
355 | LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t), |
356 | #[cfg (feature = "encoding" )] |
357 | LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => { |
358 | let mut out = Tendril::new(); |
359 | let mut t = t; |
360 | loop { |
361 | match decoder.raw_feed(&*t, &mut out) { |
362 | (_, Some(err)) => { |
363 | out.push_char(' \u{fffd}' ); |
364 | sink.error(err.cause); |
365 | debug_assert!(err.upto >= 0); |
366 | t.pop_front(err.upto as u32); |
367 | // continue loop and process remainder of t |
368 | } |
369 | (_, None) => break, |
370 | } |
371 | } |
372 | if out.len() > 0 { |
373 | sink.process(out); |
374 | } |
375 | } |
376 | #[cfg (feature = "encoding_rs" )] |
377 | LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => { |
378 | if t.is_empty() { |
379 | return; |
380 | } |
381 | decode_to_sink(t, decoder, sink, false); |
382 | } |
383 | } |
384 | } |
385 | |
386 | #[inline ] |
387 | fn error(&mut self, desc: Cow<'static, str>) { |
388 | match self.inner { |
389 | LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc), |
390 | #[cfg (feature = "encoding" )] |
391 | LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc), |
392 | #[cfg (feature = "encoding_rs" )] |
393 | LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc), |
394 | } |
395 | } |
396 | |
397 | type Output = Sink::Output; |
398 | |
399 | #[inline ] |
400 | fn finish(self) -> Sink::Output { |
401 | match self.inner { |
402 | LossyDecoderInner::Utf8(utf8) => return utf8.finish(), |
403 | #[cfg (feature = "encoding" )] |
404 | LossyDecoderInner::Encoding(mut decoder, mut sink) => { |
405 | let mut out = Tendril::new(); |
406 | if let Some(err) = decoder.raw_finish(&mut out) { |
407 | out.push_char(' \u{fffd}' ); |
408 | sink.error(err.cause); |
409 | } |
410 | if out.len() > 0 { |
411 | sink.process(out); |
412 | } |
413 | sink.finish() |
414 | } |
415 | #[cfg (feature = "encoding_rs" )] |
416 | LossyDecoderInner::EncodingRs(mut decoder, mut sink) => { |
417 | decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true); |
418 | sink.finish() |
419 | } |
420 | } |
421 | } |
422 | } |
423 | |
424 | #[cfg (feature = "encoding_rs" )] |
425 | fn decode_to_sink<Sink, A>( |
426 | mut t: Tendril<fmt::Bytes, A>, |
427 | decoder: &mut encoding_rs::Decoder, |
428 | sink: &mut Sink, |
429 | last: bool, |
430 | ) where |
431 | Sink: TendrilSink<fmt::UTF8, A>, |
432 | A: Atomicity, |
433 | { |
434 | loop { |
435 | let mut out = <Tendril<fmt::Bytes, A>>::new(); |
436 | let max_len = decoder |
437 | .max_utf8_buffer_length_without_replacement(t.len()) |
438 | .unwrap_or(8192); |
439 | unsafe { |
440 | out.push_uninitialized(std::cmp::min(max_len as u32, 8192)); |
441 | } |
442 | let (result, bytes_read, bytes_written) = |
443 | decoder.decode_to_utf8_without_replacement(&t, &mut out, last); |
444 | if bytes_written > 0 { |
445 | sink.process(unsafe { |
446 | out.subtendril(0, bytes_written as u32) |
447 | .reinterpret_without_validating() |
448 | }); |
449 | } |
450 | match result { |
451 | DecoderResult::InputEmpty => return, |
452 | DecoderResult::OutputFull => {} |
453 | DecoderResult::Malformed(_, _) => { |
454 | sink.error(Cow::Borrowed("invalid sequence" )); |
455 | sink.process(" \u{FFFD}" .into()); |
456 | } |
457 | } |
458 | t.pop_front(bytes_read as u32); |
459 | if t.is_empty() { |
460 | return; |
461 | } |
462 | } |
463 | } |
464 | |
465 | #[cfg (test)] |
466 | mod test { |
467 | use super::{TendrilSink, Utf8LossyDecoder}; |
468 | use fmt; |
469 | use std::borrow::Cow; |
470 | use tendril::{Atomicity, NonAtomic, Tendril}; |
471 | |
472 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
473 | use super::LossyDecoder; |
474 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
475 | use tendril::SliceExt; |
476 | |
477 | #[cfg (feature = "encoding" )] |
478 | use encoding::all as enc; |
479 | #[cfg (feature = "encoding_rs" )] |
480 | use encoding_rs as enc_rs; |
481 | |
482 | struct Accumulate<A> |
483 | where |
484 | A: Atomicity, |
485 | { |
486 | tendrils: Vec<Tendril<fmt::UTF8, A>>, |
487 | errors: Vec<String>, |
488 | } |
489 | |
490 | impl<A> Accumulate<A> |
491 | where |
492 | A: Atomicity, |
493 | { |
494 | fn new() -> Accumulate<A> { |
495 | Accumulate { |
496 | tendrils: vec![], |
497 | errors: vec![], |
498 | } |
499 | } |
500 | } |
501 | |
502 | impl<A> TendrilSink<fmt::UTF8, A> for Accumulate<A> |
503 | where |
504 | A: Atomicity, |
505 | { |
506 | fn process(&mut self, t: Tendril<fmt::UTF8, A>) { |
507 | self.tendrils.push(t); |
508 | } |
509 | |
510 | fn error(&mut self, desc: Cow<'static, str>) { |
511 | self.errors.push(desc.into_owned()); |
512 | } |
513 | |
514 | type Output = (Vec<Tendril<fmt::UTF8, A>>, Vec<String>); |
515 | |
516 | fn finish(self) -> Self::Output { |
517 | (self.tendrils, self.errors) |
518 | } |
519 | } |
520 | |
521 | fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) { |
522 | let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); |
523 | let (tendrils, errors) = decoder.from_iter(input.iter().cloned()); |
524 | assert_eq!( |
525 | expected, |
526 | &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>() |
527 | ); |
528 | assert_eq!(errs, errors.len()); |
529 | } |
530 | |
531 | #[test ] |
532 | fn utf8() { |
533 | check_utf8(&[], &[], 0); |
534 | check_utf8(&[b"" ], &[], 0); |
535 | check_utf8(&[b"xyz" ], &["xyz" ], 0); |
536 | check_utf8(&[b"x" , b"y" , b"z" ], &["x" , "y" , "z" ], 0); |
537 | |
538 | check_utf8(&[b"xy \xEA\x99\xAEzw" ], &["xy \u{a66e}zw" ], 0); |
539 | check_utf8(&[b"xy \xEA" , b" \x99\xAEzw" ], &["xy" , " \u{a66e}z" , "w" ], 0); |
540 | check_utf8(&[b"xy \xEA\x99" , b" \xAEzw" ], &["xy" , " \u{a66e}z" , "w" ], 0); |
541 | check_utf8( |
542 | &[b"xy \xEA" , b" \x99" , b" \xAEzw" ], |
543 | &["xy" , " \u{a66e}z" , "w" ], |
544 | 0, |
545 | ); |
546 | check_utf8(&[b" \xEA" , b"" , b" \x99" , b"" , b" \xAE" ], &[" \u{a66e}" ], 0); |
547 | check_utf8( |
548 | &[b"" , b" \xEA" , b"" , b" \x99" , b"" , b" \xAE" , b"" ], |
549 | &[" \u{a66e}" ], |
550 | 0, |
551 | ); |
552 | |
553 | check_utf8( |
554 | &[b"xy \xEA" , b" \xFF" , b" \x99\xAEz" ], |
555 | &["xy" , " \u{fffd}" , " \u{fffd}" , " \u{fffd}" , " \u{fffd}" , "z" ], |
556 | 4, |
557 | ); |
558 | check_utf8( |
559 | &[b"xy \xEA\x99" , b" \xFFz" ], |
560 | &["xy" , " \u{fffd}" , " \u{fffd}" , "z" ], |
561 | 2, |
562 | ); |
563 | |
564 | check_utf8(&[b" \xC5\x91\xC5\x91\xC5\x91" ], &["őőő" ], 0); |
565 | check_utf8( |
566 | &[b" \xC5\x91" , b" \xC5\x91" , b" \xC5\x91" ], |
567 | &["ő" , "ő" , "ő" ], |
568 | 0, |
569 | ); |
570 | check_utf8( |
571 | &[b" \xC5" , b" \x91\xC5" , b" \x91\xC5" , b" \x91" ], |
572 | &["ő" , "ő" , "ő" ], |
573 | 0, |
574 | ); |
575 | check_utf8( |
576 | &[b" \xC5" , b" \x91\xff" , b" \x91\xC5" , b" \x91" ], |
577 | &["ő" , " \u{fffd}" , " \u{fffd}" , "ő" ], |
578 | 2, |
579 | ); |
580 | |
581 | // incomplete char at end of input |
582 | check_utf8(&[b" \xC0" ], &[" \u{fffd}" ], 1); |
583 | check_utf8(&[b" \xEA\x99" ], &[" \u{fffd}" ], 1); |
584 | } |
585 | |
586 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
587 | fn check_decode( |
588 | mut decoder: LossyDecoder<Accumulate<NonAtomic>>, |
589 | input: &[&[u8]], |
590 | expected: &str, |
591 | errs: usize, |
592 | ) { |
593 | for x in input { |
594 | decoder.process(x.to_tendril()); |
595 | } |
596 | let (tendrils, errors) = decoder.finish(); |
597 | let mut tendril: Tendril<fmt::UTF8> = Tendril::new(); |
598 | for t in tendrils { |
599 | tendril.push_tendril(&t); |
600 | } |
601 | assert_eq!(expected, &*tendril); |
602 | assert_eq!(errs, errors.len()); |
603 | } |
604 | |
605 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
606 | pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)]; |
607 | |
608 | #[cfg (any(feature = "encoding" ))] |
609 | const ASCII: Tests = &[ |
610 | (&[], "" , 0), |
611 | (&[b"" ], "" , 0), |
612 | (&[b"xyz" ], "xyz" , 0), |
613 | (&[b"xy" , b"" , b"" , b"z" ], "xyz" , 0), |
614 | (&[b"x" , b"y" , b"z" ], "xyz" , 0), |
615 | (&[b" \xFF" ], " \u{fffd}" , 1), |
616 | (&[b"x \xC0yz" ], "x \u{fffd}yz" , 1), |
617 | (&[b"x" , b" \xC0y" , b"z" ], "x \u{fffd}yz" , 1), |
618 | (&[b"x \xC0yz \xFF\xFFw" ], "x \u{fffd}yz \u{fffd}\u{fffd}w" , 3), |
619 | ]; |
620 | |
621 | #[cfg (feature = "encoding" )] |
622 | #[test ] |
623 | fn decode_ascii() { |
624 | for &(input, expected, errs) in ASCII { |
625 | let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new()); |
626 | check_decode(decoder, input, expected, errs); |
627 | } |
628 | } |
629 | |
630 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
631 | const UTF_8: Tests = &[ |
632 | (&[], "" , 0), |
633 | (&[b"" ], "" , 0), |
634 | (&[b"xyz" ], "xyz" , 0), |
635 | (&[b"x" , b"y" , b"z" ], "xyz" , 0), |
636 | (&[b" \xEA\x99\xAE" ], " \u{a66e}" , 0), |
637 | (&[b" \xEA" , b" \x99\xAE" ], " \u{a66e}" , 0), |
638 | (&[b" \xEA\x99" , b" \xAE" ], " \u{a66e}" , 0), |
639 | (&[b" \xEA" , b" \x99" , b" \xAE" ], " \u{a66e}" , 0), |
640 | (&[b" \xEA" , b"" , b" \x99" , b"" , b" \xAE" ], " \u{a66e}" , 0), |
641 | ( |
642 | &[b"" , b" \xEA" , b"" , b" \x99" , b"" , b" \xAE" , b"" ], |
643 | " \u{a66e}" , |
644 | 0, |
645 | ), |
646 | (&[b"xy \xEA" , b" \x99\xAEz" ], "xy \u{a66e}z" , 0), |
647 | ( |
648 | &[b"xy \xEA" , b" \xFF" , b" \x99\xAEz" ], |
649 | "xy \u{fffd}\u{fffd}\u{fffd}\u{fffd}z" , |
650 | 4, |
651 | ), |
652 | (&[b"xy \xEA\x99" , b" \xFFz" ], "xy \u{fffd}\u{fffd}z" , 2), |
653 | // incomplete char at end of input |
654 | (&[b" \xC0" ], " \u{fffd}" , 1), |
655 | (&[b" \xEA\x99" ], " \u{fffd}" , 1), |
656 | ]; |
657 | |
658 | #[cfg (feature = "encoding" )] |
659 | #[test ] |
660 | fn decode_utf8() { |
661 | for &(input, expected, errs) in UTF_8 { |
662 | let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new()); |
663 | check_decode(decoder, input, expected, errs); |
664 | } |
665 | } |
666 | |
667 | #[cfg (feature = "encoding_rs" )] |
668 | #[test ] |
669 | fn decode_utf8_encoding_rs() { |
670 | for &(input, expected, errs) in UTF_8 { |
671 | let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new()); |
672 | check_decode(decoder, input, expected, errs); |
673 | } |
674 | } |
675 | |
676 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
677 | const KOI8_U: Tests = &[ |
678 | (&[b" \xfc\xce\xc5\xd2\xc7\xc9\xd1" ], "Энергия" , 0), |
679 | (&[b" \xfc\xce" , b" \xc5\xd2\xc7\xc9\xd1" ], "Энергия" , 0), |
680 | (&[b" \xfc\xce" , b" \xc5\xd2\xc7" , b" \xc9\xd1" ], "Энергия" , 0), |
681 | ( |
682 | &[b" \xfc\xce" , b"" , b" \xc5\xd2\xc7" , b" \xc9\xd1" , b"" ], |
683 | "Энергия" , |
684 | 0, |
685 | ), |
686 | ]; |
687 | |
688 | #[cfg (feature = "encoding" )] |
689 | #[test ] |
690 | fn decode_koi8_u() { |
691 | for &(input, expected, errs) in KOI8_U { |
692 | let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new()); |
693 | check_decode(decoder, input, expected, errs); |
694 | } |
695 | } |
696 | |
697 | #[cfg (feature = "encoding_rs" )] |
698 | #[test ] |
699 | fn decode_koi8_u_encoding_rs() { |
700 | for &(input, expected, errs) in KOI8_U { |
701 | let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new()); |
702 | check_decode(decoder, input, expected, errs); |
703 | } |
704 | } |
705 | |
706 | #[cfg (any(feature = "encoding" , feature = "encoding_rs" ))] |
707 | const WINDOWS_949: Tests = &[ |
708 | (&[], "" , 0), |
709 | (&[b"" ], "" , 0), |
710 | (&[b" \xbe\xc8\xb3\xe7" ], "안녕" , 0), |
711 | (&[b" \xbe" , b" \xc8\xb3\xe7" ], "안녕" , 0), |
712 | (&[b" \xbe" , b"" , b" \xc8\xb3\xe7" ], "안녕" , 0), |
713 | ( |
714 | &[b" \xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4" ], |
715 | "안녕하세요" , |
716 | 0, |
717 | ), |
718 | (&[b" \xbe\xc8\xb3\xe7\xc7" ], "안녕 \u{fffd}" , 1), |
719 | (&[b" \xbe" , b"" , b" \xc8\xb3" ], "안 \u{fffd}" , 1), |
720 | (&[b" \xbe\x28\xb3\xe7" ], " \u{fffd}(녕" , 1), |
721 | ]; |
722 | |
723 | #[cfg (feature = "encoding" )] |
724 | #[test ] |
725 | fn decode_windows_949() { |
726 | for &(input, expected, errs) in WINDOWS_949 { |
727 | let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new()); |
728 | check_decode(decoder, input, expected, errs); |
729 | } |
730 | } |
731 | |
732 | #[cfg (feature = "encoding_rs" )] |
733 | #[test ] |
734 | fn decode_windows_949_encoding_rs() { |
735 | for &(input, expected, errs) in WINDOWS_949 { |
736 | let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new()); |
737 | check_decode(decoder, input, expected, errs); |
738 | } |
739 | } |
740 | |
741 | #[test ] |
742 | fn read_from() { |
743 | let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new()); |
744 | let mut bytes: &[u8] = b"foo \xffbar" ; |
745 | let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap(); |
746 | assert_eq!( |
747 | &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>(), |
748 | &["foo" , " \u{FFFD}" , "bar" ] |
749 | ); |
750 | assert_eq!(errors, &["invalid byte sequence" ]); |
751 | } |
752 | } |
753 | |