1// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
4// option. This file may not be copied, modified, or distributed
5// except according to those terms.
6
7//! Streams of tendrils.
8
9use fmt;
10use tendril::{Atomicity, NonAtomic, Tendril};
11
12use std::borrow::Cow;
13use std::fs::File;
14use std::io;
15use std::marker::PhantomData;
16use std::path::Path;
17
18#[cfg(feature = "encoding")]
19use encoding;
20#[cfg(feature = "encoding_rs")]
21use encoding_rs::{self, DecoderResult};
22use utf8;
23
24/// Trait for types that can process a tendril.
25///
26/// This is a "push" interface, unlike the "pull" interface of
27/// `Iterator<Item=Tendril<F>>`. The push interface matches
28/// [html5ever][] and other incremental parsers with a similar
29/// architecture.
30///
31/// [html5ever]: https://github.com/servo/html5ever
32pub trait TendrilSink<F, A = NonAtomic>
33where
34 F: fmt::Format,
35 A: Atomicity,
36{
37 /// Process this tendril.
38 fn process(&mut self, t: Tendril<F, A>);
39
40 /// Indicates that an error has occurred.
41 fn error(&mut self, desc: Cow<'static, str>);
42
43 /// What the overall result of processing is.
44 type Output;
45
46 /// Indicates the end of the stream.
47 fn finish(self) -> Self::Output;
48
49 /// Process one tendril and finish.
50 fn one<T>(mut self, t: T) -> Self::Output
51 where
52 Self: Sized,
53 T: Into<Tendril<F, A>>,
54 {
55 self.process(t.into());
56 self.finish()
57 }
58
59 /// Consume an iterator of tendrils, processing each item, then finish.
60 fn from_iter<I>(mut self, i: I) -> Self::Output
61 where
62 Self: Sized,
63 I: IntoIterator,
64 I::Item: Into<Tendril<F, A>>,
65 {
66 for t in i {
67 self.process(t.into())
68 }
69 self.finish()
70 }
71
72 /// Read from the given stream of bytes until exhaustion and process incrementally,
73 /// then finish. Return `Err` at the first I/O error.
74 fn read_from<R>(mut self, r: &mut R) -> io::Result<Self::Output>
75 where
76 Self: Sized,
77 R: io::Read,
78 F: fmt::SliceFormat<Slice = [u8]>,
79 {
80 const BUFFER_SIZE: u32 = 4 * 1024;
81 loop {
82 let mut tendril = Tendril::<F, A>::new();
83 // FIXME: this exposes uninitialized bytes to a generic R type
84 // this is fine for R=File which never reads these bytes,
85 // but user-defined types might.
86 // The standard library pushes zeros to `Vec<u8>` for that reason.
87 unsafe {
88 tendril.push_uninitialized(BUFFER_SIZE);
89 }
90 loop {
91 match r.read(&mut tendril) {
92 Ok(0) => return Ok(self.finish()),
93 Ok(n) => {
94 tendril.pop_back(BUFFER_SIZE - n as u32);
95 self.process(tendril);
96 break;
97 }
98 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
99 Err(e) => return Err(e),
100 }
101 }
102 }
103 }
104
105 /// Read from the file at the given path and process incrementally,
106 /// then finish. Return `Err` at the first I/O error.
107 fn from_file<P>(self, path: P) -> io::Result<Self::Output>
108 where
109 Self: Sized,
110 P: AsRef<Path>,
111 F: fmt::SliceFormat<Slice = [u8]>,
112 {
113 self.read_from(&mut File::open(path)?)
114 }
115}
116
117/// A `TendrilSink` adaptor that takes bytes, decodes them as UTF-8,
118/// lossily replace ill-formed byte sequences with U+FFFD replacement characters,
119/// and emits Unicode (`StrTendril`).
120///
121/// This does not allocate memory: the output is either subtendrils on the input,
122/// on inline tendrils for a single code point.
123pub struct Utf8LossyDecoder<Sink, A = NonAtomic>
124where
125 Sink: TendrilSink<fmt::UTF8, A>,
126 A: Atomicity,
127{
128 pub inner_sink: Sink,
129 incomplete: Option<utf8::Incomplete>,
130 marker: PhantomData<A>,
131}
132
133impl<Sink, A> Utf8LossyDecoder<Sink, A>
134where
135 Sink: TendrilSink<fmt::UTF8, A>,
136 A: Atomicity,
137{
138 /// Create a new incremental UTF-8 decoder.
139 #[inline]
140 pub fn new(inner_sink: Sink) -> Self {
141 Utf8LossyDecoder {
142 inner_sink: inner_sink,
143 incomplete: None,
144 marker: PhantomData,
145 }
146 }
147}
148
149impl<Sink, A> TendrilSink<fmt::Bytes, A> for Utf8LossyDecoder<Sink, A>
150where
151 Sink: TendrilSink<fmt::UTF8, A>,
152 A: Atomicity,
153{
154 #[inline]
155 fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) {
156 // FIXME: remove take() and map() when non-lexical borrows are stable.
157 if let Some(mut incomplete) = self.incomplete.take() {
158 let resume_at = incomplete.try_complete(&t).map(|(result, rest)| {
159 match result {
160 Ok(s) => self.inner_sink.process(Tendril::from_slice(s)),
161 Err(_) => {
162 self.inner_sink.error("invalid byte sequence".into());
163 self.inner_sink
164 .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
165 }
166 }
167 t.len() - rest.len()
168 });
169 match resume_at {
170 None => {
171 self.incomplete = Some(incomplete);
172 return;
173 }
174 Some(resume_at) => t.pop_front(resume_at as u32),
175 }
176 }
177 while !t.is_empty() {
178 let unborrowed_result = match utf8::decode(&t) {
179 Ok(s) => {
180 debug_assert!(s.as_ptr() == t.as_ptr());
181 debug_assert!(s.len() == t.len());
182 Ok(())
183 }
184 Err(utf8::DecodeError::Invalid {
185 valid_prefix,
186 invalid_sequence,
187 ..
188 }) => {
189 debug_assert!(valid_prefix.as_ptr() == t.as_ptr());
190 debug_assert!(valid_prefix.len() <= t.len());
191 Err((
192 valid_prefix.len(),
193 Err(valid_prefix.len() + invalid_sequence.len()),
194 ))
195 }
196 Err(utf8::DecodeError::Incomplete {
197 valid_prefix,
198 incomplete_suffix,
199 }) => {
200 debug_assert!(valid_prefix.as_ptr() == t.as_ptr());
201 debug_assert!(valid_prefix.len() <= t.len());
202 Err((valid_prefix.len(), Ok(incomplete_suffix)))
203 }
204 };
205 match unborrowed_result {
206 Ok(()) => {
207 unsafe { self.inner_sink.process(t.reinterpret_without_validating()) }
208 return;
209 }
210 Err((valid_len, and_then)) => {
211 if valid_len > 0 {
212 let subtendril = t.subtendril(0, valid_len as u32);
213 unsafe {
214 self.inner_sink
215 .process(subtendril.reinterpret_without_validating())
216 }
217 }
218 match and_then {
219 Ok(incomplete) => {
220 self.incomplete = Some(incomplete);
221 return;
222 }
223 Err(offset) => {
224 self.inner_sink.error("invalid byte sequence".into());
225 self.inner_sink
226 .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
227 t.pop_front(offset as u32);
228 }
229 }
230 }
231 }
232 }
233 }
234
235 #[inline]
236 fn error(&mut self, desc: Cow<'static, str>) {
237 self.inner_sink.error(desc);
238 }
239
240 type Output = Sink::Output;
241
242 #[inline]
243 fn finish(mut self) -> Sink::Output {
244 if self.incomplete.is_some() {
245 self.inner_sink
246 .error("incomplete byte sequence at end of stream".into());
247 self.inner_sink
248 .process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
249 }
250 self.inner_sink.finish()
251 }
252}
253
254/// A `TendrilSink` adaptor that takes bytes, decodes them as the given character encoding,
255/// lossily replace ill-formed byte sequences with U+FFFD replacement characters,
256/// and emits Unicode (`StrTendril`).
257///
258/// This allocates new tendrils for encodings other than UTF-8.
259#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
260pub struct LossyDecoder<Sink, A = NonAtomic>
261where
262 Sink: TendrilSink<fmt::UTF8, A>,
263 A: Atomicity,
264{
265 inner: LossyDecoderInner<Sink, A>,
266}
267
268#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
269enum LossyDecoderInner<Sink, A>
270where
271 Sink: TendrilSink<fmt::UTF8, A>,
272 A: Atomicity,
273{
274 Utf8(Utf8LossyDecoder<Sink, A>),
275 #[cfg(feature = "encoding")]
276 Encoding(Box<encoding::RawDecoder>, Sink),
277 #[cfg(feature = "encoding_rs")]
278 EncodingRs(encoding_rs::Decoder, Sink),
279}
280
281#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
282impl<Sink, A> LossyDecoder<Sink, A>
283where
284 Sink: TendrilSink<fmt::UTF8, A>,
285 A: Atomicity,
286{
287 /// Create a new incremental decoder using the encoding crate.
288 #[cfg(feature = "encoding")]
289 #[inline]
290 pub fn new(encoding: encoding::EncodingRef, sink: Sink) -> Self {
291 if encoding.name() == "utf-8" {
292 LossyDecoder::utf8(sink)
293 } else {
294 LossyDecoder {
295 inner: LossyDecoderInner::Encoding(encoding.raw_decoder(), sink),
296 }
297 }
298 }
299
300 /// Create a new incremental decoder using the encoding_rs crate.
301 #[cfg(feature = "encoding_rs")]
302 #[inline]
303 pub fn new_encoding_rs(encoding: &'static encoding_rs::Encoding, sink: Sink) -> Self {
304 if encoding == encoding_rs::UTF_8 {
305 return Self::utf8(sink);
306 }
307 Self {
308 inner: LossyDecoderInner::EncodingRs(encoding.new_decoder(), sink),
309 }
310 }
311
312 /// Create a new incremental decoder for the UTF-8 encoding.
313 ///
314 /// This is useful for content that is known at run-time to be UTF-8
315 /// (whereas `Utf8LossyDecoder` requires knowning at compile-time.)
316 #[inline]
317 pub fn utf8(sink: Sink) -> LossyDecoder<Sink, A> {
318 LossyDecoder {
319 inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink)),
320 }
321 }
322
323 /// Give a reference to the inner sink.
324 pub fn inner_sink(&self) -> &Sink {
325 match self.inner {
326 LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink,
327 #[cfg(feature = "encoding")]
328 LossyDecoderInner::Encoding(_, ref inner_sink) => inner_sink,
329 #[cfg(feature = "encoding_rs")]
330 LossyDecoderInner::EncodingRs(_, ref inner_sink) => inner_sink,
331 }
332 }
333
334 /// Give a mutable reference to the inner sink.
335 pub fn inner_sink_mut(&mut self) -> &mut Sink {
336 match self.inner {
337 LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink,
338 #[cfg(feature = "encoding")]
339 LossyDecoderInner::Encoding(_, ref mut inner_sink) => inner_sink,
340 #[cfg(feature = "encoding_rs")]
341 LossyDecoderInner::EncodingRs(_, ref mut inner_sink) => inner_sink,
342 }
343 }
344}
345
346#[cfg(any(feature = "encoding", feature = "encoding_rs"))]
347impl<Sink, A> TendrilSink<fmt::Bytes, A> for LossyDecoder<Sink, A>
348where
349 Sink: TendrilSink<fmt::UTF8, A>,
350 A: Atomicity,
351{
352 #[inline]
353 fn process(&mut self, t: Tendril<fmt::Bytes, A>) {
354 match self.inner {
355 LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t),
356 #[cfg(feature = "encoding")]
357 LossyDecoderInner::Encoding(ref mut decoder, ref mut sink) => {
358 let mut out = Tendril::new();
359 let mut t = t;
360 loop {
361 match decoder.raw_feed(&*t, &mut out) {
362 (_, Some(err)) => {
363 out.push_char('\u{fffd}');
364 sink.error(err.cause);
365 debug_assert!(err.upto >= 0);
366 t.pop_front(err.upto as u32);
367 // continue loop and process remainder of t
368 }
369 (_, None) => break,
370 }
371 }
372 if out.len() > 0 {
373 sink.process(out);
374 }
375 }
376 #[cfg(feature = "encoding_rs")]
377 LossyDecoderInner::EncodingRs(ref mut decoder, ref mut sink) => {
378 if t.is_empty() {
379 return;
380 }
381 decode_to_sink(t, decoder, sink, false);
382 }
383 }
384 }
385
386 #[inline]
387 fn error(&mut self, desc: Cow<'static, str>) {
388 match self.inner {
389 LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc),
390 #[cfg(feature = "encoding")]
391 LossyDecoderInner::Encoding(_, ref mut sink) => sink.error(desc),
392 #[cfg(feature = "encoding_rs")]
393 LossyDecoderInner::EncodingRs(_, ref mut sink) => sink.error(desc),
394 }
395 }
396
397 type Output = Sink::Output;
398
399 #[inline]
400 fn finish(self) -> Sink::Output {
401 match self.inner {
402 LossyDecoderInner::Utf8(utf8) => return utf8.finish(),
403 #[cfg(feature = "encoding")]
404 LossyDecoderInner::Encoding(mut decoder, mut sink) => {
405 let mut out = Tendril::new();
406 if let Some(err) = decoder.raw_finish(&mut out) {
407 out.push_char('\u{fffd}');
408 sink.error(err.cause);
409 }
410 if out.len() > 0 {
411 sink.process(out);
412 }
413 sink.finish()
414 }
415 #[cfg(feature = "encoding_rs")]
416 LossyDecoderInner::EncodingRs(mut decoder, mut sink) => {
417 decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true);
418 sink.finish()
419 }
420 }
421 }
422}
423
424#[cfg(feature = "encoding_rs")]
425fn decode_to_sink<Sink, A>(
426 mut t: Tendril<fmt::Bytes, A>,
427 decoder: &mut encoding_rs::Decoder,
428 sink: &mut Sink,
429 last: bool,
430) where
431 Sink: TendrilSink<fmt::UTF8, A>,
432 A: Atomicity,
433{
434 loop {
435 let mut out = <Tendril<fmt::Bytes, A>>::new();
436 let max_len = decoder
437 .max_utf8_buffer_length_without_replacement(t.len())
438 .unwrap_or(8192);
439 unsafe {
440 out.push_uninitialized(std::cmp::min(max_len as u32, 8192));
441 }
442 let (result, bytes_read, bytes_written) =
443 decoder.decode_to_utf8_without_replacement(&t, &mut out, last);
444 if bytes_written > 0 {
445 sink.process(unsafe {
446 out.subtendril(0, bytes_written as u32)
447 .reinterpret_without_validating()
448 });
449 }
450 match result {
451 DecoderResult::InputEmpty => return,
452 DecoderResult::OutputFull => {}
453 DecoderResult::Malformed(_, _) => {
454 sink.error(Cow::Borrowed("invalid sequence"));
455 sink.process("\u{FFFD}".into());
456 }
457 }
458 t.pop_front(bytes_read as u32);
459 if t.is_empty() {
460 return;
461 }
462 }
463}
464
465#[cfg(test)]
466mod test {
467 use super::{TendrilSink, Utf8LossyDecoder};
468 use fmt;
469 use std::borrow::Cow;
470 use tendril::{Atomicity, NonAtomic, Tendril};
471
472 #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
473 use super::LossyDecoder;
474 #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
475 use tendril::SliceExt;
476
477 #[cfg(feature = "encoding")]
478 use encoding::all as enc;
479 #[cfg(feature = "encoding_rs")]
480 use encoding_rs as enc_rs;
481
482 struct Accumulate<A>
483 where
484 A: Atomicity,
485 {
486 tendrils: Vec<Tendril<fmt::UTF8, A>>,
487 errors: Vec<String>,
488 }
489
490 impl<A> Accumulate<A>
491 where
492 A: Atomicity,
493 {
494 fn new() -> Accumulate<A> {
495 Accumulate {
496 tendrils: vec![],
497 errors: vec![],
498 }
499 }
500 }
501
502 impl<A> TendrilSink<fmt::UTF8, A> for Accumulate<A>
503 where
504 A: Atomicity,
505 {
506 fn process(&mut self, t: Tendril<fmt::UTF8, A>) {
507 self.tendrils.push(t);
508 }
509
510 fn error(&mut self, desc: Cow<'static, str>) {
511 self.errors.push(desc.into_owned());
512 }
513
514 type Output = (Vec<Tendril<fmt::UTF8, A>>, Vec<String>);
515
516 fn finish(self) -> Self::Output {
517 (self.tendrils, self.errors)
518 }
519 }
520
521 fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) {
522 let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new());
523 let (tendrils, errors) = decoder.from_iter(input.iter().cloned());
524 assert_eq!(
525 expected,
526 &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>()
527 );
528 assert_eq!(errs, errors.len());
529 }
530
531 #[test]
532 fn utf8() {
533 check_utf8(&[], &[], 0);
534 check_utf8(&[b""], &[], 0);
535 check_utf8(&[b"xyz"], &["xyz"], 0);
536 check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0);
537
538 check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0);
539 check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0);
540 check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0);
541 check_utf8(
542 &[b"xy\xEA", b"\x99", b"\xAEzw"],
543 &["xy", "\u{a66e}z", "w"],
544 0,
545 );
546 check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0);
547 check_utf8(
548 &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""],
549 &["\u{a66e}"],
550 0,
551 );
552
553 check_utf8(
554 &[b"xy\xEA", b"\xFF", b"\x99\xAEz"],
555 &["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"],
556 4,
557 );
558 check_utf8(
559 &[b"xy\xEA\x99", b"\xFFz"],
560 &["xy", "\u{fffd}", "\u{fffd}", "z"],
561 2,
562 );
563
564 check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0);
565 check_utf8(
566 &[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"],
567 &["ő", "ő", "ő"],
568 0,
569 );
570 check_utf8(
571 &[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"],
572 &["ő", "ő", "ő"],
573 0,
574 );
575 check_utf8(
576 &[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"],
577 &["ő", "\u{fffd}", "\u{fffd}", "ő"],
578 2,
579 );
580
581 // incomplete char at end of input
582 check_utf8(&[b"\xC0"], &["\u{fffd}"], 1);
583 check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1);
584 }
585
586 #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
587 fn check_decode(
588 mut decoder: LossyDecoder<Accumulate<NonAtomic>>,
589 input: &[&[u8]],
590 expected: &str,
591 errs: usize,
592 ) {
593 for x in input {
594 decoder.process(x.to_tendril());
595 }
596 let (tendrils, errors) = decoder.finish();
597 let mut tendril: Tendril<fmt::UTF8> = Tendril::new();
598 for t in tendrils {
599 tendril.push_tendril(&t);
600 }
601 assert_eq!(expected, &*tendril);
602 assert_eq!(errs, errors.len());
603 }
604
605 #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
606 pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)];
607
608 #[cfg(any(feature = "encoding"))]
609 const ASCII: Tests = &[
610 (&[], "", 0),
611 (&[b""], "", 0),
612 (&[b"xyz"], "xyz", 0),
613 (&[b"xy", b"", b"", b"z"], "xyz", 0),
614 (&[b"x", b"y", b"z"], "xyz", 0),
615 (&[b"\xFF"], "\u{fffd}", 1),
616 (&[b"x\xC0yz"], "x\u{fffd}yz", 1),
617 (&[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1),
618 (&[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3),
619 ];
620
621 #[cfg(feature = "encoding")]
622 #[test]
623 fn decode_ascii() {
624 for &(input, expected, errs) in ASCII {
625 let decoder = LossyDecoder::new(enc::ASCII, Accumulate::new());
626 check_decode(decoder, input, expected, errs);
627 }
628 }
629
630 #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
631 const UTF_8: Tests = &[
632 (&[], "", 0),
633 (&[b""], "", 0),
634 (&[b"xyz"], "xyz", 0),
635 (&[b"x", b"y", b"z"], "xyz", 0),
636 (&[b"\xEA\x99\xAE"], "\u{a66e}", 0),
637 (&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0),
638 (&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0),
639 (&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0),
640 (&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0),
641 (
642 &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""],
643 "\u{a66e}",
644 0,
645 ),
646 (&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0),
647 (
648 &[b"xy\xEA", b"\xFF", b"\x99\xAEz"],
649 "xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z",
650 4,
651 ),
652 (&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2),
653 // incomplete char at end of input
654 (&[b"\xC0"], "\u{fffd}", 1),
655 (&[b"\xEA\x99"], "\u{fffd}", 1),
656 ];
657
658 #[cfg(feature = "encoding")]
659 #[test]
660 fn decode_utf8() {
661 for &(input, expected, errs) in UTF_8 {
662 let decoder = LossyDecoder::new(enc::UTF_8, Accumulate::new());
663 check_decode(decoder, input, expected, errs);
664 }
665 }
666
667 #[cfg(feature = "encoding_rs")]
668 #[test]
669 fn decode_utf8_encoding_rs() {
670 for &(input, expected, errs) in UTF_8 {
671 let decoder = LossyDecoder::new_encoding_rs(enc_rs::UTF_8, Accumulate::new());
672 check_decode(decoder, input, expected, errs);
673 }
674 }
675
676 #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
677 const KOI8_U: Tests = &[
678 (&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0),
679 (&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0),
680 (&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0),
681 (
682 &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""],
683 "Энергия",
684 0,
685 ),
686 ];
687
688 #[cfg(feature = "encoding")]
689 #[test]
690 fn decode_koi8_u() {
691 for &(input, expected, errs) in KOI8_U {
692 let decoder = LossyDecoder::new(enc::KOI8_U, Accumulate::new());
693 check_decode(decoder, input, expected, errs);
694 }
695 }
696
697 #[cfg(feature = "encoding_rs")]
698 #[test]
699 fn decode_koi8_u_encoding_rs() {
700 for &(input, expected, errs) in KOI8_U {
701 let decoder = LossyDecoder::new_encoding_rs(enc_rs::KOI8_U, Accumulate::new());
702 check_decode(decoder, input, expected, errs);
703 }
704 }
705
706 #[cfg(any(feature = "encoding", feature = "encoding_rs"))]
707 const WINDOWS_949: Tests = &[
708 (&[], "", 0),
709 (&[b""], "", 0),
710 (&[b"\xbe\xc8\xb3\xe7"], "안녕", 0),
711 (&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0),
712 (&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0),
713 (
714 &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"],
715 "안녕하세요",
716 0,
717 ),
718 (&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1),
719 (&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1),
720 (&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1),
721 ];
722
723 #[cfg(feature = "encoding")]
724 #[test]
725 fn decode_windows_949() {
726 for &(input, expected, errs) in WINDOWS_949 {
727 let decoder = LossyDecoder::new(enc::WINDOWS_949, Accumulate::new());
728 check_decode(decoder, input, expected, errs);
729 }
730 }
731
732 #[cfg(feature = "encoding_rs")]
733 #[test]
734 fn decode_windows_949_encoding_rs() {
735 for &(input, expected, errs) in WINDOWS_949 {
736 let decoder = LossyDecoder::new_encoding_rs(enc_rs::EUC_KR, Accumulate::new());
737 check_decode(decoder, input, expected, errs);
738 }
739 }
740
741 #[test]
742 fn read_from() {
743 let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new());
744 let mut bytes: &[u8] = b"foo\xffbar";
745 let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap();
746 assert_eq!(
747 &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>(),
748 &["foo", "\u{FFFD}", "bar"]
749 );
750 assert_eq!(errors, &["invalid byte sequence"]);
751 }
752}
753