1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10#![cfg_attr(
11 feature = "cargo-clippy",
12 allow(doc_markdown, inline_always, new_ret_no_self)
13)]
14
15//! encoding_rs is a Gecko-oriented Free Software / Open Source implementation
16//! of the [Encoding Standard](https://encoding.spec.whatwg.org/) in Rust.
17//! Gecko-oriented means that converting to and from UTF-16 is supported in
18//! addition to converting to and from UTF-8, that the performance and
19//! streamability goals are browser-oriented, and that FFI-friendliness is a
20//! goal.
21//!
22//! Additionally, the `mem` module provides functions that are useful for
23//! applications that need to be able to deal with legacy in-memory
24//! representations of Unicode.
25//!
26//! For expectation setting, please be sure to read the sections
27//! [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes),
28//! [_ISO-8859-1_](#iso-8859-1) and [_Web / Browser Focus_](#web--browser-focus) below.
29//!
30//! There is a [long-form write-up](https://hsivonen.fi/encoding_rs/) about the
31//! design and internals of the crate.
32//!
33//! # Availability
34//!
35//! The code is available under the
36//! [Apache license, Version 2.0](https://www.apache.org/licenses/LICENSE-2.0)
37//! or the [MIT license](https://opensource.org/licenses/MIT), at your option.
38//! See the
39//! [`COPYRIGHT`](https://github.com/hsivonen/encoding_rs/blob/master/COPYRIGHT)
40//! file for details.
41//! The [repository is on GitHub](https://github.com/hsivonen/encoding_rs). The
42//! [crate is available on crates.io](https://crates.io/crates/encoding_rs).
43//!
44//! # Integration with `std::io`
45//!
46//! This crate doesn't implement traits from `std::io`. However, for the case of
47//! wrapping a `std::io::Read` in a decoder that implements `std::io::Read` and
48//! presents the data from the wrapped `std::io::Read` as UTF-8 is addressed by
49//! the [`encoding_rs_io`](https://docs.rs/encoding_rs_io/) crate.
50//!
51//! # Examples
52//!
53//! Example programs:
54//!
55//! * [Rust](https://github.com/hsivonen/recode_rs)
56//! * [C](https://github.com/hsivonen/recode_c)
57//! * [C++](https://github.com/hsivonen/recode_cpp)
58//!
59//! Decode using the non-streaming API:
60//!
61//! ```
62//! #[cfg(feature = "alloc")] {
63//! use encoding_rs::*;
64//!
65//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
66//! let bytes = b"\x83n\x83\x8D\x81[\x81E\x83\x8F\x81[\x83\x8B\x83h";
67//!
68//! let (cow, encoding_used, had_errors) = SHIFT_JIS.decode(bytes);
69//! assert_eq!(&cow[..], expectation);
70//! assert_eq!(encoding_used, SHIFT_JIS);
71//! assert!(!had_errors);
72//! }
73//! ```
74//!
75//! Decode using the streaming API with minimal `unsafe`:
76//!
77//! ```
78//! use encoding_rs::*;
79//!
80//! let expectation = "\u{30CF}\u{30ED}\u{30FC}\u{30FB}\u{30EF}\u{30FC}\u{30EB}\u{30C9}";
81//!
82//! // Use an array of byte slices to demonstrate content arriving piece by
83//! // piece from the network.
84//! let bytes: [&'static [u8]; 4] = [b"\x83",
85//! b"n\x83\x8D\x81",
86//! b"[\x81E\x83\x8F\x81[\x83",
87//! b"\x8B\x83h"];
88//!
89//! // Very short output buffer to demonstrate the output buffer getting full.
90//! // Normally, you'd use something like `[0u8; 2048]`.
91//! let mut buffer_bytes = [0u8; 8];
92//! let mut buffer: &mut str = std::str::from_utf8_mut(&mut buffer_bytes[..]).unwrap();
93//!
94//! // How many bytes in the buffer currently hold significant data.
95//! let mut bytes_in_buffer = 0usize;
96//!
97//! // Collect the output to a string for demonstration purposes.
98//! let mut output = String::new();
99//!
100//! // The `Decoder`
101//! let mut decoder = SHIFT_JIS.new_decoder();
102//!
103//! // Track whether we see errors.
104//! let mut total_had_errors = false;
105//!
106//! // Decode using a fixed-size intermediate buffer (for demonstrating the
107//! // use of a fixed-size buffer; normally when the output of an incremental
108//! // decode goes to a `String` one would use `Decoder.decode_to_string()` to
109//! // avoid the intermediate buffer).
110//! for input in &bytes[..] {
111//! // The number of bytes already read from current `input` in total.
112//! let mut total_read_from_current_input = 0usize;
113//!
114//! loop {
115//! let (result, read, written, had_errors) =
116//! decoder.decode_to_str(&input[total_read_from_current_input..],
117//! &mut buffer[bytes_in_buffer..],
118//! false);
119//! total_read_from_current_input += read;
120//! bytes_in_buffer += written;
121//! total_had_errors |= had_errors;
122//! match result {
123//! CoderResult::InputEmpty => {
124//! // We have consumed the current input buffer. Break out of
125//! // the inner loop to get the next input buffer from the
126//! // outer loop.
127//! break;
128//! },
129//! CoderResult::OutputFull => {
130//! // Write the current buffer out and consider the buffer
131//! // empty.
132//! output.push_str(&buffer[..bytes_in_buffer]);
133//! bytes_in_buffer = 0usize;
134//! continue;
135//! }
136//! }
137//! }
138//! }
139//!
140//! // Process EOF
141//! loop {
142//! let (result, _, written, had_errors) =
143//! decoder.decode_to_str(b"",
144//! &mut buffer[bytes_in_buffer..],
145//! true);
146//! bytes_in_buffer += written;
147//! total_had_errors |= had_errors;
148//! // Write the current buffer out and consider the buffer empty.
149//! // Need to do this here for both `match` arms, because we exit the
150//! // loop on `CoderResult::InputEmpty`.
151//! output.push_str(&buffer[..bytes_in_buffer]);
152//! bytes_in_buffer = 0usize;
153//! match result {
154//! CoderResult::InputEmpty => {
155//! // Done!
156//! break;
157//! },
158//! CoderResult::OutputFull => {
159//! continue;
160//! }
161//! }
162//! }
163//!
164//! assert_eq!(&output[..], expectation);
165//! assert!(!total_had_errors);
166//! ```
167//!
168//! ## UTF-16LE, UTF-16BE and Unicode Encoding Schemes
169//!
170//! The Encoding Standard doesn't specify encoders for UTF-16LE and UTF-16BE,
171//! __so this crate does not provide encoders for those encodings__!
172//! Along with the replacement encoding, their _output encoding_ (i.e. the
173//! encoding used for form submission and error handling in the query string
174//! of URLs) is UTF-8, so you get an UTF-8 encoder if you request an encoder
175//! for them.
176//!
177//! Additionally, the Encoding Standard factors BOM handling into wrapper
178//! algorithms so that BOM handling isn't part of the definition of the
179//! encodings themselves. The Unicode _encoding schemes_ in the Unicode
180//! Standard define BOM handling or lack thereof as part of the encoding
181//! scheme.
182//!
183//! When used with the `_without_bom_handling` entry points, the UTF-16LE
184//! and UTF-16BE _encodings_ match the same-named _encoding schemes_ from
185//! the Unicode Standard.
186//!
187//! When used with the `_with_bom_removal` entry points, the UTF-8
188//! _encoding_ matches the UTF-8 _encoding scheme_ from the Unicode
189//! Standard.
190//!
191//! This crate does not provide a mode that matches the UTF-16 _encoding
192//! scheme_ from the Unicode Stardard. The UTF-16BE encoding used with
193//! the entry points without `_bom_` qualifiers is the closest match,
194//! but in that case, the UTF-8 BOM triggers UTF-8 decoding, which is
195//! not part of the behavior of the UTF-16 _encoding scheme_ per the
196//! Unicode Standard.
197//!
198//! The UTF-32 family of Unicode encoding schemes is not supported
199//! by this crate. The Encoding Standard doesn't define any UTF-32
200//! family encodings, since they aren't necessary for consuming Web
201//! content.
202//!
203//! While gb18030 is capable of representing U+FEFF, the Encoding
204//! Standard does not treat the gb18030 byte representation of U+FEFF
205//! as a BOM, so neither does this crate.
206//!
207//! ## ISO-8859-1
208//!
209//! ISO-8859-1 does not exist as a distinct encoding from windows-1252 in
210//! the Encoding Standard. Therefore, an encoding that maps the unsigned
211//! byte value to the same Unicode scalar value is not available via
212//! `Encoding` in this crate.
213//!
214//! However, the functions whose name starts with `convert` and contains
215//! `latin1` in the `mem` module support such conversions, which are known as
216//! [_isomorphic decode_](https://infra.spec.whatwg.org/#isomorphic-decode)
217//! and [_isomorphic encode_](https://infra.spec.whatwg.org/#isomorphic-encode)
218//! in the [Infra Standard](https://infra.spec.whatwg.org/).
219//!
220//! ## Web / Browser Focus
221//!
222//! Both in terms of scope and performance, the focus is on the Web. For scope,
223//! this means that encoding_rs implements the Encoding Standard fully and
224//! doesn't implement encodings that are not specified in the Encoding
225//! Standard. For performance, this means that decoding performance is
226//! important as well as performance for encoding into UTF-8 or encoding the
227//! Basic Latin range (ASCII) into legacy encodings. Non-Basic Latin needs to
228//! be encoded into legacy encodings in only two places in the Web platform: in
229//! the query part of URLs, in which case it's a matter of relatively rare
230//! error handling, and in form submission, in which case the user action and
231//! networking tend to hide the performance of the encoder.
232//!
233//! Deemphasizing performance of encoding non-Basic Latin text into legacy
234//! encodings enables smaller code size thanks to the encoder side using the
235//! decode-optimized data tables without having encode-optimized data tables at
236//! all. Even in decoders, smaller lookup table size is preferred over avoiding
237//! multiplication operations.
238//!
239//! Additionally, performance is a non-goal for the ASCII-incompatible
240//! ISO-2022-JP encoding, which are rarely used on the Web. Instead of
241//! performance, the decoder for ISO-2022-JP optimizes for ease/clarity
242//! of implementation.
243//!
244//! Despite the browser focus, the hope is that non-browser applications
245//! that wish to consume Web content or submit Web forms in a Web-compatible
246//! way will find encoding_rs useful. While encoding_rs does not try to match
247//! Windows behavior, many of the encodings are close enough to legacy
248//! encodings implemented by Windows that applications that need to consume
249//! data in legacy Windows encodins may find encoding_rs useful. The
250//! [codepage](https://crates.io/crates/codepage) crate maps from Windows
251//! code page identifiers onto encoding_rs `Encoding`s and vice versa.
252//!
253//! For decoding email, UTF-7 support is needed (unfortunately) in additition
254//! to the encodings defined in the Encoding Standard. The
255//! [charset](https://crates.io/crates/charset) wraps encoding_rs and adds
256//! UTF-7 decoding for email purposes.
257//!
258//! For single-byte DOS encodings beyond the ones supported by the Encoding
259//! Standard, there is the [`oem_cp`](https://crates.io/crates/oem_cp) crate.
260//!
261//! # Preparing Text for the Encoders
262//!
263//! Normalizing text into Unicode Normalization Form C prior to encoding text
264//! into a legacy encoding minimizes unmappable characters. Text can be
265//! normalized to Unicode Normalization Form C using the
266//! [`icu_normalizer`](https://crates.io/crates/icu_normalizer) crate, which
267//! is part of [ICU4X](https://icu4x.unicode.org/).
268//!
269//! The exception is windows-1258, which after normalizing to Unicode
270//! Normalization Form C requires tone marks to be decomposed in order to
271//! minimize unmappable characters. Vietnamese tone marks can be decomposed
272//! using the [`detone`](https://crates.io/crates/detone) crate.
273//!
274//! # Streaming & Non-Streaming; Rust & C/C++
275//!
276//! The API in Rust has two modes of operation: streaming and non-streaming.
277//! The streaming API is the foundation of the implementation and should be
278//! used when processing data that arrives piecemeal from an i/o stream. The
279//! streaming API has an FFI wrapper (as a [separate crate][1]) that exposes it
280//! to C callers. The non-streaming part of the API is for Rust callers only and
281//! is smart about borrowing instead of copying when possible. When
282//! streamability is not needed, the non-streaming API should be preferrer in
283//! order to avoid copying data when a borrow suffices.
284//!
285//! There is no analogous C API exposed via FFI, mainly because C doesn't have
286//! standard types for growable byte buffers and Unicode strings that know
287//! their length.
288//!
289//! The C API (header file generated at `target/include/encoding_rs.h` when
290//! building encoding_rs) can, in turn, be wrapped for use from C++. Such a
291//! C++ wrapper can re-create the non-streaming API in C++ for C++ callers.
292//! The C binding comes with a [C++17 wrapper][2] that uses standard library +
293//! [GSL][3] types and that recreates the non-streaming API in C++ on top of
294//! the streaming API. A C++ wrapper with XPCOM/MFBT types is available as
295//! [`mozilla::Encoding`][4].
296//!
297//! The `Encoding` type is common to both the streaming and non-streaming
298//! modes. In the streaming mode, decoding operations are performed with a
299//! `Decoder` and encoding operations with an `Encoder` object obtained via
300//! `Encoding`. In the non-streaming mode, decoding and encoding operations are
301//! performed using methods on `Encoding` objects themselves, so the `Decoder`
302//! and `Encoder` objects are not used at all.
303//!
304//! [1]: https://github.com/hsivonen/encoding_c
305//! [2]: https://github.com/hsivonen/encoding_c/blob/master/include/encoding_rs_cpp.h
306//! [3]: https://github.com/Microsoft/GSL/
307//! [4]: https://searchfox.org/mozilla-central/source/intl/Encoding.h
308//!
309//! # Memory management
310//!
311//! The non-streaming mode never performs heap allocations (even the methods
312//! that write into a `Vec<u8>` or a `String` by taking them as arguments do
313//! not reallocate the backing buffer of the `Vec<u8>` or the `String`). That
314//! is, the non-streaming mode uses caller-allocated buffers exclusively.
315//!
316//! The methods of the streaming mode that return a `Vec<u8>` or a `String`
317//! perform heap allocations but only to allocate the backing buffer of the
318//! `Vec<u8>` or the `String`.
319//!
320//! `Encoding` is always statically allocated. `Decoder` and `Encoder` need no
321//! `Drop` cleanup.
322//!
323//! # Buffer reading and writing behavior
324//!
325//! Based on experience gained with the `java.nio.charset` encoding converter
326//! API and with the Gecko uconv encoding converter API, the buffer reading
327//! and writing behaviors of encoding_rs are asymmetric: input buffers are
328//! fully drained but output buffers are not always fully filled.
329//!
330//! When reading from an input buffer, encoding_rs always consumes all input
331//! up to the next error or to the end of the buffer. In particular, when
332//! decoding, even if the input buffer ends in the middle of a byte sequence
333//! for a character, the decoder consumes all input. This has the benefit that
334//! the caller of the API can always fill the next buffer from the start from
335//! whatever source the bytes come from and never has to first copy the last
336//! bytes of the previous buffer to the start of the next buffer. However, when
337//! encoding, the UTF-8 input buffers have to end at a character boundary, which
338//! is a requirement for the Rust `str` type anyway, and UTF-16 input buffer
339//! boundaries falling in the middle of a surrogate pair result in both
340//! suggorates being treated individually as unpaired surrogates.
341//!
342//! Additionally, decoders guarantee that they can be fed even one byte at a
343//! time and encoders guarantee that they can be fed even one code point at a
344//! time. This has the benefit of not placing restrictions on the size of
345//! chunks the content arrives e.g. from network.
346//!
347//! When writing into an output buffer, encoding_rs makes sure that the code
348//! unit sequence for a character is never split across output buffer
349//! boundaries. This may result in wasted space at the end of an output buffer,
350//! but the advantages are that the output side of both decoders and encoders
351//! is greatly simplified compared to designs that attempt to fill output
352//! buffers exactly even when that entails splitting a code unit sequence and
353//! when encoding_rs methods return to the caller, the output produces thus
354//! far is always valid taken as whole. (In the case of encoding to ISO-2022-JP,
355//! the output needs to be considered as a whole, because the latest output
356//! buffer taken alone might not be valid taken alone if the transition away
357//! from the ASCII state occurred in an earlier output buffer. However, since
358//! the ISO-2022-JP decoder doesn't treat streams that don't end in the ASCII
359//! state as being in error despite the encoder generating a transition to the
360//! ASCII state at the end, the claim about the partial output taken as a whole
361//! being valid is true even for ISO-2022-JP.)
362//!
363//! # Error Reporting
364//!
365//! Based on experience gained with the `java.nio.charset` encoding converter
366//! API and with the Gecko uconv encoding converter API, the error reporting
367//! behaviors of encoding_rs are asymmetric: decoder errors include offsets
368//! that leave it up to the caller to extract the erroneous bytes from the
369//! input stream if the caller wishes to do so but encoder errors provide the
370//! code point associated with the error without requiring the caller to
371//! extract it from the input on its own.
372//!
373//! On the encoder side, an error is always triggered by the most recently
374//! pushed Unicode scalar, which makes it simple to pass the `char` to the
375//! caller. Also, it's very typical for the caller to wish to do something with
376//! this data: generate a numeric escape for the character. Additionally, the
377//! ISO-2022-JP encoder reports U+FFFD instead of the actual input character in
378//! certain cases, so requiring the caller to extract the character from the
379//! input buffer would require the caller to handle ISO-2022-JP details.
380//! Furthermore, requiring the caller to extract the character from the input
381//! buffer would require the caller to implement UTF-8 or UTF-16 math, which is
382//! the job of an encoding conversion library.
383//!
384//! On the decoder side, errors are triggered in more complex ways. For
385//! example, when decoding the sequence ESC, '$', _buffer boundary_, 'A' as
386//! ISO-2022-JP, the ESC byte is in error, but this is discovered only after
387//! the buffer boundary when processing 'A'. Thus, the bytes in error might not
388//! be the ones most recently pushed to the decoder and the error might not even
389//! be in the current buffer.
390//!
391//! Some encoding conversion APIs address the problem by not acknowledging
392//! trailing bytes of an input buffer as consumed if it's still possible for
393//! future bytes to cause the trailing bytes to be in error. This way, error
394//! reporting can always refer to the most recently pushed buffer. This has the
395//! problem that the caller of the API has to copy the unconsumed trailing
396//! bytes to the start of the next buffer before being able to fill the rest
397//! of the next buffer. This is annoying, error-prone and inefficient.
398//!
399//! A possible solution would be making the decoder remember recently consumed
400//! bytes in order to be able to include a copy of the erroneous bytes when
401//! reporting an error. This has two problem: First, callers a rarely
402//! interested in the erroneous bytes, so attempts to identify them are most
403//! often just overhead anyway. Second, the rare applications that are
404//! interested typically care about the location of the error in the input
405//! stream.
406//!
407//! To keep the API convenient for common uses and the overhead low while making
408//! it possible to develop applications, such as HTML validators, that care
409//! about which bytes were in error, encoding_rs reports the length of the
410//! erroneous sequence and the number of bytes consumed after the erroneous
411//! sequence. As long as the caller doesn't discard the 6 most recent bytes,
412//! this makes it possible for callers that care about the erroneous bytes to
413//! locate them.
414//!
415//! # No Convenience API for Custom Replacements
416//!
417//! The Web Platform and, therefore, the Encoding Standard supports only one
418//! error recovery mode for decoders and only one error recovery mode for
419//! encoders. The supported error recovery mode for decoders is emitting the
420//! REPLACEMENT CHARACTER on error. The supported error recovery mode for
421//! encoders is emitting an HTML decimal numeric character reference for
422//! unmappable characters.
423//!
424//! Since encoding_rs is Web-focused, these are the only error recovery modes
425//! for which convenient support is provided. Moreover, on the decoder side,
426//! there aren't really good alternatives for emitting the REPLACEMENT CHARACTER
427//! on error (other than treating errors as fatal). In particular, simply
428//! ignoring errors is a
429//! [security problem](http://www.unicode.org/reports/tr36/#Substituting_for_Ill_Formed_Subsequences),
430//! so it would be a bad idea for encoding_rs to provide a mode that encouraged
431//! callers to ignore errors.
432//!
433//! On the encoder side, there are plausible alternatives for HTML decimal
434//! numeric character references. For example, when outputting CSS, CSS-style
435//! escapes would seem to make sense. However, instead of facilitating the
436//! output of CSS, JS, etc. in non-UTF-8 encodings, encoding_rs takes the design
437//! position that you shouldn't generate output in encodings other than UTF-8,
438//! except where backward compatibility with interacting with the legacy Web
439//! requires it. The legacy Web requires it only when parsing the query strings
440//! of URLs and when submitting forms, and those two both use HTML decimal
441//! numeric character references.
442//!
443//! While encoding_rs doesn't make encoder replacements other than HTML decimal
444//! numeric character references easy, it does make them _possible_.
445//! `encode_from_utf8()`, which emits HTML decimal numeric character references
446//! for unmappable characters, is implemented on top of
447//! `encode_from_utf8_without_replacement()`. Applications that really, really
448//! want other replacement schemes for unmappable characters can likewise
449//! implement them on top of `encode_from_utf8_without_replacement()`.
450//!
451//! # No Extensibility by Design
452//!
453//! The set of encodings supported by encoding_rs is not extensible by design.
454//! That is, `Encoding`, `Decoder` and `Encoder` are intentionally `struct`s
455//! rather than `trait`s. encoding_rs takes the design position that all future
456//! text interchange should be done using UTF-8, which can represent all of
457//! Unicode. (It is, in fact, the only encoding supported by the Encoding
458//! Standard and encoding_rs that can represent all of Unicode and that has
459//! encoder support. UTF-16LE and UTF-16BE don't have encoder support, and
460//! gb18030 cannot encode U+E5E5.) The other encodings are supported merely for
461//! legacy compatibility and not due to non-UTF-8 encodings having benefits
462//! other than being able to consume legacy content.
463//!
464//! Considering that UTF-8 can represent all of Unicode and is already supported
465//! by all Web browsers, introducing a new encoding wouldn't add to the
466//! expressiveness but would add to compatibility problems. In that sense,
467//! adding new encodings to the Web Platform doesn't make sense, and, in fact,
468//! post-UTF-8 attempts at encodings, such as BOCU-1, have been rejected from
469//! the Web Platform. On the other hand, the set of legacy encodings that must
470//! be supported for a Web browser to be able to be successful is not going to
471//! expand. Empirically, the set of encodings specified in the Encoding Standard
472//! is already sufficient and the set of legacy encodings won't grow
473//! retroactively.
474//!
475//! Since extensibility doesn't make sense considering the Web focus of
476//! encoding_rs and adding encodings to Web clients would be actively harmful,
477//! it makes sense to make the set of encodings that encoding_rs supports
478//! non-extensible and to take the (admittedly small) benefits arising from
479//! that, such as the size of `Decoder` and `Encoder` objects being known ahead
480//! of time, which enables stack allocation thereof.
481//!
482//! This does have downsides for applications that might want to put encoding_rs
483//! to non-Web uses if those non-Web uses involve legacy encodings that aren't
484//! needed for Web uses. The needs of such applications should not complicate
485//! encoding_rs itself, though. It is up to those applications to provide a
486//! framework that delegates the operations with encodings that encoding_rs
487//! supports to encoding_rs and operations with other encodings to something
488//! else (as opposed to encoding_rs itself providing an extensibility
489//! framework).
490//!
491//! # Panics
492//!
493//! Methods in encoding_rs can panic if the API is used against the requirements
494//! stated in the documentation, if a state that's supposed to be impossible
495//! is reached due to an internal bug or on integer overflow. When used
496//! according to documentation with buffer sizes that stay below integer
497//! overflow, in the absence of internal bugs, encoding_rs does not panic.
498//!
499//! Panics arising from API misuse aren't documented beyond this on individual
500//! methods.
501//!
502//! # At-Risk Parts of the API
503//!
504//! The foreseeable source of partially backward-incompatible API change is the
505//! way the instances of `Encoding` are made available.
506//!
507//! If Rust changes to allow the entries of `[&'static Encoding; N]` to be
508//! initialized with `static`s of type `&'static Encoding`, the non-reference
509//! `FOO_INIT` public `Encoding` instances will be removed from the public API.
510//!
511//! If Rust changes to make the referent of `pub const FOO: &'static Encoding`
512//! unique when the constant is used in different crates, the reference-typed
513//! `static`s for the encoding instances will be changed from `static` to
514//! `const` and the non-reference-typed `_INIT` instances will be removed.
515//!
516//! # Mapping Spec Concepts onto the API
517//!
518//! <table>
519//! <thead>
520//! <tr><th>Spec Concept</th><th>Streaming</th><th>Non-Streaming</th></tr>
521//! </thead>
522//! <tbody>
523//! <tr><td><a href="https://encoding.spec.whatwg.org/#encoding">encoding</a></td><td><code>&amp;'static Encoding</code></td><td><code>&amp;'static Encoding</code></td></tr>
524//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8">UTF-8 encoding</a></td><td><code>UTF_8</code></td><td><code>UTF_8</code></td></tr>
525//! <tr><td><a href="https://encoding.spec.whatwg.org/#concept-encoding-get">get an encoding</a></td><td><code>Encoding::for_label(<var>label</var>)</code></td><td><code>Encoding::for_label(<var>label</var>)</code></td></tr>
526//! <tr><td><a href="https://encoding.spec.whatwg.org/#name">name</a></td><td><code><var>encoding</var>.name()</code></td><td><code><var>encoding</var>.name()</code></td></tr>
527//! <tr><td><a href="https://encoding.spec.whatwg.org/#get-an-output-encoding">get an output encoding</a></td><td><code><var>encoding</var>.output_encoding()</code></td><td><code><var>encoding</var>.output_encoding()</code></td></tr>
528//! <tr><td><a href="https://encoding.spec.whatwg.org/#decode">decode</a></td><td><code>let d = <var>encoding</var>.new_decoder();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.decode(<var>src</var>)</code></td></tr>
529//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode">UTF-8 decode</a></td><td><code>let d = UTF_8.new_decoder_with_bom_removal();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_with_bom_removal(<var>src</var>)</code></td></tr>
530//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom">UTF-8 decode without BOM</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = d.decode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code>UTF_8.decode_without_bom_handling(<var>src</var>)</code></td></tr>
531//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail">UTF-8 decode without BOM or fail</a></td><td><code>let d = UTF_8.new_decoder_without_bom_handling();<br>let res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, false);<br>// &hellip; (fail if malformed)</br>let last_res = d.decode_to_<var>*</var>_without_replacement(<var>src</var>, <var>dst</var>, true);<br>// (fail if malformed)</code></td><td><code>UTF_8.decode_without_bom_handling_and_without_replacement(<var>src</var>)</code></td></tr>
532//! <tr><td><a href="https://encoding.spec.whatwg.org/#encode">encode</a></td><td><code>let e = <var>encoding</var>.new_encoder();<br>let res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, false);<br>// &hellip;</br>let last_res = e.encode_to_<var>*</var>(<var>src</var>, <var>dst</var>, true);</code></td><td><code><var>encoding</var>.encode(<var>src</var>)</code></td></tr>
533//! <tr><td><a href="https://encoding.spec.whatwg.org/#utf-8-encode">UTF-8 encode</a></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// refill src<br><var>write</var>(<var>src</var>.as_bytes());<br>// &hellip;</code></td><td>Use the UTF-8 nature of Rust strings directly:<br><code><var>src</var>.as_bytes()</code></td></tr>
534//! </tbody>
535//! </table>
536//!
537//! # Compatibility with the rust-encoding API
538//!
539//! The crate
540//! [encoding_rs_compat](https://github.com/hsivonen/encoding_rs_compat/)
541//! is a drop-in replacement for rust-encoding 0.2.32 that implements (most of)
542//! the API of rust-encoding 0.2.32 on top of encoding_rs.
543//!
544//! # Mapping rust-encoding concepts to encoding_rs concepts
545//!
546//! The following table provides a mapping from rust-encoding constructs to
547//! encoding_rs ones.
548//!
549//! <table>
550//! <thead>
551//! <tr><th>rust-encoding</th><th>encoding_rs</th></tr>
552//! </thead>
553//! <tbody>
554//! <tr><td><code>encoding::EncodingRef</code></td><td><code>&amp;'static encoding_rs::Encoding</code></td></tr>
555//! <tr><td><code>encoding::all::<var>WINDOWS_31J</var></code> (not based on the WHATWG name for some encodings)</td><td><code>encoding_rs::<var>SHIFT_JIS</var></code> (always the WHATWG name uppercased and hyphens replaced with underscores)</td></tr>
556//! <tr><td><code>encoding::all::ERROR</code></td><td>Not available because not in the Encoding Standard</td></tr>
557//! <tr><td><code>encoding::all::ASCII</code></td><td>Not available because not in the Encoding Standard</td></tr>
558//! <tr><td><code>encoding::all::ISO_8859_1</code></td><td>Not available because not in the Encoding Standard</td></tr>
559//! <tr><td><code>encoding::all::HZ</code></td><td>Not available because not in the Encoding Standard</td></tr>
560//! <tr><td><code>encoding::label::encoding_from_whatwg_label(<var>string</var>)</code></td><td><code>encoding_rs::Encoding::for_label(<var>string</var>)</code></td></tr>
561//! <tr><td><code><var>enc</var>.whatwg_name()</code> (always lower case)</td><td><code><var>enc</var>.name()</code> (potentially mixed case)</td></tr>
562//! <tr><td><code><var>enc</var>.name()</code></td><td>Not available because not in the Encoding Standard</td></tr>
563//! <tr><td><code>encoding::decode(<var>bytes</var>, encoding::DecoderTrap::Replace, <var>enc</var>)</code></td><td><code><var>enc</var>.decode(<var>bytes</var>)</code></td></tr>
564//! <tr><td><code><var>enc</var>.decode(<var>bytes</var>, encoding::DecoderTrap::Replace)</code></td><td><code><var>enc</var>.decode_without_bom_handling(<var>bytes</var>)</code></td></tr>
565//! <tr><td><code><var>enc</var>.encode(<var>string</var>, encoding::EncoderTrap::NcrEscape)</code></td><td><code><var>enc</var>.encode(<var>string</var>)</code></td></tr>
566//! <tr><td><code><var>enc</var>.raw_decoder()</code></td><td><code><var>enc</var>.new_decoder_without_bom_handling()</code></td></tr>
567//! <tr><td><code><var>enc</var>.raw_encoder()</code></td><td><code><var>enc</var>.new_encoder()</code></td></tr>
568//! <tr><td><code>encoding::RawDecoder</code></td><td><code>encoding_rs::Decoder</code></td></tr>
569//! <tr><td><code>encoding::RawEncoder</code></td><td><code>encoding_rs::Encoder</code></td></tr>
570//! <tr><td><code><var>raw_decoder</var>.raw_feed(<var>src</var>, <var>dst_string</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(<var>src</var>.len()));<br><var>decoder</var>.decode_to_string_without_replacement(<var>src</var>, <var>dst_string</var>, false)</code></td></tr>
571//! <tr><td><code><var>raw_encoder</var>.raw_feed(<var>src</var>, <var>dst_vec</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(<var>src</var>.len()));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement(<var>src</var>, <var>dst_vec</var>, false)</code></td></tr>
572//! <tr><td><code><var>raw_decoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_string</var>.reserve(<var>decoder</var>.max_utf8_buffer_length_without_replacement(0));<br><var>decoder</var>.decode_to_string_without_replacement(b"", <var>dst</var>, true)</code></td></tr>
573//! <tr><td><code><var>raw_encoder</var>.raw_finish(<var>dst</var>)</code></td><td><code><var>dst_vec</var>.reserve(<var>encoder</var>.max_buffer_length_from_utf8_without_replacement(0));<br><var>encoder</var>.encode_from_utf8_to_vec_without_replacement("", <var>dst</var>, true)</code></td></tr>
574//! <tr><td><code>encoding::DecoderTrap::Strict</code></td><td><code>decode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Malformed` result as fatal).</td></tr>
575//! <tr><td><code>encoding::DecoderTrap::Replace</code></td><td><code>decode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
576//! <tr><td><code>encoding::DecoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
577//! <tr><td><code>encoding::DecoderTrap::Call(DecoderTrapFunc)</code></td><td>Can be implemented using <code>decode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
578//! <tr><td><code>encoding::EncoderTrap::Strict</code></td><td><code>encode*</code> methods that have <code>_without_replacement</code> in their name (and treating the `Unmappable` result as fatal).</td></tr>
579//! <tr><td><code>encoding::EncoderTrap::Replace</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
580//! <tr><td><code>encoding::EncoderTrap::Ignore</code></td><td>It is a bad idea to ignore errors due to security issues, but this could be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
581//! <tr><td><code>encoding::EncoderTrap::NcrEscape</code></td><td><code>encode*</code> methods that <i>do not</i> have <code>_without_replacement</code> in their name.</td></tr>
582//! <tr><td><code>encoding::EncoderTrap::Call(EncoderTrapFunc)</code></td><td>Can be implemented using <code>encode*</code> methods that have <code>_without_replacement</code> in their name.</td></tr>
583//! </tbody>
584//! </table>
585//!
586//! # Relationship with Windows Code Pages
587//!
588//! Despite the Web and browser focus, the encodings defined by the Encoding
589//! Standard and implemented by this crate may be useful for decoding legacy
590//! data that uses Windows code pages. The following table names the single-byte
591//! encodings
592//! that have a closely related Windows code page, the number of the closest
593//! code page, a column indicating whether Windows maps unassigned code points
594//! to the Unicode Private Use Area instead of U+FFFD and a remark number
595//! indicating remarks in the list after the table.
596//!
597//! <table>
598//! <thead>
599//! <tr><th>Encoding</th><th>Code Page</th><th>PUA</th><th>Remarks</th></tr>
600//! </thead>
601//! <tbody>
602//! <tr><td>Shift_JIS</td><td>932</td><td></td><td></td></tr>
603//! <tr><td>GBK</td><td>936</td><td></td><td></td></tr>
604//! <tr><td>EUC-KR</td><td>949</td><td></td><td></td></tr>
605//! <tr><td>Big5</td><td>950</td><td></td><td></td></tr>
606//! <tr><td>IBM866</td><td>866</td><td></td><td></td></tr>
607//! <tr><td>windows-874</td><td>874</td><td>&bullet;</td><td></td></tr>
608//! <tr><td>UTF-16LE</td><td>1200</td><td></td><td></td></tr>
609//! <tr><td>UTF-16BE</td><td>1201</td><td></td><td></td></tr>
610//! <tr><td>windows-1250</td><td>1250</td><td></td><td></td></tr>
611//! <tr><td>windows-1251</td><td>1251</td><td></td><td></td></tr>
612//! <tr><td>windows-1252</td><td>1252</td><td></td><td></td></tr>
613//! <tr><td>windows-1253</td><td>1253</td><td>&bullet;</td><td></td></tr>
614//! <tr><td>windows-1254</td><td>1254</td><td></td><td></td></tr>
615//! <tr><td>windows-1255</td><td>1255</td><td>&bullet;</td><td></td></tr>
616//! <tr><td>windows-1256</td><td>1256</td><td></td><td></td></tr>
617//! <tr><td>windows-1257</td><td>1257</td><td>&bullet;</td><td></td></tr>
618//! <tr><td>windows-1258</td><td>1258</td><td></td><td></td></tr>
619//! <tr><td>macintosh</td><td>10000</td><td></td><td>1</td></tr>
620//! <tr><td>x-mac-cyrillic</td><td>10017</td><td></td><td>2</td></tr>
621//! <tr><td>KOI8-R</td><td>20866</td><td></td><td></td></tr>
622//! <tr><td>EUC-JP</td><td>20932</td><td></td><td></td></tr>
623//! <tr><td>KOI8-U</td><td>21866</td><td></td><td></td></tr>
624//! <tr><td>ISO-8859-2</td><td>28592</td><td></td><td></td></tr>
625//! <tr><td>ISO-8859-3</td><td>28593</td><td></td><td></td></tr>
626//! <tr><td>ISO-8859-4</td><td>28594</td><td></td><td></td></tr>
627//! <tr><td>ISO-8859-5</td><td>28595</td><td></td><td></td></tr>
628//! <tr><td>ISO-8859-6</td><td>28596</td><td>&bullet;</td><td></td></tr>
629//! <tr><td>ISO-8859-7</td><td>28597</td><td>&bullet;</td><td>3</td></tr>
630//! <tr><td>ISO-8859-8</td><td>28598</td><td>&bullet;</td><td>4</td></tr>
631//! <tr><td>ISO-8859-13</td><td>28603</td><td>&bullet;</td><td></td></tr>
632//! <tr><td>ISO-8859-15</td><td>28605</td><td></td><td></td></tr>
633//! <tr><td>ISO-8859-8-I</td><td>38598</td><td></td><td>5</td></tr>
634//! <tr><td>ISO-2022-JP</td><td>50220</td><td></td><td></td></tr>
635//! <tr><td>gb18030</td><td>54936</td><td></td><td></td></tr>
636//! <tr><td>UTF-8</td><td>65001</td><td></td><td></td></tr>
637//! </tbody>
638//! </table>
639//!
640//! 1. Windows decodes 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
641//! 2. Windows decodes 0xFF to U+00A4 CURRENCY SIGN instead of U+20AC EURO SIGN.
642//! 3. Windows decodes the currency signs at 0xA4 and 0xA5 as well as 0xAA,
643//! which should be U+037A GREEK YPOGEGRAMMENI, to PUA code points. Windows
644//! decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA instead of U+2018
645//! LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER LETTER APOSTROPHE
646//! instead of U+2019 RIGHT SINGLE QUOTATION MARK.
647//! 4. Windows decodes 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to PUA instead
648//! of LRM and RLM.
649//! 5. Remarks from the previous item apply.
650//!
651//! The differences between this crate and Windows in the case of multibyte encodings
652//! are not yet fully documented here. The lack of remarks above should not be taken
653//! as indication of lack of differences.
654//!
655//! # Notable Differences from IANA Naming
656//!
657//! In some cases, the Encoding Standard specifies the popular unextended encoding
658//! name where in IANA terms one of the other labels would be more precise considering
659//! the extensions that the Encoding Standard has unified into the encoding.
660//!
661//! <table>
662//! <thead>
663//! <tr><th>Encoding</th><th>IANA</th></tr>
664//! </thead>
665//! <tbody>
666//! <tr><td>Big5</td><td>Big5-HKSCS</td></tr>
667//! <tr><td>EUC-KR</td><td>windows-949</td></tr>
668//! <tr><td>Shift_JIS</td><td>windows-31j</td></tr>
669//! <tr><td>x-mac-cyrillic</td><td>x-mac-ukrainian</td></tr>
670//! </tbody>
671//! </table>
672//!
673//! In other cases where the Encoding Standard unifies unextended and extended
674//! variants of an encoding, the encoding gets the name of the extended
675//! variant.
676//!
677//! <table>
678//! <thead>
679//! <tr><th>IANA</th><th>Unified into Encoding</th></tr>
680//! </thead>
681//! <tbody>
682//! <tr><td>ISO-8859-1</td><td>windows-1252</td></tr>
683//! <tr><td>ISO-8859-9</td><td>windows-1254</td></tr>
684//! <tr><td>TIS-620</td><td>windows-874</td></tr>
685//! </tbody>
686//! </table>
687//!
688//! See the section [_UTF-16LE, UTF-16BE and Unicode Encoding Schemes_](#utf-16le-utf-16be-and-unicode-encoding-schemes)
689//! for discussion about the UTF-16 family.
690
691#![no_std]
692#![cfg_attr(feature = "simd-accel", feature(core_intrinsics))]
693
694#[cfg(feature = "alloc")]
695#[cfg_attr(test, macro_use)]
696extern crate alloc;
697
698extern crate core;
699#[macro_use]
700extern crate cfg_if;
701
702#[cfg(all(
703 feature = "simd-accel",
704 any(
705 target_feature = "sse2",
706 all(target_endian = "little", target_arch = "aarch64"),
707 all(target_endian = "little", target_feature = "neon")
708 )
709))]
710#[macro_use(shuffle)]
711extern crate packed_simd;
712
713#[cfg(feature = "serde")]
714extern crate serde;
715
716#[cfg(all(test, feature = "serde"))]
717extern crate bincode;
718#[cfg(all(test, feature = "serde"))]
719#[macro_use]
720extern crate serde_derive;
721#[cfg(all(test, feature = "serde"))]
722extern crate serde_json;
723
724#[macro_use]
725mod macros;
726
727#[cfg(all(
728 feature = "simd-accel",
729 any(
730 target_feature = "sse2",
731 all(target_endian = "little", target_arch = "aarch64"),
732 all(target_endian = "little", target_feature = "neon")
733 )
734))]
735mod simd_funcs;
736
737#[cfg(all(test, feature = "alloc"))]
738mod testing;
739
740mod big5;
741mod euc_jp;
742mod euc_kr;
743mod gb18030;
744mod iso_2022_jp;
745mod replacement;
746mod shift_jis;
747mod single_byte;
748mod utf_16;
749mod utf_8;
750mod x_user_defined;
751
752mod ascii;
753mod data;
754mod handles;
755mod variant;
756
757pub mod mem;
758
759use crate::ascii::ascii_valid_up_to;
760use crate::ascii::iso_2022_jp_ascii_valid_up_to;
761use crate::utf_8::utf8_valid_up_to;
762use crate::variant::*;
763
764#[cfg(feature = "alloc")]
765use alloc::borrow::Cow;
766#[cfg(feature = "alloc")]
767use alloc::string::String;
768#[cfg(feature = "alloc")]
769use alloc::vec::Vec;
770use core::cmp::Ordering;
771use core::hash::Hash;
772use core::hash::Hasher;
773
774#[cfg(feature = "serde")]
775use serde::de::Visitor;
776#[cfg(feature = "serde")]
777use serde::{Deserialize, Deserializer, Serialize, Serializer};
778
779/// This has to be the max length of an NCR instead of max
780/// minus one, because we can't rely on getting the minus
781/// one from the space reserved for the current unmappable,
782/// because the ISO-2022-JP encoder can fill up that space
783/// with a state transition escape.
784const NCR_EXTRA: usize = 10; // &#1114111;
785
786// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
787// Instead, please regenerate using generate-encoding-data.py
788
789const LONGEST_LABEL_LENGTH: usize = 19; // cseucpkdfmtjapanese
790
791/// The initializer for the [Big5](static.BIG5.html) encoding.
792///
793/// For use only for taking the address of this form when
794/// Rust prohibits the use of the non-`_INIT` form directly,
795/// such as in initializers of other `static`s. If in doubt,
796/// use the corresponding non-`_INIT` reference-typed `static`.
797///
798/// This part of the public API will go away if Rust changes
799/// to make the referent of `pub const FOO: &'static Encoding`
800/// unique cross-crate or if Rust starts allowing static arrays
801/// to be initialized with `pub static FOO: &'static Encoding`
802/// items.
803pub static BIG5_INIT: Encoding = Encoding {
804 name: "Big5",
805 variant: VariantEncoding::Big5,
806};
807
808/// The Big5 encoding.
809///
810/// This is Big5 with HKSCS with mappings to more recent Unicode assignments
811/// instead of the Private Use Area code points that have been used historically.
812/// It is believed to be able to decode existing Web content in a way that makes
813/// sense.
814///
815/// To avoid form submissions generating data that Web servers don't understand,
816/// the encoder doesn't use the HKSCS byte sequences that precede the unextended
817/// Big5 in the lexical order.
818///
819/// [Index visualization](https://encoding.spec.whatwg.org/big5.html),
820/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/big5-bmp.html)
821///
822/// This encoding is designed to be suited for decoding the Windows code page 950
823/// and its HKSCS patched "951" variant such that the text makes sense, given
824/// assignments that Unicode has made after those encodings used Private Use
825/// Area characters.
826///
827/// This will change from `static` to `const` if Rust changes
828/// to make the referent of `pub const FOO: &'static Encoding`
829/// unique cross-crate, so don't take the address of this
830/// `static`.
831pub static BIG5: &'static Encoding = &BIG5_INIT;
832
833/// The initializer for the [EUC-JP](static.EUC_JP.html) encoding.
834///
835/// For use only for taking the address of this form when
836/// Rust prohibits the use of the non-`_INIT` form directly,
837/// such as in initializers of other `static`s. If in doubt,
838/// use the corresponding non-`_INIT` reference-typed `static`.
839///
840/// This part of the public API will go away if Rust changes
841/// to make the referent of `pub const FOO: &'static Encoding`
842/// unique cross-crate or if Rust starts allowing static arrays
843/// to be initialized with `pub static FOO: &'static Encoding`
844/// items.
845pub static EUC_JP_INIT: Encoding = Encoding {
846 name: "EUC-JP",
847 variant: VariantEncoding::EucJp,
848};
849
850/// The EUC-JP encoding.
851///
852/// This is the legacy Unix encoding for Japanese.
853///
854/// For compatibility with Web servers that don't expect three-byte sequences
855/// in form submissions, the encoder doesn't generate three-byte sequences.
856/// That is, the JIS X 0212 support is decode-only.
857///
858/// [Index visualization](https://encoding.spec.whatwg.org/euc-jp.html),
859/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-jp-bmp.html)
860///
861/// This encoding roughly matches the Windows code page 20932. There are error
862/// handling differences and a handful of 2-byte sequences that decode differently.
863/// Additionall, Windows doesn't support 3-byte sequences.
864///
865/// This will change from `static` to `const` if Rust changes
866/// to make the referent of `pub const FOO: &'static Encoding`
867/// unique cross-crate, so don't take the address of this
868/// `static`.
869pub static EUC_JP: &'static Encoding = &EUC_JP_INIT;
870
871/// The initializer for the [EUC-KR](static.EUC_KR.html) encoding.
872///
873/// For use only for taking the address of this form when
874/// Rust prohibits the use of the non-`_INIT` form directly,
875/// such as in initializers of other `static`s. If in doubt,
876/// use the corresponding non-`_INIT` reference-typed `static`.
877///
878/// This part of the public API will go away if Rust changes
879/// to make the referent of `pub const FOO: &'static Encoding`
880/// unique cross-crate or if Rust starts allowing static arrays
881/// to be initialized with `pub static FOO: &'static Encoding`
882/// items.
883pub static EUC_KR_INIT: Encoding = Encoding {
884 name: "EUC-KR",
885 variant: VariantEncoding::EucKr,
886};
887
888/// The EUC-KR encoding.
889///
890/// This is the Korean encoding for Windows. It extends the Unix legacy encoding
891/// for Korean, based on KS X 1001 (which also formed the base of MacKorean on Mac OS
892/// Classic), with all the characters from the Hangul Syllables block of Unicode.
893///
894/// [Index visualization](https://encoding.spec.whatwg.org/euc-kr.html),
895/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/euc-kr-bmp.html)
896///
897/// This encoding matches the Windows code page 949, except Windows decodes byte 0x80
898/// to U+0080 and some byte sequences that are error per the Encoding Standard to
899/// the question mark or the Private Use Area.
900///
901/// This will change from `static` to `const` if Rust changes
902/// to make the referent of `pub const FOO: &'static Encoding`
903/// unique cross-crate, so don't take the address of this
904/// `static`.
905pub static EUC_KR: &'static Encoding = &EUC_KR_INIT;
906
907/// The initializer for the [GBK](static.GBK.html) encoding.
908///
909/// For use only for taking the address of this form when
910/// Rust prohibits the use of the non-`_INIT` form directly,
911/// such as in initializers of other `static`s. If in doubt,
912/// use the corresponding non-`_INIT` reference-typed `static`.
913///
914/// This part of the public API will go away if Rust changes
915/// to make the referent of `pub const FOO: &'static Encoding`
916/// unique cross-crate or if Rust starts allowing static arrays
917/// to be initialized with `pub static FOO: &'static Encoding`
918/// items.
919pub static GBK_INIT: Encoding = Encoding {
920 name: "GBK",
921 variant: VariantEncoding::Gbk,
922};
923
924/// The GBK encoding.
925///
926/// The decoder for this encoding is the same as the decoder for gb18030.
927/// The encoder side of this encoding is GBK with Windows code page 936 euro
928/// sign behavior. GBK extends GB2312-80 to cover the CJK Unified Ideographs
929/// Unicode block as well as a handful of ideographs from the CJK Unified
930/// Ideographs Extension A and CJK Compatibility Ideographs blocks.
931///
932/// Unlike e.g. in the case of ISO-8859-1 and windows-1252, GBK encoder wasn't
933/// unified with the gb18030 encoder in the Encoding Standard out of concern
934/// that servers that expect GBK form submissions might not be able to handle
935/// the four-byte sequences.
936///
937/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
938/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
939///
940/// The encoder of this encoding roughly matches the Windows code page 936.
941/// The decoder side is a superset.
942///
943/// This will change from `static` to `const` if Rust changes
944/// to make the referent of `pub const FOO: &'static Encoding`
945/// unique cross-crate, so don't take the address of this
946/// `static`.
947pub static GBK: &'static Encoding = &GBK_INIT;
948
949/// The initializer for the [IBM866](static.IBM866.html) encoding.
950///
951/// For use only for taking the address of this form when
952/// Rust prohibits the use of the non-`_INIT` form directly,
953/// such as in initializers of other `static`s. If in doubt,
954/// use the corresponding non-`_INIT` reference-typed `static`.
955///
956/// This part of the public API will go away if Rust changes
957/// to make the referent of `pub const FOO: &'static Encoding`
958/// unique cross-crate or if Rust starts allowing static arrays
959/// to be initialized with `pub static FOO: &'static Encoding`
960/// items.
961pub static IBM866_INIT: Encoding = Encoding {
962 name: "IBM866",
963 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.ibm866, 0x0440, 96, 16),
964};
965
966/// The IBM866 encoding.
967///
968/// This the most notable one of the DOS Cyrillic code pages. It has the same
969/// box drawing characters as code page 437, so it can be used for decoding
970/// DOS-era ASCII + box drawing data.
971///
972/// [Index visualization](https://encoding.spec.whatwg.org/ibm866.html),
973/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/ibm866-bmp.html)
974///
975/// This encoding matches the Windows code page 866.
976///
977/// This will change from `static` to `const` if Rust changes
978/// to make the referent of `pub const FOO: &'static Encoding`
979/// unique cross-crate, so don't take the address of this
980/// `static`.
981pub static IBM866: &'static Encoding = &IBM866_INIT;
982
983/// The initializer for the [ISO-2022-JP](static.ISO_2022_JP.html) encoding.
984///
985/// For use only for taking the address of this form when
986/// Rust prohibits the use of the non-`_INIT` form directly,
987/// such as in initializers of other `static`s. If in doubt,
988/// use the corresponding non-`_INIT` reference-typed `static`.
989///
990/// This part of the public API will go away if Rust changes
991/// to make the referent of `pub const FOO: &'static Encoding`
992/// unique cross-crate or if Rust starts allowing static arrays
993/// to be initialized with `pub static FOO: &'static Encoding`
994/// items.
995pub static ISO_2022_JP_INIT: Encoding = Encoding {
996 name: "ISO-2022-JP",
997 variant: VariantEncoding::Iso2022Jp,
998};
999
1000/// The ISO-2022-JP encoding.
1001///
1002/// This the primary pre-UTF-8 encoding for Japanese email. It uses the ASCII
1003/// byte range to encode non-Basic Latin characters. It's the only encoding
1004/// supported by this crate whose encoder is stateful.
1005///
1006/// [Index visualization](https://encoding.spec.whatwg.org/jis0208.html),
1007/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/jis0208-bmp.html)
1008///
1009/// This encoding roughly matches the Windows code page 50220. Notably, Windows
1010/// uses U+30FB in place of the REPLACEMENT CHARACTER and otherwise differs in
1011/// error handling.
1012///
1013/// This will change from `static` to `const` if Rust changes
1014/// to make the referent of `pub const FOO: &'static Encoding`
1015/// unique cross-crate, so don't take the address of this
1016/// `static`.
1017pub static ISO_2022_JP: &'static Encoding = &ISO_2022_JP_INIT;
1018
1019/// The initializer for the [ISO-8859-10](static.ISO_8859_10.html) encoding.
1020///
1021/// For use only for taking the address of this form when
1022/// Rust prohibits the use of the non-`_INIT` form directly,
1023/// such as in initializers of other `static`s. If in doubt,
1024/// use the corresponding non-`_INIT` reference-typed `static`.
1025///
1026/// This part of the public API will go away if Rust changes
1027/// to make the referent of `pub const FOO: &'static Encoding`
1028/// unique cross-crate or if Rust starts allowing static arrays
1029/// to be initialized with `pub static FOO: &'static Encoding`
1030/// items.
1031pub static ISO_8859_10_INIT: Encoding = Encoding {
1032 name: "ISO-8859-10",
1033 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_10, 0x00DA, 90, 6),
1034};
1035
1036/// The ISO-8859-10 encoding.
1037///
1038/// This is the Nordic part of the ISO/IEC 8859 encoding family. This encoding
1039/// is also known as Latin 6.
1040///
1041/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-10.html),
1042/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-10-bmp.html)
1043///
1044/// The Windows code page number for this encoding is 28600, but kernel32.dll
1045/// does not support this encoding.
1046///
1047/// This will change from `static` to `const` if Rust changes
1048/// to make the referent of `pub const FOO: &'static Encoding`
1049/// unique cross-crate, so don't take the address of this
1050/// `static`.
1051pub static ISO_8859_10: &'static Encoding = &ISO_8859_10_INIT;
1052
1053/// The initializer for the [ISO-8859-13](static.ISO_8859_13.html) encoding.
1054///
1055/// For use only for taking the address of this form when
1056/// Rust prohibits the use of the non-`_INIT` form directly,
1057/// such as in initializers of other `static`s. If in doubt,
1058/// use the corresponding non-`_INIT` reference-typed `static`.
1059///
1060/// This part of the public API will go away if Rust changes
1061/// to make the referent of `pub const FOO: &'static Encoding`
1062/// unique cross-crate or if Rust starts allowing static arrays
1063/// to be initialized with `pub static FOO: &'static Encoding`
1064/// items.
1065pub static ISO_8859_13_INIT: Encoding = Encoding {
1066 name: "ISO-8859-13",
1067 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_13, 0x00DF, 95, 1),
1068};
1069
1070/// The ISO-8859-13 encoding.
1071///
1072/// This is the Baltic part of the ISO/IEC 8859 encoding family. This encoding
1073/// is also known as Latin 7.
1074///
1075/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-13.html),
1076/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-13-bmp.html)
1077///
1078/// This encoding matches the Windows code page 28603, except Windows decodes
1079/// unassigned code points to the Private Use Area of Unicode.
1080///
1081/// This will change from `static` to `const` if Rust changes
1082/// to make the referent of `pub const FOO: &'static Encoding`
1083/// unique cross-crate, so don't take the address of this
1084/// `static`.
1085pub static ISO_8859_13: &'static Encoding = &ISO_8859_13_INIT;
1086
1087/// The initializer for the [ISO-8859-14](static.ISO_8859_14.html) encoding.
1088///
1089/// For use only for taking the address of this form when
1090/// Rust prohibits the use of the non-`_INIT` form directly,
1091/// such as in initializers of other `static`s. If in doubt,
1092/// use the corresponding non-`_INIT` reference-typed `static`.
1093///
1094/// This part of the public API will go away if Rust changes
1095/// to make the referent of `pub const FOO: &'static Encoding`
1096/// unique cross-crate or if Rust starts allowing static arrays
1097/// to be initialized with `pub static FOO: &'static Encoding`
1098/// items.
1099pub static ISO_8859_14_INIT: Encoding = Encoding {
1100 name: "ISO-8859-14",
1101 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_14, 0x00DF, 95, 17),
1102};
1103
1104/// The ISO-8859-14 encoding.
1105///
1106/// This is the Celtic part of the ISO/IEC 8859 encoding family. This encoding
1107/// is also known as Latin 8.
1108///
1109/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-14.html),
1110/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-14-bmp.html)
1111///
1112/// The Windows code page number for this encoding is 28604, but kernel32.dll
1113/// does not support this encoding.
1114///
1115/// This will change from `static` to `const` if Rust changes
1116/// to make the referent of `pub const FOO: &'static Encoding`
1117/// unique cross-crate, so don't take the address of this
1118/// `static`.
1119pub static ISO_8859_14: &'static Encoding = &ISO_8859_14_INIT;
1120
1121/// The initializer for the [ISO-8859-15](static.ISO_8859_15.html) encoding.
1122///
1123/// For use only for taking the address of this form when
1124/// Rust prohibits the use of the non-`_INIT` form directly,
1125/// such as in initializers of other `static`s. If in doubt,
1126/// use the corresponding non-`_INIT` reference-typed `static`.
1127///
1128/// This part of the public API will go away if Rust changes
1129/// to make the referent of `pub const FOO: &'static Encoding`
1130/// unique cross-crate or if Rust starts allowing static arrays
1131/// to be initialized with `pub static FOO: &'static Encoding`
1132/// items.
1133pub static ISO_8859_15_INIT: Encoding = Encoding {
1134 name: "ISO-8859-15",
1135 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_15, 0x00BF, 63, 65),
1136};
1137
1138/// The ISO-8859-15 encoding.
1139///
1140/// This is the revised Western European part of the ISO/IEC 8859 encoding
1141/// family. This encoding is also known as Latin 9.
1142///
1143/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-15.html),
1144/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-15-bmp.html)
1145///
1146/// This encoding matches the Windows code page 28605.
1147///
1148/// This will change from `static` to `const` if Rust changes
1149/// to make the referent of `pub const FOO: &'static Encoding`
1150/// unique cross-crate, so don't take the address of this
1151/// `static`.
1152pub static ISO_8859_15: &'static Encoding = &ISO_8859_15_INIT;
1153
1154/// The initializer for the [ISO-8859-16](static.ISO_8859_16.html) encoding.
1155///
1156/// For use only for taking the address of this form when
1157/// Rust prohibits the use of the non-`_INIT` form directly,
1158/// such as in initializers of other `static`s. If in doubt,
1159/// use the corresponding non-`_INIT` reference-typed `static`.
1160///
1161/// This part of the public API will go away if Rust changes
1162/// to make the referent of `pub const FOO: &'static Encoding`
1163/// unique cross-crate or if Rust starts allowing static arrays
1164/// to be initialized with `pub static FOO: &'static Encoding`
1165/// items.
1166pub static ISO_8859_16_INIT: Encoding = Encoding {
1167 name: "ISO-8859-16",
1168 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_16, 0x00DF, 95, 4),
1169};
1170
1171/// The ISO-8859-16 encoding.
1172///
1173/// This is the South-Eastern European part of the ISO/IEC 8859 encoding
1174/// family. This encoding is also known as Latin 10.
1175///
1176/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-16.html),
1177/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-16-bmp.html)
1178///
1179/// The Windows code page number for this encoding is 28606, but kernel32.dll
1180/// does not support this encoding.
1181///
1182/// This will change from `static` to `const` if Rust changes
1183/// to make the referent of `pub const FOO: &'static Encoding`
1184/// unique cross-crate, so don't take the address of this
1185/// `static`.
1186pub static ISO_8859_16: &'static Encoding = &ISO_8859_16_INIT;
1187
1188/// The initializer for the [ISO-8859-2](static.ISO_8859_2.html) encoding.
1189///
1190/// For use only for taking the address of this form when
1191/// Rust prohibits the use of the non-`_INIT` form directly,
1192/// such as in initializers of other `static`s. If in doubt,
1193/// use the corresponding non-`_INIT` reference-typed `static`.
1194///
1195/// This part of the public API will go away if Rust changes
1196/// to make the referent of `pub const FOO: &'static Encoding`
1197/// unique cross-crate or if Rust starts allowing static arrays
1198/// to be initialized with `pub static FOO: &'static Encoding`
1199/// items.
1200pub static ISO_8859_2_INIT: Encoding = Encoding {
1201 name: "ISO-8859-2",
1202 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_2, 0x00DF, 95, 1),
1203};
1204
1205/// The ISO-8859-2 encoding.
1206///
1207/// This is the Central European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 2.
1208///
1209/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-2.html),
1210/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-2-bmp.html)
1211///
1212/// This encoding matches the Windows code page 28592.
1213///
1214/// This will change from `static` to `const` if Rust changes
1215/// to make the referent of `pub const FOO: &'static Encoding`
1216/// unique cross-crate, so don't take the address of this
1217/// `static`.
1218pub static ISO_8859_2: &'static Encoding = &ISO_8859_2_INIT;
1219
1220/// The initializer for the [ISO-8859-3](static.ISO_8859_3.html) encoding.
1221///
1222/// For use only for taking the address of this form when
1223/// Rust prohibits the use of the non-`_INIT` form directly,
1224/// such as in initializers of other `static`s. If in doubt,
1225/// use the corresponding non-`_INIT` reference-typed `static`.
1226///
1227/// This part of the public API will go away if Rust changes
1228/// to make the referent of `pub const FOO: &'static Encoding`
1229/// unique cross-crate or if Rust starts allowing static arrays
1230/// to be initialized with `pub static FOO: &'static Encoding`
1231/// items.
1232pub static ISO_8859_3_INIT: Encoding = Encoding {
1233 name: "ISO-8859-3",
1234 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_3, 0x00DF, 95, 4),
1235};
1236
1237/// The ISO-8859-3 encoding.
1238///
1239/// This is the South European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 3.
1240///
1241/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-3.html),
1242/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-3-bmp.html)
1243///
1244/// This encoding matches the Windows code page 28593.
1245///
1246/// This will change from `static` to `const` if Rust changes
1247/// to make the referent of `pub const FOO: &'static Encoding`
1248/// unique cross-crate, so don't take the address of this
1249/// `static`.
1250pub static ISO_8859_3: &'static Encoding = &ISO_8859_3_INIT;
1251
1252/// The initializer for the [ISO-8859-4](static.ISO_8859_4.html) encoding.
1253///
1254/// For use only for taking the address of this form when
1255/// Rust prohibits the use of the non-`_INIT` form directly,
1256/// such as in initializers of other `static`s. If in doubt,
1257/// use the corresponding non-`_INIT` reference-typed `static`.
1258///
1259/// This part of the public API will go away if Rust changes
1260/// to make the referent of `pub const FOO: &'static Encoding`
1261/// unique cross-crate or if Rust starts allowing static arrays
1262/// to be initialized with `pub static FOO: &'static Encoding`
1263/// items.
1264pub static ISO_8859_4_INIT: Encoding = Encoding {
1265 name: "ISO-8859-4",
1266 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_4, 0x00DF, 95, 1),
1267};
1268
1269/// The ISO-8859-4 encoding.
1270///
1271/// This is the North European part of the ISO/IEC 8859 encoding family. This encoding is also known as Latin 4.
1272///
1273/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-4.html),
1274/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-4-bmp.html)
1275///
1276/// This encoding matches the Windows code page 28594.
1277///
1278/// This will change from `static` to `const` if Rust changes
1279/// to make the referent of `pub const FOO: &'static Encoding`
1280/// unique cross-crate, so don't take the address of this
1281/// `static`.
1282pub static ISO_8859_4: &'static Encoding = &ISO_8859_4_INIT;
1283
1284/// The initializer for the [ISO-8859-5](static.ISO_8859_5.html) encoding.
1285///
1286/// For use only for taking the address of this form when
1287/// Rust prohibits the use of the non-`_INIT` form directly,
1288/// such as in initializers of other `static`s. If in doubt,
1289/// use the corresponding non-`_INIT` reference-typed `static`.
1290///
1291/// This part of the public API will go away if Rust changes
1292/// to make the referent of `pub const FOO: &'static Encoding`
1293/// unique cross-crate or if Rust starts allowing static arrays
1294/// to be initialized with `pub static FOO: &'static Encoding`
1295/// items.
1296pub static ISO_8859_5_INIT: Encoding = Encoding {
1297 name: "ISO-8859-5",
1298 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_5, 0x040E, 46, 66),
1299};
1300
1301/// The ISO-8859-5 encoding.
1302///
1303/// This is the Cyrillic part of the ISO/IEC 8859 encoding family.
1304///
1305/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-5.html),
1306/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-5-bmp.html)
1307///
1308/// This encoding matches the Windows code page 28595.
1309///
1310/// This will change from `static` to `const` if Rust changes
1311/// to make the referent of `pub const FOO: &'static Encoding`
1312/// unique cross-crate, so don't take the address of this
1313/// `static`.
1314pub static ISO_8859_5: &'static Encoding = &ISO_8859_5_INIT;
1315
1316/// The initializer for the [ISO-8859-6](static.ISO_8859_6.html) encoding.
1317///
1318/// For use only for taking the address of this form when
1319/// Rust prohibits the use of the non-`_INIT` form directly,
1320/// such as in initializers of other `static`s. If in doubt,
1321/// use the corresponding non-`_INIT` reference-typed `static`.
1322///
1323/// This part of the public API will go away if Rust changes
1324/// to make the referent of `pub const FOO: &'static Encoding`
1325/// unique cross-crate or if Rust starts allowing static arrays
1326/// to be initialized with `pub static FOO: &'static Encoding`
1327/// items.
1328pub static ISO_8859_6_INIT: Encoding = Encoding {
1329 name: "ISO-8859-6",
1330 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_6, 0x0621, 65, 26),
1331};
1332
1333/// The ISO-8859-6 encoding.
1334///
1335/// This is the Arabic part of the ISO/IEC 8859 encoding family.
1336///
1337/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-6.html),
1338/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-6-bmp.html)
1339///
1340/// This encoding matches the Windows code page 28596, except Windows decodes
1341/// unassigned code points to the Private Use Area of Unicode.
1342///
1343/// This will change from `static` to `const` if Rust changes
1344/// to make the referent of `pub const FOO: &'static Encoding`
1345/// unique cross-crate, so don't take the address of this
1346/// `static`.
1347pub static ISO_8859_6: &'static Encoding = &ISO_8859_6_INIT;
1348
1349/// The initializer for the [ISO-8859-7](static.ISO_8859_7.html) encoding.
1350///
1351/// For use only for taking the address of this form when
1352/// Rust prohibits the use of the non-`_INIT` form directly,
1353/// such as in initializers of other `static`s. If in doubt,
1354/// use the corresponding non-`_INIT` reference-typed `static`.
1355///
1356/// This part of the public API will go away if Rust changes
1357/// to make the referent of `pub const FOO: &'static Encoding`
1358/// unique cross-crate or if Rust starts allowing static arrays
1359/// to be initialized with `pub static FOO: &'static Encoding`
1360/// items.
1361pub static ISO_8859_7_INIT: Encoding = Encoding {
1362 name: "ISO-8859-7",
1363 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_7, 0x03A3, 83, 44),
1364};
1365
1366/// The ISO-8859-7 encoding.
1367///
1368/// This is the Greek part of the ISO/IEC 8859 encoding family.
1369///
1370/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-7.html),
1371/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-7-bmp.html)
1372///
1373/// This encoding roughly matches the Windows code page 28597. Windows decodes
1374/// unassigned code points, the currency signs at 0xA4 and 0xA5 as well as
1375/// 0xAA, which should be U+037A GREEK YPOGEGRAMMENI, to the Private Use Area
1376/// of Unicode. Windows decodes 0xA1 to U+02BD MODIFIER LETTER REVERSED COMMA
1377/// instead of U+2018 LEFT SINGLE QUOTATION MARK and 0xA2 to U+02BC MODIFIER
1378/// LETTER APOSTROPHE instead of U+2019 RIGHT SINGLE QUOTATION MARK.
1379///
1380/// This will change from `static` to `const` if Rust changes
1381/// to make the referent of `pub const FOO: &'static Encoding`
1382/// unique cross-crate, so don't take the address of this
1383/// `static`.
1384pub static ISO_8859_7: &'static Encoding = &ISO_8859_7_INIT;
1385
1386/// The initializer for the [ISO-8859-8](static.ISO_8859_8.html) encoding.
1387///
1388/// For use only for taking the address of this form when
1389/// Rust prohibits the use of the non-`_INIT` form directly,
1390/// such as in initializers of other `static`s. If in doubt,
1391/// use the corresponding non-`_INIT` reference-typed `static`.
1392///
1393/// This part of the public API will go away if Rust changes
1394/// to make the referent of `pub const FOO: &'static Encoding`
1395/// unique cross-crate or if Rust starts allowing static arrays
1396/// to be initialized with `pub static FOO: &'static Encoding`
1397/// items.
1398pub static ISO_8859_8_INIT: Encoding = Encoding {
1399 name: "ISO-8859-8",
1400 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1401};
1402
1403/// The ISO-8859-8 encoding.
1404///
1405/// This is the Hebrew part of the ISO/IEC 8859 encoding family in visual order.
1406///
1407/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1408/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1409///
1410/// This encoding roughly matches the Windows code page 28598. Windows decodes
1411/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1412/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1413/// the private use area.
1414///
1415/// This will change from `static` to `const` if Rust changes
1416/// to make the referent of `pub const FOO: &'static Encoding`
1417/// unique cross-crate, so don't take the address of this
1418/// `static`.
1419pub static ISO_8859_8: &'static Encoding = &ISO_8859_8_INIT;
1420
1421/// The initializer for the [ISO-8859-8-I](static.ISO_8859_8_I.html) encoding.
1422///
1423/// For use only for taking the address of this form when
1424/// Rust prohibits the use of the non-`_INIT` form directly,
1425/// such as in initializers of other `static`s. If in doubt,
1426/// use the corresponding non-`_INIT` reference-typed `static`.
1427///
1428/// This part of the public API will go away if Rust changes
1429/// to make the referent of `pub const FOO: &'static Encoding`
1430/// unique cross-crate or if Rust starts allowing static arrays
1431/// to be initialized with `pub static FOO: &'static Encoding`
1432/// items.
1433pub static ISO_8859_8_I_INIT: Encoding = Encoding {
1434 name: "ISO-8859-8-I",
1435 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.iso_8859_8, 0x05D0, 96, 27),
1436};
1437
1438/// The ISO-8859-8-I encoding.
1439///
1440/// This is the Hebrew part of the ISO/IEC 8859 encoding family in logical order.
1441///
1442/// [Index visualization](https://encoding.spec.whatwg.org/iso-8859-8.html),
1443/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/iso-8859-8-bmp.html)
1444///
1445/// This encoding roughly matches the Windows code page 38598. Windows decodes
1446/// 0xAF to OVERLINE instead of MACRON and 0xFE and 0xFD to the Private Use
1447/// Area instead of LRM and RLM. Windows decodes unassigned code points to
1448/// the private use area.
1449///
1450/// This will change from `static` to `const` if Rust changes
1451/// to make the referent of `pub const FOO: &'static Encoding`
1452/// unique cross-crate, so don't take the address of this
1453/// `static`.
1454pub static ISO_8859_8_I: &'static Encoding = &ISO_8859_8_I_INIT;
1455
1456/// The initializer for the [KOI8-R](static.KOI8_R.html) encoding.
1457///
1458/// For use only for taking the address of this form when
1459/// Rust prohibits the use of the non-`_INIT` form directly,
1460/// such as in initializers of other `static`s. If in doubt,
1461/// use the corresponding non-`_INIT` reference-typed `static`.
1462///
1463/// This part of the public API will go away if Rust changes
1464/// to make the referent of `pub const FOO: &'static Encoding`
1465/// unique cross-crate or if Rust starts allowing static arrays
1466/// to be initialized with `pub static FOO: &'static Encoding`
1467/// items.
1468pub static KOI8_R_INIT: Encoding = Encoding {
1469 name: "KOI8-R",
1470 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_r, 0x044E, 64, 1),
1471};
1472
1473/// The KOI8-R encoding.
1474///
1475/// This is an encoding for Russian from [RFC 1489](https://tools.ietf.org/html/rfc1489).
1476///
1477/// [Index visualization](https://encoding.spec.whatwg.org/koi8-r.html),
1478/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-r-bmp.html)
1479///
1480/// This encoding matches the Windows code page 20866.
1481///
1482/// This will change from `static` to `const` if Rust changes
1483/// to make the referent of `pub const FOO: &'static Encoding`
1484/// unique cross-crate, so don't take the address of this
1485/// `static`.
1486pub static KOI8_R: &'static Encoding = &KOI8_R_INIT;
1487
1488/// The initializer for the [KOI8-U](static.KOI8_U.html) encoding.
1489///
1490/// For use only for taking the address of this form when
1491/// Rust prohibits the use of the non-`_INIT` form directly,
1492/// such as in initializers of other `static`s. If in doubt,
1493/// use the corresponding non-`_INIT` reference-typed `static`.
1494///
1495/// This part of the public API will go away if Rust changes
1496/// to make the referent of `pub const FOO: &'static Encoding`
1497/// unique cross-crate or if Rust starts allowing static arrays
1498/// to be initialized with `pub static FOO: &'static Encoding`
1499/// items.
1500pub static KOI8_U_INIT: Encoding = Encoding {
1501 name: "KOI8-U",
1502 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.koi8_u, 0x044E, 64, 1),
1503};
1504
1505/// The KOI8-U encoding.
1506///
1507/// This is an encoding for Ukrainian adapted from KOI8-R.
1508///
1509/// [Index visualization](https://encoding.spec.whatwg.org/koi8-u.html),
1510/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/koi8-u-bmp.html)
1511///
1512/// This encoding matches the Windows code page 21866.
1513///
1514/// This will change from `static` to `const` if Rust changes
1515/// to make the referent of `pub const FOO: &'static Encoding`
1516/// unique cross-crate, so don't take the address of this
1517/// `static`.
1518pub static KOI8_U: &'static Encoding = &KOI8_U_INIT;
1519
1520/// The initializer for the [Shift_JIS](static.SHIFT_JIS.html) encoding.
1521///
1522/// For use only for taking the address of this form when
1523/// Rust prohibits the use of the non-`_INIT` form directly,
1524/// such as in initializers of other `static`s. If in doubt,
1525/// use the corresponding non-`_INIT` reference-typed `static`.
1526///
1527/// This part of the public API will go away if Rust changes
1528/// to make the referent of `pub const FOO: &'static Encoding`
1529/// unique cross-crate or if Rust starts allowing static arrays
1530/// to be initialized with `pub static FOO: &'static Encoding`
1531/// items.
1532pub static SHIFT_JIS_INIT: Encoding = Encoding {
1533 name: "Shift_JIS",
1534 variant: VariantEncoding::ShiftJis,
1535};
1536
1537/// The Shift_JIS encoding.
1538///
1539/// This is the Japanese encoding for Windows.
1540///
1541/// [Index visualization](https://encoding.spec.whatwg.org/shift_jis.html),
1542/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/shift_jis-bmp.html)
1543///
1544/// This encoding matches the Windows code page 932, except Windows decodes some byte
1545/// sequences that are error per the Encoding Standard to the question mark or the
1546/// Private Use Area and generally uses U+30FB in place of the REPLACEMENT CHARACTER.
1547///
1548/// This will change from `static` to `const` if Rust changes
1549/// to make the referent of `pub const FOO: &'static Encoding`
1550/// unique cross-crate, so don't take the address of this
1551/// `static`.
1552pub static SHIFT_JIS: &'static Encoding = &SHIFT_JIS_INIT;
1553
1554/// The initializer for the [UTF-16BE](static.UTF_16BE.html) encoding.
1555///
1556/// For use only for taking the address of this form when
1557/// Rust prohibits the use of the non-`_INIT` form directly,
1558/// such as in initializers of other `static`s. If in doubt,
1559/// use the corresponding non-`_INIT` reference-typed `static`.
1560///
1561/// This part of the public API will go away if Rust changes
1562/// to make the referent of `pub const FOO: &'static Encoding`
1563/// unique cross-crate or if Rust starts allowing static arrays
1564/// to be initialized with `pub static FOO: &'static Encoding`
1565/// items.
1566pub static UTF_16BE_INIT: Encoding = Encoding {
1567 name: "UTF-16BE",
1568 variant: VariantEncoding::Utf16Be,
1569};
1570
1571/// The UTF-16BE encoding.
1572///
1573/// This decode-only encoding uses 16-bit code units due to Unicode originally
1574/// having been designed as a 16-bit reportoire. In the absence of a byte order
1575/// mark the big endian byte order is assumed.
1576///
1577/// There is no corresponding encoder in this crate or in the Encoding
1578/// Standard. The output encoding of this encoding is UTF-8.
1579///
1580/// This encoding matches the Windows code page 1201.
1581///
1582/// This will change from `static` to `const` if Rust changes
1583/// to make the referent of `pub const FOO: &'static Encoding`
1584/// unique cross-crate, so don't take the address of this
1585/// `static`.
1586pub static UTF_16BE: &'static Encoding = &UTF_16BE_INIT;
1587
1588/// The initializer for the [UTF-16LE](static.UTF_16LE.html) encoding.
1589///
1590/// For use only for taking the address of this form when
1591/// Rust prohibits the use of the non-`_INIT` form directly,
1592/// such as in initializers of other `static`s. If in doubt,
1593/// use the corresponding non-`_INIT` reference-typed `static`.
1594///
1595/// This part of the public API will go away if Rust changes
1596/// to make the referent of `pub const FOO: &'static Encoding`
1597/// unique cross-crate or if Rust starts allowing static arrays
1598/// to be initialized with `pub static FOO: &'static Encoding`
1599/// items.
1600pub static UTF_16LE_INIT: Encoding = Encoding {
1601 name: "UTF-16LE",
1602 variant: VariantEncoding::Utf16Le,
1603};
1604
1605/// The UTF-16LE encoding.
1606///
1607/// This decode-only encoding uses 16-bit code units due to Unicode originally
1608/// having been designed as a 16-bit reportoire. In the absence of a byte order
1609/// mark the little endian byte order is assumed.
1610///
1611/// There is no corresponding encoder in this crate or in the Encoding
1612/// Standard. The output encoding of this encoding is UTF-8.
1613///
1614/// This encoding matches the Windows code page 1200.
1615///
1616/// This will change from `static` to `const` if Rust changes
1617/// to make the referent of `pub const FOO: &'static Encoding`
1618/// unique cross-crate, so don't take the address of this
1619/// `static`.
1620pub static UTF_16LE: &'static Encoding = &UTF_16LE_INIT;
1621
1622/// The initializer for the [UTF-8](static.UTF_8.html) encoding.
1623///
1624/// For use only for taking the address of this form when
1625/// Rust prohibits the use of the non-`_INIT` form directly,
1626/// such as in initializers of other `static`s. If in doubt,
1627/// use the corresponding non-`_INIT` reference-typed `static`.
1628///
1629/// This part of the public API will go away if Rust changes
1630/// to make the referent of `pub const FOO: &'static Encoding`
1631/// unique cross-crate or if Rust starts allowing static arrays
1632/// to be initialized with `pub static FOO: &'static Encoding`
1633/// items.
1634pub static UTF_8_INIT: Encoding = Encoding {
1635 name: "UTF-8",
1636 variant: VariantEncoding::Utf8,
1637};
1638
1639/// The UTF-8 encoding.
1640///
1641/// This is the encoding that should be used for all new development it can
1642/// represent all of Unicode.
1643///
1644/// This encoding matches the Windows code page 65001, except Windows differs
1645/// in the number of errors generated for some erroneous byte sequences.
1646///
1647/// This will change from `static` to `const` if Rust changes
1648/// to make the referent of `pub const FOO: &'static Encoding`
1649/// unique cross-crate, so don't take the address of this
1650/// `static`.
1651pub static UTF_8: &'static Encoding = &UTF_8_INIT;
1652
1653/// The initializer for the [gb18030](static.GB18030.html) encoding.
1654///
1655/// For use only for taking the address of this form when
1656/// Rust prohibits the use of the non-`_INIT` form directly,
1657/// such as in initializers of other `static`s. If in doubt,
1658/// use the corresponding non-`_INIT` reference-typed `static`.
1659///
1660/// This part of the public API will go away if Rust changes
1661/// to make the referent of `pub const FOO: &'static Encoding`
1662/// unique cross-crate or if Rust starts allowing static arrays
1663/// to be initialized with `pub static FOO: &'static Encoding`
1664/// items.
1665pub static GB18030_INIT: Encoding = Encoding {
1666 name: "gb18030",
1667 variant: VariantEncoding::Gb18030,
1668};
1669
1670/// The gb18030 encoding.
1671///
1672/// This encoding matches GB18030-2005 except the two-byte sequence 0xA3 0xA0
1673/// maps to U+3000 for compatibility with existing Web content. As a result,
1674/// this encoding can represent all of Unicode except for the private-use
1675/// character U+E5E5.
1676///
1677/// [Index visualization for the two-byte sequences](https://encoding.spec.whatwg.org/gb18030.html),
1678/// [Visualization of BMP coverage of the two-byte index](https://encoding.spec.whatwg.org/gb18030-bmp.html)
1679///
1680/// This encoding matches the Windows code page 54936.
1681///
1682/// This will change from `static` to `const` if Rust changes
1683/// to make the referent of `pub const FOO: &'static Encoding`
1684/// unique cross-crate, so don't take the address of this
1685/// `static`.
1686pub static GB18030: &'static Encoding = &GB18030_INIT;
1687
1688/// The initializer for the [macintosh](static.MACINTOSH.html) encoding.
1689///
1690/// For use only for taking the address of this form when
1691/// Rust prohibits the use of the non-`_INIT` form directly,
1692/// such as in initializers of other `static`s. If in doubt,
1693/// use the corresponding non-`_INIT` reference-typed `static`.
1694///
1695/// This part of the public API will go away if Rust changes
1696/// to make the referent of `pub const FOO: &'static Encoding`
1697/// unique cross-crate or if Rust starts allowing static arrays
1698/// to be initialized with `pub static FOO: &'static Encoding`
1699/// items.
1700pub static MACINTOSH_INIT: Encoding = Encoding {
1701 name: "macintosh",
1702 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.macintosh, 0x00CD, 106, 3),
1703};
1704
1705/// The macintosh encoding.
1706///
1707/// This is the MacRoman encoding from Mac OS Classic.
1708///
1709/// [Index visualization](https://encoding.spec.whatwg.org/macintosh.html),
1710/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/macintosh-bmp.html)
1711///
1712/// This encoding matches the Windows code page 10000, except Windows decodes
1713/// 0xBD to U+2126 OHM SIGN instead of U+03A9 GREEK CAPITAL LETTER OMEGA.
1714///
1715/// This will change from `static` to `const` if Rust changes
1716/// to make the referent of `pub const FOO: &'static Encoding`
1717/// unique cross-crate, so don't take the address of this
1718/// `static`.
1719pub static MACINTOSH: &'static Encoding = &MACINTOSH_INIT;
1720
1721/// The initializer for the [replacement](static.REPLACEMENT.html) encoding.
1722///
1723/// For use only for taking the address of this form when
1724/// Rust prohibits the use of the non-`_INIT` form directly,
1725/// such as in initializers of other `static`s. If in doubt,
1726/// use the corresponding non-`_INIT` reference-typed `static`.
1727///
1728/// This part of the public API will go away if Rust changes
1729/// to make the referent of `pub const FOO: &'static Encoding`
1730/// unique cross-crate or if Rust starts allowing static arrays
1731/// to be initialized with `pub static FOO: &'static Encoding`
1732/// items.
1733pub static REPLACEMENT_INIT: Encoding = Encoding {
1734 name: "replacement",
1735 variant: VariantEncoding::Replacement,
1736};
1737
1738/// The replacement encoding.
1739///
1740/// This decode-only encoding decodes all non-zero-length streams to a single
1741/// REPLACEMENT CHARACTER. Its purpose is to avoid the use of an
1742/// ASCII-compatible fallback encoding (typically windows-1252) for some
1743/// encodings that are no longer supported by the Web Platform and that
1744/// would be dangerous to treat as ASCII-compatible.
1745///
1746/// There is no corresponding encoder. The output encoding of this encoding
1747/// is UTF-8.
1748///
1749/// This encoding does not have a Windows code page number.
1750///
1751/// This will change from `static` to `const` if Rust changes
1752/// to make the referent of `pub const FOO: &'static Encoding`
1753/// unique cross-crate, so don't take the address of this
1754/// `static`.
1755pub static REPLACEMENT: &'static Encoding = &REPLACEMENT_INIT;
1756
1757/// The initializer for the [windows-1250](static.WINDOWS_1250.html) encoding.
1758///
1759/// For use only for taking the address of this form when
1760/// Rust prohibits the use of the non-`_INIT` form directly,
1761/// such as in initializers of other `static`s. If in doubt,
1762/// use the corresponding non-`_INIT` reference-typed `static`.
1763///
1764/// This part of the public API will go away if Rust changes
1765/// to make the referent of `pub const FOO: &'static Encoding`
1766/// unique cross-crate or if Rust starts allowing static arrays
1767/// to be initialized with `pub static FOO: &'static Encoding`
1768/// items.
1769pub static WINDOWS_1250_INIT: Encoding = Encoding {
1770 name: "windows-1250",
1771 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1250, 0x00DC, 92, 2),
1772};
1773
1774/// The windows-1250 encoding.
1775///
1776/// This is the Central European encoding for Windows.
1777///
1778/// [Index visualization](https://encoding.spec.whatwg.org/windows-1250.html),
1779/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1250-bmp.html)
1780///
1781/// This encoding matches the Windows code page 1250.
1782///
1783/// This will change from `static` to `const` if Rust changes
1784/// to make the referent of `pub const FOO: &'static Encoding`
1785/// unique cross-crate, so don't take the address of this
1786/// `static`.
1787pub static WINDOWS_1250: &'static Encoding = &WINDOWS_1250_INIT;
1788
1789/// The initializer for the [windows-1251](static.WINDOWS_1251.html) encoding.
1790///
1791/// For use only for taking the address of this form when
1792/// Rust prohibits the use of the non-`_INIT` form directly,
1793/// such as in initializers of other `static`s. If in doubt,
1794/// use the corresponding non-`_INIT` reference-typed `static`.
1795///
1796/// This part of the public API will go away if Rust changes
1797/// to make the referent of `pub const FOO: &'static Encoding`
1798/// unique cross-crate or if Rust starts allowing static arrays
1799/// to be initialized with `pub static FOO: &'static Encoding`
1800/// items.
1801pub static WINDOWS_1251_INIT: Encoding = Encoding {
1802 name: "windows-1251",
1803 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1251, 0x0410, 64, 64),
1804};
1805
1806/// The windows-1251 encoding.
1807///
1808/// This is the Cyrillic encoding for Windows.
1809///
1810/// [Index visualization](https://encoding.spec.whatwg.org/windows-1251.html),
1811/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1251-bmp.html)
1812///
1813/// This encoding matches the Windows code page 1251.
1814///
1815/// This will change from `static` to `const` if Rust changes
1816/// to make the referent of `pub const FOO: &'static Encoding`
1817/// unique cross-crate, so don't take the address of this
1818/// `static`.
1819pub static WINDOWS_1251: &'static Encoding = &WINDOWS_1251_INIT;
1820
1821/// The initializer for the [windows-1252](static.WINDOWS_1252.html) encoding.
1822///
1823/// For use only for taking the address of this form when
1824/// Rust prohibits the use of the non-`_INIT` form directly,
1825/// such as in initializers of other `static`s. If in doubt,
1826/// use the corresponding non-`_INIT` reference-typed `static`.
1827///
1828/// This part of the public API will go away if Rust changes
1829/// to make the referent of `pub const FOO: &'static Encoding`
1830/// unique cross-crate or if Rust starts allowing static arrays
1831/// to be initialized with `pub static FOO: &'static Encoding`
1832/// items.
1833pub static WINDOWS_1252_INIT: Encoding = Encoding {
1834 name: "windows-1252",
1835 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1252, 0x00A0, 32, 96),
1836};
1837
1838/// The windows-1252 encoding.
1839///
1840/// This is the Western encoding for Windows. It is an extension of ISO-8859-1,
1841/// which is known as Latin 1.
1842///
1843/// [Index visualization](https://encoding.spec.whatwg.org/windows-1252.html),
1844/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1252-bmp.html)
1845///
1846/// This encoding matches the Windows code page 1252.
1847///
1848/// This will change from `static` to `const` if Rust changes
1849/// to make the referent of `pub const FOO: &'static Encoding`
1850/// unique cross-crate, so don't take the address of this
1851/// `static`.
1852pub static WINDOWS_1252: &'static Encoding = &WINDOWS_1252_INIT;
1853
1854/// The initializer for the [windows-1253](static.WINDOWS_1253.html) encoding.
1855///
1856/// For use only for taking the address of this form when
1857/// Rust prohibits the use of the non-`_INIT` form directly,
1858/// such as in initializers of other `static`s. If in doubt,
1859/// use the corresponding non-`_INIT` reference-typed `static`.
1860///
1861/// This part of the public API will go away if Rust changes
1862/// to make the referent of `pub const FOO: &'static Encoding`
1863/// unique cross-crate or if Rust starts allowing static arrays
1864/// to be initialized with `pub static FOO: &'static Encoding`
1865/// items.
1866pub static WINDOWS_1253_INIT: Encoding = Encoding {
1867 name: "windows-1253",
1868 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1253, 0x03A3, 83, 44),
1869};
1870
1871/// The windows-1253 encoding.
1872///
1873/// This is the Greek encoding for Windows. It is mostly an extension of
1874/// ISO-8859-7, but U+0386 is mapped to a different byte.
1875///
1876/// [Index visualization](https://encoding.spec.whatwg.org/windows-1253.html),
1877/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1253-bmp.html)
1878///
1879/// This encoding matches the Windows code page 1253, except Windows decodes
1880/// unassigned code points to the Private Use Area of Unicode.
1881///
1882/// This will change from `static` to `const` if Rust changes
1883/// to make the referent of `pub const FOO: &'static Encoding`
1884/// unique cross-crate, so don't take the address of this
1885/// `static`.
1886pub static WINDOWS_1253: &'static Encoding = &WINDOWS_1253_INIT;
1887
1888/// The initializer for the [windows-1254](static.WINDOWS_1254.html) encoding.
1889///
1890/// For use only for taking the address of this form when
1891/// Rust prohibits the use of the non-`_INIT` form directly,
1892/// such as in initializers of other `static`s. If in doubt,
1893/// use the corresponding non-`_INIT` reference-typed `static`.
1894///
1895/// This part of the public API will go away if Rust changes
1896/// to make the referent of `pub const FOO: &'static Encoding`
1897/// unique cross-crate or if Rust starts allowing static arrays
1898/// to be initialized with `pub static FOO: &'static Encoding`
1899/// items.
1900pub static WINDOWS_1254_INIT: Encoding = Encoding {
1901 name: "windows-1254",
1902 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1254, 0x00DF, 95, 17),
1903};
1904
1905/// The windows-1254 encoding.
1906///
1907/// This is the Turkish encoding for Windows. It is an extension of ISO-8859-9,
1908/// which is known as Latin 5.
1909///
1910/// [Index visualization](https://encoding.spec.whatwg.org/windows-1254.html),
1911/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1254-bmp.html)
1912///
1913/// This encoding matches the Windows code page 1254.
1914///
1915/// This will change from `static` to `const` if Rust changes
1916/// to make the referent of `pub const FOO: &'static Encoding`
1917/// unique cross-crate, so don't take the address of this
1918/// `static`.
1919pub static WINDOWS_1254: &'static Encoding = &WINDOWS_1254_INIT;
1920
1921/// The initializer for the [windows-1255](static.WINDOWS_1255.html) encoding.
1922///
1923/// For use only for taking the address of this form when
1924/// Rust prohibits the use of the non-`_INIT` form directly,
1925/// such as in initializers of other `static`s. If in doubt,
1926/// use the corresponding non-`_INIT` reference-typed `static`.
1927///
1928/// This part of the public API will go away if Rust changes
1929/// to make the referent of `pub const FOO: &'static Encoding`
1930/// unique cross-crate or if Rust starts allowing static arrays
1931/// to be initialized with `pub static FOO: &'static Encoding`
1932/// items.
1933pub static WINDOWS_1255_INIT: Encoding = Encoding {
1934 name: "windows-1255",
1935 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1255, 0x05D0, 96, 27),
1936};
1937
1938/// The windows-1255 encoding.
1939///
1940/// This is the Hebrew encoding for Windows. It is an extension of ISO-8859-8-I,
1941/// except for a currency sign swap.
1942///
1943/// [Index visualization](https://encoding.spec.whatwg.org/windows-1255.html),
1944/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1255-bmp.html)
1945///
1946/// This encoding matches the Windows code page 1255, except Windows decodes
1947/// unassigned code points to the Private Use Area of Unicode.
1948///
1949/// This will change from `static` to `const` if Rust changes
1950/// to make the referent of `pub const FOO: &'static Encoding`
1951/// unique cross-crate, so don't take the address of this
1952/// `static`.
1953pub static WINDOWS_1255: &'static Encoding = &WINDOWS_1255_INIT;
1954
1955/// The initializer for the [windows-1256](static.WINDOWS_1256.html) encoding.
1956///
1957/// For use only for taking the address of this form when
1958/// Rust prohibits the use of the non-`_INIT` form directly,
1959/// such as in initializers of other `static`s. If in doubt,
1960/// use the corresponding non-`_INIT` reference-typed `static`.
1961///
1962/// This part of the public API will go away if Rust changes
1963/// to make the referent of `pub const FOO: &'static Encoding`
1964/// unique cross-crate or if Rust starts allowing static arrays
1965/// to be initialized with `pub static FOO: &'static Encoding`
1966/// items.
1967pub static WINDOWS_1256_INIT: Encoding = Encoding {
1968 name: "windows-1256",
1969 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1256, 0x0621, 65, 22),
1970};
1971
1972/// The windows-1256 encoding.
1973///
1974/// This is the Arabic encoding for Windows.
1975///
1976/// [Index visualization](https://encoding.spec.whatwg.org/windows-1256.html),
1977/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1256-bmp.html)
1978///
1979/// This encoding matches the Windows code page 1256.
1980///
1981/// This will change from `static` to `const` if Rust changes
1982/// to make the referent of `pub const FOO: &'static Encoding`
1983/// unique cross-crate, so don't take the address of this
1984/// `static`.
1985pub static WINDOWS_1256: &'static Encoding = &WINDOWS_1256_INIT;
1986
1987/// The initializer for the [windows-1257](static.WINDOWS_1257.html) encoding.
1988///
1989/// For use only for taking the address of this form when
1990/// Rust prohibits the use of the non-`_INIT` form directly,
1991/// such as in initializers of other `static`s. If in doubt,
1992/// use the corresponding non-`_INIT` reference-typed `static`.
1993///
1994/// This part of the public API will go away if Rust changes
1995/// to make the referent of `pub const FOO: &'static Encoding`
1996/// unique cross-crate or if Rust starts allowing static arrays
1997/// to be initialized with `pub static FOO: &'static Encoding`
1998/// items.
1999pub static WINDOWS_1257_INIT: Encoding = Encoding {
2000 name: "windows-1257",
2001 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1257, 0x00DF, 95, 1),
2002};
2003
2004/// The windows-1257 encoding.
2005///
2006/// This is the Baltic encoding for Windows.
2007///
2008/// [Index visualization](https://encoding.spec.whatwg.org/windows-1257.html),
2009/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1257-bmp.html)
2010///
2011/// This encoding matches the Windows code page 1257, except Windows decodes
2012/// unassigned code points to the Private Use Area of Unicode.
2013///
2014/// This will change from `static` to `const` if Rust changes
2015/// to make the referent of `pub const FOO: &'static Encoding`
2016/// unique cross-crate, so don't take the address of this
2017/// `static`.
2018pub static WINDOWS_1257: &'static Encoding = &WINDOWS_1257_INIT;
2019
2020/// The initializer for the [windows-1258](static.WINDOWS_1258.html) encoding.
2021///
2022/// For use only for taking the address of this form when
2023/// Rust prohibits the use of the non-`_INIT` form directly,
2024/// such as in initializers of other `static`s. If in doubt,
2025/// use the corresponding non-`_INIT` reference-typed `static`.
2026///
2027/// This part of the public API will go away if Rust changes
2028/// to make the referent of `pub const FOO: &'static Encoding`
2029/// unique cross-crate or if Rust starts allowing static arrays
2030/// to be initialized with `pub static FOO: &'static Encoding`
2031/// items.
2032pub static WINDOWS_1258_INIT: Encoding = Encoding {
2033 name: "windows-1258",
2034 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_1258, 0x00DF, 95, 4),
2035};
2036
2037/// The windows-1258 encoding.
2038///
2039/// This is the Vietnamese encoding for Windows.
2040///
2041/// [Index visualization](https://encoding.spec.whatwg.org/windows-1258.html),
2042/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-1258-bmp.html)
2043///
2044/// This encoding matches the Windows code page 1258 when used in the
2045/// non-normalizing mode. Unlike with the other single-byte encodings, the
2046/// result of decoding is not necessarily in Normalization Form C. On the
2047/// other hand, input in the Normalization Form C is not encoded without
2048/// replacement. In general, it's a bad idea to encode to encodings other
2049/// than UTF-8, but this encoding is especially hazardous to encode to.
2050///
2051/// This will change from `static` to `const` if Rust changes
2052/// to make the referent of `pub const FOO: &'static Encoding`
2053/// unique cross-crate, so don't take the address of this
2054/// `static`.
2055pub static WINDOWS_1258: &'static Encoding = &WINDOWS_1258_INIT;
2056
2057/// The initializer for the [windows-874](static.WINDOWS_874.html) encoding.
2058///
2059/// For use only for taking the address of this form when
2060/// Rust prohibits the use of the non-`_INIT` form directly,
2061/// such as in initializers of other `static`s. If in doubt,
2062/// use the corresponding non-`_INIT` reference-typed `static`.
2063///
2064/// This part of the public API will go away if Rust changes
2065/// to make the referent of `pub const FOO: &'static Encoding`
2066/// unique cross-crate or if Rust starts allowing static arrays
2067/// to be initialized with `pub static FOO: &'static Encoding`
2068/// items.
2069pub static WINDOWS_874_INIT: Encoding = Encoding {
2070 name: "windows-874",
2071 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.windows_874, 0x0E01, 33, 58),
2072};
2073
2074/// The windows-874 encoding.
2075///
2076/// This is the Thai encoding for Windows. It is an extension of TIS-620 / ISO-8859-11.
2077///
2078/// [Index visualization](https://encoding.spec.whatwg.org/windows-874.html),
2079/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/windows-874-bmp.html)
2080///
2081/// This encoding matches the Windows code page 874, except Windows decodes
2082/// unassigned code points to the Private Use Area of Unicode.
2083///
2084/// This will change from `static` to `const` if Rust changes
2085/// to make the referent of `pub const FOO: &'static Encoding`
2086/// unique cross-crate, so don't take the address of this
2087/// `static`.
2088pub static WINDOWS_874: &'static Encoding = &WINDOWS_874_INIT;
2089
2090/// The initializer for the [x-mac-cyrillic](static.X_MAC_CYRILLIC.html) encoding.
2091///
2092/// For use only for taking the address of this form when
2093/// Rust prohibits the use of the non-`_INIT` form directly,
2094/// such as in initializers of other `static`s. If in doubt,
2095/// use the corresponding non-`_INIT` reference-typed `static`.
2096///
2097/// This part of the public API will go away if Rust changes
2098/// to make the referent of `pub const FOO: &'static Encoding`
2099/// unique cross-crate or if Rust starts allowing static arrays
2100/// to be initialized with `pub static FOO: &'static Encoding`
2101/// items.
2102pub static X_MAC_CYRILLIC_INIT: Encoding = Encoding {
2103 name: "x-mac-cyrillic",
2104 variant: VariantEncoding::SingleByte(&data::SINGLE_BYTE_DATA.x_mac_cyrillic, 0x0430, 96, 31),
2105};
2106
2107/// The x-mac-cyrillic encoding.
2108///
2109/// This is the MacUkrainian encoding from Mac OS Classic.
2110///
2111/// [Index visualization](https://encoding.spec.whatwg.org/x-mac-cyrillic.html),
2112/// [Visualization of BMP coverage](https://encoding.spec.whatwg.org/x-mac-cyrillic-bmp.html)
2113///
2114/// This encoding matches the Windows code page 10017.
2115///
2116/// This will change from `static` to `const` if Rust changes
2117/// to make the referent of `pub const FOO: &'static Encoding`
2118/// unique cross-crate, so don't take the address of this
2119/// `static`.
2120pub static X_MAC_CYRILLIC: &'static Encoding = &X_MAC_CYRILLIC_INIT;
2121
2122/// The initializer for the [x-user-defined](static.X_USER_DEFINED.html) encoding.
2123///
2124/// For use only for taking the address of this form when
2125/// Rust prohibits the use of the non-`_INIT` form directly,
2126/// such as in initializers of other `static`s. If in doubt,
2127/// use the corresponding non-`_INIT` reference-typed `static`.
2128///
2129/// This part of the public API will go away if Rust changes
2130/// to make the referent of `pub const FOO: &'static Encoding`
2131/// unique cross-crate or if Rust starts allowing static arrays
2132/// to be initialized with `pub static FOO: &'static Encoding`
2133/// items.
2134pub static X_USER_DEFINED_INIT: Encoding = Encoding {
2135 name: "x-user-defined",
2136 variant: VariantEncoding::UserDefined,
2137};
2138
2139/// The x-user-defined encoding.
2140///
2141/// This encoding offsets the non-ASCII bytes by `0xF700` thereby decoding
2142/// them to the Private Use Area of Unicode. It was used for loading binary
2143/// data into a JavaScript string using `XMLHttpRequest` before XHR supported
2144/// the `"arraybuffer"` response type.
2145///
2146/// This encoding does not have a Windows code page number.
2147///
2148/// This will change from `static` to `const` if Rust changes
2149/// to make the referent of `pub const FOO: &'static Encoding`
2150/// unique cross-crate, so don't take the address of this
2151/// `static`.
2152pub static X_USER_DEFINED: &'static Encoding = &X_USER_DEFINED_INIT;
2153
2154static LABELS_SORTED: [&'static str; 228] = [
2155 "l1",
2156 "l2",
2157 "l3",
2158 "l4",
2159 "l5",
2160 "l6",
2161 "l9",
2162 "866",
2163 "mac",
2164 "koi",
2165 "gbk",
2166 "big5",
2167 "utf8",
2168 "koi8",
2169 "sjis",
2170 "ucs-2",
2171 "ms932",
2172 "cp866",
2173 "utf-8",
2174 "cp819",
2175 "ascii",
2176 "x-gbk",
2177 "greek",
2178 "cp1250",
2179 "cp1251",
2180 "latin1",
2181 "gb2312",
2182 "cp1252",
2183 "latin2",
2184 "cp1253",
2185 "latin3",
2186 "cp1254",
2187 "latin4",
2188 "cp1255",
2189 "csbig5",
2190 "latin5",
2191 "utf-16",
2192 "cp1256",
2193 "ibm866",
2194 "latin6",
2195 "cp1257",
2196 "cp1258",
2197 "greek8",
2198 "ibm819",
2199 "arabic",
2200 "visual",
2201 "korean",
2202 "euc-jp",
2203 "koi8-r",
2204 "koi8_r",
2205 "euc-kr",
2206 "x-sjis",
2207 "koi8-u",
2208 "hebrew",
2209 "tis-620",
2210 "gb18030",
2211 "ksc5601",
2212 "gb_2312",
2213 "dos-874",
2214 "cn-big5",
2215 "unicode",
2216 "chinese",
2217 "logical",
2218 "cskoi8r",
2219 "cseuckr",
2220 "koi8-ru",
2221 "x-cp1250",
2222 "ksc_5601",
2223 "x-cp1251",
2224 "iso88591",
2225 "csgb2312",
2226 "x-cp1252",
2227 "iso88592",
2228 "x-cp1253",
2229 "iso88593",
2230 "ecma-114",
2231 "x-cp1254",
2232 "iso88594",
2233 "x-cp1255",
2234 "iso88595",
2235 "x-x-big5",
2236 "x-cp1256",
2237 "csibm866",
2238 "iso88596",
2239 "x-cp1257",
2240 "iso88597",
2241 "asmo-708",
2242 "ecma-118",
2243 "elot_928",
2244 "x-cp1258",
2245 "iso88598",
2246 "iso88599",
2247 "cyrillic",
2248 "utf-16be",
2249 "utf-16le",
2250 "us-ascii",
2251 "ms_kanji",
2252 "x-euc-jp",
2253 "iso885910",
2254 "iso8859-1",
2255 "iso885911",
2256 "iso8859-2",
2257 "iso8859-3",
2258 "iso885913",
2259 "iso8859-4",
2260 "iso885914",
2261 "iso8859-5",
2262 "iso885915",
2263 "iso8859-6",
2264 "iso8859-7",
2265 "iso8859-8",
2266 "iso-ir-58",
2267 "iso8859-9",
2268 "csunicode",
2269 "macintosh",
2270 "shift-jis",
2271 "shift_jis",
2272 "iso-ir-100",
2273 "iso8859-10",
2274 "iso-ir-110",
2275 "gb_2312-80",
2276 "iso-8859-1",
2277 "iso_8859-1",
2278 "iso-ir-101",
2279 "iso8859-11",
2280 "iso-8859-2",
2281 "iso_8859-2",
2282 "hz-gb-2312",
2283 "iso-8859-3",
2284 "iso_8859-3",
2285 "iso8859-13",
2286 "iso-8859-4",
2287 "iso_8859-4",
2288 "iso8859-14",
2289 "iso-ir-144",
2290 "iso-8859-5",
2291 "iso_8859-5",
2292 "iso8859-15",
2293 "iso-8859-6",
2294 "iso_8859-6",
2295 "iso-ir-126",
2296 "iso-8859-7",
2297 "iso_8859-7",
2298 "iso-ir-127",
2299 "iso-ir-157",
2300 "iso-8859-8",
2301 "iso_8859-8",
2302 "iso-ir-138",
2303 "iso-ir-148",
2304 "iso-8859-9",
2305 "iso_8859-9",
2306 "iso-ir-109",
2307 "iso-ir-149",
2308 "big5-hkscs",
2309 "csshiftjis",
2310 "iso-8859-10",
2311 "iso-8859-11",
2312 "csisolatin1",
2313 "csisolatin2",
2314 "iso-8859-13",
2315 "csisolatin3",
2316 "iso-8859-14",
2317 "windows-874",
2318 "csisolatin4",
2319 "iso-8859-15",
2320 "iso_8859-15",
2321 "csisolatin5",
2322 "iso-8859-16",
2323 "csisolatin6",
2324 "windows-949",
2325 "csisolatin9",
2326 "csiso88596e",
2327 "csiso88598e",
2328 "unicodefffe",
2329 "unicodefeff",
2330 "csmacintosh",
2331 "csiso88596i",
2332 "csiso88598i",
2333 "windows-31j",
2334 "x-mac-roman",
2335 "iso-2022-cn",
2336 "iso-2022-jp",
2337 "csiso2022jp",
2338 "iso-2022-kr",
2339 "csiso2022kr",
2340 "replacement",
2341 "windows-1250",
2342 "windows-1251",
2343 "windows-1252",
2344 "windows-1253",
2345 "windows-1254",
2346 "windows-1255",
2347 "windows-1256",
2348 "windows-1257",
2349 "windows-1258",
2350 "iso-8859-6-e",
2351 "iso-8859-8-e",
2352 "iso-8859-6-i",
2353 "iso-8859-8-i",
2354 "sun_eu_greek",
2355 "csksc56011987",
2356 "unicode20utf8",
2357 "unicode11utf8",
2358 "ks_c_5601-1987",
2359 "ansi_x3.4-1968",
2360 "ks_c_5601-1989",
2361 "x-mac-cyrillic",
2362 "x-user-defined",
2363 "csiso58gb231280",
2364 "iso-10646-ucs-2",
2365 "iso_8859-1:1987",
2366 "iso_8859-2:1987",
2367 "iso_8859-6:1987",
2368 "iso_8859-7:1987",
2369 "iso_8859-3:1988",
2370 "iso_8859-4:1988",
2371 "iso_8859-5:1988",
2372 "iso_8859-8:1988",
2373 "x-unicode20utf8",
2374 "iso_8859-9:1989",
2375 "csisolatingreek",
2376 "x-mac-ukrainian",
2377 "iso-2022-cn-ext",
2378 "csisolatinarabic",
2379 "csisolatinhebrew",
2380 "unicode-1-1-utf-8",
2381 "csisolatincyrillic",
2382 "cseucpkdfmtjapanese",
2383];
2384
2385static ENCODINGS_IN_LABEL_SORT: [&'static Encoding; 228] = [
2386 &WINDOWS_1252_INIT,
2387 &ISO_8859_2_INIT,
2388 &ISO_8859_3_INIT,
2389 &ISO_8859_4_INIT,
2390 &WINDOWS_1254_INIT,
2391 &ISO_8859_10_INIT,
2392 &ISO_8859_15_INIT,
2393 &IBM866_INIT,
2394 &MACINTOSH_INIT,
2395 &KOI8_R_INIT,
2396 &GBK_INIT,
2397 &BIG5_INIT,
2398 &UTF_8_INIT,
2399 &KOI8_R_INIT,
2400 &SHIFT_JIS_INIT,
2401 &UTF_16LE_INIT,
2402 &SHIFT_JIS_INIT,
2403 &IBM866_INIT,
2404 &UTF_8_INIT,
2405 &WINDOWS_1252_INIT,
2406 &WINDOWS_1252_INIT,
2407 &GBK_INIT,
2408 &ISO_8859_7_INIT,
2409 &WINDOWS_1250_INIT,
2410 &WINDOWS_1251_INIT,
2411 &WINDOWS_1252_INIT,
2412 &GBK_INIT,
2413 &WINDOWS_1252_INIT,
2414 &ISO_8859_2_INIT,
2415 &WINDOWS_1253_INIT,
2416 &ISO_8859_3_INIT,
2417 &WINDOWS_1254_INIT,
2418 &ISO_8859_4_INIT,
2419 &WINDOWS_1255_INIT,
2420 &BIG5_INIT,
2421 &WINDOWS_1254_INIT,
2422 &UTF_16LE_INIT,
2423 &WINDOWS_1256_INIT,
2424 &IBM866_INIT,
2425 &ISO_8859_10_INIT,
2426 &WINDOWS_1257_INIT,
2427 &WINDOWS_1258_INIT,
2428 &ISO_8859_7_INIT,
2429 &WINDOWS_1252_INIT,
2430 &ISO_8859_6_INIT,
2431 &ISO_8859_8_INIT,
2432 &EUC_KR_INIT,
2433 &EUC_JP_INIT,
2434 &KOI8_R_INIT,
2435 &KOI8_R_INIT,
2436 &EUC_KR_INIT,
2437 &SHIFT_JIS_INIT,
2438 &KOI8_U_INIT,
2439 &ISO_8859_8_INIT,
2440 &WINDOWS_874_INIT,
2441 &GB18030_INIT,
2442 &EUC_KR_INIT,
2443 &GBK_INIT,
2444 &WINDOWS_874_INIT,
2445 &BIG5_INIT,
2446 &UTF_16LE_INIT,
2447 &GBK_INIT,
2448 &ISO_8859_8_I_INIT,
2449 &KOI8_R_INIT,
2450 &EUC_KR_INIT,
2451 &KOI8_U_INIT,
2452 &WINDOWS_1250_INIT,
2453 &EUC_KR_INIT,
2454 &WINDOWS_1251_INIT,
2455 &WINDOWS_1252_INIT,
2456 &GBK_INIT,
2457 &WINDOWS_1252_INIT,
2458 &ISO_8859_2_INIT,
2459 &WINDOWS_1253_INIT,
2460 &ISO_8859_3_INIT,
2461 &ISO_8859_6_INIT,
2462 &WINDOWS_1254_INIT,
2463 &ISO_8859_4_INIT,
2464 &WINDOWS_1255_INIT,
2465 &ISO_8859_5_INIT,
2466 &BIG5_INIT,
2467 &WINDOWS_1256_INIT,
2468 &IBM866_INIT,
2469 &ISO_8859_6_INIT,
2470 &WINDOWS_1257_INIT,
2471 &ISO_8859_7_INIT,
2472 &ISO_8859_6_INIT,
2473 &ISO_8859_7_INIT,
2474 &ISO_8859_7_INIT,
2475 &WINDOWS_1258_INIT,
2476 &ISO_8859_8_INIT,
2477 &WINDOWS_1254_INIT,
2478 &ISO_8859_5_INIT,
2479 &UTF_16BE_INIT,
2480 &UTF_16LE_INIT,
2481 &WINDOWS_1252_INIT,
2482 &SHIFT_JIS_INIT,
2483 &EUC_JP_INIT,
2484 &ISO_8859_10_INIT,
2485 &WINDOWS_1252_INIT,
2486 &WINDOWS_874_INIT,
2487 &ISO_8859_2_INIT,
2488 &ISO_8859_3_INIT,
2489 &ISO_8859_13_INIT,
2490 &ISO_8859_4_INIT,
2491 &ISO_8859_14_INIT,
2492 &ISO_8859_5_INIT,
2493 &ISO_8859_15_INIT,
2494 &ISO_8859_6_INIT,
2495 &ISO_8859_7_INIT,
2496 &ISO_8859_8_INIT,
2497 &GBK_INIT,
2498 &WINDOWS_1254_INIT,
2499 &UTF_16LE_INIT,
2500 &MACINTOSH_INIT,
2501 &SHIFT_JIS_INIT,
2502 &SHIFT_JIS_INIT,
2503 &WINDOWS_1252_INIT,
2504 &ISO_8859_10_INIT,
2505 &ISO_8859_4_INIT,
2506 &GBK_INIT,
2507 &WINDOWS_1252_INIT,
2508 &WINDOWS_1252_INIT,
2509 &ISO_8859_2_INIT,
2510 &WINDOWS_874_INIT,
2511 &ISO_8859_2_INIT,
2512 &ISO_8859_2_INIT,
2513 &REPLACEMENT_INIT,
2514 &ISO_8859_3_INIT,
2515 &ISO_8859_3_INIT,
2516 &ISO_8859_13_INIT,
2517 &ISO_8859_4_INIT,
2518 &ISO_8859_4_INIT,
2519 &ISO_8859_14_INIT,
2520 &ISO_8859_5_INIT,
2521 &ISO_8859_5_INIT,
2522 &ISO_8859_5_INIT,
2523 &ISO_8859_15_INIT,
2524 &ISO_8859_6_INIT,
2525 &ISO_8859_6_INIT,
2526 &ISO_8859_7_INIT,
2527 &ISO_8859_7_INIT,
2528 &ISO_8859_7_INIT,
2529 &ISO_8859_6_INIT,
2530 &ISO_8859_10_INIT,
2531 &ISO_8859_8_INIT,
2532 &ISO_8859_8_INIT,
2533 &ISO_8859_8_INIT,
2534 &WINDOWS_1254_INIT,
2535 &WINDOWS_1254_INIT,
2536 &WINDOWS_1254_INIT,
2537 &ISO_8859_3_INIT,
2538 &EUC_KR_INIT,
2539 &BIG5_INIT,
2540 &SHIFT_JIS_INIT,
2541 &ISO_8859_10_INIT,
2542 &WINDOWS_874_INIT,
2543 &WINDOWS_1252_INIT,
2544 &ISO_8859_2_INIT,
2545 &ISO_8859_13_INIT,
2546 &ISO_8859_3_INIT,
2547 &ISO_8859_14_INIT,
2548 &WINDOWS_874_INIT,
2549 &ISO_8859_4_INIT,
2550 &ISO_8859_15_INIT,
2551 &ISO_8859_15_INIT,
2552 &WINDOWS_1254_INIT,
2553 &ISO_8859_16_INIT,
2554 &ISO_8859_10_INIT,
2555 &EUC_KR_INIT,
2556 &ISO_8859_15_INIT,
2557 &ISO_8859_6_INIT,
2558 &ISO_8859_8_INIT,
2559 &UTF_16BE_INIT,
2560 &UTF_16LE_INIT,
2561 &MACINTOSH_INIT,
2562 &ISO_8859_6_INIT,
2563 &ISO_8859_8_I_INIT,
2564 &SHIFT_JIS_INIT,
2565 &MACINTOSH_INIT,
2566 &REPLACEMENT_INIT,
2567 &ISO_2022_JP_INIT,
2568 &ISO_2022_JP_INIT,
2569 &REPLACEMENT_INIT,
2570 &REPLACEMENT_INIT,
2571 &REPLACEMENT_INIT,
2572 &WINDOWS_1250_INIT,
2573 &WINDOWS_1251_INIT,
2574 &WINDOWS_1252_INIT,
2575 &WINDOWS_1253_INIT,
2576 &WINDOWS_1254_INIT,
2577 &WINDOWS_1255_INIT,
2578 &WINDOWS_1256_INIT,
2579 &WINDOWS_1257_INIT,
2580 &WINDOWS_1258_INIT,
2581 &ISO_8859_6_INIT,
2582 &ISO_8859_8_INIT,
2583 &ISO_8859_6_INIT,
2584 &ISO_8859_8_I_INIT,
2585 &ISO_8859_7_INIT,
2586 &EUC_KR_INIT,
2587 &UTF_8_INIT,
2588 &UTF_8_INIT,
2589 &EUC_KR_INIT,
2590 &WINDOWS_1252_INIT,
2591 &EUC_KR_INIT,
2592 &X_MAC_CYRILLIC_INIT,
2593 &X_USER_DEFINED_INIT,
2594 &GBK_INIT,
2595 &UTF_16LE_INIT,
2596 &WINDOWS_1252_INIT,
2597 &ISO_8859_2_INIT,
2598 &ISO_8859_6_INIT,
2599 &ISO_8859_7_INIT,
2600 &ISO_8859_3_INIT,
2601 &ISO_8859_4_INIT,
2602 &ISO_8859_5_INIT,
2603 &ISO_8859_8_INIT,
2604 &UTF_8_INIT,
2605 &WINDOWS_1254_INIT,
2606 &ISO_8859_7_INIT,
2607 &X_MAC_CYRILLIC_INIT,
2608 &REPLACEMENT_INIT,
2609 &ISO_8859_6_INIT,
2610 &ISO_8859_8_INIT,
2611 &UTF_8_INIT,
2612 &ISO_8859_5_INIT,
2613 &EUC_JP_INIT,
2614];
2615
2616// END GENERATED CODE
2617
2618/// An encoding as defined in the [Encoding Standard][1].
2619///
2620/// An _encoding_ defines a mapping from a `u8` sequence to a `char` sequence
2621/// and, in most cases, vice versa. Each encoding has a name, an output
2622/// encoding, and one or more labels.
2623///
2624/// _Labels_ are ASCII-case-insensitive strings that are used to identify an
2625/// encoding in formats and protocols. The _name_ of the encoding is the
2626/// preferred label in the case appropriate for returning from the
2627/// [`characterSet`][2] property of the `Document` DOM interface.
2628///
2629/// The _output encoding_ is the encoding used for form submission and URL
2630/// parsing on Web pages in the encoding. This is UTF-8 for the replacement,
2631/// UTF-16LE and UTF-16BE encodings and the encoding itself for other
2632/// encodings.
2633///
2634/// [1]: https://encoding.spec.whatwg.org/
2635/// [2]: https://dom.spec.whatwg.org/#dom-document-characterset
2636///
2637/// # Streaming vs. Non-Streaming
2638///
2639/// When you have the entire input in a single buffer, you can use the
2640/// methods [`decode()`][3], [`decode_with_bom_removal()`][3],
2641/// [`decode_without_bom_handling()`][5],
2642/// [`decode_without_bom_handling_and_without_replacement()`][6] and
2643/// [`encode()`][7]. (These methods are available to Rust callers only and are
2644/// not available in the C API.) Unlike the rest of the API available to Rust,
2645/// these methods perform heap allocations. You should the `Decoder` and
2646/// `Encoder` objects when your input is split into multiple buffers or when
2647/// you want to control the allocation of the output buffers.
2648///
2649/// [3]: #method.decode
2650/// [4]: #method.decode_with_bom_removal
2651/// [5]: #method.decode_without_bom_handling
2652/// [6]: #method.decode_without_bom_handling_and_without_replacement
2653/// [7]: #method.encode
2654///
2655/// # Instances
2656///
2657/// All instances of `Encoding` are statically allocated and have the `'static`
2658/// lifetime. There is precisely one unique `Encoding` instance for each
2659/// encoding defined in the Encoding Standard.
2660///
2661/// To obtain a reference to a particular encoding whose identity you know at
2662/// compile time, use a `static` that refers to encoding. There is a `static`
2663/// for each encoding. The `static`s are named in all caps with hyphens
2664/// replaced with underscores (and in C/C++ have `_ENCODING` appended to the
2665/// name). For example, if you know at compile time that you will want to
2666/// decode using the UTF-8 encoding, use the `UTF_8` `static` (`UTF_8_ENCODING`
2667/// in C/C++).
2668///
2669/// Additionally, there are non-reference-typed forms ending with `_INIT` to
2670/// work around the problem that `static`s of the type `&'static Encoding`
2671/// cannot be used to initialize items of an array whose type is
2672/// `[&'static Encoding; N]`.
2673///
2674/// If you don't know what encoding you need at compile time and need to
2675/// dynamically get an encoding by label, use
2676/// <code>Encoding::<a href="#method.for_label">for_label</a>(<var>label</var>)</code>.
2677///
2678/// Instances of `Encoding` can be compared with `==` (in both Rust and in
2679/// C/C++).
2680pub struct Encoding {
2681 name: &'static str,
2682 variant: VariantEncoding,
2683}
2684
2685impl Encoding {
2686 /// Implements the
2687 /// [_get an encoding_](https://encoding.spec.whatwg.org/#concept-encoding-get)
2688 /// algorithm.
2689 ///
2690 /// If, after ASCII-lowercasing and removing leading and trailing
2691 /// whitespace, the argument matches a label defined in the Encoding
2692 /// Standard, `Some(&'static Encoding)` representing the corresponding
2693 /// encoding is returned. If there is no match, `None` is returned.
2694 ///
2695 /// This is the right method to use if the action upon the method returning
2696 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`) instead.
2697 /// When the action upon the method returning `None` is not to proceed with
2698 /// a fallback but to refuse processing, `for_label_no_replacement()` is more
2699 /// appropriate.
2700 ///
2701 /// The argument is of type `&[u8]` instead of `&str` to save callers
2702 /// that are extracting the label from a non-UTF-8 protocol the trouble
2703 /// of conversion to UTF-8. (If you have a `&str`, just call `.as_bytes()`
2704 /// on it.)
2705 ///
2706 /// Available via the C wrapper.
2707 ///
2708 /// # Example
2709 /// ```
2710 /// use encoding_rs::Encoding;
2711 ///
2712 /// assert_eq!(Some(encoding_rs::UTF_8), Encoding::for_label(b"utf-8"));
2713 /// assert_eq!(Some(encoding_rs::UTF_8), Encoding::for_label(b"unicode11utf8"));
2714 ///
2715 /// assert_eq!(Some(encoding_rs::ISO_8859_2), Encoding::for_label(b"latin2"));
2716 ///
2717 /// assert_eq!(Some(encoding_rs::UTF_16BE), Encoding::for_label(b"utf-16be"));
2718 ///
2719 /// assert_eq!(None, Encoding::for_label(b"unrecognized label"));
2720 /// ```
2721 pub fn for_label(label: &[u8]) -> Option<&'static Encoding> {
2722 let mut trimmed = [0u8; LONGEST_LABEL_LENGTH];
2723 let mut trimmed_pos = 0usize;
2724 let mut iter = label.into_iter();
2725 // before
2726 loop {
2727 match iter.next() {
2728 None => {
2729 return None;
2730 }
2731 Some(byte) => {
2732 // The characters used in labels are:
2733 // a-z (except q, but excluding it below seems excessive)
2734 // 0-9
2735 // . _ - :
2736 match *byte {
2737 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2738 continue;
2739 }
2740 b'A'..=b'Z' => {
2741 trimmed[trimmed_pos] = *byte + 0x20u8;
2742 trimmed_pos = 1usize;
2743 break;
2744 }
2745 b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2746 trimmed[trimmed_pos] = *byte;
2747 trimmed_pos = 1usize;
2748 break;
2749 }
2750 _ => {
2751 return None;
2752 }
2753 }
2754 }
2755 }
2756 }
2757 // inside
2758 loop {
2759 match iter.next() {
2760 None => {
2761 break;
2762 }
2763 Some(byte) => {
2764 match *byte {
2765 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2766 break;
2767 }
2768 b'A'..=b'Z' => {
2769 if trimmed_pos == LONGEST_LABEL_LENGTH {
2770 // There's no encoding with a label this long
2771 return None;
2772 }
2773 trimmed[trimmed_pos] = *byte + 0x20u8;
2774 trimmed_pos += 1usize;
2775 continue;
2776 }
2777 b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b':' | b'.' => {
2778 if trimmed_pos == LONGEST_LABEL_LENGTH {
2779 // There's no encoding with a label this long
2780 return None;
2781 }
2782 trimmed[trimmed_pos] = *byte;
2783 trimmed_pos += 1usize;
2784 continue;
2785 }
2786 _ => {
2787 return None;
2788 }
2789 }
2790 }
2791 }
2792 }
2793 // after
2794 loop {
2795 match iter.next() {
2796 None => {
2797 break;
2798 }
2799 Some(byte) => {
2800 match *byte {
2801 0x09u8 | 0x0Au8 | 0x0Cu8 | 0x0Du8 | 0x20u8 => {
2802 continue;
2803 }
2804 _ => {
2805 // There's no label with space in the middle
2806 return None;
2807 }
2808 }
2809 }
2810 }
2811 }
2812 let candidate = &trimmed[..trimmed_pos];
2813 match LABELS_SORTED.binary_search_by(|probe| {
2814 let bytes = probe.as_bytes();
2815 let c = bytes.len().cmp(&candidate.len());
2816 if c != Ordering::Equal {
2817 return c;
2818 }
2819 let probe_iter = bytes.iter().rev();
2820 let candidate_iter = candidate.iter().rev();
2821 probe_iter.cmp(candidate_iter)
2822 }) {
2823 Ok(i) => Some(ENCODINGS_IN_LABEL_SORT[i]),
2824 Err(_) => None,
2825 }
2826 }
2827
2828 /// This method behaves the same as `for_label()`, except when `for_label()`
2829 /// would return `Some(REPLACEMENT)`, this method returns `None` instead.
2830 ///
2831 /// This method is useful in scenarios where a fatal error is required
2832 /// upon invalid label, because in those cases the caller typically wishes
2833 /// to treat the labels that map to the replacement encoding as fatal
2834 /// errors, too.
2835 ///
2836 /// It is not OK to use this method when the action upon the method returning
2837 /// `None` is to use a fallback encoding (e.g. `WINDOWS_1252`). In such a
2838 /// case, the `for_label()` method should be used instead in order to avoid
2839 /// unsafe fallback for labels that `for_label()` maps to `Some(REPLACEMENT)`.
2840 ///
2841 /// Available via the C wrapper.
2842 #[inline]
2843 pub fn for_label_no_replacement(label: &[u8]) -> Option<&'static Encoding> {
2844 match Encoding::for_label(label) {
2845 None => None,
2846 Some(encoding) => {
2847 if encoding == REPLACEMENT {
2848 None
2849 } else {
2850 Some(encoding)
2851 }
2852 }
2853 }
2854 }
2855
2856 /// Performs non-incremental BOM sniffing.
2857 ///
2858 /// The argument must either be a buffer representing the entire input
2859 /// stream (non-streaming case) or a buffer representing at least the first
2860 /// three bytes of the input stream (streaming case).
2861 ///
2862 /// Returns `Some((UTF_8, 3))`, `Some((UTF_16LE, 2))` or
2863 /// `Some((UTF_16BE, 2))` if the argument starts with the UTF-8, UTF-16LE
2864 /// or UTF-16BE BOM or `None` otherwise.
2865 ///
2866 /// Available via the C wrapper.
2867 #[inline]
2868 pub fn for_bom(buffer: &[u8]) -> Option<(&'static Encoding, usize)> {
2869 if buffer.starts_with(b"\xEF\xBB\xBF") {
2870 Some((UTF_8, 3))
2871 } else if buffer.starts_with(b"\xFF\xFE") {
2872 Some((UTF_16LE, 2))
2873 } else if buffer.starts_with(b"\xFE\xFF") {
2874 Some((UTF_16BE, 2))
2875 } else {
2876 None
2877 }
2878 }
2879
2880 /// Returns the name of this encoding.
2881 ///
2882 /// This name is appropriate to return as-is from the DOM
2883 /// `document.characterSet` property.
2884 ///
2885 /// Available via the C wrapper.
2886 #[inline]
2887 pub fn name(&'static self) -> &'static str {
2888 self.name
2889 }
2890
2891 /// Checks whether the _output encoding_ of this encoding can encode every
2892 /// `char`. (Only true if the output encoding is UTF-8.)
2893 ///
2894 /// Available via the C wrapper.
2895 #[inline]
2896 pub fn can_encode_everything(&'static self) -> bool {
2897 self.output_encoding() == UTF_8
2898 }
2899
2900 /// Checks whether the bytes 0x00...0x7F map exclusively to the characters
2901 /// U+0000...U+007F and vice versa.
2902 ///
2903 /// Available via the C wrapper.
2904 #[inline]
2905 pub fn is_ascii_compatible(&'static self) -> bool {
2906 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE || self == ISO_2022_JP)
2907 }
2908
2909 /// Checks whether this encoding maps one byte to one Basic Multilingual
2910 /// Plane code point (i.e. byte length equals decoded UTF-16 length) and
2911 /// vice versa (for mappable characters).
2912 ///
2913 /// `true` iff this encoding is on the list of [Legacy single-byte
2914 /// encodings](https://encoding.spec.whatwg.org/#legacy-single-byte-encodings)
2915 /// in the spec or x-user-defined.
2916 ///
2917 /// Available via the C wrapper.
2918 #[inline]
2919 pub fn is_single_byte(&'static self) -> bool {
2920 self.variant.is_single_byte()
2921 }
2922
2923 /// Checks whether the bytes 0x00...0x7F map mostly to the characters
2924 /// U+0000...U+007F and vice versa.
2925 #[cfg(feature = "alloc")]
2926 #[inline]
2927 fn is_potentially_borrowable(&'static self) -> bool {
2928 !(self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE)
2929 }
2930
2931 /// Returns the _output encoding_ of this encoding. This is UTF-8 for
2932 /// UTF-16BE, UTF-16LE, and replacement and the encoding itself otherwise.
2933 ///
2934 /// _Note:_ The _output encoding_ concept is needed for form submission and
2935 /// error handling in the query strings of URLs in the Web Platform.
2936 ///
2937 /// Available via the C wrapper.
2938 #[inline]
2939 pub fn output_encoding(&'static self) -> &'static Encoding {
2940 if self == REPLACEMENT || self == UTF_16BE || self == UTF_16LE {
2941 UTF_8
2942 } else {
2943 self
2944 }
2945 }
2946
2947 /// Decode complete input to `Cow<'a, str>` _with BOM sniffing_ and with
2948 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2949 /// entire input is available as a single buffer (i.e. the end of the
2950 /// buffer marks the end of the stream).
2951 ///
2952 /// The BOM, if any, does not appear in the output.
2953 ///
2954 /// This method implements the (non-streaming version of) the
2955 /// [_decode_](https://encoding.spec.whatwg.org/#decode) spec concept.
2956 ///
2957 /// The second item in the returned tuple is the encoding that was actually
2958 /// used (which may differ from this encoding thanks to BOM sniffing).
2959 ///
2960 /// The third item in the returned tuple indicates whether there were
2961 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
2962 ///
2963 /// _Note:_ It is wrong to use this when the input buffer represents only
2964 /// a segment of the input instead of the whole input. Use `new_decoder()`
2965 /// when decoding segmented input.
2966 ///
2967 /// This method performs a one or two heap allocations for the backing
2968 /// buffer of the `String` when unable to borrow. (One allocation if not
2969 /// errors and potentially another one in the presence of errors.) The
2970 /// first allocation assumes jemalloc and may not be optimal with
2971 /// allocators that do not use power-of-two buckets. A borrow is performed
2972 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
2973 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
2974 /// ISO-2022-JP and the input is entirely in the ASCII state without state
2975 /// transitions.
2976 ///
2977 /// # Panics
2978 ///
2979 /// If the size calculation for a heap-allocated backing buffer overflows
2980 /// `usize`.
2981 ///
2982 /// Available to Rust only and only with the `alloc` feature enabled (enabled
2983 /// by default).
2984 #[cfg(feature = "alloc")]
2985 #[inline]
2986 pub fn decode<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, &'static Encoding, bool) {
2987 let (encoding, without_bom) = match Encoding::for_bom(bytes) {
2988 Some((encoding, bom_length)) => (encoding, &bytes[bom_length..]),
2989 None => (self, bytes),
2990 };
2991 let (cow, had_errors) = encoding.decode_without_bom_handling(without_bom);
2992 (cow, encoding, had_errors)
2993 }
2994
2995 /// Decode complete input to `Cow<'a, str>` _with BOM removal_ and with
2996 /// malformed sequences replaced with the REPLACEMENT CHARACTER when the
2997 /// entire input is available as a single buffer (i.e. the end of the
2998 /// buffer marks the end of the stream).
2999 ///
3000 /// Only an initial byte sequence that is a BOM for this encoding is removed.
3001 ///
3002 /// When invoked on `UTF_8`, this method implements the (non-streaming
3003 /// version of) the
3004 /// [_UTF-8 decode_](https://encoding.spec.whatwg.org/#utf-8-decode) spec
3005 /// concept.
3006 ///
3007 /// The second item in the returned pair indicates whether there were
3008 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3009 ///
3010 /// _Note:_ It is wrong to use this when the input buffer represents only
3011 /// a segment of the input instead of the whole input. Use
3012 /// `new_decoder_with_bom_removal()` when decoding segmented input.
3013 ///
3014 /// This method performs a one or two heap allocations for the backing
3015 /// buffer of the `String` when unable to borrow. (One allocation if not
3016 /// errors and potentially another one in the presence of errors.) The
3017 /// first allocation assumes jemalloc and may not be optimal with
3018 /// allocators that do not use power-of-two buckets. A borrow is performed
3019 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3020 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3021 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3022 /// transitions.
3023 ///
3024 /// # Panics
3025 ///
3026 /// If the size calculation for a heap-allocated backing buffer overflows
3027 /// `usize`.
3028 ///
3029 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3030 /// by default).
3031 #[cfg(feature = "alloc")]
3032 #[inline]
3033 pub fn decode_with_bom_removal<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3034 let without_bom = if self == UTF_8 && bytes.starts_with(b"\xEF\xBB\xBF") {
3035 &bytes[3..]
3036 } else if (self == UTF_16LE && bytes.starts_with(b"\xFF\xFE"))
3037 || (self == UTF_16BE && bytes.starts_with(b"\xFE\xFF"))
3038 {
3039 &bytes[2..]
3040 } else {
3041 bytes
3042 };
3043 self.decode_without_bom_handling(without_bom)
3044 }
3045
3046 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3047 /// with malformed sequences replaced with the REPLACEMENT CHARACTER when
3048 /// the entire input is available as a single buffer (i.e. the end of the
3049 /// buffer marks the end of the stream).
3050 ///
3051 /// When invoked on `UTF_8`, this method implements the (non-streaming
3052 /// version of) the
3053 /// [_UTF-8 decode without BOM_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom)
3054 /// spec concept.
3055 ///
3056 /// The second item in the returned pair indicates whether there were
3057 /// malformed sequences (that were replaced with the REPLACEMENT CHARACTER).
3058 ///
3059 /// _Note:_ It is wrong to use this when the input buffer represents only
3060 /// a segment of the input instead of the whole input. Use
3061 /// `new_decoder_without_bom_handling()` when decoding segmented input.
3062 ///
3063 /// This method performs a one or two heap allocations for the backing
3064 /// buffer of the `String` when unable to borrow. (One allocation if not
3065 /// errors and potentially another one in the presence of errors.) The
3066 /// first allocation assumes jemalloc and may not be optimal with
3067 /// allocators that do not use power-of-two buckets. A borrow is performed
3068 /// if decoding UTF-8 and the input is valid UTF-8, if decoding an
3069 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3070 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3071 /// transitions.
3072 ///
3073 /// # Panics
3074 ///
3075 /// If the size calculation for a heap-allocated backing buffer overflows
3076 /// `usize`.
3077 ///
3078 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3079 /// by default).
3080 #[cfg(feature = "alloc")]
3081 pub fn decode_without_bom_handling<'a>(&'static self, bytes: &'a [u8]) -> (Cow<'a, str>, bool) {
3082 let (mut decoder, mut string, mut total_read) = if self.is_potentially_borrowable() {
3083 let valid_up_to = if self == UTF_8 {
3084 utf8_valid_up_to(bytes)
3085 } else if self == ISO_2022_JP {
3086 iso_2022_jp_ascii_valid_up_to(bytes)
3087 } else {
3088 ascii_valid_up_to(bytes)
3089 };
3090 if valid_up_to == bytes.len() {
3091 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3092 return (Cow::Borrowed(str), false);
3093 }
3094 let decoder = self.new_decoder_without_bom_handling();
3095
3096 let rounded_without_replacement = checked_next_power_of_two(checked_add(
3097 valid_up_to,
3098 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3099 ));
3100 let with_replacement = checked_add(
3101 valid_up_to,
3102 decoder.max_utf8_buffer_length(bytes.len() - valid_up_to),
3103 );
3104 let mut string = String::with_capacity(
3105 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3106 );
3107 unsafe {
3108 let vec = string.as_mut_vec();
3109 vec.set_len(valid_up_to);
3110 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3111 }
3112 (decoder, string, valid_up_to)
3113 } else {
3114 let decoder = self.new_decoder_without_bom_handling();
3115 let rounded_without_replacement = checked_next_power_of_two(
3116 decoder.max_utf8_buffer_length_without_replacement(bytes.len()),
3117 );
3118 let with_replacement = decoder.max_utf8_buffer_length(bytes.len());
3119 let string = String::with_capacity(
3120 checked_min(rounded_without_replacement, with_replacement).unwrap(),
3121 );
3122 (decoder, string, 0)
3123 };
3124
3125 let mut total_had_errors = false;
3126 loop {
3127 let (result, read, had_errors) =
3128 decoder.decode_to_string(&bytes[total_read..], &mut string, true);
3129 total_read += read;
3130 total_had_errors |= had_errors;
3131 match result {
3132 CoderResult::InputEmpty => {
3133 debug_assert_eq!(total_read, bytes.len());
3134 return (Cow::Owned(string), total_had_errors);
3135 }
3136 CoderResult::OutputFull => {
3137 // Allocate for the worst case. That is, we should come
3138 // here at most once per invocation of this method.
3139 let needed = decoder.max_utf8_buffer_length(bytes.len() - total_read);
3140 string.reserve(needed.unwrap());
3141 }
3142 }
3143 }
3144 }
3145
3146 /// Decode complete input to `Cow<'a, str>` _without BOM handling_ and
3147 /// _with malformed sequences treated as fatal_ when the entire input is
3148 /// available as a single buffer (i.e. the end of the buffer marks the end
3149 /// of the stream).
3150 ///
3151 /// When invoked on `UTF_8`, this method implements the (non-streaming
3152 /// version of) the
3153 /// [_UTF-8 decode without BOM or fail_](https://encoding.spec.whatwg.org/#utf-8-decode-without-bom-or-fail)
3154 /// spec concept.
3155 ///
3156 /// Returns `None` if a malformed sequence was encountered and the result
3157 /// of the decode as `Some(String)` otherwise.
3158 ///
3159 /// _Note:_ It is wrong to use this when the input buffer represents only
3160 /// a segment of the input instead of the whole input. Use
3161 /// `new_decoder_without_bom_handling()` when decoding segmented input.
3162 ///
3163 /// This method performs a single heap allocation for the backing
3164 /// buffer of the `String` when unable to borrow. A borrow is performed if
3165 /// decoding UTF-8 and the input is valid UTF-8, if decoding an
3166 /// ASCII-compatible encoding and the input is ASCII-only, or when decoding
3167 /// ISO-2022-JP and the input is entirely in the ASCII state without state
3168 /// transitions.
3169 ///
3170 /// # Panics
3171 ///
3172 /// If the size calculation for a heap-allocated backing buffer overflows
3173 /// `usize`.
3174 ///
3175 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3176 /// by default).
3177 #[cfg(feature = "alloc")]
3178 pub fn decode_without_bom_handling_and_without_replacement<'a>(
3179 &'static self,
3180 bytes: &'a [u8],
3181 ) -> Option<Cow<'a, str>> {
3182 if self == UTF_8 {
3183 let valid_up_to = utf8_valid_up_to(bytes);
3184 if valid_up_to == bytes.len() {
3185 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3186 return Some(Cow::Borrowed(str));
3187 }
3188 return None;
3189 }
3190 let (mut decoder, mut string, input) = if self.is_potentially_borrowable() {
3191 let valid_up_to = if self == ISO_2022_JP {
3192 iso_2022_jp_ascii_valid_up_to(bytes)
3193 } else {
3194 ascii_valid_up_to(bytes)
3195 };
3196 if valid_up_to == bytes.len() {
3197 let str: &str = unsafe { core::str::from_utf8_unchecked(bytes) };
3198 return Some(Cow::Borrowed(str));
3199 }
3200 let decoder = self.new_decoder_without_bom_handling();
3201 let mut string = String::with_capacity(
3202 checked_add(
3203 valid_up_to,
3204 decoder.max_utf8_buffer_length_without_replacement(bytes.len() - valid_up_to),
3205 )
3206 .unwrap(),
3207 );
3208 unsafe {
3209 let vec = string.as_mut_vec();
3210 vec.set_len(valid_up_to);
3211 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3212 }
3213 (decoder, string, &bytes[valid_up_to..])
3214 } else {
3215 let decoder = self.new_decoder_without_bom_handling();
3216 let string = String::with_capacity(
3217 decoder
3218 .max_utf8_buffer_length_without_replacement(bytes.len())
3219 .unwrap(),
3220 );
3221 (decoder, string, bytes)
3222 };
3223 let (result, read) = decoder.decode_to_string_without_replacement(input, &mut string, true);
3224 match result {
3225 DecoderResult::InputEmpty => {
3226 debug_assert_eq!(read, input.len());
3227 Some(Cow::Owned(string))
3228 }
3229 DecoderResult::Malformed(_, _) => None,
3230 DecoderResult::OutputFull => unreachable!(),
3231 }
3232 }
3233
3234 /// Encode complete input to `Cow<'a, [u8]>` using the
3235 /// [_output encoding_](Encoding::output_encoding) of this encoding with
3236 /// unmappable characters replaced with decimal numeric character references
3237 /// when the entire input is available as a single buffer (i.e. the end of
3238 /// the buffer marks the end of the stream).
3239 ///
3240 /// This method implements the (non-streaming version of) the
3241 /// [_encode_](https://encoding.spec.whatwg.org/#encode) spec concept. For
3242 /// the [_UTF-8 encode_](https://encoding.spec.whatwg.org/#utf-8-encode)
3243 /// spec concept, it is slightly more efficient to use
3244 /// <code><var>string</var>.as_bytes()</code> instead of invoking this
3245 /// method on `UTF_8`.
3246 ///
3247 /// The second item in the returned tuple is the encoding that was actually
3248 /// used (*which may differ from this encoding thanks to some encodings
3249 /// having UTF-8 as their output encoding*).
3250 ///
3251 /// The third item in the returned tuple indicates whether there were
3252 /// unmappable characters (that were replaced with HTML numeric character
3253 /// references).
3254 ///
3255 /// _Note:_ It is wrong to use this when the input buffer represents only
3256 /// a segment of the input instead of the whole input. Use `new_encoder()`
3257 /// when encoding segmented output.
3258 ///
3259 /// When encoding to UTF-8 or when encoding an ASCII-only input to a
3260 /// ASCII-compatible encoding, this method returns a borrow of the input
3261 /// without a heap allocation. Otherwise, this method performs a single
3262 /// heap allocation for the backing buffer of the `Vec<u8>` if there are no
3263 /// unmappable characters and potentially multiple heap allocations if
3264 /// there are. These allocations are tuned for jemalloc and may not be
3265 /// optimal when using a different allocator that doesn't use power-of-two
3266 /// buckets.
3267 ///
3268 /// # Panics
3269 ///
3270 /// If the size calculation for a heap-allocated backing buffer overflows
3271 /// `usize`.
3272 ///
3273 /// Available to Rust only and only with the `alloc` feature enabled (enabled
3274 /// by default).
3275 #[cfg(feature = "alloc")]
3276 pub fn encode<'a>(&'static self, string: &'a str) -> (Cow<'a, [u8]>, &'static Encoding, bool) {
3277 let output_encoding = self.output_encoding();
3278 if output_encoding == UTF_8 {
3279 return (Cow::Borrowed(string.as_bytes()), output_encoding, false);
3280 }
3281 debug_assert!(output_encoding.is_potentially_borrowable());
3282 let bytes = string.as_bytes();
3283 let valid_up_to = if output_encoding == ISO_2022_JP {
3284 iso_2022_jp_ascii_valid_up_to(bytes)
3285 } else {
3286 ascii_valid_up_to(bytes)
3287 };
3288 if valid_up_to == bytes.len() {
3289 return (Cow::Borrowed(bytes), output_encoding, false);
3290 }
3291 let mut encoder = output_encoding.new_encoder();
3292 let mut vec: Vec<u8> = Vec::with_capacity(
3293 (checked_add(
3294 valid_up_to,
3295 encoder.max_buffer_length_from_utf8_if_no_unmappables(string.len() - valid_up_to),
3296 ))
3297 .unwrap()
3298 .next_power_of_two(),
3299 );
3300 unsafe {
3301 vec.set_len(valid_up_to);
3302 core::ptr::copy_nonoverlapping(bytes.as_ptr(), vec.as_mut_ptr(), valid_up_to);
3303 }
3304 let mut total_read = valid_up_to;
3305 let mut total_had_errors = false;
3306 loop {
3307 let (result, read, had_errors) =
3308 encoder.encode_from_utf8_to_vec(&string[total_read..], &mut vec, true);
3309 total_read += read;
3310 total_had_errors |= had_errors;
3311 match result {
3312 CoderResult::InputEmpty => {
3313 debug_assert_eq!(total_read, string.len());
3314 return (Cow::Owned(vec), output_encoding, total_had_errors);
3315 }
3316 CoderResult::OutputFull => {
3317 // reserve_exact wants to know how much more on top of current
3318 // length--not current capacity.
3319 let needed = encoder
3320 .max_buffer_length_from_utf8_if_no_unmappables(string.len() - total_read);
3321 let rounded = (checked_add(vec.capacity(), needed))
3322 .unwrap()
3323 .next_power_of_two();
3324 let additional = rounded - vec.len();
3325 vec.reserve_exact(additional);
3326 }
3327 }
3328 }
3329 }
3330
3331 fn new_variant_decoder(&'static self) -> VariantDecoder {
3332 self.variant.new_variant_decoder()
3333 }
3334
3335 /// Instantiates a new decoder for this encoding with BOM sniffing enabled.
3336 ///
3337 /// BOM sniffing may cause the returned decoder to morph into a decoder
3338 /// for UTF-8, UTF-16LE or UTF-16BE instead of this encoding. The BOM
3339 /// does not appear in the output.
3340 ///
3341 /// Available via the C wrapper.
3342 #[inline]
3343 pub fn new_decoder(&'static self) -> Decoder {
3344 Decoder::new(self, self.new_variant_decoder(), BomHandling::Sniff)
3345 }
3346
3347 /// Instantiates a new decoder for this encoding with BOM removal.
3348 ///
3349 /// If the input starts with bytes that are the BOM for this encoding,
3350 /// those bytes are removed. However, the decoder never morphs into a
3351 /// decoder for another encoding: A BOM for another encoding is treated as
3352 /// (potentially malformed) input to the decoding algorithm for this
3353 /// encoding.
3354 ///
3355 /// Available via the C wrapper.
3356 #[inline]
3357 pub fn new_decoder_with_bom_removal(&'static self) -> Decoder {
3358 Decoder::new(self, self.new_variant_decoder(), BomHandling::Remove)
3359 }
3360
3361 /// Instantiates a new decoder for this encoding with BOM handling disabled.
3362 ///
3363 /// If the input starts with bytes that look like a BOM, those bytes are
3364 /// not treated as a BOM. (Hence, the decoder never morphs into a decoder
3365 /// for another encoding.)
3366 ///
3367 /// _Note:_ If the caller has performed BOM sniffing on its own but has not
3368 /// removed the BOM, the caller should use `new_decoder_with_bom_removal()`
3369 /// instead of this method to cause the BOM to be removed.
3370 ///
3371 /// Available via the C wrapper.
3372 #[inline]
3373 pub fn new_decoder_without_bom_handling(&'static self) -> Decoder {
3374 Decoder::new(self, self.new_variant_decoder(), BomHandling::Off)
3375 }
3376
3377 /// Instantiates a new encoder for the [_output encoding_](Encoding::output_encoding)
3378 /// of this encoding.
3379 ///
3380 /// _Note:_ The output encoding of UTF-16BE, UTF-16LE, and replacement is UTF-8. There
3381 /// is no encoder for UTF-16BE, UTF-16LE, and replacement themselves.
3382 ///
3383 /// Available via the C wrapper.
3384 #[inline]
3385 pub fn new_encoder(&'static self) -> Encoder {
3386 let enc = self.output_encoding();
3387 enc.variant.new_encoder(enc)
3388 }
3389
3390 /// Validates UTF-8.
3391 ///
3392 /// Returns the index of the first byte that makes the input malformed as
3393 /// UTF-8 or the length of the slice if the slice is entirely valid.
3394 ///
3395 /// This is currently faster than the corresponding standard library
3396 /// functionality. If this implementation gets upstreamed to the standard
3397 /// library, this method may be removed in the future.
3398 ///
3399 /// Available via the C wrapper.
3400 pub fn utf8_valid_up_to(bytes: &[u8]) -> usize {
3401 utf8_valid_up_to(bytes)
3402 }
3403
3404 /// Validates ASCII.
3405 ///
3406 /// Returns the index of the first byte that makes the input malformed as
3407 /// ASCII or the length of the slice if the slice is entirely valid.
3408 ///
3409 /// Available via the C wrapper.
3410 pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
3411 ascii_valid_up_to(bytes)
3412 }
3413
3414 /// Validates ISO-2022-JP ASCII-state data.
3415 ///
3416 /// Returns the index of the first byte that makes the input not
3417 /// representable in the ASCII state of ISO-2022-JP or the length of the
3418 /// slice if the slice is entirely representable in the ASCII state of
3419 /// ISO-2022-JP.
3420 ///
3421 /// Available via the C wrapper.
3422 pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
3423 iso_2022_jp_ascii_valid_up_to(bytes)
3424 }
3425}
3426
3427impl PartialEq for Encoding {
3428 #[inline]
3429 fn eq(&self, other: &Encoding) -> bool {
3430 (self as *const Encoding) == (other as *const Encoding)
3431 }
3432}
3433
3434impl Eq for Encoding {}
3435
3436#[cfg(test)]
3437impl PartialOrd for Encoding {
3438 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
3439 (self as *const Encoding as usize).partial_cmp(&(other as *const Encoding as usize))
3440 }
3441}
3442
3443#[cfg(test)]
3444impl Ord for Encoding {
3445 fn cmp(&self, other: &Self) -> Ordering {
3446 (self as *const Encoding as usize).cmp(&(other as *const Encoding as usize))
3447 }
3448}
3449
3450impl Hash for Encoding {
3451 #[inline]
3452 fn hash<H: Hasher>(&self, state: &mut H) {
3453 (self as *const Encoding).hash(state);
3454 }
3455}
3456
3457impl core::fmt::Debug for Encoding {
3458 #[inline]
3459 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
3460 write!(f, "Encoding {{ {} }}", self.name)
3461 }
3462}
3463
3464#[cfg(feature = "serde")]
3465impl Serialize for Encoding {
3466 #[inline]
3467 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
3468 where
3469 S: Serializer,
3470 {
3471 serializer.serialize_str(self.name)
3472 }
3473}
3474
3475#[cfg(feature = "serde")]
3476struct EncodingVisitor;
3477
3478#[cfg(feature = "serde")]
3479impl<'de> Visitor<'de> for EncodingVisitor {
3480 type Value = &'static Encoding;
3481
3482 fn expecting(&self, formatter: &mut core::fmt::Formatter) -> core::fmt::Result {
3483 formatter.write_str("a valid encoding label")
3484 }
3485
3486 fn visit_str<E>(self, value: &str) -> Result<&'static Encoding, E>
3487 where
3488 E: serde::de::Error,
3489 {
3490 if let Some(enc) = Encoding::for_label(value.as_bytes()) {
3491 Ok(enc)
3492 } else {
3493 Err(E::custom(alloc::format!(
3494 "invalid encoding label: {}",
3495 value
3496 )))
3497 }
3498 }
3499}
3500
3501#[cfg(feature = "serde")]
3502impl<'de> Deserialize<'de> for &'static Encoding {
3503 fn deserialize<D>(deserializer: D) -> Result<&'static Encoding, D::Error>
3504 where
3505 D: Deserializer<'de>,
3506 {
3507 deserializer.deserialize_str(EncodingVisitor)
3508 }
3509}
3510
3511/// Tracks the life cycle of a decoder from BOM sniffing to conversion to end.
3512#[derive(PartialEq, Debug, Copy, Clone)]
3513enum DecoderLifeCycle {
3514 /// The decoder has seen no input yet.
3515 AtStart,
3516 /// The decoder has seen no input yet but expects UTF-8.
3517 AtUtf8Start,
3518 /// The decoder has seen no input yet but expects UTF-16BE.
3519 AtUtf16BeStart,
3520 /// The decoder has seen no input yet but expects UTF-16LE.
3521 AtUtf16LeStart,
3522 /// The decoder has seen EF.
3523 SeenUtf8First,
3524 /// The decoder has seen EF, BB.
3525 SeenUtf8Second,
3526 /// The decoder has seen FE.
3527 SeenUtf16BeFirst,
3528 /// The decoder has seen FF.
3529 SeenUtf16LeFirst,
3530 /// Saw EF, BB but not BF, there was a buffer boundary after BB and the
3531 /// underlying decoder reported EF as an error, so we need to remember to
3532 /// push BB before the next buffer.
3533 ConvertingWithPendingBB,
3534 /// No longer looking for a BOM and EOF not yet seen.
3535 Converting,
3536 /// EOF has been seen.
3537 Finished,
3538}
3539
3540/// Communicate the BOM handling mode.
3541#[derive(Debug, Copy, Clone)]
3542enum BomHandling {
3543 /// Don't handle the BOM
3544 Off,
3545 /// Sniff for UTF-8, UTF-16BE or UTF-16LE BOM
3546 Sniff,
3547 /// Remove the BOM only if it's the BOM for this encoding
3548 Remove,
3549}
3550
3551/// Result of a (potentially partial) decode or encode operation with
3552/// replacement.
3553#[must_use]
3554#[derive(Debug, PartialEq, Eq)]
3555pub enum CoderResult {
3556 /// The input was exhausted.
3557 ///
3558 /// If this result was returned from a call where `last` was `true`, the
3559 /// conversion process has completed. Otherwise, the caller should call a
3560 /// decode or encode method again with more input.
3561 InputEmpty,
3562
3563 /// The converter cannot produce another unit of output, because the output
3564 /// buffer does not have enough space left.
3565 ///
3566 /// The caller must provide more output space upon the next call and re-push
3567 /// the remaining input to the converter.
3568 OutputFull,
3569}
3570
3571/// Result of a (potentially partial) decode operation without replacement.
3572#[must_use]
3573#[derive(Debug, PartialEq, Eq)]
3574pub enum DecoderResult {
3575 /// The input was exhausted.
3576 ///
3577 /// If this result was returned from a call where `last` was `true`, the
3578 /// decoding process has completed. Otherwise, the caller should call a
3579 /// decode method again with more input.
3580 InputEmpty,
3581
3582 /// The decoder cannot produce another unit of output, because the output
3583 /// buffer does not have enough space left.
3584 ///
3585 /// The caller must provide more output space upon the next call and re-push
3586 /// the remaining input to the decoder.
3587 OutputFull,
3588
3589 /// The decoder encountered a malformed byte sequence.
3590 ///
3591 /// The caller must either treat this as a fatal error or must append one
3592 /// REPLACEMENT CHARACTER (U+FFFD) to the output and then re-push the
3593 /// the remaining input to the decoder.
3594 ///
3595 /// The first wrapped integer indicates the length of the malformed byte
3596 /// sequence. The second wrapped integer indicates the number of bytes
3597 /// that were consumed after the malformed sequence. If the second
3598 /// integer is zero, the last byte that was consumed is the last byte of
3599 /// the malformed sequence. Note that the malformed bytes may have been part
3600 /// of an earlier input buffer.
3601 ///
3602 /// The first wrapped integer can have values 1, 2, 3 or 4. The second
3603 /// wrapped integer can have values 0, 1, 2 or 3. The worst-case sum
3604 /// of the two is 6, which happens with ISO-2022-JP.
3605 Malformed(u8, u8), // u8 instead of usize to avoid useless bloat
3606}
3607
3608/// A converter that decodes a byte stream into Unicode according to a
3609/// character encoding in a streaming (incremental) manner.
3610///
3611/// The various `decode_*` methods take an input buffer (`src`) and an output
3612/// buffer `dst` both of which are caller-allocated. There are variants for
3613/// both UTF-8 and UTF-16 output buffers.
3614///
3615/// A `decode_*` method decodes bytes from `src` into Unicode characters stored
3616/// into `dst` until one of the following three things happens:
3617///
3618/// 1. A malformed byte sequence is encountered (`*_without_replacement`
3619/// variants only).
3620///
3621/// 2. The output buffer has been filled so near capacity that the decoder
3622/// cannot be sure that processing an additional byte of input wouldn't
3623/// cause so much output that the output buffer would overflow.
3624///
3625/// 3. All the input bytes have been processed.
3626///
3627/// The `decode_*` method then returns tuple of a status indicating which one
3628/// of the three reasons to return happened, how many input bytes were read,
3629/// how many output code units (`u8` when decoding into UTF-8 and `u16`
3630/// when decoding to UTF-16) were written (except when decoding into `String`,
3631/// whose length change indicates this), and in the case of the
3632/// variants performing replacement, a boolean indicating whether an error was
3633/// replaced with the REPLACEMENT CHARACTER during the call.
3634///
3635/// The number of bytes "written" is what's logically written. Garbage may be
3636/// written in the output buffer beyond the point logically written to.
3637/// Therefore, if you wish to decode into an `&mut str`, you should use the
3638/// methods that take an `&mut str` argument instead of the ones that take an
3639/// `&mut [u8]` argument. The former take care of overwriting the trailing
3640/// garbage to ensure the UTF-8 validity of the `&mut str` as a whole, but the
3641/// latter don't.
3642///
3643/// In the case of the `*_without_replacement` variants, the status is a
3644/// [`DecoderResult`][1] enumeration (possibilities `Malformed`, `OutputFull` and
3645/// `InputEmpty` corresponding to the three cases listed above).
3646///
3647/// In the case of methods whose name does not end with
3648/// `*_without_replacement`, malformed sequences are automatically replaced
3649/// with the REPLACEMENT CHARACTER and errors do not cause the methods to
3650/// return early.
3651///
3652/// When decoding to UTF-8, the output buffer must have at least 4 bytes of
3653/// space. When decoding to UTF-16, the output buffer must have at least two
3654/// UTF-16 code units (`u16`) of space.
3655///
3656/// When decoding to UTF-8 without replacement, the methods are guaranteed
3657/// not to return indicating that more output space is needed if the length
3658/// of the output buffer is at least the length returned by
3659/// [`max_utf8_buffer_length_without_replacement()`][2]. When decoding to UTF-8
3660/// with replacement, the length of the output buffer that guarantees the
3661/// methods not to return indicating that more output space is needed is given
3662/// by [`max_utf8_buffer_length()`][3]. When decoding to UTF-16 with
3663/// or without replacement, the length of the output buffer that guarantees
3664/// the methods not to return indicating that more output space is needed is
3665/// given by [`max_utf16_buffer_length()`][4].
3666///
3667/// The output written into `dst` is guaranteed to be valid UTF-8 or UTF-16,
3668/// and the output after each `decode_*` call is guaranteed to consist of
3669/// complete characters. (I.e. the code unit sequence for the last character is
3670/// guaranteed not to be split across output buffers.)
3671///
3672/// The boolean argument `last` indicates that the end of the stream is reached
3673/// when all the bytes in `src` have been consumed.
3674///
3675/// A `Decoder` object can be used to incrementally decode a byte stream.
3676///
3677/// During the processing of a single stream, the caller must call `decode_*`
3678/// zero or more times with `last` set to `false` and then call `decode_*` at
3679/// least once with `last` set to `true`. If `decode_*` returns `InputEmpty`,
3680/// the processing of the stream has ended. Otherwise, the caller must call
3681/// `decode_*` again with `last` set to `true` (or treat a `Malformed` result as
3682/// a fatal error).
3683///
3684/// Once the stream has ended, the `Decoder` object must not be used anymore.
3685/// That is, you need to create another one to process another stream.
3686///
3687/// When the decoder returns `OutputFull` or the decoder returns `Malformed` and
3688/// the caller does not wish to treat it as a fatal error, the input buffer
3689/// `src` may not have been completely consumed. In that case, the caller must
3690/// pass the unconsumed contents of `src` to `decode_*` again upon the next
3691/// call.
3692///
3693/// [1]: enum.DecoderResult.html
3694/// [2]: #method.max_utf8_buffer_length_without_replacement
3695/// [3]: #method.max_utf8_buffer_length
3696/// [4]: #method.max_utf16_buffer_length
3697///
3698/// # Infinite loops
3699///
3700/// When converting with a fixed-size output buffer whose size is too small to
3701/// accommodate one character or (when applicable) one numeric character
3702/// reference of output, an infinite loop ensues. When converting with a
3703/// fixed-size output buffer, it generally makes sense to make the buffer
3704/// fairly large (e.g. couple of kilobytes).
3705pub struct Decoder {
3706 encoding: &'static Encoding,
3707 variant: VariantDecoder,
3708 life_cycle: DecoderLifeCycle,
3709}
3710
3711impl Decoder {
3712 fn new(enc: &'static Encoding, decoder: VariantDecoder, sniffing: BomHandling) -> Decoder {
3713 Decoder {
3714 encoding: enc,
3715 variant: decoder,
3716 life_cycle: match sniffing {
3717 BomHandling::Off => DecoderLifeCycle::Converting,
3718 BomHandling::Sniff => DecoderLifeCycle::AtStart,
3719 BomHandling::Remove => {
3720 if enc == UTF_8 {
3721 DecoderLifeCycle::AtUtf8Start
3722 } else if enc == UTF_16BE {
3723 DecoderLifeCycle::AtUtf16BeStart
3724 } else if enc == UTF_16LE {
3725 DecoderLifeCycle::AtUtf16LeStart
3726 } else {
3727 DecoderLifeCycle::Converting
3728 }
3729 }
3730 },
3731 }
3732 }
3733
3734 /// The `Encoding` this `Decoder` is for.
3735 ///
3736 /// BOM sniffing can change the return value of this method during the life
3737 /// of the decoder.
3738 ///
3739 /// Available via the C wrapper.
3740 #[inline]
3741 pub fn encoding(&self) -> &'static Encoding {
3742 self.encoding
3743 }
3744
3745 /// Query the worst-case UTF-8 output size _with replacement_.
3746 ///
3747 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3748 /// that will not overflow given the current state of the decoder and
3749 /// `byte_length` number of additional input bytes when decoding with
3750 /// errors handled by outputting a REPLACEMENT CHARACTER for each malformed
3751 /// sequence or `None` if `usize` would overflow.
3752 ///
3753 /// Available via the C wrapper.
3754 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
3755 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3756 // BOM getting pushed to the underlying decoder.
3757 match self.life_cycle {
3758 DecoderLifeCycle::Converting
3759 | DecoderLifeCycle::AtUtf8Start
3760 | DecoderLifeCycle::AtUtf16LeStart
3761 | DecoderLifeCycle::AtUtf16BeStart => {
3762 return self.variant.max_utf8_buffer_length(byte_length);
3763 }
3764 DecoderLifeCycle::AtStart => {
3765 if let Some(utf8_bom) = checked_add(3, byte_length.checked_mul(3)) {
3766 if let Some(utf16_bom) = checked_add(
3767 1,
3768 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3769 ) {
3770 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3771 let encoding = self.encoding();
3772 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3773 // No need to consider the internal state of the underlying decoder,
3774 // because it is at start, because no data has reached it yet.
3775 return Some(utf_bom);
3776 } else if let Some(non_bom) =
3777 self.variant.max_utf8_buffer_length(byte_length)
3778 {
3779 return Some(core::cmp::max(utf_bom, non_bom));
3780 }
3781 }
3782 }
3783 }
3784 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3785 // Add two bytes even when only one byte has been seen,
3786 // because the one byte can become a lead byte in multibyte
3787 // decoders, but only after the decoder has been queried
3788 // for max length, so the decoder's own logic for adding
3789 // one for a pending lead cannot work.
3790 if let Some(sum) = byte_length.checked_add(2) {
3791 if let Some(utf8_bom) = checked_add(3, sum.checked_mul(3)) {
3792 if self.encoding() == UTF_8 {
3793 // No need to consider the internal state of the underlying decoder,
3794 // because it is at start, because no data has reached it yet.
3795 return Some(utf8_bom);
3796 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3797 return Some(core::cmp::max(utf8_bom, non_bom));
3798 }
3799 }
3800 }
3801 }
3802 DecoderLifeCycle::ConvertingWithPendingBB => {
3803 if let Some(sum) = byte_length.checked_add(2) {
3804 return self.variant.max_utf8_buffer_length(sum);
3805 }
3806 }
3807 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3808 // Add two bytes even when only one byte has been seen,
3809 // because the one byte can become a lead byte in multibyte
3810 // decoders, but only after the decoder has been queried
3811 // for max length, so the decoder's own logic for adding
3812 // one for a pending lead cannot work.
3813 if let Some(sum) = byte_length.checked_add(2) {
3814 if let Some(utf16_bom) =
3815 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3816 {
3817 let encoding = self.encoding();
3818 if encoding == UTF_16LE || encoding == UTF_16BE {
3819 // No need to consider the internal state of the underlying decoder,
3820 // because it is at start, because no data has reached it yet.
3821 return Some(utf16_bom);
3822 } else if let Some(non_bom) = self.variant.max_utf8_buffer_length(sum) {
3823 return Some(core::cmp::max(utf16_bom, non_bom));
3824 }
3825 }
3826 }
3827 }
3828 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3829 }
3830 None
3831 }
3832
3833 /// Query the worst-case UTF-8 output size _without replacement_.
3834 ///
3835 /// Returns the size of the output buffer in UTF-8 code units (`u8`)
3836 /// that will not overflow given the current state of the decoder and
3837 /// `byte_length` number of additional input bytes when decoding without
3838 /// replacement error handling or `None` if `usize` would overflow.
3839 ///
3840 /// Note that this value may be too small for the `_with_replacement` case.
3841 /// Use `max_utf8_buffer_length()` for that case.
3842 ///
3843 /// Available via the C wrapper.
3844 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
3845 // Need to consider a) the decoder morphing due to the BOM and b) a partial
3846 // BOM getting pushed to the underlying decoder.
3847 match self.life_cycle {
3848 DecoderLifeCycle::Converting
3849 | DecoderLifeCycle::AtUtf8Start
3850 | DecoderLifeCycle::AtUtf16LeStart
3851 | DecoderLifeCycle::AtUtf16BeStart => {
3852 return self
3853 .variant
3854 .max_utf8_buffer_length_without_replacement(byte_length);
3855 }
3856 DecoderLifeCycle::AtStart => {
3857 if let Some(utf8_bom) = byte_length.checked_add(3) {
3858 if let Some(utf16_bom) = checked_add(
3859 1,
3860 checked_mul(3, checked_div(byte_length.checked_add(1), 2)),
3861 ) {
3862 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
3863 let encoding = self.encoding();
3864 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
3865 // No need to consider the internal state of the underlying decoder,
3866 // because it is at start, because no data has reached it yet.
3867 return Some(utf_bom);
3868 } else if let Some(non_bom) = self
3869 .variant
3870 .max_utf8_buffer_length_without_replacement(byte_length)
3871 {
3872 return Some(core::cmp::max(utf_bom, non_bom));
3873 }
3874 }
3875 }
3876 }
3877 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
3878 // Add two bytes even when only one byte has been seen,
3879 // because the one byte can become a lead byte in multibyte
3880 // decoders, but only after the decoder has been queried
3881 // for max length, so the decoder's own logic for adding
3882 // one for a pending lead cannot work.
3883 if let Some(sum) = byte_length.checked_add(2) {
3884 if let Some(utf8_bom) = sum.checked_add(3) {
3885 if self.encoding() == UTF_8 {
3886 // No need to consider the internal state of the underlying decoder,
3887 // because it is at start, because no data has reached it yet.
3888 return Some(utf8_bom);
3889 } else if let Some(non_bom) =
3890 self.variant.max_utf8_buffer_length_without_replacement(sum)
3891 {
3892 return Some(core::cmp::max(utf8_bom, non_bom));
3893 }
3894 }
3895 }
3896 }
3897 DecoderLifeCycle::ConvertingWithPendingBB => {
3898 if let Some(sum) = byte_length.checked_add(2) {
3899 return self.variant.max_utf8_buffer_length_without_replacement(sum);
3900 }
3901 }
3902 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
3903 // Add two bytes even when only one byte has been seen,
3904 // because the one byte can become a lead byte in multibyte
3905 // decoders, but only after the decoder has been queried
3906 // for max length, so the decoder's own logic for adding
3907 // one for a pending lead cannot work.
3908 if let Some(sum) = byte_length.checked_add(2) {
3909 if let Some(utf16_bom) =
3910 checked_add(1, checked_mul(3, checked_div(sum.checked_add(1), 2)))
3911 {
3912 let encoding = self.encoding();
3913 if encoding == UTF_16LE || encoding == UTF_16BE {
3914 // No need to consider the internal state of the underlying decoder,
3915 // because it is at start, because no data has reached it yet.
3916 return Some(utf16_bom);
3917 } else if let Some(non_bom) =
3918 self.variant.max_utf8_buffer_length_without_replacement(sum)
3919 {
3920 return Some(core::cmp::max(utf16_bom, non_bom));
3921 }
3922 }
3923 }
3924 }
3925 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
3926 }
3927 None
3928 }
3929
3930 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3931 /// replaced with the REPLACEMENT CHARACTER.
3932 ///
3933 /// See the documentation of the struct for documentation for `decode_*`
3934 /// methods collectively.
3935 ///
3936 /// Available via the C wrapper.
3937 pub fn decode_to_utf8(
3938 &mut self,
3939 src: &[u8],
3940 dst: &mut [u8],
3941 last: bool,
3942 ) -> (CoderResult, usize, usize, bool) {
3943 let mut had_errors = false;
3944 let mut total_read = 0usize;
3945 let mut total_written = 0usize;
3946 loop {
3947 let (result, read, written) = self.decode_to_utf8_without_replacement(
3948 &src[total_read..],
3949 &mut dst[total_written..],
3950 last,
3951 );
3952 total_read += read;
3953 total_written += written;
3954 match result {
3955 DecoderResult::InputEmpty => {
3956 return (
3957 CoderResult::InputEmpty,
3958 total_read,
3959 total_written,
3960 had_errors,
3961 );
3962 }
3963 DecoderResult::OutputFull => {
3964 return (
3965 CoderResult::OutputFull,
3966 total_read,
3967 total_written,
3968 had_errors,
3969 );
3970 }
3971 DecoderResult::Malformed(_, _) => {
3972 had_errors = true;
3973 // There should always be space for the U+FFFD, because
3974 // otherwise we'd have gotten OutputFull already.
3975 // XXX: is the above comment actually true for UTF-8 itself?
3976 // TODO: Consider having fewer bound checks here.
3977 dst[total_written] = 0xEFu8;
3978 total_written += 1;
3979 dst[total_written] = 0xBFu8;
3980 total_written += 1;
3981 dst[total_written] = 0xBDu8;
3982 total_written += 1;
3983 }
3984 }
3985 }
3986 }
3987
3988 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
3989 /// replaced with the REPLACEMENT CHARACTER with type system signaling
3990 /// of UTF-8 validity.
3991 ///
3992 /// This methods calls `decode_to_utf8` and then zeroes
3993 /// out up to three bytes that aren't logically part of the write in order
3994 /// to retain the UTF-8 validity even for the unwritten part of the buffer.
3995 ///
3996 /// See the documentation of the struct for documentation for `decode_*`
3997 /// methods collectively.
3998 ///
3999 /// Available to Rust only.
4000 pub fn decode_to_str(
4001 &mut self,
4002 src: &[u8],
4003 dst: &mut str,
4004 last: bool,
4005 ) -> (CoderResult, usize, usize, bool) {
4006 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4007 let (result, read, written, replaced) = self.decode_to_utf8(src, bytes, last);
4008 let len = bytes.len();
4009 let mut trail = written;
4010 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4011 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4012 // encodings to avoid overwriting here.
4013 if self.encoding != UTF_8 {
4014 let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4015 while trail < max {
4016 bytes[trail] = 0;
4017 trail += 1;
4018 }
4019 }
4020 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4021 bytes[trail] = 0;
4022 trail += 1;
4023 }
4024 (result, read, written, replaced)
4025 }
4026
4027 /// Incrementally decode a byte stream into UTF-8 with malformed sequences
4028 /// replaced with the REPLACEMENT CHARACTER using a `String` receiver.
4029 ///
4030 /// Like the others, this method follows the logic that the output buffer is
4031 /// caller-allocated. This method treats the capacity of the `String` as
4032 /// the output limit. That is, this method guarantees not to cause a
4033 /// reallocation of the backing buffer of `String`.
4034 ///
4035 /// The return value is a tuple that contains the `DecoderResult`, the
4036 /// number of bytes read and a boolean indicating whether replacements
4037 /// were done. The number of bytes written is signaled via the length of
4038 /// the `String` changing.
4039 ///
4040 /// See the documentation of the struct for documentation for `decode_*`
4041 /// methods collectively.
4042 ///
4043 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4044 /// by default).
4045 #[cfg(feature = "alloc")]
4046 pub fn decode_to_string(
4047 &mut self,
4048 src: &[u8],
4049 dst: &mut String,
4050 last: bool,
4051 ) -> (CoderResult, usize, bool) {
4052 unsafe {
4053 let vec = dst.as_mut_vec();
4054 let old_len = vec.len();
4055 let capacity = vec.capacity();
4056 vec.set_len(capacity);
4057 let (result, read, written, replaced) =
4058 self.decode_to_utf8(src, &mut vec[old_len..], last);
4059 vec.set_len(old_len + written);
4060 (result, read, replaced)
4061 }
4062 }
4063
4064 public_decode_function!(/// Incrementally decode a byte stream into UTF-8
4065 /// _without replacement_.
4066 ///
4067 /// See the documentation of the struct for
4068 /// documentation for `decode_*` methods
4069 /// collectively.
4070 ///
4071 /// Available via the C wrapper.
4072 ,
4073 decode_to_utf8_without_replacement,
4074 decode_to_utf8_raw,
4075 decode_to_utf8_checking_end,
4076 decode_to_utf8_after_one_potential_bom_byte,
4077 decode_to_utf8_after_two_potential_bom_bytes,
4078 decode_to_utf8_checking_end_with_offset,
4079 u8);
4080
4081 /// Incrementally decode a byte stream into UTF-8 with type system signaling
4082 /// of UTF-8 validity.
4083 ///
4084 /// This methods calls `decode_to_utf8` and then zeroes out up to three
4085 /// bytes that aren't logically part of the write in order to retain the
4086 /// UTF-8 validity even for the unwritten part of the buffer.
4087 ///
4088 /// See the documentation of the struct for documentation for `decode_*`
4089 /// methods collectively.
4090 ///
4091 /// Available to Rust only.
4092 pub fn decode_to_str_without_replacement(
4093 &mut self,
4094 src: &[u8],
4095 dst: &mut str,
4096 last: bool,
4097 ) -> (DecoderResult, usize, usize) {
4098 let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
4099 let (result, read, written) = self.decode_to_utf8_without_replacement(src, bytes, last);
4100 let len = bytes.len();
4101 let mut trail = written;
4102 // Non-UTF-8 ASCII-compatible decoders may write up to `MAX_STRIDE_SIZE`
4103 // bytes of trailing garbage. No need to optimize non-ASCII-compatible
4104 // encodings to avoid overwriting here.
4105 if self.encoding != UTF_8 {
4106 let max = core::cmp::min(len, trail + ascii::MAX_STRIDE_SIZE);
4107 while trail < max {
4108 bytes[trail] = 0;
4109 trail += 1;
4110 }
4111 }
4112 while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
4113 bytes[trail] = 0;
4114 trail += 1;
4115 }
4116 (result, read, written)
4117 }
4118
4119 /// Incrementally decode a byte stream into UTF-8 using a `String` receiver.
4120 ///
4121 /// Like the others, this method follows the logic that the output buffer is
4122 /// caller-allocated. This method treats the capacity of the `String` as
4123 /// the output limit. That is, this method guarantees not to cause a
4124 /// reallocation of the backing buffer of `String`.
4125 ///
4126 /// The return value is a pair that contains the `DecoderResult` and the
4127 /// number of bytes read. The number of bytes written is signaled via
4128 /// the length of the `String` changing.
4129 ///
4130 /// See the documentation of the struct for documentation for `decode_*`
4131 /// methods collectively.
4132 ///
4133 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4134 /// by default).
4135 #[cfg(feature = "alloc")]
4136 pub fn decode_to_string_without_replacement(
4137 &mut self,
4138 src: &[u8],
4139 dst: &mut String,
4140 last: bool,
4141 ) -> (DecoderResult, usize) {
4142 unsafe {
4143 let vec = dst.as_mut_vec();
4144 let old_len = vec.len();
4145 let capacity = vec.capacity();
4146 vec.set_len(capacity);
4147 let (result, read, written) =
4148 self.decode_to_utf8_without_replacement(src, &mut vec[old_len..], last);
4149 vec.set_len(old_len + written);
4150 (result, read)
4151 }
4152 }
4153
4154 /// Query the worst-case UTF-16 output size (with or without replacement).
4155 ///
4156 /// Returns the size of the output buffer in UTF-16 code units (`u16`)
4157 /// that will not overflow given the current state of the decoder and
4158 /// `byte_length` number of additional input bytes or `None` if `usize`
4159 /// would overflow.
4160 ///
4161 /// Since the REPLACEMENT CHARACTER fits into one UTF-16 code unit, the
4162 /// return value of this method applies also in the
4163 /// `_without_replacement` case.
4164 ///
4165 /// Available via the C wrapper.
4166 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
4167 // Need to consider a) the decoder morphing due to the BOM and b) a partial
4168 // BOM getting pushed to the underlying decoder.
4169 match self.life_cycle {
4170 DecoderLifeCycle::Converting
4171 | DecoderLifeCycle::AtUtf8Start
4172 | DecoderLifeCycle::AtUtf16LeStart
4173 | DecoderLifeCycle::AtUtf16BeStart => {
4174 return self.variant.max_utf16_buffer_length(byte_length);
4175 }
4176 DecoderLifeCycle::AtStart => {
4177 if let Some(utf8_bom) = byte_length.checked_add(1) {
4178 if let Some(utf16_bom) =
4179 checked_add(1, checked_div(byte_length.checked_add(1), 2))
4180 {
4181 let utf_bom = core::cmp::max(utf8_bom, utf16_bom);
4182 let encoding = self.encoding();
4183 if encoding == UTF_8 || encoding == UTF_16LE || encoding == UTF_16BE {
4184 // No need to consider the internal state of the underlying decoder,
4185 // because it is at start, because no data has reached it yet.
4186 return Some(utf_bom);
4187 } else if let Some(non_bom) =
4188 self.variant.max_utf16_buffer_length(byte_length)
4189 {
4190 return Some(core::cmp::max(utf_bom, non_bom));
4191 }
4192 }
4193 }
4194 }
4195 DecoderLifeCycle::SeenUtf8First | DecoderLifeCycle::SeenUtf8Second => {
4196 // Add two bytes even when only one byte has been seen,
4197 // because the one byte can become a lead byte in multibyte
4198 // decoders, but only after the decoder has been queried
4199 // for max length, so the decoder's own logic for adding
4200 // one for a pending lead cannot work.
4201 if let Some(sum) = byte_length.checked_add(2) {
4202 if let Some(utf8_bom) = sum.checked_add(1) {
4203 if self.encoding() == UTF_8 {
4204 // No need to consider the internal state of the underlying decoder,
4205 // because it is at start, because no data has reached it yet.
4206 return Some(utf8_bom);
4207 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4208 return Some(core::cmp::max(utf8_bom, non_bom));
4209 }
4210 }
4211 }
4212 }
4213 DecoderLifeCycle::ConvertingWithPendingBB => {
4214 if let Some(sum) = byte_length.checked_add(2) {
4215 return self.variant.max_utf16_buffer_length(sum);
4216 }
4217 }
4218 DecoderLifeCycle::SeenUtf16LeFirst | DecoderLifeCycle::SeenUtf16BeFirst => {
4219 // Add two bytes even when only one byte has been seen,
4220 // because the one byte can become a lead byte in multibyte
4221 // decoders, but only after the decoder has been queried
4222 // for max length, so the decoder's own logic for adding
4223 // one for a pending lead cannot work.
4224 if let Some(sum) = byte_length.checked_add(2) {
4225 if let Some(utf16_bom) = checked_add(1, checked_div(sum.checked_add(1), 2)) {
4226 let encoding = self.encoding();
4227 if encoding == UTF_16LE || encoding == UTF_16BE {
4228 // No need to consider the internal state of the underlying decoder,
4229 // because it is at start, because no data has reached it yet.
4230 return Some(utf16_bom);
4231 } else if let Some(non_bom) = self.variant.max_utf16_buffer_length(sum) {
4232 return Some(core::cmp::max(utf16_bom, non_bom));
4233 }
4234 }
4235 }
4236 }
4237 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4238 }
4239 None
4240 }
4241
4242 /// Incrementally decode a byte stream into UTF-16 with malformed sequences
4243 /// replaced with the REPLACEMENT CHARACTER.
4244 ///
4245 /// See the documentation of the struct for documentation for `decode_*`
4246 /// methods collectively.
4247 ///
4248 /// Available via the C wrapper.
4249 pub fn decode_to_utf16(
4250 &mut self,
4251 src: &[u8],
4252 dst: &mut [u16],
4253 last: bool,
4254 ) -> (CoderResult, usize, usize, bool) {
4255 let mut had_errors = false;
4256 let mut total_read = 0usize;
4257 let mut total_written = 0usize;
4258 loop {
4259 let (result, read, written) = self.decode_to_utf16_without_replacement(
4260 &src[total_read..],
4261 &mut dst[total_written..],
4262 last,
4263 );
4264 total_read += read;
4265 total_written += written;
4266 match result {
4267 DecoderResult::InputEmpty => {
4268 return (
4269 CoderResult::InputEmpty,
4270 total_read,
4271 total_written,
4272 had_errors,
4273 );
4274 }
4275 DecoderResult::OutputFull => {
4276 return (
4277 CoderResult::OutputFull,
4278 total_read,
4279 total_written,
4280 had_errors,
4281 );
4282 }
4283 DecoderResult::Malformed(_, _) => {
4284 had_errors = true;
4285 // There should always be space for the U+FFFD, because
4286 // otherwise we'd have gotten OutputFull already.
4287 dst[total_written] = 0xFFFD;
4288 total_written += 1;
4289 }
4290 }
4291 }
4292 }
4293
4294 public_decode_function!(/// Incrementally decode a byte stream into UTF-16
4295 /// _without replacement_.
4296 ///
4297 /// See the documentation of the struct for
4298 /// documentation for `decode_*` methods
4299 /// collectively.
4300 ///
4301 /// Available via the C wrapper.
4302 ,
4303 decode_to_utf16_without_replacement,
4304 decode_to_utf16_raw,
4305 decode_to_utf16_checking_end,
4306 decode_to_utf16_after_one_potential_bom_byte,
4307 decode_to_utf16_after_two_potential_bom_bytes,
4308 decode_to_utf16_checking_end_with_offset,
4309 u16);
4310
4311 /// Checks for compatibility with storing Unicode scalar values as unsigned
4312 /// bytes taking into account the state of the decoder.
4313 ///
4314 /// Returns `None` if the decoder is not in a neutral state, including waiting
4315 /// for the BOM, or if the encoding is never Latin1-byte-compatible.
4316 ///
4317 /// Otherwise returns the index of the first byte whose unsigned value doesn't
4318 /// directly correspond to the decoded Unicode scalar value, or the length
4319 /// of the input if all bytes in the input decode directly to scalar values
4320 /// corresponding to the unsigned byte values.
4321 ///
4322 /// Does not change the state of the decoder.
4323 ///
4324 /// Do not use this unless you are supporting SpiderMonkey/V8-style string
4325 /// storage optimizations.
4326 ///
4327 /// Available via the C wrapper.
4328 pub fn latin1_byte_compatible_up_to(&self, bytes: &[u8]) -> Option<usize> {
4329 match self.life_cycle {
4330 DecoderLifeCycle::Converting => {
4331 return self.variant.latin1_byte_compatible_up_to(bytes);
4332 }
4333 DecoderLifeCycle::Finished => panic!("Must not use a decoder that has finished."),
4334 _ => None,
4335 }
4336 }
4337}
4338
4339/// Result of a (potentially partial) encode operation without replacement.
4340#[must_use]
4341#[derive(Debug, PartialEq, Eq)]
4342pub enum EncoderResult {
4343 /// The input was exhausted.
4344 ///
4345 /// If this result was returned from a call where `last` was `true`, the
4346 /// decoding process has completed. Otherwise, the caller should call a
4347 /// decode method again with more input.
4348 InputEmpty,
4349
4350 /// The encoder cannot produce another unit of output, because the output
4351 /// buffer does not have enough space left.
4352 ///
4353 /// The caller must provide more output space upon the next call and re-push
4354 /// the remaining input to the decoder.
4355 OutputFull,
4356
4357 /// The encoder encountered an unmappable character.
4358 ///
4359 /// The caller must either treat this as a fatal error or must append
4360 /// a placeholder to the output and then re-push the remaining input to the
4361 /// encoder.
4362 Unmappable(char),
4363}
4364
4365impl EncoderResult {
4366 fn unmappable_from_bmp(bmp: u16) -> EncoderResult {
4367 EncoderResult::Unmappable(::core::char::from_u32(u32::from(bmp)).unwrap())
4368 }
4369}
4370
4371/// A converter that encodes a Unicode stream into bytes according to a
4372/// character encoding in a streaming (incremental) manner.
4373///
4374/// The various `encode_*` methods take an input buffer (`src`) and an output
4375/// buffer `dst` both of which are caller-allocated. There are variants for
4376/// both UTF-8 and UTF-16 input buffers.
4377///
4378/// An `encode_*` method encode characters from `src` into bytes characters
4379/// stored into `dst` until one of the following three things happens:
4380///
4381/// 1. An unmappable character is encountered (`*_without_replacement` variants
4382/// only).
4383///
4384/// 2. The output buffer has been filled so near capacity that the decoder
4385/// cannot be sure that processing an additional character of input wouldn't
4386/// cause so much output that the output buffer would overflow.
4387///
4388/// 3. All the input characters have been processed.
4389///
4390/// The `encode_*` method then returns tuple of a status indicating which one
4391/// of the three reasons to return happened, how many input code units (`u8`
4392/// when encoding from UTF-8 and `u16` when encoding from UTF-16) were read,
4393/// how many output bytes were written (except when encoding into `Vec<u8>`,
4394/// whose length change indicates this), and in the case of the variants that
4395/// perform replacement, a boolean indicating whether an unmappable
4396/// character was replaced with a numeric character reference during the call.
4397///
4398/// The number of bytes "written" is what's logically written. Garbage may be
4399/// written in the output buffer beyond the point logically written to.
4400///
4401/// In the case of the methods whose name ends with
4402/// `*_without_replacement`, the status is an [`EncoderResult`][1] enumeration
4403/// (possibilities `Unmappable`, `OutputFull` and `InputEmpty` corresponding to
4404/// the three cases listed above).
4405///
4406/// In the case of methods whose name does not end with
4407/// `*_without_replacement`, unmappable characters are automatically replaced
4408/// with the corresponding numeric character references and unmappable
4409/// characters do not cause the methods to return early.
4410///
4411/// When encoding from UTF-8 without replacement, the methods are guaranteed
4412/// not to return indicating that more output space is needed if the length
4413/// of the output buffer is at least the length returned by
4414/// [`max_buffer_length_from_utf8_without_replacement()`][2]. When encoding from
4415/// UTF-8 with replacement, the length of the output buffer that guarantees the
4416/// methods not to return indicating that more output space is needed in the
4417/// absence of unmappable characters is given by
4418/// [`max_buffer_length_from_utf8_if_no_unmappables()`][3]. When encoding from
4419/// UTF-16 without replacement, the methods are guaranteed not to return
4420/// indicating that more output space is needed if the length of the output
4421/// buffer is at least the length returned by
4422/// [`max_buffer_length_from_utf16_without_replacement()`][4]. When encoding
4423/// from UTF-16 with replacement, the the length of the output buffer that
4424/// guarantees the methods not to return indicating that more output space is
4425/// needed in the absence of unmappable characters is given by
4426/// [`max_buffer_length_from_utf16_if_no_unmappables()`][5].
4427/// When encoding with replacement, applications are not expected to size the
4428/// buffer for the worst case ahead of time but to resize the buffer if there
4429/// are unmappable characters. This is why max length queries are only available
4430/// for the case where there are no unmappable characters.
4431///
4432/// When encoding from UTF-8, each `src` buffer _must_ be valid UTF-8. (When
4433/// calling from Rust, the type system takes care of this.) When encoding from
4434/// UTF-16, unpaired surrogates in the input are treated as U+FFFD REPLACEMENT
4435/// CHARACTERS. Therefore, in order for astral characters not to turn into a
4436/// pair of REPLACEMENT CHARACTERS, the caller must ensure that surrogate pairs
4437/// are not split across input buffer boundaries.
4438///
4439/// After an `encode_*` call returns, the output produced so far, taken as a
4440/// whole from the start of the stream, is guaranteed to consist of a valid
4441/// byte sequence in the target encoding. (I.e. the code unit sequence for a
4442/// character is guaranteed not to be split across output buffers. However, due
4443/// to the stateful nature of ISO-2022-JP, the stream needs to be considered
4444/// from the start for it to be valid. For other encodings, the validity holds
4445/// on a per-output buffer basis.)
4446///
4447/// The boolean argument `last` indicates that the end of the stream is reached
4448/// when all the characters in `src` have been consumed. This argument is needed
4449/// for ISO-2022-JP and is ignored for other encodings.
4450///
4451/// An `Encoder` object can be used to incrementally encode a byte stream.
4452///
4453/// During the processing of a single stream, the caller must call `encode_*`
4454/// zero or more times with `last` set to `false` and then call `encode_*` at
4455/// least once with `last` set to `true`. If `encode_*` returns `InputEmpty`,
4456/// the processing of the stream has ended. Otherwise, the caller must call
4457/// `encode_*` again with `last` set to `true` (or treat an `Unmappable` result
4458/// as a fatal error).
4459///
4460/// Once the stream has ended, the `Encoder` object must not be used anymore.
4461/// That is, you need to create another one to process another stream.
4462///
4463/// When the encoder returns `OutputFull` or the encoder returns `Unmappable`
4464/// and the caller does not wish to treat it as a fatal error, the input buffer
4465/// `src` may not have been completely consumed. In that case, the caller must
4466/// pass the unconsumed contents of `src` to `encode_*` again upon the next
4467/// call.
4468///
4469/// [1]: enum.EncoderResult.html
4470/// [2]: #method.max_buffer_length_from_utf8_without_replacement
4471/// [3]: #method.max_buffer_length_from_utf8_if_no_unmappables
4472/// [4]: #method.max_buffer_length_from_utf16_without_replacement
4473/// [5]: #method.max_buffer_length_from_utf16_if_no_unmappables
4474///
4475/// # Infinite loops
4476///
4477/// When converting with a fixed-size output buffer whose size is too small to
4478/// accommodate one character of output, an infinite loop ensues. When
4479/// converting with a fixed-size output buffer, it generally makes sense to
4480/// make the buffer fairly large (e.g. couple of kilobytes).
4481pub struct Encoder {
4482 encoding: &'static Encoding,
4483 variant: VariantEncoder,
4484}
4485
4486impl Encoder {
4487 fn new(enc: &'static Encoding, encoder: VariantEncoder) -> Encoder {
4488 Encoder {
4489 encoding: enc,
4490 variant: encoder,
4491 }
4492 }
4493
4494 /// The `Encoding` this `Encoder` is for.
4495 #[inline]
4496 pub fn encoding(&self) -> &'static Encoding {
4497 self.encoding
4498 }
4499
4500 /// Returns `true` if this is an ISO-2022-JP encoder that's not in the
4501 /// ASCII state and `false` otherwise.
4502 #[inline]
4503 pub fn has_pending_state(&self) -> bool {
4504 self.variant.has_pending_state()
4505 }
4506
4507 /// Query the worst-case output size when encoding from UTF-8 with
4508 /// replacement.
4509 ///
4510 /// Returns the size of the output buffer in bytes that will not overflow
4511 /// given the current state of the encoder and `byte_length` number of
4512 /// additional input code units if there are no unmappable characters in
4513 /// the input or `None` if `usize` would overflow.
4514 ///
4515 /// Available via the C wrapper.
4516 pub fn max_buffer_length_from_utf8_if_no_unmappables(
4517 &self,
4518 byte_length: usize,
4519 ) -> Option<usize> {
4520 checked_add(
4521 if self.encoding().can_encode_everything() {
4522 0
4523 } else {
4524 NCR_EXTRA
4525 },
4526 self.max_buffer_length_from_utf8_without_replacement(byte_length),
4527 )
4528 }
4529
4530 /// Query the worst-case output size when encoding from UTF-8 without
4531 /// replacement.
4532 ///
4533 /// Returns the size of the output buffer in bytes that will not overflow
4534 /// given the current state of the encoder and `byte_length` number of
4535 /// additional input code units or `None` if `usize` would overflow.
4536 ///
4537 /// Available via the C wrapper.
4538 pub fn max_buffer_length_from_utf8_without_replacement(
4539 &self,
4540 byte_length: usize,
4541 ) -> Option<usize> {
4542 self.variant
4543 .max_buffer_length_from_utf8_without_replacement(byte_length)
4544 }
4545
4546 /// Incrementally encode into byte stream from UTF-8 with unmappable
4547 /// characters replaced with HTML (decimal) numeric character references.
4548 ///
4549 /// See the documentation of the struct for documentation for `encode_*`
4550 /// methods collectively.
4551 ///
4552 /// Available via the C wrapper.
4553 pub fn encode_from_utf8(
4554 &mut self,
4555 src: &str,
4556 dst: &mut [u8],
4557 last: bool,
4558 ) -> (CoderResult, usize, usize, bool) {
4559 let dst_len = dst.len();
4560 let effective_dst_len = if self.encoding().can_encode_everything() {
4561 dst_len
4562 } else {
4563 if dst_len < NCR_EXTRA {
4564 if src.is_empty() && !(last && self.has_pending_state()) {
4565 return (CoderResult::InputEmpty, 0, 0, false);
4566 }
4567 return (CoderResult::OutputFull, 0, 0, false);
4568 }
4569 dst_len - NCR_EXTRA
4570 };
4571 let mut had_unmappables = false;
4572 let mut total_read = 0usize;
4573 let mut total_written = 0usize;
4574 loop {
4575 let (result, read, written) = self.encode_from_utf8_without_replacement(
4576 &src[total_read..],
4577 &mut dst[total_written..effective_dst_len],
4578 last,
4579 );
4580 total_read += read;
4581 total_written += written;
4582 match result {
4583 EncoderResult::InputEmpty => {
4584 return (
4585 CoderResult::InputEmpty,
4586 total_read,
4587 total_written,
4588 had_unmappables,
4589 );
4590 }
4591 EncoderResult::OutputFull => {
4592 return (
4593 CoderResult::OutputFull,
4594 total_read,
4595 total_written,
4596 had_unmappables,
4597 );
4598 }
4599 EncoderResult::Unmappable(unmappable) => {
4600 had_unmappables = true;
4601 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4602 debug_assert_ne!(self.encoding(), UTF_16BE);
4603 debug_assert_ne!(self.encoding(), UTF_16LE);
4604 // Additionally, Iso2022JpEncoder is responsible for
4605 // transitioning to ASCII when returning with Unmappable.
4606 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4607 if total_written >= effective_dst_len {
4608 if total_read == src.len() && !(last && self.has_pending_state()) {
4609 return (
4610 CoderResult::InputEmpty,
4611 total_read,
4612 total_written,
4613 had_unmappables,
4614 );
4615 }
4616 return (
4617 CoderResult::OutputFull,
4618 total_read,
4619 total_written,
4620 had_unmappables,
4621 );
4622 }
4623 }
4624 }
4625 }
4626 }
4627
4628 /// Incrementally encode into byte stream from UTF-8 with unmappable
4629 /// characters replaced with HTML (decimal) numeric character references.
4630 ///
4631 /// See the documentation of the struct for documentation for `encode_*`
4632 /// methods collectively.
4633 ///
4634 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4635 /// by default).
4636 #[cfg(feature = "alloc")]
4637 pub fn encode_from_utf8_to_vec(
4638 &mut self,
4639 src: &str,
4640 dst: &mut Vec<u8>,
4641 last: bool,
4642 ) -> (CoderResult, usize, bool) {
4643 unsafe {
4644 let old_len = dst.len();
4645 let capacity = dst.capacity();
4646 dst.set_len(capacity);
4647 let (result, read, written, replaced) =
4648 self.encode_from_utf8(src, &mut dst[old_len..], last);
4649 dst.set_len(old_len + written);
4650 (result, read, replaced)
4651 }
4652 }
4653
4654 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4655 ///
4656 /// See the documentation of the struct for documentation for `encode_*`
4657 /// methods collectively.
4658 ///
4659 /// Available via the C wrapper.
4660 pub fn encode_from_utf8_without_replacement(
4661 &mut self,
4662 src: &str,
4663 dst: &mut [u8],
4664 last: bool,
4665 ) -> (EncoderResult, usize, usize) {
4666 self.variant.encode_from_utf8_raw(src, dst, last)
4667 }
4668
4669 /// Incrementally encode into byte stream from UTF-8 _without replacement_.
4670 ///
4671 /// See the documentation of the struct for documentation for `encode_*`
4672 /// methods collectively.
4673 ///
4674 /// Available to Rust only and only with the `alloc` feature enabled (enabled
4675 /// by default).
4676 #[cfg(feature = "alloc")]
4677 pub fn encode_from_utf8_to_vec_without_replacement(
4678 &mut self,
4679 src: &str,
4680 dst: &mut Vec<u8>,
4681 last: bool,
4682 ) -> (EncoderResult, usize) {
4683 unsafe {
4684 let old_len = dst.len();
4685 let capacity = dst.capacity();
4686 dst.set_len(capacity);
4687 let (result, read, written) =
4688 self.encode_from_utf8_without_replacement(src, &mut dst[old_len..], last);
4689 dst.set_len(old_len + written);
4690 (result, read)
4691 }
4692 }
4693
4694 /// Query the worst-case output size when encoding from UTF-16 with
4695 /// replacement.
4696 ///
4697 /// Returns the size of the output buffer in bytes that will not overflow
4698 /// given the current state of the encoder and `u16_length` number of
4699 /// additional input code units if there are no unmappable characters in
4700 /// the input or `None` if `usize` would overflow.
4701 ///
4702 /// Available via the C wrapper.
4703 pub fn max_buffer_length_from_utf16_if_no_unmappables(
4704 &self,
4705 u16_length: usize,
4706 ) -> Option<usize> {
4707 checked_add(
4708 if self.encoding().can_encode_everything() {
4709 0
4710 } else {
4711 NCR_EXTRA
4712 },
4713 self.max_buffer_length_from_utf16_without_replacement(u16_length),
4714 )
4715 }
4716
4717 /// Query the worst-case output size when encoding from UTF-16 without
4718 /// replacement.
4719 ///
4720 /// Returns the size of the output buffer in bytes that will not overflow
4721 /// given the current state of the encoder and `u16_length` number of
4722 /// additional input code units or `None` if `usize` would overflow.
4723 ///
4724 /// Available via the C wrapper.
4725 pub fn max_buffer_length_from_utf16_without_replacement(
4726 &self,
4727 u16_length: usize,
4728 ) -> Option<usize> {
4729 self.variant
4730 .max_buffer_length_from_utf16_without_replacement(u16_length)
4731 }
4732
4733 /// Incrementally encode into byte stream from UTF-16 with unmappable
4734 /// characters replaced with HTML (decimal) numeric character references.
4735 ///
4736 /// See the documentation of the struct for documentation for `encode_*`
4737 /// methods collectively.
4738 ///
4739 /// Available via the C wrapper.
4740 pub fn encode_from_utf16(
4741 &mut self,
4742 src: &[u16],
4743 dst: &mut [u8],
4744 last: bool,
4745 ) -> (CoderResult, usize, usize, bool) {
4746 let dst_len = dst.len();
4747 let effective_dst_len = if self.encoding().can_encode_everything() {
4748 dst_len
4749 } else {
4750 if dst_len < NCR_EXTRA {
4751 if src.is_empty() && !(last && self.has_pending_state()) {
4752 return (CoderResult::InputEmpty, 0, 0, false);
4753 }
4754 return (CoderResult::OutputFull, 0, 0, false);
4755 }
4756 dst_len - NCR_EXTRA
4757 };
4758 let mut had_unmappables = false;
4759 let mut total_read = 0usize;
4760 let mut total_written = 0usize;
4761 loop {
4762 let (result, read, written) = self.encode_from_utf16_without_replacement(
4763 &src[total_read..],
4764 &mut dst[total_written..effective_dst_len],
4765 last,
4766 );
4767 total_read += read;
4768 total_written += written;
4769 match result {
4770 EncoderResult::InputEmpty => {
4771 return (
4772 CoderResult::InputEmpty,
4773 total_read,
4774 total_written,
4775 had_unmappables,
4776 );
4777 }
4778 EncoderResult::OutputFull => {
4779 return (
4780 CoderResult::OutputFull,
4781 total_read,
4782 total_written,
4783 had_unmappables,
4784 );
4785 }
4786 EncoderResult::Unmappable(unmappable) => {
4787 had_unmappables = true;
4788 debug_assert!(dst.len() - total_written >= NCR_EXTRA);
4789 // There are no UTF-16 encoders and even if there were,
4790 // they'd never have unmappables.
4791 debug_assert_ne!(self.encoding(), UTF_16BE);
4792 debug_assert_ne!(self.encoding(), UTF_16LE);
4793 // Additionally, Iso2022JpEncoder is responsible for
4794 // transitioning to ASCII when returning with Unmappable
4795 // from the jis0208 state. That is, when we encode
4796 // ISO-2022-JP and come here, the encoder is in either the
4797 // ASCII or the Roman state. We are allowed to generate any
4798 // printable ASCII excluding \ and ~.
4799 total_written += write_ncr(unmappable, &mut dst[total_written..]);
4800 if total_written >= effective_dst_len {
4801 if total_read == src.len() && !(last && self.has_pending_state()) {
4802 return (
4803 CoderResult::InputEmpty,
4804 total_read,
4805 total_written,
4806 had_unmappables,
4807 );
4808 }
4809 return (
4810 CoderResult::OutputFull,
4811 total_read,
4812 total_written,
4813 had_unmappables,
4814 );
4815 }
4816 }
4817 }
4818 }
4819 }
4820
4821 /// Incrementally encode into byte stream from UTF-16 _without replacement_.
4822 ///
4823 /// See the documentation of the struct for documentation for `encode_*`
4824 /// methods collectively.
4825 ///
4826 /// Available via the C wrapper.
4827 pub fn encode_from_utf16_without_replacement(
4828 &mut self,
4829 src: &[u16],
4830 dst: &mut [u8],
4831 last: bool,
4832 ) -> (EncoderResult, usize, usize) {
4833 self.variant.encode_from_utf16_raw(src, dst, last)
4834 }
4835}
4836
4837/// Format an unmappable as NCR without heap allocation.
4838fn write_ncr(unmappable: char, dst: &mut [u8]) -> usize {
4839 // len is the number of decimal digits needed to represent unmappable plus
4840 // 3 (the length of "&#" and ";").
4841 let mut number = unmappable as u32;
4842 let len = if number >= 1_000_000u32 {
4843 10usize
4844 } else if number >= 100_000u32 {
4845 9usize
4846 } else if number >= 10_000u32 {
4847 8usize
4848 } else if number >= 1_000u32 {
4849 7usize
4850 } else if number >= 100u32 {
4851 6usize
4852 } else {
4853 // Review the outcome of https://github.com/whatwg/encoding/issues/15
4854 // to see if this case is possible
4855 5usize
4856 };
4857 debug_assert!(number >= 10u32);
4858 debug_assert!(len <= dst.len());
4859 let mut pos = len - 1;
4860 dst[pos] = b';';
4861 pos -= 1;
4862 loop {
4863 let rightmost = number % 10;
4864 dst[pos] = rightmost as u8 + b'0';
4865 pos -= 1;
4866 if number < 10 {
4867 break;
4868 }
4869 number /= 10;
4870 }
4871 dst[1] = b'#';
4872 dst[0] = b'&';
4873 len
4874}
4875
4876#[inline(always)]
4877fn in_range16(i: u16, start: u16, end: u16) -> bool {
4878 i.wrapping_sub(start) < (end - start)
4879}
4880
4881#[inline(always)]
4882fn in_range32(i: u32, start: u32, end: u32) -> bool {
4883 i.wrapping_sub(start) < (end - start)
4884}
4885
4886#[inline(always)]
4887fn in_inclusive_range8(i: u8, start: u8, end: u8) -> bool {
4888 i.wrapping_sub(start) <= (end - start)
4889}
4890
4891#[inline(always)]
4892fn in_inclusive_range16(i: u16, start: u16, end: u16) -> bool {
4893 i.wrapping_sub(start) <= (end - start)
4894}
4895
4896#[inline(always)]
4897fn in_inclusive_range32(i: u32, start: u32, end: u32) -> bool {
4898 i.wrapping_sub(start) <= (end - start)
4899}
4900
4901#[inline(always)]
4902fn in_inclusive_range(i: usize, start: usize, end: usize) -> bool {
4903 i.wrapping_sub(start) <= (end - start)
4904}
4905
4906#[inline(always)]
4907fn checked_add(num: usize, opt: Option<usize>) -> Option<usize> {
4908 if let Some(n: usize) = opt {
4909 n.checked_add(num)
4910 } else {
4911 None
4912 }
4913}
4914
4915#[inline(always)]
4916fn checked_add_opt(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4917 if let Some(n: usize) = one {
4918 checked_add(num:n, opt:other)
4919 } else {
4920 None
4921 }
4922}
4923
4924#[inline(always)]
4925fn checked_mul(num: usize, opt: Option<usize>) -> Option<usize> {
4926 if let Some(n: usize) = opt {
4927 n.checked_mul(num)
4928 } else {
4929 None
4930 }
4931}
4932
4933#[inline(always)]
4934fn checked_div(opt: Option<usize>, num: usize) -> Option<usize> {
4935 if let Some(n: usize) = opt {
4936 n.checked_div(num)
4937 } else {
4938 None
4939 }
4940}
4941
4942#[cfg(feature = "alloc")]
4943#[inline(always)]
4944fn checked_next_power_of_two(opt: Option<usize>) -> Option<usize> {
4945 opt.map(|n: usize| n.next_power_of_two())
4946}
4947
4948#[cfg(feature = "alloc")]
4949#[inline(always)]
4950fn checked_min(one: Option<usize>, other: Option<usize>) -> Option<usize> {
4951 if let Some(a: usize) = one {
4952 if let Some(b: usize) = other {
4953 Some(::core::cmp::min(v1:a, v2:b))
4954 } else {
4955 Some(a)
4956 }
4957 } else {
4958 other
4959 }
4960}
4961
4962// ############## TESTS ###############
4963
4964#[cfg(all(test, feature = "serde"))]
4965#[derive(Serialize, Deserialize, Debug, PartialEq)]
4966struct Demo {
4967 num: u32,
4968 name: String,
4969 enc: &'static Encoding,
4970}
4971
4972#[cfg(test)]
4973mod test_labels_names;
4974
4975#[cfg(all(test, feature = "alloc"))]
4976mod tests {
4977 use super::*;
4978 use alloc::borrow::Cow;
4979
4980 fn sniff_to_utf16(
4981 initial_encoding: &'static Encoding,
4982 expected_encoding: &'static Encoding,
4983 bytes: &[u8],
4984 expect: &[u16],
4985 breaks: &[usize],
4986 ) {
4987 let mut decoder = initial_encoding.new_decoder();
4988
4989 let mut dest: Vec<u16> =
4990 Vec::with_capacity(decoder.max_utf16_buffer_length(bytes.len()).unwrap());
4991 let capacity = dest.capacity();
4992 dest.resize(capacity, 0u16);
4993
4994 let mut total_written = 0usize;
4995 let mut start = 0usize;
4996 for br in breaks {
4997 let (result, read, written, _) =
4998 decoder.decode_to_utf16(&bytes[start..*br], &mut dest[total_written..], false);
4999 total_written += written;
5000 assert_eq!(read, *br - start);
5001 match result {
5002 CoderResult::InputEmpty => {}
5003 CoderResult::OutputFull => {
5004 unreachable!();
5005 }
5006 }
5007 start = *br;
5008 }
5009 let (result, read, written, _) =
5010 decoder.decode_to_utf16(&bytes[start..], &mut dest[total_written..], true);
5011 total_written += written;
5012 match result {
5013 CoderResult::InputEmpty => {}
5014 CoderResult::OutputFull => {
5015 unreachable!();
5016 }
5017 }
5018 assert_eq!(read, bytes.len() - start);
5019 assert_eq!(total_written, expect.len());
5020 assert_eq!(&dest[..total_written], expect);
5021 assert_eq!(decoder.encoding(), expected_encoding);
5022 }
5023
5024 // Any copyright to the test code below this comment is dedicated to the
5025 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
5026
5027 #[test]
5028 fn test_bom_sniffing() {
5029 // ASCII
5030 sniff_to_utf16(
5031 WINDOWS_1252,
5032 WINDOWS_1252,
5033 b"\x61\x62",
5034 &[0x0061u16, 0x0062u16],
5035 &[],
5036 );
5037 // UTF-8
5038 sniff_to_utf16(
5039 WINDOWS_1252,
5040 UTF_8,
5041 b"\xEF\xBB\xBF\x61\x62",
5042 &[0x0061u16, 0x0062u16],
5043 &[],
5044 );
5045 sniff_to_utf16(
5046 WINDOWS_1252,
5047 UTF_8,
5048 b"\xEF\xBB\xBF\x61\x62",
5049 &[0x0061u16, 0x0062u16],
5050 &[1],
5051 );
5052 sniff_to_utf16(
5053 WINDOWS_1252,
5054 UTF_8,
5055 b"\xEF\xBB\xBF\x61\x62",
5056 &[0x0061u16, 0x0062u16],
5057 &[2],
5058 );
5059 sniff_to_utf16(
5060 WINDOWS_1252,
5061 UTF_8,
5062 b"\xEF\xBB\xBF\x61\x62",
5063 &[0x0061u16, 0x0062u16],
5064 &[3],
5065 );
5066 sniff_to_utf16(
5067 WINDOWS_1252,
5068 UTF_8,
5069 b"\xEF\xBB\xBF\x61\x62",
5070 &[0x0061u16, 0x0062u16],
5071 &[4],
5072 );
5073 sniff_to_utf16(
5074 WINDOWS_1252,
5075 UTF_8,
5076 b"\xEF\xBB\xBF\x61\x62",
5077 &[0x0061u16, 0x0062u16],
5078 &[2, 3],
5079 );
5080 sniff_to_utf16(
5081 WINDOWS_1252,
5082 UTF_8,
5083 b"\xEF\xBB\xBF\x61\x62",
5084 &[0x0061u16, 0x0062u16],
5085 &[1, 2],
5086 );
5087 sniff_to_utf16(
5088 WINDOWS_1252,
5089 UTF_8,
5090 b"\xEF\xBB\xBF\x61\x62",
5091 &[0x0061u16, 0x0062u16],
5092 &[1, 3],
5093 );
5094 sniff_to_utf16(
5095 WINDOWS_1252,
5096 UTF_8,
5097 b"\xEF\xBB\xBF\x61\x62",
5098 &[0x0061u16, 0x0062u16],
5099 &[1, 2, 3, 4],
5100 );
5101 sniff_to_utf16(WINDOWS_1252, UTF_8, b"\xEF\xBB\xBF", &[], &[]);
5102 // Not UTF-8
5103 sniff_to_utf16(
5104 WINDOWS_1252,
5105 WINDOWS_1252,
5106 b"\xEF\xBB\x61\x62",
5107 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5108 &[],
5109 );
5110 sniff_to_utf16(
5111 WINDOWS_1252,
5112 WINDOWS_1252,
5113 b"\xEF\xBB\x61\x62",
5114 &[0x00EFu16, 0x00BBu16, 0x0061u16, 0x0062u16],
5115 &[1],
5116 );
5117 sniff_to_utf16(
5118 WINDOWS_1252,
5119 WINDOWS_1252,
5120 b"\xEF\x61\x62",
5121 &[0x00EFu16, 0x0061u16, 0x0062u16],
5122 &[],
5123 );
5124 sniff_to_utf16(
5125 WINDOWS_1252,
5126 WINDOWS_1252,
5127 b"\xEF\x61\x62",
5128 &[0x00EFu16, 0x0061u16, 0x0062u16],
5129 &[1],
5130 );
5131 sniff_to_utf16(
5132 WINDOWS_1252,
5133 WINDOWS_1252,
5134 b"\xEF\xBB",
5135 &[0x00EFu16, 0x00BBu16],
5136 &[],
5137 );
5138 sniff_to_utf16(
5139 WINDOWS_1252,
5140 WINDOWS_1252,
5141 b"\xEF\xBB",
5142 &[0x00EFu16, 0x00BBu16],
5143 &[1],
5144 );
5145 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xEF", &[0x00EFu16], &[]);
5146 // Not UTF-16
5147 sniff_to_utf16(
5148 WINDOWS_1252,
5149 WINDOWS_1252,
5150 b"\xFE\x61\x62",
5151 &[0x00FEu16, 0x0061u16, 0x0062u16],
5152 &[],
5153 );
5154 sniff_to_utf16(
5155 WINDOWS_1252,
5156 WINDOWS_1252,
5157 b"\xFE\x61\x62",
5158 &[0x00FEu16, 0x0061u16, 0x0062u16],
5159 &[1],
5160 );
5161 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFE", &[0x00FEu16], &[]);
5162 sniff_to_utf16(
5163 WINDOWS_1252,
5164 WINDOWS_1252,
5165 b"\xFF\x61\x62",
5166 &[0x00FFu16, 0x0061u16, 0x0062u16],
5167 &[],
5168 );
5169 sniff_to_utf16(
5170 WINDOWS_1252,
5171 WINDOWS_1252,
5172 b"\xFF\x61\x62",
5173 &[0x00FFu16, 0x0061u16, 0x0062u16],
5174 &[1],
5175 );
5176 sniff_to_utf16(WINDOWS_1252, WINDOWS_1252, b"\xFF", &[0x00FFu16], &[]);
5177 // UTF-16
5178 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[]);
5179 sniff_to_utf16(WINDOWS_1252, UTF_16BE, b"\xFE\xFF", &[], &[1]);
5180 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[]);
5181 sniff_to_utf16(WINDOWS_1252, UTF_16LE, b"\xFF\xFE", &[], &[1]);
5182 }
5183
5184 #[test]
5185 fn test_output_encoding() {
5186 assert_eq!(REPLACEMENT.output_encoding(), UTF_8);
5187 assert_eq!(UTF_16BE.output_encoding(), UTF_8);
5188 assert_eq!(UTF_16LE.output_encoding(), UTF_8);
5189 assert_eq!(UTF_8.output_encoding(), UTF_8);
5190 assert_eq!(WINDOWS_1252.output_encoding(), WINDOWS_1252);
5191 assert_eq!(REPLACEMENT.new_encoder().encoding(), UTF_8);
5192 assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8);
5193 assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8);
5194 assert_eq!(UTF_8.new_encoder().encoding(), UTF_8);
5195 assert_eq!(WINDOWS_1252.new_encoder().encoding(), WINDOWS_1252);
5196 }
5197
5198 #[test]
5199 fn test_label_resolution() {
5200 assert_eq!(Encoding::for_label(b"utf-8"), Some(UTF_8));
5201 assert_eq!(Encoding::for_label(b"UTF-8"), Some(UTF_8));
5202 assert_eq!(
5203 Encoding::for_label(b" \t \n \x0C \n utf-8 \r \n \t \x0C "),
5204 Some(UTF_8)
5205 );
5206 assert_eq!(Encoding::for_label(b"utf-8 _"), None);
5207 assert_eq!(Encoding::for_label(b"bogus"), None);
5208 assert_eq!(Encoding::for_label(b"bogusbogusbogusbogus"), None);
5209 }
5210
5211 #[test]
5212 fn test_decode_valid_windows_1257_to_cow() {
5213 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xE4");
5214 match cow {
5215 Cow::Borrowed(_) => unreachable!(),
5216 Cow::Owned(s) => {
5217 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5218 }
5219 }
5220 assert_eq!(encoding, WINDOWS_1257);
5221 assert!(!had_errors);
5222 }
5223
5224 #[test]
5225 fn test_decode_invalid_windows_1257_to_cow() {
5226 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc\x80\xA1\xE4");
5227 match cow {
5228 Cow::Borrowed(_) => unreachable!(),
5229 Cow::Owned(s) => {
5230 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5231 }
5232 }
5233 assert_eq!(encoding, WINDOWS_1257);
5234 assert!(had_errors);
5235 }
5236
5237 #[test]
5238 fn test_decode_ascii_only_windows_1257_to_cow() {
5239 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"abc");
5240 match cow {
5241 Cow::Borrowed(s) => {
5242 assert_eq!(s, "abc");
5243 }
5244 Cow::Owned(_) => unreachable!(),
5245 }
5246 assert_eq!(encoding, WINDOWS_1257);
5247 assert!(!had_errors);
5248 }
5249
5250 #[test]
5251 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow() {
5252 let (cow, encoding, had_errors) = WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5253 match cow {
5254 Cow::Borrowed(s) => {
5255 assert_eq!(s, "\u{20AC}\u{00E4}");
5256 }
5257 Cow::Owned(_) => unreachable!(),
5258 }
5259 assert_eq!(encoding, UTF_8);
5260 assert!(!had_errors);
5261 }
5262
5263 #[test]
5264 fn test_decode_bomful_invalid_utf8_as_windows_1257_to_cow() {
5265 let (cow, encoding, had_errors) =
5266 WINDOWS_1257.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5267 match cow {
5268 Cow::Borrowed(_) => unreachable!(),
5269 Cow::Owned(s) => {
5270 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5271 }
5272 }
5273 assert_eq!(encoding, UTF_8);
5274 assert!(had_errors);
5275 }
5276
5277 #[test]
5278 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow() {
5279 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5280 match cow {
5281 Cow::Borrowed(s) => {
5282 assert_eq!(s, "\u{20AC}\u{00E4}");
5283 }
5284 Cow::Owned(_) => unreachable!(),
5285 }
5286 assert_eq!(encoding, UTF_8);
5287 assert!(!had_errors);
5288 }
5289
5290 #[test]
5291 fn test_decode_bomful_invalid_utf8_as_utf_8_to_cow() {
5292 let (cow, encoding, had_errors) = UTF_8.decode(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5293 match cow {
5294 Cow::Borrowed(_) => unreachable!(),
5295 Cow::Owned(s) => {
5296 assert_eq!(s, "\u{20AC}\u{FFFD}\u{00E4}");
5297 }
5298 }
5299 assert_eq!(encoding, UTF_8);
5300 assert!(had_errors);
5301 }
5302
5303 #[test]
5304 fn test_decode_bomful_valid_utf8_as_utf_8_to_cow_with_bom_removal() {
5305 let (cow, had_errors) = UTF_8.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5306 match cow {
5307 Cow::Borrowed(s) => {
5308 assert_eq!(s, "\u{20AC}\u{00E4}");
5309 }
5310 Cow::Owned(_) => unreachable!(),
5311 }
5312 assert!(!had_errors);
5313 }
5314
5315 #[test]
5316 fn test_decode_bomful_valid_utf8_as_windows_1257_to_cow_with_bom_removal() {
5317 let (cow, had_errors) =
5318 WINDOWS_1257.decode_with_bom_removal(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5319 match cow {
5320 Cow::Borrowed(_) => unreachable!(),
5321 Cow::Owned(s) => {
5322 assert_eq!(
5323 s,
5324 "\u{013C}\u{00BB}\u{00E6}\u{0101}\u{201A}\u{00AC}\u{0106}\u{00A4}"
5325 );
5326 }
5327 }
5328 assert!(!had_errors);
5329 }
5330
5331 #[test]
5332 fn test_decode_valid_windows_1257_to_cow_with_bom_removal() {
5333 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xE4");
5334 match cow {
5335 Cow::Borrowed(_) => unreachable!(),
5336 Cow::Owned(s) => {
5337 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5338 }
5339 }
5340 assert!(!had_errors);
5341 }
5342
5343 #[test]
5344 fn test_decode_invalid_windows_1257_to_cow_with_bom_removal() {
5345 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc\x80\xA1\xE4");
5346 match cow {
5347 Cow::Borrowed(_) => unreachable!(),
5348 Cow::Owned(s) => {
5349 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5350 }
5351 }
5352 assert!(had_errors);
5353 }
5354
5355 #[test]
5356 fn test_decode_ascii_only_windows_1257_to_cow_with_bom_removal() {
5357 let (cow, had_errors) = WINDOWS_1257.decode_with_bom_removal(b"abc");
5358 match cow {
5359 Cow::Borrowed(s) => {
5360 assert_eq!(s, "abc");
5361 }
5362 Cow::Owned(_) => unreachable!(),
5363 }
5364 assert!(!had_errors);
5365 }
5366
5367 #[test]
5368 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling() {
5369 let (cow, had_errors) =
5370 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4");
5371 match cow {
5372 Cow::Borrowed(s) => {
5373 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5374 }
5375 Cow::Owned(_) => unreachable!(),
5376 }
5377 assert!(!had_errors);
5378 }
5379
5380 #[test]
5381 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling() {
5382 let (cow, had_errors) =
5383 UTF_8.decode_without_bom_handling(b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4");
5384 match cow {
5385 Cow::Borrowed(_) => unreachable!(),
5386 Cow::Owned(s) => {
5387 assert_eq!(s, "\u{FEFF}\u{20AC}\u{FFFD}\u{00E4}");
5388 }
5389 }
5390 assert!(had_errors);
5391 }
5392
5393 #[test]
5394 fn test_decode_valid_windows_1257_to_cow_without_bom_handling() {
5395 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xE4");
5396 match cow {
5397 Cow::Borrowed(_) => unreachable!(),
5398 Cow::Owned(s) => {
5399 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5400 }
5401 }
5402 assert!(!had_errors);
5403 }
5404
5405 #[test]
5406 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling() {
5407 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc\x80\xA1\xE4");
5408 match cow {
5409 Cow::Borrowed(_) => unreachable!(),
5410 Cow::Owned(s) => {
5411 assert_eq!(s, "abc\u{20AC}\u{FFFD}\u{00E4}");
5412 }
5413 }
5414 assert!(had_errors);
5415 }
5416
5417 #[test]
5418 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling() {
5419 let (cow, had_errors) = WINDOWS_1257.decode_without_bom_handling(b"abc");
5420 match cow {
5421 Cow::Borrowed(s) => {
5422 assert_eq!(s, "abc");
5423 }
5424 Cow::Owned(_) => unreachable!(),
5425 }
5426 assert!(!had_errors);
5427 }
5428
5429 #[test]
5430 fn test_decode_bomful_valid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5431 match UTF_8.decode_without_bom_handling_and_without_replacement(
5432 b"\xEF\xBB\xBF\xE2\x82\xAC\xC3\xA4",
5433 ) {
5434 Some(cow) => match cow {
5435 Cow::Borrowed(s) => {
5436 assert_eq!(s, "\u{FEFF}\u{20AC}\u{00E4}");
5437 }
5438 Cow::Owned(_) => unreachable!(),
5439 },
5440 None => unreachable!(),
5441 }
5442 }
5443
5444 #[test]
5445 fn test_decode_bomful_invalid_utf8_to_cow_without_bom_handling_and_without_replacement() {
5446 assert!(UTF_8
5447 .decode_without_bom_handling_and_without_replacement(
5448 b"\xEF\xBB\xBF\xE2\x82\xAC\x80\xC3\xA4"
5449 )
5450 .is_none());
5451 }
5452
5453 #[test]
5454 fn test_decode_valid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5455 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc\x80\xE4") {
5456 Some(cow) => match cow {
5457 Cow::Borrowed(_) => unreachable!(),
5458 Cow::Owned(s) => {
5459 assert_eq!(s, "abc\u{20AC}\u{00E4}");
5460 }
5461 },
5462 None => unreachable!(),
5463 }
5464 }
5465
5466 #[test]
5467 fn test_decode_invalid_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5468 assert!(WINDOWS_1257
5469 .decode_without_bom_handling_and_without_replacement(b"abc\x80\xA1\xE4")
5470 .is_none());
5471 }
5472
5473 #[test]
5474 fn test_decode_ascii_only_windows_1257_to_cow_without_bom_handling_and_without_replacement() {
5475 match WINDOWS_1257.decode_without_bom_handling_and_without_replacement(b"abc") {
5476 Some(cow) => match cow {
5477 Cow::Borrowed(s) => {
5478 assert_eq!(s, "abc");
5479 }
5480 Cow::Owned(_) => unreachable!(),
5481 },
5482 None => unreachable!(),
5483 }
5484 }
5485
5486 #[test]
5487 fn test_encode_ascii_only_windows_1257_to_cow() {
5488 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc");
5489 match cow {
5490 Cow::Borrowed(s) => {
5491 assert_eq!(s, b"abc");
5492 }
5493 Cow::Owned(_) => unreachable!(),
5494 }
5495 assert_eq!(encoding, WINDOWS_1257);
5496 assert!(!had_errors);
5497 }
5498
5499 #[test]
5500 fn test_encode_valid_windows_1257_to_cow() {
5501 let (cow, encoding, had_errors) = WINDOWS_1257.encode("abc\u{20AC}\u{00E4}");
5502 match cow {
5503 Cow::Borrowed(_) => unreachable!(),
5504 Cow::Owned(s) => {
5505 assert_eq!(s, b"abc\x80\xE4");
5506 }
5507 }
5508 assert_eq!(encoding, WINDOWS_1257);
5509 assert!(!had_errors);
5510 }
5511
5512 #[test]
5513 fn test_utf16_space_with_one_bom_byte() {
5514 let mut decoder = UTF_16LE.new_decoder();
5515 let mut dst = [0u16; 12];
5516 {
5517 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5518 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5519 assert_eq!(result, CoderResult::InputEmpty);
5520 }
5521 {
5522 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5523 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5524 assert_eq!(result, CoderResult::InputEmpty);
5525 }
5526 }
5527
5528 #[test]
5529 fn test_utf8_space_with_one_bom_byte() {
5530 let mut decoder = UTF_8.new_decoder();
5531 let mut dst = [0u16; 12];
5532 {
5533 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5534 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], false);
5535 assert_eq!(result, CoderResult::InputEmpty);
5536 }
5537 {
5538 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5539 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5540 assert_eq!(result, CoderResult::InputEmpty);
5541 }
5542 }
5543
5544 #[test]
5545 fn test_utf16_space_with_two_bom_bytes() {
5546 let mut decoder = UTF_16LE.new_decoder();
5547 let mut dst = [0u16; 12];
5548 {
5549 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5550 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5551 assert_eq!(result, CoderResult::InputEmpty);
5552 }
5553 {
5554 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5555 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5556 assert_eq!(result, CoderResult::InputEmpty);
5557 }
5558 {
5559 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5560 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5561 assert_eq!(result, CoderResult::InputEmpty);
5562 }
5563 }
5564
5565 #[test]
5566 fn test_utf8_space_with_two_bom_bytes() {
5567 let mut decoder = UTF_8.new_decoder();
5568 let mut dst = [0u16; 12];
5569 {
5570 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5571 let (result, _, _, _) = decoder.decode_to_utf16(b"\xEF", &mut dst[..needed], false);
5572 assert_eq!(result, CoderResult::InputEmpty);
5573 }
5574 {
5575 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5576 let (result, _, _, _) = decoder.decode_to_utf16(b"\xBB", &mut dst[..needed], false);
5577 assert_eq!(result, CoderResult::InputEmpty);
5578 }
5579 {
5580 let needed = decoder.max_utf16_buffer_length(1).unwrap();
5581 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF", &mut dst[..needed], true);
5582 assert_eq!(result, CoderResult::InputEmpty);
5583 }
5584 }
5585
5586 #[test]
5587 fn test_utf16_space_with_one_bom_byte_and_a_second_byte_in_same_call() {
5588 let mut decoder = UTF_16LE.new_decoder();
5589 let mut dst = [0u16; 12];
5590 {
5591 let needed = decoder.max_utf16_buffer_length(2).unwrap();
5592 let (result, _, _, _) = decoder.decode_to_utf16(b"\xFF\xFF", &mut dst[..needed], true);
5593 assert_eq!(result, CoderResult::InputEmpty);
5594 }
5595 }
5596
5597 #[test]
5598 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf8() {
5599 let mut dst = [0u8; 8];
5600 let mut encoder = ISO_2022_JP.new_encoder();
5601 {
5602 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], false);
5603 assert_eq!(result, CoderResult::InputEmpty);
5604 }
5605 {
5606 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..], true);
5607 assert_eq!(result, CoderResult::InputEmpty);
5608 }
5609 }
5610
5611 #[test]
5612 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf8() {
5613 let mut dst = [0u8; 16];
5614 let mut encoder = ISO_2022_JP.new_encoder();
5615 {
5616 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}", &mut dst[..], false);
5617 assert_eq!(result, CoderResult::InputEmpty);
5618 }
5619 {
5620 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], false);
5621 assert_eq!(result, CoderResult::InputEmpty);
5622 }
5623 {
5624 let (result, _, _, _) = encoder.encode_from_utf8("", &mut dst[..8], true);
5625 assert_eq!(result, CoderResult::OutputFull);
5626 }
5627 }
5628
5629 #[test]
5630 fn test_buffer_end_iso_2022_jp_from_utf8() {
5631 let mut dst = [0u8; 18];
5632 {
5633 let mut encoder = ISO_2022_JP.new_encoder();
5634 let (result, _, _, _) =
5635 encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], false);
5636 assert_eq!(result, CoderResult::InputEmpty);
5637 }
5638 {
5639 let mut encoder = ISO_2022_JP.new_encoder();
5640 let (result, _, _, _) = encoder.encode_from_utf8("\u{A5}\u{1F4A9}", &mut dst[..], true);
5641 assert_eq!(result, CoderResult::OutputFull);
5642 }
5643 {
5644 let mut encoder = ISO_2022_JP.new_encoder();
5645 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], false);
5646 assert_eq!(result, CoderResult::InputEmpty);
5647 }
5648 {
5649 let mut encoder = ISO_2022_JP.new_encoder();
5650 let (result, _, _, _) = encoder.encode_from_utf8("\u{1F4A9}", &mut dst[..13], true);
5651 assert_eq!(result, CoderResult::InputEmpty);
5652 }
5653 }
5654
5655 #[test]
5656 fn test_too_short_buffer_with_iso_2022_jp_ascii_from_utf16() {
5657 let mut dst = [0u8; 8];
5658 let mut encoder = ISO_2022_JP.new_encoder();
5659 {
5660 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], false);
5661 assert_eq!(result, CoderResult::InputEmpty);
5662 }
5663 {
5664 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..], true);
5665 assert_eq!(result, CoderResult::InputEmpty);
5666 }
5667 }
5668
5669 #[test]
5670 fn test_too_short_buffer_with_iso_2022_jp_roman_from_utf16() {
5671 let mut dst = [0u8; 16];
5672 let mut encoder = ISO_2022_JP.new_encoder();
5673 {
5674 let (result, _, _, _) = encoder.encode_from_utf16(&[0xA5u16], &mut dst[..], false);
5675 assert_eq!(result, CoderResult::InputEmpty);
5676 }
5677 {
5678 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], false);
5679 assert_eq!(result, CoderResult::InputEmpty);
5680 }
5681 {
5682 let (result, _, _, _) = encoder.encode_from_utf16(&[0u16; 0], &mut dst[..8], true);
5683 assert_eq!(result, CoderResult::OutputFull);
5684 }
5685 }
5686
5687 #[test]
5688 fn test_buffer_end_iso_2022_jp_from_utf16() {
5689 let mut dst = [0u8; 18];
5690 {
5691 let mut encoder = ISO_2022_JP.new_encoder();
5692 let (result, _, _, _) =
5693 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], false);
5694 assert_eq!(result, CoderResult::InputEmpty);
5695 }
5696 {
5697 let mut encoder = ISO_2022_JP.new_encoder();
5698 let (result, _, _, _) =
5699 encoder.encode_from_utf16(&[0xA5u16, 0xD83Du16, 0xDCA9u16], &mut dst[..], true);
5700 assert_eq!(result, CoderResult::OutputFull);
5701 }
5702 {
5703 let mut encoder = ISO_2022_JP.new_encoder();
5704 let (result, _, _, _) =
5705 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], false);
5706 assert_eq!(result, CoderResult::InputEmpty);
5707 }
5708 {
5709 let mut encoder = ISO_2022_JP.new_encoder();
5710 let (result, _, _, _) =
5711 encoder.encode_from_utf16(&[0xD83Du16, 0xDCA9u16], &mut dst[..13], true);
5712 assert_eq!(result, CoderResult::InputEmpty);
5713 }
5714 }
5715
5716 #[test]
5717 fn test_buffer_end_utf16be() {
5718 let mut decoder = UTF_16BE.new_decoder_without_bom_handling();
5719 let mut dest = [0u8; 4];
5720
5721 assert_eq!(
5722 decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, false),
5723 (CoderResult::InputEmpty, 2, 0, false)
5724 );
5725
5726 let _ = decoder.decode_to_utf8(&[0xD8, 0x00], &mut dest, true);
5727 }
5728
5729 #[test]
5730 fn test_hash() {
5731 let mut encodings = ::alloc::collections::btree_set::BTreeSet::new();
5732 encodings.insert(UTF_8);
5733 encodings.insert(ISO_2022_JP);
5734 assert!(encodings.contains(UTF_8));
5735 assert!(encodings.contains(ISO_2022_JP));
5736 assert!(!encodings.contains(WINDOWS_1252));
5737 encodings.remove(ISO_2022_JP);
5738 assert!(!encodings.contains(ISO_2022_JP));
5739 }
5740
5741 #[test]
5742 fn test_iso_2022_jp_ncr_extra_from_utf16() {
5743 let mut dst = [0u8; 17];
5744 {
5745 let mut encoder = ISO_2022_JP.new_encoder();
5746 let (result, _, _, _) =
5747 encoder.encode_from_utf16(&[0x3041u16, 0xFFFFu16], &mut dst[..], true);
5748 assert_eq!(result, CoderResult::OutputFull);
5749 }
5750 }
5751
5752 #[test]
5753 fn test_iso_2022_jp_ncr_extra_from_utf8() {
5754 let mut dst = [0u8; 17];
5755 {
5756 let mut encoder = ISO_2022_JP.new_encoder();
5757 let (result, _, _, _) =
5758 encoder.encode_from_utf8("\u{3041}\u{FFFF}", &mut dst[..], true);
5759 assert_eq!(result, CoderResult::OutputFull);
5760 }
5761 }
5762
5763 #[test]
5764 fn test_max_length_with_bom_to_utf8() {
5765 let mut output = [0u8; 20];
5766 let mut decoder = REPLACEMENT.new_decoder();
5767 let input = b"\xEF\xBB\xBFA";
5768 {
5769 let needed = decoder
5770 .max_utf8_buffer_length_without_replacement(input.len())
5771 .unwrap();
5772 let (result, read, written) =
5773 decoder.decode_to_utf8_without_replacement(input, &mut output[..needed], true);
5774 assert_eq!(result, DecoderResult::InputEmpty);
5775 assert_eq!(read, input.len());
5776 assert_eq!(written, 1);
5777 assert_eq!(output[0], 0x41);
5778 }
5779 }
5780
5781 #[cfg(feature = "serde")]
5782 #[test]
5783 fn test_serde() {
5784 let demo = Demo {
5785 num: 42,
5786 name: "foo".into(),
5787 enc: UTF_8,
5788 };
5789
5790 let serialized = serde_json::to_string(&demo).unwrap();
5791
5792 let deserialized: Demo = serde_json::from_str(&serialized).unwrap();
5793 assert_eq!(deserialized, demo);
5794
5795 let bincoded = bincode::serialize(&demo).unwrap();
5796 let debincoded: Demo = bincode::deserialize(&bincoded[..]).unwrap();
5797 assert_eq!(debincoded, demo);
5798 }
5799
5800 #[test]
5801 fn test_is_single_byte() {
5802 assert!(!BIG5.is_single_byte());
5803 assert!(!EUC_JP.is_single_byte());
5804 assert!(!EUC_KR.is_single_byte());
5805 assert!(!GB18030.is_single_byte());
5806 assert!(!GBK.is_single_byte());
5807 assert!(!REPLACEMENT.is_single_byte());
5808 assert!(!SHIFT_JIS.is_single_byte());
5809 assert!(!UTF_8.is_single_byte());
5810 assert!(!UTF_16BE.is_single_byte());
5811 assert!(!UTF_16LE.is_single_byte());
5812 assert!(!ISO_2022_JP.is_single_byte());
5813
5814 assert!(IBM866.is_single_byte());
5815 assert!(ISO_8859_2.is_single_byte());
5816 assert!(ISO_8859_3.is_single_byte());
5817 assert!(ISO_8859_4.is_single_byte());
5818 assert!(ISO_8859_5.is_single_byte());
5819 assert!(ISO_8859_6.is_single_byte());
5820 assert!(ISO_8859_7.is_single_byte());
5821 assert!(ISO_8859_8.is_single_byte());
5822 assert!(ISO_8859_10.is_single_byte());
5823 assert!(ISO_8859_13.is_single_byte());
5824 assert!(ISO_8859_14.is_single_byte());
5825 assert!(ISO_8859_15.is_single_byte());
5826 assert!(ISO_8859_16.is_single_byte());
5827 assert!(ISO_8859_8_I.is_single_byte());
5828 assert!(KOI8_R.is_single_byte());
5829 assert!(KOI8_U.is_single_byte());
5830 assert!(MACINTOSH.is_single_byte());
5831 assert!(WINDOWS_874.is_single_byte());
5832 assert!(WINDOWS_1250.is_single_byte());
5833 assert!(WINDOWS_1251.is_single_byte());
5834 assert!(WINDOWS_1252.is_single_byte());
5835 assert!(WINDOWS_1253.is_single_byte());
5836 assert!(WINDOWS_1254.is_single_byte());
5837 assert!(WINDOWS_1255.is_single_byte());
5838 assert!(WINDOWS_1256.is_single_byte());
5839 assert!(WINDOWS_1257.is_single_byte());
5840 assert!(WINDOWS_1258.is_single_byte());
5841 assert!(X_MAC_CYRILLIC.is_single_byte());
5842 assert!(X_USER_DEFINED.is_single_byte());
5843 }
5844
5845 #[test]
5846 fn test_latin1_byte_compatible_up_to() {
5847 let buffer = b"a\x81\xB6\xF6\xF0\x82\xB4";
5848 assert_eq!(
5849 BIG5.new_decoder_without_bom_handling()
5850 .latin1_byte_compatible_up_to(buffer)
5851 .unwrap(),
5852 1
5853 );
5854 assert_eq!(
5855 EUC_JP
5856 .new_decoder_without_bom_handling()
5857 .latin1_byte_compatible_up_to(buffer)
5858 .unwrap(),
5859 1
5860 );
5861 assert_eq!(
5862 EUC_KR
5863 .new_decoder_without_bom_handling()
5864 .latin1_byte_compatible_up_to(buffer)
5865 .unwrap(),
5866 1
5867 );
5868 assert_eq!(
5869 GB18030
5870 .new_decoder_without_bom_handling()
5871 .latin1_byte_compatible_up_to(buffer)
5872 .unwrap(),
5873 1
5874 );
5875 assert_eq!(
5876 GBK.new_decoder_without_bom_handling()
5877 .latin1_byte_compatible_up_to(buffer)
5878 .unwrap(),
5879 1
5880 );
5881 assert!(REPLACEMENT
5882 .new_decoder_without_bom_handling()
5883 .latin1_byte_compatible_up_to(buffer)
5884 .is_none());
5885 assert_eq!(
5886 SHIFT_JIS
5887 .new_decoder_without_bom_handling()
5888 .latin1_byte_compatible_up_to(buffer)
5889 .unwrap(),
5890 1
5891 );
5892 assert_eq!(
5893 UTF_8
5894 .new_decoder_without_bom_handling()
5895 .latin1_byte_compatible_up_to(buffer)
5896 .unwrap(),
5897 1
5898 );
5899 assert!(UTF_16BE
5900 .new_decoder_without_bom_handling()
5901 .latin1_byte_compatible_up_to(buffer)
5902 .is_none());
5903 assert!(UTF_16LE
5904 .new_decoder_without_bom_handling()
5905 .latin1_byte_compatible_up_to(buffer)
5906 .is_none());
5907 assert_eq!(
5908 ISO_2022_JP
5909 .new_decoder_without_bom_handling()
5910 .latin1_byte_compatible_up_to(buffer)
5911 .unwrap(),
5912 1
5913 );
5914
5915 assert_eq!(
5916 IBM866
5917 .new_decoder_without_bom_handling()
5918 .latin1_byte_compatible_up_to(buffer)
5919 .unwrap(),
5920 1
5921 );
5922 assert_eq!(
5923 ISO_8859_2
5924 .new_decoder_without_bom_handling()
5925 .latin1_byte_compatible_up_to(buffer)
5926 .unwrap(),
5927 2
5928 );
5929 assert_eq!(
5930 ISO_8859_3
5931 .new_decoder_without_bom_handling()
5932 .latin1_byte_compatible_up_to(buffer)
5933 .unwrap(),
5934 2
5935 );
5936 assert_eq!(
5937 ISO_8859_4
5938 .new_decoder_without_bom_handling()
5939 .latin1_byte_compatible_up_to(buffer)
5940 .unwrap(),
5941 2
5942 );
5943 assert_eq!(
5944 ISO_8859_5
5945 .new_decoder_without_bom_handling()
5946 .latin1_byte_compatible_up_to(buffer)
5947 .unwrap(),
5948 2
5949 );
5950 assert_eq!(
5951 ISO_8859_6
5952 .new_decoder_without_bom_handling()
5953 .latin1_byte_compatible_up_to(buffer)
5954 .unwrap(),
5955 2
5956 );
5957 assert_eq!(
5958 ISO_8859_7
5959 .new_decoder_without_bom_handling()
5960 .latin1_byte_compatible_up_to(buffer)
5961 .unwrap(),
5962 2
5963 );
5964 assert_eq!(
5965 ISO_8859_8
5966 .new_decoder_without_bom_handling()
5967 .latin1_byte_compatible_up_to(buffer)
5968 .unwrap(),
5969 3
5970 );
5971 assert_eq!(
5972 ISO_8859_10
5973 .new_decoder_without_bom_handling()
5974 .latin1_byte_compatible_up_to(buffer)
5975 .unwrap(),
5976 2
5977 );
5978 assert_eq!(
5979 ISO_8859_13
5980 .new_decoder_without_bom_handling()
5981 .latin1_byte_compatible_up_to(buffer)
5982 .unwrap(),
5983 4
5984 );
5985 assert_eq!(
5986 ISO_8859_14
5987 .new_decoder_without_bom_handling()
5988 .latin1_byte_compatible_up_to(buffer)
5989 .unwrap(),
5990 4
5991 );
5992 assert_eq!(
5993 ISO_8859_15
5994 .new_decoder_without_bom_handling()
5995 .latin1_byte_compatible_up_to(buffer)
5996 .unwrap(),
5997 6
5998 );
5999 assert_eq!(
6000 ISO_8859_16
6001 .new_decoder_without_bom_handling()
6002 .latin1_byte_compatible_up_to(buffer)
6003 .unwrap(),
6004 4
6005 );
6006 assert_eq!(
6007 ISO_8859_8_I
6008 .new_decoder_without_bom_handling()
6009 .latin1_byte_compatible_up_to(buffer)
6010 .unwrap(),
6011 3
6012 );
6013 assert_eq!(
6014 KOI8_R
6015 .new_decoder_without_bom_handling()
6016 .latin1_byte_compatible_up_to(buffer)
6017 .unwrap(),
6018 1
6019 );
6020 assert_eq!(
6021 KOI8_U
6022 .new_decoder_without_bom_handling()
6023 .latin1_byte_compatible_up_to(buffer)
6024 .unwrap(),
6025 1
6026 );
6027 assert_eq!(
6028 MACINTOSH
6029 .new_decoder_without_bom_handling()
6030 .latin1_byte_compatible_up_to(buffer)
6031 .unwrap(),
6032 1
6033 );
6034 assert_eq!(
6035 WINDOWS_874
6036 .new_decoder_without_bom_handling()
6037 .latin1_byte_compatible_up_to(buffer)
6038 .unwrap(),
6039 2
6040 );
6041 assert_eq!(
6042 WINDOWS_1250
6043 .new_decoder_without_bom_handling()
6044 .latin1_byte_compatible_up_to(buffer)
6045 .unwrap(),
6046 4
6047 );
6048 assert_eq!(
6049 WINDOWS_1251
6050 .new_decoder_without_bom_handling()
6051 .latin1_byte_compatible_up_to(buffer)
6052 .unwrap(),
6053 1
6054 );
6055 assert_eq!(
6056 WINDOWS_1252
6057 .new_decoder_without_bom_handling()
6058 .latin1_byte_compatible_up_to(buffer)
6059 .unwrap(),
6060 5
6061 );
6062 assert_eq!(
6063 WINDOWS_1253
6064 .new_decoder_without_bom_handling()
6065 .latin1_byte_compatible_up_to(buffer)
6066 .unwrap(),
6067 3
6068 );
6069 assert_eq!(
6070 WINDOWS_1254
6071 .new_decoder_without_bom_handling()
6072 .latin1_byte_compatible_up_to(buffer)
6073 .unwrap(),
6074 4
6075 );
6076 assert_eq!(
6077 WINDOWS_1255
6078 .new_decoder_without_bom_handling()
6079 .latin1_byte_compatible_up_to(buffer)
6080 .unwrap(),
6081 3
6082 );
6083 assert_eq!(
6084 WINDOWS_1256
6085 .new_decoder_without_bom_handling()
6086 .latin1_byte_compatible_up_to(buffer)
6087 .unwrap(),
6088 1
6089 );
6090 assert_eq!(
6091 WINDOWS_1257
6092 .new_decoder_without_bom_handling()
6093 .latin1_byte_compatible_up_to(buffer)
6094 .unwrap(),
6095 4
6096 );
6097 assert_eq!(
6098 WINDOWS_1258
6099 .new_decoder_without_bom_handling()
6100 .latin1_byte_compatible_up_to(buffer)
6101 .unwrap(),
6102 4
6103 );
6104 assert_eq!(
6105 X_MAC_CYRILLIC
6106 .new_decoder_without_bom_handling()
6107 .latin1_byte_compatible_up_to(buffer)
6108 .unwrap(),
6109 1
6110 );
6111 assert_eq!(
6112 X_USER_DEFINED
6113 .new_decoder_without_bom_handling()
6114 .latin1_byte_compatible_up_to(buffer)
6115 .unwrap(),
6116 1
6117 );
6118
6119 assert!(UTF_8
6120 .new_decoder()
6121 .latin1_byte_compatible_up_to(buffer)
6122 .is_none());
6123
6124 let mut decoder = UTF_8.new_decoder();
6125 let mut output = [0u16; 4];
6126 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6127 assert!(decoder.latin1_byte_compatible_up_to(buffer).is_none());
6128 let _ = decoder.decode_to_utf16(b"\xBB\xBF", &mut output, false);
6129 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), Some(1));
6130 let _ = decoder.decode_to_utf16(b"\xEF", &mut output, false);
6131 assert_eq!(decoder.latin1_byte_compatible_up_to(buffer), None);
6132 }
6133}
6134