single_byte.rs source code [crates/encoding_rs/src/single_byte.rs]

1	// Copyright Mozilla Foundation. See the COPYRIGHT
2	// file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	use super::*;
11	use crate::ascii::*;
12	use crate::data::position;
13	use crate::handles::*;
14	use crate::variant::*;
15
16	pub struct SingleByteDecoder {
17	table: &'static [u16; `128`],
18	}
19
20	impl SingleByteDecoder {
21	pub fn new(data: &'static [u16; `128`]) -> VariantDecoder {
22	VariantDecoder::SingleByte(SingleByteDecoder { table: data })
23	}
24
25	pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
26	Some(byte_length)
27	}
28
29	pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
30	byte_length.checked_mul(`3`)
31	}
32
33	pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
34	byte_length.checked_mul(`3`)
35	}
36
37	pub fn decode_to_utf8_raw(
38	&mut self,
39	src: &[u8],
40	dst: &mut [u8],
41	_last: bool,
42	) -> (DecoderResult, usize, usize) {
43	let mut source = ByteSource::new(src);
44	let mut dest = Utf8Destination::new(dst);
45	'outermost: loop {
46	match dest.copy_ascii_from_check_space_bmp(&mut source) {
47	CopyAsciiResult::Stop(ret) => return ret,
48	CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop {
49	// Start non-boilerplate
50	//
51	// Since the non-ASCIIness of `non_ascii` is hidden from
52	// the optimizer, it can't figure out that it's OK to
53	// statically omit the bound check when accessing
54	// `[u16; 128]` with an index
55	// `non_ascii as usize - 0x80usize`.
56	//
57	// Safety: `non_ascii` is a u8 byte >=0x80, from the invariants
58	// on Utf8Destination::copy_ascii_from_check_space_bmp()
59	let mapped =
60	unsafe { (self.table.get_unchecked(non_ascii as usize* - `0x80usize`)) };
61	// let mapped = self.table[non_ascii as usize - 0x80usize];
62	if mapped == `0u16` {
63	return (
64	DecoderResult::Malformed(`1`, `0`),
65	source.consumed(),
66	handle.written(),
67	);
68	}
69	let dest_again = handle.write_bmp_excl_ascii(mapped);
70	// End non-boilerplate
71	match source.check_available() {
72	Space::Full(src_consumed) => {
73	return (
74	DecoderResult::InputEmpty,
75	src_consumed,
76	dest_again.written(),
77	);
78	}
79	Space::Available(source_handle) => {
80	match dest_again.check_space_bmp() {
81	Space::Full(dst_written) => {
82	return (
83	DecoderResult::OutputFull,
84	source_handle.consumed(),
85	dst_written,
86	);
87	}
88	Space::Available(mut destination_handle) => {
89	let (mut b, unread_handle) = source_handle.read();
90	let source_again = unread_handle.commit();
91	'innermost: loop {
92	if b > `127` {
93	non_ascii = b;
94	handle = destination_handle;
95	continue 'middle;
96	}
97	// Testing on Haswell says that we should write the
98	// byte unconditionally instead of trying to unread it
99	// to make it part of the next SIMD stride.
100	let dest_again_again = destination_handle.write_ascii(b);
101	if b < `60` {
102	// We've got punctuation
103	match source_again.check_available() {
104	Space::Full(src_consumed_again) => {
105	return (
106	DecoderResult::InputEmpty,
107	src_consumed_again,
108	dest_again_again.written(),
109	);
110	}
111	Space::Available(source_handle_again) => {
112	match dest_again_again.check_space_bmp() {
113	Space::Full(dst_written_again) => {
114	return (
115	DecoderResult::OutputFull,
116	source_handle_again.consumed(),
117	dst_written_again,
118	);
119	}
120	Space::Available(
121	destination_handle_again,
122	) => {
123	let (b_again, _unread_handle_again) =
124	source_handle_again.read();
125	b = b_again;
126	destination_handle =
127	destination_handle_again;
128	continue 'innermost;
129	}
130	}
131	}
132	}
133	}
134	// We've got markup or ASCII text
135	continue 'outermost;
136	}
137	}
138	}
139	}
140	}
141	},
142	}
143	}
144	}
145
146	pub fn decode_to_utf16_raw(
147	&mut self,
148	src: &[u8],
149	dst: &mut [u16],
150	_last: bool,
151	) -> (DecoderResult, usize, usize) {
152	let (pending, length) = if dst.len() < src.len() {
153	(DecoderResult::OutputFull, dst.len())
154	} else {
155	(DecoderResult::InputEmpty, src.len())
156	};
157	// Safety invariant: converted <= length. Quite often we have `converted < length`
158	// which will be separately marked.
159	let mut converted = `0usize`;
160	'outermost: loop {
161	match unsafe {
162	// Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x`
163	ascii_to_basic_latin(
164	src.as_ptr().add(converted),
165	dst.as_mut_ptr().add(converted),
166	length - converted,
167	)
168	} {
169	None => {
170	return (pending, length, length);
171	}
172	Some((mut non_ascii, consumed)) => {
173	// Safety invariant: `converted <= length` upheld, since this can only consume
174	// up to `length - converted` bytes.
175	//
176	// Furthermore, in this context,
177	// we can assume `converted < length` since this branch is only ever hit when
178	// ascii_to_basic_latin fails to consume the entire slice
179	converted += consumed;
180	'middle: loop {
181	// `converted` doesn't count the reading of `non_ascii` yet.
182	// Since the non-ASCIIness of `non_ascii` is hidden from
183	// the optimizer, it can't figure out that it's OK to
184	// statically omit the bound check when accessing
185	// `[u16; 128]` with an index
186	// `non_ascii as usize - 0x80usize`.
187	//
188	// Safety: We can rely on `non_ascii` being between `0x80` and `0xFF` due to
189	// the invariants of `ascii_to_basic_latin()`, and our table has enough space for that.
190	let mapped =
191	unsafe { (self.table.get_unchecked(non_ascii as usize* - `0x80usize`)) };
192	// let mapped = self.table[non_ascii as usize - 0x80usize];
193	if mapped == `0u16` {
194	return (
195	DecoderResult::Malformed(`1`, `0`),
196	converted + `1`, // +1 `for non_ascii`
197	converted,
198	);
199	}
200	unsafe {
201	// Safety: As mentioned above, `converted < length`
202	*(dst.get_unchecked_mut(converted)) = mapped;
203	}
204	// Safety: `converted <= length` upheld, since `converted < length` before this
205	converted += `1`;
206	// Next, handle ASCII punctuation and non-ASCII without
207	// going back to ASCII acceleration. Non-ASCII scripts
208	// use ASCII punctuation, so this avoid going to
209	// acceleration just for punctuation/space and then
210	// failing. This is a significant boost to non-ASCII
211	// scripts.
212	// TODO: Split out Latin converters without this part
213	// this stuff makes Latin script-conversion slower.
214	if converted == length {
215	return (pending, length, length);
216	}
217	// Safety: We are back to `converted < length` because of the == above
218	// and can perform this check.
219	let mut b = unsafe { *(src.get_unchecked(converted)) };
220	// Safety: `converted < length` is upheld for this loop
221	'innermost: loop {
222	if b > `127` {
223	non_ascii = b;
224	continue 'middle;
225	}
226	// Testing on Haswell says that we should write the
227	// byte unconditionally instead of trying to unread it
228	// to make it part of the next SIMD stride.
229	unsafe {
230	// Safety: `converted < length` is true for this loop
231	(dst.get_unchecked_mut(converted)) = u16*::from(b);
232	}
233	// Safety: We are now at `converted <= length`. We should not* `continue`*
234	// the loop without reverifying
235	converted += `1`;
236	if b < `60` {
237	// We've got punctuation
238	if converted == length {
239	return (pending, length, length);
240	}
241	// Safety: we're back to `converted <= length` because of the == above
242	b = unsafe { *(src.get_unchecked(converted)) };
243	// Safety: The loop continues as `converted < length`
244	continue 'innermost;
245	}
246	// We've got markup or ASCII text
247	continue 'outermost;
248	}
249	}
250	}
251	}
252	}
253	}
254
255	pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize {
256	let mut bytes = buffer;
257	let mut total = `0`;
258	loop {
259	if let Some((non_ascii, offset)) = validate_ascii(bytes) {
260	total += offset;
261	// Safety: We can rely on `non_ascii` being between `0x80` and `0xFF` due to
262	// the invariants of `ascii_to_basic_latin()`, and our table has enough space for that.
263	let mapped = unsafe { (self.table.get_unchecked(non_ascii as usize* - `0x80usize`)) };
264	if mapped != u16::from(non_ascii) {
265	return total;
266	}
267	total += `1`;
268	bytes = &bytes[offset + `1`..];
269	} else {
270	return total;
271	}
272	}
273	}
274	}
275
276	pub struct SingleByteEncoder {
277	table: &'static [u16; `128`],
278	run_bmp_offset: usize,
279	run_byte_offset: usize,
280	run_length: usize,
281	}
282
283	impl SingleByteEncoder {
284	pub fn new(
285	encoding: &'static Encoding,
286	data: &'static [u16; `128`],
287	run_bmp_offset: u16,
288	run_byte_offset: u8,
289	run_length: u8,
290	) -> Encoder {
291	Encoder::new(
292	encoding,
293	VariantEncoder::SingleByte(SingleByteEncoder {
294	table: data,
295	run_bmp_offset: run_bmp_offset as usize,
296	run_byte_offset: run_byte_offset as usize,
297	run_length: run_length as usize,
298	}),
299	)
300	}
301
302	pub fn max_buffer_length_from_utf16_without_replacement(
303	&self,
304	u16_length: usize,
305	) -> Option<usize> {
306	Some(u16_length)
307	}
308
309	pub fn max_buffer_length_from_utf8_without_replacement(
310	&self,
311	byte_length: usize,
312	) -> Option<usize> {
313	Some(byte_length)
314	}
315
316	#[inline(always)]
317	fn encode_u16(&self, code_unit: u16) -> Option<u8> {
318	// First, we see if the code unit falls into a run of consecutive
319	// code units that can be mapped by offset. This is very efficient
320	// for most non-Latin encodings as well as Latin1-ish encodings.
321	//
322	// For encodings that don't fit this pattern, the run (which may
323	// have the length of just one) just establishes the starting point
324	// for the next rule.
325	//
326	// Next, we do a forward linear search in the part of the index
327	// after the run. Even in non-Latin1-ish Latin encodings (except
328	// macintosh), the lower case letters are here.
329	//
330	// Next, we search the third quadrant up to the start of the run
331	// (upper case letters in Latin encodings except macintosh, in
332	// Greek and in KOI encodings) and then the second quadrant,
333	// except if the run stared before the third quadrant, we search
334	// the second quadrant up to the run.
335	//
336	// Last, we search the first quadrant, which has unused controls
337	// or punctuation in most encodings. This is bad for macintosh
338	// and IBM866, but those are rare.
339
340	// Run of consecutive units
341	let unit_as_usize = code_unit as usize;
342	let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset);
343	if offset < self.run_length {
344	return Some((`128` + self.run_byte_offset + offset) as u8);
345	}
346
347	// Search after the run
348	let tail_start = self.run_byte_offset + self.run_length;
349	if let Some(pos) = position(&self.table[tail_start..], code_unit) {
350	return Some((`128` + tail_start + pos) as u8);
351	}
352
353	if self.run_byte_offset >= `64` {
354	// Search third quadrant before the run
355	if let Some(pos) = position(&self.table[`64`..self.run_byte_offset], code_unit) {
356	return Some(((`128` + `64`) + pos) as u8);
357	}
358
359	// Search second quadrant
360	if let Some(pos) = position(&self.table[`32`..`64`], code_unit) {
361	return Some(((`128` + `32`) + pos) as u8);
362	}
363	} else if let Some(pos) = position(&self.table[`32`..self.run_byte_offset], code_unit) {
364	// windows-1252, windows-874, ISO-8859-15 and ISO-8859-5
365	// Search second quadrant before the run
366	return Some(((`128` + `32`) + pos) as u8);
367	}
368
369	// Search first quadrant
370	if let Some(pos) = position(&self.table[..`32`], code_unit) {
371	return Some((`128` + pos) as u8);
372	}
373
374	None
375	}
376
377	ascii_compatible_bmp_encoder_function!(
378	{
379	match self.encode_u16(bmp) {
380	Some(byte) => handle.write_one(byte),
381	None => {
382	return (
383	EncoderResult::unmappable_from_bmp(bmp),
384	source.consumed(),
385	handle.written(),
386	);
387	}
388	}
389	},
390	bmp,
391	self,
392	source,
393	handle,
394	copy_ascii_to_check_space_one,
395	check_space_one,
396	encode_from_utf8_raw,
397	str,
398	Utf8Source,
399	`true`
400	);
401
402	pub fn encode_from_utf16_raw(
403	&mut self,
404	src: &[u16],
405	dst: &mut [u8],
406	_last: bool,
407	) -> (EncoderResult, usize, usize) {
408	let (pending, length) = if dst.len() < src.len() {
409	(EncoderResult::OutputFull, dst.len())
410	} else {
411	(EncoderResult::InputEmpty, src.len())
412	};
413	// Safety invariant: converted <= length. Quite often we have `converted < length`
414	// which will be separately marked.
415	let mut converted = `0usize`;
416	'outermost: loop {
417	match unsafe {
418	// Safety: length is the minimum length, `src/dst + x` will always be valid for reads/writes of `len - x`
419	basic_latin_to_ascii(
420	src.as_ptr().add(converted),
421	dst.as_mut_ptr().add(converted),
422	length - converted,
423	)
424	} {
425	None => {
426	return (pending, length, length);
427	}
428	Some((mut non_ascii, consumed)) => {
429	// Safety invariant: `converted <= length` upheld, since this can only consume
430	// up to `length - converted` bytes.
431	//
432	// Furthermore, in this context,
433	// we can assume `converted < length` since this branch is only ever hit when
434	// ascii_to_basic_latin fails to consume the entire slice
435	converted += consumed;
436	'middle: loop {
437	// `converted` doesn't count the reading of `non_ascii` yet.
438	match self.encode_u16(non_ascii) {
439	Some(byte) => {
440	unsafe {
441	// Safety: we're allowed this access since `converted < length`
442	*(dst.get_unchecked_mut(converted)) = byte;
443	}
444	converted += `1`;
445	// `converted <= length` now
446	}
447	None => {
448	// At this point, we need to know if we
449	// have a surrogate.
450	let high_bits = non_ascii & `0xFC00u16`;
451	if high_bits == `0xD800u16` {
452	// high surrogate
453	if converted + `1` == length {
454	// End of buffer. This surrogate is unpaired.
455	return (
456	EncoderResult::Unmappable('`\u{FFFD}`'),
457	converted + `1`, // +1 `for non_ascii`
458	converted,
459	);
460	}
461	// Safety: convered < length from outside the match, and `converted + 1 != length`,
462	// So `converted + 1 < length` as well. We're in bounds
463	let second =
464	u32::from(unsafe { *src.get_unchecked(converted + `1`) });
465	if second & `0xFC00u32` != `0xDC00u32` {
466	return (
467	EncoderResult::Unmappable('`\u{FFFD}`'),
468	converted + `1`, // +1 `for non_ascii`
469	converted,
470	);
471	}
472	// The next code unit is a low surrogate.
473	let astral: char = unsafe {
474	// Safety: We can rely on non_ascii being 0xD800-0xDBFF since the high bits are 0xD800
475	// Then, (non_ascii << 10 - 0xD800 << 10) becomes between (0 to 0x3FF) << 10, which is between
476	// 0x400 to 0xffc00. Adding the 0x10000 gives a range of 0x10400 to 0x10fc00. Subtracting the 0xDC00
477	// gives 0x2800 to 0x102000
478	// The second term is between 0xDC00 and 0xDFFF from the check above. This gives a maximum
479	// possible range of (0x10400 + 0xDC00) to (0x102000 + 0xDFFF) which is 0x1E000 to 0x10ffff.
480	// This is in range.
481	//
482	// From a Unicode principles perspective this can also be verified as we have checked that `non_ascii` is a high surrogate
483	// (0xD800..=0xDBFF), and that `second` is a low surrogate (`0xDC00..=0xDFFF`), and we are applying reverse of the UTC16 transformation
484	// algorithm <https://en.wikipedia.org/wiki/UTF-16#Code_points_from_U+010000_to_U+10FFFF>, by applying the high surrogate - 0xD800 to the
485	// high ten bits, and the low surrogate - 0xDc00 to the low ten bits, and then adding 0x10000
486	::core::char::from_u32_unchecked(
487	(u32::from(non_ascii) << `10`) + second
488	- (((`0xD800u32` << `10`) - `0x1_0000u32`) + `0xDC00u32`),
489	)
490	};
491	return (
492	EncoderResult::Unmappable(astral),
493	converted + `2`, // +2 `for non_ascii` and `second`
494	converted,
495	);
496	}
497	if high_bits == `0xDC00u16` {
498	// Unpaired low surrogate
499	return (
500	EncoderResult::Unmappable('`\u{FFFD}`'),
501	converted + `1`, // +1 `for non_ascii`
502	converted,
503	);
504	}
505	return (
506	EncoderResult::unmappable_from_bmp(non_ascii),
507	converted + `1`, // +1 `for non_ascii`
508	converted,
509	);
510	// Safety: This branch diverges, so no need to uphold invariants on `converted`
511	}
512	}
513	// Next, handle ASCII punctuation and non-ASCII without
514	// going back to ASCII acceleration. Non-ASCII scripts
515	// use ASCII punctuation, so this avoid going to
516	// acceleration just for punctuation/space and then
517	// failing. This is a significant boost to non-ASCII
518	// scripts.
519	// TODO: Split out Latin converters without this part
520	// this stuff makes Latin script-conversion slower.
521	if converted == length {
522	return (pending, length, length);
523	}
524	// Safety: we're back to `converted < length` due to the == above and can perform
525	// the unchecked read
526	let mut unit = unsafe { *(src.get_unchecked(converted)) };
527	'innermost: loop {
528	// Safety: This loop always begins with `converted < length`, see
529	// the invariant outside and the comment on the continue below
530	if unit > `127` {
531	non_ascii = unit;
532	continue 'middle;
533	}
534	// Testing on Haswell says that we should write the
535	// byte unconditionally instead of trying to unread it
536	// to make it part of the next SIMD stride.
537	unsafe {
538	// Safety: Can rely on converted < length
539	*(dst.get_unchecked_mut(converted)) = unit as u8;
540	}
541	converted += `1`;
542	// `converted <= length` here
543	if unit < `60` {
544	// We've got punctuation
545	if converted == length {
546	return (pending, length, length);
547	}
548	// Safety: `converted < length` due to the == above. The read is safe.
549	unit = unsafe { *(src.get_unchecked(converted)) };
550	// Safety: This only happens if `converted < length`, maintaining it
551	continue 'innermost;
552	}
553	// We've got markup or ASCII text
554	continue 'outermost;
555	// Safety: All other routes to here diverge so the continue is the only
556	// way to run the innermost loop.
557	}
558	}
559	}
560	}
561	}
562	}
563	}
564
565	// Any copyright to the test code below this comment is dedicated to the
566	// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
567
568	#[cfg(all(test, feature = "alloc"))]
569	mod tests {
570	use super::super::testing::*;
571	use super::super::*;
572
573	#[test]
574	fn test_windows_1255_ca() {
575	decode(WINDOWS_1255, b"`\xCA`", "`\u{05BA}`");
576	encode(WINDOWS_1255, "`\u{05BA}`", b"`\xCA`");
577	}
578
579	#[test]
580	fn test_ascii_punctuation() {
581	let bytes = b"`\xC1\xF5\xF4\xFC` `\xE5\xDF\xED\xE1\xE9` `\xDD\xED\xE1` `\xF4\xE5\xF3\xF4`. `\xC1\xF5\xF4\xFC` `\xE5\xDF\xED\xE1\xE9` `\xDD\xED\xE1` `\xF4\xE5\xF3\xF4`.";
582	let characters = "`\u{0391}\u{03C5}\u{03C4}\u{03CC}` \
583	`\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9}` `\u{03AD}\u{03BD}\u{03B1}` \
584	`\u{03C4}\u{03B5}\u{03C3}\u{03C4}`. `\u{0391}\u{03C5}\u{03C4}\u{03CC}` \
585	`\u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9}` `\u{03AD}\u{03BD}\u{03B1}` \
586	`\u{03C4}\u{03B5}\u{03C3}\u{03C4}`.";
587	decode(WINDOWS_1253, bytes, characters);
588	encode(WINDOWS_1253, characters, bytes);
589	}
590
591	#[test]
592	fn test_decode_malformed() {
593	decode(
594	WINDOWS_1253,
595	b"`\xC1\xF5\xD2\xF4\xFC`",
596	"`\u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}`",
597	);
598	}
599
600	#[test]
601	fn test_encode_unmappables() {
602	encode(
603	WINDOWS_1253,
604	"`\u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}`",
605	b"`\xC1\xF5`☃`\xF4\xFC`",
606	);
607	encode(
608	WINDOWS_1253,
609	"`\u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}`",
610	b"`\xC1\xF5`💩`\xF4\xFC`",
611	);
612	}
613
614	#[test]
615	fn test_encode_unpaired_surrogates() {
616	encode_from_utf16(
617	WINDOWS_1253,
618	&[`0x0391u16`, `0x03C5u16`, `0xDCA9u16`, `0x03C4u16`, `0x03CCu16`],
619	b"`\xC1\xF5`�`\xF4\xFC`",
620	);
621	encode_from_utf16(
622	WINDOWS_1253,
623	&[`0x0391u16`, `0x03C5u16`, `0xD83Du16`, `0x03C4u16`, `0x03CCu16`],
624	b"`\xC1\xF5`�`\xF4\xFC`",
625	);
626	encode_from_utf16(
627	WINDOWS_1253,
628	&[`0x0391u16`, `0x03C5u16`, `0x03C4u16`, `0x03CCu16`, `0xD83Du16`],
629	b"`\xC1\xF5\xF4\xFC`�",
630	);
631	}
632
633	pub const HIGH_BYTES: &'static [u8; `128`] = &[
634	`0x80`, `0x81`, `0x82`, `0x83`, `0x84`, `0x85`, `0x86`, `0x87`, `0x88`, `0x89`, `0x8A`, `0x8B`, `0x8C`, `0x8D`, `0x8E`,
635	`0x8F`, `0x90`, `0x91`, `0x92`, `0x93`, `0x94`, `0x95`, `0x96`, `0x97`, `0x98`, `0x99`, `0x9A`, `0x9B`, `0x9C`, `0x9D`,
636	`0x9E`, `0x9F`, `0xA0`, `0xA1`, `0xA2`, `0xA3`, `0xA4`, `0xA5`, `0xA6`, `0xA7`, `0xA8`, `0xA9`, `0xAA`, `0xAB`, `0xAC`,
637	`0xAD`, `0xAE`, `0xAF`, `0xB0`, `0xB1`, `0xB2`, `0xB3`, `0xB4`, `0xB5`, `0xB6`, `0xB7`, `0xB8`, `0xB9`, `0xBA`, `0xBB`,
638	`0xBC`, `0xBD`, `0xBE`, `0xBF`, `0xC0`, `0xC1`, `0xC2`, `0xC3`, `0xC4`, `0xC5`, `0xC6`, `0xC7`, `0xC8`, `0xC9`, `0xCA`,
639	`0xCB`, `0xCC`, `0xCD`, `0xCE`, `0xCF`, `0xD0`, `0xD1`, `0xD2`, `0xD3`, `0xD4`, `0xD5`, `0xD6`, `0xD7`, `0xD8`, `0xD9`,
640	`0xDA`, `0xDB`, `0xDC`, `0xDD`, `0xDE`, `0xDF`, `0xE0`, `0xE1`, `0xE2`, `0xE3`, `0xE4`, `0xE5`, `0xE6`, `0xE7`, `0xE8`,
641	`0xE9`, `0xEA`, `0xEB`, `0xEC`, `0xED`, `0xEE`, `0xEF`, `0xF0`, `0xF1`, `0xF2`, `0xF3`, `0xF4`, `0xF5`, `0xF6`, `0xF7`,
642	`0xF8`, `0xF9`, `0xFA`, `0xFB`, `0xFC`, `0xFD`, `0xFE`, `0xFF`,
643	];
644
645	fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; `128`]) {
646	let mut with_replacement = [`0u16`; `128`];
647	let mut it = data.iter().enumerate();
648	loop {
649	match it.next() {
650	Some((i, code_point)) => {
651	if *code_point == `0` {
652	with_replacement[i] = `0xFFFD`;
653	} else {
654	with_replacement[i] = *code_point;
655	}
656	}
657	None => {
658	break;
659	}
660	}
661	}
662
663	decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]);
664	}
665
666	fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; `128`]) {
667	let mut with_zeros = [`0u8`; `128`];
668	let mut it = data.iter().enumerate();
669	loop {
670	match it.next() {
671	Some((i, code_point)) => {
672	if *code_point == `0` {
673	with_zeros[i] = `0`;
674	} else {
675	with_zeros[i] = HIGH_BYTES[i];
676	}
677	}
678	None => {
679	break;
680	}
681	}
682	}
683
684	encode_from_utf16(encoding, data, &with_zeros[..]);
685	}
686
687	#[test]
688	fn test_single_byte_from_two_low_surrogates() {
689	let expectation = b"��";
690	let mut output = [`0u8`; `40`];
691	let mut encoder = WINDOWS_1253.new_encoder();
692	let (result, read, written, had_errors) =
693	encoder.encode_from_utf16(&[`0xDC00u16`, `0xDEDEu16`], &mut output[..], `true`);
694	assert_eq!(result, CoderResult::InputEmpty);
695	assert_eq!(read, `2`);
696	assert_eq!(written, expectation.len());
697	assert!(had_errors);
698	assert_eq!(&output[..written], expectation);
699	}
700
701	// These tests are so self-referential that they are pretty useless.
702
703	// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
704	// Instead, please regenerate using generate-encoding-data.py
705
706	#[test]
707	fn test_single_byte_decode() {
708	decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
709	decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
710	if cfg!(miri) {
711	// Miri is too slow
712	return;
713	}
714	decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
715	decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
716	decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
717	decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
718	decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
719	decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
720	decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
721	decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
722	decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
723	decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
724	decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
725	decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
726	decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
727	decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
728	decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
729	decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
730	decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
731	decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
732	decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
733	decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
734	decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
735	decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
736	decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
737	decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
738	decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
739	}
740
741	#[test]
742	fn test_single_byte_encode() {
743	encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866);
744	encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10);
745	if cfg!(miri) {
746	// Miri is too slow
747	return;
748	}
749	encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13);
750	encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14);
751	encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15);
752	encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16);
753	encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2);
754	encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3);
755	encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4);
756	encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5);
757	encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6);
758	encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7);
759	encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8);
760	encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r);
761	encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u);
762	encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh);
763	encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250);
764	encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251);
765	encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252);
766	encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253);
767	encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254);
768	encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255);
769	encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256);
770	encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257);
771	encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258);
772	encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874);
773	encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic);
774	}
775	// END GENERATED CODE
776	}
777