gb18030.rs source code [crates/encoding_rs/src/gb18030.rs]

1	// Copyright Mozilla Foundation. See the COPYRIGHT
2	// file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	use super::*;
11	use crate::data::*;
12	use crate::gb18030_2022::*;
13	use crate::handles::*;
14	use crate::variant::*;
15	// Rust 1.14.0 requires the following despite the asterisk above.
16	use super::in_inclusive_range16;
17	use super::in_range16;
18
19	enum Gb18030Pending {
20	None,
21	One(u8),
22	Two(u8, u8),
23	Three(u8, u8, u8),
24	}
25
26	impl Gb18030Pending {
27	fn is_none(&self) -> bool {
28	match *self {
29	Gb18030Pending::None => `true`,
30	_ => `false`,
31	}
32	}
33
34	fn count(&self) -> usize {
35	match *self {
36	Gb18030Pending::None => `0`,
37	Gb18030Pending::One(_) => `1`,
38	Gb18030Pending::Two(_, _) => `2`,
39	Gb18030Pending::Three(_, _, _) => `3`,
40	}
41	}
42	}
43
44	pub struct Gb18030Decoder {
45	first: Option<u8>,
46	second: Option<u8>,
47	third: Option<u8>,
48	pending: Gb18030Pending,
49	pending_ascii: Option<u8>,
50	}
51
52	impl Gb18030Decoder {
53	pub fn new() -> VariantDecoder {
54	VariantDecoder::Gb18030(Gb18030Decoder {
55	first: None,
56	second: None,
57	third: None,
58	pending: Gb18030Pending::None,
59	pending_ascii: None,
60	})
61	}
62
63	pub fn in_neutral_state(&self) -> bool {
64	self.first.is_none()
65	&& self.second.is_none()
66	&& self.third.is_none()
67	&& self.pending.is_none()
68	&& self.pending_ascii.is_none()
69	}
70
71	fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
72	byte_length.checked_add(
73	self.pending.count()
74	+ match self.first {
75	None => `0`,
76	Some(_) => `1`,
77	}
78	+ match self.second {
79	None => `0`,
80	Some(_) => `1`,
81	}
82	+ match self.third {
83	None => `0`,
84	Some(_) => `1`,
85	}
86	+ match self.pending_ascii {
87	None => `0`,
88	Some(_) => `1`,
89	},
90	)
91	}
92
93	pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
94	// ASCII: 1 to 1 (worst case)
95	// gbk: 2 to 1
96	// ranges: 4 to 1 or 4 to 2
97	checked_add(`1`, self.extra_from_state(byte_length))
98	}
99
100	pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
101	// ASCII: 1 to 1
102	// gbk: 2 to 2 or 2 to 3
103	// ranges: 4 to 2, 4 to 3 or 4 to 4
104	// 0x80: 1 to 3 (worst case)
105	self.max_utf8_buffer_length(byte_length)
106	}
107
108	pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
109	checked_add(`1`, checked_mul(`3`, self.extra_from_state(byte_length)))
110	}
111
112	gb18030_decoder_functions!(
113	{
114	// If first is between 0x81 and 0xFE, inclusive,
115	// subtract offset 0x81.
116	let non_ascii_minus_offset = non_ascii.wrapping_sub(`0x81`);
117	if non_ascii_minus_offset > (`0xFE` - `0x81`) {
118	if non_ascii == `0x80` {
119	handle.write_upper_bmp(`0x20ACu16`);
120	continue 'outermost;
121	}
122	return (DecoderResult::Malformed(`1`, `0`),
123	source.consumed(),
124	handle.written());
125	}
126	non_ascii_minus_offset
127	},
128	{
129	// Two-byte (or error)
130	if first_minus_offset >= `0x20` {
131	// Not the gbk ideograph range above GB2312
132	let trail_minus_offset = second.wrapping_sub(`0xA1`);
133	if trail_minus_offset <= (`0xFE` - `0xA1`) {
134	// GB2312
135	let hanzi_lead = first_minus_offset.wrapping_sub(`0x2F`);
136	if hanzi_lead < (`0x77` - `0x2F`) {
137	// Level 1 Hanzi, Level 2 Hanzi
138	// or one of the 5 PUA code
139	// points in between.
140	let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize;
141	let upper_bmp = GB2312_HANZI[hanzi_pointer];
142	handle.write_upper_bmp(upper_bmp)
143	} else if first_minus_offset == `0x20` {
144	// Symbols (starting with ideographic space)
145	let bmp = GB2312_SYMBOLS[trail_minus_offset as usize];
146	handle.write_bmp_excl_ascii(bmp)
147	} else if first_minus_offset == `0x25` && ((trail_minus_offset.wrapping_sub(`63`) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) {
148	handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(`63`) as usize])
149	} else if first_minus_offset == `0x27` && (trail_minus_offset as usize) < GB2312_PINYIN.len() {
150	handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize])
151	} else if first_minus_offset > `0x76` {
152	// Bottom PUA
153	let pua = (`0xE234` + mul_94(first_minus_offset - `0x77`) + trail_minus_offset as usize) as u16;
154	handle.write_upper_bmp(pua)
155	} else {
156	let bmp = gb2312_other_decode((mul_94(first_minus_offset - `0x21`) + (trail_minus_offset as usize)) as u16);
157	handle.write_bmp_excl_ascii(bmp)
158	}
159	} else {
160	// gbk range on the left
161	let mut trail_minus_offset = second.wrapping_sub(`0x40`);
162	if trail_minus_offset > (`0x7E` - `0x40`) {
163	let trail_minus_range_start = second.wrapping_sub(`0x80`);
164	if trail_minus_range_start > (`0xA0` - `0x80`) {
165	if second < `0x80` {
166	return (DecoderResult::Malformed(`1`, `0`),
167	unread_handle_second.unread(),
168	handle.written());
169	}
170	return (DecoderResult::Malformed(`2`, `0`),
171	unread_handle_second.consumed(),
172	handle.written());
173	}
174	trail_minus_offset = second - `0x41`;
175	}
176	// Zero-base lead
177	let left_lead = first_minus_offset - `0x20`;
178	let left_pointer = left_lead as usize * (`190` - `94`) +
179	trail_minus_offset as usize;
180	let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((`0x29` - `0x20`) * (`190` - `94`));
181	if gbk_left_ideograph_pointer < (((`0x7D` - `0x29`) * (`190` - `94`)) - `5`) {
182	let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16);
183	handle.write_upper_bmp(upper_bmp)
184	} else if left_pointer < ((`0x29` - `0x20`) * (`190` - `94`)) {
185	let bmp = gbk_other_decode(left_pointer as u16);
186	handle.write_bmp_excl_ascii(bmp)
187	} else {
188	let bottom_pointer = left_pointer - (((`0x7D` - `0x20`) * (`190` - `94`)) - `5`);
189	let upper_bmp = GBK_BOTTOM[bottom_pointer];
190	handle.write_upper_bmp(upper_bmp)
191	}
192	}
193	} else {
194	// gbk ideograph range above GB2312
195	let mut trail_minus_offset = second.wrapping_sub(`0x40`);
196	if trail_minus_offset > (`0x7E` - `0x40`) {
197	let trail_minus_range_start = second.wrapping_sub(`0x80`);
198	if trail_minus_range_start > (`0xFE` - `0x80`) {
199	if second < `0x80` {
200	return (DecoderResult::Malformed(`1`, `0`),
201	unread_handle_second.unread(),
202	handle.written());
203	}
204	return (DecoderResult::Malformed(`2`, `0`),
205	unread_handle_second.consumed(),
206	handle.written());
207	}
208	trail_minus_offset = second - `0x41`;
209	}
210	let pointer = first_minus_offset as usize * `190usize` +
211	trail_minus_offset as usize;
212	let upper_bmp = gbk_top_ideograph_decode(pointer as u16);
213	handle.write_upper_bmp(upper_bmp)
214	}
215	},
216	{
217	// If third is between 0x81 and 0xFE, inclusive,
218	// subtract offset 0x81.
219	let third_minus_offset = third.wrapping_sub(`0x81`);
220	if third_minus_offset > (`0xFE` - `0x81`) {
221	// We have an error. Let's inline what's going
222	// to happen when `second` is
223	// reprocessed. (`third` gets unread.)
224	// `second` is guaranteed ASCII, so let's
225	// put it in `pending_ascii`. Recompute
226	// `second` from `second_minus_offset`.
227	self.pending_ascii = Some(second_minus_offset + `0x30`);
228	// Now unread `third` and designate the previous
229	// `first` as being in error.
230	return (DecoderResult::Malformed(`1`, `1`),
231	unread_handle_third.unread(),
232	handle.written());
233	}
234	third_minus_offset
235	},
236	{
237	// If fourth is between 0x30 and 0x39, inclusive,
238	// subtract offset 0x30.
239	//
240	// If we have an error, we'll inline what's going
241	// to happen when `second` and `third` are
242	// reprocessed. (`fourth` gets unread.)
243	// `second` is guaranteed ASCII, so let's
244	// put it in `pending_ascii`. Recompute
245	// `second` from `second_minus_offset` to
246	// make this block reusable when `second`
247	// is not in scope.
248	//
249	// `third` is guaranteed to be in the range
250	// that makes it become the new `self.first`.
251	//
252	// `fourth` gets unread and the previous
253	// `first` gets designates as being in error.
254	let fourth_minus_offset = fourth.wrapping_sub(`0x30`);
255	if fourth_minus_offset > (`0x39` - `0x30`) {
256	self.pending_ascii = Some(second_minus_offset + `0x30`);
257	self.pending = Gb18030Pending::One(third_minus_offset);
258	return (DecoderResult::Malformed(`1`, `2`),
259	unread_handle_fourth.unread(),
260	handle.written());
261	}
262	let pointer = (first_minus_offset as usize * (`10` * `126` * `10`)) +
263	(second_minus_offset as usize * (`10` * `126`)) +
264	(third_minus_offset as usize * `10`) +
265	fourth_minus_offset as usize;
266	if pointer <= `39419` {
267	// BMP
268	if pointer == `7457` {
269	handle.write_upper_bmp(`0xE7C7`)
270	} else {
271	handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
272	}
273	} else if pointer >= `189_000` && pointer <= `1_237_575` {
274	// Astral
275	handle.write_astral((pointer - (`189_000usize` - `0x1_0000usize`)) as u32)
276	} else {
277	return (DecoderResult::Malformed(`4`, `0`),
278	unread_handle_fourth.consumed(),
279	handle.written());
280	}
281	},
282	self,
283	non_ascii,
284	first_minus_offset,
285	second,
286	second_minus_offset,
287	unread_handle_second,
288	third,
289	third_minus_offset,
290	unread_handle_third,
291	fourth,
292	fourth_minus_offset,
293	unread_handle_fourth,
294	source,
295	handle,
296	'outermost);
297	}
298
299	// XXX Experiment with inline directives
300	fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
301	// Try ideographic punctuation first as it's the most likely case.
302	// Throwing in the check for full-width currencies and tilde is probably
303	// more size-efficient here than elsewhere.
304	if in_inclusive_range16(bmp, `0x2014`, `0x3017`) \|\| in_inclusive_range16(bmp, `0xFF04`, `0xFFE1`) {
305	if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) {
306	return Some((`0xA1`, pos + `0xA1`));
307	}
308	}
309	// Ext A
310	if in_range16(bmp, `0x3400`, `0x4E00`) {
311	return position(&GBK_BOTTOM[`21`..`100`], bmp).map(\|pos\| {
312	(
313	`0xFE`,
314	pos + if pos < (`0x3F` - `16`) {
315	`0x40` + `16`
316	} else {
317	`0x41` + `16`
318	},
319	)
320	});
321	}
322	// Compatibility ideographs
323	if in_range16(bmp, `0xF900`, `0xFB00`) {
324	return position(&GBK_BOTTOM[`0`..`21`], bmp).map(\|pos\| {
325	if pos < `5` {
326	// end of second to last row
327	(`0xFD`, pos + (`190` - `94` - `5` + `0x41`))
328	} else {
329	// last row
330	(`0xFE`, pos + (`0x40` - `5`))
331	}
332	});
333	}
334	// Handle everything below U+02CA, which is in GBK_OTHER.
335	if bmp < `0x02CA` {
336	if in_range16(bmp, `0x00E0`, `0x0262`) && bmp != `0x00F7` {
337	// Pinyin except U+1E3F
338	if let Some(pos) = position(&GB2312_PINYIN[..], bmp) {
339	return Some((`0xA8`, pos + `0xA1`));
340	}
341	} else if in_inclusive_range16(bmp, `0x00A4`, `0x00F7`)
342	\|\| in_inclusive_range16(bmp, `0x02C7`, `0x02C9`)
343	{
344	// Diacritics and Latin 1 symbols
345	if let Some(pos) = position(&GB2312_SYMBOLS[`3`..(`0xAC` - `0x60`)], bmp) {
346	return Some((`0xA1`, pos + `0xA1` + `3`));
347	}
348	}
349	return None;
350	}
351
352	if in_inclusive_range16(bmp, `0xE78D`, `0xE864`) {
353	// The array is sorted but short, so let's do linear search.
354	if let Some(pos) = position(&GB18030_2022_OVERRIDE_PUA[..], bmp) {
355	let pair = &GB18030_2022_OVERRIDE_BYTES[pos];
356	return Some((pair[`0`].into(), pair[`1`].into()));
357	}
358	} else if bmp >= `0xFE17` {
359	// Various brackets, all in full-width regions
360	if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
361	return Some((`0xA6`, pos + (`0x9F` - `0x60` + `0xA1`)));
362	}
363	} else if bmp == `0x1E3F` {
364	// The one Pinyin placed elsewhere on the BMP
365	return Some((`0xA8`, `0x7B` - `0x60` + `0xA1`));
366	} else if in_range16(bmp, `0xA000`, `0xD800`) {
367	// Since Korean has usage in China, let's spend a branch to fast-track
368	// Hangul.
369	return None;
370	}
371	// GB2312 other (except bottom PUA and PUA between Hanzi levels).
372	if let Some(other_pointer) = gb2312_other_encode(bmp) {
373	let other_lead = other_pointer as usize / `94`;
374	let other_trail = other_pointer as usize % `94`;
375	return Some((`0xA2` + other_lead, `0xA1` + other_trail));
376	}
377	// At this point, we've handled all mappable characters above U+02D9 but
378	// below U+2010. Let's check for that range in order to let lower BMP
379	// characters used for minority languages in China avoid the subsequent
380	// search that deals mainly with various symbols.
381	if in_range16(bmp, `0x02DA`, `0x2010`) {
382	return None;
383	}
384	// GBK other (except radicals and PUA in GBK_BOTTOM).
385	if let Some(other_pointer) = gbk_other_encode(bmp) {
386	let other_lead = other_pointer as usize / (`190` - `94`);
387	let other_trail = other_pointer as usize % (`190` - `94`);
388	let offset = if other_trail < `0x3F` { `0x40` } else { `0x41` };
389	return Some((other_lead + (`0x81` + `0x20`), other_trail + offset));
390	}
391	// CJK Radicals Supplement, PUA, and U+9FBx ideographs in GBK_BOTTOM
392	if in_inclusive_range16(bmp, `0x2E81`, `0x2ECA`)
393	\|\| in_inclusive_range16(bmp, `0x9FB4`, `0x9FBB`)
394	\|\| in_inclusive_range16(bmp, `0xE816`, `0xE855`)
395	{
396	if let Some(pos) = position(&GBK_BOTTOM[`21`..], bmp) {
397	let trail = pos + `16`;
398	let offset = if trail < `0x3F` { `0x40` } else { `0x41` };
399	return Some((`0xFE`, trail + offset));
400	}
401	}
402	// GB2312 bottom PUA
403	let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(`0xE234`);
404	if bmp_minus_gb2312_bottom_pua <= (`0xE4C5` - `0xE234`) {
405	let pua_lead = bmp_minus_gb2312_bottom_pua as usize / `94`;
406	let pua_trail = bmp_minus_gb2312_bottom_pua as usize % `94`;
407	return Some((`0x81` + `0x77` + pua_lead, `0xA1` + pua_trail));
408	}
409	// PUA between Hanzi Levels
410	let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(`0xE810`);
411	if bmp_minus_pua_between_hanzi < `5` {
412	return Some((`0x81` + `0x56`, `0xFF` - `5` + bmp_minus_pua_between_hanzi as usize));
413	}
414	None
415	}
416
417	#[cfg(not(feature = "fast-gb-hanzi-encode"))]
418	#[inline(always)]
419	fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
420	if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
421	(lead, trail)
422	} else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
423	let hanzi_lead = (hanzi_pointer / `94`) + (`0xD8`);
424	let hanzi_trail = (hanzi_pointer % `94`) + `0xA1`;
425	(hanzi_lead as u8, hanzi_trail as u8)
426	} else {
427	let (lead, gbk_trail) = if bmp < `0x72DC` {
428	// Above GB2312
429	let pointer = gbk_top_ideograph_encode(bmp) as usize;
430	let lead = (pointer / `190`) + `0x81`;
431	let gbk_trail = pointer % `190`;
432	(lead, gbk_trail)
433	} else {
434	// To the left of GB2312
435	let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
436	let lead = (gbk_left_ideograph_pointer / (`190` - `94`)) + (`0x81` + `0x29`);
437	let gbk_trail = gbk_left_ideograph_pointer % (`190` - `94`);
438	(lead, gbk_trail)
439	};
440	let offset = if gbk_trail < `0x3F` { `0x40` } else { `0x41` };
441	(lead as u8, (gbk_trail + offset) as u8)
442	}
443	}
444
445	#[cfg(feature = "fast-gb-hanzi-encode")]
446	#[inline(always)]
447	fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
448	gbk_hanzi_encode(bmp_minus_unified_start)
449	}
450
451	pub struct Gb18030Encoder {
452	extended: bool,
453	}
454
455	impl Gb18030Encoder {
456	pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder {
457	Encoder::new(
458	encoding,
459	VariantEncoder::Gb18030(Gb18030Encoder {
460	extended: extended_range,
461	}),
462	)
463	}
464
465	pub fn max_buffer_length_from_utf16_without_replacement(
466	&self,
467	u16_length: usize,
468	) -> Option<usize> {
469	if self.extended {
470	u16_length.checked_mul(`4`)
471	} else {
472	// Need to add, because space check is done with the four-byte
473	// assumption.
474	checked_add(`2`, u16_length.checked_mul(`2`))
475	}
476	}
477
478	pub fn max_buffer_length_from_utf8_without_replacement(
479	&self,
480	byte_length: usize,
481	) -> Option<usize> {
482	if self.extended {
483	// 1 to 1
484	// 2 to 2
485	// 3 to 2
486	// 2 to 4 (worst)
487	// 3 to 4
488	// 4 to 4
489	checked_add(`2`, byte_length.checked_mul(`2`))
490	} else {
491	// 1 to 1
492	// 2 to 2
493	// 3 to 2
494	// Need to add, because space check is done with the four-byte
495	// assumption.
496	byte_length.checked_add(`3`)
497	}
498	}
499
500	ascii_compatible_encoder_functions!(
501	{
502	let bmp_minus_unified_start = bmp.wrapping_sub(`0x4E00`);
503	if bmp_minus_unified_start < (`0x9FA6` - `0x4E00`) {
504	// CJK Unified Ideographs
505	// Can't fail now, since all are
506	// mapped.
507	let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
508	handle.write_two(lead, trail)
509	} else if bmp == `0xE5E5` {
510	// It's not optimal to check for the unmappable
511	// and for euro at this stage, but getting
512	// the out of the way makes the rest of the
513	// code less messy.
514	return (
515	EncoderResult::unmappable_from_bmp(bmp),
516	source.consumed(),
517	handle.written(),
518	);
519	} else if bmp == `0x20AC` && !self.extended {
520	handle.write_one(`0x80u8`)
521	} else {
522	match gbk_encode_non_unified(bmp) {
523	Some((lead, trail)) => handle.write_two(lead as u8, trail as u8),
524	None => {
525	if !self.extended {
526	return (
527	EncoderResult::unmappable_from_bmp(bmp),
528	source.consumed(),
529	handle.written(),
530	);
531	}
532	let range_pointer = gb18030_range_encode(bmp);
533	let first = range_pointer / (`10` * `126` * `10`);
534	let rem_first = range_pointer % (`10` * `126` * `10`);
535	let second = rem_first / (`10` * `126`);
536	let rem_second = rem_first % (`10` * `126`);
537	let third = rem_second / `10`;
538	let fourth = rem_second % `10`;
539	handle.write_four(
540	(first + `0x81`) as u8,
541	(second + `0x30`) as u8,
542	(third + `0x81`) as u8,
543	(fourth + `0x30`) as u8,
544	)
545	}
546	}
547	}
548	},
549	{
550	if !self.extended {
551	return (
552	EncoderResult::Unmappable(astral),
553	source.consumed(),
554	handle.written(),
555	);
556	}
557	let range_pointer = astral as usize + (`189_000usize` - `0x1_0000usize`);
558	let first = range_pointer / (`10` * `126` * `10`);
559	let rem_first = range_pointer % (`10` * `126` * `10`);
560	let second = rem_first / (`10` * `126`);
561	let rem_second = rem_first % (`10` * `126`);
562	let third = rem_second / `10`;
563	let fourth = rem_second % `10`;
564	handle.write_four(
565	(first + `0x81`) as u8,
566	(second + `0x30`) as u8,
567	(third + `0x81`) as u8,
568	(fourth + `0x30`) as u8,
569	)
570	},
571	bmp,
572	astral,
573	self,
574	source,
575	handle,
576	copy_ascii_to_check_space_four,
577	check_space_four,
578	`false`
579	);
580	}
581
582	// Any copyright to the test code below this comment is dedicated to the
583	// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
584
585	#[cfg(all(test, feature = "alloc"))]
586	mod tests {
587	use super::super::testing::*;
588	use super::super::*;
589
590	fn decode_gb18030(bytes: &[u8], expect: &str) {
591	decode(GB18030, bytes, expect);
592	}
593
594	fn encode_gb18030(string: &str, expect: &[u8]) {
595	encode(GB18030, string, expect);
596	}
597
598	fn encode_gbk(string: &str, expect: &[u8]) {
599	encode(GBK, string, expect);
600	}
601
602	#[test]
603	fn test_gb18030_decode() {
604	// Empty
605	decode_gb18030(b"", &"");
606
607	// ASCII
608	decode_gb18030(b"`\x61\x62`", "`\u{0061}\u{0062}`");
609
610	// euro
611	decode_gb18030(b"`\x80`", "`\u{20AC}`");
612	decode_gb18030(b"`\xA2\xE3`", "`\u{20AC}`");
613
614	// two bytes
615	decode_gb18030(b"`\x81\x40`", "`\u{4E02}`");
616	decode_gb18030(b"`\x81\x7E`", "`\u{4E8A}`");
617	decode_gb18030(b"`\x81\x7F`", "`\u{FFFD}\u{007F}`");
618	decode_gb18030(b"`\x81\x80`", "`\u{4E90}`");
619	decode_gb18030(b"`\x81\xFE`", "`\u{4FA2}`");
620	decode_gb18030(b"`\xFE\x40`", "`\u{FA0C}`");
621	decode_gb18030(b"`\xFE\x7F`", "`\u{FFFD}\u{007F}`");
622	decode_gb18030(b"`\xFE\x80`", "`\u{4723}`");
623	decode_gb18030(b"`\xFE\xFE`", "`\u{E4C5}`");
624
625	// Changes between GB18030-2005 and GB18030-2022
626	decode_gb18030(b"`\xFE\x7E`", "`\u{9FB9}`");
627	decode_gb18030(b"`\xA6\xDD`", "`\u{FE14}`");
628
629	// These mappings remain in place the GB18030-2005 way despite GB18030-2022
630	decode_gb18030(b"`\x82\x35\x91\x32`", "`\u{9FB9}`");
631	decode_gb18030(b"`\x84\x31\x83\x30`", "`\u{FE14}`");
632
633	// The difference from the original GB18030
634	decode_gb18030(b"`\xA3\xA0`", "`\u{3000}`");
635	decode_gb18030(b"`\xA1\xA1`", "`\u{3000}`");
636
637	// 0xFF
638	decode_gb18030(b"`\xFF\x40`", "`\u{FFFD}\u{0040}`");
639	decode_gb18030(b"`\xE3\xFF\x9A\x33`", "`\u{FFFD}\u{FFFD}`"); // not \u{FFFD}\u{FFFD}\u{0033} !
640	decode_gb18030(b"`\xFF\x32\x9A\x33`", "`\u{FFFD}\u{0032}\u{FFFD}`"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} !
641	decode_gb18030(b"`\xFF\x40\x00`", "`\u{FFFD}\u{0040}\u{0000}`");
642	decode_gb18030(b"`\xE3\xFF\x9A\x33\x00`", "`\u{FFFD}\u{FFFD}\u{0033}\u{0000}`");
643	decode_gb18030(
644	b"`\xFF\x32\x9A\x33\x00`",
645	"`\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}`",
646	);
647
648	// Four bytes
649	decode_gb18030(b"`\x81\x30\x81\x30`", "`\u{0080}`");
650	decode_gb18030(b"`\x81\x35\xF4\x37`", "`\u{E7C7}`");
651	decode_gb18030(b"`\x81\x37\xA3\x30`", "`\u{2603}`");
652	decode_gb18030(b"`\x94\x39\xDA\x33`", "`\u{1F4A9}`");
653	decode_gb18030(b"`\xE3\x32\x9A\x35`", "`\u{10FFFF}`");
654	decode_gb18030(b"`\xE3\x32\x9A\x36\x81\x30`", "`\u{FFFD}\u{FFFD}`");
655	decode_gb18030(b"`\xE3\x32\x9A\x36\x81\x40`", "`\u{FFFD}\u{4E02}`");
656	decode_gb18030(b"`\xE3\x32\x9A`", "`\u{FFFD}`"); // not \u{FFFD}\u{0032}\u{FFFD} !
657	decode_gb18030(b"`\xE3\x32\x9A\x00`", "`\u{FFFD}\u{0032}\u{FFFD}\u{0000}`");
658	}
659
660	#[test]
661	fn test_gb18030_encode() {
662	// Empty
663	encode_gb18030("", b"");
664
665	// ASCII
666	encode_gb18030("`\u{0061}\u{0062}`", b"`\x61\x62`");
667
668	// euro
669	encode_gb18030("`\u{20AC}`", b"`\xA2\xE3`");
670
671	// two bytes
672	encode_gb18030("`\u{4E02}`", b"`\x81\x40`");
673	encode_gb18030("`\u{4E8A}`", b"`\x81\x7E`");
674	if !cfg!(miri) {
675	// Miri is too slow
676	encode_gb18030("`\u{4E90}`", b"`\x81\x80`");
677	encode_gb18030("`\u{4FA2}`", b"`\x81\xFE`");
678	encode_gb18030("`\u{FA0C}`", b"`\xFE\x40`");
679	encode_gb18030("`\u{E843}`", b"`\xFE\x7E`");
680	encode_gb18030("`\u{4723}`", b"`\xFE\x80`");
681	encode_gb18030("`\u{E4C5}`", b"`\xFE\xFE`");
682	}
683
684	// The difference from the original GB18030
685	encode_gb18030("`\u{E5E5}`", b"");
686	encode_gb18030("`\u{3000}`", b"`\xA1\xA1`");
687
688	// Four bytes
689	encode_gb18030("`\u{0080}`", b"`\x81\x30\x81\x30`");
690	encode_gb18030("`\u{E7C7}`", b"`\x81\x35\xF4\x37`");
691	if !cfg!(miri) {
692	// Miri is too slow
693	encode_gb18030("`\u{2603}`", b"`\x81\x37\xA3\x30`");
694	encode_gb18030("`\u{1F4A9}`", b"`\x94\x39\xDA\x33`");
695	encode_gb18030("`\u{10FFFF}`", b"`\xE3\x32\x9A\x35`");
696	}
697
698	// Edge cases
699	encode_gb18030("`\u{00F7}`", b"`\xA1\xC2`");
700
701	// GB18030-2022
702	encode_gb18030("`\u{9FB9}`", b"`\xFE\x7E`");
703	encode_gb18030("`\u{FE14}`", b"`\xA6\xDD`");
704	encode_gb18030("`\u{E843}`", b"`\xFE\x7E`");
705	encode_gb18030("`\u{E791}`", b"`\xA6\xDD`");
706
707	// Non-change in GB18030-2022
708	encode_gb18030("`\u{E817}`", b"`\xFE\x52`");
709	}
710
711	#[test]
712	fn test_gbk_encode() {
713	// Empty
714	encode_gbk("", b"");
715
716	// ASCII
717	encode_gbk("`\u{0061}\u{0062}`", b"`\x61\x62`");
718
719	// euro
720	encode_gbk("`\u{20AC}`", b"`\x80`");
721
722	// two bytes
723	encode_gbk("`\u{4E02}`", b"`\x81\x40`");
724	encode_gbk("`\u{4E8A}`", b"`\x81\x7E`");
725	if !cfg!(miri) {
726	// Miri is too slow
727	encode_gbk("`\u{4E90}`", b"`\x81\x80`");
728	encode_gbk("`\u{4FA2}`", b"`\x81\xFE`");
729	encode_gbk("`\u{FA0C}`", b"`\xFE\x40`");
730	encode_gbk("`\u{E843}`", b"`\xFE\x7E`");
731	encode_gbk("`\u{4723}`", b"`\xFE\x80`");
732	encode_gbk("`\u{E4C5}`", b"`\xFE\xFE`");
733	}
734
735	// The difference from the original gb18030
736	encode_gbk("`\u{E5E5}`", b"");
737	encode_gbk("`\u{3000}`", b"`\xA1\xA1`");
738
739	// Four bytes
740	encode_gbk("`\u{0080}`", b"");
741	encode_gbk("`\u{E7C7}`", b"");
742	if !cfg!(miri) {
743	// Miri is too slow
744	encode_gbk("`\u{2603}`", b"☃");
745	encode_gbk("`\u{1F4A9}`", b"💩");
746	encode_gbk("`\u{10FFFF}`", b"􏿿");
747	}
748
749	// Edge cases
750	encode_gbk("`\u{00F7}`", b"`\xA1\xC2`");
751
752	// GB18030-2022
753	encode_gb18030("`\u{9FB9}`", b"`\xFE\x7E`");
754	encode_gb18030("`\u{FE14}`", b"`\xA6\xDD`");
755	encode_gb18030("`\u{E843}`", b"`\xFE\x7E`");
756	encode_gb18030("`\u{E791}`", b"`\xA6\xDD`");
757
758	// Non-change in GB18030-2022
759	encode_gb18030("`\u{E817}`", b"`\xFE\x52`");
760	}
761
762	#[test]
763	#[cfg_attr(miri, ignore)] // Miri is too slow
764	fn test_gb18030_decode_all() {
765	let input = include_bytes!("test_data/gb18030_in.txt");
766	let expectation = include_str!("test_data/gb18030_in_ref.txt");
767	let (cow, had_errors) = GB18030.decode_without_bom_handling(input);
768	assert!(!had_errors, "Should not have had errors.");
769	assert_eq!(&cow[..], expectation);
770	}
771
772	#[test]
773	#[cfg_attr(miri, ignore)] // Miri is too slow
774	fn test_gb18030_encode_all() {
775	let input = include_str!("test_data/gb18030_out.txt");
776	let expectation = include_bytes!("test_data/gb18030_out_ref.txt");
777	let (cow, encoding, had_errors) = GB18030.encode(input);
778	assert!(!had_errors, "Should not have had errors.");
779	assert_eq!(encoding, GB18030);
780	assert_eq!(&cow[..], &expectation[..]);
781	}
782
783	#[test]
784	fn test_gb18030_encode_from_utf16_max_length() {
785	let mut output = [`0u8`; `20`];
786	let mut encoder = GB18030.new_encoder();
787	{
788	let needed = encoder
789	.max_buffer_length_from_utf16_without_replacement(`1`)
790	.unwrap();
791	let (result, read, written) = encoder.encode_from_utf16_without_replacement(
792	&[`0x3000`],
793	&mut output[..needed],
794	`true`,
795	);
796	assert_eq!(result, EncoderResult::InputEmpty);
797	assert_eq!(read, `1`);
798	assert_eq!(written, `2`);
799	assert_eq!(output[`0`], `0xA1`);
800	assert_eq!(output[`1`], `0xA1`);
801	}
802	}
803	}
804