big5.rs source code [crates/encoding_rs/src/big5.rs]

1	// Copyright Mozilla Foundation. See the COPYRIGHT
2	// file at the top-level directory of this distribution.
3	//
4	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7	// option. This file may not be copied, modified, or distributed
8	// except according to those terms.
9
10	use super::*;
11	use crate::data::*;
12	use crate::handles::*;
13	use crate::variant::*;
14	// Rust 1.14.0 requires the following despite the asterisk above.
15	use super::in_inclusive_range32;
16
17	pub struct Big5Decoder {
18	lead: Option<u8>,
19	}
20
21	impl Big5Decoder {
22	pub fn new() -> VariantDecoder {
23	VariantDecoder::Big5(Big5Decoder { lead: None })
24	}
25
26	pub fn in_neutral_state(&self) -> bool {
27	self.lead.is_none()
28	}
29
30	fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
31	byte_length.checked_add(match self.lead {
32	None => `0`,
33	Some(_) => `1`,
34	})
35	}
36
37	pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
38	// If there is a lead but the next byte isn't a valid trail, an
39	// error is generated for the lead (+1). Then another iteration checks
40	// space, which needs +1 to account for the possibility of astral
41	// output or combining pair.
42	checked_add(`1`, self.plus_one_if_lead(byte_length))
43	}
44
45	pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
46	// No need to account for REPLACEMENT CHARACTERS.
47	// Cases:
48	// ASCII: 1 to 1
49	// Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4
50	// lead set and first byte is trail: 1 to 4 worst case
51	//
52	// When checking for space for the last byte:
53	// no lead: the last byte must be ASCII (or fatal error): 1 to 1
54	// lead set: space for 4 bytes was already checked when reading the
55	// lead, hence the last lead and the last trail together are worst
56	// case 2 to 4.
57	//
58	// If lead set and the input is a single trail byte, the worst-case
59	// output is 4, so we need to add one before multiplying if lead is
60	// set.
61	//
62	// Finally, add two so that if input is non-zero, the output is at
63	// least 4.
64	checked_add(`2`, checked_mul(`2`, self.plus_one_if_lead(byte_length)))
65	}
66
67	pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
68	// If there is a lead but the next byte isn't a valid trail, an
69	// error is generated for the lead (+(13)). Then another iteration*
70	// checks space, which needs +3 to account for the possibility of astral
71	// output or combining pair. In between start and end, the worst case
72	// is that every byte is bad: 3.*
73	checked_add(`3`, checked_mul(`3`, self.plus_one_if_lead(byte_length)))
74	}
75
76	ascii_compatible_two_byte_decoder_functions!(
77	{
78	// If lead is between 0x81 and 0xFE, inclusive,
79	// subtract offset 0x81.
80	let non_ascii_minus_offset =
81	non_ascii.wrapping_sub(`0x81`);
82	if non_ascii_minus_offset > (`0xFE` - `0x81`) {
83	return (DecoderResult::Malformed(`1`, `0`),
84	source.consumed(),
85	handle.written());
86	}
87	non_ascii_minus_offset
88	},
89	{
90	// If trail is between 0x40 and 0x7E, inclusive,
91	// subtract offset 0x40. Else if trail is
92	// between 0xA1 and 0xFE, inclusive, subtract
93	// offset 0x62.
94	// TODO: Find out which range is more probable.
95	let mut trail_minus_offset =
96	byte.wrapping_sub(`0x40`);
97	if trail_minus_offset > (`0x7E` - `0x40`) {
98	let trail_minus_range_start =
99	byte.wrapping_sub(`0xA1`);
100	if trail_minus_range_start >
101	(`0xFE` - `0xA1`) {
102	if byte < `0x80` {
103	return (DecoderResult::Malformed(`1`, `0`),
104	unread_handle_trail.unread(),
105	handle.written());
106	}
107	return (DecoderResult::Malformed(`2`, `0`),
108	unread_handle_trail.consumed(),
109	handle.written());
110	}
111	trail_minus_offset = byte - `0x62`;
112	}
113	let pointer = lead_minus_offset as usize *
114	`157usize` +
115	trail_minus_offset as usize;
116	let rebased_pointer = pointer.wrapping_sub(`942`);
117	let low_bits = big5_low_bits(rebased_pointer);
118	if low_bits == `0` {
119	match pointer {
120	`1133` => {
121	handle.write_big5_combination(`0x00CAu16`,
122	`0x0304u16`)
123	}
124	`1135` => {
125	handle.write_big5_combination(`0x00CAu16`,
126	`0x030Cu16`)
127	}
128	`1164` => {
129	handle.write_big5_combination(`0x00EAu16`,
130	`0x0304u16`)
131	}
132	`1166` => {
133	handle.write_big5_combination(`0x00EAu16`,
134	`0x030Cu16`)
135	}
136	_ => {
137	if byte < `0x80` {
138	return (DecoderResult::Malformed(`1`, `0`),
139	unread_handle_trail.unread(),
140	handle.written());
141	}
142	return (DecoderResult::Malformed(`2`, `0`),
143	unread_handle_trail.consumed(),
144	handle.written());
145	}
146	}
147	} else if big5_is_astral(rebased_pointer) {
148	handle.write_astral(u32::from(low_bits) \|
149	`0x20000u32`)
150	} else {
151	handle.write_bmp_excl_ascii(low_bits)
152	}
153	},
154	self,
155	non_ascii,
156	byte,
157	lead_minus_offset,
158	unread_handle_trail,
159	source,
160	handle,
161	'outermost,
162	copy_ascii_from_check_space_astral,
163	check_space_astral,
164	`false`);
165	}
166
167	pub struct Big5Encoder;
168
169	impl Big5Encoder {
170	pub fn new(encoding: &'static Encoding) -> Encoder {
171	Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder))
172	}
173
174	pub fn max_buffer_length_from_utf16_without_replacement(
175	&self,
176	u16_length: usize,
177	) -> Option<usize> {
178	// Astral: 2 to 2
179	// ASCII: 1 to 1
180	// Other: 1 to 2
181	u16_length.checked_mul(`2`)
182	}
183
184	pub fn max_buffer_length_from_utf8_without_replacement(
185	&self,
186	byte_length: usize,
187	) -> Option<usize> {
188	// Astral: 4 to 2
189	// Upper BMP: 3 to 2
190	// Lower BMP: 2 to 2
191	// ASCII: 1 to 1
192	byte_length.checked_add(`1`)
193	}
194
195	ascii_compatible_encoder_functions!(
196	{
197	// For simplicity, unified ideographs
198	// in the pointer range 11206...11212 are handled
199	// as Level 1 Hanzi.
200	if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) {
201	handle.write_two(lead, trail)
202	} else {
203	let pointer = if let Some(pointer) = big5_box_encode(bmp) {
204	pointer
205	} else if let Some(pointer) = big5_other_encode(bmp) {
206	pointer
207	} else {
208	return (
209	EncoderResult::unmappable_from_bmp(bmp),
210	source.consumed(),
211	handle.written(),
212	);
213	};
214	let lead = pointer / `157` + `0x81`;
215	let remainder = pointer % `157`;
216	let trail = if remainder < `0x3F` {
217	remainder + `0x40`
218	} else {
219	remainder + `0x62`
220	};
221	handle.write_two(lead as u8, trail as u8)
222	}
223	},
224	{
225	if in_inclusive_range32(astral as u32, `0x2008A`, `0x2F8A6`) {
226	if let Some(rebased_pointer) = big5_astral_encode(astral as u16) {
227	// big5_astral_encode returns rebased pointer,
228	// so adding 0x87 instead of 0x81.
229	let lead = rebased_pointer / `157` + `0x87`;
230	let remainder = rebased_pointer % `157`;
231	let trail = if remainder < `0x3F` {
232	remainder + `0x40`
233	} else {
234	remainder + `0x62`
235	};
236	handle.write_two(lead as u8, trail as u8)
237	} else {
238	return (
239	EncoderResult::Unmappable(astral),
240	source.consumed(),
241	handle.written(),
242	);
243	}
244	} else {
245	return (
246	EncoderResult::Unmappable(astral),
247	source.consumed(),
248	handle.written(),
249	);
250	}
251	},
252	bmp,
253	astral,
254	self,
255	source,
256	handle,
257	copy_ascii_to_check_space_two,
258	check_space_two,
259	`false`
260	);
261	}
262
263	// Any copyright to the test code below this comment is dedicated to the
264	// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
265
266	#[cfg(all(test, feature = "alloc"))]
267	mod tests {
268	use super::super::testing::*;
269	use super::super::*;
270
271	fn decode_big5(bytes: &[u8], expect: &str) {
272	decode(BIG5, bytes, expect);
273	}
274
275	fn encode_big5(string: &str, expect: &[u8]) {
276	encode(BIG5, string, expect);
277	}
278
279	#[test]
280	fn test_big5_decode() {
281	// Empty
282	decode_big5(b"", &"");
283
284	// ASCII
285	decode_big5(&[`0x61u8`, `0x62u8`], &"`\u{0061}\u{0062}`");
286
287	// Edge cases
288	decode_big5(&[`0x87u8`, `0x40u8`], &"`\u{43F0}`");
289	decode_big5(&[`0xFEu8`, `0xFEu8`], &"`\u{79D4}`");
290	decode_big5(&[`0xFEu8`, `0xFDu8`], &"`\u{2910D}`");
291	decode_big5(&[`0x88u8`, `0x62u8`], &"`\u{00CA}\u{0304}`");
292	decode_big5(&[`0x88u8`, `0x64u8`], &"`\u{00CA}\u{030C}`");
293	decode_big5(&[`0x88u8`, `0x66u8`], &"`\u{00CA}`");
294	decode_big5(&[`0x88u8`, `0xA3u8`], &"`\u{00EA}\u{0304}`");
295	decode_big5(&[`0x88u8`, `0xA5u8`], &"`\u{00EA}\u{030C}`");
296	decode_big5(&[`0x88u8`, `0xA7u8`], &"`\u{00EA}`");
297	decode_big5(&[`0x99u8`, `0xD4u8`], &"`\u{8991}`");
298	decode_big5(&[`0x99u8`, `0xD5u8`], &"`\u{27967}`");
299	decode_big5(&[`0x99u8`, `0xD6u8`], &"`\u{8A29}`");
300
301	// Edge cases surrounded with ASCII
302	decode_big5(
303	&[`0x61u8`, `0x87u8`, `0x40u8`, `0x62u8`],
304	&"`\u{0061}\u{43F0}\u{0062}`",
305	);
306	decode_big5(
307	&[`0x61u8`, `0xFEu8`, `0xFEu8`, `0x62u8`],
308	&"`\u{0061}\u{79D4}\u{0062}`",
309	);
310	decode_big5(
311	&[`0x61u8`, `0xFEu8`, `0xFDu8`, `0x62u8`],
312	&"`\u{0061}\u{2910D}\u{0062}`",
313	);
314	decode_big5(
315	&[`0x61u8`, `0x88u8`, `0x62u8`, `0x62u8`],
316	&"`\u{0061}\u{00CA}\u{0304}\u{0062}`",
317	);
318	decode_big5(
319	&[`0x61u8`, `0x88u8`, `0x64u8`, `0x62u8`],
320	&"`\u{0061}\u{00CA}\u{030C}\u{0062}`",
321	);
322	decode_big5(
323	&[`0x61u8`, `0x88u8`, `0x66u8`, `0x62u8`],
324	&"`\u{0061}\u{00CA}\u{0062}`",
325	);
326	decode_big5(
327	&[`0x61u8`, `0x88u8`, `0xA3u8`, `0x62u8`],
328	&"`\u{0061}\u{00EA}\u{0304}\u{0062}`",
329	);
330	decode_big5(
331	&[`0x61u8`, `0x88u8`, `0xA5u8`, `0x62u8`],
332	&"`\u{0061}\u{00EA}\u{030C}\u{0062}`",
333	);
334	decode_big5(
335	&[`0x61u8`, `0x88u8`, `0xA7u8`, `0x62u8`],
336	&"`\u{0061}\u{00EA}\u{0062}`",
337	);
338	decode_big5(
339	&[`0x61u8`, `0x99u8`, `0xD4u8`, `0x62u8`],
340	&"`\u{0061}\u{8991}\u{0062}`",
341	);
342	decode_big5(
343	&[`0x61u8`, `0x99u8`, `0xD5u8`, `0x62u8`],
344	&"`\u{0061}\u{27967}\u{0062}`",
345	);
346	decode_big5(
347	&[`0x61u8`, `0x99u8`, `0xD6u8`, `0x62u8`],
348	&"`\u{0061}\u{8A29}\u{0062}`",
349	);
350
351	// Bad sequences
352	decode_big5(&[`0x80u8`, `0x61u8`], &"`\u{FFFD}\u{0061}`");
353	decode_big5(&[`0xFFu8`, `0x61u8`], &"`\u{FFFD}\u{0061}`");
354	decode_big5(&[`0xFEu8`, `0x39u8`], &"`\u{FFFD}\u{0039}`");
355	decode_big5(&[`0x87u8`, `0x66u8`], &"`\u{FFFD}\u{0066}`");
356	decode_big5(&[`0x81u8`, `0x40u8`], &"`\u{FFFD}\u{0040}`");
357	decode_big5(&[`0x61u8`, `0x81u8`], &"`\u{0061}\u{FFFD}`");
358	}
359
360	#[test]
361	fn test_big5_encode() {
362	// Empty
363	encode_big5("", b"");
364
365	// ASCII
366	encode_big5("`\u{0061}\u{0062}`", b"`\x61\x62`");
367
368	if !cfg!(miri) {
369	// Miri is too slow
370	// Edge cases
371	encode_big5("`\u{9EA6}\u{0061}`", b"麦`\x61`");
372	encode_big5("`\u{2626B}\u{0061}`", b"𦉫`\x61`");
373	encode_big5("`\u{3000}`", b"`\xA1\x40`");
374	encode_big5("`\u{20AC}`", b"`\xA3\xE1`");
375	encode_big5("`\u{4E00}`", b"`\xA4\x40`");
376	encode_big5("`\u{27607}`", b"`\xC8\xA4`");
377	encode_big5("`\u{FFE2}`", b"`\xC8\xCD`");
378	encode_big5("`\u{79D4}`", b"`\xFE\xFE`");
379
380	// Not in index
381	encode_big5("`\u{2603}\u{0061}`", b"☃`\x61`");
382	}
383
384	// duplicate low bits
385	encode_big5("`\u{203B5}`", b"`\xFD\x6A`");
386	encode_big5("`\u{25605}`", b"`\xFE\x46`");
387
388	// prefer last
389	encode_big5("`\u{2550}`", b"`\xF9\xF9`");
390	}
391
392	#[test]
393	#[cfg_attr(miri, ignore)] // Miri is too slow
394	fn test_big5_decode_all() {
395	let input = include_bytes!("test_data/big5_in.txt");
396	let expectation = include_str!("test_data/big5_in_ref.txt");
397	let (cow, had_errors) = BIG5.decode_without_bom_handling(input);
398	assert!(had_errors, "Should have had errors.");
399	assert_eq!(&cow[..], expectation);
400	}
401
402	#[test]
403	#[cfg_attr(miri, ignore)] // Miri is too slow
404	fn test_big5_encode_all() {
405	let input = include_str!("test_data/big5_out.txt");
406	let expectation = include_bytes!("test_data/big5_out_ref.txt");
407	let (cow, encoding, had_errors) = BIG5.encode(input);
408	assert!(!had_errors, "Should not have had errors.");
409	assert_eq!(encoding, BIG5);
410	assert_eq!(&cow[..], &expectation[..]);
411	}
412
413	#[test]
414	#[cfg_attr(miri, ignore)] // Miri is too slow
415	fn test_big5_encode_from_two_low_surrogates() {
416	let expectation = b"��";
417	let mut output = [`0u8`; `40`];
418	let mut encoder = BIG5.new_encoder();
419	let (result, read, written, had_errors) =
420	encoder.encode_from_utf16(&[`0xDC00u16`, `0xDEDEu16`], &mut output[..], `true`);
421	assert_eq!(result, CoderResult::InputEmpty);
422	assert_eq!(read, `2`);
423	assert_eq!(written, expectation.len());
424	assert!(had_errors);
425	assert_eq!(&output[..written], expectation);
426	}
427	}
428