1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::data::*;
12use crate::handles::*;
13use crate::variant::*;
14// Rust 1.14.0 requires the following despite the asterisk above.
15use super::in_inclusive_range32;
16
17pub struct Big5Decoder {
18 lead: Option<u8>,
19}
20
21impl Big5Decoder {
22 pub fn new() -> VariantDecoder {
23 VariantDecoder::Big5(Big5Decoder { lead: None })
24 }
25
26 pub fn in_neutral_state(&self) -> bool {
27 self.lead.is_none()
28 }
29
30 fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
31 byte_length.checked_add(match self.lead {
32 None => 0,
33 Some(_) => 1,
34 })
35 }
36
37 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
38 // If there is a lead but the next byte isn't a valid trail, an
39 // error is generated for the lead (+1). Then another iteration checks
40 // space, which needs +1 to account for the possibility of astral
41 // output or combining pair.
42 checked_add(1, self.plus_one_if_lead(byte_length))
43 }
44
45 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
46 // No need to account for REPLACEMENT CHARACTERS.
47 // Cases:
48 // ASCII: 1 to 1
49 // Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4
50 // lead set and first byte is trail: 1 to 4 worst case
51 //
52 // When checking for space for the last byte:
53 // no lead: the last byte must be ASCII (or fatal error): 1 to 1
54 // lead set: space for 4 bytes was already checked when reading the
55 // lead, hence the last lead and the last trail together are worst
56 // case 2 to 4.
57 //
58 // If lead set and the input is a single trail byte, the worst-case
59 // output is 4, so we need to add one before multiplying if lead is
60 // set.
61 //
62 // Finally, add two so that if input is non-zero, the output is at
63 // least 4.
64 checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length)))
65 }
66
67 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
68 // If there is a lead but the next byte isn't a valid trail, an
69 // error is generated for the lead (+(1*3)). Then another iteration
70 // checks space, which needs +3 to account for the possibility of astral
71 // output or combining pair. In between start and end, the worst case
72 // is that every byte is bad: *3.
73 checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length)))
74 }
75
76 ascii_compatible_two_byte_decoder_functions!(
77 {
78 // If lead is between 0x81 and 0xFE, inclusive,
79 // subtract offset 0x81.
80 let non_ascii_minus_offset =
81 non_ascii.wrapping_sub(0x81);
82 if non_ascii_minus_offset > (0xFE - 0x81) {
83 return (DecoderResult::Malformed(1, 0),
84 source.consumed(),
85 handle.written());
86 }
87 non_ascii_minus_offset
88 },
89 {
90 // If trail is between 0x40 and 0x7E, inclusive,
91 // subtract offset 0x40. Else if trail is
92 // between 0xA1 and 0xFE, inclusive, subtract
93 // offset 0x62.
94 // TODO: Find out which range is more probable.
95 let mut trail_minus_offset =
96 byte.wrapping_sub(0x40);
97 if trail_minus_offset > (0x7E - 0x40) {
98 let trail_minus_range_start =
99 byte.wrapping_sub(0xA1);
100 if trail_minus_range_start >
101 (0xFE - 0xA1) {
102 if byte < 0x80 {
103 return (DecoderResult::Malformed(1, 0),
104 unread_handle_trail.unread(),
105 handle.written());
106 }
107 return (DecoderResult::Malformed(2, 0),
108 unread_handle_trail.consumed(),
109 handle.written());
110 }
111 trail_minus_offset = byte - 0x62;
112 }
113 let pointer = lead_minus_offset as usize *
114 157usize +
115 trail_minus_offset as usize;
116 let rebased_pointer = pointer.wrapping_sub(942);
117 let low_bits = big5_low_bits(rebased_pointer);
118 if low_bits == 0 {
119 match pointer {
120 1133 => {
121 handle.write_big5_combination(0x00CAu16,
122 0x0304u16)
123 }
124 1135 => {
125 handle.write_big5_combination(0x00CAu16,
126 0x030Cu16)
127 }
128 1164 => {
129 handle.write_big5_combination(0x00EAu16,
130 0x0304u16)
131 }
132 1166 => {
133 handle.write_big5_combination(0x00EAu16,
134 0x030Cu16)
135 }
136 _ => {
137 if byte < 0x80 {
138 return (DecoderResult::Malformed(1, 0),
139 unread_handle_trail.unread(),
140 handle.written());
141 }
142 return (DecoderResult::Malformed(2, 0),
143 unread_handle_trail.consumed(),
144 handle.written());
145 }
146 }
147 } else if big5_is_astral(rebased_pointer) {
148 handle.write_astral(u32::from(low_bits) |
149 0x20000u32)
150 } else {
151 handle.write_bmp_excl_ascii(low_bits)
152 }
153 },
154 self,
155 non_ascii,
156 byte,
157 lead_minus_offset,
158 unread_handle_trail,
159 source,
160 handle,
161 'outermost,
162 copy_ascii_from_check_space_astral,
163 check_space_astral,
164 false);
165}
166
167pub struct Big5Encoder;
168
169impl Big5Encoder {
170 pub fn new(encoding: &'static Encoding) -> Encoder {
171 Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder))
172 }
173
174 pub fn max_buffer_length_from_utf16_without_replacement(
175 &self,
176 u16_length: usize,
177 ) -> Option<usize> {
178 // Astral: 2 to 2
179 // ASCII: 1 to 1
180 // Other: 1 to 2
181 u16_length.checked_mul(2)
182 }
183
184 pub fn max_buffer_length_from_utf8_without_replacement(
185 &self,
186 byte_length: usize,
187 ) -> Option<usize> {
188 // Astral: 4 to 2
189 // Upper BMP: 3 to 2
190 // Lower BMP: 2 to 2
191 // ASCII: 1 to 1
192 byte_length.checked_add(1)
193 }
194
195 ascii_compatible_encoder_functions!(
196 {
197 // For simplicity, unified ideographs
198 // in the pointer range 11206...11212 are handled
199 // as Level 1 Hanzi.
200 if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) {
201 handle.write_two(lead, trail)
202 } else {
203 let pointer = if let Some(pointer) = big5_box_encode(bmp) {
204 pointer
205 } else if let Some(pointer) = big5_other_encode(bmp) {
206 pointer
207 } else {
208 return (
209 EncoderResult::unmappable_from_bmp(bmp),
210 source.consumed(),
211 handle.written(),
212 );
213 };
214 let lead = pointer / 157 + 0x81;
215 let remainder = pointer % 157;
216 let trail = if remainder < 0x3F {
217 remainder + 0x40
218 } else {
219 remainder + 0x62
220 };
221 handle.write_two(lead as u8, trail as u8)
222 }
223 },
224 {
225 if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) {
226 if let Some(rebased_pointer) = big5_astral_encode(astral as u16) {
227 // big5_astral_encode returns rebased pointer,
228 // so adding 0x87 instead of 0x81.
229 let lead = rebased_pointer / 157 + 0x87;
230 let remainder = rebased_pointer % 157;
231 let trail = if remainder < 0x3F {
232 remainder + 0x40
233 } else {
234 remainder + 0x62
235 };
236 handle.write_two(lead as u8, trail as u8)
237 } else {
238 return (
239 EncoderResult::Unmappable(astral),
240 source.consumed(),
241 handle.written(),
242 );
243 }
244 } else {
245 return (
246 EncoderResult::Unmappable(astral),
247 source.consumed(),
248 handle.written(),
249 );
250 }
251 },
252 bmp,
253 astral,
254 self,
255 source,
256 handle,
257 copy_ascii_to_check_space_two,
258 check_space_two,
259 false
260 );
261}
262
263// Any copyright to the test code below this comment is dedicated to the
264// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
265
266#[cfg(all(test, feature = "alloc"))]
267mod tests {
268 use super::super::testing::*;
269 use super::super::*;
270
271 fn decode_big5(bytes: &[u8], expect: &str) {
272 decode(BIG5, bytes, expect);
273 }
274
275 fn encode_big5(string: &str, expect: &[u8]) {
276 encode(BIG5, string, expect);
277 }
278
279 #[test]
280 fn test_big5_decode() {
281 // Empty
282 decode_big5(b"", &"");
283
284 // ASCII
285 decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}");
286
287 // Edge cases
288 decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}");
289 decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}");
290 decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}");
291 decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}");
292 decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}");
293 decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}");
294 decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}");
295 decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}");
296 decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}");
297 decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}");
298 decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}");
299 decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}");
300
301 // Edge cases surrounded with ASCII
302 decode_big5(
303 &[0x61u8, 0x87u8, 0x40u8, 0x62u8],
304 &"\u{0061}\u{43F0}\u{0062}",
305 );
306 decode_big5(
307 &[0x61u8, 0xFEu8, 0xFEu8, 0x62u8],
308 &"\u{0061}\u{79D4}\u{0062}",
309 );
310 decode_big5(
311 &[0x61u8, 0xFEu8, 0xFDu8, 0x62u8],
312 &"\u{0061}\u{2910D}\u{0062}",
313 );
314 decode_big5(
315 &[0x61u8, 0x88u8, 0x62u8, 0x62u8],
316 &"\u{0061}\u{00CA}\u{0304}\u{0062}",
317 );
318 decode_big5(
319 &[0x61u8, 0x88u8, 0x64u8, 0x62u8],
320 &"\u{0061}\u{00CA}\u{030C}\u{0062}",
321 );
322 decode_big5(
323 &[0x61u8, 0x88u8, 0x66u8, 0x62u8],
324 &"\u{0061}\u{00CA}\u{0062}",
325 );
326 decode_big5(
327 &[0x61u8, 0x88u8, 0xA3u8, 0x62u8],
328 &"\u{0061}\u{00EA}\u{0304}\u{0062}",
329 );
330 decode_big5(
331 &[0x61u8, 0x88u8, 0xA5u8, 0x62u8],
332 &"\u{0061}\u{00EA}\u{030C}\u{0062}",
333 );
334 decode_big5(
335 &[0x61u8, 0x88u8, 0xA7u8, 0x62u8],
336 &"\u{0061}\u{00EA}\u{0062}",
337 );
338 decode_big5(
339 &[0x61u8, 0x99u8, 0xD4u8, 0x62u8],
340 &"\u{0061}\u{8991}\u{0062}",
341 );
342 decode_big5(
343 &[0x61u8, 0x99u8, 0xD5u8, 0x62u8],
344 &"\u{0061}\u{27967}\u{0062}",
345 );
346 decode_big5(
347 &[0x61u8, 0x99u8, 0xD6u8, 0x62u8],
348 &"\u{0061}\u{8A29}\u{0062}",
349 );
350
351 // Bad sequences
352 decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}");
353 decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}");
354 decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}");
355 decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}");
356 decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}");
357 decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}");
358 }
359
360 #[test]
361 fn test_big5_encode() {
362 // Empty
363 encode_big5("", b"");
364
365 // ASCII
366 encode_big5("\u{0061}\u{0062}", b"\x61\x62");
367
368 if !cfg!(miri) {
369 // Miri is too slow
370 // Edge cases
371 encode_big5("\u{9EA6}\u{0061}", b"&#40614;\x61");
372 encode_big5("\u{2626B}\u{0061}", b"&#156267;\x61");
373 encode_big5("\u{3000}", b"\xA1\x40");
374 encode_big5("\u{20AC}", b"\xA3\xE1");
375 encode_big5("\u{4E00}", b"\xA4\x40");
376 encode_big5("\u{27607}", b"\xC8\xA4");
377 encode_big5("\u{FFE2}", b"\xC8\xCD");
378 encode_big5("\u{79D4}", b"\xFE\xFE");
379
380 // Not in index
381 encode_big5("\u{2603}\u{0061}", b"&#9731;\x61");
382 }
383
384 // duplicate low bits
385 encode_big5("\u{203B5}", b"\xFD\x6A");
386 encode_big5("\u{25605}", b"\xFE\x46");
387
388 // prefer last
389 encode_big5("\u{2550}", b"\xF9\xF9");
390 }
391
392 #[test]
393 #[cfg_attr(miri, ignore)] // Miri is too slow
394 fn test_big5_decode_all() {
395 let input = include_bytes!("test_data/big5_in.txt");
396 let expectation = include_str!("test_data/big5_in_ref.txt");
397 let (cow, had_errors) = BIG5.decode_without_bom_handling(input);
398 assert!(had_errors, "Should have had errors.");
399 assert_eq!(&cow[..], expectation);
400 }
401
402 #[test]
403 #[cfg_attr(miri, ignore)] // Miri is too slow
404 fn test_big5_encode_all() {
405 let input = include_str!("test_data/big5_out.txt");
406 let expectation = include_bytes!("test_data/big5_out_ref.txt");
407 let (cow, encoding, had_errors) = BIG5.encode(input);
408 assert!(!had_errors, "Should not have had errors.");
409 assert_eq!(encoding, BIG5);
410 assert_eq!(&cow[..], &expectation[..]);
411 }
412
413 #[test]
414 #[cfg_attr(miri, ignore)] // Miri is too slow
415 fn test_big5_encode_from_two_low_surrogates() {
416 let expectation = b"&#65533;&#65533;";
417 let mut output = [0u8; 40];
418 let mut encoder = BIG5.new_encoder();
419 let (result, read, written, had_errors) =
420 encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
421 assert_eq!(result, CoderResult::InputEmpty);
422 assert_eq!(read, 2);
423 assert_eq!(written, expectation.len());
424 assert!(had_errors);
425 assert_eq!(&output[..written], expectation);
426 }
427}
428