1 | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | use super::*; |
11 | use crate::data::*; |
12 | use crate::handles::*; |
13 | use crate::variant::*; |
14 | // Rust 1.14.0 requires the following despite the asterisk above. |
15 | use super::in_inclusive_range32; |
16 | |
17 | pub struct Big5Decoder { |
18 | lead: Option<u8>, |
19 | } |
20 | |
21 | impl Big5Decoder { |
22 | pub fn new() -> VariantDecoder { |
23 | VariantDecoder::Big5(Big5Decoder { lead: None }) |
24 | } |
25 | |
26 | pub fn in_neutral_state(&self) -> bool { |
27 | self.lead.is_none() |
28 | } |
29 | |
30 | fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> { |
31 | byte_length.checked_add(match self.lead { |
32 | None => 0, |
33 | Some(_) => 1, |
34 | }) |
35 | } |
36 | |
37 | pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
38 | // If there is a lead but the next byte isn't a valid trail, an |
39 | // error is generated for the lead (+1). Then another iteration checks |
40 | // space, which needs +1 to account for the possibility of astral |
41 | // output or combining pair. |
42 | checked_add(1, self.plus_one_if_lead(byte_length)) |
43 | } |
44 | |
45 | pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
46 | // No need to account for REPLACEMENT CHARACTERS. |
47 | // Cases: |
48 | // ASCII: 1 to 1 |
49 | // Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4 |
50 | // lead set and first byte is trail: 1 to 4 worst case |
51 | // |
52 | // When checking for space for the last byte: |
53 | // no lead: the last byte must be ASCII (or fatal error): 1 to 1 |
54 | // lead set: space for 4 bytes was already checked when reading the |
55 | // lead, hence the last lead and the last trail together are worst |
56 | // case 2 to 4. |
57 | // |
58 | // If lead set and the input is a single trail byte, the worst-case |
59 | // output is 4, so we need to add one before multiplying if lead is |
60 | // set. |
61 | // |
62 | // Finally, add two so that if input is non-zero, the output is at |
63 | // least 4. |
64 | checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length))) |
65 | } |
66 | |
67 | pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
68 | // If there is a lead but the next byte isn't a valid trail, an |
69 | // error is generated for the lead (+(1*3)). Then another iteration |
70 | // checks space, which needs +3 to account for the possibility of astral |
71 | // output or combining pair. In between start and end, the worst case |
72 | // is that every byte is bad: *3. |
73 | checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length))) |
74 | } |
75 | |
76 | ascii_compatible_two_byte_decoder_functions!( |
77 | { |
78 | // If lead is between 0x81 and 0xFE, inclusive, |
79 | // subtract offset 0x81. |
80 | let non_ascii_minus_offset = |
81 | non_ascii.wrapping_sub(0x81); |
82 | if non_ascii_minus_offset > (0xFE - 0x81) { |
83 | return (DecoderResult::Malformed(1, 0), |
84 | source.consumed(), |
85 | handle.written()); |
86 | } |
87 | non_ascii_minus_offset |
88 | }, |
89 | { |
90 | // If trail is between 0x40 and 0x7E, inclusive, |
91 | // subtract offset 0x40. Else if trail is |
92 | // between 0xA1 and 0xFE, inclusive, subtract |
93 | // offset 0x62. |
94 | // TODO: Find out which range is more probable. |
95 | let mut trail_minus_offset = |
96 | byte.wrapping_sub(0x40); |
97 | if trail_minus_offset > (0x7E - 0x40) { |
98 | let trail_minus_range_start = |
99 | byte.wrapping_sub(0xA1); |
100 | if trail_minus_range_start > |
101 | (0xFE - 0xA1) { |
102 | if byte < 0x80 { |
103 | return (DecoderResult::Malformed(1, 0), |
104 | unread_handle_trail.unread(), |
105 | handle.written()); |
106 | } |
107 | return (DecoderResult::Malformed(2, 0), |
108 | unread_handle_trail.consumed(), |
109 | handle.written()); |
110 | } |
111 | trail_minus_offset = byte - 0x62; |
112 | } |
113 | let pointer = lead_minus_offset as usize * |
114 | 157usize + |
115 | trail_minus_offset as usize; |
116 | let rebased_pointer = pointer.wrapping_sub(942); |
117 | let low_bits = big5_low_bits(rebased_pointer); |
118 | if low_bits == 0 { |
119 | match pointer { |
120 | 1133 => { |
121 | handle.write_big5_combination(0x00CAu16, |
122 | 0x0304u16) |
123 | } |
124 | 1135 => { |
125 | handle.write_big5_combination(0x00CAu16, |
126 | 0x030Cu16) |
127 | } |
128 | 1164 => { |
129 | handle.write_big5_combination(0x00EAu16, |
130 | 0x0304u16) |
131 | } |
132 | 1166 => { |
133 | handle.write_big5_combination(0x00EAu16, |
134 | 0x030Cu16) |
135 | } |
136 | _ => { |
137 | if byte < 0x80 { |
138 | return (DecoderResult::Malformed(1, 0), |
139 | unread_handle_trail.unread(), |
140 | handle.written()); |
141 | } |
142 | return (DecoderResult::Malformed(2, 0), |
143 | unread_handle_trail.consumed(), |
144 | handle.written()); |
145 | } |
146 | } |
147 | } else if big5_is_astral(rebased_pointer) { |
148 | handle.write_astral(u32::from(low_bits) | |
149 | 0x20000u32) |
150 | } else { |
151 | handle.write_bmp_excl_ascii(low_bits) |
152 | } |
153 | }, |
154 | self, |
155 | non_ascii, |
156 | byte, |
157 | lead_minus_offset, |
158 | unread_handle_trail, |
159 | source, |
160 | handle, |
161 | 'outermost, |
162 | copy_ascii_from_check_space_astral, |
163 | check_space_astral, |
164 | false); |
165 | } |
166 | |
167 | pub struct Big5Encoder; |
168 | |
169 | impl Big5Encoder { |
170 | pub fn new(encoding: &'static Encoding) -> Encoder { |
171 | Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder)) |
172 | } |
173 | |
174 | pub fn max_buffer_length_from_utf16_without_replacement( |
175 | &self, |
176 | u16_length: usize, |
177 | ) -> Option<usize> { |
178 | // Astral: 2 to 2 |
179 | // ASCII: 1 to 1 |
180 | // Other: 1 to 2 |
181 | u16_length.checked_mul(2) |
182 | } |
183 | |
184 | pub fn max_buffer_length_from_utf8_without_replacement( |
185 | &self, |
186 | byte_length: usize, |
187 | ) -> Option<usize> { |
188 | // Astral: 4 to 2 |
189 | // Upper BMP: 3 to 2 |
190 | // Lower BMP: 2 to 2 |
191 | // ASCII: 1 to 1 |
192 | byte_length.checked_add(1) |
193 | } |
194 | |
195 | ascii_compatible_encoder_functions!( |
196 | { |
197 | // For simplicity, unified ideographs |
198 | // in the pointer range 11206...11212 are handled |
199 | // as Level 1 Hanzi. |
200 | if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) { |
201 | handle.write_two(lead, trail) |
202 | } else { |
203 | let pointer = if let Some(pointer) = big5_box_encode(bmp) { |
204 | pointer |
205 | } else if let Some(pointer) = big5_other_encode(bmp) { |
206 | pointer |
207 | } else { |
208 | return ( |
209 | EncoderResult::unmappable_from_bmp(bmp), |
210 | source.consumed(), |
211 | handle.written(), |
212 | ); |
213 | }; |
214 | let lead = pointer / 157 + 0x81; |
215 | let remainder = pointer % 157; |
216 | let trail = if remainder < 0x3F { |
217 | remainder + 0x40 |
218 | } else { |
219 | remainder + 0x62 |
220 | }; |
221 | handle.write_two(lead as u8, trail as u8) |
222 | } |
223 | }, |
224 | { |
225 | if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) { |
226 | if let Some(rebased_pointer) = big5_astral_encode(astral as u16) { |
227 | // big5_astral_encode returns rebased pointer, |
228 | // so adding 0x87 instead of 0x81. |
229 | let lead = rebased_pointer / 157 + 0x87; |
230 | let remainder = rebased_pointer % 157; |
231 | let trail = if remainder < 0x3F { |
232 | remainder + 0x40 |
233 | } else { |
234 | remainder + 0x62 |
235 | }; |
236 | handle.write_two(lead as u8, trail as u8) |
237 | } else { |
238 | return ( |
239 | EncoderResult::Unmappable(astral), |
240 | source.consumed(), |
241 | handle.written(), |
242 | ); |
243 | } |
244 | } else { |
245 | return ( |
246 | EncoderResult::Unmappable(astral), |
247 | source.consumed(), |
248 | handle.written(), |
249 | ); |
250 | } |
251 | }, |
252 | bmp, |
253 | astral, |
254 | self, |
255 | source, |
256 | handle, |
257 | copy_ascii_to_check_space_two, |
258 | check_space_two, |
259 | false |
260 | ); |
261 | } |
262 | |
263 | // Any copyright to the test code below this comment is dedicated to the |
264 | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
265 | |
266 | #[cfg (all(test, feature = "alloc" ))] |
267 | mod tests { |
268 | use super::super::testing::*; |
269 | use super::super::*; |
270 | |
271 | fn decode_big5(bytes: &[u8], expect: &str) { |
272 | decode(BIG5, bytes, expect); |
273 | } |
274 | |
275 | fn encode_big5(string: &str, expect: &[u8]) { |
276 | encode(BIG5, string, expect); |
277 | } |
278 | |
279 | #[test ] |
280 | fn test_big5_decode() { |
281 | // Empty |
282 | decode_big5(b"" , &"" ); |
283 | |
284 | // ASCII |
285 | decode_big5(&[0x61u8, 0x62u8], &" \u{0061}\u{0062}" ); |
286 | |
287 | // Edge cases |
288 | decode_big5(&[0x87u8, 0x40u8], &" \u{43F0}" ); |
289 | decode_big5(&[0xFEu8, 0xFEu8], &" \u{79D4}" ); |
290 | decode_big5(&[0xFEu8, 0xFDu8], &" \u{2910D}" ); |
291 | decode_big5(&[0x88u8, 0x62u8], &" \u{00CA}\u{0304}" ); |
292 | decode_big5(&[0x88u8, 0x64u8], &" \u{00CA}\u{030C}" ); |
293 | decode_big5(&[0x88u8, 0x66u8], &" \u{00CA}" ); |
294 | decode_big5(&[0x88u8, 0xA3u8], &" \u{00EA}\u{0304}" ); |
295 | decode_big5(&[0x88u8, 0xA5u8], &" \u{00EA}\u{030C}" ); |
296 | decode_big5(&[0x88u8, 0xA7u8], &" \u{00EA}" ); |
297 | decode_big5(&[0x99u8, 0xD4u8], &" \u{8991}" ); |
298 | decode_big5(&[0x99u8, 0xD5u8], &" \u{27967}" ); |
299 | decode_big5(&[0x99u8, 0xD6u8], &" \u{8A29}" ); |
300 | |
301 | // Edge cases surrounded with ASCII |
302 | decode_big5( |
303 | &[0x61u8, 0x87u8, 0x40u8, 0x62u8], |
304 | &" \u{0061}\u{43F0}\u{0062}" , |
305 | ); |
306 | decode_big5( |
307 | &[0x61u8, 0xFEu8, 0xFEu8, 0x62u8], |
308 | &" \u{0061}\u{79D4}\u{0062}" , |
309 | ); |
310 | decode_big5( |
311 | &[0x61u8, 0xFEu8, 0xFDu8, 0x62u8], |
312 | &" \u{0061}\u{2910D}\u{0062}" , |
313 | ); |
314 | decode_big5( |
315 | &[0x61u8, 0x88u8, 0x62u8, 0x62u8], |
316 | &" \u{0061}\u{00CA}\u{0304}\u{0062}" , |
317 | ); |
318 | decode_big5( |
319 | &[0x61u8, 0x88u8, 0x64u8, 0x62u8], |
320 | &" \u{0061}\u{00CA}\u{030C}\u{0062}" , |
321 | ); |
322 | decode_big5( |
323 | &[0x61u8, 0x88u8, 0x66u8, 0x62u8], |
324 | &" \u{0061}\u{00CA}\u{0062}" , |
325 | ); |
326 | decode_big5( |
327 | &[0x61u8, 0x88u8, 0xA3u8, 0x62u8], |
328 | &" \u{0061}\u{00EA}\u{0304}\u{0062}" , |
329 | ); |
330 | decode_big5( |
331 | &[0x61u8, 0x88u8, 0xA5u8, 0x62u8], |
332 | &" \u{0061}\u{00EA}\u{030C}\u{0062}" , |
333 | ); |
334 | decode_big5( |
335 | &[0x61u8, 0x88u8, 0xA7u8, 0x62u8], |
336 | &" \u{0061}\u{00EA}\u{0062}" , |
337 | ); |
338 | decode_big5( |
339 | &[0x61u8, 0x99u8, 0xD4u8, 0x62u8], |
340 | &" \u{0061}\u{8991}\u{0062}" , |
341 | ); |
342 | decode_big5( |
343 | &[0x61u8, 0x99u8, 0xD5u8, 0x62u8], |
344 | &" \u{0061}\u{27967}\u{0062}" , |
345 | ); |
346 | decode_big5( |
347 | &[0x61u8, 0x99u8, 0xD6u8, 0x62u8], |
348 | &" \u{0061}\u{8A29}\u{0062}" , |
349 | ); |
350 | |
351 | // Bad sequences |
352 | decode_big5(&[0x80u8, 0x61u8], &" \u{FFFD}\u{0061}" ); |
353 | decode_big5(&[0xFFu8, 0x61u8], &" \u{FFFD}\u{0061}" ); |
354 | decode_big5(&[0xFEu8, 0x39u8], &" \u{FFFD}\u{0039}" ); |
355 | decode_big5(&[0x87u8, 0x66u8], &" \u{FFFD}\u{0066}" ); |
356 | decode_big5(&[0x81u8, 0x40u8], &" \u{FFFD}\u{0040}" ); |
357 | decode_big5(&[0x61u8, 0x81u8], &" \u{0061}\u{FFFD}" ); |
358 | } |
359 | |
360 | #[test ] |
361 | fn test_big5_encode() { |
362 | // Empty |
363 | encode_big5("" , b"" ); |
364 | |
365 | // ASCII |
366 | encode_big5(" \u{0061}\u{0062}" , b" \x61\x62" ); |
367 | |
368 | if !cfg!(miri) { |
369 | // Miri is too slow |
370 | // Edge cases |
371 | encode_big5(" \u{9EA6}\u{0061}" , b"麦 \x61" ); |
372 | encode_big5(" \u{2626B}\u{0061}" , b"𦉫 \x61" ); |
373 | encode_big5(" \u{3000}" , b" \xA1\x40" ); |
374 | encode_big5(" \u{20AC}" , b" \xA3\xE1" ); |
375 | encode_big5(" \u{4E00}" , b" \xA4\x40" ); |
376 | encode_big5(" \u{27607}" , b" \xC8\xA4" ); |
377 | encode_big5(" \u{FFE2}" , b" \xC8\xCD" ); |
378 | encode_big5(" \u{79D4}" , b" \xFE\xFE" ); |
379 | |
380 | // Not in index |
381 | encode_big5(" \u{2603}\u{0061}" , b"☃ \x61" ); |
382 | } |
383 | |
384 | // duplicate low bits |
385 | encode_big5(" \u{203B5}" , b" \xFD\x6A" ); |
386 | encode_big5(" \u{25605}" , b" \xFE\x46" ); |
387 | |
388 | // prefer last |
389 | encode_big5(" \u{2550}" , b" \xF9\xF9" ); |
390 | } |
391 | |
392 | #[test ] |
393 | #[cfg_attr (miri, ignore)] // Miri is too slow |
394 | fn test_big5_decode_all() { |
395 | let input = include_bytes!("test_data/big5_in.txt" ); |
396 | let expectation = include_str!("test_data/big5_in_ref.txt" ); |
397 | let (cow, had_errors) = BIG5.decode_without_bom_handling(input); |
398 | assert!(had_errors, "Should have had errors." ); |
399 | assert_eq!(&cow[..], expectation); |
400 | } |
401 | |
402 | #[test ] |
403 | #[cfg_attr (miri, ignore)] // Miri is too slow |
404 | fn test_big5_encode_all() { |
405 | let input = include_str!("test_data/big5_out.txt" ); |
406 | let expectation = include_bytes!("test_data/big5_out_ref.txt" ); |
407 | let (cow, encoding, had_errors) = BIG5.encode(input); |
408 | assert!(!had_errors, "Should not have had errors." ); |
409 | assert_eq!(encoding, BIG5); |
410 | assert_eq!(&cow[..], &expectation[..]); |
411 | } |
412 | |
413 | #[test ] |
414 | #[cfg_attr (miri, ignore)] // Miri is too slow |
415 | fn test_big5_encode_from_two_low_surrogates() { |
416 | let expectation = b"��" ; |
417 | let mut output = [0u8; 40]; |
418 | let mut encoder = BIG5.new_encoder(); |
419 | let (result, read, written, had_errors) = |
420 | encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true); |
421 | assert_eq!(result, CoderResult::InputEmpty); |
422 | assert_eq!(read, 2); |
423 | assert_eq!(written, expectation.len()); |
424 | assert!(had_errors); |
425 | assert_eq!(&output[..written], expectation); |
426 | } |
427 | } |
428 | |