1 | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | use super::*; |
11 | use crate::data::*; |
12 | use crate::handles::*; |
13 | use crate::variant::*; |
14 | // Rust 1.14.0 requires the following despite the asterisk above. |
15 | use super::in_inclusive_range16; |
16 | |
17 | enum EucJpPending { |
18 | None, |
19 | Jis0208Lead(u8), |
20 | Jis0212Shift, |
21 | Jis0212Lead(u8), |
22 | HalfWidthKatakana, |
23 | } |
24 | |
25 | impl EucJpPending { |
26 | fn is_none(&self) -> bool { |
27 | match *self { |
28 | EucJpPending::None => true, |
29 | _ => false, |
30 | } |
31 | } |
32 | |
33 | fn count(&self) -> usize { |
34 | match *self { |
35 | EucJpPending::None => 0, |
36 | EucJpPending::Jis0208Lead(_) |
37 | | EucJpPending::Jis0212Shift |
38 | | EucJpPending::HalfWidthKatakana => 1, |
39 | EucJpPending::Jis0212Lead(_) => 2, |
40 | } |
41 | } |
42 | } |
43 | |
44 | pub struct EucJpDecoder { |
45 | pending: EucJpPending, |
46 | } |
47 | |
48 | impl EucJpDecoder { |
49 | pub fn new() -> VariantDecoder { |
50 | VariantDecoder::EucJp(EucJpDecoder { |
51 | pending: EucJpPending::None, |
52 | }) |
53 | } |
54 | |
55 | pub fn in_neutral_state(&self) -> bool { |
56 | self.pending.is_none() |
57 | } |
58 | |
59 | fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> { |
60 | byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 }) |
61 | } |
62 | |
63 | pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
64 | self.plus_one_if_lead(byte_length) |
65 | } |
66 | |
67 | pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
68 | // worst case: 2 to 3 |
69 | let len = self.plus_one_if_lead(byte_length); |
70 | checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2))) |
71 | } |
72 | |
73 | pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
74 | checked_mul(3, self.plus_one_if_lead(byte_length)) |
75 | } |
76 | |
77 | euc_jp_decoder_functions!( |
78 | { |
79 | let trail_minus_offset = byte.wrapping_sub(0xA1); |
80 | // Fast-track Hiragana (60% according to Lunde) |
81 | // and Katakana (10% acconding to Lunde). |
82 | if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 { |
83 | // Hiragana |
84 | handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset)) |
85 | } else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 { |
86 | // Katakana |
87 | handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset)) |
88 | } else if trail_minus_offset > (0xFE - 0xA1) { |
89 | if byte < 0x80 { |
90 | return ( |
91 | DecoderResult::Malformed(1, 0), |
92 | unread_handle_trail.unread(), |
93 | handle.written(), |
94 | ); |
95 | } |
96 | return ( |
97 | DecoderResult::Malformed(2, 0), |
98 | unread_handle_trail.consumed(), |
99 | handle.written(), |
100 | ); |
101 | } else { |
102 | let pointer = mul_94(jis0208_lead_minus_offset) + usize::from(trail_minus_offset); |
103 | let level1_pointer = pointer.wrapping_sub(1410); |
104 | if level1_pointer < JIS0208_LEVEL1_KANJI.len() { |
105 | handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer]) |
106 | } else { |
107 | let level2_pointer = pointer.wrapping_sub(4418); |
108 | if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() { |
109 | handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer]) |
110 | } else { |
111 | let ibm_pointer = pointer.wrapping_sub(8272); |
112 | if ibm_pointer < IBM_KANJI.len() { |
113 | handle.write_upper_bmp(IBM_KANJI[ibm_pointer]) |
114 | } else if let Some(bmp) = jis0208_symbol_decode(pointer) { |
115 | handle.write_bmp_excl_ascii(bmp) |
116 | } else if let Some(bmp) = jis0208_range_decode(pointer) { |
117 | handle.write_bmp_excl_ascii(bmp) |
118 | } else { |
119 | return ( |
120 | DecoderResult::Malformed(2, 0), |
121 | unread_handle_trail.consumed(), |
122 | handle.written(), |
123 | ); |
124 | } |
125 | } |
126 | } |
127 | } |
128 | }, |
129 | { |
130 | // If lead is between 0xA1 and 0xFE, inclusive, |
131 | // subtract 0xA1. |
132 | let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1); |
133 | if jis0212_lead_minus_offset > (0xFE - 0xA1) { |
134 | if lead < 0x80 { |
135 | return ( |
136 | DecoderResult::Malformed(1, 0), |
137 | unread_handle_jis0212.unread(), |
138 | handle.written(), |
139 | ); |
140 | } |
141 | return ( |
142 | DecoderResult::Malformed(2, 0), |
143 | unread_handle_jis0212.consumed(), |
144 | handle.written(), |
145 | ); |
146 | } |
147 | jis0212_lead_minus_offset |
148 | }, |
149 | { |
150 | // If trail is between 0xA1 and 0xFE, inclusive, |
151 | // subtract 0xA1. |
152 | let trail_minus_offset = byte.wrapping_sub(0xA1); |
153 | if trail_minus_offset > (0xFE - 0xA1) { |
154 | if byte < 0x80 { |
155 | return ( |
156 | DecoderResult::Malformed(2, 0), |
157 | unread_handle_trail.unread(), |
158 | handle.written(), |
159 | ); |
160 | } |
161 | return ( |
162 | DecoderResult::Malformed(3, 0), |
163 | unread_handle_trail.consumed(), |
164 | handle.written(), |
165 | ); |
166 | } |
167 | let pointer = mul_94(jis0212_lead_minus_offset) + usize::from(trail_minus_offset); |
168 | let pointer_minus_kanji = pointer.wrapping_sub(1410); |
169 | if pointer_minus_kanji < JIS0212_KANJI.len() { |
170 | handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji]) |
171 | } else if let Some(bmp) = jis0212_accented_decode(pointer) { |
172 | handle.write_bmp_excl_ascii(bmp) |
173 | } else { |
174 | let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597); |
175 | if pointer_minus_upper_cyrillic <= (607 - 597) { |
176 | handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16) |
177 | } else { |
178 | let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645); |
179 | if pointer_minus_lower_cyrillic <= (655 - 645) { |
180 | handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16) |
181 | } else { |
182 | return ( |
183 | DecoderResult::Malformed(3, 0), |
184 | unread_handle_trail.consumed(), |
185 | handle.written(), |
186 | ); |
187 | } |
188 | } |
189 | } |
190 | }, |
191 | { |
192 | // If trail is between 0xA1 and 0xDF, inclusive, |
193 | // subtract 0xA1 and map to half-width Katakana. |
194 | let trail_minus_offset = byte.wrapping_sub(0xA1); |
195 | if trail_minus_offset > (0xDF - 0xA1) { |
196 | if byte < 0x80 { |
197 | return ( |
198 | DecoderResult::Malformed(1, 0), |
199 | unread_handle_trail.unread(), |
200 | handle.written(), |
201 | ); |
202 | } |
203 | return ( |
204 | DecoderResult::Malformed(2, 0), |
205 | unread_handle_trail.consumed(), |
206 | handle.written(), |
207 | ); |
208 | } |
209 | handle.write_upper_bmp(0xFF61 + u16::from(trail_minus_offset)) |
210 | }, |
211 | self, |
212 | non_ascii, |
213 | jis0208_lead_minus_offset, |
214 | byte, |
215 | unread_handle_trail, |
216 | jis0212_lead_minus_offset, |
217 | lead, |
218 | unread_handle_jis0212, |
219 | source, |
220 | handle |
221 | ); |
222 | } |
223 | |
224 | #[cfg (feature = "fast-kanji-encode" )] |
225 | #[inline (always)] |
226 | fn encode_kanji(bmp: u16) -> Option<(u8, u8)> { |
227 | jis0208_kanji_euc_jp_encode(bmp) |
228 | } |
229 | |
230 | #[cfg (not(feature = "fast-kanji-encode" ))] |
231 | #[inline (always)] |
232 | fn encode_kanji(bmp: u16) -> Option<(u8, u8)> { |
233 | if 0x4EDD == bmp { |
234 | // Ideograph on the symbol row! |
235 | Some((0xA1, 0xB8)) |
236 | } else if let Some((lead: u8, trail: u8)) = jis0208_level1_kanji_euc_jp_encode(bmp) { |
237 | Some((lead, trail)) |
238 | } else if let Some(pos: usize) = jis0208_level2_and_additional_kanji_encode(bmp) { |
239 | let lead: usize = (pos / 94) + 0xD0; |
240 | let trail: usize = (pos % 94) + 0xA1; |
241 | Some((lead as u8, trail as u8)) |
242 | } else if let Some(pos: usize) = position(&IBM_KANJI[..], needle:bmp) { |
243 | let lead: usize = (pos / 94) + 0xF9; |
244 | let trail: usize = (pos % 94) + 0xA1; |
245 | Some((lead as u8, trail as u8)) |
246 | } else { |
247 | None |
248 | } |
249 | } |
250 | |
251 | pub struct EucJpEncoder; |
252 | |
253 | impl EucJpEncoder { |
254 | pub fn new(encoding: &'static Encoding) -> Encoder { |
255 | Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder)) |
256 | } |
257 | |
258 | pub fn max_buffer_length_from_utf16_without_replacement( |
259 | &self, |
260 | u16_length: usize, |
261 | ) -> Option<usize> { |
262 | u16_length.checked_mul(2) |
263 | } |
264 | |
265 | pub fn max_buffer_length_from_utf8_without_replacement( |
266 | &self, |
267 | byte_length: usize, |
268 | ) -> Option<usize> { |
269 | byte_length.checked_add(1) |
270 | } |
271 | |
272 | ascii_compatible_bmp_encoder_functions!( |
273 | { |
274 | // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana |
275 | let bmp_minus_hiragana = bmp.wrapping_sub(0x3041); |
276 | if bmp_minus_hiragana < 0x53 { |
277 | handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8) |
278 | } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) { |
279 | if let Some((lead, trail)) = encode_kanji(bmp) { |
280 | handle.write_two(lead, trail) |
281 | } else { |
282 | return ( |
283 | EncoderResult::unmappable_from_bmp(bmp), |
284 | source.consumed(), |
285 | handle.written(), |
286 | ); |
287 | } |
288 | } else { |
289 | let bmp_minus_katakana = bmp.wrapping_sub(0x30A1); |
290 | if bmp_minus_katakana < 0x56 { |
291 | handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8) |
292 | } else { |
293 | let bmp_minus_space = bmp.wrapping_sub(0x3000); |
294 | if bmp_minus_space < 3 { |
295 | // fast-track common punctuation |
296 | handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8) |
297 | } else if bmp == 0xA5 { |
298 | handle.write_one(0x5Cu8) |
299 | } else if bmp == 0x203E { |
300 | handle.write_one(0x7Eu8) |
301 | } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) { |
302 | handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8) |
303 | } else if bmp == 0x2212 { |
304 | handle.write_two(0xA1u8, 0xDDu8) |
305 | } else if let Some(pointer) = jis0208_range_encode(bmp) { |
306 | let lead = (pointer / 94) + 0xA1; |
307 | let trail = (pointer % 94) + 0xA1; |
308 | handle.write_two(lead as u8, trail as u8) |
309 | } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) |
310 | || bmp == 0xF929 |
311 | || bmp == 0xF9DC |
312 | { |
313 | // Guaranteed to be found in IBM_KANJI |
314 | let pos = position(&IBM_KANJI[..], bmp).unwrap(); |
315 | let lead = (pos / 94) + 0xF9; |
316 | let trail = (pos % 94) + 0xA1; |
317 | handle.write_two(lead as u8, trail as u8) |
318 | } else if let Some(pointer) = ibm_symbol_encode(bmp) { |
319 | let lead = (pointer / 94) + 0xA1; |
320 | let trail = (pointer % 94) + 0xA1; |
321 | handle.write_two(lead as u8, trail as u8) |
322 | } else if let Some(pointer) = jis0208_symbol_encode(bmp) { |
323 | let lead = (pointer / 94) + 0xA1; |
324 | let trail = (pointer % 94) + 0xA1; |
325 | handle.write_two(lead as u8, trail as u8) |
326 | } else { |
327 | return ( |
328 | EncoderResult::unmappable_from_bmp(bmp), |
329 | source.consumed(), |
330 | handle.written(), |
331 | ); |
332 | } |
333 | } |
334 | } |
335 | }, |
336 | bmp, |
337 | self, |
338 | source, |
339 | handle, |
340 | copy_ascii_to_check_space_two, |
341 | check_space_two, |
342 | false |
343 | ); |
344 | } |
345 | |
346 | // Any copyright to the test code below this comment is dedicated to the |
347 | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
348 | |
349 | #[cfg (all(test, feature = "alloc" ))] |
350 | mod tests { |
351 | use super::super::testing::*; |
352 | use super::super::*; |
353 | |
354 | fn decode_euc_jp(bytes: &[u8], expect: &str) { |
355 | decode(EUC_JP, bytes, expect); |
356 | } |
357 | |
358 | fn encode_euc_jp(string: &str, expect: &[u8]) { |
359 | encode(EUC_JP, string, expect); |
360 | } |
361 | |
362 | #[test ] |
363 | fn test_euc_jp_decode() { |
364 | // Empty |
365 | decode_euc_jp(b"" , &"" ); |
366 | |
367 | // ASCII |
368 | decode_euc_jp(b" \x61\x62" , " \u{0061}\u{0062}" ); |
369 | |
370 | // Half-width |
371 | decode_euc_jp(b" \x8E\xA1" , " \u{FF61}" ); |
372 | decode_euc_jp(b" \x8E\xDF" , " \u{FF9F}" ); |
373 | decode_euc_jp(b" \x8E\xA0" , " \u{FFFD}" ); |
374 | decode_euc_jp(b" \x8E\xE0" , " \u{FFFD}" ); |
375 | decode_euc_jp(b" \x8E\xFF" , " \u{FFFD}" ); |
376 | decode_euc_jp(b" \x8E" , " \u{FFFD}" ); |
377 | |
378 | // JIS 0212 |
379 | decode_euc_jp(b" \x8F\xA1\xA1" , " \u{FFFD}" ); |
380 | decode_euc_jp(b" \x8F\xA2\xAF" , " \u{02D8}" ); |
381 | decode_euc_jp(b" \x8F\xA2\xFF" , " \u{FFFD}" ); |
382 | decode_euc_jp(b" \x8F\xA1" , " \u{FFFD}" ); |
383 | decode_euc_jp(b" \x8F" , " \u{FFFD}" ); |
384 | |
385 | // JIS 0208 |
386 | decode_euc_jp(b" \xA1\xA1" , " \u{3000}" ); |
387 | decode_euc_jp(b" \xA1\xA0" , " \u{FFFD}" ); |
388 | decode_euc_jp(b" \xFC\xFE" , " \u{FF02}" ); |
389 | decode_euc_jp(b" \xFE\xFE" , " \u{FFFD}" ); |
390 | decode_euc_jp(b" \xA1" , " \u{FFFD}" ); |
391 | |
392 | // Bad leads |
393 | decode_euc_jp(b" \xFF\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
394 | decode_euc_jp(b" \xA0\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
395 | decode_euc_jp(b" \x80\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
396 | decode_euc_jp(b" \x81\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
397 | decode_euc_jp(b" \x82\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
398 | decode_euc_jp(b" \x83\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
399 | decode_euc_jp(b" \x84\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
400 | decode_euc_jp(b" \x85\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
401 | decode_euc_jp(b" \x86\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
402 | decode_euc_jp(b" \x87\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
403 | decode_euc_jp(b" \x88\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
404 | decode_euc_jp(b" \x89\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
405 | decode_euc_jp(b" \x8A\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
406 | decode_euc_jp(b" \x8B\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
407 | decode_euc_jp(b" \x8C\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
408 | decode_euc_jp(b" \x8D\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
409 | |
410 | // Bad ASCII trail |
411 | decode_euc_jp(b" \xA1\x40" , " \u{FFFD}\u{0040}" ); |
412 | } |
413 | |
414 | #[test ] |
415 | fn test_euc_jp_encode() { |
416 | // Empty |
417 | encode_euc_jp("" , b"" ); |
418 | |
419 | // ASCII |
420 | encode_euc_jp(" \u{0061}\u{0062}" , b" \x61\x62" ); |
421 | |
422 | // Exceptional code points |
423 | encode_euc_jp(" \u{00A5}" , b" \x5C" ); |
424 | encode_euc_jp(" \u{203E}" , b" \x7E" ); |
425 | encode_euc_jp(" \u{2212}" , b" \xA1\xDD" ); |
426 | |
427 | // Half-width |
428 | encode_euc_jp(" \u{FF61}" , b" \x8E\xA1" ); |
429 | encode_euc_jp(" \u{FF9F}" , b" \x8E\xDF" ); |
430 | |
431 | // JIS 0212 |
432 | encode_euc_jp(" \u{02D8}" , b"˘" ); |
433 | |
434 | // JIS 0208 |
435 | encode_euc_jp(" \u{3000}" , b" \xA1\xA1" ); |
436 | encode_euc_jp(" \u{FF02}" , b" \xFC\xFE" ); |
437 | } |
438 | |
439 | #[test ] |
440 | #[cfg_attr (miri, ignore)] // Miri is too slow |
441 | fn test_jis0208_decode_all() { |
442 | let input = include_bytes!("test_data/jis0208_in.txt" ); |
443 | let expectation = include_str!("test_data/jis0208_in_ref.txt" ); |
444 | let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input); |
445 | assert!(had_errors, "Should have had errors." ); |
446 | assert_eq!(&cow[..], expectation); |
447 | } |
448 | |
449 | #[test ] |
450 | #[cfg_attr (miri, ignore)] // Miri is too slow |
451 | fn test_jis0208_encode_all() { |
452 | let input = include_str!("test_data/jis0208_out.txt" ); |
453 | let expectation = include_bytes!("test_data/jis0208_out_ref.txt" ); |
454 | let (cow, encoding, had_errors) = EUC_JP.encode(input); |
455 | assert!(!had_errors, "Should not have had errors." ); |
456 | assert_eq!(encoding, EUC_JP); |
457 | assert_eq!(&cow[..], &expectation[..]); |
458 | } |
459 | |
460 | #[test ] |
461 | #[cfg_attr (miri, ignore)] // Miri is too slow |
462 | fn test_jis0212_decode_all() { |
463 | let input = include_bytes!("test_data/jis0212_in.txt" ); |
464 | let expectation = include_str!("test_data/jis0212_in_ref.txt" ); |
465 | let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input); |
466 | assert!(had_errors, "Should have had errors." ); |
467 | assert_eq!(&cow[..], expectation); |
468 | } |
469 | } |
470 | |