| 1 | // Copyright Mozilla Foundation. See the COPYRIGHT |
| 2 | // file at the top-level directory of this distribution. |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| 7 | // option. This file may not be copied, modified, or distributed |
| 8 | // except according to those terms. |
| 9 | |
| 10 | use super::*; |
| 11 | use crate::data::*; |
| 12 | use crate::handles::*; |
| 13 | use crate::variant::*; |
| 14 | // Rust 1.14.0 requires the following despite the asterisk above. |
| 15 | use super::in_inclusive_range16; |
| 16 | |
| 17 | enum EucJpPending { |
| 18 | None, |
| 19 | Jis0208Lead(u8), |
| 20 | Jis0212Shift, |
| 21 | Jis0212Lead(u8), |
| 22 | HalfWidthKatakana, |
| 23 | } |
| 24 | |
| 25 | impl EucJpPending { |
| 26 | fn is_none(&self) -> bool { |
| 27 | match *self { |
| 28 | EucJpPending::None => true, |
| 29 | _ => false, |
| 30 | } |
| 31 | } |
| 32 | |
| 33 | fn count(&self) -> usize { |
| 34 | match *self { |
| 35 | EucJpPending::None => 0, |
| 36 | EucJpPending::Jis0208Lead(_) |
| 37 | | EucJpPending::Jis0212Shift |
| 38 | | EucJpPending::HalfWidthKatakana => 1, |
| 39 | EucJpPending::Jis0212Lead(_) => 2, |
| 40 | } |
| 41 | } |
| 42 | } |
| 43 | |
| 44 | pub struct EucJpDecoder { |
| 45 | pending: EucJpPending, |
| 46 | } |
| 47 | |
| 48 | impl EucJpDecoder { |
| 49 | pub fn new() -> VariantDecoder { |
| 50 | VariantDecoder::EucJp(EucJpDecoder { |
| 51 | pending: EucJpPending::None, |
| 52 | }) |
| 53 | } |
| 54 | |
| 55 | pub fn in_neutral_state(&self) -> bool { |
| 56 | self.pending.is_none() |
| 57 | } |
| 58 | |
| 59 | fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> { |
| 60 | byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 }) |
| 61 | } |
| 62 | |
| 63 | pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
| 64 | self.plus_one_if_lead(byte_length) |
| 65 | } |
| 66 | |
| 67 | pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
| 68 | // worst case: 2 to 3 |
| 69 | let len = self.plus_one_if_lead(byte_length); |
| 70 | checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2))) |
| 71 | } |
| 72 | |
| 73 | pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
| 74 | checked_mul(3, self.plus_one_if_lead(byte_length)) |
| 75 | } |
| 76 | |
| 77 | euc_jp_decoder_functions!( |
| 78 | { |
| 79 | let trail_minus_offset = byte.wrapping_sub(0xA1); |
| 80 | // Fast-track Hiragana (60% according to Lunde) |
| 81 | // and Katakana (10% acconding to Lunde). |
| 82 | if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 { |
| 83 | // Hiragana |
| 84 | handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset)) |
| 85 | } else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 { |
| 86 | // Katakana |
| 87 | handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset)) |
| 88 | } else if trail_minus_offset > (0xFE - 0xA1) { |
| 89 | if byte < 0x80 { |
| 90 | return ( |
| 91 | DecoderResult::Malformed(1, 0), |
| 92 | unread_handle_trail.unread(), |
| 93 | handle.written(), |
| 94 | ); |
| 95 | } |
| 96 | return ( |
| 97 | DecoderResult::Malformed(2, 0), |
| 98 | unread_handle_trail.consumed(), |
| 99 | handle.written(), |
| 100 | ); |
| 101 | } else { |
| 102 | let pointer = mul_94(jis0208_lead_minus_offset) + usize::from(trail_minus_offset); |
| 103 | let level1_pointer = pointer.wrapping_sub(1410); |
| 104 | if level1_pointer < JIS0208_LEVEL1_KANJI.len() { |
| 105 | handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer]) |
| 106 | } else { |
| 107 | let level2_pointer = pointer.wrapping_sub(4418); |
| 108 | if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() { |
| 109 | handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer]) |
| 110 | } else { |
| 111 | let ibm_pointer = pointer.wrapping_sub(8272); |
| 112 | if ibm_pointer < IBM_KANJI.len() { |
| 113 | handle.write_upper_bmp(IBM_KANJI[ibm_pointer]) |
| 114 | } else if let Some(bmp) = jis0208_symbol_decode(pointer) { |
| 115 | handle.write_bmp_excl_ascii(bmp) |
| 116 | } else if let Some(bmp) = jis0208_range_decode(pointer) { |
| 117 | handle.write_bmp_excl_ascii(bmp) |
| 118 | } else { |
| 119 | return ( |
| 120 | DecoderResult::Malformed(2, 0), |
| 121 | unread_handle_trail.consumed(), |
| 122 | handle.written(), |
| 123 | ); |
| 124 | } |
| 125 | } |
| 126 | } |
| 127 | } |
| 128 | }, |
| 129 | { |
| 130 | // If lead is between 0xA1 and 0xFE, inclusive, |
| 131 | // subtract 0xA1. |
| 132 | let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1); |
| 133 | if jis0212_lead_minus_offset > (0xFE - 0xA1) { |
| 134 | if lead < 0x80 { |
| 135 | return ( |
| 136 | DecoderResult::Malformed(1, 0), |
| 137 | unread_handle_jis0212.unread(), |
| 138 | handle.written(), |
| 139 | ); |
| 140 | } |
| 141 | return ( |
| 142 | DecoderResult::Malformed(2, 0), |
| 143 | unread_handle_jis0212.consumed(), |
| 144 | handle.written(), |
| 145 | ); |
| 146 | } |
| 147 | jis0212_lead_minus_offset |
| 148 | }, |
| 149 | { |
| 150 | // If trail is between 0xA1 and 0xFE, inclusive, |
| 151 | // subtract 0xA1. |
| 152 | let trail_minus_offset = byte.wrapping_sub(0xA1); |
| 153 | if trail_minus_offset > (0xFE - 0xA1) { |
| 154 | if byte < 0x80 { |
| 155 | return ( |
| 156 | DecoderResult::Malformed(2, 0), |
| 157 | unread_handle_trail.unread(), |
| 158 | handle.written(), |
| 159 | ); |
| 160 | } |
| 161 | return ( |
| 162 | DecoderResult::Malformed(3, 0), |
| 163 | unread_handle_trail.consumed(), |
| 164 | handle.written(), |
| 165 | ); |
| 166 | } |
| 167 | let pointer = mul_94(jis0212_lead_minus_offset) + usize::from(trail_minus_offset); |
| 168 | let pointer_minus_kanji = pointer.wrapping_sub(1410); |
| 169 | if pointer_minus_kanji < JIS0212_KANJI.len() { |
| 170 | handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji]) |
| 171 | } else if let Some(bmp) = jis0212_accented_decode(pointer) { |
| 172 | handle.write_bmp_excl_ascii(bmp) |
| 173 | } else { |
| 174 | let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597); |
| 175 | if pointer_minus_upper_cyrillic <= (607 - 597) { |
| 176 | handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16) |
| 177 | } else { |
| 178 | let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645); |
| 179 | if pointer_minus_lower_cyrillic <= (655 - 645) { |
| 180 | handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16) |
| 181 | } else { |
| 182 | return ( |
| 183 | DecoderResult::Malformed(3, 0), |
| 184 | unread_handle_trail.consumed(), |
| 185 | handle.written(), |
| 186 | ); |
| 187 | } |
| 188 | } |
| 189 | } |
| 190 | }, |
| 191 | { |
| 192 | // If trail is between 0xA1 and 0xDF, inclusive, |
| 193 | // subtract 0xA1 and map to half-width Katakana. |
| 194 | let trail_minus_offset = byte.wrapping_sub(0xA1); |
| 195 | if trail_minus_offset > (0xDF - 0xA1) { |
| 196 | if byte < 0x80 { |
| 197 | return ( |
| 198 | DecoderResult::Malformed(1, 0), |
| 199 | unread_handle_trail.unread(), |
| 200 | handle.written(), |
| 201 | ); |
| 202 | } |
| 203 | return ( |
| 204 | DecoderResult::Malformed(2, 0), |
| 205 | unread_handle_trail.consumed(), |
| 206 | handle.written(), |
| 207 | ); |
| 208 | } |
| 209 | handle.write_upper_bmp(0xFF61 + u16::from(trail_minus_offset)) |
| 210 | }, |
| 211 | self, |
| 212 | non_ascii, |
| 213 | jis0208_lead_minus_offset, |
| 214 | byte, |
| 215 | unread_handle_trail, |
| 216 | jis0212_lead_minus_offset, |
| 217 | lead, |
| 218 | unread_handle_jis0212, |
| 219 | source, |
| 220 | handle |
| 221 | ); |
| 222 | } |
| 223 | |
| 224 | #[cfg (feature = "fast-kanji-encode" )] |
| 225 | #[inline (always)] |
| 226 | fn encode_kanji(bmp: u16) -> Option<(u8, u8)> { |
| 227 | jis0208_kanji_euc_jp_encode(bmp) |
| 228 | } |
| 229 | |
| 230 | #[cfg (not(feature = "fast-kanji-encode" ))] |
| 231 | #[inline (always)] |
| 232 | fn encode_kanji(bmp: u16) -> Option<(u8, u8)> { |
| 233 | if 0x4EDD == bmp { |
| 234 | // Ideograph on the symbol row! |
| 235 | Some((0xA1, 0xB8)) |
| 236 | } else if let Some((lead: u8, trail: u8)) = jis0208_level1_kanji_euc_jp_encode(bmp) { |
| 237 | Some((lead, trail)) |
| 238 | } else if let Some(pos: usize) = jis0208_level2_and_additional_kanji_encode(bmp) { |
| 239 | let lead: usize = (pos / 94) + 0xD0; |
| 240 | let trail: usize = (pos % 94) + 0xA1; |
| 241 | Some((lead as u8, trail as u8)) |
| 242 | } else if let Some(pos: usize) = position(&IBM_KANJI[..], needle:bmp) { |
| 243 | let lead: usize = (pos / 94) + 0xF9; |
| 244 | let trail: usize = (pos % 94) + 0xA1; |
| 245 | Some((lead as u8, trail as u8)) |
| 246 | } else { |
| 247 | None |
| 248 | } |
| 249 | } |
| 250 | |
| 251 | pub struct EucJpEncoder; |
| 252 | |
| 253 | impl EucJpEncoder { |
| 254 | pub fn new(encoding: &'static Encoding) -> Encoder { |
| 255 | Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder)) |
| 256 | } |
| 257 | |
| 258 | pub fn max_buffer_length_from_utf16_without_replacement( |
| 259 | &self, |
| 260 | u16_length: usize, |
| 261 | ) -> Option<usize> { |
| 262 | u16_length.checked_mul(2) |
| 263 | } |
| 264 | |
| 265 | pub fn max_buffer_length_from_utf8_without_replacement( |
| 266 | &self, |
| 267 | byte_length: usize, |
| 268 | ) -> Option<usize> { |
| 269 | byte_length.checked_add(1) |
| 270 | } |
| 271 | |
| 272 | ascii_compatible_bmp_encoder_functions!( |
| 273 | { |
| 274 | // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana |
| 275 | let bmp_minus_hiragana = bmp.wrapping_sub(0x3041); |
| 276 | if bmp_minus_hiragana < 0x53 { |
| 277 | handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8) |
| 278 | } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) { |
| 279 | if let Some((lead, trail)) = encode_kanji(bmp) { |
| 280 | handle.write_two(lead, trail) |
| 281 | } else { |
| 282 | return ( |
| 283 | EncoderResult::unmappable_from_bmp(bmp), |
| 284 | source.consumed(), |
| 285 | handle.written(), |
| 286 | ); |
| 287 | } |
| 288 | } else { |
| 289 | let bmp_minus_katakana = bmp.wrapping_sub(0x30A1); |
| 290 | if bmp_minus_katakana < 0x56 { |
| 291 | handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8) |
| 292 | } else { |
| 293 | let bmp_minus_space = bmp.wrapping_sub(0x3000); |
| 294 | if bmp_minus_space < 3 { |
| 295 | // fast-track common punctuation |
| 296 | handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8) |
| 297 | } else if bmp == 0xA5 { |
| 298 | handle.write_one(0x5Cu8) |
| 299 | } else if bmp == 0x203E { |
| 300 | handle.write_one(0x7Eu8) |
| 301 | } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) { |
| 302 | handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8) |
| 303 | } else if bmp == 0x2212 { |
| 304 | handle.write_two(0xA1u8, 0xDDu8) |
| 305 | } else if let Some(pointer) = jis0208_range_encode(bmp) { |
| 306 | let lead = (pointer / 94) + 0xA1; |
| 307 | let trail = (pointer % 94) + 0xA1; |
| 308 | handle.write_two(lead as u8, trail as u8) |
| 309 | } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) |
| 310 | || bmp == 0xF929 |
| 311 | || bmp == 0xF9DC |
| 312 | { |
| 313 | // Guaranteed to be found in IBM_KANJI |
| 314 | let pos = position(&IBM_KANJI[..], bmp).unwrap(); |
| 315 | let lead = (pos / 94) + 0xF9; |
| 316 | let trail = (pos % 94) + 0xA1; |
| 317 | handle.write_two(lead as u8, trail as u8) |
| 318 | } else if let Some(pointer) = ibm_symbol_encode(bmp) { |
| 319 | let lead = (pointer / 94) + 0xA1; |
| 320 | let trail = (pointer % 94) + 0xA1; |
| 321 | handle.write_two(lead as u8, trail as u8) |
| 322 | } else if let Some(pointer) = jis0208_symbol_encode(bmp) { |
| 323 | let lead = (pointer / 94) + 0xA1; |
| 324 | let trail = (pointer % 94) + 0xA1; |
| 325 | handle.write_two(lead as u8, trail as u8) |
| 326 | } else { |
| 327 | return ( |
| 328 | EncoderResult::unmappable_from_bmp(bmp), |
| 329 | source.consumed(), |
| 330 | handle.written(), |
| 331 | ); |
| 332 | } |
| 333 | } |
| 334 | } |
| 335 | }, |
| 336 | bmp, |
| 337 | self, |
| 338 | source, |
| 339 | handle, |
| 340 | copy_ascii_to_check_space_two, |
| 341 | check_space_two, |
| 342 | false |
| 343 | ); |
| 344 | } |
| 345 | |
| 346 | // Any copyright to the test code below this comment is dedicated to the |
| 347 | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
| 348 | |
| 349 | #[cfg (all(test, feature = "alloc" ))] |
| 350 | mod tests { |
| 351 | use super::super::testing::*; |
| 352 | use super::super::*; |
| 353 | |
| 354 | fn decode_euc_jp(bytes: &[u8], expect: &str) { |
| 355 | decode(EUC_JP, bytes, expect); |
| 356 | } |
| 357 | |
| 358 | fn encode_euc_jp(string: &str, expect: &[u8]) { |
| 359 | encode(EUC_JP, string, expect); |
| 360 | } |
| 361 | |
| 362 | #[test ] |
| 363 | fn test_euc_jp_decode() { |
| 364 | // Empty |
| 365 | decode_euc_jp(b"" , &"" ); |
| 366 | |
| 367 | // ASCII |
| 368 | decode_euc_jp(b" \x61\x62" , " \u{0061}\u{0062}" ); |
| 369 | |
| 370 | // Half-width |
| 371 | decode_euc_jp(b" \x8E\xA1" , " \u{FF61}" ); |
| 372 | decode_euc_jp(b" \x8E\xDF" , " \u{FF9F}" ); |
| 373 | decode_euc_jp(b" \x8E\xA0" , " \u{FFFD}" ); |
| 374 | decode_euc_jp(b" \x8E\xE0" , " \u{FFFD}" ); |
| 375 | decode_euc_jp(b" \x8E\xFF" , " \u{FFFD}" ); |
| 376 | decode_euc_jp(b" \x8E" , " \u{FFFD}" ); |
| 377 | |
| 378 | // JIS 0212 |
| 379 | decode_euc_jp(b" \x8F\xA1\xA1" , " \u{FFFD}" ); |
| 380 | decode_euc_jp(b" \x8F\xA2\xAF" , " \u{02D8}" ); |
| 381 | decode_euc_jp(b" \x8F\xA2\xFF" , " \u{FFFD}" ); |
| 382 | decode_euc_jp(b" \x8F\xA1" , " \u{FFFD}" ); |
| 383 | decode_euc_jp(b" \x8F" , " \u{FFFD}" ); |
| 384 | |
| 385 | // JIS 0208 |
| 386 | decode_euc_jp(b" \xA1\xA1" , " \u{3000}" ); |
| 387 | decode_euc_jp(b" \xA1\xA0" , " \u{FFFD}" ); |
| 388 | decode_euc_jp(b" \xFC\xFE" , " \u{FF02}" ); |
| 389 | decode_euc_jp(b" \xFE\xFE" , " \u{FFFD}" ); |
| 390 | decode_euc_jp(b" \xA1" , " \u{FFFD}" ); |
| 391 | |
| 392 | // Bad leads |
| 393 | decode_euc_jp(b" \xFF\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 394 | decode_euc_jp(b" \xA0\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 395 | decode_euc_jp(b" \x80\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 396 | decode_euc_jp(b" \x81\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 397 | decode_euc_jp(b" \x82\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 398 | decode_euc_jp(b" \x83\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 399 | decode_euc_jp(b" \x84\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 400 | decode_euc_jp(b" \x85\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 401 | decode_euc_jp(b" \x86\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 402 | decode_euc_jp(b" \x87\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 403 | decode_euc_jp(b" \x88\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 404 | decode_euc_jp(b" \x89\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 405 | decode_euc_jp(b" \x8A\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 406 | decode_euc_jp(b" \x8B\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 407 | decode_euc_jp(b" \x8C\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 408 | decode_euc_jp(b" \x8D\xA1\xA1" , " \u{FFFD}\u{3000}" ); |
| 409 | |
| 410 | // Bad ASCII trail |
| 411 | decode_euc_jp(b" \xA1\x40" , " \u{FFFD}\u{0040}" ); |
| 412 | } |
| 413 | |
| 414 | #[test ] |
| 415 | fn test_euc_jp_encode() { |
| 416 | // Empty |
| 417 | encode_euc_jp("" , b"" ); |
| 418 | |
| 419 | // ASCII |
| 420 | encode_euc_jp(" \u{0061}\u{0062}" , b" \x61\x62" ); |
| 421 | |
| 422 | // Exceptional code points |
| 423 | encode_euc_jp(" \u{00A5}" , b" \x5C" ); |
| 424 | encode_euc_jp(" \u{203E}" , b" \x7E" ); |
| 425 | encode_euc_jp(" \u{2212}" , b" \xA1\xDD" ); |
| 426 | |
| 427 | // Half-width |
| 428 | encode_euc_jp(" \u{FF61}" , b" \x8E\xA1" ); |
| 429 | encode_euc_jp(" \u{FF9F}" , b" \x8E\xDF" ); |
| 430 | |
| 431 | // JIS 0212 |
| 432 | encode_euc_jp(" \u{02D8}" , b"˘" ); |
| 433 | |
| 434 | // JIS 0208 |
| 435 | encode_euc_jp(" \u{3000}" , b" \xA1\xA1" ); |
| 436 | encode_euc_jp(" \u{FF02}" , b" \xFC\xFE" ); |
| 437 | } |
| 438 | |
| 439 | #[test ] |
| 440 | #[cfg_attr (miri, ignore)] // Miri is too slow |
| 441 | fn test_jis0208_decode_all() { |
| 442 | let input = include_bytes!("test_data/jis0208_in.txt" ); |
| 443 | let expectation = include_str!("test_data/jis0208_in_ref.txt" ); |
| 444 | let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input); |
| 445 | assert!(had_errors, "Should have had errors." ); |
| 446 | assert_eq!(&cow[..], expectation); |
| 447 | } |
| 448 | |
| 449 | #[test ] |
| 450 | #[cfg_attr (miri, ignore)] // Miri is too slow |
| 451 | fn test_jis0208_encode_all() { |
| 452 | let input = include_str!("test_data/jis0208_out.txt" ); |
| 453 | let expectation = include_bytes!("test_data/jis0208_out_ref.txt" ); |
| 454 | let (cow, encoding, had_errors) = EUC_JP.encode(input); |
| 455 | assert!(!had_errors, "Should not have had errors." ); |
| 456 | assert_eq!(encoding, EUC_JP); |
| 457 | assert_eq!(&cow[..], &expectation[..]); |
| 458 | } |
| 459 | |
| 460 | #[test ] |
| 461 | #[cfg_attr (miri, ignore)] // Miri is too slow |
| 462 | fn test_jis0212_decode_all() { |
| 463 | let input = include_bytes!("test_data/jis0212_in.txt" ); |
| 464 | let expectation = include_str!("test_data/jis0212_in_ref.txt" ); |
| 465 | let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input); |
| 466 | assert!(had_errors, "Should have had errors." ); |
| 467 | assert_eq!(&cow[..], expectation); |
| 468 | } |
| 469 | } |
| 470 | |