1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::data::*;
12use crate::handles::*;
13use crate::variant::*;
14// Rust 1.14.0 requires the following despite the asterisk above.
15use super::in_inclusive_range16;
16
17enum EucJpPending {
18 None,
19 Jis0208Lead(u8),
20 Jis0212Shift,
21 Jis0212Lead(u8),
22 HalfWidthKatakana,
23}
24
25impl EucJpPending {
26 fn is_none(&self) -> bool {
27 match *self {
28 EucJpPending::None => true,
29 _ => false,
30 }
31 }
32
33 fn count(&self) -> usize {
34 match *self {
35 EucJpPending::None => 0,
36 EucJpPending::Jis0208Lead(_)
37 | EucJpPending::Jis0212Shift
38 | EucJpPending::HalfWidthKatakana => 1,
39 EucJpPending::Jis0212Lead(_) => 2,
40 }
41 }
42}
43
44pub struct EucJpDecoder {
45 pending: EucJpPending,
46}
47
48impl EucJpDecoder {
49 pub fn new() -> VariantDecoder {
50 VariantDecoder::EucJp(EucJpDecoder {
51 pending: EucJpPending::None,
52 })
53 }
54
55 pub fn in_neutral_state(&self) -> bool {
56 self.pending.is_none()
57 }
58
59 fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
60 byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 })
61 }
62
63 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
64 self.plus_one_if_lead(byte_length)
65 }
66
67 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
68 // worst case: 2 to 3
69 let len = self.plus_one_if_lead(byte_length);
70 checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
71 }
72
73 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
74 checked_mul(3, self.plus_one_if_lead(byte_length))
75 }
76
77 euc_jp_decoder_functions!(
78 {
79 let trail_minus_offset = byte.wrapping_sub(0xA1);
80 // Fast-track Hiragana (60% according to Lunde)
81 // and Katakana (10% acconding to Lunde).
82 if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
83 // Hiragana
84 handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset))
85 } else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
86 // Katakana
87 handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
88 } else if trail_minus_offset > (0xFE - 0xA1) {
89 if byte < 0x80 {
90 return (
91 DecoderResult::Malformed(1, 0),
92 unread_handle_trail.unread(),
93 handle.written(),
94 );
95 }
96 return (
97 DecoderResult::Malformed(2, 0),
98 unread_handle_trail.consumed(),
99 handle.written(),
100 );
101 } else {
102 let pointer = mul_94(jis0208_lead_minus_offset) + usize::from(trail_minus_offset);
103 let level1_pointer = pointer.wrapping_sub(1410);
104 if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
105 handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
106 } else {
107 let level2_pointer = pointer.wrapping_sub(4418);
108 if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
109 handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
110 } else {
111 let ibm_pointer = pointer.wrapping_sub(8272);
112 if ibm_pointer < IBM_KANJI.len() {
113 handle.write_upper_bmp(IBM_KANJI[ibm_pointer])
114 } else if let Some(bmp) = jis0208_symbol_decode(pointer) {
115 handle.write_bmp_excl_ascii(bmp)
116 } else if let Some(bmp) = jis0208_range_decode(pointer) {
117 handle.write_bmp_excl_ascii(bmp)
118 } else {
119 return (
120 DecoderResult::Malformed(2, 0),
121 unread_handle_trail.consumed(),
122 handle.written(),
123 );
124 }
125 }
126 }
127 }
128 },
129 {
130 // If lead is between 0xA1 and 0xFE, inclusive,
131 // subtract 0xA1.
132 let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1);
133 if jis0212_lead_minus_offset > (0xFE - 0xA1) {
134 if lead < 0x80 {
135 return (
136 DecoderResult::Malformed(1, 0),
137 unread_handle_jis0212.unread(),
138 handle.written(),
139 );
140 }
141 return (
142 DecoderResult::Malformed(2, 0),
143 unread_handle_jis0212.consumed(),
144 handle.written(),
145 );
146 }
147 jis0212_lead_minus_offset
148 },
149 {
150 // If trail is between 0xA1 and 0xFE, inclusive,
151 // subtract 0xA1.
152 let trail_minus_offset = byte.wrapping_sub(0xA1);
153 if trail_minus_offset > (0xFE - 0xA1) {
154 if byte < 0x80 {
155 return (
156 DecoderResult::Malformed(2, 0),
157 unread_handle_trail.unread(),
158 handle.written(),
159 );
160 }
161 return (
162 DecoderResult::Malformed(3, 0),
163 unread_handle_trail.consumed(),
164 handle.written(),
165 );
166 }
167 let pointer = mul_94(jis0212_lead_minus_offset) + usize::from(trail_minus_offset);
168 let pointer_minus_kanji = pointer.wrapping_sub(1410);
169 if pointer_minus_kanji < JIS0212_KANJI.len() {
170 handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji])
171 } else if let Some(bmp) = jis0212_accented_decode(pointer) {
172 handle.write_bmp_excl_ascii(bmp)
173 } else {
174 let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597);
175 if pointer_minus_upper_cyrillic <= (607 - 597) {
176 handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16)
177 } else {
178 let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645);
179 if pointer_minus_lower_cyrillic <= (655 - 645) {
180 handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16)
181 } else {
182 return (
183 DecoderResult::Malformed(3, 0),
184 unread_handle_trail.consumed(),
185 handle.written(),
186 );
187 }
188 }
189 }
190 },
191 {
192 // If trail is between 0xA1 and 0xDF, inclusive,
193 // subtract 0xA1 and map to half-width Katakana.
194 let trail_minus_offset = byte.wrapping_sub(0xA1);
195 if trail_minus_offset > (0xDF - 0xA1) {
196 if byte < 0x80 {
197 return (
198 DecoderResult::Malformed(1, 0),
199 unread_handle_trail.unread(),
200 handle.written(),
201 );
202 }
203 return (
204 DecoderResult::Malformed(2, 0),
205 unread_handle_trail.consumed(),
206 handle.written(),
207 );
208 }
209 handle.write_upper_bmp(0xFF61 + u16::from(trail_minus_offset))
210 },
211 self,
212 non_ascii,
213 jis0208_lead_minus_offset,
214 byte,
215 unread_handle_trail,
216 jis0212_lead_minus_offset,
217 lead,
218 unread_handle_jis0212,
219 source,
220 handle
221 );
222}
223
224#[cfg(feature = "fast-kanji-encode")]
225#[inline(always)]
226fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
227 jis0208_kanji_euc_jp_encode(bmp)
228}
229
230#[cfg(not(feature = "fast-kanji-encode"))]
231#[inline(always)]
232fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
233 if 0x4EDD == bmp {
234 // Ideograph on the symbol row!
235 Some((0xA1, 0xB8))
236 } else if let Some((lead: u8, trail: u8)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
237 Some((lead, trail))
238 } else if let Some(pos: usize) = jis0208_level2_and_additional_kanji_encode(bmp) {
239 let lead: usize = (pos / 94) + 0xD0;
240 let trail: usize = (pos % 94) + 0xA1;
241 Some((lead as u8, trail as u8))
242 } else if let Some(pos: usize) = position(&IBM_KANJI[..], needle:bmp) {
243 let lead: usize = (pos / 94) + 0xF9;
244 let trail: usize = (pos % 94) + 0xA1;
245 Some((lead as u8, trail as u8))
246 } else {
247 None
248 }
249}
250
251pub struct EucJpEncoder;
252
253impl EucJpEncoder {
254 pub fn new(encoding: &'static Encoding) -> Encoder {
255 Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder))
256 }
257
258 pub fn max_buffer_length_from_utf16_without_replacement(
259 &self,
260 u16_length: usize,
261 ) -> Option<usize> {
262 u16_length.checked_mul(2)
263 }
264
265 pub fn max_buffer_length_from_utf8_without_replacement(
266 &self,
267 byte_length: usize,
268 ) -> Option<usize> {
269 byte_length.checked_add(1)
270 }
271
272 ascii_compatible_bmp_encoder_functions!(
273 {
274 // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
275 let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
276 if bmp_minus_hiragana < 0x53 {
277 handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8)
278 } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
279 if let Some((lead, trail)) = encode_kanji(bmp) {
280 handle.write_two(lead, trail)
281 } else {
282 return (
283 EncoderResult::unmappable_from_bmp(bmp),
284 source.consumed(),
285 handle.written(),
286 );
287 }
288 } else {
289 let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
290 if bmp_minus_katakana < 0x56 {
291 handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8)
292 } else {
293 let bmp_minus_space = bmp.wrapping_sub(0x3000);
294 if bmp_minus_space < 3 {
295 // fast-track common punctuation
296 handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8)
297 } else if bmp == 0xA5 {
298 handle.write_one(0x5Cu8)
299 } else if bmp == 0x203E {
300 handle.write_one(0x7Eu8)
301 } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
302 handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8)
303 } else if bmp == 0x2212 {
304 handle.write_two(0xA1u8, 0xDDu8)
305 } else if let Some(pointer) = jis0208_range_encode(bmp) {
306 let lead = (pointer / 94) + 0xA1;
307 let trail = (pointer % 94) + 0xA1;
308 handle.write_two(lead as u8, trail as u8)
309 } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
310 || bmp == 0xF929
311 || bmp == 0xF9DC
312 {
313 // Guaranteed to be found in IBM_KANJI
314 let pos = position(&IBM_KANJI[..], bmp).unwrap();
315 let lead = (pos / 94) + 0xF9;
316 let trail = (pos % 94) + 0xA1;
317 handle.write_two(lead as u8, trail as u8)
318 } else if let Some(pointer) = ibm_symbol_encode(bmp) {
319 let lead = (pointer / 94) + 0xA1;
320 let trail = (pointer % 94) + 0xA1;
321 handle.write_two(lead as u8, trail as u8)
322 } else if let Some(pointer) = jis0208_symbol_encode(bmp) {
323 let lead = (pointer / 94) + 0xA1;
324 let trail = (pointer % 94) + 0xA1;
325 handle.write_two(lead as u8, trail as u8)
326 } else {
327 return (
328 EncoderResult::unmappable_from_bmp(bmp),
329 source.consumed(),
330 handle.written(),
331 );
332 }
333 }
334 }
335 },
336 bmp,
337 self,
338 source,
339 handle,
340 copy_ascii_to_check_space_two,
341 check_space_two,
342 false
343 );
344}
345
346// Any copyright to the test code below this comment is dedicated to the
347// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
348
349#[cfg(all(test, feature = "alloc"))]
350mod tests {
351 use super::super::testing::*;
352 use super::super::*;
353
354 fn decode_euc_jp(bytes: &[u8], expect: &str) {
355 decode(EUC_JP, bytes, expect);
356 }
357
358 fn encode_euc_jp(string: &str, expect: &[u8]) {
359 encode(EUC_JP, string, expect);
360 }
361
362 #[test]
363 fn test_euc_jp_decode() {
364 // Empty
365 decode_euc_jp(b"", &"");
366
367 // ASCII
368 decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}");
369
370 // Half-width
371 decode_euc_jp(b"\x8E\xA1", "\u{FF61}");
372 decode_euc_jp(b"\x8E\xDF", "\u{FF9F}");
373 decode_euc_jp(b"\x8E\xA0", "\u{FFFD}");
374 decode_euc_jp(b"\x8E\xE0", "\u{FFFD}");
375 decode_euc_jp(b"\x8E\xFF", "\u{FFFD}");
376 decode_euc_jp(b"\x8E", "\u{FFFD}");
377
378 // JIS 0212
379 decode_euc_jp(b"\x8F\xA1\xA1", "\u{FFFD}");
380 decode_euc_jp(b"\x8F\xA2\xAF", "\u{02D8}");
381 decode_euc_jp(b"\x8F\xA2\xFF", "\u{FFFD}");
382 decode_euc_jp(b"\x8F\xA1", "\u{FFFD}");
383 decode_euc_jp(b"\x8F", "\u{FFFD}");
384
385 // JIS 0208
386 decode_euc_jp(b"\xA1\xA1", "\u{3000}");
387 decode_euc_jp(b"\xA1\xA0", "\u{FFFD}");
388 decode_euc_jp(b"\xFC\xFE", "\u{FF02}");
389 decode_euc_jp(b"\xFE\xFE", "\u{FFFD}");
390 decode_euc_jp(b"\xA1", "\u{FFFD}");
391
392 // Bad leads
393 decode_euc_jp(b"\xFF\xA1\xA1", "\u{FFFD}\u{3000}");
394 decode_euc_jp(b"\xA0\xA1\xA1", "\u{FFFD}\u{3000}");
395 decode_euc_jp(b"\x80\xA1\xA1", "\u{FFFD}\u{3000}");
396 decode_euc_jp(b"\x81\xA1\xA1", "\u{FFFD}\u{3000}");
397 decode_euc_jp(b"\x82\xA1\xA1", "\u{FFFD}\u{3000}");
398 decode_euc_jp(b"\x83\xA1\xA1", "\u{FFFD}\u{3000}");
399 decode_euc_jp(b"\x84\xA1\xA1", "\u{FFFD}\u{3000}");
400 decode_euc_jp(b"\x85\xA1\xA1", "\u{FFFD}\u{3000}");
401 decode_euc_jp(b"\x86\xA1\xA1", "\u{FFFD}\u{3000}");
402 decode_euc_jp(b"\x87\xA1\xA1", "\u{FFFD}\u{3000}");
403 decode_euc_jp(b"\x88\xA1\xA1", "\u{FFFD}\u{3000}");
404 decode_euc_jp(b"\x89\xA1\xA1", "\u{FFFD}\u{3000}");
405 decode_euc_jp(b"\x8A\xA1\xA1", "\u{FFFD}\u{3000}");
406 decode_euc_jp(b"\x8B\xA1\xA1", "\u{FFFD}\u{3000}");
407 decode_euc_jp(b"\x8C\xA1\xA1", "\u{FFFD}\u{3000}");
408 decode_euc_jp(b"\x8D\xA1\xA1", "\u{FFFD}\u{3000}");
409
410 // Bad ASCII trail
411 decode_euc_jp(b"\xA1\x40", "\u{FFFD}\u{0040}");
412 }
413
414 #[test]
415 fn test_euc_jp_encode() {
416 // Empty
417 encode_euc_jp("", b"");
418
419 // ASCII
420 encode_euc_jp("\u{0061}\u{0062}", b"\x61\x62");
421
422 // Exceptional code points
423 encode_euc_jp("\u{00A5}", b"\x5C");
424 encode_euc_jp("\u{203E}", b"\x7E");
425 encode_euc_jp("\u{2212}", b"\xA1\xDD");
426
427 // Half-width
428 encode_euc_jp("\u{FF61}", b"\x8E\xA1");
429 encode_euc_jp("\u{FF9F}", b"\x8E\xDF");
430
431 // JIS 0212
432 encode_euc_jp("\u{02D8}", b"&#728;");
433
434 // JIS 0208
435 encode_euc_jp("\u{3000}", b"\xA1\xA1");
436 encode_euc_jp("\u{FF02}", b"\xFC\xFE");
437 }
438
439 #[test]
440 #[cfg_attr(miri, ignore)] // Miri is too slow
441 fn test_jis0208_decode_all() {
442 let input = include_bytes!("test_data/jis0208_in.txt");
443 let expectation = include_str!("test_data/jis0208_in_ref.txt");
444 let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
445 assert!(had_errors, "Should have had errors.");
446 assert_eq!(&cow[..], expectation);
447 }
448
449 #[test]
450 #[cfg_attr(miri, ignore)] // Miri is too slow
451 fn test_jis0208_encode_all() {
452 let input = include_str!("test_data/jis0208_out.txt");
453 let expectation = include_bytes!("test_data/jis0208_out_ref.txt");
454 let (cow, encoding, had_errors) = EUC_JP.encode(input);
455 assert!(!had_errors, "Should not have had errors.");
456 assert_eq!(encoding, EUC_JP);
457 assert_eq!(&cow[..], &expectation[..]);
458 }
459
460 #[test]
461 #[cfg_attr(miri, ignore)] // Miri is too slow
462 fn test_jis0212_decode_all() {
463 let input = include_bytes!("test_data/jis0212_in.txt");
464 let expectation = include_str!("test_data/jis0212_in_ref.txt");
465 let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
466 assert!(had_errors, "Should have had errors.");
467 assert_eq!(&cow[..], expectation);
468 }
469}
470