1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::data::*;
12use crate::handles::*;
13use crate::variant::*;
14// Rust 1.14.0 requires the following despite the asterisk above.
15use super::in_inclusive_range16;
16
17#[derive(Copy, Clone, PartialEq)]
18enum Iso2022JpDecoderState {
19 Ascii,
20 Roman,
21 Katakana,
22 LeadByte,
23 TrailByte,
24 EscapeStart,
25 Escape,
26}
27
28pub struct Iso2022JpDecoder {
29 decoder_state: Iso2022JpDecoderState,
30 output_state: Iso2022JpDecoderState, // only takes 1 of first 4 values
31 lead: u8,
32 output_flag: bool,
33 pending_prepended: bool,
34}
35
36impl Iso2022JpDecoder {
37 pub fn new() -> VariantDecoder {
38 VariantDecoder::Iso2022Jp(Iso2022JpDecoder {
39 decoder_state: Iso2022JpDecoderState::Ascii,
40 output_state: Iso2022JpDecoderState::Ascii,
41 lead: 0u8,
42 output_flag: false,
43 pending_prepended: false,
44 })
45 }
46
47 pub fn in_neutral_state(&self) -> bool {
48 self.decoder_state == Iso2022JpDecoderState::Ascii
49 && self.output_state == Iso2022JpDecoderState::Ascii
50 && self.lead == 0u8
51 && !self.output_flag
52 && !self.pending_prepended
53 }
54
55 fn extra_to_input_from_state(&self, byte_length: usize) -> Option<usize> {
56 byte_length.checked_add(
57 if self.lead == 0 || self.pending_prepended {
58 0
59 } else {
60 1
61 } + match self.decoder_state {
62 Iso2022JpDecoderState::Escape | Iso2022JpDecoderState::EscapeStart => 1,
63 _ => 0,
64 },
65 )
66 }
67
68 fn extra_to_output_from_state(&self) -> usize {
69 if self.lead != 0 && self.pending_prepended {
70 1 + self.output_flag as usize
71 } else {
72 self.output_flag as usize
73 }
74 }
75
76 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
77 checked_add(
78 self.extra_to_output_from_state(),
79 self.extra_to_input_from_state(byte_length),
80 )
81 }
82
83 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
84 // worst case: 1 to 3 (half-width katakana)
85 self.max_utf8_buffer_length(byte_length)
86 }
87
88 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
89 checked_mul(
90 3,
91 checked_add(
92 self.extra_to_output_from_state(),
93 self.extra_to_input_from_state(byte_length),
94 ),
95 )
96 }
97
98 decoder_functions!(
99 {
100 if self.pending_prepended {
101 // lead was set in EscapeStart and "prepended"
102 // in Escape.
103 debug_assert!(self.lead == 0x24u8 || self.lead == 0x28u8);
104 match dest.check_space_bmp() {
105 Space::Full(_) => {
106 return (DecoderResult::OutputFull, 0, 0);
107 }
108 Space::Available(destination_handle) => {
109 self.pending_prepended = false;
110 self.output_flag = false;
111 match self.decoder_state {
112 Iso2022JpDecoderState::Ascii | Iso2022JpDecoderState::Roman => {
113 destination_handle.write_ascii(self.lead);
114 self.lead = 0x0u8;
115 }
116 Iso2022JpDecoderState::Katakana => {
117 destination_handle
118 .write_upper_bmp(u16::from(self.lead) - 0x21u16 + 0xFF61u16);
119 self.lead = 0x0u8;
120 }
121 Iso2022JpDecoderState::LeadByte => {
122 self.decoder_state = Iso2022JpDecoderState::TrailByte;
123 }
124 _ => unreachable!(),
125 }
126 }
127 }
128 }
129 },
130 {},
131 {
132 match self.decoder_state {
133 Iso2022JpDecoderState::TrailByte | Iso2022JpDecoderState::EscapeStart => {
134 self.decoder_state = self.output_state;
135 return (DecoderResult::Malformed(1, 0), src_consumed, dest.written());
136 }
137 Iso2022JpDecoderState::Escape => {
138 self.pending_prepended = true;
139 self.decoder_state = self.output_state;
140 return (DecoderResult::Malformed(1, 1), src_consumed, dest.written());
141 }
142 _ => {}
143 }
144 },
145 {
146 match self.decoder_state {
147 Iso2022JpDecoderState::Ascii => {
148 if b == 0x1Bu8 {
149 self.decoder_state = Iso2022JpDecoderState::EscapeStart;
150 continue;
151 }
152 self.output_flag = false;
153 if b > 0x7Fu8 || b == 0x0Eu8 || b == 0x0Fu8 {
154 return (
155 DecoderResult::Malformed(1, 0),
156 unread_handle.consumed(),
157 destination_handle.written(),
158 );
159 }
160 destination_handle.write_ascii(b);
161 continue;
162 }
163 Iso2022JpDecoderState::Roman => {
164 if b == 0x1Bu8 {
165 self.decoder_state = Iso2022JpDecoderState::EscapeStart;
166 continue;
167 }
168 self.output_flag = false;
169 if b == 0x5Cu8 {
170 destination_handle.write_mid_bmp(0x00A5u16);
171 continue;
172 }
173 if b == 0x7Eu8 {
174 destination_handle.write_upper_bmp(0x203Eu16);
175 continue;
176 }
177 if b > 0x7Fu8 || b == 0x0Eu8 || b == 0x0Fu8 {
178 return (
179 DecoderResult::Malformed(1, 0),
180 unread_handle.consumed(),
181 destination_handle.written(),
182 );
183 }
184 destination_handle.write_ascii(b);
185 continue;
186 }
187 Iso2022JpDecoderState::Katakana => {
188 if b == 0x1Bu8 {
189 self.decoder_state = Iso2022JpDecoderState::EscapeStart;
190 continue;
191 }
192 self.output_flag = false;
193 if b >= 0x21u8 && b <= 0x5Fu8 {
194 destination_handle.write_upper_bmp(u16::from(b) - 0x21u16 + 0xFF61u16);
195 continue;
196 }
197 return (
198 DecoderResult::Malformed(1, 0),
199 unread_handle.consumed(),
200 destination_handle.written(),
201 );
202 }
203 Iso2022JpDecoderState::LeadByte => {
204 if b == 0x1Bu8 {
205 self.decoder_state = Iso2022JpDecoderState::EscapeStart;
206 continue;
207 }
208 self.output_flag = false;
209 if b >= 0x21u8 && b <= 0x7Eu8 {
210 self.lead = b;
211 self.decoder_state = Iso2022JpDecoderState::TrailByte;
212 continue;
213 }
214 return (
215 DecoderResult::Malformed(1, 0),
216 unread_handle.consumed(),
217 destination_handle.written(),
218 );
219 }
220 Iso2022JpDecoderState::TrailByte => {
221 if b == 0x1Bu8 {
222 self.decoder_state = Iso2022JpDecoderState::EscapeStart;
223 // The byte in error is the previous
224 // lead byte.
225 return (
226 DecoderResult::Malformed(1, 1),
227 unread_handle.consumed(),
228 destination_handle.written(),
229 );
230 }
231 self.decoder_state = Iso2022JpDecoderState::LeadByte;
232 let jis0208_lead_minus_offset = self.lead - 0x21;
233 let byte = b;
234 let handle = destination_handle;
235 // The code below uses else after continue in
236 // order to retain the structure seen in EUC-JP.
237 let trail_minus_offset = byte.wrapping_sub(0x21);
238 // Fast-track Hiragana (60% according to Lunde)
239 // and Katakana (10% acconding to Lunde).
240 if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
241 // Hiragana
242 handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset));
243 continue;
244 } else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
245 // Katakana
246 handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset));
247 continue;
248 } else if trail_minus_offset > (0xFE - 0xA1) {
249 return (
250 DecoderResult::Malformed(2, 0),
251 unread_handle.consumed(),
252 handle.written(),
253 );
254 } else {
255 let pointer =
256 mul_94(jis0208_lead_minus_offset) + trail_minus_offset as usize;
257 let level1_pointer = pointer.wrapping_sub(1410);
258 if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
259 handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer]);
260 continue;
261 } else {
262 let level2_pointer = pointer.wrapping_sub(4418);
263 if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
264 handle.write_upper_bmp(
265 JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer],
266 );
267 continue;
268 } else {
269 let ibm_pointer = pointer.wrapping_sub(8272);
270 if ibm_pointer < IBM_KANJI.len() {
271 handle.write_upper_bmp(IBM_KANJI[ibm_pointer]);
272 continue;
273 } else if let Some(bmp) = jis0208_symbol_decode(pointer) {
274 handle.write_bmp_excl_ascii(bmp);
275 continue;
276 } else if let Some(bmp) = jis0208_range_decode(pointer) {
277 handle.write_bmp_excl_ascii(bmp);
278 continue;
279 } else {
280 return (
281 DecoderResult::Malformed(2, 0),
282 unread_handle.consumed(),
283 handle.written(),
284 );
285 }
286 }
287 }
288 }
289 }
290 Iso2022JpDecoderState::EscapeStart => {
291 if b == 0x24u8 || b == 0x28u8 {
292 self.lead = b;
293 self.decoder_state = Iso2022JpDecoderState::Escape;
294 continue;
295 }
296 self.output_flag = false;
297 self.decoder_state = self.output_state;
298 return (
299 DecoderResult::Malformed(1, 0),
300 unread_handle.unread(),
301 destination_handle.written(),
302 );
303 }
304 Iso2022JpDecoderState::Escape => {
305 let mut state: Option<Iso2022JpDecoderState> = None;
306 if self.lead == 0x28u8 && b == 0x42u8 {
307 state = Some(Iso2022JpDecoderState::Ascii);
308 } else if self.lead == 0x28u8 && b == 0x4Au8 {
309 state = Some(Iso2022JpDecoderState::Roman);
310 } else if self.lead == 0x28u8 && b == 0x49u8 {
311 state = Some(Iso2022JpDecoderState::Katakana);
312 } else if self.lead == 0x24u8 && (b == 0x40u8 || b == 0x42u8) {
313 state = Some(Iso2022JpDecoderState::LeadByte);
314 }
315 match state {
316 Some(s) => {
317 self.lead = 0x0u8;
318 self.decoder_state = s;
319 self.output_state = s;
320 let flag = self.output_flag;
321 self.output_flag = true;
322 if flag {
323 // We had an escape sequence
324 // immediately following another
325 // escape sequence. Therefore,
326 // the first one of these was
327 // useless.
328 return (
329 DecoderResult::Malformed(3, 3),
330 unread_handle.consumed(),
331 destination_handle.written(),
332 );
333 }
334 continue;
335 }
336 None => {
337 // self.lead is still the previous
338 // byte. It will be processed in
339 // the preabmle upon next call.
340 self.pending_prepended = true;
341 self.output_flag = false;
342 self.decoder_state = self.output_state;
343 // The byte in error is not the
344 // current or the previous byte but
345 // the one before those (lone 0x1B).
346 return (
347 DecoderResult::Malformed(1, 1),
348 unread_handle.unread(),
349 destination_handle.written(),
350 );
351 }
352 }
353 }
354 }
355 },
356 self,
357 src_consumed,
358 dest,
359 source,
360 b,
361 destination_handle,
362 unread_handle,
363 check_space_bmp
364 );
365}
366
367#[cfg(feature = "fast-kanji-encode")]
368#[inline(always)]
369fn is_kanji_mapped(bmp: u16) -> bool {
370 // Use the shift_jis variant, because we don't care about the
371 // byte values here.
372 jis0208_kanji_shift_jis_encode(bmp).is_some()
373}
374
375#[cfg(not(feature = "fast-kanji-encode"))]
376#[cfg_attr(
377 feature = "cargo-clippy",
378 allow(if_let_redundant_pattern_matching, if_same_then_else)
379)]
380#[inline(always)]
381fn is_kanji_mapped(bmp: u16) -> bool {
382 if 0x4EDD == bmp {
383 true
384 } else if let Some(_) = jis0208_level1_kanji_shift_jis_encode(bmp) {
385 // Use the shift_jis variant, because we don't care about the
386 // byte values here.
387 true
388 } else if let Some(_) = jis0208_level2_and_additional_kanji_encode(bmp) {
389 true
390 } else if let Some(_) = position(&IBM_KANJI[..], needle:bmp) {
391 true
392 } else {
393 false
394 }
395}
396
397#[cfg_attr(
398 feature = "cargo-clippy",
399 allow(if_let_redundant_pattern_matching, if_same_then_else)
400)]
401fn is_mapped_for_two_byte_encode(bmp: u16) -> bool {
402 // The code below uses else after return to
403 // keep the same structure as in EUC-JP.
404 // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
405 let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
406 if bmp_minus_hiragana < 0x53 {
407 true
408 } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
409 is_kanji_mapped(bmp)
410 } else {
411 let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
412 if bmp_minus_katakana < 0x56 {
413 true
414 } else {
415 let bmp_minus_space = bmp.wrapping_sub(0x3000);
416 if bmp_minus_space < 3 {
417 // fast-track common punctuation
418 true
419 } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
420 true
421 } else if bmp == 0x2212 {
422 true
423 } else if let Some(_) = jis0208_range_encode(bmp) {
424 true
425 } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 || bmp == 0xF9DC {
426 true
427 } else if let Some(_) = ibm_symbol_encode(bmp) {
428 true
429 } else if let Some(_) = jis0208_symbol_encode(bmp) {
430 true
431 } else {
432 false
433 }
434 }
435 }
436}
437
438#[cfg(feature = "fast-kanji-encode")]
439#[inline(always)]
440fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
441 jis0208_kanji_iso_2022_jp_encode(bmp)
442}
443
444#[cfg(not(feature = "fast-kanji-encode"))]
445#[inline(always)]
446fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
447 if 0x4EDD == bmp {
448 // Ideograph on the symbol row!
449 Some((0x21, 0xB8 - 0x80))
450 } else if let Some((lead: u8, trail: u8)) = jis0208_level1_kanji_iso_2022_jp_encode(bmp) {
451 Some((lead, trail))
452 } else if let Some(pos: usize) = jis0208_level2_and_additional_kanji_encode(bmp) {
453 let lead: usize = (pos / 94) + (0xD0 - 0x80);
454 let trail: usize = (pos % 94) + 0x21;
455 Some((lead as u8, trail as u8))
456 } else if let Some(pos: usize) = position(&IBM_KANJI[..], needle:bmp) {
457 let lead: usize = (pos / 94) + (0xF9 - 0x80);
458 let trail: usize = (pos % 94) + 0x21;
459 Some((lead as u8, trail as u8))
460 } else {
461 None
462 }
463}
464
465enum Iso2022JpEncoderState {
466 Ascii,
467 Roman,
468 Jis0208,
469}
470
471pub struct Iso2022JpEncoder {
472 state: Iso2022JpEncoderState,
473}
474
475impl Iso2022JpEncoder {
476 pub fn new(encoding: &'static Encoding) -> Encoder {
477 Encoder::new(
478 encoding,
479 VariantEncoder::Iso2022Jp(Iso2022JpEncoder {
480 state: Iso2022JpEncoderState::Ascii,
481 }),
482 )
483 }
484
485 pub fn has_pending_state(&self) -> bool {
486 match self.state {
487 Iso2022JpEncoderState::Ascii => false,
488 _ => true,
489 }
490 }
491
492 pub fn max_buffer_length_from_utf16_without_replacement(
493 &self,
494 u16_length: usize,
495 ) -> Option<usize> {
496 // Worst case: every other character is ASCII/Roman and every other
497 // JIS0208.
498 // Two UTF-16 input units:
499 // Transition to Roman: 3
500 // Roman/ASCII: 1
501 // Transition to JIS0208: 3
502 // JIS0208: 2
503 // End transition: 3
504 checked_add_opt(
505 checked_add(3, u16_length.checked_mul(4)),
506 checked_div(u16_length.checked_add(1), 2),
507 )
508 }
509
510 pub fn max_buffer_length_from_utf8_without_replacement(
511 &self,
512 byte_length: usize,
513 ) -> Option<usize> {
514 // Worst case: every other character is ASCII/Roman and every other
515 // JIS0208.
516 // Three UTF-8 input units: 1 ASCII, 2 JIS0208
517 // Transition to ASCII: 3
518 // Roman/ASCII: 1
519 // Transition to JIS0208: 3
520 // JIS0208: 2
521 // End transition: 3
522 checked_add(3, byte_length.checked_mul(3))
523 }
524
525 encoder_functions!(
526 {
527 match self.state {
528 Iso2022JpEncoderState::Ascii => {}
529 _ => match dest.check_space_three() {
530 Space::Full(dst_written) => {
531 return (EncoderResult::OutputFull, src_consumed, dst_written);
532 }
533 Space::Available(destination_handle) => {
534 self.state = Iso2022JpEncoderState::Ascii;
535 destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
536 }
537 },
538 }
539 },
540 {
541 match self.state {
542 Iso2022JpEncoderState::Ascii => {
543 if c == '\u{0E}' || c == '\u{0F}' || c == '\u{1B}' {
544 return (
545 EncoderResult::Unmappable('\u{FFFD}'),
546 unread_handle.consumed(),
547 destination_handle.written(),
548 );
549 }
550 if c <= '\u{7F}' {
551 destination_handle.write_one(c as u8);
552 continue;
553 }
554 if c == '\u{A5}' || c == '\u{203E}' {
555 self.state = Iso2022JpEncoderState::Roman;
556 destination_handle.write_three(0x1Bu8, 0x28u8, 0x4Au8);
557 unread_handle.unread();
558 continue;
559 }
560 if c > '\u{FFFF}' {
561 return (
562 EncoderResult::Unmappable(c),
563 unread_handle.consumed(),
564 destination_handle.written(),
565 );
566 }
567 // Yes, if c is in index, we'll search
568 // again in the Jis0208 state, but this
569 // encoder is not worth optimizing.
570 if is_mapped_for_two_byte_encode(c as u16) {
571 self.state = Iso2022JpEncoderState::Jis0208;
572 destination_handle.write_three(0x1Bu8, 0x24u8, 0x42u8);
573 unread_handle.unread();
574 continue;
575 }
576 return (
577 EncoderResult::Unmappable(c),
578 unread_handle.consumed(),
579 destination_handle.written(),
580 );
581 }
582 Iso2022JpEncoderState::Roman => {
583 if c == '\u{0E}' || c == '\u{0F}' || c == '\u{1B}' {
584 return (
585 EncoderResult::Unmappable('\u{FFFD}'),
586 unread_handle.consumed(),
587 destination_handle.written(),
588 );
589 }
590 if c == '\u{5C}' || c == '\u{7E}' {
591 self.state = Iso2022JpEncoderState::Ascii;
592 destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
593 unread_handle.unread();
594 continue;
595 }
596 if c <= '\u{7F}' {
597 destination_handle.write_one(c as u8);
598 continue;
599 }
600 if c == '\u{A5}' {
601 destination_handle.write_one(0x5Cu8);
602 continue;
603 }
604 if c == '\u{203E}' {
605 destination_handle.write_one(0x7Eu8);
606 continue;
607 }
608 if c > '\u{FFFF}' {
609 return (
610 EncoderResult::Unmappable(c),
611 unread_handle.consumed(),
612 destination_handle.written(),
613 );
614 }
615 // Yes, if c is in index, we'll search
616 // again in the Jis0208 state, but this
617 // encoder is not worth optimizing.
618 if is_mapped_for_two_byte_encode(c as u16) {
619 self.state = Iso2022JpEncoderState::Jis0208;
620 destination_handle.write_three(0x1Bu8, 0x24u8, 0x42u8);
621 unread_handle.unread();
622 continue;
623 }
624 return (
625 EncoderResult::Unmappable(c),
626 unread_handle.consumed(),
627 destination_handle.written(),
628 );
629 }
630 Iso2022JpEncoderState::Jis0208 => {
631 if c <= '\u{7F}' {
632 self.state = Iso2022JpEncoderState::Ascii;
633 destination_handle.write_three(0x1Bu8, 0x28u8, 0x42u8);
634 unread_handle.unread();
635 continue;
636 }
637 if c == '\u{A5}' || c == '\u{203E}' {
638 self.state = Iso2022JpEncoderState::Roman;
639 destination_handle.write_three(0x1Bu8, 0x28u8, 0x4Au8);
640 unread_handle.unread();
641 continue;
642 }
643 if c > '\u{FFFF}' {
644 // Transition to ASCII here in order
645 // not to make it the responsibility
646 // of the caller.
647 self.state = Iso2022JpEncoderState::Ascii;
648 return (
649 EncoderResult::Unmappable(c),
650 unread_handle.consumed(),
651 destination_handle.write_three_return_written(0x1Bu8, 0x28u8, 0x42u8),
652 );
653 }
654 let bmp = c as u16;
655 let handle = destination_handle;
656 // The code below uses else after continue to
657 // keep the same structure as in EUC-JP.
658 // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
659 let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
660 if bmp_minus_hiragana < 0x53 {
661 handle.write_two(0x24, 0x21 + bmp_minus_hiragana as u8);
662 continue;
663 } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
664 if let Some((lead, trail)) = encode_kanji(bmp) {
665 handle.write_two(lead, trail);
666 continue;
667 } else {
668 self.state = Iso2022JpEncoderState::Ascii;
669 return (
670 EncoderResult::Unmappable(c),
671 unread_handle.consumed(),
672 handle.write_three_return_written(0x1Bu8, 0x28u8, 0x42u8),
673 );
674 }
675 } else {
676 let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
677 if bmp_minus_katakana < 0x56 {
678 handle.write_two(0x25, 0x21 + bmp_minus_katakana as u8);
679 continue;
680 } else {
681 let bmp_minus_space = bmp.wrapping_sub(0x3000);
682 if bmp_minus_space < 3 {
683 // fast-track common punctuation
684 handle.write_two(0x21, 0x21 + bmp_minus_space as u8);
685 continue;
686 }
687 let bmp_minus_half_width = bmp.wrapping_sub(0xFF61);
688 if bmp_minus_half_width <= (0xFF9F - 0xFF61) {
689 // We have half-width katakana. The lead is either
690 // row 1 or 5 of JIS X 0208, so the lookup table
691 // only stores the trail.
692 let lead =
693 if bmp != 0xFF70 && in_inclusive_range16(bmp, 0xFF66, 0xFF9D) {
694 0x25u8
695 } else {
696 0x21u8
697 };
698 let trail =
699 ISO_2022_JP_HALF_WIDTH_TRAIL[bmp_minus_half_width as usize];
700 handle.write_two(lead, trail);
701 continue;
702 } else if bmp == 0x2212 {
703 handle.write_two(0x21, 0x5D);
704 continue;
705 } else if let Some(pointer) = jis0208_range_encode(bmp) {
706 let lead = (pointer / 94) + 0x21;
707 let trail = (pointer % 94) + 0x21;
708 handle.write_two(lead as u8, trail as u8);
709 continue;
710 } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
711 || bmp == 0xF929
712 || bmp == 0xF9DC
713 {
714 // Guaranteed to be found in IBM_KANJI
715 let pos = position(&IBM_KANJI[..], bmp).unwrap();
716 let lead = (pos / 94) + (0xF9 - 0x80);
717 let trail = (pos % 94) + 0x21;
718 handle.write_two(lead as u8, trail as u8);
719 continue;
720 } else if let Some(pointer) = ibm_symbol_encode(bmp) {
721 let lead = (pointer / 94) + 0x21;
722 let trail = (pointer % 94) + 0x21;
723 handle.write_two(lead as u8, trail as u8);
724 continue;
725 } else if let Some(pointer) = jis0208_symbol_encode(bmp) {
726 let lead = (pointer / 94) + 0x21;
727 let trail = (pointer % 94) + 0x21;
728 handle.write_two(lead as u8, trail as u8);
729 continue;
730 } else {
731 self.state = Iso2022JpEncoderState::Ascii;
732 return (
733 EncoderResult::Unmappable(c),
734 unread_handle.consumed(),
735 handle.write_three_return_written(0x1Bu8, 0x28u8, 0x42u8),
736 );
737 }
738 }
739 }
740 }
741 }
742 },
743 self,
744 src_consumed,
745 source,
746 dest,
747 c,
748 destination_handle,
749 unread_handle,
750 check_space_three
751 );
752}
753
754// Any copyright to the test code below this comment is dedicated to the
755// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
756
757#[cfg(all(test, feature = "alloc"))]
758mod tests {
759 use super::super::testing::*;
760 use super::super::*;
761
762 fn decode_iso_2022_jp(bytes: &[u8], expect: &str) {
763 decode(ISO_2022_JP, bytes, expect);
764 }
765
766 fn encode_iso_2022_jp(string: &str, expect: &[u8]) {
767 encode(ISO_2022_JP, string, expect);
768 }
769
770 #[test]
771 fn test_iso_2022_jp_decode() {
772 // Empty
773 decode_iso_2022_jp(b"", &"");
774
775 // ASCII
776 decode_iso_2022_jp(b"\x61\x62", "\u{0061}\u{0062}");
777 decode_iso_2022_jp(b"\x7F\x0E\x0F", "\u{007F}\u{FFFD}\u{FFFD}");
778
779 // Partial escapes
780 decode_iso_2022_jp(b"\x1B", "\u{FFFD}");
781 decode_iso_2022_jp(b"\x1B$", "\u{FFFD}$");
782 decode_iso_2022_jp(b"\x1B(", "\u{FFFD}(");
783 decode_iso_2022_jp(b"\x1B.", "\u{FFFD}.");
784
785 // ISO escapes
786 decode_iso_2022_jp(b"\x1B(B", ""); // ASCII
787 decode_iso_2022_jp(b"\x1B(J", ""); // Roman
788 decode_iso_2022_jp(b"\x1B$@", ""); // 0208
789 decode_iso_2022_jp(b"\x1B$B", ""); // 0208
790 decode_iso_2022_jp(b"\x1B$(D", "\u{FFFD}$(D"); // 2012
791 decode_iso_2022_jp(b"\x1B$A", "\u{FFFD}$A"); // GB2312
792 decode_iso_2022_jp(b"\x1B$(C", "\u{FFFD}$(C"); // KR
793 decode_iso_2022_jp(b"\x1B.A", "\u{FFFD}.A"); // Latin-1
794 decode_iso_2022_jp(b"\x1B.F", "\u{FFFD}.F"); // Greek
795 decode_iso_2022_jp(b"\x1B(I", ""); // Half-width Katakana
796 decode_iso_2022_jp(b"\x1B$(O", "\u{FFFD}$(O"); // 2013
797 decode_iso_2022_jp(b"\x1B$(P", "\u{FFFD}$(P"); // 2013
798 decode_iso_2022_jp(b"\x1B$(Q", "\u{FFFD}$(Q"); // 2013
799 decode_iso_2022_jp(b"\x1B$)C", "\u{FFFD}$)C"); // KR
800 decode_iso_2022_jp(b"\x1B$)A", "\u{FFFD}$)A"); // GB2312
801 decode_iso_2022_jp(b"\x1B$)G", "\u{FFFD}$)G"); // CNS
802 decode_iso_2022_jp(b"\x1B$*H", "\u{FFFD}$*H"); // CNS
803 decode_iso_2022_jp(b"\x1B$)E", "\u{FFFD}$)E"); // IR
804 decode_iso_2022_jp(b"\x1B$+I", "\u{FFFD}$+I"); // CNS
805 decode_iso_2022_jp(b"\x1B$+J", "\u{FFFD}$+J"); // CNS
806 decode_iso_2022_jp(b"\x1B$+K", "\u{FFFD}$+K"); // CNS
807 decode_iso_2022_jp(b"\x1B$+L", "\u{FFFD}$+L"); // CNS
808 decode_iso_2022_jp(b"\x1B$+M", "\u{FFFD}$+M"); // CNS
809 decode_iso_2022_jp(b"\x1B$(@", "\u{FFFD}$(@"); // 0208
810 decode_iso_2022_jp(b"\x1B$(A", "\u{FFFD}$(A"); // GB2312
811 decode_iso_2022_jp(b"\x1B$(B", "\u{FFFD}$(B"); // 0208
812 decode_iso_2022_jp(b"\x1B%G", "\u{FFFD}%G"); // UTF-8
813
814 // ASCII
815 decode_iso_2022_jp(b"\x5B", "\u{005B}");
816 decode_iso_2022_jp(b"\x5C", "\u{005C}");
817 decode_iso_2022_jp(b"\x7E", "\u{007E}");
818 decode_iso_2022_jp(b"\x0E", "\u{FFFD}");
819 decode_iso_2022_jp(b"\x0F", "\u{FFFD}");
820 decode_iso_2022_jp(b"\x80", "\u{FFFD}");
821 decode_iso_2022_jp(b"\xFF", "\u{FFFD}");
822 decode_iso_2022_jp(b"\x1B(B\x5B", "\u{005B}");
823 decode_iso_2022_jp(b"\x1B(B\x5C", "\u{005C}");
824 decode_iso_2022_jp(b"\x1B(B\x7E", "\u{007E}");
825 decode_iso_2022_jp(b"\x1B(B\x0E", "\u{FFFD}");
826 decode_iso_2022_jp(b"\x1B(B\x0F", "\u{FFFD}");
827 decode_iso_2022_jp(b"\x1B(B\x80", "\u{FFFD}");
828 decode_iso_2022_jp(b"\x1B(B\xFF", "\u{FFFD}");
829
830 // Roman
831 decode_iso_2022_jp(b"\x1B(J\x5B", "\u{005B}");
832 decode_iso_2022_jp(b"\x1B(J\x5C", "\u{00A5}");
833 decode_iso_2022_jp(b"\x1B(J\x7E", "\u{203E}");
834 decode_iso_2022_jp(b"\x1B(J\x0E", "\u{FFFD}");
835 decode_iso_2022_jp(b"\x1B(J\x0F", "\u{FFFD}");
836 decode_iso_2022_jp(b"\x1B(J\x80", "\u{FFFD}");
837 decode_iso_2022_jp(b"\x1B(J\xFF", "\u{FFFD}");
838
839 // Katakana
840 decode_iso_2022_jp(b"\x1B(I\x20", "\u{FFFD}");
841 decode_iso_2022_jp(b"\x1B(I\x21", "\u{FF61}");
842 decode_iso_2022_jp(b"\x1B(I\x5F", "\u{FF9F}");
843 decode_iso_2022_jp(b"\x1B(I\x60", "\u{FFFD}");
844 decode_iso_2022_jp(b"\x1B(I\x0E", "\u{FFFD}");
845 decode_iso_2022_jp(b"\x1B(I\x0F", "\u{FFFD}");
846 decode_iso_2022_jp(b"\x1B(I\x80", "\u{FFFD}");
847 decode_iso_2022_jp(b"\x1B(I\xFF", "\u{FFFD}");
848
849 // 0208 differences from 1978 to 1983
850 decode_iso_2022_jp(b"\x1B$@\x54\x64", "\u{58FA}");
851 decode_iso_2022_jp(b"\x1B$@\x44\x5B", "\u{58F7}");
852 decode_iso_2022_jp(b"\x1B$@\x74\x21", "\u{582F}");
853 decode_iso_2022_jp(b"\x1B$@\x36\x46", "\u{5C2D}");
854 decode_iso_2022_jp(b"\x1B$@\x28\x2E", "\u{250F}");
855 decode_iso_2022_jp(b"\x1B$B\x54\x64", "\u{58FA}");
856 decode_iso_2022_jp(b"\x1B$B\x44\x5B", "\u{58F7}");
857 decode_iso_2022_jp(b"\x1B$B\x74\x21", "\u{582F}");
858 decode_iso_2022_jp(b"\x1B$B\x36\x46", "\u{5C2D}");
859 decode_iso_2022_jp(b"\x1B$B\x28\x2E", "\u{250F}");
860
861 // Broken 0208
862 decode_iso_2022_jp(b"\x1B$B\x28\x41", "\u{FFFD}");
863 decode_iso_2022_jp(b"\x1B$@\x80\x54\x64", "\u{FFFD}\u{58FA}");
864 decode_iso_2022_jp(b"\x1B$B\x28\x80", "\u{FFFD}");
865
866 if cfg!(miri) {
867 // Miri is too slow
868 return;
869 }
870
871 // Transitions
872 decode_iso_2022_jp(b"\x1B(B\x5C\x1B(J\x5C", "\u{005C}\u{00A5}");
873 decode_iso_2022_jp(b"\x1B(B\x5C\x1B(I\x21", "\u{005C}\u{FF61}");
874 decode_iso_2022_jp(b"\x1B(B\x5C\x1B$@\x54\x64", "\u{005C}\u{58FA}");
875 decode_iso_2022_jp(b"\x1B(B\x5C\x1B$B\x54\x64", "\u{005C}\u{58FA}");
876
877 decode_iso_2022_jp(b"\x1B(J\x5C\x1B(B\x5C", "\u{00A5}\u{005C}");
878 decode_iso_2022_jp(b"\x1B(J\x5C\x1B(I\x21", "\u{00A5}\u{FF61}");
879 decode_iso_2022_jp(b"\x1B(J\x5C\x1B$@\x54\x64", "\u{00A5}\u{58FA}");
880 decode_iso_2022_jp(b"\x1B(J\x5C\x1B$B\x54\x64", "\u{00A5}\u{58FA}");
881
882 decode_iso_2022_jp(b"\x1B(I\x21\x1B(J\x5C", "\u{FF61}\u{00A5}");
883 decode_iso_2022_jp(b"\x1B(I\x21\x1B(B\x5C", "\u{FF61}\u{005C}");
884 decode_iso_2022_jp(b"\x1B(I\x21\x1B$@\x54\x64", "\u{FF61}\u{58FA}");
885 decode_iso_2022_jp(b"\x1B(I\x21\x1B$B\x54\x64", "\u{FF61}\u{58FA}");
886
887 decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(J\x5C", "\u{58FA}\u{00A5}");
888 decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(I\x21", "\u{58FA}\u{FF61}");
889 decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B(B\x5C", "\u{58FA}\u{005C}");
890 decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B$B\x54\x64", "\u{58FA}\u{58FA}");
891
892 decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(J\x5C", "\u{58FA}\u{00A5}");
893 decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(I\x21", "\u{58FA}\u{FF61}");
894 decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B$@\x54\x64", "\u{58FA}\u{58FA}");
895 decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B(B\x5C", "\u{58FA}\u{005C}");
896
897 // Empty transitions
898 decode_iso_2022_jp(b"\x1B(B\x1B(J", "\u{FFFD}");
899 decode_iso_2022_jp(b"\x1B(B\x1B(I", "\u{FFFD}");
900 decode_iso_2022_jp(b"\x1B(B\x1B$@", "\u{FFFD}");
901 decode_iso_2022_jp(b"\x1B(B\x1B$B", "\u{FFFD}");
902
903 decode_iso_2022_jp(b"\x1B(J\x1B(B", "\u{FFFD}");
904 decode_iso_2022_jp(b"\x1B(J\x1B(I", "\u{FFFD}");
905 decode_iso_2022_jp(b"\x1B(J\x1B$@", "\u{FFFD}");
906 decode_iso_2022_jp(b"\x1B(J\x1B$B", "\u{FFFD}");
907
908 decode_iso_2022_jp(b"\x1B(I\x1B(J", "\u{FFFD}");
909 decode_iso_2022_jp(b"\x1B(I\x1B(B", "\u{FFFD}");
910 decode_iso_2022_jp(b"\x1B(I\x1B$@", "\u{FFFD}");
911 decode_iso_2022_jp(b"\x1B(I\x1B$B", "\u{FFFD}");
912
913 decode_iso_2022_jp(b"\x1B$@\x1B(J", "\u{FFFD}");
914 decode_iso_2022_jp(b"\x1B$@\x1B(I", "\u{FFFD}");
915 decode_iso_2022_jp(b"\x1B$@\x1B(B", "\u{FFFD}");
916 decode_iso_2022_jp(b"\x1B$@\x1B$B", "\u{FFFD}");
917
918 decode_iso_2022_jp(b"\x1B$B\x1B(J", "\u{FFFD}");
919 decode_iso_2022_jp(b"\x1B$B\x1B(I", "\u{FFFD}");
920 decode_iso_2022_jp(b"\x1B$B\x1B$@", "\u{FFFD}");
921 decode_iso_2022_jp(b"\x1B$B\x1B(B", "\u{FFFD}");
922
923 // Transitions to self
924 decode_iso_2022_jp(b"\x1B(B\x5C\x1B(B\x5C", "\u{005C}\u{005C}");
925 decode_iso_2022_jp(b"\x1B(J\x5C\x1B(J\x5C", "\u{00A5}\u{00A5}");
926 decode_iso_2022_jp(b"\x1B(I\x21\x1B(I\x21", "\u{FF61}\u{FF61}");
927 decode_iso_2022_jp(b"\x1B$@\x54\x64\x1B$@\x54\x64", "\u{58FA}\u{58FA}");
928 decode_iso_2022_jp(b"\x1B$B\x54\x64\x1B$B\x54\x64", "\u{58FA}\u{58FA}");
929 }
930
931 #[test]
932 fn test_iso_2022_jp_encode() {
933 // Empty
934 encode_iso_2022_jp("", b"");
935
936 // ASCII
937 encode_iso_2022_jp("ab", b"ab");
938 encode_iso_2022_jp("\u{1F4A9}", b"&#128169;");
939 encode_iso_2022_jp("\x1B", b"&#65533;");
940 encode_iso_2022_jp("\x0E", b"&#65533;");
941 encode_iso_2022_jp("\x0F", b"&#65533;");
942
943 // Roman
944 encode_iso_2022_jp("a\u{00A5}b", b"a\x1B(J\x5Cb\x1B(B");
945 encode_iso_2022_jp("a\u{203E}b", b"a\x1B(J\x7Eb\x1B(B");
946 if !cfg!(miri) {
947 // Miri is too slow
948 encode_iso_2022_jp("a\u{00A5}b\x5C", b"a\x1B(J\x5Cb\x1B(B\x5C");
949 encode_iso_2022_jp("a\u{203E}b\x7E", b"a\x1B(J\x7Eb\x1B(B\x7E");
950 encode_iso_2022_jp("\u{00A5}\u{1F4A9}", b"\x1B(J\x5C&#128169;\x1B(B");
951 encode_iso_2022_jp("\u{00A5}\x1B", b"\x1B(J\x5C&#65533;\x1B(B");
952 encode_iso_2022_jp("\u{00A5}\x0E", b"\x1B(J\x5C&#65533;\x1B(B");
953 encode_iso_2022_jp("\u{00A5}\x0F", b"\x1B(J\x5C&#65533;\x1B(B");
954 encode_iso_2022_jp("\u{00A5}\u{58FA}", b"\x1B(J\x5C\x1B$B\x54\x64\x1B(B");
955 }
956
957 // Half-width Katakana
958 encode_iso_2022_jp("\u{FF61}", b"\x1B$B\x21\x23\x1B(B");
959 encode_iso_2022_jp("\u{FF65}", b"\x1B$B\x21\x26\x1B(B");
960 if !cfg!(miri) {
961 // Miri is too slow
962 encode_iso_2022_jp("\u{FF66}", b"\x1B$B\x25\x72\x1B(B");
963 encode_iso_2022_jp("\u{FF70}", b"\x1B$B\x21\x3C\x1B(B");
964 encode_iso_2022_jp("\u{FF9D}", b"\x1B$B\x25\x73\x1B(B");
965 encode_iso_2022_jp("\u{FF9E}", b"\x1B$B\x21\x2B\x1B(B");
966 encode_iso_2022_jp("\u{FF9F}", b"\x1B$B\x21\x2C\x1B(B");
967 }
968
969 // 0208
970 encode_iso_2022_jp("\u{58FA}", b"\x1B$B\x54\x64\x1B(B");
971 encode_iso_2022_jp("\u{58FA}\u{250F}", b"\x1B$B\x54\x64\x28\x2E\x1B(B");
972 if !cfg!(miri) {
973 // Miri is too slow
974 encode_iso_2022_jp("\u{58FA}\u{1F4A9}", b"\x1B$B\x54\x64\x1B(B&#128169;");
975 encode_iso_2022_jp("\u{58FA}\x1B", b"\x1B$B\x54\x64\x1B(B&#65533;");
976 encode_iso_2022_jp("\u{58FA}\x0E", b"\x1B$B\x54\x64\x1B(B&#65533;");
977 encode_iso_2022_jp("\u{58FA}\x0F", b"\x1B$B\x54\x64\x1B(B&#65533;");
978 encode_iso_2022_jp("\u{58FA}\u{00A5}", b"\x1B$B\x54\x64\x1B(J\x5C\x1B(B");
979 encode_iso_2022_jp("\u{58FA}a", b"\x1B$B\x54\x64\x1B(Ba");
980 }
981 }
982
983 #[test]
984 #[cfg_attr(miri, ignore)] // Miri is too slow
985 fn test_iso_2022_jp_decode_all() {
986 let input = include_bytes!("test_data/iso_2022_jp_in.txt");
987 let expectation = include_str!("test_data/iso_2022_jp_in_ref.txt");
988 let (cow, had_errors) = ISO_2022_JP.decode_without_bom_handling(input);
989 assert!(had_errors, "Should have had errors.");
990 assert_eq!(&cow[..], expectation);
991 }
992
993 #[test]
994 #[cfg_attr(miri, ignore)] // Miri is too slow
995 fn test_iso_2022_jp_encode_all() {
996 let input = include_str!("test_data/iso_2022_jp_out.txt");
997 let expectation = include_bytes!("test_data/iso_2022_jp_out_ref.txt");
998 let (cow, encoding, had_errors) = ISO_2022_JP.encode(input);
999 assert!(!had_errors, "Should not have had errors.");
1000 assert_eq!(encoding, ISO_2022_JP);
1001 assert_eq!(&cow[..], &expectation[..]);
1002 }
1003
1004 #[test]
1005 fn test_iso_2022_jp_half_width_katakana_length() {
1006 let mut output = [0u8; 20];
1007 let mut decoder = ISO_2022_JP.new_decoder();
1008 {
1009 let (result, read, written) =
1010 decoder.decode_to_utf8_without_replacement(b"\x1B\x28\x49", &mut output, false);
1011 assert_eq!(result, DecoderResult::InputEmpty);
1012 assert_eq!(read, 3);
1013 assert_eq!(written, 0);
1014 }
1015 {
1016 let needed = decoder
1017 .max_utf8_buffer_length_without_replacement(1)
1018 .unwrap();
1019 let (result, read, written) =
1020 decoder.decode_to_utf8_without_replacement(b"\x21", &mut output[..needed], true);
1021 assert_eq!(result, DecoderResult::InputEmpty);
1022 assert_eq!(read, 1);
1023 assert_eq!(written, 3);
1024 assert_eq!(output[0], 0xEF);
1025 assert_eq!(output[1], 0xBD);
1026 assert_eq!(output[2], 0xA1);
1027 }
1028 }
1029
1030 #[test]
1031 fn test_iso_2022_jp_length_after_escape() {
1032 let mut output = [0u16; 20];
1033 let mut decoder = ISO_2022_JP.new_decoder();
1034 {
1035 let (result, read, written, had_errors) =
1036 decoder.decode_to_utf16(b"\x1B", &mut output, false);
1037 assert_eq!(result, CoderResult::InputEmpty);
1038 assert_eq!(read, 1);
1039 assert_eq!(written, 0);
1040 assert!(!had_errors);
1041 }
1042 {
1043 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1044 let (result, read, written, had_errors) =
1045 decoder.decode_to_utf16(b"A", &mut output[..needed], true);
1046 assert_eq!(result, CoderResult::InputEmpty);
1047 assert_eq!(read, 1);
1048 assert_eq!(written, 2);
1049 assert!(had_errors);
1050 assert_eq!(output[0], 0xFFFD);
1051 assert_eq!(output[1], 0x0041);
1052 }
1053 }
1054
1055 #[test]
1056 fn test_iso_2022_jp_encode_from_two_low_surrogates() {
1057 let expectation = b"&#65533;&#65533;";
1058 let mut output = [0u8; 40];
1059 let mut encoder = ISO_2022_JP.new_encoder();
1060 let (result, read, written, had_errors) =
1061 encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
1062 assert_eq!(result, CoderResult::InputEmpty);
1063 assert_eq!(read, 2);
1064 assert_eq!(written, expectation.len());
1065 assert!(had_errors);
1066 assert_eq!(&output[..written], expectation);
1067 }
1068}
1069