1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::data::*;
12use crate::handles::*;
13use crate::variant::*;
14// Rust 1.14.0 requires the following despite the asterisk above.
15use super::in_inclusive_range16;
16use super::in_range16;
17
18pub struct EucKrDecoder {
19 lead: Option<u8>,
20}
21
22impl EucKrDecoder {
23 pub fn new() -> VariantDecoder {
24 VariantDecoder::EucKr(EucKrDecoder { lead: None })
25 }
26
27 pub fn in_neutral_state(&self) -> bool {
28 self.lead.is_none()
29 }
30
31 fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
32 byte_length.checked_add(match self.lead {
33 None => 0,
34 Some(_) => 1,
35 })
36 }
37
38 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
39 self.plus_one_if_lead(byte_length)
40 }
41
42 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
43 // worst case: 2 to 3
44 let len = self.plus_one_if_lead(byte_length);
45 checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
46 }
47
48 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
49 checked_mul(3, self.plus_one_if_lead(byte_length))
50 }
51
52 ascii_compatible_two_byte_decoder_functions!(
53 {
54 // If lead is between 0x81 and 0xFE, inclusive,
55 // subtract offset 0x81.
56 let non_ascii_minus_offset =
57 non_ascii.wrapping_sub(0x81);
58 if non_ascii_minus_offset > (0xFE - 0x81) {
59 return (DecoderResult::Malformed(1, 0),
60 source.consumed(),
61 handle.written());
62 }
63 non_ascii_minus_offset
64 },
65 {
66 if lead_minus_offset >= 0x20 {
67 // Not the extension range above KS X 1001
68 let trail_minus_offset =
69 byte.wrapping_sub(0xA1);
70 if trail_minus_offset <= (0xFE - 0xA1) {
71 // KS X 1001
72 let ksx_pointer = mul_94(lead_minus_offset - 0x20) + trail_minus_offset as usize;
73 let hangul_pointer = ksx_pointer.wrapping_sub((0x2F - 0x20) * 94);
74 if hangul_pointer < KSX1001_HANGUL.len() {
75 let upper_bmp = KSX1001_HANGUL[hangul_pointer];
76 handle.write_upper_bmp(upper_bmp)
77 } else if ksx_pointer < KSX1001_SYMBOLS.len() {
78 let bmp = KSX1001_SYMBOLS[ksx_pointer];
79 handle.write_bmp_excl_ascii(bmp)
80 } else {
81 let hanja_pointer = ksx_pointer.wrapping_sub((0x49 - 0x20) * 94);
82 if hanja_pointer < KSX1001_HANJA.len() {
83 let upper_bmp = KSX1001_HANJA[hanja_pointer];
84 handle.write_upper_bmp(upper_bmp)
85 } else if (lead_minus_offset == 0x27) && ((trail_minus_offset as usize) < KSX1001_UPPERCASE.len()) {
86 let mid_bmp = KSX1001_UPPERCASE[trail_minus_offset as usize];
87 if mid_bmp == 0 {
88 return (DecoderResult::Malformed(2, 0),
89 unread_handle_trail.consumed(),
90 handle.written());
91 }
92 handle.write_mid_bmp(mid_bmp)
93 } else if (lead_minus_offset == 0x28) && ((trail_minus_offset as usize) < KSX1001_LOWERCASE.len()) {
94 let mid_bmp = KSX1001_LOWERCASE[trail_minus_offset as usize];
95 handle.write_mid_bmp(mid_bmp)
96 } else if (lead_minus_offset == 0x25) && ((trail_minus_offset as usize) < KSX1001_BOX.len()) {
97 let upper_bmp = KSX1001_BOX[trail_minus_offset as usize];
98 handle.write_upper_bmp(upper_bmp)
99 } else {
100 let other_pointer = ksx_pointer.wrapping_sub(2 * 94);
101 if other_pointer < 0x039F {
102 let bmp = ksx1001_other_decode(other_pointer as u16);
103 // ASCII range means unassigned
104 if bmp < 0x80 {
105 return (DecoderResult::Malformed(2, 0),
106 unread_handle_trail.consumed(),
107 handle.written());
108 }
109 handle.write_bmp_excl_ascii(bmp)
110 } else {
111 return (DecoderResult::Malformed(2, 0),
112 unread_handle_trail.consumed(),
113 handle.written());
114 }
115 }
116 }
117 } else {
118 // Extension range to the left of
119 // KS X 1001
120 let left_lead = lead_minus_offset - 0x20;
121 let left_trail = if byte.wrapping_sub(0x40 + 0x41) < (0x60 - 0x40) {
122 byte - (12 + 0x41)
123 } else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
124 byte - (6 + 0x41)
125 } else if byte.wrapping_sub(0x41) < 0x1A {
126 byte - 0x41
127 } else {
128 if byte < 0x80 {
129 return (DecoderResult::Malformed(1, 0),
130 unread_handle_trail.unread(),
131 handle.written());
132 }
133 return (DecoderResult::Malformed(2, 0),
134 unread_handle_trail.consumed(),
135 handle.written());
136 };
137 let left_pointer = ((left_lead as usize) * (190 - 94 - 12)) + left_trail as usize;
138 if left_pointer < (0x45 - 0x20) * (190 - 94 - 12) + 0x12 {
139 let upper_bmp = cp949_left_hangul_decode(left_pointer as u16);
140 handle.write_upper_bmp(upper_bmp)
141 } else {
142 if byte < 0x80 {
143 return (DecoderResult::Malformed(1, 0),
144 unread_handle_trail.unread(),
145 handle.written());
146 }
147 return (DecoderResult::Malformed(2, 0),
148 unread_handle_trail.consumed(),
149 handle.written());
150 }
151 }
152 } else {
153 // Extension range above KS X 1001
154 let top_trail = if byte.wrapping_sub(0x40 + 0x41) < (0xBE - 0x40) {
155 byte - (12 + 0x41)
156 } else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
157 byte - (6 + 0x41)
158 } else if byte.wrapping_sub(0x41) < 0x1A {
159 byte - 0x41
160 } else {
161 if byte < 0x80 {
162 return (DecoderResult::Malformed(1, 0),
163 unread_handle_trail.unread(),
164 handle.written());
165 }
166 return (DecoderResult::Malformed(2, 0),
167 unread_handle_trail.consumed(),
168 handle.written());
169 };
170 let top_pointer = ((lead_minus_offset as usize) * (190 - 12)) + top_trail as usize;
171 let upper_bmp = cp949_top_hangul_decode(top_pointer as u16);
172 handle.write_upper_bmp(upper_bmp)
173 }
174 },
175 self,
176 non_ascii,
177 byte,
178 lead_minus_offset,
179 unread_handle_trail,
180 source,
181 handle,
182 'outermost,
183 copy_ascii_from_check_space_bmp,
184 check_space_bmp,
185 true);
186}
187
188fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> {
189 if in_inclusive_range16(bmp, 0x3000, 0x3015) {
190 if let Some(pos) = position(&KSX1001_SYMBOLS[..(0xAB - 0x60)], bmp) {
191 return Some((0xA1, pos + 0xA1));
192 }
193 }
194 if let Some(other_pointer) = ksx1001_other_encode(bmp) {
195 let other_lead = ((other_pointer as usize) / 94) + (0x81 + 0x22);
196 let other_trail = ((other_pointer as usize) % 94) + 0xA1;
197 return Some((other_lead, other_trail));
198 }
199 if in_range16(bmp, 0x00AA, 0x0168) {
200 // Latin
201 if let Some(pos) = position(&KSX1001_LOWERCASE[..], bmp) {
202 return Some((0x81 + 0x28, 0xA1 + pos));
203 }
204 if let Some(pos) = position(&KSX1001_UPPERCASE[..], bmp) {
205 return Some((0x81 + 0x27, 0xA1 + pos));
206 }
207 } else if in_range16(bmp, 0x2500, 0x254C) {
208 if let Some(pos) = position(&KSX1001_BOX[..], bmp) {
209 return Some((0x81 + 0x25, 0xA1 + pos));
210 }
211 }
212 if in_inclusive_range16(bmp, 0x2015, 0x266D)
213 || in_inclusive_range16(bmp, 0x321C, 0x33D8)
214 || in_inclusive_range16(bmp, 0xFF3C, 0xFFE5)
215 || in_inclusive_range16(bmp, 0x00A1, 0x00F7)
216 || in_inclusive_range16(bmp, 0x02C7, 0x02DD)
217 {
218 if let Some(pos) = position(&KSX1001_SYMBOLS[3..], bmp) {
219 if pos < (94 - 3) {
220 return Some((0xA1, pos + 0xA1 + 3));
221 }
222 return Some((0xA2, pos - (94 - 3) + 0xA1));
223 }
224 }
225 None
226}
227
228#[cfg(not(feature = "fast-hangul-encode"))]
229#[inline(always)]
230fn ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8) {
231 match KSX1001_HANGUL.binary_search(&bmp) {
232 Ok(ksx_hangul_pointer) => {
233 let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
234 let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
235 (ksx_hangul_lead as u8, ksx_hangul_trail as u8)
236 }
237 Err(_) => {
238 let (lead, cp949_trail) = if bmp < 0xC8A5 {
239 // Above KS X 1001
240 let top_pointer = cp949_top_hangul_encode(bmp) as usize;
241 let top_lead = (top_pointer / (190 - 12)) + 0x81;
242 let top_trail = top_pointer % (190 - 12);
243 (top_lead as u8, top_trail as u8)
244 } else {
245 // To the left of KS X 1001
246 let left_pointer = cp949_left_hangul_encode(bmp) as usize;
247 let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
248 let left_trail = left_pointer % (190 - 94 - 12);
249 (left_lead as u8, left_trail as u8)
250 };
251 let offset = if cp949_trail >= (0x40 - 12) {
252 0x41 + 12
253 } else if cp949_trail >= (0x20 - 6) {
254 0x41 + 6
255 } else {
256 0x41
257 };
258 (lead as u8, (cp949_trail + offset) as u8)
259 }
260 }
261}
262
263#[cfg(feature = "fast-hangul-encode")]
264#[inline(always)]
265fn ksx1001_encode_hangul(_: u16, bmp_minus_hangul_start: u16) -> (u8, u8) {
266 cp949_hangul_encode(bmp_minus_hangul_start)
267}
268
269#[cfg(not(feature = "fast-hanja-encode"))]
270#[inline(always)]
271fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
272 if let Some(hanja_pointer: usize) = position(&KSX1001_HANJA[..], needle:bmp) {
273 let hanja_lead: usize = (hanja_pointer / 94) + (0x81 + 0x49);
274 let hanja_trail: usize = (hanja_pointer % 94) + 0xA1;
275 Some((hanja_lead as u8, hanja_trail as u8))
276 } else {
277 None
278 }
279}
280
281#[cfg(feature = "fast-hanja-encode")]
282#[inline(always)]
283fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
284 if bmp < 0xF900 {
285 ksx1001_unified_hangul_encode(bmp)
286 } else {
287 Some(ksx1001_compatibility_hangul_encode(bmp))
288 }
289}
290
291pub struct EucKrEncoder;
292
293impl EucKrEncoder {
294 pub fn new(encoding: &'static Encoding) -> Encoder {
295 Encoder::new(encoding, VariantEncoder::EucKr(EucKrEncoder))
296 }
297
298 pub fn max_buffer_length_from_utf16_without_replacement(
299 &self,
300 u16_length: usize,
301 ) -> Option<usize> {
302 u16_length.checked_mul(2)
303 }
304
305 pub fn max_buffer_length_from_utf8_without_replacement(
306 &self,
307 byte_length: usize,
308 ) -> Option<usize> {
309 byte_length.checked_add(1)
310 }
311
312 ascii_compatible_bmp_encoder_functions!(
313 {
314 let bmp_minus_hangul_start = bmp.wrapping_sub(0xAC00);
315 let (lead, trail) = if bmp_minus_hangul_start < (0xD7A4 - 0xAC00) {
316 // Hangul
317 ksx1001_encode_hangul(bmp, bmp_minus_hangul_start)
318 } else if in_range16(bmp, 0x33DE, 0xFF01) {
319 // Vast range that includes no other
320 // mappables except Hangul (already
321 // processed) and Hanja.
322 // Narrow the range further to Unified and
323 // Compatibility ranges of Hanja.
324 if in_range16(bmp, 0x4E00, 0x9F9D) || in_range16(bmp, 0xF900, 0xFA0C) {
325 if let Some((hanja_lead, hanja_trail)) = ksx1001_encode_hanja(bmp) {
326 (hanja_lead, hanja_trail)
327 } else {
328 return (
329 EncoderResult::unmappable_from_bmp(bmp),
330 source.consumed(),
331 handle.written(),
332 );
333 }
334 } else {
335 return (
336 EncoderResult::unmappable_from_bmp(bmp),
337 source.consumed(),
338 handle.written(),
339 );
340 }
341 } else if let Some((lead, trail)) = ksx1001_encode_misc(bmp) {
342 (lead as u8, trail as u8)
343 } else {
344 return (
345 EncoderResult::unmappable_from_bmp(bmp),
346 source.consumed(),
347 handle.written(),
348 );
349 };
350 handle.write_two(lead, trail)
351 },
352 bmp,
353 self,
354 source,
355 handle,
356 copy_ascii_to_check_space_two,
357 check_space_two,
358 true
359 );
360}
361
362// Any copyright to the test code below this comment is dedicated to the
363// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
364
365#[cfg(all(test, feature = "alloc"))]
366mod tests {
367 use super::super::testing::*;
368 use super::super::*;
369
370 fn decode_euc_kr(bytes: &[u8], expect: &str) {
371 decode(EUC_KR, bytes, expect);
372 }
373
374 fn encode_euc_kr(string: &str, expect: &[u8]) {
375 encode(EUC_KR, string, expect);
376 }
377
378 #[test]
379 fn test_euc_kr_decode() {
380 // Empty
381 decode_euc_kr(b"", &"");
382
383 // ASCII
384 decode_euc_kr(b"\x61\x62", "\u{0061}\u{0062}");
385
386 decode_euc_kr(b"\x81\x41", "\u{AC02}");
387 decode_euc_kr(b"\x81\x5B", "\u{FFFD}\x5B");
388 decode_euc_kr(b"\xFD\xFE", "\u{8A70}");
389 decode_euc_kr(b"\xFE\x41", "\u{FFFD}\x41");
390 decode_euc_kr(b"\xFF\x41", "\u{FFFD}\x41");
391 decode_euc_kr(b"\x80\x41", "\u{FFFD}\x41");
392 decode_euc_kr(b"\xA1\xFF", "\u{FFFD}");
393 decode_euc_kr(b"\x81\xFF", "\u{FFFD}");
394 }
395
396 #[test]
397 fn test_euc_kr_encode() {
398 // Empty
399 encode_euc_kr("", b"");
400
401 // ASCII
402 encode_euc_kr("\u{0061}\u{0062}", b"\x61\x62");
403
404 encode_euc_kr("\u{AC02}", b"\x81\x41");
405 encode_euc_kr("\u{8A70}", b"\xFD\xFE");
406 }
407
408 #[test]
409 #[cfg_attr(miri, ignore)] // Miri is too slow
410 fn test_euc_kr_decode_all() {
411 let input = include_bytes!("test_data/euc_kr_in.txt");
412 let expectation = include_str!("test_data/euc_kr_in_ref.txt");
413 let (cow, had_errors) = EUC_KR.decode_without_bom_handling(input);
414 assert!(had_errors, "Should have had errors.");
415 assert_eq!(&cow[..], expectation);
416 }
417
418 #[test]
419 #[cfg_attr(miri, ignore)] // Miri is too slow
420 fn test_euc_kr_encode_all() {
421 let input = include_str!("test_data/euc_kr_out.txt");
422 let expectation = include_bytes!("test_data/euc_kr_out_ref.txt");
423 let (cow, encoding, had_errors) = EUC_KR.encode(input);
424 assert!(!had_errors, "Should not have had errors.");
425 assert_eq!(encoding, EUC_KR);
426 assert_eq!(&cow[..], &expectation[..]);
427 }
428
429 #[test]
430 fn test_euc_kr_encode_from_two_low_surrogates() {
431 let expectation = b"&#65533;&#65533;";
432 let mut output = [0u8; 40];
433 let mut encoder = EUC_KR.new_encoder();
434 let (result, read, written, had_errors) =
435 encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
436 assert_eq!(result, CoderResult::InputEmpty);
437 assert_eq!(read, 2);
438 assert_eq!(written, expectation.len());
439 assert!(had_errors);
440 assert_eq!(&output[..written], expectation);
441 }
442}
443