1 | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | use super::*; |
11 | use crate::data::*; |
12 | use crate::handles::*; |
13 | use crate::variant::*; |
14 | // Rust 1.14.0 requires the following despite the asterisk above. |
15 | use super::in_inclusive_range16; |
16 | use super::in_range16; |
17 | |
18 | enum Gb18030Pending { |
19 | None, |
20 | One(u8), |
21 | Two(u8, u8), |
22 | Three(u8, u8, u8), |
23 | } |
24 | |
25 | impl Gb18030Pending { |
26 | fn is_none(&self) -> bool { |
27 | match *self { |
28 | Gb18030Pending::None => true, |
29 | _ => false, |
30 | } |
31 | } |
32 | |
33 | fn count(&self) -> usize { |
34 | match *self { |
35 | Gb18030Pending::None => 0, |
36 | Gb18030Pending::One(_) => 1, |
37 | Gb18030Pending::Two(_, _) => 2, |
38 | Gb18030Pending::Three(_, _, _) => 3, |
39 | } |
40 | } |
41 | } |
42 | |
43 | pub struct Gb18030Decoder { |
44 | first: Option<u8>, |
45 | second: Option<u8>, |
46 | third: Option<u8>, |
47 | pending: Gb18030Pending, |
48 | pending_ascii: Option<u8>, |
49 | } |
50 | |
51 | impl Gb18030Decoder { |
52 | pub fn new() -> VariantDecoder { |
53 | VariantDecoder::Gb18030(Gb18030Decoder { |
54 | first: None, |
55 | second: None, |
56 | third: None, |
57 | pending: Gb18030Pending::None, |
58 | pending_ascii: None, |
59 | }) |
60 | } |
61 | |
62 | pub fn in_neutral_state(&self) -> bool { |
63 | self.first.is_none() |
64 | && self.second.is_none() |
65 | && self.third.is_none() |
66 | && self.pending.is_none() |
67 | && self.pending_ascii.is_none() |
68 | } |
69 | |
70 | fn extra_from_state(&self, byte_length: usize) -> Option<usize> { |
71 | byte_length.checked_add( |
72 | self.pending.count() |
73 | + match self.first { |
74 | None => 0, |
75 | Some(_) => 1, |
76 | } |
77 | + match self.second { |
78 | None => 0, |
79 | Some(_) => 1, |
80 | } |
81 | + match self.third { |
82 | None => 0, |
83 | Some(_) => 1, |
84 | } |
85 | + match self.pending_ascii { |
86 | None => 0, |
87 | Some(_) => 1, |
88 | }, |
89 | ) |
90 | } |
91 | |
92 | pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
93 | // ASCII: 1 to 1 (worst case) |
94 | // gbk: 2 to 1 |
95 | // ranges: 4 to 1 or 4 to 2 |
96 | checked_add(1, self.extra_from_state(byte_length)) |
97 | } |
98 | |
99 | pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
100 | // ASCII: 1 to 1 |
101 | // gbk: 2 to 2 or 2 to 3 |
102 | // ranges: 4 to 2, 4 to 3 or 4 to 4 |
103 | // 0x80: 1 to 3 (worst case) |
104 | self.max_utf8_buffer_length(byte_length) |
105 | } |
106 | |
107 | pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
108 | checked_add(1, checked_mul(3, self.extra_from_state(byte_length))) |
109 | } |
110 | |
111 | gb18030_decoder_functions!( |
112 | { |
113 | // If first is between 0x81 and 0xFE, inclusive, |
114 | // subtract offset 0x81. |
115 | let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81); |
116 | if non_ascii_minus_offset > (0xFE - 0x81) { |
117 | if non_ascii == 0x80 { |
118 | handle.write_upper_bmp(0x20ACu16); |
119 | continue 'outermost; |
120 | } |
121 | return (DecoderResult::Malformed(1, 0), |
122 | source.consumed(), |
123 | handle.written()); |
124 | } |
125 | non_ascii_minus_offset |
126 | }, |
127 | { |
128 | // Two-byte (or error) |
129 | if first_minus_offset >= 0x20 { |
130 | // Not the gbk ideograph range above GB2312 |
131 | let trail_minus_offset = second.wrapping_sub(0xA1); |
132 | if trail_minus_offset <= (0xFE - 0xA1) { |
133 | // GB2312 |
134 | let hanzi_lead = first_minus_offset.wrapping_sub(0x2F); |
135 | if hanzi_lead < (0x77 - 0x2F) { |
136 | // Level 1 Hanzi, Level 2 Hanzi |
137 | // or one of the 5 PUA code |
138 | // points in between. |
139 | let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize; |
140 | let upper_bmp = GB2312_HANZI[hanzi_pointer]; |
141 | handle.write_upper_bmp(upper_bmp) |
142 | } else if first_minus_offset == 0x20 { |
143 | // Symbols (starting with ideographic space) |
144 | let bmp = GB2312_SYMBOLS[trail_minus_offset as usize]; |
145 | handle.write_bmp_excl_ascii(bmp) |
146 | } else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) { |
147 | handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize]) |
148 | } else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() { |
149 | handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize]) |
150 | } else if first_minus_offset > 0x76 { |
151 | // Bottom PUA |
152 | let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16; |
153 | handle.write_upper_bmp(pua) |
154 | } else { |
155 | let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16); |
156 | handle.write_bmp_excl_ascii(bmp) |
157 | } |
158 | } else { |
159 | // gbk range on the left |
160 | let mut trail_minus_offset = second.wrapping_sub(0x40); |
161 | if trail_minus_offset > (0x7E - 0x40) { |
162 | let trail_minus_range_start = second.wrapping_sub(0x80); |
163 | if trail_minus_range_start > (0xA0 - 0x80) { |
164 | if second < 0x80 { |
165 | return (DecoderResult::Malformed(1, 0), |
166 | unread_handle_second.unread(), |
167 | handle.written()); |
168 | } |
169 | return (DecoderResult::Malformed(2, 0), |
170 | unread_handle_second.consumed(), |
171 | handle.written()); |
172 | } |
173 | trail_minus_offset = second - 0x41; |
174 | } |
175 | // Zero-base lead |
176 | let left_lead = first_minus_offset - 0x20; |
177 | let left_pointer = left_lead as usize * (190 - 94) + |
178 | trail_minus_offset as usize; |
179 | let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94)); |
180 | if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) { |
181 | let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16); |
182 | handle.write_upper_bmp(upper_bmp) |
183 | } else if left_pointer < ((0x29 - 0x20) * (190 - 94)) { |
184 | let bmp = gbk_other_decode(left_pointer as u16); |
185 | handle.write_bmp_excl_ascii(bmp) |
186 | } else { |
187 | let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5); |
188 | let upper_bmp = GBK_BOTTOM[bottom_pointer]; |
189 | handle.write_upper_bmp(upper_bmp) |
190 | } |
191 | } |
192 | } else { |
193 | // gbk ideograph range above GB2312 |
194 | let mut trail_minus_offset = second.wrapping_sub(0x40); |
195 | if trail_minus_offset > (0x7E - 0x40) { |
196 | let trail_minus_range_start = second.wrapping_sub(0x80); |
197 | if trail_minus_range_start > (0xFE - 0x80) { |
198 | if second < 0x80 { |
199 | return (DecoderResult::Malformed(1, 0), |
200 | unread_handle_second.unread(), |
201 | handle.written()); |
202 | } |
203 | return (DecoderResult::Malformed(2, 0), |
204 | unread_handle_second.consumed(), |
205 | handle.written()); |
206 | } |
207 | trail_minus_offset = second - 0x41; |
208 | } |
209 | let pointer = first_minus_offset as usize * 190usize + |
210 | trail_minus_offset as usize; |
211 | let upper_bmp = gbk_top_ideograph_decode(pointer as u16); |
212 | handle.write_upper_bmp(upper_bmp) |
213 | } |
214 | }, |
215 | { |
216 | // If third is between 0x81 and 0xFE, inclusive, |
217 | // subtract offset 0x81. |
218 | let third_minus_offset = third.wrapping_sub(0x81); |
219 | if third_minus_offset > (0xFE - 0x81) { |
220 | // We have an error. Let's inline what's going |
221 | // to happen when `second` is |
222 | // reprocessed. (`third` gets unread.) |
223 | // `second` is guaranteed ASCII, so let's |
224 | // put it in `pending_ascii`. Recompute |
225 | // `second` from `second_minus_offset`. |
226 | self.pending_ascii = Some(second_minus_offset + 0x30); |
227 | // Now unread `third` and designate the previous |
228 | // `first` as being in error. |
229 | return (DecoderResult::Malformed(1, 1), |
230 | unread_handle_third.unread(), |
231 | handle.written()); |
232 | } |
233 | third_minus_offset |
234 | }, |
235 | { |
236 | // If fourth is between 0x30 and 0x39, inclusive, |
237 | // subtract offset 0x30. |
238 | // |
239 | // If we have an error, we'll inline what's going |
240 | // to happen when `second` and `third` are |
241 | // reprocessed. (`fourth` gets unread.) |
242 | // `second` is guaranteed ASCII, so let's |
243 | // put it in `pending_ascii`. Recompute |
244 | // `second` from `second_minus_offset` to |
245 | // make this block reusable when `second` |
246 | // is not in scope. |
247 | // |
248 | // `third` is guaranteed to be in the range |
249 | // that makes it become the new `self.first`. |
250 | // |
251 | // `fourth` gets unread and the previous |
252 | // `first` gets designates as being in error. |
253 | let fourth_minus_offset = fourth.wrapping_sub(0x30); |
254 | if fourth_minus_offset > (0x39 - 0x30) { |
255 | self.pending_ascii = Some(second_minus_offset + 0x30); |
256 | self.pending = Gb18030Pending::One(third_minus_offset); |
257 | return (DecoderResult::Malformed(1, 2), |
258 | unread_handle_fourth.unread(), |
259 | handle.written()); |
260 | } |
261 | let pointer = (first_minus_offset as usize * (10 * 126 * 10)) + |
262 | (second_minus_offset as usize * (10 * 126)) + |
263 | (third_minus_offset as usize * 10) + |
264 | fourth_minus_offset as usize; |
265 | if pointer <= 39419 { |
266 | // BMP |
267 | if pointer == 7457 { |
268 | handle.write_upper_bmp(0xE7C7) |
269 | } else { |
270 | handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16)) |
271 | } |
272 | } else if pointer >= 189_000 && pointer <= 1_237_575 { |
273 | // Astral |
274 | handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32) |
275 | } else { |
276 | return (DecoderResult::Malformed(4, 0), |
277 | unread_handle_fourth.consumed(), |
278 | handle.written()); |
279 | } |
280 | }, |
281 | self, |
282 | non_ascii, |
283 | first_minus_offset, |
284 | second, |
285 | second_minus_offset, |
286 | unread_handle_second, |
287 | third, |
288 | third_minus_offset, |
289 | unread_handle_third, |
290 | fourth, |
291 | fourth_minus_offset, |
292 | unread_handle_fourth, |
293 | source, |
294 | handle, |
295 | 'outermost); |
296 | } |
297 | |
298 | // XXX Experiment with inline directives |
299 | fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { |
300 | // Try ideographic punctuation first as it's the most likely case. |
301 | // Throwing in the check for full-width currencies and tilde is probably |
302 | // more size-efficient here than elsewhere. |
303 | if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) { |
304 | if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) { |
305 | return Some((0xA1, pos + 0xA1)); |
306 | } |
307 | } |
308 | // Ext A |
309 | if in_range16(bmp, 0x3400, 0x4E00) { |
310 | return position(&GBK_BOTTOM[21..100], bmp).map(|pos| { |
311 | ( |
312 | 0xFE, |
313 | pos + if pos < (0x3F - 16) { |
314 | 0x40 + 16 |
315 | } else { |
316 | 0x41 + 16 |
317 | }, |
318 | ) |
319 | }); |
320 | } |
321 | // Compatibility ideographs |
322 | if in_range16(bmp, 0xF900, 0xFB00) { |
323 | return position(&GBK_BOTTOM[0..21], bmp).map(|pos| { |
324 | if pos < 5 { |
325 | // end of second to last row |
326 | (0xFD, pos + (190 - 94 - 5 + 0x41)) |
327 | } else { |
328 | // last row |
329 | (0xFE, pos + (0x40 - 5)) |
330 | } |
331 | }); |
332 | } |
333 | // Handle everything below U+02CA, which is in GBK_OTHER. |
334 | if bmp < 0x02CA { |
335 | if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 { |
336 | // Pinyin except U+1E3F |
337 | if let Some(pos) = position(&GB2312_PINYIN[..], bmp) { |
338 | return Some((0xA8, pos + 0xA1)); |
339 | } |
340 | } else if in_inclusive_range16(bmp, 0x00A4, 0x00F7) |
341 | || in_inclusive_range16(bmp, 0x02C7, 0x02C9) |
342 | { |
343 | // Diacritics and Latin 1 symbols |
344 | if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) { |
345 | return Some((0xA1, pos + 0xA1 + 3)); |
346 | } |
347 | } |
348 | return None; |
349 | } |
350 | if bmp >= 0xE794 { |
351 | // Various brackets, all in PUA or full-width regions |
352 | if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) { |
353 | return Some((0xA6, pos + (0x9F - 0x60 + 0xA1))); |
354 | } |
355 | } else if bmp == 0x1E3F { |
356 | // The one Pinyin placed elsewhere on the BMP |
357 | return Some((0xA8, 0x7B - 0x60 + 0xA1)); |
358 | } else if in_range16(bmp, 0xA000, 0xD800) { |
359 | // Since Korean has usage in China, let's spend a branch to fast-track |
360 | // Hangul. |
361 | return None; |
362 | } |
363 | // GB2312 other (except bottom PUA and PUA between Hanzi levels). |
364 | if let Some(other_pointer) = gb2312_other_encode(bmp) { |
365 | let other_lead = other_pointer as usize / 94; |
366 | let other_trail = other_pointer as usize % 94; |
367 | return Some((0xA2 + other_lead, 0xA1 + other_trail)); |
368 | } |
369 | // At this point, we've handled all mappable characters above U+02D9 but |
370 | // below U+2010. Let's check for that range in order to let lower BMP |
371 | // characters used for minority languages in China avoid the subsequent |
372 | // search that deals mainly with various symbols. |
373 | if in_range16(bmp, 0x02DA, 0x2010) { |
374 | return None; |
375 | } |
376 | // GBK other (except radicals and PUA in GBK_BOTTOM). |
377 | if let Some(other_pointer) = gbk_other_encode(bmp) { |
378 | let other_lead = other_pointer as usize / (190 - 94); |
379 | let other_trail = other_pointer as usize % (190 - 94); |
380 | let offset = if other_trail < 0x3F { 0x40 } else { 0x41 }; |
381 | return Some((other_lead + (0x81 + 0x20), other_trail + offset)); |
382 | } |
383 | // CJK Radicals Supplement or PUA in GBK_BOTTOM |
384 | if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) { |
385 | if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) { |
386 | let trail = pos + 16; |
387 | let offset = if trail < 0x3F { 0x40 } else { 0x41 }; |
388 | return Some((0xFE, trail + offset)); |
389 | } |
390 | } |
391 | // GB2312 bottom PUA |
392 | let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234); |
393 | if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) { |
394 | let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94; |
395 | let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94; |
396 | return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail)); |
397 | } |
398 | // PUA between Hanzi Levels |
399 | let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810); |
400 | if bmp_minus_pua_between_hanzi < 5 { |
401 | return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize)); |
402 | } |
403 | None |
404 | } |
405 | |
406 | #[cfg (not(feature = "fast-gb-hanzi-encode" ))] |
407 | #[inline (always)] |
408 | fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) { |
409 | if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) { |
410 | (lead, trail) |
411 | } else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) { |
412 | let hanzi_lead = (hanzi_pointer / 94) + (0xD8); |
413 | let hanzi_trail = (hanzi_pointer % 94) + 0xA1; |
414 | (hanzi_lead as u8, hanzi_trail as u8) |
415 | } else { |
416 | let (lead, gbk_trail) = if bmp < 0x72DC { |
417 | // Above GB2312 |
418 | let pointer = gbk_top_ideograph_encode(bmp) as usize; |
419 | let lead = (pointer / 190) + 0x81; |
420 | let gbk_trail = pointer % 190; |
421 | (lead, gbk_trail) |
422 | } else { |
423 | // To the left of GB2312 |
424 | let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize; |
425 | let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29); |
426 | let gbk_trail = gbk_left_ideograph_pointer % (190 - 94); |
427 | (lead, gbk_trail) |
428 | }; |
429 | let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 }; |
430 | (lead as u8, (gbk_trail + offset) as u8) |
431 | } |
432 | } |
433 | |
434 | #[cfg (feature = "fast-gb-hanzi-encode" )] |
435 | #[inline (always)] |
436 | fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) { |
437 | gbk_hanzi_encode(bmp_minus_unified_start) |
438 | } |
439 | |
440 | pub struct Gb18030Encoder { |
441 | extended: bool, |
442 | } |
443 | |
444 | impl Gb18030Encoder { |
445 | pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder { |
446 | Encoder::new( |
447 | encoding, |
448 | VariantEncoder::Gb18030(Gb18030Encoder { |
449 | extended: extended_range, |
450 | }), |
451 | ) |
452 | } |
453 | |
454 | pub fn max_buffer_length_from_utf16_without_replacement( |
455 | &self, |
456 | u16_length: usize, |
457 | ) -> Option<usize> { |
458 | if self.extended { |
459 | u16_length.checked_mul(4) |
460 | } else { |
461 | // Need to add, because space check is done with the four-byte |
462 | // assumption. |
463 | checked_add(2, u16_length.checked_mul(2)) |
464 | } |
465 | } |
466 | |
467 | pub fn max_buffer_length_from_utf8_without_replacement( |
468 | &self, |
469 | byte_length: usize, |
470 | ) -> Option<usize> { |
471 | if self.extended { |
472 | // 1 to 1 |
473 | // 2 to 2 |
474 | // 3 to 2 |
475 | // 2 to 4 (worst) |
476 | // 3 to 4 |
477 | // 4 to 4 |
478 | checked_add(2, byte_length.checked_mul(2)) |
479 | } else { |
480 | // 1 to 1 |
481 | // 2 to 2 |
482 | // 3 to 2 |
483 | // Need to add, because space check is done with the four-byte |
484 | // assumption. |
485 | byte_length.checked_add(3) |
486 | } |
487 | } |
488 | |
489 | ascii_compatible_encoder_functions!( |
490 | { |
491 | let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00); |
492 | if bmp_minus_unified_start < (0x9FA6 - 0x4E00) { |
493 | // CJK Unified Ideographs |
494 | // Can't fail now, since all are |
495 | // mapped. |
496 | let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start); |
497 | handle.write_two(lead, trail) |
498 | } else if bmp == 0xE5E5 { |
499 | // It's not optimal to check for the unmappable |
500 | // and for euro at this stage, but getting |
501 | // the out of the way makes the rest of the |
502 | // code less messy. |
503 | return ( |
504 | EncoderResult::unmappable_from_bmp(bmp), |
505 | source.consumed(), |
506 | handle.written(), |
507 | ); |
508 | } else if bmp == 0x20AC && !self.extended { |
509 | handle.write_one(0x80u8) |
510 | } else { |
511 | match gbk_encode_non_unified(bmp) { |
512 | Some((lead, trail)) => handle.write_two(lead as u8, trail as u8), |
513 | None => { |
514 | if !self.extended { |
515 | return ( |
516 | EncoderResult::unmappable_from_bmp(bmp), |
517 | source.consumed(), |
518 | handle.written(), |
519 | ); |
520 | } |
521 | let range_pointer = gb18030_range_encode(bmp); |
522 | let first = range_pointer / (10 * 126 * 10); |
523 | let rem_first = range_pointer % (10 * 126 * 10); |
524 | let second = rem_first / (10 * 126); |
525 | let rem_second = rem_first % (10 * 126); |
526 | let third = rem_second / 10; |
527 | let fourth = rem_second % 10; |
528 | handle.write_four( |
529 | (first + 0x81) as u8, |
530 | (second + 0x30) as u8, |
531 | (third + 0x81) as u8, |
532 | (fourth + 0x30) as u8, |
533 | ) |
534 | } |
535 | } |
536 | } |
537 | }, |
538 | { |
539 | if !self.extended { |
540 | return ( |
541 | EncoderResult::Unmappable(astral), |
542 | source.consumed(), |
543 | handle.written(), |
544 | ); |
545 | } |
546 | let range_pointer = astral as usize + (189_000usize - 0x1_0000usize); |
547 | let first = range_pointer / (10 * 126 * 10); |
548 | let rem_first = range_pointer % (10 * 126 * 10); |
549 | let second = rem_first / (10 * 126); |
550 | let rem_second = rem_first % (10 * 126); |
551 | let third = rem_second / 10; |
552 | let fourth = rem_second % 10; |
553 | handle.write_four( |
554 | (first + 0x81) as u8, |
555 | (second + 0x30) as u8, |
556 | (third + 0x81) as u8, |
557 | (fourth + 0x30) as u8, |
558 | ) |
559 | }, |
560 | bmp, |
561 | astral, |
562 | self, |
563 | source, |
564 | handle, |
565 | copy_ascii_to_check_space_four, |
566 | check_space_four, |
567 | false |
568 | ); |
569 | } |
570 | |
571 | // Any copyright to the test code below this comment is dedicated to the |
572 | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
573 | |
574 | #[cfg (all(test, feature = "alloc" ))] |
575 | mod tests { |
576 | use super::super::testing::*; |
577 | use super::super::*; |
578 | |
579 | fn decode_gb18030(bytes: &[u8], expect: &str) { |
580 | decode(GB18030, bytes, expect); |
581 | } |
582 | |
583 | fn encode_gb18030(string: &str, expect: &[u8]) { |
584 | encode(GB18030, string, expect); |
585 | } |
586 | |
587 | fn encode_gbk(string: &str, expect: &[u8]) { |
588 | encode(GBK, string, expect); |
589 | } |
590 | |
591 | #[test ] |
592 | fn test_gb18030_decode() { |
593 | // Empty |
594 | decode_gb18030(b"" , &"" ); |
595 | |
596 | // ASCII |
597 | decode_gb18030(b" \x61\x62" , " \u{0061}\u{0062}" ); |
598 | |
599 | // euro |
600 | decode_gb18030(b" \x80" , " \u{20AC}" ); |
601 | decode_gb18030(b" \xA2\xE3" , " \u{20AC}" ); |
602 | |
603 | // two bytes |
604 | decode_gb18030(b" \x81\x40" , " \u{4E02}" ); |
605 | decode_gb18030(b" \x81\x7E" , " \u{4E8A}" ); |
606 | decode_gb18030(b" \x81\x7F" , " \u{FFFD}\u{007F}" ); |
607 | decode_gb18030(b" \x81\x80" , " \u{4E90}" ); |
608 | decode_gb18030(b" \x81\xFE" , " \u{4FA2}" ); |
609 | decode_gb18030(b" \xFE\x40" , " \u{FA0C}" ); |
610 | decode_gb18030(b" \xFE\x7E" , " \u{E843}" ); |
611 | decode_gb18030(b" \xFE\x7F" , " \u{FFFD}\u{007F}" ); |
612 | decode_gb18030(b" \xFE\x80" , " \u{4723}" ); |
613 | decode_gb18030(b" \xFE\xFE" , " \u{E4C5}" ); |
614 | |
615 | // The difference from the original GB18030 |
616 | decode_gb18030(b" \xA3\xA0" , " \u{3000}" ); |
617 | decode_gb18030(b" \xA1\xA1" , " \u{3000}" ); |
618 | |
619 | // 0xFF |
620 | decode_gb18030(b" \xFF\x40" , " \u{FFFD}\u{0040}" ); |
621 | decode_gb18030(b" \xE3\xFF\x9A\x33" , " \u{FFFD}\u{FFFD}" ); // not \u{FFFD}\u{FFFD}\u{0033} ! |
622 | decode_gb18030(b" \xFF\x32\x9A\x33" , " \u{FFFD}\u{0032}\u{FFFD}" ); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} ! |
623 | decode_gb18030(b" \xFF\x40\x00" , " \u{FFFD}\u{0040}\u{0000}" ); |
624 | decode_gb18030(b" \xE3\xFF\x9A\x33\x00" , " \u{FFFD}\u{FFFD}\u{0033}\u{0000}" ); |
625 | decode_gb18030( |
626 | b" \xFF\x32\x9A\x33\x00" , |
627 | " \u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}" , |
628 | ); |
629 | |
630 | // Four bytes |
631 | decode_gb18030(b" \x81\x30\x81\x30" , " \u{0080}" ); |
632 | decode_gb18030(b" \x81\x35\xF4\x37" , " \u{E7C7}" ); |
633 | decode_gb18030(b" \x81\x37\xA3\x30" , " \u{2603}" ); |
634 | decode_gb18030(b" \x94\x39\xDA\x33" , " \u{1F4A9}" ); |
635 | decode_gb18030(b" \xE3\x32\x9A\x35" , " \u{10FFFF}" ); |
636 | decode_gb18030(b" \xE3\x32\x9A\x36\x81\x30" , " \u{FFFD}\u{FFFD}" ); |
637 | decode_gb18030(b" \xE3\x32\x9A\x36\x81\x40" , " \u{FFFD}\u{4E02}" ); |
638 | decode_gb18030(b" \xE3\x32\x9A" , " \u{FFFD}" ); // not \u{FFFD}\u{0032}\u{FFFD} ! |
639 | decode_gb18030(b" \xE3\x32\x9A\x00" , " \u{FFFD}\u{0032}\u{FFFD}\u{0000}" ); |
640 | } |
641 | |
642 | #[test ] |
643 | fn test_gb18030_encode() { |
644 | // Empty |
645 | encode_gb18030("" , b"" ); |
646 | |
647 | // ASCII |
648 | encode_gb18030(" \u{0061}\u{0062}" , b" \x61\x62" ); |
649 | |
650 | // euro |
651 | encode_gb18030(" \u{20AC}" , b" \xA2\xE3" ); |
652 | |
653 | // two bytes |
654 | encode_gb18030(" \u{4E02}" , b" \x81\x40" ); |
655 | encode_gb18030(" \u{4E8A}" , b" \x81\x7E" ); |
656 | if !cfg!(miri) { |
657 | // Miri is too slow |
658 | encode_gb18030(" \u{4E90}" , b" \x81\x80" ); |
659 | encode_gb18030(" \u{4FA2}" , b" \x81\xFE" ); |
660 | encode_gb18030(" \u{FA0C}" , b" \xFE\x40" ); |
661 | encode_gb18030(" \u{E843}" , b" \xFE\x7E" ); |
662 | encode_gb18030(" \u{4723}" , b" \xFE\x80" ); |
663 | encode_gb18030(" \u{E4C5}" , b" \xFE\xFE" ); |
664 | } |
665 | |
666 | // The difference from the original GB18030 |
667 | encode_gb18030(" \u{E5E5}" , b"" ); |
668 | encode_gb18030(" \u{3000}" , b" \xA1\xA1" ); |
669 | |
670 | // Four bytes |
671 | encode_gb18030(" \u{0080}" , b" \x81\x30\x81\x30" ); |
672 | encode_gb18030(" \u{E7C7}" , b" \x81\x35\xF4\x37" ); |
673 | if !cfg!(miri) { |
674 | // Miri is too slow |
675 | encode_gb18030(" \u{2603}" , b" \x81\x37\xA3\x30" ); |
676 | encode_gb18030(" \u{1F4A9}" , b" \x94\x39\xDA\x33" ); |
677 | encode_gb18030(" \u{10FFFF}" , b" \xE3\x32\x9A\x35" ); |
678 | } |
679 | |
680 | // Edge cases |
681 | encode_gb18030(" \u{00F7}" , b" \xA1\xC2" ); |
682 | } |
683 | |
684 | #[test ] |
685 | fn test_gbk_encode() { |
686 | // Empty |
687 | encode_gbk("" , b"" ); |
688 | |
689 | // ASCII |
690 | encode_gbk(" \u{0061}\u{0062}" , b" \x61\x62" ); |
691 | |
692 | // euro |
693 | encode_gbk(" \u{20AC}" , b" \x80" ); |
694 | |
695 | // two bytes |
696 | encode_gbk(" \u{4E02}" , b" \x81\x40" ); |
697 | encode_gbk(" \u{4E8A}" , b" \x81\x7E" ); |
698 | if !cfg!(miri) { |
699 | // Miri is too slow |
700 | encode_gbk(" \u{4E90}" , b" \x81\x80" ); |
701 | encode_gbk(" \u{4FA2}" , b" \x81\xFE" ); |
702 | encode_gbk(" \u{FA0C}" , b" \xFE\x40" ); |
703 | encode_gbk(" \u{E843}" , b" \xFE\x7E" ); |
704 | encode_gbk(" \u{4723}" , b" \xFE\x80" ); |
705 | encode_gbk(" \u{E4C5}" , b" \xFE\xFE" ); |
706 | } |
707 | |
708 | // The difference from the original gb18030 |
709 | encode_gbk(" \u{E5E5}" , b"" ); |
710 | encode_gbk(" \u{3000}" , b" \xA1\xA1" ); |
711 | |
712 | // Four bytes |
713 | encode_gbk(" \u{0080}" , b"€" ); |
714 | encode_gbk(" \u{E7C7}" , b"" ); |
715 | if !cfg!(miri) { |
716 | // Miri is too slow |
717 | encode_gbk(" \u{2603}" , b"☃" ); |
718 | encode_gbk(" \u{1F4A9}" , b"💩" ); |
719 | encode_gbk(" \u{10FFFF}" , b"" ); |
720 | } |
721 | |
722 | // Edge cases |
723 | encode_gbk(" \u{00F7}" , b" \xA1\xC2" ); |
724 | } |
725 | |
726 | #[test ] |
727 | #[cfg_attr (miri, ignore)] // Miri is too slow |
728 | fn test_gb18030_decode_all() { |
729 | let input = include_bytes!("test_data/gb18030_in.txt" ); |
730 | let expectation = include_str!("test_data/gb18030_in_ref.txt" ); |
731 | let (cow, had_errors) = GB18030.decode_without_bom_handling(input); |
732 | assert!(!had_errors, "Should not have had errors." ); |
733 | assert_eq!(&cow[..], expectation); |
734 | } |
735 | |
736 | #[test ] |
737 | #[cfg_attr (miri, ignore)] // Miri is too slow |
738 | fn test_gb18030_encode_all() { |
739 | let input = include_str!("test_data/gb18030_out.txt" ); |
740 | let expectation = include_bytes!("test_data/gb18030_out_ref.txt" ); |
741 | let (cow, encoding, had_errors) = GB18030.encode(input); |
742 | assert!(!had_errors, "Should not have had errors." ); |
743 | assert_eq!(encoding, GB18030); |
744 | assert_eq!(&cow[..], &expectation[..]); |
745 | } |
746 | |
747 | #[test ] |
748 | fn test_gb18030_encode_from_utf16_max_length() { |
749 | let mut output = [0u8; 20]; |
750 | let mut encoder = GB18030.new_encoder(); |
751 | { |
752 | let needed = encoder |
753 | .max_buffer_length_from_utf16_without_replacement(1) |
754 | .unwrap(); |
755 | let (result, read, written) = encoder.encode_from_utf16_without_replacement( |
756 | &[0x3000], |
757 | &mut output[..needed], |
758 | true, |
759 | ); |
760 | assert_eq!(result, EncoderResult::InputEmpty); |
761 | assert_eq!(read, 1); |
762 | assert_eq!(written, 2); |
763 | assert_eq!(output[0], 0xA1); |
764 | assert_eq!(output[1], 0xA1); |
765 | } |
766 | } |
767 | } |
768 | |