1 | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | use super::*; |
11 | use crate::data::*; |
12 | use crate::gb18030_2022::*; |
13 | use crate::handles::*; |
14 | use crate::variant::*; |
15 | // Rust 1.14.0 requires the following despite the asterisk above. |
16 | use super::in_inclusive_range16; |
17 | use super::in_range16; |
18 | |
19 | enum Gb18030Pending { |
20 | None, |
21 | One(u8), |
22 | Two(u8, u8), |
23 | Three(u8, u8, u8), |
24 | } |
25 | |
26 | impl Gb18030Pending { |
27 | fn is_none(&self) -> bool { |
28 | match *self { |
29 | Gb18030Pending::None => true, |
30 | _ => false, |
31 | } |
32 | } |
33 | |
34 | fn count(&self) -> usize { |
35 | match *self { |
36 | Gb18030Pending::None => 0, |
37 | Gb18030Pending::One(_) => 1, |
38 | Gb18030Pending::Two(_, _) => 2, |
39 | Gb18030Pending::Three(_, _, _) => 3, |
40 | } |
41 | } |
42 | } |
43 | |
44 | pub struct Gb18030Decoder { |
45 | first: Option<u8>, |
46 | second: Option<u8>, |
47 | third: Option<u8>, |
48 | pending: Gb18030Pending, |
49 | pending_ascii: Option<u8>, |
50 | } |
51 | |
52 | impl Gb18030Decoder { |
53 | pub fn new() -> VariantDecoder { |
54 | VariantDecoder::Gb18030(Gb18030Decoder { |
55 | first: None, |
56 | second: None, |
57 | third: None, |
58 | pending: Gb18030Pending::None, |
59 | pending_ascii: None, |
60 | }) |
61 | } |
62 | |
63 | pub fn in_neutral_state(&self) -> bool { |
64 | self.first.is_none() |
65 | && self.second.is_none() |
66 | && self.third.is_none() |
67 | && self.pending.is_none() |
68 | && self.pending_ascii.is_none() |
69 | } |
70 | |
71 | fn extra_from_state(&self, byte_length: usize) -> Option<usize> { |
72 | byte_length.checked_add( |
73 | self.pending.count() |
74 | + match self.first { |
75 | None => 0, |
76 | Some(_) => 1, |
77 | } |
78 | + match self.second { |
79 | None => 0, |
80 | Some(_) => 1, |
81 | } |
82 | + match self.third { |
83 | None => 0, |
84 | Some(_) => 1, |
85 | } |
86 | + match self.pending_ascii { |
87 | None => 0, |
88 | Some(_) => 1, |
89 | }, |
90 | ) |
91 | } |
92 | |
93 | pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
94 | // ASCII: 1 to 1 (worst case) |
95 | // gbk: 2 to 1 |
96 | // ranges: 4 to 1 or 4 to 2 |
97 | checked_add(1, self.extra_from_state(byte_length)) |
98 | } |
99 | |
100 | pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
101 | // ASCII: 1 to 1 |
102 | // gbk: 2 to 2 or 2 to 3 |
103 | // ranges: 4 to 2, 4 to 3 or 4 to 4 |
104 | // 0x80: 1 to 3 (worst case) |
105 | self.max_utf8_buffer_length(byte_length) |
106 | } |
107 | |
108 | pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
109 | checked_add(1, checked_mul(3, self.extra_from_state(byte_length))) |
110 | } |
111 | |
112 | gb18030_decoder_functions!( |
113 | { |
114 | // If first is between 0x81 and 0xFE, inclusive, |
115 | // subtract offset 0x81. |
116 | let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81); |
117 | if non_ascii_minus_offset > (0xFE - 0x81) { |
118 | if non_ascii == 0x80 { |
119 | handle.write_upper_bmp(0x20ACu16); |
120 | continue 'outermost; |
121 | } |
122 | return (DecoderResult::Malformed(1, 0), |
123 | source.consumed(), |
124 | handle.written()); |
125 | } |
126 | non_ascii_minus_offset |
127 | }, |
128 | { |
129 | // Two-byte (or error) |
130 | if first_minus_offset >= 0x20 { |
131 | // Not the gbk ideograph range above GB2312 |
132 | let trail_minus_offset = second.wrapping_sub(0xA1); |
133 | if trail_minus_offset <= (0xFE - 0xA1) { |
134 | // GB2312 |
135 | let hanzi_lead = first_minus_offset.wrapping_sub(0x2F); |
136 | if hanzi_lead < (0x77 - 0x2F) { |
137 | // Level 1 Hanzi, Level 2 Hanzi |
138 | // or one of the 5 PUA code |
139 | // points in between. |
140 | let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize; |
141 | let upper_bmp = GB2312_HANZI[hanzi_pointer]; |
142 | handle.write_upper_bmp(upper_bmp) |
143 | } else if first_minus_offset == 0x20 { |
144 | // Symbols (starting with ideographic space) |
145 | let bmp = GB2312_SYMBOLS[trail_minus_offset as usize]; |
146 | handle.write_bmp_excl_ascii(bmp) |
147 | } else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) { |
148 | handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize]) |
149 | } else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() { |
150 | handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize]) |
151 | } else if first_minus_offset > 0x76 { |
152 | // Bottom PUA |
153 | let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16; |
154 | handle.write_upper_bmp(pua) |
155 | } else { |
156 | let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16); |
157 | handle.write_bmp_excl_ascii(bmp) |
158 | } |
159 | } else { |
160 | // gbk range on the left |
161 | let mut trail_minus_offset = second.wrapping_sub(0x40); |
162 | if trail_minus_offset > (0x7E - 0x40) { |
163 | let trail_minus_range_start = second.wrapping_sub(0x80); |
164 | if trail_minus_range_start > (0xA0 - 0x80) { |
165 | if second < 0x80 { |
166 | return (DecoderResult::Malformed(1, 0), |
167 | unread_handle_second.unread(), |
168 | handle.written()); |
169 | } |
170 | return (DecoderResult::Malformed(2, 0), |
171 | unread_handle_second.consumed(), |
172 | handle.written()); |
173 | } |
174 | trail_minus_offset = second - 0x41; |
175 | } |
176 | // Zero-base lead |
177 | let left_lead = first_minus_offset - 0x20; |
178 | let left_pointer = left_lead as usize * (190 - 94) + |
179 | trail_minus_offset as usize; |
180 | let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94)); |
181 | if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) { |
182 | let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16); |
183 | handle.write_upper_bmp(upper_bmp) |
184 | } else if left_pointer < ((0x29 - 0x20) * (190 - 94)) { |
185 | let bmp = gbk_other_decode(left_pointer as u16); |
186 | handle.write_bmp_excl_ascii(bmp) |
187 | } else { |
188 | let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5); |
189 | let upper_bmp = GBK_BOTTOM[bottom_pointer]; |
190 | handle.write_upper_bmp(upper_bmp) |
191 | } |
192 | } |
193 | } else { |
194 | // gbk ideograph range above GB2312 |
195 | let mut trail_minus_offset = second.wrapping_sub(0x40); |
196 | if trail_minus_offset > (0x7E - 0x40) { |
197 | let trail_minus_range_start = second.wrapping_sub(0x80); |
198 | if trail_minus_range_start > (0xFE - 0x80) { |
199 | if second < 0x80 { |
200 | return (DecoderResult::Malformed(1, 0), |
201 | unread_handle_second.unread(), |
202 | handle.written()); |
203 | } |
204 | return (DecoderResult::Malformed(2, 0), |
205 | unread_handle_second.consumed(), |
206 | handle.written()); |
207 | } |
208 | trail_minus_offset = second - 0x41; |
209 | } |
210 | let pointer = first_minus_offset as usize * 190usize + |
211 | trail_minus_offset as usize; |
212 | let upper_bmp = gbk_top_ideograph_decode(pointer as u16); |
213 | handle.write_upper_bmp(upper_bmp) |
214 | } |
215 | }, |
216 | { |
217 | // If third is between 0x81 and 0xFE, inclusive, |
218 | // subtract offset 0x81. |
219 | let third_minus_offset = third.wrapping_sub(0x81); |
220 | if third_minus_offset > (0xFE - 0x81) { |
221 | // We have an error. Let's inline what's going |
222 | // to happen when `second` is |
223 | // reprocessed. (`third` gets unread.) |
224 | // `second` is guaranteed ASCII, so let's |
225 | // put it in `pending_ascii`. Recompute |
226 | // `second` from `second_minus_offset`. |
227 | self.pending_ascii = Some(second_minus_offset + 0x30); |
228 | // Now unread `third` and designate the previous |
229 | // `first` as being in error. |
230 | return (DecoderResult::Malformed(1, 1), |
231 | unread_handle_third.unread(), |
232 | handle.written()); |
233 | } |
234 | third_minus_offset |
235 | }, |
236 | { |
237 | // If fourth is between 0x30 and 0x39, inclusive, |
238 | // subtract offset 0x30. |
239 | // |
240 | // If we have an error, we'll inline what's going |
241 | // to happen when `second` and `third` are |
242 | // reprocessed. (`fourth` gets unread.) |
243 | // `second` is guaranteed ASCII, so let's |
244 | // put it in `pending_ascii`. Recompute |
245 | // `second` from `second_minus_offset` to |
246 | // make this block reusable when `second` |
247 | // is not in scope. |
248 | // |
249 | // `third` is guaranteed to be in the range |
250 | // that makes it become the new `self.first`. |
251 | // |
252 | // `fourth` gets unread and the previous |
253 | // `first` gets designates as being in error. |
254 | let fourth_minus_offset = fourth.wrapping_sub(0x30); |
255 | if fourth_minus_offset > (0x39 - 0x30) { |
256 | self.pending_ascii = Some(second_minus_offset + 0x30); |
257 | self.pending = Gb18030Pending::One(third_minus_offset); |
258 | return (DecoderResult::Malformed(1, 2), |
259 | unread_handle_fourth.unread(), |
260 | handle.written()); |
261 | } |
262 | let pointer = (first_minus_offset as usize * (10 * 126 * 10)) + |
263 | (second_minus_offset as usize * (10 * 126)) + |
264 | (third_minus_offset as usize * 10) + |
265 | fourth_minus_offset as usize; |
266 | if pointer <= 39419 { |
267 | // BMP |
268 | if pointer == 7457 { |
269 | handle.write_upper_bmp(0xE7C7) |
270 | } else { |
271 | handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16)) |
272 | } |
273 | } else if pointer >= 189_000 && pointer <= 1_237_575 { |
274 | // Astral |
275 | handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32) |
276 | } else { |
277 | return (DecoderResult::Malformed(4, 0), |
278 | unread_handle_fourth.consumed(), |
279 | handle.written()); |
280 | } |
281 | }, |
282 | self, |
283 | non_ascii, |
284 | first_minus_offset, |
285 | second, |
286 | second_minus_offset, |
287 | unread_handle_second, |
288 | third, |
289 | third_minus_offset, |
290 | unread_handle_third, |
291 | fourth, |
292 | fourth_minus_offset, |
293 | unread_handle_fourth, |
294 | source, |
295 | handle, |
296 | 'outermost); |
297 | } |
298 | |
299 | // XXX Experiment with inline directives |
300 | fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { |
301 | // Try ideographic punctuation first as it's the most likely case. |
302 | // Throwing in the check for full-width currencies and tilde is probably |
303 | // more size-efficient here than elsewhere. |
304 | if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) { |
305 | if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) { |
306 | return Some((0xA1, pos + 0xA1)); |
307 | } |
308 | } |
309 | // Ext A |
310 | if in_range16(bmp, 0x3400, 0x4E00) { |
311 | return position(&GBK_BOTTOM[21..100], bmp).map(|pos| { |
312 | ( |
313 | 0xFE, |
314 | pos + if pos < (0x3F - 16) { |
315 | 0x40 + 16 |
316 | } else { |
317 | 0x41 + 16 |
318 | }, |
319 | ) |
320 | }); |
321 | } |
322 | // Compatibility ideographs |
323 | if in_range16(bmp, 0xF900, 0xFB00) { |
324 | return position(&GBK_BOTTOM[0..21], bmp).map(|pos| { |
325 | if pos < 5 { |
326 | // end of second to last row |
327 | (0xFD, pos + (190 - 94 - 5 + 0x41)) |
328 | } else { |
329 | // last row |
330 | (0xFE, pos + (0x40 - 5)) |
331 | } |
332 | }); |
333 | } |
334 | // Handle everything below U+02CA, which is in GBK_OTHER. |
335 | if bmp < 0x02CA { |
336 | if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 { |
337 | // Pinyin except U+1E3F |
338 | if let Some(pos) = position(&GB2312_PINYIN[..], bmp) { |
339 | return Some((0xA8, pos + 0xA1)); |
340 | } |
341 | } else if in_inclusive_range16(bmp, 0x00A4, 0x00F7) |
342 | || in_inclusive_range16(bmp, 0x02C7, 0x02C9) |
343 | { |
344 | // Diacritics and Latin 1 symbols |
345 | if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) { |
346 | return Some((0xA1, pos + 0xA1 + 3)); |
347 | } |
348 | } |
349 | return None; |
350 | } |
351 | |
352 | if in_inclusive_range16(bmp, 0xE78D, 0xE864) { |
353 | // The array is sorted but short, so let's do linear search. |
354 | if let Some(pos) = position(&GB18030_2022_OVERRIDE_PUA[..], bmp) { |
355 | let pair = &GB18030_2022_OVERRIDE_BYTES[pos]; |
356 | return Some((pair[0].into(), pair[1].into())); |
357 | } |
358 | } else if bmp >= 0xFE17 { |
359 | // Various brackets, all in full-width regions |
360 | if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) { |
361 | return Some((0xA6, pos + (0x9F - 0x60 + 0xA1))); |
362 | } |
363 | } else if bmp == 0x1E3F { |
364 | // The one Pinyin placed elsewhere on the BMP |
365 | return Some((0xA8, 0x7B - 0x60 + 0xA1)); |
366 | } else if in_range16(bmp, 0xA000, 0xD800) { |
367 | // Since Korean has usage in China, let's spend a branch to fast-track |
368 | // Hangul. |
369 | return None; |
370 | } |
371 | // GB2312 other (except bottom PUA and PUA between Hanzi levels). |
372 | if let Some(other_pointer) = gb2312_other_encode(bmp) { |
373 | let other_lead = other_pointer as usize / 94; |
374 | let other_trail = other_pointer as usize % 94; |
375 | return Some((0xA2 + other_lead, 0xA1 + other_trail)); |
376 | } |
377 | // At this point, we've handled all mappable characters above U+02D9 but |
378 | // below U+2010. Let's check for that range in order to let lower BMP |
379 | // characters used for minority languages in China avoid the subsequent |
380 | // search that deals mainly with various symbols. |
381 | if in_range16(bmp, 0x02DA, 0x2010) { |
382 | return None; |
383 | } |
384 | // GBK other (except radicals and PUA in GBK_BOTTOM). |
385 | if let Some(other_pointer) = gbk_other_encode(bmp) { |
386 | let other_lead = other_pointer as usize / (190 - 94); |
387 | let other_trail = other_pointer as usize % (190 - 94); |
388 | let offset = if other_trail < 0x3F { 0x40 } else { 0x41 }; |
389 | return Some((other_lead + (0x81 + 0x20), other_trail + offset)); |
390 | } |
391 | // CJK Radicals Supplement, PUA, and U+9FBx ideographs in GBK_BOTTOM |
392 | if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) |
393 | || in_inclusive_range16(bmp, 0x9FB4, 0x9FBB) |
394 | || in_inclusive_range16(bmp, 0xE816, 0xE855) |
395 | { |
396 | if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) { |
397 | let trail = pos + 16; |
398 | let offset = if trail < 0x3F { 0x40 } else { 0x41 }; |
399 | return Some((0xFE, trail + offset)); |
400 | } |
401 | } |
402 | // GB2312 bottom PUA |
403 | let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234); |
404 | if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) { |
405 | let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94; |
406 | let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94; |
407 | return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail)); |
408 | } |
409 | // PUA between Hanzi Levels |
410 | let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810); |
411 | if bmp_minus_pua_between_hanzi < 5 { |
412 | return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize)); |
413 | } |
414 | None |
415 | } |
416 | |
417 | #[cfg (not(feature = "fast-gb-hanzi-encode" ))] |
418 | #[inline (always)] |
419 | fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) { |
420 | if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) { |
421 | (lead, trail) |
422 | } else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) { |
423 | let hanzi_lead = (hanzi_pointer / 94) + (0xD8); |
424 | let hanzi_trail = (hanzi_pointer % 94) + 0xA1; |
425 | (hanzi_lead as u8, hanzi_trail as u8) |
426 | } else { |
427 | let (lead, gbk_trail) = if bmp < 0x72DC { |
428 | // Above GB2312 |
429 | let pointer = gbk_top_ideograph_encode(bmp) as usize; |
430 | let lead = (pointer / 190) + 0x81; |
431 | let gbk_trail = pointer % 190; |
432 | (lead, gbk_trail) |
433 | } else { |
434 | // To the left of GB2312 |
435 | let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize; |
436 | let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29); |
437 | let gbk_trail = gbk_left_ideograph_pointer % (190 - 94); |
438 | (lead, gbk_trail) |
439 | }; |
440 | let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 }; |
441 | (lead as u8, (gbk_trail + offset) as u8) |
442 | } |
443 | } |
444 | |
445 | #[cfg (feature = "fast-gb-hanzi-encode" )] |
446 | #[inline (always)] |
447 | fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) { |
448 | gbk_hanzi_encode(bmp_minus_unified_start) |
449 | } |
450 | |
451 | pub struct Gb18030Encoder { |
452 | extended: bool, |
453 | } |
454 | |
455 | impl Gb18030Encoder { |
456 | pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder { |
457 | Encoder::new( |
458 | encoding, |
459 | VariantEncoder::Gb18030(Gb18030Encoder { |
460 | extended: extended_range, |
461 | }), |
462 | ) |
463 | } |
464 | |
465 | pub fn max_buffer_length_from_utf16_without_replacement( |
466 | &self, |
467 | u16_length: usize, |
468 | ) -> Option<usize> { |
469 | if self.extended { |
470 | u16_length.checked_mul(4) |
471 | } else { |
472 | // Need to add, because space check is done with the four-byte |
473 | // assumption. |
474 | checked_add(2, u16_length.checked_mul(2)) |
475 | } |
476 | } |
477 | |
478 | pub fn max_buffer_length_from_utf8_without_replacement( |
479 | &self, |
480 | byte_length: usize, |
481 | ) -> Option<usize> { |
482 | if self.extended { |
483 | // 1 to 1 |
484 | // 2 to 2 |
485 | // 3 to 2 |
486 | // 2 to 4 (worst) |
487 | // 3 to 4 |
488 | // 4 to 4 |
489 | checked_add(2, byte_length.checked_mul(2)) |
490 | } else { |
491 | // 1 to 1 |
492 | // 2 to 2 |
493 | // 3 to 2 |
494 | // Need to add, because space check is done with the four-byte |
495 | // assumption. |
496 | byte_length.checked_add(3) |
497 | } |
498 | } |
499 | |
500 | ascii_compatible_encoder_functions!( |
501 | { |
502 | let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00); |
503 | if bmp_minus_unified_start < (0x9FA6 - 0x4E00) { |
504 | // CJK Unified Ideographs |
505 | // Can't fail now, since all are |
506 | // mapped. |
507 | let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start); |
508 | handle.write_two(lead, trail) |
509 | } else if bmp == 0xE5E5 { |
510 | // It's not optimal to check for the unmappable |
511 | // and for euro at this stage, but getting |
512 | // the out of the way makes the rest of the |
513 | // code less messy. |
514 | return ( |
515 | EncoderResult::unmappable_from_bmp(bmp), |
516 | source.consumed(), |
517 | handle.written(), |
518 | ); |
519 | } else if bmp == 0x20AC && !self.extended { |
520 | handle.write_one(0x80u8) |
521 | } else { |
522 | match gbk_encode_non_unified(bmp) { |
523 | Some((lead, trail)) => handle.write_two(lead as u8, trail as u8), |
524 | None => { |
525 | if !self.extended { |
526 | return ( |
527 | EncoderResult::unmappable_from_bmp(bmp), |
528 | source.consumed(), |
529 | handle.written(), |
530 | ); |
531 | } |
532 | let range_pointer = gb18030_range_encode(bmp); |
533 | let first = range_pointer / (10 * 126 * 10); |
534 | let rem_first = range_pointer % (10 * 126 * 10); |
535 | let second = rem_first / (10 * 126); |
536 | let rem_second = rem_first % (10 * 126); |
537 | let third = rem_second / 10; |
538 | let fourth = rem_second % 10; |
539 | handle.write_four( |
540 | (first + 0x81) as u8, |
541 | (second + 0x30) as u8, |
542 | (third + 0x81) as u8, |
543 | (fourth + 0x30) as u8, |
544 | ) |
545 | } |
546 | } |
547 | } |
548 | }, |
549 | { |
550 | if !self.extended { |
551 | return ( |
552 | EncoderResult::Unmappable(astral), |
553 | source.consumed(), |
554 | handle.written(), |
555 | ); |
556 | } |
557 | let range_pointer = astral as usize + (189_000usize - 0x1_0000usize); |
558 | let first = range_pointer / (10 * 126 * 10); |
559 | let rem_first = range_pointer % (10 * 126 * 10); |
560 | let second = rem_first / (10 * 126); |
561 | let rem_second = rem_first % (10 * 126); |
562 | let third = rem_second / 10; |
563 | let fourth = rem_second % 10; |
564 | handle.write_four( |
565 | (first + 0x81) as u8, |
566 | (second + 0x30) as u8, |
567 | (third + 0x81) as u8, |
568 | (fourth + 0x30) as u8, |
569 | ) |
570 | }, |
571 | bmp, |
572 | astral, |
573 | self, |
574 | source, |
575 | handle, |
576 | copy_ascii_to_check_space_four, |
577 | check_space_four, |
578 | false |
579 | ); |
580 | } |
581 | |
582 | // Any copyright to the test code below this comment is dedicated to the |
583 | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
584 | |
585 | #[cfg (all(test, feature = "alloc" ))] |
586 | mod tests { |
587 | use super::super::testing::*; |
588 | use super::super::*; |
589 | |
590 | fn decode_gb18030(bytes: &[u8], expect: &str) { |
591 | decode(GB18030, bytes, expect); |
592 | } |
593 | |
594 | fn encode_gb18030(string: &str, expect: &[u8]) { |
595 | encode(GB18030, string, expect); |
596 | } |
597 | |
598 | fn encode_gbk(string: &str, expect: &[u8]) { |
599 | encode(GBK, string, expect); |
600 | } |
601 | |
602 | #[test ] |
603 | fn test_gb18030_decode() { |
604 | // Empty |
605 | decode_gb18030(b"" , &"" ); |
606 | |
607 | // ASCII |
608 | decode_gb18030(b" \x61\x62" , " \u{0061}\u{0062}" ); |
609 | |
610 | // euro |
611 | decode_gb18030(b" \x80" , " \u{20AC}" ); |
612 | decode_gb18030(b" \xA2\xE3" , " \u{20AC}" ); |
613 | |
614 | // two bytes |
615 | decode_gb18030(b" \x81\x40" , " \u{4E02}" ); |
616 | decode_gb18030(b" \x81\x7E" , " \u{4E8A}" ); |
617 | decode_gb18030(b" \x81\x7F" , " \u{FFFD}\u{007F}" ); |
618 | decode_gb18030(b" \x81\x80" , " \u{4E90}" ); |
619 | decode_gb18030(b" \x81\xFE" , " \u{4FA2}" ); |
620 | decode_gb18030(b" \xFE\x40" , " \u{FA0C}" ); |
621 | decode_gb18030(b" \xFE\x7F" , " \u{FFFD}\u{007F}" ); |
622 | decode_gb18030(b" \xFE\x80" , " \u{4723}" ); |
623 | decode_gb18030(b" \xFE\xFE" , " \u{E4C5}" ); |
624 | |
625 | // Changes between GB18030-2005 and GB18030-2022 |
626 | decode_gb18030(b" \xFE\x7E" , " \u{9FB9}" ); |
627 | decode_gb18030(b" \xA6\xDD" , " \u{FE14}" ); |
628 | |
629 | // These mappings remain in place the GB18030-2005 way despite GB18030-2022 |
630 | decode_gb18030(b" \x82\x35\x91\x32" , " \u{9FB9}" ); |
631 | decode_gb18030(b" \x84\x31\x83\x30" , " \u{FE14}" ); |
632 | |
633 | // The difference from the original GB18030 |
634 | decode_gb18030(b" \xA3\xA0" , " \u{3000}" ); |
635 | decode_gb18030(b" \xA1\xA1" , " \u{3000}" ); |
636 | |
637 | // 0xFF |
638 | decode_gb18030(b" \xFF\x40" , " \u{FFFD}\u{0040}" ); |
639 | decode_gb18030(b" \xE3\xFF\x9A\x33" , " \u{FFFD}\u{FFFD}" ); // not \u{FFFD}\u{FFFD}\u{0033} ! |
640 | decode_gb18030(b" \xFF\x32\x9A\x33" , " \u{FFFD}\u{0032}\u{FFFD}" ); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} ! |
641 | decode_gb18030(b" \xFF\x40\x00" , " \u{FFFD}\u{0040}\u{0000}" ); |
642 | decode_gb18030(b" \xE3\xFF\x9A\x33\x00" , " \u{FFFD}\u{FFFD}\u{0033}\u{0000}" ); |
643 | decode_gb18030( |
644 | b" \xFF\x32\x9A\x33\x00" , |
645 | " \u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}" , |
646 | ); |
647 | |
648 | // Four bytes |
649 | decode_gb18030(b" \x81\x30\x81\x30" , " \u{0080}" ); |
650 | decode_gb18030(b" \x81\x35\xF4\x37" , " \u{E7C7}" ); |
651 | decode_gb18030(b" \x81\x37\xA3\x30" , " \u{2603}" ); |
652 | decode_gb18030(b" \x94\x39\xDA\x33" , " \u{1F4A9}" ); |
653 | decode_gb18030(b" \xE3\x32\x9A\x35" , " \u{10FFFF}" ); |
654 | decode_gb18030(b" \xE3\x32\x9A\x36\x81\x30" , " \u{FFFD}\u{FFFD}" ); |
655 | decode_gb18030(b" \xE3\x32\x9A\x36\x81\x40" , " \u{FFFD}\u{4E02}" ); |
656 | decode_gb18030(b" \xE3\x32\x9A" , " \u{FFFD}" ); // not \u{FFFD}\u{0032}\u{FFFD} ! |
657 | decode_gb18030(b" \xE3\x32\x9A\x00" , " \u{FFFD}\u{0032}\u{FFFD}\u{0000}" ); |
658 | } |
659 | |
660 | #[test ] |
661 | fn test_gb18030_encode() { |
662 | // Empty |
663 | encode_gb18030("" , b"" ); |
664 | |
665 | // ASCII |
666 | encode_gb18030(" \u{0061}\u{0062}" , b" \x61\x62" ); |
667 | |
668 | // euro |
669 | encode_gb18030(" \u{20AC}" , b" \xA2\xE3" ); |
670 | |
671 | // two bytes |
672 | encode_gb18030(" \u{4E02}" , b" \x81\x40" ); |
673 | encode_gb18030(" \u{4E8A}" , b" \x81\x7E" ); |
674 | if !cfg!(miri) { |
675 | // Miri is too slow |
676 | encode_gb18030(" \u{4E90}" , b" \x81\x80" ); |
677 | encode_gb18030(" \u{4FA2}" , b" \x81\xFE" ); |
678 | encode_gb18030(" \u{FA0C}" , b" \xFE\x40" ); |
679 | encode_gb18030(" \u{E843}" , b" \xFE\x7E" ); |
680 | encode_gb18030(" \u{4723}" , b" \xFE\x80" ); |
681 | encode_gb18030(" \u{E4C5}" , b" \xFE\xFE" ); |
682 | } |
683 | |
684 | // The difference from the original GB18030 |
685 | encode_gb18030(" \u{E5E5}" , b"" ); |
686 | encode_gb18030(" \u{3000}" , b" \xA1\xA1" ); |
687 | |
688 | // Four bytes |
689 | encode_gb18030(" \u{0080}" , b" \x81\x30\x81\x30" ); |
690 | encode_gb18030(" \u{E7C7}" , b" \x81\x35\xF4\x37" ); |
691 | if !cfg!(miri) { |
692 | // Miri is too slow |
693 | encode_gb18030(" \u{2603}" , b" \x81\x37\xA3\x30" ); |
694 | encode_gb18030(" \u{1F4A9}" , b" \x94\x39\xDA\x33" ); |
695 | encode_gb18030(" \u{10FFFF}" , b" \xE3\x32\x9A\x35" ); |
696 | } |
697 | |
698 | // Edge cases |
699 | encode_gb18030(" \u{00F7}" , b" \xA1\xC2" ); |
700 | |
701 | // GB18030-2022 |
702 | encode_gb18030(" \u{9FB9}" , b" \xFE\x7E" ); |
703 | encode_gb18030(" \u{FE14}" , b" \xA6\xDD" ); |
704 | encode_gb18030(" \u{E843}" , b" \xFE\x7E" ); |
705 | encode_gb18030(" \u{E791}" , b" \xA6\xDD" ); |
706 | |
707 | // Non-change in GB18030-2022 |
708 | encode_gb18030(" \u{E817}" , b" \xFE\x52" ); |
709 | } |
710 | |
711 | #[test ] |
712 | fn test_gbk_encode() { |
713 | // Empty |
714 | encode_gbk("" , b"" ); |
715 | |
716 | // ASCII |
717 | encode_gbk(" \u{0061}\u{0062}" , b" \x61\x62" ); |
718 | |
719 | // euro |
720 | encode_gbk(" \u{20AC}" , b" \x80" ); |
721 | |
722 | // two bytes |
723 | encode_gbk(" \u{4E02}" , b" \x81\x40" ); |
724 | encode_gbk(" \u{4E8A}" , b" \x81\x7E" ); |
725 | if !cfg!(miri) { |
726 | // Miri is too slow |
727 | encode_gbk(" \u{4E90}" , b" \x81\x80" ); |
728 | encode_gbk(" \u{4FA2}" , b" \x81\xFE" ); |
729 | encode_gbk(" \u{FA0C}" , b" \xFE\x40" ); |
730 | encode_gbk(" \u{E843}" , b" \xFE\x7E" ); |
731 | encode_gbk(" \u{4723}" , b" \xFE\x80" ); |
732 | encode_gbk(" \u{E4C5}" , b" \xFE\xFE" ); |
733 | } |
734 | |
735 | // The difference from the original gb18030 |
736 | encode_gbk(" \u{E5E5}" , b"" ); |
737 | encode_gbk(" \u{3000}" , b" \xA1\xA1" ); |
738 | |
739 | // Four bytes |
740 | encode_gbk(" \u{0080}" , b"€" ); |
741 | encode_gbk(" \u{E7C7}" , b"" ); |
742 | if !cfg!(miri) { |
743 | // Miri is too slow |
744 | encode_gbk(" \u{2603}" , b"☃" ); |
745 | encode_gbk(" \u{1F4A9}" , b"💩" ); |
746 | encode_gbk(" \u{10FFFF}" , b"" ); |
747 | } |
748 | |
749 | // Edge cases |
750 | encode_gbk(" \u{00F7}" , b" \xA1\xC2" ); |
751 | |
752 | // GB18030-2022 |
753 | encode_gb18030(" \u{9FB9}" , b" \xFE\x7E" ); |
754 | encode_gb18030(" \u{FE14}" , b" \xA6\xDD" ); |
755 | encode_gb18030(" \u{E843}" , b" \xFE\x7E" ); |
756 | encode_gb18030(" \u{E791}" , b" \xA6\xDD" ); |
757 | |
758 | // Non-change in GB18030-2022 |
759 | encode_gb18030(" \u{E817}" , b" \xFE\x52" ); |
760 | } |
761 | |
762 | #[test ] |
763 | #[cfg_attr (miri, ignore)] // Miri is too slow |
764 | fn test_gb18030_decode_all() { |
765 | let input = include_bytes!("test_data/gb18030_in.txt" ); |
766 | let expectation = include_str!("test_data/gb18030_in_ref.txt" ); |
767 | let (cow, had_errors) = GB18030.decode_without_bom_handling(input); |
768 | assert!(!had_errors, "Should not have had errors." ); |
769 | assert_eq!(&cow[..], expectation); |
770 | } |
771 | |
772 | #[test ] |
773 | #[cfg_attr (miri, ignore)] // Miri is too slow |
774 | fn test_gb18030_encode_all() { |
775 | let input = include_str!("test_data/gb18030_out.txt" ); |
776 | let expectation = include_bytes!("test_data/gb18030_out_ref.txt" ); |
777 | let (cow, encoding, had_errors) = GB18030.encode(input); |
778 | assert!(!had_errors, "Should not have had errors." ); |
779 | assert_eq!(encoding, GB18030); |
780 | assert_eq!(&cow[..], &expectation[..]); |
781 | } |
782 | |
783 | #[test ] |
784 | fn test_gb18030_encode_from_utf16_max_length() { |
785 | let mut output = [0u8; 20]; |
786 | let mut encoder = GB18030.new_encoder(); |
787 | { |
788 | let needed = encoder |
789 | .max_buffer_length_from_utf16_without_replacement(1) |
790 | .unwrap(); |
791 | let (result, read, written) = encoder.encode_from_utf16_without_replacement( |
792 | &[0x3000], |
793 | &mut output[..needed], |
794 | true, |
795 | ); |
796 | assert_eq!(result, EncoderResult::InputEmpty); |
797 | assert_eq!(read, 1); |
798 | assert_eq!(written, 2); |
799 | assert_eq!(output[0], 0xA1); |
800 | assert_eq!(output[1], 0xA1); |
801 | } |
802 | } |
803 | } |
804 | |