| 1 | // Copyright Mozilla Foundation. See the COPYRIGHT |
| 2 | // file at the top-level directory of this distribution. |
| 3 | // |
| 4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| 7 | // option. This file may not be copied, modified, or distributed |
| 8 | // except according to those terms. |
| 9 | |
| 10 | use super::*; |
| 11 | use crate::data::*; |
| 12 | use crate::gb18030_2022::*; |
| 13 | use crate::handles::*; |
| 14 | use crate::variant::*; |
| 15 | // Rust 1.14.0 requires the following despite the asterisk above. |
| 16 | use super::in_inclusive_range16; |
| 17 | use super::in_range16; |
| 18 | |
| 19 | enum Gb18030Pending { |
| 20 | None, |
| 21 | One(u8), |
| 22 | Two(u8, u8), |
| 23 | Three(u8, u8, u8), |
| 24 | } |
| 25 | |
| 26 | impl Gb18030Pending { |
| 27 | fn is_none(&self) -> bool { |
| 28 | match *self { |
| 29 | Gb18030Pending::None => true, |
| 30 | _ => false, |
| 31 | } |
| 32 | } |
| 33 | |
| 34 | fn count(&self) -> usize { |
| 35 | match *self { |
| 36 | Gb18030Pending::None => 0, |
| 37 | Gb18030Pending::One(_) => 1, |
| 38 | Gb18030Pending::Two(_, _) => 2, |
| 39 | Gb18030Pending::Three(_, _, _) => 3, |
| 40 | } |
| 41 | } |
| 42 | } |
| 43 | |
| 44 | pub struct Gb18030Decoder { |
| 45 | first: Option<u8>, |
| 46 | second: Option<u8>, |
| 47 | third: Option<u8>, |
| 48 | pending: Gb18030Pending, |
| 49 | pending_ascii: Option<u8>, |
| 50 | } |
| 51 | |
| 52 | impl Gb18030Decoder { |
| 53 | pub fn new() -> VariantDecoder { |
| 54 | VariantDecoder::Gb18030(Gb18030Decoder { |
| 55 | first: None, |
| 56 | second: None, |
| 57 | third: None, |
| 58 | pending: Gb18030Pending::None, |
| 59 | pending_ascii: None, |
| 60 | }) |
| 61 | } |
| 62 | |
| 63 | pub fn in_neutral_state(&self) -> bool { |
| 64 | self.first.is_none() |
| 65 | && self.second.is_none() |
| 66 | && self.third.is_none() |
| 67 | && self.pending.is_none() |
| 68 | && self.pending_ascii.is_none() |
| 69 | } |
| 70 | |
| 71 | fn extra_from_state(&self, byte_length: usize) -> Option<usize> { |
| 72 | byte_length.checked_add( |
| 73 | self.pending.count() |
| 74 | + match self.first { |
| 75 | None => 0, |
| 76 | Some(_) => 1, |
| 77 | } |
| 78 | + match self.second { |
| 79 | None => 0, |
| 80 | Some(_) => 1, |
| 81 | } |
| 82 | + match self.third { |
| 83 | None => 0, |
| 84 | Some(_) => 1, |
| 85 | } |
| 86 | + match self.pending_ascii { |
| 87 | None => 0, |
| 88 | Some(_) => 1, |
| 89 | }, |
| 90 | ) |
| 91 | } |
| 92 | |
| 93 | pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
| 94 | // ASCII: 1 to 1 (worst case) |
| 95 | // gbk: 2 to 1 |
| 96 | // ranges: 4 to 1 or 4 to 2 |
| 97 | checked_add(1, self.extra_from_state(byte_length)) |
| 98 | } |
| 99 | |
| 100 | pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
| 101 | // ASCII: 1 to 1 |
| 102 | // gbk: 2 to 2 or 2 to 3 |
| 103 | // ranges: 4 to 2, 4 to 3 or 4 to 4 |
| 104 | // 0x80: 1 to 3 (worst case) |
| 105 | self.max_utf8_buffer_length(byte_length) |
| 106 | } |
| 107 | |
| 108 | pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
| 109 | checked_add(1, checked_mul(3, self.extra_from_state(byte_length))) |
| 110 | } |
| 111 | |
| 112 | gb18030_decoder_functions!( |
| 113 | { |
| 114 | // If first is between 0x81 and 0xFE, inclusive, |
| 115 | // subtract offset 0x81. |
| 116 | let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81); |
| 117 | if non_ascii_minus_offset > (0xFE - 0x81) { |
| 118 | if non_ascii == 0x80 { |
| 119 | handle.write_upper_bmp(0x20ACu16); |
| 120 | continue 'outermost; |
| 121 | } |
| 122 | return (DecoderResult::Malformed(1, 0), |
| 123 | source.consumed(), |
| 124 | handle.written()); |
| 125 | } |
| 126 | non_ascii_minus_offset |
| 127 | }, |
| 128 | { |
| 129 | // Two-byte (or error) |
| 130 | if first_minus_offset >= 0x20 { |
| 131 | // Not the gbk ideograph range above GB2312 |
| 132 | let trail_minus_offset = second.wrapping_sub(0xA1); |
| 133 | if trail_minus_offset <= (0xFE - 0xA1) { |
| 134 | // GB2312 |
| 135 | let hanzi_lead = first_minus_offset.wrapping_sub(0x2F); |
| 136 | if hanzi_lead < (0x77 - 0x2F) { |
| 137 | // Level 1 Hanzi, Level 2 Hanzi |
| 138 | // or one of the 5 PUA code |
| 139 | // points in between. |
| 140 | let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize; |
| 141 | let upper_bmp = GB2312_HANZI[hanzi_pointer]; |
| 142 | handle.write_upper_bmp(upper_bmp) |
| 143 | } else if first_minus_offset == 0x20 { |
| 144 | // Symbols (starting with ideographic space) |
| 145 | let bmp = GB2312_SYMBOLS[trail_minus_offset as usize]; |
| 146 | handle.write_bmp_excl_ascii(bmp) |
| 147 | } else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) { |
| 148 | handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize]) |
| 149 | } else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() { |
| 150 | handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize]) |
| 151 | } else if first_minus_offset > 0x76 { |
| 152 | // Bottom PUA |
| 153 | let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16; |
| 154 | handle.write_upper_bmp(pua) |
| 155 | } else { |
| 156 | let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16); |
| 157 | handle.write_bmp_excl_ascii(bmp) |
| 158 | } |
| 159 | } else { |
| 160 | // gbk range on the left |
| 161 | let mut trail_minus_offset = second.wrapping_sub(0x40); |
| 162 | if trail_minus_offset > (0x7E - 0x40) { |
| 163 | let trail_minus_range_start = second.wrapping_sub(0x80); |
| 164 | if trail_minus_range_start > (0xA0 - 0x80) { |
| 165 | if second < 0x80 { |
| 166 | return (DecoderResult::Malformed(1, 0), |
| 167 | unread_handle_second.unread(), |
| 168 | handle.written()); |
| 169 | } |
| 170 | return (DecoderResult::Malformed(2, 0), |
| 171 | unread_handle_second.consumed(), |
| 172 | handle.written()); |
| 173 | } |
| 174 | trail_minus_offset = second - 0x41; |
| 175 | } |
| 176 | // Zero-base lead |
| 177 | let left_lead = first_minus_offset - 0x20; |
| 178 | let left_pointer = left_lead as usize * (190 - 94) + |
| 179 | trail_minus_offset as usize; |
| 180 | let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94)); |
| 181 | if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) { |
| 182 | let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16); |
| 183 | handle.write_upper_bmp(upper_bmp) |
| 184 | } else if left_pointer < ((0x29 - 0x20) * (190 - 94)) { |
| 185 | let bmp = gbk_other_decode(left_pointer as u16); |
| 186 | handle.write_bmp_excl_ascii(bmp) |
| 187 | } else { |
| 188 | let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5); |
| 189 | let upper_bmp = GBK_BOTTOM[bottom_pointer]; |
| 190 | handle.write_upper_bmp(upper_bmp) |
| 191 | } |
| 192 | } |
| 193 | } else { |
| 194 | // gbk ideograph range above GB2312 |
| 195 | let mut trail_minus_offset = second.wrapping_sub(0x40); |
| 196 | if trail_minus_offset > (0x7E - 0x40) { |
| 197 | let trail_minus_range_start = second.wrapping_sub(0x80); |
| 198 | if trail_minus_range_start > (0xFE - 0x80) { |
| 199 | if second < 0x80 { |
| 200 | return (DecoderResult::Malformed(1, 0), |
| 201 | unread_handle_second.unread(), |
| 202 | handle.written()); |
| 203 | } |
| 204 | return (DecoderResult::Malformed(2, 0), |
| 205 | unread_handle_second.consumed(), |
| 206 | handle.written()); |
| 207 | } |
| 208 | trail_minus_offset = second - 0x41; |
| 209 | } |
| 210 | let pointer = first_minus_offset as usize * 190usize + |
| 211 | trail_minus_offset as usize; |
| 212 | let upper_bmp = gbk_top_ideograph_decode(pointer as u16); |
| 213 | handle.write_upper_bmp(upper_bmp) |
| 214 | } |
| 215 | }, |
| 216 | { |
| 217 | // If third is between 0x81 and 0xFE, inclusive, |
| 218 | // subtract offset 0x81. |
| 219 | let third_minus_offset = third.wrapping_sub(0x81); |
| 220 | if third_minus_offset > (0xFE - 0x81) { |
| 221 | // We have an error. Let's inline what's going |
| 222 | // to happen when `second` is |
| 223 | // reprocessed. (`third` gets unread.) |
| 224 | // `second` is guaranteed ASCII, so let's |
| 225 | // put it in `pending_ascii`. Recompute |
| 226 | // `second` from `second_minus_offset`. |
| 227 | self.pending_ascii = Some(second_minus_offset + 0x30); |
| 228 | // Now unread `third` and designate the previous |
| 229 | // `first` as being in error. |
| 230 | return (DecoderResult::Malformed(1, 1), |
| 231 | unread_handle_third.unread(), |
| 232 | handle.written()); |
| 233 | } |
| 234 | third_minus_offset |
| 235 | }, |
| 236 | { |
| 237 | // If fourth is between 0x30 and 0x39, inclusive, |
| 238 | // subtract offset 0x30. |
| 239 | // |
| 240 | // If we have an error, we'll inline what's going |
| 241 | // to happen when `second` and `third` are |
| 242 | // reprocessed. (`fourth` gets unread.) |
| 243 | // `second` is guaranteed ASCII, so let's |
| 244 | // put it in `pending_ascii`. Recompute |
| 245 | // `second` from `second_minus_offset` to |
| 246 | // make this block reusable when `second` |
| 247 | // is not in scope. |
| 248 | // |
| 249 | // `third` is guaranteed to be in the range |
| 250 | // that makes it become the new `self.first`. |
| 251 | // |
| 252 | // `fourth` gets unread and the previous |
| 253 | // `first` gets designates as being in error. |
| 254 | let fourth_minus_offset = fourth.wrapping_sub(0x30); |
| 255 | if fourth_minus_offset > (0x39 - 0x30) { |
| 256 | self.pending_ascii = Some(second_minus_offset + 0x30); |
| 257 | self.pending = Gb18030Pending::One(third_minus_offset); |
| 258 | return (DecoderResult::Malformed(1, 2), |
| 259 | unread_handle_fourth.unread(), |
| 260 | handle.written()); |
| 261 | } |
| 262 | let pointer = (first_minus_offset as usize * (10 * 126 * 10)) + |
| 263 | (second_minus_offset as usize * (10 * 126)) + |
| 264 | (third_minus_offset as usize * 10) + |
| 265 | fourth_minus_offset as usize; |
| 266 | if pointer <= 39419 { |
| 267 | // BMP |
| 268 | if pointer == 7457 { |
| 269 | handle.write_upper_bmp(0xE7C7) |
| 270 | } else { |
| 271 | handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16)) |
| 272 | } |
| 273 | } else if pointer >= 189_000 && pointer <= 1_237_575 { |
| 274 | // Astral |
| 275 | handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32) |
| 276 | } else { |
| 277 | return (DecoderResult::Malformed(4, 0), |
| 278 | unread_handle_fourth.consumed(), |
| 279 | handle.written()); |
| 280 | } |
| 281 | }, |
| 282 | self, |
| 283 | non_ascii, |
| 284 | first_minus_offset, |
| 285 | second, |
| 286 | second_minus_offset, |
| 287 | unread_handle_second, |
| 288 | third, |
| 289 | third_minus_offset, |
| 290 | unread_handle_third, |
| 291 | fourth, |
| 292 | fourth_minus_offset, |
| 293 | unread_handle_fourth, |
| 294 | source, |
| 295 | handle, |
| 296 | 'outermost); |
| 297 | } |
| 298 | |
| 299 | // XXX Experiment with inline directives |
| 300 | fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> { |
| 301 | // Try ideographic punctuation first as it's the most likely case. |
| 302 | // Throwing in the check for full-width currencies and tilde is probably |
| 303 | // more size-efficient here than elsewhere. |
| 304 | if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) { |
| 305 | if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) { |
| 306 | return Some((0xA1, pos + 0xA1)); |
| 307 | } |
| 308 | } |
| 309 | // Ext A |
| 310 | if in_range16(bmp, 0x3400, 0x4E00) { |
| 311 | return position(&GBK_BOTTOM[21..100], bmp).map(|pos| { |
| 312 | ( |
| 313 | 0xFE, |
| 314 | pos + if pos < (0x3F - 16) { |
| 315 | 0x40 + 16 |
| 316 | } else { |
| 317 | 0x41 + 16 |
| 318 | }, |
| 319 | ) |
| 320 | }); |
| 321 | } |
| 322 | // Compatibility ideographs |
| 323 | if in_range16(bmp, 0xF900, 0xFB00) { |
| 324 | return position(&GBK_BOTTOM[0..21], bmp).map(|pos| { |
| 325 | if pos < 5 { |
| 326 | // end of second to last row |
| 327 | (0xFD, pos + (190 - 94 - 5 + 0x41)) |
| 328 | } else { |
| 329 | // last row |
| 330 | (0xFE, pos + (0x40 - 5)) |
| 331 | } |
| 332 | }); |
| 333 | } |
| 334 | // Handle everything below U+02CA, which is in GBK_OTHER. |
| 335 | if bmp < 0x02CA { |
| 336 | if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 { |
| 337 | // Pinyin except U+1E3F |
| 338 | if let Some(pos) = position(&GB2312_PINYIN[..], bmp) { |
| 339 | return Some((0xA8, pos + 0xA1)); |
| 340 | } |
| 341 | } else if in_inclusive_range16(bmp, 0x00A4, 0x00F7) |
| 342 | || in_inclusive_range16(bmp, 0x02C7, 0x02C9) |
| 343 | { |
| 344 | // Diacritics and Latin 1 symbols |
| 345 | if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) { |
| 346 | return Some((0xA1, pos + 0xA1 + 3)); |
| 347 | } |
| 348 | } |
| 349 | return None; |
| 350 | } |
| 351 | |
| 352 | if in_inclusive_range16(bmp, 0xE78D, 0xE864) { |
| 353 | // The array is sorted but short, so let's do linear search. |
| 354 | if let Some(pos) = position(&GB18030_2022_OVERRIDE_PUA[..], bmp) { |
| 355 | let pair = &GB18030_2022_OVERRIDE_BYTES[pos]; |
| 356 | return Some((pair[0].into(), pair[1].into())); |
| 357 | } |
| 358 | } else if bmp >= 0xFE17 { |
| 359 | // Various brackets, all in full-width regions |
| 360 | if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) { |
| 361 | return Some((0xA6, pos + (0x9F - 0x60 + 0xA1))); |
| 362 | } |
| 363 | } else if bmp == 0x1E3F { |
| 364 | // The one Pinyin placed elsewhere on the BMP |
| 365 | return Some((0xA8, 0x7B - 0x60 + 0xA1)); |
| 366 | } else if in_range16(bmp, 0xA000, 0xD800) { |
| 367 | // Since Korean has usage in China, let's spend a branch to fast-track |
| 368 | // Hangul. |
| 369 | return None; |
| 370 | } |
| 371 | // GB2312 other (except bottom PUA and PUA between Hanzi levels). |
| 372 | if let Some(other_pointer) = gb2312_other_encode(bmp) { |
| 373 | let other_lead = other_pointer as usize / 94; |
| 374 | let other_trail = other_pointer as usize % 94; |
| 375 | return Some((0xA2 + other_lead, 0xA1 + other_trail)); |
| 376 | } |
| 377 | // At this point, we've handled all mappable characters above U+02D9 but |
| 378 | // below U+2010. Let's check for that range in order to let lower BMP |
| 379 | // characters used for minority languages in China avoid the subsequent |
| 380 | // search that deals mainly with various symbols. |
| 381 | if in_range16(bmp, 0x02DA, 0x2010) { |
| 382 | return None; |
| 383 | } |
| 384 | // GBK other (except radicals and PUA in GBK_BOTTOM). |
| 385 | if let Some(other_pointer) = gbk_other_encode(bmp) { |
| 386 | let other_lead = other_pointer as usize / (190 - 94); |
| 387 | let other_trail = other_pointer as usize % (190 - 94); |
| 388 | let offset = if other_trail < 0x3F { 0x40 } else { 0x41 }; |
| 389 | return Some((other_lead + (0x81 + 0x20), other_trail + offset)); |
| 390 | } |
| 391 | // CJK Radicals Supplement, PUA, and U+9FBx ideographs in GBK_BOTTOM |
| 392 | if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) |
| 393 | || in_inclusive_range16(bmp, 0x9FB4, 0x9FBB) |
| 394 | || in_inclusive_range16(bmp, 0xE816, 0xE855) |
| 395 | { |
| 396 | if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) { |
| 397 | let trail = pos + 16; |
| 398 | let offset = if trail < 0x3F { 0x40 } else { 0x41 }; |
| 399 | return Some((0xFE, trail + offset)); |
| 400 | } |
| 401 | } |
| 402 | // GB2312 bottom PUA |
| 403 | let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234); |
| 404 | if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) { |
| 405 | let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94; |
| 406 | let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94; |
| 407 | return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail)); |
| 408 | } |
| 409 | // PUA between Hanzi Levels |
| 410 | let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810); |
| 411 | if bmp_minus_pua_between_hanzi < 5 { |
| 412 | return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize)); |
| 413 | } |
| 414 | None |
| 415 | } |
| 416 | |
| 417 | #[cfg (not(feature = "fast-gb-hanzi-encode" ))] |
| 418 | #[inline (always)] |
| 419 | fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) { |
| 420 | if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) { |
| 421 | (lead, trail) |
| 422 | } else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) { |
| 423 | let hanzi_lead = (hanzi_pointer / 94) + (0xD8); |
| 424 | let hanzi_trail = (hanzi_pointer % 94) + 0xA1; |
| 425 | (hanzi_lead as u8, hanzi_trail as u8) |
| 426 | } else { |
| 427 | let (lead, gbk_trail) = if bmp < 0x72DC { |
| 428 | // Above GB2312 |
| 429 | let pointer = gbk_top_ideograph_encode(bmp) as usize; |
| 430 | let lead = (pointer / 190) + 0x81; |
| 431 | let gbk_trail = pointer % 190; |
| 432 | (lead, gbk_trail) |
| 433 | } else { |
| 434 | // To the left of GB2312 |
| 435 | let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize; |
| 436 | let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29); |
| 437 | let gbk_trail = gbk_left_ideograph_pointer % (190 - 94); |
| 438 | (lead, gbk_trail) |
| 439 | }; |
| 440 | let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 }; |
| 441 | (lead as u8, (gbk_trail + offset) as u8) |
| 442 | } |
| 443 | } |
| 444 | |
| 445 | #[cfg (feature = "fast-gb-hanzi-encode" )] |
| 446 | #[inline (always)] |
| 447 | fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) { |
| 448 | gbk_hanzi_encode(bmp_minus_unified_start) |
| 449 | } |
| 450 | |
| 451 | pub struct Gb18030Encoder { |
| 452 | extended: bool, |
| 453 | } |
| 454 | |
| 455 | impl Gb18030Encoder { |
| 456 | pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder { |
| 457 | Encoder::new( |
| 458 | encoding, |
| 459 | VariantEncoder::Gb18030(Gb18030Encoder { |
| 460 | extended: extended_range, |
| 461 | }), |
| 462 | ) |
| 463 | } |
| 464 | |
| 465 | pub fn max_buffer_length_from_utf16_without_replacement( |
| 466 | &self, |
| 467 | u16_length: usize, |
| 468 | ) -> Option<usize> { |
| 469 | if self.extended { |
| 470 | u16_length.checked_mul(4) |
| 471 | } else { |
| 472 | // Need to add, because space check is done with the four-byte |
| 473 | // assumption. |
| 474 | checked_add(2, u16_length.checked_mul(2)) |
| 475 | } |
| 476 | } |
| 477 | |
| 478 | pub fn max_buffer_length_from_utf8_without_replacement( |
| 479 | &self, |
| 480 | byte_length: usize, |
| 481 | ) -> Option<usize> { |
| 482 | if self.extended { |
| 483 | // 1 to 1 |
| 484 | // 2 to 2 |
| 485 | // 3 to 2 |
| 486 | // 2 to 4 (worst) |
| 487 | // 3 to 4 |
| 488 | // 4 to 4 |
| 489 | checked_add(2, byte_length.checked_mul(2)) |
| 490 | } else { |
| 491 | // 1 to 1 |
| 492 | // 2 to 2 |
| 493 | // 3 to 2 |
| 494 | // Need to add, because space check is done with the four-byte |
| 495 | // assumption. |
| 496 | byte_length.checked_add(3) |
| 497 | } |
| 498 | } |
| 499 | |
| 500 | ascii_compatible_encoder_functions!( |
| 501 | { |
| 502 | let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00); |
| 503 | if bmp_minus_unified_start < (0x9FA6 - 0x4E00) { |
| 504 | // CJK Unified Ideographs |
| 505 | // Can't fail now, since all are |
| 506 | // mapped. |
| 507 | let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start); |
| 508 | handle.write_two(lead, trail) |
| 509 | } else if bmp == 0xE5E5 { |
| 510 | // It's not optimal to check for the unmappable |
| 511 | // and for euro at this stage, but getting |
| 512 | // the out of the way makes the rest of the |
| 513 | // code less messy. |
| 514 | return ( |
| 515 | EncoderResult::unmappable_from_bmp(bmp), |
| 516 | source.consumed(), |
| 517 | handle.written(), |
| 518 | ); |
| 519 | } else if bmp == 0x20AC && !self.extended { |
| 520 | handle.write_one(0x80u8) |
| 521 | } else { |
| 522 | match gbk_encode_non_unified(bmp) { |
| 523 | Some((lead, trail)) => handle.write_two(lead as u8, trail as u8), |
| 524 | None => { |
| 525 | if !self.extended { |
| 526 | return ( |
| 527 | EncoderResult::unmappable_from_bmp(bmp), |
| 528 | source.consumed(), |
| 529 | handle.written(), |
| 530 | ); |
| 531 | } |
| 532 | let range_pointer = gb18030_range_encode(bmp); |
| 533 | let first = range_pointer / (10 * 126 * 10); |
| 534 | let rem_first = range_pointer % (10 * 126 * 10); |
| 535 | let second = rem_first / (10 * 126); |
| 536 | let rem_second = rem_first % (10 * 126); |
| 537 | let third = rem_second / 10; |
| 538 | let fourth = rem_second % 10; |
| 539 | handle.write_four( |
| 540 | (first + 0x81) as u8, |
| 541 | (second + 0x30) as u8, |
| 542 | (third + 0x81) as u8, |
| 543 | (fourth + 0x30) as u8, |
| 544 | ) |
| 545 | } |
| 546 | } |
| 547 | } |
| 548 | }, |
| 549 | { |
| 550 | if !self.extended { |
| 551 | return ( |
| 552 | EncoderResult::Unmappable(astral), |
| 553 | source.consumed(), |
| 554 | handle.written(), |
| 555 | ); |
| 556 | } |
| 557 | let range_pointer = astral as usize + (189_000usize - 0x1_0000usize); |
| 558 | let first = range_pointer / (10 * 126 * 10); |
| 559 | let rem_first = range_pointer % (10 * 126 * 10); |
| 560 | let second = rem_first / (10 * 126); |
| 561 | let rem_second = rem_first % (10 * 126); |
| 562 | let third = rem_second / 10; |
| 563 | let fourth = rem_second % 10; |
| 564 | handle.write_four( |
| 565 | (first + 0x81) as u8, |
| 566 | (second + 0x30) as u8, |
| 567 | (third + 0x81) as u8, |
| 568 | (fourth + 0x30) as u8, |
| 569 | ) |
| 570 | }, |
| 571 | bmp, |
| 572 | astral, |
| 573 | self, |
| 574 | source, |
| 575 | handle, |
| 576 | copy_ascii_to_check_space_four, |
| 577 | check_space_four, |
| 578 | false |
| 579 | ); |
| 580 | } |
| 581 | |
| 582 | // Any copyright to the test code below this comment is dedicated to the |
| 583 | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
| 584 | |
| 585 | #[cfg (all(test, feature = "alloc" ))] |
| 586 | mod tests { |
| 587 | use super::super::testing::*; |
| 588 | use super::super::*; |
| 589 | |
| 590 | fn decode_gb18030(bytes: &[u8], expect: &str) { |
| 591 | decode(GB18030, bytes, expect); |
| 592 | } |
| 593 | |
| 594 | fn encode_gb18030(string: &str, expect: &[u8]) { |
| 595 | encode(GB18030, string, expect); |
| 596 | } |
| 597 | |
| 598 | fn encode_gbk(string: &str, expect: &[u8]) { |
| 599 | encode(GBK, string, expect); |
| 600 | } |
| 601 | |
| 602 | #[test ] |
| 603 | fn test_gb18030_decode() { |
| 604 | // Empty |
| 605 | decode_gb18030(b"" , &"" ); |
| 606 | |
| 607 | // ASCII |
| 608 | decode_gb18030(b" \x61\x62" , " \u{0061}\u{0062}" ); |
| 609 | |
| 610 | // euro |
| 611 | decode_gb18030(b" \x80" , " \u{20AC}" ); |
| 612 | decode_gb18030(b" \xA2\xE3" , " \u{20AC}" ); |
| 613 | |
| 614 | // two bytes |
| 615 | decode_gb18030(b" \x81\x40" , " \u{4E02}" ); |
| 616 | decode_gb18030(b" \x81\x7E" , " \u{4E8A}" ); |
| 617 | decode_gb18030(b" \x81\x7F" , " \u{FFFD}\u{007F}" ); |
| 618 | decode_gb18030(b" \x81\x80" , " \u{4E90}" ); |
| 619 | decode_gb18030(b" \x81\xFE" , " \u{4FA2}" ); |
| 620 | decode_gb18030(b" \xFE\x40" , " \u{FA0C}" ); |
| 621 | decode_gb18030(b" \xFE\x7F" , " \u{FFFD}\u{007F}" ); |
| 622 | decode_gb18030(b" \xFE\x80" , " \u{4723}" ); |
| 623 | decode_gb18030(b" \xFE\xFE" , " \u{E4C5}" ); |
| 624 | |
| 625 | // Changes between GB18030-2005 and GB18030-2022 |
| 626 | decode_gb18030(b" \xFE\x7E" , " \u{9FB9}" ); |
| 627 | decode_gb18030(b" \xA6\xDD" , " \u{FE14}" ); |
| 628 | |
| 629 | // These mappings remain in place the GB18030-2005 way despite GB18030-2022 |
| 630 | decode_gb18030(b" \x82\x35\x91\x32" , " \u{9FB9}" ); |
| 631 | decode_gb18030(b" \x84\x31\x83\x30" , " \u{FE14}" ); |
| 632 | |
| 633 | // The difference from the original GB18030 |
| 634 | decode_gb18030(b" \xA3\xA0" , " \u{3000}" ); |
| 635 | decode_gb18030(b" \xA1\xA1" , " \u{3000}" ); |
| 636 | |
| 637 | // 0xFF |
| 638 | decode_gb18030(b" \xFF\x40" , " \u{FFFD}\u{0040}" ); |
| 639 | decode_gb18030(b" \xE3\xFF\x9A\x33" , " \u{FFFD}\u{FFFD}" ); // not \u{FFFD}\u{FFFD}\u{0033} ! |
| 640 | decode_gb18030(b" \xFF\x32\x9A\x33" , " \u{FFFD}\u{0032}\u{FFFD}" ); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} ! |
| 641 | decode_gb18030(b" \xFF\x40\x00" , " \u{FFFD}\u{0040}\u{0000}" ); |
| 642 | decode_gb18030(b" \xE3\xFF\x9A\x33\x00" , " \u{FFFD}\u{FFFD}\u{0033}\u{0000}" ); |
| 643 | decode_gb18030( |
| 644 | b" \xFF\x32\x9A\x33\x00" , |
| 645 | " \u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}" , |
| 646 | ); |
| 647 | |
| 648 | // Four bytes |
| 649 | decode_gb18030(b" \x81\x30\x81\x30" , " \u{0080}" ); |
| 650 | decode_gb18030(b" \x81\x35\xF4\x37" , " \u{E7C7}" ); |
| 651 | decode_gb18030(b" \x81\x37\xA3\x30" , " \u{2603}" ); |
| 652 | decode_gb18030(b" \x94\x39\xDA\x33" , " \u{1F4A9}" ); |
| 653 | decode_gb18030(b" \xE3\x32\x9A\x35" , " \u{10FFFF}" ); |
| 654 | decode_gb18030(b" \xE3\x32\x9A\x36\x81\x30" , " \u{FFFD}\u{FFFD}" ); |
| 655 | decode_gb18030(b" \xE3\x32\x9A\x36\x81\x40" , " \u{FFFD}\u{4E02}" ); |
| 656 | decode_gb18030(b" \xE3\x32\x9A" , " \u{FFFD}" ); // not \u{FFFD}\u{0032}\u{FFFD} ! |
| 657 | decode_gb18030(b" \xE3\x32\x9A\x00" , " \u{FFFD}\u{0032}\u{FFFD}\u{0000}" ); |
| 658 | } |
| 659 | |
| 660 | #[test ] |
| 661 | fn test_gb18030_encode() { |
| 662 | // Empty |
| 663 | encode_gb18030("" , b"" ); |
| 664 | |
| 665 | // ASCII |
| 666 | encode_gb18030(" \u{0061}\u{0062}" , b" \x61\x62" ); |
| 667 | |
| 668 | // euro |
| 669 | encode_gb18030(" \u{20AC}" , b" \xA2\xE3" ); |
| 670 | |
| 671 | // two bytes |
| 672 | encode_gb18030(" \u{4E02}" , b" \x81\x40" ); |
| 673 | encode_gb18030(" \u{4E8A}" , b" \x81\x7E" ); |
| 674 | if !cfg!(miri) { |
| 675 | // Miri is too slow |
| 676 | encode_gb18030(" \u{4E90}" , b" \x81\x80" ); |
| 677 | encode_gb18030(" \u{4FA2}" , b" \x81\xFE" ); |
| 678 | encode_gb18030(" \u{FA0C}" , b" \xFE\x40" ); |
| 679 | encode_gb18030(" \u{E843}" , b" \xFE\x7E" ); |
| 680 | encode_gb18030(" \u{4723}" , b" \xFE\x80" ); |
| 681 | encode_gb18030(" \u{E4C5}" , b" \xFE\xFE" ); |
| 682 | } |
| 683 | |
| 684 | // The difference from the original GB18030 |
| 685 | encode_gb18030(" \u{E5E5}" , b"" ); |
| 686 | encode_gb18030(" \u{3000}" , b" \xA1\xA1" ); |
| 687 | |
| 688 | // Four bytes |
| 689 | encode_gb18030(" \u{0080}" , b" \x81\x30\x81\x30" ); |
| 690 | encode_gb18030(" \u{E7C7}" , b" \x81\x35\xF4\x37" ); |
| 691 | if !cfg!(miri) { |
| 692 | // Miri is too slow |
| 693 | encode_gb18030(" \u{2603}" , b" \x81\x37\xA3\x30" ); |
| 694 | encode_gb18030(" \u{1F4A9}" , b" \x94\x39\xDA\x33" ); |
| 695 | encode_gb18030(" \u{10FFFF}" , b" \xE3\x32\x9A\x35" ); |
| 696 | } |
| 697 | |
| 698 | // Edge cases |
| 699 | encode_gb18030(" \u{00F7}" , b" \xA1\xC2" ); |
| 700 | |
| 701 | // GB18030-2022 |
| 702 | encode_gb18030(" \u{9FB9}" , b" \xFE\x7E" ); |
| 703 | encode_gb18030(" \u{FE14}" , b" \xA6\xDD" ); |
| 704 | encode_gb18030(" \u{E843}" , b" \xFE\x7E" ); |
| 705 | encode_gb18030(" \u{E791}" , b" \xA6\xDD" ); |
| 706 | |
| 707 | // Non-change in GB18030-2022 |
| 708 | encode_gb18030(" \u{E817}" , b" \xFE\x52" ); |
| 709 | } |
| 710 | |
| 711 | #[test ] |
| 712 | fn test_gbk_encode() { |
| 713 | // Empty |
| 714 | encode_gbk("" , b"" ); |
| 715 | |
| 716 | // ASCII |
| 717 | encode_gbk(" \u{0061}\u{0062}" , b" \x61\x62" ); |
| 718 | |
| 719 | // euro |
| 720 | encode_gbk(" \u{20AC}" , b" \x80" ); |
| 721 | |
| 722 | // two bytes |
| 723 | encode_gbk(" \u{4E02}" , b" \x81\x40" ); |
| 724 | encode_gbk(" \u{4E8A}" , b" \x81\x7E" ); |
| 725 | if !cfg!(miri) { |
| 726 | // Miri is too slow |
| 727 | encode_gbk(" \u{4E90}" , b" \x81\x80" ); |
| 728 | encode_gbk(" \u{4FA2}" , b" \x81\xFE" ); |
| 729 | encode_gbk(" \u{FA0C}" , b" \xFE\x40" ); |
| 730 | encode_gbk(" \u{E843}" , b" \xFE\x7E" ); |
| 731 | encode_gbk(" \u{4723}" , b" \xFE\x80" ); |
| 732 | encode_gbk(" \u{E4C5}" , b" \xFE\xFE" ); |
| 733 | } |
| 734 | |
| 735 | // The difference from the original gb18030 |
| 736 | encode_gbk(" \u{E5E5}" , b"" ); |
| 737 | encode_gbk(" \u{3000}" , b" \xA1\xA1" ); |
| 738 | |
| 739 | // Four bytes |
| 740 | encode_gbk(" \u{0080}" , b"€" ); |
| 741 | encode_gbk(" \u{E7C7}" , b"" ); |
| 742 | if !cfg!(miri) { |
| 743 | // Miri is too slow |
| 744 | encode_gbk(" \u{2603}" , b"☃" ); |
| 745 | encode_gbk(" \u{1F4A9}" , b"💩" ); |
| 746 | encode_gbk(" \u{10FFFF}" , b"" ); |
| 747 | } |
| 748 | |
| 749 | // Edge cases |
| 750 | encode_gbk(" \u{00F7}" , b" \xA1\xC2" ); |
| 751 | |
| 752 | // GB18030-2022 |
| 753 | encode_gb18030(" \u{9FB9}" , b" \xFE\x7E" ); |
| 754 | encode_gb18030(" \u{FE14}" , b" \xA6\xDD" ); |
| 755 | encode_gb18030(" \u{E843}" , b" \xFE\x7E" ); |
| 756 | encode_gb18030(" \u{E791}" , b" \xA6\xDD" ); |
| 757 | |
| 758 | // Non-change in GB18030-2022 |
| 759 | encode_gb18030(" \u{E817}" , b" \xFE\x52" ); |
| 760 | } |
| 761 | |
| 762 | #[test ] |
| 763 | #[cfg_attr (miri, ignore)] // Miri is too slow |
| 764 | fn test_gb18030_decode_all() { |
| 765 | let input = include_bytes!("test_data/gb18030_in.txt" ); |
| 766 | let expectation = include_str!("test_data/gb18030_in_ref.txt" ); |
| 767 | let (cow, had_errors) = GB18030.decode_without_bom_handling(input); |
| 768 | assert!(!had_errors, "Should not have had errors." ); |
| 769 | assert_eq!(&cow[..], expectation); |
| 770 | } |
| 771 | |
| 772 | #[test ] |
| 773 | #[cfg_attr (miri, ignore)] // Miri is too slow |
| 774 | fn test_gb18030_encode_all() { |
| 775 | let input = include_str!("test_data/gb18030_out.txt" ); |
| 776 | let expectation = include_bytes!("test_data/gb18030_out_ref.txt" ); |
| 777 | let (cow, encoding, had_errors) = GB18030.encode(input); |
| 778 | assert!(!had_errors, "Should not have had errors." ); |
| 779 | assert_eq!(encoding, GB18030); |
| 780 | assert_eq!(&cow[..], &expectation[..]); |
| 781 | } |
| 782 | |
| 783 | #[test ] |
| 784 | fn test_gb18030_encode_from_utf16_max_length() { |
| 785 | let mut output = [0u8; 20]; |
| 786 | let mut encoder = GB18030.new_encoder(); |
| 787 | { |
| 788 | let needed = encoder |
| 789 | .max_buffer_length_from_utf16_without_replacement(1) |
| 790 | .unwrap(); |
| 791 | let (result, read, written) = encoder.encode_from_utf16_without_replacement( |
| 792 | &[0x3000], |
| 793 | &mut output[..needed], |
| 794 | true, |
| 795 | ); |
| 796 | assert_eq!(result, EncoderResult::InputEmpty); |
| 797 | assert_eq!(read, 1); |
| 798 | assert_eq!(written, 2); |
| 799 | assert_eq!(output[0], 0xA1); |
| 800 | assert_eq!(output[1], 0xA1); |
| 801 | } |
| 802 | } |
| 803 | } |
| 804 | |