1 | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | use super::*; |
11 | use crate::ascii::*; |
12 | use crate::data::position; |
13 | use crate::handles::*; |
14 | use crate::variant::*; |
15 | |
16 | pub struct SingleByteDecoder { |
17 | table: &'static [u16; 128], |
18 | } |
19 | |
20 | impl SingleByteDecoder { |
21 | pub fn new(data: &'static [u16; 128]) -> VariantDecoder { |
22 | VariantDecoder::SingleByte(SingleByteDecoder { table: data }) |
23 | } |
24 | |
25 | pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { |
26 | Some(byte_length) |
27 | } |
28 | |
29 | pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { |
30 | byte_length.checked_mul(3) |
31 | } |
32 | |
33 | pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { |
34 | byte_length.checked_mul(3) |
35 | } |
36 | |
37 | pub fn decode_to_utf8_raw( |
38 | &mut self, |
39 | src: &[u8], |
40 | dst: &mut [u8], |
41 | _last: bool, |
42 | ) -> (DecoderResult, usize, usize) { |
43 | let mut source = ByteSource::new(src); |
44 | let mut dest = Utf8Destination::new(dst); |
45 | 'outermost: loop { |
46 | match dest.copy_ascii_from_check_space_bmp(&mut source) { |
47 | CopyAsciiResult::Stop(ret) => return ret, |
48 | CopyAsciiResult::GoOn((mut non_ascii, mut handle)) => 'middle: loop { |
49 | // Start non-boilerplate |
50 | // |
51 | // Since the non-ASCIIness of `non_ascii` is hidden from |
52 | // the optimizer, it can't figure out that it's OK to |
53 | // statically omit the bound check when accessing |
54 | // `[u16; 128]` with an index |
55 | // `non_ascii as usize - 0x80usize`. |
56 | let mapped = |
57 | unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) }; |
58 | // let mapped = self.table[non_ascii as usize - 0x80usize]; |
59 | if mapped == 0u16 { |
60 | return ( |
61 | DecoderResult::Malformed(1, 0), |
62 | source.consumed(), |
63 | handle.written(), |
64 | ); |
65 | } |
66 | let dest_again = handle.write_bmp_excl_ascii(mapped); |
67 | // End non-boilerplate |
68 | match source.check_available() { |
69 | Space::Full(src_consumed) => { |
70 | return ( |
71 | DecoderResult::InputEmpty, |
72 | src_consumed, |
73 | dest_again.written(), |
74 | ); |
75 | } |
76 | Space::Available(source_handle) => { |
77 | match dest_again.check_space_bmp() { |
78 | Space::Full(dst_written) => { |
79 | return ( |
80 | DecoderResult::OutputFull, |
81 | source_handle.consumed(), |
82 | dst_written, |
83 | ); |
84 | } |
85 | Space::Available(mut destination_handle) => { |
86 | let (mut b, unread_handle) = source_handle.read(); |
87 | let source_again = unread_handle.commit(); |
88 | 'innermost: loop { |
89 | if b > 127 { |
90 | non_ascii = b; |
91 | handle = destination_handle; |
92 | continue 'middle; |
93 | } |
94 | // Testing on Haswell says that we should write the |
95 | // byte unconditionally instead of trying to unread it |
96 | // to make it part of the next SIMD stride. |
97 | let dest_again_again = destination_handle.write_ascii(b); |
98 | if b < 60 { |
99 | // We've got punctuation |
100 | match source_again.check_available() { |
101 | Space::Full(src_consumed_again) => { |
102 | return ( |
103 | DecoderResult::InputEmpty, |
104 | src_consumed_again, |
105 | dest_again_again.written(), |
106 | ); |
107 | } |
108 | Space::Available(source_handle_again) => { |
109 | match dest_again_again.check_space_bmp() { |
110 | Space::Full(dst_written_again) => { |
111 | return ( |
112 | DecoderResult::OutputFull, |
113 | source_handle_again.consumed(), |
114 | dst_written_again, |
115 | ); |
116 | } |
117 | Space::Available( |
118 | destination_handle_again, |
119 | ) => { |
120 | let (b_again, _unread_handle_again) = |
121 | source_handle_again.read(); |
122 | b = b_again; |
123 | destination_handle = |
124 | destination_handle_again; |
125 | continue 'innermost; |
126 | } |
127 | } |
128 | } |
129 | } |
130 | } |
131 | // We've got markup or ASCII text |
132 | continue 'outermost; |
133 | } |
134 | } |
135 | } |
136 | } |
137 | } |
138 | }, |
139 | } |
140 | } |
141 | } |
142 | |
143 | pub fn decode_to_utf16_raw( |
144 | &mut self, |
145 | src: &[u8], |
146 | dst: &mut [u16], |
147 | _last: bool, |
148 | ) -> (DecoderResult, usize, usize) { |
149 | let (pending, length) = if dst.len() < src.len() { |
150 | (DecoderResult::OutputFull, dst.len()) |
151 | } else { |
152 | (DecoderResult::InputEmpty, src.len()) |
153 | }; |
154 | let mut converted = 0usize; |
155 | 'outermost: loop { |
156 | match unsafe { |
157 | ascii_to_basic_latin( |
158 | src.as_ptr().add(converted), |
159 | dst.as_mut_ptr().add(converted), |
160 | length - converted, |
161 | ) |
162 | } { |
163 | None => { |
164 | return (pending, length, length); |
165 | } |
166 | Some((mut non_ascii, consumed)) => { |
167 | converted += consumed; |
168 | 'middle: loop { |
169 | // `converted` doesn't count the reading of `non_ascii` yet. |
170 | // Since the non-ASCIIness of `non_ascii` is hidden from |
171 | // the optimizer, it can't figure out that it's OK to |
172 | // statically omit the bound check when accessing |
173 | // `[u16; 128]` with an index |
174 | // `non_ascii as usize - 0x80usize`. |
175 | let mapped = |
176 | unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) }; |
177 | // let mapped = self.table[non_ascii as usize - 0x80usize]; |
178 | if mapped == 0u16 { |
179 | return ( |
180 | DecoderResult::Malformed(1, 0), |
181 | converted + 1, // +1 `for non_ascii` |
182 | converted, |
183 | ); |
184 | } |
185 | unsafe { |
186 | // The bound check has already been performed |
187 | *(dst.get_unchecked_mut(converted)) = mapped; |
188 | } |
189 | converted += 1; |
190 | // Next, handle ASCII punctuation and non-ASCII without |
191 | // going back to ASCII acceleration. Non-ASCII scripts |
192 | // use ASCII punctuation, so this avoid going to |
193 | // acceleration just for punctuation/space and then |
194 | // failing. This is a significant boost to non-ASCII |
195 | // scripts. |
196 | // TODO: Split out Latin converters without this part |
197 | // this stuff makes Latin script-conversion slower. |
198 | if converted == length { |
199 | return (pending, length, length); |
200 | } |
201 | let mut b = unsafe { *(src.get_unchecked(converted)) }; |
202 | 'innermost: loop { |
203 | if b > 127 { |
204 | non_ascii = b; |
205 | continue 'middle; |
206 | } |
207 | // Testing on Haswell says that we should write the |
208 | // byte unconditionally instead of trying to unread it |
209 | // to make it part of the next SIMD stride. |
210 | unsafe { |
211 | *(dst.get_unchecked_mut(converted)) = u16::from(b); |
212 | } |
213 | converted += 1; |
214 | if b < 60 { |
215 | // We've got punctuation |
216 | if converted == length { |
217 | return (pending, length, length); |
218 | } |
219 | b = unsafe { *(src.get_unchecked(converted)) }; |
220 | continue 'innermost; |
221 | } |
222 | // We've got markup or ASCII text |
223 | continue 'outermost; |
224 | } |
225 | } |
226 | } |
227 | } |
228 | } |
229 | } |
230 | |
231 | pub fn latin1_byte_compatible_up_to(&self, buffer: &[u8]) -> usize { |
232 | let mut bytes = buffer; |
233 | let mut total = 0; |
234 | loop { |
235 | if let Some((non_ascii, offset)) = validate_ascii(bytes) { |
236 | total += offset; |
237 | let mapped = unsafe { *(self.table.get_unchecked(non_ascii as usize - 0x80usize)) }; |
238 | if mapped != u16::from(non_ascii) { |
239 | return total; |
240 | } |
241 | total += 1; |
242 | bytes = &bytes[offset + 1..]; |
243 | } else { |
244 | return total; |
245 | } |
246 | } |
247 | } |
248 | } |
249 | |
250 | pub struct SingleByteEncoder { |
251 | table: &'static [u16; 128], |
252 | run_bmp_offset: usize, |
253 | run_byte_offset: usize, |
254 | run_length: usize, |
255 | } |
256 | |
257 | impl SingleByteEncoder { |
258 | pub fn new( |
259 | encoding: &'static Encoding, |
260 | data: &'static [u16; 128], |
261 | run_bmp_offset: u16, |
262 | run_byte_offset: u8, |
263 | run_length: u8, |
264 | ) -> Encoder { |
265 | Encoder::new( |
266 | encoding, |
267 | VariantEncoder::SingleByte(SingleByteEncoder { |
268 | table: data, |
269 | run_bmp_offset: run_bmp_offset as usize, |
270 | run_byte_offset: run_byte_offset as usize, |
271 | run_length: run_length as usize, |
272 | }), |
273 | ) |
274 | } |
275 | |
276 | pub fn max_buffer_length_from_utf16_without_replacement( |
277 | &self, |
278 | u16_length: usize, |
279 | ) -> Option<usize> { |
280 | Some(u16_length) |
281 | } |
282 | |
283 | pub fn max_buffer_length_from_utf8_without_replacement( |
284 | &self, |
285 | byte_length: usize, |
286 | ) -> Option<usize> { |
287 | Some(byte_length) |
288 | } |
289 | |
290 | #[inline (always)] |
291 | fn encode_u16(&self, code_unit: u16) -> Option<u8> { |
292 | // First, we see if the code unit falls into a run of consecutive |
293 | // code units that can be mapped by offset. This is very efficient |
294 | // for most non-Latin encodings as well as Latin1-ish encodings. |
295 | // |
296 | // For encodings that don't fit this pattern, the run (which may |
297 | // have the length of just one) just establishes the starting point |
298 | // for the next rule. |
299 | // |
300 | // Next, we do a forward linear search in the part of the index |
301 | // after the run. Even in non-Latin1-ish Latin encodings (except |
302 | // macintosh), the lower case letters are here. |
303 | // |
304 | // Next, we search the third quadrant up to the start of the run |
305 | // (upper case letters in Latin encodings except macintosh, in |
306 | // Greek and in KOI encodings) and then the second quadrant, |
307 | // except if the run stared before the third quadrant, we search |
308 | // the second quadrant up to the run. |
309 | // |
310 | // Last, we search the first quadrant, which has unused controls |
311 | // or punctuation in most encodings. This is bad for macintosh |
312 | // and IBM866, but those are rare. |
313 | |
314 | // Run of consecutive units |
315 | let unit_as_usize = code_unit as usize; |
316 | let offset = unit_as_usize.wrapping_sub(self.run_bmp_offset); |
317 | if offset < self.run_length { |
318 | return Some((128 + self.run_byte_offset + offset) as u8); |
319 | } |
320 | |
321 | // Search after the run |
322 | let tail_start = self.run_byte_offset + self.run_length; |
323 | if let Some(pos) = position(&self.table[tail_start..], code_unit) { |
324 | return Some((128 + tail_start + pos) as u8); |
325 | } |
326 | |
327 | if self.run_byte_offset >= 64 { |
328 | // Search third quadrant before the run |
329 | if let Some(pos) = position(&self.table[64..self.run_byte_offset], code_unit) { |
330 | return Some(((128 + 64) + pos) as u8); |
331 | } |
332 | |
333 | // Search second quadrant |
334 | if let Some(pos) = position(&self.table[32..64], code_unit) { |
335 | return Some(((128 + 32) + pos) as u8); |
336 | } |
337 | } else if let Some(pos) = position(&self.table[32..self.run_byte_offset], code_unit) { |
338 | // windows-1252, windows-874, ISO-8859-15 and ISO-8859-5 |
339 | // Search second quadrant before the run |
340 | return Some(((128 + 32) + pos) as u8); |
341 | } |
342 | |
343 | // Search first quadrant |
344 | if let Some(pos) = position(&self.table[..32], code_unit) { |
345 | return Some((128 + pos) as u8); |
346 | } |
347 | |
348 | None |
349 | } |
350 | |
351 | ascii_compatible_bmp_encoder_function!( |
352 | { |
353 | match self.encode_u16(bmp) { |
354 | Some(byte) => handle.write_one(byte), |
355 | None => { |
356 | return ( |
357 | EncoderResult::unmappable_from_bmp(bmp), |
358 | source.consumed(), |
359 | handle.written(), |
360 | ); |
361 | } |
362 | } |
363 | }, |
364 | bmp, |
365 | self, |
366 | source, |
367 | handle, |
368 | copy_ascii_to_check_space_one, |
369 | check_space_one, |
370 | encode_from_utf8_raw, |
371 | str, |
372 | Utf8Source, |
373 | true |
374 | ); |
375 | |
376 | pub fn encode_from_utf16_raw( |
377 | &mut self, |
378 | src: &[u16], |
379 | dst: &mut [u8], |
380 | _last: bool, |
381 | ) -> (EncoderResult, usize, usize) { |
382 | let (pending, length) = if dst.len() < src.len() { |
383 | (EncoderResult::OutputFull, dst.len()) |
384 | } else { |
385 | (EncoderResult::InputEmpty, src.len()) |
386 | }; |
387 | let mut converted = 0usize; |
388 | 'outermost: loop { |
389 | match unsafe { |
390 | basic_latin_to_ascii( |
391 | src.as_ptr().add(converted), |
392 | dst.as_mut_ptr().add(converted), |
393 | length - converted, |
394 | ) |
395 | } { |
396 | None => { |
397 | return (pending, length, length); |
398 | } |
399 | Some((mut non_ascii, consumed)) => { |
400 | converted += consumed; |
401 | 'middle: loop { |
402 | // `converted` doesn't count the reading of `non_ascii` yet. |
403 | match self.encode_u16(non_ascii) { |
404 | Some(byte) => { |
405 | unsafe { |
406 | *(dst.get_unchecked_mut(converted)) = byte; |
407 | } |
408 | converted += 1; |
409 | } |
410 | None => { |
411 | // At this point, we need to know if we |
412 | // have a surrogate. |
413 | let high_bits = non_ascii & 0xFC00u16; |
414 | if high_bits == 0xD800u16 { |
415 | // high surrogate |
416 | if converted + 1 == length { |
417 | // End of buffer. This surrogate is unpaired. |
418 | return ( |
419 | EncoderResult::Unmappable(' \u{FFFD}' ), |
420 | converted + 1, // +1 `for non_ascii` |
421 | converted, |
422 | ); |
423 | } |
424 | let second = |
425 | u32::from(unsafe { *src.get_unchecked(converted + 1) }); |
426 | if second & 0xFC00u32 != 0xDC00u32 { |
427 | return ( |
428 | EncoderResult::Unmappable(' \u{FFFD}' ), |
429 | converted + 1, // +1 `for non_ascii` |
430 | converted, |
431 | ); |
432 | } |
433 | // The next code unit is a low surrogate. |
434 | let astral: char = unsafe { |
435 | ::core::char::from_u32_unchecked( |
436 | (u32::from(non_ascii) << 10) + second |
437 | - (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32), |
438 | ) |
439 | }; |
440 | return ( |
441 | EncoderResult::Unmappable(astral), |
442 | converted + 2, // +2 `for non_ascii` and `second` |
443 | converted, |
444 | ); |
445 | } |
446 | if high_bits == 0xDC00u16 { |
447 | // Unpaired low surrogate |
448 | return ( |
449 | EncoderResult::Unmappable(' \u{FFFD}' ), |
450 | converted + 1, // +1 `for non_ascii` |
451 | converted, |
452 | ); |
453 | } |
454 | return ( |
455 | EncoderResult::unmappable_from_bmp(non_ascii), |
456 | converted + 1, // +1 `for non_ascii` |
457 | converted, |
458 | ); |
459 | } |
460 | } |
461 | // Next, handle ASCII punctuation and non-ASCII without |
462 | // going back to ASCII acceleration. Non-ASCII scripts |
463 | // use ASCII punctuation, so this avoid going to |
464 | // acceleration just for punctuation/space and then |
465 | // failing. This is a significant boost to non-ASCII |
466 | // scripts. |
467 | // TODO: Split out Latin converters without this part |
468 | // this stuff makes Latin script-conversion slower. |
469 | if converted == length { |
470 | return (pending, length, length); |
471 | } |
472 | let mut unit = unsafe { *(src.get_unchecked(converted)) }; |
473 | 'innermost: loop { |
474 | if unit > 127 { |
475 | non_ascii = unit; |
476 | continue 'middle; |
477 | } |
478 | // Testing on Haswell says that we should write the |
479 | // byte unconditionally instead of trying to unread it |
480 | // to make it part of the next SIMD stride. |
481 | unsafe { |
482 | *(dst.get_unchecked_mut(converted)) = unit as u8; |
483 | } |
484 | converted += 1; |
485 | if unit < 60 { |
486 | // We've got punctuation |
487 | if converted == length { |
488 | return (pending, length, length); |
489 | } |
490 | unit = unsafe { *(src.get_unchecked(converted)) }; |
491 | continue 'innermost; |
492 | } |
493 | // We've got markup or ASCII text |
494 | continue 'outermost; |
495 | } |
496 | } |
497 | } |
498 | } |
499 | } |
500 | } |
501 | } |
502 | |
503 | // Any copyright to the test code below this comment is dedicated to the |
504 | // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ |
505 | |
506 | #[cfg (all(test, feature = "alloc" ))] |
507 | mod tests { |
508 | use super::super::testing::*; |
509 | use super::super::*; |
510 | |
511 | #[test ] |
512 | fn test_windows_1255_ca() { |
513 | decode(WINDOWS_1255, b" \xCA" , " \u{05BA}" ); |
514 | encode(WINDOWS_1255, " \u{05BA}" , b" \xCA" ); |
515 | } |
516 | |
517 | #[test ] |
518 | fn test_ascii_punctuation() { |
519 | let bytes = b" \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4. \xC1\xF5\xF4\xFC \xE5\xDF\xED\xE1\xE9 \xDD\xED\xE1 \xF4\xE5\xF3\xF4." ; |
520 | let characters = " \u{0391}\u{03C5}\u{03C4}\u{03CC} \ |
521 | \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \ |
522 | \u{03C4}\u{03B5}\u{03C3}\u{03C4}. \u{0391}\u{03C5}\u{03C4}\u{03CC} \ |
523 | \u{03B5}\u{03AF}\u{03BD}\u{03B1}\u{03B9} \u{03AD}\u{03BD}\u{03B1} \ |
524 | \u{03C4}\u{03B5}\u{03C3}\u{03C4}." ; |
525 | decode(WINDOWS_1253, bytes, characters); |
526 | encode(WINDOWS_1253, characters, bytes); |
527 | } |
528 | |
529 | #[test ] |
530 | fn test_decode_malformed() { |
531 | decode( |
532 | WINDOWS_1253, |
533 | b" \xC1\xF5\xD2\xF4\xFC" , |
534 | " \u{0391}\u{03C5}\u{FFFD}\u{03C4}\u{03CC}" , |
535 | ); |
536 | } |
537 | |
538 | #[test ] |
539 | fn test_encode_unmappables() { |
540 | encode( |
541 | WINDOWS_1253, |
542 | " \u{0391}\u{03C5}\u{2603}\u{03C4}\u{03CC}" , |
543 | b" \xC1\xF5☃ \xF4\xFC" , |
544 | ); |
545 | encode( |
546 | WINDOWS_1253, |
547 | " \u{0391}\u{03C5}\u{1F4A9}\u{03C4}\u{03CC}" , |
548 | b" \xC1\xF5💩 \xF4\xFC" , |
549 | ); |
550 | } |
551 | |
552 | #[test ] |
553 | fn test_encode_unpaired_surrogates() { |
554 | encode_from_utf16( |
555 | WINDOWS_1253, |
556 | &[0x0391u16, 0x03C5u16, 0xDCA9u16, 0x03C4u16, 0x03CCu16], |
557 | b" \xC1\xF5� \xF4\xFC" , |
558 | ); |
559 | encode_from_utf16( |
560 | WINDOWS_1253, |
561 | &[0x0391u16, 0x03C5u16, 0xD83Du16, 0x03C4u16, 0x03CCu16], |
562 | b" \xC1\xF5� \xF4\xFC" , |
563 | ); |
564 | encode_from_utf16( |
565 | WINDOWS_1253, |
566 | &[0x0391u16, 0x03C5u16, 0x03C4u16, 0x03CCu16, 0xD83Du16], |
567 | b" \xC1\xF5\xF4\xFC�" , |
568 | ); |
569 | } |
570 | |
571 | pub const HIGH_BYTES: &'static [u8; 128] = &[ |
572 | 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, |
573 | 0x8F, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, |
574 | 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, |
575 | 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, |
576 | 0xBC, 0xBD, 0xBE, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, |
577 | 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, |
578 | 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, |
579 | 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, |
580 | 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF, |
581 | ]; |
582 | |
583 | fn decode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) { |
584 | let mut with_replacement = [0u16; 128]; |
585 | let mut it = data.iter().enumerate(); |
586 | loop { |
587 | match it.next() { |
588 | Some((i, code_point)) => { |
589 | if *code_point == 0 { |
590 | with_replacement[i] = 0xFFFD; |
591 | } else { |
592 | with_replacement[i] = *code_point; |
593 | } |
594 | } |
595 | None => { |
596 | break; |
597 | } |
598 | } |
599 | } |
600 | |
601 | decode_to_utf16(encoding, HIGH_BYTES, &with_replacement[..]); |
602 | } |
603 | |
604 | fn encode_single_byte(encoding: &'static Encoding, data: &'static [u16; 128]) { |
605 | let mut with_zeros = [0u8; 128]; |
606 | let mut it = data.iter().enumerate(); |
607 | loop { |
608 | match it.next() { |
609 | Some((i, code_point)) => { |
610 | if *code_point == 0 { |
611 | with_zeros[i] = 0; |
612 | } else { |
613 | with_zeros[i] = HIGH_BYTES[i]; |
614 | } |
615 | } |
616 | None => { |
617 | break; |
618 | } |
619 | } |
620 | } |
621 | |
622 | encode_from_utf16(encoding, data, &with_zeros[..]); |
623 | } |
624 | |
625 | #[test ] |
626 | fn test_single_byte_from_two_low_surrogates() { |
627 | let expectation = b"��" ; |
628 | let mut output = [0u8; 40]; |
629 | let mut encoder = WINDOWS_1253.new_encoder(); |
630 | let (result, read, written, had_errors) = |
631 | encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true); |
632 | assert_eq!(result, CoderResult::InputEmpty); |
633 | assert_eq!(read, 2); |
634 | assert_eq!(written, expectation.len()); |
635 | assert!(had_errors); |
636 | assert_eq!(&output[..written], expectation); |
637 | } |
638 | |
639 | // These tests are so self-referential that they are pretty useless. |
640 | |
641 | // BEGIN GENERATED CODE. PLEASE DO NOT EDIT. |
642 | // Instead, please regenerate using generate-encoding-data.py |
643 | |
644 | #[test ] |
645 | fn test_single_byte_decode() { |
646 | decode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866); |
647 | decode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10); |
648 | if cfg!(miri) { |
649 | // Miri is too slow |
650 | return; |
651 | } |
652 | decode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13); |
653 | decode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14); |
654 | decode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15); |
655 | decode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16); |
656 | decode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2); |
657 | decode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3); |
658 | decode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4); |
659 | decode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5); |
660 | decode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6); |
661 | decode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7); |
662 | decode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8); |
663 | decode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r); |
664 | decode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u); |
665 | decode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh); |
666 | decode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250); |
667 | decode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251); |
668 | decode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252); |
669 | decode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253); |
670 | decode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254); |
671 | decode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255); |
672 | decode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256); |
673 | decode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257); |
674 | decode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258); |
675 | decode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874); |
676 | decode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic); |
677 | } |
678 | |
679 | #[test ] |
680 | fn test_single_byte_encode() { |
681 | encode_single_byte(IBM866, &data::SINGLE_BYTE_DATA.ibm866); |
682 | encode_single_byte(ISO_8859_10, &data::SINGLE_BYTE_DATA.iso_8859_10); |
683 | if cfg!(miri) { |
684 | // Miri is too slow |
685 | return; |
686 | } |
687 | encode_single_byte(ISO_8859_13, &data::SINGLE_BYTE_DATA.iso_8859_13); |
688 | encode_single_byte(ISO_8859_14, &data::SINGLE_BYTE_DATA.iso_8859_14); |
689 | encode_single_byte(ISO_8859_15, &data::SINGLE_BYTE_DATA.iso_8859_15); |
690 | encode_single_byte(ISO_8859_16, &data::SINGLE_BYTE_DATA.iso_8859_16); |
691 | encode_single_byte(ISO_8859_2, &data::SINGLE_BYTE_DATA.iso_8859_2); |
692 | encode_single_byte(ISO_8859_3, &data::SINGLE_BYTE_DATA.iso_8859_3); |
693 | encode_single_byte(ISO_8859_4, &data::SINGLE_BYTE_DATA.iso_8859_4); |
694 | encode_single_byte(ISO_8859_5, &data::SINGLE_BYTE_DATA.iso_8859_5); |
695 | encode_single_byte(ISO_8859_6, &data::SINGLE_BYTE_DATA.iso_8859_6); |
696 | encode_single_byte(ISO_8859_7, &data::SINGLE_BYTE_DATA.iso_8859_7); |
697 | encode_single_byte(ISO_8859_8, &data::SINGLE_BYTE_DATA.iso_8859_8); |
698 | encode_single_byte(KOI8_R, &data::SINGLE_BYTE_DATA.koi8_r); |
699 | encode_single_byte(KOI8_U, &data::SINGLE_BYTE_DATA.koi8_u); |
700 | encode_single_byte(MACINTOSH, &data::SINGLE_BYTE_DATA.macintosh); |
701 | encode_single_byte(WINDOWS_1250, &data::SINGLE_BYTE_DATA.windows_1250); |
702 | encode_single_byte(WINDOWS_1251, &data::SINGLE_BYTE_DATA.windows_1251); |
703 | encode_single_byte(WINDOWS_1252, &data::SINGLE_BYTE_DATA.windows_1252); |
704 | encode_single_byte(WINDOWS_1253, &data::SINGLE_BYTE_DATA.windows_1253); |
705 | encode_single_byte(WINDOWS_1254, &data::SINGLE_BYTE_DATA.windows_1254); |
706 | encode_single_byte(WINDOWS_1255, &data::SINGLE_BYTE_DATA.windows_1255); |
707 | encode_single_byte(WINDOWS_1256, &data::SINGLE_BYTE_DATA.windows_1256); |
708 | encode_single_byte(WINDOWS_1257, &data::SINGLE_BYTE_DATA.windows_1257); |
709 | encode_single_byte(WINDOWS_1258, &data::SINGLE_BYTE_DATA.windows_1258); |
710 | encode_single_byte(WINDOWS_874, &data::SINGLE_BYTE_DATA.windows_874); |
711 | encode_single_byte(X_MAC_CYRILLIC, &data::SINGLE_BYTE_DATA.x_mac_cyrillic); |
712 | } |
713 | // END GENERATED CODE |
714 | } |
715 | |