1 | // Copyright Mozilla Foundation. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution. |
3 | // |
4 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
5 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
6 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
7 | // option. This file may not be copied, modified, or distributed |
8 | // except according to those terms. |
9 | |
10 | //! This module provides structs that use lifetimes to couple bounds checking |
11 | //! and space availability checking and detaching those from actual slice |
12 | //! reading/writing. |
13 | //! |
14 | //! At present, the internals of the implementation are safe code, so the |
15 | //! bound checks currently also happen on read/write. Once this code works, |
16 | //! the plan is to replace the internals with unsafe code that omits the |
17 | //! bound check at the read/write time. |
18 | |
19 | #[cfg (all( |
20 | feature = "simd-accel" , |
21 | any( |
22 | target_feature = "sse2" , |
23 | all(target_endian = "little" , target_arch = "aarch64" ), |
24 | all(target_endian = "little" , target_feature = "neon" ) |
25 | ) |
26 | ))] |
27 | use crate::simd_funcs::*; |
28 | |
29 | #[cfg (all( |
30 | feature = "simd-accel" , |
31 | any( |
32 | target_feature = "sse2" , |
33 | all(target_endian = "little" , target_arch = "aarch64" ), |
34 | all(target_endian = "little" , target_feature = "neon" ) |
35 | ) |
36 | ))] |
37 | use core::simd::u16x8; |
38 | |
39 | use super::DecoderResult; |
40 | use super::EncoderResult; |
41 | use crate::ascii::*; |
42 | use crate::utf_8::convert_utf8_to_utf16_up_to_invalid; |
43 | use crate::utf_8::utf8_valid_up_to; |
44 | |
45 | pub enum Space<T> { |
46 | Available(T), |
47 | Full(usize), |
48 | } |
49 | |
50 | pub enum CopyAsciiResult<T, U> { |
51 | Stop(T), |
52 | GoOn(U), |
53 | } |
54 | |
55 | pub enum NonAscii { |
56 | BmpExclAscii(u16), |
57 | Astral(char), |
58 | } |
59 | |
60 | pub enum Unicode { |
61 | Ascii(u8), |
62 | NonAscii(NonAscii), |
63 | } |
64 | |
65 | // Start UTF-16LE/BE fast path |
66 | |
67 | pub trait Endian { |
68 | const OPPOSITE_ENDIAN: bool; |
69 | } |
70 | |
71 | pub struct BigEndian; |
72 | |
73 | impl Endian for BigEndian { |
74 | #[cfg (target_endian = "little" )] |
75 | const OPPOSITE_ENDIAN: bool = true; |
76 | |
77 | #[cfg (target_endian = "big" )] |
78 | const OPPOSITE_ENDIAN: bool = false; |
79 | } |
80 | |
81 | pub struct LittleEndian; |
82 | |
83 | impl Endian for LittleEndian { |
84 | #[cfg (target_endian = "little" )] |
85 | const OPPOSITE_ENDIAN: bool = false; |
86 | |
87 | #[cfg (target_endian = "big" )] |
88 | const OPPOSITE_ENDIAN: bool = true; |
89 | } |
90 | |
91 | #[derive (Debug, Copy, Clone)] |
92 | struct UnalignedU16Slice { |
93 | // Safety invariant: ptr must be valid for reading 2*len bytes |
94 | ptr: *const u8, |
95 | len: usize, |
96 | } |
97 | |
98 | impl UnalignedU16Slice { |
99 | /// Safety: ptr must be valid for reading 2*len bytes |
100 | #[inline (always)] |
101 | pub unsafe fn new(ptr: *const u8, len: usize) -> UnalignedU16Slice { |
102 | // Safety: field invariant passed up to caller here |
103 | UnalignedU16Slice { ptr, len } |
104 | } |
105 | |
106 | #[inline (always)] |
107 | pub fn trim_last(&mut self) { |
108 | assert!(self.len > 0); |
109 | // Safety: invariant upheld here: a slice is still valid with a shorter len |
110 | self.len -= 1; |
111 | } |
112 | |
113 | #[inline (always)] |
114 | pub fn at(&self, i: usize) -> u16 { |
115 | use core::mem::MaybeUninit; |
116 | |
117 | assert!(i < self.len); |
118 | unsafe { |
119 | let mut u: MaybeUninit<u16> = MaybeUninit::uninit(); |
120 | // Safety: i is at most len - 1, which works here |
121 | ::core::ptr::copy_nonoverlapping(self.ptr.add(i * 2), u.as_mut_ptr() as *mut u8, 2); |
122 | // Safety: valid read above lets us do this |
123 | u.assume_init() |
124 | } |
125 | } |
126 | |
127 | #[cfg (feature = "simd-accel" )] |
128 | #[inline (always)] |
129 | pub fn simd_at(&self, i: usize) -> u16x8 { |
130 | // Safety: i/len are on the scale of u16s, each one corresponds to 2 u8s |
131 | assert!(i + SIMD_STRIDE_SIZE / 2 <= self.len); |
132 | let byte_index = i * 2; |
133 | // Safety: load16_unaligned needs SIMD_STRIDE_SIZE=16 u8 elements to read, |
134 | // or 16/2 = 8 u16 elements to read. |
135 | // We have checked that we have at least that many above. |
136 | |
137 | unsafe { to_u16_lanes(load16_unaligned(self.ptr.add(byte_index))) } |
138 | } |
139 | |
140 | #[inline (always)] |
141 | pub fn len(&self) -> usize { |
142 | self.len |
143 | } |
144 | |
145 | #[inline (always)] |
146 | pub fn tail(&self, from: usize) -> UnalignedU16Slice { |
147 | // XXX the return value should be restricted not to |
148 | // outlive self. |
149 | assert!(from <= self.len); |
150 | // Safety: This upholds the same invariant: `from` is in bounds and we're returning a shorter slice |
151 | unsafe { UnalignedU16Slice::new(self.ptr.add(from * 2), self.len - from) } |
152 | } |
153 | |
154 | #[cfg (feature = "simd-accel" )] |
155 | #[inline (always)] |
156 | pub fn copy_bmp_to<E: Endian>(&self, other: &mut [u16]) -> Option<(u16, usize)> { |
157 | assert!(self.len <= other.len()); |
158 | let mut offset = 0; |
159 | // Safety: SIMD_STRIDE_SIZE is measured in bytes, whereas len is in u16s. We check we can |
160 | // munch SIMD_STRIDE_SIZE / 2 u16s which means we can write SIMD_STRIDE_SIZE u8s |
161 | if SIMD_STRIDE_SIZE / 2 <= self.len { |
162 | let len_minus_stride = self.len - SIMD_STRIDE_SIZE / 2; |
163 | loop { |
164 | let mut simd = self.simd_at(offset); |
165 | if E::OPPOSITE_ENDIAN { |
166 | simd = simd_byte_swap(simd); |
167 | } |
168 | // Safety: we have enough space on the other side to write this |
169 | unsafe { |
170 | store8_unaligned(other.as_mut_ptr().add(offset), simd); |
171 | } |
172 | if contains_surrogates(simd) { |
173 | break; |
174 | } |
175 | offset += SIMD_STRIDE_SIZE / 2; |
176 | // Safety: This ensures we still have space for writing SIMD_STRIDE_SIZE u8s |
177 | if offset > len_minus_stride { |
178 | break; |
179 | } |
180 | } |
181 | } |
182 | while offset < self.len { |
183 | let unit = swap_if_opposite_endian::<E>(self.at(offset)); |
184 | other[offset] = unit; |
185 | if super::in_range16(unit, 0xD800, 0xE000) { |
186 | return Some((unit, offset)); |
187 | } |
188 | offset += 1; |
189 | } |
190 | None |
191 | } |
192 | |
193 | #[cfg (not(feature = "simd-accel" ))] |
194 | #[inline (always)] |
195 | fn copy_bmp_to<E: Endian>(&self, other: &mut [u16]) -> Option<(u16, usize)> { |
196 | assert!(self.len <= other.len()); |
197 | for (i, target) in other.iter_mut().enumerate().take(self.len) { |
198 | let unit = swap_if_opposite_endian::<E>(self.at(i)); |
199 | *target = unit; |
200 | if super::in_range16(unit, 0xD800, 0xE000) { |
201 | return Some((unit, i)); |
202 | } |
203 | } |
204 | None |
205 | } |
206 | } |
207 | |
208 | #[inline (always)] |
209 | fn copy_unaligned_basic_latin_to_ascii_alu<E: Endian>( |
210 | src: UnalignedU16Slice, |
211 | dst: &mut [u8], |
212 | offset: usize, |
213 | ) -> CopyAsciiResult<usize, (u16, usize)> { |
214 | let len: usize = ::core::cmp::min(v1:src.len(), v2:dst.len()); |
215 | let mut i: usize = 0usize; |
216 | loop { |
217 | if i == len { |
218 | return CopyAsciiResult::Stop(i + offset); |
219 | } |
220 | let unit: u16 = swap_if_opposite_endian::<E>(unit:src.at(i)); |
221 | if unit > 0x7F { |
222 | return CopyAsciiResult::GoOn((unit, i + offset)); |
223 | } |
224 | dst[i] = unit as u8; |
225 | i += 1; |
226 | } |
227 | } |
228 | |
229 | #[inline (always)] |
230 | fn swap_if_opposite_endian<E: Endian>(unit: u16) -> u16 { |
231 | if E::OPPOSITE_ENDIAN { |
232 | unit.swap_bytes() |
233 | } else { |
234 | unit |
235 | } |
236 | } |
237 | |
238 | #[cfg (not(feature = "simd-accel" ))] |
239 | #[inline (always)] |
240 | fn copy_unaligned_basic_latin_to_ascii<E: Endian>( |
241 | src: UnalignedU16Slice, |
242 | dst: &mut [u8], |
243 | ) -> CopyAsciiResult<usize, (u16, usize)> { |
244 | copy_unaligned_basic_latin_to_ascii_alu::<E>(src, dst, offset:0) |
245 | } |
246 | |
247 | #[cfg (feature = "simd-accel" )] |
248 | #[inline (always)] |
249 | fn copy_unaligned_basic_latin_to_ascii<E: Endian>( |
250 | src: UnalignedU16Slice, |
251 | dst: &mut [u8], |
252 | ) -> CopyAsciiResult<usize, (u16, usize)> { |
253 | let len = ::core::cmp::min(src.len(), dst.len()); |
254 | let mut offset = 0; |
255 | // Safety: This check ensures we are able to read/write at least SIMD_STRIDE_SIZE elements |
256 | if SIMD_STRIDE_SIZE <= len { |
257 | let len_minus_stride = len - SIMD_STRIDE_SIZE; |
258 | loop { |
259 | let mut first = src.simd_at(offset); |
260 | let mut second = src.simd_at(offset + (SIMD_STRIDE_SIZE / 2)); |
261 | if E::OPPOSITE_ENDIAN { |
262 | first = simd_byte_swap(first); |
263 | second = simd_byte_swap(second); |
264 | } |
265 | if !simd_is_basic_latin(first | second) { |
266 | break; |
267 | } |
268 | let packed = simd_pack(first, second); |
269 | // Safety: We are able to write SIMD_STRIDE_SIZE elements in this iteration |
270 | unsafe { |
271 | store16_unaligned(dst.as_mut_ptr().add(offset), packed); |
272 | } |
273 | offset += SIMD_STRIDE_SIZE; |
274 | // Safety: This is `offset > len - SIMD_STRIDE_SIZE`, which ensures that we can write at least SIMD_STRIDE_SIZE elements |
275 | // in the next iteration |
276 | if offset > len_minus_stride { |
277 | break; |
278 | } |
279 | } |
280 | } |
281 | copy_unaligned_basic_latin_to_ascii_alu::<E>(src.tail(offset), &mut dst[offset..], offset) |
282 | } |
283 | |
284 | #[inline (always)] |
285 | fn convert_unaligned_utf16_to_utf8<E: Endian>( |
286 | src: UnalignedU16Slice, |
287 | dst: &mut [u8], |
288 | ) -> (usize, usize, bool) { |
289 | if dst.len() < 4 { |
290 | return (0, 0, false); |
291 | } |
292 | let mut src_pos = 0usize; |
293 | let mut dst_pos = 0usize; |
294 | let src_len = src.len(); |
295 | let dst_len_minus_three = dst.len() - 3; |
296 | 'outer: loop { |
297 | let mut non_ascii = match copy_unaligned_basic_latin_to_ascii::<E>( |
298 | src.tail(src_pos), |
299 | &mut dst[dst_pos..], |
300 | ) { |
301 | CopyAsciiResult::GoOn((unit, read_written)) => { |
302 | src_pos += read_written; |
303 | dst_pos += read_written; |
304 | unit |
305 | } |
306 | CopyAsciiResult::Stop(read_written) => { |
307 | return (src_pos + read_written, dst_pos + read_written, false); |
308 | } |
309 | }; |
310 | if dst_pos >= dst_len_minus_three { |
311 | break 'outer; |
312 | } |
313 | // We have enough destination space to commit to |
314 | // having read `non_ascii`. |
315 | src_pos += 1; |
316 | 'inner: loop { |
317 | let non_ascii_minus_surrogate_start = non_ascii.wrapping_sub(0xD800); |
318 | if non_ascii_minus_surrogate_start > (0xDFFF - 0xD800) { |
319 | if non_ascii < 0x800 { |
320 | dst[dst_pos] = ((non_ascii >> 6) | 0xC0) as u8; |
321 | dst_pos += 1; |
322 | dst[dst_pos] = ((non_ascii & 0x3F) | 0x80) as u8; |
323 | dst_pos += 1; |
324 | } else { |
325 | dst[dst_pos] = ((non_ascii >> 12) | 0xE0) as u8; |
326 | dst_pos += 1; |
327 | dst[dst_pos] = (((non_ascii & 0xFC0) >> 6) | 0x80) as u8; |
328 | dst_pos += 1; |
329 | dst[dst_pos] = ((non_ascii & 0x3F) | 0x80) as u8; |
330 | dst_pos += 1; |
331 | } |
332 | } else if non_ascii_minus_surrogate_start <= (0xDBFF - 0xD800) { |
333 | // high surrogate |
334 | if src_pos < src_len { |
335 | let second = swap_if_opposite_endian::<E>(src.at(src_pos)); |
336 | let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); |
337 | if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) { |
338 | // The next code unit is a low surrogate. Advance position. |
339 | src_pos += 1; |
340 | let point = (u32::from(non_ascii) << 10) + u32::from(second) |
341 | - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32); |
342 | |
343 | dst[dst_pos] = ((point >> 18) | 0xF0u32) as u8; |
344 | dst_pos += 1; |
345 | dst[dst_pos] = (((point & 0x3F000u32) >> 12) | 0x80u32) as u8; |
346 | dst_pos += 1; |
347 | dst[dst_pos] = (((point & 0xFC0u32) >> 6) | 0x80u32) as u8; |
348 | dst_pos += 1; |
349 | dst[dst_pos] = ((point & 0x3Fu32) | 0x80u32) as u8; |
350 | dst_pos += 1; |
351 | } else { |
352 | // The next code unit is not a low surrogate. Don't advance |
353 | // position and treat the high surrogate as unpaired. |
354 | return (src_pos, dst_pos, true); |
355 | } |
356 | } else { |
357 | // Unpaired surrogate at the end of buffer |
358 | return (src_pos, dst_pos, true); |
359 | } |
360 | } else { |
361 | // Unpaired low surrogate |
362 | return (src_pos, dst_pos, true); |
363 | } |
364 | if dst_pos >= dst_len_minus_three || src_pos == src_len { |
365 | break 'outer; |
366 | } |
367 | let unit = swap_if_opposite_endian::<E>(src.at(src_pos)); |
368 | src_pos += 1; |
369 | if unit > 0x7F { |
370 | non_ascii = unit; |
371 | continue 'inner; |
372 | } |
373 | dst[dst_pos] = unit as u8; |
374 | dst_pos += 1; |
375 | continue 'outer; |
376 | } |
377 | } |
378 | (src_pos, dst_pos, false) |
379 | } |
380 | |
381 | // Byte source |
382 | |
383 | pub struct ByteSource<'a> { |
384 | slice: &'a [u8], |
385 | pos: usize, |
386 | } |
387 | |
388 | impl<'a> ByteSource<'a> { |
389 | #[inline (always)] |
390 | pub fn new(src: &[u8]) -> ByteSource { |
391 | ByteSource { slice: src, pos: 0 } |
392 | } |
393 | #[inline (always)] |
394 | pub fn check_available<'b>(&'b mut self) -> Space<ByteReadHandle<'b, 'a>> { |
395 | if self.pos < self.slice.len() { |
396 | Space::Available(ByteReadHandle::new(self)) |
397 | } else { |
398 | Space::Full(self.consumed()) |
399 | } |
400 | } |
401 | #[inline (always)] |
402 | fn read(&mut self) -> u8 { |
403 | let ret = self.slice[self.pos]; |
404 | self.pos += 1; |
405 | ret |
406 | } |
407 | #[inline (always)] |
408 | fn unread(&mut self) -> usize { |
409 | self.pos -= 1; |
410 | self.pos |
411 | } |
412 | #[inline (always)] |
413 | pub fn consumed(&self) -> usize { |
414 | self.pos |
415 | } |
416 | } |
417 | |
418 | pub struct ByteReadHandle<'a, 'b> |
419 | where |
420 | 'b: 'a, |
421 | { |
422 | source: &'a mut ByteSource<'b>, |
423 | } |
424 | |
425 | impl<'a, 'b> ByteReadHandle<'a, 'b> |
426 | where |
427 | 'b: 'a, |
428 | { |
429 | #[inline (always)] |
430 | fn new(src: &'a mut ByteSource<'b>) -> ByteReadHandle<'a, 'b> { |
431 | ByteReadHandle { source: src } |
432 | } |
433 | #[inline (always)] |
434 | pub fn read(self) -> (u8, ByteUnreadHandle<'a, 'b>) { |
435 | let byte: u8 = self.source.read(); |
436 | let handle: ByteUnreadHandle<'_, '_> = ByteUnreadHandle::new(self.source); |
437 | (byte, handle) |
438 | } |
439 | #[inline (always)] |
440 | pub fn consumed(&self) -> usize { |
441 | self.source.consumed() |
442 | } |
443 | } |
444 | |
445 | pub struct ByteUnreadHandle<'a, 'b> |
446 | where |
447 | 'b: 'a, |
448 | { |
449 | source: &'a mut ByteSource<'b>, |
450 | } |
451 | |
452 | impl<'a, 'b> ByteUnreadHandle<'a, 'b> |
453 | where |
454 | 'b: 'a, |
455 | { |
456 | #[inline (always)] |
457 | fn new(src: &'a mut ByteSource<'b>) -> ByteUnreadHandle<'a, 'b> { |
458 | ByteUnreadHandle { source: src } |
459 | } |
460 | #[inline (always)] |
461 | pub fn unread(self) -> usize { |
462 | self.source.unread() |
463 | } |
464 | #[inline (always)] |
465 | pub fn consumed(&self) -> usize { |
466 | self.source.consumed() |
467 | } |
468 | #[inline (always)] |
469 | pub fn commit(self) -> &'a mut ByteSource<'b> { |
470 | self.source |
471 | } |
472 | } |
473 | |
474 | // UTF-16 destination |
475 | |
476 | pub struct Utf16BmpHandle<'a, 'b> |
477 | where |
478 | 'b: 'a, |
479 | { |
480 | dest: &'a mut Utf16Destination<'b>, |
481 | } |
482 | |
483 | impl<'a, 'b> Utf16BmpHandle<'a, 'b> |
484 | where |
485 | 'b: 'a, |
486 | { |
487 | #[inline (always)] |
488 | fn new(dst: &'a mut Utf16Destination<'b>) -> Utf16BmpHandle<'a, 'b> { |
489 | Utf16BmpHandle { dest: dst } |
490 | } |
491 | #[inline (always)] |
492 | pub fn written(&self) -> usize { |
493 | self.dest.written() |
494 | } |
495 | #[inline (always)] |
496 | pub fn write_ascii(self, ascii: u8) -> &'a mut Utf16Destination<'b> { |
497 | self.dest.write_ascii(ascii); |
498 | self.dest |
499 | } |
500 | #[inline (always)] |
501 | pub fn write_bmp(self, bmp: u16) -> &'a mut Utf16Destination<'b> { |
502 | self.dest.write_bmp(bmp); |
503 | self.dest |
504 | } |
505 | #[inline (always)] |
506 | pub fn write_bmp_excl_ascii(self, bmp: u16) -> &'a mut Utf16Destination<'b> { |
507 | self.dest.write_bmp_excl_ascii(bmp); |
508 | self.dest |
509 | } |
510 | #[inline (always)] |
511 | pub fn write_mid_bmp(self, bmp: u16) -> &'a mut Utf16Destination<'b> { |
512 | self.dest.write_mid_bmp(bmp); |
513 | self.dest |
514 | } |
515 | #[inline (always)] |
516 | pub fn write_upper_bmp(self, bmp: u16) -> &'a mut Utf16Destination<'b> { |
517 | self.dest.write_upper_bmp(bmp); |
518 | self.dest |
519 | } |
520 | #[inline (always)] |
521 | pub fn commit(self) -> &'a mut Utf16Destination<'b> { |
522 | self.dest |
523 | } |
524 | } |
525 | |
526 | pub struct Utf16AstralHandle<'a, 'b> |
527 | where |
528 | 'b: 'a, |
529 | { |
530 | dest: &'a mut Utf16Destination<'b>, |
531 | } |
532 | |
533 | impl<'a, 'b> Utf16AstralHandle<'a, 'b> |
534 | where |
535 | 'b: 'a, |
536 | { |
537 | #[inline (always)] |
538 | fn new(dst: &'a mut Utf16Destination<'b>) -> Utf16AstralHandle<'a, 'b> { |
539 | Utf16AstralHandle { dest: dst } |
540 | } |
541 | #[inline (always)] |
542 | pub fn written(&self) -> usize { |
543 | self.dest.written() |
544 | } |
545 | #[inline (always)] |
546 | pub fn write_ascii(self, ascii: u8) -> &'a mut Utf16Destination<'b> { |
547 | self.dest.write_ascii(ascii); |
548 | self.dest |
549 | } |
550 | #[inline (always)] |
551 | pub fn write_bmp(self, bmp: u16) -> &'a mut Utf16Destination<'b> { |
552 | self.dest.write_bmp(bmp); |
553 | self.dest |
554 | } |
555 | #[inline (always)] |
556 | pub fn write_bmp_excl_ascii(self, bmp: u16) -> &'a mut Utf16Destination<'b> { |
557 | self.dest.write_bmp_excl_ascii(bmp); |
558 | self.dest |
559 | } |
560 | #[inline (always)] |
561 | pub fn write_upper_bmp(self, bmp: u16) -> &'a mut Utf16Destination<'b> { |
562 | self.dest.write_upper_bmp(bmp); |
563 | self.dest |
564 | } |
565 | #[inline (always)] |
566 | pub fn write_astral(self, astral: u32) -> &'a mut Utf16Destination<'b> { |
567 | self.dest.write_astral(astral); |
568 | self.dest |
569 | } |
570 | #[inline (always)] |
571 | pub fn write_surrogate_pair(self, high: u16, low: u16) -> &'a mut Utf16Destination<'b> { |
572 | self.dest.write_surrogate_pair(high, low); |
573 | self.dest |
574 | } |
575 | #[inline (always)] |
576 | pub fn write_big5_combination( |
577 | self, |
578 | combined: u16, |
579 | combining: u16, |
580 | ) -> &'a mut Utf16Destination<'b> { |
581 | self.dest.write_big5_combination(combined, combining); |
582 | self.dest |
583 | } |
584 | #[inline (always)] |
585 | pub fn commit(self) -> &'a mut Utf16Destination<'b> { |
586 | self.dest |
587 | } |
588 | } |
589 | |
590 | pub struct Utf16Destination<'a> { |
591 | slice: &'a mut [u16], |
592 | pos: usize, |
593 | } |
594 | |
595 | impl<'a> Utf16Destination<'a> { |
596 | #[inline (always)] |
597 | pub fn new(dst: &mut [u16]) -> Utf16Destination { |
598 | Utf16Destination { slice: dst, pos: 0 } |
599 | } |
600 | #[inline (always)] |
601 | pub fn check_space_bmp<'b>(&'b mut self) -> Space<Utf16BmpHandle<'b, 'a>> { |
602 | if self.pos < self.slice.len() { |
603 | Space::Available(Utf16BmpHandle::new(self)) |
604 | } else { |
605 | Space::Full(self.written()) |
606 | } |
607 | } |
608 | #[inline (always)] |
609 | pub fn check_space_astral<'b>(&'b mut self) -> Space<Utf16AstralHandle<'b, 'a>> { |
610 | if self.pos + 1 < self.slice.len() { |
611 | Space::Available(Utf16AstralHandle::new(self)) |
612 | } else { |
613 | Space::Full(self.written()) |
614 | } |
615 | } |
616 | #[inline (always)] |
617 | pub fn written(&self) -> usize { |
618 | self.pos |
619 | } |
620 | #[inline (always)] |
621 | fn write_code_unit(&mut self, u: u16) { |
622 | unsafe { |
623 | // OK, because we checked before handing out a handle. |
624 | *(self.slice.get_unchecked_mut(self.pos)) = u; |
625 | } |
626 | self.pos += 1; |
627 | } |
628 | #[inline (always)] |
629 | fn write_ascii(&mut self, ascii: u8) { |
630 | debug_assert!(ascii < 0x80); |
631 | self.write_code_unit(u16::from(ascii)); |
632 | } |
633 | #[inline (always)] |
634 | fn write_bmp(&mut self, bmp: u16) { |
635 | self.write_code_unit(bmp); |
636 | } |
637 | #[inline (always)] |
638 | fn write_bmp_excl_ascii(&mut self, bmp: u16) { |
639 | debug_assert!(bmp >= 0x80); |
640 | self.write_code_unit(bmp); |
641 | } |
642 | #[inline (always)] |
643 | fn write_mid_bmp(&mut self, bmp: u16) { |
644 | debug_assert!(bmp >= 0x80); // XXX |
645 | self.write_code_unit(bmp); |
646 | } |
647 | #[inline (always)] |
648 | fn write_upper_bmp(&mut self, bmp: u16) { |
649 | debug_assert!(bmp >= 0x80); |
650 | self.write_code_unit(bmp); |
651 | } |
652 | #[inline (always)] |
653 | fn write_astral(&mut self, astral: u32) { |
654 | debug_assert!(astral > 0xFFFF); |
655 | debug_assert!(astral <= 0x10_FFFF); |
656 | self.write_code_unit((0xD7C0 + (astral >> 10)) as u16); |
657 | self.write_code_unit((0xDC00 + (astral & 0x3FF)) as u16); |
658 | } |
659 | #[inline (always)] |
660 | fn write_surrogate_pair(&mut self, high: u16, low: u16) { |
661 | self.write_code_unit(high); |
662 | self.write_code_unit(low); |
663 | } |
664 | #[inline (always)] |
665 | fn write_big5_combination(&mut self, combined: u16, combining: u16) { |
666 | self.write_bmp_excl_ascii(combined); |
667 | self.write_bmp_excl_ascii(combining); |
668 | } |
669 | // Safety-usable invariant: CopyAsciiResult::GoOn will only contain bytes >=0x80 |
670 | #[inline (always)] |
671 | pub fn copy_ascii_from_check_space_bmp<'b>( |
672 | &'b mut self, |
673 | source: &mut ByteSource, |
674 | ) -> CopyAsciiResult<(DecoderResult, usize, usize), (u8, Utf16BmpHandle<'b, 'a>)> { |
675 | let non_ascii_ret = { |
676 | let src_remaining = &source.slice[source.pos..]; |
677 | let dst_remaining = &mut self.slice[self.pos..]; |
678 | let (pending, length) = if dst_remaining.len() < src_remaining.len() { |
679 | (DecoderResult::OutputFull, dst_remaining.len()) |
680 | } else { |
681 | (DecoderResult::InputEmpty, src_remaining.len()) |
682 | }; |
683 | // Safety: This function is documented as needing valid pointers for src/dest and len, which |
684 | // is true since we've passed the minumum length of the two |
685 | match unsafe { |
686 | ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
687 | } { |
688 | None => { |
689 | source.pos += length; |
690 | self.pos += length; |
691 | return CopyAsciiResult::Stop((pending, source.pos, self.pos)); |
692 | } |
693 | // Safety: the function is documented as returning bytes >=0x80 in the Some |
694 | Some((non_ascii, consumed)) => { |
695 | source.pos += consumed; |
696 | self.pos += consumed; |
697 | source.pos += 1; // +1 for non_ascii |
698 | // Safety: non-ascii bubbled out here |
699 | non_ascii |
700 | } |
701 | } |
702 | }; |
703 | // Safety: non-ascii returned here |
704 | CopyAsciiResult::GoOn((non_ascii_ret, Utf16BmpHandle::new(self))) |
705 | } |
706 | // Safety-usable invariant: CopyAsciiResult::GoOn will only contain bytes >=0x80 |
707 | #[inline (always)] |
708 | pub fn copy_ascii_from_check_space_astral<'b>( |
709 | &'b mut self, |
710 | source: &mut ByteSource, |
711 | ) -> CopyAsciiResult<(DecoderResult, usize, usize), (u8, Utf16AstralHandle<'b, 'a>)> { |
712 | let non_ascii_ret = { |
713 | let dst_len = self.slice.len(); |
714 | let src_remaining = &source.slice[source.pos..]; |
715 | let dst_remaining = &mut self.slice[self.pos..]; |
716 | let (pending, length) = if dst_remaining.len() < src_remaining.len() { |
717 | (DecoderResult::OutputFull, dst_remaining.len()) |
718 | } else { |
719 | (DecoderResult::InputEmpty, src_remaining.len()) |
720 | }; |
721 | // Safety: This function is documented as needing valid pointers for src/dest and len, which |
722 | // is true since we've passed the minumum length of the two |
723 | match unsafe { |
724 | ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
725 | } { |
726 | None => { |
727 | source.pos += length; |
728 | self.pos += length; |
729 | return CopyAsciiResult::Stop((pending, source.pos, self.pos)); |
730 | } |
731 | // Safety: the function is documented as returning bytes >=0x80 in the Some |
732 | Some((non_ascii, consumed)) => { |
733 | source.pos += consumed; |
734 | self.pos += consumed; |
735 | if self.pos + 1 < dst_len { |
736 | source.pos += 1; // +1 for non_ascii |
737 | // Safety: non-ascii bubbled out here |
738 | non_ascii |
739 | } else { |
740 | return CopyAsciiResult::Stop(( |
741 | DecoderResult::OutputFull, |
742 | source.pos, |
743 | self.pos, |
744 | )); |
745 | } |
746 | } |
747 | } |
748 | }; |
749 | // Safety: non-ascii returned here |
750 | CopyAsciiResult::GoOn((non_ascii_ret, Utf16AstralHandle::new(self))) |
751 | } |
752 | #[inline (always)] |
753 | pub fn copy_utf8_up_to_invalid_from(&mut self, source: &mut ByteSource) { |
754 | let src_remaining = &source.slice[source.pos..]; |
755 | let dst_remaining = &mut self.slice[self.pos..]; |
756 | let (read, written) = convert_utf8_to_utf16_up_to_invalid(src_remaining, dst_remaining); |
757 | source.pos += read; |
758 | self.pos += written; |
759 | } |
760 | #[inline (always)] |
761 | pub fn copy_utf16_from<E: Endian>( |
762 | &mut self, |
763 | source: &mut ByteSource, |
764 | ) -> Option<(usize, usize)> { |
765 | let src_remaining = &source.slice[source.pos..]; |
766 | let dst_remaining = &mut self.slice[self.pos..]; |
767 | |
768 | let mut src_unaligned = unsafe { |
769 | UnalignedU16Slice::new( |
770 | src_remaining.as_ptr(), |
771 | ::core::cmp::min(src_remaining.len() / 2, dst_remaining.len()), |
772 | ) |
773 | }; |
774 | if src_unaligned.len() == 0 { |
775 | return None; |
776 | } |
777 | let last_unit = swap_if_opposite_endian::<E>(src_unaligned.at(src_unaligned.len() - 1)); |
778 | if super::in_range16(last_unit, 0xD800, 0xDC00) { |
779 | // Last code unit is a high surrogate. It might |
780 | // legitimately form a pair later, so let's not |
781 | // include it. |
782 | src_unaligned.trim_last(); |
783 | } |
784 | let mut offset = 0usize; |
785 | loop { |
786 | if let Some((surrogate, bmp_len)) = { |
787 | let src_left = src_unaligned.tail(offset); |
788 | let dst_left = &mut dst_remaining[offset..src_unaligned.len()]; |
789 | src_left.copy_bmp_to::<E>(dst_left) |
790 | } { |
791 | offset += bmp_len; // surrogate has not been consumed yet |
792 | let second_pos = offset + 1; |
793 | if surrogate > 0xDBFF || second_pos == src_unaligned.len() { |
794 | // Unpaired surrogate |
795 | source.pos += second_pos * 2; |
796 | self.pos += offset; |
797 | return Some((source.pos, self.pos)); |
798 | } |
799 | let second = swap_if_opposite_endian::<E>(src_unaligned.at(second_pos)); |
800 | if !super::in_range16(second, 0xDC00, 0xE000) { |
801 | // Unpaired surrogate |
802 | source.pos += second_pos * 2; |
803 | self.pos += offset; |
804 | return Some((source.pos, self.pos)); |
805 | } |
806 | // `surrogate` was already speculatively written |
807 | dst_remaining[second_pos] = second; |
808 | offset += 2; |
809 | continue; |
810 | } else { |
811 | source.pos += src_unaligned.len() * 2; |
812 | self.pos += src_unaligned.len(); |
813 | return None; |
814 | } |
815 | } |
816 | } |
817 | } |
818 | |
819 | // UTF-8 destination |
820 | |
821 | pub struct Utf8BmpHandle<'a, 'b> |
822 | where |
823 | 'b: 'a, |
824 | { |
825 | dest: &'a mut Utf8Destination<'b>, |
826 | } |
827 | |
828 | impl<'a, 'b> Utf8BmpHandle<'a, 'b> |
829 | where |
830 | 'b: 'a, |
831 | { |
832 | #[inline (always)] |
833 | fn new(dst: &'a mut Utf8Destination<'b>) -> Utf8BmpHandle<'a, 'b> { |
834 | Utf8BmpHandle { dest: dst } |
835 | } |
836 | #[inline (always)] |
837 | pub fn written(&self) -> usize { |
838 | self.dest.written() |
839 | } |
840 | #[inline (always)] |
841 | pub fn write_ascii(self, ascii: u8) -> &'a mut Utf8Destination<'b> { |
842 | self.dest.write_ascii(ascii); |
843 | self.dest |
844 | } |
845 | #[inline (always)] |
846 | pub fn write_bmp(self, bmp: u16) -> &'a mut Utf8Destination<'b> { |
847 | self.dest.write_bmp(bmp); |
848 | self.dest |
849 | } |
850 | #[inline (always)] |
851 | pub fn write_bmp_excl_ascii(self, bmp: u16) -> &'a mut Utf8Destination<'b> { |
852 | self.dest.write_bmp_excl_ascii(bmp); |
853 | self.dest |
854 | } |
855 | #[inline (always)] |
856 | pub fn write_mid_bmp(self, bmp: u16) -> &'a mut Utf8Destination<'b> { |
857 | self.dest.write_mid_bmp(bmp); |
858 | self.dest |
859 | } |
860 | #[inline (always)] |
861 | pub fn write_upper_bmp(self, bmp: u16) -> &'a mut Utf8Destination<'b> { |
862 | self.dest.write_upper_bmp(bmp); |
863 | self.dest |
864 | } |
865 | #[inline (always)] |
866 | pub fn commit(self) -> &'a mut Utf8Destination<'b> { |
867 | self.dest |
868 | } |
869 | } |
870 | |
871 | pub struct Utf8AstralHandle<'a, 'b> |
872 | where |
873 | 'b: 'a, |
874 | { |
875 | dest: &'a mut Utf8Destination<'b>, |
876 | } |
877 | |
878 | impl<'a, 'b> Utf8AstralHandle<'a, 'b> |
879 | where |
880 | 'b: 'a, |
881 | { |
882 | #[inline (always)] |
883 | fn new(dst: &'a mut Utf8Destination<'b>) -> Utf8AstralHandle<'a, 'b> { |
884 | Utf8AstralHandle { dest: dst } |
885 | } |
886 | #[inline (always)] |
887 | pub fn written(&self) -> usize { |
888 | self.dest.written() |
889 | } |
890 | #[inline (always)] |
891 | pub fn write_ascii(self, ascii: u8) -> &'a mut Utf8Destination<'b> { |
892 | self.dest.write_ascii(ascii); |
893 | self.dest |
894 | } |
895 | #[inline (always)] |
896 | pub fn write_bmp(self, bmp: u16) -> &'a mut Utf8Destination<'b> { |
897 | self.dest.write_bmp(bmp); |
898 | self.dest |
899 | } |
900 | #[inline (always)] |
901 | pub fn write_bmp_excl_ascii(self, bmp: u16) -> &'a mut Utf8Destination<'b> { |
902 | self.dest.write_bmp_excl_ascii(bmp); |
903 | self.dest |
904 | } |
905 | #[inline (always)] |
906 | pub fn write_upper_bmp(self, bmp: u16) -> &'a mut Utf8Destination<'b> { |
907 | self.dest.write_upper_bmp(bmp); |
908 | self.dest |
909 | } |
910 | #[inline (always)] |
911 | pub fn write_astral(self, astral: u32) -> &'a mut Utf8Destination<'b> { |
912 | self.dest.write_astral(astral); |
913 | self.dest |
914 | } |
915 | #[inline (always)] |
916 | pub fn write_surrogate_pair(self, high: u16, low: u16) -> &'a mut Utf8Destination<'b> { |
917 | self.dest.write_surrogate_pair(high, low); |
918 | self.dest |
919 | } |
920 | #[inline (always)] |
921 | pub fn write_big5_combination( |
922 | self, |
923 | combined: u16, |
924 | combining: u16, |
925 | ) -> &'a mut Utf8Destination<'b> { |
926 | self.dest.write_big5_combination(combined, combining); |
927 | self.dest |
928 | } |
929 | #[inline (always)] |
930 | pub fn commit(self) -> &'a mut Utf8Destination<'b> { |
931 | self.dest |
932 | } |
933 | } |
934 | |
935 | pub struct Utf8Destination<'a> { |
936 | slice: &'a mut [u8], |
937 | pos: usize, |
938 | } |
939 | |
940 | impl<'a> Utf8Destination<'a> { |
941 | #[inline (always)] |
942 | pub fn new(dst: &mut [u8]) -> Utf8Destination { |
943 | Utf8Destination { slice: dst, pos: 0 } |
944 | } |
945 | #[inline (always)] |
946 | pub fn check_space_bmp<'b>(&'b mut self) -> Space<Utf8BmpHandle<'b, 'a>> { |
947 | if self.pos + 2 < self.slice.len() { |
948 | Space::Available(Utf8BmpHandle::new(self)) |
949 | } else { |
950 | Space::Full(self.written()) |
951 | } |
952 | } |
953 | #[inline (always)] |
954 | pub fn check_space_astral<'b>(&'b mut self) -> Space<Utf8AstralHandle<'b, 'a>> { |
955 | if self.pos + 3 < self.slice.len() { |
956 | Space::Available(Utf8AstralHandle::new(self)) |
957 | } else { |
958 | Space::Full(self.written()) |
959 | } |
960 | } |
961 | #[inline (always)] |
962 | pub fn written(&self) -> usize { |
963 | self.pos |
964 | } |
965 | #[inline (always)] |
966 | fn write_code_unit(&mut self, u: u8) { |
967 | unsafe { |
968 | // OK, because we checked before handing out a handle. |
969 | *(self.slice.get_unchecked_mut(self.pos)) = u; |
970 | } |
971 | self.pos += 1; |
972 | } |
973 | #[inline (always)] |
974 | fn write_ascii(&mut self, ascii: u8) { |
975 | debug_assert!(ascii < 0x80); |
976 | self.write_code_unit(ascii); |
977 | } |
978 | #[inline (always)] |
979 | fn write_bmp(&mut self, bmp: u16) { |
980 | if bmp < 0x80u16 { |
981 | self.write_ascii(bmp as u8); |
982 | } else if bmp < 0x800u16 { |
983 | self.write_mid_bmp(bmp); |
984 | } else { |
985 | self.write_upper_bmp(bmp); |
986 | } |
987 | } |
988 | #[inline (always)] |
989 | fn write_mid_bmp(&mut self, mid_bmp: u16) { |
990 | debug_assert!(mid_bmp >= 0x80); |
991 | debug_assert!(mid_bmp < 0x800); |
992 | self.write_code_unit(((mid_bmp >> 6) | 0xC0) as u8); |
993 | self.write_code_unit(((mid_bmp & 0x3F) | 0x80) as u8); |
994 | } |
995 | #[inline (always)] |
996 | fn write_upper_bmp(&mut self, upper_bmp: u16) { |
997 | debug_assert!(upper_bmp >= 0x800); |
998 | self.write_code_unit(((upper_bmp >> 12) | 0xE0) as u8); |
999 | self.write_code_unit((((upper_bmp & 0xFC0) >> 6) | 0x80) as u8); |
1000 | self.write_code_unit(((upper_bmp & 0x3F) | 0x80) as u8); |
1001 | } |
1002 | #[inline (always)] |
1003 | fn write_bmp_excl_ascii(&mut self, bmp: u16) { |
1004 | if bmp < 0x800u16 { |
1005 | self.write_mid_bmp(bmp); |
1006 | } else { |
1007 | self.write_upper_bmp(bmp); |
1008 | } |
1009 | } |
1010 | #[inline (always)] |
1011 | fn write_astral(&mut self, astral: u32) { |
1012 | debug_assert!(astral > 0xFFFF); |
1013 | debug_assert!(astral <= 0x10_FFFF); |
1014 | self.write_code_unit(((astral >> 18) | 0xF0) as u8); |
1015 | self.write_code_unit((((astral & 0x3F000) >> 12) | 0x80) as u8); |
1016 | self.write_code_unit((((astral & 0xFC0) >> 6) | 0x80) as u8); |
1017 | self.write_code_unit(((astral & 0x3F) | 0x80) as u8); |
1018 | } |
1019 | #[inline (always)] |
1020 | pub fn write_surrogate_pair(&mut self, high: u16, low: u16) { |
1021 | self.write_astral( |
1022 | (u32::from(high) << 10) + u32::from(low) |
1023 | - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32), |
1024 | ); |
1025 | } |
1026 | #[inline (always)] |
1027 | fn write_big5_combination(&mut self, combined: u16, combining: u16) { |
1028 | self.write_mid_bmp(combined); |
1029 | self.write_mid_bmp(combining); |
1030 | } |
1031 | #[inline (always)] |
1032 | pub fn copy_ascii_from_check_space_bmp<'b>( |
1033 | &'b mut self, |
1034 | source: &mut ByteSource, |
1035 | ) -> CopyAsciiResult<(DecoderResult, usize, usize), (u8, Utf8BmpHandle<'b, 'a>)> { |
1036 | let non_ascii_ret = { |
1037 | let dst_len = self.slice.len(); |
1038 | let src_remaining = &source.slice[source.pos..]; |
1039 | let dst_remaining = &mut self.slice[self.pos..]; |
1040 | let (pending, length) = if dst_remaining.len() < src_remaining.len() { |
1041 | (DecoderResult::OutputFull, dst_remaining.len()) |
1042 | } else { |
1043 | (DecoderResult::InputEmpty, src_remaining.len()) |
1044 | }; |
1045 | match unsafe { |
1046 | ascii_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
1047 | } { |
1048 | None => { |
1049 | source.pos += length; |
1050 | self.pos += length; |
1051 | return CopyAsciiResult::Stop((pending, source.pos, self.pos)); |
1052 | } |
1053 | Some((non_ascii, consumed)) => { |
1054 | source.pos += consumed; |
1055 | self.pos += consumed; |
1056 | if self.pos + 2 < dst_len { |
1057 | source.pos += 1; // +1 for non_ascii |
1058 | non_ascii |
1059 | } else { |
1060 | return CopyAsciiResult::Stop(( |
1061 | DecoderResult::OutputFull, |
1062 | source.pos, |
1063 | self.pos, |
1064 | )); |
1065 | } |
1066 | } |
1067 | } |
1068 | }; |
1069 | CopyAsciiResult::GoOn((non_ascii_ret, Utf8BmpHandle::new(self))) |
1070 | } |
1071 | #[inline (always)] |
1072 | pub fn copy_ascii_from_check_space_astral<'b>( |
1073 | &'b mut self, |
1074 | source: &mut ByteSource, |
1075 | ) -> CopyAsciiResult<(DecoderResult, usize, usize), (u8, Utf8AstralHandle<'b, 'a>)> { |
1076 | let non_ascii_ret = { |
1077 | let dst_len = self.slice.len(); |
1078 | let src_remaining = &source.slice[source.pos..]; |
1079 | let dst_remaining = &mut self.slice[self.pos..]; |
1080 | let (pending, length) = if dst_remaining.len() < src_remaining.len() { |
1081 | (DecoderResult::OutputFull, dst_remaining.len()) |
1082 | } else { |
1083 | (DecoderResult::InputEmpty, src_remaining.len()) |
1084 | }; |
1085 | match unsafe { |
1086 | ascii_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
1087 | } { |
1088 | None => { |
1089 | source.pos += length; |
1090 | self.pos += length; |
1091 | return CopyAsciiResult::Stop((pending, source.pos, self.pos)); |
1092 | } |
1093 | Some((non_ascii, consumed)) => { |
1094 | source.pos += consumed; |
1095 | self.pos += consumed; |
1096 | if self.pos + 3 < dst_len { |
1097 | source.pos += 1; // +1 for non_ascii |
1098 | non_ascii |
1099 | } else { |
1100 | return CopyAsciiResult::Stop(( |
1101 | DecoderResult::OutputFull, |
1102 | source.pos, |
1103 | self.pos, |
1104 | )); |
1105 | } |
1106 | } |
1107 | } |
1108 | }; |
1109 | CopyAsciiResult::GoOn((non_ascii_ret, Utf8AstralHandle::new(self))) |
1110 | } |
1111 | #[inline (always)] |
1112 | pub fn copy_utf8_up_to_invalid_from(&mut self, source: &mut ByteSource) { |
1113 | let src_remaining = &source.slice[source.pos..]; |
1114 | let dst_remaining = &mut self.slice[self.pos..]; |
1115 | let min_len = ::core::cmp::min(src_remaining.len(), dst_remaining.len()); |
1116 | // Validate first, then memcpy to let memcpy do its thing even for |
1117 | // non-ASCII. (And potentially do something better than SSE2 for ASCII.) |
1118 | let valid_len = utf8_valid_up_to(&src_remaining[..min_len]); |
1119 | (&mut dst_remaining[..valid_len]).copy_from_slice(&src_remaining[..valid_len]); |
1120 | source.pos += valid_len; |
1121 | self.pos += valid_len; |
1122 | } |
1123 | #[inline (always)] |
1124 | pub fn copy_utf16_from<E: Endian>( |
1125 | &mut self, |
1126 | source: &mut ByteSource, |
1127 | ) -> Option<(usize, usize)> { |
1128 | let src_remaining = &source.slice[source.pos..]; |
1129 | let dst_remaining = &mut self.slice[self.pos..]; |
1130 | |
1131 | let mut src_unaligned = |
1132 | unsafe { UnalignedU16Slice::new(src_remaining.as_ptr(), src_remaining.len() / 2) }; |
1133 | if src_unaligned.len() == 0 { |
1134 | return None; |
1135 | } |
1136 | let mut last_unit = src_unaligned.at(src_unaligned.len() - 1); |
1137 | if E::OPPOSITE_ENDIAN { |
1138 | last_unit = last_unit.swap_bytes(); |
1139 | } |
1140 | if super::in_range16(last_unit, 0xD800, 0xDC00) { |
1141 | // Last code unit is a high surrogate. It might |
1142 | // legitimately form a pair later, so let's not |
1143 | // include it. |
1144 | src_unaligned.trim_last(); |
1145 | } |
1146 | let (read, written, had_error) = |
1147 | convert_unaligned_utf16_to_utf8::<E>(src_unaligned, dst_remaining); |
1148 | source.pos += read * 2; |
1149 | self.pos += written; |
1150 | if had_error { |
1151 | Some((source.pos, self.pos)) |
1152 | } else { |
1153 | None |
1154 | } |
1155 | } |
1156 | } |
1157 | |
1158 | // UTF-16 source |
1159 | |
1160 | pub struct Utf16Source<'a> { |
1161 | slice: &'a [u16], |
1162 | pos: usize, |
1163 | old_pos: usize, |
1164 | } |
1165 | |
1166 | impl<'a> Utf16Source<'a> { |
1167 | #[inline (always)] |
1168 | pub fn new(src: &[u16]) -> Utf16Source { |
1169 | Utf16Source { |
1170 | slice: src, |
1171 | pos: 0, |
1172 | old_pos: 0, |
1173 | } |
1174 | } |
1175 | #[inline (always)] |
1176 | pub fn check_available<'b>(&'b mut self) -> Space<Utf16ReadHandle<'b, 'a>> { |
1177 | if self.pos < self.slice.len() { |
1178 | Space::Available(Utf16ReadHandle::new(self)) |
1179 | } else { |
1180 | Space::Full(self.consumed()) |
1181 | } |
1182 | } |
1183 | #[cfg_attr (feature = "cargo-clippy" , allow(collapsible_if))] |
1184 | #[inline (always)] |
1185 | fn read(&mut self) -> char { |
1186 | self.old_pos = self.pos; |
1187 | let unit = self.slice[self.pos]; |
1188 | self.pos += 1; |
1189 | let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); |
1190 | if unit_minus_surrogate_start > (0xDFFF - 0xD800) { |
1191 | return unsafe { ::core::char::from_u32_unchecked(u32::from(unit)) }; |
1192 | } |
1193 | if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { |
1194 | // high surrogate |
1195 | if self.pos < self.slice.len() { |
1196 | let second = self.slice[self.pos]; |
1197 | let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); |
1198 | if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) { |
1199 | // The next code unit is a low surrogate. Advance position. |
1200 | self.pos += 1; |
1201 | return unsafe { |
1202 | ::core::char::from_u32_unchecked( |
1203 | (u32::from(unit) << 10) + u32::from(second) |
1204 | - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32), |
1205 | ) |
1206 | }; |
1207 | } |
1208 | // The next code unit is not a low surrogate. Don't advance |
1209 | // position and treat the high surrogate as unpaired. |
1210 | // fall through |
1211 | } |
1212 | // Unpaired surrogate at the end of buffer, fall through |
1213 | } |
1214 | // Unpaired low surrogate |
1215 | ' \u{FFFD}' |
1216 | } |
1217 | #[cfg_attr (feature = "cargo-clippy" , allow(collapsible_if))] |
1218 | #[inline (always)] |
1219 | fn read_enum(&mut self) -> Unicode { |
1220 | self.old_pos = self.pos; |
1221 | let unit = self.slice[self.pos]; |
1222 | self.pos += 1; |
1223 | if unit < 0x80 { |
1224 | return Unicode::Ascii(unit as u8); |
1225 | } |
1226 | let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); |
1227 | if unit_minus_surrogate_start > (0xDFFF - 0xD800) { |
1228 | return Unicode::NonAscii(NonAscii::BmpExclAscii(unit)); |
1229 | } |
1230 | if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { |
1231 | // high surrogate |
1232 | if self.pos < self.slice.len() { |
1233 | let second = self.slice[self.pos]; |
1234 | let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); |
1235 | if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) { |
1236 | // The next code unit is a low surrogate. Advance position. |
1237 | self.pos += 1; |
1238 | return Unicode::NonAscii(NonAscii::Astral(unsafe { |
1239 | ::core::char::from_u32_unchecked( |
1240 | (u32::from(unit) << 10) + u32::from(second) |
1241 | - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32), |
1242 | ) |
1243 | })); |
1244 | } |
1245 | // The next code unit is not a low surrogate. Don't advance |
1246 | // position and treat the high surrogate as unpaired. |
1247 | // fall through |
1248 | } |
1249 | // Unpaired surrogate at the end of buffer, fall through |
1250 | } |
1251 | // Unpaired low surrogate |
1252 | Unicode::NonAscii(NonAscii::BmpExclAscii(0xFFFDu16)) |
1253 | } |
1254 | #[inline (always)] |
1255 | fn unread(&mut self) -> usize { |
1256 | self.pos = self.old_pos; |
1257 | self.pos |
1258 | } |
1259 | #[inline (always)] |
1260 | pub fn consumed(&self) -> usize { |
1261 | self.pos |
1262 | } |
1263 | #[inline (always)] |
1264 | pub fn copy_ascii_to_check_space_two<'b>( |
1265 | &mut self, |
1266 | dest: &'b mut ByteDestination<'a>, |
1267 | ) -> CopyAsciiResult<(EncoderResult, usize, usize), (NonAscii, ByteTwoHandle<'b, 'a>)> { |
1268 | let non_ascii_ret = { |
1269 | let dst_len = dest.slice.len(); |
1270 | let src_remaining = &self.slice[self.pos..]; |
1271 | let dst_remaining = &mut dest.slice[dest.pos..]; |
1272 | let (pending, length) = if dst_remaining.len() < src_remaining.len() { |
1273 | (EncoderResult::OutputFull, dst_remaining.len()) |
1274 | } else { |
1275 | (EncoderResult::InputEmpty, src_remaining.len()) |
1276 | }; |
1277 | match unsafe { |
1278 | basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
1279 | } { |
1280 | None => { |
1281 | self.pos += length; |
1282 | dest.pos += length; |
1283 | return CopyAsciiResult::Stop((pending, self.pos, dest.pos)); |
1284 | } |
1285 | Some((non_ascii, consumed)) => { |
1286 | self.pos += consumed; |
1287 | dest.pos += consumed; |
1288 | if dest.pos + 1 < dst_len { |
1289 | self.pos += 1; // commit to reading `non_ascii` |
1290 | let unit = non_ascii; |
1291 | let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); |
1292 | if unit_minus_surrogate_start > (0xDFFF - 0xD800) { |
1293 | NonAscii::BmpExclAscii(unit) |
1294 | } else if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { |
1295 | // high surrogate |
1296 | if self.pos < self.slice.len() { |
1297 | let second = self.slice[self.pos]; |
1298 | let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); |
1299 | if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) { |
1300 | // The next code unit is a low surrogate. Advance position. |
1301 | self.pos += 1; |
1302 | NonAscii::Astral(unsafe { |
1303 | ::core::char::from_u32_unchecked( |
1304 | (u32::from(unit) << 10) + u32::from(second) |
1305 | - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32), |
1306 | ) |
1307 | }) |
1308 | } else { |
1309 | // The next code unit is not a low surrogate. Don't advance |
1310 | // position and treat the high surrogate as unpaired. |
1311 | NonAscii::BmpExclAscii(0xFFFDu16) |
1312 | } |
1313 | } else { |
1314 | // Unpaired surrogate at the end of the buffer. |
1315 | NonAscii::BmpExclAscii(0xFFFDu16) |
1316 | } |
1317 | } else { |
1318 | // Unpaired low surrogate |
1319 | NonAscii::BmpExclAscii(0xFFFDu16) |
1320 | } |
1321 | } else { |
1322 | return CopyAsciiResult::Stop(( |
1323 | EncoderResult::OutputFull, |
1324 | self.pos, |
1325 | dest.pos, |
1326 | )); |
1327 | } |
1328 | } |
1329 | } |
1330 | }; |
1331 | CopyAsciiResult::GoOn((non_ascii_ret, ByteTwoHandle::new(dest))) |
1332 | } |
1333 | #[inline (always)] |
1334 | pub fn copy_ascii_to_check_space_four<'b>( |
1335 | &mut self, |
1336 | dest: &'b mut ByteDestination<'a>, |
1337 | ) -> CopyAsciiResult<(EncoderResult, usize, usize), (NonAscii, ByteFourHandle<'b, 'a>)> { |
1338 | let non_ascii_ret = { |
1339 | let dst_len = dest.slice.len(); |
1340 | let src_remaining = &self.slice[self.pos..]; |
1341 | let dst_remaining = &mut dest.slice[dest.pos..]; |
1342 | let (pending, length) = if dst_remaining.len() < src_remaining.len() { |
1343 | (EncoderResult::OutputFull, dst_remaining.len()) |
1344 | } else { |
1345 | (EncoderResult::InputEmpty, src_remaining.len()) |
1346 | }; |
1347 | match unsafe { |
1348 | basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
1349 | } { |
1350 | None => { |
1351 | self.pos += length; |
1352 | dest.pos += length; |
1353 | return CopyAsciiResult::Stop((pending, self.pos, dest.pos)); |
1354 | } |
1355 | Some((non_ascii, consumed)) => { |
1356 | self.pos += consumed; |
1357 | dest.pos += consumed; |
1358 | if dest.pos + 3 < dst_len { |
1359 | self.pos += 1; // commit to reading `non_ascii` |
1360 | let unit = non_ascii; |
1361 | let unit_minus_surrogate_start = unit.wrapping_sub(0xD800); |
1362 | if unit_minus_surrogate_start > (0xDFFF - 0xD800) { |
1363 | NonAscii::BmpExclAscii(unit) |
1364 | } else if unit_minus_surrogate_start <= (0xDBFF - 0xD800) { |
1365 | // high surrogate |
1366 | if self.pos == self.slice.len() { |
1367 | // Unpaired surrogate at the end of the buffer. |
1368 | NonAscii::BmpExclAscii(0xFFFDu16) |
1369 | } else { |
1370 | let second = self.slice[self.pos]; |
1371 | let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00); |
1372 | if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) { |
1373 | // The next code unit is a low surrogate. Advance position. |
1374 | self.pos += 1; |
1375 | NonAscii::Astral(unsafe { |
1376 | ::core::char::from_u32_unchecked( |
1377 | (u32::from(unit) << 10) + u32::from(second) |
1378 | - (((0xD800u32 << 10) - 0x1_0000u32) + 0xDC00u32), |
1379 | ) |
1380 | }) |
1381 | } else { |
1382 | // The next code unit is not a low surrogate. Don't advance |
1383 | // position and treat the high surrogate as unpaired. |
1384 | NonAscii::BmpExclAscii(0xFFFDu16) |
1385 | } |
1386 | } |
1387 | } else { |
1388 | // Unpaired low surrogate |
1389 | NonAscii::BmpExclAscii(0xFFFDu16) |
1390 | } |
1391 | } else { |
1392 | return CopyAsciiResult::Stop(( |
1393 | EncoderResult::OutputFull, |
1394 | self.pos, |
1395 | dest.pos, |
1396 | )); |
1397 | } |
1398 | } |
1399 | } |
1400 | }; |
1401 | CopyAsciiResult::GoOn((non_ascii_ret, ByteFourHandle::new(dest))) |
1402 | } |
1403 | } |
1404 | |
1405 | pub struct Utf16ReadHandle<'a, 'b> |
1406 | where |
1407 | 'b: 'a, |
1408 | { |
1409 | source: &'a mut Utf16Source<'b>, |
1410 | } |
1411 | |
1412 | impl<'a, 'b> Utf16ReadHandle<'a, 'b> |
1413 | where |
1414 | 'b: 'a, |
1415 | { |
1416 | #[inline (always)] |
1417 | fn new(src: &'a mut Utf16Source<'b>) -> Utf16ReadHandle<'a, 'b> { |
1418 | Utf16ReadHandle { source: src } |
1419 | } |
1420 | #[inline (always)] |
1421 | pub fn read(self) -> (char, Utf16UnreadHandle<'a, 'b>) { |
1422 | let character: char = self.source.read(); |
1423 | let handle: Utf16UnreadHandle<'_, '_> = Utf16UnreadHandle::new(self.source); |
1424 | (character, handle) |
1425 | } |
1426 | #[inline (always)] |
1427 | pub fn read_enum(self) -> (Unicode, Utf16UnreadHandle<'a, 'b>) { |
1428 | let character: Unicode = self.source.read_enum(); |
1429 | let handle: Utf16UnreadHandle<'_, '_> = Utf16UnreadHandle::new(self.source); |
1430 | (character, handle) |
1431 | } |
1432 | #[inline (always)] |
1433 | pub fn consumed(&self) -> usize { |
1434 | self.source.consumed() |
1435 | } |
1436 | } |
1437 | |
1438 | pub struct Utf16UnreadHandle<'a, 'b> |
1439 | where |
1440 | 'b: 'a, |
1441 | { |
1442 | source: &'a mut Utf16Source<'b>, |
1443 | } |
1444 | |
1445 | impl<'a, 'b> Utf16UnreadHandle<'a, 'b> |
1446 | where |
1447 | 'b: 'a, |
1448 | { |
1449 | #[inline (always)] |
1450 | fn new(src: &'a mut Utf16Source<'b>) -> Utf16UnreadHandle<'a, 'b> { |
1451 | Utf16UnreadHandle { source: src } |
1452 | } |
1453 | #[inline (always)] |
1454 | pub fn unread(self) -> usize { |
1455 | self.source.unread() |
1456 | } |
1457 | #[inline (always)] |
1458 | pub fn consumed(&self) -> usize { |
1459 | self.source.consumed() |
1460 | } |
1461 | #[inline (always)] |
1462 | pub fn commit(self) -> &'a mut Utf16Source<'b> { |
1463 | self.source |
1464 | } |
1465 | } |
1466 | |
1467 | // UTF-8 source |
1468 | |
1469 | pub struct Utf8Source<'a> { |
1470 | slice: &'a [u8], |
1471 | pos: usize, |
1472 | old_pos: usize, |
1473 | } |
1474 | |
1475 | impl<'a> Utf8Source<'a> { |
1476 | #[inline (always)] |
1477 | pub fn new(src: &str) -> Utf8Source { |
1478 | Utf8Source { |
1479 | slice: src.as_bytes(), |
1480 | pos: 0, |
1481 | old_pos: 0, |
1482 | } |
1483 | } |
1484 | #[inline (always)] |
1485 | pub fn check_available<'b>(&'b mut self) -> Space<Utf8ReadHandle<'b, 'a>> { |
1486 | if self.pos < self.slice.len() { |
1487 | Space::Available(Utf8ReadHandle::new(self)) |
1488 | } else { |
1489 | Space::Full(self.consumed()) |
1490 | } |
1491 | } |
1492 | #[inline (always)] |
1493 | fn read(&mut self) -> char { |
1494 | self.old_pos = self.pos; |
1495 | let unit = self.slice[self.pos]; |
1496 | if unit < 0x80 { |
1497 | self.pos += 1; |
1498 | return char::from(unit); |
1499 | } |
1500 | if unit < 0xE0 { |
1501 | let point = |
1502 | ((u32::from(unit) & 0x1F) << 6) | (u32::from(self.slice[self.pos + 1]) & 0x3F); |
1503 | self.pos += 2; |
1504 | return unsafe { ::core::char::from_u32_unchecked(point) }; |
1505 | } |
1506 | if unit < 0xF0 { |
1507 | let point = ((u32::from(unit) & 0xF) << 12) |
1508 | | ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 6) |
1509 | | (u32::from(self.slice[self.pos + 2]) & 0x3F); |
1510 | self.pos += 3; |
1511 | return unsafe { ::core::char::from_u32_unchecked(point) }; |
1512 | } |
1513 | let point = ((u32::from(unit) & 0x7) << 18) |
1514 | | ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12) |
1515 | | ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6) |
1516 | | (u32::from(self.slice[self.pos + 3]) & 0x3F); |
1517 | self.pos += 4; |
1518 | unsafe { ::core::char::from_u32_unchecked(point) } |
1519 | } |
1520 | #[inline (always)] |
1521 | fn read_enum(&mut self) -> Unicode { |
1522 | self.old_pos = self.pos; |
1523 | let unit = self.slice[self.pos]; |
1524 | if unit < 0x80 { |
1525 | self.pos += 1; |
1526 | return Unicode::Ascii(unit); |
1527 | } |
1528 | if unit < 0xE0 { |
1529 | let point = |
1530 | ((u16::from(unit) & 0x1F) << 6) | (u16::from(self.slice[self.pos + 1]) & 0x3F); |
1531 | self.pos += 2; |
1532 | return Unicode::NonAscii(NonAscii::BmpExclAscii(point)); |
1533 | } |
1534 | if unit < 0xF0 { |
1535 | let point = ((u16::from(unit) & 0xF) << 12) |
1536 | | ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6) |
1537 | | (u16::from(self.slice[self.pos + 2]) & 0x3F); |
1538 | self.pos += 3; |
1539 | return Unicode::NonAscii(NonAscii::BmpExclAscii(point)); |
1540 | } |
1541 | let point = ((u32::from(unit) & 0x7) << 18) |
1542 | | ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12) |
1543 | | ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6) |
1544 | | (u32::from(self.slice[self.pos + 3]) & 0x3F); |
1545 | self.pos += 4; |
1546 | Unicode::NonAscii(NonAscii::Astral(unsafe { |
1547 | ::core::char::from_u32_unchecked(point) |
1548 | })) |
1549 | } |
1550 | #[inline (always)] |
1551 | fn unread(&mut self) -> usize { |
1552 | self.pos = self.old_pos; |
1553 | self.pos |
1554 | } |
1555 | #[inline (always)] |
1556 | pub fn consumed(&self) -> usize { |
1557 | self.pos |
1558 | } |
1559 | #[inline (always)] |
1560 | pub fn copy_ascii_to_check_space_one<'b>( |
1561 | &mut self, |
1562 | dest: &'b mut ByteDestination<'a>, |
1563 | ) -> CopyAsciiResult<(EncoderResult, usize, usize), (NonAscii, ByteOneHandle<'b, 'a>)> { |
1564 | let non_ascii_ret = { |
1565 | let src_remaining = &self.slice[self.pos..]; |
1566 | let dst_remaining = &mut dest.slice[dest.pos..]; |
1567 | let (pending, length) = if dst_remaining.len() < src_remaining.len() { |
1568 | (EncoderResult::OutputFull, dst_remaining.len()) |
1569 | } else { |
1570 | (EncoderResult::InputEmpty, src_remaining.len()) |
1571 | }; |
1572 | match unsafe { |
1573 | ascii_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
1574 | } { |
1575 | None => { |
1576 | self.pos += length; |
1577 | dest.pos += length; |
1578 | return CopyAsciiResult::Stop((pending, self.pos, dest.pos)); |
1579 | } |
1580 | Some((non_ascii, consumed)) => { |
1581 | self.pos += consumed; |
1582 | dest.pos += consumed; |
1583 | // We don't need to check space in destination, because |
1584 | // `ascii_to_ascii()` already did. |
1585 | if non_ascii < 0xE0 { |
1586 | let point = ((u16::from(non_ascii) & 0x1F) << 6) |
1587 | | (u16::from(self.slice[self.pos + 1]) & 0x3F); |
1588 | self.pos += 2; |
1589 | NonAscii::BmpExclAscii(point) |
1590 | } else if non_ascii < 0xF0 { |
1591 | let point = ((u16::from(non_ascii) & 0xF) << 12) |
1592 | | ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6) |
1593 | | (u16::from(self.slice[self.pos + 2]) & 0x3F); |
1594 | self.pos += 3; |
1595 | NonAscii::BmpExclAscii(point) |
1596 | } else { |
1597 | let point = ((u32::from(non_ascii) & 0x7) << 18) |
1598 | | ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12) |
1599 | | ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6) |
1600 | | (u32::from(self.slice[self.pos + 3]) & 0x3F); |
1601 | self.pos += 4; |
1602 | NonAscii::Astral(unsafe { ::core::char::from_u32_unchecked(point) }) |
1603 | } |
1604 | } |
1605 | } |
1606 | }; |
1607 | CopyAsciiResult::GoOn((non_ascii_ret, ByteOneHandle::new(dest))) |
1608 | } |
1609 | #[inline (always)] |
1610 | pub fn copy_ascii_to_check_space_two<'b>( |
1611 | &mut self, |
1612 | dest: &'b mut ByteDestination<'a>, |
1613 | ) -> CopyAsciiResult<(EncoderResult, usize, usize), (NonAscii, ByteTwoHandle<'b, 'a>)> { |
1614 | let non_ascii_ret = { |
1615 | let dst_len = dest.slice.len(); |
1616 | let src_remaining = &self.slice[self.pos..]; |
1617 | let dst_remaining = &mut dest.slice[dest.pos..]; |
1618 | let (pending, length) = if dst_remaining.len() < src_remaining.len() { |
1619 | (EncoderResult::OutputFull, dst_remaining.len()) |
1620 | } else { |
1621 | (EncoderResult::InputEmpty, src_remaining.len()) |
1622 | }; |
1623 | match unsafe { |
1624 | ascii_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
1625 | } { |
1626 | None => { |
1627 | self.pos += length; |
1628 | dest.pos += length; |
1629 | return CopyAsciiResult::Stop((pending, self.pos, dest.pos)); |
1630 | } |
1631 | Some((non_ascii, consumed)) => { |
1632 | self.pos += consumed; |
1633 | dest.pos += consumed; |
1634 | if dest.pos + 1 < dst_len { |
1635 | if non_ascii < 0xE0 { |
1636 | let point = ((u16::from(non_ascii) & 0x1F) << 6) |
1637 | | (u16::from(self.slice[self.pos + 1]) & 0x3F); |
1638 | self.pos += 2; |
1639 | NonAscii::BmpExclAscii(point) |
1640 | } else if non_ascii < 0xF0 { |
1641 | let point = ((u16::from(non_ascii) & 0xF) << 12) |
1642 | | ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6) |
1643 | | (u16::from(self.slice[self.pos + 2]) & 0x3F); |
1644 | self.pos += 3; |
1645 | NonAscii::BmpExclAscii(point) |
1646 | } else { |
1647 | let point = ((u32::from(non_ascii) & 0x7) << 18) |
1648 | | ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12) |
1649 | | ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6) |
1650 | | (u32::from(self.slice[self.pos + 3]) & 0x3F); |
1651 | self.pos += 4; |
1652 | NonAscii::Astral(unsafe { ::core::char::from_u32_unchecked(point) }) |
1653 | } |
1654 | } else { |
1655 | return CopyAsciiResult::Stop(( |
1656 | EncoderResult::OutputFull, |
1657 | self.pos, |
1658 | dest.pos, |
1659 | )); |
1660 | } |
1661 | } |
1662 | } |
1663 | }; |
1664 | CopyAsciiResult::GoOn((non_ascii_ret, ByteTwoHandle::new(dest))) |
1665 | } |
1666 | #[inline (always)] |
1667 | pub fn copy_ascii_to_check_space_four<'b>( |
1668 | &mut self, |
1669 | dest: &'b mut ByteDestination<'a>, |
1670 | ) -> CopyAsciiResult<(EncoderResult, usize, usize), (NonAscii, ByteFourHandle<'b, 'a>)> { |
1671 | let non_ascii_ret = { |
1672 | let dst_len = dest.slice.len(); |
1673 | let src_remaining = &self.slice[self.pos..]; |
1674 | let dst_remaining = &mut dest.slice[dest.pos..]; |
1675 | let (pending, length) = if dst_remaining.len() < src_remaining.len() { |
1676 | (EncoderResult::OutputFull, dst_remaining.len()) |
1677 | } else { |
1678 | (EncoderResult::InputEmpty, src_remaining.len()) |
1679 | }; |
1680 | match unsafe { |
1681 | ascii_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length) |
1682 | } { |
1683 | None => { |
1684 | self.pos += length; |
1685 | dest.pos += length; |
1686 | return CopyAsciiResult::Stop((pending, self.pos, dest.pos)); |
1687 | } |
1688 | Some((non_ascii, consumed)) => { |
1689 | self.pos += consumed; |
1690 | dest.pos += consumed; |
1691 | if dest.pos + 3 < dst_len { |
1692 | if non_ascii < 0xE0 { |
1693 | let point = ((u16::from(non_ascii) & 0x1F) << 6) |
1694 | | (u16::from(self.slice[self.pos + 1]) & 0x3F); |
1695 | self.pos += 2; |
1696 | NonAscii::BmpExclAscii(point) |
1697 | } else if non_ascii < 0xF0 { |
1698 | let point = ((u16::from(non_ascii) & 0xF) << 12) |
1699 | | ((u16::from(self.slice[self.pos + 1]) & 0x3F) << 6) |
1700 | | (u16::from(self.slice[self.pos + 2]) & 0x3F); |
1701 | self.pos += 3; |
1702 | NonAscii::BmpExclAscii(point) |
1703 | } else { |
1704 | let point = ((u32::from(non_ascii) & 0x7) << 18) |
1705 | | ((u32::from(self.slice[self.pos + 1]) & 0x3F) << 12) |
1706 | | ((u32::from(self.slice[self.pos + 2]) & 0x3F) << 6) |
1707 | | (u32::from(self.slice[self.pos + 3]) & 0x3F); |
1708 | self.pos += 4; |
1709 | NonAscii::Astral(unsafe { ::core::char::from_u32_unchecked(point) }) |
1710 | } |
1711 | } else { |
1712 | return CopyAsciiResult::Stop(( |
1713 | EncoderResult::OutputFull, |
1714 | self.pos, |
1715 | dest.pos, |
1716 | )); |
1717 | } |
1718 | } |
1719 | } |
1720 | }; |
1721 | CopyAsciiResult::GoOn((non_ascii_ret, ByteFourHandle::new(dest))) |
1722 | } |
1723 | } |
1724 | |
1725 | pub struct Utf8ReadHandle<'a, 'b> |
1726 | where |
1727 | 'b: 'a, |
1728 | { |
1729 | source: &'a mut Utf8Source<'b>, |
1730 | } |
1731 | |
1732 | impl<'a, 'b> Utf8ReadHandle<'a, 'b> |
1733 | where |
1734 | 'b: 'a, |
1735 | { |
1736 | #[inline (always)] |
1737 | fn new(src: &'a mut Utf8Source<'b>) -> Utf8ReadHandle<'a, 'b> { |
1738 | Utf8ReadHandle { source: src } |
1739 | } |
1740 | #[inline (always)] |
1741 | pub fn read(self) -> (char, Utf8UnreadHandle<'a, 'b>) { |
1742 | let character: char = self.source.read(); |
1743 | let handle: Utf8UnreadHandle<'_, '_> = Utf8UnreadHandle::new(self.source); |
1744 | (character, handle) |
1745 | } |
1746 | #[inline (always)] |
1747 | pub fn read_enum(self) -> (Unicode, Utf8UnreadHandle<'a, 'b>) { |
1748 | let character: Unicode = self.source.read_enum(); |
1749 | let handle: Utf8UnreadHandle<'_, '_> = Utf8UnreadHandle::new(self.source); |
1750 | (character, handle) |
1751 | } |
1752 | #[inline (always)] |
1753 | pub fn consumed(&self) -> usize { |
1754 | self.source.consumed() |
1755 | } |
1756 | } |
1757 | |
1758 | pub struct Utf8UnreadHandle<'a, 'b> |
1759 | where |
1760 | 'b: 'a, |
1761 | { |
1762 | source: &'a mut Utf8Source<'b>, |
1763 | } |
1764 | |
1765 | impl<'a, 'b> Utf8UnreadHandle<'a, 'b> |
1766 | where |
1767 | 'b: 'a, |
1768 | { |
1769 | #[inline (always)] |
1770 | fn new(src: &'a mut Utf8Source<'b>) -> Utf8UnreadHandle<'a, 'b> { |
1771 | Utf8UnreadHandle { source: src } |
1772 | } |
1773 | #[inline (always)] |
1774 | pub fn unread(self) -> usize { |
1775 | self.source.unread() |
1776 | } |
1777 | #[inline (always)] |
1778 | pub fn consumed(&self) -> usize { |
1779 | self.source.consumed() |
1780 | } |
1781 | #[inline (always)] |
1782 | pub fn commit(self) -> &'a mut Utf8Source<'b> { |
1783 | self.source |
1784 | } |
1785 | } |
1786 | |
1787 | // Byte destination |
1788 | |
1789 | pub struct ByteOneHandle<'a, 'b> |
1790 | where |
1791 | 'b: 'a, |
1792 | { |
1793 | dest: &'a mut ByteDestination<'b>, |
1794 | } |
1795 | |
1796 | impl<'a, 'b> ByteOneHandle<'a, 'b> |
1797 | where |
1798 | 'b: 'a, |
1799 | { |
1800 | #[inline (always)] |
1801 | fn new(dst: &'a mut ByteDestination<'b>) -> ByteOneHandle<'a, 'b> { |
1802 | ByteOneHandle { dest: dst } |
1803 | } |
1804 | #[inline (always)] |
1805 | pub fn written(&self) -> usize { |
1806 | self.dest.written() |
1807 | } |
1808 | #[inline (always)] |
1809 | pub fn write_one(self, first: u8) -> &'a mut ByteDestination<'b> { |
1810 | self.dest.write_one(first); |
1811 | self.dest |
1812 | } |
1813 | } |
1814 | |
1815 | pub struct ByteTwoHandle<'a, 'b> |
1816 | where |
1817 | 'b: 'a, |
1818 | { |
1819 | dest: &'a mut ByteDestination<'b>, |
1820 | } |
1821 | |
1822 | impl<'a, 'b> ByteTwoHandle<'a, 'b> |
1823 | where |
1824 | 'b: 'a, |
1825 | { |
1826 | #[inline (always)] |
1827 | fn new(dst: &'a mut ByteDestination<'b>) -> ByteTwoHandle<'a, 'b> { |
1828 | ByteTwoHandle { dest: dst } |
1829 | } |
1830 | #[inline (always)] |
1831 | pub fn written(&self) -> usize { |
1832 | self.dest.written() |
1833 | } |
1834 | #[inline (always)] |
1835 | pub fn write_one(self, first: u8) -> &'a mut ByteDestination<'b> { |
1836 | self.dest.write_one(first); |
1837 | self.dest |
1838 | } |
1839 | #[inline (always)] |
1840 | pub fn write_two(self, first: u8, second: u8) -> &'a mut ByteDestination<'b> { |
1841 | self.dest.write_two(first, second); |
1842 | self.dest |
1843 | } |
1844 | } |
1845 | |
1846 | pub struct ByteThreeHandle<'a, 'b> |
1847 | where |
1848 | 'b: 'a, |
1849 | { |
1850 | dest: &'a mut ByteDestination<'b>, |
1851 | } |
1852 | |
1853 | impl<'a, 'b> ByteThreeHandle<'a, 'b> |
1854 | where |
1855 | 'b: 'a, |
1856 | { |
1857 | #[inline (always)] |
1858 | fn new(dst: &'a mut ByteDestination<'b>) -> ByteThreeHandle<'a, 'b> { |
1859 | ByteThreeHandle { dest: dst } |
1860 | } |
1861 | #[inline (always)] |
1862 | pub fn written(&self) -> usize { |
1863 | self.dest.written() |
1864 | } |
1865 | #[inline (always)] |
1866 | pub fn write_one(self, first: u8) -> &'a mut ByteDestination<'b> { |
1867 | self.dest.write_one(first); |
1868 | self.dest |
1869 | } |
1870 | #[inline (always)] |
1871 | pub fn write_two(self, first: u8, second: u8) -> &'a mut ByteDestination<'b> { |
1872 | self.dest.write_two(first, second); |
1873 | self.dest |
1874 | } |
1875 | #[inline (always)] |
1876 | pub fn write_three(self, first: u8, second: u8, third: u8) -> &'a mut ByteDestination<'b> { |
1877 | self.dest.write_three(first, second, third); |
1878 | self.dest |
1879 | } |
1880 | #[inline (always)] |
1881 | pub fn write_three_return_written(self, first: u8, second: u8, third: u8) -> usize { |
1882 | self.dest.write_three(first, second, third); |
1883 | self.dest.written() |
1884 | } |
1885 | } |
1886 | |
1887 | pub struct ByteFourHandle<'a, 'b> |
1888 | where |
1889 | 'b: 'a, |
1890 | { |
1891 | dest: &'a mut ByteDestination<'b>, |
1892 | } |
1893 | |
1894 | impl<'a, 'b> ByteFourHandle<'a, 'b> |
1895 | where |
1896 | 'b: 'a, |
1897 | { |
1898 | #[inline (always)] |
1899 | fn new(dst: &'a mut ByteDestination<'b>) -> ByteFourHandle<'a, 'b> { |
1900 | ByteFourHandle { dest: dst } |
1901 | } |
1902 | #[inline (always)] |
1903 | pub fn written(&self) -> usize { |
1904 | self.dest.written() |
1905 | } |
1906 | #[inline (always)] |
1907 | pub fn write_one(self, first: u8) -> &'a mut ByteDestination<'b> { |
1908 | self.dest.write_one(first); |
1909 | self.dest |
1910 | } |
1911 | #[inline (always)] |
1912 | pub fn write_two(self, first: u8, second: u8) -> &'a mut ByteDestination<'b> { |
1913 | self.dest.write_two(first, second); |
1914 | self.dest |
1915 | } |
1916 | #[inline (always)] |
1917 | pub fn write_four( |
1918 | self, |
1919 | first: u8, |
1920 | second: u8, |
1921 | third: u8, |
1922 | fourth: u8, |
1923 | ) -> &'a mut ByteDestination<'b> { |
1924 | self.dest.write_four(first, second, third, fourth); |
1925 | self.dest |
1926 | } |
1927 | } |
1928 | |
1929 | pub struct ByteDestination<'a> { |
1930 | slice: &'a mut [u8], |
1931 | pos: usize, |
1932 | } |
1933 | |
1934 | impl<'a> ByteDestination<'a> { |
1935 | #[inline (always)] |
1936 | pub fn new(dst: &mut [u8]) -> ByteDestination { |
1937 | ByteDestination { slice: dst, pos: 0 } |
1938 | } |
1939 | #[inline (always)] |
1940 | pub fn check_space_one<'b>(&'b mut self) -> Space<ByteOneHandle<'b, 'a>> { |
1941 | if self.pos < self.slice.len() { |
1942 | Space::Available(ByteOneHandle::new(self)) |
1943 | } else { |
1944 | Space::Full(self.written()) |
1945 | } |
1946 | } |
1947 | #[inline (always)] |
1948 | pub fn check_space_two<'b>(&'b mut self) -> Space<ByteTwoHandle<'b, 'a>> { |
1949 | if self.pos + 1 < self.slice.len() { |
1950 | Space::Available(ByteTwoHandle::new(self)) |
1951 | } else { |
1952 | Space::Full(self.written()) |
1953 | } |
1954 | } |
1955 | #[inline (always)] |
1956 | pub fn check_space_three<'b>(&'b mut self) -> Space<ByteThreeHandle<'b, 'a>> { |
1957 | if self.pos + 2 < self.slice.len() { |
1958 | Space::Available(ByteThreeHandle::new(self)) |
1959 | } else { |
1960 | Space::Full(self.written()) |
1961 | } |
1962 | } |
1963 | #[inline (always)] |
1964 | pub fn check_space_four<'b>(&'b mut self) -> Space<ByteFourHandle<'b, 'a>> { |
1965 | if self.pos + 3 < self.slice.len() { |
1966 | Space::Available(ByteFourHandle::new(self)) |
1967 | } else { |
1968 | Space::Full(self.written()) |
1969 | } |
1970 | } |
1971 | #[inline (always)] |
1972 | pub fn written(&self) -> usize { |
1973 | self.pos |
1974 | } |
1975 | #[inline (always)] |
1976 | fn write_one(&mut self, first: u8) { |
1977 | self.slice[self.pos] = first; |
1978 | self.pos += 1; |
1979 | } |
1980 | #[inline (always)] |
1981 | fn write_two(&mut self, first: u8, second: u8) { |
1982 | self.slice[self.pos] = first; |
1983 | self.slice[self.pos + 1] = second; |
1984 | self.pos += 2; |
1985 | } |
1986 | #[inline (always)] |
1987 | fn write_three(&mut self, first: u8, second: u8, third: u8) { |
1988 | self.slice[self.pos] = first; |
1989 | self.slice[self.pos + 1] = second; |
1990 | self.slice[self.pos + 2] = third; |
1991 | self.pos += 3; |
1992 | } |
1993 | #[inline (always)] |
1994 | fn write_four(&mut self, first: u8, second: u8, third: u8, fourth: u8) { |
1995 | self.slice[self.pos] = first; |
1996 | self.slice[self.pos + 1] = second; |
1997 | self.slice[self.pos + 2] = third; |
1998 | self.slice[self.pos + 3] = fourth; |
1999 | self.pos += 4; |
2000 | } |
2001 | } |
2002 | |