1 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
2 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
3 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
4 | // option. This file may not be copied, modified, or distributed |
5 | // except according to those terms. |
6 | |
7 | //! Marker types for formats. |
8 | //! |
9 | //! This module defines the types and traits used to mark a `Tendril` |
10 | //! with the format of data it contains. It includes those formats |
11 | //! for which `Tendril` supports at least some operations without |
12 | //! conversion. |
13 | //! |
14 | //! To convert a string tendril to/from a byte tendril in an arbitrary |
15 | //! character encoding, see the `encode` and `decode` methods on |
16 | //! `Tendril`. |
17 | //! |
18 | //! `Tendril` operations may become memory-unsafe if data invalid for |
19 | //! the format sneaks in. For that reason, these traits require |
20 | //! `unsafe impl`. |
21 | |
22 | use std::default::Default; |
23 | use std::{char, mem, str}; |
24 | |
25 | use futf::{self, Codepoint, Meaning}; |
26 | |
27 | /// Implementation details. |
28 | /// |
29 | /// You don't need these unless you are implementing |
30 | /// a new format. |
31 | pub mod imp { |
32 | use std::default::Default; |
33 | use std::{iter, mem, slice}; |
34 | |
35 | /// Describes how to fix up encodings when concatenating. |
36 | /// |
37 | /// We can drop characters on either side of the splice, |
38 | /// and insert up to 4 bytes in the middle. |
39 | pub struct Fixup { |
40 | pub drop_left: u32, |
41 | pub drop_right: u32, |
42 | pub insert_len: u32, |
43 | pub insert_bytes: [u8; 4], |
44 | } |
45 | |
46 | impl Default for Fixup { |
47 | #[inline (always)] |
48 | fn default() -> Fixup { |
49 | Fixup { |
50 | drop_left: 0, |
51 | drop_right: 0, |
52 | insert_len: 0, |
53 | insert_bytes: [0; 4], |
54 | } |
55 | } |
56 | } |
57 | |
58 | #[inline (always)] |
59 | unsafe fn from_u32_unchecked(n: u32) -> char { |
60 | mem::transmute(n) |
61 | } |
62 | |
63 | pub struct SingleByteCharIndices<'a> { |
64 | inner: iter::Enumerate<slice::Iter<'a, u8>>, |
65 | } |
66 | |
67 | impl<'a> Iterator for SingleByteCharIndices<'a> { |
68 | type Item = (usize, char); |
69 | |
70 | #[inline ] |
71 | fn next(&mut self) -> Option<(usize, char)> { |
72 | self.inner |
73 | .next() |
74 | .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) }) |
75 | } |
76 | } |
77 | |
78 | impl<'a> SingleByteCharIndices<'a> { |
79 | #[inline ] |
80 | pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> { |
81 | SingleByteCharIndices { |
82 | inner: buf.iter().enumerate(), |
83 | } |
84 | } |
85 | } |
86 | } |
87 | |
88 | /// Trait for format marker types. |
89 | /// |
90 | /// The type implementing this trait is usually not instantiated. |
91 | /// It's used with a phantom type parameter of `Tendril`. |
92 | pub unsafe trait Format { |
93 | /// Check whether the buffer is valid for this format. |
94 | fn validate(buf: &[u8]) -> bool; |
95 | |
96 | /// Check whether the buffer is valid for this format. |
97 | /// |
98 | /// You may assume the buffer is a prefix of a valid buffer. |
99 | #[inline ] |
100 | fn validate_prefix(buf: &[u8]) -> bool { |
101 | <Self as Format>::validate(buf) |
102 | } |
103 | |
104 | /// Check whether the buffer is valid for this format. |
105 | /// |
106 | /// You may assume the buffer is a suffix of a valid buffer. |
107 | #[inline ] |
108 | fn validate_suffix(buf: &[u8]) -> bool { |
109 | <Self as Format>::validate(buf) |
110 | } |
111 | |
112 | /// Check whether the buffer is valid for this format. |
113 | /// |
114 | /// You may assume the buffer is a contiguous subsequence |
115 | /// of a valid buffer, but not necessarily a prefix or |
116 | /// a suffix. |
117 | #[inline ] |
118 | fn validate_subseq(buf: &[u8]) -> bool { |
119 | <Self as Format>::validate(buf) |
120 | } |
121 | |
122 | /// Compute any fixup needed when concatenating buffers. |
123 | /// |
124 | /// The default is to do nothing. |
125 | /// |
126 | /// The function is `unsafe` because it may assume the input |
127 | /// buffers are already valid for the format. Also, no |
128 | /// bounds-checking is performed on the return value! |
129 | #[inline (always)] |
130 | unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup { |
131 | Default::default() |
132 | } |
133 | } |
134 | |
135 | /// Indicates that one format is a subset of another. |
136 | /// |
137 | /// The subset format can be converted to the superset format |
138 | /// for free. |
139 | pub unsafe trait SubsetOf<Super>: Format |
140 | where |
141 | Super: Format, |
142 | { |
143 | /// Validate the *other* direction of conversion; check if |
144 | /// this buffer from the superset format conforms to the |
145 | /// subset format. |
146 | /// |
147 | /// The default calls `Self::validate`, but some conversions |
148 | /// may implement a check which is cheaper than validating |
149 | /// from scratch. |
150 | fn revalidate_subset(x: &[u8]) -> bool { |
151 | Self::validate(buf:x) |
152 | } |
153 | } |
154 | |
155 | /// Indicates a format which corresponds to a Rust slice type, |
156 | /// representing exactly the same invariants. |
157 | pub unsafe trait SliceFormat: Format + Sized { |
158 | type Slice: ?Sized + Slice; |
159 | } |
160 | |
161 | /// Indicates a format which contains characters from Unicode |
162 | /// (all of it, or some proper subset). |
163 | pub unsafe trait CharFormat<'a>: Format { |
164 | /// Iterator for characters and their byte indices. |
165 | type Iter: Iterator<Item = (usize, char)>; |
166 | |
167 | /// Iterate over the characters of the string and their byte |
168 | /// indices. |
169 | /// |
170 | /// You may assume the buffer is *already validated* for `Format`. |
171 | unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter; |
172 | |
173 | /// Encode the character as bytes and pass them to a continuation. |
174 | /// |
175 | /// Returns `Err(())` iff the character cannot be represented. |
176 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> |
177 | where |
178 | F: FnOnce(&[u8]); |
179 | } |
180 | |
181 | /// Indicates a Rust slice type that is represented in memory as bytes. |
182 | pub unsafe trait Slice { |
183 | /// Access the raw bytes of the slice. |
184 | fn as_bytes(&self) -> &[u8]; |
185 | |
186 | /// Convert a byte slice to this kind of slice. |
187 | /// |
188 | /// You may assume the buffer is *already validated* |
189 | /// for `Format`. |
190 | unsafe fn from_bytes(x: &[u8]) -> &Self; |
191 | |
192 | /// Convert a byte slice to this kind of slice. |
193 | /// |
194 | /// You may assume the buffer is *already validated* |
195 | /// for `Format`. |
196 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self; |
197 | } |
198 | |
199 | /// Marker type for uninterpreted bytes. |
200 | /// |
201 | /// Validation will never fail for this format. |
202 | #[derive (Copy, Clone, Default, Debug)] |
203 | pub struct Bytes; |
204 | |
205 | unsafe impl Format for Bytes { |
206 | #[inline (always)] |
207 | fn validate(_: &[u8]) -> bool { |
208 | true |
209 | } |
210 | } |
211 | |
212 | unsafe impl SliceFormat for Bytes { |
213 | type Slice = [u8]; |
214 | } |
215 | |
216 | unsafe impl Slice for [u8] { |
217 | #[inline (always)] |
218 | fn as_bytes(&self) -> &[u8] { |
219 | self |
220 | } |
221 | |
222 | #[inline (always)] |
223 | unsafe fn from_bytes(x: &[u8]) -> &[u8] { |
224 | x |
225 | } |
226 | |
227 | #[inline (always)] |
228 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] { |
229 | x |
230 | } |
231 | } |
232 | |
233 | /// Marker type for ASCII text. |
234 | #[derive (Copy, Clone, Default, Debug)] |
235 | pub struct ASCII; |
236 | |
237 | unsafe impl Format for ASCII { |
238 | #[inline ] |
239 | fn validate(buf: &[u8]) -> bool { |
240 | buf.iter().all(|&n: u8| n <= 127) |
241 | } |
242 | |
243 | #[inline (always)] |
244 | fn validate_prefix(_: &[u8]) -> bool { |
245 | true |
246 | } |
247 | |
248 | #[inline (always)] |
249 | fn validate_suffix(_: &[u8]) -> bool { |
250 | true |
251 | } |
252 | |
253 | #[inline (always)] |
254 | fn validate_subseq(_: &[u8]) -> bool { |
255 | true |
256 | } |
257 | } |
258 | |
259 | unsafe impl SubsetOf<UTF8> for ASCII {} |
260 | unsafe impl SubsetOf<Latin1> for ASCII {} |
261 | |
262 | unsafe impl<'a> CharFormat<'a> for ASCII { |
263 | type Iter = imp::SingleByteCharIndices<'a>; |
264 | |
265 | #[inline ] |
266 | unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { |
267 | imp::SingleByteCharIndices::new(buf) |
268 | } |
269 | |
270 | #[inline ] |
271 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> |
272 | where |
273 | F: FnOnce(&[u8]), |
274 | { |
275 | let n: u32 = ch as u32; |
276 | if n > 0x7F { |
277 | return Err(()); |
278 | } |
279 | cont(&[n as u8]); |
280 | Ok(()) |
281 | } |
282 | } |
283 | |
284 | /// Marker type for UTF-8 text. |
285 | #[derive (Copy, Clone, Default, Debug)] |
286 | pub struct UTF8; |
287 | |
288 | unsafe impl Format for UTF8 { |
289 | #[inline ] |
290 | fn validate(buf: &[u8]) -> bool { |
291 | str::from_utf8(buf).is_ok() |
292 | } |
293 | |
294 | #[inline ] |
295 | fn validate_prefix(buf: &[u8]) -> bool { |
296 | if buf.len() == 0 { |
297 | return true; |
298 | } |
299 | match futf::classify(buf, buf.len() - 1) { |
300 | Some(Codepoint { |
301 | meaning: Meaning::Whole(_), |
302 | .. |
303 | }) => true, |
304 | _ => false, |
305 | } |
306 | } |
307 | |
308 | #[inline ] |
309 | fn validate_suffix(buf: &[u8]) -> bool { |
310 | if buf.len() == 0 { |
311 | return true; |
312 | } |
313 | match futf::classify(buf, 0) { |
314 | Some(Codepoint { |
315 | meaning: Meaning::Whole(_), |
316 | .. |
317 | }) => true, |
318 | _ => false, |
319 | } |
320 | } |
321 | |
322 | #[inline ] |
323 | fn validate_subseq(buf: &[u8]) -> bool { |
324 | <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) |
325 | } |
326 | } |
327 | |
328 | unsafe impl SubsetOf<WTF8> for UTF8 {} |
329 | |
330 | unsafe impl SliceFormat for UTF8 { |
331 | type Slice = str; |
332 | } |
333 | |
334 | unsafe impl Slice for str { |
335 | #[inline (always)] |
336 | fn as_bytes(&self) -> &[u8] { |
337 | str::as_bytes(self) |
338 | } |
339 | |
340 | #[inline (always)] |
341 | unsafe fn from_bytes(x: &[u8]) -> &str { |
342 | str::from_utf8_unchecked(x) |
343 | } |
344 | |
345 | #[inline (always)] |
346 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str { |
347 | mem::transmute(src:x) |
348 | } |
349 | } |
350 | |
351 | unsafe impl<'a> CharFormat<'a> for UTF8 { |
352 | type Iter = str::CharIndices<'a>; |
353 | |
354 | #[inline ] |
355 | unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> { |
356 | str::from_utf8_unchecked(buf).char_indices() |
357 | } |
358 | |
359 | #[inline ] |
360 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> |
361 | where |
362 | F: FnOnce(&[u8]), |
363 | { |
364 | cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes()); |
365 | Ok(()) |
366 | } |
367 | } |
368 | |
369 | /// Marker type for WTF-8 text. |
370 | /// |
371 | /// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/). |
372 | #[derive (Copy, Clone, Default, Debug)] |
373 | pub struct WTF8; |
374 | |
375 | #[inline ] |
376 | fn wtf8_meaningful(m: Meaning) -> bool { |
377 | match m { |
378 | Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true, |
379 | _ => false, |
380 | } |
381 | } |
382 | |
383 | unsafe impl Format for WTF8 { |
384 | #[inline ] |
385 | fn validate(buf: &[u8]) -> bool { |
386 | let mut i = 0; |
387 | let mut prev_lead = false; |
388 | while i < buf.len() { |
389 | let codept = unwrap_or_return!(futf::classify(buf, i), false); |
390 | if !wtf8_meaningful(codept.meaning) { |
391 | return false; |
392 | } |
393 | i += codept.bytes.len(); |
394 | prev_lead = match codept.meaning { |
395 | Meaning::TrailSurrogate(_) if prev_lead => return false, |
396 | Meaning::LeadSurrogate(_) => true, |
397 | _ => false, |
398 | }; |
399 | } |
400 | |
401 | true |
402 | } |
403 | |
404 | #[inline ] |
405 | fn validate_prefix(buf: &[u8]) -> bool { |
406 | if buf.len() == 0 { |
407 | return true; |
408 | } |
409 | match futf::classify(buf, buf.len() - 1) { |
410 | Some(c) => wtf8_meaningful(c.meaning), |
411 | _ => false, |
412 | } |
413 | } |
414 | |
415 | #[inline ] |
416 | fn validate_suffix(buf: &[u8]) -> bool { |
417 | if buf.len() == 0 { |
418 | return true; |
419 | } |
420 | match futf::classify(buf, 0) { |
421 | Some(c) => wtf8_meaningful(c.meaning), |
422 | _ => false, |
423 | } |
424 | } |
425 | |
426 | #[inline ] |
427 | fn validate_subseq(buf: &[u8]) -> bool { |
428 | <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) |
429 | } |
430 | |
431 | #[inline ] |
432 | unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup { |
433 | const ERR: &'static str = "WTF8: internal error" ; |
434 | |
435 | if lhs.len() >= 3 && rhs.len() >= 3 { |
436 | if let ( |
437 | Some(Codepoint { |
438 | meaning: Meaning::LeadSurrogate(hi), |
439 | .. |
440 | }), |
441 | Some(Codepoint { |
442 | meaning: Meaning::TrailSurrogate(lo), |
443 | .. |
444 | }), |
445 | ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0)) |
446 | { |
447 | let mut fixup = imp::Fixup { |
448 | drop_left: 3, |
449 | drop_right: 3, |
450 | insert_len: 0, |
451 | insert_bytes: [0_u8; 4], |
452 | }; |
453 | |
454 | let n = 0x10000 + ((hi as u32) << 10) + (lo as u32); |
455 | |
456 | let ch = char::from_u32(n).expect(ERR); |
457 | fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32; |
458 | |
459 | return fixup; |
460 | } |
461 | } |
462 | |
463 | Default::default() |
464 | } |
465 | } |
466 | |
467 | /// Marker type for the single-byte encoding of the first 256 Unicode codepoints. |
468 | /// |
469 | /// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the |
470 | /// C0 and C1 control characters from ECMA-48 / ISO 6429. |
471 | /// |
472 | /// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the |
473 | /// many other aliases), which actually stand for Windows-1252. |
474 | #[derive (Copy, Clone, Default, Debug)] |
475 | pub struct Latin1; |
476 | |
477 | unsafe impl Format for Latin1 { |
478 | #[inline (always)] |
479 | fn validate(_: &[u8]) -> bool { |
480 | true |
481 | } |
482 | |
483 | #[inline (always)] |
484 | fn validate_prefix(_: &[u8]) -> bool { |
485 | true |
486 | } |
487 | |
488 | #[inline (always)] |
489 | fn validate_suffix(_: &[u8]) -> bool { |
490 | true |
491 | } |
492 | |
493 | #[inline (always)] |
494 | fn validate_subseq(_: &[u8]) -> bool { |
495 | true |
496 | } |
497 | } |
498 | |
499 | unsafe impl<'a> CharFormat<'a> for Latin1 { |
500 | type Iter = imp::SingleByteCharIndices<'a>; |
501 | |
502 | #[inline ] |
503 | unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { |
504 | imp::SingleByteCharIndices::new(buf) |
505 | } |
506 | |
507 | #[inline ] |
508 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> |
509 | where |
510 | F: FnOnce(&[u8]), |
511 | { |
512 | let n: u32 = ch as u32; |
513 | if n > 0xFF { |
514 | return Err(()); |
515 | } |
516 | cont(&[n as u8]); |
517 | Ok(()) |
518 | } |
519 | } |
520 | |