1// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
4// option. This file may not be copied, modified, or distributed
5// except according to those terms.
6
7//! Marker types for formats.
8//!
9//! This module defines the types and traits used to mark a `Tendril`
10//! with the format of data it contains. It includes those formats
11//! for which `Tendril` supports at least some operations without
12//! conversion.
13//!
14//! To convert a string tendril to/from a byte tendril in an arbitrary
15//! character encoding, see the `encode` and `decode` methods on
16//! `Tendril`.
17//!
18//! `Tendril` operations may become memory-unsafe if data invalid for
19//! the format sneaks in. For that reason, these traits require
20//! `unsafe impl`.
21
22use std::default::Default;
23use std::{char, mem, str};
24
25use futf::{self, Codepoint, Meaning};
26
27/// Implementation details.
28///
29/// You don't need these unless you are implementing
30/// a new format.
31pub mod imp {
32 use std::default::Default;
33 use std::{iter, mem, slice};
34
35 /// Describes how to fix up encodings when concatenating.
36 ///
37 /// We can drop characters on either side of the splice,
38 /// and insert up to 4 bytes in the middle.
39 pub struct Fixup {
40 pub drop_left: u32,
41 pub drop_right: u32,
42 pub insert_len: u32,
43 pub insert_bytes: [u8; 4],
44 }
45
46 impl Default for Fixup {
47 #[inline(always)]
48 fn default() -> Fixup {
49 Fixup {
50 drop_left: 0,
51 drop_right: 0,
52 insert_len: 0,
53 insert_bytes: [0; 4],
54 }
55 }
56 }
57
58 #[inline(always)]
59 unsafe fn from_u32_unchecked(n: u32) -> char {
60 mem::transmute(n)
61 }
62
63 pub struct SingleByteCharIndices<'a> {
64 inner: iter::Enumerate<slice::Iter<'a, u8>>,
65 }
66
67 impl<'a> Iterator for SingleByteCharIndices<'a> {
68 type Item = (usize, char);
69
70 #[inline]
71 fn next(&mut self) -> Option<(usize, char)> {
72 self.inner
73 .next()
74 .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) })
75 }
76 }
77
78 impl<'a> SingleByteCharIndices<'a> {
79 #[inline]
80 pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
81 SingleByteCharIndices {
82 inner: buf.iter().enumerate(),
83 }
84 }
85 }
86}
87
88/// Trait for format marker types.
89///
90/// The type implementing this trait is usually not instantiated.
91/// It's used with a phantom type parameter of `Tendril`.
92pub unsafe trait Format {
93 /// Check whether the buffer is valid for this format.
94 fn validate(buf: &[u8]) -> bool;
95
96 /// Check whether the buffer is valid for this format.
97 ///
98 /// You may assume the buffer is a prefix of a valid buffer.
99 #[inline]
100 fn validate_prefix(buf: &[u8]) -> bool {
101 <Self as Format>::validate(buf)
102 }
103
104 /// Check whether the buffer is valid for this format.
105 ///
106 /// You may assume the buffer is a suffix of a valid buffer.
107 #[inline]
108 fn validate_suffix(buf: &[u8]) -> bool {
109 <Self as Format>::validate(buf)
110 }
111
112 /// Check whether the buffer is valid for this format.
113 ///
114 /// You may assume the buffer is a contiguous subsequence
115 /// of a valid buffer, but not necessarily a prefix or
116 /// a suffix.
117 #[inline]
118 fn validate_subseq(buf: &[u8]) -> bool {
119 <Self as Format>::validate(buf)
120 }
121
122 /// Compute any fixup needed when concatenating buffers.
123 ///
124 /// The default is to do nothing.
125 ///
126 /// The function is `unsafe` because it may assume the input
127 /// buffers are already valid for the format. Also, no
128 /// bounds-checking is performed on the return value!
129 #[inline(always)]
130 unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
131 Default::default()
132 }
133}
134
135/// Indicates that one format is a subset of another.
136///
137/// The subset format can be converted to the superset format
138/// for free.
139pub unsafe trait SubsetOf<Super>: Format
140where
141 Super: Format,
142{
143 /// Validate the *other* direction of conversion; check if
144 /// this buffer from the superset format conforms to the
145 /// subset format.
146 ///
147 /// The default calls `Self::validate`, but some conversions
148 /// may implement a check which is cheaper than validating
149 /// from scratch.
150 fn revalidate_subset(x: &[u8]) -> bool {
151 Self::validate(buf:x)
152 }
153}
154
155/// Indicates a format which corresponds to a Rust slice type,
156/// representing exactly the same invariants.
157pub unsafe trait SliceFormat: Format + Sized {
158 type Slice: ?Sized + Slice;
159}
160
161/// Indicates a format which contains characters from Unicode
162/// (all of it, or some proper subset).
163pub unsafe trait CharFormat<'a>: Format {
164 /// Iterator for characters and their byte indices.
165 type Iter: Iterator<Item = (usize, char)>;
166
167 /// Iterate over the characters of the string and their byte
168 /// indices.
169 ///
170 /// You may assume the buffer is *already validated* for `Format`.
171 unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;
172
173 /// Encode the character as bytes and pass them to a continuation.
174 ///
175 /// Returns `Err(())` iff the character cannot be represented.
176 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
177 where
178 F: FnOnce(&[u8]);
179}
180
181/// Indicates a Rust slice type that is represented in memory as bytes.
182pub unsafe trait Slice {
183 /// Access the raw bytes of the slice.
184 fn as_bytes(&self) -> &[u8];
185
186 /// Convert a byte slice to this kind of slice.
187 ///
188 /// You may assume the buffer is *already validated*
189 /// for `Format`.
190 unsafe fn from_bytes(x: &[u8]) -> &Self;
191
192 /// Convert a byte slice to this kind of slice.
193 ///
194 /// You may assume the buffer is *already validated*
195 /// for `Format`.
196 unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
197}
198
199/// Marker type for uninterpreted bytes.
200///
201/// Validation will never fail for this format.
202#[derive(Copy, Clone, Default, Debug)]
203pub struct Bytes;
204
205unsafe impl Format for Bytes {
206 #[inline(always)]
207 fn validate(_: &[u8]) -> bool {
208 true
209 }
210}
211
212unsafe impl SliceFormat for Bytes {
213 type Slice = [u8];
214}
215
216unsafe impl Slice for [u8] {
217 #[inline(always)]
218 fn as_bytes(&self) -> &[u8] {
219 self
220 }
221
222 #[inline(always)]
223 unsafe fn from_bytes(x: &[u8]) -> &[u8] {
224 x
225 }
226
227 #[inline(always)]
228 unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
229 x
230 }
231}
232
233/// Marker type for ASCII text.
234#[derive(Copy, Clone, Default, Debug)]
235pub struct ASCII;
236
237unsafe impl Format for ASCII {
238 #[inline]
239 fn validate(buf: &[u8]) -> bool {
240 buf.iter().all(|&n: u8| n <= 127)
241 }
242
243 #[inline(always)]
244 fn validate_prefix(_: &[u8]) -> bool {
245 true
246 }
247
248 #[inline(always)]
249 fn validate_suffix(_: &[u8]) -> bool {
250 true
251 }
252
253 #[inline(always)]
254 fn validate_subseq(_: &[u8]) -> bool {
255 true
256 }
257}
258
259unsafe impl SubsetOf<UTF8> for ASCII {}
260unsafe impl SubsetOf<Latin1> for ASCII {}
261
262unsafe impl<'a> CharFormat<'a> for ASCII {
263 type Iter = imp::SingleByteCharIndices<'a>;
264
265 #[inline]
266 unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
267 imp::SingleByteCharIndices::new(buf)
268 }
269
270 #[inline]
271 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
272 where
273 F: FnOnce(&[u8]),
274 {
275 let n: u32 = ch as u32;
276 if n > 0x7F {
277 return Err(());
278 }
279 cont(&[n as u8]);
280 Ok(())
281 }
282}
283
284/// Marker type for UTF-8 text.
285#[derive(Copy, Clone, Default, Debug)]
286pub struct UTF8;
287
288unsafe impl Format for UTF8 {
289 #[inline]
290 fn validate(buf: &[u8]) -> bool {
291 str::from_utf8(buf).is_ok()
292 }
293
294 #[inline]
295 fn validate_prefix(buf: &[u8]) -> bool {
296 if buf.len() == 0 {
297 return true;
298 }
299 match futf::classify(buf, buf.len() - 1) {
300 Some(Codepoint {
301 meaning: Meaning::Whole(_),
302 ..
303 }) => true,
304 _ => false,
305 }
306 }
307
308 #[inline]
309 fn validate_suffix(buf: &[u8]) -> bool {
310 if buf.len() == 0 {
311 return true;
312 }
313 match futf::classify(buf, 0) {
314 Some(Codepoint {
315 meaning: Meaning::Whole(_),
316 ..
317 }) => true,
318 _ => false,
319 }
320 }
321
322 #[inline]
323 fn validate_subseq(buf: &[u8]) -> bool {
324 <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
325 }
326}
327
328unsafe impl SubsetOf<WTF8> for UTF8 {}
329
330unsafe impl SliceFormat for UTF8 {
331 type Slice = str;
332}
333
334unsafe impl Slice for str {
335 #[inline(always)]
336 fn as_bytes(&self) -> &[u8] {
337 str::as_bytes(self)
338 }
339
340 #[inline(always)]
341 unsafe fn from_bytes(x: &[u8]) -> &str {
342 str::from_utf8_unchecked(x)
343 }
344
345 #[inline(always)]
346 unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
347 mem::transmute(src:x)
348 }
349}
350
351unsafe impl<'a> CharFormat<'a> for UTF8 {
352 type Iter = str::CharIndices<'a>;
353
354 #[inline]
355 unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
356 str::from_utf8_unchecked(buf).char_indices()
357 }
358
359 #[inline]
360 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
361 where
362 F: FnOnce(&[u8]),
363 {
364 cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes());
365 Ok(())
366 }
367}
368
369/// Marker type for WTF-8 text.
370///
371/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/).
372#[derive(Copy, Clone, Default, Debug)]
373pub struct WTF8;
374
375#[inline]
376fn wtf8_meaningful(m: Meaning) -> bool {
377 match m {
378 Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true,
379 _ => false,
380 }
381}
382
383unsafe impl Format for WTF8 {
384 #[inline]
385 fn validate(buf: &[u8]) -> bool {
386 let mut i = 0;
387 let mut prev_lead = false;
388 while i < buf.len() {
389 let codept = unwrap_or_return!(futf::classify(buf, i), false);
390 if !wtf8_meaningful(codept.meaning) {
391 return false;
392 }
393 i += codept.bytes.len();
394 prev_lead = match codept.meaning {
395 Meaning::TrailSurrogate(_) if prev_lead => return false,
396 Meaning::LeadSurrogate(_) => true,
397 _ => false,
398 };
399 }
400
401 true
402 }
403
404 #[inline]
405 fn validate_prefix(buf: &[u8]) -> bool {
406 if buf.len() == 0 {
407 return true;
408 }
409 match futf::classify(buf, buf.len() - 1) {
410 Some(c) => wtf8_meaningful(c.meaning),
411 _ => false,
412 }
413 }
414
415 #[inline]
416 fn validate_suffix(buf: &[u8]) -> bool {
417 if buf.len() == 0 {
418 return true;
419 }
420 match futf::classify(buf, 0) {
421 Some(c) => wtf8_meaningful(c.meaning),
422 _ => false,
423 }
424 }
425
426 #[inline]
427 fn validate_subseq(buf: &[u8]) -> bool {
428 <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
429 }
430
431 #[inline]
432 unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
433 const ERR: &'static str = "WTF8: internal error";
434
435 if lhs.len() >= 3 && rhs.len() >= 3 {
436 if let (
437 Some(Codepoint {
438 meaning: Meaning::LeadSurrogate(hi),
439 ..
440 }),
441 Some(Codepoint {
442 meaning: Meaning::TrailSurrogate(lo),
443 ..
444 }),
445 ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0))
446 {
447 let mut fixup = imp::Fixup {
448 drop_left: 3,
449 drop_right: 3,
450 insert_len: 0,
451 insert_bytes: [0_u8; 4],
452 };
453
454 let n = 0x10000 + ((hi as u32) << 10) + (lo as u32);
455
456 let ch = char::from_u32(n).expect(ERR);
457 fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32;
458
459 return fixup;
460 }
461 }
462
463 Default::default()
464 }
465}
466
467/// Marker type for the single-byte encoding of the first 256 Unicode codepoints.
468///
469/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the
470/// C0 and C1 control characters from ECMA-48 / ISO 6429.
471///
472/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the
473/// many other aliases), which actually stand for Windows-1252.
474#[derive(Copy, Clone, Default, Debug)]
475pub struct Latin1;
476
477unsafe impl Format for Latin1 {
478 #[inline(always)]
479 fn validate(_: &[u8]) -> bool {
480 true
481 }
482
483 #[inline(always)]
484 fn validate_prefix(_: &[u8]) -> bool {
485 true
486 }
487
488 #[inline(always)]
489 fn validate_suffix(_: &[u8]) -> bool {
490 true
491 }
492
493 #[inline(always)]
494 fn validate_subseq(_: &[u8]) -> bool {
495 true
496 }
497}
498
499unsafe impl<'a> CharFormat<'a> for Latin1 {
500 type Iter = imp::SingleByteCharIndices<'a>;
501
502 #[inline]
503 unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
504 imp::SingleByteCharIndices::new(buf)
505 }
506
507 #[inline]
508 fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
509 where
510 F: FnOnce(&[u8]),
511 {
512 let n: u32 = ch as u32;
513 if n > 0xFF {
514 return Err(());
515 }
516 cont(&[n as u8]);
517 Ok(())
518 }
519}
520