| 1 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 2 | // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 3 | // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your |
| 4 | // option. This file may not be copied, modified, or distributed |
| 5 | // except according to those terms. |
| 6 | |
| 7 | //! Marker types for formats. |
| 8 | //! |
| 9 | //! This module defines the types and traits used to mark a `Tendril` |
| 10 | //! with the format of data it contains. It includes those formats |
| 11 | //! for which `Tendril` supports at least some operations without |
| 12 | //! conversion. |
| 13 | //! |
| 14 | //! To convert a string tendril to/from a byte tendril in an arbitrary |
| 15 | //! character encoding, see the `encode` and `decode` methods on |
| 16 | //! `Tendril`. |
| 17 | //! |
| 18 | //! `Tendril` operations may become memory-unsafe if data invalid for |
| 19 | //! the format sneaks in. For that reason, these traits require |
| 20 | //! `unsafe impl`. |
| 21 | |
| 22 | use std::default::Default; |
| 23 | use std::{char, mem, str}; |
| 24 | |
| 25 | use futf::{self, Codepoint, Meaning}; |
| 26 | |
| 27 | /// Implementation details. |
| 28 | /// |
| 29 | /// You don't need these unless you are implementing |
| 30 | /// a new format. |
| 31 | pub mod imp { |
| 32 | use std::default::Default; |
| 33 | use std::{iter, mem, slice}; |
| 34 | |
| 35 | /// Describes how to fix up encodings when concatenating. |
| 36 | /// |
| 37 | /// We can drop characters on either side of the splice, |
| 38 | /// and insert up to 4 bytes in the middle. |
| 39 | pub struct Fixup { |
| 40 | pub drop_left: u32, |
| 41 | pub drop_right: u32, |
| 42 | pub insert_len: u32, |
| 43 | pub insert_bytes: [u8; 4], |
| 44 | } |
| 45 | |
| 46 | impl Default for Fixup { |
| 47 | #[inline (always)] |
| 48 | fn default() -> Fixup { |
| 49 | Fixup { |
| 50 | drop_left: 0, |
| 51 | drop_right: 0, |
| 52 | insert_len: 0, |
| 53 | insert_bytes: [0; 4], |
| 54 | } |
| 55 | } |
| 56 | } |
| 57 | |
| 58 | #[inline (always)] |
| 59 | unsafe fn from_u32_unchecked(n: u32) -> char { |
| 60 | mem::transmute(n) |
| 61 | } |
| 62 | |
| 63 | pub struct SingleByteCharIndices<'a> { |
| 64 | inner: iter::Enumerate<slice::Iter<'a, u8>>, |
| 65 | } |
| 66 | |
| 67 | impl<'a> Iterator for SingleByteCharIndices<'a> { |
| 68 | type Item = (usize, char); |
| 69 | |
| 70 | #[inline ] |
| 71 | fn next(&mut self) -> Option<(usize, char)> { |
| 72 | self.inner |
| 73 | .next() |
| 74 | .map(|(i, &b)| unsafe { (i, from_u32_unchecked(b as u32)) }) |
| 75 | } |
| 76 | } |
| 77 | |
| 78 | impl<'a> SingleByteCharIndices<'a> { |
| 79 | #[inline ] |
| 80 | pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> { |
| 81 | SingleByteCharIndices { |
| 82 | inner: buf.iter().enumerate(), |
| 83 | } |
| 84 | } |
| 85 | } |
| 86 | } |
| 87 | |
| 88 | /// Trait for format marker types. |
| 89 | /// |
| 90 | /// The type implementing this trait is usually not instantiated. |
| 91 | /// It's used with a phantom type parameter of `Tendril`. |
| 92 | pub unsafe trait Format { |
| 93 | /// Check whether the buffer is valid for this format. |
| 94 | fn validate(buf: &[u8]) -> bool; |
| 95 | |
| 96 | /// Check whether the buffer is valid for this format. |
| 97 | /// |
| 98 | /// You may assume the buffer is a prefix of a valid buffer. |
| 99 | #[inline ] |
| 100 | fn validate_prefix(buf: &[u8]) -> bool { |
| 101 | <Self as Format>::validate(buf) |
| 102 | } |
| 103 | |
| 104 | /// Check whether the buffer is valid for this format. |
| 105 | /// |
| 106 | /// You may assume the buffer is a suffix of a valid buffer. |
| 107 | #[inline ] |
| 108 | fn validate_suffix(buf: &[u8]) -> bool { |
| 109 | <Self as Format>::validate(buf) |
| 110 | } |
| 111 | |
| 112 | /// Check whether the buffer is valid for this format. |
| 113 | /// |
| 114 | /// You may assume the buffer is a contiguous subsequence |
| 115 | /// of a valid buffer, but not necessarily a prefix or |
| 116 | /// a suffix. |
| 117 | #[inline ] |
| 118 | fn validate_subseq(buf: &[u8]) -> bool { |
| 119 | <Self as Format>::validate(buf) |
| 120 | } |
| 121 | |
| 122 | /// Compute any fixup needed when concatenating buffers. |
| 123 | /// |
| 124 | /// The default is to do nothing. |
| 125 | /// |
| 126 | /// The function is `unsafe` because it may assume the input |
| 127 | /// buffers are already valid for the format. Also, no |
| 128 | /// bounds-checking is performed on the return value! |
| 129 | #[inline (always)] |
| 130 | unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup { |
| 131 | Default::default() |
| 132 | } |
| 133 | } |
| 134 | |
| 135 | /// Indicates that one format is a subset of another. |
| 136 | /// |
| 137 | /// The subset format can be converted to the superset format |
| 138 | /// for free. |
| 139 | pub unsafe trait SubsetOf<Super>: Format |
| 140 | where |
| 141 | Super: Format, |
| 142 | { |
| 143 | /// Validate the *other* direction of conversion; check if |
| 144 | /// this buffer from the superset format conforms to the |
| 145 | /// subset format. |
| 146 | /// |
| 147 | /// The default calls `Self::validate`, but some conversions |
| 148 | /// may implement a check which is cheaper than validating |
| 149 | /// from scratch. |
| 150 | fn revalidate_subset(x: &[u8]) -> bool { |
| 151 | Self::validate(buf:x) |
| 152 | } |
| 153 | } |
| 154 | |
| 155 | /// Indicates a format which corresponds to a Rust slice type, |
| 156 | /// representing exactly the same invariants. |
| 157 | pub unsafe trait SliceFormat: Format + Sized { |
| 158 | type Slice: ?Sized + Slice; |
| 159 | } |
| 160 | |
| 161 | /// Indicates a format which contains characters from Unicode |
| 162 | /// (all of it, or some proper subset). |
| 163 | pub unsafe trait CharFormat<'a>: Format { |
| 164 | /// Iterator for characters and their byte indices. |
| 165 | type Iter: Iterator<Item = (usize, char)>; |
| 166 | |
| 167 | /// Iterate over the characters of the string and their byte |
| 168 | /// indices. |
| 169 | /// |
| 170 | /// You may assume the buffer is *already validated* for `Format`. |
| 171 | unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter; |
| 172 | |
| 173 | /// Encode the character as bytes and pass them to a continuation. |
| 174 | /// |
| 175 | /// Returns `Err(())` iff the character cannot be represented. |
| 176 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> |
| 177 | where |
| 178 | F: FnOnce(&[u8]); |
| 179 | } |
| 180 | |
| 181 | /// Indicates a Rust slice type that is represented in memory as bytes. |
| 182 | pub unsafe trait Slice { |
| 183 | /// Access the raw bytes of the slice. |
| 184 | fn as_bytes(&self) -> &[u8]; |
| 185 | |
| 186 | /// Convert a byte slice to this kind of slice. |
| 187 | /// |
| 188 | /// You may assume the buffer is *already validated* |
| 189 | /// for `Format`. |
| 190 | unsafe fn from_bytes(x: &[u8]) -> &Self; |
| 191 | |
| 192 | /// Convert a byte slice to this kind of slice. |
| 193 | /// |
| 194 | /// You may assume the buffer is *already validated* |
| 195 | /// for `Format`. |
| 196 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self; |
| 197 | } |
| 198 | |
| 199 | /// Marker type for uninterpreted bytes. |
| 200 | /// |
| 201 | /// Validation will never fail for this format. |
| 202 | #[derive (Copy, Clone, Default, Debug)] |
| 203 | pub struct Bytes; |
| 204 | |
| 205 | unsafe impl Format for Bytes { |
| 206 | #[inline (always)] |
| 207 | fn validate(_: &[u8]) -> bool { |
| 208 | true |
| 209 | } |
| 210 | } |
| 211 | |
| 212 | unsafe impl SliceFormat for Bytes { |
| 213 | type Slice = [u8]; |
| 214 | } |
| 215 | |
| 216 | unsafe impl Slice for [u8] { |
| 217 | #[inline (always)] |
| 218 | fn as_bytes(&self) -> &[u8] { |
| 219 | self |
| 220 | } |
| 221 | |
| 222 | #[inline (always)] |
| 223 | unsafe fn from_bytes(x: &[u8]) -> &[u8] { |
| 224 | x |
| 225 | } |
| 226 | |
| 227 | #[inline (always)] |
| 228 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] { |
| 229 | x |
| 230 | } |
| 231 | } |
| 232 | |
| 233 | /// Marker type for ASCII text. |
| 234 | #[derive (Copy, Clone, Default, Debug)] |
| 235 | pub struct ASCII; |
| 236 | |
| 237 | unsafe impl Format for ASCII { |
| 238 | #[inline ] |
| 239 | fn validate(buf: &[u8]) -> bool { |
| 240 | buf.iter().all(|&n: u8| n <= 127) |
| 241 | } |
| 242 | |
| 243 | #[inline (always)] |
| 244 | fn validate_prefix(_: &[u8]) -> bool { |
| 245 | true |
| 246 | } |
| 247 | |
| 248 | #[inline (always)] |
| 249 | fn validate_suffix(_: &[u8]) -> bool { |
| 250 | true |
| 251 | } |
| 252 | |
| 253 | #[inline (always)] |
| 254 | fn validate_subseq(_: &[u8]) -> bool { |
| 255 | true |
| 256 | } |
| 257 | } |
| 258 | |
| 259 | unsafe impl SubsetOf<UTF8> for ASCII {} |
| 260 | unsafe impl SubsetOf<Latin1> for ASCII {} |
| 261 | |
| 262 | unsafe impl<'a> CharFormat<'a> for ASCII { |
| 263 | type Iter = imp::SingleByteCharIndices<'a>; |
| 264 | |
| 265 | #[inline ] |
| 266 | unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { |
| 267 | imp::SingleByteCharIndices::new(buf) |
| 268 | } |
| 269 | |
| 270 | #[inline ] |
| 271 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> |
| 272 | where |
| 273 | F: FnOnce(&[u8]), |
| 274 | { |
| 275 | let n: u32 = ch as u32; |
| 276 | if n > 0x7F { |
| 277 | return Err(()); |
| 278 | } |
| 279 | cont(&[n as u8]); |
| 280 | Ok(()) |
| 281 | } |
| 282 | } |
| 283 | |
| 284 | /// Marker type for UTF-8 text. |
| 285 | #[derive (Copy, Clone, Default, Debug)] |
| 286 | pub struct UTF8; |
| 287 | |
| 288 | unsafe impl Format for UTF8 { |
| 289 | #[inline ] |
| 290 | fn validate(buf: &[u8]) -> bool { |
| 291 | str::from_utf8(buf).is_ok() |
| 292 | } |
| 293 | |
| 294 | #[inline ] |
| 295 | fn validate_prefix(buf: &[u8]) -> bool { |
| 296 | if buf.len() == 0 { |
| 297 | return true; |
| 298 | } |
| 299 | match futf::classify(buf, buf.len() - 1) { |
| 300 | Some(Codepoint { |
| 301 | meaning: Meaning::Whole(_), |
| 302 | .. |
| 303 | }) => true, |
| 304 | _ => false, |
| 305 | } |
| 306 | } |
| 307 | |
| 308 | #[inline ] |
| 309 | fn validate_suffix(buf: &[u8]) -> bool { |
| 310 | if buf.len() == 0 { |
| 311 | return true; |
| 312 | } |
| 313 | match futf::classify(buf, 0) { |
| 314 | Some(Codepoint { |
| 315 | meaning: Meaning::Whole(_), |
| 316 | .. |
| 317 | }) => true, |
| 318 | _ => false, |
| 319 | } |
| 320 | } |
| 321 | |
| 322 | #[inline ] |
| 323 | fn validate_subseq(buf: &[u8]) -> bool { |
| 324 | <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) |
| 325 | } |
| 326 | } |
| 327 | |
| 328 | unsafe impl SubsetOf<WTF8> for UTF8 {} |
| 329 | |
| 330 | unsafe impl SliceFormat for UTF8 { |
| 331 | type Slice = str; |
| 332 | } |
| 333 | |
| 334 | unsafe impl Slice for str { |
| 335 | #[inline (always)] |
| 336 | fn as_bytes(&self) -> &[u8] { |
| 337 | str::as_bytes(self) |
| 338 | } |
| 339 | |
| 340 | #[inline (always)] |
| 341 | unsafe fn from_bytes(x: &[u8]) -> &str { |
| 342 | str::from_utf8_unchecked(x) |
| 343 | } |
| 344 | |
| 345 | #[inline (always)] |
| 346 | unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str { |
| 347 | mem::transmute(src:x) |
| 348 | } |
| 349 | } |
| 350 | |
| 351 | unsafe impl<'a> CharFormat<'a> for UTF8 { |
| 352 | type Iter = str::CharIndices<'a>; |
| 353 | |
| 354 | #[inline ] |
| 355 | unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> { |
| 356 | str::from_utf8_unchecked(buf).char_indices() |
| 357 | } |
| 358 | |
| 359 | #[inline ] |
| 360 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> |
| 361 | where |
| 362 | F: FnOnce(&[u8]), |
| 363 | { |
| 364 | cont(ch.encode_utf8(&mut [0_u8; 4]).as_bytes()); |
| 365 | Ok(()) |
| 366 | } |
| 367 | } |
| 368 | |
| 369 | /// Marker type for WTF-8 text. |
| 370 | /// |
| 371 | /// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/). |
| 372 | #[derive (Copy, Clone, Default, Debug)] |
| 373 | pub struct WTF8; |
| 374 | |
| 375 | #[inline ] |
| 376 | fn wtf8_meaningful(m: Meaning) -> bool { |
| 377 | match m { |
| 378 | Meaning::Whole(_) | Meaning::LeadSurrogate(_) | Meaning::TrailSurrogate(_) => true, |
| 379 | _ => false, |
| 380 | } |
| 381 | } |
| 382 | |
| 383 | unsafe impl Format for WTF8 { |
| 384 | #[inline ] |
| 385 | fn validate(buf: &[u8]) -> bool { |
| 386 | let mut i = 0; |
| 387 | let mut prev_lead = false; |
| 388 | while i < buf.len() { |
| 389 | let codept = unwrap_or_return!(futf::classify(buf, i), false); |
| 390 | if !wtf8_meaningful(codept.meaning) { |
| 391 | return false; |
| 392 | } |
| 393 | i += codept.bytes.len(); |
| 394 | prev_lead = match codept.meaning { |
| 395 | Meaning::TrailSurrogate(_) if prev_lead => return false, |
| 396 | Meaning::LeadSurrogate(_) => true, |
| 397 | _ => false, |
| 398 | }; |
| 399 | } |
| 400 | |
| 401 | true |
| 402 | } |
| 403 | |
| 404 | #[inline ] |
| 405 | fn validate_prefix(buf: &[u8]) -> bool { |
| 406 | if buf.len() == 0 { |
| 407 | return true; |
| 408 | } |
| 409 | match futf::classify(buf, buf.len() - 1) { |
| 410 | Some(c) => wtf8_meaningful(c.meaning), |
| 411 | _ => false, |
| 412 | } |
| 413 | } |
| 414 | |
| 415 | #[inline ] |
| 416 | fn validate_suffix(buf: &[u8]) -> bool { |
| 417 | if buf.len() == 0 { |
| 418 | return true; |
| 419 | } |
| 420 | match futf::classify(buf, 0) { |
| 421 | Some(c) => wtf8_meaningful(c.meaning), |
| 422 | _ => false, |
| 423 | } |
| 424 | } |
| 425 | |
| 426 | #[inline ] |
| 427 | fn validate_subseq(buf: &[u8]) -> bool { |
| 428 | <Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf) |
| 429 | } |
| 430 | |
| 431 | #[inline ] |
| 432 | unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup { |
| 433 | const ERR: &'static str = "WTF8: internal error" ; |
| 434 | |
| 435 | if lhs.len() >= 3 && rhs.len() >= 3 { |
| 436 | if let ( |
| 437 | Some(Codepoint { |
| 438 | meaning: Meaning::LeadSurrogate(hi), |
| 439 | .. |
| 440 | }), |
| 441 | Some(Codepoint { |
| 442 | meaning: Meaning::TrailSurrogate(lo), |
| 443 | .. |
| 444 | }), |
| 445 | ) = (futf::classify(lhs, lhs.len() - 1), futf::classify(rhs, 0)) |
| 446 | { |
| 447 | let mut fixup = imp::Fixup { |
| 448 | drop_left: 3, |
| 449 | drop_right: 3, |
| 450 | insert_len: 0, |
| 451 | insert_bytes: [0_u8; 4], |
| 452 | }; |
| 453 | |
| 454 | let n = 0x10000 + ((hi as u32) << 10) + (lo as u32); |
| 455 | |
| 456 | let ch = char::from_u32(n).expect(ERR); |
| 457 | fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32; |
| 458 | |
| 459 | return fixup; |
| 460 | } |
| 461 | } |
| 462 | |
| 463 | Default::default() |
| 464 | } |
| 465 | } |
| 466 | |
| 467 | /// Marker type for the single-byte encoding of the first 256 Unicode codepoints. |
| 468 | /// |
| 469 | /// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the |
| 470 | /// C0 and C1 control characters from ECMA-48 / ISO 6429. |
| 471 | /// |
| 472 | /// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the |
| 473 | /// many other aliases), which actually stand for Windows-1252. |
| 474 | #[derive (Copy, Clone, Default, Debug)] |
| 475 | pub struct Latin1; |
| 476 | |
| 477 | unsafe impl Format for Latin1 { |
| 478 | #[inline (always)] |
| 479 | fn validate(_: &[u8]) -> bool { |
| 480 | true |
| 481 | } |
| 482 | |
| 483 | #[inline (always)] |
| 484 | fn validate_prefix(_: &[u8]) -> bool { |
| 485 | true |
| 486 | } |
| 487 | |
| 488 | #[inline (always)] |
| 489 | fn validate_suffix(_: &[u8]) -> bool { |
| 490 | true |
| 491 | } |
| 492 | |
| 493 | #[inline (always)] |
| 494 | fn validate_subseq(_: &[u8]) -> bool { |
| 495 | true |
| 496 | } |
| 497 | } |
| 498 | |
| 499 | unsafe impl<'a> CharFormat<'a> for Latin1 { |
| 500 | type Iter = imp::SingleByteCharIndices<'a>; |
| 501 | |
| 502 | #[inline ] |
| 503 | unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> { |
| 504 | imp::SingleByteCharIndices::new(buf) |
| 505 | } |
| 506 | |
| 507 | #[inline ] |
| 508 | fn encode_char<F>(ch: char, cont: F) -> Result<(), ()> |
| 509 | where |
| 510 | F: FnOnce(&[u8]), |
| 511 | { |
| 512 | let n: u32 = ch as u32; |
| 513 | if n > 0xFF { |
| 514 | return Err(()); |
| 515 | } |
| 516 | cont(&[n as u8]); |
| 517 | Ok(()) |
| 518 | } |
| 519 | } |
| 520 | |