fmt.rs source code [crates/tendril/src/fmt.rs]

1	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
2	// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
3	// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
4	// option. This file may not be copied, modified, or distributed
5	// except according to those terms.
6
7	//! Marker types for formats.
8	//!
9	//! This module defines the types and traits used to mark a `Tendril`
10	//! with the format of data it contains. It includes those formats
11	//! for which `Tendril` supports at least some operations without
12	//! conversion.
13	//!
14	//! To convert a string tendril to/from a byte tendril in an arbitrary
15	//! character encoding, see the `encode` and `decode` methods on
16	//! `Tendril`.
17	//!
18	//! `Tendril` operations may become memory-unsafe if data invalid for
19	//! the format sneaks in. For that reason, these traits require
20	//! `unsafe impl`.
21
22	use std::default::Default;
23	use std::{char, mem, str};
24
25	use futf::{self, Codepoint, Meaning};
26
27	/// Implementation details.
28	///
29	/// You don't need these unless you are implementing
30	/// a new format.
31	pub mod imp {
32	use std::default::Default;
33	use std::{iter, mem, slice};
34
35	/// Describes how to fix up encodings when concatenating.
36	///
37	/// We can drop characters on either side of the splice,
38	/// and insert up to 4 bytes in the middle.
39	pub struct Fixup {
40	pub drop_left: u32,
41	pub drop_right: u32,
42	pub insert_len: u32,
43	pub insert_bytes: [u8; `4`],
44	}
45
46	impl Default for Fixup {
47	#[inline(always)]
48	fn default() -> Fixup {
49	Fixup {
50	drop_left: `0`,
51	drop_right: `0`,
52	insert_len: `0`,
53	insert_bytes: [`0`; `4`],
54	}
55	}
56	}
57
58	#[inline(always)]
59	unsafe fn from_u32_unchecked(n: u32) -> char {
60	mem::transmute(n)
61	}
62
63	pub struct SingleByteCharIndices<'a> {
64	inner: iter::Enumerate<slice::Iter<'a, u8>>,
65	}
66
67	impl<'a> Iterator for SingleByteCharIndices<'a> {
68	type Item = (usize, char);
69
70	#[inline]
71	fn next(&mut self) -> Option<(usize, char)> {
72	self.inner
73	.next()
74	.map(\|(i, &b)\| unsafe { (i, from_u32_unchecked(b as u32)) })
75	}
76	}
77
78	impl<'a> SingleByteCharIndices<'a> {
79	#[inline]
80	pub fn new(buf: &'a [u8]) -> SingleByteCharIndices<'a> {
81	SingleByteCharIndices {
82	inner: buf.iter().enumerate(),
83	}
84	}
85	}
86	}
87
88	/// Trait for format marker types.
89	///
90	/// The type implementing this trait is usually not instantiated.
91	/// It's used with a phantom type parameter of `Tendril`.
92	pub unsafe trait Format {
93	/// Check whether the buffer is valid for this format.
94	fn validate(buf: &[u8]) -> bool;
95
96	/// Check whether the buffer is valid for this format.
97	///
98	/// You may assume the buffer is a prefix of a valid buffer.
99	#[inline]
100	fn validate_prefix(buf: &[u8]) -> bool {
101	<Self as Format>::validate(buf)
102	}
103
104	/// Check whether the buffer is valid for this format.
105	///
106	/// You may assume the buffer is a suffix of a valid buffer.
107	#[inline]
108	fn validate_suffix(buf: &[u8]) -> bool {
109	<Self as Format>::validate(buf)
110	}
111
112	/// Check whether the buffer is valid for this format.
113	///
114	/// You may assume the buffer is a contiguous subsequence
115	/// of a valid buffer, but not necessarily a prefix or
116	/// a suffix.
117	#[inline]
118	fn validate_subseq(buf: &[u8]) -> bool {
119	<Self as Format>::validate(buf)
120	}
121
122	/// Compute any fixup needed when concatenating buffers.
123	///
124	/// The default is to do nothing.
125	///
126	/// The function is `unsafe` because it may assume the input
127	/// buffers are already valid for the format. Also, no
128	/// bounds-checking is performed on the return value!
129	#[inline(always)]
130	unsafe fn fixup(_lhs: &[u8], _rhs: &[u8]) -> imp::Fixup {
131	Default::default()
132	}
133	}
134
135	/// Indicates that one format is a subset of another.
136	///
137	/// The subset format can be converted to the superset format
138	/// for free.
139	pub unsafe trait SubsetOf<Super>: Format
140	where
141	Super: Format,
142	{
143	/// Validate the other* direction of conversion; check if*
144	/// this buffer from the superset format conforms to the
145	/// subset format.
146	///
147	/// The default calls `Self::validate`, but some conversions
148	/// may implement a check which is cheaper than validating
149	/// from scratch.
150	fn revalidate_subset(x: &[u8]) -> bool {
151	Self::validate(buf:x)
152	}
153	}
154
155	/// Indicates a format which corresponds to a Rust slice type,
156	/// representing exactly the same invariants.
157	pub unsafe trait SliceFormat: Format + Sized {
158	type Slice: ?Sized + Slice;
159	}
160
161	/// Indicates a format which contains characters from Unicode
162	/// (all of it, or some proper subset).
163	pub unsafe trait CharFormat<'a>: Format {
164	/// Iterator for characters and their byte indices.
165	type Iter: Iterator<Item = (usize, char)>;
166
167	/// Iterate over the characters of the string and their byte
168	/// indices.
169	///
170	/// You may assume the buffer is already validated* for `Format`.*
171	unsafe fn char_indices(buf: &'a [u8]) -> Self::Iter;
172
173	/// Encode the character as bytes and pass them to a continuation.
174	///
175	/// Returns `Err(())` iff the character cannot be represented.
176	fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
177	where
178	F: FnOnce(&[u8]);
179	}
180
181	/// Indicates a Rust slice type that is represented in memory as bytes.
182	pub unsafe trait Slice {
183	/// Access the raw bytes of the slice.
184	fn as_bytes(&self) -> &[u8];
185
186	/// Convert a byte slice to this kind of slice.
187	///
188	/// You may assume the buffer is already validated
189	/// for `Format`.
190	unsafe fn from_bytes(x: &[u8]) -> &Self;
191
192	/// Convert a byte slice to this kind of slice.
193	///
194	/// You may assume the buffer is already validated
195	/// for `Format`.
196	unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut Self;
197	}
198
199	/// Marker type for uninterpreted bytes.
200	///
201	/// Validation will never fail for this format.
202	#[derive(Copy, Clone, Default, Debug)]
203	pub struct Bytes;
204
205	unsafe impl Format for Bytes {
206	#[inline(always)]
207	fn validate(_: &[u8]) -> bool {
208	`true`
209	}
210	}
211
212	unsafe impl SliceFormat for Bytes {
213	type Slice = [u8];
214	}
215
216	unsafe impl Slice for [u8] {
217	#[inline(always)]
218	fn as_bytes(&self) -> &[u8] {
219	self
220	}
221
222	#[inline(always)]
223	unsafe fn from_bytes(x: &[u8]) -> &[u8] {
224	x
225	}
226
227	#[inline(always)]
228	unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut [u8] {
229	x
230	}
231	}
232
233	/// Marker type for ASCII text.
234	#[derive(Copy, Clone, Default, Debug)]
235	pub struct ASCII;
236
237	unsafe impl Format for ASCII {
238	#[inline]
239	fn validate(buf: &[u8]) -> bool {
240	buf.iter().all(\|&n: u8\| n <= `127`)
241	}
242
243	#[inline(always)]
244	fn validate_prefix(_: &[u8]) -> bool {
245	`true`
246	}
247
248	#[inline(always)]
249	fn validate_suffix(_: &[u8]) -> bool {
250	`true`
251	}
252
253	#[inline(always)]
254	fn validate_subseq(_: &[u8]) -> bool {
255	`true`
256	}
257	}
258
259	unsafe impl SubsetOf<UTF8> for ASCII {}
260	unsafe impl SubsetOf<Latin1> for ASCII {}
261
262	unsafe impl<'a> CharFormat<'a> for ASCII {
263	type Iter = imp::SingleByteCharIndices<'a>;
264
265	#[inline]
266	unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
267	imp::SingleByteCharIndices::new(buf)
268	}
269
270	#[inline]
271	fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
272	where
273	F: FnOnce(&[u8]),
274	{
275	let n: u32 = ch as u32;
276	if n > `0x7F` {
277	return Err(());
278	}
279	cont(&[n as u8]);
280	Ok(())
281	}
282	}
283
284	/// Marker type for UTF-8 text.
285	#[derive(Copy, Clone, Default, Debug)]
286	pub struct UTF8;
287
288	unsafe impl Format for UTF8 {
289	#[inline]
290	fn validate(buf: &[u8]) -> bool {
291	str::from_utf8(buf).is_ok()
292	}
293
294	#[inline]
295	fn validate_prefix(buf: &[u8]) -> bool {
296	if buf.len() == `0` {
297	return `true`;
298	}
299	match futf::classify(buf, buf.len() - `1`) {
300	Some(Codepoint {
301	meaning: Meaning::Whole(_),
302	..
303	}) => `true`,
304	_ => `false`,
305	}
306	}
307
308	#[inline]
309	fn validate_suffix(buf: &[u8]) -> bool {
310	if buf.len() == `0` {
311	return `true`;
312	}
313	match futf::classify(buf, `0`) {
314	Some(Codepoint {
315	meaning: Meaning::Whole(_),
316	..
317	}) => `true`,
318	_ => `false`,
319	}
320	}
321
322	#[inline]
323	fn validate_subseq(buf: &[u8]) -> bool {
324	<Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
325	}
326	}
327
328	unsafe impl SubsetOf<WTF8> for UTF8 {}
329
330	unsafe impl SliceFormat for UTF8 {
331	type Slice = str;
332	}
333
334	unsafe impl Slice for str {
335	#[inline(always)]
336	fn as_bytes(&self) -> &[u8] {
337	str::as_bytes(self)
338	}
339
340	#[inline(always)]
341	unsafe fn from_bytes(x: &[u8]) -> &str {
342	str::from_utf8_unchecked(x)
343	}
344
345	#[inline(always)]
346	unsafe fn from_mut_bytes(x: &mut [u8]) -> &mut str {
347	mem::transmute(src:x)
348	}
349	}
350
351	unsafe impl<'a> CharFormat<'a> for UTF8 {
352	type Iter = str::CharIndices<'a>;
353
354	#[inline]
355	unsafe fn char_indices(buf: &'a [u8]) -> str::CharIndices<'a> {
356	str::from_utf8_unchecked(buf).char_indices()
357	}
358
359	#[inline]
360	fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
361	where
362	F: FnOnce(&[u8]),
363	{
364	cont(ch.encode_utf8(&mut [`0_u8`; `4`]).as_bytes());
365	Ok(())
366	}
367	}
368
369	/// Marker type for WTF-8 text.
370	///
371	/// See the [WTF-8 spec](https://simonsapin.github.io/wtf-8/).
372	#[derive(Copy, Clone, Default, Debug)]
373	pub struct WTF8;
374
375	#[inline]
376	fn wtf8_meaningful(m: Meaning) -> bool {
377	match m {
378	Meaning::Whole(_) \| Meaning::LeadSurrogate(_) \| Meaning::TrailSurrogate(_) => `true`,
379	_ => `false`,
380	}
381	}
382
383	unsafe impl Format for WTF8 {
384	#[inline]
385	fn validate(buf: &[u8]) -> bool {
386	let mut i = `0`;
387	let mut prev_lead = `false`;
388	while i < buf.len() {
389	let codept = unwrap_or_return!(futf::classify(buf, i), `false`);
390	if !wtf8_meaningful(codept.meaning) {
391	return `false`;
392	}
393	i += codept.bytes.len();
394	prev_lead = match codept.meaning {
395	Meaning::TrailSurrogate(_) if prev_lead => return `false`,
396	Meaning::LeadSurrogate(_) => `true`,
397	_ => `false`,
398	};
399	}
400
401	`true`
402	}
403
404	#[inline]
405	fn validate_prefix(buf: &[u8]) -> bool {
406	if buf.len() == `0` {
407	return `true`;
408	}
409	match futf::classify(buf, buf.len() - `1`) {
410	Some(c) => wtf8_meaningful(c.meaning),
411	_ => `false`,
412	}
413	}
414
415	#[inline]
416	fn validate_suffix(buf: &[u8]) -> bool {
417	if buf.len() == `0` {
418	return `true`;
419	}
420	match futf::classify(buf, `0`) {
421	Some(c) => wtf8_meaningful(c.meaning),
422	_ => `false`,
423	}
424	}
425
426	#[inline]
427	fn validate_subseq(buf: &[u8]) -> bool {
428	<Self as Format>::validate_prefix(buf) && <Self as Format>::validate_suffix(buf)
429	}
430
431	#[inline]
432	unsafe fn fixup(lhs: &[u8], rhs: &[u8]) -> imp::Fixup {
433	const ERR: &'static str = "WTF8: internal error";
434
435	if lhs.len() >= `3` && rhs.len() >= `3` {
436	if let (
437	Some(Codepoint {
438	meaning: Meaning::LeadSurrogate(hi),
439	..
440	}),
441	Some(Codepoint {
442	meaning: Meaning::TrailSurrogate(lo),
443	..
444	}),
445	) = (futf::classify(lhs, lhs.len() - `1`), futf::classify(rhs, `0`))
446	{
447	let mut fixup = imp::Fixup {
448	drop_left: `3`,
449	drop_right: `3`,
450	insert_len: `0`,
451	insert_bytes: [`0_u8`; `4`],
452	};
453
454	let n = `0x10000` + ((hi as u32) << `10`) + (lo as u32);
455
456	let ch = char::from_u32(n).expect(ERR);
457	fixup.insert_len = ch.encode_utf8(&mut fixup.insert_bytes).len() as u32;
458
459	return fixup;
460	}
461	}
462
463	Default::default()
464	}
465	}
466
467	/// Marker type for the single-byte encoding of the first 256 Unicode codepoints.
468	///
469	/// This is IANA's "ISO-8859-1". It's ISO's "ISO 8859-1" with the addition of the
470	/// C0 and C1 control characters from ECMA-48 / ISO 6429.
471	///
472	/// Not to be confused with WHATWG's "latin1" or "iso8859-1" labels (or the
473	/// many other aliases), which actually stand for Windows-1252.
474	#[derive(Copy, Clone, Default, Debug)]
475	pub struct Latin1;
476
477	unsafe impl Format for Latin1 {
478	#[inline(always)]
479	fn validate(_: &[u8]) -> bool {
480	`true`
481	}
482
483	#[inline(always)]
484	fn validate_prefix(_: &[u8]) -> bool {
485	`true`
486	}
487
488	#[inline(always)]
489	fn validate_suffix(_: &[u8]) -> bool {
490	`true`
491	}
492
493	#[inline(always)]
494	fn validate_subseq(_: &[u8]) -> bool {
495	`true`
496	}
497	}
498
499	unsafe impl<'a> CharFormat<'a> for Latin1 {
500	type Iter = imp::SingleByteCharIndices<'a>;
501
502	#[inline]
503	unsafe fn char_indices(buf: &'a [u8]) -> imp::SingleByteCharIndices<'a> {
504	imp::SingleByteCharIndices::new(buf)
505	}
506
507	#[inline]
508	fn encode_char<F>(ch: char, cont: F) -> Result<(), ()>
509	where
510	F: FnOnce(&[u8]),
511	{
512	let n: u32 = ch as u32;
513	if n > `0xFF` {
514	return Err(());
515	}
516	cont(&[n as u8]);
517	Ok(())
518	}
519	}
520

Provided by KDAB

Definitions