unicode.rs source code [crates/fluent_syntax/src/unicode.rs]

1	//! A set of helper functions for unescaping Fluent unicode escape sequences.
2	//!
3	//! # Unicode
4	//!
5	//! Fluent supports UTF-8 in all FTL resources, but it also allows
6	//! unicode sequences to be escaped in [`String
7	//! Literals`](super::ast::InlineExpression::StringLiteral).
8	//!
9	//! Four byte sequences are encoded with `\u` and six byte
10	//! sequences using `\U`.
11	//! ## Example
12	//!
13	//! ```
14	//! use fluent_syntax::unicode::unescape_unicode_to_string;
15	//!
16	//! assert_eq!(
17	//! unescape_unicode_to_string("Foo `\\`u5bd2 Bar"),
18	//! "Foo 寒 Bar"
19	//! );
20	//!
21	//! assert_eq!(
22	//! unescape_unicode_to_string("Foo `\\`U01F68A Bar"),
23	//! "Foo 🚊 Bar"
24	//! );
25	//! ```
26	//!
27	//! # Other unescapes
28	//!
29	//! This also allows for a char `"` to be present inside an FTL string literal,
30	//! and for `\` itself to be escaped.
31	//!
32	//! ## Example
33	//!
34	//! ```
35	//! use fluent_syntax::unicode::unescape_unicode_to_string;
36	//!
37	//! assert_eq!(
38	//! unescape_unicode_to_string("Foo `\\\"` Bar"),
39	//! "Foo `\"` Bar"
40	//! );
41	//! assert_eq!(
42	//! unescape_unicode_to_string("Foo `\\\\` Bar"),
43	//! "Foo `\\` Bar"
44	//! );
45	//! ```
46	use std::borrow::Cow;
47	use std::char;
48	use std::fmt;
49
50	const UNKNOWN_CHAR: char = '�';
51
52	fn encode_unicode(s: Option<&str>) -> char {
53	s.and_then(\|s\| u32::from_str_radix(s, `16`).ok().and_then(char::from_u32))
54	.unwrap_or(UNKNOWN_CHAR)
55	}
56
57	/// Unescapes to a writer without allocating.
58	///
59	/// ## Example
60	///
61	/// ```
62	/// use fluent_syntax::unicode::unescape_unicode;
63	///
64	/// let mut s = String::new();
65	/// unescape_unicode(&mut s, "Foo `\\`U01F60A Bar");
66	/// assert_eq!(s, "Foo 😊 Bar");
67	/// ```
68	pub fn unescape_unicode<W>(w: &mut W, input: &str) -> fmt::Result
69	where
70	W: fmt::Write,
71	{
72	let bytes = input.as_bytes();
73
74	let mut start = `0`;
75	let mut ptr = `0`;
76
77	while let Some(b) = bytes.get(ptr) {
78	if b != &b'`\\`' {
79	ptr += `1`;
80	continue;
81	}
82	if start != ptr {
83	w.write_str(&input[start..ptr])?;
84	}
85
86	ptr += `1`;
87
88	let new_char = match bytes.get(ptr) {
89	Some(b'`\\`') => '`\\`',
90	Some(b'"') => '"',
91	Some(u @ b'u') \| Some(u @ b'U') => {
92	let seq_start = ptr + `1`;
93	let len = if u == &b'u' { `4` } else { `6` };
94	ptr += len;
95	encode_unicode(input.get(seq_start..seq_start + len))
96	}
97	_ => UNKNOWN_CHAR,
98	};
99	ptr += `1`;
100	w.write_char(new_char)?;
101	start = ptr;
102	}
103	if start != ptr {
104	w.write_str(&input[start..ptr])?;
105	}
106	Ok(())
107	}
108
109	/// Unescapes to a `Cow<str>` optionally allocating.
110	///
111	/// ## Example
112	///
113	/// ```
114	/// use fluent_syntax::unicode::unescape_unicode_to_string;
115	///
116	/// assert_eq!(
117	/// unescape_unicode_to_string("Foo `\\`U01F60A Bar"),
118	/// "Foo 😊 Bar"
119	/// );
120	/// ```
121	pub fn unescape_unicode_to_string(input: &str) -> Cow<str> {
122	let bytes = input.as_bytes();
123	let mut result = Cow::from(input);
124
125	let mut ptr = `0`;
126
127	while let Some(b) = bytes.get(ptr) {
128	if b != &b'`\\`' {
129	if let Cow::Owned(ref mut s) = result {
130	s.push(b as char*);
131	}
132	ptr += `1`;
133	continue;
134	}
135
136	if let Cow::Borrowed(_) = result {
137	result = Cow::from(&input[`0`..ptr]);
138	}
139
140	ptr += `1`;
141
142	let new_char = match bytes.get(ptr) {
143	Some(b'`\\`') => '`\\`',
144	Some(b'"') => '"',
145	Some(u @ b'u') \| Some(u @ b'U') => {
146	let start = ptr + `1`;
147	let len = if u == &b'u' { `4` } else { `6` };
148	ptr += len;
149	input
150	.get(start..(start + len))
151	.map_or(UNKNOWN_CHAR, \|slice\| encode_unicode(Some(slice)))
152	}
153	_ => UNKNOWN_CHAR,
154	};
155	result.to_mut().push(new_char);
156	ptr += `1`;
157	}
158	result
159	}
160