1 | //! A set of helper functions for unescaping Fluent unicode escape sequences. |
2 | //! |
3 | //! # Unicode |
4 | //! |
5 | //! Fluent supports UTF-8 in all FTL resources, but it also allows |
6 | //! unicode sequences to be escaped in [`String |
7 | //! Literals`](super::ast::InlineExpression::StringLiteral). |
8 | //! |
9 | //! Four byte sequences are encoded with `\u` and six byte |
10 | //! sqeuences using `\U`. |
11 | //! ## Example |
12 | //! |
13 | //! ``` |
14 | //! use fluent_syntax::unicode::unescape_unicode_to_string; |
15 | //! |
16 | //! assert_eq!( |
17 | //! unescape_unicode_to_string("Foo \\u5bd2 Bar" ), |
18 | //! "Foo 寒 Bar" |
19 | //! ); |
20 | //! |
21 | //! assert_eq!( |
22 | //! unescape_unicode_to_string("Foo \\U01F68A Bar" ), |
23 | //! "Foo 🚊 Bar" |
24 | //! ); |
25 | //! ``` |
26 | //! |
27 | //! # Other unescapes |
28 | //! |
29 | //! This also allows for a char `"` to be present inside an FTL string literal, |
30 | //! and for `\` itself to be escaped. |
31 | //! |
32 | //! ## Example |
33 | //! |
34 | //! ``` |
35 | //! use fluent_syntax::unicode::unescape_unicode_to_string; |
36 | //! |
37 | //! assert_eq!( |
38 | //! unescape_unicode_to_string("Foo \\\" Bar" ), |
39 | //! "Foo \" Bar" |
40 | //! ); |
41 | //! assert_eq!( |
42 | //! unescape_unicode_to_string("Foo \\\\ Bar" ), |
43 | //! "Foo \\ Bar" |
44 | //! ); |
45 | //! ``` |
46 | use std::borrow::Cow; |
47 | use std::char; |
48 | use std::fmt; |
49 | |
50 | const UNKNOWN_CHAR: char = '�' ; |
51 | |
52 | fn encode_unicode(s: Option<&str>) -> char { |
53 | s.and_then(|s| u32::from_str_radix(s, 16).ok().and_then(char::from_u32)) |
54 | .unwrap_or(UNKNOWN_CHAR) |
55 | } |
56 | |
57 | /// Unescapes to a writer without allocating. |
58 | /// |
59 | /// ## Example |
60 | /// |
61 | /// ``` |
62 | /// use fluent_syntax::unicode::unescape_unicode; |
63 | /// |
64 | /// let mut s = String::new(); |
65 | /// unescape_unicode(&mut s, "Foo \\U01F60A Bar" ); |
66 | /// assert_eq!(s, "Foo 😊 Bar" ); |
67 | /// ``` |
68 | pub fn unescape_unicode<W>(w: &mut W, input: &str) -> fmt::Result |
69 | where |
70 | W: fmt::Write, |
71 | { |
72 | let bytes = input.as_bytes(); |
73 | |
74 | let mut start = 0; |
75 | let mut ptr = 0; |
76 | |
77 | while let Some(b) = bytes.get(ptr) { |
78 | if b != &b' \\' { |
79 | ptr += 1; |
80 | continue; |
81 | } |
82 | if start != ptr { |
83 | w.write_str(&input[start..ptr])?; |
84 | } |
85 | |
86 | ptr += 1; |
87 | |
88 | let new_char = match bytes.get(ptr) { |
89 | Some(b' \\' ) => ' \\' , |
90 | Some(b'"' ) => '"' , |
91 | Some(u @ b'u' ) | Some(u @ b'U' ) => { |
92 | let seq_start = ptr + 1; |
93 | let len = if u == &b'u' { 4 } else { 6 }; |
94 | ptr += len; |
95 | encode_unicode(input.get(seq_start..seq_start + len)) |
96 | } |
97 | _ => UNKNOWN_CHAR, |
98 | }; |
99 | ptr += 1; |
100 | w.write_char(new_char)?; |
101 | start = ptr; |
102 | } |
103 | if start != ptr { |
104 | w.write_str(&input[start..ptr])?; |
105 | } |
106 | Ok(()) |
107 | } |
108 | |
109 | /// Unescapes to a `Cow<str>` optionally allocating. |
110 | /// |
111 | /// ## Example |
112 | /// |
113 | /// ``` |
114 | /// use fluent_syntax::unicode::unescape_unicode_to_string; |
115 | /// |
116 | /// assert_eq!( |
117 | /// unescape_unicode_to_string("Foo \\U01F60A Bar" ), |
118 | /// "Foo 😊 Bar" |
119 | /// ); |
120 | /// ``` |
121 | pub fn unescape_unicode_to_string(input: &str) -> Cow<str> { |
122 | let bytes = input.as_bytes(); |
123 | let mut result = Cow::from(input); |
124 | |
125 | let mut ptr = 0; |
126 | |
127 | while let Some(b) = bytes.get(ptr) { |
128 | if b != &b' \\' { |
129 | if let Cow::Owned(ref mut s) = result { |
130 | s.push(*b as char); |
131 | } |
132 | ptr += 1; |
133 | continue; |
134 | } |
135 | |
136 | if let Cow::Borrowed(_) = result { |
137 | result = Cow::from(&input[0..ptr]); |
138 | } |
139 | |
140 | ptr += 1; |
141 | |
142 | let new_char = match bytes.get(ptr) { |
143 | Some(b' \\' ) => ' \\' , |
144 | Some(b'"' ) => '"' , |
145 | Some(u @ b'u' ) | Some(u @ b'U' ) => { |
146 | let start = ptr + 1; |
147 | let len = if u == &b'u' { 4 } else { 6 }; |
148 | ptr += len; |
149 | input |
150 | .get(start..(start + len)) |
151 | .map_or(UNKNOWN_CHAR, |slice| encode_unicode(Some(slice))) |
152 | } |
153 | _ => UNKNOWN_CHAR, |
154 | }; |
155 | result.to_mut().push(new_char); |
156 | ptr += 1; |
157 | } |
158 | result |
159 | } |
160 | |