| 1 | //! A set of helper functions for unescaping Fluent unicode escape sequences. |
| 2 | //! |
| 3 | //! # Unicode |
| 4 | //! |
| 5 | //! Fluent supports UTF-8 in all FTL resources, but it also allows |
| 6 | //! unicode sequences to be escaped in [`String |
| 7 | //! Literals`](super::ast::InlineExpression::StringLiteral). |
| 8 | //! |
| 9 | //! Four byte sequences are encoded with `\u` and six byte |
| 10 | //! sequences using `\U`. |
| 11 | //! ## Example |
| 12 | //! |
| 13 | //! ``` |
| 14 | //! use fluent_syntax::unicode::unescape_unicode_to_string; |
| 15 | //! |
| 16 | //! assert_eq!( |
| 17 | //! unescape_unicode_to_string("Foo \\u5bd2 Bar" ), |
| 18 | //! "Foo 寒 Bar" |
| 19 | //! ); |
| 20 | //! |
| 21 | //! assert_eq!( |
| 22 | //! unescape_unicode_to_string("Foo \\U01F68A Bar" ), |
| 23 | //! "Foo 🚊 Bar" |
| 24 | //! ); |
| 25 | //! ``` |
| 26 | //! |
| 27 | //! # Other unescapes |
| 28 | //! |
| 29 | //! This also allows for a char `"` to be present inside an FTL string literal, |
| 30 | //! and for `\` itself to be escaped. |
| 31 | //! |
| 32 | //! ## Example |
| 33 | //! |
| 34 | //! ``` |
| 35 | //! use fluent_syntax::unicode::unescape_unicode_to_string; |
| 36 | //! |
| 37 | //! assert_eq!( |
| 38 | //! unescape_unicode_to_string("Foo \\\" Bar" ), |
| 39 | //! "Foo \" Bar" |
| 40 | //! ); |
| 41 | //! assert_eq!( |
| 42 | //! unescape_unicode_to_string("Foo \\\\ Bar" ), |
| 43 | //! "Foo \\ Bar" |
| 44 | //! ); |
| 45 | //! ``` |
| 46 | use std::borrow::Cow; |
| 47 | use std::char; |
| 48 | use std::fmt; |
| 49 | |
| 50 | const UNKNOWN_CHAR: char = '�' ; |
| 51 | |
| 52 | fn encode_unicode(s: Option<&str>) -> char { |
| 53 | s.and_then(|s| u32::from_str_radix(s, 16).ok().and_then(char::from_u32)) |
| 54 | .unwrap_or(UNKNOWN_CHAR) |
| 55 | } |
| 56 | |
| 57 | /// Unescapes to a writer without allocating. |
| 58 | /// |
| 59 | /// ## Example |
| 60 | /// |
| 61 | /// ``` |
| 62 | /// use fluent_syntax::unicode::unescape_unicode; |
| 63 | /// |
| 64 | /// let mut s = String::new(); |
| 65 | /// unescape_unicode(&mut s, "Foo \\U01F60A Bar" ); |
| 66 | /// assert_eq!(s, "Foo 😊 Bar" ); |
| 67 | /// ``` |
| 68 | pub fn unescape_unicode<W>(w: &mut W, input: &str) -> fmt::Result |
| 69 | where |
| 70 | W: fmt::Write, |
| 71 | { |
| 72 | let bytes = input.as_bytes(); |
| 73 | |
| 74 | let mut start = 0; |
| 75 | let mut ptr = 0; |
| 76 | |
| 77 | while let Some(b) = bytes.get(ptr) { |
| 78 | if b != &b' \\' { |
| 79 | ptr += 1; |
| 80 | continue; |
| 81 | } |
| 82 | if start != ptr { |
| 83 | w.write_str(&input[start..ptr])?; |
| 84 | } |
| 85 | |
| 86 | ptr += 1; |
| 87 | |
| 88 | let new_char = match bytes.get(ptr) { |
| 89 | Some(b' \\' ) => ' \\' , |
| 90 | Some(b'"' ) => '"' , |
| 91 | Some(u @ b'u' ) | Some(u @ b'U' ) => { |
| 92 | let seq_start = ptr + 1; |
| 93 | let len = if u == &b'u' { 4 } else { 6 }; |
| 94 | ptr += len; |
| 95 | encode_unicode(input.get(seq_start..seq_start + len)) |
| 96 | } |
| 97 | _ => UNKNOWN_CHAR, |
| 98 | }; |
| 99 | ptr += 1; |
| 100 | w.write_char(new_char)?; |
| 101 | start = ptr; |
| 102 | } |
| 103 | if start != ptr { |
| 104 | w.write_str(&input[start..ptr])?; |
| 105 | } |
| 106 | Ok(()) |
| 107 | } |
| 108 | |
| 109 | /// Unescapes to a `Cow<str>` optionally allocating. |
| 110 | /// |
| 111 | /// ## Example |
| 112 | /// |
| 113 | /// ``` |
| 114 | /// use fluent_syntax::unicode::unescape_unicode_to_string; |
| 115 | /// |
| 116 | /// assert_eq!( |
| 117 | /// unescape_unicode_to_string("Foo \\U01F60A Bar" ), |
| 118 | /// "Foo 😊 Bar" |
| 119 | /// ); |
| 120 | /// ``` |
| 121 | pub fn unescape_unicode_to_string(input: &str) -> Cow<str> { |
| 122 | let bytes = input.as_bytes(); |
| 123 | let mut result = Cow::from(input); |
| 124 | |
| 125 | let mut ptr = 0; |
| 126 | |
| 127 | while let Some(b) = bytes.get(ptr) { |
| 128 | if b != &b' \\' { |
| 129 | if let Cow::Owned(ref mut s) = result { |
| 130 | s.push(*b as char); |
| 131 | } |
| 132 | ptr += 1; |
| 133 | continue; |
| 134 | } |
| 135 | |
| 136 | if let Cow::Borrowed(_) = result { |
| 137 | result = Cow::from(&input[0..ptr]); |
| 138 | } |
| 139 | |
| 140 | ptr += 1; |
| 141 | |
| 142 | let new_char = match bytes.get(ptr) { |
| 143 | Some(b' \\' ) => ' \\' , |
| 144 | Some(b'"' ) => '"' , |
| 145 | Some(u @ b'u' ) | Some(u @ b'U' ) => { |
| 146 | let start = ptr + 1; |
| 147 | let len = if u == &b'u' { 4 } else { 6 }; |
| 148 | ptr += len; |
| 149 | input |
| 150 | .get(start..(start + len)) |
| 151 | .map_or(UNKNOWN_CHAR, |slice| encode_unicode(Some(slice))) |
| 152 | } |
| 153 | _ => UNKNOWN_CHAR, |
| 154 | }; |
| 155 | result.to_mut().push(new_char); |
| 156 | ptr += 1; |
| 157 | } |
| 158 | result |
| 159 | } |
| 160 | |