| 1 | // This file is part of ICU4X. For terms of use, please see the file |
| 2 | // called LICENSE at the top level of the ICU4X source tree |
| 3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| 4 | |
| 5 | pub mod errors; |
| 6 | mod langid; |
| 7 | mod locale; |
| 8 | |
| 9 | pub use errors::ParserError; |
| 10 | pub use langid::{ |
| 11 | parse_language_identifier, parse_language_identifier_from_iter, |
| 12 | parse_language_identifier_with_single_variant, |
| 13 | parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter, ParserMode, |
| 14 | }; |
| 15 | |
| 16 | pub use locale::{ |
| 17 | parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension, |
| 18 | }; |
| 19 | |
| 20 | #[inline ] |
| 21 | const fn is_separator(slice: &[u8], idx: usize) -> bool { |
| 22 | #[allow (clippy::indexing_slicing)] |
| 23 | let b: u8 = slice[idx]; |
| 24 | b == b'-' || b == b'_' |
| 25 | } |
| 26 | |
| 27 | const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) { |
| 28 | debug_assert!(idx < slice.len()); |
| 29 | |
| 30 | // This function is called only on the idx == 0 or on a separator. |
| 31 | let (start: usize, mut end: usize) = if is_separator(slice, idx) { |
| 32 | // If it's a separator, set the start to idx+1 and advance the idx to the next char. |
| 33 | (idx + 1, idx + 1) |
| 34 | } else { |
| 35 | // If it's idx=0, start is 0 and end is set to 1 |
| 36 | debug_assert!(idx == 0); |
| 37 | (0, 1) |
| 38 | }; |
| 39 | |
| 40 | while end < slice.len() && !is_separator(slice, idx:end) { |
| 41 | // Advance until we reach end of slice or a separator. |
| 42 | end += 1; |
| 43 | } |
| 44 | // Notice: this slice may be empty (start == end) for cases like `"en-"` or `"en--US"` |
| 45 | (start, end) |
| 46 | } |
| 47 | |
| 48 | // `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing. |
| 49 | // |
| 50 | // It is quite extraordinary due to focus on performance and Rust limitations for `const` |
| 51 | // functions. |
| 52 | // |
| 53 | // The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`, |
| 54 | // `"en-"` etc. |
| 55 | // |
| 56 | // The iterator provides methods available for static users - `next_manual` and `peek_manual`, |
| 57 | // as well as typical `Peekable` iterator APIs - `next` and `peek`. |
| 58 | // |
| 59 | // All methods return an `Option` of a `Result`. |
| 60 | #[derive (Copy, Clone, Debug)] |
| 61 | pub struct SubtagIterator<'a> { |
| 62 | pub slice: &'a [u8], |
| 63 | done: bool, |
| 64 | // done + subtag is faster than Option<(usize, usize)> |
| 65 | // at the time of writing. |
| 66 | subtag: (usize, usize), |
| 67 | } |
| 68 | |
| 69 | impl<'a> SubtagIterator<'a> { |
| 70 | pub const fn new(slice: &'a [u8]) -> Self { |
| 71 | let subtag = if slice.is_empty() || is_separator(slice, 0) { |
| 72 | // This returns (0, 0) which returns Some(b"") for slices like `"-en"` or `"-"` |
| 73 | (0, 0) |
| 74 | } else { |
| 75 | get_current_subtag(slice, 0) |
| 76 | }; |
| 77 | Self { |
| 78 | slice, |
| 79 | done: false, |
| 80 | subtag, |
| 81 | } |
| 82 | } |
| 83 | |
| 84 | pub const fn next_manual(mut self) -> (Self, Option<(usize, usize)>) { |
| 85 | if self.done { |
| 86 | return (self, None); |
| 87 | } |
| 88 | let result = self.subtag; |
| 89 | if result.1 < self.slice.len() { |
| 90 | self.subtag = get_current_subtag(self.slice, result.1); |
| 91 | } else { |
| 92 | self.done = true; |
| 93 | } |
| 94 | (self, Some(result)) |
| 95 | } |
| 96 | |
| 97 | pub const fn peek_manual(&self) -> Option<(usize, usize)> { |
| 98 | if self.done { |
| 99 | return None; |
| 100 | } |
| 101 | Some(self.subtag) |
| 102 | } |
| 103 | |
| 104 | pub fn peek(&self) -> Option<&'a [u8]> { |
| 105 | #[allow (clippy::indexing_slicing)] // peek_manual returns valid indices |
| 106 | self.peek_manual().map(|(s, e)| &self.slice[s..e]) |
| 107 | } |
| 108 | } |
| 109 | |
| 110 | impl<'a> Iterator for SubtagIterator<'a> { |
| 111 | type Item = &'a [u8]; |
| 112 | |
| 113 | fn next(&mut self) -> Option<Self::Item> { |
| 114 | let (s: SubtagIterator<'_>, res: Option<(usize, usize)>) = self.next_manual(); |
| 115 | *self = s; |
| 116 | #[allow (clippy::indexing_slicing)] // next_manual returns valid indices |
| 117 | res.map(|(s: usize, e: usize)| &self.slice[s..e]) |
| 118 | } |
| 119 | } |
| 120 | |
| 121 | #[cfg (test)] |
| 122 | mod test { |
| 123 | use super::*; |
| 124 | |
| 125 | fn slice_to_str(input: &[u8]) -> &str { |
| 126 | std::str::from_utf8(input).unwrap() |
| 127 | } |
| 128 | |
| 129 | #[test ] |
| 130 | fn subtag_iterator_peek_test() { |
| 131 | let slice = "de_at-u-ca-foobar" ; |
| 132 | let mut si = SubtagIterator::new(slice.as_bytes()); |
| 133 | |
| 134 | assert_eq!(si.peek().map(slice_to_str), Some("de" )); |
| 135 | assert_eq!(si.peek().map(slice_to_str), Some("de" )); |
| 136 | assert_eq!(si.next().map(slice_to_str), Some("de" )); |
| 137 | |
| 138 | assert_eq!(si.peek().map(slice_to_str), Some("at" )); |
| 139 | assert_eq!(si.peek().map(slice_to_str), Some("at" )); |
| 140 | assert_eq!(si.next().map(slice_to_str), Some("at" )); |
| 141 | } |
| 142 | |
| 143 | #[test ] |
| 144 | fn subtag_iterator_test() { |
| 145 | let slice = "" ; |
| 146 | let mut si = SubtagIterator::new(slice.as_bytes()); |
| 147 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
| 148 | |
| 149 | let slice = "-" ; |
| 150 | let mut si = SubtagIterator::new(slice.as_bytes()); |
| 151 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
| 152 | |
| 153 | let slice = "-en" ; |
| 154 | let mut si = SubtagIterator::new(slice.as_bytes()); |
| 155 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
| 156 | assert_eq!(si.next().map(slice_to_str), Some("en" )); |
| 157 | assert_eq!(si.next(), None); |
| 158 | |
| 159 | let slice = "en" ; |
| 160 | let si = SubtagIterator::new(slice.as_bytes()); |
| 161 | assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en" ,]); |
| 162 | |
| 163 | let slice = "en-" ; |
| 164 | let si = SubtagIterator::new(slice.as_bytes()); |
| 165 | assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en" , "" ,]); |
| 166 | |
| 167 | let slice = "--" ; |
| 168 | let mut si = SubtagIterator::new(slice.as_bytes()); |
| 169 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
| 170 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
| 171 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
| 172 | assert_eq!(si.next(), None); |
| 173 | |
| 174 | let slice = "-en-" ; |
| 175 | let mut si = SubtagIterator::new(slice.as_bytes()); |
| 176 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
| 177 | assert_eq!(si.next().map(slice_to_str), Some("en" )); |
| 178 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
| 179 | assert_eq!(si.next(), None); |
| 180 | |
| 181 | let slice = "de_at-u-ca-foobar" ; |
| 182 | let si = SubtagIterator::new(slice.as_bytes()); |
| 183 | assert_eq!( |
| 184 | si.map(slice_to_str).collect::<Vec<_>>(), |
| 185 | vec!["de" , "at" , "u" , "ca" , "foobar" ,] |
| 186 | ); |
| 187 | } |
| 188 | |
| 189 | #[test ] |
| 190 | fn get_current_subtag_test() { |
| 191 | let slice = "-" ; |
| 192 | let current = get_current_subtag(slice.as_bytes(), 0); |
| 193 | assert_eq!(current, (1, 1)); |
| 194 | |
| 195 | let slice = "-en" ; |
| 196 | let current = get_current_subtag(slice.as_bytes(), 0); |
| 197 | assert_eq!(current, (1, 3)); |
| 198 | |
| 199 | let slice = "-en-" ; |
| 200 | let current = get_current_subtag(slice.as_bytes(), 3); |
| 201 | assert_eq!(current, (4, 4)); |
| 202 | |
| 203 | let slice = "en-" ; |
| 204 | let current = get_current_subtag(slice.as_bytes(), 0); |
| 205 | assert_eq!(current, (0, 2)); |
| 206 | |
| 207 | let current = get_current_subtag(slice.as_bytes(), 2); |
| 208 | assert_eq!(current, (3, 3)); |
| 209 | |
| 210 | let slice = "en--US" ; |
| 211 | let current = get_current_subtag(slice.as_bytes(), 0); |
| 212 | assert_eq!(current, (0, 2)); |
| 213 | |
| 214 | let current = get_current_subtag(slice.as_bytes(), 2); |
| 215 | assert_eq!(current, (3, 3)); |
| 216 | |
| 217 | let current = get_current_subtag(slice.as_bytes(), 3); |
| 218 | assert_eq!(current, (4, 6)); |
| 219 | |
| 220 | let slice = "--" ; |
| 221 | let current = get_current_subtag(slice.as_bytes(), 0); |
| 222 | assert_eq!(current, (1, 1)); |
| 223 | |
| 224 | let current = get_current_subtag(slice.as_bytes(), 1); |
| 225 | assert_eq!(current, (2, 2)); |
| 226 | |
| 227 | let slice = "-" ; |
| 228 | let current = get_current_subtag(slice.as_bytes(), 0); |
| 229 | assert_eq!(current, (1, 1)); |
| 230 | } |
| 231 | } |
| 232 | |