1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | pub mod errors; |
6 | mod langid; |
7 | mod locale; |
8 | |
9 | pub use errors::ParserError; |
10 | pub use langid::{ |
11 | parse_language_identifier, parse_language_identifier_from_iter, |
12 | parse_language_identifier_with_single_variant, |
13 | parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter, ParserMode, |
14 | }; |
15 | |
16 | pub use locale::{ |
17 | parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension, |
18 | }; |
19 | |
20 | #[inline ] |
21 | const fn is_separator(slice: &[u8], idx: usize) -> bool { |
22 | #[allow (clippy::indexing_slicing)] |
23 | let b: u8 = slice[idx]; |
24 | b == b'-' || b == b'_' |
25 | } |
26 | |
27 | const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) { |
28 | debug_assert!(idx < slice.len()); |
29 | |
30 | // This function is called only on the idx == 0 or on a separator. |
31 | let (start: usize, mut end: usize) = if is_separator(slice, idx) { |
32 | // If it's a separator, set the start to idx+1 and advance the idx to the next char. |
33 | (idx + 1, idx + 1) |
34 | } else { |
35 | // If it's idx=0, start is 0 and end is set to 1 |
36 | debug_assert!(idx == 0); |
37 | (0, 1) |
38 | }; |
39 | |
40 | while end < slice.len() && !is_separator(slice, idx:end) { |
41 | // Advance until we reach end of slice or a separator. |
42 | end += 1; |
43 | } |
44 | // Notice: this slice may be empty (start == end) for cases like `"en-"` or `"en--US"` |
45 | (start, end) |
46 | } |
47 | |
48 | // `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing. |
49 | // |
50 | // It is quite extraordinary due to focus on performance and Rust limitations for `const` |
51 | // functions. |
52 | // |
53 | // The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`, |
54 | // `"en-"` etc. |
55 | // |
56 | // The iterator provides methods available for static users - `next_manual` and `peek_manual`, |
57 | // as well as typical `Peekable` iterator APIs - `next` and `peek`. |
58 | // |
59 | // All methods return an `Option` of a `Result`. |
60 | #[derive (Copy, Clone, Debug)] |
61 | pub struct SubtagIterator<'a> { |
62 | pub slice: &'a [u8], |
63 | done: bool, |
64 | // done + subtag is faster than Option<(usize, usize)> |
65 | // at the time of writing. |
66 | subtag: (usize, usize), |
67 | } |
68 | |
69 | impl<'a> SubtagIterator<'a> { |
70 | pub const fn new(slice: &'a [u8]) -> Self { |
71 | let subtag = if slice.is_empty() || is_separator(slice, 0) { |
72 | // This returns (0, 0) which returns Some(b"") for slices like `"-en"` or `"-"` |
73 | (0, 0) |
74 | } else { |
75 | get_current_subtag(slice, 0) |
76 | }; |
77 | Self { |
78 | slice, |
79 | done: false, |
80 | subtag, |
81 | } |
82 | } |
83 | |
84 | pub const fn next_manual(mut self) -> (Self, Option<(usize, usize)>) { |
85 | if self.done { |
86 | return (self, None); |
87 | } |
88 | let result = self.subtag; |
89 | if result.1 < self.slice.len() { |
90 | self.subtag = get_current_subtag(self.slice, result.1); |
91 | } else { |
92 | self.done = true; |
93 | } |
94 | (self, Some(result)) |
95 | } |
96 | |
97 | pub const fn peek_manual(&self) -> Option<(usize, usize)> { |
98 | if self.done { |
99 | return None; |
100 | } |
101 | Some(self.subtag) |
102 | } |
103 | |
104 | pub fn peek(&self) -> Option<&'a [u8]> { |
105 | #[allow (clippy::indexing_slicing)] // peek_manual returns valid indices |
106 | self.peek_manual().map(|(s, e)| &self.slice[s..e]) |
107 | } |
108 | } |
109 | |
110 | impl<'a> Iterator for SubtagIterator<'a> { |
111 | type Item = &'a [u8]; |
112 | |
113 | fn next(&mut self) -> Option<Self::Item> { |
114 | let (s: SubtagIterator<'_>, res: Option<(usize, usize)>) = self.next_manual(); |
115 | *self = s; |
116 | #[allow (clippy::indexing_slicing)] // next_manual returns valid indices |
117 | res.map(|(s: usize, e: usize)| &self.slice[s..e]) |
118 | } |
119 | } |
120 | |
121 | #[cfg (test)] |
122 | mod test { |
123 | use super::*; |
124 | |
125 | fn slice_to_str(input: &[u8]) -> &str { |
126 | std::str::from_utf8(input).unwrap() |
127 | } |
128 | |
129 | #[test ] |
130 | fn subtag_iterator_peek_test() { |
131 | let slice = "de_at-u-ca-foobar" ; |
132 | let mut si = SubtagIterator::new(slice.as_bytes()); |
133 | |
134 | assert_eq!(si.peek().map(slice_to_str), Some("de" )); |
135 | assert_eq!(si.peek().map(slice_to_str), Some("de" )); |
136 | assert_eq!(si.next().map(slice_to_str), Some("de" )); |
137 | |
138 | assert_eq!(si.peek().map(slice_to_str), Some("at" )); |
139 | assert_eq!(si.peek().map(slice_to_str), Some("at" )); |
140 | assert_eq!(si.next().map(slice_to_str), Some("at" )); |
141 | } |
142 | |
143 | #[test ] |
144 | fn subtag_iterator_test() { |
145 | let slice = "" ; |
146 | let mut si = SubtagIterator::new(slice.as_bytes()); |
147 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
148 | |
149 | let slice = "-" ; |
150 | let mut si = SubtagIterator::new(slice.as_bytes()); |
151 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
152 | |
153 | let slice = "-en" ; |
154 | let mut si = SubtagIterator::new(slice.as_bytes()); |
155 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
156 | assert_eq!(si.next().map(slice_to_str), Some("en" )); |
157 | assert_eq!(si.next(), None); |
158 | |
159 | let slice = "en" ; |
160 | let si = SubtagIterator::new(slice.as_bytes()); |
161 | assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en" ,]); |
162 | |
163 | let slice = "en-" ; |
164 | let si = SubtagIterator::new(slice.as_bytes()); |
165 | assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en" , "" ,]); |
166 | |
167 | let slice = "--" ; |
168 | let mut si = SubtagIterator::new(slice.as_bytes()); |
169 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
170 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
171 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
172 | assert_eq!(si.next(), None); |
173 | |
174 | let slice = "-en-" ; |
175 | let mut si = SubtagIterator::new(slice.as_bytes()); |
176 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
177 | assert_eq!(si.next().map(slice_to_str), Some("en" )); |
178 | assert_eq!(si.next().map(slice_to_str), Some("" )); |
179 | assert_eq!(si.next(), None); |
180 | |
181 | let slice = "de_at-u-ca-foobar" ; |
182 | let si = SubtagIterator::new(slice.as_bytes()); |
183 | assert_eq!( |
184 | si.map(slice_to_str).collect::<Vec<_>>(), |
185 | vec!["de" , "at" , "u" , "ca" , "foobar" ,] |
186 | ); |
187 | } |
188 | |
189 | #[test ] |
190 | fn get_current_subtag_test() { |
191 | let slice = "-" ; |
192 | let current = get_current_subtag(slice.as_bytes(), 0); |
193 | assert_eq!(current, (1, 1)); |
194 | |
195 | let slice = "-en" ; |
196 | let current = get_current_subtag(slice.as_bytes(), 0); |
197 | assert_eq!(current, (1, 3)); |
198 | |
199 | let slice = "-en-" ; |
200 | let current = get_current_subtag(slice.as_bytes(), 3); |
201 | assert_eq!(current, (4, 4)); |
202 | |
203 | let slice = "en-" ; |
204 | let current = get_current_subtag(slice.as_bytes(), 0); |
205 | assert_eq!(current, (0, 2)); |
206 | |
207 | let current = get_current_subtag(slice.as_bytes(), 2); |
208 | assert_eq!(current, (3, 3)); |
209 | |
210 | let slice = "en--US" ; |
211 | let current = get_current_subtag(slice.as_bytes(), 0); |
212 | assert_eq!(current, (0, 2)); |
213 | |
214 | let current = get_current_subtag(slice.as_bytes(), 2); |
215 | assert_eq!(current, (3, 3)); |
216 | |
217 | let current = get_current_subtag(slice.as_bytes(), 3); |
218 | assert_eq!(current, (4, 6)); |
219 | |
220 | let slice = "--" ; |
221 | let current = get_current_subtag(slice.as_bytes(), 0); |
222 | assert_eq!(current, (1, 1)); |
223 | |
224 | let current = get_current_subtag(slice.as_bytes(), 1); |
225 | assert_eq!(current, (2, 2)); |
226 | |
227 | let slice = "-" ; |
228 | let current = get_current_subtag(slice.as_bytes(), 0); |
229 | assert_eq!(current, (1, 1)); |
230 | } |
231 | } |
232 | |