1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5pub mod errors;
6mod langid;
7mod locale;
8
9pub use errors::ParserError;
10pub use langid::{
11 parse_language_identifier, parse_language_identifier_from_iter,
12 parse_language_identifier_with_single_variant,
13 parse_locale_with_single_variant_single_keyword_unicode_extension_from_iter, ParserMode,
14};
15
16pub use locale::{
17 parse_locale, parse_locale_with_single_variant_single_keyword_unicode_keyword_extension,
18};
19
20#[inline]
21const fn is_separator(slice: &[u8], idx: usize) -> bool {
22 #[allow(clippy::indexing_slicing)]
23 let b: u8 = slice[idx];
24 b == b'-' || b == b'_'
25}
26
27const fn get_current_subtag(slice: &[u8], idx: usize) -> (usize, usize) {
28 debug_assert!(idx < slice.len());
29
30 // This function is called only on the idx == 0 or on a separator.
31 let (start: usize, mut end: usize) = if is_separator(slice, idx) {
32 // If it's a separator, set the start to idx+1 and advance the idx to the next char.
33 (idx + 1, idx + 1)
34 } else {
35 // If it's idx=0, start is 0 and end is set to 1
36 debug_assert!(idx == 0);
37 (0, 1)
38 };
39
40 while end < slice.len() && !is_separator(slice, idx:end) {
41 // Advance until we reach end of slice or a separator.
42 end += 1;
43 }
44 // Notice: this slice may be empty (start == end) for cases like `"en-"` or `"en--US"`
45 (start, end)
46}
47
48// `SubtagIterator` is a helper iterator for [`LanguageIdentifier`] and [`Locale`] parsing.
49//
50// It is quite extraordinary due to focus on performance and Rust limitations for `const`
51// functions.
52//
53// The iterator is eager and fallible allowing it to reject invalid slices such as `"-"`, `"-en"`,
54// `"en-"` etc.
55//
56// The iterator provides methods available for static users - `next_manual` and `peek_manual`,
57// as well as typical `Peekable` iterator APIs - `next` and `peek`.
58//
59// All methods return an `Option` of a `Result`.
60#[derive(Copy, Clone, Debug)]
61pub struct SubtagIterator<'a> {
62 pub slice: &'a [u8],
63 done: bool,
64 // done + subtag is faster than Option<(usize, usize)>
65 // at the time of writing.
66 subtag: (usize, usize),
67}
68
69impl<'a> SubtagIterator<'a> {
70 pub const fn new(slice: &'a [u8]) -> Self {
71 let subtag = if slice.is_empty() || is_separator(slice, 0) {
72 // This returns (0, 0) which returns Some(b"") for slices like `"-en"` or `"-"`
73 (0, 0)
74 } else {
75 get_current_subtag(slice, 0)
76 };
77 Self {
78 slice,
79 done: false,
80 subtag,
81 }
82 }
83
84 pub const fn next_manual(mut self) -> (Self, Option<(usize, usize)>) {
85 if self.done {
86 return (self, None);
87 }
88 let result = self.subtag;
89 if result.1 < self.slice.len() {
90 self.subtag = get_current_subtag(self.slice, result.1);
91 } else {
92 self.done = true;
93 }
94 (self, Some(result))
95 }
96
97 pub const fn peek_manual(&self) -> Option<(usize, usize)> {
98 if self.done {
99 return None;
100 }
101 Some(self.subtag)
102 }
103
104 pub fn peek(&self) -> Option<&'a [u8]> {
105 #[allow(clippy::indexing_slicing)] // peek_manual returns valid indices
106 self.peek_manual().map(|(s, e)| &self.slice[s..e])
107 }
108}
109
110impl<'a> Iterator for SubtagIterator<'a> {
111 type Item = &'a [u8];
112
113 fn next(&mut self) -> Option<Self::Item> {
114 let (s: SubtagIterator<'_>, res: Option<(usize, usize)>) = self.next_manual();
115 *self = s;
116 #[allow(clippy::indexing_slicing)] // next_manual returns valid indices
117 res.map(|(s: usize, e: usize)| &self.slice[s..e])
118 }
119}
120
121#[cfg(test)]
122mod test {
123 use super::*;
124
125 fn slice_to_str(input: &[u8]) -> &str {
126 std::str::from_utf8(input).unwrap()
127 }
128
129 #[test]
130 fn subtag_iterator_peek_test() {
131 let slice = "de_at-u-ca-foobar";
132 let mut si = SubtagIterator::new(slice.as_bytes());
133
134 assert_eq!(si.peek().map(slice_to_str), Some("de"));
135 assert_eq!(si.peek().map(slice_to_str), Some("de"));
136 assert_eq!(si.next().map(slice_to_str), Some("de"));
137
138 assert_eq!(si.peek().map(slice_to_str), Some("at"));
139 assert_eq!(si.peek().map(slice_to_str), Some("at"));
140 assert_eq!(si.next().map(slice_to_str), Some("at"));
141 }
142
143 #[test]
144 fn subtag_iterator_test() {
145 let slice = "";
146 let mut si = SubtagIterator::new(slice.as_bytes());
147 assert_eq!(si.next().map(slice_to_str), Some(""));
148
149 let slice = "-";
150 let mut si = SubtagIterator::new(slice.as_bytes());
151 assert_eq!(si.next().map(slice_to_str), Some(""));
152
153 let slice = "-en";
154 let mut si = SubtagIterator::new(slice.as_bytes());
155 assert_eq!(si.next().map(slice_to_str), Some(""));
156 assert_eq!(si.next().map(slice_to_str), Some("en"));
157 assert_eq!(si.next(), None);
158
159 let slice = "en";
160 let si = SubtagIterator::new(slice.as_bytes());
161 assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en",]);
162
163 let slice = "en-";
164 let si = SubtagIterator::new(slice.as_bytes());
165 assert_eq!(si.map(slice_to_str).collect::<Vec<_>>(), vec!["en", "",]);
166
167 let slice = "--";
168 let mut si = SubtagIterator::new(slice.as_bytes());
169 assert_eq!(si.next().map(slice_to_str), Some(""));
170 assert_eq!(si.next().map(slice_to_str), Some(""));
171 assert_eq!(si.next().map(slice_to_str), Some(""));
172 assert_eq!(si.next(), None);
173
174 let slice = "-en-";
175 let mut si = SubtagIterator::new(slice.as_bytes());
176 assert_eq!(si.next().map(slice_to_str), Some(""));
177 assert_eq!(si.next().map(slice_to_str), Some("en"));
178 assert_eq!(si.next().map(slice_to_str), Some(""));
179 assert_eq!(si.next(), None);
180
181 let slice = "de_at-u-ca-foobar";
182 let si = SubtagIterator::new(slice.as_bytes());
183 assert_eq!(
184 si.map(slice_to_str).collect::<Vec<_>>(),
185 vec!["de", "at", "u", "ca", "foobar",]
186 );
187 }
188
189 #[test]
190 fn get_current_subtag_test() {
191 let slice = "-";
192 let current = get_current_subtag(slice.as_bytes(), 0);
193 assert_eq!(current, (1, 1));
194
195 let slice = "-en";
196 let current = get_current_subtag(slice.as_bytes(), 0);
197 assert_eq!(current, (1, 3));
198
199 let slice = "-en-";
200 let current = get_current_subtag(slice.as_bytes(), 3);
201 assert_eq!(current, (4, 4));
202
203 let slice = "en-";
204 let current = get_current_subtag(slice.as_bytes(), 0);
205 assert_eq!(current, (0, 2));
206
207 let current = get_current_subtag(slice.as_bytes(), 2);
208 assert_eq!(current, (3, 3));
209
210 let slice = "en--US";
211 let current = get_current_subtag(slice.as_bytes(), 0);
212 assert_eq!(current, (0, 2));
213
214 let current = get_current_subtag(slice.as_bytes(), 2);
215 assert_eq!(current, (3, 3));
216
217 let current = get_current_subtag(slice.as_bytes(), 3);
218 assert_eq!(current, (4, 6));
219
220 let slice = "--";
221 let current = get_current_subtag(slice.as_bytes(), 0);
222 assert_eq!(current, (1, 1));
223
224 let current = get_current_subtag(slice.as_bytes(), 1);
225 assert_eq!(current, (2, 2));
226
227 let slice = "-";
228 let current = get_current_subtag(slice.as_bytes(), 0);
229 assert_eq!(current, (1, 1));
230 }
231}
232