1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | use zerofrom::ZeroFrom; |
6 | use zerovec::{ZeroSlice, ZeroVec}; |
7 | |
8 | // Match-node lead unit values, after masking off intermediate-value bits: |
9 | |
10 | // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise |
11 | // the length is one more than the next byte. |
12 | |
13 | // For a branch sub-node with at most this many entries, we drop down |
14 | // to a linear search. |
15 | const MAX_BRANCH_LINEAR_SUB_NODE_LENGTH: usize = 5; |
16 | |
17 | // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node. |
18 | const MIN_LINEAR_MATCH: u16 = 0x30; |
19 | const MAX_LINEAR_MATCH_LENGTH: u16 = 0x10; |
20 | |
21 | // Match-node lead unit bits 14..6 for the optional intermediate value. |
22 | // If these bits are 0, then there is no intermediate value. |
23 | // Otherwise, see the *NodeValue* constants below. |
24 | const MIN_VALUE_LEAD: u16 = MIN_LINEAR_MATCH + MAX_LINEAR_MATCH_LENGTH; // 0x40 |
25 | const NODE_TYPE_MASK: u16 = MIN_VALUE_LEAD - 1; // 0x003f |
26 | |
27 | // A final-value node has bit 15 set. |
28 | const VALUE_IS_FINAL: u16 = 0x8000; |
29 | |
30 | // Compact value: After testing bit 0, shift right by 15 and then use the following thresholds. |
31 | const MAX_ONE_UNIT_VALUE: u16 = 0x3fff; |
32 | |
33 | const MIN_TWO_UNIT_VALUE_LEAD: u16 = MAX_ONE_UNIT_VALUE + 1; // 0x4000 |
34 | |
35 | const MAX_ONE_UNIT_NODE_VALUE: u16 = 0xff; |
36 | |
37 | const MIN_TWO_UNIT_NODE_VALUE_LEAD: u16 = MIN_VALUE_LEAD + ((MAX_ONE_UNIT_NODE_VALUE + 1) << 6); // 0x4040 |
38 | |
39 | const THREE_UNIT_NODE_VALUE_LEAD: u16 = 0x7fc0; |
40 | |
41 | const THREE_UNIT_VALUE_LEAD: u16 = 0x7fff; |
42 | |
43 | // Compact delta integers. |
44 | const MAX_ONE_UNIT_DELTA: u16 = 0xfbff; |
45 | const MIN_TWO_UNIT_DELTA_LEAD: u16 = MAX_ONE_UNIT_DELTA + 1; // 0xfc00 |
46 | const THREE_UNIT_DELTA_LEAD: u16 = 0xffff; |
47 | |
48 | fn skip_value(pos: usize, lead: u16) -> usize { |
49 | if lead < MIN_TWO_UNIT_VALUE_LEAD { |
50 | pos |
51 | } else if lead < THREE_UNIT_VALUE_LEAD { |
52 | pos + 1 |
53 | } else { |
54 | pos + 2 |
55 | } |
56 | } |
57 | |
58 | fn skip_node_value(pos: usize, lead: u16) -> usize { |
59 | if lead < MIN_TWO_UNIT_NODE_VALUE_LEAD { |
60 | pos |
61 | } else if lead < THREE_UNIT_NODE_VALUE_LEAD { |
62 | pos + 1 |
63 | } else { |
64 | pos + 2 |
65 | } |
66 | } |
67 | |
68 | /// This struct represents a de-serialized `Char16Trie` that was exported from |
69 | /// ICU binary data. |
70 | /// |
71 | /// Light-weight, non-const reader class for a `CharsTrie`. Traverses a |
72 | /// char-serialized data structure with minimal state, for mapping 16-bit-unit |
73 | /// sequences to non-negative integer values. |
74 | /// |
75 | /// For more information: |
76 | /// - [ICU4C UCharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UCharsTrie.html) |
77 | /// - [ICU4J CharsTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4j/com/ibm/icu/util/CharsTrie.html) API. |
78 | #[cfg_attr (feature = "serde" , derive(serde::Deserialize, serde::Serialize))] |
79 | #[cfg_attr (feature = "databake" , derive(databake::Bake), databake(path = icu_collections::char16trie))] |
80 | #[derive (Clone, Debug, PartialEq, Eq, ZeroFrom)] |
81 | pub struct Char16Trie<'data> { |
82 | /// An array of u16 containing the trie data. |
83 | #[cfg_attr (feature = "serde" , serde(borrow))] |
84 | #[doc (hidden)] // #2417 |
85 | pub data: ZeroVec<'data, u16>, |
86 | } |
87 | |
88 | impl<'data> Char16Trie<'data> { |
89 | /// Returns a new [`Char16Trie`] with ownership of the provided data. |
90 | pub fn new(data: ZeroVec<'data, u16>) -> Self { |
91 | Self { data } |
92 | } |
93 | |
94 | /// Returns a new [`Char16TrieIterator`] backed by borrowed data from the `trie` data |
95 | pub fn iter(&self) -> Char16TrieIterator { |
96 | Char16TrieIterator::new(&self.data) |
97 | } |
98 | } |
99 | |
100 | /// This struct represents an iterator over a [`Char16Trie`]. |
101 | #[derive (Clone)] |
102 | pub struct Char16TrieIterator<'a> { |
103 | /// A reference to the Char16Trie data to iterate over. |
104 | trie: &'a ZeroSlice<u16>, |
105 | /// Index of next trie unit to read, or `None` if there are no more matches. |
106 | pos: Option<usize>, |
107 | /// Remaining length of a linear-match node, minus 1, or `None` if not in |
108 | /// such a node. |
109 | remaining_match_length: Option<usize>, |
110 | } |
111 | |
112 | /// An enum representing the return value from a lookup in [`Char16Trie`]. |
113 | #[derive (Clone, Copy, Debug, PartialEq)] |
114 | pub enum TrieResult { |
115 | /// The input unit(s) did not continue a matching string. |
116 | /// Once `next()` returns `TrieResult::NoMatch`, all further calls to `next()` |
117 | /// will also return `TrieResult::NoMatch`. |
118 | NoMatch, |
119 | /// The input unit(s) matched a string but there is no value for the string |
120 | /// so far. (It is a prefix of a longer string.) |
121 | NoValue, |
122 | /// The input unit(s) continued a matching string and there is a value for |
123 | /// the string so far. No further input byte/unit can continue a matching |
124 | /// string. |
125 | FinalValue(i32), |
126 | /// The input unit(s) continued a matching string and there is a value for |
127 | /// the string so far. Another input byte/unit can continue a matching |
128 | /// string. |
129 | Intermediate(i32), |
130 | } |
131 | |
132 | // Get the lead surrogate (0xd800..0xdbff) for a |
133 | // supplementary code point (0x10000..0x10ffff). |
134 | // @param supplementary 32-bit code point (U+10000..U+10ffff) |
135 | // @return lead surrogate (U+d800..U+dbff) for supplementary |
136 | fn u16_lead(supplementary: i32) -> u16 { |
137 | (((supplementary) >> 10) + 0xd7c0) as u16 |
138 | } |
139 | |
140 | // Get the trail surrogate (0xdc00..0xdfff) for a |
141 | // supplementary code point (0x10000..0x10ffff). |
142 | // @param supplementary 32-bit code point (U+10000..U+10ffff) |
143 | // @return trail surrogate (U+dc00..U+dfff) for supplementary |
144 | fn u16_tail(supplementary: i32) -> u16 { |
145 | (((supplementary) & 0x3ff) | 0xdc00) as u16 |
146 | } |
147 | |
148 | /// A macro that takes an `Option` argument and either unwraps it if it has a value or |
149 | /// causes the function to return `TrieResult::NoMatch` if there is no value. |
150 | /// This could perhaps be done with `std::ops::Try` once stabilized. |
151 | macro_rules! trie_unwrap { |
152 | ($option:expr) => { |
153 | match $option { |
154 | Some(x) => x, |
155 | None => { |
156 | // Unexpected |
157 | debug_assert!(false); |
158 | return TrieResult::NoMatch; |
159 | } |
160 | } |
161 | }; |
162 | } |
163 | |
164 | impl<'a> Char16TrieIterator<'a> { |
165 | /// Returns a new [`Char16TrieIterator`] backed by borrowed data for the `trie` array |
166 | pub fn new(trie: &'a ZeroSlice<u16>) -> Self { |
167 | Self { |
168 | trie, |
169 | pos: Some(0), |
170 | remaining_match_length: None, |
171 | } |
172 | } |
173 | |
174 | /// Traverses the trie from the current state for this input char. |
175 | /// |
176 | /// # Examples |
177 | /// |
178 | /// ``` |
179 | /// use icu::collections::char16trie::{Char16Trie, TrieResult}; |
180 | /// use zerovec::ZeroVec; |
181 | /// |
182 | /// // A Char16Trie containing the ASCII characters 'a' and 'b'. |
183 | /// let trie_data = [48, 97, 176, 98, 32868]; |
184 | /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data)); |
185 | /// |
186 | /// let mut iter = trie.iter(); |
187 | /// let res = iter.next('a' ); |
188 | /// assert_eq!(res, TrieResult::Intermediate(1)); |
189 | /// let res = iter.next('b' ); |
190 | /// assert_eq!(res, TrieResult::FinalValue(100)); |
191 | /// let res = iter.next('c' ); |
192 | /// assert_eq!(res, TrieResult::NoMatch); |
193 | /// ``` |
194 | pub fn next(&mut self, c: char) -> TrieResult { |
195 | if (c as u32) <= 0xffff { |
196 | self.next16(c as u16) |
197 | } else { |
198 | match self.next16(u16_lead(c as i32)) { |
199 | TrieResult::NoValue | TrieResult::Intermediate(_) => { |
200 | self.next16(u16_tail(c as i32)) |
201 | } |
202 | _ => TrieResult::NoMatch, |
203 | } |
204 | } |
205 | } |
206 | |
207 | /// Traverses the trie from the current state for this input char. |
208 | /// |
209 | /// # Examples |
210 | /// |
211 | /// ``` |
212 | /// use icu::collections::char16trie::{Char16Trie, TrieResult}; |
213 | /// use zerovec::ZeroVec; |
214 | /// |
215 | /// // A Char16Trie containing the ASCII characters 'a' and 'b'. |
216 | /// let trie_data = [48, 97, 176, 98, 32868]; |
217 | /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data)); |
218 | /// |
219 | /// let mut iter = trie.iter(); |
220 | /// let res = iter.next('a' ); |
221 | /// assert_eq!(res, TrieResult::Intermediate(1)); |
222 | /// let res = iter.next('b' ); |
223 | /// assert_eq!(res, TrieResult::FinalValue(100)); |
224 | /// let res = iter.next('c' ); |
225 | /// assert_eq!(res, TrieResult::NoMatch); |
226 | /// ``` |
227 | pub fn next32(&mut self, c: u32) -> TrieResult { |
228 | if c <= 0xffff { |
229 | self.next16(c as u16) |
230 | } else { |
231 | match self.next16(u16_lead(c as i32)) { |
232 | TrieResult::NoValue | TrieResult::Intermediate(_) => { |
233 | self.next16(u16_tail(c as i32)) |
234 | } |
235 | _ => TrieResult::NoMatch, |
236 | } |
237 | } |
238 | } |
239 | |
240 | /// Traverses the trie from the current state for this input char. |
241 | /// |
242 | /// # Examples |
243 | /// |
244 | /// ``` |
245 | /// use icu::collections::char16trie::{Char16Trie, TrieResult}; |
246 | /// use zerovec::ZeroVec; |
247 | /// |
248 | /// // A Char16Trie containing the ASCII characters 'a' and 'b'. |
249 | /// let trie_data = [48, 97, 176, 98, 32868]; |
250 | /// let trie = Char16Trie::new(ZeroVec::from_slice_or_alloc(&trie_data)); |
251 | /// |
252 | /// let mut iter = trie.iter(); |
253 | /// let res = iter.next16('a' as u16); |
254 | /// assert_eq!(res, TrieResult::Intermediate(1)); |
255 | /// let res = iter.next16('b' as u16); |
256 | /// assert_eq!(res, TrieResult::FinalValue(100)); |
257 | /// let res = iter.next16('c' as u16); |
258 | /// assert_eq!(res, TrieResult::NoMatch); |
259 | /// ``` |
260 | pub fn next16(&mut self, c: u16) -> TrieResult { |
261 | let mut pos = match self.pos { |
262 | Some(p) => p, |
263 | None => return TrieResult::NoMatch, |
264 | }; |
265 | if let Some(length) = self.remaining_match_length { |
266 | // Remaining part of a linear-match node |
267 | if c == trie_unwrap!(self.trie.get(pos)) { |
268 | pos += 1; |
269 | self.pos = Some(pos); |
270 | if length == 0 { |
271 | self.remaining_match_length = None; |
272 | let node = trie_unwrap!(self.trie.get(pos)); |
273 | if node >= MIN_VALUE_LEAD { |
274 | return self.value_result(pos); |
275 | } |
276 | } else { |
277 | self.remaining_match_length = Some(length - 1); |
278 | } |
279 | return TrieResult::NoValue; |
280 | } |
281 | self.stop(); |
282 | TrieResult::NoMatch |
283 | } else { |
284 | self.next_impl(pos, c) |
285 | } |
286 | } |
287 | |
288 | fn branch_next(&mut self, pos: usize, length: usize, in_unit: u16) -> TrieResult { |
289 | let mut pos = pos; |
290 | let mut length = length; |
291 | if length == 0 { |
292 | length = trie_unwrap!(self.trie.get(pos)) as usize; |
293 | pos += 1; |
294 | } |
295 | length += 1; |
296 | |
297 | // The length of the branch is the number of units to select from. |
298 | // The data structure encodes a binary search. |
299 | while length > MAX_BRANCH_LINEAR_SUB_NODE_LENGTH { |
300 | if in_unit < trie_unwrap!(self.trie.get(pos)) { |
301 | length >>= 1; |
302 | pos = trie_unwrap!(self.jump_by_delta(pos + 1)); |
303 | } else { |
304 | length = length - (length >> 1); |
305 | pos = trie_unwrap!(self.skip_delta(pos + 1)); |
306 | } |
307 | } |
308 | // Drop down to linear search for the last few bytes. |
309 | // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 |
310 | // and divides length by 2. |
311 | loop { |
312 | if in_unit == trie_unwrap!(self.trie.get(pos)) { |
313 | pos += 1; |
314 | let mut node = trie_unwrap!(self.trie.get(pos)); |
315 | if node & VALUE_IS_FINAL != 0 { |
316 | self.pos = Some(pos); |
317 | return self.value_result(pos); |
318 | } |
319 | // Use the non-final value as the jump delta. |
320 | pos += 1; |
321 | |
322 | if node < MIN_TWO_UNIT_VALUE_LEAD { |
323 | pos += node as usize; |
324 | } else if node < THREE_UNIT_VALUE_LEAD { |
325 | pos += (((node - MIN_TWO_UNIT_VALUE_LEAD) as u32) << 16) as usize |
326 | | trie_unwrap!(self.trie.get(pos)) as usize; |
327 | pos += 1; |
328 | } else { |
329 | pos += (trie_unwrap!(self.trie.get(pos)) as usize) << 16 |
330 | | trie_unwrap!(self.trie.get(pos + 1)) as usize; |
331 | pos += 2; |
332 | } |
333 | node = trie_unwrap!(self.trie.get(pos)); |
334 | self.pos = Some(pos); |
335 | |
336 | if node >= MIN_VALUE_LEAD { |
337 | return self.value_result(pos); |
338 | } |
339 | return TrieResult::NoValue; |
340 | } |
341 | length -= 1; |
342 | pos = trie_unwrap!(self.skip_value(pos + 1)); |
343 | if length <= 1 { |
344 | break; |
345 | } |
346 | } |
347 | |
348 | if in_unit == trie_unwrap!(self.trie.get(pos)) { |
349 | pos += 1; |
350 | self.pos = Some(pos); |
351 | let node = trie_unwrap!(self.trie.get(pos)); |
352 | if node >= MIN_VALUE_LEAD { |
353 | return self.value_result(pos); |
354 | } |
355 | TrieResult::NoValue |
356 | } else { |
357 | self.stop(); |
358 | TrieResult::NoMatch |
359 | } |
360 | } |
361 | |
362 | fn next_impl(&mut self, pos: usize, in_unit: u16) -> TrieResult { |
363 | let mut node = trie_unwrap!(self.trie.get(pos)); |
364 | let mut pos = pos + 1; |
365 | loop { |
366 | if node < MIN_LINEAR_MATCH { |
367 | return self.branch_next(pos, node as usize, in_unit); |
368 | } else if node < MIN_VALUE_LEAD { |
369 | // Match the first of length+1 units. |
370 | let length = node - MIN_LINEAR_MATCH; |
371 | if in_unit == trie_unwrap!(self.trie.get(pos)) { |
372 | pos += 1; |
373 | if length == 0 { |
374 | self.remaining_match_length = None; |
375 | self.pos = Some(pos); |
376 | node = trie_unwrap!(self.trie.get(pos)); |
377 | if node >= MIN_VALUE_LEAD { |
378 | return self.value_result(pos); |
379 | } |
380 | return TrieResult::NoValue; |
381 | } |
382 | self.remaining_match_length = Some(length as usize - 1); |
383 | self.pos = Some(pos); |
384 | return TrieResult::NoValue; |
385 | } |
386 | // No match |
387 | break; |
388 | } else if (node & VALUE_IS_FINAL) != 0 { |
389 | // No further matching units. |
390 | break; |
391 | } else { |
392 | // Skip intermediate value. |
393 | pos = skip_node_value(pos, node); |
394 | node &= NODE_TYPE_MASK; |
395 | } |
396 | } |
397 | self.stop(); |
398 | TrieResult::NoMatch |
399 | } |
400 | |
401 | fn stop(&mut self) { |
402 | self.pos = None; |
403 | } |
404 | |
405 | #[inline (always)] // 1 call site and we want the Option to go away |
406 | fn jump_by_delta(&self, pos: usize) -> Option<usize> { |
407 | let delta = self.trie.get(pos)?; |
408 | let v = if delta < MIN_TWO_UNIT_DELTA_LEAD { |
409 | // nothing to do |
410 | pos + 1 + delta as usize |
411 | } else if delta == THREE_UNIT_DELTA_LEAD { |
412 | let delta = |
413 | ((self.trie.get(pos + 1)? as usize) << 16) | (self.trie.get(pos + 2)? as usize); |
414 | pos + delta + 3 |
415 | } else { |
416 | let delta = ((delta - MIN_TWO_UNIT_DELTA_LEAD) as usize) << 16 |
417 | | (self.trie.get(pos + 1)? as usize); |
418 | pos + delta + 2 |
419 | }; |
420 | Some(v) |
421 | } |
422 | |
423 | #[inline (always)] // 1 call site and we want the Option to go away |
424 | fn skip_value(&self, pos: usize) -> Option<usize> { |
425 | let lead_unit = self.trie.get(pos)?; |
426 | Some(skip_value(pos + 1, lead_unit & 0x7fff)) |
427 | } |
428 | |
429 | #[inline (always)] // 1 call site and we want the Option to go away |
430 | fn skip_delta(&self, pos: usize) -> Option<usize> { |
431 | let delta = self.trie.get(pos)?; |
432 | let v = if delta < MIN_TWO_UNIT_DELTA_LEAD { |
433 | pos + 1 |
434 | } else if delta == THREE_UNIT_DELTA_LEAD { |
435 | pos + 3 |
436 | } else { |
437 | pos + 2 |
438 | }; |
439 | Some(v) |
440 | } |
441 | |
442 | fn value_result(&self, pos: usize) -> TrieResult { |
443 | match self.get_value(pos) { |
444 | Some(result) => result, |
445 | None => { |
446 | // Unexpected |
447 | debug_assert!(false); |
448 | TrieResult::NoMatch |
449 | } |
450 | } |
451 | } |
452 | |
453 | #[inline (always)] // 1 call site and we want the Option to go away |
454 | fn get_value(&self, pos: usize) -> Option<TrieResult> { |
455 | let lead_unit = self.trie.get(pos)?; |
456 | if lead_unit & VALUE_IS_FINAL == VALUE_IS_FINAL { |
457 | self.read_value(pos + 1, lead_unit & 0x7fff) |
458 | .map(TrieResult::FinalValue) |
459 | } else { |
460 | self.read_node_value(pos + 1, lead_unit) |
461 | .map(TrieResult::Intermediate) |
462 | } |
463 | } |
464 | |
465 | #[inline (always)] // 1 call site and we want the Option to go away |
466 | fn read_value(&self, pos: usize, lead_unit: u16) -> Option<i32> { |
467 | let v = if lead_unit < MIN_TWO_UNIT_VALUE_LEAD { |
468 | lead_unit.into() |
469 | } else if lead_unit < THREE_UNIT_VALUE_LEAD { |
470 | ((lead_unit - MIN_TWO_UNIT_VALUE_LEAD) as i32) << 16 | self.trie.get(pos)? as i32 |
471 | } else { |
472 | (self.trie.get(pos)? as i32) << 16 | self.trie.get(pos + 1)? as i32 |
473 | }; |
474 | Some(v) |
475 | } |
476 | |
477 | #[inline (always)] // 1 call site and we want the Option to go away |
478 | fn read_node_value(&self, pos: usize, lead_unit: u16) -> Option<i32> { |
479 | let v = if lead_unit < (MIN_TWO_UNIT_NODE_VALUE_LEAD) { |
480 | ((lead_unit >> 6) - 1).into() |
481 | } else if lead_unit < THREE_UNIT_NODE_VALUE_LEAD { |
482 | (((lead_unit & 0x7fc0) - MIN_TWO_UNIT_NODE_VALUE_LEAD) as i32) << 10 |
483 | | self.trie.get(pos)? as i32 |
484 | } else { |
485 | (self.trie.get(pos)? as i32) << 16 | self.trie.get(pos + 1)? as i32 |
486 | }; |
487 | Some(v) |
488 | } |
489 | } |
490 | |