1 | /*! |
2 | The ucd-trie crate provides a compressed trie set specifically tailored for |
3 | Unicode codepoints. The principle use case for such a trie is to represent |
4 | properties defined by Unicode that correspond to sets of Unicode codepoints. |
5 | (These properties are formally called boolean properties or "single valued" |
6 | properties. See |
7 | [UTR#23 S3.3](https://www.unicode.org/reports/tr23/#PropertyTypeDefinitions) |
8 | for more details.) |
9 | |
10 | This crate has two principle types: `TrieSetOwned` and `TrieSetSlice`, |
11 | corresponding to a similar split as there is between `Vec<T>` and `&[T]`. |
12 | `TrieSetOwned` is the only way to construct a trie from a set of Unicode |
13 | codepoints. |
14 | |
15 | The intended use of this library is to embed a static instance of |
16 | `TrieSetSlice` into your source code, and then use its methods as defined in |
17 | this crate to test membership. (The `ucd-generate` tool can likely generate |
18 | this code for you.) |
19 | |
20 | Finally, while this crate uses the standard library by default, it provides |
21 | `no_std` functionality by disabling the `std` feature. When `no_std` is |
22 | enabled, then `TrieSetOwned` is not provided. Instead, only `TrieSetSlice` is |
23 | provided, which means `no_std` crates can still embed tries into their code. |
24 | */ |
25 | |
26 | #![deny (missing_docs)] |
27 | #![cfg_attr (not(feature = "std" ), no_std)] |
28 | |
29 | use core::fmt; |
30 | |
31 | #[cfg (feature = "std" )] |
32 | pub use crate::owned::{Error, Result, TrieSetOwned}; |
33 | |
34 | #[cfg (test)] |
35 | #[allow (dead_code)] |
36 | mod general_category; |
37 | #[cfg (feature = "std" )] |
38 | mod owned; |
39 | |
40 | const CHUNK_SIZE: usize = 64; |
41 | |
42 | /// A type alias for `TrieSetSlice<'static>`. |
43 | pub type TrieSet = TrieSetSlice<'static>; |
44 | |
45 | /// A borrowed trie set. |
46 | #[derive (Clone, Copy)] |
47 | pub struct TrieSetSlice<'a> { |
48 | /// first tree, one level |
49 | #[doc (hidden)] |
50 | pub tree1_level1: &'a [u64], |
51 | /// second tree, first level |
52 | #[doc (hidden)] |
53 | pub tree2_level1: &'a [u8], |
54 | /// second tree, second level |
55 | #[doc (hidden)] |
56 | pub tree2_level2: &'a [u64], |
57 | /// third tree, first level |
58 | #[doc (hidden)] |
59 | pub tree3_level1: &'a [u8], |
60 | /// third tree, second level |
61 | #[doc (hidden)] |
62 | pub tree3_level2: &'a [u8], |
63 | /// third tree, third level |
64 | #[doc (hidden)] |
65 | pub tree3_level3: &'a [u64], |
66 | } |
67 | |
68 | impl<'a> fmt::Debug for TrieSetSlice<'a> { |
69 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
70 | write!(f, "TrieSetSlice(...)" ) |
71 | } |
72 | } |
73 | |
74 | impl<'a> TrieSetSlice<'a> { |
75 | /// Returns true if and only if the given Unicode scalar value is in this |
76 | /// set. |
77 | pub fn contains_char(&self, c: char) -> bool { |
78 | self.contains(c as usize) |
79 | } |
80 | |
81 | /// Returns true if and only if the given codepoint is in this set. |
82 | /// |
83 | /// If the given value exceeds the codepoint range (i.e., it's greater |
84 | /// than `0x10FFFF`), then this returns false. |
85 | pub fn contains_u32(&self, cp: u32) -> bool { |
86 | if cp > 0x10FFFF { |
87 | return false; |
88 | } |
89 | self.contains(cp as usize) |
90 | } |
91 | |
92 | #[inline (always)] |
93 | fn contains(&self, cp: usize) -> bool { |
94 | if cp < 0x800 { |
95 | self.chunk_contains(cp, self.tree1_level1[cp >> 6]) |
96 | } else if cp < 0x10000 { |
97 | let leaf = match self.tree2_level1.get((cp >> 6) - 0x20) { |
98 | None => return false, |
99 | Some(&leaf) => leaf, |
100 | }; |
101 | self.chunk_contains(cp, self.tree2_level2[leaf as usize]) |
102 | } else { |
103 | let child = match self.tree3_level1.get((cp >> 12) - 0x10) { |
104 | None => return false, |
105 | Some(&child) => child, |
106 | }; |
107 | let i = ((child as usize) * CHUNK_SIZE) + ((cp >> 6) & 0b111111); |
108 | let leaf = self.tree3_level2[i]; |
109 | self.chunk_contains(cp, self.tree3_level3[leaf as usize]) |
110 | } |
111 | } |
112 | |
113 | #[inline (always)] |
114 | fn chunk_contains(&self, cp: usize, chunk: u64) -> bool { |
115 | ((chunk >> (cp & 0b111111)) & 1) == 1 |
116 | } |
117 | } |
118 | |