1 | use alloc::string::String; |
2 | use core::cmp::Ordering; |
3 | use core::hash::{Hash, Hasher}; |
4 | |
5 | use self::map::lookup; |
6 | mod map; |
7 | |
8 | #[derive (Clone, Copy, Debug, Default)] |
9 | pub struct Unicode<S>(pub S); |
10 | |
11 | impl<S: AsRef<str>> Unicode<S> { |
12 | pub fn to_folded_case(&self) -> String { |
13 | self.0.as_ref().chars().flat_map(lookup).collect() |
14 | } |
15 | } |
16 | |
17 | impl<S1: AsRef<str>, S2: AsRef<str>> PartialEq<Unicode<S2>> for Unicode<S1> { |
18 | #[inline ] |
19 | fn eq(&self, other: &Unicode<S2>) -> bool { |
20 | let mut left: impl Iterator = self.0.as_ref().chars().flat_map(lookup); |
21 | let mut right: impl Iterator = other.0.as_ref().chars().flat_map(lookup); |
22 | |
23 | // inline Iterator::eq since not added until Rust 1.5 |
24 | loop { |
25 | let x: char = match left.next() { |
26 | None => return right.next().is_none(), |
27 | Some(val: char) => val, |
28 | }; |
29 | |
30 | let y: char = match right.next() { |
31 | None => return false, |
32 | Some(val: char) => val, |
33 | }; |
34 | |
35 | if x != y { |
36 | return false; |
37 | } |
38 | } |
39 | } |
40 | } |
41 | |
42 | impl<S: AsRef<str>> Eq for Unicode<S> {} |
43 | |
44 | impl<T: AsRef<str>> PartialOrd for Unicode<T> { |
45 | #[inline ] |
46 | fn partial_cmp(&self, other: &Self) -> Option<Ordering> { |
47 | Some(self.cmp(other)) |
48 | } |
49 | } |
50 | |
51 | impl<T: AsRef<str>> Ord for Unicode<T> { |
52 | #[inline ] |
53 | fn cmp(&self, other: &Self) -> Ordering { |
54 | let self_chars: impl Iterator = self.0.as_ref().chars().flat_map(lookup); |
55 | let other_chars: impl Iterator = other.0.as_ref().chars().flat_map(lookup); |
56 | self_chars.cmp(other_chars) |
57 | } |
58 | } |
59 | |
60 | impl<S: AsRef<str>> Hash for Unicode<S> { |
61 | #[inline ] |
62 | fn hash<H: Hasher>(&self, hasher: &mut H) { |
63 | let mut buf: [u8; 4] = [0; 4]; |
64 | for c: char in self.0.as_ref().chars().flat_map(|c: char| lookup(orig:c)) { |
65 | let len: usize = char_to_utf8(c, &mut buf); |
66 | // we can't use `write(buf)` because the ASCII variant uses |
67 | // `write_u8`. The docs for Hash say that's technically different. |
68 | // ¯\_(ツ)_/¯ |
69 | for &b: u8 in &buf[..len] { |
70 | hasher.write_u8(b); |
71 | } |
72 | } |
73 | // prefix-freedom |
74 | hasher.write_u8(0xFF); |
75 | } |
76 | } |
77 | |
78 | #[inline ] |
79 | fn char_to_utf8(c: char, dst: &mut [u8; 4]) -> usize { |
80 | const TAG_CONT: u8 = 0b1000_0000; |
81 | const TAG_TWO_B: u8 = 0b1100_0000; |
82 | const TAG_THREE_B: u8 = 0b1110_0000; |
83 | const TAG_FOUR_B: u8 = 0b1111_0000; |
84 | |
85 | let code = c as u32; |
86 | if code <= 0x7F { |
87 | dst[0] = code as u8; |
88 | 1 |
89 | } else if code <= 0x7FF { |
90 | dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B; |
91 | dst[1] = (code & 0x3F) as u8 | TAG_CONT; |
92 | 2 |
93 | } else if code <= 0xFFFF { |
94 | dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B; |
95 | dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT; |
96 | dst[2] = (code & 0x3F) as u8 | TAG_CONT; |
97 | 3 |
98 | } else { |
99 | dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B; |
100 | dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT; |
101 | dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT; |
102 | dst[3] = (code & 0x3F) as u8 | TAG_CONT; |
103 | 4 |
104 | } |
105 | } |
106 | |
107 | // internal mod so that the enum can be 'pub' |
108 | // thanks privacy-checker :___( |
109 | mod fold { |
110 | #[derive (Clone, Copy)] |
111 | pub enum Fold { |
112 | Zero, |
113 | One(char), |
114 | Two(char, char), |
115 | Three(char, char, char), |
116 | } |
117 | |
118 | impl Iterator for Fold { |
119 | type Item = char; |
120 | #[inline ] |
121 | fn next(&mut self) -> Option<char> { |
122 | match *self { |
123 | Fold::Zero => None, |
124 | Fold::One(one) => { |
125 | *self = Fold::Zero; |
126 | Some(one) |
127 | } |
128 | Fold::Two(one, two) => { |
129 | *self = Fold::One(two); |
130 | Some(one) |
131 | } |
132 | Fold::Three(one, two, three) => { |
133 | *self = Fold::Two(one, two); |
134 | Some(three) |
135 | } |
136 | } |
137 | } |
138 | |
139 | #[inline ] |
140 | fn size_hint(&self) -> (usize, Option<usize>) { |
141 | match *self { |
142 | Fold::Zero => (0, Some(0)), |
143 | Fold::One(..) => (1, Some(1)), |
144 | Fold::Two(..) => (2, Some(2)), |
145 | Fold::Three(..) => (3, Some(3)), |
146 | } |
147 | } |
148 | } |
149 | impl From<(char,)> for Fold { |
150 | #[inline ] |
151 | fn from((one,): (char,)) -> Fold { |
152 | Fold::One(one) |
153 | } |
154 | } |
155 | |
156 | impl From<(char, char)> for Fold { |
157 | #[inline ] |
158 | fn from((one, two): (char, char)) -> Fold { |
159 | Fold::Two(one, two) |
160 | } |
161 | } |
162 | |
163 | impl From<(char, char, char)> for Fold { |
164 | #[inline ] |
165 | fn from((one, two, three): (char, char, char)) -> Fold { |
166 | Fold::Three(one, two, three) |
167 | } |
168 | } |
169 | } |
170 | |
171 | #[cfg (test)] |
172 | mod tests { |
173 | use super::Unicode; |
174 | |
175 | macro_rules! eq { |
176 | ($left:expr, $right:expr) => {{ |
177 | assert_eq!(Unicode($left), Unicode($right)); |
178 | }}; |
179 | } |
180 | |
181 | #[test ] |
182 | fn test_ascii_folding() { |
183 | eq!("foo bar" , "FoO BAR" ); |
184 | } |
185 | |
186 | #[test ] |
187 | fn test_simple_case_folding() { |
188 | eq!("στιγμας" , "στιγμασ" ); |
189 | } |
190 | |
191 | #[test ] |
192 | fn test_full_case_folding() { |
193 | eq!("flour" , "flour" ); |
194 | eq!("Maße" , "MASSE" ); |
195 | eq!("ᾲ στο διάολο" , "ὰι στο διάολο" ); |
196 | } |
197 | |
198 | #[test ] |
199 | fn test_to_folded_case() { |
200 | assert_eq!(Unicode("Maße" ).to_folded_case(), "masse" ); |
201 | } |
202 | |
203 | #[cfg (feature = "nightly" )] |
204 | #[bench ] |
205 | fn bench_ascii_folding(b: &mut ::test::Bencher) { |
206 | b.bytes = b"foo bar" .len() as u64; |
207 | b.iter(|| eq!("foo bar" , "FoO BAR" )); |
208 | } |
209 | |
210 | #[cfg (feature = "nightly" )] |
211 | #[bench ] |
212 | fn bench_simple_case_folding(b: &mut ::test::Bencher) { |
213 | b.bytes = "στιγμας" .len() as u64; |
214 | b.iter(|| eq!("στιγμας" , "στιγμασ" )); |
215 | } |
216 | } |
217 | |