1 | use core::fmt::{self, Debug}; |
2 | |
3 | macro_rules! for_range_inc { |
4 | ($current:ident in $start:expr, $end:expr => $($code:tt)*) => { |
5 | let mut $current = $start; |
6 | let end = $end; |
7 | |
8 | while $current <= end { |
9 | $($code)* |
10 | |
11 | $current+=1; |
12 | } |
13 | }; |
14 | } |
15 | |
16 | use core::ops::Range; |
17 | |
18 | #[derive (Copy, Clone)] |
19 | struct ByteKind(u8); |
20 | |
21 | impl Debug for ByteKind { |
22 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
23 | f.write_str(data:match () { |
24 | _ if self.0 == Self::Other.0 => "Other" , |
25 | _ if self.0 == Self::Number.0 => "Number" , |
26 | _ if self.0 == Self::LowerCase.0 => "LowerCase" , |
27 | _ if self.0 == Self::UpperCase.0 => "UpperCase" , |
28 | _ if self.0 == Self::NonAscii.0 => "NonAscii" , |
29 | _ => unreachable!(), |
30 | }) |
31 | } |
32 | } |
33 | |
34 | #[allow (non_upper_case_globals)] |
35 | impl ByteKind { |
36 | const Other: Self = Self(0b0001); |
37 | const Number: Self = Self(0b0010); |
38 | const LowerCase: Self = Self(0b0100); |
39 | const UpperCase: Self = Self(0b1000); |
40 | const Alphabetic: Self = Self(Self::LowerCase.0 | Self::UpperCase.0); |
41 | // Assumes that non-ascii chars are mostly alphabetic, |
42 | // this should work out fine most of the time. |
43 | const NonAscii: Self = Self(0b1100); |
44 | } |
45 | |
46 | impl ByteKind { |
47 | #[allow (dead_code)] |
48 | #[inline (always)] |
49 | pub const fn eq(self, other: Self) -> bool { |
50 | (self.0 & other.0) != 0 |
51 | } |
52 | |
53 | #[inline (always)] |
54 | pub const fn ne(self, other: Self) -> bool { |
55 | (self.0 & other.0) == 0 |
56 | } |
57 | |
58 | #[inline (always)] |
59 | pub const fn is_alphabetic(self) -> bool { |
60 | self.0 == Self::LowerCase.0 || self.0 == Self::UpperCase.0 |
61 | } |
62 | |
63 | pub const fn is_end_of_word(mut self, prev: Self, other: Self) -> bool { |
64 | if self.0 == Self::NonAscii.0 { |
65 | self = prev; |
66 | } |
67 | |
68 | if self.0 == Self::UpperCase.0 { |
69 | other.ne(Self::Alphabetic) |
70 | } else { |
71 | self.ne(other) |
72 | } |
73 | } |
74 | } |
75 | |
76 | #[derive (Debug, Copy, Clone)] |
77 | pub(crate) struct WordIterator<'a> { |
78 | bytes: &'a [u8], |
79 | start: usize, |
80 | } |
81 | |
82 | const BYTE_KIND: &[ByteKind; 256] = &{ |
83 | let mut out: [ByteKind; 256] = [ByteKind::NonAscii; 256]; |
84 | |
85 | // Make sure that this goes first |
86 | for_range_inc! {i in 0, 127 => out[i as usize] = ByteKind::Other; } |
87 | for_range_inc! {i in b'A' , b'Z' => out[i as usize] = ByteKind::UpperCase; } |
88 | for_range_inc! {i in b'a' , b'z' => out[i as usize] = ByteKind::LowerCase; } |
89 | for_range_inc! {i in b'0' , b'9' => out[i as usize] = ByteKind::Number; } |
90 | |
91 | out |
92 | }; |
93 | |
94 | impl<'a> WordIterator<'a> { |
95 | pub(crate) const fn new(bytes: &'a [u8]) -> Self { |
96 | Self { bytes, start: 0 } |
97 | } |
98 | |
99 | const fn skip_same_kind(mut self, mut kind: ByteKind) -> (Self, ByteKind) { |
100 | let orig_bytes_len = self.bytes.len(); |
101 | |
102 | let mut prev_kind = kind; |
103 | while let [b, rem @ ..] = self.bytes { |
104 | let next_kind = BYTE_KIND[*b as usize]; |
105 | let cmp = kind.is_end_of_word(prev_kind, next_kind); |
106 | if kind.is_alphabetic() { |
107 | prev_kind = kind; |
108 | } |
109 | kind = next_kind; |
110 | if cmp { |
111 | break; |
112 | } |
113 | self.bytes = rem; |
114 | } |
115 | |
116 | // Advance until a char boundary is found |
117 | while let [b, rem @ ..] = self.bytes { |
118 | if (*b as i8) >= -0x40 { |
119 | break; |
120 | } |
121 | self.bytes = rem; |
122 | } |
123 | |
124 | // Remember not to add return statements to the function |
125 | self.start += orig_bytes_len - self.bytes.len(); |
126 | |
127 | (self, kind) |
128 | } |
129 | |
130 | pub(crate) const fn next(self) -> Option<(Self, Range<usize>)> { |
131 | let (this, fkind) = self.skip_same_kind(ByteKind::Other); |
132 | if let [] = this.bytes { |
133 | None |
134 | } else { |
135 | let (next, _) = this.skip_same_kind(fkind); |
136 | let range = this.start..next.start; |
137 | Some((next, range)) |
138 | } |
139 | } |
140 | } |
141 | |
142 | #[cfg (test)] |
143 | mod tests { |
144 | use super::*; |
145 | |
146 | use arrayvec::ArrayVec; |
147 | |
148 | fn get_words(text: &str) -> ArrayVec<[&str; 20]> { |
149 | let mut list = <ArrayVec<[&str; 20]>>::new(); |
150 | let mut word_iter = WordIterator::new(text.as_bytes()); |
151 | |
152 | while let Some((niter, word_range)) = word_iter.next() { |
153 | word_iter = niter; |
154 | list.push(&text[word_range]); |
155 | } |
156 | |
157 | list |
158 | } |
159 | |
160 | #[test ] |
161 | fn test_word_iter() { |
162 | assert_eq!( |
163 | get_words("01934324ñmaniÑNnFooBar" )[..], |
164 | ["01934324" , "ñmaniÑ" , "Nn" , "Foo" , "Bar" ], |
165 | ); |
166 | |
167 | assert_eq!( |
168 | get_words("01934 324 ñmani-嶲Nn____FOOOBar" )[..], |
169 | ["01934" , "324" , "ñmani" , "嶲Nn" , "FOOOBar" ], |
170 | ); |
171 | |
172 | assert_eq!(get_words(" 01934 1111 " )[..], ["01934" , "1111" ],); |
173 | |
174 | assert_eq!(get_words(" 嶲01934 " )[..], ["嶲" , "01934" ],); |
175 | |
176 | assert_eq!(get_words(" 嶲A01934 " )[..], ["嶲A" , "01934" ],); |
177 | |
178 | assert_eq!(get_words(" 嶲a01934 " )[..], ["嶲a" , "01934" ],); |
179 | |
180 | assert_eq!(get_words(" ñA01934 " )[..], ["ñA" , "01934" ],); |
181 | |
182 | assert_eq!(get_words(" ña01934 " )[..], ["ña" , "01934" ],); |
183 | } |
184 | } |
185 | |