1use core::fmt::{self, Debug};
2
3macro_rules! for_range_inc {
4 ($current:ident in $start:expr, $end:expr => $($code:tt)*) => {
5 let mut $current = $start;
6 let end = $end;
7
8 while $current <= end {
9 $($code)*
10
11 $current+=1;
12 }
13 };
14}
15
16use core::ops::Range;
17
18#[derive(Copy, Clone)]
19struct ByteKind(u8);
20
21impl Debug for ByteKind {
22 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23 f.write_str(data:match () {
24 _ if self.0 == Self::Other.0 => "Other",
25 _ if self.0 == Self::Number.0 => "Number",
26 _ if self.0 == Self::LowerCase.0 => "LowerCase",
27 _ if self.0 == Self::UpperCase.0 => "UpperCase",
28 _ if self.0 == Self::NonAscii.0 => "NonAscii",
29 _ => unreachable!(),
30 })
31 }
32}
33
34#[allow(non_upper_case_globals)]
35impl ByteKind {
36 const Other: Self = Self(0b0001);
37 const Number: Self = Self(0b0010);
38 const LowerCase: Self = Self(0b0100);
39 const UpperCase: Self = Self(0b1000);
40 const Alphabetic: Self = Self(Self::LowerCase.0 | Self::UpperCase.0);
41 // Assumes that non-ascii chars are mostly alphabetic,
42 // this should work out fine most of the time.
43 const NonAscii: Self = Self(0b1100);
44}
45
46impl ByteKind {
47 #[allow(dead_code)]
48 #[inline(always)]
49 pub const fn eq(self, other: Self) -> bool {
50 (self.0 & other.0) != 0
51 }
52
53 #[inline(always)]
54 pub const fn ne(self, other: Self) -> bool {
55 (self.0 & other.0) == 0
56 }
57
58 #[inline(always)]
59 pub const fn is_alphabetic(self) -> bool {
60 self.0 == Self::LowerCase.0 || self.0 == Self::UpperCase.0
61 }
62
63 pub const fn is_end_of_word(mut self, prev: Self, other: Self) -> bool {
64 if self.0 == Self::NonAscii.0 {
65 self = prev;
66 }
67
68 if self.0 == Self::UpperCase.0 {
69 other.ne(Self::Alphabetic)
70 } else {
71 self.ne(other)
72 }
73 }
74}
75
76#[derive(Debug, Copy, Clone)]
77pub(crate) struct WordIterator<'a> {
78 bytes: &'a [u8],
79 start: usize,
80}
81
82const BYTE_KIND: &[ByteKind; 256] = &{
83 let mut out: [ByteKind; 256] = [ByteKind::NonAscii; 256];
84
85 // Make sure that this goes first
86 for_range_inc! {i in 0, 127 => out[i as usize] = ByteKind::Other; }
87 for_range_inc! {i in b'A', b'Z' => out[i as usize] = ByteKind::UpperCase; }
88 for_range_inc! {i in b'a', b'z' => out[i as usize] = ByteKind::LowerCase; }
89 for_range_inc! {i in b'0', b'9' => out[i as usize] = ByteKind::Number; }
90
91 out
92};
93
94impl<'a> WordIterator<'a> {
95 pub(crate) const fn new(bytes: &'a [u8]) -> Self {
96 Self { bytes, start: 0 }
97 }
98
99 const fn skip_same_kind(mut self, mut kind: ByteKind) -> (Self, ByteKind) {
100 let orig_bytes_len = self.bytes.len();
101
102 let mut prev_kind = kind;
103 while let [b, rem @ ..] = self.bytes {
104 let next_kind = BYTE_KIND[*b as usize];
105 let cmp = kind.is_end_of_word(prev_kind, next_kind);
106 if kind.is_alphabetic() {
107 prev_kind = kind;
108 }
109 kind = next_kind;
110 if cmp {
111 break;
112 }
113 self.bytes = rem;
114 }
115
116 // Advance until a char boundary is found
117 while let [b, rem @ ..] = self.bytes {
118 if (*b as i8) >= -0x40 {
119 break;
120 }
121 self.bytes = rem;
122 }
123
124 // Remember not to add return statements to the function
125 self.start += orig_bytes_len - self.bytes.len();
126
127 (self, kind)
128 }
129
130 pub(crate) const fn next(self) -> Option<(Self, Range<usize>)> {
131 let (this, fkind) = self.skip_same_kind(ByteKind::Other);
132 if let [] = this.bytes {
133 None
134 } else {
135 let (next, _) = this.skip_same_kind(fkind);
136 let range = this.start..next.start;
137 Some((next, range))
138 }
139 }
140}
141
142#[cfg(test)]
143mod tests {
144 use super::*;
145
146 use arrayvec::ArrayVec;
147
148 fn get_words(text: &str) -> ArrayVec<[&str; 20]> {
149 let mut list = <ArrayVec<[&str; 20]>>::new();
150 let mut word_iter = WordIterator::new(text.as_bytes());
151
152 while let Some((niter, word_range)) = word_iter.next() {
153 word_iter = niter;
154 list.push(&text[word_range]);
155 }
156
157 list
158 }
159
160 #[test]
161 fn test_word_iter() {
162 assert_eq!(
163 get_words("01934324ñmaniÑNnFooBar")[..],
164 ["01934324", "ñmaniÑ", "Nn", "Foo", "Bar"],
165 );
166
167 assert_eq!(
168 get_words("01934 324 ñmani-嶲Nn____FOOOBar")[..],
169 ["01934", "324", "ñmani", "嶲Nn", "FOOOBar"],
170 );
171
172 assert_eq!(get_words(" 01934 1111 ")[..], ["01934", "1111"],);
173
174 assert_eq!(get_words(" 嶲01934 ")[..], ["嶲", "01934"],);
175
176 assert_eq!(get_words(" 嶲A01934 ")[..], ["嶲A", "01934"],);
177
178 assert_eq!(get_words(" 嶲a01934 ")[..], ["嶲a", "01934"],);
179
180 assert_eq!(get_words(" ñA01934 ")[..], ["ñA", "01934"],);
181
182 assert_eq!(get_words(" ña01934 ")[..], ["ña", "01934"],);
183 }
184}
185