| 1 | use core::fmt::{self, Debug}; |
| 2 | |
| 3 | macro_rules! for_range_inc { |
| 4 | ($current:ident in $start:expr, $end:expr => $($code:tt)*) => { |
| 5 | let mut $current = $start; |
| 6 | let end = $end; |
| 7 | |
| 8 | while $current <= end { |
| 9 | $($code)* |
| 10 | |
| 11 | $current+=1; |
| 12 | } |
| 13 | }; |
| 14 | } |
| 15 | |
| 16 | use core::ops::Range; |
| 17 | |
| 18 | #[derive (Copy, Clone)] |
| 19 | struct ByteKind(u8); |
| 20 | |
| 21 | impl Debug for ByteKind { |
| 22 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
| 23 | f.write_str(data:match () { |
| 24 | _ if self.0 == Self::Other.0 => "Other" , |
| 25 | _ if self.0 == Self::Number.0 => "Number" , |
| 26 | _ if self.0 == Self::LowerCase.0 => "LowerCase" , |
| 27 | _ if self.0 == Self::UpperCase.0 => "UpperCase" , |
| 28 | _ if self.0 == Self::NonAscii.0 => "NonAscii" , |
| 29 | _ => unreachable!(), |
| 30 | }) |
| 31 | } |
| 32 | } |
| 33 | |
| 34 | #[allow (non_upper_case_globals)] |
| 35 | impl ByteKind { |
| 36 | const Other: Self = Self(0b0001); |
| 37 | const Number: Self = Self(0b0010); |
| 38 | const LowerCase: Self = Self(0b0100); |
| 39 | const UpperCase: Self = Self(0b1000); |
| 40 | const Alphabetic: Self = Self(Self::LowerCase.0 | Self::UpperCase.0); |
| 41 | // Assumes that non-ascii chars are mostly alphabetic, |
| 42 | // this should work out fine most of the time. |
| 43 | const NonAscii: Self = Self(0b1100); |
| 44 | } |
| 45 | |
| 46 | impl ByteKind { |
| 47 | #[allow (dead_code)] |
| 48 | #[inline (always)] |
| 49 | pub const fn eq(self, other: Self) -> bool { |
| 50 | (self.0 & other.0) != 0 |
| 51 | } |
| 52 | |
| 53 | #[inline (always)] |
| 54 | pub const fn ne(self, other: Self) -> bool { |
| 55 | (self.0 & other.0) == 0 |
| 56 | } |
| 57 | |
| 58 | #[inline (always)] |
| 59 | pub const fn is_alphabetic(self) -> bool { |
| 60 | self.0 == Self::LowerCase.0 || self.0 == Self::UpperCase.0 |
| 61 | } |
| 62 | |
| 63 | pub const fn is_end_of_word(mut self, prev: Self, other: Self) -> bool { |
| 64 | if self.0 == Self::NonAscii.0 { |
| 65 | self = prev; |
| 66 | } |
| 67 | |
| 68 | if self.0 == Self::UpperCase.0 { |
| 69 | other.ne(Self::Alphabetic) |
| 70 | } else { |
| 71 | self.ne(other) |
| 72 | } |
| 73 | } |
| 74 | } |
| 75 | |
| 76 | #[derive (Debug, Copy, Clone)] |
| 77 | pub(crate) struct WordIterator<'a> { |
| 78 | bytes: &'a [u8], |
| 79 | start: usize, |
| 80 | } |
| 81 | |
| 82 | const BYTE_KIND: &[ByteKind; 256] = &{ |
| 83 | let mut out: [ByteKind; 256] = [ByteKind::NonAscii; 256]; |
| 84 | |
| 85 | // Make sure that this goes first |
| 86 | for_range_inc! {i in 0, 127 => out[i as usize] = ByteKind::Other; } |
| 87 | for_range_inc! {i in b'A' , b'Z' => out[i as usize] = ByteKind::UpperCase; } |
| 88 | for_range_inc! {i in b'a' , b'z' => out[i as usize] = ByteKind::LowerCase; } |
| 89 | for_range_inc! {i in b'0' , b'9' => out[i as usize] = ByteKind::Number; } |
| 90 | |
| 91 | out |
| 92 | }; |
| 93 | |
| 94 | impl<'a> WordIterator<'a> { |
| 95 | pub(crate) const fn new(bytes: &'a [u8]) -> Self { |
| 96 | Self { bytes, start: 0 } |
| 97 | } |
| 98 | |
| 99 | const fn skip_same_kind(mut self, mut kind: ByteKind) -> (Self, ByteKind) { |
| 100 | let orig_bytes_len = self.bytes.len(); |
| 101 | |
| 102 | let mut prev_kind = kind; |
| 103 | while let [b, rem @ ..] = self.bytes { |
| 104 | let next_kind = BYTE_KIND[*b as usize]; |
| 105 | let cmp = kind.is_end_of_word(prev_kind, next_kind); |
| 106 | if kind.is_alphabetic() { |
| 107 | prev_kind = kind; |
| 108 | } |
| 109 | kind = next_kind; |
| 110 | if cmp { |
| 111 | break; |
| 112 | } |
| 113 | self.bytes = rem; |
| 114 | } |
| 115 | |
| 116 | // Advance until a char boundary is found |
| 117 | while let [b, rem @ ..] = self.bytes { |
| 118 | if (*b as i8) >= -0x40 { |
| 119 | break; |
| 120 | } |
| 121 | self.bytes = rem; |
| 122 | } |
| 123 | |
| 124 | // Remember not to add return statements to the function |
| 125 | self.start += orig_bytes_len - self.bytes.len(); |
| 126 | |
| 127 | (self, kind) |
| 128 | } |
| 129 | |
| 130 | pub(crate) const fn next(self) -> Option<(Self, Range<usize>)> { |
| 131 | let (this, fkind) = self.skip_same_kind(ByteKind::Other); |
| 132 | if let [] = this.bytes { |
| 133 | None |
| 134 | } else { |
| 135 | let (next, _) = this.skip_same_kind(fkind); |
| 136 | let range = this.start..next.start; |
| 137 | Some((next, range)) |
| 138 | } |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | #[cfg (test)] |
| 143 | mod tests { |
| 144 | use super::*; |
| 145 | |
| 146 | use arrayvec::ArrayVec; |
| 147 | |
| 148 | fn get_words(text: &str) -> ArrayVec<[&str; 20]> { |
| 149 | let mut list = <ArrayVec<[&str; 20]>>::new(); |
| 150 | let mut word_iter = WordIterator::new(text.as_bytes()); |
| 151 | |
| 152 | while let Some((niter, word_range)) = word_iter.next() { |
| 153 | word_iter = niter; |
| 154 | list.push(&text[word_range]); |
| 155 | } |
| 156 | |
| 157 | list |
| 158 | } |
| 159 | |
| 160 | #[test ] |
| 161 | fn test_word_iter() { |
| 162 | assert_eq!( |
| 163 | get_words("01934324ñmaniÑNnFooBar" )[..], |
| 164 | ["01934324" , "ñmaniÑ" , "Nn" , "Foo" , "Bar" ], |
| 165 | ); |
| 166 | |
| 167 | assert_eq!( |
| 168 | get_words("01934 324 ñmani-嶲Nn____FOOOBar" )[..], |
| 169 | ["01934" , "324" , "ñmani" , "嶲Nn" , "FOOOBar" ], |
| 170 | ); |
| 171 | |
| 172 | assert_eq!(get_words(" 01934 1111 " )[..], ["01934" , "1111" ],); |
| 173 | |
| 174 | assert_eq!(get_words(" 嶲01934 " )[..], ["嶲" , "01934" ],); |
| 175 | |
| 176 | assert_eq!(get_words(" 嶲A01934 " )[..], ["嶲A" , "01934" ],); |
| 177 | |
| 178 | assert_eq!(get_words(" 嶲a01934 " )[..], ["嶲a" , "01934" ],); |
| 179 | |
| 180 | assert_eq!(get_words(" ñA01934 " )[..], ["ñA" , "01934" ],); |
| 181 | |
| 182 | assert_eq!(get_words(" ña01934 " )[..], ["ña" , "01934" ],); |
| 183 | } |
| 184 | } |
| 185 | |