word_iterator.rs source code [crates/const_format-0.2.32/src/__ascii_case_conv/word_iterator.rs]

1	use core::fmt::{self, Debug};
2
3	macro_rules! for_range_inc {
4	($current:ident in $start:expr, $end:expr => $($code:tt)*) => {
5	let mut $current = $start;
6	let end = $end;
7
8	while $current <= end {
9	$($code)*
10
11	$current+=`1`;
12	}
13	};
14	}
15
16	use core::ops::Range;
17
18	#[derive(Copy, Clone)]
19	struct ByteKind(u8);
20
21	impl Debug for ByteKind {
22	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
23	f.write_str(data:match () {
24	_ if self.0 == Self::Other.0 => "Other",
25	_ if self.0 == Self::Number.0 => "Number",
26	_ if self.0 == Self::LowerCase.0 => "LowerCase",
27	_ if self.0 == Self::UpperCase.0 => "UpperCase",
28	_ if self.0 == Self::NonAscii.0 => "NonAscii",
29	_ => unreachable!(),
30	})
31	}
32	}
33
34	#[allow(non_upper_case_globals)]
35	impl ByteKind {
36	const Other: Self = Self(`0b0001`);
37	const Number: Self = Self(`0b0010`);
38	const LowerCase: Self = Self(`0b0100`);
39	const UpperCase: Self = Self(`0b1000`);
40	const Alphabetic: Self = Self(Self::LowerCase.0 \| Self::UpperCase.0);
41	// Assumes that non-ascii chars are mostly alphabetic,
42	// this should work out fine most of the time.
43	const NonAscii: Self = Self(`0b1100`);
44	}
45
46	impl ByteKind {
47	#[allow(dead_code)]
48	#[inline(always)]
49	pub const fn eq(self, other: Self) -> bool {
50	(self.0 & other.0) != `0`
51	}
52
53	#[inline(always)]
54	pub const fn ne(self, other: Self) -> bool {
55	(self.0 & other.0) == `0`
56	}
57
58	#[inline(always)]
59	pub const fn is_alphabetic(self) -> bool {
60	self.0 == Self::LowerCase.0 \|\| self.0 == Self::UpperCase.0
61	}
62
63	pub const fn is_end_of_word(mut self, prev: Self, other: Self) -> bool {
64	if self.0 == Self::NonAscii.0 {
65	self = prev;
66	}
67
68	if self.0 == Self::UpperCase.0 {
69	other.ne(Self::Alphabetic)
70	} else {
71	self.ne(other)
72	}
73	}
74	}
75
76	#[derive(Debug, Copy, Clone)]
77	pub(crate) struct WordIterator<'a> {
78	bytes: &'a [u8],
79	start: usize,
80	}
81
82	const BYTE_KIND: &[ByteKind; `256`] = &{
83	let mut out: [ByteKind; 256] = [ByteKind::NonAscii; `256`];
84
85	// Make sure that this goes first
86	for_range_inc! {i in `0`, `127` => out[i as usize] = ByteKind::Other; }
87	for_range_inc! {i in b'A', b'Z' => out[i as usize] = ByteKind::UpperCase; }
88	for_range_inc! {i in b'a', b'z' => out[i as usize] = ByteKind::LowerCase; }
89	for_range_inc! {i in b'0', b'9' => out[i as usize] = ByteKind::Number; }
90
91	out
92	};
93
94	impl<'a> WordIterator<'a> {
95	pub(crate) const fn new(bytes: &'a [u8]) -> Self {
96	Self { bytes, start: `0` }
97	}
98
99	const fn skip_same_kind(mut self, mut kind: ByteKind) -> (Self, ByteKind) {
100	let orig_bytes_len = self.bytes.len();
101
102	let mut prev_kind = kind;
103	while let [b, rem @ ..] = self.bytes {
104	let next_kind = BYTE_KIND[b as usize*];
105	let cmp = kind.is_end_of_word(prev_kind, next_kind);
106	if kind.is_alphabetic() {
107	prev_kind = kind;
108	}
109	kind = next_kind;
110	if cmp {
111	break;
112	}
113	self.bytes = rem;
114	}
115
116	// Advance until a char boundary is found
117	while let [b, rem @ ..] = self.bytes {
118	if (*b as i8) >= `-0x40` {
119	break;
120	}
121	self.bytes = rem;
122	}
123
124	// Remember not to add return statements to the function
125	self.start += orig_bytes_len - self.bytes.len();
126
127	(self, kind)
128	}
129
130	pub(crate) const fn next(self) -> Option<(Self, Range<usize>)> {
131	let (this, fkind) = self.skip_same_kind(ByteKind::Other);
132	if let [] = this.bytes {
133	None
134	} else {
135	let (next, _) = this.skip_same_kind(fkind);
136	let range = this.start..next.start;
137	Some((next, range))
138	}
139	}
140	}
141
142	#[cfg(test)]
143	mod tests {
144	use super::*;
145
146	use arrayvec::ArrayVec;
147
148	fn get_words(text: &str) -> ArrayVec<[&str; `20`]> {
149	let mut list = <ArrayVec<[&str; `20`]>>::new();
150	let mut word_iter = WordIterator::new(text.as_bytes());
151
152	while let Some((niter, word_range)) = word_iter.next() {
153	word_iter = niter;
154	list.push(&text[word_range]);
155	}
156
157	list
158	}
159
160	#[test]
161	fn test_word_iter() {
162	assert_eq!(
163	get_words("01934324ñmaniÑNnFooBar")[..],
164	["01934324", "ñmaniÑ", "Nn", "Foo", "Bar"],
165	);
166
167	assert_eq!(
168	get_words("01934 324 ñmani-嶲Nn____FOOOBar")[..],
169	["01934", "324", "ñmani", "嶲Nn", "FOOOBar"],
170	);
171
172	assert_eq!(get_words(" 01934 1111 ")[..], ["01934", "1111"],);
173
174	assert_eq!(get_words(" 嶲01934 ")[..], ["嶲", "01934"],);
175
176	assert_eq!(get_words(" 嶲A01934 ")[..], ["嶲A", "01934"],);
177
178	assert_eq!(get_words(" 嶲a01934 ")[..], ["嶲a", "01934"],);
179
180	assert_eq!(get_words(" ñA01934 ")[..], ["ñA", "01934"],);
181
182	assert_eq!(get_words(" ña01934 ")[..], ["ña", "01934"],);
183	}
184	}
185