word.rs source code [crates/bstr-1.5.0/src/unicode/word.rs]

1	use regex_automata::DFA;
2
3	use crate::{
4	ext_slice::ByteSlice,
5	unicode::fsm::{
6	simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
7	},
8	utf8,
9	};
10
11	/// An iterator over words in a byte string.
12	///
13	/// This iterator is typically constructed by
14	/// [`ByteSlice::words`](trait.ByteSlice.html#method.words).
15	///
16	/// This is similar to the [`WordsWithBreaks`](struct.WordsWithBreaks.html)
17	/// iterator, except it only returns elements that contain a "word" character.
18	/// A word character is defined by UTS #18 (Annex C) to be the combination
19	/// of the `Alphabetic` and `Join_Control` properties, along with the
20	/// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
21	///
22	/// Since words are made up of one or more codepoints, this iterator yields
23	/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
24	/// are [substituted](index.html#handling-of-invalid-utf-8).
25	///
26	/// This iterator yields words in accordance with the default word boundary
27	/// rules specified in
28	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
29	/// In particular, this may not be suitable for Japanese and Chinese scripts
30	/// that do not use spaces between words.
31	#[derive(Clone, Debug)]
32	pub struct Words<'a>(WordsWithBreaks<'a>);
33
34	impl<'a> Words<'a> {
35	pub(crate) fn new(bs: &'a [u8]) -> Words<'a> {
36	Words(WordsWithBreaks::new(bs))
37	}
38
39	/// View the underlying data as a subslice of the original data.
40	///
41	/// The slice returned has the same lifetime as the original slice, and so
42	/// the iterator can continue to be used while this exists.
43	///
44	/// # Examples
45	///
46	/// ```
47	/// use bstr::ByteSlice;
48	///
49	/// let mut it = b"foo bar baz".words();
50	///
51	/// assert_eq!(b"foo bar baz", it.as_bytes());
52	/// it.next();
53	/// it.next();
54	/// assert_eq!(b" baz", it.as_bytes());
55	/// it.next();
56	/// assert_eq!(b"", it.as_bytes());
57	/// ```
58	#[inline]
59	pub fn as_bytes(&self) -> &'a [u8] {
60	self.0.as_bytes()
61	}
62	}
63
64	impl<'a> Iterator for Words<'a> {
65	type Item = &'a str;
66
67	#[inline]
68	fn next(&mut self) -> Option<&'a str> {
69	while let Some(word: &str) = self.0.next() {
70	if SIMPLE_WORD_FWD.is_match(word.as_bytes()) {
71	return Some(word);
72	}
73	}
74	None
75	}
76	}
77
78	/// An iterator over words in a byte string and their byte index positions.
79	///
80	/// This iterator is typically constructed by
81	/// [`ByteSlice::word_indices`](trait.ByteSlice.html#method.word_indices).
82	///
83	/// This is similar to the
84	/// [`WordsWithBreakIndices`](struct.WordsWithBreakIndices.html) iterator,
85	/// except it only returns elements that contain a "word" character. A
86	/// word character is defined by UTS #18 (Annex C) to be the combination
87	/// of the `Alphabetic` and `Join_Control` properties, along with the
88	/// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
89	///
90	/// Since words are made up of one or more codepoints, this iterator
91	/// yields `&str` elements (along with their start and end byte offsets).
92	/// When invalid UTF-8 is encountered, replacement codepoints are
93	/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
94	/// indices yielded by this iterator may not correspond to the length of the
95	/// word yielded with those indices. For example, when this iterator encounters
96	/// `\xFF` in the byte string, then it will yield a pair of indices ranging
97	/// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
98	/// which is three bytes in length. However, when given only valid UTF-8, then
99	/// all indices are in exact correspondence with their paired word.
100	///
101	/// This iterator yields words in accordance with the default word boundary
102	/// rules specified in
103	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
104	/// In particular, this may not be suitable for Japanese and Chinese scripts
105	/// that do not use spaces between words.
106	#[derive(Clone, Debug)]
107	pub struct WordIndices<'a>(WordsWithBreakIndices<'a>);
108
109	impl<'a> WordIndices<'a> {
110	pub(crate) fn new(bs: &'a [u8]) -> WordIndices<'a> {
111	WordIndices(WordsWithBreakIndices::new(bs))
112	}
113
114	/// View the underlying data as a subslice of the original data.
115	///
116	/// The slice returned has the same lifetime as the original slice, and so
117	/// the iterator can continue to be used while this exists.
118	///
119	/// # Examples
120	///
121	/// ```
122	/// use bstr::ByteSlice;
123	///
124	/// let mut it = b"foo bar baz".word_indices();
125	///
126	/// assert_eq!(b"foo bar baz", it.as_bytes());
127	/// it.next();
128	/// it.next();
129	/// assert_eq!(b" baz", it.as_bytes());
130	/// it.next();
131	/// it.next();
132	/// assert_eq!(b"", it.as_bytes());
133	/// ```
134	#[inline]
135	pub fn as_bytes(&self) -> &'a [u8] {
136	self.0.as_bytes()
137	}
138	}
139
140	impl<'a> Iterator for WordIndices<'a> {
141	type Item = (usize, usize, &'a str);
142
143	#[inline]
144	fn next(&mut self) -> Option<(usize, usize, &'a str)> {
145	while let Some((start: usize, end: usize, word: &str)) = self.0.next() {
146	if SIMPLE_WORD_FWD.is_match(word.as_bytes()) {
147	return Some((start, end, word));
148	}
149	}
150	None
151	}
152	}
153
154	/// An iterator over all word breaks in a byte string.
155	///
156	/// This iterator is typically constructed by
157	/// [`ByteSlice::words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks).
158	///
159	/// This iterator yields not only all words, but the content that comes between
160	/// words. In particular, if all elements yielded by this iterator are
161	/// concatenated, then the result is the original string (subject to Unicode
162	/// replacement codepoint substitutions).
163	///
164	/// Since words are made up of one or more codepoints, this iterator yields
165	/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
166	/// are [substituted](index.html#handling-of-invalid-utf-8).
167	///
168	/// This iterator yields words in accordance with the default word boundary
169	/// rules specified in
170	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
171	/// In particular, this may not be suitable for Japanese and Chinese scripts
172	/// that do not use spaces between words.
173	#[derive(Clone, Debug)]
174	pub struct WordsWithBreaks<'a> {
175	bs: &'a [u8],
176	}
177
178	impl<'a> WordsWithBreaks<'a> {
179	pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreaks<'a> {
180	WordsWithBreaks { bs }
181	}
182
183	/// View the underlying data as a subslice of the original data.
184	///
185	/// The slice returned has the same lifetime as the original slice, and so
186	/// the iterator can continue to be used while this exists.
187	///
188	/// # Examples
189	///
190	/// ```
191	/// use bstr::ByteSlice;
192	///
193	/// let mut it = b"foo bar baz".words_with_breaks();
194	///
195	/// assert_eq!(b"foo bar baz", it.as_bytes());
196	/// it.next();
197	/// assert_eq!(b" bar baz", it.as_bytes());
198	/// it.next();
199	/// it.next();
200	/// assert_eq!(b" baz", it.as_bytes());
201	/// it.next();
202	/// it.next();
203	/// assert_eq!(b"", it.as_bytes());
204	/// ```
205	#[inline]
206	pub fn as_bytes(&self) -> &'a [u8] {
207	self.bs
208	}
209	}
210
211	impl<'a> Iterator for WordsWithBreaks<'a> {
212	type Item = &'a str;
213
214	#[inline]
215	fn next(&mut self) -> Option<&'a str> {
216	let (word: &str, size: usize) = decode_word(self.bs);
217	if size == `0` {
218	return None;
219	}
220	self.bs = &self.bs[size..];
221	Some(word)
222	}
223	}
224
225	/// An iterator over all word breaks in a byte string, along with their byte
226	/// index positions.
227	///
228	/// This iterator is typically constructed by
229	/// [`ByteSlice::words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices).
230	///
231	/// This iterator yields not only all words, but the content that comes between
232	/// words. In particular, if all elements yielded by this iterator are
233	/// concatenated, then the result is the original string (subject to Unicode
234	/// replacement codepoint substitutions).
235	///
236	/// Since words are made up of one or more codepoints, this iterator
237	/// yields `&str` elements (along with their start and end byte offsets).
238	/// When invalid UTF-8 is encountered, replacement codepoints are
239	/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
240	/// indices yielded by this iterator may not correspond to the length of the
241	/// word yielded with those indices. For example, when this iterator encounters
242	/// `\xFF` in the byte string, then it will yield a pair of indices ranging
243	/// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
244	/// which is three bytes in length. However, when given only valid UTF-8, then
245	/// all indices are in exact correspondence with their paired word.
246	///
247	/// This iterator yields words in accordance with the default word boundary
248	/// rules specified in
249	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
250	/// In particular, this may not be suitable for Japanese and Chinese scripts
251	/// that do not use spaces between words.
252	#[derive(Clone, Debug)]
253	pub struct WordsWithBreakIndices<'a> {
254	bs: &'a [u8],
255	forward_index: usize,
256	}
257
258	impl<'a> WordsWithBreakIndices<'a> {
259	pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
260	WordsWithBreakIndices { bs, forward_index: `0` }
261	}
262
263	/// View the underlying data as a subslice of the original data.
264	///
265	/// The slice returned has the same lifetime as the original slice, and so
266	/// the iterator can continue to be used while this exists.
267	///
268	/// # Examples
269	///
270	/// ```
271	/// use bstr::ByteSlice;
272	///
273	/// let mut it = b"foo bar baz".words_with_break_indices();
274	///
275	/// assert_eq!(b"foo bar baz", it.as_bytes());
276	/// it.next();
277	/// assert_eq!(b" bar baz", it.as_bytes());
278	/// it.next();
279	/// it.next();
280	/// assert_eq!(b" baz", it.as_bytes());
281	/// it.next();
282	/// it.next();
283	/// assert_eq!(b"", it.as_bytes());
284	/// ```
285	#[inline]
286	pub fn as_bytes(&self) -> &'a [u8] {
287	self.bs
288	}
289	}
290
291	impl<'a> Iterator for WordsWithBreakIndices<'a> {
292	type Item = (usize, usize, &'a str);
293
294	#[inline]
295	fn next(&mut self) -> Option<(usize, usize, &'a str)> {
296	let index: usize = self.forward_index;
297	let (word: &str, size: usize) = decode_word(self.bs);
298	if size == `0` {
299	return None;
300	}
301	self.bs = &self.bs[size..];
302	self.forward_index += size;
303	Some((index, index + size, word))
304	}
305	}
306
307	fn decode_word(bs: &[u8]) -> (&str, usize) {
308	if bs.is_empty() {
309	("", `0`)
310	} else if let Some(end: usize) = WORD_BREAK_FWD.find(bytes:bs) {
311	// Safe because a match can only occur for valid UTF-8.
312	let word: &str = unsafe { bs[..end].to_str_unchecked() };
313	(word, word.len())
314	} else {
315	const INVALID: &'static str = "`\u{FFFD}`";
316	// No match on non-empty bytes implies we found invalid UTF-8.
317	let (_, size: usize) = utf8::decode_lossy(slice:bs);
318	(INVALID, size)
319	}
320	}
321
322	#[cfg(all(test, feature = "std"))]
323	mod tests {
324	#[cfg(not(miri))]
325	use ucd_parse::WordBreakTest;
326
327	use crate::ext_slice::ByteSlice;
328
329	#[test]
330	#[cfg(not(miri))]
331	fn forward_ucd() {
332	for (i, test) in ucdtests().into_iter().enumerate() {
333	let given = test.words.concat();
334	let got = words(given.as_bytes());
335	assert_eq!(
336	test.words,
337	got,
338	"`\n\n`word forward break test {} failed:`\n`\
339	given: {:?}`\n`\
340	expected: {:?}`\n`\
341	got: {:?}`\n`",
342	i,
343	given,
344	strs_to_bstrs(&test.words),
345	strs_to_bstrs(&got),
346	);
347	}
348	}
349
350	// Some additional tests that don't seem to be covered by the UCD tests.
351	//
352	// It's pretty amazing that the UCD tests miss these cases. I only found
353	// them by running this crate's segmenter and ICU's segmenter on the same
354	// text and comparing the output.
355	#[test]
356	fn forward_additional() {
357	assert_eq!(vec!["a", ".", " ", "Y"], words(b"a. Y"));
358	assert_eq!(vec!["r", ".", " ", "Yo"], words(b"r. Yo"));
359	assert_eq!(
360	vec!["whatsoever", ".", " ", "You", " ", "may"],
361	words(b"whatsoever. You may")
362	);
363	assert_eq!(
364	vec!["21stcentury'syesterday"],
365	words(b"21stcentury'syesterday")
366	);
367
368	assert_eq!(vec!["Bonta_", "'", "s"], words(b"Bonta_'s"));
369	assert_eq!(vec!["_vhat's"], words(b"_vhat's"));
370	assert_eq!(vec!["__on'anima"], words(b"__on'anima"));
371	assert_eq!(vec!["123_", "'", "4"], words(b"123_'4"));
372	assert_eq!(vec!["_123'4"], words(b"_123'4"));
373	assert_eq!(vec!["__12'345"], words(b"__12'345"));
374
375	assert_eq!(
376	vec!["tomorrowat4", ":", "00", ","],
377	words(b"tomorrowat4:00,")
378	);
379	assert_eq!(vec!["RS1", "'", "s"], words(b"RS1's"));
380	assert_eq!(vec!["X38"], words(b"X38"));
381
382	assert_eq!(vec!["4abc", ":", "00", ","], words(b"4abc:00,"));
383	assert_eq!(vec!["12S", "'", "1"], words(b"12S'1"));
384	assert_eq!(vec!["1XY"], words(b"1XY"));
385
386	assert_eq!(vec!["`\u{FEFF}`", "Ты"], words("`\u{FEFF}`Ты".as_bytes()));
387
388	// Tests that Vithkuqi works, which was introduced in Unicode 14.
389	// This test fails prior to Unicode 14.
390	assert_eq!(
391	vec!["`\u{10570}\u{10597}`"],
392	words("`\u{10570}\u{10597}`".as_bytes())
393	);
394	}
395
396	fn words(bytes: &[u8]) -> Vec<&str> {
397	bytes.words_with_breaks().collect()
398	}
399
400	#[cfg(not(miri))]
401	fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
402	strs.iter().map(\|s\| s.as_ref().as_bytes()).collect()
403	}
404
405	/// Return all of the UCD for word breaks.
406	#[cfg(not(miri))]
407	fn ucdtests() -> Vec<WordBreakTest> {
408	const TESTDATA: &'static str = include_str!("data/WordBreakTest.txt");
409
410	let mut tests = vec![];
411	for mut line in TESTDATA.lines() {
412	line = line.trim();
413	if line.starts_with("#") \|\| line.contains("surrogate") {
414	continue;
415	}
416	tests.push(line.parse().unwrap());
417	}
418	tests
419	}
420	}
421