word.rs source code [crates/bstr/src/unicode/word.rs]

1	use regex_automata::{dfa::Automaton, Anchored, Input};
2
3	use crate::{
4	ext_slice::ByteSlice,
5	unicode::fsm::{
6	simple_word_fwd::SIMPLE_WORD_FWD, word_break_fwd::WORD_BREAK_FWD,
7	},
8	utf8,
9	};
10
11	/// An iterator over words in a byte string.
12	///
13	/// This iterator is typically constructed by
14	/// [`ByteSlice::words`](trait.ByteSlice.html#method.words).
15	///
16	/// This is similar to the [`WordsWithBreaks`](struct.WordsWithBreaks.html)
17	/// iterator, except it only returns elements that contain a "word" character.
18	/// A word character is defined by UTS #18 (Annex C) to be the combination
19	/// of the `Alphabetic` and `Join_Control` properties, along with the
20	/// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
21	///
22	/// Since words are made up of one or more codepoints, this iterator yields
23	/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
24	/// are [substituted](index.html#handling-of-invalid-utf-8).
25	///
26	/// This iterator yields words in accordance with the default word boundary
27	/// rules specified in
28	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
29	/// In particular, this may not be suitable for Japanese and Chinese scripts
30	/// that do not use spaces between words.
31	#[derive(Clone, Debug)]
32	pub struct Words<'a>(WordsWithBreaks<'a>);
33
34	impl<'a> Words<'a> {
35	pub(crate) fn new(bs: &'a [u8]) -> Words<'a> {
36	Words(WordsWithBreaks::new(bs))
37	}
38
39	/// View the underlying data as a subslice of the original data.
40	///
41	/// The slice returned has the same lifetime as the original slice, and so
42	/// the iterator can continue to be used while this exists.
43	///
44	/// # Examples
45	///
46	/// ```
47	/// use bstr::ByteSlice;
48	///
49	/// let mut it = b"foo bar baz".words();
50	///
51	/// assert_eq!(b"foo bar baz", it.as_bytes());
52	/// it.next();
53	/// it.next();
54	/// assert_eq!(b" baz", it.as_bytes());
55	/// it.next();
56	/// assert_eq!(b"", it.as_bytes());
57	/// ```
58	#[inline]
59	pub fn as_bytes(&self) -> &'a [u8] {
60	self.0.as_bytes()
61	}
62	}
63
64	impl<'a> Iterator for Words<'a> {
65	type Item = &'a str;
66
67	#[inline]
68	fn next(&mut self) -> Option<&'a str> {
69	for word: &'a str in self.0.by_ref() {
70	let input: Input<'_> =
71	Input::new(word).anchored(Anchored::Yes).earliest(yes:`true`);
72	if SIMPLE_WORD_FWD.try_search_fwd(&input).unwrap().is_some() {
73	return Some(word);
74	}
75	}
76	None
77	}
78	}
79
80	/// An iterator over words in a byte string and their byte index positions.
81	///
82	/// This iterator is typically constructed by
83	/// [`ByteSlice::word_indices`](trait.ByteSlice.html#method.word_indices).
84	///
85	/// This is similar to the
86	/// [`WordsWithBreakIndices`](struct.WordsWithBreakIndices.html) iterator,
87	/// except it only returns elements that contain a "word" character. A
88	/// word character is defined by UTS #18 (Annex C) to be the combination
89	/// of the `Alphabetic` and `Join_Control` properties, along with the
90	/// `Decimal_Number`, `Mark` and `Connector_Punctuation` general categories.
91	///
92	/// Since words are made up of one or more codepoints, this iterator
93	/// yields `&str` elements (along with their start and end byte offsets).
94	/// When invalid UTF-8 is encountered, replacement codepoints are
95	/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
96	/// indices yielded by this iterator may not correspond to the length of the
97	/// word yielded with those indices. For example, when this iterator encounters
98	/// `\xFF` in the byte string, then it will yield a pair of indices ranging
99	/// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
100	/// which is three bytes in length. However, when given only valid UTF-8, then
101	/// all indices are in exact correspondence with their paired word.
102	///
103	/// This iterator yields words in accordance with the default word boundary
104	/// rules specified in
105	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
106	/// In particular, this may not be suitable for Japanese and Chinese scripts
107	/// that do not use spaces between words.
108	#[derive(Clone, Debug)]
109	pub struct WordIndices<'a>(WordsWithBreakIndices<'a>);
110
111	impl<'a> WordIndices<'a> {
112	pub(crate) fn new(bs: &'a [u8]) -> WordIndices<'a> {
113	WordIndices(WordsWithBreakIndices::new(bs))
114	}
115
116	/// View the underlying data as a subslice of the original data.
117	///
118	/// The slice returned has the same lifetime as the original slice, and so
119	/// the iterator can continue to be used while this exists.
120	///
121	/// # Examples
122	///
123	/// ```
124	/// use bstr::ByteSlice;
125	///
126	/// let mut it = b"foo bar baz".word_indices();
127	///
128	/// assert_eq!(b"foo bar baz", it.as_bytes());
129	/// it.next();
130	/// it.next();
131	/// assert_eq!(b" baz", it.as_bytes());
132	/// it.next();
133	/// it.next();
134	/// assert_eq!(b"", it.as_bytes());
135	/// ```
136	#[inline]
137	pub fn as_bytes(&self) -> &'a [u8] {
138	self.0.as_bytes()
139	}
140	}
141
142	impl<'a> Iterator for WordIndices<'a> {
143	type Item = (usize, usize, &'a str);
144
145	#[inline]
146	fn next(&mut self) -> Option<(usize, usize, &'a str)> {
147	for (start: usize, end: usize, word: &'a str) in self.0.by_ref() {
148	let input: Input<'_> =
149	Input::new(word).anchored(Anchored::Yes).earliest(yes:`true`);
150	if SIMPLE_WORD_FWD.try_search_fwd(&input).unwrap().is_some() {
151	return Some((start, end, word));
152	}
153	}
154	None
155	}
156	}
157
158	/// An iterator over all word breaks in a byte string.
159	///
160	/// This iterator is typically constructed by
161	/// [`ByteSlice::words_with_breaks`](trait.ByteSlice.html#method.words_with_breaks).
162	///
163	/// This iterator yields not only all words, but the content that comes between
164	/// words. In particular, if all elements yielded by this iterator are
165	/// concatenated, then the result is the original string (subject to Unicode
166	/// replacement codepoint substitutions).
167	///
168	/// Since words are made up of one or more codepoints, this iterator yields
169	/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
170	/// are [substituted](index.html#handling-of-invalid-utf-8).
171	///
172	/// This iterator yields words in accordance with the default word boundary
173	/// rules specified in
174	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
175	/// In particular, this may not be suitable for Japanese and Chinese scripts
176	/// that do not use spaces between words.
177	#[derive(Clone, Debug)]
178	pub struct WordsWithBreaks<'a> {
179	bs: &'a [u8],
180	}
181
182	impl<'a> WordsWithBreaks<'a> {
183	pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreaks<'a> {
184	WordsWithBreaks { bs }
185	}
186
187	/// View the underlying data as a subslice of the original data.
188	///
189	/// The slice returned has the same lifetime as the original slice, and so
190	/// the iterator can continue to be used while this exists.
191	///
192	/// # Examples
193	///
194	/// ```
195	/// use bstr::ByteSlice;
196	///
197	/// let mut it = b"foo bar baz".words_with_breaks();
198	///
199	/// assert_eq!(b"foo bar baz", it.as_bytes());
200	/// it.next();
201	/// assert_eq!(b" bar baz", it.as_bytes());
202	/// it.next();
203	/// it.next();
204	/// assert_eq!(b" baz", it.as_bytes());
205	/// it.next();
206	/// it.next();
207	/// assert_eq!(b"", it.as_bytes());
208	/// ```
209	#[inline]
210	pub fn as_bytes(&self) -> &'a [u8] {
211	self.bs
212	}
213	}
214
215	impl<'a> Iterator for WordsWithBreaks<'a> {
216	type Item = &'a str;
217
218	#[inline]
219	fn next(&mut self) -> Option<&'a str> {
220	let (word: &str, size: usize) = decode_word(self.bs);
221	if size == `0` {
222	return None;
223	}
224	self.bs = &self.bs[size..];
225	Some(word)
226	}
227	}
228
229	/// An iterator over all word breaks in a byte string, along with their byte
230	/// index positions.
231	///
232	/// This iterator is typically constructed by
233	/// [`ByteSlice::words_with_break_indices`](trait.ByteSlice.html#method.words_with_break_indices).
234	///
235	/// This iterator yields not only all words, but the content that comes between
236	/// words. In particular, if all elements yielded by this iterator are
237	/// concatenated, then the result is the original string (subject to Unicode
238	/// replacement codepoint substitutions).
239	///
240	/// Since words are made up of one or more codepoints, this iterator
241	/// yields `&str` elements (along with their start and end byte offsets).
242	/// When invalid UTF-8 is encountered, replacement codepoints are
243	/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
244	/// indices yielded by this iterator may not correspond to the length of the
245	/// word yielded with those indices. For example, when this iterator encounters
246	/// `\xFF` in the byte string, then it will yield a pair of indices ranging
247	/// over a single byte, but will provide an `&str` equivalent to `"\u{FFFD}"`,
248	/// which is three bytes in length. However, when given only valid UTF-8, then
249	/// all indices are in exact correspondence with their paired word.
250	///
251	/// This iterator yields words in accordance with the default word boundary
252	/// rules specified in
253	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Word_Boundaries).
254	/// In particular, this may not be suitable for Japanese and Chinese scripts
255	/// that do not use spaces between words.
256	#[derive(Clone, Debug)]
257	pub struct WordsWithBreakIndices<'a> {
258	bs: &'a [u8],
259	forward_index: usize,
260	}
261
262	impl<'a> WordsWithBreakIndices<'a> {
263	pub(crate) fn new(bs: &'a [u8]) -> WordsWithBreakIndices<'a> {
264	WordsWithBreakIndices { bs, forward_index: `0` }
265	}
266
267	/// View the underlying data as a subslice of the original data.
268	///
269	/// The slice returned has the same lifetime as the original slice, and so
270	/// the iterator can continue to be used while this exists.
271	///
272	/// # Examples
273	///
274	/// ```
275	/// use bstr::ByteSlice;
276	///
277	/// let mut it = b"foo bar baz".words_with_break_indices();
278	///
279	/// assert_eq!(b"foo bar baz", it.as_bytes());
280	/// it.next();
281	/// assert_eq!(b" bar baz", it.as_bytes());
282	/// it.next();
283	/// it.next();
284	/// assert_eq!(b" baz", it.as_bytes());
285	/// it.next();
286	/// it.next();
287	/// assert_eq!(b"", it.as_bytes());
288	/// ```
289	#[inline]
290	pub fn as_bytes(&self) -> &'a [u8] {
291	self.bs
292	}
293	}
294
295	impl<'a> Iterator for WordsWithBreakIndices<'a> {
296	type Item = (usize, usize, &'a str);
297
298	#[inline]
299	fn next(&mut self) -> Option<(usize, usize, &'a str)> {
300	let index: usize = self.forward_index;
301	let (word: &str, size: usize) = decode_word(self.bs);
302	if size == `0` {
303	return None;
304	}
305	self.bs = &self.bs[size..];
306	self.forward_index += size;
307	Some((index, index + size, word))
308	}
309	}
310
311	fn decode_word(bs: &[u8]) -> (&str, usize) {
312	if bs.is_empty() {
313	("", `0`)
314	} else if let Some(hm: HalfMatch) = {
315	let input: Input<'_> = Input::new(bs).anchored(mode:Anchored::Yes);
316	WORD_BREAK_FWD.try_search_fwd(&input).unwrap()
317	} {
318	// Safe because a match can only occur for valid UTF-8.
319	let word: &str = unsafe { bs[..hm.offset()].to_str_unchecked() };
320	(word, word.len())
321	} else {
322	const INVALID: &str = "`\u{FFFD}`";
323	// No match on non-empty bytes implies we found invalid UTF-8.
324	let (_, size: usize) = utf8::decode_lossy(slice:bs);
325	(INVALID, size)
326	}
327	}
328
329	#[cfg(all(test, feature = "std"))]
330	mod tests {
331	use alloc::{vec, vec::Vec};
332
333	#[cfg(not(miri))]
334	use ucd_parse::WordBreakTest;
335
336	use crate::ext_slice::ByteSlice;
337
338	#[test]
339	#[cfg(not(miri))]
340	fn forward_ucd() {
341	for (i, test) in ucdtests().into_iter().enumerate() {
342	let given = test.words.concat();
343	let got = words(given.as_bytes());
344	assert_eq!(
345	test.words,
346	got,
347	"`\n\n`word forward break test {} failed:`\n`\
348	given: {:?}`\n`\
349	expected: {:?}`\n`\
350	got: {:?}`\n`",
351	i,
352	given,
353	strs_to_bstrs(&test.words),
354	strs_to_bstrs(&got),
355	);
356	}
357	}
358
359	// Some additional tests that don't seem to be covered by the UCD tests.
360	//
361	// It's pretty amazing that the UCD tests miss these cases. I only found
362	// them by running this crate's segmenter and ICU's segmenter on the same
363	// text and comparing the output.
364	#[test]
365	fn forward_additional() {
366	assert_eq!(vec!["a", ".", " ", "Y"], words(b"a. Y"));
367	assert_eq!(vec!["r", ".", " ", "Yo"], words(b"r. Yo"));
368	assert_eq!(
369	vec!["whatsoever", ".", " ", "You", " ", "may"],
370	words(b"whatsoever. You may")
371	);
372	assert_eq!(
373	vec!["21stcentury'syesterday"],
374	words(b"21stcentury'syesterday")
375	);
376
377	assert_eq!(vec!["Bonta_", "'", "s"], words(b"Bonta_'s"));
378	assert_eq!(vec!["_vhat's"], words(b"_vhat's"));
379	assert_eq!(vec!["__on'anima"], words(b"__on'anima"));
380	assert_eq!(vec!["123_", "'", "4"], words(b"123_'4"));
381	assert_eq!(vec!["_123'4"], words(b"_123'4"));
382	assert_eq!(vec!["__12'345"], words(b"__12'345"));
383
384	assert_eq!(
385	vec!["tomorrowat4", ":", "00", ","],
386	words(b"tomorrowat4:00,")
387	);
388	assert_eq!(vec!["RS1", "'", "s"], words(b"RS1's"));
389	assert_eq!(vec!["X38"], words(b"X38"));
390
391	assert_eq!(vec!["4abc", ":", "00", ","], words(b"4abc:00,"));
392	assert_eq!(vec!["12S", "'", "1"], words(b"12S'1"));
393	assert_eq!(vec!["1XY"], words(b"1XY"));
394
395	assert_eq!(vec!["`\u{FEFF}`", "Ты"], words("`\u{FEFF}`Ты".as_bytes()));
396
397	// Tests that Vithkuqi works, which was introduced in Unicode 14.
398	// This test fails prior to Unicode 14.
399	assert_eq!(
400	vec!["`\u{10570}\u{10597}`"],
401	words("`\u{10570}\u{10597}`".as_bytes())
402	);
403	}
404
405	fn words(bytes: &[u8]) -> Vec<&str> {
406	bytes.words_with_breaks().collect()
407	}
408
409	#[cfg(not(miri))]
410	fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
411	strs.iter().map(\|s\| s.as_ref().as_bytes()).collect()
412	}
413
414	/// Return all of the UCD for word breaks.
415	#[cfg(not(miri))]
416	fn ucdtests() -> Vec<WordBreakTest> {
417	const TESTDATA: &str = include_str!("data/WordBreakTest.txt");
418
419	let mut tests = vec![];
420	for mut line in TESTDATA.lines() {
421	line = line.trim();
422	if line.starts_with("#") \|\| line.contains("surrogate") {
423	continue;
424	}
425	tests.push(line.parse().unwrap());
426	}
427	tests
428	}
429	}
430