sentence.rs source code [crates/bstr/src/unicode/sentence.rs]

1	use regex_automata::{dfa::Automaton, Anchored, Input};
2
3	use crate::{
4	ext_slice::ByteSlice,
5	unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8,
6	};
7
8	/// An iterator over sentences in a byte string.
9	///
10	/// This iterator is typically constructed by
11	/// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences).
12	///
13	/// Sentences typically include their trailing punctuation and whitespace.
14	///
15	/// Since sentences are made up of one or more codepoints, this iterator yields
16	/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
17	/// are [substituted](index.html#handling-of-invalid-utf-8).
18	///
19	/// This iterator yields words in accordance with the default sentence boundary
20	/// rules specified in
21	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
22	#[derive(Clone, Debug)]
23	pub struct Sentences<'a> {
24	bs: &'a [u8],
25	}
26
27	impl<'a> Sentences<'a> {
28	pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> {
29	Sentences { bs }
30	}
31
32	/// View the underlying data as a subslice of the original data.
33	///
34	/// The slice returned has the same lifetime as the original slice, and so
35	/// the iterator can continue to be used while this exists.
36	///
37	/// # Examples
38	///
39	/// ```
40	/// use bstr::ByteSlice;
41	///
42	/// let mut it = b"I want this. Not that. Right now.".sentences();
43	///
44	/// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
45	/// it.next();
46	/// assert_eq!(b"Not that. Right now.", it.as_bytes());
47	/// it.next();
48	/// it.next();
49	/// assert_eq!(b"", it.as_bytes());
50	/// ```
51	#[inline]
52	pub fn as_bytes(&self) -> &'a [u8] {
53	self.bs
54	}
55	}
56
57	impl<'a> Iterator for Sentences<'a> {
58	type Item = &'a str;
59
60	#[inline]
61	fn next(&mut self) -> Option<&'a str> {
62	let (sentence: &str, size: usize) = decode_sentence(self.bs);
63	if size == `0` {
64	return None;
65	}
66	self.bs = &self.bs[size..];
67	Some(sentence)
68	}
69	}
70
71	/// An iterator over sentences in a byte string, along with their byte offsets.
72	///
73	/// This iterator is typically constructed by
74	/// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices).
75	///
76	/// Sentences typically include their trailing punctuation and whitespace.
77	///
78	/// Since sentences are made up of one or more codepoints, this iterator
79	/// yields `&str` elements (along with their start and end byte offsets).
80	/// When invalid UTF-8 is encountered, replacement codepoints are
81	/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
82	/// indices yielded by this iterator may not correspond to the length of the
83	/// sentence yielded with those indices. For example, when this iterator
84	/// encounters `\xFF` in the byte string, then it will yield a pair of indices
85	/// ranging over a single byte, but will provide an `&str` equivalent to
86	/// `"\u{FFFD}"`, which is three bytes in length. However, when given only
87	/// valid UTF-8, then all indices are in exact correspondence with their paired
88	/// word.
89	///
90	/// This iterator yields words in accordance with the default sentence boundary
91	/// rules specified in
92	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
93	#[derive(Clone, Debug)]
94	pub struct SentenceIndices<'a> {
95	bs: &'a [u8],
96	forward_index: usize,
97	}
98
99	impl<'a> SentenceIndices<'a> {
100	pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
101	SentenceIndices { bs, forward_index: `0` }
102	}
103
104	/// View the underlying data as a subslice of the original data.
105	///
106	/// The slice returned has the same lifetime as the original slice, and so
107	/// the iterator can continue to be used while this exists.
108	///
109	/// # Examples
110	///
111	/// ```
112	/// use bstr::ByteSlice;
113	///
114	/// let mut it = b"I want this. Not that. Right now.".sentence_indices();
115	///
116	/// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
117	/// it.next();
118	/// assert_eq!(b"Not that. Right now.", it.as_bytes());
119	/// it.next();
120	/// it.next();
121	/// assert_eq!(b"", it.as_bytes());
122	/// ```
123	#[inline]
124	pub fn as_bytes(&self) -> &'a [u8] {
125	self.bs
126	}
127	}
128
129	impl<'a> Iterator for SentenceIndices<'a> {
130	type Item = (usize, usize, &'a str);
131
132	#[inline]
133	fn next(&mut self) -> Option<(usize, usize, &'a str)> {
134	let index: usize = self.forward_index;
135	let (word: &str, size: usize) = decode_sentence(self.bs);
136	if size == `0` {
137	return None;
138	}
139	self.bs = &self.bs[size..];
140	self.forward_index += size;
141	Some((index, index + size, word))
142	}
143	}
144
145	fn decode_sentence(bs: &[u8]) -> (&str, usize) {
146	if bs.is_empty() {
147	("", `0`)
148	} else if let Some(hm: HalfMatch) = {
149	let input: Input<'_> = Input::new(bs).anchored(mode:Anchored::Yes);
150	SENTENCE_BREAK_FWD.try_search_fwd(&input).unwrap()
151	} {
152	// Safe because a match can only occur for valid UTF-8.
153	let sentence: &str = unsafe { bs[..hm.offset()].to_str_unchecked() };
154	(sentence, sentence.len())
155	} else {
156	const INVALID: &str = "`\u{FFFD}`";
157	// No match on non-empty bytes implies we found invalid UTF-8.
158	let (_, size: usize) = utf8::decode_lossy(slice:bs);
159	(INVALID, size)
160	}
161	}
162
163	#[cfg(all(test, feature = "std"))]
164	mod tests {
165	use alloc::{vec, vec::Vec};
166
167	#[cfg(not(miri))]
168	use ucd_parse::SentenceBreakTest;
169
170	use crate::ext_slice::ByteSlice;
171
172	#[test]
173	#[cfg(not(miri))]
174	fn forward_ucd() {
175	for (i, test) in ucdtests().into_iter().enumerate() {
176	let given = test.sentences.concat();
177	let got = sentences(given.as_bytes());
178	assert_eq!(
179	test.sentences,
180	got,
181	"`\n\n`sentence forward break test {} failed:`\n`\
182	given: {:?}`\n`\
183	expected: {:?}`\n`\
184	got: {:?}`\n`",
185	i,
186	given,
187	strs_to_bstrs(&test.sentences),
188	strs_to_bstrs(&got),
189	);
190	}
191	}
192
193	// Some additional tests that don't seem to be covered by the UCD tests.
194	#[test]
195	fn forward_additional() {
196	assert_eq!(vec!["a.. ", "A"], sentences(b"a.. A"));
197	assert_eq!(vec!["a.. a"], sentences(b"a.. a"));
198
199	assert_eq!(vec!["a... ", "A"], sentences(b"a... A"));
200	assert_eq!(vec!["a... a"], sentences(b"a... a"));
201
202	assert_eq!(vec!["a...,..., a"], sentences(b"a...,..., a"));
203	}
204
205	fn sentences(bytes: &[u8]) -> Vec<&str> {
206	bytes.sentences().collect()
207	}
208
209	#[cfg(not(miri))]
210	fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
211	strs.iter().map(\|s\| s.as_ref().as_bytes()).collect()
212	}
213
214	/// Return all of the UCD for sentence breaks.
215	#[cfg(not(miri))]
216	fn ucdtests() -> Vec<SentenceBreakTest> {
217	const TESTDATA: &str = include_str!("data/SentenceBreakTest.txt");
218
219	let mut tests = vec![];
220	for mut line in TESTDATA.lines() {
221	line = line.trim();
222	if line.starts_with("#") \|\| line.contains("surrogate") {
223	continue;
224	}
225	tests.push(line.parse().unwrap());
226	}
227	tests
228	}
229	}
230