sentence.rs source code [crates/bstr-1.5.0/src/unicode/sentence.rs]

1	use regex_automata::DFA;
2
3	use crate::{
4	ext_slice::ByteSlice,
5	unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8,
6	};
7
8	/// An iterator over sentences in a byte string.
9	///
10	/// This iterator is typically constructed by
11	/// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences).
12	///
13	/// Sentences typically include their trailing punctuation and whitespace.
14	///
15	/// Since sentences are made up of one or more codepoints, this iterator yields
16	/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
17	/// are [substituted](index.html#handling-of-invalid-utf-8).
18	///
19	/// This iterator yields words in accordance with the default sentence boundary
20	/// rules specified in
21	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
22	#[derive(Clone, Debug)]
23	pub struct Sentences<'a> {
24	bs: &'a [u8],
25	}
26
27	impl<'a> Sentences<'a> {
28	pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> {
29	Sentences { bs }
30	}
31
32	/// View the underlying data as a subslice of the original data.
33	///
34	/// The slice returned has the same lifetime as the original slice, and so
35	/// the iterator can continue to be used while this exists.
36	///
37	/// # Examples
38	///
39	/// ```
40	/// use bstr::ByteSlice;
41	///
42	/// let mut it = b"I want this. Not that. Right now.".sentences();
43	///
44	/// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
45	/// it.next();
46	/// assert_eq!(b"Not that. Right now.", it.as_bytes());
47	/// it.next();
48	/// it.next();
49	/// assert_eq!(b"", it.as_bytes());
50	/// ```
51	#[inline]
52	pub fn as_bytes(&self) -> &'a [u8] {
53	self.bs
54	}
55	}
56
57	impl<'a> Iterator for Sentences<'a> {
58	type Item = &'a str;
59
60	#[inline]
61	fn next(&mut self) -> Option<&'a str> {
62	let (sentence: &str, size: usize) = decode_sentence(self.bs);
63	if size == `0` {
64	return None;
65	}
66	self.bs = &self.bs[size..];
67	Some(sentence)
68	}
69	}
70
71	/// An iterator over sentences in a byte string, along with their byte offsets.
72	///
73	/// This iterator is typically constructed by
74	/// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices).
75	///
76	/// Sentences typically include their trailing punctuation and whitespace.
77	///
78	/// Since sentences are made up of one or more codepoints, this iterator
79	/// yields `&str` elements (along with their start and end byte offsets).
80	/// When invalid UTF-8 is encountered, replacement codepoints are
81	/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
82	/// indices yielded by this iterator may not correspond to the length of the
83	/// sentence yielded with those indices. For example, when this iterator
84	/// encounters `\xFF` in the byte string, then it will yield a pair of indices
85	/// ranging over a single byte, but will provide an `&str` equivalent to
86	/// `"\u{FFFD}"`, which is three bytes in length. However, when given only
87	/// valid UTF-8, then all indices are in exact correspondence with their paired
88	/// word.
89	///
90	/// This iterator yields words in accordance with the default sentence boundary
91	/// rules specified in
92	/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
93	#[derive(Clone, Debug)]
94	pub struct SentenceIndices<'a> {
95	bs: &'a [u8],
96	forward_index: usize,
97	}
98
99	impl<'a> SentenceIndices<'a> {
100	pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
101	SentenceIndices { bs, forward_index: `0` }
102	}
103
104	/// View the underlying data as a subslice of the original data.
105	///
106	/// The slice returned has the same lifetime as the original slice, and so
107	/// the iterator can continue to be used while this exists.
108	///
109	/// # Examples
110	///
111	/// ```
112	/// use bstr::ByteSlice;
113	///
114	/// let mut it = b"I want this. Not that. Right now.".sentence_indices();
115	///
116	/// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
117	/// it.next();
118	/// assert_eq!(b"Not that. Right now.", it.as_bytes());
119	/// it.next();
120	/// it.next();
121	/// assert_eq!(b"", it.as_bytes());
122	/// ```
123	#[inline]
124	pub fn as_bytes(&self) -> &'a [u8] {
125	self.bs
126	}
127	}
128
129	impl<'a> Iterator for SentenceIndices<'a> {
130	type Item = (usize, usize, &'a str);
131
132	#[inline]
133	fn next(&mut self) -> Option<(usize, usize, &'a str)> {
134	let index: usize = self.forward_index;
135	let (word: &str, size: usize) = decode_sentence(self.bs);
136	if size == `0` {
137	return None;
138	}
139	self.bs = &self.bs[size..];
140	self.forward_index += size;
141	Some((index, index + size, word))
142	}
143	}
144
145	fn decode_sentence(bs: &[u8]) -> (&str, usize) {
146	if bs.is_empty() {
147	("", `0`)
148	} else if let Some(end: usize) = SENTENCE_BREAK_FWD.find(bytes:bs) {
149	// Safe because a match can only occur for valid UTF-8.
150	let sentence: &str = unsafe { bs[..end].to_str_unchecked() };
151	(sentence, sentence.len())
152	} else {
153	const INVALID: &'static str = "`\u{FFFD}`";
154	// No match on non-empty bytes implies we found invalid UTF-8.
155	let (_, size: usize) = utf8::decode_lossy(slice:bs);
156	(INVALID, size)
157	}
158	}
159
160	#[cfg(all(test, feature = "std"))]
161	mod tests {
162	#[cfg(not(miri))]
163	use ucd_parse::SentenceBreakTest;
164
165	use crate::ext_slice::ByteSlice;
166
167	#[test]
168	#[cfg(not(miri))]
169	fn forward_ucd() {
170	for (i, test) in ucdtests().into_iter().enumerate() {
171	let given = test.sentences.concat();
172	let got = sentences(given.as_bytes());
173	assert_eq!(
174	test.sentences,
175	got,
176	"`\n\n`sentence forward break test {} failed:`\n`\
177	given: {:?}`\n`\
178	expected: {:?}`\n`\
179	got: {:?}`\n`",
180	i,
181	given,
182	strs_to_bstrs(&test.sentences),
183	strs_to_bstrs(&got),
184	);
185	}
186	}
187
188	// Some additional tests that don't seem to be covered by the UCD tests.
189	#[test]
190	fn forward_additional() {
191	assert_eq!(vec!["a.. ", "A"], sentences(b"a.. A"));
192	assert_eq!(vec!["a.. a"], sentences(b"a.. a"));
193
194	assert_eq!(vec!["a... ", "A"], sentences(b"a... A"));
195	assert_eq!(vec!["a... a"], sentences(b"a... a"));
196
197	assert_eq!(vec!["a...,..., a"], sentences(b"a...,..., a"));
198	}
199
200	fn sentences(bytes: &[u8]) -> Vec<&str> {
201	bytes.sentences().collect()
202	}
203
204	#[cfg(not(miri))]
205	fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
206	strs.iter().map(\|s\| s.as_ref().as_bytes()).collect()
207	}
208
209	/// Return all of the UCD for sentence breaks.
210	#[cfg(not(miri))]
211	fn ucdtests() -> Vec<SentenceBreakTest> {
212	const TESTDATA: &'static str =
213	include_str!("data/SentenceBreakTest.txt");
214
215	let mut tests = vec![];
216	for mut line in TESTDATA.lines() {
217	line = line.trim();
218	if line.starts_with("#") \|\| line.contains("surrogate") {
219	continue;
220	}
221	tests.push(line.parse().unwrap());
222	}
223	tests
224	}
225	}
226