1use regex_automata::DFA;
2
3use crate::{
4 ext_slice::ByteSlice,
5 unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8,
6};
7
8/// An iterator over sentences in a byte string.
9///
10/// This iterator is typically constructed by
11/// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences).
12///
13/// Sentences typically include their trailing punctuation and whitespace.
14///
15/// Since sentences are made up of one or more codepoints, this iterator yields
16/// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints
17/// are [substituted](index.html#handling-of-invalid-utf-8).
18///
19/// This iterator yields words in accordance with the default sentence boundary
20/// rules specified in
21/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
22#[derive(Clone, Debug)]
23pub struct Sentences<'a> {
24 bs: &'a [u8],
25}
26
27impl<'a> Sentences<'a> {
28 pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> {
29 Sentences { bs }
30 }
31
32 /// View the underlying data as a subslice of the original data.
33 ///
34 /// The slice returned has the same lifetime as the original slice, and so
35 /// the iterator can continue to be used while this exists.
36 ///
37 /// # Examples
38 ///
39 /// ```
40 /// use bstr::ByteSlice;
41 ///
42 /// let mut it = b"I want this. Not that. Right now.".sentences();
43 ///
44 /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
45 /// it.next();
46 /// assert_eq!(b"Not that. Right now.", it.as_bytes());
47 /// it.next();
48 /// it.next();
49 /// assert_eq!(b"", it.as_bytes());
50 /// ```
51 #[inline]
52 pub fn as_bytes(&self) -> &'a [u8] {
53 self.bs
54 }
55}
56
57impl<'a> Iterator for Sentences<'a> {
58 type Item = &'a str;
59
60 #[inline]
61 fn next(&mut self) -> Option<&'a str> {
62 let (sentence: &str, size: usize) = decode_sentence(self.bs);
63 if size == 0 {
64 return None;
65 }
66 self.bs = &self.bs[size..];
67 Some(sentence)
68 }
69}
70
71/// An iterator over sentences in a byte string, along with their byte offsets.
72///
73/// This iterator is typically constructed by
74/// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices).
75///
76/// Sentences typically include their trailing punctuation and whitespace.
77///
78/// Since sentences are made up of one or more codepoints, this iterator
79/// yields `&str` elements (along with their start and end byte offsets).
80/// When invalid UTF-8 is encountered, replacement codepoints are
81/// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the
82/// indices yielded by this iterator may not correspond to the length of the
83/// sentence yielded with those indices. For example, when this iterator
84/// encounters `\xFF` in the byte string, then it will yield a pair of indices
85/// ranging over a single byte, but will provide an `&str` equivalent to
86/// `"\u{FFFD}"`, which is three bytes in length. However, when given only
87/// valid UTF-8, then all indices are in exact correspondence with their paired
88/// word.
89///
90/// This iterator yields words in accordance with the default sentence boundary
91/// rules specified in
92/// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries).
93#[derive(Clone, Debug)]
94pub struct SentenceIndices<'a> {
95 bs: &'a [u8],
96 forward_index: usize,
97}
98
99impl<'a> SentenceIndices<'a> {
100 pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> {
101 SentenceIndices { bs, forward_index: 0 }
102 }
103
104 /// View the underlying data as a subslice of the original data.
105 ///
106 /// The slice returned has the same lifetime as the original slice, and so
107 /// the iterator can continue to be used while this exists.
108 ///
109 /// # Examples
110 ///
111 /// ```
112 /// use bstr::ByteSlice;
113 ///
114 /// let mut it = b"I want this. Not that. Right now.".sentence_indices();
115 ///
116 /// assert_eq!(&b"I want this. Not that. Right now."[..], it.as_bytes());
117 /// it.next();
118 /// assert_eq!(b"Not that. Right now.", it.as_bytes());
119 /// it.next();
120 /// it.next();
121 /// assert_eq!(b"", it.as_bytes());
122 /// ```
123 #[inline]
124 pub fn as_bytes(&self) -> &'a [u8] {
125 self.bs
126 }
127}
128
129impl<'a> Iterator for SentenceIndices<'a> {
130 type Item = (usize, usize, &'a str);
131
132 #[inline]
133 fn next(&mut self) -> Option<(usize, usize, &'a str)> {
134 let index: usize = self.forward_index;
135 let (word: &str, size: usize) = decode_sentence(self.bs);
136 if size == 0 {
137 return None;
138 }
139 self.bs = &self.bs[size..];
140 self.forward_index += size;
141 Some((index, index + size, word))
142 }
143}
144
145fn decode_sentence(bs: &[u8]) -> (&str, usize) {
146 if bs.is_empty() {
147 ("", 0)
148 } else if let Some(end: usize) = SENTENCE_BREAK_FWD.find(bytes:bs) {
149 // Safe because a match can only occur for valid UTF-8.
150 let sentence: &str = unsafe { bs[..end].to_str_unchecked() };
151 (sentence, sentence.len())
152 } else {
153 const INVALID: &'static str = "\u{FFFD}";
154 // No match on non-empty bytes implies we found invalid UTF-8.
155 let (_, size: usize) = utf8::decode_lossy(slice:bs);
156 (INVALID, size)
157 }
158}
159
160#[cfg(all(test, feature = "std"))]
161mod tests {
162 #[cfg(not(miri))]
163 use ucd_parse::SentenceBreakTest;
164
165 use crate::ext_slice::ByteSlice;
166
167 #[test]
168 #[cfg(not(miri))]
169 fn forward_ucd() {
170 for (i, test) in ucdtests().into_iter().enumerate() {
171 let given = test.sentences.concat();
172 let got = sentences(given.as_bytes());
173 assert_eq!(
174 test.sentences,
175 got,
176 "\n\nsentence forward break test {} failed:\n\
177 given: {:?}\n\
178 expected: {:?}\n\
179 got: {:?}\n",
180 i,
181 given,
182 strs_to_bstrs(&test.sentences),
183 strs_to_bstrs(&got),
184 );
185 }
186 }
187
188 // Some additional tests that don't seem to be covered by the UCD tests.
189 #[test]
190 fn forward_additional() {
191 assert_eq!(vec!["a.. ", "A"], sentences(b"a.. A"));
192 assert_eq!(vec!["a.. a"], sentences(b"a.. a"));
193
194 assert_eq!(vec!["a... ", "A"], sentences(b"a... A"));
195 assert_eq!(vec!["a... a"], sentences(b"a... a"));
196
197 assert_eq!(vec!["a...,..., a"], sentences(b"a...,..., a"));
198 }
199
200 fn sentences(bytes: &[u8]) -> Vec<&str> {
201 bytes.sentences().collect()
202 }
203
204 #[cfg(not(miri))]
205 fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> {
206 strs.iter().map(|s| s.as_ref().as_bytes()).collect()
207 }
208
209 /// Return all of the UCD for sentence breaks.
210 #[cfg(not(miri))]
211 fn ucdtests() -> Vec<SentenceBreakTest> {
212 const TESTDATA: &'static str =
213 include_str!("data/SentenceBreakTest.txt");
214
215 let mut tests = vec![];
216 for mut line in TESTDATA.lines() {
217 line = line.trim();
218 if line.starts_with("#") || line.contains("surrogate") {
219 continue;
220 }
221 tests.push(line.parse().unwrap());
222 }
223 tests
224 }
225}
226