1 | use regex_automata::DFA; |
2 | |
3 | use crate::{ |
4 | ext_slice::ByteSlice, |
5 | unicode::fsm::sentence_break_fwd::SENTENCE_BREAK_FWD, utf8, |
6 | }; |
7 | |
8 | /// An iterator over sentences in a byte string. |
9 | /// |
10 | /// This iterator is typically constructed by |
11 | /// [`ByteSlice::sentences`](trait.ByteSlice.html#method.sentences). |
12 | /// |
13 | /// Sentences typically include their trailing punctuation and whitespace. |
14 | /// |
15 | /// Since sentences are made up of one or more codepoints, this iterator yields |
16 | /// `&str` elements. When invalid UTF-8 is encountered, replacement codepoints |
17 | /// are [substituted](index.html#handling-of-invalid-utf-8). |
18 | /// |
19 | /// This iterator yields words in accordance with the default sentence boundary |
20 | /// rules specified in |
21 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries). |
22 | #[derive (Clone, Debug)] |
23 | pub struct Sentences<'a> { |
24 | bs: &'a [u8], |
25 | } |
26 | |
27 | impl<'a> Sentences<'a> { |
28 | pub(crate) fn new(bs: &'a [u8]) -> Sentences<'a> { |
29 | Sentences { bs } |
30 | } |
31 | |
32 | /// View the underlying data as a subslice of the original data. |
33 | /// |
34 | /// The slice returned has the same lifetime as the original slice, and so |
35 | /// the iterator can continue to be used while this exists. |
36 | /// |
37 | /// # Examples |
38 | /// |
39 | /// ``` |
40 | /// use bstr::ByteSlice; |
41 | /// |
42 | /// let mut it = b"I want this. Not that. Right now." .sentences(); |
43 | /// |
44 | /// assert_eq!(&b"I want this. Not that. Right now." [..], it.as_bytes()); |
45 | /// it.next(); |
46 | /// assert_eq!(b"Not that. Right now." , it.as_bytes()); |
47 | /// it.next(); |
48 | /// it.next(); |
49 | /// assert_eq!(b"" , it.as_bytes()); |
50 | /// ``` |
51 | #[inline ] |
52 | pub fn as_bytes(&self) -> &'a [u8] { |
53 | self.bs |
54 | } |
55 | } |
56 | |
57 | impl<'a> Iterator for Sentences<'a> { |
58 | type Item = &'a str; |
59 | |
60 | #[inline ] |
61 | fn next(&mut self) -> Option<&'a str> { |
62 | let (sentence: &str, size: usize) = decode_sentence(self.bs); |
63 | if size == 0 { |
64 | return None; |
65 | } |
66 | self.bs = &self.bs[size..]; |
67 | Some(sentence) |
68 | } |
69 | } |
70 | |
71 | /// An iterator over sentences in a byte string, along with their byte offsets. |
72 | /// |
73 | /// This iterator is typically constructed by |
74 | /// [`ByteSlice::sentence_indices`](trait.ByteSlice.html#method.sentence_indices). |
75 | /// |
76 | /// Sentences typically include their trailing punctuation and whitespace. |
77 | /// |
78 | /// Since sentences are made up of one or more codepoints, this iterator |
79 | /// yields `&str` elements (along with their start and end byte offsets). |
80 | /// When invalid UTF-8 is encountered, replacement codepoints are |
81 | /// [substituted](index.html#handling-of-invalid-utf-8). Because of this, the |
82 | /// indices yielded by this iterator may not correspond to the length of the |
83 | /// sentence yielded with those indices. For example, when this iterator |
84 | /// encounters `\xFF` in the byte string, then it will yield a pair of indices |
85 | /// ranging over a single byte, but will provide an `&str` equivalent to |
86 | /// `"\u{FFFD}"`, which is three bytes in length. However, when given only |
87 | /// valid UTF-8, then all indices are in exact correspondence with their paired |
88 | /// word. |
89 | /// |
90 | /// This iterator yields words in accordance with the default sentence boundary |
91 | /// rules specified in |
92 | /// [UAX #29](https://www.unicode.org/reports/tr29/tr29-33.html#Sentence_Boundaries). |
93 | #[derive (Clone, Debug)] |
94 | pub struct SentenceIndices<'a> { |
95 | bs: &'a [u8], |
96 | forward_index: usize, |
97 | } |
98 | |
99 | impl<'a> SentenceIndices<'a> { |
100 | pub(crate) fn new(bs: &'a [u8]) -> SentenceIndices<'a> { |
101 | SentenceIndices { bs, forward_index: 0 } |
102 | } |
103 | |
104 | /// View the underlying data as a subslice of the original data. |
105 | /// |
106 | /// The slice returned has the same lifetime as the original slice, and so |
107 | /// the iterator can continue to be used while this exists. |
108 | /// |
109 | /// # Examples |
110 | /// |
111 | /// ``` |
112 | /// use bstr::ByteSlice; |
113 | /// |
114 | /// let mut it = b"I want this. Not that. Right now." .sentence_indices(); |
115 | /// |
116 | /// assert_eq!(&b"I want this. Not that. Right now." [..], it.as_bytes()); |
117 | /// it.next(); |
118 | /// assert_eq!(b"Not that. Right now." , it.as_bytes()); |
119 | /// it.next(); |
120 | /// it.next(); |
121 | /// assert_eq!(b"" , it.as_bytes()); |
122 | /// ``` |
123 | #[inline ] |
124 | pub fn as_bytes(&self) -> &'a [u8] { |
125 | self.bs |
126 | } |
127 | } |
128 | |
129 | impl<'a> Iterator for SentenceIndices<'a> { |
130 | type Item = (usize, usize, &'a str); |
131 | |
132 | #[inline ] |
133 | fn next(&mut self) -> Option<(usize, usize, &'a str)> { |
134 | let index: usize = self.forward_index; |
135 | let (word: &str, size: usize) = decode_sentence(self.bs); |
136 | if size == 0 { |
137 | return None; |
138 | } |
139 | self.bs = &self.bs[size..]; |
140 | self.forward_index += size; |
141 | Some((index, index + size, word)) |
142 | } |
143 | } |
144 | |
145 | fn decode_sentence(bs: &[u8]) -> (&str, usize) { |
146 | if bs.is_empty() { |
147 | ("" , 0) |
148 | } else if let Some(end: usize) = SENTENCE_BREAK_FWD.find(bytes:bs) { |
149 | // Safe because a match can only occur for valid UTF-8. |
150 | let sentence: &str = unsafe { bs[..end].to_str_unchecked() }; |
151 | (sentence, sentence.len()) |
152 | } else { |
153 | const INVALID: &'static str = " \u{FFFD}" ; |
154 | // No match on non-empty bytes implies we found invalid UTF-8. |
155 | let (_, size: usize) = utf8::decode_lossy(slice:bs); |
156 | (INVALID, size) |
157 | } |
158 | } |
159 | |
160 | #[cfg (all(test, feature = "std" ))] |
161 | mod tests { |
162 | #[cfg (not(miri))] |
163 | use ucd_parse::SentenceBreakTest; |
164 | |
165 | use crate::ext_slice::ByteSlice; |
166 | |
167 | #[test ] |
168 | #[cfg (not(miri))] |
169 | fn forward_ucd() { |
170 | for (i, test) in ucdtests().into_iter().enumerate() { |
171 | let given = test .sentences.concat(); |
172 | let got = sentences(given.as_bytes()); |
173 | assert_eq!( |
174 | test .sentences, |
175 | got, |
176 | " \n\nsentence forward break test {} failed: \n\ |
177 | given: {:?}\n\ |
178 | expected: {:?}\n\ |
179 | got: {:?}\n" , |
180 | i, |
181 | given, |
182 | strs_to_bstrs(&test .sentences), |
183 | strs_to_bstrs(&got), |
184 | ); |
185 | } |
186 | } |
187 | |
188 | // Some additional tests that don't seem to be covered by the UCD tests. |
189 | #[test ] |
190 | fn forward_additional() { |
191 | assert_eq!(vec!["a.. " , "A" ], sentences(b"a.. A" )); |
192 | assert_eq!(vec!["a.. a" ], sentences(b"a.. a" )); |
193 | |
194 | assert_eq!(vec!["a... " , "A" ], sentences(b"a... A" )); |
195 | assert_eq!(vec!["a... a" ], sentences(b"a... a" )); |
196 | |
197 | assert_eq!(vec!["a...,..., a" ], sentences(b"a...,..., a" )); |
198 | } |
199 | |
200 | fn sentences(bytes: &[u8]) -> Vec<&str> { |
201 | bytes.sentences().collect() |
202 | } |
203 | |
204 | #[cfg (not(miri))] |
205 | fn strs_to_bstrs<S: AsRef<str>>(strs: &[S]) -> Vec<&[u8]> { |
206 | strs.iter().map(|s| s.as_ref().as_bytes()).collect() |
207 | } |
208 | |
209 | /// Return all of the UCD for sentence breaks. |
210 | #[cfg (not(miri))] |
211 | fn ucdtests() -> Vec<SentenceBreakTest> { |
212 | const TESTDATA: &'static str = |
213 | include_str!("data/SentenceBreakTest.txt" ); |
214 | |
215 | let mut tests = vec![]; |
216 | for mut line in TESTDATA.lines() { |
217 | line = line.trim(); |
218 | if line.starts_with("#" ) || line.contains("surrogate" ) { |
219 | continue; |
220 | } |
221 | tests.push(line.parse().unwrap()); |
222 | } |
223 | tests |
224 | } |
225 | } |
226 | |