1 | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at |
3 | // http://rust-lang.org/COPYRIGHT. |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | // option. This file may not be copied, modified, or distributed |
9 | // except according to those terms. |
10 | |
11 | use core::cmp; |
12 | use core::iter::Filter; |
13 | |
14 | // All of the logic for forward iteration over sentences |
15 | mod fwd { |
16 | use crate::tables::sentence::SentenceCat; |
17 | use core::cmp; |
18 | |
19 | // Describe a parsed part of source string as described in this table: |
20 | // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries |
21 | #[derive (Clone, Copy, PartialEq, Eq)] |
22 | enum StatePart { |
23 | Sot, |
24 | Eot, |
25 | Other, |
26 | CR, |
27 | LF, |
28 | Sep, |
29 | ATerm, |
30 | UpperLower, |
31 | ClosePlus, |
32 | SpPlus, |
33 | STerm, |
34 | } |
35 | |
36 | #[derive (Clone, PartialEq, Eq)] |
37 | struct SentenceBreaksState(pub [StatePart; 4]); |
38 | |
39 | const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([ |
40 | StatePart::Sot, |
41 | StatePart::Sot, |
42 | StatePart::Sot, |
43 | StatePart::Sot, |
44 | ]); |
45 | |
46 | #[derive (Clone)] |
47 | pub struct SentenceBreaks<'a> { |
48 | pub string: &'a str, |
49 | pos: usize, |
50 | state: SentenceBreaksState, |
51 | } |
52 | |
53 | impl SentenceBreaksState { |
54 | // Attempt to advance the internal state by one part |
55 | // Whitespace and some punctutation will be collapsed |
56 | fn next(&self, cat: SentenceCat) -> SentenceBreaksState { |
57 | let &SentenceBreaksState(parts) = self; |
58 | let parts = match (parts[3], cat) { |
59 | (StatePart::ClosePlus, SentenceCat::SC_Close) => parts, |
60 | (StatePart::SpPlus, SentenceCat::SC_Sp) => parts, |
61 | _ => [ |
62 | parts[1], |
63 | parts[2], |
64 | parts[3], |
65 | match cat { |
66 | SentenceCat::SC_CR => StatePart::CR, |
67 | SentenceCat::SC_LF => StatePart::LF, |
68 | SentenceCat::SC_Sep => StatePart::Sep, |
69 | SentenceCat::SC_ATerm => StatePart::ATerm, |
70 | SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower, |
71 | SentenceCat::SC_Close => StatePart::ClosePlus, |
72 | SentenceCat::SC_Sp => StatePart::SpPlus, |
73 | SentenceCat::SC_STerm => StatePart::STerm, |
74 | _ => StatePart::Other, |
75 | }, |
76 | ], |
77 | }; |
78 | SentenceBreaksState(parts) |
79 | } |
80 | |
81 | fn end(&self) -> SentenceBreaksState { |
82 | let &SentenceBreaksState(parts) = self; |
83 | SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot]) |
84 | } |
85 | |
86 | // Helper function to check if state head matches a single `StatePart` |
87 | fn match1(&self, part: StatePart) -> bool { |
88 | let &SentenceBreaksState(parts) = self; |
89 | part == parts[3] |
90 | } |
91 | |
92 | // Helper function to check if first two `StateParts` in state match |
93 | // the given two |
94 | fn match2(&self, part1: StatePart, part2: StatePart) -> bool { |
95 | let &SentenceBreaksState(parts) = self; |
96 | part1 == parts[2] && part2 == parts[3] |
97 | } |
98 | } |
99 | |
100 | // https://unicode.org/reports/tr29/#SB8 |
101 | // TODO cache this, it is currently quadratic |
102 | fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool { |
103 | let &SentenceBreaksState(parts) = state; |
104 | let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; |
105 | if parts[idx] == StatePart::ClosePlus { |
106 | idx -= 1 |
107 | } |
108 | |
109 | if parts[idx] == StatePart::ATerm { |
110 | use crate::tables::sentence as se; |
111 | |
112 | for next_char in ahead.chars() { |
113 | //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower |
114 | match se::sentence_category(next_char).2 { |
115 | se::SC_Lower => return true, |
116 | se::SC_OLetter |
117 | | se::SC_Upper |
118 | | se::SC_Sep |
119 | | se::SC_CR |
120 | | se::SC_LF |
121 | | se::SC_STerm |
122 | | se::SC_ATerm => return false, |
123 | _ => continue, |
124 | } |
125 | } |
126 | } |
127 | |
128 | false |
129 | } |
130 | |
131 | // https://unicode.org/reports/tr29/#SB8a |
132 | fn match_sb8a(state: &SentenceBreaksState) -> bool { |
133 | // SATerm Close* Sp* |
134 | let &SentenceBreaksState(parts) = state; |
135 | let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; |
136 | if parts[idx] == StatePart::ClosePlus { |
137 | idx -= 1 |
138 | } |
139 | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
140 | } |
141 | |
142 | // https://unicode.org/reports/tr29/#SB9 |
143 | fn match_sb9(state: &SentenceBreaksState) -> bool { |
144 | // SATerm Close* |
145 | let &SentenceBreaksState(parts) = state; |
146 | let idx = if parts[3] == StatePart::ClosePlus { |
147 | 2 |
148 | } else { |
149 | 3 |
150 | }; |
151 | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
152 | } |
153 | |
154 | // https://unicode.org/reports/tr29/#SB11 |
155 | fn match_sb11(state: &SentenceBreaksState) -> bool { |
156 | // SATerm Close* Sp* ParaSep? |
157 | let &SentenceBreaksState(parts) = state; |
158 | let mut idx = match parts[3] { |
159 | StatePart::Sep | StatePart::CR | StatePart::LF => 2, |
160 | _ => 3, |
161 | }; |
162 | |
163 | if parts[idx] == StatePart::SpPlus { |
164 | idx -= 1 |
165 | } |
166 | if parts[idx] == StatePart::ClosePlus { |
167 | idx -= 1 |
168 | } |
169 | |
170 | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
171 | } |
172 | |
173 | impl<'a> Iterator for SentenceBreaks<'a> { |
174 | // Returns the index of the character which follows a break |
175 | type Item = usize; |
176 | |
177 | #[inline ] |
178 | fn size_hint(&self) -> (usize, Option<usize>) { |
179 | let slen = self.string.len(); |
180 | // A sentence could be one character |
181 | (cmp::min(slen, 2), Some(slen + 1)) |
182 | } |
183 | |
184 | #[inline ] |
185 | fn next(&mut self) -> Option<usize> { |
186 | use crate::tables::sentence as se; |
187 | |
188 | for next_char in self.string[self.pos..].chars() { |
189 | let position_before = self.pos; |
190 | let state_before = self.state.clone(); |
191 | |
192 | let next_cat = se::sentence_category(next_char).2; |
193 | |
194 | self.pos += next_char.len_utf8(); |
195 | self.state = self.state.next(next_cat); |
196 | |
197 | match next_cat { |
198 | // SB1 https://unicode.org/reports/tr29/#SB1 |
199 | _ if state_before.match1(StatePart::Sot) => return Some(position_before), |
200 | |
201 | // SB2 is handled when inner iterator (chars) is finished |
202 | |
203 | // SB3 https://unicode.org/reports/tr29/#SB3 |
204 | SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue, |
205 | |
206 | // SB4 https://unicode.org/reports/tr29/#SB4 |
207 | _ if state_before.match1(StatePart::Sep) |
208 | || state_before.match1(StatePart::CR) |
209 | || state_before.match1(StatePart::LF) => |
210 | { |
211 | return Some(position_before) |
212 | } |
213 | |
214 | // SB5 https://unicode.org/reports/tr29/#SB5 |
215 | SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before, |
216 | |
217 | // SB6 https://unicode.org/reports/tr29/#SB6 |
218 | SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue, |
219 | |
220 | // SB7 https://unicode.org/reports/tr29/#SB7 |
221 | SentenceCat::SC_Upper |
222 | if state_before.match2(StatePart::UpperLower, StatePart::ATerm) => |
223 | { |
224 | continue |
225 | } |
226 | |
227 | // SB8 https://unicode.org/reports/tr29/#SB8 |
228 | _ if match_sb8(&state_before, &self.string[position_before..]) => continue, |
229 | |
230 | // SB8a https://unicode.org/reports/tr29/#SB8a |
231 | SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm |
232 | if match_sb8a(&state_before) => |
233 | { |
234 | continue |
235 | } |
236 | |
237 | // SB9 https://unicode.org/reports/tr29/#SB9 |
238 | SentenceCat::SC_Close |
239 | | SentenceCat::SC_Sp |
240 | | SentenceCat::SC_Sep |
241 | | SentenceCat::SC_CR |
242 | | SentenceCat::SC_LF |
243 | if match_sb9(&state_before) => |
244 | { |
245 | continue |
246 | } |
247 | |
248 | // SB10 https://unicode.org/reports/tr29/#SB10 |
249 | SentenceCat::SC_Sp |
250 | | SentenceCat::SC_Sep |
251 | | SentenceCat::SC_CR |
252 | | SentenceCat::SC_LF |
253 | if match_sb8a(&state_before) => |
254 | { |
255 | continue |
256 | } |
257 | |
258 | // SB11 https://unicode.org/reports/tr29/#SB11 |
259 | _ if match_sb11(&state_before) => return Some(position_before), |
260 | |
261 | // SB998 https://unicode.org/reports/tr29/#SB998 |
262 | _ => continue, |
263 | } |
264 | } |
265 | |
266 | // SB2 https://unicode.org/reports/tr29/#SB2 |
267 | if self.state.match1(StatePart::Sot) { |
268 | None |
269 | } else if self.state.match1(StatePart::Eot) { |
270 | None |
271 | } else { |
272 | self.state = self.state.end(); |
273 | Some(self.pos) |
274 | } |
275 | } |
276 | } |
277 | |
278 | pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> { |
279 | SentenceBreaks { |
280 | string: source, |
281 | pos: 0, |
282 | state: INITIAL_STATE, |
283 | } |
284 | } |
285 | } |
286 | |
287 | /// An iterator over the substrings of a string which, after splitting the string on |
288 | /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries), |
289 | /// contain any characters with the |
290 | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
291 | /// property, or with |
292 | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
293 | /// |
294 | /// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`] |
295 | /// trait. See its documentation for more. |
296 | /// |
297 | /// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences |
298 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
299 | #[derive (Clone)] |
300 | pub struct UnicodeSentences<'a> { |
301 | inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>, |
302 | } |
303 | |
304 | /// External iterator for a string's |
305 | /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). |
306 | /// |
307 | /// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`] |
308 | /// trait. See its documentation for more. |
309 | /// |
310 | /// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds |
311 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
312 | #[derive (Clone)] |
313 | pub struct USentenceBounds<'a> { |
314 | iter: fwd::SentenceBreaks<'a>, |
315 | sentence_start: Option<usize>, |
316 | } |
317 | |
318 | /// External iterator for sentence boundaries and byte offsets. |
319 | /// |
320 | /// This struct is created by the [`split_sentence_bound_indices`] method on the |
321 | /// [`UnicodeSegmentation`] trait. See its documentation for more. |
322 | /// |
323 | /// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices |
324 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
325 | #[derive (Clone)] |
326 | pub struct USentenceBoundIndices<'a> { |
327 | start_offset: usize, |
328 | iter: USentenceBounds<'a>, |
329 | } |
330 | |
331 | #[inline ] |
332 | pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> { |
333 | USentenceBounds { |
334 | iter: fwd::new_sentence_breaks(source), |
335 | sentence_start: None, |
336 | } |
337 | } |
338 | |
339 | #[inline ] |
340 | pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> { |
341 | USentenceBoundIndices { |
342 | start_offset: source.as_ptr() as usize, |
343 | iter: new_sentence_bounds(source), |
344 | } |
345 | } |
346 | |
347 | #[inline ] |
348 | pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> { |
349 | use super::UnicodeSegmentation; |
350 | use crate::tables::util::is_alphanumeric; |
351 | |
352 | fn has_alphanumeric(s: &&str) -> bool { |
353 | s.chars().any(|c: char| is_alphanumeric(c)) |
354 | } |
355 | let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer |
356 | |
357 | UnicodeSentences { |
358 | inner: s.split_sentence_bounds().filter(has_alphanumeric), |
359 | } |
360 | } |
361 | |
362 | impl<'a> Iterator for UnicodeSentences<'a> { |
363 | type Item = &'a str; |
364 | |
365 | #[inline ] |
366 | fn next(&mut self) -> Option<&'a str> { |
367 | self.inner.next() |
368 | } |
369 | } |
370 | |
371 | impl<'a> Iterator for USentenceBounds<'a> { |
372 | type Item = &'a str; |
373 | |
374 | #[inline ] |
375 | fn size_hint(&self) -> (usize, Option<usize>) { |
376 | let (lower, upper) = self.iter.size_hint(); |
377 | (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1))) |
378 | } |
379 | |
380 | #[inline ] |
381 | fn next(&mut self) -> Option<&'a str> { |
382 | if self.sentence_start == None { |
383 | if let Some(start_pos) = self.iter.next() { |
384 | self.sentence_start = Some(start_pos) |
385 | } else { |
386 | return None; |
387 | } |
388 | } |
389 | |
390 | if let Some(break_pos) = self.iter.next() { |
391 | let start_pos = self.sentence_start.unwrap(); |
392 | let sentence = &self.iter.string[start_pos..break_pos]; |
393 | self.sentence_start = Some(break_pos); |
394 | Some(sentence) |
395 | } else { |
396 | None |
397 | } |
398 | } |
399 | } |
400 | |
401 | impl<'a> Iterator for USentenceBoundIndices<'a> { |
402 | type Item = (usize, &'a str); |
403 | |
404 | #[inline ] |
405 | fn next(&mut self) -> Option<(usize, &'a str)> { |
406 | self.iter |
407 | .next() |
408 | .map(|s: &str| (s.as_ptr() as usize - self.start_offset, s)) |
409 | } |
410 | |
411 | #[inline ] |
412 | fn size_hint(&self) -> (usize, Option<usize>) { |
413 | self.iter.size_hint() |
414 | } |
415 | } |
416 | |