1// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp;
12use core::iter::Filter;
13
14// All of the logic for forward iteration over sentences
15mod fwd {
16 use crate::tables::sentence::SentenceCat;
17 use core::cmp;
18
19 // Describe a parsed part of source string as described in this table:
20 // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
21 #[derive(Clone, Copy, PartialEq, Eq)]
22 enum StatePart {
23 Sot,
24 Eot,
25 Other,
26 CR,
27 LF,
28 Sep,
29 ATerm,
30 UpperLower,
31 ClosePlus,
32 SpPlus,
33 STerm,
34 }
35
36 #[derive(Clone, PartialEq, Eq)]
37 struct SentenceBreaksState(pub [StatePart; 4]);
38
39 const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
40 StatePart::Sot,
41 StatePart::Sot,
42 StatePart::Sot,
43 StatePart::Sot,
44 ]);
45
46 #[derive(Clone)]
47 pub struct SentenceBreaks<'a> {
48 pub string: &'a str,
49 pos: usize,
50 state: SentenceBreaksState,
51 }
52
53 impl SentenceBreaksState {
54 // Attempt to advance the internal state by one part
55 // Whitespace and some punctutation will be collapsed
56 fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
57 let &SentenceBreaksState(parts) = self;
58 let parts = match (parts[3], cat) {
59 (StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
60 (StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
61 _ => [
62 parts[1],
63 parts[2],
64 parts[3],
65 match cat {
66 SentenceCat::SC_CR => StatePart::CR,
67 SentenceCat::SC_LF => StatePart::LF,
68 SentenceCat::SC_Sep => StatePart::Sep,
69 SentenceCat::SC_ATerm => StatePart::ATerm,
70 SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower,
71 SentenceCat::SC_Close => StatePart::ClosePlus,
72 SentenceCat::SC_Sp => StatePart::SpPlus,
73 SentenceCat::SC_STerm => StatePart::STerm,
74 _ => StatePart::Other,
75 },
76 ],
77 };
78 SentenceBreaksState(parts)
79 }
80
81 fn end(&self) -> SentenceBreaksState {
82 let &SentenceBreaksState(parts) = self;
83 SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot])
84 }
85
86 // Helper function to check if state head matches a single `StatePart`
87 fn match1(&self, part: StatePart) -> bool {
88 let &SentenceBreaksState(parts) = self;
89 part == parts[3]
90 }
91
92 // Helper function to check if first two `StateParts` in state match
93 // the given two
94 fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
95 let &SentenceBreaksState(parts) = self;
96 part1 == parts[2] && part2 == parts[3]
97 }
98 }
99
100 // https://unicode.org/reports/tr29/#SB8
101 // TODO cache this, it is currently quadratic
102 fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
103 let &SentenceBreaksState(parts) = state;
104 let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
105 if parts[idx] == StatePart::ClosePlus {
106 idx -= 1
107 }
108
109 if parts[idx] == StatePart::ATerm {
110 use crate::tables::sentence as se;
111
112 for next_char in ahead.chars() {
113 //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
114 match se::sentence_category(next_char).2 {
115 se::SC_Lower => return true,
116 se::SC_OLetter
117 | se::SC_Upper
118 | se::SC_Sep
119 | se::SC_CR
120 | se::SC_LF
121 | se::SC_STerm
122 | se::SC_ATerm => return false,
123 _ => continue,
124 }
125 }
126 }
127
128 false
129 }
130
131 // https://unicode.org/reports/tr29/#SB8a
132 fn match_sb8a(state: &SentenceBreaksState) -> bool {
133 // SATerm Close* Sp*
134 let &SentenceBreaksState(parts) = state;
135 let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 };
136 if parts[idx] == StatePart::ClosePlus {
137 idx -= 1
138 }
139 parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
140 }
141
142 // https://unicode.org/reports/tr29/#SB9
143 fn match_sb9(state: &SentenceBreaksState) -> bool {
144 // SATerm Close*
145 let &SentenceBreaksState(parts) = state;
146 let idx = if parts[3] == StatePart::ClosePlus {
147 2
148 } else {
149 3
150 };
151 parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
152 }
153
154 // https://unicode.org/reports/tr29/#SB11
155 fn match_sb11(state: &SentenceBreaksState) -> bool {
156 // SATerm Close* Sp* ParaSep?
157 let &SentenceBreaksState(parts) = state;
158 let mut idx = match parts[3] {
159 StatePart::Sep | StatePart::CR | StatePart::LF => 2,
160 _ => 3,
161 };
162
163 if parts[idx] == StatePart::SpPlus {
164 idx -= 1
165 }
166 if parts[idx] == StatePart::ClosePlus {
167 idx -= 1
168 }
169
170 parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm
171 }
172
173 impl<'a> Iterator for SentenceBreaks<'a> {
174 // Returns the index of the character which follows a break
175 type Item = usize;
176
177 #[inline]
178 fn size_hint(&self) -> (usize, Option<usize>) {
179 let slen = self.string.len();
180 // A sentence could be one character
181 (cmp::min(slen, 2), Some(slen + 1))
182 }
183
184 #[inline]
185 fn next(&mut self) -> Option<usize> {
186 use crate::tables::sentence as se;
187
188 for next_char in self.string[self.pos..].chars() {
189 let position_before = self.pos;
190 let state_before = self.state.clone();
191
192 let next_cat = se::sentence_category(next_char).2;
193
194 self.pos += next_char.len_utf8();
195 self.state = self.state.next(next_cat);
196
197 match next_cat {
198 // SB1 https://unicode.org/reports/tr29/#SB1
199 _ if state_before.match1(StatePart::Sot) => return Some(position_before),
200
201 // SB2 is handled when inner iterator (chars) is finished
202
203 // SB3 https://unicode.org/reports/tr29/#SB3
204 SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
205
206 // SB4 https://unicode.org/reports/tr29/#SB4
207 _ if state_before.match1(StatePart::Sep)
208 || state_before.match1(StatePart::CR)
209 || state_before.match1(StatePart::LF) =>
210 {
211 return Some(position_before)
212 }
213
214 // SB5 https://unicode.org/reports/tr29/#SB5
215 SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before,
216
217 // SB6 https://unicode.org/reports/tr29/#SB6
218 SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
219
220 // SB7 https://unicode.org/reports/tr29/#SB7
221 SentenceCat::SC_Upper
222 if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
223 {
224 continue
225 }
226
227 // SB8 https://unicode.org/reports/tr29/#SB8
228 _ if match_sb8(&state_before, &self.string[position_before..]) => continue,
229
230 // SB8a https://unicode.org/reports/tr29/#SB8a
231 SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm
232 if match_sb8a(&state_before) =>
233 {
234 continue
235 }
236
237 // SB9 https://unicode.org/reports/tr29/#SB9
238 SentenceCat::SC_Close
239 | SentenceCat::SC_Sp
240 | SentenceCat::SC_Sep
241 | SentenceCat::SC_CR
242 | SentenceCat::SC_LF
243 if match_sb9(&state_before) =>
244 {
245 continue
246 }
247
248 // SB10 https://unicode.org/reports/tr29/#SB10
249 SentenceCat::SC_Sp
250 | SentenceCat::SC_Sep
251 | SentenceCat::SC_CR
252 | SentenceCat::SC_LF
253 if match_sb8a(&state_before) =>
254 {
255 continue
256 }
257
258 // SB11 https://unicode.org/reports/tr29/#SB11
259 _ if match_sb11(&state_before) => return Some(position_before),
260
261 // SB998 https://unicode.org/reports/tr29/#SB998
262 _ => continue,
263 }
264 }
265
266 // SB2 https://unicode.org/reports/tr29/#SB2
267 if self.state.match1(StatePart::Sot) {
268 None
269 } else if self.state.match1(StatePart::Eot) {
270 None
271 } else {
272 self.state = self.state.end();
273 Some(self.pos)
274 }
275 }
276 }
277
278 pub fn new_sentence_breaks<'a>(source: &'a str) -> SentenceBreaks<'a> {
279 SentenceBreaks {
280 string: source,
281 pos: 0,
282 state: INITIAL_STATE,
283 }
284 }
285}
286
287/// An iterator over the substrings of a string which, after splitting the string on
288/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
289/// contain any characters with the
290/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
291/// property, or with
292/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
293///
294/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
295/// trait. See its documentation for more.
296///
297/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
298/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
299#[derive(Clone)]
300pub struct UnicodeSentences<'a> {
301 inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
302}
303
304/// External iterator for a string's
305/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
306///
307/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
308/// trait. See its documentation for more.
309///
310/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
311/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
312#[derive(Clone)]
313pub struct USentenceBounds<'a> {
314 iter: fwd::SentenceBreaks<'a>,
315 sentence_start: Option<usize>,
316}
317
318/// External iterator for sentence boundaries and byte offsets.
319///
320/// This struct is created by the [`split_sentence_bound_indices`] method on the
321/// [`UnicodeSegmentation`] trait. See its documentation for more.
322///
323/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
324/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
325#[derive(Clone)]
326pub struct USentenceBoundIndices<'a> {
327 start_offset: usize,
328 iter: USentenceBounds<'a>,
329}
330
331#[inline]
332pub fn new_sentence_bounds<'a>(source: &'a str) -> USentenceBounds<'a> {
333 USentenceBounds {
334 iter: fwd::new_sentence_breaks(source),
335 sentence_start: None,
336 }
337}
338
339#[inline]
340pub fn new_sentence_bound_indices<'a>(source: &'a str) -> USentenceBoundIndices<'a> {
341 USentenceBoundIndices {
342 start_offset: source.as_ptr() as usize,
343 iter: new_sentence_bounds(source),
344 }
345}
346
347#[inline]
348pub fn new_unicode_sentences<'b>(s: &'b str) -> UnicodeSentences<'b> {
349 use super::UnicodeSegmentation;
350 use crate::tables::util::is_alphanumeric;
351
352 fn has_alphanumeric(s: &&str) -> bool {
353 s.chars().any(|c: char| is_alphanumeric(c))
354 }
355 let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
356
357 UnicodeSentences {
358 inner: s.split_sentence_bounds().filter(has_alphanumeric),
359 }
360}
361
362impl<'a> Iterator for UnicodeSentences<'a> {
363 type Item = &'a str;
364
365 #[inline]
366 fn next(&mut self) -> Option<&'a str> {
367 self.inner.next()
368 }
369
370 #[inline]
371 fn size_hint(&self) -> (usize, Option<usize>) {
372 self.inner.size_hint()
373 }
374}
375
376impl<'a> Iterator for USentenceBounds<'a> {
377 type Item = &'a str;
378
379 #[inline]
380 fn size_hint(&self) -> (usize, Option<usize>) {
381 let (lower, upper) = self.iter.size_hint();
382 (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1)))
383 }
384
385 #[inline]
386 fn next(&mut self) -> Option<&'a str> {
387 if self.sentence_start == None {
388 if let Some(start_pos) = self.iter.next() {
389 self.sentence_start = Some(start_pos)
390 } else {
391 return None;
392 }
393 }
394
395 if let Some(break_pos) = self.iter.next() {
396 let start_pos = self.sentence_start.unwrap();
397 let sentence = &self.iter.string[start_pos..break_pos];
398 self.sentence_start = Some(break_pos);
399 Some(sentence)
400 } else {
401 None
402 }
403 }
404}
405
406impl<'a> Iterator for USentenceBoundIndices<'a> {
407 type Item = (usize, &'a str);
408
409 #[inline]
410 fn next(&mut self) -> Option<(usize, &'a str)> {
411 self.iter
412 .next()
413 .map(|s: &str| (s.as_ptr() as usize - self.start_offset, s))
414 }
415
416 #[inline]
417 fn size_hint(&self) -> (usize, Option<usize>) {
418 self.iter.size_hint()
419 }
420}
421