| 1 | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
| 2 | // file at the top-level directory of this distribution and at |
| 3 | // http://rust-lang.org/COPYRIGHT. |
| 4 | // |
| 5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
| 6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
| 7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
| 8 | // option. This file may not be copied, modified, or distributed |
| 9 | // except according to those terms. |
| 10 | |
| 11 | use core::cmp; |
| 12 | use core::iter::Filter; |
| 13 | |
| 14 | // All of the logic for forward iteration over sentences |
| 15 | mod fwd { |
| 16 | use crate::tables::sentence::SentenceCat; |
| 17 | use core::cmp; |
| 18 | |
| 19 | // Describe a parsed part of source string as described in this table: |
| 20 | // https://unicode.org/reports/tr29/#Default_Sentence_Boundaries |
| 21 | #[derive (Debug, Clone, Copy, PartialEq, Eq)] |
| 22 | enum StatePart { |
| 23 | Sot, |
| 24 | Eot, |
| 25 | Other, |
| 26 | CR, |
| 27 | LF, |
| 28 | Sep, |
| 29 | ATerm, |
| 30 | UpperLower, |
| 31 | ClosePlus, |
| 32 | SpPlus, |
| 33 | STerm, |
| 34 | } |
| 35 | |
| 36 | #[derive (Debug, Clone, PartialEq, Eq)] |
| 37 | struct SentenceBreaksState(pub [StatePart; 4]); |
| 38 | |
| 39 | const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([ |
| 40 | StatePart::Sot, |
| 41 | StatePart::Sot, |
| 42 | StatePart::Sot, |
| 43 | StatePart::Sot, |
| 44 | ]); |
| 45 | |
| 46 | #[derive (Debug, Clone)] |
| 47 | pub struct SentenceBreaks<'a> { |
| 48 | pub string: &'a str, |
| 49 | pos: usize, |
| 50 | state: SentenceBreaksState, |
| 51 | } |
| 52 | |
| 53 | impl SentenceBreaksState { |
| 54 | // Attempt to advance the internal state by one part |
| 55 | // Whitespace and some punctutation will be collapsed |
| 56 | fn next(&self, cat: SentenceCat) -> SentenceBreaksState { |
| 57 | let &SentenceBreaksState(parts) = self; |
| 58 | let parts = match (parts[3], cat) { |
| 59 | (StatePart::ClosePlus, SentenceCat::SC_Close) => parts, |
| 60 | (StatePart::SpPlus, SentenceCat::SC_Sp) => parts, |
| 61 | _ => [ |
| 62 | parts[1], |
| 63 | parts[2], |
| 64 | parts[3], |
| 65 | match cat { |
| 66 | SentenceCat::SC_CR => StatePart::CR, |
| 67 | SentenceCat::SC_LF => StatePart::LF, |
| 68 | SentenceCat::SC_Sep => StatePart::Sep, |
| 69 | SentenceCat::SC_ATerm => StatePart::ATerm, |
| 70 | SentenceCat::SC_Upper | SentenceCat::SC_Lower => StatePart::UpperLower, |
| 71 | SentenceCat::SC_Close => StatePart::ClosePlus, |
| 72 | SentenceCat::SC_Sp => StatePart::SpPlus, |
| 73 | SentenceCat::SC_STerm => StatePart::STerm, |
| 74 | _ => StatePart::Other, |
| 75 | }, |
| 76 | ], |
| 77 | }; |
| 78 | SentenceBreaksState(parts) |
| 79 | } |
| 80 | |
| 81 | fn end(&self) -> SentenceBreaksState { |
| 82 | let &SentenceBreaksState(parts) = self; |
| 83 | SentenceBreaksState([parts[1], parts[2], parts[3], StatePart::Eot]) |
| 84 | } |
| 85 | |
| 86 | // Helper function to check if state head matches a single `StatePart` |
| 87 | fn match1(&self, part: StatePart) -> bool { |
| 88 | let &SentenceBreaksState(parts) = self; |
| 89 | part == parts[3] |
| 90 | } |
| 91 | |
| 92 | // Helper function to check if first two `StateParts` in state match |
| 93 | // the given two |
| 94 | fn match2(&self, part1: StatePart, part2: StatePart) -> bool { |
| 95 | let &SentenceBreaksState(parts) = self; |
| 96 | part1 == parts[2] && part2 == parts[3] |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | // https://unicode.org/reports/tr29/#SB8 |
| 101 | // TODO cache this, it is currently quadratic |
| 102 | fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool { |
| 103 | let &SentenceBreaksState(parts) = state; |
| 104 | let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; |
| 105 | if parts[idx] == StatePart::ClosePlus { |
| 106 | idx -= 1 |
| 107 | } |
| 108 | |
| 109 | if parts[idx] == StatePart::ATerm { |
| 110 | use crate::tables::sentence as se; |
| 111 | |
| 112 | for next_char in ahead.chars() { |
| 113 | //( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower |
| 114 | match se::sentence_category(next_char).2 { |
| 115 | se::SC_Lower => return true, |
| 116 | se::SC_OLetter |
| 117 | | se::SC_Upper |
| 118 | | se::SC_Sep |
| 119 | | se::SC_CR |
| 120 | | se::SC_LF |
| 121 | | se::SC_STerm |
| 122 | | se::SC_ATerm => return false, |
| 123 | _ => continue, |
| 124 | } |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | false |
| 129 | } |
| 130 | |
| 131 | // https://unicode.org/reports/tr29/#SB8a |
| 132 | fn match_sb8a(state: &SentenceBreaksState) -> bool { |
| 133 | // SATerm Close* Sp* |
| 134 | let &SentenceBreaksState(parts) = state; |
| 135 | let mut idx = if parts[3] == StatePart::SpPlus { 2 } else { 3 }; |
| 136 | if parts[idx] == StatePart::ClosePlus { |
| 137 | idx -= 1 |
| 138 | } |
| 139 | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
| 140 | } |
| 141 | |
| 142 | // https://unicode.org/reports/tr29/#SB9 |
| 143 | fn match_sb9(state: &SentenceBreaksState) -> bool { |
| 144 | // SATerm Close* |
| 145 | let &SentenceBreaksState(parts) = state; |
| 146 | let idx = if parts[3] == StatePart::ClosePlus { |
| 147 | 2 |
| 148 | } else { |
| 149 | 3 |
| 150 | }; |
| 151 | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
| 152 | } |
| 153 | |
| 154 | // https://unicode.org/reports/tr29/#SB11 |
| 155 | fn match_sb11(state: &SentenceBreaksState) -> bool { |
| 156 | // SATerm Close* Sp* ParaSep? |
| 157 | let &SentenceBreaksState(parts) = state; |
| 158 | let mut idx = match parts[3] { |
| 159 | StatePart::Sep | StatePart::CR | StatePart::LF => 2, |
| 160 | _ => 3, |
| 161 | }; |
| 162 | |
| 163 | if parts[idx] == StatePart::SpPlus { |
| 164 | idx -= 1 |
| 165 | } |
| 166 | if parts[idx] == StatePart::ClosePlus { |
| 167 | idx -= 1 |
| 168 | } |
| 169 | |
| 170 | parts[idx] == StatePart::STerm || parts[idx] == StatePart::ATerm |
| 171 | } |
| 172 | |
| 173 | impl<'a> Iterator for SentenceBreaks<'a> { |
| 174 | // Returns the index of the character which follows a break |
| 175 | type Item = usize; |
| 176 | |
| 177 | #[inline ] |
| 178 | fn size_hint(&self) -> (usize, Option<usize>) { |
| 179 | let slen = self.string.len(); |
| 180 | // A sentence could be one character |
| 181 | (cmp::min(slen, 2), Some(slen + 1)) |
| 182 | } |
| 183 | |
| 184 | #[inline ] |
| 185 | fn next(&mut self) -> Option<usize> { |
| 186 | use crate::tables::sentence as se; |
| 187 | |
| 188 | for next_char in self.string[self.pos..].chars() { |
| 189 | let position_before = self.pos; |
| 190 | let state_before = self.state.clone(); |
| 191 | |
| 192 | let next_cat = se::sentence_category(next_char).2; |
| 193 | |
| 194 | self.pos += next_char.len_utf8(); |
| 195 | self.state = self.state.next(next_cat); |
| 196 | |
| 197 | match next_cat { |
| 198 | // SB1 https://unicode.org/reports/tr29/#SB1 |
| 199 | _ if state_before.match1(StatePart::Sot) => return Some(position_before), |
| 200 | |
| 201 | // SB2 is handled when inner iterator (chars) is finished |
| 202 | |
| 203 | // SB3 https://unicode.org/reports/tr29/#SB3 |
| 204 | SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue, |
| 205 | |
| 206 | // SB4 https://unicode.org/reports/tr29/#SB4 |
| 207 | _ if state_before.match1(StatePart::Sep) |
| 208 | || state_before.match1(StatePart::CR) |
| 209 | || state_before.match1(StatePart::LF) => |
| 210 | { |
| 211 | return Some(position_before) |
| 212 | } |
| 213 | |
| 214 | // SB5 https://unicode.org/reports/tr29/#SB5 |
| 215 | SentenceCat::SC_Extend | SentenceCat::SC_Format => self.state = state_before, |
| 216 | |
| 217 | // SB6 https://unicode.org/reports/tr29/#SB6 |
| 218 | SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue, |
| 219 | |
| 220 | // SB7 https://unicode.org/reports/tr29/#SB7 |
| 221 | SentenceCat::SC_Upper |
| 222 | if state_before.match2(StatePart::UpperLower, StatePart::ATerm) => |
| 223 | { |
| 224 | continue |
| 225 | } |
| 226 | |
| 227 | // SB8 https://unicode.org/reports/tr29/#SB8 |
| 228 | _ if match_sb8(&state_before, &self.string[position_before..]) => continue, |
| 229 | |
| 230 | // SB8a https://unicode.org/reports/tr29/#SB8a |
| 231 | SentenceCat::SC_SContinue | SentenceCat::SC_STerm | SentenceCat::SC_ATerm |
| 232 | if match_sb8a(&state_before) => |
| 233 | { |
| 234 | continue |
| 235 | } |
| 236 | |
| 237 | // SB9 https://unicode.org/reports/tr29/#SB9 |
| 238 | SentenceCat::SC_Close |
| 239 | | SentenceCat::SC_Sp |
| 240 | | SentenceCat::SC_Sep |
| 241 | | SentenceCat::SC_CR |
| 242 | | SentenceCat::SC_LF |
| 243 | if match_sb9(&state_before) => |
| 244 | { |
| 245 | continue |
| 246 | } |
| 247 | |
| 248 | // SB10 https://unicode.org/reports/tr29/#SB10 |
| 249 | SentenceCat::SC_Sp |
| 250 | | SentenceCat::SC_Sep |
| 251 | | SentenceCat::SC_CR |
| 252 | | SentenceCat::SC_LF |
| 253 | if match_sb8a(&state_before) => |
| 254 | { |
| 255 | continue |
| 256 | } |
| 257 | |
| 258 | // SB11 https://unicode.org/reports/tr29/#SB11 |
| 259 | _ if match_sb11(&state_before) => return Some(position_before), |
| 260 | |
| 261 | // SB998 https://unicode.org/reports/tr29/#SB998 |
| 262 | _ => continue, |
| 263 | } |
| 264 | } |
| 265 | |
| 266 | // SB2 https://unicode.org/reports/tr29/#SB2 |
| 267 | if self.state.match1(StatePart::Sot) || self.state.match1(StatePart::Eot) { |
| 268 | None |
| 269 | } else { |
| 270 | self.state = self.state.end(); |
| 271 | Some(self.pos) |
| 272 | } |
| 273 | } |
| 274 | } |
| 275 | |
| 276 | pub fn new_sentence_breaks(source: &str) -> SentenceBreaks<'_> { |
| 277 | SentenceBreaks { |
| 278 | string: source, |
| 279 | pos: 0, |
| 280 | state: INITIAL_STATE, |
| 281 | } |
| 282 | } |
| 283 | } |
| 284 | |
| 285 | /// An iterator over the substrings of a string which, after splitting the string on |
| 286 | /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries), |
| 287 | /// contain any characters with the |
| 288 | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
| 289 | /// property, or with |
| 290 | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
| 291 | /// |
| 292 | /// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`] |
| 293 | /// trait. See its documentation for more. |
| 294 | /// |
| 295 | /// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences |
| 296 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
| 297 | #[derive (Debug, Clone)] |
| 298 | pub struct UnicodeSentences<'a> { |
| 299 | inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>, |
| 300 | } |
| 301 | |
| 302 | /// External iterator for a string's |
| 303 | /// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries). |
| 304 | /// |
| 305 | /// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`] |
| 306 | /// trait. See its documentation for more. |
| 307 | /// |
| 308 | /// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds |
| 309 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
| 310 | #[derive (Debug, Clone)] |
| 311 | pub struct USentenceBounds<'a> { |
| 312 | iter: fwd::SentenceBreaks<'a>, |
| 313 | sentence_start: Option<usize>, |
| 314 | } |
| 315 | |
| 316 | /// External iterator for sentence boundaries and byte offsets. |
| 317 | /// |
| 318 | /// This struct is created by the [`split_sentence_bound_indices`] method on the |
| 319 | /// [`UnicodeSegmentation`] trait. See its documentation for more. |
| 320 | /// |
| 321 | /// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices |
| 322 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
| 323 | #[derive (Debug, Clone)] |
| 324 | pub struct USentenceBoundIndices<'a> { |
| 325 | start_offset: usize, |
| 326 | iter: USentenceBounds<'a>, |
| 327 | } |
| 328 | |
| 329 | #[inline ] |
| 330 | pub fn new_sentence_bounds(source: &str) -> USentenceBounds<'_> { |
| 331 | USentenceBounds { |
| 332 | iter: fwd::new_sentence_breaks(source), |
| 333 | sentence_start: None, |
| 334 | } |
| 335 | } |
| 336 | |
| 337 | #[inline ] |
| 338 | pub fn new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_> { |
| 339 | USentenceBoundIndices { |
| 340 | start_offset: source.as_ptr() as usize, |
| 341 | iter: new_sentence_bounds(source), |
| 342 | } |
| 343 | } |
| 344 | |
| 345 | #[inline ] |
| 346 | pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> { |
| 347 | use super::UnicodeSegmentation; |
| 348 | use crate::tables::util::is_alphanumeric; |
| 349 | |
| 350 | fn has_alphanumeric(s: &&str) -> bool { |
| 351 | s.chars().any(is_alphanumeric) |
| 352 | } |
| 353 | let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer |
| 354 | |
| 355 | UnicodeSentences { |
| 356 | inner: s.split_sentence_bounds().filter(has_alphanumeric), |
| 357 | } |
| 358 | } |
| 359 | |
| 360 | impl<'a> Iterator for UnicodeSentences<'a> { |
| 361 | type Item = &'a str; |
| 362 | |
| 363 | #[inline ] |
| 364 | fn next(&mut self) -> Option<&'a str> { |
| 365 | self.inner.next() |
| 366 | } |
| 367 | |
| 368 | #[inline ] |
| 369 | fn size_hint(&self) -> (usize, Option<usize>) { |
| 370 | self.inner.size_hint() |
| 371 | } |
| 372 | } |
| 373 | |
| 374 | impl<'a> Iterator for USentenceBounds<'a> { |
| 375 | type Item = &'a str; |
| 376 | |
| 377 | #[inline ] |
| 378 | fn size_hint(&self) -> (usize, Option<usize>) { |
| 379 | let (lower, upper) = self.iter.size_hint(); |
| 380 | (cmp::max(0, lower - 1), upper.map(|u| cmp::max(0, u - 1))) |
| 381 | } |
| 382 | |
| 383 | #[inline ] |
| 384 | fn next(&mut self) -> Option<&'a str> { |
| 385 | if self.sentence_start.is_none() { |
| 386 | if let Some(start_pos) = self.iter.next() { |
| 387 | self.sentence_start = Some(start_pos) |
| 388 | } else { |
| 389 | return None; |
| 390 | } |
| 391 | } |
| 392 | |
| 393 | if let Some(break_pos) = self.iter.next() { |
| 394 | let start_pos = self.sentence_start.unwrap(); |
| 395 | let sentence = &self.iter.string[start_pos..break_pos]; |
| 396 | self.sentence_start = Some(break_pos); |
| 397 | Some(sentence) |
| 398 | } else { |
| 399 | None |
| 400 | } |
| 401 | } |
| 402 | } |
| 403 | |
| 404 | impl<'a> Iterator for USentenceBoundIndices<'a> { |
| 405 | type Item = (usize, &'a str); |
| 406 | |
| 407 | #[inline ] |
| 408 | fn next(&mut self) -> Option<(usize, &'a str)> { |
| 409 | self.iter |
| 410 | .next() |
| 411 | .map(|s: &'a str| (s.as_ptr() as usize - self.start_offset, s)) |
| 412 | } |
| 413 | |
| 414 | #[inline ] |
| 415 | fn size_hint(&self) -> (usize, Option<usize>) { |
| 416 | self.iter.size_hint() |
| 417 | } |
| 418 | } |
| 419 | |