sentence.rs source code [crates/unicode_segmentation/src/sentence.rs]

1	// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2	// file at the top-level directory of this distribution and at
3	// http://rust-lang.org/COPYRIGHT.
4	//
5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8	// option. This file may not be copied, modified, or distributed
9	// except according to those terms.
10
11	use core::cmp;
12	use core::iter::Filter;
13
14	// All of the logic for forward iteration over sentences
15	mod fwd {
16	use crate::tables::sentence::SentenceCat;
17	use core::cmp;
18
19	// Describe a parsed part of source string as described in this table:
20	// https://unicode.org/reports/tr29/#Default_Sentence_Boundaries
21	#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22	enum StatePart {
23	Sot,
24	Eot,
25	Other,
26	CR,
27	LF,
28	Sep,
29	ATerm,
30	UpperLower,
31	ClosePlus,
32	SpPlus,
33	STerm,
34	}
35
36	#[derive(Debug, Clone, PartialEq, Eq)]
37	struct SentenceBreaksState(pub [StatePart; `4`]);
38
39	const INITIAL_STATE: SentenceBreaksState = SentenceBreaksState([
40	StatePart::Sot,
41	StatePart::Sot,
42	StatePart::Sot,
43	StatePart::Sot,
44	]);
45
46	#[derive(Debug, Clone)]
47	pub struct SentenceBreaks<'a> {
48	pub string: &'a str,
49	pos: usize,
50	state: SentenceBreaksState,
51	}
52
53	impl SentenceBreaksState {
54	// Attempt to advance the internal state by one part
55	// Whitespace and some punctutation will be collapsed
56	fn next(&self, cat: SentenceCat) -> SentenceBreaksState {
57	let &SentenceBreaksState(parts) = self;
58	let parts = match (parts[`3`], cat) {
59	(StatePart::ClosePlus, SentenceCat::SC_Close) => parts,
60	(StatePart::SpPlus, SentenceCat::SC_Sp) => parts,
61	_ => [
62	parts[`1`],
63	parts[`2`],
64	parts[`3`],
65	match cat {
66	SentenceCat::SC_CR => StatePart::CR,
67	SentenceCat::SC_LF => StatePart::LF,
68	SentenceCat::SC_Sep => StatePart::Sep,
69	SentenceCat::SC_ATerm => StatePart::ATerm,
70	SentenceCat::SC_Upper \| SentenceCat::SC_Lower => StatePart::UpperLower,
71	SentenceCat::SC_Close => StatePart::ClosePlus,
72	SentenceCat::SC_Sp => StatePart::SpPlus,
73	SentenceCat::SC_STerm => StatePart::STerm,
74	_ => StatePart::Other,
75	},
76	],
77	};
78	SentenceBreaksState(parts)
79	}
80
81	fn end(&self) -> SentenceBreaksState {
82	let &SentenceBreaksState(parts) = self;
83	SentenceBreaksState([parts[`1`], parts[`2`], parts[`3`], StatePart::Eot])
84	}
85
86	// Helper function to check if state head matches a single `StatePart`
87	fn match1(&self, part: StatePart) -> bool {
88	let &SentenceBreaksState(parts) = self;
89	part == parts[`3`]
90	}
91
92	// Helper function to check if first two `StateParts` in state match
93	// the given two
94	fn match2(&self, part1: StatePart, part2: StatePart) -> bool {
95	let &SentenceBreaksState(parts) = self;
96	part1 == parts[`2`] && part2 == parts[`3`]
97	}
98	}
99
100	// https://unicode.org/reports/tr29/#SB8
101	// TODO cache this, it is currently quadratic
102	fn match_sb8(state: &SentenceBreaksState, ahead: &str) -> bool {
103	let &SentenceBreaksState(parts) = state;
104	let mut idx = if parts[`3`] == StatePart::SpPlus { `2` } else { `3` };
105	if parts[idx] == StatePart::ClosePlus {
106	idx -= `1`
107	}
108
109	if parts[idx] == StatePart::ATerm {
110	use crate::tables::sentence as se;
111
112	for next_char in ahead.chars() {
113	//( ¬(OLetter \| Upper \| Lower \| ParaSep \| SATerm) ) Lower*
114	match se::sentence_category(next_char).2 {
115	se::SC_Lower => return `true`,
116	se::SC_OLetter
117	\| se::SC_Upper
118	\| se::SC_Sep
119	\| se::SC_CR
120	\| se::SC_LF
121	\| se::SC_STerm
122	\| se::SC_ATerm => return `false`,
123	_ => continue,
124	}
125	}
126	}
127
128	`false`
129	}
130
131	// https://unicode.org/reports/tr29/#SB8a
132	fn match_sb8a(state: &SentenceBreaksState) -> bool {
133	// SATerm Close* Sp*
134	let &SentenceBreaksState(parts) = state;
135	let mut idx = if parts[`3`] == StatePart::SpPlus { `2` } else { `3` };
136	if parts[idx] == StatePart::ClosePlus {
137	idx -= `1`
138	}
139	parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
140	}
141
142	// https://unicode.org/reports/tr29/#SB9
143	fn match_sb9(state: &SentenceBreaksState) -> bool {
144	// SATerm Close*
145	let &SentenceBreaksState(parts) = state;
146	let idx = if parts[`3`] == StatePart::ClosePlus {
147	`2`
148	} else {
149	`3`
150	};
151	parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
152	}
153
154	// https://unicode.org/reports/tr29/#SB11
155	fn match_sb11(state: &SentenceBreaksState) -> bool {
156	// SATerm Close Sp* ParaSep?*
157	let &SentenceBreaksState(parts) = state;
158	let mut idx = match parts[`3`] {
159	StatePart::Sep \| StatePart::CR \| StatePart::LF => `2`,
160	_ => `3`,
161	};
162
163	if parts[idx] == StatePart::SpPlus {
164	idx -= `1`
165	}
166	if parts[idx] == StatePart::ClosePlus {
167	idx -= `1`
168	}
169
170	parts[idx] == StatePart::STerm \|\| parts[idx] == StatePart::ATerm
171	}
172
173	impl<'a> Iterator for SentenceBreaks<'a> {
174	// Returns the index of the character which follows a break
175	type Item = usize;
176
177	#[inline]
178	fn size_hint(&self) -> (usize, Option<usize>) {
179	let slen = self.string.len();
180	// A sentence could be one character
181	(cmp::min(slen, `2`), Some(slen + `1`))
182	}
183
184	#[inline]
185	fn next(&mut self) -> Option<usize> {
186	use crate::tables::sentence as se;
187
188	for next_char in self.string[self.pos..].chars() {
189	let position_before = self.pos;
190	let state_before = self.state.clone();
191
192	let next_cat = se::sentence_category(next_char).2;
193
194	self.pos += next_char.len_utf8();
195	self.state = self.state.next(next_cat);
196
197	match next_cat {
198	// SB1 https://unicode.org/reports/tr29/#SB1
199	_ if state_before.match1(StatePart::Sot) => return Some(position_before),
200
201	// SB2 is handled when inner iterator (chars) is finished
202
203	// SB3 https://unicode.org/reports/tr29/#SB3
204	SentenceCat::SC_LF if state_before.match1(StatePart::CR) => continue,
205
206	// SB4 https://unicode.org/reports/tr29/#SB4
207	_ if state_before.match1(StatePart::Sep)
208	\|\| state_before.match1(StatePart::CR)
209	\|\| state_before.match1(StatePart::LF) =>
210	{
211	return Some(position_before)
212	}
213
214	// SB5 https://unicode.org/reports/tr29/#SB5
215	SentenceCat::SC_Extend \| SentenceCat::SC_Format => self.state = state_before,
216
217	// SB6 https://unicode.org/reports/tr29/#SB6
218	SentenceCat::SC_Numeric if state_before.match1(StatePart::ATerm) => continue,
219
220	// SB7 https://unicode.org/reports/tr29/#SB7
221	SentenceCat::SC_Upper
222	if state_before.match2(StatePart::UpperLower, StatePart::ATerm) =>
223	{
224	continue
225	}
226
227	// SB8 https://unicode.org/reports/tr29/#SB8
228	_ if match_sb8(&state_before, &self.string[position_before..]) => continue,
229
230	// SB8a https://unicode.org/reports/tr29/#SB8a
231	SentenceCat::SC_SContinue \| SentenceCat::SC_STerm \| SentenceCat::SC_ATerm
232	if match_sb8a(&state_before) =>
233	{
234	continue
235	}
236
237	// SB9 https://unicode.org/reports/tr29/#SB9
238	SentenceCat::SC_Close
239	\| SentenceCat::SC_Sp
240	\| SentenceCat::SC_Sep
241	\| SentenceCat::SC_CR
242	\| SentenceCat::SC_LF
243	if match_sb9(&state_before) =>
244	{
245	continue
246	}
247
248	// SB10 https://unicode.org/reports/tr29/#SB10
249	SentenceCat::SC_Sp
250	\| SentenceCat::SC_Sep
251	\| SentenceCat::SC_CR
252	\| SentenceCat::SC_LF
253	if match_sb8a(&state_before) =>
254	{
255	continue
256	}
257
258	// SB11 https://unicode.org/reports/tr29/#SB11
259	_ if match_sb11(&state_before) => return Some(position_before),
260
261	// SB998 https://unicode.org/reports/tr29/#SB998
262	_ => continue,
263	}
264	}
265
266	// SB2 https://unicode.org/reports/tr29/#SB2
267	if self.state.match1(StatePart::Sot) \|\| self.state.match1(StatePart::Eot) {
268	None
269	} else {
270	self.state = self.state.end();
271	Some(self.pos)
272	}
273	}
274	}
275
276	pub fn new_sentence_breaks(source: &str) -> SentenceBreaks<'_> {
277	SentenceBreaks {
278	string: source,
279	pos: `0`,
280	state: INITIAL_STATE,
281	}
282	}
283	}
284
285	/// An iterator over the substrings of a string which, after splitting the string on
286	/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries),
287	/// contain any characters with the
288	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
289	/// property, or with
290	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
291	///
292	/// This struct is created by the [`unicode_sentences`] method on the [`UnicodeSegmentation`]
293	/// trait. See its documentation for more.
294	///
295	/// [`unicode_sentences`]: trait.UnicodeSegmentation.html#tymethod.unicode_sentences
296	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
297	#[derive(Debug, Clone)]
298	pub struct UnicodeSentences<'a> {
299	inner: Filter<USentenceBounds<'a>, fn(&&str) -> bool>,
300	}
301
302	/// External iterator for a string's
303	/// [sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
304	///
305	/// This struct is created by the [`split_sentence_bounds`] method on the [`UnicodeSegmentation`]
306	/// trait. See its documentation for more.
307	///
308	/// [`split_sentence_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bounds
309	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
310	#[derive(Debug, Clone)]
311	pub struct USentenceBounds<'a> {
312	iter: fwd::SentenceBreaks<'a>,
313	sentence_start: Option<usize>,
314	}
315
316	/// External iterator for sentence boundaries and byte offsets.
317	///
318	/// This struct is created by the [`split_sentence_bound_indices`] method on the
319	/// [`UnicodeSegmentation`] trait. See its documentation for more.
320	///
321	/// [`split_sentence_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_sentence_bound_indices
322	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
323	#[derive(Debug, Clone)]
324	pub struct USentenceBoundIndices<'a> {
325	start_offset: usize,
326	iter: USentenceBounds<'a>,
327	}
328
329	#[inline]
330	pub fn new_sentence_bounds(source: &str) -> USentenceBounds<'_> {
331	USentenceBounds {
332	iter: fwd::new_sentence_breaks(source),
333	sentence_start: None,
334	}
335	}
336
337	#[inline]
338	pub fn new_sentence_bound_indices(source: &str) -> USentenceBoundIndices<'_> {
339	USentenceBoundIndices {
340	start_offset: source.as_ptr() as usize,
341	iter: new_sentence_bounds(source),
342	}
343	}
344
345	#[inline]
346	pub fn new_unicode_sentences(s: &str) -> UnicodeSentences<'_> {
347	use super::UnicodeSegmentation;
348	use crate::tables::util::is_alphanumeric;
349
350	fn has_alphanumeric(s: &&str) -> bool {
351	s.chars().any(is_alphanumeric)
352	}
353	let has_alphanumeric: fn(&&str) -> bool = has_alphanumeric; // coerce to fn pointer
354
355	UnicodeSentences {
356	inner: s.split_sentence_bounds().filter(has_alphanumeric),
357	}
358	}
359
360	impl<'a> Iterator for UnicodeSentences<'a> {
361	type Item = &'a str;
362
363	#[inline]
364	fn next(&mut self) -> Option<&'a str> {
365	self.inner.next()
366	}
367
368	#[inline]
369	fn size_hint(&self) -> (usize, Option<usize>) {
370	self.inner.size_hint()
371	}
372	}
373
374	impl<'a> Iterator for USentenceBounds<'a> {
375	type Item = &'a str;
376
377	#[inline]
378	fn size_hint(&self) -> (usize, Option<usize>) {
379	let (lower, upper) = self.iter.size_hint();
380	(cmp::max(`0`, lower - `1`), upper.map(\|u\| cmp::max(`0`, u - `1`)))
381	}
382
383	#[inline]
384	fn next(&mut self) -> Option<&'a str> {
385	if self.sentence_start.is_none() {
386	if let Some(start_pos) = self.iter.next() {
387	self.sentence_start = Some(start_pos)
388	} else {
389	return None;
390	}
391	}
392
393	if let Some(break_pos) = self.iter.next() {
394	let start_pos = self.sentence_start.unwrap();
395	let sentence = &self.iter.string[start_pos..break_pos];
396	self.sentence_start = Some(break_pos);
397	Some(sentence)
398	} else {
399	None
400	}
401	}
402	}
403
404	impl<'a> Iterator for USentenceBoundIndices<'a> {
405	type Item = (usize, &'a str);
406
407	#[inline]
408	fn next(&mut self) -> Option<(usize, &'a str)> {
409	self.iter
410	.next()
411	.map(\|s: &'a str\| (s.as_ptr() as usize - self.start_offset, s))
412	}
413
414	#[inline]
415	fn size_hint(&self) -> (usize, Option<usize>) {
416	self.iter.size_hint()
417	}
418	}
419