lib.rs source code [crates/unicode_segmentation/src/lib.rs]

1	// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2	// file at the top-level directory of this distribution and at
3	// http://rust-lang.org/COPYRIGHT.
4	//
5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8	// option. This file may not be copied, modified, or distributed
9	// except according to those terms.
10
11	//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
12	//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
13	//!
14	//! ```rust
15	//! extern crate unicode_segmentation;
16	//!
17	//! use unicode_segmentation::UnicodeSegmentation;
18	//!
19	//! fn main() {
20	//! let s = "a̐éö̲`\r\n`";
21	//! let g = UnicodeSegmentation::graphemes(s, `true`).collect::<Vec<&str>>();
22	//! let b: &[_] = &["a̐", "é", "ö̲", "`\r\n`"];
23	//! assert_eq!(g, b);
24	//!
25	//! let s = "The quick (`\"`brown`\"`) fox can't jump 32.3 feet, right?";
26	//! let w = s.unicode_words().collect::<Vec<&str>>();
27	//! let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
28	//! assert_eq!(w, b);
29	//!
30	//! let s = "The quick (`\"`brown`\"`) fox";
31	//! let w = s.split_word_bounds().collect::<Vec<&str>>();
32	//! let b: &[_] = &["The", " ", "quick", " ", "(", "`\"`", "brown", "`\"`", ")", " ", "fox"];
33	//! assert_eq!(w, b);
34	//! }
35	//! ```
36	//!
37	//! # no_std
38	//!
39	//! unicode-segmentation does not depend on libstd, so it can be used in crates
40	//! with the `#![no_std]` attribute.
41	//!
42	//! # crates.io
43	//!
44	//! You can use this package in your project by adding the following
45	//! to your `Cargo.toml`:
46	//!
47	//! ```toml
48	//! [dependencies]
49	//! unicode-segmentation = "1.9.0"
50	//! ```
51
52	#![deny(missing_docs, unsafe_code)]
53	#![doc(
54	html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
55	html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
56	)]
57	#![no_std]
58
59	pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
60	pub use grapheme::{GraphemeIndices, Graphemes};
61	pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
62	pub use tables::UNICODE_VERSION;
63	pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
64
65	mod grapheme;
66	mod sentence;
67	#[rustfmt::skip]
68	mod tables;
69	mod word;
70
71	/// Methods for segmenting strings according to
72	/// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
73	pub trait UnicodeSegmentation {
74	/// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
75	///
76	/// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
77	///
78	/// If `is_extended` is true, the iterator is over the
79	/// extended grapheme clusters;
80	/// otherwise, the iterator is over the legacy grapheme clusters.
81	/// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
82	/// recommends extended grapheme cluster boundaries for general processing.
83	///
84	/// # Examples
85	///
86	/// ```
87	/// # use self::unicode_segmentation::UnicodeSegmentation;
88	/// let gr1 = UnicodeSegmentation::graphemes("a`\u{310}`e`\u{301}`o`\u{308}\u{332}`", `true`)
89	/// .collect::<Vec<&str>>();
90	/// let b: &[_] = &["a`\u{310}`", "e`\u{301}`", "o`\u{308}\u{332}`"];
91	///
92	/// assert_eq!(&gr1[..], b);
93	///
94	/// let gr2 = UnicodeSegmentation::graphemes("a`\r\n`b🇷🇺🇸🇹", `true`).collect::<Vec<&str>>();
95	/// let b: &[_] = &["a", "`\r\n`", "b", "🇷🇺", "🇸🇹"];
96	///
97	/// assert_eq!(&gr2[..], b);
98	/// ```
99	fn graphemes(&self, is_extended: bool) -> Graphemes<'_>;
100
101	/// Returns an iterator over the grapheme clusters of `self` and their
102	/// byte offsets. See `graphemes()` for more information.
103	///
104	/// # Examples
105	///
106	/// ```
107	/// # use self::unicode_segmentation::UnicodeSegmentation;
108	/// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲`\r\n`", `true`)
109	/// .collect::<Vec<(usize, &str)>>();
110	/// let b: &[_] = &[(`0`, "a̐"), (`3`, "é"), (`6`, "ö̲"), (`11`, "`\r\n`")];
111	///
112	/// assert_eq!(&gr_inds[..], b);
113	/// ```
114	fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices<'_>;
115
116	/// Returns an iterator over the words of `self`, separated on
117	/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
118	///
119	/// Here, "words" are just those substrings which, after splitting on
120	/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
121	/// substring must contain at least one character with the
122	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
123	/// property, or with
124	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
125	///
126	/// # Example
127	///
128	/// ```
129	/// # use self::unicode_segmentation::UnicodeSegmentation;
130	/// let uws = "The quick (`\"`brown`\"`) fox can't jump 32.3 feet, right?";
131	/// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
132	/// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
133	///
134	/// assert_eq!(&uw1[..], b);
135	/// ```
136	fn unicode_words(&self) -> UnicodeWords<'_>;
137
138	/// Returns an iterator over the words of `self`, separated on
139	/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
140	/// offsets.
141	///
142	/// Here, "words" are just those substrings which, after splitting on
143	/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
144	/// substring must contain at least one character with the
145	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
146	/// property, or with
147	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
148	///
149	/// # Example
150	///
151	/// ```
152	/// # use self::unicode_segmentation::UnicodeSegmentation;
153	/// let uwis = "The quick (`\"`brown`\"`) fox can't jump 32.3 feet, right?";
154	/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
155	/// let b: &[_] = &[(`0`, "The"), (`4`, "quick"), (`12`, "brown"), (`20`, "fox"), (`24`, "can't"),
156	/// (`30`, "jump"), (`35`, "32.3"), (`40`, "feet"), (`46`, "right")];
157	///
158	/// assert_eq!(&uwi1[..], b);
159	/// ```
160	fn unicode_word_indices(&self) -> UnicodeWordIndices<'_>;
161
162	/// Returns an iterator over substrings of `self` separated on
163	/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
164	///
165	/// The concatenation of the substrings returned by this function is just the original string.
166	///
167	/// # Example
168	///
169	/// ```
170	/// # use self::unicode_segmentation::UnicodeSegmentation;
171	/// let swu1 = "The quick (`\"`brown`\"`) fox".split_word_bounds().collect::<Vec<&str>>();
172	/// let b: &[_] = &["The", " ", "quick", " ", "(", "`\"`", "brown", "`\"`", ")", " ", "fox"];
173	///
174	/// assert_eq!(&swu1[..], b);
175	/// ```
176	fn split_word_bounds(&self) -> UWordBounds<'_>;
177
178	/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
179	/// and their offsets. See `split_word_bounds()` for more information.
180	///
181	/// # Example
182	///
183	/// ```
184	/// # use self::unicode_segmentation::UnicodeSegmentation;
185	/// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
186	/// let b: &[_] = &[(`0`, "Brr"), (`3`, ","), (`4`, " "), (`5`, "it's"), (`9`, " "), (`10`, "29.3"),
187	/// (`14`, "°"), (`16`, "F"), (`17`, "!")];
188	///
189	/// assert_eq!(&swi1[..], b);
190	/// ```
191	fn split_word_bound_indices(&self) -> UWordBoundIndices<'_>;
192
193	/// Returns an iterator over substrings of `self` separated on
194	/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
195	///
196	/// Here, "sentences" are just those substrings which, after splitting on
197	/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
198	/// substring must contain at least one character with the
199	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
200	/// property, or with
201	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
202	///
203	/// # Example
204	///
205	/// ```
206	/// # use self::unicode_segmentation::UnicodeSegmentation;
207	/// let uss = "Mr. Fox jumped. [...] The dog was too lazy.";
208	/// let us1 = uss.unicode_sentences().collect::<Vec<&str>>();
209	/// let b: &[_] = &["Mr. ", "Fox jumped. ", "The dog was too lazy."];
210	///
211	/// assert_eq!(&us1[..], b);
212	/// ```
213	fn unicode_sentences(&self) -> UnicodeSentences<'_>;
214
215	/// Returns an iterator over substrings of `self` separated on
216	/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
217	///
218	/// The concatenation of the substrings returned by this function is just the original string.
219	///
220	/// # Example
221	///
222	/// ```
223	/// # use self::unicode_segmentation::UnicodeSegmentation;
224	/// let ssbs = "Mr. Fox jumped. [...] The dog was too lazy.";
225	/// let ssb1 = ssbs.split_sentence_bounds().collect::<Vec<&str>>();
226	/// let b: &[_] = &["Mr. ", "Fox jumped. ", "[...] ", "The dog was too lazy."];
227	///
228	/// assert_eq!(&ssb1[..], b);
229	/// ```
230	fn split_sentence_bounds(&self) -> USentenceBounds<'_>;
231
232	/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
233	/// and their offsets. See `split_sentence_bounds()` for more information.
234	///
235	/// # Example
236	///
237	/// ```
238	/// # use self::unicode_segmentation::UnicodeSegmentation;
239	/// let ssis = "Mr. Fox jumped. [...] The dog was too lazy.";
240	/// let ssi1 = ssis.split_sentence_bound_indices().collect::<Vec<(usize, &str)>>();
241	/// let b: &[_] = &[(`0`, "Mr. "), (`4`, "Fox jumped. "), (`16`, "[...] "),
242	/// (`22`, "The dog was too lazy.")];
243	///
244	/// assert_eq!(&ssi1[..], b);
245	/// ```
246	fn split_sentence_bound_indices(&self) -> USentenceBoundIndices<'_>;
247	}
248
249	impl UnicodeSegmentation for str {
250	#[inline]
251	fn graphemes(&self, is_extended: bool) -> Graphemes {
252	grapheme::new_graphemes(self, is_extended)
253	}
254
255	#[inline]
256	fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
257	grapheme::new_grapheme_indices(self, is_extended)
258	}
259
260	#[inline]
261	fn unicode_words(&self) -> UnicodeWords {
262	word::new_unicode_words(self)
263	}
264
265	#[inline]
266	fn unicode_word_indices(&self) -> UnicodeWordIndices {
267	word::new_unicode_word_indices(self)
268	}
269
270	#[inline]
271	fn split_word_bounds(&self) -> UWordBounds {
272	word::new_word_bounds(self)
273	}
274
275	#[inline]
276	fn split_word_bound_indices(&self) -> UWordBoundIndices {
277	word::new_word_bound_indices(self)
278	}
279
280	#[inline]
281	fn unicode_sentences(&self) -> UnicodeSentences {
282	sentence::new_unicode_sentences(self)
283	}
284
285	#[inline]
286	fn split_sentence_bounds(&self) -> USentenceBounds {
287	sentence::new_sentence_bounds(self)
288	}
289
290	#[inline]
291	fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
292	sentence::new_sentence_bound_indices(self)
293	}
294	}
295