lib.rs source code [crates/unicode-segmentation/src/lib.rs]

1	// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2	// file at the top-level directory of this distribution and at
3	// http://rust-lang.org/COPYRIGHT.
4	//
5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8	// option. This file may not be copied, modified, or distributed
9	// except according to those terms.
10
11	//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
12	//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
13	//!
14	//! ```rust
15	//! extern crate unicode_segmentation;
16	//!
17	//! use unicode_segmentation::UnicodeSegmentation;
18	//!
19	//! fn main() {
20	//! let s = "a̐éö̲`\r\n`";
21	//! let g = UnicodeSegmentation::graphemes(s, `true`).collect::<Vec<&str>>();
22	//! let b: &[_] = &["a̐", "é", "ö̲", "`\r\n`"];
23	//! assert_eq!(g, b);
24	//!
25	//! let s = "The quick (`\"`brown`\"`) fox can't jump 32.3 feet, right?";
26	//! let w = s.unicode_words().collect::<Vec<&str>>();
27	//! let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
28	//! assert_eq!(w, b);
29	//!
30	//! let s = "The quick (`\"`brown`\"`) fox";
31	//! let w = s.split_word_bounds().collect::<Vec<&str>>();
32	//! let b: &[_] = &["The", " ", "quick", " ", "(", "`\"`", "brown", "`\"`", ")", " ", "fox"];
33	//! assert_eq!(w, b);
34	//! }
35	//! ```
36	//!
37	//! # no_std
38	//!
39	//! unicode-segmentation does not depend on libstd, so it can be used in crates
40	//! with the `#![no_std]` attribute.
41	//!
42	//! # crates.io
43	//!
44	//! You can use this package in your project by adding the following
45	//! to your `Cargo.toml`:
46	//!
47	//! ```toml
48	//! [dependencies]
49	//! unicode-segmentation = "1.9.0"
50	//! ```
51
52	#![deny(missing_docs, unsafe_code)]
53	#![doc(
54	html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
55	html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
56	)]
57	#![no_std]
58
59	#[cfg(test)]
60	#[macro_use]
61	extern crate std;
62
63	#[cfg(test)]
64	#[macro_use]
65	extern crate quickcheck;
66
67	pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
68	pub use grapheme::{GraphemeIndices, Graphemes};
69	pub use sentence::{USentenceBoundIndices, USentenceBounds, UnicodeSentences};
70	pub use tables::UNICODE_VERSION;
71	pub use word::{UWordBoundIndices, UWordBounds, UnicodeWordIndices, UnicodeWords};
72
73	mod grapheme;
74	#[rustfmt::skip]
75	mod tables;
76	mod sentence;
77	mod word;
78
79	#[cfg(test)]
80	mod test;
81	#[cfg(test)]
82	mod testdata;
83
84	/// Methods for segmenting strings according to
85	/// [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/).
86	pub trait UnicodeSegmentation {
87	/// Returns an iterator over the [grapheme clusters][graphemes] of `self`.
88	///
89	/// [graphemes]: http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
90	///
91	/// If `is_extended` is true, the iterator is over the
92	/// extended grapheme clusters;
93	/// otherwise, the iterator is over the legacy grapheme clusters.
94	/// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
95	/// recommends extended grapheme cluster boundaries for general processing.
96	///
97	/// # Examples
98	///
99	/// ```
100	/// # use self::unicode_segmentation::UnicodeSegmentation;
101	/// let gr1 = UnicodeSegmentation::graphemes("a`\u{310}`e`\u{301}`o`\u{308}\u{332}`", `true`)
102	/// .collect::<Vec<&str>>();
103	/// let b: &[_] = &["a`\u{310}`", "e`\u{301}`", "o`\u{308}\u{332}`"];
104	///
105	/// assert_eq!(&gr1[..], b);
106	///
107	/// let gr2 = UnicodeSegmentation::graphemes("a`\r\n`b🇷🇺🇸🇹", `true`).collect::<Vec<&str>>();
108	/// let b: &[_] = &["a", "`\r\n`", "b", "🇷🇺", "🇸🇹"];
109	///
110	/// assert_eq!(&gr2[..], b);
111	/// ```
112	fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
113
114	/// Returns an iterator over the grapheme clusters of `self` and their
115	/// byte offsets. See `graphemes()` for more information.
116	///
117	/// # Examples
118	///
119	/// ```
120	/// # use self::unicode_segmentation::UnicodeSegmentation;
121	/// let gr_inds = UnicodeSegmentation::grapheme_indices("a̐éö̲`\r\n`", `true`)
122	/// .collect::<Vec<(usize, &str)>>();
123	/// let b: &[_] = &[(`0`, "a̐"), (`3`, "é"), (`6`, "ö̲"), (`11`, "`\r\n`")];
124	///
125	/// assert_eq!(&gr_inds[..], b);
126	/// ```
127	fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
128
129	/// Returns an iterator over the words of `self`, separated on
130	/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
131	///
132	/// Here, "words" are just those substrings which, after splitting on
133	/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
134	/// substring must contain at least one character with the
135	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
136	/// property, or with
137	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
138	///
139	/// # Example
140	///
141	/// ```
142	/// # use self::unicode_segmentation::UnicodeSegmentation;
143	/// let uws = "The quick (`\"`brown`\"`) fox can't jump 32.3 feet, right?";
144	/// let uw1 = uws.unicode_words().collect::<Vec<&str>>();
145	/// let b: &[_] = &["The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"];
146	///
147	/// assert_eq!(&uw1[..], b);
148	/// ```
149	fn unicode_words<'a>(&'a self) -> UnicodeWords<'a>;
150
151	/// Returns an iterator over the words of `self`, separated on
152	/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), and their
153	/// offsets.
154	///
155	/// Here, "words" are just those substrings which, after splitting on
156	/// UAX#29 word boundaries, contain any alphanumeric characters. That is, the
157	/// substring must contain at least one character with the
158	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
159	/// property, or with
160	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
161	///
162	/// # Example
163	///
164	/// ```
165	/// # use self::unicode_segmentation::UnicodeSegmentation;
166	/// let uwis = "The quick (`\"`brown`\"`) fox can't jump 32.3 feet, right?";
167	/// let uwi1 = uwis.unicode_word_indices().collect::<Vec<(usize, &str)>>();
168	/// let b: &[_] = &[(`0`, "The"), (`4`, "quick"), (`12`, "brown"), (`20`, "fox"), (`24`, "can't"),
169	/// (`30`, "jump"), (`35`, "32.3"), (`40`, "feet"), (`46`, "right")];
170	///
171	/// assert_eq!(&uwi1[..], b);
172	/// ```
173	fn unicode_word_indices<'a>(&'a self) -> UnicodeWordIndices<'a>;
174
175	/// Returns an iterator over substrings of `self` separated on
176	/// [UAX#29 word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
177	///
178	/// The concatenation of the substrings returned by this function is just the original string.
179	///
180	/// # Example
181	///
182	/// ```
183	/// # use self::unicode_segmentation::UnicodeSegmentation;
184	/// let swu1 = "The quick (`\"`brown`\"`) fox".split_word_bounds().collect::<Vec<&str>>();
185	/// let b: &[_] = &["The", " ", "quick", " ", "(", "`\"`", "brown", "`\"`", ")", " ", "fox"];
186	///
187	/// assert_eq!(&swu1[..], b);
188	/// ```
189	fn split_word_bounds<'a>(&'a self) -> UWordBounds<'a>;
190
191	/// Returns an iterator over substrings of `self`, split on UAX#29 word boundaries,
192	/// and their offsets. See `split_word_bounds()` for more information.
193	///
194	/// # Example
195	///
196	/// ```
197	/// # use self::unicode_segmentation::UnicodeSegmentation;
198	/// let swi1 = "Brr, it's 29.3°F!".split_word_bound_indices().collect::<Vec<(usize, &str)>>();
199	/// let b: &[_] = &[(`0`, "Brr"), (`3`, ","), (`4`, " "), (`5`, "it's"), (`9`, " "), (`10`, "29.3"),
200	/// (`14`, "°"), (`16`, "F"), (`17`, "!")];
201	///
202	/// assert_eq!(&swi1[..], b);
203	/// ```
204	fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
205
206	/// Returns an iterator over substrings of `self` separated on
207	/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
208	///
209	/// Here, "sentences" are just those substrings which, after splitting on
210	/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
211	/// substring must contain at least one character with the
212	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
213	/// property, or with
214	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
215	///
216	/// # Example
217	///
218	/// ```
219	/// # use self::unicode_segmentation::UnicodeSegmentation;
220	/// let uss = "Mr. Fox jumped. [...] The dog was too lazy.";
221	/// let us1 = uss.unicode_sentences().collect::<Vec<&str>>();
222	/// let b: &[_] = &["Mr. ", "Fox jumped. ", "The dog was too lazy."];
223	///
224	/// assert_eq!(&us1[..], b);
225	/// ```
226	fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
227
228	/// Returns an iterator over substrings of `self` separated on
229	/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
230	///
231	/// The concatenation of the substrings returned by this function is just the original string.
232	///
233	/// # Example
234	///
235	/// ```
236	/// # use self::unicode_segmentation::UnicodeSegmentation;
237	/// let ssbs = "Mr. Fox jumped. [...] The dog was too lazy.";
238	/// let ssb1 = ssbs.split_sentence_bounds().collect::<Vec<&str>>();
239	/// let b: &[_] = &["Mr. ", "Fox jumped. ", "[...] ", "The dog was too lazy."];
240	///
241	/// assert_eq!(&ssb1[..], b);
242	/// ```
243	fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
244
245	/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
246	/// and their offsets. See `split_sentence_bounds()` for more information.
247	///
248	/// # Example
249	///
250	/// ```
251	/// # use self::unicode_segmentation::UnicodeSegmentation;
252	/// let ssis = "Mr. Fox jumped. [...] The dog was too lazy.";
253	/// let ssi1 = ssis.split_sentence_bound_indices().collect::<Vec<(usize, &str)>>();
254	/// let b: &[_] = &[(`0`, "Mr. "), (`4`, "Fox jumped. "), (`16`, "[...] "),
255	/// (`22`, "The dog was too lazy.")];
256	///
257	/// assert_eq!(&ssi1[..], b);
258	/// ```
259	fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
260	}
261
262	impl UnicodeSegmentation for str {
263	#[inline]
264	fn graphemes(&self, is_extended: bool) -> Graphemes {
265	grapheme::new_graphemes(self, is_extended)
266	}
267
268	#[inline]
269	fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
270	grapheme::new_grapheme_indices(self, is_extended)
271	}
272
273	#[inline]
274	fn unicode_words(&self) -> UnicodeWords {
275	word::new_unicode_words(self)
276	}
277
278	#[inline]
279	fn unicode_word_indices(&self) -> UnicodeWordIndices {
280	word::new_unicode_word_indices(self)
281	}
282
283	#[inline]
284	fn split_word_bounds(&self) -> UWordBounds {
285	word::new_word_bounds(self)
286	}
287
288	#[inline]
289	fn split_word_bound_indices(&self) -> UWordBoundIndices {
290	word::new_word_bound_indices(self)
291	}
292
293	#[inline]
294	fn unicode_sentences(&self) -> UnicodeSentences {
295	sentence::new_unicode_sentences(self)
296	}
297
298	#[inline]
299	fn split_sentence_bounds(&self) -> USentenceBounds {
300	sentence::new_sentence_bounds(self)
301	}
302
303	#[inline]
304	fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
305	sentence::new_sentence_bound_indices(self)
306	}
307	}
308