word.rs source code [crates/unicode-segmentation/src/word.rs]

1	// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2	// file at the top-level directory of this distribution and at
3	// http://rust-lang.org/COPYRIGHT.
4	//
5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8	// option. This file may not be copied, modified, or distributed
9	// except according to those terms.
10
11	use core::cmp;
12	use core::iter::Filter;
13
14	use crate::tables::word::WordCat;
15
16	/// An iterator over the substrings of a string which, after splitting the string on
17	/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18	/// contain any characters with the
19	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20	/// property, or with
21	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22	///
23	/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24	/// its documentation for more.
25	///
26	/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28	pub struct UnicodeWords<'a> {
29	inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
30	}
31
32	impl<'a> Iterator for UnicodeWords<'a> {
33	type Item = &'a str;
34
35	#[inline]
36	fn next(&mut self) -> Option<&'a str> {
37	self.inner.next()
38	}
39	}
40	impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
41	#[inline]
42	fn next_back(&mut self) -> Option<&'a str> {
43	self.inner.next_back()
44	}
45	}
46
47	/// An iterator over the substrings of a string which, after splitting the string on
48	/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
49	/// contain any characters with the
50	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
51	/// property, or with
52	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
53	/// This iterator also provides the byte offsets for each substring.
54	///
55	/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
56	/// its documentation for more.
57	///
58	/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
59	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
60	pub struct UnicodeWordIndices<'a> {
61	inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
62	}
63
64	impl<'a> Iterator for UnicodeWordIndices<'a> {
65	type Item = (usize, &'a str);
66
67	#[inline]
68	fn next(&mut self) -> Option<(usize, &'a str)> {
69	self.inner.next()
70	}
71	}
72	impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
73	#[inline]
74	fn next_back(&mut self) -> Option<(usize, &'a str)> {
75	self.inner.next_back()
76	}
77	}
78
79	/// External iterator for a string's
80	/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
81	///
82	/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
83	/// trait. See its documentation for more.
84	///
85	/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
86	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
87	#[derive(Clone)]
88	pub struct UWordBounds<'a> {
89	string: &'a str,
90	cat: Option<WordCat>,
91	catb: Option<WordCat>,
92	}
93
94	/// External iterator for word boundaries and byte offsets.
95	///
96	/// This struct is created by the [`split_word_bound_indices`] method on the
97	/// [`UnicodeSegmentation`] trait. See its documentation for more.
98	///
99	/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
100	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
101	#[derive(Clone)]
102	pub struct UWordBoundIndices<'a> {
103	start_offset: usize,
104	iter: UWordBounds<'a>,
105	}
106
107	impl<'a> UWordBoundIndices<'a> {
108	#[inline]
109	/// View the underlying data (the part yet to be iterated) as a slice of the original string.
110	///
111	/// ```rust
112	/// # use unicode_segmentation::UnicodeSegmentation;
113	/// let mut iter = "Hello world".split_word_bound_indices();
114	/// assert_eq!(iter.as_str(), "Hello world");
115	/// iter.next();
116	/// assert_eq!(iter.as_str(), " world");
117	/// iter.next();
118	/// assert_eq!(iter.as_str(), "world");
119	/// ```
120	pub fn as_str(&self) -> &'a str {
121	self.iter.as_str()
122	}
123	}
124
125	impl<'a> Iterator for UWordBoundIndices<'a> {
126	type Item = (usize, &'a str);
127
128	#[inline]
129	fn next(&mut self) -> Option<(usize, &'a str)> {
130	self.iter
131	.next()
132	.map(\|s: &str\| (s.as_ptr() as usize - self.start_offset, s))
133	}
134
135	#[inline]
136	fn size_hint(&self) -> (usize, Option<usize>) {
137	self.iter.size_hint()
138	}
139	}
140
141	impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
142	#[inline]
143	fn next_back(&mut self) -> Option<(usize, &'a str)> {
144	self.iter
145	.next_back()
146	.map(\|s: &str\| (s.as_ptr() as usize - self.start_offset, s))
147	}
148	}
149
150	// state machine for word boundary rules
151	#[derive(Clone, Copy, PartialEq, Eq, Debug)]
152	enum UWordBoundsState {
153	Start,
154	Letter,
155	HLetter,
156	Numeric,
157	Katakana,
158	ExtendNumLet,
159	Regional(RegionalState),
160	FormatExtend(FormatExtendType),
161	Zwj,
162	Emoji,
163	WSegSpace,
164	}
165
166	// subtypes for FormatExtend state in UWordBoundsState
167	#[derive(Clone, Copy, PartialEq, Eq, Debug)]
168	enum FormatExtendType {
169	AcceptAny,
170	AcceptNone,
171	RequireLetter,
172	RequireHLetter,
173	AcceptQLetter,
174	RequireNumeric,
175	}
176
177	#[derive(Clone, Copy, PartialEq, Eq, Debug)]
178	enum RegionalState {
179	Half,
180	Full,
181	Unknown,
182	}
183
184	fn is_emoji(ch: char) -> bool {
185	use crate::tables::emoji;
186	emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
187	}
188
189	impl<'a> Iterator for UWordBounds<'a> {
190	type Item = &'a str;
191
192	#[inline]
193	fn size_hint(&self) -> (usize, Option<usize>) {
194	let slen = self.string.len();
195	(cmp::min(slen, `1`), Some(slen))
196	}
197
198	#[inline]
199	fn next(&mut self) -> Option<&'a str> {
200	use self::FormatExtendType::*;
201	use self::UWordBoundsState::*;
202	use crate::tables::word as wd;
203	if self.string.len() == `0` {
204	return None;
205	}
206
207	let mut take_curr = `true`;
208	let mut take_cat = `true`;
209	let mut idx = `0`;
210	let mut saveidx = `0`;
211	let mut state = Start;
212	let mut cat = wd::WC_Any;
213	let mut savecat = wd::WC_Any;
214
215	// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
216	let mut skipped_format_extend = `false`;
217	for (curr, ch) in self.string.char_indices() {
218	idx = curr;
219	// Whether or not the previous category was ZWJ
220	// ZWJs get collapsed, so this handles precedence of WB3c over WB4
221	let prev_zwj = cat == wd::WC_ZWJ;
222	// if there's a category cached, grab it
223	cat = match self.cat {
224	None => wd::word_category(ch).2,
225	_ => self.cat.take().unwrap(),
226	};
227	take_cat = `true`;
228
229	// handle rule WB4
230	// just skip all format, extend, and zwj chars
231	// note that Start is a special case: if there's a bunch of Format \| Extend
232	// characters at the beginning of a block of text, dump them out as one unit.
233	//
234	// (This is not obvious from the wording of UAX#29, but if you look at the
235	// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
236	// then the "correct" interpretation of WB4 becomes apparent.)
237	if state != Start {
238	match cat {
239	wd::WC_Extend \| wd::WC_Format \| wd::WC_ZWJ => {
240	skipped_format_extend = `true`;
241	continue;
242	}
243	_ => {}
244	}
245	}
246
247	// rule WB3c
248	// WB4 makes all ZWJs collapse into the previous state
249	// but you can still be in a Zwj state if you started with Zwj
250	//
251	// This means that an EP + Zwj will collapse into EP, which is wrong,
252	// since EP+EP is not a boundary but EP+ZWJ+EP is
253	//
254	// Thus, we separately keep track of whether or not the last character
255	// was a ZWJ. This is an additional bit of state tracked outside of the
256	// state enum; the state enum represents the last non-zwj state encountered.
257	// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
258	// however we are in the previous state for the purposes of all other rules.
259	if prev_zwj {
260	if is_emoji(ch) {
261	state = Emoji;
262	continue;
263	}
264	}
265	// Don't use `continue` in this match without updating `cat`
266	state = match state {
267	Start if cat == wd::WC_CR => {
268	idx += match self.get_next_cat(idx) {
269	Some(ncat) if ncat == wd::WC_LF => `1`, // rule WB3
270	_ => `0`,
271	};
272	break; // rule WB3a
273	}
274	Start => match cat {
275	wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
276	wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
277	wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
278	wd::WC_Katakana => Katakana, // rule WB13, WB13a
279	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
280	wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
281	wd::WC_LF \| wd::WC_Newline => break, // rule WB3a
282	wd::WC_ZWJ => Zwj, // rule WB3c
283	wd::WC_WSegSpace => WSegSpace, // rule WB3d
284	_ => {
285	if let Some(ncat) = self.get_next_cat(idx) {
286	// rule WB4
287	if ncat == wd::WC_Format \|\| ncat == wd::WC_Extend \|\| ncat == wd::WC_ZWJ
288	{
289	state = FormatExtend(AcceptNone);
290	self.cat = Some(ncat);
291	continue;
292	}
293	}
294	break; // rule WB999
295	}
296	},
297	WSegSpace => match cat {
298	wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
299	_ => {
300	take_curr = `false`;
301	break;
302	}
303	},
304	Zwj => {
305	// We already handle WB3c above.
306	take_curr = `false`;
307	break;
308	}
309	Letter \| HLetter => match cat {
310	wd::WC_ALetter => Letter, // rule WB5
311	wd::WC_Hebrew_Letter => HLetter, // rule WB5
312	wd::WC_Numeric => Numeric, // rule WB9
313	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
314	wd::WC_Double_Quote if state == HLetter => {
315	savecat = cat;
316	saveidx = idx;
317	FormatExtend(RequireHLetter) // rule WB7b
318	}
319	wd::WC_Single_Quote if state == HLetter => {
320	FormatExtend(AcceptQLetter) // rule WB7a
321	}
322	wd::WC_MidLetter \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
323	savecat = cat;
324	saveidx = idx;
325	FormatExtend(RequireLetter) // rule WB6
326	}
327	_ => {
328	take_curr = `false`;
329	break;
330	}
331	},
332	Numeric => match cat {
333	wd::WC_Numeric => Numeric, // rule WB8
334	wd::WC_ALetter => Letter, // rule WB10
335	wd::WC_Hebrew_Letter => HLetter, // rule WB10
336	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
337	wd::WC_MidNum \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
338	savecat = cat;
339	saveidx = idx;
340	FormatExtend(RequireNumeric) // rule WB12
341	}
342	_ => {
343	take_curr = `false`;
344	break;
345	}
346	},
347	Katakana => match cat {
348	wd::WC_Katakana => Katakana, // rule WB13
349	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
350	_ => {
351	take_curr = `false`;
352	break;
353	}
354	},
355	ExtendNumLet => match cat {
356	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
357	wd::WC_ALetter => Letter, // rule WB13b
358	wd::WC_Hebrew_Letter => HLetter, // rule WB13b
359	wd::WC_Numeric => Numeric, // rule WB13b
360	wd::WC_Katakana => Katakana, // rule WB13b
361	_ => {
362	take_curr = `false`;
363	break;
364	}
365	},
366	Regional(RegionalState::Full) => {
367	// if it reaches here we've gone too far,
368	// a full flag can only compose with ZWJ/Extend/Format
369	// proceeding it.
370	take_curr = `false`;
371	break;
372	}
373	Regional(RegionalState::Half) => match cat {
374	wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
375	_ => {
376	take_curr = `false`;
377	break;
378	}
379	},
380	Regional(_) => {
381	unreachable!("RegionalState::Unknown should not occur on forward iteration")
382	}
383	Emoji => {
384	// We already handle WB3c above. If you've reached this point, the emoji sequence is over.
385	take_curr = `false`;
386	break;
387	}
388	FormatExtend(t) => match t {
389	// handle FormatExtends depending on what type
390	RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
391	RequireLetter \| AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
392	RequireLetter \| AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
393	RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
394	AcceptNone \| AcceptQLetter => {
395	take_curr = `false`; // emit all the Format\|Extend characters
396	take_cat = `false`;
397	break;
398	}
399	_ => break, // rewind (in if statement below)
400	},
401	}
402	}
403
404	if let FormatExtend(t) = state {
405	// we were looking for something and didn't find it; we have to back up
406	if t == RequireLetter \|\| t == RequireHLetter \|\| t == RequireNumeric {
407	idx = saveidx;
408	cat = savecat;
409	take_curr = `false`;
410	}
411	}
412
413	self.cat = if take_curr {
414	idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
415	None
416	} else if take_cat {
417	Some(cat)
418	} else {
419	None
420	};
421
422	let retstr = &self.string[..idx];
423	self.string = &self.string[idx..];
424	Some(retstr)
425	}
426	}
427
428	impl<'a> DoubleEndedIterator for UWordBounds<'a> {
429	#[inline]
430	fn next_back(&mut self) -> Option<&'a str> {
431	use self::FormatExtendType::*;
432	use self::UWordBoundsState::*;
433	use crate::tables::word as wd;
434	if self.string.len() == `0` {
435	return None;
436	}
437
438	let mut take_curr = `true`;
439	let mut take_cat = `true`;
440	let mut idx = self.string.len();
441	idx -= self.string.chars().next_back().unwrap().len_utf8();
442	let mut previdx = idx;
443	let mut saveidx = idx;
444	let mut state = Start;
445	let mut savestate = Start;
446	let mut cat = wd::WC_Any;
447
448	let mut skipped_format_extend = `false`;
449
450	for (curr, ch) in self.string.char_indices().rev() {
451	previdx = idx;
452	idx = curr;
453
454	// if there's a category cached, grab it
455	cat = match self.catb {
456	None => wd::word_category(ch).2,
457	_ => self.catb.take().unwrap(),
458	};
459	take_cat = `true`;
460
461	// backward iterator over word boundaries. Mostly the same as the forward
462	// iterator, with two weirdnesses:
463	// (1) If we encounter a single quote in the Start state, we have to check for a
464	// Hebrew Letter immediately before it.
465	// (2) Format and Extend char handling takes some gymnastics.
466
467	if cat == wd::WC_Extend \|\| cat == wd::WC_Format \|\| (cat == wd::WC_ZWJ && state != Zwj) {
468	// WB3c has more priority so we should not
469	// fold in that case
470	if match state {
471	FormatExtend(_) \| Start => `false`,
472	_ => `true`,
473	} {
474	saveidx = previdx;
475	savestate = state;
476	state = FormatExtend(AcceptNone);
477	}
478
479	if state != Start {
480	continue;
481	}
482	} else if state == FormatExtend(AcceptNone) {
483	// finished a scan of some Format\|Extend chars, restore previous state
484	state = savestate;
485	previdx = saveidx;
486	take_cat = `false`;
487	skipped_format_extend = `true`;
488	}
489
490	// Don't use `continue` in this match without updating `catb`
491	state = match state {
492	Start \| FormatExtend(AcceptAny) => match cat {
493	_ if is_emoji(ch) => Zwj,
494	wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
495	wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
496	wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
497	wd::WC_Katakana => Katakana, // rule WB13, WB13b
498	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
499	wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
500	// rule WB4:
501	wd::WC_Extend \| wd::WC_Format \| wd::WC_ZWJ => FormatExtend(AcceptAny),
502	wd::WC_Single_Quote => {
503	saveidx = idx;
504	FormatExtend(AcceptQLetter) // rule WB7a
505	}
506	wd::WC_WSegSpace => WSegSpace,
507	wd::WC_CR \| wd::WC_LF \| wd::WC_Newline => {
508	if state == Start {
509	if cat == wd::WC_LF {
510	idx -= match self.get_prev_cat(idx) {
511	Some(pcat) if pcat == wd::WC_CR => `1`, // rule WB3
512	_ => `0`,
513	};
514	}
515	} else {
516	take_curr = `false`;
517	}
518	break; // rule WB3a
519	}
520	_ => break, // rule WB999
521	},
522	Zwj => match cat {
523	// rule WB3c
524	wd::WC_ZWJ => FormatExtend(AcceptAny),
525	_ => {
526	take_curr = `false`;
527	break;
528	}
529	},
530	WSegSpace => match cat {
531	// rule WB3d
532	wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
533	_ => {
534	take_curr = `false`;
535	break;
536	}
537	},
538	Letter \| HLetter => match cat {
539	wd::WC_ALetter => Letter, // rule WB5
540	wd::WC_Hebrew_Letter => HLetter, // rule WB5
541	wd::WC_Numeric => Numeric, // rule WB10
542	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
543	wd::WC_Double_Quote if state == HLetter => {
544	saveidx = previdx;
545	FormatExtend(RequireHLetter) // rule WB7c
546	}
547	wd::WC_MidLetter \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
548	saveidx = previdx;
549	FormatExtend(RequireLetter) // rule WB7
550	}
551	_ => {
552	take_curr = `false`;
553	break;
554	}
555	},
556	Numeric => match cat {
557	wd::WC_Numeric => Numeric, // rule WB8
558	wd::WC_ALetter => Letter, // rule WB9
559	wd::WC_Hebrew_Letter => HLetter, // rule WB9
560	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
561	wd::WC_MidNum \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
562	saveidx = previdx;
563	FormatExtend(RequireNumeric) // rule WB11
564	}
565	_ => {
566	take_curr = `false`;
567	break;
568	}
569	},
570	Katakana => match cat {
571	wd::WC_Katakana => Katakana, // rule WB13
572	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
573	_ => {
574	take_curr = `false`;
575	break;
576	}
577	},
578	ExtendNumLet => match cat {
579	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
580	wd::WC_ALetter => Letter, // rule WB13a
581	wd::WC_Hebrew_Letter => HLetter, // rule WB13a
582	wd::WC_Numeric => Numeric, // rule WB13a
583	wd::WC_Katakana => Katakana, // rule WB13a
584	_ => {
585	take_curr = `false`;
586	break;
587	}
588	},
589	Regional(mut regional_state) => match cat {
590	// rule WB13c
591	wd::WC_Regional_Indicator => {
592	if regional_state == RegionalState::Unknown {
593	let count = self.string[..previdx]
594	.chars()
595	.rev()
596	.map(\|c\| wd::word_category(c).2)
597	.filter(\|&c\| {
598	!(c == wd::WC_ZWJ \|\| c == wd::WC_Extend \|\| c == wd::WC_Format)
599	})
600	.take_while(\|&c\| c == wd::WC_Regional_Indicator)
601	.count();
602	regional_state = if count % `2` == `0` {
603	RegionalState::Full
604	} else {
605	RegionalState::Half
606	};
607	}
608	if regional_state == RegionalState::Full {
609	take_curr = `false`;
610	break;
611	} else {
612	Regional(RegionalState::Full)
613	}
614	}
615	_ => {
616	take_curr = `false`;
617	break;
618	}
619	},
620	Emoji => {
621	if is_emoji(ch) {
622	// rule WB3c
623	Zwj
624	} else {
625	take_curr = `false`;
626	break;
627	}
628	}
629	FormatExtend(t) => match t {
630	RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
631	RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
632	RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
633	AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
634	RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
635	_ => break, // backtrack will happens
636	},
637	}
638	}
639
640	if let FormatExtend(t) = state {
641	// if we required something but didn't find it, backtrack
642	if t == RequireLetter
643	\|\| t == RequireHLetter
644	\|\| t == RequireNumeric
645	\|\| t == AcceptNone
646	\|\| t == AcceptQLetter
647	{
648	previdx = saveidx;
649	take_cat = `false`;
650	take_curr = `false`;
651	}
652	}
653
654	self.catb = if take_curr {
655	None
656	} else {
657	idx = previdx;
658	if take_cat {
659	Some(cat)
660	} else {
661	None
662	}
663	};
664
665	let retstr = &self.string[idx..];
666	self.string = &self.string[..idx];
667	Some(retstr)
668	}
669	}
670
671	impl<'a> UWordBounds<'a> {
672	#[inline]
673	/// View the underlying data (the part yet to be iterated) as a slice of the original string.
674	///
675	/// ```rust
676	/// # use unicode_segmentation::UnicodeSegmentation;
677	/// let mut iter = "Hello world".split_word_bounds();
678	/// assert_eq!(iter.as_str(), "Hello world");
679	/// iter.next();
680	/// assert_eq!(iter.as_str(), " world");
681	/// iter.next();
682	/// assert_eq!(iter.as_str(), "world");
683	/// ```
684	pub fn as_str(&self) -> &'a str {
685	self.string
686	}
687
688	#[inline]
689	fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
690	use crate::tables::word as wd;
691	let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
692	if nidx < self.string.len() {
693	let nch = self.string[nidx..].chars().next().unwrap();
694	Some(wd::word_category(nch).2)
695	} else {
696	None
697	}
698	}
699
700	#[inline]
701	fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
702	use crate::tables::word as wd;
703	if idx > `0` {
704	let nch = self.string[..idx].chars().next_back().unwrap();
705	Some(wd::word_category(nch).2)
706	} else {
707	None
708	}
709	}
710	}
711
712	#[inline]
713	pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> {
714	UWordBounds {
715	string: s,
716	cat: None,
717	catb: None,
718	}
719	}
720
721	#[inline]
722	pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> {
723	UWordBoundIndices {
724	start_offset: s.as_ptr() as usize,
725	iter: new_word_bounds(s),
726	}
727	}
728
729	#[inline]
730	fn has_alphanumeric(s: &&str) -> bool {
731	use crate::tables::util::is_alphanumeric;
732
733	s.chars().any(\|c: char\| is_alphanumeric(c))
734	}
735
736	#[inline]
737	pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> {
738	use super::UnicodeSegmentation;
739
740	UnicodeWords {
741	inner: s.split_word_bounds().filter(has_alphanumeric),
742	}
743	}
744
745	#[inline]
746	pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> {
747	use super::UnicodeSegmentation;
748
749	UnicodeWordIndices {
750	inner: sUWordBoundIndices<'_>
751	.split_word_bound_indices()
752	.filter(\|(_, c: &&str)\| has_alphanumeric(c)),
753	}
754	}
755