word.rs source code [crates/unicode_segmentation/src/word.rs]

1	// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2	// file at the top-level directory of this distribution and at
3	// http://rust-lang.org/COPYRIGHT.
4	//
5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8	// option. This file may not be copied, modified, or distributed
9	// except according to those terms.
10
11	use core::cmp;
12	use core::iter::Filter;
13
14	use crate::tables::word::WordCat;
15
16	/// An iterator over the substrings of a string which, after splitting the string on
17	/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
18	/// contain any characters with the
19	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
20	/// property, or with
21	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
22	///
23	/// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See
24	/// its documentation for more.
25	///
26	/// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words
27	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
28	#[derive(Debug)]
29	pub struct UnicodeWords<'a> {
30	inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>,
31	}
32
33	impl<'a> Iterator for UnicodeWords<'a> {
34	type Item = &'a str;
35
36	#[inline]
37	fn next(&mut self) -> Option<&'a str> {
38	self.inner.next()
39	}
40
41	#[inline]
42	fn size_hint(&self) -> (usize, Option<usize>) {
43	self.inner.size_hint()
44	}
45	}
46	impl<'a> DoubleEndedIterator for UnicodeWords<'a> {
47	#[inline]
48	fn next_back(&mut self) -> Option<&'a str> {
49	self.inner.next_back()
50	}
51	}
52
53	/// An iterator over the substrings of a string which, after splitting the string on
54	/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries),
55	/// contain any characters with the
56	/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
57	/// property, or with
58	/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
59	/// This iterator also provides the byte offsets for each substring.
60	///
61	/// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See
62	/// its documentation for more.
63	///
64	/// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices
65	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
66	#[derive(Debug)]
67	pub struct UnicodeWordIndices<'a> {
68	#[allow(clippy::type_complexity)]
69	inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>,
70	}
71
72	impl<'a> Iterator for UnicodeWordIndices<'a> {
73	type Item = (usize, &'a str);
74
75	#[inline]
76	fn next(&mut self) -> Option<(usize, &'a str)> {
77	self.inner.next()
78	}
79
80	#[inline]
81	fn size_hint(&self) -> (usize, Option<usize>) {
82	self.inner.size_hint()
83	}
84	}
85	impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> {
86	#[inline]
87	fn next_back(&mut self) -> Option<(usize, &'a str)> {
88	self.inner.next_back()
89	}
90	}
91
92	/// External iterator for a string's
93	/// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries).
94	///
95	/// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`]
96	/// trait. See its documentation for more.
97	///
98	/// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds
99	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
100	#[derive(Debug, Clone)]
101	pub struct UWordBounds<'a> {
102	string: &'a str,
103	cat: Option<WordCat>,
104	catb: Option<WordCat>,
105	}
106
107	/// External iterator for word boundaries and byte offsets.
108	///
109	/// This struct is created by the [`split_word_bound_indices`] method on the
110	/// [`UnicodeSegmentation`] trait. See its documentation for more.
111	///
112	/// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices
113	/// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html
114	#[derive(Debug, Clone)]
115	pub struct UWordBoundIndices<'a> {
116	start_offset: usize,
117	iter: UWordBounds<'a>,
118	}
119
120	impl<'a> UWordBoundIndices<'a> {
121	#[inline]
122	/// View the underlying data (the part yet to be iterated) as a slice of the original string.
123	///
124	/// ```rust
125	/// # use unicode_segmentation::UnicodeSegmentation;
126	/// let mut iter = "Hello world".split_word_bound_indices();
127	/// assert_eq!(iter.as_str(), "Hello world");
128	/// iter.next();
129	/// assert_eq!(iter.as_str(), " world");
130	/// iter.next();
131	/// assert_eq!(iter.as_str(), "world");
132	/// ```
133	pub fn as_str(&self) -> &'a str {
134	self.iter.as_str()
135	}
136	}
137
138	impl<'a> Iterator for UWordBoundIndices<'a> {
139	type Item = (usize, &'a str);
140
141	#[inline]
142	fn next(&mut self) -> Option<(usize, &'a str)> {
143	self.iter
144	.next()
145	.map(\|s: &'a str\| (s.as_ptr() as usize - self.start_offset, s))
146	}
147
148	#[inline]
149	fn size_hint(&self) -> (usize, Option<usize>) {
150	self.iter.size_hint()
151	}
152	}
153
154	impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> {
155	#[inline]
156	fn next_back(&mut self) -> Option<(usize, &'a str)> {
157	self.iter
158	.next_back()
159	.map(\|s: &'a str\| (s.as_ptr() as usize - self.start_offset, s))
160	}
161	}
162
163	// state machine for word boundary rules
164	#[derive(Clone, Copy, PartialEq, Eq, Debug)]
165	enum UWordBoundsState {
166	Start,
167	Letter,
168	HLetter,
169	Numeric,
170	Katakana,
171	ExtendNumLet,
172	Regional(RegionalState),
173	FormatExtend(FormatExtendType),
174	Zwj,
175	Emoji,
176	WSegSpace,
177	}
178
179	// subtypes for FormatExtend state in UWordBoundsState
180	#[derive(Clone, Copy, PartialEq, Eq, Debug)]
181	enum FormatExtendType {
182	AcceptAny,
183	AcceptNone,
184	RequireLetter,
185	RequireHLetter,
186	AcceptQLetter,
187	RequireNumeric,
188	}
189
190	#[derive(Clone, Copy, PartialEq, Eq, Debug)]
191	enum RegionalState {
192	Half,
193	Full,
194	Unknown,
195	}
196
197	fn is_emoji(ch: char) -> bool {
198	use crate::tables::emoji;
199	emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic
200	}
201
202	impl<'a> Iterator for UWordBounds<'a> {
203	type Item = &'a str;
204
205	#[inline]
206	fn size_hint(&self) -> (usize, Option<usize>) {
207	let slen = self.string.len();
208	(cmp::min(slen, `1`), Some(slen))
209	}
210
211	#[inline]
212	fn next(&mut self) -> Option<&'a str> {
213	use self::FormatExtendType::*;
214	use self::UWordBoundsState::*;
215	use crate::tables::word as wd;
216	if self.string.is_empty() {
217	return None;
218	}
219
220	let mut take_curr = `true`;
221	let mut take_cat = `true`;
222	let mut idx = `0`;
223	let mut saveidx = `0`;
224	let mut state = Start;
225	let mut cat = wd::WC_Any;
226	let mut savecat = wd::WC_Any;
227
228	// If extend/format/zwj were skipped. Handles precedence of WB3d over WB4
229	let mut skipped_format_extend = `false`;
230	for (curr, ch) in self.string.char_indices() {
231	idx = curr;
232	// Whether or not the previous category was ZWJ
233	// ZWJs get collapsed, so this handles precedence of WB3c over WB4
234	let prev_zwj = cat == wd::WC_ZWJ;
235	// if there's a category cached, grab it
236	cat = match self.cat {
237	None => wd::word_category(ch).2,
238	_ => self.cat.take().unwrap(),
239	};
240	take_cat = `true`;
241
242	// handle rule WB4
243	// just skip all format, extend, and zwj chars
244	// note that Start is a special case: if there's a bunch of Format \| Extend
245	// characters at the beginning of a block of text, dump them out as one unit.
246	//
247	// (This is not obvious from the wording of UAX#29, but if you look at the
248	// test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt
249	// then the "correct" interpretation of WB4 becomes apparent.)
250	if state != Start {
251	match cat {
252	wd::WC_Extend \| wd::WC_Format \| wd::WC_ZWJ => {
253	skipped_format_extend = `true`;
254	continue;
255	}
256	_ => {}
257	}
258	}
259
260	// rule WB3c
261	// WB4 makes all ZWJs collapse into the previous state
262	// but you can still be in a Zwj state if you started with Zwj
263	//
264	// This means that an EP + Zwj will collapse into EP, which is wrong,
265	// since EP+EP is not a boundary but EP+ZWJ+EP is
266	//
267	// Thus, we separately keep track of whether or not the last character
268	// was a ZWJ. This is an additional bit of state tracked outside of the
269	// state enum; the state enum represents the last non-zwj state encountered.
270	// When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state,
271	// however we are in the previous state for the purposes of all other rules.
272	if prev_zwj && is_emoji(ch) {
273	state = Emoji;
274	continue;
275	}
276	// Don't use `continue` in this match without updating `cat`
277	state = match state {
278	Start if cat == wd::WC_CR => {
279	idx += match self.get_next_cat(idx) {
280	Some(wd::WC_LF) => `1`, // rule WB3
281	_ => `0`,
282	};
283	break; // rule WB3a
284	}
285	Start => match cat {
286	wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a
287	wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a
288	wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a
289	wd::WC_Katakana => Katakana, // rule WB13, WB13a
290	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b
291	wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c
292	wd::WC_LF \| wd::WC_Newline => break, // rule WB3a
293	wd::WC_ZWJ => Zwj, // rule WB3c
294	wd::WC_WSegSpace => WSegSpace, // rule WB3d
295	_ => {
296	if let Some(ncat) = self.get_next_cat(idx) {
297	// rule WB4
298	if ncat == wd::WC_Format \|\| ncat == wd::WC_Extend \|\| ncat == wd::WC_ZWJ
299	{
300	state = FormatExtend(AcceptNone);
301	self.cat = Some(ncat);
302	continue;
303	}
304	}
305	break; // rule WB999
306	}
307	},
308	WSegSpace => match cat {
309	wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
310	_ => {
311	take_curr = `false`;
312	break;
313	}
314	},
315	Zwj => {
316	// We already handle WB3c above.
317	take_curr = `false`;
318	break;
319	}
320	Letter \| HLetter => match cat {
321	wd::WC_ALetter => Letter, // rule WB5
322	wd::WC_Hebrew_Letter => HLetter, // rule WB5
323	wd::WC_Numeric => Numeric, // rule WB9
324	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
325	wd::WC_Double_Quote if state == HLetter => {
326	savecat = cat;
327	saveidx = idx;
328	FormatExtend(RequireHLetter) // rule WB7b
329	}
330	wd::WC_Single_Quote if state == HLetter => {
331	FormatExtend(AcceptQLetter) // rule WB7a
332	}
333	wd::WC_MidLetter \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
334	savecat = cat;
335	saveidx = idx;
336	FormatExtend(RequireLetter) // rule WB6
337	}
338	_ => {
339	take_curr = `false`;
340	break;
341	}
342	},
343	Numeric => match cat {
344	wd::WC_Numeric => Numeric, // rule WB8
345	wd::WC_ALetter => Letter, // rule WB10
346	wd::WC_Hebrew_Letter => HLetter, // rule WB10
347	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
348	wd::WC_MidNum \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
349	savecat = cat;
350	saveidx = idx;
351	FormatExtend(RequireNumeric) // rule WB12
352	}
353	_ => {
354	take_curr = `false`;
355	break;
356	}
357	},
358	Katakana => match cat {
359	wd::WC_Katakana => Katakana, // rule WB13
360	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
361	_ => {
362	take_curr = `false`;
363	break;
364	}
365	},
366	ExtendNumLet => match cat {
367	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
368	wd::WC_ALetter => Letter, // rule WB13b
369	wd::WC_Hebrew_Letter => HLetter, // rule WB13b
370	wd::WC_Numeric => Numeric, // rule WB13b
371	wd::WC_Katakana => Katakana, // rule WB13b
372	_ => {
373	take_curr = `false`;
374	break;
375	}
376	},
377	Regional(RegionalState::Full) => {
378	// if it reaches here we've gone too far,
379	// a full flag can only compose with ZWJ/Extend/Format
380	// proceeding it.
381	take_curr = `false`;
382	break;
383	}
384	Regional(RegionalState::Half) => match cat {
385	wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c
386	_ => {
387	take_curr = `false`;
388	break;
389	}
390	},
391	Regional(_) => {
392	unreachable!("RegionalState::Unknown should not occur on forward iteration")
393	}
394	Emoji => {
395	// We already handle WB3c above. If you've reached this point, the emoji sequence is over.
396	take_curr = `false`;
397	break;
398	}
399	FormatExtend(t) => match t {
400	// handle FormatExtends depending on what type
401	RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11
402	RequireLetter \| AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7
403	RequireLetter \| AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a
404	RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
405	AcceptNone \| AcceptQLetter => {
406	take_curr = `false`; // emit all the Format\|Extend characters
407	take_cat = `false`;
408	break;
409	}
410	_ => break, // rewind (in if statement below)
411	},
412	}
413	}
414
415	if let FormatExtend(t) = state {
416	// we were looking for something and didn't find it; we have to back up
417	if t == RequireLetter \|\| t == RequireHLetter \|\| t == RequireNumeric {
418	idx = saveidx;
419	cat = savecat;
420	take_curr = `false`;
421	}
422	}
423
424	self.cat = if take_curr {
425	idx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
426	None
427	} else if take_cat {
428	Some(cat)
429	} else {
430	None
431	};
432
433	let retstr = &self.string[..idx];
434	self.string = &self.string[idx..];
435	Some(retstr)
436	}
437	}
438
439	impl<'a> DoubleEndedIterator for UWordBounds<'a> {
440	#[inline]
441	fn next_back(&mut self) -> Option<&'a str> {
442	use self::FormatExtendType::*;
443	use self::UWordBoundsState::*;
444	use crate::tables::word as wd;
445	if self.string.is_empty() {
446	return None;
447	}
448
449	let mut take_curr = `true`;
450	let mut take_cat = `true`;
451	let mut idx = self.string.len();
452	idx -= self.string.chars().next_back().unwrap().len_utf8();
453	let mut previdx = idx;
454	let mut saveidx = idx;
455	let mut state = Start;
456	let mut savestate = Start;
457	let mut cat = wd::WC_Any;
458
459	let mut skipped_format_extend = `false`;
460
461	for (curr, ch) in self.string.char_indices().rev() {
462	previdx = idx;
463	idx = curr;
464
465	// if there's a category cached, grab it
466	cat = match self.catb {
467	None => wd::word_category(ch).2,
468	_ => self.catb.take().unwrap(),
469	};
470	take_cat = `true`;
471
472	// backward iterator over word boundaries. Mostly the same as the forward
473	// iterator, with two weirdnesses:
474	// (1) If we encounter a single quote in the Start state, we have to check for a
475	// Hebrew Letter immediately before it.
476	// (2) Format and Extend char handling takes some gymnastics.
477
478	if cat == wd::WC_Extend \|\| cat == wd::WC_Format \|\| (cat == wd::WC_ZWJ && state != Zwj) {
479	// WB3c has more priority so we should not
480	// fold in that case
481	if !matches!(state, FormatExtend(_) \| Start) {
482	saveidx = previdx;
483	savestate = state;
484	state = FormatExtend(AcceptNone);
485	}
486
487	if state != Start {
488	continue;
489	}
490	} else if state == FormatExtend(AcceptNone) {
491	// finished a scan of some Format\|Extend chars, restore previous state
492	state = savestate;
493	previdx = saveidx;
494	take_cat = `false`;
495	skipped_format_extend = `true`;
496	}
497
498	// Don't use `continue` in this match without updating `catb`
499	state = match state {
500	Start \| FormatExtend(AcceptAny) => match cat {
501	_ if is_emoji(ch) => Zwj,
502	wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b
503	wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b
504	wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b
505	wd::WC_Katakana => Katakana, // rule WB13, WB13b
506	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
507	wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c
508	// rule WB4:
509	wd::WC_Extend \| wd::WC_Format \| wd::WC_ZWJ => FormatExtend(AcceptAny),
510	wd::WC_Single_Quote => {
511	saveidx = idx;
512	FormatExtend(AcceptQLetter) // rule WB7a
513	}
514	wd::WC_WSegSpace => WSegSpace,
515	wd::WC_CR \| wd::WC_LF \| wd::WC_Newline => {
516	if state == Start {
517	if cat == wd::WC_LF {
518	idx -= match self.get_prev_cat(idx) {
519	Some(wd::WC_CR) => `1`, // rule WB3
520	_ => `0`,
521	};
522	}
523	} else {
524	take_curr = `false`;
525	}
526	break; // rule WB3a
527	}
528	_ => break, // rule WB999
529	},
530	Zwj => match cat {
531	// rule WB3c
532	wd::WC_ZWJ => FormatExtend(AcceptAny),
533	_ => {
534	take_curr = `false`;
535	break;
536	}
537	},
538	WSegSpace => match cat {
539	// rule WB3d
540	wd::WC_WSegSpace if !skipped_format_extend => WSegSpace,
541	_ => {
542	take_curr = `false`;
543	break;
544	}
545	},
546	Letter \| HLetter => match cat {
547	wd::WC_ALetter => Letter, // rule WB5
548	wd::WC_Hebrew_Letter => HLetter, // rule WB5
549	wd::WC_Numeric => Numeric, // rule WB10
550	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
551	wd::WC_Double_Quote if state == HLetter => {
552	saveidx = previdx;
553	FormatExtend(RequireHLetter) // rule WB7c
554	}
555	wd::WC_MidLetter \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
556	saveidx = previdx;
557	FormatExtend(RequireLetter) // rule WB7
558	}
559	_ => {
560	take_curr = `false`;
561	break;
562	}
563	},
564	Numeric => match cat {
565	wd::WC_Numeric => Numeric, // rule WB8
566	wd::WC_ALetter => Letter, // rule WB9
567	wd::WC_Hebrew_Letter => HLetter, // rule WB9
568	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
569	wd::WC_MidNum \| wd::WC_MidNumLet \| wd::WC_Single_Quote => {
570	saveidx = previdx;
571	FormatExtend(RequireNumeric) // rule WB11
572	}
573	_ => {
574	take_curr = `false`;
575	break;
576	}
577	},
578	Katakana => match cat {
579	wd::WC_Katakana => Katakana, // rule WB13
580	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b
581	_ => {
582	take_curr = `false`;
583	break;
584	}
585	},
586	ExtendNumLet => match cat {
587	wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a
588	wd::WC_ALetter => Letter, // rule WB13a
589	wd::WC_Hebrew_Letter => HLetter, // rule WB13a
590	wd::WC_Numeric => Numeric, // rule WB13a
591	wd::WC_Katakana => Katakana, // rule WB13a
592	_ => {
593	take_curr = `false`;
594	break;
595	}
596	},
597	Regional(mut regional_state) => match cat {
598	// rule WB13c
599	wd::WC_Regional_Indicator => {
600	if regional_state == RegionalState::Unknown {
601	let count = self.string[..previdx]
602	.chars()
603	.rev()
604	.map(\|c\| wd::word_category(c).2)
605	.filter(\|&c\| {
606	!(c == wd::WC_ZWJ \|\| c == wd::WC_Extend \|\| c == wd::WC_Format)
607	})
608	.take_while(\|&c\| c == wd::WC_Regional_Indicator)
609	.count();
610	regional_state = if count % `2` == `0` {
611	RegionalState::Full
612	} else {
613	RegionalState::Half
614	};
615	}
616	if regional_state == RegionalState::Full {
617	take_curr = `false`;
618	break;
619	} else {
620	Regional(RegionalState::Full)
621	}
622	}
623	_ => {
624	take_curr = `false`;
625	break;
626	}
627	},
628	Emoji => {
629	if is_emoji(ch) {
630	// rule WB3c
631	Zwj
632	} else {
633	take_curr = `false`;
634	break;
635	}
636	}
637	FormatExtend(t) => match t {
638	RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12
639	RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6
640	RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6
641	AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a
642	RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b
643	_ => break, // backtrack will happens
644	},
645	}
646	}
647
648	if let FormatExtend(t) = state {
649	// if we required something but didn't find it, backtrack
650	if t == RequireLetter
651	\|\| t == RequireHLetter
652	\|\| t == RequireNumeric
653	\|\| t == AcceptNone
654	\|\| t == AcceptQLetter
655	{
656	previdx = saveidx;
657	take_cat = `false`;
658	take_curr = `false`;
659	}
660	}
661
662	self.catb = if take_curr {
663	None
664	} else {
665	idx = previdx;
666	if take_cat {
667	Some(cat)
668	} else {
669	None
670	}
671	};
672
673	let retstr = &self.string[idx..];
674	self.string = &self.string[..idx];
675	Some(retstr)
676	}
677	}
678
679	impl<'a> UWordBounds<'a> {
680	#[inline]
681	/// View the underlying data (the part yet to be iterated) as a slice of the original string.
682	///
683	/// ```rust
684	/// # use unicode_segmentation::UnicodeSegmentation;
685	/// let mut iter = "Hello world".split_word_bounds();
686	/// assert_eq!(iter.as_str(), "Hello world");
687	/// iter.next();
688	/// assert_eq!(iter.as_str(), " world");
689	/// iter.next();
690	/// assert_eq!(iter.as_str(), "world");
691	/// ```
692	pub fn as_str(&self) -> &'a str {
693	self.string
694	}
695
696	#[inline]
697	fn get_next_cat(&self, idx: usize) -> Option<WordCat> {
698	use crate::tables::word as wd;
699	let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
700	if nidx < self.string.len() {
701	let nch = self.string[nidx..].chars().next().unwrap();
702	Some(wd::word_category(nch).2)
703	} else {
704	None
705	}
706	}
707
708	#[inline]
709	fn get_prev_cat(&self, idx: usize) -> Option<WordCat> {
710	use crate::tables::word as wd;
711	if idx > `0` {
712	let nch = self.string[..idx].chars().next_back().unwrap();
713	Some(wd::word_category(nch).2)
714	} else {
715	None
716	}
717	}
718	}
719
720	#[inline]
721	pub fn new_word_bounds(s: &str) -> UWordBounds<'_> {
722	UWordBounds {
723	string: s,
724	cat: None,
725	catb: None,
726	}
727	}
728
729	#[inline]
730	pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> {
731	UWordBoundIndices {
732	start_offset: s.as_ptr() as usize,
733	iter: new_word_bounds(s),
734	}
735	}
736
737	#[inline]
738	fn has_alphanumeric(s: &&str) -> bool {
739	use crate::tables::util::is_alphanumeric;
740
741	s.chars().any(is_alphanumeric)
742	}
743
744	#[inline]
745	pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> {
746	use super::UnicodeSegmentation;
747
748	UnicodeWords {
749	inner: s.split_word_bounds().filter(has_alphanumeric),
750	}
751	}
752
753	#[inline]
754	pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> {
755	use super::UnicodeSegmentation;
756
757	UnicodeWordIndices {
758	inner: sUWordBoundIndices<'_>
759	.split_word_bound_indices()
760	.filter(\|(_, c: &&str)\| has_alphanumeric(c)),
761	}
762	}
763
764	#[cfg(test)]
765	mod tests {
766	#[test]
767	fn test_syriac_abbr_mark() {
768	use crate::tables::word as wd;
769	let (_, _, cat) = wd::word_category('`\u{70f}`');
770	assert_eq!(cat, wd::WC_ALetter);
771	}
772
773	#[test]
774	fn test_end_of_ayah_cat() {
775	use crate::tables::word as wd;
776	let (_, _, cat) = wd::word_category('`\u{6dd}`');
777	assert_eq!(cat, wd::WC_Numeric);
778	}
779	}
780