1 | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at |
3 | // http://rust-lang.org/COPYRIGHT. |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | // option. This file may not be copied, modified, or distributed |
9 | // except according to those terms. |
10 | |
11 | use core::cmp; |
12 | use core::iter::Filter; |
13 | |
14 | use crate::tables::word::WordCat; |
15 | |
16 | /// An iterator over the substrings of a string which, after splitting the string on |
17 | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), |
18 | /// contain any characters with the |
19 | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
20 | /// property, or with |
21 | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
22 | /// |
23 | /// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See |
24 | /// its documentation for more. |
25 | /// |
26 | /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words |
27 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
28 | pub struct UnicodeWords<'a> { |
29 | inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>, |
30 | } |
31 | |
32 | impl<'a> Iterator for UnicodeWords<'a> { |
33 | type Item = &'a str; |
34 | |
35 | #[inline ] |
36 | fn next(&mut self) -> Option<&'a str> { |
37 | self.inner.next() |
38 | } |
39 | } |
40 | impl<'a> DoubleEndedIterator for UnicodeWords<'a> { |
41 | #[inline ] |
42 | fn next_back(&mut self) -> Option<&'a str> { |
43 | self.inner.next_back() |
44 | } |
45 | } |
46 | |
47 | /// An iterator over the substrings of a string which, after splitting the string on |
48 | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), |
49 | /// contain any characters with the |
50 | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
51 | /// property, or with |
52 | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
53 | /// This iterator also provides the byte offsets for each substring. |
54 | /// |
55 | /// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See |
56 | /// its documentation for more. |
57 | /// |
58 | /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices |
59 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
60 | pub struct UnicodeWordIndices<'a> { |
61 | inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>, |
62 | } |
63 | |
64 | impl<'a> Iterator for UnicodeWordIndices<'a> { |
65 | type Item = (usize, &'a str); |
66 | |
67 | #[inline ] |
68 | fn next(&mut self) -> Option<(usize, &'a str)> { |
69 | self.inner.next() |
70 | } |
71 | } |
72 | impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { |
73 | #[inline ] |
74 | fn next_back(&mut self) -> Option<(usize, &'a str)> { |
75 | self.inner.next_back() |
76 | } |
77 | } |
78 | |
79 | /// External iterator for a string's |
80 | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). |
81 | /// |
82 | /// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`] |
83 | /// trait. See its documentation for more. |
84 | /// |
85 | /// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds |
86 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
87 | #[derive (Clone)] |
88 | pub struct UWordBounds<'a> { |
89 | string: &'a str, |
90 | cat: Option<WordCat>, |
91 | catb: Option<WordCat>, |
92 | } |
93 | |
94 | /// External iterator for word boundaries and byte offsets. |
95 | /// |
96 | /// This struct is created by the [`split_word_bound_indices`] method on the |
97 | /// [`UnicodeSegmentation`] trait. See its documentation for more. |
98 | /// |
99 | /// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices |
100 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
101 | #[derive (Clone)] |
102 | pub struct UWordBoundIndices<'a> { |
103 | start_offset: usize, |
104 | iter: UWordBounds<'a>, |
105 | } |
106 | |
107 | impl<'a> UWordBoundIndices<'a> { |
108 | #[inline ] |
109 | /// View the underlying data (the part yet to be iterated) as a slice of the original string. |
110 | /// |
111 | /// ```rust |
112 | /// # use unicode_segmentation::UnicodeSegmentation; |
113 | /// let mut iter = "Hello world" .split_word_bound_indices(); |
114 | /// assert_eq!(iter.as_str(), "Hello world" ); |
115 | /// iter.next(); |
116 | /// assert_eq!(iter.as_str(), " world" ); |
117 | /// iter.next(); |
118 | /// assert_eq!(iter.as_str(), "world" ); |
119 | /// ``` |
120 | pub fn as_str(&self) -> &'a str { |
121 | self.iter.as_str() |
122 | } |
123 | } |
124 | |
125 | impl<'a> Iterator for UWordBoundIndices<'a> { |
126 | type Item = (usize, &'a str); |
127 | |
128 | #[inline ] |
129 | fn next(&mut self) -> Option<(usize, &'a str)> { |
130 | self.iter |
131 | .next() |
132 | .map(|s: &str| (s.as_ptr() as usize - self.start_offset, s)) |
133 | } |
134 | |
135 | #[inline ] |
136 | fn size_hint(&self) -> (usize, Option<usize>) { |
137 | self.iter.size_hint() |
138 | } |
139 | } |
140 | |
141 | impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> { |
142 | #[inline ] |
143 | fn next_back(&mut self) -> Option<(usize, &'a str)> { |
144 | self.iter |
145 | .next_back() |
146 | .map(|s: &str| (s.as_ptr() as usize - self.start_offset, s)) |
147 | } |
148 | } |
149 | |
150 | // state machine for word boundary rules |
151 | #[derive (Clone, Copy, PartialEq, Eq, Debug)] |
152 | enum UWordBoundsState { |
153 | Start, |
154 | Letter, |
155 | HLetter, |
156 | Numeric, |
157 | Katakana, |
158 | ExtendNumLet, |
159 | Regional(RegionalState), |
160 | FormatExtend(FormatExtendType), |
161 | Zwj, |
162 | Emoji, |
163 | WSegSpace, |
164 | } |
165 | |
166 | // subtypes for FormatExtend state in UWordBoundsState |
167 | #[derive (Clone, Copy, PartialEq, Eq, Debug)] |
168 | enum FormatExtendType { |
169 | AcceptAny, |
170 | AcceptNone, |
171 | RequireLetter, |
172 | RequireHLetter, |
173 | AcceptQLetter, |
174 | RequireNumeric, |
175 | } |
176 | |
177 | #[derive (Clone, Copy, PartialEq, Eq, Debug)] |
178 | enum RegionalState { |
179 | Half, |
180 | Full, |
181 | Unknown, |
182 | } |
183 | |
184 | fn is_emoji(ch: char) -> bool { |
185 | use crate::tables::emoji; |
186 | emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic |
187 | } |
188 | |
189 | impl<'a> Iterator for UWordBounds<'a> { |
190 | type Item = &'a str; |
191 | |
192 | #[inline ] |
193 | fn size_hint(&self) -> (usize, Option<usize>) { |
194 | let slen = self.string.len(); |
195 | (cmp::min(slen, 1), Some(slen)) |
196 | } |
197 | |
198 | #[inline ] |
199 | fn next(&mut self) -> Option<&'a str> { |
200 | use self::FormatExtendType::*; |
201 | use self::UWordBoundsState::*; |
202 | use crate::tables::word as wd; |
203 | if self.string.len() == 0 { |
204 | return None; |
205 | } |
206 | |
207 | let mut take_curr = true; |
208 | let mut take_cat = true; |
209 | let mut idx = 0; |
210 | let mut saveidx = 0; |
211 | let mut state = Start; |
212 | let mut cat = wd::WC_Any; |
213 | let mut savecat = wd::WC_Any; |
214 | |
215 | // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4 |
216 | let mut skipped_format_extend = false; |
217 | for (curr, ch) in self.string.char_indices() { |
218 | idx = curr; |
219 | // Whether or not the previous category was ZWJ |
220 | // ZWJs get collapsed, so this handles precedence of WB3c over WB4 |
221 | let prev_zwj = cat == wd::WC_ZWJ; |
222 | // if there's a category cached, grab it |
223 | cat = match self.cat { |
224 | None => wd::word_category(ch).2, |
225 | _ => self.cat.take().unwrap(), |
226 | }; |
227 | take_cat = true; |
228 | |
229 | // handle rule WB4 |
230 | // just skip all format, extend, and zwj chars |
231 | // note that Start is a special case: if there's a bunch of Format | Extend |
232 | // characters at the beginning of a block of text, dump them out as one unit. |
233 | // |
234 | // (This is not obvious from the wording of UAX#29, but if you look at the |
235 | // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt |
236 | // then the "correct" interpretation of WB4 becomes apparent.) |
237 | if state != Start { |
238 | match cat { |
239 | wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => { |
240 | skipped_format_extend = true; |
241 | continue; |
242 | } |
243 | _ => {} |
244 | } |
245 | } |
246 | |
247 | // rule WB3c |
248 | // WB4 makes all ZWJs collapse into the previous state |
249 | // but you can still be in a Zwj state if you started with Zwj |
250 | // |
251 | // This means that an EP + Zwj will collapse into EP, which is wrong, |
252 | // since EP+EP is not a boundary but EP+ZWJ+EP is |
253 | // |
254 | // Thus, we separately keep track of whether or not the last character |
255 | // was a ZWJ. This is an additional bit of state tracked outside of the |
256 | // state enum; the state enum represents the last non-zwj state encountered. |
257 | // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state, |
258 | // however we are in the previous state for the purposes of all other rules. |
259 | if prev_zwj { |
260 | if is_emoji(ch) { |
261 | state = Emoji; |
262 | continue; |
263 | } |
264 | } |
265 | // Don't use `continue` in this match without updating `cat` |
266 | state = match state { |
267 | Start if cat == wd::WC_CR => { |
268 | idx += match self.get_next_cat(idx) { |
269 | Some(ncat) if ncat == wd::WC_LF => 1, // rule WB3 |
270 | _ => 0, |
271 | }; |
272 | break; // rule WB3a |
273 | } |
274 | Start => match cat { |
275 | wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a |
276 | wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a |
277 | wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a |
278 | wd::WC_Katakana => Katakana, // rule WB13, WB13a |
279 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b |
280 | wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c |
281 | wd::WC_LF | wd::WC_Newline => break, // rule WB3a |
282 | wd::WC_ZWJ => Zwj, // rule WB3c |
283 | wd::WC_WSegSpace => WSegSpace, // rule WB3d |
284 | _ => { |
285 | if let Some(ncat) = self.get_next_cat(idx) { |
286 | // rule WB4 |
287 | if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ |
288 | { |
289 | state = FormatExtend(AcceptNone); |
290 | self.cat = Some(ncat); |
291 | continue; |
292 | } |
293 | } |
294 | break; // rule WB999 |
295 | } |
296 | }, |
297 | WSegSpace => match cat { |
298 | wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, |
299 | _ => { |
300 | take_curr = false; |
301 | break; |
302 | } |
303 | }, |
304 | Zwj => { |
305 | // We already handle WB3c above. |
306 | take_curr = false; |
307 | break; |
308 | } |
309 | Letter | HLetter => match cat { |
310 | wd::WC_ALetter => Letter, // rule WB5 |
311 | wd::WC_Hebrew_Letter => HLetter, // rule WB5 |
312 | wd::WC_Numeric => Numeric, // rule WB9 |
313 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
314 | wd::WC_Double_Quote if state == HLetter => { |
315 | savecat = cat; |
316 | saveidx = idx; |
317 | FormatExtend(RequireHLetter) // rule WB7b |
318 | } |
319 | wd::WC_Single_Quote if state == HLetter => { |
320 | FormatExtend(AcceptQLetter) // rule WB7a |
321 | } |
322 | wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
323 | savecat = cat; |
324 | saveidx = idx; |
325 | FormatExtend(RequireLetter) // rule WB6 |
326 | } |
327 | _ => { |
328 | take_curr = false; |
329 | break; |
330 | } |
331 | }, |
332 | Numeric => match cat { |
333 | wd::WC_Numeric => Numeric, // rule WB8 |
334 | wd::WC_ALetter => Letter, // rule WB10 |
335 | wd::WC_Hebrew_Letter => HLetter, // rule WB10 |
336 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
337 | wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
338 | savecat = cat; |
339 | saveidx = idx; |
340 | FormatExtend(RequireNumeric) // rule WB12 |
341 | } |
342 | _ => { |
343 | take_curr = false; |
344 | break; |
345 | } |
346 | }, |
347 | Katakana => match cat { |
348 | wd::WC_Katakana => Katakana, // rule WB13 |
349 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
350 | _ => { |
351 | take_curr = false; |
352 | break; |
353 | } |
354 | }, |
355 | ExtendNumLet => match cat { |
356 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
357 | wd::WC_ALetter => Letter, // rule WB13b |
358 | wd::WC_Hebrew_Letter => HLetter, // rule WB13b |
359 | wd::WC_Numeric => Numeric, // rule WB13b |
360 | wd::WC_Katakana => Katakana, // rule WB13b |
361 | _ => { |
362 | take_curr = false; |
363 | break; |
364 | } |
365 | }, |
366 | Regional(RegionalState::Full) => { |
367 | // if it reaches here we've gone too far, |
368 | // a full flag can only compose with ZWJ/Extend/Format |
369 | // proceeding it. |
370 | take_curr = false; |
371 | break; |
372 | } |
373 | Regional(RegionalState::Half) => match cat { |
374 | wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c |
375 | _ => { |
376 | take_curr = false; |
377 | break; |
378 | } |
379 | }, |
380 | Regional(_) => { |
381 | unreachable!("RegionalState::Unknown should not occur on forward iteration" ) |
382 | } |
383 | Emoji => { |
384 | // We already handle WB3c above. If you've reached this point, the emoji sequence is over. |
385 | take_curr = false; |
386 | break; |
387 | } |
388 | FormatExtend(t) => match t { |
389 | // handle FormatExtends depending on what type |
390 | RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11 |
391 | RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7 |
392 | RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a |
393 | RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b |
394 | AcceptNone | AcceptQLetter => { |
395 | take_curr = false; // emit all the Format|Extend characters |
396 | take_cat = false; |
397 | break; |
398 | } |
399 | _ => break, // rewind (in if statement below) |
400 | }, |
401 | } |
402 | } |
403 | |
404 | if let FormatExtend(t) = state { |
405 | // we were looking for something and didn't find it; we have to back up |
406 | if t == RequireLetter || t == RequireHLetter || t == RequireNumeric { |
407 | idx = saveidx; |
408 | cat = savecat; |
409 | take_curr = false; |
410 | } |
411 | } |
412 | |
413 | self.cat = if take_curr { |
414 | idx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); |
415 | None |
416 | } else if take_cat { |
417 | Some(cat) |
418 | } else { |
419 | None |
420 | }; |
421 | |
422 | let retstr = &self.string[..idx]; |
423 | self.string = &self.string[idx..]; |
424 | Some(retstr) |
425 | } |
426 | } |
427 | |
428 | impl<'a> DoubleEndedIterator for UWordBounds<'a> { |
429 | #[inline ] |
430 | fn next_back(&mut self) -> Option<&'a str> { |
431 | use self::FormatExtendType::*; |
432 | use self::UWordBoundsState::*; |
433 | use crate::tables::word as wd; |
434 | if self.string.len() == 0 { |
435 | return None; |
436 | } |
437 | |
438 | let mut take_curr = true; |
439 | let mut take_cat = true; |
440 | let mut idx = self.string.len(); |
441 | idx -= self.string.chars().next_back().unwrap().len_utf8(); |
442 | let mut previdx = idx; |
443 | let mut saveidx = idx; |
444 | let mut state = Start; |
445 | let mut savestate = Start; |
446 | let mut cat = wd::WC_Any; |
447 | |
448 | let mut skipped_format_extend = false; |
449 | |
450 | for (curr, ch) in self.string.char_indices().rev() { |
451 | previdx = idx; |
452 | idx = curr; |
453 | |
454 | // if there's a category cached, grab it |
455 | cat = match self.catb { |
456 | None => wd::word_category(ch).2, |
457 | _ => self.catb.take().unwrap(), |
458 | }; |
459 | take_cat = true; |
460 | |
461 | // backward iterator over word boundaries. Mostly the same as the forward |
462 | // iterator, with two weirdnesses: |
463 | // (1) If we encounter a single quote in the Start state, we have to check for a |
464 | // Hebrew Letter immediately before it. |
465 | // (2) Format and Extend char handling takes some gymnastics. |
466 | |
467 | if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) { |
468 | // WB3c has more priority so we should not |
469 | // fold in that case |
470 | if match state { |
471 | FormatExtend(_) | Start => false, |
472 | _ => true, |
473 | } { |
474 | saveidx = previdx; |
475 | savestate = state; |
476 | state = FormatExtend(AcceptNone); |
477 | } |
478 | |
479 | if state != Start { |
480 | continue; |
481 | } |
482 | } else if state == FormatExtend(AcceptNone) { |
483 | // finished a scan of some Format|Extend chars, restore previous state |
484 | state = savestate; |
485 | previdx = saveidx; |
486 | take_cat = false; |
487 | skipped_format_extend = true; |
488 | } |
489 | |
490 | // Don't use `continue` in this match without updating `catb` |
491 | state = match state { |
492 | Start | FormatExtend(AcceptAny) => match cat { |
493 | _ if is_emoji(ch) => Zwj, |
494 | wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b |
495 | wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b |
496 | wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b |
497 | wd::WC_Katakana => Katakana, // rule WB13, WB13b |
498 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
499 | wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c |
500 | // rule WB4: |
501 | wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny), |
502 | wd::WC_Single_Quote => { |
503 | saveidx = idx; |
504 | FormatExtend(AcceptQLetter) // rule WB7a |
505 | } |
506 | wd::WC_WSegSpace => WSegSpace, |
507 | wd::WC_CR | wd::WC_LF | wd::WC_Newline => { |
508 | if state == Start { |
509 | if cat == wd::WC_LF { |
510 | idx -= match self.get_prev_cat(idx) { |
511 | Some(pcat) if pcat == wd::WC_CR => 1, // rule WB3 |
512 | _ => 0, |
513 | }; |
514 | } |
515 | } else { |
516 | take_curr = false; |
517 | } |
518 | break; // rule WB3a |
519 | } |
520 | _ => break, // rule WB999 |
521 | }, |
522 | Zwj => match cat { |
523 | // rule WB3c |
524 | wd::WC_ZWJ => FormatExtend(AcceptAny), |
525 | _ => { |
526 | take_curr = false; |
527 | break; |
528 | } |
529 | }, |
530 | WSegSpace => match cat { |
531 | // rule WB3d |
532 | wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, |
533 | _ => { |
534 | take_curr = false; |
535 | break; |
536 | } |
537 | }, |
538 | Letter | HLetter => match cat { |
539 | wd::WC_ALetter => Letter, // rule WB5 |
540 | wd::WC_Hebrew_Letter => HLetter, // rule WB5 |
541 | wd::WC_Numeric => Numeric, // rule WB10 |
542 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
543 | wd::WC_Double_Quote if state == HLetter => { |
544 | saveidx = previdx; |
545 | FormatExtend(RequireHLetter) // rule WB7c |
546 | } |
547 | wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
548 | saveidx = previdx; |
549 | FormatExtend(RequireLetter) // rule WB7 |
550 | } |
551 | _ => { |
552 | take_curr = false; |
553 | break; |
554 | } |
555 | }, |
556 | Numeric => match cat { |
557 | wd::WC_Numeric => Numeric, // rule WB8 |
558 | wd::WC_ALetter => Letter, // rule WB9 |
559 | wd::WC_Hebrew_Letter => HLetter, // rule WB9 |
560 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
561 | wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
562 | saveidx = previdx; |
563 | FormatExtend(RequireNumeric) // rule WB11 |
564 | } |
565 | _ => { |
566 | take_curr = false; |
567 | break; |
568 | } |
569 | }, |
570 | Katakana => match cat { |
571 | wd::WC_Katakana => Katakana, // rule WB13 |
572 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
573 | _ => { |
574 | take_curr = false; |
575 | break; |
576 | } |
577 | }, |
578 | ExtendNumLet => match cat { |
579 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
580 | wd::WC_ALetter => Letter, // rule WB13a |
581 | wd::WC_Hebrew_Letter => HLetter, // rule WB13a |
582 | wd::WC_Numeric => Numeric, // rule WB13a |
583 | wd::WC_Katakana => Katakana, // rule WB13a |
584 | _ => { |
585 | take_curr = false; |
586 | break; |
587 | } |
588 | }, |
589 | Regional(mut regional_state) => match cat { |
590 | // rule WB13c |
591 | wd::WC_Regional_Indicator => { |
592 | if regional_state == RegionalState::Unknown { |
593 | let count = self.string[..previdx] |
594 | .chars() |
595 | .rev() |
596 | .map(|c| wd::word_category(c).2) |
597 | .filter(|&c| { |
598 | !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format) |
599 | }) |
600 | .take_while(|&c| c == wd::WC_Regional_Indicator) |
601 | .count(); |
602 | regional_state = if count % 2 == 0 { |
603 | RegionalState::Full |
604 | } else { |
605 | RegionalState::Half |
606 | }; |
607 | } |
608 | if regional_state == RegionalState::Full { |
609 | take_curr = false; |
610 | break; |
611 | } else { |
612 | Regional(RegionalState::Full) |
613 | } |
614 | } |
615 | _ => { |
616 | take_curr = false; |
617 | break; |
618 | } |
619 | }, |
620 | Emoji => { |
621 | if is_emoji(ch) { |
622 | // rule WB3c |
623 | Zwj |
624 | } else { |
625 | take_curr = false; |
626 | break; |
627 | } |
628 | } |
629 | FormatExtend(t) => match t { |
630 | RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12 |
631 | RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6 |
632 | RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6 |
633 | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a |
634 | RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b |
635 | _ => break, // backtrack will happens |
636 | }, |
637 | } |
638 | } |
639 | |
640 | if let FormatExtend(t) = state { |
641 | // if we required something but didn't find it, backtrack |
642 | if t == RequireLetter |
643 | || t == RequireHLetter |
644 | || t == RequireNumeric |
645 | || t == AcceptNone |
646 | || t == AcceptQLetter |
647 | { |
648 | previdx = saveidx; |
649 | take_cat = false; |
650 | take_curr = false; |
651 | } |
652 | } |
653 | |
654 | self.catb = if take_curr { |
655 | None |
656 | } else { |
657 | idx = previdx; |
658 | if take_cat { |
659 | Some(cat) |
660 | } else { |
661 | None |
662 | } |
663 | }; |
664 | |
665 | let retstr = &self.string[idx..]; |
666 | self.string = &self.string[..idx]; |
667 | Some(retstr) |
668 | } |
669 | } |
670 | |
671 | impl<'a> UWordBounds<'a> { |
672 | #[inline ] |
673 | /// View the underlying data (the part yet to be iterated) as a slice of the original string. |
674 | /// |
675 | /// ```rust |
676 | /// # use unicode_segmentation::UnicodeSegmentation; |
677 | /// let mut iter = "Hello world" .split_word_bounds(); |
678 | /// assert_eq!(iter.as_str(), "Hello world" ); |
679 | /// iter.next(); |
680 | /// assert_eq!(iter.as_str(), " world" ); |
681 | /// iter.next(); |
682 | /// assert_eq!(iter.as_str(), "world" ); |
683 | /// ``` |
684 | pub fn as_str(&self) -> &'a str { |
685 | self.string |
686 | } |
687 | |
688 | #[inline ] |
689 | fn get_next_cat(&self, idx: usize) -> Option<WordCat> { |
690 | use crate::tables::word as wd; |
691 | let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); |
692 | if nidx < self.string.len() { |
693 | let nch = self.string[nidx..].chars().next().unwrap(); |
694 | Some(wd::word_category(nch).2) |
695 | } else { |
696 | None |
697 | } |
698 | } |
699 | |
700 | #[inline ] |
701 | fn get_prev_cat(&self, idx: usize) -> Option<WordCat> { |
702 | use crate::tables::word as wd; |
703 | if idx > 0 { |
704 | let nch = self.string[..idx].chars().next_back().unwrap(); |
705 | Some(wd::word_category(nch).2) |
706 | } else { |
707 | None |
708 | } |
709 | } |
710 | } |
711 | |
712 | #[inline ] |
713 | pub fn new_word_bounds<'b>(s: &'b str) -> UWordBounds<'b> { |
714 | UWordBounds { |
715 | string: s, |
716 | cat: None, |
717 | catb: None, |
718 | } |
719 | } |
720 | |
721 | #[inline ] |
722 | pub fn new_word_bound_indices<'b>(s: &'b str) -> UWordBoundIndices<'b> { |
723 | UWordBoundIndices { |
724 | start_offset: s.as_ptr() as usize, |
725 | iter: new_word_bounds(s), |
726 | } |
727 | } |
728 | |
729 | #[inline ] |
730 | fn has_alphanumeric(s: &&str) -> bool { |
731 | use crate::tables::util::is_alphanumeric; |
732 | |
733 | s.chars().any(|c: char| is_alphanumeric(c)) |
734 | } |
735 | |
736 | #[inline ] |
737 | pub fn new_unicode_words<'b>(s: &'b str) -> UnicodeWords<'b> { |
738 | use super::UnicodeSegmentation; |
739 | |
740 | UnicodeWords { |
741 | inner: s.split_word_bounds().filter(has_alphanumeric), |
742 | } |
743 | } |
744 | |
745 | #[inline ] |
746 | pub fn new_unicode_word_indices<'b>(s: &'b str) -> UnicodeWordIndices<'b> { |
747 | use super::UnicodeSegmentation; |
748 | |
749 | UnicodeWordIndices { |
750 | inner: sUWordBoundIndices<'_> |
751 | .split_word_bound_indices() |
752 | .filter(|(_, c: &&str)| has_alphanumeric(c)), |
753 | } |
754 | } |
755 | |