1 | // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at |
3 | // http://rust-lang.org/COPYRIGHT. |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
8 | // option. This file may not be copied, modified, or distributed |
9 | // except according to those terms. |
10 | |
11 | use core::cmp; |
12 | use core::iter::Filter; |
13 | |
14 | use crate::tables::word::WordCat; |
15 | |
16 | /// An iterator over the substrings of a string which, after splitting the string on |
17 | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), |
18 | /// contain any characters with the |
19 | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
20 | /// property, or with |
21 | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
22 | /// |
23 | /// This struct is created by the [`unicode_words`] method on the [`UnicodeSegmentation`] trait. See |
24 | /// its documentation for more. |
25 | /// |
26 | /// [`unicode_words`]: trait.UnicodeSegmentation.html#tymethod.unicode_words |
27 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
28 | #[derive (Debug)] |
29 | pub struct UnicodeWords<'a> { |
30 | inner: Filter<UWordBounds<'a>, fn(&&str) -> bool>, |
31 | } |
32 | |
33 | impl<'a> Iterator for UnicodeWords<'a> { |
34 | type Item = &'a str; |
35 | |
36 | #[inline ] |
37 | fn next(&mut self) -> Option<&'a str> { |
38 | self.inner.next() |
39 | } |
40 | |
41 | #[inline ] |
42 | fn size_hint(&self) -> (usize, Option<usize>) { |
43 | self.inner.size_hint() |
44 | } |
45 | } |
46 | impl<'a> DoubleEndedIterator for UnicodeWords<'a> { |
47 | #[inline ] |
48 | fn next_back(&mut self) -> Option<&'a str> { |
49 | self.inner.next_back() |
50 | } |
51 | } |
52 | |
53 | /// An iterator over the substrings of a string which, after splitting the string on |
54 | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries), |
55 | /// contain any characters with the |
56 | /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic) |
57 | /// property, or with |
58 | /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values). |
59 | /// This iterator also provides the byte offsets for each substring. |
60 | /// |
61 | /// This struct is created by the [`unicode_word_indices`] method on the [`UnicodeSegmentation`] trait. See |
62 | /// its documentation for more. |
63 | /// |
64 | /// [`unicode_word_indices`]: trait.UnicodeSegmentation.html#tymethod.unicode_word_indices |
65 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
66 | #[derive (Debug)] |
67 | pub struct UnicodeWordIndices<'a> { |
68 | #[allow (clippy::type_complexity)] |
69 | inner: Filter<UWordBoundIndices<'a>, fn(&(usize, &str)) -> bool>, |
70 | } |
71 | |
72 | impl<'a> Iterator for UnicodeWordIndices<'a> { |
73 | type Item = (usize, &'a str); |
74 | |
75 | #[inline ] |
76 | fn next(&mut self) -> Option<(usize, &'a str)> { |
77 | self.inner.next() |
78 | } |
79 | |
80 | #[inline ] |
81 | fn size_hint(&self) -> (usize, Option<usize>) { |
82 | self.inner.size_hint() |
83 | } |
84 | } |
85 | impl<'a> DoubleEndedIterator for UnicodeWordIndices<'a> { |
86 | #[inline ] |
87 | fn next_back(&mut self) -> Option<(usize, &'a str)> { |
88 | self.inner.next_back() |
89 | } |
90 | } |
91 | |
92 | /// External iterator for a string's |
93 | /// [word boundaries](http://www.unicode.org/reports/tr29/#Word_Boundaries). |
94 | /// |
95 | /// This struct is created by the [`split_word_bounds`] method on the [`UnicodeSegmentation`] |
96 | /// trait. See its documentation for more. |
97 | /// |
98 | /// [`split_word_bounds`]: trait.UnicodeSegmentation.html#tymethod.split_word_bounds |
99 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
100 | #[derive (Debug, Clone)] |
101 | pub struct UWordBounds<'a> { |
102 | string: &'a str, |
103 | cat: Option<WordCat>, |
104 | catb: Option<WordCat>, |
105 | } |
106 | |
107 | /// External iterator for word boundaries and byte offsets. |
108 | /// |
109 | /// This struct is created by the [`split_word_bound_indices`] method on the |
110 | /// [`UnicodeSegmentation`] trait. See its documentation for more. |
111 | /// |
112 | /// [`split_word_bound_indices`]: trait.UnicodeSegmentation.html#tymethod.split_word_bound_indices |
113 | /// [`UnicodeSegmentation`]: trait.UnicodeSegmentation.html |
114 | #[derive (Debug, Clone)] |
115 | pub struct UWordBoundIndices<'a> { |
116 | start_offset: usize, |
117 | iter: UWordBounds<'a>, |
118 | } |
119 | |
120 | impl<'a> UWordBoundIndices<'a> { |
121 | #[inline ] |
122 | /// View the underlying data (the part yet to be iterated) as a slice of the original string. |
123 | /// |
124 | /// ```rust |
125 | /// # use unicode_segmentation::UnicodeSegmentation; |
126 | /// let mut iter = "Hello world" .split_word_bound_indices(); |
127 | /// assert_eq!(iter.as_str(), "Hello world" ); |
128 | /// iter.next(); |
129 | /// assert_eq!(iter.as_str(), " world" ); |
130 | /// iter.next(); |
131 | /// assert_eq!(iter.as_str(), "world" ); |
132 | /// ``` |
133 | pub fn as_str(&self) -> &'a str { |
134 | self.iter.as_str() |
135 | } |
136 | } |
137 | |
138 | impl<'a> Iterator for UWordBoundIndices<'a> { |
139 | type Item = (usize, &'a str); |
140 | |
141 | #[inline ] |
142 | fn next(&mut self) -> Option<(usize, &'a str)> { |
143 | self.iter |
144 | .next() |
145 | .map(|s: &'a str| (s.as_ptr() as usize - self.start_offset, s)) |
146 | } |
147 | |
148 | #[inline ] |
149 | fn size_hint(&self) -> (usize, Option<usize>) { |
150 | self.iter.size_hint() |
151 | } |
152 | } |
153 | |
154 | impl<'a> DoubleEndedIterator for UWordBoundIndices<'a> { |
155 | #[inline ] |
156 | fn next_back(&mut self) -> Option<(usize, &'a str)> { |
157 | self.iter |
158 | .next_back() |
159 | .map(|s: &'a str| (s.as_ptr() as usize - self.start_offset, s)) |
160 | } |
161 | } |
162 | |
163 | // state machine for word boundary rules |
164 | #[derive (Clone, Copy, PartialEq, Eq, Debug)] |
165 | enum UWordBoundsState { |
166 | Start, |
167 | Letter, |
168 | HLetter, |
169 | Numeric, |
170 | Katakana, |
171 | ExtendNumLet, |
172 | Regional(RegionalState), |
173 | FormatExtend(FormatExtendType), |
174 | Zwj, |
175 | Emoji, |
176 | WSegSpace, |
177 | } |
178 | |
179 | // subtypes for FormatExtend state in UWordBoundsState |
180 | #[derive (Clone, Copy, PartialEq, Eq, Debug)] |
181 | enum FormatExtendType { |
182 | AcceptAny, |
183 | AcceptNone, |
184 | RequireLetter, |
185 | RequireHLetter, |
186 | AcceptQLetter, |
187 | RequireNumeric, |
188 | } |
189 | |
190 | #[derive (Clone, Copy, PartialEq, Eq, Debug)] |
191 | enum RegionalState { |
192 | Half, |
193 | Full, |
194 | Unknown, |
195 | } |
196 | |
197 | fn is_emoji(ch: char) -> bool { |
198 | use crate::tables::emoji; |
199 | emoji::emoji_category(ch).2 == emoji::EmojiCat::EC_Extended_Pictographic |
200 | } |
201 | |
202 | impl<'a> Iterator for UWordBounds<'a> { |
203 | type Item = &'a str; |
204 | |
205 | #[inline ] |
206 | fn size_hint(&self) -> (usize, Option<usize>) { |
207 | let slen = self.string.len(); |
208 | (cmp::min(slen, 1), Some(slen)) |
209 | } |
210 | |
211 | #[inline ] |
212 | fn next(&mut self) -> Option<&'a str> { |
213 | use self::FormatExtendType::*; |
214 | use self::UWordBoundsState::*; |
215 | use crate::tables::word as wd; |
216 | if self.string.is_empty() { |
217 | return None; |
218 | } |
219 | |
220 | let mut take_curr = true; |
221 | let mut take_cat = true; |
222 | let mut idx = 0; |
223 | let mut saveidx = 0; |
224 | let mut state = Start; |
225 | let mut cat = wd::WC_Any; |
226 | let mut savecat = wd::WC_Any; |
227 | |
228 | // If extend/format/zwj were skipped. Handles precedence of WB3d over WB4 |
229 | let mut skipped_format_extend = false; |
230 | for (curr, ch) in self.string.char_indices() { |
231 | idx = curr; |
232 | // Whether or not the previous category was ZWJ |
233 | // ZWJs get collapsed, so this handles precedence of WB3c over WB4 |
234 | let prev_zwj = cat == wd::WC_ZWJ; |
235 | // if there's a category cached, grab it |
236 | cat = match self.cat { |
237 | None => wd::word_category(ch).2, |
238 | _ => self.cat.take().unwrap(), |
239 | }; |
240 | take_cat = true; |
241 | |
242 | // handle rule WB4 |
243 | // just skip all format, extend, and zwj chars |
244 | // note that Start is a special case: if there's a bunch of Format | Extend |
245 | // characters at the beginning of a block of text, dump them out as one unit. |
246 | // |
247 | // (This is not obvious from the wording of UAX#29, but if you look at the |
248 | // test cases http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt |
249 | // then the "correct" interpretation of WB4 becomes apparent.) |
250 | if state != Start { |
251 | match cat { |
252 | wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => { |
253 | skipped_format_extend = true; |
254 | continue; |
255 | } |
256 | _ => {} |
257 | } |
258 | } |
259 | |
260 | // rule WB3c |
261 | // WB4 makes all ZWJs collapse into the previous state |
262 | // but you can still be in a Zwj state if you started with Zwj |
263 | // |
264 | // This means that an EP + Zwj will collapse into EP, which is wrong, |
265 | // since EP+EP is not a boundary but EP+ZWJ+EP is |
266 | // |
267 | // Thus, we separately keep track of whether or not the last character |
268 | // was a ZWJ. This is an additional bit of state tracked outside of the |
269 | // state enum; the state enum represents the last non-zwj state encountered. |
270 | // When prev_zwj is true, for the purposes of WB3c, we are in the Zwj state, |
271 | // however we are in the previous state for the purposes of all other rules. |
272 | if prev_zwj && is_emoji(ch) { |
273 | state = Emoji; |
274 | continue; |
275 | } |
276 | // Don't use `continue` in this match without updating `cat` |
277 | state = match state { |
278 | Start if cat == wd::WC_CR => { |
279 | idx += match self.get_next_cat(idx) { |
280 | Some(wd::WC_LF) => 1, // rule WB3 |
281 | _ => 0, |
282 | }; |
283 | break; // rule WB3a |
284 | } |
285 | Start => match cat { |
286 | wd::WC_ALetter => Letter, // rule WB5, WB6, WB9, WB13a |
287 | wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB6, WB7a, WB7b, WB9, WB13a |
288 | wd::WC_Numeric => Numeric, // rule WB8, WB10, WB12, WB13a |
289 | wd::WC_Katakana => Katakana, // rule WB13, WB13a |
290 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a, WB13b |
291 | wd::WC_Regional_Indicator => Regional(RegionalState::Half), // rule WB13c |
292 | wd::WC_LF | wd::WC_Newline => break, // rule WB3a |
293 | wd::WC_ZWJ => Zwj, // rule WB3c |
294 | wd::WC_WSegSpace => WSegSpace, // rule WB3d |
295 | _ => { |
296 | if let Some(ncat) = self.get_next_cat(idx) { |
297 | // rule WB4 |
298 | if ncat == wd::WC_Format || ncat == wd::WC_Extend || ncat == wd::WC_ZWJ |
299 | { |
300 | state = FormatExtend(AcceptNone); |
301 | self.cat = Some(ncat); |
302 | continue; |
303 | } |
304 | } |
305 | break; // rule WB999 |
306 | } |
307 | }, |
308 | WSegSpace => match cat { |
309 | wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, |
310 | _ => { |
311 | take_curr = false; |
312 | break; |
313 | } |
314 | }, |
315 | Zwj => { |
316 | // We already handle WB3c above. |
317 | take_curr = false; |
318 | break; |
319 | } |
320 | Letter | HLetter => match cat { |
321 | wd::WC_ALetter => Letter, // rule WB5 |
322 | wd::WC_Hebrew_Letter => HLetter, // rule WB5 |
323 | wd::WC_Numeric => Numeric, // rule WB9 |
324 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
325 | wd::WC_Double_Quote if state == HLetter => { |
326 | savecat = cat; |
327 | saveidx = idx; |
328 | FormatExtend(RequireHLetter) // rule WB7b |
329 | } |
330 | wd::WC_Single_Quote if state == HLetter => { |
331 | FormatExtend(AcceptQLetter) // rule WB7a |
332 | } |
333 | wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
334 | savecat = cat; |
335 | saveidx = idx; |
336 | FormatExtend(RequireLetter) // rule WB6 |
337 | } |
338 | _ => { |
339 | take_curr = false; |
340 | break; |
341 | } |
342 | }, |
343 | Numeric => match cat { |
344 | wd::WC_Numeric => Numeric, // rule WB8 |
345 | wd::WC_ALetter => Letter, // rule WB10 |
346 | wd::WC_Hebrew_Letter => HLetter, // rule WB10 |
347 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
348 | wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
349 | savecat = cat; |
350 | saveidx = idx; |
351 | FormatExtend(RequireNumeric) // rule WB12 |
352 | } |
353 | _ => { |
354 | take_curr = false; |
355 | break; |
356 | } |
357 | }, |
358 | Katakana => match cat { |
359 | wd::WC_Katakana => Katakana, // rule WB13 |
360 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
361 | _ => { |
362 | take_curr = false; |
363 | break; |
364 | } |
365 | }, |
366 | ExtendNumLet => match cat { |
367 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
368 | wd::WC_ALetter => Letter, // rule WB13b |
369 | wd::WC_Hebrew_Letter => HLetter, // rule WB13b |
370 | wd::WC_Numeric => Numeric, // rule WB13b |
371 | wd::WC_Katakana => Katakana, // rule WB13b |
372 | _ => { |
373 | take_curr = false; |
374 | break; |
375 | } |
376 | }, |
377 | Regional(RegionalState::Full) => { |
378 | // if it reaches here we've gone too far, |
379 | // a full flag can only compose with ZWJ/Extend/Format |
380 | // proceeding it. |
381 | take_curr = false; |
382 | break; |
383 | } |
384 | Regional(RegionalState::Half) => match cat { |
385 | wd::WC_Regional_Indicator => Regional(RegionalState::Full), // rule WB13c |
386 | _ => { |
387 | take_curr = false; |
388 | break; |
389 | } |
390 | }, |
391 | Regional(_) => { |
392 | unreachable!("RegionalState::Unknown should not occur on forward iteration" ) |
393 | } |
394 | Emoji => { |
395 | // We already handle WB3c above. If you've reached this point, the emoji sequence is over. |
396 | take_curr = false; |
397 | break; |
398 | } |
399 | FormatExtend(t) => match t { |
400 | // handle FormatExtends depending on what type |
401 | RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB11 |
402 | RequireLetter | AcceptQLetter if cat == wd::WC_ALetter => Letter, // rule WB7 |
403 | RequireLetter | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // WB7a |
404 | RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b |
405 | AcceptNone | AcceptQLetter => { |
406 | take_curr = false; // emit all the Format|Extend characters |
407 | take_cat = false; |
408 | break; |
409 | } |
410 | _ => break, // rewind (in if statement below) |
411 | }, |
412 | } |
413 | } |
414 | |
415 | if let FormatExtend(t) = state { |
416 | // we were looking for something and didn't find it; we have to back up |
417 | if t == RequireLetter || t == RequireHLetter || t == RequireNumeric { |
418 | idx = saveidx; |
419 | cat = savecat; |
420 | take_curr = false; |
421 | } |
422 | } |
423 | |
424 | self.cat = if take_curr { |
425 | idx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); |
426 | None |
427 | } else if take_cat { |
428 | Some(cat) |
429 | } else { |
430 | None |
431 | }; |
432 | |
433 | let retstr = &self.string[..idx]; |
434 | self.string = &self.string[idx..]; |
435 | Some(retstr) |
436 | } |
437 | } |
438 | |
439 | impl<'a> DoubleEndedIterator for UWordBounds<'a> { |
440 | #[inline ] |
441 | fn next_back(&mut self) -> Option<&'a str> { |
442 | use self::FormatExtendType::*; |
443 | use self::UWordBoundsState::*; |
444 | use crate::tables::word as wd; |
445 | if self.string.is_empty() { |
446 | return None; |
447 | } |
448 | |
449 | let mut take_curr = true; |
450 | let mut take_cat = true; |
451 | let mut idx = self.string.len(); |
452 | idx -= self.string.chars().next_back().unwrap().len_utf8(); |
453 | let mut previdx = idx; |
454 | let mut saveidx = idx; |
455 | let mut state = Start; |
456 | let mut savestate = Start; |
457 | let mut cat = wd::WC_Any; |
458 | |
459 | let mut skipped_format_extend = false; |
460 | |
461 | for (curr, ch) in self.string.char_indices().rev() { |
462 | previdx = idx; |
463 | idx = curr; |
464 | |
465 | // if there's a category cached, grab it |
466 | cat = match self.catb { |
467 | None => wd::word_category(ch).2, |
468 | _ => self.catb.take().unwrap(), |
469 | }; |
470 | take_cat = true; |
471 | |
472 | // backward iterator over word boundaries. Mostly the same as the forward |
473 | // iterator, with two weirdnesses: |
474 | // (1) If we encounter a single quote in the Start state, we have to check for a |
475 | // Hebrew Letter immediately before it. |
476 | // (2) Format and Extend char handling takes some gymnastics. |
477 | |
478 | if cat == wd::WC_Extend || cat == wd::WC_Format || (cat == wd::WC_ZWJ && state != Zwj) { |
479 | // WB3c has more priority so we should not |
480 | // fold in that case |
481 | if !matches!(state, FormatExtend(_) | Start) { |
482 | saveidx = previdx; |
483 | savestate = state; |
484 | state = FormatExtend(AcceptNone); |
485 | } |
486 | |
487 | if state != Start { |
488 | continue; |
489 | } |
490 | } else if state == FormatExtend(AcceptNone) { |
491 | // finished a scan of some Format|Extend chars, restore previous state |
492 | state = savestate; |
493 | previdx = saveidx; |
494 | take_cat = false; |
495 | skipped_format_extend = true; |
496 | } |
497 | |
498 | // Don't use `continue` in this match without updating `catb` |
499 | state = match state { |
500 | Start | FormatExtend(AcceptAny) => match cat { |
501 | _ if is_emoji(ch) => Zwj, |
502 | wd::WC_ALetter => Letter, // rule WB5, WB7, WB10, WB13b |
503 | wd::WC_Hebrew_Letter => HLetter, // rule WB5, WB7, WB7c, WB10, WB13b |
504 | wd::WC_Numeric => Numeric, // rule WB8, WB9, WB11, WB13b |
505 | wd::WC_Katakana => Katakana, // rule WB13, WB13b |
506 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
507 | wd::WC_Regional_Indicator => Regional(RegionalState::Unknown), // rule WB13c |
508 | // rule WB4: |
509 | wd::WC_Extend | wd::WC_Format | wd::WC_ZWJ => FormatExtend(AcceptAny), |
510 | wd::WC_Single_Quote => { |
511 | saveidx = idx; |
512 | FormatExtend(AcceptQLetter) // rule WB7a |
513 | } |
514 | wd::WC_WSegSpace => WSegSpace, |
515 | wd::WC_CR | wd::WC_LF | wd::WC_Newline => { |
516 | if state == Start { |
517 | if cat == wd::WC_LF { |
518 | idx -= match self.get_prev_cat(idx) { |
519 | Some(wd::WC_CR) => 1, // rule WB3 |
520 | _ => 0, |
521 | }; |
522 | } |
523 | } else { |
524 | take_curr = false; |
525 | } |
526 | break; // rule WB3a |
527 | } |
528 | _ => break, // rule WB999 |
529 | }, |
530 | Zwj => match cat { |
531 | // rule WB3c |
532 | wd::WC_ZWJ => FormatExtend(AcceptAny), |
533 | _ => { |
534 | take_curr = false; |
535 | break; |
536 | } |
537 | }, |
538 | WSegSpace => match cat { |
539 | // rule WB3d |
540 | wd::WC_WSegSpace if !skipped_format_extend => WSegSpace, |
541 | _ => { |
542 | take_curr = false; |
543 | break; |
544 | } |
545 | }, |
546 | Letter | HLetter => match cat { |
547 | wd::WC_ALetter => Letter, // rule WB5 |
548 | wd::WC_Hebrew_Letter => HLetter, // rule WB5 |
549 | wd::WC_Numeric => Numeric, // rule WB10 |
550 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
551 | wd::WC_Double_Quote if state == HLetter => { |
552 | saveidx = previdx; |
553 | FormatExtend(RequireHLetter) // rule WB7c |
554 | } |
555 | wd::WC_MidLetter | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
556 | saveidx = previdx; |
557 | FormatExtend(RequireLetter) // rule WB7 |
558 | } |
559 | _ => { |
560 | take_curr = false; |
561 | break; |
562 | } |
563 | }, |
564 | Numeric => match cat { |
565 | wd::WC_Numeric => Numeric, // rule WB8 |
566 | wd::WC_ALetter => Letter, // rule WB9 |
567 | wd::WC_Hebrew_Letter => HLetter, // rule WB9 |
568 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
569 | wd::WC_MidNum | wd::WC_MidNumLet | wd::WC_Single_Quote => { |
570 | saveidx = previdx; |
571 | FormatExtend(RequireNumeric) // rule WB11 |
572 | } |
573 | _ => { |
574 | take_curr = false; |
575 | break; |
576 | } |
577 | }, |
578 | Katakana => match cat { |
579 | wd::WC_Katakana => Katakana, // rule WB13 |
580 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13b |
581 | _ => { |
582 | take_curr = false; |
583 | break; |
584 | } |
585 | }, |
586 | ExtendNumLet => match cat { |
587 | wd::WC_ExtendNumLet => ExtendNumLet, // rule WB13a |
588 | wd::WC_ALetter => Letter, // rule WB13a |
589 | wd::WC_Hebrew_Letter => HLetter, // rule WB13a |
590 | wd::WC_Numeric => Numeric, // rule WB13a |
591 | wd::WC_Katakana => Katakana, // rule WB13a |
592 | _ => { |
593 | take_curr = false; |
594 | break; |
595 | } |
596 | }, |
597 | Regional(mut regional_state) => match cat { |
598 | // rule WB13c |
599 | wd::WC_Regional_Indicator => { |
600 | if regional_state == RegionalState::Unknown { |
601 | let count = self.string[..previdx] |
602 | .chars() |
603 | .rev() |
604 | .map(|c| wd::word_category(c).2) |
605 | .filter(|&c| { |
606 | !(c == wd::WC_ZWJ || c == wd::WC_Extend || c == wd::WC_Format) |
607 | }) |
608 | .take_while(|&c| c == wd::WC_Regional_Indicator) |
609 | .count(); |
610 | regional_state = if count % 2 == 0 { |
611 | RegionalState::Full |
612 | } else { |
613 | RegionalState::Half |
614 | }; |
615 | } |
616 | if regional_state == RegionalState::Full { |
617 | take_curr = false; |
618 | break; |
619 | } else { |
620 | Regional(RegionalState::Full) |
621 | } |
622 | } |
623 | _ => { |
624 | take_curr = false; |
625 | break; |
626 | } |
627 | }, |
628 | Emoji => { |
629 | if is_emoji(ch) { |
630 | // rule WB3c |
631 | Zwj |
632 | } else { |
633 | take_curr = false; |
634 | break; |
635 | } |
636 | } |
637 | FormatExtend(t) => match t { |
638 | RequireNumeric if cat == wd::WC_Numeric => Numeric, // rule WB12 |
639 | RequireLetter if cat == wd::WC_ALetter => Letter, // rule WB6 |
640 | RequireLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB6 |
641 | AcceptQLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7a |
642 | RequireHLetter if cat == wd::WC_Hebrew_Letter => HLetter, // rule WB7b |
643 | _ => break, // backtrack will happens |
644 | }, |
645 | } |
646 | } |
647 | |
648 | if let FormatExtend(t) = state { |
649 | // if we required something but didn't find it, backtrack |
650 | if t == RequireLetter |
651 | || t == RequireHLetter |
652 | || t == RequireNumeric |
653 | || t == AcceptNone |
654 | || t == AcceptQLetter |
655 | { |
656 | previdx = saveidx; |
657 | take_cat = false; |
658 | take_curr = false; |
659 | } |
660 | } |
661 | |
662 | self.catb = if take_curr { |
663 | None |
664 | } else { |
665 | idx = previdx; |
666 | if take_cat { |
667 | Some(cat) |
668 | } else { |
669 | None |
670 | } |
671 | }; |
672 | |
673 | let retstr = &self.string[idx..]; |
674 | self.string = &self.string[..idx]; |
675 | Some(retstr) |
676 | } |
677 | } |
678 | |
679 | impl<'a> UWordBounds<'a> { |
680 | #[inline ] |
681 | /// View the underlying data (the part yet to be iterated) as a slice of the original string. |
682 | /// |
683 | /// ```rust |
684 | /// # use unicode_segmentation::UnicodeSegmentation; |
685 | /// let mut iter = "Hello world" .split_word_bounds(); |
686 | /// assert_eq!(iter.as_str(), "Hello world" ); |
687 | /// iter.next(); |
688 | /// assert_eq!(iter.as_str(), " world" ); |
689 | /// iter.next(); |
690 | /// assert_eq!(iter.as_str(), "world" ); |
691 | /// ``` |
692 | pub fn as_str(&self) -> &'a str { |
693 | self.string |
694 | } |
695 | |
696 | #[inline ] |
697 | fn get_next_cat(&self, idx: usize) -> Option<WordCat> { |
698 | use crate::tables::word as wd; |
699 | let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8(); |
700 | if nidx < self.string.len() { |
701 | let nch = self.string[nidx..].chars().next().unwrap(); |
702 | Some(wd::word_category(nch).2) |
703 | } else { |
704 | None |
705 | } |
706 | } |
707 | |
708 | #[inline ] |
709 | fn get_prev_cat(&self, idx: usize) -> Option<WordCat> { |
710 | use crate::tables::word as wd; |
711 | if idx > 0 { |
712 | let nch = self.string[..idx].chars().next_back().unwrap(); |
713 | Some(wd::word_category(nch).2) |
714 | } else { |
715 | None |
716 | } |
717 | } |
718 | } |
719 | |
720 | #[inline ] |
721 | pub fn new_word_bounds(s: &str) -> UWordBounds<'_> { |
722 | UWordBounds { |
723 | string: s, |
724 | cat: None, |
725 | catb: None, |
726 | } |
727 | } |
728 | |
729 | #[inline ] |
730 | pub fn new_word_bound_indices(s: &str) -> UWordBoundIndices<'_> { |
731 | UWordBoundIndices { |
732 | start_offset: s.as_ptr() as usize, |
733 | iter: new_word_bounds(s), |
734 | } |
735 | } |
736 | |
737 | #[inline ] |
738 | fn has_alphanumeric(s: &&str) -> bool { |
739 | use crate::tables::util::is_alphanumeric; |
740 | |
741 | s.chars().any(is_alphanumeric) |
742 | } |
743 | |
744 | #[inline ] |
745 | pub fn new_unicode_words(s: &str) -> UnicodeWords<'_> { |
746 | use super::UnicodeSegmentation; |
747 | |
748 | UnicodeWords { |
749 | inner: s.split_word_bounds().filter(has_alphanumeric), |
750 | } |
751 | } |
752 | |
753 | #[inline ] |
754 | pub fn new_unicode_word_indices(s: &str) -> UnicodeWordIndices<'_> { |
755 | use super::UnicodeSegmentation; |
756 | |
757 | UnicodeWordIndices { |
758 | inner: sUWordBoundIndices<'_> |
759 | .split_word_bound_indices() |
760 | .filter(|(_, c: &&str)| has_alphanumeric(c)), |
761 | } |
762 | } |
763 | |
764 | #[cfg (test)] |
765 | mod tests { |
766 | #[test ] |
767 | fn test_syriac_abbr_mark() { |
768 | use crate::tables::word as wd; |
769 | let (_, _, cat) = wd::word_category(' \u{70f}' ); |
770 | assert_eq!(cat, wd::WC_ALetter); |
771 | } |
772 | |
773 | #[test ] |
774 | fn test_end_of_ayah_cat() { |
775 | use crate::tables::word as wd; |
776 | let (_, _, cat) = wd::word_category(' \u{6dd}' ); |
777 | assert_eq!(cat, wd::WC_Numeric); |
778 | } |
779 | } |
780 | |