1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | use crate::codepointtrie::CodePointMapRange; |
6 | |
7 | /// This is an iterator that coalesces adjacent ranges in an iterator over code |
8 | /// point ranges |
9 | pub(crate) struct RangeListIteratorCoalescer<I, T> { |
10 | iter: I, |
11 | peek: Option<CodePointMapRange<T>>, |
12 | } |
13 | |
14 | impl<I, T: Eq> RangeListIteratorCoalescer<I, T> |
15 | where |
16 | I: Iterator<Item = CodePointMapRange<T>>, |
17 | { |
18 | pub fn new(iter: I) -> Self { |
19 | Self { iter, peek: None } |
20 | } |
21 | } |
22 | |
23 | impl<I, T: Eq> Iterator for RangeListIteratorCoalescer<I, T> |
24 | where |
25 | I: Iterator<Item = CodePointMapRange<T>>, |
26 | { |
27 | type Item = CodePointMapRange<T>; |
28 | |
29 | fn next(&mut self) -> Option<Self::Item> { |
30 | // Get the initial range we're working with: either a leftover |
31 | // range from last time, or the next range |
32 | let mut ret = if let Some(peek) = self.peek.take() { |
33 | peek |
34 | } else if let Some(next) = self.iter.next() { |
35 | next |
36 | } else { |
37 | // No ranges, exit early |
38 | return None; |
39 | }; |
40 | |
41 | // Keep pulling ranges |
42 | #[allow (clippy::while_let_on_iterator)] |
43 | // can't move the iterator, also we want it to be explicit that we're not draining the iterator |
44 | while let Some(next) = self.iter.next() { |
45 | if *next.range.start() == ret.range.end() + 1 && next.value == ret.value { |
46 | // Range has no gap, coalesce |
47 | ret.range = *ret.range.start()..=*next.range.end(); |
48 | } else { |
49 | // Range has a gap, return what we have so far, update |
50 | // peek |
51 | self.peek = Some(next); |
52 | return Some(ret); |
53 | } |
54 | } |
55 | |
56 | // Ran out of elements, exit |
57 | Some(ret) |
58 | } |
59 | } |
60 | |
61 | #[cfg (test)] |
62 | mod tests { |
63 | use core::fmt::Debug; |
64 | use icu::collections::codepointinvlist::CodePointInversionListBuilder; |
65 | use icu::collections::codepointtrie::TrieValue; |
66 | use icu::properties::maps::{self, CodePointMapDataBorrowed}; |
67 | use icu::properties::sets::{self, CodePointSetDataBorrowed}; |
68 | use icu::properties::{GeneralCategory, Script}; |
69 | |
70 | fn test_set(data: CodePointSetDataBorrowed<'static>, name: &str) { |
71 | let mut builder = CodePointInversionListBuilder::new(); |
72 | let mut builder_complement = CodePointInversionListBuilder::new(); |
73 | |
74 | for range in data.iter_ranges() { |
75 | builder.add_range32(&range) |
76 | } |
77 | |
78 | for range in data.iter_ranges_complemented() { |
79 | builder_complement.add_range32(&range) |
80 | } |
81 | |
82 | builder.complement(); |
83 | let set1 = builder.build(); |
84 | let set2 = builder_complement.build(); |
85 | assert_eq!(set1, set2, "Set {name} failed to complement correctly" ); |
86 | } |
87 | |
88 | fn test_map<T: TrieValue + Debug>( |
89 | data: &CodePointMapDataBorrowed<'static, T>, |
90 | value: T, |
91 | name: &str, |
92 | ) { |
93 | let mut builder = CodePointInversionListBuilder::new(); |
94 | let mut builder_complement = CodePointInversionListBuilder::new(); |
95 | |
96 | for range in data.iter_ranges_for_value(value) { |
97 | builder.add_range32(&range) |
98 | } |
99 | |
100 | for range in data.iter_ranges_for_value_complemented(value) { |
101 | builder_complement.add_range32(&range) |
102 | } |
103 | |
104 | builder.complement(); |
105 | let set1 = builder.build(); |
106 | let set2 = builder_complement.build(); |
107 | assert_eq!( |
108 | set1, set2, |
109 | "Map {name} failed to complement correctly with value {value:?}" |
110 | ); |
111 | } |
112 | |
113 | #[test ] |
114 | fn test_complement_sets() { |
115 | // Stress test the RangeListIteratorComplementer logic by ensuring it works for |
116 | // a whole bunch of binary properties |
117 | test_set(sets::ascii_hex_digit(), "ASCII_Hex_Digit" ); |
118 | test_set(sets::alnum(), "Alnum" ); |
119 | test_set(sets::alphabetic(), "Alphabetic" ); |
120 | test_set(sets::bidi_control(), "Bidi_Control" ); |
121 | test_set(sets::bidi_mirrored(), "Bidi_Mirrored" ); |
122 | test_set(sets::blank(), "Blank" ); |
123 | test_set(sets::cased(), "Cased" ); |
124 | test_set(sets::case_ignorable(), "Case_Ignorable" ); |
125 | test_set( |
126 | sets::full_composition_exclusion(), |
127 | "Full_Composition_Exclusion" , |
128 | ); |
129 | test_set(sets::changes_when_casefolded(), "Changes_When_Casefolded" ); |
130 | test_set(sets::changes_when_casemapped(), "Changes_When_Casemapped" ); |
131 | test_set( |
132 | sets::changes_when_nfkc_casefolded(), |
133 | "Changes_When_NFKC_Casefolded" , |
134 | ); |
135 | test_set(sets::changes_when_lowercased(), "Changes_When_Lowercased" ); |
136 | test_set(sets::changes_when_titlecased(), "Changes_When_Titlecased" ); |
137 | test_set(sets::changes_when_uppercased(), "Changes_When_Uppercased" ); |
138 | test_set(sets::dash(), "Dash" ); |
139 | test_set(sets::deprecated(), "Deprecated" ); |
140 | test_set( |
141 | sets::default_ignorable_code_point(), |
142 | "Default_Ignorable_Code_Point" , |
143 | ); |
144 | test_set(sets::diacritic(), "Diacritic" ); |
145 | test_set(sets::emoji_modifier_base(), "Emoji_Modifier_Base" ); |
146 | test_set(sets::emoji_component(), "Emoji_Component" ); |
147 | test_set(sets::emoji_modifier(), "Emoji_Modifier" ); |
148 | test_set(sets::emoji(), "Emoji" ); |
149 | test_set(sets::emoji_presentation(), "Emoji_Presentation" ); |
150 | test_set(sets::extender(), "Extender" ); |
151 | test_set(sets::extended_pictographic(), "Extended_Pictographic" ); |
152 | test_set(sets::graph(), "Graph" ); |
153 | test_set(sets::grapheme_base(), "Grapheme_Base" ); |
154 | test_set(sets::grapheme_extend(), "Grapheme_Extend" ); |
155 | test_set(sets::grapheme_link(), "Grapheme_Link" ); |
156 | test_set(sets::hex_digit(), "Hex_Digit" ); |
157 | test_set(sets::hyphen(), "Hyphen" ); |
158 | test_set(sets::id_continue(), "Id_Continue" ); |
159 | test_set(sets::ideographic(), "Ideographic" ); |
160 | test_set(sets::id_start(), "Id_Start" ); |
161 | test_set(sets::ids_binary_operator(), "Ids_Binary_Operator" ); |
162 | test_set(sets::ids_trinary_operator(), "Ids_Trinary_Operator" ); |
163 | test_set(sets::join_control(), "Join_Control" ); |
164 | test_set(sets::logical_order_exception(), "Logical_Order_Exception" ); |
165 | test_set(sets::lowercase(), "Lowercase" ); |
166 | test_set(sets::math(), "Math" ); |
167 | test_set(sets::noncharacter_code_point(), "Noncharacter_Code_Point" ); |
168 | test_set(sets::nfc_inert(), "NFC_Inert" ); |
169 | test_set(sets::nfd_inert(), "NFD_Inert" ); |
170 | test_set(sets::nfkc_inert(), "NFKC_Inert" ); |
171 | test_set(sets::nfkd_inert(), "NFKD_Inert" ); |
172 | test_set(sets::pattern_syntax(), "Pattern_Syntax" ); |
173 | test_set(sets::pattern_white_space(), "Pattern_White_Space" ); |
174 | test_set( |
175 | sets::prepended_concatenation_mark(), |
176 | "Prepended_Concatenation_Mark" , |
177 | ); |
178 | test_set(sets::print(), "Print" ); |
179 | test_set(sets::quotation_mark(), "Quotation_Mark" ); |
180 | test_set(sets::radical(), "Radical" ); |
181 | test_set(sets::regional_indicator(), "Regional_Indicator" ); |
182 | test_set(sets::soft_dotted(), "Soft_Dotted" ); |
183 | test_set(sets::segment_starter(), "Segment_Starter" ); |
184 | test_set(sets::case_sensitive(), "Case_Sensitive" ); |
185 | test_set(sets::sentence_terminal(), "Sentence_Terminal" ); |
186 | test_set(sets::terminal_punctuation(), "Terminal_Punctuation" ); |
187 | test_set(sets::unified_ideograph(), "Unified_Ideograph" ); |
188 | test_set(sets::uppercase(), "Uppercase" ); |
189 | test_set(sets::variation_selector(), "Variation_Selector" ); |
190 | test_set(sets::white_space(), "White_Space" ); |
191 | test_set(sets::xdigit(), "Xdigit" ); |
192 | test_set(sets::xid_continue(), "XID_Continue" ); |
193 | test_set(sets::xid_start(), "XID_Start" ); |
194 | } |
195 | |
196 | #[test ] |
197 | fn test_complement_maps() { |
198 | let gc = maps::general_category(); |
199 | let script = maps::script(); |
200 | test_map(&gc, GeneralCategory::UppercaseLetter, "gc" ); |
201 | test_map(&gc, GeneralCategory::OtherPunctuation, "gc" ); |
202 | test_map(&script, Script::Devanagari, "script" ); |
203 | test_map(&script, Script::Latin, "script" ); |
204 | test_map(&script, Script::Common, "script" ); |
205 | } |
206 | } |
207 | |