1// This file is part of ICU4X. For terms of use, please see the file
2// called LICENSE at the top level of the ICU4X source tree
3// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5#[cfg(feature = "serde")]
6use alloc::format;
7#[cfg(feature = "serde")]
8use alloc::string::String;
9use alloc::vec::Vec;
10use core::{char, ops::RangeBounds, ops::RangeInclusive};
11use yoke::Yokeable;
12use zerofrom::ZeroFrom;
13use zerovec::{ule::AsULE, zerovec, ZeroVec};
14
15use super::CodePointInversionListError;
16use crate::codepointinvlist::utils::{deconstruct_range, is_valid_zv};
17
18/// Represents the end code point of the Basic Multilingual Plane range, starting from code point 0, inclusive
19const BMP_MAX: u32 = 0xFFFF;
20
21/// Represents the inversion list for a set of all code points in the Basic Multilingual Plane.
22const BMP_INV_LIST_VEC: ZeroVec<u32> =
23 zerovec!(u32; <u32 as AsULE>::ULE::from_unsigned; [0x0, BMP_MAX + 1]);
24
25/// Represents the inversion list for all of the code points in the Unicode range.
26const ALL_VEC: ZeroVec<u32> =
27 zerovec!(u32; <u32 as AsULE>::ULE::from_unsigned; [0x0, (char::MAX as u32) + 1]);
28
29/// A membership wrapper for [`CodePointInversionList`].
30///
31/// Provides exposure to membership functions and constructors from serialized `CodePointSet`s (sets of code points)
32/// and predefined ranges.
33#[zerovec::make_varule(CodePointInversionListULE)]
34#[zerovec::skip_derive(Ord)]
35#[zerovec::derive(Debug)]
36#[derive(Debug, Eq, PartialEq, Clone, Yokeable, ZeroFrom)]
37pub struct CodePointInversionList<'data> {
38 // If we wanted to use an array to keep the memory on the stack, there is an unsafe nightly feature
39 // https://doc.rust-lang.org/nightly/core/array/trait.FixedSizeArray.html
40 // Allows for traits of fixed size arrays
41
42 // Implements an [inversion list.](https://en.wikipedia.org/wiki/Inversion_list)
43 inv_list: ZeroVec<'data, u32>,
44 size: u32,
45}
46
47#[cfg(feature = "serde")]
48impl<'de: 'a, 'a> serde::Deserialize<'de> for CodePointInversionList<'a> {
49 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
50 where
51 D: serde::Deserializer<'de>,
52 {
53 use serde::de::Error;
54
55 let parsed_inv_list = if deserializer.is_human_readable() {
56 #[derive(serde::Deserialize)]
57 #[serde(untagged)]
58 pub enum De<'data> {
59 // TODO(#2856): Remove in ICU4X 2.0
60 #[serde(borrow)]
61 OldStyle(ZeroVec<'data, u32>),
62 #[serde(borrow)]
63 NewStyle(Vec<alloc::borrow::Cow<'data, str>>),
64 }
65
66 match De::<'de>::deserialize(deserializer)? {
67 De::OldStyle(parsed_inv_list) => parsed_inv_list,
68 De::NewStyle(parsed_strings) => {
69 let mut inv_list =
70 ZeroVec::new_owned(Vec::with_capacity(parsed_strings.len() * 2));
71 for range in parsed_strings {
72 fn internal(range: &str) -> Option<(u32, u32)> {
73 let (start, range) = UnicodeCodePoint::parse(range)?;
74 if range.is_empty() {
75 return Some((start.0, start.0));
76 }
77 let (hyphen, range) = UnicodeCodePoint::parse(range)?;
78 if hyphen.0 != '-' as u32 {
79 return None;
80 }
81 let (end, range) = UnicodeCodePoint::parse(range)?;
82 range.is_empty().then_some((start.0, end.0))
83 }
84 let (start, end) = internal(&range).ok_or_else(|| Error::custom(format!(
85 "Cannot deserialize invalid inversion list for CodePointInversionList: {range:?}"
86 )))?;
87 inv_list.with_mut(|v| {
88 v.push(start.to_unaligned());
89 v.push((end + 1).to_unaligned());
90 });
91 }
92 inv_list
93 }
94 }
95 } else {
96 ZeroVec::<u32>::deserialize(deserializer)?
97 };
98 CodePointInversionList::try_from_inversion_list(parsed_inv_list).map_err(|e| {
99 Error::custom(format!(
100 "Cannot deserialize invalid inversion list for CodePointInversionList: {e:?}"
101 ))
102 })
103 }
104}
105
106#[cfg(feature = "databake")]
107impl databake::Bake for CodePointInversionList<'_> {
108 fn bake(&self, env: &databake::CrateEnv) -> databake::TokenStream {
109 env.insert("icu_collections");
110 let inv_list = self.inv_list.bake(env);
111 let size = self.size.bake(env);
112 // Safe because our parts are safe.
113 databake::quote! { unsafe {
114 #[allow(unused_unsafe)]
115 icu_collections::codepointinvlist::CodePointInversionList::from_parts_unchecked(#inv_list, #size)
116 }}
117 }
118}
119
120#[cfg(feature = "serde")]
121#[derive(Debug, Copy, Clone)]
122struct UnicodeCodePoint(u32);
123
124#[cfg(feature = "serde")]
125impl UnicodeCodePoint {
126 fn from_u32(cp: u32) -> Result<Self, String> {
127 if cp <= char::MAX as u32 {
128 Ok(Self(cp))
129 } else {
130 Err(format!("Not a Unicode code point {}", cp))
131 }
132 }
133
134 fn parse(value: &str) -> Option<(Self, &str)> {
135 Some(if let Some(hex) = value.strip_prefix("U+") {
136 let (escape, remainder) = (hex.get(..4)?, hex.get(4..)?);
137 (Self(u32::from_str_radix(escape, 16).ok()?), remainder)
138 } else {
139 let c = value.chars().next()?;
140 (Self(c as u32), value.get(c.len_utf8()..)?)
141 })
142 }
143}
144
145#[cfg(feature = "serde")]
146impl core::fmt::Display for UnicodeCodePoint {
147 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
148 match self.0 {
149 s @ 0xD800..=0xDFFF => write!(f, "U+{s:X}"),
150 // SAFETY: c <= char::MAX by construction, and not a surrogate
151 c => write!(f, "{}", unsafe { char::from_u32_unchecked(c) }),
152 }
153 }
154}
155
156#[cfg(feature = "serde")]
157impl<'data> serde::Serialize for CodePointInversionList<'data> {
158 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
159 where
160 S: serde::Serializer,
161 {
162 if serializer.is_human_readable() {
163 use serde::ser::Error;
164 use serde::ser::SerializeSeq;
165 let mut seq = serializer.serialize_seq(Some(self.inv_list.len() / 2))?;
166 for range in self.iter_ranges() {
167 let start = UnicodeCodePoint::from_u32(*range.start()).map_err(S::Error::custom)?;
168 if range.start() == range.end() {
169 seq.serialize_element(&format!("{start}"))?;
170 } else {
171 let end = UnicodeCodePoint::from_u32(*range.end()).map_err(S::Error::custom)?;
172 seq.serialize_element(&format!("{start}-{end}",))?;
173 }
174 }
175 seq.end()
176 } else {
177 // Note: serde(flatten) currently does not promote a struct field of type Vec
178 // to replace the struct when serializing. The error message from the default
179 // serialization is: "can only flatten structs and maps (got a sequence)".
180 self.inv_list.serialize(serializer)
181 }
182 }
183}
184
185impl<'data> CodePointInversionList<'data> {
186 /// Returns a new [`CodePointInversionList`] from an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
187 /// represented as a [`ZeroVec`]`<`[`u32`]`>` of code points.
188 ///
189 /// The inversion list must be of even length, sorted ascending non-overlapping,
190 /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
191 ///
192 /// # Examples
193 ///
194 /// ```
195 /// use icu::collections::codepointinvlist::CodePointInversionList;
196 /// use icu::collections::codepointinvlist::CodePointInversionListError;
197 /// use zerovec::ZeroVec;
198 /// let valid = [0x0, 0x10000];
199 /// let inv_list: ZeroVec<u32> = ZeroVec::from_slice_or_alloc(&valid);
200 /// let result = CodePointInversionList::try_from_inversion_list(inv_list);
201 /// assert!(matches!(result, CodePointInversionList));
202 ///
203 /// let invalid: Vec<u32> = vec![0x0, 0x80, 0x3];
204 /// let inv_list: ZeroVec<u32> = ZeroVec::from_slice_or_alloc(&invalid);
205 /// let result = CodePointInversionList::try_from_inversion_list(inv_list);
206 /// assert!(matches!(
207 /// result,
208 /// Err(CodePointInversionListError::InvalidSet(_))
209 /// ));
210 /// if let Err(CodePointInversionListError::InvalidSet(actual)) = result {
211 /// assert_eq!(&invalid, &actual);
212 /// }
213 /// ```
214 pub fn try_from_inversion_list(
215 inv_list: ZeroVec<'data, u32>,
216 ) -> Result<Self, CodePointInversionListError> {
217 #[allow(clippy::indexing_slicing)] // chunks
218 if is_valid_zv(&inv_list) {
219 let size = inv_list
220 .as_ule_slice()
221 .chunks(2)
222 .map(|end_points| {
223 <u32 as AsULE>::from_unaligned(end_points[1])
224 - <u32 as AsULE>::from_unaligned(end_points[0])
225 })
226 .sum::<u32>();
227 Ok(Self { inv_list, size })
228 } else {
229 Err(CodePointInversionListError::InvalidSet(inv_list.to_vec()))
230 }
231 }
232
233 #[doc(hidden)] // databake internal
234 pub const unsafe fn from_parts_unchecked(inv_list: ZeroVec<'data, u32>, size: u32) -> Self {
235 Self { inv_list, size }
236 }
237
238 /// Returns a new [`CodePointInversionList`] by borrowing an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
239 /// represented as a slice of [`u32`] code points.
240 ///
241 /// The inversion list must be of even length, sorted ascending non-overlapping,
242 /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
243 ///
244 /// Note: The slice may be cloned on certain platforms; for more information, see [`ZeroVec::from_slice_or_alloc`].
245 ///
246 /// # Examples
247 ///
248 /// ```
249 /// use icu::collections::codepointinvlist::CodePointInversionList;
250 /// use icu::collections::codepointinvlist::CodePointInversionListError;
251 /// let valid = [0x0, 0x10000];
252 /// let result = CodePointInversionList::try_from_inversion_list_slice(&valid);
253 /// assert!(matches!(result, CodePointInversionList));
254 ///
255 /// let invalid: Vec<u32> = vec![0x0, 0x80, 0x3];
256 /// let result =
257 /// CodePointInversionList::try_from_inversion_list_slice(&invalid);
258 /// assert!(matches!(
259 /// result,
260 /// Err(CodePointInversionListError::InvalidSet(_))
261 /// ));
262 /// if let Err(CodePointInversionListError::InvalidSet(actual)) = result {
263 /// assert_eq!(&invalid, &actual);
264 /// }
265 /// ```
266 pub fn try_from_inversion_list_slice(
267 inv_list: &'data [u32],
268 ) -> Result<Self, CodePointInversionListError> {
269 let inv_list_zv: ZeroVec<u32> = ZeroVec::from_slice_or_alloc(inv_list);
270 CodePointInversionList::try_from_inversion_list(inv_list_zv)
271 }
272
273 /// Returns a new, fully-owned [`CodePointInversionList`] by cloning an [inversion list](https://en.wikipedia.org/wiki/Inversion_list)
274 /// represented as a slice of [`u32`] code points.
275 ///
276 /// The inversion list must be of even length, sorted ascending non-overlapping,
277 /// and within the bounds of `0x0 -> 0x10FFFF` inclusive, and end points being exclusive.
278 ///
279 /// # Examples
280 ///
281 /// ```
282 /// use icu::collections::codepointinvlist::CodePointInversionList;
283 ///
284 /// let bmp_list = &[0x0, 0x10000];
285 /// let smp_list = &[0x10000, 0x20000];
286 /// let sip_list = &[0x20000, 0x30000];
287 ///
288 /// let lists: Vec<CodePointInversionList> =
289 /// [&bmp_list[..], smp_list, sip_list]
290 /// .into_iter()
291 /// .map(|l| {
292 /// CodePointInversionList::try_clone_from_inversion_list_slice(l)
293 /// .unwrap()
294 /// })
295 /// .collect();
296 ///
297 /// let bmp = &lists[0];
298 /// assert!(bmp.contains32(0xFFFF));
299 /// assert!(!bmp.contains32(0x10000));
300 ///
301 /// assert!(!lists.iter().any(|set| set.contains32(0x40000)));
302 /// ```
303 pub fn try_clone_from_inversion_list_slice(
304 inv_list: &[u32],
305 ) -> Result<Self, CodePointInversionListError> {
306 let inv_list_zv: ZeroVec<u32> = ZeroVec::alloc_from_slice(inv_list);
307 CodePointInversionList::try_from_inversion_list(inv_list_zv)
308 }
309
310 /// Attempts to convert this list into a fully-owned one. No-op if already fully owned
311 pub fn into_owned(self) -> CodePointInversionList<'static> {
312 CodePointInversionList {
313 inv_list: self.inv_list.into_owned(),
314 size: self.size,
315 }
316 }
317
318 /// Returns an owned inversion list representing the current [`CodePointInversionList`]
319 pub fn get_inversion_list_vec(&self) -> Vec<u32> {
320 let result: Vec<u32> = self.as_inversion_list().to_vec(); // Only crate public, to not leak impl
321 result
322 }
323
324 /// Returns [`CodePointInversionList`] spanning entire Unicode range
325 ///
326 /// The range spans from `0x0 -> 0x10FFFF` inclusive.
327 ///
328 /// # Examples
329 ///
330 /// ```
331 /// use icu::collections::codepointinvlist::CodePointInversionList;
332 ///
333 /// let expected = [0x0, (char::MAX as u32) + 1];
334 /// assert_eq!(
335 /// CodePointInversionList::all().get_inversion_list_vec(),
336 /// expected
337 /// );
338 /// assert_eq!(
339 /// CodePointInversionList::all().size(),
340 /// (expected[1] - expected[0]) as usize
341 /// );
342 /// ```
343 pub fn all() -> Self {
344 Self {
345 inv_list: ALL_VEC,
346 size: (char::MAX as u32) + 1,
347 }
348 }
349
350 /// Returns [`CodePointInversionList`] spanning BMP range
351 ///
352 /// The range spans from `0x0 -> 0xFFFF` inclusive.
353 ///
354 /// # Examples
355 ///
356 /// ```
357 /// use icu::collections::codepointinvlist::CodePointInversionList;
358 ///
359 /// const BMP_MAX: u32 = 0xFFFF;
360 ///
361 /// let expected = [0x0, BMP_MAX + 1];
362 /// assert_eq!(
363 /// CodePointInversionList::bmp().get_inversion_list_vec(),
364 /// expected
365 /// );
366 /// assert_eq!(
367 /// CodePointInversionList::bmp().size(),
368 /// (expected[1] - expected[0]) as usize
369 /// );
370 /// ```
371 pub fn bmp() -> Self {
372 Self {
373 inv_list: BMP_INV_LIST_VEC,
374 size: BMP_MAX + 1,
375 }
376 }
377
378 /// Returns the inversion list as a slice
379 ///
380 /// Public only to the crate, not exposed to public
381 pub(crate) fn as_inversion_list(&self) -> &ZeroVec<u32> {
382 &self.inv_list
383 }
384
385 /// Yields an [`Iterator`] going through the character set in the [`CodePointInversionList`]
386 ///
387 /// # Examples
388 ///
389 /// ```
390 /// use icu::collections::codepointinvlist::CodePointInversionList;
391 /// let example_list = [0x41, 0x44, 0x45, 0x46];
392 /// let example =
393 /// CodePointInversionList::try_from_inversion_list_slice(&example_list)
394 /// .unwrap();
395 /// let mut ex_iter_chars = example.iter_chars();
396 /// assert_eq!(Some('A'), ex_iter_chars.next());
397 /// assert_eq!(Some('B'), ex_iter_chars.next());
398 /// assert_eq!(Some('C'), ex_iter_chars.next());
399 /// assert_eq!(Some('E'), ex_iter_chars.next());
400 /// assert_eq!(None, ex_iter_chars.next());
401 /// ```
402 pub fn iter_chars(&self) -> impl Iterator<Item = char> + '_ {
403 #[allow(clippy::indexing_slicing)] // chunks
404 self.inv_list
405 .as_ule_slice()
406 .chunks(2)
407 .flat_map(|pair| (AsULE::from_unaligned(pair[0])..AsULE::from_unaligned(pair[1])))
408 .filter_map(char::from_u32)
409 }
410
411 /// Yields an [`Iterator`] returning the ranges of the code points that are
412 /// included in the [`CodePointInversionList`]
413 ///
414 /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
415 /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
416 /// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`.
417 ///
418 /// # Example
419 ///
420 /// ```
421 /// use icu::collections::codepointinvlist::CodePointInversionList;
422 /// let example_list = [0x41, 0x44, 0x45, 0x46];
423 /// let example =
424 /// CodePointInversionList::try_from_inversion_list_slice(&example_list)
425 /// .unwrap();
426 /// let mut example_iter_ranges = example.iter_ranges();
427 /// assert_eq!(Some(0x41..=0x43), example_iter_ranges.next());
428 /// assert_eq!(Some(0x45..=0x45), example_iter_ranges.next());
429 /// assert_eq!(None, example_iter_ranges.next());
430 /// ```
431 pub fn iter_ranges(&self) -> impl ExactSizeIterator<Item = RangeInclusive<u32>> + '_ {
432 #[allow(clippy::indexing_slicing)] // chunks
433 self.inv_list.as_ule_slice().chunks(2).map(|pair| {
434 let range_start: u32 = AsULE::from_unaligned(pair[0]);
435 let range_limit: u32 = AsULE::from_unaligned(pair[1]);
436 RangeInclusive::new(range_start, range_limit - 1)
437 })
438 }
439
440 /// Yields an [`Iterator`] returning the ranges of the code points that are
441 /// *not* included in the [`CodePointInversionList`]
442 ///
443 /// Ranges are returned as [`RangeInclusive`], which is inclusive of its
444 /// `end` bound value. An end-inclusive behavior matches the ICU4C/J
445 /// behavior of ranges, ex: `CodePointInversionList::contains(UChar32 start, UChar32 end)`.
446 ///
447 /// # Example
448 ///
449 /// ```
450 /// use icu::collections::codepointinvlist::CodePointInversionList;
451 /// let example_list = [0x41, 0x44, 0x45, 0x46];
452 /// let example =
453 /// CodePointInversionList::try_from_inversion_list_slice(&example_list)
454 /// .unwrap();
455 /// let mut example_iter_ranges = example.iter_ranges_complemented();
456 /// assert_eq!(Some(0..=0x40), example_iter_ranges.next());
457 /// assert_eq!(Some(0x44..=0x44), example_iter_ranges.next());
458 /// assert_eq!(Some(0x46..=char::MAX as u32), example_iter_ranges.next());
459 /// assert_eq!(None, example_iter_ranges.next());
460 /// ```
461 pub fn iter_ranges_complemented(&self) -> impl Iterator<Item = RangeInclusive<u32>> + '_ {
462 let inv_ule = self.inv_list.as_ule_slice();
463 let middle = inv_ule.get(1..inv_ule.len() - 1).unwrap_or(&[]);
464 let beginning = if let Some(first) = self.inv_list.first() {
465 if first == 0 {
466 None
467 } else {
468 Some(0..=first - 1)
469 }
470 } else {
471 None
472 };
473 let end = if let Some(last) = self.inv_list.last() {
474 if last == char::MAX as u32 {
475 None
476 } else {
477 Some(last..=char::MAX as u32)
478 }
479 } else {
480 None
481 };
482 #[allow(clippy::indexing_slicing)] // chunks
483 let chunks = middle.chunks(2).map(|pair| {
484 let range_start: u32 = AsULE::from_unaligned(pair[0]);
485 let range_limit: u32 = AsULE::from_unaligned(pair[1]);
486 RangeInclusive::new(range_start, range_limit - 1)
487 });
488 beginning.into_iter().chain(chunks).chain(end)
489 }
490
491 /// Returns the number of ranges contained in this [`CodePointInversionList`]
492 pub fn get_range_count(&self) -> usize {
493 self.inv_list.len() / 2
494 }
495
496 /// Returns a specific range contained in this [`CodePointInversionList`] by index.
497 /// Intended for use in FFI.
498 pub fn get_nth_range(&self, idx: usize) -> Option<RangeInclusive<u32>> {
499 let start_idx = idx * 2;
500 let end_idx = start_idx + 1;
501 let start = self.inv_list.get(start_idx)?;
502 let end = self.inv_list.get(end_idx)?;
503 Some(RangeInclusive::new(start, end - 1))
504 }
505
506 /// Returns the number of elements of the [`CodePointInversionList`]
507 pub fn size(&self) -> usize {
508 if self.is_empty() {
509 return 0;
510 }
511 self.size as usize
512 }
513
514 /// Returns whether or not the [`CodePointInversionList`] is empty
515 pub fn is_empty(&self) -> bool {
516 self.inv_list.is_empty()
517 }
518
519 /// Wrapper for contains
520 ///
521 /// Returns an [`Option`] as to whether or not it is possible for the query to be contained.
522 /// The value in the [`Option`] is the start index of the range that contains the query.
523 fn contains_query(&self, query: u32) -> Option<usize> {
524 match self.inv_list.binary_search(&query) {
525 Ok(pos) => {
526 if pos % 2 == 0 {
527 Some(pos)
528 } else {
529 None
530 }
531 }
532 Err(pos) => {
533 if pos % 2 != 0 && pos < self.inv_list.len() {
534 Some(pos - 1)
535 } else {
536 None
537 }
538 }
539 }
540 }
541
542 /// Checks to see the query is in the [`CodePointInversionList`]
543 ///
544 /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
545 /// in the set using [`core`] implementation
546 ///
547 /// # Examples
548 ///
549 /// ```
550 /// use icu::collections::codepointinvlist::CodePointInversionList;
551 /// let example_list = [0x41, 0x43, 0x44, 0x45];
552 /// let example =
553 /// CodePointInversionList::try_from_inversion_list_slice(&example_list)
554 /// .unwrap();
555 /// assert!(example.contains('A'));
556 /// assert!(!example.contains('C'));
557 /// ```
558 pub fn contains(&self, query: char) -> bool {
559 self.contains_query(query as u32).is_some()
560 }
561
562 /// Checks to see the unsigned int is in the [`CodePointInversionList::all()`](CodePointInversionList::all())
563 ///
564 /// Note: Even though [`u32`] and [`prim@char`] in Rust are non-negative 4-byte
565 /// values, there is an important difference. A [`u32`] can take values up to
566 /// a very large integer value, while a [`prim@char`] in Rust is defined to be in
567 /// the range from 0 to the maximum valid Unicode Scalar Value.
568 ///
569 /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
570 /// in the set using [`core`] implementation
571 ///
572 /// # Examples
573 ///
574 /// ```
575 /// use icu::collections::codepointinvlist::CodePointInversionList;
576 /// let example_list = [0x41, 0x43, 0x44, 0x45];
577 /// let example =
578 /// CodePointInversionList::try_from_inversion_list_slice(&example_list)
579 /// .unwrap();
580 /// assert!(example.contains32(0x41));
581 /// assert!(!example.contains32(0x43));
582 /// ```
583 pub fn contains32(&self, query: u32) -> bool {
584 self.contains_query(query).is_some()
585 }
586
587 /// Checks to see if the range is in the [`CodePointInversionList`]
588 ///
589 /// Runs a binary search in `O(log(n))` where `n` is the number of start and end points
590 /// in the set using [`Vec`] implementation. Only runs the search once on the `start`
591 /// parameter, while the `end` parameter is checked in a single `O(1)` step.
592 ///
593 /// # Examples
594 ///
595 /// ```
596 /// use icu::collections::codepointinvlist::CodePointInversionList;
597 /// let example_list = [0x41, 0x43, 0x44, 0x45];
598 /// let example =
599 /// CodePointInversionList::try_from_inversion_list_slice(&example_list)
600 /// .unwrap();
601 /// assert!(example.contains_range(&('A'..'C')));
602 /// assert!(example.contains_range(&('A'..='B')));
603 /// assert!(!example.contains_range(&('A'..='C')));
604 /// ```
605 ///
606 /// Surrogate points (`0xD800 -> 0xDFFF`) will return [`false`] if the Range contains them but the
607 /// [`CodePointInversionList`] does not.
608 ///
609 /// Note: when comparing to ICU4C/J, keep in mind that `Range`s in Rust are
610 /// constructed inclusive of start boundary and exclusive of end boundary.
611 /// The ICU4C/J `CodePointInversionList::contains(UChar32 start, UChar32 end)` method
612 /// differs by including the end boundary.
613 ///
614 /// # Examples
615 ///
616 /// ```
617 /// use icu::collections::codepointinvlist::CodePointInversionList;
618 /// use std::char;
619 /// let check =
620 /// char::from_u32(0xD7FE).unwrap()..char::from_u32(0xE001).unwrap();
621 /// let example_list = [0xD7FE, 0xD7FF, 0xE000, 0xE001];
622 /// let example =
623 /// CodePointInversionList::try_from_inversion_list_slice(&example_list)
624 /// .unwrap();
625 /// assert!(!example.contains_range(&(check)));
626 /// ```
627 pub fn contains_range(&self, range: &impl RangeBounds<char>) -> bool {
628 let (from, till) = deconstruct_range(range);
629 if from >= till {
630 return false;
631 }
632 match self.contains_query(from) {
633 Some(pos) => {
634 if let Some(x) = self.inv_list.get(pos + 1) {
635 (till) <= x
636 } else {
637 debug_assert!(
638 false,
639 "Inversion list query should not return out of bounds index"
640 );
641 false
642 }
643 }
644 None => false,
645 }
646 }
647
648 /// Check if the calling [`CodePointInversionList`] contains all the characters of the given [`CodePointInversionList`]
649 ///
650 /// # Examples
651 ///
652 /// ```
653 /// use icu::collections::codepointinvlist::CodePointInversionList;
654 /// let example_list = [0x41, 0x46, 0x55, 0x5B]; // A - E, U - Z
655 /// let example =
656 /// CodePointInversionList::try_from_inversion_list_slice(&example_list)
657 /// .unwrap();
658 /// let a_to_d =
659 /// CodePointInversionList::try_from_inversion_list_slice(&[0x41, 0x45])
660 /// .unwrap();
661 /// let f_to_t =
662 /// CodePointInversionList::try_from_inversion_list_slice(&[0x46, 0x55])
663 /// .unwrap();
664 /// let r_to_x =
665 /// CodePointInversionList::try_from_inversion_list_slice(&[0x52, 0x58])
666 /// .unwrap();
667 /// assert!(example.contains_set(&a_to_d)); // contains all
668 /// assert!(!example.contains_set(&f_to_t)); // contains none
669 /// assert!(!example.contains_set(&r_to_x)); // contains some
670 /// ```
671 pub fn contains_set(&self, set: &Self) -> bool {
672 if set.size() > self.size() {
673 return false;
674 }
675
676 let mut set_ranges = set.iter_ranges();
677 let mut check_elem = set_ranges.next();
678
679 let ranges = self.iter_ranges();
680 for range in ranges {
681 match check_elem {
682 Some(ref check_range) => {
683 if check_range.start() >= range.start()
684 && check_range.end() <= &(range.end() + 1)
685 {
686 check_elem = set_ranges.next();
687 }
688 }
689 _ => break,
690 }
691 }
692 check_elem.is_none()
693 }
694
695 /// Returns the end of the initial substring where the characters are either contained/not contained
696 /// in the set.
697 ///
698 /// # Examples
699 ///
700 /// ```
701 /// use icu::collections::codepointinvlist::CodePointInversionList;
702 /// let example_list = [0x41, 0x44]; // {A, B, C}
703 /// let example =
704 /// CodePointInversionList::try_from_inversion_list_slice(&example_list)
705 /// .unwrap();
706 /// assert_eq!(example.span("CABXYZ", true), 3);
707 /// assert_eq!(example.span("XYZC", false), 3);
708 /// assert_eq!(example.span("XYZ", true), 0);
709 /// assert_eq!(example.span("ABC", false), 0);
710 /// ```
711 pub fn span(&self, span_str: &str, contained: bool) -> usize {
712 span_str
713 .chars()
714 .take_while(|&x| self.contains(x) == contained)
715 .count()
716 }
717
718 /// Returns the start of the trailing substring (starting from end of string) where the characters are
719 /// either contained/not contained in the set. Returns the length of the string if no valid return.
720 ///
721 /// # Examples
722 ///
723 /// ```
724 /// use icu::collections::codepointinvlist::CodePointInversionList;
725 /// let example_list = [0x41, 0x44]; // {A, B, C}
726 /// let example =
727 /// CodePointInversionList::try_from_inversion_list_slice(&example_list)
728 /// .unwrap();
729 /// assert_eq!(example.span_back("XYZCAB", true), 3);
730 /// assert_eq!(example.span_back("ABCXYZ", true), 6);
731 /// assert_eq!(example.span_back("CABXYZ", false), 3);
732 /// ```
733 pub fn span_back(&self, span_str: &str, contained: bool) -> usize {
734 span_str.len()
735 - span_str
736 .chars()
737 .rev()
738 .take_while(|&x| self.contains(x) == contained)
739 .count()
740 }
741}
742
743#[cfg(test)]
744mod tests {
745 use super::{CodePointInversionList, CodePointInversionListError};
746 use std::{char, vec::Vec};
747 use zerovec::ZeroVec;
748
749 #[test]
750 fn test_codepointinversionlist_try_from_vec() {
751 let ex = vec![0x2, 0x3, 0x4, 0x5];
752 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
753 assert_eq!(ex, check.get_inversion_list_vec());
754 assert_eq!(2, check.size());
755 }
756
757 #[test]
758 fn test_codepointinversionlist_try_from_vec_error() {
759 let check = vec![0x1, 0x1, 0x2, 0x3, 0x4];
760 let inv_list = ZeroVec::from_slice_or_alloc(&check);
761 let set = CodePointInversionList::try_from_inversion_list(inv_list);
762 assert!(matches!(
763 set,
764 Err(CodePointInversionListError::InvalidSet(_))
765 ));
766 if let Err(CodePointInversionListError::InvalidSet(actual)) = set {
767 assert_eq!(&check, &actual);
768 }
769 }
770
771 // CodePointInversionList membership functions
772 #[test]
773 fn test_codepointinversionlist_contains_query() {
774 let ex = vec![0x41, 0x46, 0x4B, 0x55];
775 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
776 assert!(check.contains_query(0x40).is_none());
777 assert_eq!(check.contains_query(0x41).unwrap(), 0);
778 assert_eq!(check.contains_query(0x44).unwrap(), 0);
779 assert!(check.contains_query(0x46).is_none());
780 assert_eq!(check.contains_query(0x4C).unwrap(), 2);
781 assert!(check.contains_query(0x56).is_none());
782 }
783
784 #[test]
785 fn test_codepointinversionlist_contains() {
786 let ex = vec![0x2, 0x5, 0xA, 0xF];
787 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
788 assert!(check.contains(0x2 as char));
789 assert!(check.contains(0x4 as char));
790 assert!(check.contains(0xA as char));
791 assert!(check.contains(0xE as char));
792 }
793
794 #[test]
795 fn test_codepointinversionlist_contains_false() {
796 let ex = vec![0x2, 0x5, 0xA, 0xF];
797 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
798 assert!(!check.contains(0x1 as char));
799 assert!(!check.contains(0x5 as char));
800 assert!(!check.contains(0x9 as char));
801 assert!(!check.contains(0xF as char));
802 assert!(!check.contains(0x10 as char));
803 }
804
805 #[test]
806 fn test_codepointinversionlist_contains_range() {
807 let ex = vec![0x41, 0x46, 0x4B, 0x55];
808 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
809 assert!(check.contains_range(&('A'..='E'))); // 65 - 69
810 assert!(check.contains_range(&('C'..'D'))); // 67 - 67
811 assert!(check.contains_range(&('L'..'P'))); // 76 - 80
812 assert!(!check.contains_range(&('L'..='U'))); // 76 - 85
813 }
814
815 #[test]
816 fn test_codepointinversionlist_contains_range_false() {
817 let ex = vec![0x41, 0x46, 0x4B, 0x55];
818 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
819 assert!(!check.contains_range(&('!'..'A'))); // 33 - 65
820 assert!(!check.contains_range(&('F'..'K'))); // 70 - 74
821 assert!(!check.contains_range(&('U'..))); // 85 - ..
822 }
823
824 #[test]
825 fn test_codepointinversionlist_contains_range_invalid() {
826 let check = CodePointInversionList::all();
827 assert!(!check.contains_range(&('A'..'!'))); // 65 - 33
828 assert!(!check.contains_range(&('A'..'A'))); // 65 - 65
829 }
830
831 #[test]
832 fn test_codepointinversionlist_contains_set_u() {
833 let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x6E];
834 let u = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
835 let inside = vec![0xF, 0x14, 0x2C, 0x31, 0x46, 0x50, 0x64, 0x6D];
836 let s = CodePointInversionList::try_from_inversion_list_slice(&inside).unwrap();
837 assert!(u.contains_set(&s));
838 }
839
840 #[test]
841 fn test_codepointinversionlist_contains_set_u_false() {
842 let ex = vec![0xA, 0x14, 0x28, 0x32, 0x46, 0x50, 0x64, 0x78];
843 let u = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
844 let outside = vec![0x0, 0xA, 0x16, 0x2C, 0x32, 0x46, 0x4F, 0x51, 0x6D, 0x6F];
845 let s = CodePointInversionList::try_from_inversion_list_slice(&outside).unwrap();
846 assert!(!u.contains_set(&s));
847 }
848
849 #[test]
850 fn test_codepointinversionlist_size() {
851 let ex = vec![0x2, 0x5, 0xA, 0xF];
852 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
853 assert_eq!(8, check.size());
854 let check = CodePointInversionList::all();
855 let expected = (char::MAX as u32) + 1;
856 assert_eq!(expected as usize, check.size());
857 let inv_list_vec: Vec<u32> = vec![];
858 let check = CodePointInversionList {
859 inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec),
860 size: 0,
861 };
862 assert_eq!(check.size(), 0);
863 }
864
865 #[test]
866 fn test_codepointinversionlist_is_empty() {
867 let inv_list_vec: Vec<u32> = vec![];
868 let check = CodePointInversionList {
869 inv_list: ZeroVec::from_slice_or_alloc(&inv_list_vec),
870 size: 0,
871 };
872 assert!(check.is_empty());
873 }
874
875 #[test]
876 fn test_codepointinversionlist_is_not_empty() {
877 let check = CodePointInversionList::all();
878 assert!(!check.is_empty());
879 }
880
881 #[test]
882 fn test_codepointinversionlist_iter_chars() {
883 let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
884 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
885 let mut iter = check.iter_chars();
886 assert_eq!(Some('A'), iter.next());
887 assert_eq!(Some('B'), iter.next());
888 assert_eq!(Some('C'), iter.next());
889 assert_eq!(Some('E'), iter.next());
890 assert_eq!(None, iter.next());
891 }
892
893 #[test]
894 fn test_codepointinversionlist_iter_ranges() {
895 let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
896 let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
897 let mut ranges = set.iter_ranges();
898 assert_eq!(Some(0x41..=0x43), ranges.next());
899 assert_eq!(Some(0x45..=0x45), ranges.next());
900 assert_eq!(Some(0xD800..=0xD800), ranges.next());
901 assert_eq!(None, ranges.next());
902 }
903
904 #[test]
905 fn test_codepointinversionlist_iter_ranges_exactsizeiter_trait() {
906 let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
907 let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
908 let ranges = set.iter_ranges();
909 assert_eq!(3, ranges.len());
910 }
911
912 #[test]
913 fn test_codepointinversionlist_range_count() {
914 let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
915 let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
916 assert_eq!(3, set.get_range_count());
917 }
918
919 #[test]
920 fn test_codepointinversionlist_get_nth_range() {
921 let ex = vec![0x41, 0x44, 0x45, 0x46, 0xD800, 0xD801];
922 let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
923 assert_eq!(Some(0x41..=0x43), set.get_nth_range(0));
924 assert_eq!(Some(0x45..=0x45), set.get_nth_range(1));
925 assert_eq!(Some(0xD800..=0xD800), set.get_nth_range(2));
926 assert_eq!(None, set.get_nth_range(3));
927 }
928
929 // Range<char> cannot represent the upper bound (non-inclusive) for
930 // char::MAX, whereas Range<u32> can.
931 #[test]
932 fn test_codepointinversionlist_iter_ranges_with_max_code_point() {
933 let ex = vec![0x80, (char::MAX as u32) + 1];
934 let set = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
935 let mut ranges = set.iter_ranges();
936 assert_eq!(Some(0x80..=(char::MAX as u32)), ranges.next());
937 assert_eq!(None, ranges.next());
938 }
939
940 #[test]
941 fn test_codepointinversionlist_span_contains() {
942 let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
943 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
944 assert_eq!(check.span("ABCDE", true), 3);
945 assert_eq!(check.span("E", true), 0);
946 }
947
948 #[test]
949 fn test_codepointinversionlist_span_does_not_contain() {
950 let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
951 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
952 assert_eq!(check.span("DEF", false), 2);
953 assert_eq!(check.span("KLMA", false), 3);
954 }
955
956 #[test]
957 fn test_codepointinversionlist_span_back_contains() {
958 let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
959 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
960 assert_eq!(check.span_back("XYZABFH", true), 3);
961 assert_eq!(check.span_back("ABCXYZ", true), 6);
962 }
963
964 #[test]
965 fn test_codepointinversionlist_span_back_does_not_contain() {
966 let ex = vec![0x41, 0x44, 0x46, 0x4B]; // A - D, F - K
967 let check = CodePointInversionList::try_from_inversion_list_slice(&ex).unwrap();
968 assert_eq!(check.span_back("ABCXYZ", false), 3);
969 assert_eq!(check.span_back("XYZABC", false), 6);
970 }
971
972 #[test]
973 fn test_uniset_to_inv_list() {
974 let inv_list = [
975 0x9, 0xE, 0x20, 0x21, 0x85, 0x86, 0xA0, 0xA1, 0x1626, 0x1627, 0x2000, 0x2003, 0x2028,
976 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
977 ];
978 let s: CodePointInversionList =
979 CodePointInversionList::try_from_inversion_list_slice(&inv_list).unwrap();
980 let round_trip_inv_list = s.get_inversion_list_vec();
981 assert_eq!(round_trip_inv_list, inv_list);
982 }
983
984 #[test]
985 fn test_serde_serialize() {
986 let inv_list = [0x41, 0x46, 0x4B, 0x55];
987 let uniset = CodePointInversionList::try_from_inversion_list_slice(&inv_list).unwrap();
988 let json_str = serde_json::to_string(&uniset).unwrap();
989 assert_eq!(json_str, r#"["A-E","K-T"]"#);
990 }
991
992 #[test]
993 fn test_serde_serialize_surrogates() {
994 let inv_list = [0xDFAB, 0xDFFF];
995 let uniset = CodePointInversionList::try_from_inversion_list_slice(&inv_list).unwrap();
996 let json_str = serde_json::to_string(&uniset).unwrap();
997 assert_eq!(json_str, r#"["U+DFAB-U+DFFE"]"#);
998 }
999
1000 #[test]
1001 fn test_serde_deserialize() {
1002 let inv_list_str = r#"["A-E","K-T"]"#;
1003 let exp_inv_list = [0x41, 0x46, 0x4B, 0x55];
1004 let exp_uniset =
1005 CodePointInversionList::try_from_inversion_list_slice(&exp_inv_list).unwrap();
1006 let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
1007 assert_eq!(act_uniset, exp_uniset);
1008 }
1009
1010 #[test]
1011 fn test_serde_deserialize_surrogates() {
1012 let inv_list_str = r#"["U+DFAB-U+DFFE"]"#;
1013 let exp_inv_list = [0xDFAB, 0xDFFF];
1014 let exp_uniset =
1015 CodePointInversionList::try_from_inversion_list_slice(&exp_inv_list).unwrap();
1016 let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
1017 assert_eq!(act_uniset, exp_uniset);
1018 }
1019
1020 #[test]
1021 fn test_serde_deserialize_legacy() {
1022 let inv_list_str = "[65,70,75,85]";
1023 let exp_inv_list = [0x41, 0x46, 0x4B, 0x55];
1024 let exp_uniset =
1025 CodePointInversionList::try_from_inversion_list_slice(&exp_inv_list).unwrap();
1026 let act_uniset: CodePointInversionList = serde_json::from_str(inv_list_str).unwrap();
1027 assert_eq!(act_uniset, exp_uniset);
1028 }
1029
1030 #[test]
1031 fn test_serde_deserialize_invalid() {
1032 assert!(serde_json::from_str::<CodePointInversionList>("[65,70,98775,85]").is_err());
1033 assert!(serde_json::from_str::<CodePointInversionList>("[65,70,U+FFFFFFFFFF,85]").is_err());
1034 }
1035
1036 #[test]
1037 fn test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error> {
1038 let set = CodePointInversionList::bmp();
1039 let set_serialized: Vec<u8> = postcard::to_allocvec(&set).unwrap();
1040 let set_deserialized: CodePointInversionList =
1041 postcard::from_bytes::<CodePointInversionList>(&set_serialized)?;
1042
1043 assert_eq!(&set, &set_deserialized);
1044 assert!(!set_deserialized.inv_list.is_owned());
1045
1046 Ok(())
1047 }
1048
1049 #[test]
1050 fn databake() {
1051 databake::test_bake!(
1052 CodePointInversionList<'static>,
1053 const: unsafe {
1054 #[allow(unused_unsafe)]
1055 crate::codepointinvlist::CodePointInversionList::from_parts_unchecked(
1056 unsafe {
1057 zerovec::ZeroVec::from_bytes_unchecked(
1058 b"0\0\0\0:\0\0\0A\0\0\0G\0\0\0a\0\0\0g\0\0\0"
1059 )
1060 },
1061 22u32,
1062 )
1063 },
1064 icu_collections,
1065 [zerovec],
1066 );
1067 }
1068}
1069