1 | // This file is part of ICU4X. For terms of use, please see the file |
2 | // called LICENSE at the top level of the ICU4X source tree |
3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
4 | |
5 | //! Bundles the part of UTS 46 that makes sense to implement as a |
6 | //! normalization. |
7 | //! |
8 | //! This is meant to be used as a building block of an UTS 46 |
9 | //! implementation, such as the `idna` crate. |
10 | |
11 | use crate::CanonicalCompositionsV1Marker; |
12 | use crate::CanonicalDecompositionDataV1Marker; |
13 | use crate::CanonicalDecompositionTablesV1Marker; |
14 | use crate::CompatibilityDecompositionTablesV1Marker; |
15 | use crate::ComposingNormalizer; |
16 | use crate::NormalizerError; |
17 | use crate::Uts46DecompositionSupplementV1Marker; |
18 | use icu_provider::DataProvider; |
19 | |
20 | // Implementation note: Despite merely wrapping a `ComposingNormalizer`, |
21 | // having a `Uts46Mapper` serves two purposes: |
22 | // |
23 | // 1. Denying public access to parts of the `ComposingNormalizer` API |
24 | // that don't work when the data contains markers for ignorables. |
25 | // 2. Providing a place where additional iterator pre-processing or |
26 | // post-processing can take place if needed in the future. (When |
27 | // writing this, it looked like such processing was needed but |
28 | // now isn't needed after all.) |
29 | |
30 | /// A mapper that knows how to performs the subsets of UTS 46 processing |
31 | /// documented on the methods. |
32 | #[derive (Debug)] |
33 | pub struct Uts46Mapper { |
34 | normalizer: ComposingNormalizer, |
35 | } |
36 | |
37 | #[cfg (feature = "compiled_data" )] |
38 | impl Default for Uts46Mapper { |
39 | fn default() -> Self { |
40 | Self::new() |
41 | } |
42 | } |
43 | |
44 | impl Uts46Mapper { |
45 | /// Construct with compiled data. |
46 | #[cfg (feature = "compiled_data" )] |
47 | pub const fn new() -> Self { |
48 | Uts46Mapper { |
49 | normalizer: ComposingNormalizer::new_uts46(), |
50 | } |
51 | } |
52 | |
53 | /// Construct with provider. |
54 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)] |
55 | pub fn try_new<D>(provider: &D) -> Result<Self, NormalizerError> |
56 | where |
57 | D: DataProvider<CanonicalDecompositionDataV1Marker> |
58 | + DataProvider<Uts46DecompositionSupplementV1Marker> |
59 | + DataProvider<CanonicalDecompositionTablesV1Marker> |
60 | + DataProvider<CompatibilityDecompositionTablesV1Marker> |
61 | // UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker |
62 | + DataProvider<CanonicalCompositionsV1Marker> |
63 | + ?Sized, |
64 | { |
65 | let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?; |
66 | |
67 | Ok(Uts46Mapper { normalizer }) |
68 | } |
69 | |
70 | /// Returns an iterator adaptor that turns an `Iterator` over `char` |
71 | /// into an iterator yielding a `char` sequence that gets the following |
72 | /// operations from the "Map" and "Normalize" steps of the "Processing" |
73 | /// section of UTS 46 lazily applied to it: |
74 | /// |
75 | /// 1. The _ignored_ characters are ignored. |
76 | /// 2. The _mapped_ characters are mapped. |
77 | /// 3. The _disallowed_ characters are replaced with U+FFFD, |
78 | /// which itself is a disallowed character. |
79 | /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ |
80 | /// as appropriate. |
81 | /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. |
82 | /// 6. The _disallowed_STD3_mapped_ characters are treated as |
83 | /// _mapped_. |
84 | /// 7. The result is normalized to NFC. |
85 | /// |
86 | /// Notably: |
87 | /// |
88 | /// * The STD3 or WHATWG ASCII deny list should be implemented as a |
89 | /// post-processing step. |
90 | /// * Transitional processing is not performed. Transitional mapping |
91 | /// would be a pre-processing step, but transitional processing is |
92 | /// deprecated, and none of Firefox, Safari, or Chrome use it. |
93 | pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>( |
94 | &'delegate self, |
95 | iter: I, |
96 | ) -> impl Iterator<Item = char> + 'delegate { |
97 | self.normalizer |
98 | .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored) |
99 | } |
100 | |
101 | /// Returns an iterator adaptor that turns an `Iterator` over `char` |
102 | /// into an iterator yielding a `char` sequence that gets the following |
103 | /// operations from the NFC check and statucs steps of the "Validity |
104 | /// Criteria" section of UTS 46 lazily applied to it: |
105 | /// |
106 | /// 1. The _ignored_ characters are treated as _disallowed_. |
107 | /// 2. The _mapped_ characters are mapped. |
108 | /// 3. The _disallowed_ characters are replaced with U+FFFD, |
109 | /// which itself is a disallowed character. |
110 | /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ |
111 | /// as appropriate. |
112 | /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. |
113 | /// 6. The _disallowed_STD3_mapped_ characters are treated as |
114 | /// _mapped_. |
115 | /// 7. The result is normalized to NFC. |
116 | /// |
117 | /// Notably: |
118 | /// |
119 | /// * The STD3 or WHATWG ASCII deny list should be implemented as a |
120 | /// post-processing step. |
121 | /// * Transitional processing is not performed. Transitional mapping |
122 | /// would be a pre-processing step, but transitional processing is |
123 | /// deprecated, and none of Firefox, Safari, or Chrome use it. |
124 | /// * The output needs to be compared with input to see if anything |
125 | /// changed. This check catches failures to adhere to the normalization |
126 | /// and status requirements. In particular, this comparison results |
127 | /// in _mapped_ characters resulting in error like "Validity Criteria" |
128 | /// requires. |
129 | pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>( |
130 | &'delegate self, |
131 | iter: I, |
132 | ) -> impl Iterator<Item = char> + 'delegate { |
133 | self.normalizer |
134 | .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter) |
135 | } |
136 | } |
137 | |