| 1 | // This file is part of ICU4X. For terms of use, please see the file |
| 2 | // called LICENSE at the top level of the ICU4X source tree |
| 3 | // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). |
| 4 | |
| 5 | //! Bundles the part of UTS 46 that makes sense to implement as a |
| 6 | //! normalization. |
| 7 | //! |
| 8 | //! This is meant to be used as a building block of an UTS 46 |
| 9 | //! implementation, such as the `idna` crate. |
| 10 | |
| 11 | use crate::CanonicalCompositionsV1Marker; |
| 12 | use crate::CanonicalDecompositionDataV1Marker; |
| 13 | use crate::CanonicalDecompositionTablesV1Marker; |
| 14 | use crate::CompatibilityDecompositionTablesV1Marker; |
| 15 | use crate::ComposingNormalizer; |
| 16 | use crate::NormalizerError; |
| 17 | use crate::Uts46DecompositionSupplementV1Marker; |
| 18 | use icu_provider::DataProvider; |
| 19 | |
| 20 | // Implementation note: Despite merely wrapping a `ComposingNormalizer`, |
| 21 | // having a `Uts46Mapper` serves two purposes: |
| 22 | // |
| 23 | // 1. Denying public access to parts of the `ComposingNormalizer` API |
| 24 | // that don't work when the data contains markers for ignorables. |
| 25 | // 2. Providing a place where additional iterator pre-processing or |
| 26 | // post-processing can take place if needed in the future. (When |
| 27 | // writing this, it looked like such processing was needed but |
| 28 | // now isn't needed after all.) |
| 29 | |
| 30 | /// A mapper that knows how to performs the subsets of UTS 46 processing |
| 31 | /// documented on the methods. |
| 32 | #[derive (Debug)] |
| 33 | pub struct Uts46Mapper { |
| 34 | normalizer: ComposingNormalizer, |
| 35 | } |
| 36 | |
| 37 | #[cfg (feature = "compiled_data" )] |
| 38 | impl Default for Uts46Mapper { |
| 39 | fn default() -> Self { |
| 40 | Self::new() |
| 41 | } |
| 42 | } |
| 43 | |
| 44 | impl Uts46Mapper { |
| 45 | /// Construct with compiled data. |
| 46 | #[cfg (feature = "compiled_data" )] |
| 47 | pub const fn new() -> Self { |
| 48 | Uts46Mapper { |
| 49 | normalizer: ComposingNormalizer::new_uts46(), |
| 50 | } |
| 51 | } |
| 52 | |
| 53 | /// Construct with provider. |
| 54 | #[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::new)] |
| 55 | pub fn try_new<D>(provider: &D) -> Result<Self, NormalizerError> |
| 56 | where |
| 57 | D: DataProvider<CanonicalDecompositionDataV1Marker> |
| 58 | + DataProvider<Uts46DecompositionSupplementV1Marker> |
| 59 | + DataProvider<CanonicalDecompositionTablesV1Marker> |
| 60 | + DataProvider<CompatibilityDecompositionTablesV1Marker> |
| 61 | // UTS 46 tables merged into CompatibilityDecompositionTablesV1Marker |
| 62 | + DataProvider<CanonicalCompositionsV1Marker> |
| 63 | + ?Sized, |
| 64 | { |
| 65 | let normalizer = ComposingNormalizer::try_new_uts46_unstable(provider)?; |
| 66 | |
| 67 | Ok(Uts46Mapper { normalizer }) |
| 68 | } |
| 69 | |
| 70 | /// Returns an iterator adaptor that turns an `Iterator` over `char` |
| 71 | /// into an iterator yielding a `char` sequence that gets the following |
| 72 | /// operations from the "Map" and "Normalize" steps of the "Processing" |
| 73 | /// section of UTS 46 lazily applied to it: |
| 74 | /// |
| 75 | /// 1. The _ignored_ characters are ignored. |
| 76 | /// 2. The _mapped_ characters are mapped. |
| 77 | /// 3. The _disallowed_ characters are replaced with U+FFFD, |
| 78 | /// which itself is a disallowed character. |
| 79 | /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ |
| 80 | /// as appropriate. |
| 81 | /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. |
| 82 | /// 6. The _disallowed_STD3_mapped_ characters are treated as |
| 83 | /// _mapped_. |
| 84 | /// 7. The result is normalized to NFC. |
| 85 | /// |
| 86 | /// Notably: |
| 87 | /// |
| 88 | /// * The STD3 or WHATWG ASCII deny list should be implemented as a |
| 89 | /// post-processing step. |
| 90 | /// * Transitional processing is not performed. Transitional mapping |
| 91 | /// would be a pre-processing step, but transitional processing is |
| 92 | /// deprecated, and none of Firefox, Safari, or Chrome use it. |
| 93 | pub fn map_normalize<'delegate, I: Iterator<Item = char> + 'delegate>( |
| 94 | &'delegate self, |
| 95 | iter: I, |
| 96 | ) -> impl Iterator<Item = char> + 'delegate { |
| 97 | self.normalizer |
| 98 | .normalize_iter_private(iter, crate::IgnorableBehavior::Ignored) |
| 99 | } |
| 100 | |
| 101 | /// Returns an iterator adaptor that turns an `Iterator` over `char` |
| 102 | /// into an iterator yielding a `char` sequence that gets the following |
| 103 | /// operations from the NFC check and statucs steps of the "Validity |
| 104 | /// Criteria" section of UTS 46 lazily applied to it: |
| 105 | /// |
| 106 | /// 1. The _ignored_ characters are treated as _disallowed_. |
| 107 | /// 2. The _mapped_ characters are mapped. |
| 108 | /// 3. The _disallowed_ characters are replaced with U+FFFD, |
| 109 | /// which itself is a disallowed character. |
| 110 | /// 4. The _deviation_ characters are treated as _mapped_ or _valid_ |
| 111 | /// as appropriate. |
| 112 | /// 5. The _disallowed_STD3_valid_ characters are treated as allowed. |
| 113 | /// 6. The _disallowed_STD3_mapped_ characters are treated as |
| 114 | /// _mapped_. |
| 115 | /// 7. The result is normalized to NFC. |
| 116 | /// |
| 117 | /// Notably: |
| 118 | /// |
| 119 | /// * The STD3 or WHATWG ASCII deny list should be implemented as a |
| 120 | /// post-processing step. |
| 121 | /// * Transitional processing is not performed. Transitional mapping |
| 122 | /// would be a pre-processing step, but transitional processing is |
| 123 | /// deprecated, and none of Firefox, Safari, or Chrome use it. |
| 124 | /// * The output needs to be compared with input to see if anything |
| 125 | /// changed. This check catches failures to adhere to the normalization |
| 126 | /// and status requirements. In particular, this comparison results |
| 127 | /// in _mapped_ characters resulting in error like "Validity Criteria" |
| 128 | /// requires. |
| 129 | pub fn normalize_validate<'delegate, I: Iterator<Item = char> + 'delegate>( |
| 130 | &'delegate self, |
| 131 | iter: I, |
| 132 | ) -> impl Iterator<Item = char> + 'delegate { |
| 133 | self.normalizer |
| 134 | .normalize_iter_private(iter, crate::IgnorableBehavior::ReplacementCharacter) |
| 135 | } |
| 136 | } |
| 137 | |