| 1 | // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT | 
| 2 | // file at the top-level directory of this distribution and at | 
|---|
| 3 | // http://rust-lang.org/COPYRIGHT. | 
|---|
| 4 | // | 
|---|
| 5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | 
|---|
| 6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | 
|---|
| 7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | 
|---|
| 8 | // option. This file may not be copied, modified, or distributed | 
|---|
| 9 | // except according to those terms. | 
|---|
| 10 |  | 
|---|
| 11 | //! Unicode character composition and decomposition utilities | 
|---|
| 12 | //! as described in | 
|---|
| 13 | //! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/). | 
|---|
| 14 | //! | 
|---|
| 15 | //! ```rust | 
|---|
| 16 | //! extern crate unicode_normalization; | 
|---|
| 17 | //! | 
|---|
| 18 | //! use unicode_normalization::char::compose; | 
|---|
| 19 | //! use unicode_normalization::UnicodeNormalization; | 
|---|
| 20 | //! | 
|---|
| 21 | //! fn main() { | 
|---|
| 22 | //!     assert_eq!(compose( 'A', '\u{30a} '), Some( 'Å')); | 
|---|
| 23 | //! | 
|---|
| 24 | //!     let s = "ÅΩ"; | 
|---|
| 25 | //!     let c = s.nfc().collect::<String>(); | 
|---|
| 26 | //!     assert_eq!(c, "ÅΩ"); | 
|---|
| 27 | //! } | 
|---|
| 28 | //! ``` | 
|---|
| 29 | //! | 
|---|
| 30 | //! # crates.io | 
|---|
| 31 | //! | 
|---|
| 32 | //! You can use this package in your project by adding the following | 
|---|
| 33 | //! to your `Cargo.toml`: | 
|---|
| 34 | //! | 
|---|
| 35 | //! ```toml | 
|---|
| 36 | //! [dependencies] | 
|---|
| 37 | //! unicode-normalization = "0.1.20" | 
|---|
| 38 | //! ``` | 
|---|
| 39 |  | 
|---|
| 40 | #![ deny(missing_docs, unsafe_code)] | 
|---|
| 41 | #![ doc( | 
|---|
| 42 | html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", | 
|---|
| 43 | html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" | 
|---|
| 44 | )] | 
|---|
| 45 | #![ cfg_attr(not(feature = "std"), no_std)] | 
|---|
| 46 |  | 
|---|
| 47 | #[ cfg(not(feature = "std"))] | 
|---|
| 48 | extern crate alloc; | 
|---|
| 49 |  | 
|---|
| 50 | #[ cfg(feature = "std")] | 
|---|
| 51 | extern crate core; | 
|---|
| 52 |  | 
|---|
| 53 | extern crate tinyvec; | 
|---|
| 54 |  | 
|---|
| 55 | pub use crate::decompose::Decompositions; | 
|---|
| 56 | pub use crate::quick_check::{ | 
|---|
| 57 | is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick, | 
|---|
| 58 | is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick, | 
|---|
| 59 | IsNormalized, | 
|---|
| 60 | }; | 
|---|
| 61 | pub use crate::recompose::Recompositions; | 
|---|
| 62 | pub use crate::replace::Replacements; | 
|---|
| 63 | pub use crate::stream_safe::StreamSafe; | 
|---|
| 64 | pub use crate::tables::UNICODE_VERSION; | 
|---|
| 65 | use core::{option, str::Chars}; | 
|---|
| 66 |  | 
|---|
| 67 | mod decompose; | 
|---|
| 68 | mod lookups; | 
|---|
| 69 | mod normalize; | 
|---|
| 70 | mod perfect_hash; | 
|---|
| 71 | mod quick_check; | 
|---|
| 72 | mod recompose; | 
|---|
| 73 | mod replace; | 
|---|
| 74 | mod stream_safe; | 
|---|
| 75 | mod tables; | 
|---|
| 76 |  | 
|---|
| 77 | #[ doc(hidden)] | 
|---|
| 78 | pub mod __test_api; | 
|---|
| 79 | #[ cfg(test)] | 
|---|
| 80 | mod test; | 
|---|
| 81 |  | 
|---|
| 82 | /// Methods for composing and decomposing characters. | 
|---|
| 83 | pub mod char { | 
|---|
| 84 | pub use crate::normalize::{ | 
|---|
| 85 | compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible, | 
|---|
| 86 | }; | 
|---|
| 87 |  | 
|---|
| 88 | pub use crate::lookups::{canonical_combining_class, is_combining_mark}; | 
|---|
| 89 |  | 
|---|
| 90 | /// Return whether the given character is assigned (`General_Category` != `Unassigned`) | 
|---|
| 91 | /// and not Private-Use (`General_Category` != `Private_Use`), in the supported version | 
|---|
| 92 | /// of Unicode. | 
|---|
| 93 | pub use crate::tables::is_public_assigned; | 
|---|
| 94 | } | 
|---|
| 95 |  | 
|---|
| 96 | /// Methods for iterating over strings while applying Unicode normalizations | 
|---|
| 97 | /// as described in | 
|---|
| 98 | /// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/). | 
|---|
| 99 | pub trait UnicodeNormalization<I: Iterator<Item = char>> { | 
|---|
| 100 | /// Returns an iterator over the string in Unicode Normalization Form D | 
|---|
| 101 | /// (canonical decomposition). | 
|---|
| 102 | fn nfd(self) -> Decompositions<I>; | 
|---|
| 103 |  | 
|---|
| 104 | /// Returns an iterator over the string in Unicode Normalization Form KD | 
|---|
| 105 | /// (compatibility decomposition). | 
|---|
| 106 | fn nfkd(self) -> Decompositions<I>; | 
|---|
| 107 |  | 
|---|
| 108 | /// An Iterator over the string in Unicode Normalization Form C | 
|---|
| 109 | /// (canonical decomposition followed by canonical composition). | 
|---|
| 110 | fn nfc(self) -> Recompositions<I>; | 
|---|
| 111 |  | 
|---|
| 112 | /// An Iterator over the string in Unicode Normalization Form KC | 
|---|
| 113 | /// (compatibility decomposition followed by canonical composition). | 
|---|
| 114 | fn nfkc(self) -> Recompositions<I>; | 
|---|
| 115 |  | 
|---|
| 116 | /// A transformation which replaces CJK Compatibility Ideograph codepoints | 
|---|
| 117 | /// with normal forms using Standardized Variation Sequences. This is not | 
|---|
| 118 | /// part of the canonical or compatibility decomposition algorithms, but | 
|---|
| 119 | /// performing it before those algorithms produces normalized output which | 
|---|
| 120 | /// better preserves the intent of the original text. | 
|---|
| 121 | /// | 
|---|
| 122 | /// Note that many systems today ignore variation selectors, so these | 
|---|
| 123 | /// may not immediately help text display as intended, but they at | 
|---|
| 124 | /// least preserve the information in a standardized form, giving | 
|---|
| 125 | /// implementations the option to recognize them. | 
|---|
| 126 | fn cjk_compat_variants(self) -> Replacements<I>; | 
|---|
| 127 |  | 
|---|
| 128 | /// An Iterator over the string with Conjoining Grapheme Joiner characters | 
|---|
| 129 | /// inserted according to the Stream-Safe Text Process (UAX15-D4) | 
|---|
| 130 | fn stream_safe(self) -> StreamSafe<I>; | 
|---|
| 131 | } | 
|---|
| 132 |  | 
|---|
| 133 | impl<'a> UnicodeNormalization<Chars<'a>> for &'a str { | 
|---|
| 134 | #[ inline] | 
|---|
| 135 | fn nfd(self) -> Decompositions<Chars<'a>> { | 
|---|
| 136 | Decompositions::new_canonical(self.chars()) | 
|---|
| 137 | } | 
|---|
| 138 |  | 
|---|
| 139 | #[ inline] | 
|---|
| 140 | fn nfkd(self) -> Decompositions<Chars<'a>> { | 
|---|
| 141 | Decompositions::new_compatible(self.chars()) | 
|---|
| 142 | } | 
|---|
| 143 |  | 
|---|
| 144 | #[ inline] | 
|---|
| 145 | fn nfc(self) -> Recompositions<Chars<'a>> { | 
|---|
| 146 | Recompositions::new_canonical(self.chars()) | 
|---|
| 147 | } | 
|---|
| 148 |  | 
|---|
| 149 | #[ inline] | 
|---|
| 150 | fn nfkc(self) -> Recompositions<Chars<'a>> { | 
|---|
| 151 | Recompositions::new_compatible(self.chars()) | 
|---|
| 152 | } | 
|---|
| 153 |  | 
|---|
| 154 | #[ inline] | 
|---|
| 155 | fn cjk_compat_variants(self) -> Replacements<Chars<'a>> { | 
|---|
| 156 | replace::new_cjk_compat_variants(self.chars()) | 
|---|
| 157 | } | 
|---|
| 158 |  | 
|---|
| 159 | #[ inline] | 
|---|
| 160 | fn stream_safe(self) -> StreamSafe<Chars<'a>> { | 
|---|
| 161 | StreamSafe::new(self.chars()) | 
|---|
| 162 | } | 
|---|
| 163 | } | 
|---|
| 164 |  | 
|---|
| 165 | impl UnicodeNormalization<option::IntoIter<char>> for char { | 
|---|
| 166 | #[ inline] | 
|---|
| 167 | fn nfd(self) -> Decompositions<option::IntoIter<char>> { | 
|---|
| 168 | Decompositions::new_canonical(Some(self).into_iter()) | 
|---|
| 169 | } | 
|---|
| 170 |  | 
|---|
| 171 | #[ inline] | 
|---|
| 172 | fn nfkd(self) -> Decompositions<option::IntoIter<char>> { | 
|---|
| 173 | Decompositions::new_compatible(Some(self).into_iter()) | 
|---|
| 174 | } | 
|---|
| 175 |  | 
|---|
| 176 | #[ inline] | 
|---|
| 177 | fn nfc(self) -> Recompositions<option::IntoIter<char>> { | 
|---|
| 178 | Recompositions::new_canonical(Some(self).into_iter()) | 
|---|
| 179 | } | 
|---|
| 180 |  | 
|---|
| 181 | #[ inline] | 
|---|
| 182 | fn nfkc(self) -> Recompositions<option::IntoIter<char>> { | 
|---|
| 183 | Recompositions::new_compatible(Some(self).into_iter()) | 
|---|
| 184 | } | 
|---|
| 185 |  | 
|---|
| 186 | #[ inline] | 
|---|
| 187 | fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> { | 
|---|
| 188 | replace::new_cjk_compat_variants(Some(self).into_iter()) | 
|---|
| 189 | } | 
|---|
| 190 |  | 
|---|
| 191 | #[ inline] | 
|---|
| 192 | fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> { | 
|---|
| 193 | StreamSafe::new(Some(self).into_iter()) | 
|---|
| 194 | } | 
|---|
| 195 | } | 
|---|
| 196 |  | 
|---|
| 197 | impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I { | 
|---|
| 198 | #[ inline] | 
|---|
| 199 | fn nfd(self) -> Decompositions<I> { | 
|---|
| 200 | Decompositions::new_canonical(self) | 
|---|
| 201 | } | 
|---|
| 202 |  | 
|---|
| 203 | #[ inline] | 
|---|
| 204 | fn nfkd(self) -> Decompositions<I> { | 
|---|
| 205 | Decompositions::new_compatible(self) | 
|---|
| 206 | } | 
|---|
| 207 |  | 
|---|
| 208 | #[ inline] | 
|---|
| 209 | fn nfc(self) -> Recompositions<I> { | 
|---|
| 210 | Recompositions::new_canonical(self) | 
|---|
| 211 | } | 
|---|
| 212 |  | 
|---|
| 213 | #[ inline] | 
|---|
| 214 | fn nfkc(self) -> Recompositions<I> { | 
|---|
| 215 | Recompositions::new_compatible(self) | 
|---|
| 216 | } | 
|---|
| 217 |  | 
|---|
| 218 | #[ inline] | 
|---|
| 219 | fn cjk_compat_variants(self) -> Replacements<I> { | 
|---|
| 220 | replace::new_cjk_compat_variants(self) | 
|---|
| 221 | } | 
|---|
| 222 |  | 
|---|
| 223 | #[ inline] | 
|---|
| 224 | fn stream_safe(self) -> StreamSafe<I> { | 
|---|
| 225 | StreamSafe::new(self) | 
|---|
| 226 | } | 
|---|
| 227 | } | 
|---|
| 228 |  | 
|---|