1// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Unicode character composition and decomposition utilities
12//! as described in
13//! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
14//!
15//! ```rust
16//! extern crate unicode_normalization;
17//!
18//! use unicode_normalization::char::compose;
19//! use unicode_normalization::UnicodeNormalization;
20//!
21//! fn main() {
22//! assert_eq!(compose('A','\u{30a}'), Some('Å'));
23//!
24//! let s = "ÅΩ";
25//! let c = s.nfc().collect::<String>();
26//! assert_eq!(c, "ÅΩ");
27//! }
28//! ```
29//!
30//! # crates.io
31//!
32//! You can use this package in your project by adding the following
33//! to your `Cargo.toml`:
34//!
35//! ```toml
36//! [dependencies]
37//! unicode-normalization = "0.1.20"
38//! ```
39
40#![deny(missing_docs, unsafe_code)]
41#![doc(
42 html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
43 html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
44)]
45#![cfg_attr(not(feature = "std"), no_std)]
46
47#[cfg(not(feature = "std"))]
48extern crate alloc;
49
50#[cfg(feature = "std")]
51extern crate core;
52
53extern crate tinyvec;
54
55pub use crate::decompose::Decompositions;
56pub use crate::quick_check::{
57 is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
58 is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
59 IsNormalized,
60};
61pub use crate::recompose::Recompositions;
62pub use crate::replace::Replacements;
63pub use crate::stream_safe::StreamSafe;
64pub use crate::tables::UNICODE_VERSION;
65use core::{
66 str::Chars,
67 option,
68};
69
70mod no_std_prelude;
71
72mod decompose;
73mod lookups;
74mod normalize;
75mod perfect_hash;
76mod quick_check;
77mod recompose;
78mod replace;
79mod stream_safe;
80
81#[rustfmt::skip]
82mod tables;
83
84#[doc(hidden)]
85pub mod __test_api;
86#[cfg(test)]
87mod test;
88
89/// Methods for composing and decomposing characters.
90pub mod char {
91 pub use crate::normalize::{
92 compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
93 };
94
95 pub use crate::lookups::{canonical_combining_class, is_combining_mark};
96
97 /// Return whether the given character is assigned (`General_Category` != `Unassigned`)
98 /// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
99 /// of Unicode.
100 pub use crate::tables::is_public_assigned;
101}
102
103/// Methods for iterating over strings while applying Unicode normalizations
104/// as described in
105/// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
106pub trait UnicodeNormalization<I: Iterator<Item = char>> {
107 /// Returns an iterator over the string in Unicode Normalization Form D
108 /// (canonical decomposition).
109 fn nfd(self) -> Decompositions<I>;
110
111 /// Returns an iterator over the string in Unicode Normalization Form KD
112 /// (compatibility decomposition).
113 fn nfkd(self) -> Decompositions<I>;
114
115 /// An Iterator over the string in Unicode Normalization Form C
116 /// (canonical decomposition followed by canonical composition).
117 fn nfc(self) -> Recompositions<I>;
118
119 /// An Iterator over the string in Unicode Normalization Form KC
120 /// (compatibility decomposition followed by canonical composition).
121 fn nfkc(self) -> Recompositions<I>;
122
123 /// A transformation which replaces CJK Compatibility Ideograph codepoints
124 /// with normal forms using Standardized Variation Sequences. This is not
125 /// part of the canonical or compatibility decomposition algorithms, but
126 /// performing it before those algorithms produces normalized output which
127 /// better preserves the intent of the original text.
128 ///
129 /// Note that many systems today ignore variation selectors, so these
130 /// may not immediately help text display as intended, but they at
131 /// least preserve the information in a standardized form, giving
132 /// implementations the option to recognize them.
133 fn cjk_compat_variants(self) -> Replacements<I>;
134
135 /// An Iterator over the string with Conjoining Grapheme Joiner characters
136 /// inserted according to the Stream-Safe Text Process (UAX15-D4)
137 fn stream_safe(self) -> StreamSafe<I>;
138}
139
140impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
141 #[inline]
142 fn nfd(self) -> Decompositions<Chars<'a>> {
143 decompose::new_canonical(self.chars())
144 }
145
146 #[inline]
147 fn nfkd(self) -> Decompositions<Chars<'a>> {
148 decompose::new_compatible(self.chars())
149 }
150
151 #[inline]
152 fn nfc(self) -> Recompositions<Chars<'a>> {
153 recompose::new_canonical(self.chars())
154 }
155
156 #[inline]
157 fn nfkc(self) -> Recompositions<Chars<'a>> {
158 recompose::new_compatible(self.chars())
159 }
160
161 #[inline]
162 fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
163 replace::new_cjk_compat_variants(self.chars())
164 }
165
166 #[inline]
167 fn stream_safe(self) -> StreamSafe<Chars<'a>> {
168 StreamSafe::new(self.chars())
169 }
170}
171
172
173impl UnicodeNormalization<option::IntoIter<char>> for char {
174 #[inline]
175 fn nfd(self) -> Decompositions<option::IntoIter<char>> {
176 decompose::new_canonical(Some(self).into_iter())
177 }
178
179 #[inline]
180 fn nfkd(self) -> Decompositions<option::IntoIter<char>> {
181 decompose::new_compatible(Some(self).into_iter())
182 }
183
184 #[inline]
185 fn nfc(self) -> Recompositions<option::IntoIter<char>> {
186 recompose::new_canonical(Some(self).into_iter())
187 }
188
189 #[inline]
190 fn nfkc(self) -> Recompositions<option::IntoIter<char>> {
191 recompose::new_compatible(Some(self).into_iter())
192 }
193
194 #[inline]
195 fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
196 replace::new_cjk_compat_variants(Some(self).into_iter())
197 }
198
199 #[inline]
200 fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> {
201 StreamSafe::new(Some(self).into_iter())
202 }
203}
204
205impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
206 #[inline]
207 fn nfd(self) -> Decompositions<I> {
208 decompose::new_canonical(self)
209 }
210
211 #[inline]
212 fn nfkd(self) -> Decompositions<I> {
213 decompose::new_compatible(self)
214 }
215
216 #[inline]
217 fn nfc(self) -> Recompositions<I> {
218 recompose::new_canonical(self)
219 }
220
221 #[inline]
222 fn nfkc(self) -> Recompositions<I> {
223 recompose::new_compatible(self)
224 }
225
226 #[inline]
227 fn cjk_compat_variants(self) -> Replacements<I> {
228 replace::new_cjk_compat_variants(self)
229 }
230
231 #[inline]
232 fn stream_safe(self) -> StreamSafe<I> {
233 StreamSafe::new(self)
234 }
235}
236