normalize.rs source code [crates/unicode_normalization/src/normalize.rs]

1	// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2	// file at the top-level directory of this distribution and at
3	// http://rust-lang.org/COPYRIGHT.
4	//
5	// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6	// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7	// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8	// option. This file may not be copied, modified, or distributed
9	// except according to those terms.
10
11	//! Functions for computing canonical and compatible decompositions for Unicode characters.
12	use crate::lookups::{
13	canonical_fully_decomposed, cjk_compat_variants_fully_decomposed,
14	compatibility_fully_decomposed, composition_table,
15	};
16
17	use core::char;
18
19	/// Compute canonical Unicode decomposition for character.
20	/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
21	/// for more information.
22	#[inline]
23	pub fn decompose_canonical<F>(c: char, emit_char: F)
24	where
25	F: FnMut(char),
26	{
27	decompose(c, decompose_char:canonical_fully_decomposed, emit_char)
28	}
29
30	/// Compute canonical or compatible Unicode decomposition for character.
31	/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
32	/// for more information.
33	#[inline]
34	pub fn decompose_compatible<F: FnMut(char)>(c: char, emit_char: F) {
35	let decompose_char: impl Fn(char) -> {unknown} =
36	\|c: char\| compatibility_fully_decomposed(c).or_else(\|\| canonical_fully_decomposed(c));
37	decompose(c, decompose_char, emit_char)
38	}
39
40	/// Compute standard-variation decomposition for character.
41	///
42	/// [Standardized Variation Sequences] are used instead of the standard canonical
43	/// decompositions, notably for CJK codepoints with singleton canonical decompositions,
44	/// to avoid losing information. See the
45	/// [Unicode Variation Sequence FAQ](http://unicode.org/faq/vs.html) and the
46	/// "Other Enhancements" section of the
47	/// [Unicode 6.3 Release Summary](https://www.unicode.org/versions/Unicode6.3.0/#Summary)
48	/// for more information.
49	#[inline]
50	pub fn decompose_cjk_compat_variants<F>(c: char, mut emit_char: F)
51	where
52	F: FnMut(char),
53	{
54	// 7-bit ASCII never decomposes
55	if c <= '`\x7f`' {
56	emit_char(c);
57	return;
58	}
59
60	// Don't perform decomposition for Hangul
61
62	if let Some(decomposed) = cjk_compat_variants_fully_decomposed(c) {
63	for &d in decomposed {
64	emit_char(d);
65	}
66	return;
67	}
68
69	// Finally bottom out.
70	emit_char(c);
71	}
72
73	#[inline]
74	#[allow(unsafe_code)]
75	fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
76	where
77	D: Fn(char) -> Option<&'static [char]>,
78	F: FnMut(char),
79	{
80	// 7-bit ASCII never decomposes
81	if c <= '`\x7f`' {
82	emit_char(c);
83	return;
84	}
85
86	// Perform decomposition for Hangul
87	if is_hangul_syllable(c) {
88	// Safety: Hangul Syllables invariant checked by is_hangul_syllable above
89	unsafe {
90	decompose_hangul(c, emit_char);
91	}
92	return;
93	}
94
95	if let Some(decomposed) = decompose_char(c) {
96	for &d in decomposed {
97	emit_char(d);
98	}
99	return;
100	}
101
102	// Finally bottom out.
103	emit_char(c);
104	}
105
106	/// Compose two characters into a single character, if possible.
107	/// See [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/)
108	/// for more information.
109	pub fn compose(a: char, b: char) -> Option<char> {
110	compose_hangul(a, b).or_else(\|\| composition_table(c1:a, c2:b))
111	}
112
113	// Constants from Unicode 9.0.0 Section 3.12 Conjoining Jamo Behavior
114	// http://www.unicode.org/versions/Unicode9.0.0/ch03.pdf#M9.32468.Heading.310.Combining.Jamo.Behavior
115	const S_BASE: u32 = `0xAC00`;
116	const L_BASE: u32 = `0x1100`;
117	const V_BASE: u32 = `0x1161`;
118	const T_BASE: u32 = `0x11A7`;
119	const L_COUNT: u32 = `19`;
120	const V_COUNT: u32 = `21`;
121	const T_COUNT: u32 = `28`;
122	const N_COUNT: u32 = V_COUNT * T_COUNT;
123	const S_COUNT: u32 = L_COUNT * N_COUNT;
124
125	const S_LAST: u32 = S_BASE + S_COUNT - `1`;
126	const L_LAST: u32 = L_BASE + L_COUNT - `1`;
127	const V_LAST: u32 = V_BASE + V_COUNT - `1`;
128	const T_LAST: u32 = T_BASE + T_COUNT - `1`;
129
130	// Composition only occurs for `TPart`s in `U+11A8 ..= U+11C2`,
131	// i.e. `T_BASE + 1 ..= T_LAST`.
132	const T_FIRST: u32 = T_BASE + `1`;
133
134	// Safety-usable invariant: This ensures that c is a valid Hangul Syllable character (U+AC00..U+D7AF)
135	pub(crate) fn is_hangul_syllable(c: char) -> bool {
136	// Safety: This checks the range 0xAC00 (S_BASE) to 0xD7A4 (S_BASE + S_COUNT), upholding the safety-usable invariant
137	(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
138	}
139
140	// Decompose a precomposed Hangul syllable
141	// Safety: `s` MUST be a valid Hangul Syllable character, between U+AC00..U+D7AF
142	#[allow(unsafe_code, unused_unsafe)]
143	#[inline(always)]
144	unsafe fn decompose_hangul<F>(s: char, mut emit_char: F)
145	where
146	F: FnMut(char),
147	{
148	// This will be at most 0x2baf, the size of the Hangul Syllables block
149	let s_index: u32 = s as u32 - S_BASE;
150	// This will be at most 0x2baf / (21 28), 19*
151	let l_index: u32 = s_index / N_COUNT;
152	unsafe {
153	// Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
154	emit_char(char::from_u32_unchecked(L_BASE + l_index));
155
156	// Safety: This will be at most (N_COUNT - 1) / T_COUNT = (VT - 1) / T, which gives us an upper bound of V_COUNT = 21*
157	let v_index: u32 = (s_index % N_COUNT) / T_COUNT;
158	// Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
159	emit_char(char::from_u32_unchecked(V_BASE + v_index));
160
161	// Safety: This will be at most T_COUNT - 1 (27)
162	let t_index: u32 = s_index % T_COUNT;
163	if t_index > `0` {
164	// Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
165	emit_char(char::from_u32_unchecked(T_BASE + t_index));
166	}
167	}
168	}
169
170	#[inline]
171	pub(crate) fn hangul_decomposition_length(s: char) -> usize {
172	let si: u32 = s as u32 - S_BASE;
173	let ti: u32 = si % T_COUNT;
174	if ti > `0` {
175	`3`
176	} else {
177	`2`
178	}
179	}
180
181	// Compose a pair of Hangul Jamo
182	#[allow(unsafe_code)]
183	#[inline(always)]
184	#[allow(ellipsis_inclusive_range_patterns)]
185	fn compose_hangul(a: char, b: char) -> Option<char> {
186	let (a, b) = (a as u32, b as u32);
187	match (a, b) {
188	// Compose a leading consonant and a vowel together into an LV_Syllable
189	(L_BASE..=L_LAST, V_BASE..=V_LAST) => {
190	// Safety: based on the above bounds, l_index will be less than or equal to L_COUNT (19)
191	// and v_index will be <= V_COUNT (21)
192	let l_index = a - L_BASE;
193	let v_index = b - V_BASE;
194	// Safety: This will be <= 19 (20 * 21) + (21 * 20), which is 8400.*
195	let lv_index = l_index * N_COUNT + v_index * T_COUNT;
196	// Safety: This is between 0xAC00 and 0xCCD0, which are in range for Hangul Syllables (U+AC00..U+D7AF) and also in range
197	// for BMP unicode
198	let s = S_BASE + lv_index;
199	// Safety: We've verified this is in-range
200	Some(unsafe { char::from_u32_unchecked(s) })
201	}
202	// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
203	(S_BASE..=S_LAST, T_FIRST..=T_LAST) if (a - S_BASE) % T_COUNT == `0` => {
204	// Safety: a is between 0xAC00 and (0xAC00 + 19 21 * 28). b - T_BASE is between 0 and 19.*
205	// Adding a number 0 to 19 to a number that is at largest 0xD7A4 will not go out of bounds to 0xD800 (where the
206	// surrogates start), so this is safe.
207	Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
208	}
209	_ => None,
210	}
211	}
212
213	#[cfg(test)]
214	mod tests {
215	use super::compose_hangul;
216
217	// Regression test from a bugfix where we were composing an LV_Syllable with
218	// T_BASE directly. (We should only compose an LV_Syllable with a character
219	// in the range `T_BASE + 1 ..= T_LAST`.)
220	#[test]
221	fn test_hangul_composition() {
222	assert_eq!(compose_hangul('`\u{c8e0}`', '`\u{11a7}`'), None);
223	}
224	}
225