normalize.rs source code [crates/rustybuzz-0.13.0/src/normalize.rs]

1	use crate::buffer::{Buffer, BufferScratchFlags, GlyphInfo};
2	use crate::complex::MAX_COMBINING_MARKS;
3	use crate::plan::ShapePlan;
4	use crate::unicode::{CharExt, GeneralCategory};
5	use crate::Face;
6
7	// HIGHLEVEL DESIGN:
8	//
9	// This file exports one main function: normalize().
10	//
11	// This function closely reflects the Unicode Normalization Algorithm,
12	// yet it's different.
13	//
14	// Each shaper specifies whether it prefers decomposed (NFD) or composed (NFC).
15	// The logic however tries to use whatever the font can support.
16	//
17	// In general what happens is that: each grapheme is decomposed in a chain
18	// of 1:2 decompositions, marks reordered, and then recomposed if desired,
19	// so far it's like Unicode Normalization. However, the decomposition and
20	// recomposition only happens if the font supports the resulting characters.
21	//
22	// The goals are:
23	//
24	// - Try to render all canonically equivalent strings similarly. To really
25	// achieve this we have to always do the full decomposition and then
26	// selectively recompose from there. It's kinda too expensive though, so
27	// we skip some cases. For example, if composed is desired, we simply
28	// don't touch 1-character clusters that are supported by the font, even
29	// though their NFC may be different.
30	//
31	// - When a font has a precomposed character for a sequence but the 'ccmp'
32	// feature in the font is not adequate, use the precomposed character
33	// which typically has better mark positioning.
34	//
35	// - When a font does not support a combining mark, but supports it precomposed
36	// with previous base, use that. This needs the itemizer to have this
37	// knowledge too. We need to provide assistance to the itemizer.
38	//
39	// - When a font does not support a character but supports its canonical
40	// decomposition, well, use the decomposition.
41	//
42	// - The complex shapers can customize the compose and decompose functions to
43	// offload some of their requirements to the normalizer. For example, the
44	// Indic shaper may want to disallow recomposing of two matras.
45
46	pub struct ShapeNormalizeContext<'a> {
47	pub plan: &'a ShapePlan,
48	pub buffer: &'a mut Buffer,
49	pub face: &'a Face<'a>,
50	pub decompose: fn(&ShapeNormalizeContext, char) -> Option<(char, char)>,
51	pub compose: fn(&ShapeNormalizeContext, char, char) -> Option<char>,
52	}
53
54	#[derive(Clone, Copy, Debug, PartialEq, Eq)]
55	pub enum ShapeNormalizationMode {
56	#[allow(dead_code)]
57	Decomposed,
58	/// Never composes base-to-base.
59	ComposedDiacritics,
60	/// Always fully decomposes and then recompose back.
61	ComposedDiacriticsNoShortCircuit,
62	Auto,
63	}
64
65	impl Default for ShapeNormalizationMode {
66	fn default() -> Self {
67	Self::Auto
68	}
69	}
70
71	pub fn normalize(plan: &ShapePlan, face: &Face, buffer: &mut Buffer) {
72	if buffer.is_empty() {
73	return;
74	}
75
76	let mut mode = plan.shaper.normalization_mode;
77	if mode == Some(ShapeNormalizationMode::Auto) {
78	// https://github.com/harfbuzz/harfbuzz/issues/653#issuecomment-423905920
79	// if plan.has_gpos_mark() {
80	// mode = ShapeNormalizationMode::Decomposed;
81	// }
82	mode = Some(ShapeNormalizationMode::ComposedDiacritics);
83	}
84
85	let decompose = plan
86	.shaper
87	.decompose
88	.unwrap_or(\|_, ab\| crate::unicode::decompose(ab));
89	let compose = plan
90	.shaper
91	.compose
92	.unwrap_or(\|_, a, b\| crate::unicode::compose(a, b));
93	let mut ctx = ShapeNormalizeContext {
94	plan,
95	buffer,
96	face,
97	decompose,
98	compose,
99	};
100	let mut buffer = &mut ctx.buffer;
101
102	let always_short_circuit = mode.is_none();
103	let might_short_circuit = always_short_circuit
104	\|\| !matches!(
105	mode,
106	Some(ShapeNormalizationMode::Decomposed)
107	\| Some(ShapeNormalizationMode::ComposedDiacriticsNoShortCircuit)
108	);
109
110	// We do a fairly straightforward yet custom normalization process in three
111	// separate rounds: decompose, reorder, recompose (if desired). Currently
112	// this makes two buffer swaps. We can make it faster by moving the last
113	// two rounds into the inner loop for the first round, but it's more readable
114	// this way.
115
116	// First round, decompose
117	let mut all_simple = `true`;
118	{
119	let count = buffer.len;
120	buffer.idx = `0`;
121	buffer.clear_output();
122	loop {
123	let mut end = buffer.idx + `1`;
124	while end < count && !buffer.info[end].is_unicode_mark() {
125	end += `1`;
126	}
127
128	if end < count {
129	// Leave one base for the marks to cluster with.
130	end -= `1`;
131	}
132
133	// From idx to end are simple clusters.
134	if might_short_circuit {
135	let len = end - buffer.idx;
136	let mut done = `0`;
137	while done < len {
138	let cur = buffer.cur_mut(done);
139	cur.set_glyph_index(match face.glyph_index(cur.glyph_id) {
140	Some(glyph_id) => u32::from(glyph_id.0),
141	None => break,
142	});
143	done += `1`;
144	}
145	buffer.next_glyphs(done);
146	}
147
148	while buffer.idx < end && buffer.successful {
149	decompose_current_character(&mut ctx, might_short_circuit);
150	buffer = &mut ctx.buffer;
151	}
152
153	if buffer.idx == count \|\| !buffer.successful {
154	break;
155	}
156
157	all_simple = `false`;
158
159	// Find all the marks now.
160	end = buffer.idx + `1`;
161	while end < count && buffer.info[end].is_unicode_mark() {
162	end += `1`;
163	}
164
165	// idx to end is one non-simple cluster.
166	decompose_multi_char_cluster(&mut ctx, end, always_short_circuit);
167	buffer = &mut ctx.buffer;
168
169	if buffer.idx >= count \|\| !buffer.successful {
170	break;
171	}
172	}
173
174	buffer.sync();
175	}
176
177	// Second round, reorder (inplace)
178	if !all_simple {
179	let count = buffer.len;
180	let mut i = `0`;
181	while i < count {
182	if buffer.info[i].modified_combining_class() == `0` {
183	i += `1`;
184	continue;
185	}
186
187	let mut end = i + `1`;
188	while end < count && buffer.info[end].modified_combining_class() != `0` {
189	end += `1`;
190	}
191
192	// We are going to do a O(n^2). Only do this if the sequence is short.
193	if end - i <= MAX_COMBINING_MARKS {
194	buffer.sort(i, end, \|a, b\| {
195	a.modified_combining_class() > b.modified_combining_class()
196	});
197
198	if let Some(reorder_marks) = ctx.plan.shaper.reorder_marks {
199	reorder_marks(ctx.plan, buffer, i, end);
200	}
201	}
202
203	i = end + `1`;
204	}
205	}
206	if buffer.scratch_flags.contains(BufferScratchFlags::HAS_CGJ) {
207	// For all CGJ, check if it prevented any reordering at all.
208	// If it did NOT, then make it skippable.
209	// https://github.com/harfbuzz/harfbuzz/issues/554
210	for i in `1`..buffer.len.saturating_sub(`1`) {
211	if buffer.info[i].glyph_id == `0x034F`
212	/ CGJ /
213	{
214	let last = buffer.info[i - `1`].modified_combining_class();
215	let next = buffer.info[i + `1`].modified_combining_class();
216	if next == `0` \|\| last <= next {
217	buffer.info[i].unhide();
218	}
219	}
220	}
221	}
222
223	// Third round, recompose
224	if !all_simple
225	&& buffer.successful
226	&& matches!(
227	mode,
228	Some(ShapeNormalizationMode::ComposedDiacritics)
229	\| Some(ShapeNormalizationMode::ComposedDiacriticsNoShortCircuit)
230	)
231	{
232	// As noted in the comment earlier, we don't try to combine
233	// ccc=0 chars with their previous Starter.
234
235	let count = buffer.len;
236	let mut starter = `0`;
237	buffer.clear_output();
238	buffer.next_glyph();
239	while buffer.idx < count && buffer.successful {
240	// We don't try to compose a non-mark character with it's preceding starter.
241	// This is both an optimization to avoid trying to compose every two neighboring
242	// glyphs in most scripts AND a desired feature for Hangul. Apparently Hangul
243	// fonts are not designed to mix-and-match pre-composed syllables and Jamo.
244	let cur = buffer.cur(`0`);
245	if cur.is_unicode_mark() &&
246	// If there's anything between the starter and this char, they should have CCC
247	// smaller than this character's.
248	(starter == buffer.out_len - `1`
249	\|\| buffer.prev().modified_combining_class() < cur.modified_combining_class())
250	{
251	let a = buffer.out_info()[starter].as_char();
252	let b = cur.as_char();
253	if let Some(composed) = (ctx.compose)(&ctx, a, b) {
254	if let Some(glyph_id) = face.glyph_index(u32::from(composed)) {
255	// Copy to out-buffer.
256	buffer = &mut ctx.buffer;
257	buffer.next_glyph();
258	if !buffer.successful {
259	return;
260	}
261
262	// Merge and remove the second composable.
263	buffer.merge_out_clusters(starter, buffer.out_len);
264	buffer.out_len -= `1`;
265
266	// Modify starter and carry on.
267	let mut flags = buffer.scratch_flags;
268	let info = &mut buffer.out_info_mut()[starter];
269	info.glyph_id = u32::from(composed);
270	info.set_glyph_index(u32::from(glyph_id.0));
271	info.init_unicode_props(&mut flags);
272	buffer.scratch_flags = flags;
273
274	continue;
275	}
276	}
277	}
278
279	// Blocked, or doesn't compose.
280	buffer = &mut ctx.buffer;
281	buffer.next_glyph();
282
283	if buffer.prev().modified_combining_class() == `0` {
284	starter = buffer.out_len - `1`;
285	}
286	}
287
288	buffer.sync();
289	}
290	}
291
292	fn decompose_multi_char_cluster(ctx: &mut ShapeNormalizeContext, end: usize, short_circuit: bool) {
293	let mut i: usize = ctx.buffer.idx;
294	while i < end && ctx.buffer.successful {
295	if ctx.buffer.info[i].as_char().is_variation_selector() {
296	handle_variation_selector_cluster(ctx, end, short_circuit);
297	return;
298	}
299	i += `1`;
300	}
301
302	while ctx.buffer.idx < end && ctx.buffer.successful {
303	decompose_current_character(ctx, shortest:short_circuit);
304	}
305	}
306
307	fn handle_variation_selector_cluster(ctx: &mut ShapeNormalizeContext, end: usize, _: bool) {
308	let face = ctx.face;
309	let set_glyph = \|info: &mut GlyphInfo\| {
310	if let Some(glyph_id) = face.glyph_index(info.glyph_id) {
311	info.set_glyph_index(u32::from(glyph_id.0));
312	}
313	};
314
315	// TODO: Currently if there's a variation-selector we give-up, it's just too hard.
316	let buffer = &mut ctx.buffer;
317	while buffer.idx < end - `1` && buffer.successful {
318	if buffer.cur(`1`).as_char().is_variation_selector() {
319	if let Some(glyph_id) =
320	face.glyph_variation_index(buffer.cur(`0`).as_char(), buffer.cur(`1`).as_char())
321	{
322	buffer.cur_mut(`0`).set_glyph_index(u32::from(glyph_id.0));
323	let unicode = buffer.cur(`0`).glyph_id;
324	buffer.replace_glyphs(`2`, `1`, &[unicode]);
325	} else {
326	// Just pass on the two characters separately, let GSUB do its magic.
327	set_glyph(buffer.cur_mut(`0`));
328	buffer.next_glyph();
329	set_glyph(buffer.cur_mut(`0`));
330	buffer.next_glyph();
331	}
332
333	// Skip any further variation selectors.
334	while buffer.idx < end && buffer.cur(`0`).as_char().is_variation_selector() {
335	set_glyph(buffer.cur_mut(`0`));
336	buffer.next_glyph();
337	}
338	} else {
339	set_glyph(buffer.cur_mut(`0`));
340	buffer.next_glyph();
341	}
342	}
343
344	if ctx.buffer.idx < end {
345	set_glyph(ctx.buffer.cur_mut(`0`));
346	ctx.buffer.next_glyph();
347	}
348	}
349
350	fn decompose_current_character(ctx: &mut ShapeNormalizeContext, shortest: bool) {
351	let u = ctx.buffer.cur(`0`).as_char();
352	let glyph = ctx.face.glyph_index(u32::from(u));
353
354	if !shortest \|\| glyph.is_none() {
355	if decompose(ctx, shortest, u) > `0` {
356	ctx.buffer.skip_glyph();
357	return;
358	}
359	}
360
361	if let Some(glyph) = glyph {
362	ctx.buffer.next_char(u32::from(glyph.0));
363	return;
364	}
365
366	// Handle space characters.
367	if ctx.buffer.cur(`0`).general_category() == GeneralCategory::SpaceSeparator {
368	if let Some(space_type) = u.space_fallback() {
369	let space_glyph = ctx
370	.face
371	.glyph_index(u32::from(' '))
372	.or(ctx.buffer.invisible);
373
374	if let Some(space_glyph) = space_glyph {
375	ctx.buffer.cur_mut(`0`).set_space_fallback(space_type);
376	ctx.buffer.next_char(u32::from(space_glyph.0));
377	ctx.buffer.scratch_flags \|= BufferScratchFlags::HAS_SPACE_FALLBACK;
378	return;
379	}
380	}
381	}
382
383	// U+2011 is the only sensible character that is a no-break version of another character
384	// and not a space. The space ones are handled already. Handle this lone one.
385	if u == '`\u{2011}`' {
386	if let Some(other_glyph) = ctx.face.glyph_index(`0x2010`) {
387	ctx.buffer.next_char(u32::from(other_glyph.0));
388	return;
389	}
390	}
391
392	// Insert a .notdef glyph if decomposition failed.
393	ctx.buffer.next_char(`0`);
394	}
395
396	/// Returns 0 if didn't decompose, number of resulting characters otherwise.
397	fn decompose(ctx: &mut ShapeNormalizeContext, shortest: bool, ab: char) -> u32 {
398	let (a, b) = match (ctx.decompose)(ctx, ab) {
399	Some(decomposed) => decomposed,
400	_ => return `0`,
401	};
402
403	let a_glyph = ctx.face.glyph_index(u32::from(a));
404	let b_glyph = if b != '`\0`' {
405	match ctx.face.glyph_index(u32::from(b)) {
406	Some(glyph_id) => Some(glyph_id),
407	None => return `0`,
408	}
409	} else {
410	None
411	};
412
413	if !shortest \|\| a_glyph.is_none() {
414	let ret = decompose(ctx, shortest, a);
415	if ret != `0` {
416	if let Some(b_glyph) = b_glyph {
417	ctx.buffer.output_char(u32::from(b), u32::from(b_glyph.0));
418	return ret + `1`;
419	}
420	return ret;
421	}
422	}
423
424	if let Some(a_glyph) = a_glyph {
425	// Output a and b.
426	ctx.buffer.output_char(u32::from(a), u32::from(a_glyph.0));
427	if let Some(b_glyph) = b_glyph {
428	ctx.buffer.output_char(u32::from(b), u32::from(b_glyph.0));
429	return `2`;
430	}
431	return `1`;
432	}
433
434	`0`
435	}
436