ot_shape_normalize.rs source code [crates/rustybuzz/src/hb/ot_shape_normalize.rs]

1	use super::buffer::*;
2	use super::common::hb_codepoint_t;
3	use super::hb_font_t;
4	use super::ot_layout::*;
5	use super::ot_shape_plan::hb_ot_shape_plan_t;
6	use super::ot_shaper::{ComposeFn, DecomposeFn, MAX_COMBINING_MARKS};
7	use super::unicode::{hb_unicode_funcs_t, CharExt};
8
9	pub struct hb_ot_shape_normalize_context_t<'a> {
10	pub plan: &'a hb_ot_shape_plan_t,
11	pub buffer: &'a mut hb_buffer_t,
12	pub face: &'a hb_font_t<'a>,
13	pub decompose: DecomposeFn,
14	pub compose: ComposeFn,
15	}
16
17	impl hb_ot_shape_normalize_context_t<'_> {
18	pub(crate) fn override_decompose_and_compose(
19	&mut self,
20	decompose: Option<DecomposeFn>,
21	compose: Option<ComposeFn>,
22	) {
23	if let Some(decompose: fn(&hb_ot_shape_normalize_context_t<'_>, …) -> …) = decompose {
24	self.decompose = decompose;
25	}
26
27	if let Some(compose: fn(&hb_ot_shape_normalize_context_t<'_>, …) -> …) = compose {
28	self.compose = compose;
29	}
30	}
31	}
32
33	pub type hb_ot_shape_normalization_mode_t = i32;
34	pub const HB_OT_SHAPE_NORMALIZATION_MODE_NONE: i32 = `0`;
35	pub const HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED: i32 = `1`;
36	pub const HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS: i32 = `2`; / Never composes base-to-base /
37	pub const HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT: i32 = `3`; / Always fully decomposes and then recompose back /
38	pub const HB_OT_SHAPE_NORMALIZATION_MODE_AUTO: i32 = `4`; / See hb-ot-shape-normalize.cc for logic. /
39	#[allow(dead_code)]
40	pub const HB_OT_SHAPE_NORMALIZATION_MODE_DEFAULT: i32 = HB_OT_SHAPE_NORMALIZATION_MODE_AUTO;
41
42	// HIGHLEVEL DESIGN:
43	//
44	// This file exports one main function: normalize().
45	//
46	// This function closely reflects the Unicode Normalization Algorithm,
47	// yet it's different.
48	//
49	// Each shaper specifies whether it prefers decomposed (NFD) or composed (NFC).
50	// The logic however tries to use whatever the font can support.
51	//
52	// In general what happens is that: each grapheme is decomposed in a chain
53	// of 1:2 decompositions, marks reordered, and then recomposed if desired,
54	// so far it's like Unicode Normalization. However, the decomposition and
55	// recomposition only happens if the font supports the resulting characters.
56	//
57	// The goals are:
58	//
59	// - Try to render all canonically equivalent strings similarly. To really
60	// achieve this we have to always do the full decomposition and then
61	// selectively recompose from there. It's kinda too expensive though, so
62	// we skip some cases. For example, if composed is desired, we simply
63	// don't touch 1-character clusters that are supported by the font, even
64	// though their NFC may be different.
65	//
66	// - When a font has a precomposed character for a sequence but the 'ccmp'
67	// feature in the font is not adequate, use the precomposed character
68	// which typically has better mark positioning.
69	//
70	// - When a font does not support a combining mark, but supports it precomposed
71	// with previous base, use that. This needs the itemizer to have this
72	// knowledge too. We need to provide assistance to the itemizer.
73	//
74	// - When a font does not support a character but supports its canonical
75	// decomposition, well, use the decomposition.
76	//
77	// - The shapers can customize the compose and decompose functions to
78	// offload some of their requirements to the normalizer. For example, the
79	// Indic shaper may want to disallow recomposing of two matras.
80
81	fn decompose_unicode(
82	_: &hb_ot_shape_normalize_context_t,
83	ab: hb_codepoint_t,
84	) -> Option<(hb_codepoint_t, hb_codepoint_t)> {
85	super::unicode::decompose(ab)
86	}
87
88	fn compose_unicode(
89	_: &hb_ot_shape_normalize_context_t,
90	a: hb_codepoint_t,
91	b: hb_codepoint_t,
92	) -> Option<hb_codepoint_t> {
93	super::unicode::compose(a, b)
94	}
95
96	fn set_glyph(info: &mut hb_glyph_info_t, font: &hb_font_t) {
97	if let Some(glyph_id: GlyphId) = font.get_nominal_glyph(info.glyph_id) {
98	info.set_glyph_index(u32::from(glyph_id.0));
99	}
100	}
101
102	fn output_char(buffer: &mut hb_buffer_t, unichar: u32, glyph: u32) {
103	// This is very confusing indeed.
104	buffer.cur_mut(`0`).set_glyph_index(glyph);
105	buffer.output_glyph(glyph_index:unichar);
106	// TODO: should be _hb_glyph_info_set_unicode_props (&buffer->prev(), buffer);
107	let mut flags: u32 = buffer.scratch_flags;
108	buffer.prev_mut().init_unicode_props(&mut flags);
109	buffer.scratch_flags = flags;
110	}
111
112	fn next_char(buffer: &mut hb_buffer_t, glyph: u32) {
113	buffer.cur_mut(`0`).set_glyph_index(glyph);
114	buffer.next_glyph();
115	}
116
117	fn skip_char(buffer: &mut hb_buffer_t) {
118	buffer.skip_glyph();
119	}
120
121	/// Returns 0 if didn't decompose, number of resulting characters otherwise.
122	fn decompose(ctx: &mut hb_ot_shape_normalize_context_t, shortest: bool, ab: hb_codepoint_t) -> u32 {
123	let (a, b) = match (ctx.decompose)(ctx, ab) {
124	Some(decomposed) => decomposed,
125	_ => return `0`,
126	};
127
128	let a_glyph = ctx.face.get_nominal_glyph(u32::from(a));
129	let b_glyph = if b != '`\0`' {
130	match ctx.face.get_nominal_glyph(u32::from(b)) {
131	Some(glyph_id) => Some(glyph_id),
132	None => return `0`,
133	}
134	} else {
135	None
136	};
137
138	if !shortest \|\| a_glyph.is_none() {
139	let ret = decompose(ctx, shortest, a);
140	if ret != `0` {
141	if let Some(b_glyph) = b_glyph {
142	output_char(ctx.buffer, u32::from(b), u32::from(b_glyph.0));
143	return ret + `1`;
144	}
145	return ret;
146	}
147	}
148
149	if let Some(a_glyph) = a_glyph {
150	// Output a and b.
151	output_char(ctx.buffer, u32::from(a), u32::from(a_glyph.0));
152	if let Some(b_glyph) = b_glyph {
153	output_char(ctx.buffer, u32::from(b), u32::from(b_glyph.0));
154	return `2`;
155	}
156	return `1`;
157	}
158
159	`0`
160	}
161
162	fn decompose_current_character(ctx: &mut hb_ot_shape_normalize_context_t, shortest: bool) {
163	let u = ctx.buffer.cur(`0`).as_char();
164	let glyph = ctx.face.get_nominal_glyph(u32::from(u));
165
166	// TODO: different to harfbuzz, sync
167	if !shortest \|\| glyph.is_none() {
168	if decompose(ctx, shortest, u) > `0` {
169	skip_char(ctx.buffer);
170	return;
171	}
172	}
173
174	// TODO: different to harfbuzz, sync
175	if let Some(glyph) = glyph {
176	next_char(ctx.buffer, u32::from(glyph.0));
177	return;
178	}
179
180	if _hb_glyph_info_is_unicode_space(ctx.buffer.cur(`0`)) {
181	let space_type = u.space_fallback();
182	if space_type != hb_unicode_funcs_t::NOT_SPACE {
183	let space_glyph = ctx.face.get_nominal_glyph(`0x0020`).or(ctx.buffer.invisible);
184
185	if let Some(space_glyph) = space_glyph {
186	_hb_glyph_info_set_unicode_space_fallback_type(ctx.buffer.cur_mut(`0`), space_type);
187	next_char(ctx.buffer, u32::from(space_glyph.0));
188	ctx.buffer.scratch_flags \|= HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK;
189	return;
190	}
191	}
192	}
193
194	// U+2011 is the only sensible character that is a no-break version of another character
195	// and not a space. The space ones are handled already. Handle this lone one.
196	if u == '`\u{2011}`' {
197	if let Some(other_glyph) = ctx.face.get_nominal_glyph(`0x2010`) {
198	next_char(ctx.buffer, u32::from(other_glyph.0));
199	return;
200	}
201	}
202
203	// Insert a .notdef glyph if decomposition failed.
204	next_char(ctx.buffer, `0`);
205	}
206
207	fn handle_variation_selector_cluster(
208	ctx: &mut hb_ot_shape_normalize_context_t,
209	end: usize,
210	_: bool,
211	) {
212	let face = ctx.face;
213
214	// Currently if there's a variation-selector we give-up on normalization, it's just too hard.
215	let buffer = &mut ctx.buffer;
216	while buffer.idx < end - `1` && buffer.successful {
217	if buffer.cur(`1`).as_char().is_variation_selector() {
218	if let Some(glyph_id) =
219	face.glyph_variation_index(buffer.cur(`0`).as_char(), buffer.cur(`1`).as_char())
220	{
221	buffer.cur_mut(`0`).set_glyph_index(u32::from(glyph_id.0));
222	let unicode = buffer.cur(`0`).glyph_id;
223	buffer.replace_glyphs(`2`, `1`, &[unicode]);
224	} else {
225	// Just pass on the two characters separately, let GSUB do its magic.
226	set_glyph(buffer.cur_mut(`0`), face);
227	buffer.next_glyph();
228
229	buffer.scratch_flags \|= HB_BUFFER_SCRATCH_FLAG_HAS_VARIATION_SELECTOR_FALLBACK;
230
231	_hb_glyph_info_set_variation_selector(buffer.cur_mut(`0`), `true`);
232
233	if buffer.not_found_variation_selector.is_some() {
234	_hb_glyph_info_clear_default_ignorable(buffer.cur_mut(`0`))
235	}
236
237	set_glyph(buffer.cur_mut(`0`), face);
238	buffer.next_glyph();
239	}
240
241	// Skip any further variation selectors.
242	while buffer.idx < end && buffer.cur(`0`).as_char().is_variation_selector() {
243	set_glyph(buffer.cur_mut(`0`), face);
244	buffer.next_glyph();
245	}
246	} else {
247	set_glyph(buffer.cur_mut(`0`), face);
248	buffer.next_glyph();
249	}
250	}
251
252	if ctx.buffer.idx < end {
253	set_glyph(ctx.buffer.cur_mut(`0`), face);
254	ctx.buffer.next_glyph();
255	}
256	}
257
258	fn decompose_multi_char_cluster(
259	ctx: &mut hb_ot_shape_normalize_context_t,
260	end: usize,
261	short_circuit: bool,
262	) {
263	let mut i: usize = ctx.buffer.idx;
264	while i < end && ctx.buffer.successful {
265	if ctx.buffer.info[i].as_char().is_variation_selector() {
266	handle_variation_selector_cluster(ctx, end, short_circuit);
267	return;
268	}
269	i += `1`;
270	}
271
272	while ctx.buffer.idx < end && ctx.buffer.successful {
273	decompose_current_character(ctx, shortest:short_circuit);
274	}
275	}
276
277	fn compare_combining_class(pa: &hb_glyph_info_t, pb: &hb_glyph_info_t) -> bool {
278	let a: u8 = _hb_glyph_info_get_modified_combining_class(info:pa);
279	let b: u8 = _hb_glyph_info_get_modified_combining_class(info:pb);
280	a > b
281	}
282
283	pub fn _hb_ot_shape_normalize(
284	plan: &hb_ot_shape_plan_t,
285	buffer: &mut hb_buffer_t,
286	face: &hb_font_t,
287	) {
288	if buffer.is_empty() {
289	return;
290	}
291
292	let mut mode = plan.shaper.normalization_preference;
293	if mode == HB_OT_SHAPE_NORMALIZATION_MODE_AUTO {
294	if plan.has_gpos_mark {
295	// https://github.com/harfbuzz/harfbuzz/issues/653#issuecomment-423905920
296	// mode = Some(HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED);
297	mode = HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS;
298	} else {
299	mode = HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS;
300	}
301	}
302
303	let mut ctx = hb_ot_shape_normalize_context_t {
304	plan,
305	buffer,
306	face,
307	decompose: decompose_unicode,
308	compose: compose_unicode,
309	};
310	ctx.override_decompose_and_compose(plan.shaper.decompose, plan.shaper.compose);
311
312	let mut buffer = &mut ctx.buffer;
313
314	let always_short_circuit = mode == HB_OT_SHAPE_NORMALIZATION_MODE_NONE;
315	let might_short_circuit = always_short_circuit
316	\|\| (mode != HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED
317	&& mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT);
318
319	// We do a fairly straightforward yet custom normalization process in three
320	// separate rounds: decompose, reorder, recompose (if desired). Currently
321	// this makes two buffer swaps. We can make it faster by moving the last
322	// two rounds into the inner loop for the first round, but it's more readable
323	// this way.
324
325	// First round, decompose
326	let mut all_simple = `true`;
327	{
328	buffer.clear_output();
329	let count = buffer.len;
330	buffer.idx = `0`;
331	loop {
332	let mut end = buffer.idx + `1`;
333	while end < count && !_hb_glyph_info_is_unicode_mark(&buffer.info[end]) {
334	end += `1`;
335	}
336
337	if end < count {
338	// Leave one base for the marks to cluster with.
339	end -= `1`;
340	}
341
342	// From idx to end are simple clusters.
343	if might_short_circuit {
344	let len = end - buffer.idx;
345	let mut done = `0`;
346	while done < len {
347	let cur = buffer.cur_mut(done);
348	cur.set_glyph_index(match face.get_nominal_glyph(cur.glyph_id) {
349	Some(glyph_id) => u32::from(glyph_id.0),
350	None => break,
351	});
352	done += `1`;
353	}
354	buffer.next_glyphs(done);
355	}
356
357	while buffer.idx < end && buffer.successful {
358	decompose_current_character(&mut ctx, might_short_circuit);
359	buffer = &mut ctx.buffer;
360	}
361
362	if buffer.idx == count \|\| !buffer.successful {
363	break;
364	}
365
366	all_simple = `false`;
367
368	// Find all the marks now.
369	end = buffer.idx + `1`;
370	while end < count && _hb_glyph_info_is_unicode_mark(&buffer.info[end]) {
371	end += `1`;
372	}
373
374	// idx to end is one non-simple cluster.
375	decompose_multi_char_cluster(&mut ctx, end, always_short_circuit);
376	buffer = &mut ctx.buffer;
377
378	if buffer.idx >= count \|\| !buffer.successful {
379	break;
380	}
381	}
382
383	buffer.sync();
384	}
385
386	// Second round, reorder (inplace)
387	if !all_simple {
388	let count = buffer.len;
389	let mut i = `0`;
390	while i < count {
391	if _hb_glyph_info_get_modified_combining_class(&buffer.info[i]) == `0` {
392	i += `1`;
393	continue;
394	}
395
396	let mut end = i + `1`;
397	while end < count && _hb_glyph_info_get_modified_combining_class(&buffer.info[end]) != `0`
398	{
399	end += `1`;
400	}
401
402	// We are going to do a O(n^2). Only do this if the sequence is short.
403	if end - i <= MAX_COMBINING_MARKS {
404	buffer.sort(i, end, compare_combining_class);
405
406	if let Some(reorder_marks) = ctx.plan.shaper.reorder_marks {
407	reorder_marks(ctx.plan, buffer, i, end);
408	}
409	}
410
411	i = end + `1`;
412	}
413	}
414	if buffer.scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_CGJ != `0` {
415	// For all CGJ, check if it prevented any reordering at all.
416	// If it did NOT, then make it skippable.
417	// https://github.com/harfbuzz/harfbuzz/issues/554
418	for i in `1`..buffer.len.saturating_sub(`1`) {
419	if buffer.info[i].glyph_id == `0x034F`
420	/ CGJ /
421	{
422	let last = _hb_glyph_info_get_modified_combining_class(&buffer.info[i - `1`]);
423	let next = _hb_glyph_info_get_modified_combining_class(&buffer.info[i + `1`]);
424	if next == `0` \|\| last <= next {
425	buffer.info[i].unhide();
426	}
427	}
428	}
429	}
430
431	// Third round, recompose
432	if !all_simple
433	&& buffer.successful
434	&& (mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS
435	\|\| mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT)
436	{
437	// As noted in the comment earlier, we don't try to combine
438	// ccc=0 chars with their previous Starter.
439
440	let count = buffer.len;
441	let mut starter = `0`;
442	buffer.clear_output();
443	buffer.next_glyph();
444	while buffer.idx < count && buffer.successful {
445	// We don't try to compose a non-mark character with it's preceding starter.
446	// This is both an optimization to avoid trying to compose every two neighboring
447	// glyphs in most scripts AND a desired feature for Hangul. Apparently Hangul
448	// fonts are not designed to mix-and-match pre-composed syllables and Jamo.
449	let cur = buffer.cur(`0`);
450	if _hb_glyph_info_is_unicode_mark(cur) &&
451	// If there's anything between the starter and this char, they should have CCC
452	// smaller than this character's.
453	(starter == buffer.out_len - `1`
454	\|\| _hb_glyph_info_get_modified_combining_class(buffer.prev()) < _hb_glyph_info_get_modified_combining_class(cur))
455	{
456	let a = buffer.out_info()[starter].as_char();
457	let b = cur.as_char();
458	if let Some(composed) = (ctx.compose)(&ctx, a, b) {
459	if let Some(glyph_id) = face.get_nominal_glyph(u32::from(composed)) {
460	// Copy to out-buffer.
461	buffer = &mut ctx.buffer;
462	buffer.next_glyph();
463	if !buffer.successful {
464	return;
465	}
466
467	// Merge and remove the second composable.
468	buffer.merge_out_clusters(starter, buffer.out_len);
469	buffer.out_len -= `1`;
470
471	// Modify starter and carry on.
472	let mut flags = buffer.scratch_flags;
473	let info = &mut buffer.out_info_mut()[starter];
474	info.glyph_id = u32::from(composed);
475	info.set_glyph_index(u32::from(glyph_id.0));
476	info.init_unicode_props(&mut flags);
477	buffer.scratch_flags = flags;
478
479	continue;
480	}
481	}
482	}
483
484	// Blocked, or doesn't compose.
485	buffer = &mut ctx.buffer;
486	buffer.next_glyph();
487
488	if _hb_glyph_info_get_modified_combining_class(buffer.prev()) == `0` {
489	starter = buffer.out_len - `1`;
490	}
491	}
492
493	buffer.sync();
494	}
495	}
496