1use super::buffer::*;
2use super::common::hb_codepoint_t;
3use super::hb_font_t;
4use super::ot_layout::*;
5use super::ot_shape_plan::hb_ot_shape_plan_t;
6use super::ot_shaper::{ComposeFn, DecomposeFn, MAX_COMBINING_MARKS};
7use super::unicode::{hb_unicode_funcs_t, CharExt};
8
9pub struct hb_ot_shape_normalize_context_t<'a> {
10 pub plan: &'a hb_ot_shape_plan_t,
11 pub buffer: &'a mut hb_buffer_t,
12 pub face: &'a hb_font_t<'a>,
13 pub decompose: DecomposeFn,
14 pub compose: ComposeFn,
15}
16
17impl hb_ot_shape_normalize_context_t<'_> {
18 pub(crate) fn override_decompose_and_compose(
19 &mut self,
20 decompose: Option<DecomposeFn>,
21 compose: Option<ComposeFn>,
22 ) {
23 if let Some(decompose: fn(&hb_ot_shape_normalize_context_t<'_>, …) -> …) = decompose {
24 self.decompose = decompose;
25 }
26
27 if let Some(compose: fn(&hb_ot_shape_normalize_context_t<'_>, …) -> …) = compose {
28 self.compose = compose;
29 }
30 }
31}
32
33pub type hb_ot_shape_normalization_mode_t = i32;
34pub const HB_OT_SHAPE_NORMALIZATION_MODE_NONE: i32 = 0;
35pub const HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED: i32 = 1;
36pub const HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS: i32 = 2; /* Never composes base-to-base */
37pub const HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT: i32 = 3; /* Always fully decomposes and then recompose back */
38pub const HB_OT_SHAPE_NORMALIZATION_MODE_AUTO: i32 = 4; /* See hb-ot-shape-normalize.cc for logic. */
39#[allow(dead_code)]
40pub const HB_OT_SHAPE_NORMALIZATION_MODE_DEFAULT: i32 = HB_OT_SHAPE_NORMALIZATION_MODE_AUTO;
41
42// HIGHLEVEL DESIGN:
43//
44// This file exports one main function: normalize().
45//
46// This function closely reflects the Unicode Normalization Algorithm,
47// yet it's different.
48//
49// Each shaper specifies whether it prefers decomposed (NFD) or composed (NFC).
50// The logic however tries to use whatever the font can support.
51//
52// In general what happens is that: each grapheme is decomposed in a chain
53// of 1:2 decompositions, marks reordered, and then recomposed if desired,
54// so far it's like Unicode Normalization. However, the decomposition and
55// recomposition only happens if the font supports the resulting characters.
56//
57// The goals are:
58//
59// - Try to render all canonically equivalent strings similarly. To really
60// achieve this we have to always do the full decomposition and then
61// selectively recompose from there. It's kinda too expensive though, so
62// we skip some cases. For example, if composed is desired, we simply
63// don't touch 1-character clusters that are supported by the font, even
64// though their NFC may be different.
65//
66// - When a font has a precomposed character for a sequence but the 'ccmp'
67// feature in the font is not adequate, use the precomposed character
68// which typically has better mark positioning.
69//
70// - When a font does not support a combining mark, but supports it precomposed
71// with previous base, use that. This needs the itemizer to have this
72// knowledge too. We need to provide assistance to the itemizer.
73//
74// - When a font does not support a character but supports its canonical
75// decomposition, well, use the decomposition.
76//
77// - The shapers can customize the compose and decompose functions to
78// offload some of their requirements to the normalizer. For example, the
79// Indic shaper may want to disallow recomposing of two matras.
80
81fn decompose_unicode(
82 _: &hb_ot_shape_normalize_context_t,
83 ab: hb_codepoint_t,
84) -> Option<(hb_codepoint_t, hb_codepoint_t)> {
85 super::unicode::decompose(ab)
86}
87
88fn compose_unicode(
89 _: &hb_ot_shape_normalize_context_t,
90 a: hb_codepoint_t,
91 b: hb_codepoint_t,
92) -> Option<hb_codepoint_t> {
93 super::unicode::compose(a, b)
94}
95
96fn set_glyph(info: &mut hb_glyph_info_t, font: &hb_font_t) {
97 if let Some(glyph_id: GlyphId) = font.get_nominal_glyph(info.glyph_id) {
98 info.set_glyph_index(u32::from(glyph_id.0));
99 }
100}
101
102fn output_char(buffer: &mut hb_buffer_t, unichar: u32, glyph: u32) {
103 // This is very confusing indeed.
104 buffer.cur_mut(0).set_glyph_index(glyph);
105 buffer.output_glyph(glyph_index:unichar);
106 // TODO: should be _hb_glyph_info_set_unicode_props (&buffer->prev(), buffer);
107 let mut flags: u32 = buffer.scratch_flags;
108 buffer.prev_mut().init_unicode_props(&mut flags);
109 buffer.scratch_flags = flags;
110}
111
112fn next_char(buffer: &mut hb_buffer_t, glyph: u32) {
113 buffer.cur_mut(0).set_glyph_index(glyph);
114 buffer.next_glyph();
115}
116
117fn skip_char(buffer: &mut hb_buffer_t) {
118 buffer.skip_glyph();
119}
120
121/// Returns 0 if didn't decompose, number of resulting characters otherwise.
122fn decompose(ctx: &mut hb_ot_shape_normalize_context_t, shortest: bool, ab: hb_codepoint_t) -> u32 {
123 let (a, b) = match (ctx.decompose)(ctx, ab) {
124 Some(decomposed) => decomposed,
125 _ => return 0,
126 };
127
128 let a_glyph = ctx.face.get_nominal_glyph(u32::from(a));
129 let b_glyph = if b != '\0' {
130 match ctx.face.get_nominal_glyph(u32::from(b)) {
131 Some(glyph_id) => Some(glyph_id),
132 None => return 0,
133 }
134 } else {
135 None
136 };
137
138 if !shortest || a_glyph.is_none() {
139 let ret = decompose(ctx, shortest, a);
140 if ret != 0 {
141 if let Some(b_glyph) = b_glyph {
142 output_char(ctx.buffer, u32::from(b), u32::from(b_glyph.0));
143 return ret + 1;
144 }
145 return ret;
146 }
147 }
148
149 if let Some(a_glyph) = a_glyph {
150 // Output a and b.
151 output_char(ctx.buffer, u32::from(a), u32::from(a_glyph.0));
152 if let Some(b_glyph) = b_glyph {
153 output_char(ctx.buffer, u32::from(b), u32::from(b_glyph.0));
154 return 2;
155 }
156 return 1;
157 }
158
159 0
160}
161
162fn decompose_current_character(ctx: &mut hb_ot_shape_normalize_context_t, shortest: bool) {
163 let u = ctx.buffer.cur(0).as_char();
164 let glyph = ctx.face.get_nominal_glyph(u32::from(u));
165
166 // TODO: different to harfbuzz, sync
167 if !shortest || glyph.is_none() {
168 if decompose(ctx, shortest, u) > 0 {
169 skip_char(ctx.buffer);
170 return;
171 }
172 }
173
174 // TODO: different to harfbuzz, sync
175 if let Some(glyph) = glyph {
176 next_char(ctx.buffer, u32::from(glyph.0));
177 return;
178 }
179
180 if _hb_glyph_info_is_unicode_space(ctx.buffer.cur(0)) {
181 let space_type = u.space_fallback();
182 if space_type != hb_unicode_funcs_t::NOT_SPACE {
183 let space_glyph = ctx.face.get_nominal_glyph(0x0020).or(ctx.buffer.invisible);
184
185 if let Some(space_glyph) = space_glyph {
186 _hb_glyph_info_set_unicode_space_fallback_type(ctx.buffer.cur_mut(0), space_type);
187 next_char(ctx.buffer, u32::from(space_glyph.0));
188 ctx.buffer.scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK;
189 return;
190 }
191 }
192 }
193
194 // U+2011 is the only sensible character that is a no-break version of another character
195 // and not a space. The space ones are handled already. Handle this lone one.
196 if u == '\u{2011}' {
197 if let Some(other_glyph) = ctx.face.get_nominal_glyph(0x2010) {
198 next_char(ctx.buffer, u32::from(other_glyph.0));
199 return;
200 }
201 }
202
203 // Insert a .notdef glyph if decomposition failed.
204 next_char(ctx.buffer, 0);
205}
206
207fn handle_variation_selector_cluster(
208 ctx: &mut hb_ot_shape_normalize_context_t,
209 end: usize,
210 _: bool,
211) {
212 let face = ctx.face;
213
214 // Currently if there's a variation-selector we give-up on normalization, it's just too hard.
215 let buffer = &mut ctx.buffer;
216 while buffer.idx < end - 1 && buffer.successful {
217 if buffer.cur(1).as_char().is_variation_selector() {
218 if let Some(glyph_id) =
219 face.glyph_variation_index(buffer.cur(0).as_char(), buffer.cur(1).as_char())
220 {
221 buffer.cur_mut(0).set_glyph_index(u32::from(glyph_id.0));
222 let unicode = buffer.cur(0).glyph_id;
223 buffer.replace_glyphs(2, 1, &[unicode]);
224 } else {
225 // Just pass on the two characters separately, let GSUB do its magic.
226 set_glyph(buffer.cur_mut(0), face);
227 buffer.next_glyph();
228
229 buffer.scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_VARIATION_SELECTOR_FALLBACK;
230
231 _hb_glyph_info_set_variation_selector(buffer.cur_mut(0), true);
232
233 if buffer.not_found_variation_selector.is_some() {
234 _hb_glyph_info_clear_default_ignorable(buffer.cur_mut(0))
235 }
236
237 set_glyph(buffer.cur_mut(0), face);
238 buffer.next_glyph();
239 }
240
241 // Skip any further variation selectors.
242 while buffer.idx < end && buffer.cur(0).as_char().is_variation_selector() {
243 set_glyph(buffer.cur_mut(0), face);
244 buffer.next_glyph();
245 }
246 } else {
247 set_glyph(buffer.cur_mut(0), face);
248 buffer.next_glyph();
249 }
250 }
251
252 if ctx.buffer.idx < end {
253 set_glyph(ctx.buffer.cur_mut(0), face);
254 ctx.buffer.next_glyph();
255 }
256}
257
258fn decompose_multi_char_cluster(
259 ctx: &mut hb_ot_shape_normalize_context_t,
260 end: usize,
261 short_circuit: bool,
262) {
263 let mut i: usize = ctx.buffer.idx;
264 while i < end && ctx.buffer.successful {
265 if ctx.buffer.info[i].as_char().is_variation_selector() {
266 handle_variation_selector_cluster(ctx, end, short_circuit);
267 return;
268 }
269 i += 1;
270 }
271
272 while ctx.buffer.idx < end && ctx.buffer.successful {
273 decompose_current_character(ctx, shortest:short_circuit);
274 }
275}
276
277fn compare_combining_class(pa: &hb_glyph_info_t, pb: &hb_glyph_info_t) -> bool {
278 let a: u8 = _hb_glyph_info_get_modified_combining_class(info:pa);
279 let b: u8 = _hb_glyph_info_get_modified_combining_class(info:pb);
280 a > b
281}
282
283pub fn _hb_ot_shape_normalize(
284 plan: &hb_ot_shape_plan_t,
285 buffer: &mut hb_buffer_t,
286 face: &hb_font_t,
287) {
288 if buffer.is_empty() {
289 return;
290 }
291
292 let mut mode = plan.shaper.normalization_preference;
293 if mode == HB_OT_SHAPE_NORMALIZATION_MODE_AUTO {
294 if plan.has_gpos_mark {
295 // https://github.com/harfbuzz/harfbuzz/issues/653#issuecomment-423905920
296 // mode = Some(HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED);
297 mode = HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS;
298 } else {
299 mode = HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS;
300 }
301 }
302
303 let mut ctx = hb_ot_shape_normalize_context_t {
304 plan,
305 buffer,
306 face,
307 decompose: decompose_unicode,
308 compose: compose_unicode,
309 };
310 ctx.override_decompose_and_compose(plan.shaper.decompose, plan.shaper.compose);
311
312 let mut buffer = &mut ctx.buffer;
313
314 let always_short_circuit = mode == HB_OT_SHAPE_NORMALIZATION_MODE_NONE;
315 let might_short_circuit = always_short_circuit
316 || (mode != HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED
317 && mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT);
318
319 // We do a fairly straightforward yet custom normalization process in three
320 // separate rounds: decompose, reorder, recompose (if desired). Currently
321 // this makes two buffer swaps. We can make it faster by moving the last
322 // two rounds into the inner loop for the first round, but it's more readable
323 // this way.
324
325 // First round, decompose
326 let mut all_simple = true;
327 {
328 buffer.clear_output();
329 let count = buffer.len;
330 buffer.idx = 0;
331 loop {
332 let mut end = buffer.idx + 1;
333 while end < count && !_hb_glyph_info_is_unicode_mark(&buffer.info[end]) {
334 end += 1;
335 }
336
337 if end < count {
338 // Leave one base for the marks to cluster with.
339 end -= 1;
340 }
341
342 // From idx to end are simple clusters.
343 if might_short_circuit {
344 let len = end - buffer.idx;
345 let mut done = 0;
346 while done < len {
347 let cur = buffer.cur_mut(done);
348 cur.set_glyph_index(match face.get_nominal_glyph(cur.glyph_id) {
349 Some(glyph_id) => u32::from(glyph_id.0),
350 None => break,
351 });
352 done += 1;
353 }
354 buffer.next_glyphs(done);
355 }
356
357 while buffer.idx < end && buffer.successful {
358 decompose_current_character(&mut ctx, might_short_circuit);
359 buffer = &mut ctx.buffer;
360 }
361
362 if buffer.idx == count || !buffer.successful {
363 break;
364 }
365
366 all_simple = false;
367
368 // Find all the marks now.
369 end = buffer.idx + 1;
370 while end < count && _hb_glyph_info_is_unicode_mark(&buffer.info[end]) {
371 end += 1;
372 }
373
374 // idx to end is one non-simple cluster.
375 decompose_multi_char_cluster(&mut ctx, end, always_short_circuit);
376 buffer = &mut ctx.buffer;
377
378 if buffer.idx >= count || !buffer.successful {
379 break;
380 }
381 }
382
383 buffer.sync();
384 }
385
386 // Second round, reorder (inplace)
387 if !all_simple {
388 let count = buffer.len;
389 let mut i = 0;
390 while i < count {
391 if _hb_glyph_info_get_modified_combining_class(&buffer.info[i]) == 0 {
392 i += 1;
393 continue;
394 }
395
396 let mut end = i + 1;
397 while end < count && _hb_glyph_info_get_modified_combining_class(&buffer.info[end]) != 0
398 {
399 end += 1;
400 }
401
402 // We are going to do a O(n^2). Only do this if the sequence is short.
403 if end - i <= MAX_COMBINING_MARKS {
404 buffer.sort(i, end, compare_combining_class);
405
406 if let Some(reorder_marks) = ctx.plan.shaper.reorder_marks {
407 reorder_marks(ctx.plan, buffer, i, end);
408 }
409 }
410
411 i = end + 1;
412 }
413 }
414 if buffer.scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_CGJ != 0 {
415 // For all CGJ, check if it prevented any reordering at all.
416 // If it did NOT, then make it skippable.
417 // https://github.com/harfbuzz/harfbuzz/issues/554
418 for i in 1..buffer.len.saturating_sub(1) {
419 if buffer.info[i].glyph_id == 0x034F
420 /* CGJ */
421 {
422 let last = _hb_glyph_info_get_modified_combining_class(&buffer.info[i - 1]);
423 let next = _hb_glyph_info_get_modified_combining_class(&buffer.info[i + 1]);
424 if next == 0 || last <= next {
425 buffer.info[i].unhide();
426 }
427 }
428 }
429 }
430
431 // Third round, recompose
432 if !all_simple
433 && buffer.successful
434 && (mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS
435 || mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT)
436 {
437 // As noted in the comment earlier, we don't try to combine
438 // ccc=0 chars with their previous Starter.
439
440 let count = buffer.len;
441 let mut starter = 0;
442 buffer.clear_output();
443 buffer.next_glyph();
444 while buffer.idx < count && buffer.successful {
445 // We don't try to compose a non-mark character with it's preceding starter.
446 // This is both an optimization to avoid trying to compose every two neighboring
447 // glyphs in most scripts AND a desired feature for Hangul. Apparently Hangul
448 // fonts are not designed to mix-and-match pre-composed syllables and Jamo.
449 let cur = buffer.cur(0);
450 if _hb_glyph_info_is_unicode_mark(cur) &&
451 // If there's anything between the starter and this char, they should have CCC
452 // smaller than this character's.
453 (starter == buffer.out_len - 1
454 || _hb_glyph_info_get_modified_combining_class(buffer.prev()) < _hb_glyph_info_get_modified_combining_class(cur))
455 {
456 let a = buffer.out_info()[starter].as_char();
457 let b = cur.as_char();
458 if let Some(composed) = (ctx.compose)(&ctx, a, b) {
459 if let Some(glyph_id) = face.get_nominal_glyph(u32::from(composed)) {
460 // Copy to out-buffer.
461 buffer = &mut ctx.buffer;
462 buffer.next_glyph();
463 if !buffer.successful {
464 return;
465 }
466
467 // Merge and remove the second composable.
468 buffer.merge_out_clusters(starter, buffer.out_len);
469 buffer.out_len -= 1;
470
471 // Modify starter and carry on.
472 let mut flags = buffer.scratch_flags;
473 let info = &mut buffer.out_info_mut()[starter];
474 info.glyph_id = u32::from(composed);
475 info.set_glyph_index(u32::from(glyph_id.0));
476 info.init_unicode_props(&mut flags);
477 buffer.scratch_flags = flags;
478
479 continue;
480 }
481 }
482 }
483
484 // Blocked, or doesn't compose.
485 buffer = &mut ctx.buffer;
486 buffer.next_glyph();
487
488 if _hb_glyph_info_get_modified_combining_class(buffer.prev()) == 0 {
489 starter = buffer.out_len - 1;
490 }
491 }
492
493 buffer.sync();
494 }
495}
496