1use crate::buffer::{Buffer, BufferScratchFlags, GlyphInfo};
2use crate::complex::MAX_COMBINING_MARKS;
3use crate::plan::ShapePlan;
4use crate::unicode::{CharExt, GeneralCategory};
5use crate::Face;
6
7// HIGHLEVEL DESIGN:
8//
9// This file exports one main function: normalize().
10//
11// This function closely reflects the Unicode Normalization Algorithm,
12// yet it's different.
13//
14// Each shaper specifies whether it prefers decomposed (NFD) or composed (NFC).
15// The logic however tries to use whatever the font can support.
16//
17// In general what happens is that: each grapheme is decomposed in a chain
18// of 1:2 decompositions, marks reordered, and then recomposed if desired,
19// so far it's like Unicode Normalization. However, the decomposition and
20// recomposition only happens if the font supports the resulting characters.
21//
22// The goals are:
23//
24// - Try to render all canonically equivalent strings similarly. To really
25// achieve this we have to always do the full decomposition and then
26// selectively recompose from there. It's kinda too expensive though, so
27// we skip some cases. For example, if composed is desired, we simply
28// don't touch 1-character clusters that are supported by the font, even
29// though their NFC may be different.
30//
31// - When a font has a precomposed character for a sequence but the 'ccmp'
32// feature in the font is not adequate, use the precomposed character
33// which typically has better mark positioning.
34//
35// - When a font does not support a combining mark, but supports it precomposed
36// with previous base, use that. This needs the itemizer to have this
37// knowledge too. We need to provide assistance to the itemizer.
38//
39// - When a font does not support a character but supports its canonical
40// decomposition, well, use the decomposition.
41//
42// - The complex shapers can customize the compose and decompose functions to
43// offload some of their requirements to the normalizer. For example, the
44// Indic shaper may want to disallow recomposing of two matras.
45
46pub struct ShapeNormalizeContext<'a> {
47 pub plan: &'a ShapePlan,
48 pub buffer: &'a mut Buffer,
49 pub face: &'a Face<'a>,
50 pub decompose: fn(&ShapeNormalizeContext, char) -> Option<(char, char)>,
51 pub compose: fn(&ShapeNormalizeContext, char, char) -> Option<char>,
52}
53
54#[derive(Clone, Copy, Debug, PartialEq, Eq)]
55pub enum ShapeNormalizationMode {
56 #[allow(dead_code)]
57 Decomposed,
58 /// Never composes base-to-base.
59 ComposedDiacritics,
60 /// Always fully decomposes and then recompose back.
61 ComposedDiacriticsNoShortCircuit,
62 Auto,
63}
64
65impl Default for ShapeNormalizationMode {
66 fn default() -> Self {
67 Self::Auto
68 }
69}
70
71pub fn normalize(plan: &ShapePlan, face: &Face, buffer: &mut Buffer) {
72 if buffer.is_empty() {
73 return;
74 }
75
76 let mut mode = plan.shaper.normalization_mode;
77 if mode == Some(ShapeNormalizationMode::Auto) {
78 // https://github.com/harfbuzz/harfbuzz/issues/653#issuecomment-423905920
79 // if plan.has_gpos_mark() {
80 // mode = ShapeNormalizationMode::Decomposed;
81 // }
82 mode = Some(ShapeNormalizationMode::ComposedDiacritics);
83 }
84
85 let decompose = plan
86 .shaper
87 .decompose
88 .unwrap_or(|_, ab| crate::unicode::decompose(ab));
89 let compose = plan
90 .shaper
91 .compose
92 .unwrap_or(|_, a, b| crate::unicode::compose(a, b));
93 let mut ctx = ShapeNormalizeContext {
94 plan,
95 buffer,
96 face,
97 decompose,
98 compose,
99 };
100 let mut buffer = &mut ctx.buffer;
101
102 let always_short_circuit = mode.is_none();
103 let might_short_circuit = always_short_circuit
104 || !matches!(
105 mode,
106 Some(ShapeNormalizationMode::Decomposed)
107 | Some(ShapeNormalizationMode::ComposedDiacriticsNoShortCircuit)
108 );
109
110 // We do a fairly straightforward yet custom normalization process in three
111 // separate rounds: decompose, reorder, recompose (if desired). Currently
112 // this makes two buffer swaps. We can make it faster by moving the last
113 // two rounds into the inner loop for the first round, but it's more readable
114 // this way.
115
116 // First round, decompose
117 let mut all_simple = true;
118 {
119 let count = buffer.len;
120 buffer.idx = 0;
121 buffer.clear_output();
122 loop {
123 let mut end = buffer.idx + 1;
124 while end < count && !buffer.info[end].is_unicode_mark() {
125 end += 1;
126 }
127
128 if end < count {
129 // Leave one base for the marks to cluster with.
130 end -= 1;
131 }
132
133 // From idx to end are simple clusters.
134 if might_short_circuit {
135 let len = end - buffer.idx;
136 let mut done = 0;
137 while done < len {
138 let cur = buffer.cur_mut(done);
139 cur.set_glyph_index(match face.glyph_index(cur.glyph_id) {
140 Some(glyph_id) => u32::from(glyph_id.0),
141 None => break,
142 });
143 done += 1;
144 }
145 buffer.next_glyphs(done);
146 }
147
148 while buffer.idx < end && buffer.successful {
149 decompose_current_character(&mut ctx, might_short_circuit);
150 buffer = &mut ctx.buffer;
151 }
152
153 if buffer.idx == count || !buffer.successful {
154 break;
155 }
156
157 all_simple = false;
158
159 // Find all the marks now.
160 end = buffer.idx + 1;
161 while end < count && buffer.info[end].is_unicode_mark() {
162 end += 1;
163 }
164
165 // idx to end is one non-simple cluster.
166 decompose_multi_char_cluster(&mut ctx, end, always_short_circuit);
167 buffer = &mut ctx.buffer;
168
169 if buffer.idx >= count || !buffer.successful {
170 break;
171 }
172 }
173
174 buffer.swap_buffers();
175 }
176
177 // Second round, reorder (inplace)
178 if !all_simple {
179 let count = buffer.len;
180 let mut i = 0;
181 while i < count {
182 if buffer.info[i].modified_combining_class() == 0 {
183 i += 1;
184 continue;
185 }
186
187 let mut end = i + 1;
188 while end < count && buffer.info[end].modified_combining_class() != 0 {
189 end += 1;
190 }
191
192 // We are going to do a O(n^2). Only do this if the sequence is short.
193 if end - i <= MAX_COMBINING_MARKS {
194 buffer.sort(i, end, |a, b| {
195 a.modified_combining_class() > b.modified_combining_class()
196 });
197
198 if let Some(reorder_marks) = ctx.plan.shaper.reorder_marks {
199 reorder_marks(ctx.plan, buffer, i, end);
200 }
201 }
202
203 i = end + 1;
204 }
205 }
206 if buffer.scratch_flags.contains(BufferScratchFlags::HAS_CGJ) {
207 // For all CGJ, check if it prevented any reordering at all.
208 // If it did NOT, then make it skippable.
209 // https://github.com/harfbuzz/harfbuzz/issues/554
210 for i in 1..buffer.len.saturating_sub(1) {
211 if buffer.info[i].glyph_id == 0x034F
212 /* CGJ */
213 {
214 let last = buffer.info[i - 1].modified_combining_class();
215 let next = buffer.info[i + 1].modified_combining_class();
216 if next == 0 || last <= next {
217 buffer.info[i].unhide();
218 }
219 }
220 }
221 }
222
223 // Third round, recompose
224 if !all_simple
225 && buffer.successful
226 && matches!(
227 mode,
228 Some(ShapeNormalizationMode::ComposedDiacritics)
229 | Some(ShapeNormalizationMode::ComposedDiacriticsNoShortCircuit)
230 )
231 {
232 // As noted in the comment earlier, we don't try to combine
233 // ccc=0 chars with their previous Starter.
234
235 let count = buffer.len;
236 let mut starter = 0;
237 buffer.clear_output();
238 buffer.next_glyph();
239 while buffer.idx < count && buffer.successful {
240 // We don't try to compose a non-mark character with it's preceding starter.
241 // This is both an optimization to avoid trying to compose every two neighboring
242 // glyphs in most scripts AND a desired feature for Hangul. Apparently Hangul
243 // fonts are not designed to mix-and-match pre-composed syllables and Jamo.
244 let cur = buffer.cur(0);
245 if cur.is_unicode_mark() &&
246 // If there's anything between the starter and this char, they should have CCC
247 // smaller than this character's.
248 (starter == buffer.out_len - 1
249 || buffer.prev().modified_combining_class() < cur.modified_combining_class())
250 {
251 let a = buffer.out_info()[starter].as_char();
252 let b = cur.as_char();
253 if let Some(composed) = (ctx.compose)(&ctx, a, b) {
254 if let Some(glyph_id) = face.glyph_index(u32::from(composed)) {
255 // Copy to out-buffer.
256 buffer = &mut ctx.buffer;
257 buffer.next_glyph();
258 if !buffer.successful {
259 return;
260 }
261
262 // Merge and remove the second composable.
263 buffer.merge_out_clusters(starter, buffer.out_len);
264 buffer.out_len -= 1;
265
266 // Modify starter and carry on.
267 let mut flags = buffer.scratch_flags;
268 let info = &mut buffer.out_info_mut()[starter];
269 info.glyph_id = u32::from(composed);
270 info.set_glyph_index(u32::from(glyph_id.0));
271 info.init_unicode_props(&mut flags);
272 buffer.scratch_flags = flags;
273
274 continue;
275 }
276 }
277 }
278
279 // Blocked, or doesn't compose.
280 buffer = &mut ctx.buffer;
281 buffer.next_glyph();
282
283 if buffer.prev().modified_combining_class() == 0 {
284 starter = buffer.out_len - 1;
285 }
286 }
287
288 buffer.swap_buffers();
289 }
290}
291
292fn decompose_multi_char_cluster(ctx: &mut ShapeNormalizeContext, end: usize, short_circuit: bool) {
293 let mut i: usize = ctx.buffer.idx;
294 while i < end && ctx.buffer.successful {
295 if ctx.buffer.info[i].as_char().is_variation_selector() {
296 handle_variation_selector_cluster(ctx, end, short_circuit);
297 return;
298 }
299 i += 1;
300 }
301
302 while ctx.buffer.idx < end && ctx.buffer.successful {
303 decompose_current_character(ctx, shortest:short_circuit);
304 }
305}
306
307fn handle_variation_selector_cluster(ctx: &mut ShapeNormalizeContext, end: usize, _: bool) {
308 let face = ctx.face;
309 let set_glyph = |info: &mut GlyphInfo| {
310 if let Some(glyph_id) = face.glyph_index(info.glyph_id) {
311 info.set_glyph_index(u32::from(glyph_id.0));
312 }
313 };
314
315 // TODO: Currently if there's a variation-selector we give-up, it's just too hard.
316 let buffer = &mut ctx.buffer;
317 while buffer.idx < end - 1 && buffer.successful {
318 if buffer.cur(1).as_char().is_variation_selector() {
319 if let Some(glyph_id) =
320 face.glyph_variation_index(buffer.cur(0).as_char(), buffer.cur(1).as_char())
321 {
322 buffer.cur_mut(0).set_glyph_index(u32::from(glyph_id.0));
323 let unicode = buffer.cur(0).glyph_id;
324 buffer.replace_glyphs(2, 1, &[unicode]);
325 } else {
326 // Just pass on the two characters separately, let GSUB do its magic.
327 set_glyph(buffer.cur_mut(0));
328 buffer.next_glyph();
329 set_glyph(buffer.cur_mut(0));
330 buffer.next_glyph();
331 }
332
333 // Skip any further variation selectors.
334 while buffer.idx < end && buffer.cur(0).as_char().is_variation_selector() {
335 set_glyph(buffer.cur_mut(0));
336 buffer.next_glyph();
337 }
338 } else {
339 set_glyph(buffer.cur_mut(0));
340 buffer.next_glyph();
341 }
342 }
343
344 if ctx.buffer.idx < end {
345 set_glyph(ctx.buffer.cur_mut(0));
346 ctx.buffer.next_glyph();
347 }
348}
349
350fn decompose_current_character(ctx: &mut ShapeNormalizeContext, shortest: bool) {
351 let u = ctx.buffer.cur(0).as_char();
352 let glyph = ctx.face.glyph_index(u32::from(u));
353
354 if !shortest || glyph.is_none() {
355 if decompose(ctx, shortest, u) > 0 {
356 ctx.buffer.skip_glyph();
357 return;
358 }
359 }
360
361 if let Some(glyph) = glyph {
362 ctx.buffer.next_char(u32::from(glyph.0));
363 return;
364 }
365
366 // Handle space characters.
367 if ctx.buffer.cur(0).general_category() == GeneralCategory::SpaceSeparator {
368 if let Some(space_type) = u.space_fallback() {
369 if let Some(space_glyph) = ctx.face.glyph_index(u32::from(' ')) {
370 ctx.buffer.cur_mut(0).set_space_fallback(space_type);
371 ctx.buffer.next_char(u32::from(space_glyph.0));
372 ctx.buffer.scratch_flags |= BufferScratchFlags::HAS_SPACE_FALLBACK;
373 return;
374 }
375 }
376 }
377
378 // U+2011 is the only sensible character that is a no-break version of another character
379 // and not a space. The space ones are handled already. Handle this lone one.
380 if u == '\u{2011}' {
381 if let Some(other_glyph) = ctx.face.glyph_index(0x2010) {
382 ctx.buffer.next_char(u32::from(other_glyph.0));
383 return;
384 }
385 }
386
387 // Insert a .notdef glyph if decomposition failed.
388 ctx.buffer.next_char(0);
389}
390
391/// Returns 0 if didn't decompose, number of resulting characters otherwise.
392fn decompose(ctx: &mut ShapeNormalizeContext, shortest: bool, ab: char) -> u32 {
393 let (a, b) = match (ctx.decompose)(ctx, ab) {
394 Some(decomposed) => decomposed,
395 _ => return 0,
396 };
397
398 let a_glyph = ctx.face.glyph_index(u32::from(a));
399 let b_glyph = if b != '\0' {
400 match ctx.face.glyph_index(u32::from(b)) {
401 Some(glyph_id) => Some(glyph_id),
402 None => return 0,
403 }
404 } else {
405 None
406 };
407
408 if !shortest || a_glyph.is_none() {
409 let ret = decompose(ctx, shortest, a);
410 if ret != 0 {
411 if let Some(b_glyph) = b_glyph {
412 ctx.buffer.output_char(u32::from(b), u32::from(b_glyph.0));
413 return ret + 1;
414 }
415 return ret;
416 }
417 }
418
419 if let Some(a_glyph) = a_glyph {
420 // Output a and b.
421 ctx.buffer.output_char(u32::from(a), u32::from(a_glyph.0));
422 if let Some(b_glyph) = b_glyph {
423 ctx.buffer.output_char(u32::from(b), u32::from(b_glyph.0));
424 return 2;
425 }
426 return 1;
427 }
428
429 0
430}
431