1 | use super::buffer::*; |
2 | use super::common::hb_codepoint_t; |
3 | use super::hb_font_t; |
4 | use super::ot_layout::*; |
5 | use super::ot_shape_plan::hb_ot_shape_plan_t; |
6 | use super::ot_shaper::{ComposeFn, DecomposeFn, MAX_COMBINING_MARKS}; |
7 | use super::unicode::{hb_unicode_funcs_t, CharExt}; |
8 | |
9 | pub struct hb_ot_shape_normalize_context_t<'a> { |
10 | pub plan: &'a hb_ot_shape_plan_t, |
11 | pub buffer: &'a mut hb_buffer_t, |
12 | pub face: &'a hb_font_t<'a>, |
13 | pub decompose: DecomposeFn, |
14 | pub compose: ComposeFn, |
15 | } |
16 | |
17 | impl hb_ot_shape_normalize_context_t<'_> { |
18 | pub(crate) fn override_decompose_and_compose( |
19 | &mut self, |
20 | decompose: Option<DecomposeFn>, |
21 | compose: Option<ComposeFn>, |
22 | ) { |
23 | if let Some(decompose: fn(&hb_ot_shape_normalize_context_t<'_>, …) -> …) = decompose { |
24 | self.decompose = decompose; |
25 | } |
26 | |
27 | if let Some(compose: fn(&hb_ot_shape_normalize_context_t<'_>, …) -> …) = compose { |
28 | self.compose = compose; |
29 | } |
30 | } |
31 | } |
32 | |
33 | pub type hb_ot_shape_normalization_mode_t = i32; |
34 | pub const HB_OT_SHAPE_NORMALIZATION_MODE_NONE: i32 = 0; |
35 | pub const HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED: i32 = 1; |
36 | pub const HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS: i32 = 2; /* Never composes base-to-base */ |
37 | pub const HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT: i32 = 3; /* Always fully decomposes and then recompose back */ |
38 | pub const HB_OT_SHAPE_NORMALIZATION_MODE_AUTO: i32 = 4; /* See hb-ot-shape-normalize.cc for logic. */ |
39 | #[allow (dead_code)] |
40 | pub const HB_OT_SHAPE_NORMALIZATION_MODE_DEFAULT: i32 = HB_OT_SHAPE_NORMALIZATION_MODE_AUTO; |
41 | |
42 | // HIGHLEVEL DESIGN: |
43 | // |
44 | // This file exports one main function: normalize(). |
45 | // |
46 | // This function closely reflects the Unicode Normalization Algorithm, |
47 | // yet it's different. |
48 | // |
49 | // Each shaper specifies whether it prefers decomposed (NFD) or composed (NFC). |
50 | // The logic however tries to use whatever the font can support. |
51 | // |
52 | // In general what happens is that: each grapheme is decomposed in a chain |
53 | // of 1:2 decompositions, marks reordered, and then recomposed if desired, |
54 | // so far it's like Unicode Normalization. However, the decomposition and |
55 | // recomposition only happens if the font supports the resulting characters. |
56 | // |
57 | // The goals are: |
58 | // |
59 | // - Try to render all canonically equivalent strings similarly. To really |
60 | // achieve this we have to always do the full decomposition and then |
61 | // selectively recompose from there. It's kinda too expensive though, so |
62 | // we skip some cases. For example, if composed is desired, we simply |
63 | // don't touch 1-character clusters that are supported by the font, even |
64 | // though their NFC may be different. |
65 | // |
66 | // - When a font has a precomposed character for a sequence but the 'ccmp' |
67 | // feature in the font is not adequate, use the precomposed character |
68 | // which typically has better mark positioning. |
69 | // |
70 | // - When a font does not support a combining mark, but supports it precomposed |
71 | // with previous base, use that. This needs the itemizer to have this |
72 | // knowledge too. We need to provide assistance to the itemizer. |
73 | // |
74 | // - When a font does not support a character but supports its canonical |
75 | // decomposition, well, use the decomposition. |
76 | // |
77 | // - The shapers can customize the compose and decompose functions to |
78 | // offload some of their requirements to the normalizer. For example, the |
79 | // Indic shaper may want to disallow recomposing of two matras. |
80 | |
81 | fn decompose_unicode( |
82 | _: &hb_ot_shape_normalize_context_t, |
83 | ab: hb_codepoint_t, |
84 | ) -> Option<(hb_codepoint_t, hb_codepoint_t)> { |
85 | super::unicode::decompose(ab) |
86 | } |
87 | |
88 | fn compose_unicode( |
89 | _: &hb_ot_shape_normalize_context_t, |
90 | a: hb_codepoint_t, |
91 | b: hb_codepoint_t, |
92 | ) -> Option<hb_codepoint_t> { |
93 | super::unicode::compose(a, b) |
94 | } |
95 | |
96 | fn set_glyph(info: &mut hb_glyph_info_t, font: &hb_font_t) { |
97 | if let Some(glyph_id: GlyphId) = font.get_nominal_glyph(info.glyph_id) { |
98 | info.set_glyph_index(u32::from(glyph_id.0)); |
99 | } |
100 | } |
101 | |
102 | fn output_char(buffer: &mut hb_buffer_t, unichar: u32, glyph: u32) { |
103 | // This is very confusing indeed. |
104 | buffer.cur_mut(0).set_glyph_index(glyph); |
105 | buffer.output_glyph(glyph_index:unichar); |
106 | // TODO: should be _hb_glyph_info_set_unicode_props (&buffer->prev(), buffer); |
107 | let mut flags: u32 = buffer.scratch_flags; |
108 | buffer.prev_mut().init_unicode_props(&mut flags); |
109 | buffer.scratch_flags = flags; |
110 | } |
111 | |
112 | fn next_char(buffer: &mut hb_buffer_t, glyph: u32) { |
113 | buffer.cur_mut(0).set_glyph_index(glyph); |
114 | buffer.next_glyph(); |
115 | } |
116 | |
117 | fn skip_char(buffer: &mut hb_buffer_t) { |
118 | buffer.skip_glyph(); |
119 | } |
120 | |
121 | /// Returns 0 if didn't decompose, number of resulting characters otherwise. |
122 | fn decompose(ctx: &mut hb_ot_shape_normalize_context_t, shortest: bool, ab: hb_codepoint_t) -> u32 { |
123 | let (a, b) = match (ctx.decompose)(ctx, ab) { |
124 | Some(decomposed) => decomposed, |
125 | _ => return 0, |
126 | }; |
127 | |
128 | let a_glyph = ctx.face.get_nominal_glyph(u32::from(a)); |
129 | let b_glyph = if b != ' \0' { |
130 | match ctx.face.get_nominal_glyph(u32::from(b)) { |
131 | Some(glyph_id) => Some(glyph_id), |
132 | None => return 0, |
133 | } |
134 | } else { |
135 | None |
136 | }; |
137 | |
138 | if !shortest || a_glyph.is_none() { |
139 | let ret = decompose(ctx, shortest, a); |
140 | if ret != 0 { |
141 | if let Some(b_glyph) = b_glyph { |
142 | output_char(ctx.buffer, u32::from(b), u32::from(b_glyph.0)); |
143 | return ret + 1; |
144 | } |
145 | return ret; |
146 | } |
147 | } |
148 | |
149 | if let Some(a_glyph) = a_glyph { |
150 | // Output a and b. |
151 | output_char(ctx.buffer, u32::from(a), u32::from(a_glyph.0)); |
152 | if let Some(b_glyph) = b_glyph { |
153 | output_char(ctx.buffer, u32::from(b), u32::from(b_glyph.0)); |
154 | return 2; |
155 | } |
156 | return 1; |
157 | } |
158 | |
159 | 0 |
160 | } |
161 | |
162 | fn decompose_current_character(ctx: &mut hb_ot_shape_normalize_context_t, shortest: bool) { |
163 | let u = ctx.buffer.cur(0).as_char(); |
164 | let glyph = ctx.face.get_nominal_glyph(u32::from(u)); |
165 | |
166 | // TODO: different to harfbuzz, sync |
167 | if !shortest || glyph.is_none() { |
168 | if decompose(ctx, shortest, u) > 0 { |
169 | skip_char(ctx.buffer); |
170 | return; |
171 | } |
172 | } |
173 | |
174 | // TODO: different to harfbuzz, sync |
175 | if let Some(glyph) = glyph { |
176 | next_char(ctx.buffer, u32::from(glyph.0)); |
177 | return; |
178 | } |
179 | |
180 | if _hb_glyph_info_is_unicode_space(ctx.buffer.cur(0)) { |
181 | let space_type = u.space_fallback(); |
182 | if space_type != hb_unicode_funcs_t::NOT_SPACE { |
183 | let space_glyph = ctx.face.get_nominal_glyph(0x0020).or(ctx.buffer.invisible); |
184 | |
185 | if let Some(space_glyph) = space_glyph { |
186 | _hb_glyph_info_set_unicode_space_fallback_type(ctx.buffer.cur_mut(0), space_type); |
187 | next_char(ctx.buffer, u32::from(space_glyph.0)); |
188 | ctx.buffer.scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_SPACE_FALLBACK; |
189 | return; |
190 | } |
191 | } |
192 | } |
193 | |
194 | // U+2011 is the only sensible character that is a no-break version of another character |
195 | // and not a space. The space ones are handled already. Handle this lone one. |
196 | if u == ' \u{2011}' { |
197 | if let Some(other_glyph) = ctx.face.get_nominal_glyph(0x2010) { |
198 | next_char(ctx.buffer, u32::from(other_glyph.0)); |
199 | return; |
200 | } |
201 | } |
202 | |
203 | // Insert a .notdef glyph if decomposition failed. |
204 | next_char(ctx.buffer, 0); |
205 | } |
206 | |
207 | fn handle_variation_selector_cluster( |
208 | ctx: &mut hb_ot_shape_normalize_context_t, |
209 | end: usize, |
210 | _: bool, |
211 | ) { |
212 | let face = ctx.face; |
213 | |
214 | // Currently if there's a variation-selector we give-up on normalization, it's just too hard. |
215 | let buffer = &mut ctx.buffer; |
216 | while buffer.idx < end - 1 && buffer.successful { |
217 | if buffer.cur(1).as_char().is_variation_selector() { |
218 | if let Some(glyph_id) = |
219 | face.glyph_variation_index(buffer.cur(0).as_char(), buffer.cur(1).as_char()) |
220 | { |
221 | buffer.cur_mut(0).set_glyph_index(u32::from(glyph_id.0)); |
222 | let unicode = buffer.cur(0).glyph_id; |
223 | buffer.replace_glyphs(2, 1, &[unicode]); |
224 | } else { |
225 | // Just pass on the two characters separately, let GSUB do its magic. |
226 | set_glyph(buffer.cur_mut(0), face); |
227 | buffer.next_glyph(); |
228 | |
229 | buffer.scratch_flags |= HB_BUFFER_SCRATCH_FLAG_HAS_VARIATION_SELECTOR_FALLBACK; |
230 | |
231 | _hb_glyph_info_set_variation_selector(buffer.cur_mut(0), true); |
232 | |
233 | if buffer.not_found_variation_selector.is_some() { |
234 | _hb_glyph_info_clear_default_ignorable(buffer.cur_mut(0)) |
235 | } |
236 | |
237 | set_glyph(buffer.cur_mut(0), face); |
238 | buffer.next_glyph(); |
239 | } |
240 | |
241 | // Skip any further variation selectors. |
242 | while buffer.idx < end && buffer.cur(0).as_char().is_variation_selector() { |
243 | set_glyph(buffer.cur_mut(0), face); |
244 | buffer.next_glyph(); |
245 | } |
246 | } else { |
247 | set_glyph(buffer.cur_mut(0), face); |
248 | buffer.next_glyph(); |
249 | } |
250 | } |
251 | |
252 | if ctx.buffer.idx < end { |
253 | set_glyph(ctx.buffer.cur_mut(0), face); |
254 | ctx.buffer.next_glyph(); |
255 | } |
256 | } |
257 | |
258 | fn decompose_multi_char_cluster( |
259 | ctx: &mut hb_ot_shape_normalize_context_t, |
260 | end: usize, |
261 | short_circuit: bool, |
262 | ) { |
263 | let mut i: usize = ctx.buffer.idx; |
264 | while i < end && ctx.buffer.successful { |
265 | if ctx.buffer.info[i].as_char().is_variation_selector() { |
266 | handle_variation_selector_cluster(ctx, end, short_circuit); |
267 | return; |
268 | } |
269 | i += 1; |
270 | } |
271 | |
272 | while ctx.buffer.idx < end && ctx.buffer.successful { |
273 | decompose_current_character(ctx, shortest:short_circuit); |
274 | } |
275 | } |
276 | |
277 | fn compare_combining_class(pa: &hb_glyph_info_t, pb: &hb_glyph_info_t) -> bool { |
278 | let a: u8 = _hb_glyph_info_get_modified_combining_class(info:pa); |
279 | let b: u8 = _hb_glyph_info_get_modified_combining_class(info:pb); |
280 | a > b |
281 | } |
282 | |
283 | pub fn _hb_ot_shape_normalize( |
284 | plan: &hb_ot_shape_plan_t, |
285 | buffer: &mut hb_buffer_t, |
286 | face: &hb_font_t, |
287 | ) { |
288 | if buffer.is_empty() { |
289 | return; |
290 | } |
291 | |
292 | let mut mode = plan.shaper.normalization_preference; |
293 | if mode == HB_OT_SHAPE_NORMALIZATION_MODE_AUTO { |
294 | if plan.has_gpos_mark { |
295 | // https://github.com/harfbuzz/harfbuzz/issues/653#issuecomment-423905920 |
296 | // mode = Some(HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED); |
297 | mode = HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS; |
298 | } else { |
299 | mode = HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS; |
300 | } |
301 | } |
302 | |
303 | let mut ctx = hb_ot_shape_normalize_context_t { |
304 | plan, |
305 | buffer, |
306 | face, |
307 | decompose: decompose_unicode, |
308 | compose: compose_unicode, |
309 | }; |
310 | ctx.override_decompose_and_compose(plan.shaper.decompose, plan.shaper.compose); |
311 | |
312 | let mut buffer = &mut ctx.buffer; |
313 | |
314 | let always_short_circuit = mode == HB_OT_SHAPE_NORMALIZATION_MODE_NONE; |
315 | let might_short_circuit = always_short_circuit |
316 | || (mode != HB_OT_SHAPE_NORMALIZATION_MODE_DECOMPOSED |
317 | && mode != HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT); |
318 | |
319 | // We do a fairly straightforward yet custom normalization process in three |
320 | // separate rounds: decompose, reorder, recompose (if desired). Currently |
321 | // this makes two buffer swaps. We can make it faster by moving the last |
322 | // two rounds into the inner loop for the first round, but it's more readable |
323 | // this way. |
324 | |
325 | // First round, decompose |
326 | let mut all_simple = true; |
327 | { |
328 | buffer.clear_output(); |
329 | let count = buffer.len; |
330 | buffer.idx = 0; |
331 | loop { |
332 | let mut end = buffer.idx + 1; |
333 | while end < count && !_hb_glyph_info_is_unicode_mark(&buffer.info[end]) { |
334 | end += 1; |
335 | } |
336 | |
337 | if end < count { |
338 | // Leave one base for the marks to cluster with. |
339 | end -= 1; |
340 | } |
341 | |
342 | // From idx to end are simple clusters. |
343 | if might_short_circuit { |
344 | let len = end - buffer.idx; |
345 | let mut done = 0; |
346 | while done < len { |
347 | let cur = buffer.cur_mut(done); |
348 | cur.set_glyph_index(match face.get_nominal_glyph(cur.glyph_id) { |
349 | Some(glyph_id) => u32::from(glyph_id.0), |
350 | None => break, |
351 | }); |
352 | done += 1; |
353 | } |
354 | buffer.next_glyphs(done); |
355 | } |
356 | |
357 | while buffer.idx < end && buffer.successful { |
358 | decompose_current_character(&mut ctx, might_short_circuit); |
359 | buffer = &mut ctx.buffer; |
360 | } |
361 | |
362 | if buffer.idx == count || !buffer.successful { |
363 | break; |
364 | } |
365 | |
366 | all_simple = false; |
367 | |
368 | // Find all the marks now. |
369 | end = buffer.idx + 1; |
370 | while end < count && _hb_glyph_info_is_unicode_mark(&buffer.info[end]) { |
371 | end += 1; |
372 | } |
373 | |
374 | // idx to end is one non-simple cluster. |
375 | decompose_multi_char_cluster(&mut ctx, end, always_short_circuit); |
376 | buffer = &mut ctx.buffer; |
377 | |
378 | if buffer.idx >= count || !buffer.successful { |
379 | break; |
380 | } |
381 | } |
382 | |
383 | buffer.sync(); |
384 | } |
385 | |
386 | // Second round, reorder (inplace) |
387 | if !all_simple { |
388 | let count = buffer.len; |
389 | let mut i = 0; |
390 | while i < count { |
391 | if _hb_glyph_info_get_modified_combining_class(&buffer.info[i]) == 0 { |
392 | i += 1; |
393 | continue; |
394 | } |
395 | |
396 | let mut end = i + 1; |
397 | while end < count && _hb_glyph_info_get_modified_combining_class(&buffer.info[end]) != 0 |
398 | { |
399 | end += 1; |
400 | } |
401 | |
402 | // We are going to do a O(n^2). Only do this if the sequence is short. |
403 | if end - i <= MAX_COMBINING_MARKS { |
404 | buffer.sort(i, end, compare_combining_class); |
405 | |
406 | if let Some(reorder_marks) = ctx.plan.shaper.reorder_marks { |
407 | reorder_marks(ctx.plan, buffer, i, end); |
408 | } |
409 | } |
410 | |
411 | i = end + 1; |
412 | } |
413 | } |
414 | if buffer.scratch_flags & HB_BUFFER_SCRATCH_FLAG_HAS_CGJ != 0 { |
415 | // For all CGJ, check if it prevented any reordering at all. |
416 | // If it did NOT, then make it skippable. |
417 | // https://github.com/harfbuzz/harfbuzz/issues/554 |
418 | for i in 1..buffer.len.saturating_sub(1) { |
419 | if buffer.info[i].glyph_id == 0x034F |
420 | /* CGJ */ |
421 | { |
422 | let last = _hb_glyph_info_get_modified_combining_class(&buffer.info[i - 1]); |
423 | let next = _hb_glyph_info_get_modified_combining_class(&buffer.info[i + 1]); |
424 | if next == 0 || last <= next { |
425 | buffer.info[i].unhide(); |
426 | } |
427 | } |
428 | } |
429 | } |
430 | |
431 | // Third round, recompose |
432 | if !all_simple |
433 | && buffer.successful |
434 | && (mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS |
435 | || mode == HB_OT_SHAPE_NORMALIZATION_MODE_COMPOSED_DIACRITICS_NO_SHORT_CIRCUIT) |
436 | { |
437 | // As noted in the comment earlier, we don't try to combine |
438 | // ccc=0 chars with their previous Starter. |
439 | |
440 | let count = buffer.len; |
441 | let mut starter = 0; |
442 | buffer.clear_output(); |
443 | buffer.next_glyph(); |
444 | while buffer.idx < count && buffer.successful { |
445 | // We don't try to compose a non-mark character with it's preceding starter. |
446 | // This is both an optimization to avoid trying to compose every two neighboring |
447 | // glyphs in most scripts AND a desired feature for Hangul. Apparently Hangul |
448 | // fonts are not designed to mix-and-match pre-composed syllables and Jamo. |
449 | let cur = buffer.cur(0); |
450 | if _hb_glyph_info_is_unicode_mark(cur) && |
451 | // If there's anything between the starter and this char, they should have CCC |
452 | // smaller than this character's. |
453 | (starter == buffer.out_len - 1 |
454 | || _hb_glyph_info_get_modified_combining_class(buffer.prev()) < _hb_glyph_info_get_modified_combining_class(cur)) |
455 | { |
456 | let a = buffer.out_info()[starter].as_char(); |
457 | let b = cur.as_char(); |
458 | if let Some(composed) = (ctx.compose)(&ctx, a, b) { |
459 | if let Some(glyph_id) = face.get_nominal_glyph(u32::from(composed)) { |
460 | // Copy to out-buffer. |
461 | buffer = &mut ctx.buffer; |
462 | buffer.next_glyph(); |
463 | if !buffer.successful { |
464 | return; |
465 | } |
466 | |
467 | // Merge and remove the second composable. |
468 | buffer.merge_out_clusters(starter, buffer.out_len); |
469 | buffer.out_len -= 1; |
470 | |
471 | // Modify starter and carry on. |
472 | let mut flags = buffer.scratch_flags; |
473 | let info = &mut buffer.out_info_mut()[starter]; |
474 | info.glyph_id = u32::from(composed); |
475 | info.set_glyph_index(u32::from(glyph_id.0)); |
476 | info.init_unicode_props(&mut flags); |
477 | buffer.scratch_flags = flags; |
478 | |
479 | continue; |
480 | } |
481 | } |
482 | } |
483 | |
484 | // Blocked, or doesn't compose. |
485 | buffer = &mut ctx.buffer; |
486 | buffer.next_glyph(); |
487 | |
488 | if _hb_glyph_info_get_modified_combining_class(buffer.prev()) == 0 { |
489 | starter = buffer.out_len - 1; |
490 | } |
491 | } |
492 | |
493 | buffer.sync(); |
494 | } |
495 | } |
496 | |