ot_shaper_hangul.rs source code [crates/rustybuzz/src/hb/ot_shaper_hangul.rs]

1	use alloc::boxed::Box;
2
3	use super::buffer::*;
4	use super::ot_map::*;
5	use super::ot_shape::*;
6	use super::ot_shape_normalize::HB_OT_SHAPE_NORMALIZATION_MODE_NONE;
7	use super::ot_shape_plan::hb_ot_shape_plan_t;
8	use super::ot_shaper::*;
9	use super::*;
10	use crate::BufferFlags;
11
12	const LJMO: u8 = `1`;
13	const VJMO: u8 = `2`;
14	const TJMO: u8 = `3`;
15
16	impl hb_glyph_info_t {
17	fn hangul_shaping_feature(&self) -> u8 {
18	self.ot_shaper_var_u8_auxiliary()
19	}
20
21	fn set_hangul_shaping_feature(&mut self, feature: u8) {
22	self.set_ot_shaper_var_u8_auxiliary(feature)
23	}
24	}
25
26	fn collect_features_hangul(planner: &mut hb_ot_shape_planner_t) {
27	planner
28	.ot_map
29	.add_feature(tag:hb_tag_t::from_bytes(b"ljmo"), F_NONE, value:`1`);
30	planner
31	.ot_map
32	.add_feature(tag:hb_tag_t::from_bytes(b"vjmo"), F_NONE, value:`1`);
33	planner
34	.ot_map
35	.add_feature(tag:hb_tag_t::from_bytes(b"tjmo"), F_NONE, value:`1`);
36	}
37
38	fn override_features_hangul(planner: &mut hb_ot_shape_planner_t) {
39	// Uniscribe does not apply 'calt' for Hangul, and certain fonts
40	// (Noto Sans CJK, Source Sans Han, etc) apply all of jamo lookups
41	// in calt, which is not desirable.
42	planner
43	.ot_map
44	.disable_feature(tag:hb_tag_t::from_bytes(b"calt"));
45	}
46
47	struct hangul_shape_plan_t {
48	mask_array: [hb_mask_t; `4`],
49	}
50
51	fn data_create_hangul(map: &hb_ot_map_t) -> hangul_shape_plan_t {
52	hangul_shape_plan_t {
53	mask_array: [
54	`0`,
55	map.get_1_mask(feature_tag:hb_tag_t::from_bytes(b"ljmo")),
56	map.get_1_mask(feature_tag:hb_tag_t::from_bytes(b"vjmo")),
57	map.get_1_mask(feature_tag:hb_tag_t::from_bytes(b"tjmo")),
58	],
59	}
60	}
61
62	const L_BASE: u32 = `0x1100`;
63	const V_BASE: u32 = `0x1161`;
64	const T_BASE: u32 = `0x11A7`;
65	const L_COUNT: u32 = `19`;
66	const V_COUNT: u32 = `21`;
67	const T_COUNT: u32 = `28`;
68	const N_COUNT: u32 = V_COUNT * T_COUNT;
69	const S_COUNT: u32 = L_COUNT * N_COUNT;
70	const S_BASE: u32 = `0xAC00`;
71
72	fn is_combining_l(u: u32) -> bool {
73	(L_BASE..=L_BASE + L_COUNT - `1`).contains(&u)
74	}
75
76	fn is_combining_v(u: u32) -> bool {
77	(V_BASE..=V_BASE + V_COUNT - `1`).contains(&u)
78	}
79
80	fn is_combining_t(u: u32) -> bool {
81	(T_BASE + `1`..=T_BASE + T_COUNT - `1`).contains(&u)
82	}
83
84	fn is_combined_s(u: u32) -> bool {
85	(S_BASE..=S_BASE + S_COUNT - `1`).contains(&u)
86	}
87
88	fn is_l(u: u32) -> bool {
89	(`0x1100`..=`0x115F`).contains(&u) \|\| (`0xA960`..=`0xA97C`).contains(&u)
90	}
91
92	fn is_v(u: u32) -> bool {
93	(`0x1160`..=`0x11A7`).contains(&u) \|\| (`0xD7B0`..=`0xD7C6`).contains(&u)
94	}
95
96	fn is_t(u: u32) -> bool {
97	(`0x11A8`..=`0x11FF`).contains(&u) \|\| (`0xD7CB`..=`0xD7FB`).contains(&u)
98	}
99
100	fn is_hangul_tone(u: u32) -> bool {
101	(`0x302E`..=`0x302F`).contains(&u)
102	}
103
104	fn is_zero_width_char(face: &hb_font_t, c: char) -> bool {
105	if let Some(glyph: GlyphId) = face.get_nominal_glyph(c as u32) {
106	face.glyph_h_advance(glyph) == `0`
107	} else {
108	`false`
109	}
110	}
111
112	fn preprocess_text_hangul(_: &hb_ot_shape_plan_t, face: &hb_font_t, buffer: &mut hb_buffer_t) {
113	// Hangul syllables come in two shapes: LV, and LVT. Of those:
114	//
115	// - LV can be precomposed, or decomposed. Lets call those
116	// <LV> and <L,V>,
117	// - LVT can be fully precomposed, partially precomposed, or
118	// fully decomposed. Ie. <LVT>, <LV,T>, or <L,V,T>.
119	//
120	// The composition / decomposition is mechanical. However, not
121	// all <L,V> sequences compose, and not all <LV,T> sequences
122	// compose.
123	//
124	// Here are the specifics:
125	//
126	// - <L>: U+1100..115F, U+A960..A97F
127	// - <V>: U+1160..11A7, U+D7B0..D7C7
128	// - <T>: U+11A8..11FF, U+D7CB..D7FB
129	//
130	// - Only the <L,V> sequences for some of the U+11xx ranges combine.
131	// - Only <LV,T> sequences for some of the Ts in U+11xx range combine.
132	//
133	// Here is what we want to accomplish in this shaper:
134	//
135	// - If the whole syllable can be precomposed, do that,
136	// - Otherwise, fully decompose and apply ljmo/vjmo/tjmo features.
137	// - If a valid syllable is followed by a Hangul tone mark, reorder the tone
138	// mark to precede the whole syllable - unless it is a zero-width glyph, in
139	// which case we leave it untouched, assuming it's designed to overstrike.
140	//
141	// That is, of the different possible syllables:
142	//
143	// <L>
144	// <L,V>
145	// <L,V,T>
146	// <LV>
147	// <LVT>
148	// <LV, T>
149	//
150	// - <L> needs no work.
151	//
152	// - <LV> and <LVT> can stay the way they are if the font supports them, otherwise we
153	// should fully decompose them if font supports.
154	//
155	// - <L,V> and <L,V,T> we should compose if the whole thing can be composed.
156	//
157	// - <LV,T> we should compose if the whole thing can be composed, otherwise we should
158	// decompose.
159
160	buffer.clear_output();
161	// Extent of most recently seen syllable; valid only if start < end
162	let mut start = `0`;
163	let mut end = `0`;
164	buffer.idx = `0`;
165	while buffer.idx < buffer.len {
166	let u = buffer.cur(`0`).glyph_id;
167	let c = buffer.cur(`0`).as_char();
168
169	if is_hangul_tone(u) {
170	// We could cache the width of the tone marks and the existence of dotted-circle,
171	// but the use of the Hangul tone mark characters seems to be rare enough that
172	// I didn't bother for now.
173	if start < end && end == buffer.out_len {
174	// Tone mark follows a valid syllable; move it in front, unless it's zero width.
175	buffer.unsafe_to_break_from_outbuffer(Some(start), Some(buffer.idx));
176	buffer.next_glyph();
177	if !is_zero_width_char(face, c) {
178	buffer.merge_out_clusters(start, end + `1`);
179	let out_info = buffer.out_info_mut();
180	let tone = out_info[end];
181	for i in (`0`..end - start).rev() {
182	out_info[i + start + `1`] = out_info[i + start];
183	}
184	out_info[start] = tone;
185	}
186	} else {
187	// No valid syllable as base for tone mark; try to insert dotted circle.
188	if !buffer
189	.flags
190	.contains(BufferFlags::DO_NOT_INSERT_DOTTED_CIRCLE)
191	&& face.has_glyph(`0x25CC`)
192	{
193	let mut chars = [`0`; `2`];
194	if !is_zero_width_char(face, c) {
195	chars[`0`] = u;
196	chars[`1`] = `0x25CC`;
197	} else {
198	chars[`0`] = `0x25CC`;
199	chars[`1`] = u;
200	}
201
202	buffer.replace_glyphs(`1`, `2`, &chars);
203	} else {
204	// No dotted circle available in the font; just leave tone mark untouched.
205	buffer.next_glyph();
206	}
207	}
208
209	start = buffer.out_len;
210	end = buffer.out_len;
211	continue;
212	}
213
214	// Remember current position as a potential syllable start;
215	// will only be used if we set end to a later position.
216	start = buffer.out_len;
217
218	if is_l(u) && buffer.idx + `1` < buffer.len {
219	let l = u;
220	let v = buffer.cur(`1`).glyph_id;
221	if is_v(v) {
222	// Have <L,V> or <L,V,T>.
223	let mut t = `0`;
224	let mut tindex = `0`;
225	if buffer.idx + `2` < buffer.len {
226	t = buffer.cur(`2`).glyph_id;
227	if is_t(t) {
228	// Only used if isCombiningT (t); otherwise invalid.
229	tindex = t - T_BASE;
230	} else {
231	// The next character was not a trailing jamo.
232	t = `0`;
233	}
234	}
235
236	let offset = if t != `0` { `3` } else { `2` };
237	buffer.unsafe_to_break(Some(buffer.idx), Some(buffer.idx + offset));
238
239	// We've got a syllable <L,V,T?>; see if it can potentially be composed.
240	if is_combining_l(l) && is_combining_v(v) && (t == `0` \|\| is_combining_t(t)) {
241	// Try to compose; if this succeeds, end is set to start+1.
242	let s = S_BASE + (l - L_BASE) * N_COUNT + (v - V_BASE) * T_COUNT + tindex;
243	if face.has_glyph(s) {
244	let n = if t != `0` { `3` } else { `2` };
245	buffer.replace_glyphs(n, `1`, &[s]);
246	end = start + `1`;
247	continue;
248	}
249	}
250
251	// We didn't compose, either because it's an Old Hangul syllable without a
252	// precomposed character in Unicode, or because the font didn't support the
253	// necessary precomposed glyph.
254	// Set jamo features on the individual glyphs, and advance past them.
255	buffer.cur_mut(`0`).set_hangul_shaping_feature(LJMO);
256	buffer.next_glyph();
257	buffer.cur_mut(`0`).set_hangul_shaping_feature(VJMO);
258	buffer.next_glyph();
259	if t != `0` {
260	buffer.cur_mut(`0`).set_hangul_shaping_feature(TJMO);
261	buffer.next_glyph();
262	end = start + `3`;
263	} else {
264	end = start + `2`;
265	}
266
267	if buffer.cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES {
268	buffer.merge_out_clusters(start, end);
269	}
270
271	continue;
272	}
273	} else if is_combined_s(u) {
274	// Have <LV>, <LVT>, or <LV,T>
275	let s = u;
276	let has_glyph = face.has_glyph(s);
277
278	let lindex = (s - S_BASE) / N_COUNT;
279	let nindex = (s - S_BASE) % N_COUNT;
280	let vindex = nindex / T_COUNT;
281	let tindex = nindex % T_COUNT;
282
283	if tindex == `0` && buffer.idx + `1` < buffer.len && is_combining_t(buffer.cur(`1`).glyph_id)
284	{
285	// <LV,T>, try to combine.
286	let new_tindex = buffer.cur(`1`).glyph_id - T_BASE;
287	let new_s = s + new_tindex;
288
289	if face.has_glyph(new_s) {
290	buffer.replace_glyphs(`2`, `1`, &[new_s]);
291	end = start + `1`;
292	continue;
293	} else {
294	// Mark unsafe between LV and T.
295	buffer.unsafe_to_break(Some(buffer.idx), Some(buffer.idx + `2`));
296	}
297	}
298
299	// Otherwise, decompose if font doesn't support <LV> or <LVT>,
300	// or if having non-combining <LV,T>. Note that we already handled
301	// combining <LV,T> above.
302	if !has_glyph
303	\|\| (tindex == `0` && buffer.idx + `1` < buffer.len && is_t(buffer.cur(`1`).glyph_id))
304	{
305	let decomposed = [L_BASE + lindex, V_BASE + vindex, T_BASE + tindex];
306	if face.has_glyph(decomposed[`0`])
307	&& face.has_glyph(decomposed[`1`])
308	&& (tindex == `0` \|\| face.has_glyph(decomposed[`2`]))
309	{
310	let mut s_len = if tindex != `0` { `3` } else { `2` };
311	buffer.replace_glyphs(`1`, s_len, &decomposed);
312
313	// If we decomposed an LV because of a non-combining T following,
314	// we want to include this T in the syllable.
315	if has_glyph && tindex == `0` {
316	buffer.next_glyph();
317	s_len += `1`;
318	}
319
320	// We decomposed S: apply jamo features to the individual glyphs
321	// that are now in `buffer.out_info`.
322	end = start + s_len;
323
324	buffer.out_info_mut()[start + `0`].set_hangul_shaping_feature(LJMO);
325	buffer.out_info_mut()[start + `1`].set_hangul_shaping_feature(VJMO);
326	if start + `2` < end {
327	buffer.out_info_mut()[start + `2`].set_hangul_shaping_feature(TJMO);
328	}
329
330	if buffer.cluster_level == HB_BUFFER_CLUSTER_LEVEL_MONOTONE_GRAPHEMES {
331	buffer.merge_out_clusters(start, end);
332	}
333
334	continue;
335	} else if tindex == `0` && buffer.idx + `1` > buffer.len && is_t(buffer.cur(`1`).glyph_id)
336	{
337	// Mark unsafe between LV and T.
338	buffer.unsafe_to_break(Some(buffer.idx), Some(buffer.idx + `2`));
339	}
340	}
341
342	if has_glyph {
343	// We didn't decompose the S, so just advance past it.
344	end = start + `1`;
345	buffer.next_glyph();
346	continue;
347	}
348	}
349
350	// Didn't find a recognizable syllable, so we leave end <= start;
351	// this will prevent tone-mark reordering happening.
352	buffer.next_glyph();
353	}
354
355	buffer.sync();
356	}
357
358	fn setup_masks_hangul(plan: &hb_ot_shape_plan_t, _: &hb_font_t, buffer: &mut hb_buffer_t) {
359	let hangul_plan: &hangul_shape_plan_t = plan.data::<hangul_shape_plan_t>();
360	for info: &mut hb_glyph_info_t in buffer.info_slice_mut() {
361	info.mask \|= hangul_plan.mask_array[info.hangul_shaping_feature() as usize];
362	}
363	}
364
365	pub const HANGUL_SHAPER: hb_ot_shaper_t = hb_ot_shaper_t {
366	collect_features: Some(collect_features_hangul),
367	override_features: Some(override_features_hangul),
368	create_data: Some(\|plan: &hb_ot_shape_plan_t\| Box::new(data_create_hangul(&plan.ot_map))),
369	preprocess_text: Some(preprocess_text_hangul),
370	postprocess_glyphs: None,
371	normalization_preference: HB_OT_SHAPE_NORMALIZATION_MODE_NONE,
372	decompose: None,
373	compose: None,
374	setup_masks: Some(setup_masks_hangul),
375	gpos_tag: None,
376	reorder_marks: None,
377	zero_width_marks: HB_OT_SHAPE_ZERO_WIDTH_MARKS_NONE,
378	fallback_position: `false`,
379	};
380