mod.rs source code [crates/fontdue/src/unicode/mod.rs]

1	mod tables;
2
3	use crate::unicode::tables::*;
4	use alloc::string::String;
5
6	const CONT_MASK: u8 = `0b0011_1111`;
7
8	#[inline(always)]
9	fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 {
10	(ch << `6`) \| (byte & CONT_MASK) as u32
11	}
12
13	pub fn decode_utf16(bytes: &[u8]) -> String {
14	let mut output: String = String::new();
15	let mut offset: usize = `0`;
16	while offset < bytes.len() {
17	output.push(ch:read_utf16(bytes, &mut offset));
18	}
19	output
20	}
21
22	pub fn read_utf16(bytes: &[u8], offset: &mut usize) -> char {
23	let a: u16 = ((bytes[offset] as u16) << `8`) \| bytes[offset + `1`] as u16;
24	*offset += `2`;
25	if a < `0xD800` \|\| `0xDFFF` < a {
26	unsafe { core::char::from_u32_unchecked(a as u32) }
27	} else {
28	let b: u16 = ((bytes[offset] as u16) << `8`) \| bytes[offset + `1`] as u16;
29	*offset += `2`;
30	let c: u32 = (((a - `0xD800`) as u32) << `10` \| (b - `0xDC00`) as u32) + `0x1_0000`;
31	unsafe { core::char::from_u32_unchecked(c as u32) }
32	}
33	}
34
35	/// Returns (length, character). Cannot be run at the end of the string.
36	pub fn read_utf8(bytes: &[u8], byte_offset: &mut usize) -> char {
37	let x: u8 = bytes[*byte_offset];
38	*byte_offset += `1`;
39	if x < `128` {
40	return unsafe { core::char::from_u32_unchecked(x as u32) };
41	}
42	let init: u32 = (x & (`0x7F` >> `2`)) as u32;
43	let y: u8 = bytes[*byte_offset];
44	*byte_offset += `1`;
45	let mut ch: u32 = utf8_acc_cont_byte(ch:init, byte:y);
46	if x >= `0xE0` {
47	let z: u8 = bytes[*byte_offset];
48	*byte_offset += `1`;
49	let y_z: u32 = utf8_acc_cont_byte((y & CONT_MASK) as u32, byte:z);
50	ch = init << `12` \| y_z;
51	if x >= `0xF0` {
52	let w: u8 = bytes[*byte_offset];
53	*byte_offset += `1`;
54	ch = (init & `7`) << `18` \| utf8_acc_cont_byte(ch:y_z, byte:w);
55	}
56	}
57	unsafe { core::char::from_u32_unchecked(ch) }
58	}
59
60	#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
61	/// Ordering is based on linebreak priority. Ordering is Hard > Soft > None.
62	pub struct LinebreakData {
63	bits: u8,
64	}
65
66	pub const LINEBREAK_NONE: LinebreakData = LinebreakData::new(bits:`0b0000_0000`);
67	pub const LINEBREAK_SOFT: LinebreakData = LinebreakData::new(bits:`0b0000_0001`);
68	pub const LINEBREAK_HARD: LinebreakData = LinebreakData::new(bits:`0b0000_0010`);
69
70	impl LinebreakData {
71	const NONE: u8 = `0b0000_0000`;
72	const SOFT: u8 = `0b0000_0001`;
73	const HARD: u8 = `0b0000_0010`;
74
75	const fn new(bits: u8) -> LinebreakData {
76	LinebreakData {
77	bits,
78	}
79	}
80
81	pub fn from_mask(wrap_soft_breaks: bool, wrap_hard_breaks: bool, has_width: bool) -> LinebreakData {
82	let mut mask = `0`;
83	if wrap_hard_breaks {
84	mask \|= LinebreakData::HARD;
85	}
86	if wrap_soft_breaks && has_width {
87	mask \|= LinebreakData::SOFT;
88	}
89	LinebreakData {
90	bits: mask,
91	}
92	}
93
94	pub fn is_hard(&self) -> bool {
95	self.bits == LinebreakData::HARD
96	}
97
98	pub fn is_soft(&self) -> bool {
99	self.bits == LinebreakData::SOFT
100	}
101
102	pub fn mask(&self, other: LinebreakData) -> LinebreakData {
103	Self::new(self.bits & other.bits)
104	}
105	}
106
107	#[derive(Debug, Copy, Clone)]
108	pub struct Linebreaker {
109	state: u8,
110	}
111
112	impl Linebreaker {
113	pub fn new() -> Linebreaker {
114	Linebreaker {
115	state: `0`,
116	}
117	}
118
119	pub fn reset(&mut self) {
120	self.state = `0`;
121	}
122
123	// [See license/xi-editor/xi-unicode] Copyright 2016 The xi-editor Authors
124	pub fn next(&mut self, codepoint: char) -> LinebreakData {
125	let cp = codepoint as usize;
126	let lb = if cp < `0x800` {
127	LINEBREAK_1_2[cp]
128	} else if cp < `0x10000` {
129	let child = LINEBREAK_3_ROOT[cp >> `6`];
130	LINEBREAK_3_CHILD[(child as usize) * `0x40` + (cp & `0x3f`)]
131	} else {
132	let mid = LINEBREAK_4_ROOT[cp >> `12`];
133	let leaf = LINEBREAK_4_MID[(mid as usize) * `0x40` + ((cp >> `6`) & `0x3f`)];
134	LINEBREAK_4_LEAVES[(leaf as usize) * `0x40` + (cp & `0x3f`)]
135	};
136	let i = (self.state as usize) * N_LINEBREAK_CATEGORIES + (lb as usize);
137	let new = LINEBREAK_STATE_MACHINE[i];
138	if (new as i8) < `0` {
139	self.state = new & `0x3f`;
140	if new >= `0xc0` {
141	LINEBREAK_HARD
142	} else {
143	LINEBREAK_SOFT
144	}
145	} else {
146	self.state = new;
147	LINEBREAK_NONE
148	}
149	}
150	}
151
152	/// Miscellaneous metadata associated with a character to assist in layout.
153	#[derive(Debug, Copy, Clone, PartialEq, Eq)]
154	pub struct CharacterData {
155	bits: u8,
156	}
157
158	impl CharacterData {
159	const WHITESPACE: u8 = `0b0000_0001`;
160	const CONTROL: u8 = `0b0000_0010`;
161	const MISSING: u8 = `0b0000_0100`;
162
163	/// Classifies a character given its index in the font.
164	pub fn classify(c: char, index: u16) -> CharacterData {
165	let mut class = `0`;
166	if index == `0` {
167	class \|= CharacterData::MISSING;
168	}
169	match c {
170	'`\t`' \| '`\n`' \| '`\x0C`' \| '`\r`' \| ' ' => class \|= CharacterData::WHITESPACE,
171	_ => {}
172	}
173	match c {
174	'`\0`'..='`\x1F`' \| '`\x7F`' => class \|= CharacterData::CONTROL,
175	_ => {}
176	}
177	CharacterData {
178	bits: class,
179	}
180	}
181
182	/// A heuristic for if the glpyh this was classified from should be rasterized. Missing glyphs,
183	/// whitespace, and control characters will return false.
184	pub fn rasterize(&self) -> bool {
185	self.bits == `0`
186	}
187
188	/// Marks if the character is an ASCII whitespace character.
189	pub fn is_whitespace(&self) -> bool {
190	self.bits & CharacterData::WHITESPACE != `0`
191	}
192
193	/// Marks if the character is an ASCII control character.
194	pub fn is_control(&self) -> bool {
195	self.bits & CharacterData::CONTROL != `0`
196	}
197
198	/// Marks if the character is missing from its associated font.
199	pub fn is_missing(&self) -> bool {
200	self.bits & CharacterData::MISSING != `0`
201	}
202	}
203