1 | mod tables; |
2 | |
3 | use crate::unicode::tables::*; |
4 | use alloc::string::String; |
5 | |
6 | const CONT_MASK: u8 = 0b0011_1111; |
7 | |
8 | #[inline (always)] |
9 | fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { |
10 | (ch << 6) | (byte & CONT_MASK) as u32 |
11 | } |
12 | |
13 | pub fn decode_utf16(bytes: &[u8]) -> String { |
14 | let mut output: String = String::new(); |
15 | let mut offset: usize = 0; |
16 | while offset < bytes.len() { |
17 | output.push(ch:read_utf16(bytes, &mut offset)); |
18 | } |
19 | output |
20 | } |
21 | |
22 | pub fn read_utf16(bytes: &[u8], offset: &mut usize) -> char { |
23 | let a: u16 = ((bytes[*offset] as u16) << 8) | bytes[*offset + 1] as u16; |
24 | *offset += 2; |
25 | if a < 0xD800 || 0xDFFF < a { |
26 | unsafe { core::char::from_u32_unchecked(a as u32) } |
27 | } else { |
28 | let b: u16 = ((bytes[*offset] as u16) << 8) | bytes[*offset + 1] as u16; |
29 | *offset += 2; |
30 | let c: u32 = (((a - 0xD800) as u32) << 10 | (b - 0xDC00) as u32) + 0x1_0000; |
31 | unsafe { core::char::from_u32_unchecked(c as u32) } |
32 | } |
33 | } |
34 | |
35 | /// Returns (length, character). Cannot be run at the end of the string. |
36 | pub fn read_utf8(bytes: &[u8], byte_offset: &mut usize) -> char { |
37 | let x: u8 = bytes[*byte_offset]; |
38 | *byte_offset += 1; |
39 | if x < 128 { |
40 | return unsafe { core::char::from_u32_unchecked(x as u32) }; |
41 | } |
42 | let init: u32 = (x & (0x7F >> 2)) as u32; |
43 | let y: u8 = bytes[*byte_offset]; |
44 | *byte_offset += 1; |
45 | let mut ch: u32 = utf8_acc_cont_byte(ch:init, byte:y); |
46 | if x >= 0xE0 { |
47 | let z: u8 = bytes[*byte_offset]; |
48 | *byte_offset += 1; |
49 | let y_z: u32 = utf8_acc_cont_byte((y & CONT_MASK) as u32, byte:z); |
50 | ch = init << 12 | y_z; |
51 | if x >= 0xF0 { |
52 | let w: u8 = bytes[*byte_offset]; |
53 | *byte_offset += 1; |
54 | ch = (init & 7) << 18 | utf8_acc_cont_byte(ch:y_z, byte:w); |
55 | } |
56 | } |
57 | unsafe { core::char::from_u32_unchecked(ch) } |
58 | } |
59 | |
60 | #[derive (Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] |
61 | /// Ordering is based on linebreak priority. Ordering is Hard > Soft > None. |
62 | pub struct LinebreakData { |
63 | bits: u8, |
64 | } |
65 | |
66 | pub const LINEBREAK_NONE: LinebreakData = LinebreakData::new(bits:0b0000_0000); |
67 | pub const LINEBREAK_SOFT: LinebreakData = LinebreakData::new(bits:0b0000_0001); |
68 | pub const LINEBREAK_HARD: LinebreakData = LinebreakData::new(bits:0b0000_0010); |
69 | |
70 | impl LinebreakData { |
71 | const NONE: u8 = 0b0000_0000; |
72 | const SOFT: u8 = 0b0000_0001; |
73 | const HARD: u8 = 0b0000_0010; |
74 | |
75 | const fn new(bits: u8) -> LinebreakData { |
76 | LinebreakData { |
77 | bits, |
78 | } |
79 | } |
80 | |
81 | pub fn from_mask(wrap_soft_breaks: bool, wrap_hard_breaks: bool, has_width: bool) -> LinebreakData { |
82 | let mut mask = 0; |
83 | if wrap_hard_breaks { |
84 | mask |= LinebreakData::HARD; |
85 | } |
86 | if wrap_soft_breaks && has_width { |
87 | mask |= LinebreakData::SOFT; |
88 | } |
89 | LinebreakData { |
90 | bits: mask, |
91 | } |
92 | } |
93 | |
94 | pub fn is_hard(&self) -> bool { |
95 | self.bits == LinebreakData::HARD |
96 | } |
97 | |
98 | pub fn is_soft(&self) -> bool { |
99 | self.bits == LinebreakData::SOFT |
100 | } |
101 | |
102 | pub fn mask(&self, other: LinebreakData) -> LinebreakData { |
103 | Self::new(self.bits & other.bits) |
104 | } |
105 | } |
106 | |
107 | #[derive (Debug, Copy, Clone)] |
108 | pub struct Linebreaker { |
109 | state: u8, |
110 | } |
111 | |
112 | impl Linebreaker { |
113 | pub fn new() -> Linebreaker { |
114 | Linebreaker { |
115 | state: 0, |
116 | } |
117 | } |
118 | |
119 | pub fn reset(&mut self) { |
120 | self.state = 0; |
121 | } |
122 | |
123 | // [See license/xi-editor/xi-unicode] Copyright 2016 The xi-editor Authors |
124 | pub fn next(&mut self, codepoint: char) -> LinebreakData { |
125 | let cp = codepoint as usize; |
126 | let lb = if cp < 0x800 { |
127 | LINEBREAK_1_2[cp] |
128 | } else if cp < 0x10000 { |
129 | let child = LINEBREAK_3_ROOT[cp >> 6]; |
130 | LINEBREAK_3_CHILD[(child as usize) * 0x40 + (cp & 0x3f)] |
131 | } else { |
132 | let mid = LINEBREAK_4_ROOT[cp >> 12]; |
133 | let leaf = LINEBREAK_4_MID[(mid as usize) * 0x40 + ((cp >> 6) & 0x3f)]; |
134 | LINEBREAK_4_LEAVES[(leaf as usize) * 0x40 + (cp & 0x3f)] |
135 | }; |
136 | let i = (self.state as usize) * N_LINEBREAK_CATEGORIES + (lb as usize); |
137 | let new = LINEBREAK_STATE_MACHINE[i]; |
138 | if (new as i8) < 0 { |
139 | self.state = new & 0x3f; |
140 | if new >= 0xc0 { |
141 | LINEBREAK_HARD |
142 | } else { |
143 | LINEBREAK_SOFT |
144 | } |
145 | } else { |
146 | self.state = new; |
147 | LINEBREAK_NONE |
148 | } |
149 | } |
150 | } |
151 | |
152 | /// Miscellaneous metadata associated with a character to assist in layout. |
153 | #[derive (Debug, Copy, Clone, PartialEq, Eq)] |
154 | pub struct CharacterData { |
155 | bits: u8, |
156 | } |
157 | |
158 | impl CharacterData { |
159 | const WHITESPACE: u8 = 0b0000_0001; |
160 | const CONTROL: u8 = 0b0000_0010; |
161 | const MISSING: u8 = 0b0000_0100; |
162 | |
163 | /// Classifies a character given its index in the font. |
164 | pub fn classify(c: char, index: u16) -> CharacterData { |
165 | let mut class = 0; |
166 | if index == 0 { |
167 | class |= CharacterData::MISSING; |
168 | } |
169 | match c { |
170 | ' \t' | ' \n' | ' \x0C' | ' \r' | ' ' => class |= CharacterData::WHITESPACE, |
171 | _ => {} |
172 | } |
173 | match c { |
174 | ' \0' ..=' \x1F' | ' \x7F' => class |= CharacterData::CONTROL, |
175 | _ => {} |
176 | } |
177 | CharacterData { |
178 | bits: class, |
179 | } |
180 | } |
181 | |
182 | /// A heuristic for if the glpyh this was classified from should be rasterized. Missing glyphs, |
183 | /// whitespace, and control characters will return false. |
184 | pub fn rasterize(&self) -> bool { |
185 | self.bits == 0 |
186 | } |
187 | |
188 | /// Marks if the character is an ASCII whitespace character. |
189 | pub fn is_whitespace(&self) -> bool { |
190 | self.bits & CharacterData::WHITESPACE != 0 |
191 | } |
192 | |
193 | /// Marks if the character is an ASCII control character. |
194 | pub fn is_control(&self) -> bool { |
195 | self.bits & CharacterData::CONTROL != 0 |
196 | } |
197 | |
198 | /// Marks if the character is missing from its associated font. |
199 | pub fn is_missing(&self) -> bool { |
200 | self.bits & CharacterData::MISSING != 0 |
201 | } |
202 | } |
203 | |