| 1 | mod tables; |
| 2 | |
| 3 | use crate::unicode::tables::*; |
| 4 | use alloc::string::String; |
| 5 | |
| 6 | const CONT_MASK: u8 = 0b0011_1111; |
| 7 | |
| 8 | #[inline (always)] |
| 9 | fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { |
| 10 | (ch << 6) | (byte & CONT_MASK) as u32 |
| 11 | } |
| 12 | |
| 13 | pub fn decode_utf16(bytes: &[u8]) -> String { |
| 14 | let mut output: String = String::new(); |
| 15 | let mut offset: usize = 0; |
| 16 | while offset < bytes.len() { |
| 17 | output.push(ch:read_utf16(bytes, &mut offset)); |
| 18 | } |
| 19 | output |
| 20 | } |
| 21 | |
| 22 | pub fn read_utf16(bytes: &[u8], offset: &mut usize) -> char { |
| 23 | let a: u16 = ((bytes[*offset] as u16) << 8) | bytes[*offset + 1] as u16; |
| 24 | *offset += 2; |
| 25 | if a < 0xD800 || 0xDFFF < a { |
| 26 | unsafe { core::char::from_u32_unchecked(a as u32) } |
| 27 | } else { |
| 28 | let b: u16 = ((bytes[*offset] as u16) << 8) | bytes[*offset + 1] as u16; |
| 29 | *offset += 2; |
| 30 | let c: u32 = (((a - 0xD800) as u32) << 10 | (b - 0xDC00) as u32) + 0x1_0000; |
| 31 | unsafe { core::char::from_u32_unchecked(c as u32) } |
| 32 | } |
| 33 | } |
| 34 | |
| 35 | /// Returns (length, character). Cannot be run at the end of the string. |
| 36 | pub fn read_utf8(bytes: &[u8], byte_offset: &mut usize) -> char { |
| 37 | let x: u8 = bytes[*byte_offset]; |
| 38 | *byte_offset += 1; |
| 39 | if x < 128 { |
| 40 | return unsafe { core::char::from_u32_unchecked(x as u32) }; |
| 41 | } |
| 42 | let init: u32 = (x & (0x7F >> 2)) as u32; |
| 43 | let y: u8 = bytes[*byte_offset]; |
| 44 | *byte_offset += 1; |
| 45 | let mut ch: u32 = utf8_acc_cont_byte(ch:init, byte:y); |
| 46 | if x >= 0xE0 { |
| 47 | let z: u8 = bytes[*byte_offset]; |
| 48 | *byte_offset += 1; |
| 49 | let y_z: u32 = utf8_acc_cont_byte((y & CONT_MASK) as u32, byte:z); |
| 50 | ch = init << 12 | y_z; |
| 51 | if x >= 0xF0 { |
| 52 | let w: u8 = bytes[*byte_offset]; |
| 53 | *byte_offset += 1; |
| 54 | ch = (init & 7) << 18 | utf8_acc_cont_byte(ch:y_z, byte:w); |
| 55 | } |
| 56 | } |
| 57 | unsafe { core::char::from_u32_unchecked(ch) } |
| 58 | } |
| 59 | |
| 60 | #[derive (Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] |
| 61 | /// Ordering is based on linebreak priority. Ordering is Hard > Soft > None. |
| 62 | pub struct LinebreakData { |
| 63 | bits: u8, |
| 64 | } |
| 65 | |
| 66 | pub const LINEBREAK_NONE: LinebreakData = LinebreakData::new(bits:0b0000_0000); |
| 67 | pub const LINEBREAK_SOFT: LinebreakData = LinebreakData::new(bits:0b0000_0001); |
| 68 | pub const LINEBREAK_HARD: LinebreakData = LinebreakData::new(bits:0b0000_0010); |
| 69 | |
| 70 | impl LinebreakData { |
| 71 | const NONE: u8 = 0b0000_0000; |
| 72 | const SOFT: u8 = 0b0000_0001; |
| 73 | const HARD: u8 = 0b0000_0010; |
| 74 | |
| 75 | const fn new(bits: u8) -> LinebreakData { |
| 76 | LinebreakData { |
| 77 | bits, |
| 78 | } |
| 79 | } |
| 80 | |
| 81 | pub fn from_mask(wrap_soft_breaks: bool, wrap_hard_breaks: bool, has_width: bool) -> LinebreakData { |
| 82 | let mut mask = 0; |
| 83 | if wrap_hard_breaks { |
| 84 | mask |= LinebreakData::HARD; |
| 85 | } |
| 86 | if wrap_soft_breaks && has_width { |
| 87 | mask |= LinebreakData::SOFT; |
| 88 | } |
| 89 | LinebreakData { |
| 90 | bits: mask, |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | pub fn is_hard(&self) -> bool { |
| 95 | self.bits == LinebreakData::HARD |
| 96 | } |
| 97 | |
| 98 | pub fn is_soft(&self) -> bool { |
| 99 | self.bits == LinebreakData::SOFT |
| 100 | } |
| 101 | |
| 102 | pub fn mask(&self, other: LinebreakData) -> LinebreakData { |
| 103 | Self::new(self.bits & other.bits) |
| 104 | } |
| 105 | } |
| 106 | |
| 107 | #[derive (Debug, Copy, Clone)] |
| 108 | pub struct Linebreaker { |
| 109 | state: u8, |
| 110 | } |
| 111 | |
| 112 | impl Linebreaker { |
| 113 | pub fn new() -> Linebreaker { |
| 114 | Linebreaker { |
| 115 | state: 0, |
| 116 | } |
| 117 | } |
| 118 | |
| 119 | pub fn reset(&mut self) { |
| 120 | self.state = 0; |
| 121 | } |
| 122 | |
| 123 | // [See license/xi-editor/xi-unicode] Copyright 2016 The xi-editor Authors |
| 124 | pub fn next(&mut self, codepoint: char) -> LinebreakData { |
| 125 | let cp = codepoint as usize; |
| 126 | let lb = if cp < 0x800 { |
| 127 | LINEBREAK_1_2[cp] |
| 128 | } else if cp < 0x10000 { |
| 129 | let child = LINEBREAK_3_ROOT[cp >> 6]; |
| 130 | LINEBREAK_3_CHILD[(child as usize) * 0x40 + (cp & 0x3f)] |
| 131 | } else { |
| 132 | let mid = LINEBREAK_4_ROOT[cp >> 12]; |
| 133 | let leaf = LINEBREAK_4_MID[(mid as usize) * 0x40 + ((cp >> 6) & 0x3f)]; |
| 134 | LINEBREAK_4_LEAVES[(leaf as usize) * 0x40 + (cp & 0x3f)] |
| 135 | }; |
| 136 | let i = (self.state as usize) * N_LINEBREAK_CATEGORIES + (lb as usize); |
| 137 | let new = LINEBREAK_STATE_MACHINE[i]; |
| 138 | if (new as i8) < 0 { |
| 139 | self.state = new & 0x3f; |
| 140 | if new >= 0xc0 { |
| 141 | LINEBREAK_HARD |
| 142 | } else { |
| 143 | LINEBREAK_SOFT |
| 144 | } |
| 145 | } else { |
| 146 | self.state = new; |
| 147 | LINEBREAK_NONE |
| 148 | } |
| 149 | } |
| 150 | } |
| 151 | |
| 152 | /// Miscellaneous metadata associated with a character to assist in layout. |
| 153 | #[derive (Debug, Copy, Clone, PartialEq, Eq)] |
| 154 | pub struct CharacterData { |
| 155 | bits: u8, |
| 156 | } |
| 157 | |
| 158 | impl CharacterData { |
| 159 | const WHITESPACE: u8 = 0b0000_0001; |
| 160 | const CONTROL: u8 = 0b0000_0010; |
| 161 | const MISSING: u8 = 0b0000_0100; |
| 162 | |
| 163 | /// Classifies a character given its index in the font. |
| 164 | pub fn classify(c: char, index: u16) -> CharacterData { |
| 165 | let mut class = 0; |
| 166 | if index == 0 { |
| 167 | class |= CharacterData::MISSING; |
| 168 | } |
| 169 | match c { |
| 170 | ' \t' | ' \n' | ' \x0C' | ' \r' | ' ' => class |= CharacterData::WHITESPACE, |
| 171 | _ => {} |
| 172 | } |
| 173 | match c { |
| 174 | ' \0' ..=' \x1F' | ' \x7F' => class |= CharacterData::CONTROL, |
| 175 | _ => {} |
| 176 | } |
| 177 | CharacterData { |
| 178 | bits: class, |
| 179 | } |
| 180 | } |
| 181 | |
| 182 | /// A heuristic for if the glpyh this was classified from should be rasterized. Missing glyphs, |
| 183 | /// whitespace, and control characters will return false. |
| 184 | pub fn rasterize(&self) -> bool { |
| 185 | self.bits == 0 |
| 186 | } |
| 187 | |
| 188 | /// Marks if the character is an ASCII whitespace character. |
| 189 | pub fn is_whitespace(&self) -> bool { |
| 190 | self.bits & CharacterData::WHITESPACE != 0 |
| 191 | } |
| 192 | |
| 193 | /// Marks if the character is an ASCII control character. |
| 194 | pub fn is_control(&self) -> bool { |
| 195 | self.bits & CharacterData::CONTROL != 0 |
| 196 | } |
| 197 | |
| 198 | /// Marks if the character is missing from its associated font. |
| 199 | pub fn is_missing(&self) -> bool { |
| 200 | self.bits & CharacterData::MISSING != 0 |
| 201 | } |
| 202 | } |
| 203 | |