1 | /*! |
2 | Utilities for dealing with UTF-8. |
3 | |
4 | This module provides some UTF-8 related helper routines, including an |
5 | incremental decoder. |
6 | */ |
7 | |
8 | /// Returns true if and only if the given byte is considered a word character. |
9 | /// This only applies to ASCII. |
10 | /// |
11 | /// This was copied from regex-syntax so that we can use it to determine the |
12 | /// starting DFA state while searching without depending on regex-syntax. The |
13 | /// definition is never going to change, so there's no maintenance/bit-rot |
14 | /// hazard here. |
15 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
16 | pub(crate) fn is_word_byte(b: u8) -> bool { |
17 | const fn mkwordset() -> [bool; 256] { |
18 | // FIXME: Use as_usize() once const functions in traits are stable. |
19 | let mut set = [false; 256]; |
20 | set[b'_' as usize] = true; |
21 | |
22 | let mut byte = b'0' ; |
23 | while byte <= b'9' { |
24 | set[byte as usize] = true; |
25 | byte += 1; |
26 | } |
27 | byte = b'A' ; |
28 | while byte <= b'Z' { |
29 | set[byte as usize] = true; |
30 | byte += 1; |
31 | } |
32 | byte = b'a' ; |
33 | while byte <= b'z' { |
34 | set[byte as usize] = true; |
35 | byte += 1; |
36 | } |
37 | set |
38 | } |
39 | const WORD: [bool; 256] = mkwordset(); |
40 | WORD[b as usize] |
41 | } |
42 | |
43 | /// Decodes the next UTF-8 encoded codepoint from the given byte slice. |
44 | /// |
45 | /// If no valid encoding of a codepoint exists at the beginning of the given |
46 | /// byte slice, then the first byte is returned instead. |
47 | /// |
48 | /// This returns `None` if and only if `bytes` is empty. |
49 | /// |
50 | /// This never panics. |
51 | /// |
52 | /// *WARNING*: This is not designed for performance. If you're looking for a |
53 | /// fast UTF-8 decoder, this is not it. If you feel like you need one in this |
54 | /// crate, then please file an issue and discuss your use case. |
55 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
56 | pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> { |
57 | if bytes.is_empty() { |
58 | return None; |
59 | } |
60 | let len = match len(bytes[0]) { |
61 | None => return Some(Err(bytes[0])), |
62 | Some(len) if len > bytes.len() => return Some(Err(bytes[0])), |
63 | Some(1) => return Some(Ok(char::from(bytes[0]))), |
64 | Some(len) => len, |
65 | }; |
66 | match core::str::from_utf8(&bytes[..len]) { |
67 | Ok(s) => Some(Ok(s.chars().next().unwrap())), |
68 | Err(_) => Some(Err(bytes[0])), |
69 | } |
70 | } |
71 | |
72 | /// Decodes the last UTF-8 encoded codepoint from the given byte slice. |
73 | /// |
74 | /// If no valid encoding of a codepoint exists at the end of the given byte |
75 | /// slice, then the last byte is returned instead. |
76 | /// |
77 | /// This returns `None` if and only if `bytes` is empty. |
78 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
79 | pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> { |
80 | if bytes.is_empty() { |
81 | return None; |
82 | } |
83 | let mut start = bytes.len() - 1; |
84 | let limit = bytes.len().saturating_sub(4); |
85 | while start > limit && !is_leading_or_invalid_byte(bytes[start]) { |
86 | start -= 1; |
87 | } |
88 | match decode(&bytes[start..]) { |
89 | None => None, |
90 | Some(Ok(ch)) => Some(Ok(ch)), |
91 | Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), |
92 | } |
93 | } |
94 | |
95 | /// Given a UTF-8 leading byte, this returns the total number of code units |
96 | /// in the following encoded codepoint. |
97 | /// |
98 | /// If the given byte is not a valid UTF-8 leading byte, then this returns |
99 | /// `None`. |
100 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
101 | fn len(byte: u8) -> Option<usize> { |
102 | if byte <= 0x7F { |
103 | return Some(1); |
104 | } else if byte & 0b1100_0000 == 0b1000_0000 { |
105 | return None; |
106 | } else if byte <= 0b1101_1111 { |
107 | Some(2) |
108 | } else if byte <= 0b1110_1111 { |
109 | Some(3) |
110 | } else if byte <= 0b1111_0111 { |
111 | Some(4) |
112 | } else { |
113 | None |
114 | } |
115 | } |
116 | |
117 | /// Returns true if and only if the given offset in the given bytes falls on a |
118 | /// valid UTF-8 encoded codepoint boundary. |
119 | /// |
120 | /// If `bytes` is not valid UTF-8, then the behavior of this routine is |
121 | /// unspecified. |
122 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
123 | pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool { |
124 | match bytes.get(i) { |
125 | // The position at the end of the bytes always represents an empty |
126 | // string, which is a valid boundary. But anything after that doesn't |
127 | // make much sense to call valid a boundary. |
128 | None => i == bytes.len(), |
129 | // Other than ASCII (where the most significant bit is never set), |
130 | // valid starting bytes always have their most significant two bits |
131 | // set, where as continuation bytes never have their second most |
132 | // significant bit set. Therefore, this only returns true when bytes[i] |
133 | // corresponds to a byte that begins a valid UTF-8 encoding of a |
134 | // Unicode scalar value. |
135 | Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000, |
136 | } |
137 | } |
138 | |
139 | /// Returns true if and only if the given byte is either a valid leading UTF-8 |
140 | /// byte, or is otherwise an invalid byte that can never appear anywhere in a |
141 | /// valid UTF-8 sequence. |
142 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
143 | fn is_leading_or_invalid_byte(b: u8) -> bool { |
144 | // In the ASCII case, the most significant bit is never set. The leading |
145 | // byte of a 2/3/4-byte sequence always has the top two most significant |
146 | // bits set. For bytes that can never appear anywhere in valid UTF-8, this |
147 | // also returns true, since every such byte has its two most significant |
148 | // bits set: |
149 | // |
150 | // \xC0 :: 11000000 |
151 | // \xC1 :: 11000001 |
152 | // \xF5 :: 11110101 |
153 | // \xF6 :: 11110110 |
154 | // \xF7 :: 11110111 |
155 | // \xF8 :: 11111000 |
156 | // \xF9 :: 11111001 |
157 | // \xFA :: 11111010 |
158 | // \xFB :: 11111011 |
159 | // \xFC :: 11111100 |
160 | // \xFD :: 11111101 |
161 | // \xFE :: 11111110 |
162 | // \xFF :: 11111111 |
163 | (b & 0b1100_0000) != 0b1000_0000 |
164 | } |
165 | |
166 | /* |
167 | /// Returns the smallest possible index of the next valid UTF-8 sequence |
168 | /// starting after `i`. |
169 | /// |
170 | /// For all inputs, including invalid UTF-8 and any value of `i`, the return |
171 | /// value is guaranteed to be greater than `i`. (If there is no value greater |
172 | /// than `i` that fits in `usize`, then this panics.) |
173 | /// |
174 | /// Generally speaking, this should only be called on `text` when it is |
175 | /// permitted to assume that it is valid UTF-8 and where either `i >= |
176 | /// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. |
177 | /// |
178 | /// NOTE: This method was used in a previous conception of iterators where we |
179 | /// specifically tried to skip over empty matches that split a codepoint by |
180 | /// simply requiring that our next search begin at the beginning of codepoint. |
181 | /// But we ended up changing that technique to always advance by 1 byte and |
182 | /// then filter out matches that split a codepoint after-the-fact. Thus, we no |
183 | /// longer use this method. But I've kept it around in case we want to switch |
184 | /// back to this approach. Its guarantees are a little subtle, so I'd prefer |
185 | /// not to rebuild it from whole cloth. |
186 | pub(crate) fn next(text: &[u8], i: usize) -> usize { |
187 | let b = match text.get(i) { |
188 | None => return i.checked_add(1).unwrap(), |
189 | Some(&b) => b, |
190 | }; |
191 | // For cases where we see an invalid UTF-8 byte, there isn't much we can do |
192 | // other than just start at the next byte. |
193 | let inc = len(b).unwrap_or(1); |
194 | i.checked_add(inc).unwrap() |
195 | } |
196 | */ |
197 | |