| 1 | /*! |
| 2 | Utilities for dealing with UTF-8. |
| 3 | |
| 4 | This module provides some UTF-8 related helper routines, including an |
| 5 | incremental decoder. |
| 6 | */ |
| 7 | |
| 8 | /// Returns true if and only if the given byte is considered a word character. |
| 9 | /// This only applies to ASCII. |
| 10 | /// |
| 11 | /// This was copied from regex-syntax so that we can use it to determine the |
| 12 | /// starting DFA state while searching without depending on regex-syntax. The |
| 13 | /// definition is never going to change, so there's no maintenance/bit-rot |
| 14 | /// hazard here. |
| 15 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 16 | pub(crate) fn is_word_byte(b: u8) -> bool { |
| 17 | const fn mkwordset() -> [bool; 256] { |
| 18 | // FIXME: Use as_usize() once const functions in traits are stable. |
| 19 | let mut set = [false; 256]; |
| 20 | set[b'_' as usize] = true; |
| 21 | |
| 22 | let mut byte = b'0' ; |
| 23 | while byte <= b'9' { |
| 24 | set[byte as usize] = true; |
| 25 | byte += 1; |
| 26 | } |
| 27 | byte = b'A' ; |
| 28 | while byte <= b'Z' { |
| 29 | set[byte as usize] = true; |
| 30 | byte += 1; |
| 31 | } |
| 32 | byte = b'a' ; |
| 33 | while byte <= b'z' { |
| 34 | set[byte as usize] = true; |
| 35 | byte += 1; |
| 36 | } |
| 37 | set |
| 38 | } |
| 39 | const WORD: [bool; 256] = mkwordset(); |
| 40 | WORD[b as usize] |
| 41 | } |
| 42 | |
| 43 | /// Decodes the next UTF-8 encoded codepoint from the given byte slice. |
| 44 | /// |
| 45 | /// If no valid encoding of a codepoint exists at the beginning of the given |
| 46 | /// byte slice, then the first byte is returned instead. |
| 47 | /// |
| 48 | /// This returns `None` if and only if `bytes` is empty. |
| 49 | /// |
| 50 | /// This never panics. |
| 51 | /// |
| 52 | /// *WARNING*: This is not designed for performance. If you're looking for a |
| 53 | /// fast UTF-8 decoder, this is not it. If you feel like you need one in this |
| 54 | /// crate, then please file an issue and discuss your use case. |
| 55 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 56 | pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> { |
| 57 | if bytes.is_empty() { |
| 58 | return None; |
| 59 | } |
| 60 | let len: usize = match len(byte:bytes[0]) { |
| 61 | None => return Some(Err(bytes[0])), |
| 62 | Some(len: usize) if len > bytes.len() => return Some(Err(bytes[0])), |
| 63 | Some(1) => return Some(Ok(char::from(bytes[0]))), |
| 64 | Some(len: usize) => len, |
| 65 | }; |
| 66 | match core::str::from_utf8(&bytes[..len]) { |
| 67 | Ok(s: &str) => Some(Ok(s.chars().next().unwrap())), |
| 68 | Err(_) => Some(Err(bytes[0])), |
| 69 | } |
| 70 | } |
| 71 | |
| 72 | /// Decodes the last UTF-8 encoded codepoint from the given byte slice. |
| 73 | /// |
| 74 | /// If no valid encoding of a codepoint exists at the end of the given byte |
| 75 | /// slice, then the last byte is returned instead. |
| 76 | /// |
| 77 | /// This returns `None` if and only if `bytes` is empty. |
| 78 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 79 | pub(crate) fn decode_last(bytes: &[u8]) -> Option<Result<char, u8>> { |
| 80 | if bytes.is_empty() { |
| 81 | return None; |
| 82 | } |
| 83 | let mut start: usize = bytes.len() - 1; |
| 84 | let limit: usize = bytes.len().saturating_sub(4); |
| 85 | while start > limit && !is_leading_or_invalid_byte(bytes[start]) { |
| 86 | start -= 1; |
| 87 | } |
| 88 | match decode(&bytes[start..]) { |
| 89 | None => None, |
| 90 | Some(Ok(ch: char)) => Some(Ok(ch)), |
| 91 | Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | /// Given a UTF-8 leading byte, this returns the total number of code units |
| 96 | /// in the following encoded codepoint. |
| 97 | /// |
| 98 | /// If the given byte is not a valid UTF-8 leading byte, then this returns |
| 99 | /// `None`. |
| 100 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 101 | fn len(byte: u8) -> Option<usize> { |
| 102 | if byte <= 0x7F { |
| 103 | return Some(1); |
| 104 | } else if byte & 0b1100_0000 == 0b1000_0000 { |
| 105 | return None; |
| 106 | } else if byte <= 0b1101_1111 { |
| 107 | Some(2) |
| 108 | } else if byte <= 0b1110_1111 { |
| 109 | Some(3) |
| 110 | } else if byte <= 0b1111_0111 { |
| 111 | Some(4) |
| 112 | } else { |
| 113 | None |
| 114 | } |
| 115 | } |
| 116 | |
| 117 | /// Returns true if and only if the given offset in the given bytes falls on a |
| 118 | /// valid UTF-8 encoded codepoint boundary. |
| 119 | /// |
| 120 | /// If `bytes` is not valid UTF-8, then the behavior of this routine is |
| 121 | /// unspecified. |
| 122 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 123 | pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool { |
| 124 | match bytes.get(index:i) { |
| 125 | // The position at the end of the bytes always represents an empty |
| 126 | // string, which is a valid boundary. But anything after that doesn't |
| 127 | // make much sense to call valid a boundary. |
| 128 | None => i == bytes.len(), |
| 129 | // Other than ASCII (where the most significant bit is never set), |
| 130 | // valid starting bytes always have their most significant two bits |
| 131 | // set, where as continuation bytes never have their second most |
| 132 | // significant bit set. Therefore, this only returns true when bytes[i] |
| 133 | // corresponds to a byte that begins a valid UTF-8 encoding of a |
| 134 | // Unicode scalar value. |
| 135 | Some(&b: u8) => b <= 0b0111_1111 || b >= 0b1100_0000, |
| 136 | } |
| 137 | } |
| 138 | |
| 139 | /// Returns true if and only if the given byte is either a valid leading UTF-8 |
| 140 | /// byte, or is otherwise an invalid byte that can never appear anywhere in a |
| 141 | /// valid UTF-8 sequence. |
| 142 | #[cfg_attr (feature = "perf-inline" , inline(always))] |
| 143 | fn is_leading_or_invalid_byte(b: u8) -> bool { |
| 144 | // In the ASCII case, the most significant bit is never set. The leading |
| 145 | // byte of a 2/3/4-byte sequence always has the top two most significant |
| 146 | // bits set. For bytes that can never appear anywhere in valid UTF-8, this |
| 147 | // also returns true, since every such byte has its two most significant |
| 148 | // bits set: |
| 149 | // |
| 150 | // \xC0 :: 11000000 |
| 151 | // \xC1 :: 11000001 |
| 152 | // \xF5 :: 11110101 |
| 153 | // \xF6 :: 11110110 |
| 154 | // \xF7 :: 11110111 |
| 155 | // \xF8 :: 11111000 |
| 156 | // \xF9 :: 11111001 |
| 157 | // \xFA :: 11111010 |
| 158 | // \xFB :: 11111011 |
| 159 | // \xFC :: 11111100 |
| 160 | // \xFD :: 11111101 |
| 161 | // \xFE :: 11111110 |
| 162 | // \xFF :: 11111111 |
| 163 | (b & 0b1100_0000) != 0b1000_0000 |
| 164 | } |
| 165 | |
| 166 | /* |
| 167 | /// Returns the smallest possible index of the next valid UTF-8 sequence |
| 168 | /// starting after `i`. |
| 169 | /// |
| 170 | /// For all inputs, including invalid UTF-8 and any value of `i`, the return |
| 171 | /// value is guaranteed to be greater than `i`. (If there is no value greater |
| 172 | /// than `i` that fits in `usize`, then this panics.) |
| 173 | /// |
| 174 | /// Generally speaking, this should only be called on `text` when it is |
| 175 | /// permitted to assume that it is valid UTF-8 and where either `i >= |
| 176 | /// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. |
| 177 | /// |
| 178 | /// NOTE: This method was used in a previous conception of iterators where we |
| 179 | /// specifically tried to skip over empty matches that split a codepoint by |
| 180 | /// simply requiring that our next search begin at the beginning of codepoint. |
| 181 | /// But we ended up changing that technique to always advance by 1 byte and |
| 182 | /// then filter out matches that split a codepoint after-the-fact. Thus, we no |
| 183 | /// longer use this method. But I've kept it around in case we want to switch |
| 184 | /// back to this approach. Its guarantees are a little subtle, so I'd prefer |
| 185 | /// not to rebuild it from whole cloth. |
| 186 | pub(crate) fn next(text: &[u8], i: usize) -> usize { |
| 187 | let b = match text.get(i) { |
| 188 | None => return i.checked_add(1).unwrap(), |
| 189 | Some(&b) => b, |
| 190 | }; |
| 191 | // For cases where we see an invalid UTF-8 byte, there isn't much we can do |
| 192 | // other than just start at the next byte. |
| 193 | let inc = len(b).unwrap_or(1); |
| 194 | i.checked_add(inc).unwrap() |
| 195 | } |
| 196 | */ |
| 197 | |