| 1 | /*! |
| 2 | # UTF-8 Width |
| 3 | |
| 4 | To determine the width of a UTF-8 character by providing its first byte. |
| 5 | |
| 6 | References: https://tools.ietf.org/html/rfc3629 |
| 7 | |
| 8 | ## Examples |
| 9 | |
| 10 | ```rust |
| 11 | assert_eq!(1, utf8_width::get_width(b'1' )); |
| 12 | assert_eq!(3, utf8_width::get_width("δΈ" .as_bytes()[0])); |
| 13 | ``` |
| 14 | |
| 15 | ## Benchmark |
| 16 | |
| 17 | ```bash |
| 18 | cargo bench |
| 19 | ``` |
| 20 | */ |
| 21 | |
| 22 | #![no_std ] |
| 23 | |
| 24 | pub const MIN_0_1: u8 = 0x80; |
| 25 | pub const MAX_0_1: u8 = 0xC1; |
| 26 | pub const MIN_0_2: u8 = 0xF5; |
| 27 | pub const MAX_0_2: u8 = 0xFF; |
| 28 | pub const MIN_1: u8 = 0x00; |
| 29 | pub const MAX_1: u8 = 0x7F; |
| 30 | pub const MIN_2: u8 = 0xC2; |
| 31 | pub const MAX_2: u8 = 0xDF; |
| 32 | pub const MIN_3: u8 = 0xE0; |
| 33 | pub const MAX_3: u8 = 0xEF; |
| 34 | pub const MIN_4: u8 = 0xF0; |
| 35 | pub const MAX_4: u8 = 0xF4; |
| 36 | |
| 37 | #[inline ] |
| 38 | pub const fn is_width_1(byte: u8) -> bool { |
| 39 | byte <= MAX_1 // no need to check `MIN_1 <= byte` |
| 40 | } |
| 41 | |
| 42 | #[inline ] |
| 43 | pub const fn is_width_2(byte: u8) -> bool { |
| 44 | byte >= MIN_2 && byte <= MAX_2 |
| 45 | } |
| 46 | |
| 47 | #[inline ] |
| 48 | pub const fn is_width_3(byte: u8) -> bool { |
| 49 | byte >= MIN_3 && byte <= MAX_3 |
| 50 | } |
| 51 | |
| 52 | #[inline ] |
| 53 | pub const fn is_width_4(byte: u8) -> bool { |
| 54 | byte >= MIN_4 && byte <= MAX_4 |
| 55 | } |
| 56 | |
| 57 | #[inline ] |
| 58 | pub const fn is_width_0(byte: u8) -> bool { |
| 59 | byte >= MIN_0_1 && byte <= MAX_0_1 || MIN_0_2 <= byte // no need to check `byte <= MAX_0_2` |
| 60 | } |
| 61 | |
| 62 | /// Given a first byte, determine how many bytes are in this UTF-8 character. If the UTF-8 character is invalid, return `0`; otherwise, return `1` to `4`. |
| 63 | #[inline ] |
| 64 | pub const fn get_width(byte: u8) -> usize { |
| 65 | if is_width_1(byte) { |
| 66 | 1 |
| 67 | } else if is_width_2(byte) { |
| 68 | 2 |
| 69 | } else if byte <= MAX_3 { |
| 70 | // no need to check `MIN_3 <= byte` |
| 71 | 3 |
| 72 | } else if byte <= MAX_4 { |
| 73 | // no need to check `MIN_4 <= byte` |
| 74 | 4 |
| 75 | } else { |
| 76 | 0 |
| 77 | } |
| 78 | } |
| 79 | |
| 80 | /// *Assuming the input first byte is from a valid UTF-8 character*, determine how many bytes are in this UTF-8 character. It returns `1` to `4`. |
| 81 | /// |
| 82 | /// # Safety |
| 83 | /// |
| 84 | /// You must ensure that the input byte is a valid UTF-8 first byte on your own. |
| 85 | #[inline ] |
| 86 | pub const unsafe fn get_width_assume_valid(byte: u8) -> usize { |
| 87 | if byte <= MAX_1 { |
| 88 | 1 |
| 89 | } else if byte <= MAX_2 { |
| 90 | 2 |
| 91 | } else if byte <= MAX_3 { |
| 92 | 3 |
| 93 | } else { |
| 94 | 4 |
| 95 | } |
| 96 | } |
| 97 | |