| 1 | /*! | 
| 2 | # UTF-8 Width | 
|---|
| 3 |  | 
|---|
| 4 | To determine the width of a UTF-8 character by providing its first byte. | 
|---|
| 5 |  | 
|---|
| 6 | References: https://tools.ietf.org/html/rfc3629 | 
|---|
| 7 |  | 
|---|
| 8 | ## Examples | 
|---|
| 9 |  | 
|---|
| 10 | ```rust | 
|---|
| 11 | assert_eq!(1, utf8_width::get_width( b'1')); | 
|---|
| 12 | assert_eq!(3, utf8_width::get_width( "δΈ".as_bytes()[0])); | 
|---|
| 13 | ``` | 
|---|
| 14 |  | 
|---|
| 15 | ## Benchmark | 
|---|
| 16 |  | 
|---|
| 17 | ```bash | 
|---|
| 18 | cargo bench | 
|---|
| 19 | ``` | 
|---|
| 20 | */ | 
|---|
| 21 |  | 
|---|
| 22 | #![ no_std] | 
|---|
| 23 |  | 
|---|
| 24 | pub const MIN_0_1: u8 = 0x80; | 
|---|
| 25 | pub const MAX_0_1: u8 = 0xC1; | 
|---|
| 26 | pub const MIN_0_2: u8 = 0xF5; | 
|---|
| 27 | pub const MAX_0_2: u8 = 0xFF; | 
|---|
| 28 | pub const MIN_1: u8 = 0x00; | 
|---|
| 29 | pub const MAX_1: u8 = 0x7F; | 
|---|
| 30 | pub const MIN_2: u8 = 0xC2; | 
|---|
| 31 | pub const MAX_2: u8 = 0xDF; | 
|---|
| 32 | pub const MIN_3: u8 = 0xE0; | 
|---|
| 33 | pub const MAX_3: u8 = 0xEF; | 
|---|
| 34 | pub const MIN_4: u8 = 0xF0; | 
|---|
| 35 | pub const MAX_4: u8 = 0xF4; | 
|---|
| 36 |  | 
|---|
| 37 | #[ inline] | 
|---|
| 38 | pub const fn is_width_1(byte: u8) -> bool { | 
|---|
| 39 | byte <= MAX_1 // no need to check `MIN_1 <= byte` | 
|---|
| 40 | } | 
|---|
| 41 |  | 
|---|
| 42 | #[ inline] | 
|---|
| 43 | pub const fn is_width_2(byte: u8) -> bool { | 
|---|
| 44 | byte >= MIN_2 && byte <= MAX_2 | 
|---|
| 45 | } | 
|---|
| 46 |  | 
|---|
| 47 | #[ inline] | 
|---|
| 48 | pub const fn is_width_3(byte: u8) -> bool { | 
|---|
| 49 | byte >= MIN_3 && byte <= MAX_3 | 
|---|
| 50 | } | 
|---|
| 51 |  | 
|---|
| 52 | #[ inline] | 
|---|
| 53 | pub const fn is_width_4(byte: u8) -> bool { | 
|---|
| 54 | byte >= MIN_4 && byte <= MAX_4 | 
|---|
| 55 | } | 
|---|
| 56 |  | 
|---|
| 57 | #[ inline] | 
|---|
| 58 | pub const fn is_width_0(byte: u8) -> bool { | 
|---|
| 59 | byte >= MIN_0_1 && byte <= MAX_0_1 || MIN_0_2 <= byte // no need to check `byte <= MAX_0_2` | 
|---|
| 60 | } | 
|---|
| 61 |  | 
|---|
| 62 | /// Given a first byte, determine how many bytes are in this UTF-8 character. If the UTF-8 character is invalid, return `0`; otherwise, return `1` to `4`. | 
|---|
| 63 | #[ inline] | 
|---|
| 64 | pub const fn get_width(byte: u8) -> usize { | 
|---|
| 65 | if is_width_1(byte) { | 
|---|
| 66 | 1 | 
|---|
| 67 | } else if is_width_2(byte) { | 
|---|
| 68 | 2 | 
|---|
| 69 | } else if byte <= MAX_3 { | 
|---|
| 70 | // no need to check `MIN_3 <= byte` | 
|---|
| 71 | 3 | 
|---|
| 72 | } else if byte <= MAX_4 { | 
|---|
| 73 | // no need to check `MIN_4 <= byte` | 
|---|
| 74 | 4 | 
|---|
| 75 | } else { | 
|---|
| 76 | 0 | 
|---|
| 77 | } | 
|---|
| 78 | } | 
|---|
| 79 |  | 
|---|
| 80 | /// *Assuming the input first byte is from a valid UTF-8 character*, determine how many bytes are in this UTF-8 character. It returns `1` to `4`. | 
|---|
| 81 | /// | 
|---|
| 82 | /// # Safety | 
|---|
| 83 | /// | 
|---|
| 84 | /// You must ensure that the input byte is a valid UTF-8 first byte on your own. | 
|---|
| 85 | #[ inline] | 
|---|
| 86 | pub const unsafe fn get_width_assume_valid(byte: u8) -> usize { | 
|---|
| 87 | if byte <= MAX_1 { | 
|---|
| 88 | 1 | 
|---|
| 89 | } else if byte <= MAX_2 { | 
|---|
| 90 | 2 | 
|---|
| 91 | } else if byte <= MAX_3 { | 
|---|
| 92 | 3 | 
|---|
| 93 | } else { | 
|---|
| 94 | 4 | 
|---|
| 95 | } | 
|---|
| 96 | } | 
|---|
| 97 |  | 
|---|