1/*!
2# UTF-8 Width
3
4To determine the width of a UTF-8 character by providing its first byte.
5
6References: https://tools.ietf.org/html/rfc3629
7
8## Examples
9
10```rust
11assert_eq!(1, utf8_width::get_width(b'1'));
12assert_eq!(3, utf8_width::get_width("δΈ­".as_bytes()[0]));
13```
14
15## Benchmark
16
17```bash
18cargo bench
19```
20*/
21
22#![no_std]
23
24pub const MIN_0_1: u8 = 0x80;
25pub const MAX_0_1: u8 = 0xC1;
26pub const MIN_0_2: u8 = 0xF5;
27pub const MAX_0_2: u8 = 0xFF;
28pub const MIN_1: u8 = 0x00;
29pub const MAX_1: u8 = 0x7F;
30pub const MIN_2: u8 = 0xC2;
31pub const MAX_2: u8 = 0xDF;
32pub const MIN_3: u8 = 0xE0;
33pub const MAX_3: u8 = 0xEF;
34pub const MIN_4: u8 = 0xF0;
35pub const MAX_4: u8 = 0xF4;
36
37#[inline]
38pub const fn is_width_1(byte: u8) -> bool {
39 byte <= MAX_1 // no need to check `MIN_1 <= byte`
40}
41
42#[inline]
43pub const fn is_width_2(byte: u8) -> bool {
44 byte >= MIN_2 && byte <= MAX_2
45}
46
47#[inline]
48pub const fn is_width_3(byte: u8) -> bool {
49 byte >= MIN_3 && byte <= MAX_3
50}
51
52#[inline]
53pub const fn is_width_4(byte: u8) -> bool {
54 byte >= MIN_4 && byte <= MAX_4
55}
56
57#[inline]
58pub const fn is_width_0(byte: u8) -> bool {
59 byte >= MIN_0_1 && byte <= MAX_0_1 || MIN_0_2 <= byte // no need to check `byte <= MAX_0_2`
60}
61
62/// Given a first byte, determine how many bytes are in this UTF-8 character. If the UTF-8 character is invalid, return `0`; otherwise, return `1` to `4`.
63#[inline]
64pub const fn get_width(byte: u8) -> usize {
65 if is_width_1(byte) {
66 1
67 } else if is_width_2(byte) {
68 2
69 } else if byte <= MAX_3 {
70 // no need to check `MIN_3 <= byte`
71 3
72 } else if byte <= MAX_4 {
73 // no need to check `MIN_4 <= byte`
74 4
75 } else {
76 0
77 }
78}
79
80/// *Assuming the input first byte is from a valid UTF-8 character*, determine how many bytes are in this UTF-8 character. It returns `1` to `4`.
81///
82/// # Safety
83///
84/// You must ensure that the input byte is a valid UTF-8 first byte on your own.
85#[inline]
86pub const unsafe fn get_width_assume_valid(byte: u8) -> usize {
87 if byte <= MAX_1 {
88 1
89 } else if byte <= MAX_2 {
90 2
91 } else if byte <= MAX_3 {
92 3
93 } else {
94 4
95 }
96}
97