lib.rs source code [crates/utf8_width/src/lib.rs]

1	/!*
2	# UTF-8 Width
3
4	To determine the width of a UTF-8 character by providing its first byte.
5
6	References: https://tools.ietf.org/html/rfc3629
7
8	## Examples
9
10	```rust
11	assert_eq!(`1`, utf8_width::get_width(b'1'));
12	assert_eq!(`3`, utf8_width::get_width("中".as_bytes()[`0`]));
13	```
14
15	## Benchmark
16
17	```bash
18	cargo bench
19	```
20	*/
21
22	#![no_std]
23
24	pub const MIN_0_1: u8 = `0x80`;
25	pub const MAX_0_1: u8 = `0xC1`;
26	pub const MIN_0_2: u8 = `0xF5`;
27	pub const MAX_0_2: u8 = `0xFF`;
28	pub const MIN_1: u8 = `0x00`;
29	pub const MAX_1: u8 = `0x7F`;
30	pub const MIN_2: u8 = `0xC2`;
31	pub const MAX_2: u8 = `0xDF`;
32	pub const MIN_3: u8 = `0xE0`;
33	pub const MAX_3: u8 = `0xEF`;
34	pub const MIN_4: u8 = `0xF0`;
35	pub const MAX_4: u8 = `0xF4`;
36
37	#[inline]
38	pub const fn is_width_1(byte: u8) -> bool {
39	byte <= MAX_1 // no need to check `MIN_1 <= byte`
40	}
41
42	#[inline]
43	pub const fn is_width_2(byte: u8) -> bool {
44	byte >= MIN_2 && byte <= MAX_2
45	}
46
47	#[inline]
48	pub const fn is_width_3(byte: u8) -> bool {
49	byte >= MIN_3 && byte <= MAX_3
50	}
51
52	#[inline]
53	pub const fn is_width_4(byte: u8) -> bool {
54	byte >= MIN_4 && byte <= MAX_4
55	}
56
57	#[inline]
58	pub const fn is_width_0(byte: u8) -> bool {
59	byte >= MIN_0_1 && byte <= MAX_0_1 \|\| MIN_0_2 <= byte // no need to check `byte <= MAX_0_2`
60	}
61
62	/// Given a first byte, determine how many bytes are in this UTF-8 character. If the UTF-8 character is invalid, return `0`; otherwise, return `1` to `4`.
63	#[inline]
64	pub const fn get_width(byte: u8) -> usize {
65	if is_width_1(byte) {
66	`1`
67	} else if is_width_2(byte) {
68	`2`
69	} else if byte <= MAX_3 {
70	// no need to check `MIN_3 <= byte`
71	`3`
72	} else if byte <= MAX_4 {
73	// no need to check `MIN_4 <= byte`
74	`4`
75	} else {
76	`0`
77	}
78	}
79
80	/// Assuming the input first byte is from a valid UTF-8 character, determine how many bytes are in this UTF-8 character. It returns `1` to `4`.
81	///
82	/// # Safety
83	///
84	/// You must ensure that the input byte is a valid UTF-8 first byte on your own.
85	#[inline]
86	pub const unsafe fn get_width_assume_valid(byte: u8) -> usize {
87	if byte <= MAX_1 {
88	`1`
89	} else if byte <= MAX_2 {
90	`2`
91	} else if byte <= MAX_3 {
92	`3`
93	} else {
94	`4`
95	}
96	}
97