core.rs - Codebrowser

1	/// Compute the display width of `text`
2	///
3	/// # Examples
4	///
5	/// Note:* When the `unicode` Cargo feature is disabled, all characters are presumed to take up*
6	/// 1 width. With the feature enabled, function will correctly deal with [combining characters] in
7	/// their decomposed form (see [Unicode equivalence]).
8	///
9	/// An example of a decomposed character is “é”, which can be decomposed into: “e” followed by a
10	/// combining acute accent: “◌́”. Without the `unicode` Cargo feature, every `char` has a width of
11	/// 1. This includes the combining accent:
12	///
13	/// ## Emojis and CJK Characters
14	///
15	/// Characters such as emojis and [CJK characters] used in the
16	/// Chinese, Japanese, and Korean languages are seen as double-width,
17	/// even if the `unicode-width` feature is disabled:
18	///
19	/// # Limitations
20	///
21	/// The displayed width of a string cannot always be computed from the
22	/// string alone. This is because the width depends on the rendering
23	/// engine used. This is particularly visible with [emoji modifier
24	/// sequences] where a base emoji is modified with, e.g., skin tone or
25	/// hair color modifiers. It is up to the rendering engine to detect
26	/// this and to produce a suitable emoji.
27	///
28	/// A simple example is “❤️”, which consists of “❤” (U+2764: Black
29	/// Heart Symbol) followed by U+FE0F (Variation Selector-16). By
30	/// itself, “❤” is a black heart, but if you follow it with the
31	/// variant selector, you may get a wider red heart.
32	///
33	/// A more complex example would be “👨‍🦰” which should depict a man
34	/// with red hair. Here the computed width is too large — and the
35	/// width differs depending on the use of the `unicode-width` feature:
36	///
37	/// This happens because the grapheme consists of three code points:
38	/// “👨” (U+1F468: Man), Zero Width Joiner (U+200D), and “🦰”
39	/// (U+1F9B0: Red Hair). You can see them above in the test. With
40	/// `unicode-width` enabled, the ZWJ is correctly seen as having zero
41	/// width, without it is counted as a double-width character.
42	///
43	/// ## Terminal Support
44	///
45	/// Modern browsers typically do a great job at combining characters
46	/// as shown above, but terminals often struggle more. As an example,
47	/// Gnome Terminal version 3.38.1, shows “❤️” as a big red heart, but
48	/// shows "👨‍🦰" as “👨🦰”.
49	///
50	/// [combining characters]: https://en.wikipedia.org/wiki/Combining_character
51	/// [Unicode equivalence]: https://en.wikipedia.org/wiki/Unicode_equivalence
52	/// [CJK characters]: https://en.wikipedia.org/wiki/CJK_characters
53	/// [emoji modifier sequences]: https://unicode.org/emoji/charts/full-emoji-modifiers.html
54	#[inline(never)]
55	pub(crate) fn display_width(text: &str) -> usize {
56	let mut width = `0`;
57
58	let mut control_sequence = `false`;
59	let control_terminate: char = 'm';
60
61	for ch in text.chars() {
62	if ch.is_ascii_control() {
63	control_sequence = `true`;
64	} else if control_sequence && ch == control_terminate {
65	control_sequence = `false`;
66	continue;
67	}
68
69	if !control_sequence {
70	width += ch_width(ch);
71	}
72	}
73	width
74	}
75
76	#[cfg(feature = "unicode")]
77	fn ch_width(ch: char) -> usize {
78	unicode_width::UnicodeWidthChar::width(ch).unwrap_or(`0`)
79	}
80
81	#[cfg(not(feature = "unicode"))]
82	fn ch_width(_: char) -> usize {
83	`1`
84	}
85
86	#[cfg(test)]
87	mod tests {
88	use super::*;
89
90	#[cfg(feature = "unicode")]
91	use unicode_width::UnicodeWidthChar;
92
93	#[test]
94	fn emojis_have_correct_width() {
95	use unic_emoji_char::is_emoji;
96
97	// Emojis in the Basic Latin (ASCII) and Latin-1 Supplement
98	// blocks all have a width of 1 column. This includes
99	// characters such as '#' and '©'.
100	for ch in '`\u{1}`'..'`\u{FF}`' {
101	if is_emoji(ch) {
102	let desc = format!("{:?} U+{:04X}", ch, ch as u32);
103
104	#[cfg(feature = "unicode")]
105	assert_eq!(ch.width().unwrap(), `1`, "char: {desc}");
106
107	#[cfg(not(feature = "unicode"))]
108	assert_eq!(ch_width(ch), `1`, "char: {desc}");
109	}
110	}
111
112	// Emojis in the remaining blocks of the Basic Multilingual
113	// Plane (BMP), in the Supplementary Multilingual Plane (SMP),
114	// and in the Supplementary Ideographic Plane (SIP), are all 1
115	// or 2 columns wide when unicode-width is used, and always 2
116	// columns wide otherwise. This includes all of our favorite
117	// emojis such as 😊.
118	for ch in '`\u{FF}`'..'`\u{2FFFF}`' {
119	if is_emoji(ch) {
120	let desc = format!("{:?} U+{:04X}", ch, ch as u32);
121
122	#[cfg(feature = "unicode")]
123	assert!(ch.width().unwrap() <= `2`, "char: {desc}");
124
125	#[cfg(not(feature = "unicode"))]
126	assert_eq!(ch_width(ch), `1`, "char: {desc}");
127	}
128	}
129
130	// The remaining planes contain almost no assigned code points
131	// and thus also no emojis.
132	}
133
134	#[test]
135	#[cfg(feature = "unicode")]
136	fn display_width_works() {
137	assert_eq!("Café Plain".len(), `11`); // “é” is two bytes
138	assert_eq!(display_width("Café Plain"), `10`);
139	}
140
141	#[test]
142	#[cfg(feature = "unicode")]
143	fn display_width_narrow_emojis() {
144	assert_eq!(display_width("⁉"), `1`);
145	}
146
147	#[test]
148	#[cfg(feature = "unicode")]
149	fn display_width_narrow_emojis_variant_selector() {
150	assert_eq!(display_width("⁉`\u{fe0f}`"), `1`);
151	}
152
153	#[test]
154	#[cfg(feature = "unicode")]
155	fn display_width_emojis() {
156	assert_eq!(display_width("😂😭🥺🤣✨😍🙏🥰😊🔥"), `20`);
157	}
158	}
159