count.rs source code [crates/core/src/str/count.rs]

1	//! Code for efficiently counting the number of `char`s in a UTF-8 encoded
2	//! string.
3	//!
4	//! Broadly, UTF-8 encodes `char`s as a "leading" byte which begins the `char`,
5	//! followed by some number (possibly 0) of continuation bytes.
6	//!
7	//! The leading byte can have a number of bit-patterns (with the specific
8	//! pattern indicating how many continuation bytes follow), but the continuation
9	//! bytes are always in the format `0b10XX_XXXX` (where the `X`s can take any
10	//! value). That is, the most significant bit is set, and the second most
11	//! significant bit is unset.
12	//!
13	//! To count the number of characters, we can just count the number of bytes in
14	//! the string which are not continuation bytes, which can be done many bytes at
15	//! a time fairly easily.
16	//!
17	//! Note: Because the term "leading byte" can sometimes be ambiguous (for
18	//! example, it could also refer to the first byte of a slice), we'll often use
19	//! the term "non-continuation byte" to refer to these bytes in the code.
20	use core::intrinsics::unlikely;
21
22	const USIZE_SIZE: usize = core::mem::size_of::<usize>();
23	const UNROLL_INNER: usize = `4`;
24
25	#[inline]
26	pub(super) fn count_chars(s: &str) -> usize {
27	if s.len() < USIZE_SIZE * UNROLL_INNER {
28	// Avoid entering the optimized implementation for strings where the
29	// difference is not likely to matter, or where it might even be slower.
30	// That said, a ton of thought was not spent on the particular threshold
31	// here, beyond "this value seems to make sense".
32	char_count_general_case(s.as_bytes())
33	} else {
34	do_count_chars(s)
35	}
36	}
37
38	fn do_count_chars(s: &str) -> usize {
39	// For correctness, `CHUNK_SIZE` must be:
40	//
41	// - Less than or equal to 255, otherwise we'll overflow bytes in `counts`.
42	// - A multiple of `UNROLL_INNER`, otherwise our `break` inside the
43	// `body.chunks(CHUNK_SIZE)` loop is incorrect.
44	//
45	// For performance, `CHUNK_SIZE` should be:
46	// - Relatively cheap to `/` against (so some simple sum of powers of two).
47	// - Large enough to avoid paying for the cost of the `sum_bytes_in_usize`
48	// too often.
49	const CHUNK_SIZE: usize = `192`;
50
51	// Check the properties of `CHUNK_SIZE` and `UNROLL_INNER` that are required
52	// for correctness.
53	const _: () = assert!(CHUNK_SIZE < `256`);
54	const _: () = assert!(CHUNK_SIZE % UNROLL_INNER == `0`);
55
56	// SAFETY: transmuting `[u8]` to `[usize]` is safe except for size
57	// differences which are handled by `align_to`.
58	let (head, body, tail) = unsafe { s.as_bytes().align_to::<usize>() };
59
60	// This should be quite rare, and basically exists to handle the degenerate
61	// cases where align_to fails (as well as miri under symbolic alignment
62	// mode).
63	//
64	// The `unlikely` helps discourage LLVM from inlining the body, which is
65	// nice, as we would rather not mark the `char_count_general_case` function
66	// as cold.
67	if unlikely(body.is_empty() \|\| head.len() > USIZE_SIZE \|\| tail.len() > USIZE_SIZE) {
68	return char_count_general_case(s.as_bytes());
69	}
70
71	let mut total = char_count_general_case(head) + char_count_general_case(tail);
72	// Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which
73	// we call `sum_bytes_in_usize`.
74	for chunk in body.chunks(CHUNK_SIZE) {
75	// We accumulate intermediate sums in `counts`, where each byte contains
76	// a subset of the sum of this chunk, like a `[u8; size_of::<usize>()]`.
77	let mut counts = `0`;
78
79	let (unrolled_chunks, remainder) = chunk.as_chunks::<UNROLL_INNER>();
80	for unrolled in unrolled_chunks {
81	for &word in unrolled {
82	// Because `CHUNK_SIZE` is < 256, this addition can't cause the
83	// count in any of the bytes to overflow into a subsequent byte.
84	counts += contains_non_continuation_byte(word);
85	}
86	}
87
88	// Sum the values in `counts` (which, again, is conceptually a `[u8;
89	// size_of::<usize>()]`), and accumulate the result into `total`.
90	total += sum_bytes_in_usize(counts);
91
92	// If there's any data in `remainder`, then handle it. This will only
93	// happen for the last `chunk` in `body.chunks()` (because `CHUNK_SIZE`
94	// is divisible by `UNROLL_INNER`), so we explicitly break at the end
95	// (which seems to help LLVM out).
96	if !remainder.is_empty() {
97	// Accumulate all the data in the remainder.
98	let mut counts = `0`;
99	for &word in remainder {
100	counts += contains_non_continuation_byte(word);
101	}
102	total += sum_bytes_in_usize(counts);
103	break;
104	}
105	}
106	total
107	}
108
109	// Checks each byte of `w` to see if it contains the first byte in a UTF-8
110	// sequence. Bytes in `w` which are continuation bytes are left as `0x00` (e.g.
111	// false), and bytes which are non-continuation bytes are left as `0x01` (e.g.
112	// true)
113	#[inline]
114	fn contains_non_continuation_byte(w: usize) -> usize {
115	const LSB: usize = usize::repeat_u8(`0x01`);
116	((!w >> `7`) \| (w >> `6`)) & LSB
117	}
118
119	// Morally equivalent to `values.to_ne_bytes().into_iter().sum::<usize>()`, but
120	// more efficient.
121	#[inline]
122	fn sum_bytes_in_usize(values: usize) -> usize {
123	const LSB_SHORTS: usize = usize::repeat_u16(`0x0001`);
124	const SKIP_BYTES: usize = usize::repeat_u16(`0x00ff`);
125
126	let pair_sum: usize = (values & SKIP_BYTES) + ((values >> `8`) & SKIP_BYTES);
127	pair_sum.wrapping_mul(LSB_SHORTS) >> ((USIZE_SIZE - `2`) * `8`)
128	}
129
130	// This is the most direct implementation of the concept of "count the number of
131	// bytes in the string which are not continuation bytes", and is used for the
132	// head and tail of the input string (the first and last item in the tuple
133	// returned by `slice::align_to`).
134	fn char_count_general_case(s: &[u8]) -> usize {
135	s.iter().filter(\|&&byte: u8\| !super::validations::utf8_is_cont_byte(byte)).count()
136	}
137

Provided by KDAB

Definitions

Learn Rust with the experts

Find out more