1 | //! Code for efficiently counting the number of `char`s in a UTF-8 encoded |
2 | //! string. |
3 | //! |
4 | //! Broadly, UTF-8 encodes `char`s as a "leading" byte which begins the `char`, |
5 | //! followed by some number (possibly 0) of continuation bytes. |
6 | //! |
7 | //! The leading byte can have a number of bit-patterns (with the specific |
8 | //! pattern indicating how many continuation bytes follow), but the continuation |
9 | //! bytes are always in the format `0b10XX_XXXX` (where the `X`s can take any |
10 | //! value). That is, the most significant bit is set, and the second most |
11 | //! significant bit is unset. |
12 | //! |
13 | //! To count the number of characters, we can just count the number of bytes in |
14 | //! the string which are not continuation bytes, which can be done many bytes at |
15 | //! a time fairly easily. |
16 | //! |
17 | //! Note: Because the term "leading byte" can sometimes be ambiguous (for |
18 | //! example, it could also refer to the first byte of a slice), we'll often use |
19 | //! the term "non-continuation byte" to refer to these bytes in the code. |
20 | |
21 | use core::intrinsics::unlikely; |
22 | |
23 | const USIZE_SIZE: usize = size_of::<usize>(); |
24 | const UNROLL_INNER: usize = 4; |
25 | |
26 | #[inline ] |
27 | pub(super) fn count_chars(s: &str) -> usize { |
28 | if cfg!(feature = "optimize_for_size" ) || s.len() < USIZE_SIZE * UNROLL_INNER { |
29 | // Avoid entering the optimized implementation for strings where the |
30 | // difference is not likely to matter, or where it might even be slower. |
31 | // That said, a ton of thought was not spent on the particular threshold |
32 | // here, beyond "this value seems to make sense". |
33 | char_count_general_case(s.as_bytes()) |
34 | } else { |
35 | do_count_chars(s) |
36 | } |
37 | } |
38 | |
39 | fn do_count_chars(s: &str) -> usize { |
40 | // For correctness, `CHUNK_SIZE` must be: |
41 | // |
42 | // - Less than or equal to 255, otherwise we'll overflow bytes in `counts`. |
43 | // - A multiple of `UNROLL_INNER`, otherwise our `break` inside the |
44 | // `body.chunks(CHUNK_SIZE)` loop is incorrect. |
45 | // |
46 | // For performance, `CHUNK_SIZE` should be: |
47 | // - Relatively cheap to `/` against (so some simple sum of powers of two). |
48 | // - Large enough to avoid paying for the cost of the `sum_bytes_in_usize` |
49 | // too often. |
50 | const CHUNK_SIZE: usize = 192; |
51 | |
52 | // Check the properties of `CHUNK_SIZE` and `UNROLL_INNER` that are required |
53 | // for correctness. |
54 | const _: () = assert!(CHUNK_SIZE < 256); |
55 | const _: () = assert!(CHUNK_SIZE % UNROLL_INNER == 0); |
56 | |
57 | // SAFETY: transmuting `[u8]` to `[usize]` is safe except for size |
58 | // differences which are handled by `align_to`. |
59 | let (head, body, tail) = unsafe { s.as_bytes().align_to::<usize>() }; |
60 | |
61 | // This should be quite rare, and basically exists to handle the degenerate |
62 | // cases where align_to fails (as well as miri under symbolic alignment |
63 | // mode). |
64 | // |
65 | // The `unlikely` helps discourage LLVM from inlining the body, which is |
66 | // nice, as we would rather not mark the `char_count_general_case` function |
67 | // as cold. |
68 | if unlikely(body.is_empty() || head.len() > USIZE_SIZE || tail.len() > USIZE_SIZE) { |
69 | return char_count_general_case(s.as_bytes()); |
70 | } |
71 | |
72 | let mut total = char_count_general_case(head) + char_count_general_case(tail); |
73 | // Split `body` into `CHUNK_SIZE` chunks to reduce the frequency with which |
74 | // we call `sum_bytes_in_usize`. |
75 | for chunk in body.chunks(CHUNK_SIZE) { |
76 | // We accumulate intermediate sums in `counts`, where each byte contains |
77 | // a subset of the sum of this chunk, like a `[u8; size_of::<usize>()]`. |
78 | let mut counts = 0; |
79 | |
80 | let (unrolled_chunks, remainder) = chunk.as_chunks::<UNROLL_INNER>(); |
81 | for unrolled in unrolled_chunks { |
82 | for &word in unrolled { |
83 | // Because `CHUNK_SIZE` is < 256, this addition can't cause the |
84 | // count in any of the bytes to overflow into a subsequent byte. |
85 | counts += contains_non_continuation_byte(word); |
86 | } |
87 | } |
88 | |
89 | // Sum the values in `counts` (which, again, is conceptually a `[u8; |
90 | // size_of::<usize>()]`), and accumulate the result into `total`. |
91 | total += sum_bytes_in_usize(counts); |
92 | |
93 | // If there's any data in `remainder`, then handle it. This will only |
94 | // happen for the last `chunk` in `body.chunks()` (because `CHUNK_SIZE` |
95 | // is divisible by `UNROLL_INNER`), so we explicitly break at the end |
96 | // (which seems to help LLVM out). |
97 | if !remainder.is_empty() { |
98 | // Accumulate all the data in the remainder. |
99 | let mut counts = 0; |
100 | for &word in remainder { |
101 | counts += contains_non_continuation_byte(word); |
102 | } |
103 | total += sum_bytes_in_usize(counts); |
104 | break; |
105 | } |
106 | } |
107 | total |
108 | } |
109 | |
110 | // Checks each byte of `w` to see if it contains the first byte in a UTF-8 |
111 | // sequence. Bytes in `w` which are continuation bytes are left as `0x00` (e.g. |
112 | // false), and bytes which are non-continuation bytes are left as `0x01` (e.g. |
113 | // true) |
114 | #[inline ] |
115 | fn contains_non_continuation_byte(w: usize) -> usize { |
116 | const LSB: usize = usize::repeat_u8(0x01); |
117 | ((!w >> 7) | (w >> 6)) & LSB |
118 | } |
119 | |
120 | // Morally equivalent to `values.to_ne_bytes().into_iter().sum::<usize>()`, but |
121 | // more efficient. |
122 | #[inline ] |
123 | fn sum_bytes_in_usize(values: usize) -> usize { |
124 | const LSB_SHORTS: usize = usize::repeat_u16(0x0001); |
125 | const SKIP_BYTES: usize = usize::repeat_u16(0x00ff); |
126 | |
127 | let pair_sum: usize = (values & SKIP_BYTES) + ((values >> 8) & SKIP_BYTES); |
128 | pair_sum.wrapping_mul(LSB_SHORTS) >> ((USIZE_SIZE - 2) * 8) |
129 | } |
130 | |
131 | // This is the most direct implementation of the concept of "count the number of |
132 | // bytes in the string which are not continuation bytes", and is used for the |
133 | // head and tail of the input string (the first and last item in the tuple |
134 | // returned by `slice::align_to`). |
135 | fn char_count_general_case(s: &[u8]) -> usize { |
136 | s.iter().filter(|&&byte: u8| !super::validations::utf8_is_cont_byte(byte)).count() |
137 | } |
138 | |