1 | //! count occurrences of a given byte, or the number of UTF-8 code points, in a |
2 | //! byte slice, fast. |
3 | //! |
4 | //! This crate has the [`count`](fn.count.html) method to count byte |
5 | //! occurrences (for example newlines) in a larger `&[u8]` slice. |
6 | //! |
7 | //! For example: |
8 | //! |
9 | //! ```rust |
10 | //! assert_eq!(5, bytecount::count(b"Hello, this is the bytecount crate!" , b' ' )); |
11 | //! ``` |
12 | //! |
13 | //! Also there is a [`num_chars`](fn.num_chars.html) method to count |
14 | //! the number of UTF8 characters in a slice. It will work the same as |
15 | //! `str::chars().count()` for byte slices of correct UTF-8 character |
16 | //! sequences. The result will likely be off for invalid sequences, |
17 | //! although the result is guaranteed to be between `0` and |
18 | //! `[_]::len()`, inclusive. |
19 | //! |
20 | //! Example: |
21 | //! |
22 | //! ```rust |
23 | //! let sequence = "Wenn ich ein Vöglein wär, flög ich zu Dir!" ; |
24 | //! assert_eq!(sequence.chars().count(), |
25 | //! bytecount::num_chars(sequence.as_bytes())); |
26 | //! ``` |
27 | //! |
28 | //! For completeness and easy comparison, the "naive" versions of both |
29 | //! count and num_chars are provided. Those are also faster if used on |
30 | //! predominantly small strings. The |
31 | //! [`naive_count_32`](fn.naive_count_32.html) method can be faster |
32 | //! still on small strings. |
33 | |
34 | #![cfg_attr (feature = "generic-simd" , feature(portable_simd))] |
35 | |
36 | #![deny (missing_docs)] |
37 | #![cfg_attr (not(feature = "runtime-dispatch-simd" ), no_std)] |
38 | |
39 | #[cfg (not(feature = "runtime-dispatch-simd" ))] |
40 | use core::mem; |
41 | #[cfg (feature = "runtime-dispatch-simd" )] |
42 | use std::mem; |
43 | |
44 | mod naive; |
45 | pub use naive::*; |
46 | mod integer_simd; |
47 | |
48 | #[cfg (any( |
49 | all( |
50 | feature = "runtime-dispatch-simd" , |
51 | any(target_arch = "x86" , target_arch = "x86_64" ) |
52 | ), |
53 | target_arch = "aarch64" , |
54 | target_arch = "wasm32" , |
55 | feature = "generic-simd" |
56 | ))] |
57 | mod simd; |
58 | |
59 | /// Count occurrences of a byte in a slice of bytes, fast |
60 | /// |
61 | /// # Examples |
62 | /// |
63 | /// ``` |
64 | /// let s = b"This is a Text with spaces" ; |
65 | /// let number_of_spaces = bytecount::count(s, b' ' ); |
66 | /// assert_eq!(number_of_spaces, 5); |
67 | /// ``` |
68 | pub fn count(haystack: &[u8], needle: u8) -> usize { |
69 | if haystack.len() >= 32 { |
70 | #[cfg (all(feature = "runtime-dispatch-simd" , target_arch = "x86_64" ))] |
71 | { |
72 | if is_x86_feature_detected!("avx2" ) { |
73 | unsafe { |
74 | return simd::x86_avx2::chunk_count(haystack, needle); |
75 | } |
76 | } |
77 | } |
78 | |
79 | #[cfg (feature = "generic-simd" )] |
80 | return simd::generic::chunk_count(haystack, needle); |
81 | } |
82 | |
83 | if haystack.len() >= 16 { |
84 | #[cfg (all( |
85 | feature = "runtime-dispatch-simd" , |
86 | any(target_arch = "x86" , target_arch = "x86_64" ), |
87 | not(feature = "generic-simd" ) |
88 | ))] |
89 | { |
90 | if is_x86_feature_detected!("sse2" ) { |
91 | unsafe { |
92 | return simd::x86_sse2::chunk_count(haystack, needle); |
93 | } |
94 | } |
95 | } |
96 | #[cfg (all(target_arch = "aarch64" , not(feature = "generic_simd" )))] |
97 | { |
98 | unsafe { |
99 | return simd::aarch64::chunk_count(haystack, needle); |
100 | } |
101 | } |
102 | |
103 | #[cfg (target_arch = "wasm32" )] |
104 | { |
105 | unsafe { |
106 | return simd::wasm::chunk_count(haystack, needle); |
107 | } |
108 | } |
109 | } |
110 | |
111 | if haystack.len() >= mem::size_of::<usize>() { |
112 | return integer_simd::chunk_count(haystack, needle); |
113 | } |
114 | |
115 | naive_count(haystack, needle) |
116 | } |
117 | |
118 | /// Count the number of UTF-8 encoded Unicode codepoints in a slice of bytes, fast |
119 | /// |
120 | /// This function is safe to use on any byte array, valid UTF-8 or not, |
121 | /// but the output is only meaningful for well-formed UTF-8. |
122 | /// |
123 | /// # Example |
124 | /// |
125 | /// ``` |
126 | /// let swordfish = "メカジキ" ; |
127 | /// let char_count = bytecount::num_chars(swordfish.as_bytes()); |
128 | /// assert_eq!(char_count, 4); |
129 | /// ``` |
130 | pub fn num_chars(utf8_chars: &[u8]) -> usize { |
131 | if utf8_chars.len() >= 32 { |
132 | #[cfg (all(feature = "runtime-dispatch-simd" , target_arch = "x86_64" ))] |
133 | { |
134 | if is_x86_feature_detected!("avx2" ) { |
135 | unsafe { |
136 | return simd::x86_avx2::chunk_num_chars(utf8_chars); |
137 | } |
138 | } |
139 | } |
140 | |
141 | #[cfg (feature = "generic-simd" )] |
142 | return simd::generic::chunk_num_chars(utf8_chars); |
143 | } |
144 | |
145 | if utf8_chars.len() >= 16 { |
146 | #[cfg (all( |
147 | feature = "runtime-dispatch-simd" , |
148 | any(target_arch = "x86" , target_arch = "x86_64" ), |
149 | not(feature = "generic-simd" ) |
150 | ))] |
151 | { |
152 | if is_x86_feature_detected!("sse2" ) { |
153 | unsafe { |
154 | return simd::x86_sse2::chunk_num_chars(utf8_chars); |
155 | } |
156 | } |
157 | } |
158 | #[cfg (all(target_arch = "aarch64" , not(feature = "generic_simd" )))] |
159 | { |
160 | unsafe { |
161 | return simd::aarch64::chunk_num_chars(utf8_chars); |
162 | } |
163 | } |
164 | |
165 | #[cfg (target_arch = "wasm32" )] |
166 | { |
167 | unsafe { |
168 | return simd::wasm::chunk_num_chars(utf8_chars); |
169 | } |
170 | } |
171 | } |
172 | |
173 | if utf8_chars.len() >= mem::size_of::<usize>() { |
174 | return integer_simd::chunk_num_chars(utf8_chars); |
175 | } |
176 | |
177 | naive_num_chars(utf8_chars) |
178 | } |
179 | |