1 | // To run: |
2 | // |
3 | // cargo criterion --features criterion/real_blackbox |
4 | // |
5 | // This benchmarks each of the different libraries at several ratios of ASCII to |
6 | // non-ASCII content. There is one additional benchmark labeled "baseline" which |
7 | // just iterates over characters in a string, converting UTF-8 to 32-bit chars. |
8 | // |
9 | // Criterion will show a time in milliseconds. The non-baseline bench functions |
10 | // each make one million function calls (2 calls per character, 500K characters |
11 | // in the strings created by gen_string). The "time per call" listed in our |
12 | // readme is computed by subtracting this baseline from the other bench |
13 | // functions' time, then dividing by one million (ms -> ns). |
14 | |
15 | #![allow (clippy::needless_pass_by_value)] |
16 | |
17 | #[path = "../tests/fst/mod.rs" ] |
18 | mod fst; |
19 | #[path = "../tests/roaring/mod.rs" ] |
20 | mod roaring; |
21 | #[path = "../tests/trie/mod.rs" ] |
22 | mod trie; |
23 | |
24 | use criterion::{black_box, criterion_group, criterion_main, Criterion}; |
25 | use rand::distributions::{Bernoulli, Distribution, Uniform}; |
26 | use rand::rngs::SmallRng; |
27 | use rand::SeedableRng; |
28 | use std::time::Duration; |
29 | |
30 | fn gen_string(p_nonascii: u32) -> String { |
31 | let mut rng = SmallRng::from_seed([b'!' ; 32]); |
32 | let pick_nonascii = Bernoulli::from_ratio(p_nonascii, 100).unwrap(); |
33 | let ascii = Uniform::new_inclusive(' \0' , ' \x7f' ); |
34 | let nonascii = Uniform::new_inclusive(0x80 as char, char::MAX); |
35 | |
36 | let mut string = String::new(); |
37 | for _ in 0..500_000 { |
38 | let distribution = if pick_nonascii.sample(&mut rng) { |
39 | nonascii |
40 | } else { |
41 | ascii |
42 | }; |
43 | string.push(distribution.sample(&mut rng)); |
44 | } |
45 | |
46 | string |
47 | } |
48 | |
49 | fn bench(c: &mut Criterion, group_name: &str, string: String) { |
50 | let mut group = c.benchmark_group(group_name); |
51 | group.measurement_time(Duration::from_secs(10)); |
52 | group.bench_function("baseline" , |b| { |
53 | b.iter(|| { |
54 | for ch in string.chars() { |
55 | black_box(ch); |
56 | } |
57 | }); |
58 | }); |
59 | group.bench_function("unicode-ident" , |b| { |
60 | b.iter(|| { |
61 | for ch in string.chars() { |
62 | black_box(unicode_ident::is_xid_start(ch)); |
63 | black_box(unicode_ident::is_xid_continue(ch)); |
64 | } |
65 | }); |
66 | }); |
67 | group.bench_function("unicode-xid" , |b| { |
68 | b.iter(|| { |
69 | for ch in string.chars() { |
70 | black_box(unicode_xid::UnicodeXID::is_xid_start(ch)); |
71 | black_box(unicode_xid::UnicodeXID::is_xid_continue(ch)); |
72 | } |
73 | }); |
74 | }); |
75 | group.bench_function("ucd-trie" , |b| { |
76 | b.iter(|| { |
77 | for ch in string.chars() { |
78 | black_box(trie::XID_START.contains_char(ch)); |
79 | black_box(trie::XID_CONTINUE.contains_char(ch)); |
80 | } |
81 | }); |
82 | }); |
83 | group.bench_function("fst" , |b| { |
84 | let xid_start_fst = fst::xid_start_fst(); |
85 | let xid_continue_fst = fst::xid_continue_fst(); |
86 | b.iter(|| { |
87 | for ch in string.chars() { |
88 | let ch_bytes = (ch as u32).to_be_bytes(); |
89 | black_box(xid_start_fst.contains(ch_bytes)); |
90 | black_box(xid_continue_fst.contains(ch_bytes)); |
91 | } |
92 | }); |
93 | }); |
94 | group.bench_function("roaring" , |b| { |
95 | let xid_start_bitmap = roaring::xid_start_bitmap(); |
96 | let xid_continue_bitmap = roaring::xid_continue_bitmap(); |
97 | b.iter(|| { |
98 | for ch in string.chars() { |
99 | black_box(xid_start_bitmap.contains(ch as u32)); |
100 | black_box(xid_continue_bitmap.contains(ch as u32)); |
101 | } |
102 | }); |
103 | }); |
104 | group.finish(); |
105 | } |
106 | |
107 | fn bench0(c: &mut Criterion) { |
108 | bench(c, "0%-nonascii" , gen_string(0)); |
109 | } |
110 | |
111 | fn bench1(c: &mut Criterion) { |
112 | bench(c, "1%-nonascii" , gen_string(1)); |
113 | } |
114 | |
115 | fn bench10(c: &mut Criterion) { |
116 | bench(c, "10%-nonascii" , gen_string(10)); |
117 | } |
118 | |
119 | fn bench100(c: &mut Criterion) { |
120 | bench(c, "100%-nonascii" , gen_string(100)); |
121 | } |
122 | |
123 | criterion_group!(benches, bench0, bench1, bench10, bench100); |
124 | criterion_main!(benches); |
125 | |