1 | // Mini-benchmark for tsan VTS worst case performance |
2 | // Idea: |
3 | // 1) Spawn M + N threads (M >> N) |
4 | // We'll call the 'M' threads as 'garbage threads'. |
5 | // 2) Make sure all threads have created thus no TIDs were reused |
6 | // 3) Join the garbage threads |
7 | // 4) Do many sync operations on the remaining N threads |
8 | // |
9 | // It turns out that due to O(M+N) VTS complexity the (4) is much slower with |
10 | // when N is large. |
11 | // |
12 | // Some numbers: |
13 | // a) clang++ native O1 with n_iterations=200kk takes |
14 | // 5s regardless of M |
15 | // clang++ tsanv2 O1 with n_iterations=20kk takes |
16 | // 23.5s with M=200 |
17 | // 11.5s with M=1 |
18 | // i.e. tsanv2 is ~23x to ~47x slower than native, depends on M. |
19 | // b) g++ native O1 with n_iterations=200kk takes |
20 | // 5.5s regardless of M |
21 | // g++ tsanv1 O1 with n_iterations=2kk takes |
22 | // 39.5s with M=200 |
23 | // 20.5s with M=1 |
24 | // i.e. tsanv1 is ~370x to ~720x slower than native, depends on M. |
25 | |
26 | #include <assert.h> |
27 | #include <pthread.h> |
28 | #include <stdio.h> |
29 | #include <stdlib.h> |
30 | |
31 | class __attribute__((aligned(64))) Mutex { |
32 | public: |
33 | Mutex() { pthread_mutex_init(mutex: &m_, NULL); } |
34 | ~Mutex() { pthread_mutex_destroy(mutex: &m_); } |
35 | void Lock() { pthread_mutex_lock(mutex: &m_); } |
36 | void Unlock() { pthread_mutex_unlock(mutex: &m_); } |
37 | |
38 | private: |
39 | pthread_mutex_t m_; |
40 | }; |
41 | |
42 | const int kNumMutexes = 1024; |
43 | Mutex mutexes[kNumMutexes]; |
44 | |
45 | int n_threads, n_iterations; |
46 | |
47 | pthread_barrier_t all_threads_ready, main_threads_ready; |
48 | |
49 | void* GarbageThread(void *unused) { |
50 | pthread_barrier_wait(barrier: &all_threads_ready); |
51 | return 0; |
52 | } |
53 | |
54 | void *Thread(void *arg) { |
55 | long idx = (long)arg; |
56 | pthread_barrier_wait(barrier: &all_threads_ready); |
57 | |
58 | // Wait for the main thread to join the garbage threads. |
59 | pthread_barrier_wait(barrier: &main_threads_ready); |
60 | |
61 | printf(format: "Thread %ld go!\n" , idx); |
62 | int offset = idx * kNumMutexes / n_threads; |
63 | for (int i = 0; i < n_iterations; i++) { |
64 | mutexes[(offset + i) % kNumMutexes].Lock(); |
65 | mutexes[(offset + i) % kNumMutexes].Unlock(); |
66 | } |
67 | printf(format: "Thread %ld done\n" , idx); |
68 | return 0; |
69 | } |
70 | |
71 | int main(int argc, char **argv) { |
72 | int n_garbage_threads; |
73 | if (argc == 1) { |
74 | n_threads = 2; |
75 | n_garbage_threads = 200; |
76 | n_iterations = 20000000; |
77 | } else if (argc == 4) { |
78 | n_threads = atoi(nptr: argv[1]); |
79 | assert(n_threads > 0 && n_threads <= 32); |
80 | n_garbage_threads = atoi(nptr: argv[2]); |
81 | assert(n_garbage_threads > 0 && n_garbage_threads <= 16000); |
82 | n_iterations = atoi(nptr: argv[3]); |
83 | } else { |
84 | printf(format: "Usage: %s n_threads n_garbage_threads n_iterations\n" , argv[0]); |
85 | return 1; |
86 | } |
87 | printf(format: "%s: n_threads=%d n_garbage_threads=%d n_iterations=%d\n" , |
88 | __FILE__, n_threads, n_garbage_threads, n_iterations); |
89 | |
90 | pthread_barrier_init(barrier: &all_threads_ready, NULL, count: n_garbage_threads + n_threads + 1); |
91 | pthread_barrier_init(barrier: &main_threads_ready, NULL, count: n_threads + 1); |
92 | |
93 | pthread_t *t = new pthread_t[n_threads]; |
94 | { |
95 | pthread_t *g_t = new pthread_t[n_garbage_threads]; |
96 | for (int i = 0; i < n_garbage_threads; i++) { |
97 | int status = pthread_create(newthread: &g_t[i], attr: 0, start_routine: GarbageThread, NULL); |
98 | assert(status == 0); |
99 | } |
100 | for (int i = 0; i < n_threads; i++) { |
101 | int status = pthread_create(newthread: &t[i], attr: 0, start_routine: Thread, arg: (void*)i); |
102 | assert(status == 0); |
103 | } |
104 | pthread_barrier_wait(barrier: &all_threads_ready); |
105 | printf(format: "All threads started! Killing the garbage threads.\n" ); |
106 | for (int i = 0; i < n_garbage_threads; i++) { |
107 | pthread_join(th: g_t[i], thread_return: 0); |
108 | } |
109 | delete [] g_t; |
110 | } |
111 | printf(format: "Resuming the main threads.\n" ); |
112 | pthread_barrier_wait(barrier: &main_threads_ready); |
113 | |
114 | |
115 | for (int i = 0; i < n_threads; i++) { |
116 | pthread_join(th: t[i], thread_return: 0); |
117 | } |
118 | delete [] t; |
119 | return 0; |
120 | } |
121 | |