| 1 | // Mini-benchmark for tsan VTS worst case performance |
| 2 | // Idea: |
| 3 | // 1) Spawn M + N threads (M >> N) |
| 4 | // We'll call the 'M' threads as 'garbage threads'. |
| 5 | // 2) Make sure all threads have created thus no TIDs were reused |
| 6 | // 3) Join the garbage threads |
| 7 | // 4) Do many sync operations on the remaining N threads |
| 8 | // |
| 9 | // It turns out that due to O(M+N) VTS complexity the (4) is much slower with |
| 10 | // when N is large. |
| 11 | // |
| 12 | // Some numbers: |
| 13 | // a) clang++ native O1 with n_iterations=200kk takes |
| 14 | // 5s regardless of M |
| 15 | // clang++ tsanv2 O1 with n_iterations=20kk takes |
| 16 | // 23.5s with M=200 |
| 17 | // 11.5s with M=1 |
| 18 | // i.e. tsanv2 is ~23x to ~47x slower than native, depends on M. |
| 19 | // b) g++ native O1 with n_iterations=200kk takes |
| 20 | // 5.5s regardless of M |
| 21 | // g++ tsanv1 O1 with n_iterations=2kk takes |
| 22 | // 39.5s with M=200 |
| 23 | // 20.5s with M=1 |
| 24 | // i.e. tsanv1 is ~370x to ~720x slower than native, depends on M. |
| 25 | |
| 26 | #include <assert.h> |
| 27 | #include <pthread.h> |
| 28 | #include <stdio.h> |
| 29 | #include <stdlib.h> |
| 30 | |
| 31 | class __attribute__((aligned(64))) Mutex { |
| 32 | public: |
| 33 | Mutex() { pthread_mutex_init(mutex: &m_, NULL); } |
| 34 | ~Mutex() { pthread_mutex_destroy(mutex: &m_); } |
| 35 | void Lock() { pthread_mutex_lock(mutex: &m_); } |
| 36 | void Unlock() { pthread_mutex_unlock(mutex: &m_); } |
| 37 | |
| 38 | private: |
| 39 | pthread_mutex_t m_; |
| 40 | }; |
| 41 | |
| 42 | const int kNumMutexes = 1024; |
| 43 | Mutex mutexes[kNumMutexes]; |
| 44 | |
| 45 | int n_threads, n_iterations; |
| 46 | |
| 47 | pthread_barrier_t all_threads_ready, main_threads_ready; |
| 48 | |
| 49 | void* GarbageThread(void *unused) { |
| 50 | pthread_barrier_wait(barrier: &all_threads_ready); |
| 51 | return 0; |
| 52 | } |
| 53 | |
| 54 | void *Thread(void *arg) { |
| 55 | long idx = (long)arg; |
| 56 | pthread_barrier_wait(barrier: &all_threads_ready); |
| 57 | |
| 58 | // Wait for the main thread to join the garbage threads. |
| 59 | pthread_barrier_wait(barrier: &main_threads_ready); |
| 60 | |
| 61 | printf(format: "Thread %ld go!\n" , idx); |
| 62 | int offset = idx * kNumMutexes / n_threads; |
| 63 | for (int i = 0; i < n_iterations; i++) { |
| 64 | mutexes[(offset + i) % kNumMutexes].Lock(); |
| 65 | mutexes[(offset + i) % kNumMutexes].Unlock(); |
| 66 | } |
| 67 | printf(format: "Thread %ld done\n" , idx); |
| 68 | return 0; |
| 69 | } |
| 70 | |
| 71 | int main(int argc, char **argv) { |
| 72 | int n_garbage_threads; |
| 73 | if (argc == 1) { |
| 74 | n_threads = 2; |
| 75 | n_garbage_threads = 200; |
| 76 | n_iterations = 20000000; |
| 77 | } else if (argc == 4) { |
| 78 | n_threads = atoi(nptr: argv[1]); |
| 79 | assert(n_threads > 0 && n_threads <= 32); |
| 80 | n_garbage_threads = atoi(nptr: argv[2]); |
| 81 | assert(n_garbage_threads > 0 && n_garbage_threads <= 16000); |
| 82 | n_iterations = atoi(nptr: argv[3]); |
| 83 | } else { |
| 84 | printf(format: "Usage: %s n_threads n_garbage_threads n_iterations\n" , argv[0]); |
| 85 | return 1; |
| 86 | } |
| 87 | printf(format: "%s: n_threads=%d n_garbage_threads=%d n_iterations=%d\n" , |
| 88 | __FILE__, n_threads, n_garbage_threads, n_iterations); |
| 89 | |
| 90 | pthread_barrier_init(barrier: &all_threads_ready, NULL, count: n_garbage_threads + n_threads + 1); |
| 91 | pthread_barrier_init(barrier: &main_threads_ready, NULL, count: n_threads + 1); |
| 92 | |
| 93 | pthread_t *t = new pthread_t[n_threads]; |
| 94 | { |
| 95 | pthread_t *g_t = new pthread_t[n_garbage_threads]; |
| 96 | for (int i = 0; i < n_garbage_threads; i++) { |
| 97 | int status = pthread_create(newthread: &g_t[i], attr: 0, start_routine: GarbageThread, NULL); |
| 98 | assert(status == 0); |
| 99 | } |
| 100 | for (int i = 0; i < n_threads; i++) { |
| 101 | int status = pthread_create(newthread: &t[i], attr: 0, start_routine: Thread, arg: (void*)i); |
| 102 | assert(status == 0); |
| 103 | } |
| 104 | pthread_barrier_wait(barrier: &all_threads_ready); |
| 105 | printf(format: "All threads started! Killing the garbage threads.\n" ); |
| 106 | for (int i = 0; i < n_garbage_threads; i++) { |
| 107 | pthread_join(th: g_t[i], thread_return: 0); |
| 108 | } |
| 109 | delete [] g_t; |
| 110 | } |
| 111 | printf(format: "Resuming the main threads.\n" ); |
| 112 | pthread_barrier_wait(barrier: &main_threads_ready); |
| 113 | |
| 114 | |
| 115 | for (int i = 0; i < n_threads; i++) { |
| 116 | pthread_join(th: t[i], thread_return: 0); |
| 117 | } |
| 118 | delete [] t; |
| 119 | return 0; |
| 120 | } |
| 121 | |