1/*
2 * Microbenchmark for math functions.
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9#undef _GNU_SOURCE
10#define _GNU_SOURCE 1
11#include <stdint.h>
12#include <stdlib.h>
13#include <stdio.h>
14#include <string.h>
15#include <time.h>
16#include <math.h>
17#include "mathlib.h"
18
19#ifndef WANT_VMATH
20/* Enable the build of vector math code. */
21# define WANT_VMATH 1
22#endif
23
24/* Number of measurements, best result is reported. */
25#define MEASURE 60
26/* Array size. */
27#define N 8000
28/* Iterations over the array. */
29#define ITER 125
30
31static double *Trace;
32static size_t trace_size;
33static double A[N];
34static float Af[N];
35static long measurecount = MEASURE;
36static long itercount = ITER;
37
38#if __aarch64__ && WANT_VMATH
39typedef __f64x2_t v_double;
40
41#define v_double_len() 2
42
43static inline v_double
44v_double_load (const double *p)
45{
46 return (v_double){p[0], p[1]};
47}
48
49static inline v_double
50v_double_dup (double x)
51{
52 return (v_double){x, x};
53}
54
55typedef __f32x4_t v_float;
56
57#define v_float_len() 4
58
59static inline v_float
60v_float_load (const float *p)
61{
62 return (v_float){p[0], p[1], p[2], p[3]};
63}
64
65static inline v_float
66v_float_dup (float x)
67{
68 return (v_float){x, x, x, x};
69}
70#else
71/* dummy definitions to make things compile. */
72typedef double v_double;
73typedef float v_float;
74#define v_double_len(x) 1
75#define v_double_load(x) (x)[0]
76#define v_double_dup(x) (x)
77#define v_float_len(x) 1
78#define v_float_load(x) (x)[0]
79#define v_float_dup(x) (x)
80#endif
81
82static double
83dummy (double x)
84{
85 return x;
86}
87
88static float
89dummyf (float x)
90{
91 return x;
92}
93
94#if WANT_VMATH
95#if __aarch64__
96static v_double
97__v_dummy (v_double x)
98{
99 return x;
100}
101
102static v_float
103__v_dummyf (v_float x)
104{
105 return x;
106}
107
108#ifdef __vpcs
109__vpcs static v_double
110__vn_dummy (v_double x)
111{
112 return x;
113}
114
115__vpcs static v_float
116__vn_dummyf (v_float x)
117{
118 return x;
119}
120
121__vpcs static v_float
122xy__vn_powf (v_float x)
123{
124 return __vn_powf (x, x);
125}
126
127__vpcs static v_float
128xy_Z_powf (v_float x)
129{
130 return _ZGVnN4vv_powf (x, x);
131}
132
133__vpcs static v_double
134xy__vn_pow (v_double x)
135{
136 return __vn_pow (x, x);
137}
138
139__vpcs static v_double
140xy_Z_pow (v_double x)
141{
142 return _ZGVnN2vv_pow (x, x);
143}
144#endif
145
146static v_float
147xy__v_powf (v_float x)
148{
149 return __v_powf (x, x);
150}
151
152static v_double
153xy__v_pow (v_double x)
154{
155 return __v_pow (x, x);
156}
157#endif
158
159static float
160xy__s_powf (float x)
161{
162 return __s_powf (x, x);
163}
164
165static double
166xy__s_pow (double x)
167{
168 return __s_pow (x, x);
169}
170#endif
171
172static double
173xypow (double x)
174{
175 return pow (x: x, y: x);
176}
177
178static float
179xypowf (float x)
180{
181 return powf (x: x, y: x);
182}
183
184static double
185xpow (double x)
186{
187 return pow (x: x, y: 23.4);
188}
189
190static float
191xpowf (float x)
192{
193 return powf (x: x, y: 23.4f);
194}
195
196static double
197ypow (double x)
198{
199 return pow (x: 2.34, y: x);
200}
201
202static float
203ypowf (float x)
204{
205 return powf (x: 2.34f, y: x);
206}
207
208static float
209sincosf_wrap (float x)
210{
211 float s, c;
212 sincosf (x: x, sinx: &s, cosx: &c);
213 return s + c;
214}
215
216static const struct fun
217{
218 const char *name;
219 int prec;
220 int vec;
221 double lo;
222 double hi;
223 union
224 {
225 double (*d) (double);
226 float (*f) (float);
227 v_double (*vd) (v_double);
228 v_float (*vf) (v_float);
229#ifdef __vpcs
230 __vpcs v_double (*vnd) (v_double);
231 __vpcs v_float (*vnf) (v_float);
232#endif
233 } fun;
234} funtab[] = {
235#define D(func, lo, hi) {#func, 'd', 0, lo, hi, {.d = func}},
236#define F(func, lo, hi) {#func, 'f', 0, lo, hi, {.f = func}},
237#define VD(func, lo, hi) {#func, 'd', 'v', lo, hi, {.vd = func}},
238#define VF(func, lo, hi) {#func, 'f', 'v', lo, hi, {.vf = func}},
239#define VND(func, lo, hi) {#func, 'd', 'n', lo, hi, {.vnd = func}},
240#define VNF(func, lo, hi) {#func, 'f', 'n', lo, hi, {.vnf = func}},
241D (dummy, 1.0, 2.0)
242D (exp, -9.9, 9.9)
243D (exp, 0.5, 1.0)
244D (exp2, -9.9, 9.9)
245D (log, 0.01, 11.1)
246D (log, 0.999, 1.001)
247D (log2, 0.01, 11.1)
248D (log2, 0.999, 1.001)
249{"pow", 'd', 0, 0.01, 11.1, {.d = xypow}},
250D (xpow, 0.01, 11.1)
251D (ypow, -9.9, 9.9)
252
253F (dummyf, 1.0, 2.0)
254F (expf, -9.9, 9.9)
255F (exp2f, -9.9, 9.9)
256F (logf, 0.01, 11.1)
257F (log2f, 0.01, 11.1)
258{"powf", 'f', 0, 0.01, 11.1, {.f = xypowf}},
259F (xpowf, 0.01, 11.1)
260F (ypowf, -9.9, 9.9)
261{"sincosf", 'f', 0, 0.1, 0.7, {.f = sincosf_wrap}},
262{"sincosf", 'f', 0, 0.8, 3.1, {.f = sincosf_wrap}},
263{"sincosf", 'f', 0, -3.1, 3.1, {.f = sincosf_wrap}},
264{"sincosf", 'f', 0, 3.3, 33.3, {.f = sincosf_wrap}},
265{"sincosf", 'f', 0, 100, 1000, {.f = sincosf_wrap}},
266{"sincosf", 'f', 0, 1e6, 1e32, {.f = sincosf_wrap}},
267F (sinf, 0.1, 0.7)
268F (sinf, 0.8, 3.1)
269F (sinf, -3.1, 3.1)
270F (sinf, 3.3, 33.3)
271F (sinf, 100, 1000)
272F (sinf, 1e6, 1e32)
273F (cosf, 0.1, 0.7)
274F (cosf, 0.8, 3.1)
275F (cosf, -3.1, 3.1)
276F (cosf, 3.3, 33.3)
277F (cosf, 100, 1000)
278F (cosf, 1e6, 1e32)
279#if WANT_VMATH
280D (__s_sin, -3.1, 3.1)
281D (__s_cos, -3.1, 3.1)
282D (__s_exp, -9.9, 9.9)
283D (__s_log, 0.01, 11.1)
284{"__s_pow", 'd', 0, 0.01, 11.1, {.d = xy__s_pow}},
285F (__s_expf, -9.9, 9.9)
286F (__s_expf_1u, -9.9, 9.9)
287F (__s_exp2f, -9.9, 9.9)
288F (__s_exp2f_1u, -9.9, 9.9)
289F (__s_logf, 0.01, 11.1)
290{"__s_powf", 'f', 0, 0.01, 11.1, {.f = xy__s_powf}},
291F (__s_sinf, -3.1, 3.1)
292F (__s_cosf, -3.1, 3.1)
293#if __aarch64__
294VD (__v_dummy, 1.0, 2.0)
295VD (__v_sin, -3.1, 3.1)
296VD (__v_cos, -3.1, 3.1)
297VD (__v_exp, -9.9, 9.9)
298VD (__v_log, 0.01, 11.1)
299{"__v_pow", 'd', 'v', 0.01, 11.1, {.vd = xy__v_pow}},
300VF (__v_dummyf, 1.0, 2.0)
301VF (__v_expf, -9.9, 9.9)
302VF (__v_expf_1u, -9.9, 9.9)
303VF (__v_exp2f, -9.9, 9.9)
304VF (__v_exp2f_1u, -9.9, 9.9)
305VF (__v_logf, 0.01, 11.1)
306{"__v_powf", 'f', 'v', 0.01, 11.1, {.vf = xy__v_powf}},
307VF (__v_sinf, -3.1, 3.1)
308VF (__v_cosf, -3.1, 3.1)
309#ifdef __vpcs
310VND (__vn_dummy, 1.0, 2.0)
311VND (__vn_exp, -9.9, 9.9)
312VND (_ZGVnN2v_exp, -9.9, 9.9)
313VND (__vn_log, 0.01, 11.1)
314VND (_ZGVnN2v_log, 0.01, 11.1)
315{"__vn_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy__vn_pow}},
316{"_ZGVnN2vv_pow", 'd', 'n', 0.01, 11.1, {.vnd = xy_Z_pow}},
317VND (__vn_sin, -3.1, 3.1)
318VND (_ZGVnN2v_sin, -3.1, 3.1)
319VND (__vn_cos, -3.1, 3.1)
320VND (_ZGVnN2v_cos, -3.1, 3.1)
321VNF (__vn_dummyf, 1.0, 2.0)
322VNF (__vn_expf, -9.9, 9.9)
323VNF (_ZGVnN4v_expf, -9.9, 9.9)
324VNF (__vn_expf_1u, -9.9, 9.9)
325VNF (__vn_exp2f, -9.9, 9.9)
326VNF (_ZGVnN4v_exp2f, -9.9, 9.9)
327VNF (__vn_exp2f_1u, -9.9, 9.9)
328VNF (__vn_logf, 0.01, 11.1)
329VNF (_ZGVnN4v_logf, 0.01, 11.1)
330{"__vn_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy__vn_powf}},
331{"_ZGVnN4vv_powf", 'f', 'n', 0.01, 11.1, {.vnf = xy_Z_powf}},
332VNF (__vn_sinf, -3.1, 3.1)
333VNF (_ZGVnN4v_sinf, -3.1, 3.1)
334VNF (__vn_cosf, -3.1, 3.1)
335VNF (_ZGVnN4v_cosf, -3.1, 3.1)
336#endif
337#endif
338#endif
339{0},
340#undef F
341#undef D
342#undef VF
343#undef VD
344#undef VNF
345#undef VND
346};
347
348static void
349gen_linear (double lo, double hi)
350{
351 for (int i = 0; i < N; i++)
352 A[i] = (lo * (N - i) + hi * i) / N;
353}
354
355static void
356genf_linear (double lo, double hi)
357{
358 for (int i = 0; i < N; i++)
359 Af[i] = (float)(lo * (N - i) + hi * i) / N;
360}
361
362static inline double
363asdouble (uint64_t i)
364{
365 union
366 {
367 uint64_t i;
368 double f;
369 } u = {.i: i};
370 return u.f;
371}
372
373static uint64_t seed = 0x0123456789abcdef;
374
375static double
376frand (double lo, double hi)
377{
378 seed = 6364136223846793005ULL * seed + 1;
379 return lo + (hi - lo) * (asdouble (i: seed >> 12 | 0x3ffULL << 52) - 1.0);
380}
381
382static void
383gen_rand (double lo, double hi)
384{
385 for (int i = 0; i < N; i++)
386 A[i] = frand (lo, hi);
387}
388
389static void
390genf_rand (double lo, double hi)
391{
392 for (int i = 0; i < N; i++)
393 Af[i] = (float)frand (lo, hi);
394}
395
396static void
397gen_trace (int index)
398{
399 for (int i = 0; i < N; i++)
400 A[i] = Trace[index + i];
401}
402
403static void
404genf_trace (int index)
405{
406 for (int i = 0; i < N; i++)
407 Af[i] = (float)Trace[index + i];
408}
409
410static void
411run_thruput (double f (double))
412{
413 for (int i = 0; i < N; i++)
414 f (A[i]);
415}
416
417static void
418runf_thruput (float f (float))
419{
420 for (int i = 0; i < N; i++)
421 f (Af[i]);
422}
423
424volatile double zero = 0;
425
426static void
427run_latency (double f (double))
428{
429 double z = zero;
430 double prev = z;
431 for (int i = 0; i < N; i++)
432 prev = f (A[i] + prev * z);
433}
434
435static void
436runf_latency (float f (float))
437{
438 float z = (float)zero;
439 float prev = z;
440 for (int i = 0; i < N; i++)
441 prev = f (Af[i] + prev * z);
442}
443
444static void
445run_v_thruput (v_double f (v_double))
446{
447 for (int i = 0; i < N; i += v_double_len ())
448 f (v_double_load (A+i));
449}
450
451static void
452runf_v_thruput (v_float f (v_float))
453{
454 for (int i = 0; i < N; i += v_float_len ())
455 f (v_float_load (Af+i));
456}
457
458static void
459run_v_latency (v_double f (v_double))
460{
461 v_double z = v_double_dup (zero);
462 v_double prev = z;
463 for (int i = 0; i < N; i += v_double_len ())
464 prev = f (v_double_load (A+i) + prev * z);
465}
466
467static void
468runf_v_latency (v_float f (v_float))
469{
470 v_float z = v_float_dup (zero);
471 v_float prev = z;
472 for (int i = 0; i < N; i += v_float_len ())
473 prev = f (v_float_load (Af+i) + prev * z);
474}
475
476#ifdef __vpcs
477static void
478run_vn_thruput (__vpcs v_double f (v_double))
479{
480 for (int i = 0; i < N; i += v_double_len ())
481 f (v_double_load (A+i));
482}
483
484static void
485runf_vn_thruput (__vpcs v_float f (v_float))
486{
487 for (int i = 0; i < N; i += v_float_len ())
488 f (v_float_load (Af+i));
489}
490
491static void
492run_vn_latency (__vpcs v_double f (v_double))
493{
494 v_double z = v_double_dup (zero);
495 v_double prev = z;
496 for (int i = 0; i < N; i += v_double_len ())
497 prev = f (v_double_load (A+i) + prev * z);
498}
499
500static void
501runf_vn_latency (__vpcs v_float f (v_float))
502{
503 v_float z = v_float_dup (zero);
504 v_float prev = z;
505 for (int i = 0; i < N; i += v_float_len ())
506 prev = f (v_float_load (Af+i) + prev * z);
507}
508#endif
509
510static uint64_t
511tic (void)
512{
513 struct timespec ts;
514 if (clock_gettime (CLOCK_REALTIME, tp: &ts))
515 abort ();
516 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
517}
518
519#define TIMEIT(run, f) do { \
520 dt = -1; \
521 run (f); /* Warm up. */ \
522 for (int j = 0; j < measurecount; j++) \
523 { \
524 uint64_t t0 = tic (); \
525 for (int i = 0; i < itercount; i++) \
526 run (f); \
527 uint64_t t1 = tic (); \
528 if (t1 - t0 < dt) \
529 dt = t1 - t0; \
530 } \
531} while (0)
532
533static void
534bench1 (const struct fun *f, int type, double lo, double hi)
535{
536 uint64_t dt = 0;
537 uint64_t ns100;
538 const char *s = type == 't' ? "rthruput" : "latency";
539 int vlen = 1;
540
541 if (f->vec && f->prec == 'd')
542 vlen = v_double_len();
543 else if (f->vec && f->prec == 'f')
544 vlen = v_float_len();
545
546 if (f->prec == 'd' && type == 't' && f->vec == 0)
547 TIMEIT (run_thruput, f->fun.d);
548 else if (f->prec == 'd' && type == 'l' && f->vec == 0)
549 TIMEIT (run_latency, f->fun.d);
550 else if (f->prec == 'f' && type == 't' && f->vec == 0)
551 TIMEIT (runf_thruput, f->fun.f);
552 else if (f->prec == 'f' && type == 'l' && f->vec == 0)
553 TIMEIT (runf_latency, f->fun.f);
554 else if (f->prec == 'd' && type == 't' && f->vec == 'v')
555 TIMEIT (run_v_thruput, f->fun.vd);
556 else if (f->prec == 'd' && type == 'l' && f->vec == 'v')
557 TIMEIT (run_v_latency, f->fun.vd);
558 else if (f->prec == 'f' && type == 't' && f->vec == 'v')
559 TIMEIT (runf_v_thruput, f->fun.vf);
560 else if (f->prec == 'f' && type == 'l' && f->vec == 'v')
561 TIMEIT (runf_v_latency, f->fun.vf);
562#ifdef __vpcs
563 else if (f->prec == 'd' && type == 't' && f->vec == 'n')
564 TIMEIT (run_vn_thruput, f->fun.vnd);
565 else if (f->prec == 'd' && type == 'l' && f->vec == 'n')
566 TIMEIT (run_vn_latency, f->fun.vnd);
567 else if (f->prec == 'f' && type == 't' && f->vec == 'n')
568 TIMEIT (runf_vn_thruput, f->fun.vnf);
569 else if (f->prec == 'f' && type == 'l' && f->vec == 'n')
570 TIMEIT (runf_vn_latency, f->fun.vnf);
571#endif
572
573 if (type == 't')
574 {
575 ns100 = (100 * dt + itercount * N / 2) / (itercount * N);
576 printf (format: "%9s %8s: %4u.%02u ns/elem %10llu ns in [%g %g]\n", f->name, s,
577 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
578 (unsigned long long) dt, lo, hi);
579 }
580 else if (type == 'l')
581 {
582 ns100 = (100 * dt + itercount * N / vlen / 2) / (itercount * N / vlen);
583 printf (format: "%9s %8s: %4u.%02u ns/call %10llu ns in [%g %g]\n", f->name, s,
584 (unsigned) (ns100 / 100), (unsigned) (ns100 % 100),
585 (unsigned long long) dt, lo, hi);
586 }
587 fflush (stdout);
588}
589
590static void
591bench (const struct fun *f, double lo, double hi, int type, int gen)
592{
593 if (f->prec == 'd' && gen == 'r')
594 gen_rand (lo, hi);
595 else if (f->prec == 'd' && gen == 'l')
596 gen_linear (lo, hi);
597 else if (f->prec == 'd' && gen == 't')
598 gen_trace (index: 0);
599 else if (f->prec == 'f' && gen == 'r')
600 genf_rand (lo, hi);
601 else if (f->prec == 'f' && gen == 'l')
602 genf_linear (lo, hi);
603 else if (f->prec == 'f' && gen == 't')
604 genf_trace (index: 0);
605
606 if (gen == 't')
607 hi = trace_size / N;
608
609 if (type == 'b' || type == 't')
610 bench1 (f, type: 't', lo, hi);
611
612 if (type == 'b' || type == 'l')
613 bench1 (f, type: 'l', lo, hi);
614
615 for (int i = N; i < trace_size; i += N)
616 {
617 if (f->prec == 'd')
618 gen_trace (index: i);
619 else
620 genf_trace (index: i);
621
622 lo = i / N;
623 if (type == 'b' || type == 't')
624 bench1 (f, type: 't', lo, hi);
625
626 if (type == 'b' || type == 'l')
627 bench1 (f, type: 'l', lo, hi);
628 }
629}
630
631static void
632readtrace (const char *name)
633{
634 int n = 0;
635 FILE *f = strcmp (s1: name, s2: "-") == 0 ? stdin : fopen (filename: name, modes: "r");
636 if (!f)
637 {
638 printf (format: "opening \"%s\" failed: %m\n", name);
639 exit (status: 1);
640 }
641 for (;;)
642 {
643 if (n >= trace_size)
644 {
645 trace_size += N;
646 Trace = realloc (ptr: Trace, size: trace_size * sizeof (Trace[0]));
647 if (Trace == NULL)
648 {
649 printf (format: "out of memory\n");
650 exit (status: 1);
651 }
652 }
653 if (fscanf (stream: f, format: "%lf", Trace + n) != 1)
654 break;
655 n++;
656 }
657 if (ferror (stream: f) || n == 0)
658 {
659 printf (format: "reading \"%s\" failed: %m\n", name);
660 exit (status: 1);
661 }
662 fclose (stream: f);
663 if (n % N == 0)
664 trace_size = n;
665 for (int i = 0; n < trace_size; n++, i++)
666 Trace[n] = Trace[i];
667}
668
669static void
670usage (void)
671{
672 printf (format: "usage: ./mathbench [-g rand|linear|trace] [-t latency|thruput|both] "
673 "[-i low high] [-f tracefile] [-m measurements] [-c iterations] func "
674 "[func2 ..]\n");
675 printf (format: "func:\n");
676 printf (format: "%7s [run all benchmarks]\n", "all");
677 for (const struct fun *f = funtab; f->name; f++)
678 printf (format: "%7s [low: %g high: %g]\n", f->name, f->lo, f->hi);
679 exit (status: 1);
680}
681
682int
683main (int argc, char *argv[])
684{
685 int usergen = 0, gen = 'r', type = 'b', all = 0;
686 double lo = 0, hi = 0;
687 const char *tracefile = "-";
688
689 argv++;
690 argc--;
691 for (;;)
692 {
693 if (argc <= 0)
694 usage ();
695 if (argv[0][0] != '-')
696 break;
697 else if (argc >= 3 && strcmp (s1: argv[0], s2: "-i") == 0)
698 {
699 usergen = 1;
700 lo = strtod (nptr: argv[1], endptr: 0);
701 hi = strtod (nptr: argv[2], endptr: 0);
702 argv += 3;
703 argc -= 3;
704 }
705 else if (argc >= 2 && strcmp (s1: argv[0], s2: "-m") == 0)
706 {
707 measurecount = strtol (nptr: argv[1], endptr: 0, base: 0);
708 argv += 2;
709 argc -= 2;
710 }
711 else if (argc >= 2 && strcmp (s1: argv[0], s2: "-c") == 0)
712 {
713 itercount = strtol (nptr: argv[1], endptr: 0, base: 0);
714 argv += 2;
715 argc -= 2;
716 }
717 else if (argc >= 2 && strcmp (s1: argv[0], s2: "-g") == 0)
718 {
719 gen = argv[1][0];
720 if (strchr (s: "rlt", c: gen) == 0)
721 usage ();
722 argv += 2;
723 argc -= 2;
724 }
725 else if (argc >= 2 && strcmp (s1: argv[0], s2: "-f") == 0)
726 {
727 gen = 't'; /* -f implies -g trace. */
728 tracefile = argv[1];
729 argv += 2;
730 argc -= 2;
731 }
732 else if (argc >= 2 && strcmp (s1: argv[0], s2: "-t") == 0)
733 {
734 type = argv[1][0];
735 if (strchr (s: "ltb", c: type) == 0)
736 usage ();
737 argv += 2;
738 argc -= 2;
739 }
740 else
741 usage ();
742 }
743 if (gen == 't')
744 {
745 readtrace (name: tracefile);
746 lo = hi = 0;
747 usergen = 1;
748 }
749 while (argc > 0)
750 {
751 int found = 0;
752 all = strcmp (s1: argv[0], s2: "all") == 0;
753 for (const struct fun *f = funtab; f->name; f++)
754 if (all || strcmp (s1: argv[0], s2: f->name) == 0)
755 {
756 found = 1;
757 if (!usergen)
758 {
759 lo = f->lo;
760 hi = f->hi;
761 }
762 bench (f, lo, hi, type, gen);
763 if (usergen && !all)
764 break;
765 }
766 if (!found)
767 printf (format: "unknown function: %s\n", argv[0]);
768 argv++;
769 argc--;
770 }
771 return 0;
772}
773

source code of libc/AOR_v20.02/math/test/mathbench.c