1/*
2 * Ones' complement checksum test & benchmark
3 *
4 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5 * See https://llvm.org/LICENSE.txt for license information.
6 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 */
8
9#define _GNU_SOURCE
10#include <inttypes.h>
11#include <stdbool.h>
12#include <stdint.h>
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16#include <sys/mman.h>
17#include <time.h>
18#include <unistd.h>
19#include "../include/networking.h"
20
21#if WANT_ASSERT
22#undef NDEBUG
23#include <assert.h>
24#define Assert(exp) assert(exp)
25#else
26#define Assert(exp) (void) (exp)
27#endif
28
29#ifdef __GNUC__
30#define may_alias __attribute__((__may_alias__))
31#else
32#define may_alias
33#endif
34
35#define CACHE_LINE 64
36#define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1))
37
38/* Reference implementation - do not modify! */
39static uint16_t
40checksum_simple(const void *ptr, uint32_t nbytes)
41{
42 const uint16_t *may_alias hptr = ptr;
43 uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */
44
45 /* Sum all halfwords, assume misaligned accesses are handled in HW */
46 for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--)
47 {
48 sum += *hptr++;
49 }
50
51 /* Add any trailing odd byte */
52 if ((nbytes & 0x01) != 0)
53 {
54 sum += *(uint8_t *) hptr;
55 }
56
57 /* Fold 64-bit sum to 32 bits */
58 sum = (sum & 0xffffffff) + (sum >> 32);
59 sum = (sum & 0xffffffff) + (sum >> 32);
60 Assert(sum == (uint32_t) sum);
61
62 /* Fold 32-bit sum to 16 bits */
63 sum = (sum & 0xffff) + (sum >> 16);
64 sum = (sum & 0xffff) + (sum >> 16);
65 Assert(sum == (uint16_t) sum);
66
67 return (uint16_t) sum;
68}
69
70static struct
71{
72 uint16_t (*cksum_fp)(const void *, uint32_t);
73 const char *name;
74} implementations[] =
75{
76 { checksum_simple, "simple"},
77 { __chksum, "scalar"},
78#if __arm__
79 { __chksum_arm_simd, "simd" },
80#elif __aarch64__
81 { __chksum_aarch64_simd, "simd" },
82#endif
83 { NULL, NULL}
84};
85
86static int
87find_impl(const char *name)
88{
89 for (int i = 0; implementations[i].name != NULL; i++)
90 {
91 if (strcmp(s1: implementations[i].name, s2: name) == 0)
92 {
93 return i;
94 }
95 }
96 return -1;
97}
98
99static uint16_t (*CKSUM_FP)(const void *, uint32_t);
100static volatile uint16_t SINK;
101
102static bool
103verify(const void *data, uint32_t offset, uint32_t size)
104{
105
106 uint16_t csum_expected = checksum_simple(ptr: data, nbytes: size);
107 uint16_t csum_actual = CKSUM_FP(data, size);
108 if (csum_actual != csum_expected)
109 {
110 fprintf(stderr, format: "\nInvalid checksum for offset %u size %u: "
111 "actual %04x expected %04x (valid)",
112 offset, size, csum_actual, csum_expected);
113 if (size < 65536)
114 {
115 /* Fatal error */
116 exit(EXIT_FAILURE);
117 }
118 /* Else some implementations only support sizes up to 2^16 */
119 return false;
120 }
121 return true;
122}
123
124static uint64_t
125clock_get_ns(void)
126{
127 struct timespec ts;
128 clock_gettime(CLOCK_MONOTONIC, tp: &ts);
129 return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec;
130}
131
132static void
133benchmark(const uint8_t *base,
134 size_t poolsize,
135 uint32_t blksize,
136 uint32_t numops,
137 uint64_t cpufreq)
138{
139 printf(format: "%11u ", (unsigned int) blksize); fflush(stdout);
140
141 uint64_t start = clock_get_ns();
142 for (uint32_t i = 0; i < numops; i ++)
143 {
144 /* Read a random value from the pool */
145 uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)];
146 /* Generate a random starting address */
147 const void *data = &base[random % (poolsize - blksize)];
148 SINK = CKSUM_FP(data, blksize);
149 }
150 uint64_t end = clock_get_ns();
151
152#define MEGABYTE 1000000 /* Decimal megabyte (MB) */
153 uint64_t elapsed_ns = end - start;
154 uint64_t elapsed_ms = elapsed_ns / 1000000;
155 uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000);
156 uint64_t accbytes = (uint64_t) numops * blksize;
157 printf(format: "%11ju ", (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE);
158 unsigned int cyc_per_blk = cpufreq / blks_per_s;
159 printf(format: "%11u ", cyc_per_blk);
160 if (blksize != 0)
161 {
162 unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize;
163 printf(format: "%7u.%03u ",
164 cyc_per_byte / 1000, cyc_per_byte % 1000);
165 }
166 printf(format: "\n");
167}
168
169int main(int argc, char *argv[])
170{
171 int c;
172 bool DUMP = false;
173 uint32_t IMPL = 0;/* Simple implementation */
174 uint64_t CPUFREQ = 0;
175 uint32_t BLKSIZE = 0;
176 uint32_t NUMOPS = 1000000;
177 uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */
178
179 setvbuf(stdout, NULL, _IOLBF, n: 160);
180 while ((c = getopt(argc: argc, argv: argv, shortopts: "b:df:i:n:p:")) != -1)
181 {
182 switch (c)
183 {
184 case 'b' :
185 {
186 int blksize = atoi(nptr: optarg);
187 if (blksize < 1 || blksize > POOLSIZE / 2)
188 {
189 fprintf(stderr, format: "Invalid block size %d\n", blksize);
190 exit(EXIT_FAILURE);
191 }
192 BLKSIZE = (unsigned) blksize;
193 break;
194 }
195 case 'd' :
196 DUMP = true;
197 break;
198 case 'f' :
199 {
200 int64_t cpufreq = atoll(nptr: optarg);
201 if (cpufreq < 1)
202 {
203 fprintf(stderr, format: "Invalid CPU frequency %"PRId64"\n",
204 cpufreq);
205 exit(EXIT_FAILURE);
206 }
207 CPUFREQ = cpufreq;
208 break;
209 }
210 case 'i' :
211 {
212 int impl = find_impl(name: optarg);
213 if (impl < 0)
214 {
215 fprintf(stderr, format: "Invalid implementation %s\n", optarg);
216 goto usage;
217 }
218 IMPL = (unsigned) impl;
219 break;
220 }
221 case 'n' :
222 {
223 int numops = atoi(nptr: optarg);
224 if (numops < 1)
225 {
226 fprintf(stderr, format: "Invalid number of operations %d\n", numops);
227 exit(EXIT_FAILURE);
228 }
229 NUMOPS = (unsigned) numops;
230 break;
231 }
232 case 'p' :
233 {
234 int poolsize = atoi(nptr: optarg);
235 if (poolsize < 4096)
236 {
237 fprintf(stderr, format: "Invalid pool size %d\n", poolsize);
238 exit(EXIT_FAILURE);
239 }
240 char c = optarg[strlen(s: optarg) - 1];
241 if (c == 'M')
242 {
243 POOLSIZE = (unsigned) poolsize * 1024 * 1024;
244 }
245 else if (c == 'K')
246 {
247 POOLSIZE = (unsigned) poolsize * 1024;
248 }
249 else
250 {
251 POOLSIZE = (unsigned) poolsize;
252 }
253 break;
254 }
255 default :
256usage :
257 fprintf(stderr, format: "Usage: checksum <options>\n"
258 "-b <blksize> Block size\n"
259 "-d Dump first 96 bytes of data\n"
260 "-f <cpufreq> CPU frequency (Hz)\n"
261 "-i <impl> Implementation\n"
262 "-n <numops> Number of operations\n"
263 "-p <poolsize> Pool size (K or M suffix)\n"
264 );
265 printf(format: "Implementations:");
266 for (int i = 0; implementations[i].name != NULL; i++)
267 {
268 printf(format: " %s", implementations[i].name);
269 }
270 printf(format: "\n");
271 exit(EXIT_FAILURE);
272 }
273 }
274 if (optind > argc)
275 {
276 goto usage;
277 }
278
279 CKSUM_FP = implementations[IMPL].cksum_fp;
280 POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE);
281 uint8_t *base = mmap(addr: 0, len: POOLSIZE, PROT_READ|PROT_WRITE,
282 MAP_PRIVATE|MAP_ANONYMOUS, fd: -1, offset: 0);
283 if (base == MAP_FAILED)
284 {
285 perror(s: "aligned_alloc"), exit(EXIT_FAILURE);
286 }
287 for (size_t i = 0; i < POOLSIZE / 4; i++)
288 {
289 ((uint32_t *) base)[i] = rand();
290 }
291
292 printf(format: "Implementation: %s\n", implementations[IMPL].name);
293 printf(format: "numops %u, poolsize ", NUMOPS);
294 if (POOLSIZE % (1024 * 1024) == 0)
295 {
296 printf(format: "%uMiB", POOLSIZE / (1024 * 1024));
297 }
298 else if (POOLSIZE % 1024 == 0)
299 {
300 printf(format: "%uKiB", POOLSIZE / 1024);
301 }
302 else
303 {
304 printf(format: "%uB", POOLSIZE);
305 }
306 printf(format: ", blocksize %u, CPU frequency %juMHz\n",
307 BLKSIZE, (uintmax_t) (CPUFREQ / 1000000));
308#if WANT_ASSERT
309 printf("Warning: assertions are enabled\n");
310#endif
311
312 if (DUMP)
313 {
314 /* Print out first 96 bytes of data for human debugging */
315 for (int i = 0; i < 96; i++)
316 {
317 if (i % 8 == 0)
318 printf(format: "%2u:", i);
319 printf(format: " %02x", base[i]);
320 if (i % 8 == 7)
321 printf(format: "\n");
322 }
323 }
324
325 /* Verify that chosen algorithm handles all combinations of offsets and sizes */
326 printf(format: "Verifying..."); fflush(stdout);
327 bool success = true;
328 /* Check all (relevant) combinations of size and offset */
329 for (int size = 0; size <= 256; size++)
330 {
331 for (int offset = 0; offset < 255; offset++)
332 {
333 /* Check at start of mapped memory */
334 success &= verify(data: &base[offset], offset, size);
335 /* Check at end of mapped memory */
336 uint8_t *p = base + POOLSIZE - (size + offset);
337 success &= verify(data: p, offset: (uintptr_t) p % 64, size);
338 }
339 }
340 /* Check increasingly larger sizes */
341 for (size_t size = 1; size < POOLSIZE; size *= 2)
342 {
343 success &= verify(data: base, offset: 0, size);
344 }
345 /* Check the full size, this can detect accumulator overflows */
346 success &= verify(data: base, offset: 0, size: POOLSIZE);
347 printf(format: "%s\n", success ? "OK" : "failure");
348
349 /* Print throughput in decimal megabyte (1000000B) per second */
350 if (CPUFREQ != 0)
351 {
352 printf(format: "%11s %11s %11s %11s\n",
353 "block size", "MB/s", "cycles/blk", "cycles/byte");
354 }
355 else
356 {
357 printf(format: "%11s %11s %11s %11s\n",
358 "block size", "MB/s", "ns/blk", "ns/byte");
359 CPUFREQ = 1000000000;
360 }
361 if (BLKSIZE != 0)
362 {
363 benchmark(base, poolsize: POOLSIZE, blksize: BLKSIZE, numops: NUMOPS, cpufreq: CPUFREQ);
364 }
365 else
366 {
367 static const uint16_t sizes[] =
368 { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 };
369 for (int i = 0; sizes[i] != 0; i++)
370 {
371 uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]);
372 benchmark(base, poolsize: POOLSIZE, blksize: sizes[i], numops, cpufreq: CPUFREQ);
373 }
374 }
375
376 if (munmap(addr: base, len: POOLSIZE) != 0)
377 {
378 perror(s: "munmap"), exit(EXIT_FAILURE);
379 }
380
381 return success ? EXIT_SUCCESS : EXIT_FAILURE;
382}
383

source code of libc/AOR_v20.02/networking/test/chksum.c