1 | /* |
2 | * Ones' complement checksum test & benchmark |
3 | * |
4 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
5 | * See https://llvm.org/LICENSE.txt for license information. |
6 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
7 | */ |
8 | |
9 | #define _GNU_SOURCE |
10 | #include <inttypes.h> |
11 | #include <stdbool.h> |
12 | #include <stdint.h> |
13 | #include <stdio.h> |
14 | #include <stdlib.h> |
15 | #include <string.h> |
16 | #include <sys/mman.h> |
17 | #include <time.h> |
18 | #include <unistd.h> |
19 | #include "../include/networking.h" |
20 | |
21 | #if WANT_ASSERT |
22 | #undef NDEBUG |
23 | #include <assert.h> |
24 | #define Assert(exp) assert(exp) |
25 | #else |
26 | #define Assert(exp) (void) (exp) |
27 | #endif |
28 | |
29 | #ifdef __GNUC__ |
30 | #define may_alias __attribute__((__may_alias__)) |
31 | #else |
32 | #define may_alias |
33 | #endif |
34 | |
35 | #define CACHE_LINE 64 |
36 | #define ALIGN(x, y) (((x) + (y) - 1) & ~((y) - 1)) |
37 | |
38 | /* Reference implementation - do not modify! */ |
39 | static uint16_t |
40 | checksum_simple(const void *ptr, uint32_t nbytes) |
41 | { |
42 | const uint16_t *may_alias hptr = ptr; |
43 | uint64_t sum = 0;/* Need 64-bit accumulator when nbytes > 64K */ |
44 | |
45 | /* Sum all halfwords, assume misaligned accesses are handled in HW */ |
46 | for (uint32_t nhalfs = nbytes >> 1; nhalfs != 0; nhalfs--) |
47 | { |
48 | sum += *hptr++; |
49 | } |
50 | |
51 | /* Add any trailing odd byte */ |
52 | if ((nbytes & 0x01) != 0) |
53 | { |
54 | sum += *(uint8_t *) hptr; |
55 | } |
56 | |
57 | /* Fold 64-bit sum to 32 bits */ |
58 | sum = (sum & 0xffffffff) + (sum >> 32); |
59 | sum = (sum & 0xffffffff) + (sum >> 32); |
60 | Assert(sum == (uint32_t) sum); |
61 | |
62 | /* Fold 32-bit sum to 16 bits */ |
63 | sum = (sum & 0xffff) + (sum >> 16); |
64 | sum = (sum & 0xffff) + (sum >> 16); |
65 | Assert(sum == (uint16_t) sum); |
66 | |
67 | return (uint16_t) sum; |
68 | } |
69 | |
70 | static struct |
71 | { |
72 | uint16_t (*cksum_fp)(const void *, uint32_t); |
73 | const char *name; |
74 | } implementations[] = |
75 | { |
76 | { checksum_simple, "simple" }, |
77 | { __chksum, "scalar" }, |
78 | #if __arm__ |
79 | { __chksum_arm_simd, "simd" }, |
80 | #elif __aarch64__ |
81 | { __chksum_aarch64_simd, "simd" }, |
82 | #endif |
83 | { NULL, NULL} |
84 | }; |
85 | |
86 | static int |
87 | find_impl(const char *name) |
88 | { |
89 | for (int i = 0; implementations[i].name != NULL; i++) |
90 | { |
91 | if (strcmp(s1: implementations[i].name, s2: name) == 0) |
92 | { |
93 | return i; |
94 | } |
95 | } |
96 | return -1; |
97 | } |
98 | |
99 | static uint16_t (*CKSUM_FP)(const void *, uint32_t); |
100 | static volatile uint16_t SINK; |
101 | |
102 | static bool |
103 | verify(const void *data, uint32_t offset, uint32_t size) |
104 | { |
105 | |
106 | uint16_t csum_expected = checksum_simple(ptr: data, nbytes: size); |
107 | uint16_t csum_actual = CKSUM_FP(data, size); |
108 | if (csum_actual != csum_expected) |
109 | { |
110 | fprintf(stderr, format: "\nInvalid checksum for offset %u size %u: " |
111 | "actual %04x expected %04x (valid)" , |
112 | offset, size, csum_actual, csum_expected); |
113 | if (size < 65536) |
114 | { |
115 | /* Fatal error */ |
116 | exit(EXIT_FAILURE); |
117 | } |
118 | /* Else some implementations only support sizes up to 2^16 */ |
119 | return false; |
120 | } |
121 | return true; |
122 | } |
123 | |
124 | static uint64_t |
125 | clock_get_ns(void) |
126 | { |
127 | struct timespec ts; |
128 | clock_gettime(CLOCK_MONOTONIC, tp: &ts); |
129 | return ts.tv_sec * (uint64_t) 1000000000 + ts.tv_nsec; |
130 | } |
131 | |
132 | static void |
133 | benchmark(const uint8_t *base, |
134 | size_t poolsize, |
135 | uint32_t blksize, |
136 | uint32_t numops, |
137 | uint64_t cpufreq) |
138 | { |
139 | printf(format: "%11u " , (unsigned int) blksize); fflush(stdout); |
140 | |
141 | uint64_t start = clock_get_ns(); |
142 | for (uint32_t i = 0; i < numops; i ++) |
143 | { |
144 | /* Read a random value from the pool */ |
145 | uint32_t random = ((uint32_t *) base)[i % (poolsize / 4)]; |
146 | /* Generate a random starting address */ |
147 | const void *data = &base[random % (poolsize - blksize)]; |
148 | SINK = CKSUM_FP(data, blksize); |
149 | } |
150 | uint64_t end = clock_get_ns(); |
151 | |
152 | #define MEGABYTE 1000000 /* Decimal megabyte (MB) */ |
153 | uint64_t elapsed_ns = end - start; |
154 | uint64_t elapsed_ms = elapsed_ns / 1000000; |
155 | uint32_t blks_per_s = (uint32_t) ((numops / elapsed_ms) * 1000); |
156 | uint64_t accbytes = (uint64_t) numops * blksize; |
157 | printf(format: "%11ju " , (uintmax_t) ((accbytes / elapsed_ms) * 1000) / MEGABYTE); |
158 | unsigned int cyc_per_blk = cpufreq / blks_per_s; |
159 | printf(format: "%11u " , cyc_per_blk); |
160 | if (blksize != 0) |
161 | { |
162 | unsigned int cyc_per_byte = 1000 * cyc_per_blk / blksize; |
163 | printf(format: "%7u.%03u " , |
164 | cyc_per_byte / 1000, cyc_per_byte % 1000); |
165 | } |
166 | printf(format: "\n" ); |
167 | } |
168 | |
169 | int main(int argc, char *argv[]) |
170 | { |
171 | int c; |
172 | bool DUMP = false; |
173 | uint32_t IMPL = 0;/* Simple implementation */ |
174 | uint64_t CPUFREQ = 0; |
175 | uint32_t BLKSIZE = 0; |
176 | uint32_t NUMOPS = 1000000; |
177 | uint32_t POOLSIZE = 512 * 1024;/* Typical ARM L2 cache size */ |
178 | |
179 | setvbuf(stdout, NULL, _IOLBF, n: 160); |
180 | while ((c = getopt(argc: argc, argv: argv, shortopts: "b:df:i:n:p:" )) != -1) |
181 | { |
182 | switch (c) |
183 | { |
184 | case 'b' : |
185 | { |
186 | int blksize = atoi(nptr: optarg); |
187 | if (blksize < 1 || blksize > POOLSIZE / 2) |
188 | { |
189 | fprintf(stderr, format: "Invalid block size %d\n" , blksize); |
190 | exit(EXIT_FAILURE); |
191 | } |
192 | BLKSIZE = (unsigned) blksize; |
193 | break; |
194 | } |
195 | case 'd' : |
196 | DUMP = true; |
197 | break; |
198 | case 'f' : |
199 | { |
200 | int64_t cpufreq = atoll(nptr: optarg); |
201 | if (cpufreq < 1) |
202 | { |
203 | fprintf(stderr, format: "Invalid CPU frequency %" PRId64"\n" , |
204 | cpufreq); |
205 | exit(EXIT_FAILURE); |
206 | } |
207 | CPUFREQ = cpufreq; |
208 | break; |
209 | } |
210 | case 'i' : |
211 | { |
212 | int impl = find_impl(name: optarg); |
213 | if (impl < 0) |
214 | { |
215 | fprintf(stderr, format: "Invalid implementation %s\n" , optarg); |
216 | goto usage; |
217 | } |
218 | IMPL = (unsigned) impl; |
219 | break; |
220 | } |
221 | case 'n' : |
222 | { |
223 | int numops = atoi(nptr: optarg); |
224 | if (numops < 1) |
225 | { |
226 | fprintf(stderr, format: "Invalid number of operations %d\n" , numops); |
227 | exit(EXIT_FAILURE); |
228 | } |
229 | NUMOPS = (unsigned) numops; |
230 | break; |
231 | } |
232 | case 'p' : |
233 | { |
234 | int poolsize = atoi(nptr: optarg); |
235 | if (poolsize < 4096) |
236 | { |
237 | fprintf(stderr, format: "Invalid pool size %d\n" , poolsize); |
238 | exit(EXIT_FAILURE); |
239 | } |
240 | char c = optarg[strlen(s: optarg) - 1]; |
241 | if (c == 'M') |
242 | { |
243 | POOLSIZE = (unsigned) poolsize * 1024 * 1024; |
244 | } |
245 | else if (c == 'K') |
246 | { |
247 | POOLSIZE = (unsigned) poolsize * 1024; |
248 | } |
249 | else |
250 | { |
251 | POOLSIZE = (unsigned) poolsize; |
252 | } |
253 | break; |
254 | } |
255 | default : |
256 | usage : |
257 | fprintf(stderr, format: "Usage: checksum <options>\n" |
258 | "-b <blksize> Block size\n" |
259 | "-d Dump first 96 bytes of data\n" |
260 | "-f <cpufreq> CPU frequency (Hz)\n" |
261 | "-i <impl> Implementation\n" |
262 | "-n <numops> Number of operations\n" |
263 | "-p <poolsize> Pool size (K or M suffix)\n" |
264 | ); |
265 | printf(format: "Implementations:" ); |
266 | for (int i = 0; implementations[i].name != NULL; i++) |
267 | { |
268 | printf(format: " %s" , implementations[i].name); |
269 | } |
270 | printf(format: "\n" ); |
271 | exit(EXIT_FAILURE); |
272 | } |
273 | } |
274 | if (optind > argc) |
275 | { |
276 | goto usage; |
277 | } |
278 | |
279 | CKSUM_FP = implementations[IMPL].cksum_fp; |
280 | POOLSIZE = ALIGN(POOLSIZE, CACHE_LINE); |
281 | uint8_t *base = mmap(addr: 0, len: POOLSIZE, PROT_READ|PROT_WRITE, |
282 | MAP_PRIVATE|MAP_ANONYMOUS, fd: -1, offset: 0); |
283 | if (base == MAP_FAILED) |
284 | { |
285 | perror(s: "aligned_alloc" ), exit(EXIT_FAILURE); |
286 | } |
287 | for (size_t i = 0; i < POOLSIZE / 4; i++) |
288 | { |
289 | ((uint32_t *) base)[i] = rand(); |
290 | } |
291 | |
292 | printf(format: "Implementation: %s\n" , implementations[IMPL].name); |
293 | printf(format: "numops %u, poolsize " , NUMOPS); |
294 | if (POOLSIZE % (1024 * 1024) == 0) |
295 | { |
296 | printf(format: "%uMiB" , POOLSIZE / (1024 * 1024)); |
297 | } |
298 | else if (POOLSIZE % 1024 == 0) |
299 | { |
300 | printf(format: "%uKiB" , POOLSIZE / 1024); |
301 | } |
302 | else |
303 | { |
304 | printf(format: "%uB" , POOLSIZE); |
305 | } |
306 | printf(format: ", blocksize %u, CPU frequency %juMHz\n" , |
307 | BLKSIZE, (uintmax_t) (CPUFREQ / 1000000)); |
308 | #if WANT_ASSERT |
309 | printf("Warning: assertions are enabled\n" ); |
310 | #endif |
311 | |
312 | if (DUMP) |
313 | { |
314 | /* Print out first 96 bytes of data for human debugging */ |
315 | for (int i = 0; i < 96; i++) |
316 | { |
317 | if (i % 8 == 0) |
318 | printf(format: "%2u:" , i); |
319 | printf(format: " %02x" , base[i]); |
320 | if (i % 8 == 7) |
321 | printf(format: "\n" ); |
322 | } |
323 | } |
324 | |
325 | /* Verify that chosen algorithm handles all combinations of offsets and sizes */ |
326 | printf(format: "Verifying..." ); fflush(stdout); |
327 | bool success = true; |
328 | /* Check all (relevant) combinations of size and offset */ |
329 | for (int size = 0; size <= 256; size++) |
330 | { |
331 | for (int offset = 0; offset < 255; offset++) |
332 | { |
333 | /* Check at start of mapped memory */ |
334 | success &= verify(data: &base[offset], offset, size); |
335 | /* Check at end of mapped memory */ |
336 | uint8_t *p = base + POOLSIZE - (size + offset); |
337 | success &= verify(data: p, offset: (uintptr_t) p % 64, size); |
338 | } |
339 | } |
340 | /* Check increasingly larger sizes */ |
341 | for (size_t size = 1; size < POOLSIZE; size *= 2) |
342 | { |
343 | success &= verify(data: base, offset: 0, size); |
344 | } |
345 | /* Check the full size, this can detect accumulator overflows */ |
346 | success &= verify(data: base, offset: 0, size: POOLSIZE); |
347 | printf(format: "%s\n" , success ? "OK" : "failure" ); |
348 | |
349 | /* Print throughput in decimal megabyte (1000000B) per second */ |
350 | if (CPUFREQ != 0) |
351 | { |
352 | printf(format: "%11s %11s %11s %11s\n" , |
353 | "block size" , "MB/s" , "cycles/blk" , "cycles/byte" ); |
354 | } |
355 | else |
356 | { |
357 | printf(format: "%11s %11s %11s %11s\n" , |
358 | "block size" , "MB/s" , "ns/blk" , "ns/byte" ); |
359 | CPUFREQ = 1000000000; |
360 | } |
361 | if (BLKSIZE != 0) |
362 | { |
363 | benchmark(base, poolsize: POOLSIZE, blksize: BLKSIZE, numops: NUMOPS, cpufreq: CPUFREQ); |
364 | } |
365 | else |
366 | { |
367 | static const uint16_t sizes[] = |
368 | { 20, 42, 102, 250, 612, 1500, 3674, 9000, 0 }; |
369 | for (int i = 0; sizes[i] != 0; i++) |
370 | { |
371 | uint32_t numops = NUMOPS * 10000 / (40 + sizes[i]); |
372 | benchmark(base, poolsize: POOLSIZE, blksize: sizes[i], numops, cpufreq: CPUFREQ); |
373 | } |
374 | } |
375 | |
376 | if (munmap(addr: base, len: POOLSIZE) != 0) |
377 | { |
378 | perror(s: "munmap" ), exit(EXIT_FAILURE); |
379 | } |
380 | |
381 | return success ? EXIT_SUCCESS : EXIT_FAILURE; |
382 | } |
383 | |