dump1090-fa/starch/example/generated/benchmark.c

570 lines
20 KiB
C
Raw Normal View History

Move all converters to starch-based implementations (#97) * Switch all conversion routines to use starch. main user-visible changes: * ensure you check out submodules ('git clone --recurse-submodules") * --version shows the CPU features and DSP implementations in use * --wisdom allows overriding of the built-in architecture wisdom * --dcfilter no longer supported * "starch-benchmark" binary will benchmark all options on the current machine and can produce a wisdom file to feed to the --wisdom option If you have a usecase for --dcfilter, please get in touch and let me know - it's an edge case and for now there's no starch/DSP support for it, but support can be written if needed. In almost all cases the new conversion routines are slightly or substantially faster than the old conversion routines. The only case that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower due to changing from heavily approximated lookup tables to higher quality results (but SC16 is probably already out of reach of a Pi 0) * No need to build with SC16Q11_TABLE_BITS any more * Add oneoff/uc8_capture_stats (reads a UC8 capture; measures min/max/mean I and Q) * Switch UC8 conversion to 127.4 center, 128 range. Looking at actual UC8 captures from a RTL2832, the mean I and Q are actually at 127.4, so use that as the zero point. This means that the resulting I/Q maximum values could be as large as 127.6. Switch to 128 for simplicity. * Switch to the new UC8 zero offset in benchmarks, fix some bugs * Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements * Ditch UC8 approximation path, add a NEON VRQSQRTE path. * Tweak the SC16 exact path, add a new impl that uses a mix of u32 & floats. * SC16Q11 impl tweaks: * add a u32->float exact path * ditch the approximation path * add a NEON VRSQRTE path * add a 12-bit table path (using the full signed I/Q value, not absolute value) * Ditch SC16 approximation path, add NEON vrsqrte path * Add oneoff/dsp_error_measurement This runs sample input through the DSP functions that are allowed to be inexact and dumps the results as a TSV suitable for feeding to gnuplot to look at the actual errors. * Update make clean, make wisdom targets * Update wisdom based on benchmarking * Preserve the raw wisdom benchmark data * Update to latest starch * Update .gitignore for new wisdom files * Update starch generated code * Build starch-benchmark as part of the 'all' target * Use wisdom from /etc/dump1090-fa/wisdom.local if present * Package starch-benchmark and a helper script to generate local wisdom data * Remove submodules in preparation for importing them directly * Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0 * Import starch at commit a725c8491dc33a321565d451b385131e589d8490 from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
/* starch generated code. Do not edit. */
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <stdalign.h>
#include <stdbool.h>
#include <string.h>
#include <inttypes.h>
#include <time.h>
#include <unistd.h>
#include <errno.h>
#include "starch.h"
typedef struct timespec starch_benchmark_time;
uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end);
void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size);
void starch_benchmark_aligned_free(void *user_ptr);
void starch_benchmark_get_time(starch_benchmark_time *t);
const unsigned starch_benchmark_warmup_loops = 10;
typedef struct {
const char *name;
const char *impl;
uint64_t ns;
} starch_benchmark_result;
static starch_benchmark_result *starch_benchmark_results = NULL;
static unsigned starch_benchmark_result_size = 0;
static unsigned starch_benchmark_result_count = 0;
typedef struct benchmark_flavor_list_node {
const char *flavor;
struct benchmark_flavor_list_node *next;
} starch_benchmark_flavor_list;
static starch_benchmark_flavor_list *starch_benchmark_flavor_whitelist = NULL;
static starch_benchmark_flavor_list *starch_benchmark_flavor_blacklist = NULL;
static bool starch_benchmark_list_only = false;
static bool starch_benchmark_top_only = false;
static unsigned starch_benchmark_iterations = 1;
typedef struct timespec starch_benchmark_time;
void starch_benchmark_get_time(starch_benchmark_time *t)
{
#ifdef CLOCK_THREAD_CPUTIME_ID
clock_gettime(CLOCK_THREAD_CPUTIME_ID, t);
#else
clock_gettime(CLOCK_MONOTONIC, t);
#endif
}
uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end)
{
return ((uint64_t)end->tv_sec - (uint64_t)start->tv_sec) * 1000000000U + (uint64_t)end->tv_nsec - (uint64_t)start->tv_nsec;
}
void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size)
{
size_t use_alignment = (type_alignment > alignment ? type_alignment : alignment);
if ( (use_alignment % type_alignment) || (use_alignment % alignment) ) {
fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: conflicting alignment requirements (%zu versus %zu)\n", size, alignment, type_alignment);
return NULL;
}
/* Over-allocate so we can stash our own pointer before the start, and so that we can adjust
* the returned alignment so it is only aligned to the requested boundary, and not also
* aligned to a larger power of two (we don't want to accidentally benchmark the performance
* of a more restrictive larger alignment)
*/
size_t header_size = (use_alignment < sizeof(void*) ? sizeof(void*) : use_alignment);
char *block_ptr = aligned_alloc(use_alignment, header_size + size + use_alignment);
if (!block_ptr) {
fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: %s\n", size, strerror(errno));
return NULL;
}
char *user_ptr = block_ptr + header_size;
if ( (uintptr_t)user_ptr % (use_alignment * 2) == 0 ) {
// user_ptr is aligned to the next power of two, but we don't want that, move it on
user_ptr += use_alignment;
}
void **stash = (void**)user_ptr - 1;
*stash = block_ptr;
return user_ptr;
}
void starch_benchmark_aligned_free(void *user_ptr)
{
if (!user_ptr)
return;
void **stash = (void**)user_ptr - 1;
free(*stash);
}
static bool starch_benchmark_flavor_in_list(const char *flavor, const starch_benchmark_flavor_list *list)
{
for (; list; list = list->next) {
if (!strcmp(flavor, list->flavor))
return true;
}
return false;
}
/* prototypes for benchmark helpers provided by user code */
void starch_subtract_n_benchmark (void);
bool starch_subtract_n_benchmark_verify ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 );
/* prototype the benchmarking function so that we can build with -Wmissing-declarations */
void starch_subtract_n_benchmark(void);
static void starch_benchmark_one_subtract_n( starch_subtract_n_regentry * _entry, const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 )
{
fprintf(stderr, " %-40s ", _entry->name);
/* test for support */
if (_entry->flavor_supported && !(_entry->flavor_supported())) {
fprintf(stderr, "unsupported\n");
return;
}
if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) {
fprintf(stderr, "skipped (not whitelisted)\n");
return;
}
if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) {
fprintf(stderr, "skipped (blacklisted)\n");
return;
}
if (starch_benchmark_list_only) {
fprintf(stderr, "supported\n");
return;
}
/* initial warmup */
for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop)
_entry->callable ( arg0, arg1, arg2, arg3 );
/* verify correctness of the output */
if (! starch_subtract_n_benchmark_verify ( arg0, arg1, arg2, arg3 )) {
fprintf(stderr, "skipped (verification failed)\n");
return;
}
/* pre-benchmark, find a loop count that takes at least 100ms */
starch_benchmark_time _start, _end;
uint64_t _elapsed = 0;
uint64_t _loops = 127;
while (_elapsed < 100000000) {
_loops *= 2;
starch_benchmark_get_time(&_start);
for (uint64_t _loop = 0; _loop < _loops; ++_loop)
_entry->callable ( arg0, arg1, arg2, arg3 );
starch_benchmark_get_time(&_end);
_elapsed = starch_benchmark_elapsed(&_start, &_end);
}
/* real benchmark, run for approx 1 second */
_loops = _loops * 1000000000 / _elapsed;
_elapsed = 0;
uint64_t _elapsed_min = UINT64_MAX;
uint64_t _elapsed_max = 0;
for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) {
starch_benchmark_get_time(&_start);
for (uint64_t _loop = 0; _loop < _loops; ++_loop)
_entry->callable ( arg0, arg1, arg2, arg3 );
starch_benchmark_get_time(&_end);
uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end);
if (_elapsed_one < _elapsed_min)
_elapsed_min = _elapsed_one;
if (_elapsed_one > _elapsed_max)
_elapsed_max = _elapsed_one;
_elapsed += _elapsed_one;
}
uint64_t _per_loop;
if (starch_benchmark_iterations > 2)
_per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2);
else
_per_loop = _elapsed / _loops / starch_benchmark_iterations;
fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop);
if (starch_benchmark_result_count >= starch_benchmark_result_size) {
if (!starch_benchmark_result_size)
starch_benchmark_result_size = 64;
else
starch_benchmark_result_size *= 2;
starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results));
if (!starch_benchmark_results) {
fprintf(stderr, "realloc: %s\n", strerror(errno));
exit(1);
}
}
starch_benchmark_results[starch_benchmark_result_count].name = "subtract_n";
starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name;
starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop;
++starch_benchmark_result_count;
}
static void starch_benchmark_run_subtract_n( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 )
{
for (starch_subtract_n_regentry *_entry = starch_subtract_n_registry; _entry->name; ++_entry) {
starch_benchmark_one_subtract_n( _entry, arg0, arg1, arg2, arg3 );
}
}
/* prototypes for benchmark helpers provided by user code */
void starch_subtract_n_aligned_benchmark (void);
bool starch_subtract_n_aligned_benchmark_verify ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 );
/* prototype the benchmarking function so that we can build with -Wmissing-declarations */
void starch_subtract_n_aligned_benchmark(void);
static void starch_benchmark_one_subtract_n_aligned( starch_subtract_n_aligned_regentry * _entry, const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 )
{
fprintf(stderr, " %-40s ", _entry->name);
/* test for support */
if (_entry->flavor_supported && !(_entry->flavor_supported())) {
fprintf(stderr, "unsupported\n");
return;
}
if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) {
fprintf(stderr, "skipped (not whitelisted)\n");
return;
}
if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) {
fprintf(stderr, "skipped (blacklisted)\n");
return;
}
if (starch_benchmark_list_only) {
fprintf(stderr, "supported\n");
return;
}
/* initial warmup */
for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop)
_entry->callable ( arg0, arg1, arg2, arg3 );
/* verify correctness of the output */
if (! starch_subtract_n_aligned_benchmark_verify ( arg0, arg1, arg2, arg3 )) {
fprintf(stderr, "skipped (verification failed)\n");
return;
}
/* pre-benchmark, find a loop count that takes at least 100ms */
starch_benchmark_time _start, _end;
uint64_t _elapsed = 0;
uint64_t _loops = 127;
while (_elapsed < 100000000) {
_loops *= 2;
starch_benchmark_get_time(&_start);
for (uint64_t _loop = 0; _loop < _loops; ++_loop)
_entry->callable ( arg0, arg1, arg2, arg3 );
starch_benchmark_get_time(&_end);
_elapsed = starch_benchmark_elapsed(&_start, &_end);
}
/* real benchmark, run for approx 1 second */
_loops = _loops * 1000000000 / _elapsed;
_elapsed = 0;
uint64_t _elapsed_min = UINT64_MAX;
uint64_t _elapsed_max = 0;
for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) {
starch_benchmark_get_time(&_start);
for (uint64_t _loop = 0; _loop < _loops; ++_loop)
_entry->callable ( arg0, arg1, arg2, arg3 );
starch_benchmark_get_time(&_end);
uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end);
if (_elapsed_one < _elapsed_min)
_elapsed_min = _elapsed_one;
if (_elapsed_one > _elapsed_max)
_elapsed_max = _elapsed_one;
_elapsed += _elapsed_one;
}
uint64_t _per_loop;
if (starch_benchmark_iterations > 2)
_per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2);
else
_per_loop = _elapsed / _loops / starch_benchmark_iterations;
fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop);
if (starch_benchmark_result_count >= starch_benchmark_result_size) {
if (!starch_benchmark_result_size)
starch_benchmark_result_size = 64;
else
starch_benchmark_result_size *= 2;
starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results));
if (!starch_benchmark_results) {
fprintf(stderr, "realloc: %s\n", strerror(errno));
exit(1);
}
}
starch_benchmark_results[starch_benchmark_result_count].name = "subtract_n_aligned";
starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name;
starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop;
++starch_benchmark_result_count;
}
static void starch_benchmark_run_subtract_n_aligned( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 )
{
for (starch_subtract_n_aligned_regentry *_entry = starch_subtract_n_aligned_registry; _entry->name; ++_entry) {
starch_benchmark_one_subtract_n_aligned( _entry, arg0, arg1, arg2, arg3 );
}
}
#undef STARCH_ALIGNMENT
#define STARCH_ALIGNMENT 1
#define STARCH_ALIGNED(_ptr) (_ptr)
#define STARCH_SYMBOL(_name) starch_ ## _name ## _benchmark_sym
#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _dummy_benchmark
#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl)
#define STARCH_BENCHMARK(_function) starch_ ## _function ## _benchmark
#define STARCH_BENCHMARK_VERIFY(_function) starch_ ## _function ## _benchmark_verify
#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ( __VA_ARGS__ )
#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(1, alignof(_type), (_count) * sizeof(_type))
#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr)
#include "../benchmark/subtract_n_benchmark.c"
#undef STARCH_ALIGNMENT
#undef STARCH_ALIGNED
#undef STARCH_SYMBOL
#undef STARCH_IMPL
#undef STARCH_IMPL_REQUIRES
#undef STARCH_BENCHMARK
#undef STARCH_BENCHMARK_VERIFY
#undef STARCH_BENCHMARK_RUN
#undef STARCH_BENCHMARK_ALLOC
#undef STARCH_BENCHMARK_FREE
#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT
#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT))
#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_benchmark_sym
#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _dummy_benchmark
#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl)
#define STARCH_BENCHMARK(_function) starch_ ## _function ## _aligned_benchmark
#define STARCH_BENCHMARK_VERIFY(_function) starch_ ## _function ## _aligned_benchmark_verify
#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ## _aligned ( __VA_ARGS__ )
#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(STARCH_MIX_ALIGNMENT, alignof(_type), (_count) * sizeof(_type))
#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr)
#include "../benchmark/subtract_n_benchmark.c"
static void starch_benchmark_all_subtract_n(void)
{
fprintf(stderr, "==== subtract_n ===\n");
starch_subtract_n_benchmark ();
}
static void starch_benchmark_all_subtract_n_aligned(void)
{
fprintf(stderr, "==== subtract_n_aligned ===\n");
starch_subtract_n_aligned_benchmark ();
}
static int starch_benchmark_compare_result(const void *a, const void *b)
{
const starch_benchmark_result *left = (const starch_benchmark_result *) a;
const starch_benchmark_result *right = (const starch_benchmark_result *) b;
int name_cmp = strcmp(left->name, right->name);
if (name_cmp)
return name_cmp;
if (left->ns < right->ns)
return -1;
if (left->ns > right->ns)
return 1;
return 0;
}
static void starch_benchmark_usage(const char *argv0)
{
fprintf(stderr,
"Usage: %s [OPTION]... [FUNCTION]...\n"
"Benchmarks starch functions and optionally writes a sorted wisdom file.\n"
"\n"
" -r FILE Read initial wisdom from FILE\n"
" -o FILE Write sorted wisdom to FILE\n"
" -F FLAVOR Add FLAVOR to whitelist\n"
" (default: no whitelist, run all runtime-supported flavors)\n"
" -N FLAVOR Add FLAVOR to blacklist\n"
" (default: no blacklist, run all runtime-supported flavors)\n"
" -l List compiled-in implementations but don't benchmark them\n"
" -t Include only the top candidate per function in wisdom output\n"
" -i ITERS Run benchmark ITERS times and use the mean. If ITERS > 2, ignore\n"
" the smallest and largest runs when calculating the mean.\n"
" (default: 1 iteration)\n"
" FUNCTION Run benchmarks for these functions only\n"
" (default: benchmark all functions)\n"
"\n"
"Supported flavors: "
#ifdef STARCH_FLAVOR_GENERIC
"generic "
#endif
#ifdef STARCH_FLAVOR_ARMV7A_VFPV3
"armv7a_vfpv3 "
#endif
#ifdef STARCH_FLAVOR_ARMV7A_VFPV4
"armv7a_vfpv4 "
#endif
#ifdef STARCH_FLAVOR_X86_64_AVX
"x86_64_avx "
#endif
#ifdef STARCH_FLAVOR_X86_64_AVX2
"x86_64_avx2 "
#endif
"\n"
"Supported functions: "
"subtract_n "
"subtract_n_aligned "
"\n", argv0);
}
static void starch_benchmark_append_flavor(const char *flavor, starch_benchmark_flavor_list **list)
{
starch_benchmark_flavor_list *newnode = malloc(sizeof(*newnode));
if (!newnode) {
fprintf(stderr, "malloc: %s\n", strerror(errno));
exit(1);
}
newnode->flavor = flavor;
newnode->next = *list;
*list = newnode;
}
int main(int argc, char **argv)
{
int specific = 0;
const char *output_path = NULL;
int opt;
while ((opt = getopt(argc, argv, "r:o:F:N:i:lht")) != -1) {
switch (opt) {
case 'r':
if (starch_read_wisdom(optarg) < 0) {
fprintf(stderr, "%s: cannot read %s: %s\n", argv[0], optarg, strerror(errno));
return 1;
}
fprintf(stderr, "%s: loaded wisdom file %s\n", argv[0], optarg);
break;
case 'o':
output_path = optarg;
break;
case 'F':
if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_blacklist)) {
fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg);
return 2;
}
starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_whitelist);
break;
case 'N':
if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_whitelist)) {
fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg);
return 2;
}
starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_blacklist);
break;
case 'l':
starch_benchmark_list_only = true;
break;
case 't':
starch_benchmark_top_only = true;
break;
case 'i':
starch_benchmark_iterations = atoi(optarg);
break;
case 'h':
starch_benchmark_usage(argv[0]);
return 0;
case '?':
default:
starch_benchmark_usage(argv[0]);
return 2;
}
}
if (starch_benchmark_list_only && output_path) {
fprintf(stderr, "%s: -o and -l options cannot be specified together\n", argv[0]);
return 2;
}
for (int i = optind; i < argc; ++i) {
if (!strcmp(argv[i], "subtract_n")) {
specific = 1;
starch_benchmark_all_subtract_n();
continue;
}
if (!strcmp(argv[i], "subtract_n_aligned")) {
specific = 1;
starch_benchmark_all_subtract_n_aligned();
continue;
}
fprintf(stderr, "%s: unrecognized function name: %s\n", argv[0], argv[i]);
return 2;
}
if (!specific) {
starch_benchmark_all_subtract_n();
starch_benchmark_all_subtract_n_aligned();
}
if (output_path) {
FILE *out = fopen(output_path, "w");
if (!out) {
fprintf(stderr, "%s: cannot open %s: %s\n", argv[0], output_path, strerror(errno));
return 1;
}
fprintf(out, "# generated by ");
for (int i = 0; i < argc; ++i)
fprintf(out, "%s ", argv[i]);
fprintf(out, "\n\n");
qsort(starch_benchmark_results, starch_benchmark_result_count, sizeof(*starch_benchmark_results), starch_benchmark_compare_result);
const char *last_name = NULL;
bool first = true;
for (unsigned i = 0; i < starch_benchmark_result_count; ++i) {
starch_benchmark_result *result = &starch_benchmark_results[i];
if (last_name && strcmp(last_name, result->name) != 0) {
fprintf(out, "\n");
first = true;
}
last_name = result->name;
if (starch_benchmark_top_only && !first)
continue;
fprintf(out, "%-40s %-40s # %" PRIu64 " ns/call\n", result->name, result->impl, result->ns);
first = false;
}
fclose(out);
fprintf(stderr, "%s: wrote sorted wisdom to %s\n", argv[0], output_path);
}
return 0;
}