Move all converters to starch-based implementations (#97)
* Switch all conversion routines to use starch.
main user-visible changes:
* ensure you check out submodules ('git clone --recurse-submodules")
* --version shows the CPU features and DSP implementations in use
* --wisdom allows overriding of the built-in architecture wisdom
* --dcfilter no longer supported
* "starch-benchmark" binary will benchmark all options on the
current machine and can produce a wisdom file to feed to
the --wisdom option
If you have a usecase for --dcfilter, please get in touch and
let me know - it's an edge case and for now there's no starch/DSP
support for it, but support can be written if needed.
In almost all cases the new conversion routines are slightly or
substantially faster than the old conversion routines. The only case
that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower
due to changing from heavily approximated lookup tables to higher
quality results (but SC16 is probably already out of reach of a Pi 0)
* No need to build with SC16Q11_TABLE_BITS any more
* Add oneoff/uc8_capture_stats
(reads a UC8 capture; measures min/max/mean I and Q)
* Switch UC8 conversion to 127.4 center, 128 range.
Looking at actual UC8 captures from a RTL2832, the mean I and Q
are actually at 127.4, so use that as the zero point.
This means that the resulting I/Q maximum values could be as large as
127.6. Switch to 128 for simplicity.
* Switch to the new UC8 zero offset in benchmarks, fix some bugs
* Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements
* Ditch UC8 approximation path, add a NEON VRQSQRTE path.
* Tweak the SC16 exact path, add a new impl that uses a mix of
u32 & floats.
* SC16Q11 impl tweaks:
* add a u32->float exact path
* ditch the approximation path
* add a NEON VRSQRTE path
* add a 12-bit table path (using the full signed I/Q value, not absolute value)
* Ditch SC16 approximation path, add NEON vrsqrte path
* Add oneoff/dsp_error_measurement
This runs sample input through the DSP functions that are
allowed to be inexact and dumps the results as a TSV suitable for
feeding to gnuplot to look at the actual errors.
* Update make clean, make wisdom targets
* Update wisdom based on benchmarking
* Preserve the raw wisdom benchmark data
* Update to latest starch
* Update .gitignore for new wisdom files
* Update starch generated code
* Build starch-benchmark as part of the 'all' target
* Use wisdom from /etc/dump1090-fa/wisdom.local if present
* Package starch-benchmark and a helper script to generate local wisdom data
* Remove submodules in preparation for importing them directly
* Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0
* Import starch at commit a725c8491dc33a321565d451b385131e589d8490
from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
|
|
|
### Copyright (c) 2020, FlightAware LLC.
|
|
|
|
|
### All rights reserved.
|
|
|
|
|
### See the LICENSE file for licensing terms.
|
|
|
|
|
|
|
|
|
|
/* starch generated code. Do not edit. */
|
|
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <stdint.h>
|
|
|
|
|
#include <stdalign.h>
|
|
|
|
|
#include <stdbool.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <inttypes.h>
|
|
|
|
|
#include <time.h>
|
|
|
|
|
#include <unistd.h>
|
|
|
|
|
#include <errno.h>
|
|
|
|
|
|
|
|
|
|
#include "${os.path.relpath(gen.generated_include_path, current_dir)}"
|
|
|
|
|
|
|
|
|
|
typedef struct timespec starch_benchmark_time;
|
|
|
|
|
|
|
|
|
|
uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end);
|
|
|
|
|
void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size);
|
|
|
|
|
void starch_benchmark_aligned_free(void *user_ptr);
|
|
|
|
|
void starch_benchmark_get_time(starch_benchmark_time *t);
|
|
|
|
|
|
|
|
|
|
const unsigned starch_benchmark_warmup_loops = 10;
|
|
|
|
|
|
|
|
|
|
typedef struct {
|
|
|
|
|
const char *name;
|
|
|
|
|
const char *impl;
|
|
|
|
|
uint64_t ns;
|
|
|
|
|
} starch_benchmark_result;
|
|
|
|
|
|
|
|
|
|
static starch_benchmark_result *starch_benchmark_results = NULL;
|
|
|
|
|
static unsigned starch_benchmark_result_size = 0;
|
|
|
|
|
static unsigned starch_benchmark_result_count = 0;
|
|
|
|
|
|
|
|
|
|
typedef struct benchmark_flavor_list_node {
|
|
|
|
|
const char *flavor;
|
|
|
|
|
struct benchmark_flavor_list_node *next;
|
|
|
|
|
} starch_benchmark_flavor_list;
|
|
|
|
|
|
|
|
|
|
static starch_benchmark_flavor_list *starch_benchmark_flavor_whitelist = NULL;
|
|
|
|
|
static starch_benchmark_flavor_list *starch_benchmark_flavor_blacklist = NULL;
|
|
|
|
|
|
|
|
|
|
static bool starch_benchmark_list_only = false;
|
|
|
|
|
static bool starch_benchmark_validate_only = false;
|
|
|
|
|
static bool starch_benchmark_validation_failed = false;
|
|
|
|
|
static bool starch_benchmark_top_only = false;
|
|
|
|
|
static unsigned starch_benchmark_iterations = 1;
|
|
|
|
|
|
|
|
|
|
typedef struct timespec starch_benchmark_time;
|
|
|
|
|
void starch_benchmark_get_time(starch_benchmark_time *t)
|
|
|
|
|
{
|
|
|
|
|
#ifdef CLOCK_THREAD_CPUTIME_ID
|
|
|
|
|
clock_gettime(CLOCK_THREAD_CPUTIME_ID, t);
|
|
|
|
|
#else
|
|
|
|
|
clock_gettime(CLOCK_MONOTONIC, t);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end)
|
|
|
|
|
{
|
|
|
|
|
return ((uint64_t)end->tv_sec - (uint64_t)start->tv_sec) * 1000000000U + (uint64_t)end->tv_nsec - (uint64_t)start->tv_nsec;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size)
|
|
|
|
|
{
|
|
|
|
|
size_t use_alignment = (type_alignment > alignment ? type_alignment : alignment);
|
|
|
|
|
if ( (use_alignment % type_alignment) || (use_alignment % alignment) ) {
|
|
|
|
|
fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: conflicting alignment requirements (%zu versus %zu)\n", size, alignment, type_alignment);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Over-allocate so we can stash our own pointer before the start, and so that we can adjust
|
|
|
|
|
* the returned alignment so it is only aligned to the requested boundary, and not also
|
|
|
|
|
* aligned to a larger power of two (we don't want to accidentally benchmark the performance
|
|
|
|
|
* of a more restrictive larger alignment)
|
|
|
|
|
*/
|
|
|
|
|
size_t header_size = (use_alignment < sizeof(void*) ? sizeof(void*) : use_alignment);
|
|
|
|
|
char *block_ptr = aligned_alloc(use_alignment, header_size + size + use_alignment);
|
|
|
|
|
if (!block_ptr) {
|
|
|
|
|
fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: %s\n", size, strerror(errno));
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
char *user_ptr = block_ptr + header_size;
|
|
|
|
|
if ( (uintptr_t)user_ptr % (use_alignment * 2) == 0 ) {
|
|
|
|
|
// user_ptr is aligned to the next power of two, but we don't want that, move it on
|
|
|
|
|
user_ptr += use_alignment;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void **stash = (void**)user_ptr - 1;
|
|
|
|
|
*stash = block_ptr;
|
|
|
|
|
|
|
|
|
|
return user_ptr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void starch_benchmark_aligned_free(void *user_ptr)
|
|
|
|
|
{
|
|
|
|
|
if (!user_ptr)
|
|
|
|
|
return;
|
|
|
|
|
void **stash = (void**)user_ptr - 1;
|
|
|
|
|
free(*stash);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool starch_benchmark_flavor_in_list(const char *flavor, const starch_benchmark_flavor_list *list)
|
|
|
|
|
{
|
|
|
|
|
for (; list; list = list->next) {
|
|
|
|
|
if (!strcmp(flavor, list->flavor))
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
<% functions_to_benchmark = [f for f in gen.functions.values() if f.benchmark] %>
|
2021-02-09 06:05:05 +00:00
|
|
|
% for function in sorted(functions_to_benchmark):
|
Move all converters to starch-based implementations (#97)
* Switch all conversion routines to use starch.
main user-visible changes:
* ensure you check out submodules ('git clone --recurse-submodules")
* --version shows the CPU features and DSP implementations in use
* --wisdom allows overriding of the built-in architecture wisdom
* --dcfilter no longer supported
* "starch-benchmark" binary will benchmark all options on the
current machine and can produce a wisdom file to feed to
the --wisdom option
If you have a usecase for --dcfilter, please get in touch and
let me know - it's an edge case and for now there's no starch/DSP
support for it, but support can be written if needed.
In almost all cases the new conversion routines are slightly or
substantially faster than the old conversion routines. The only case
that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower
due to changing from heavily approximated lookup tables to higher
quality results (but SC16 is probably already out of reach of a Pi 0)
* No need to build with SC16Q11_TABLE_BITS any more
* Add oneoff/uc8_capture_stats
(reads a UC8 capture; measures min/max/mean I and Q)
* Switch UC8 conversion to 127.4 center, 128 range.
Looking at actual UC8 captures from a RTL2832, the mean I and Q
are actually at 127.4, so use that as the zero point.
This means that the resulting I/Q maximum values could be as large as
127.6. Switch to 128 for simplicity.
* Switch to the new UC8 zero offset in benchmarks, fix some bugs
* Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements
* Ditch UC8 approximation path, add a NEON VRQSQRTE path.
* Tweak the SC16 exact path, add a new impl that uses a mix of
u32 & floats.
* SC16Q11 impl tweaks:
* add a u32->float exact path
* ditch the approximation path
* add a NEON VRSQRTE path
* add a 12-bit table path (using the full signed I/Q value, not absolute value)
* Ditch SC16 approximation path, add NEON vrsqrte path
* Add oneoff/dsp_error_measurement
This runs sample input through the DSP functions that are
allowed to be inexact and dumps the results as a TSV suitable for
feeding to gnuplot to look at the actual errors.
* Update make clean, make wisdom targets
* Update wisdom based on benchmarking
* Preserve the raw wisdom benchmark data
* Update to latest starch
* Update .gitignore for new wisdom files
* Update starch generated code
* Build starch-benchmark as part of the 'all' target
* Use wisdom from /etc/dump1090-fa/wisdom.local if present
* Package starch-benchmark and a helper script to generate local wisdom data
* Remove submodules in preparation for importing them directly
* Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0
* Import starch at commit a725c8491dc33a321565d451b385131e589d8490
from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
|
|
|
/* prototypes for benchmark helpers provided by user code */
|
|
|
|
|
void ${function.benchmark_symbol} (void);
|
|
|
|
|
% if function.benchmark_verify:
|
|
|
|
|
bool ${function.benchmark_verify_symbol} ( ${function.declaration_arglist } );
|
|
|
|
|
% endif
|
|
|
|
|
|
|
|
|
|
/* prototype the benchmarking function so that we can build with -Wmissing-declarations */
|
|
|
|
|
void ${gen.symbol_prefix}${function.name}_benchmark(void);
|
|
|
|
|
|
|
|
|
|
static void starch_benchmark_one_${function.name}( ${function.regentry_type} * _entry, ${function.declaration_arglist } )
|
|
|
|
|
{
|
|
|
|
|
fprintf(stderr, " %-40s ", _entry->name);
|
|
|
|
|
|
|
|
|
|
/* test for support */
|
|
|
|
|
if (_entry->flavor_supported && !(_entry->flavor_supported())) {
|
|
|
|
|
fprintf(stderr, "unsupported\n");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) {
|
|
|
|
|
fprintf(stderr, "skipped (not whitelisted)\n");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) {
|
|
|
|
|
fprintf(stderr, "skipped (blacklisted)\n");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (starch_benchmark_list_only) {
|
|
|
|
|
fprintf(stderr, "supported\n");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* initial warmup */
|
|
|
|
|
for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop)
|
|
|
|
|
_entry->callable ( ${function.named_arglist} );
|
|
|
|
|
|
|
|
|
|
% if function.benchmark_verify:
|
|
|
|
|
/* verify correctness of the output */
|
|
|
|
|
if (! ${function.benchmark_verify_symbol} ( ${function.named_arglist} )) {
|
|
|
|
|
fprintf(stderr, "skipped (verification failed)\n");
|
|
|
|
|
starch_benchmark_validation_failed = true;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
if (starch_benchmark_validate_only) {
|
|
|
|
|
fprintf(stderr, "validation ok\n");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
% else:
|
|
|
|
|
if (starch_benchmark_validate_only) {
|
|
|
|
|
fprintf(stderr, "no validator defined\n");
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
% endif
|
|
|
|
|
|
|
|
|
|
/* pre-benchmark, find a loop count that takes at least 100ms */
|
|
|
|
|
starch_benchmark_time _start, _end;
|
|
|
|
|
uint64_t _elapsed = 0;
|
|
|
|
|
uint64_t _loops = 127;
|
|
|
|
|
while (_elapsed < 100000000) {
|
|
|
|
|
_loops *= 2;
|
|
|
|
|
starch_benchmark_get_time(&_start);
|
|
|
|
|
for (uint64_t _loop = 0; _loop < _loops; ++_loop)
|
|
|
|
|
_entry->callable ( ${function.named_arglist} );
|
|
|
|
|
starch_benchmark_get_time(&_end);
|
|
|
|
|
_elapsed = starch_benchmark_elapsed(&_start, &_end);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* real benchmark, run for approx 1 second */
|
|
|
|
|
_loops = _loops * 1000000000 / _elapsed;
|
|
|
|
|
|
|
|
|
|
_elapsed = 0;
|
|
|
|
|
uint64_t _elapsed_min = UINT64_MAX;
|
|
|
|
|
uint64_t _elapsed_max = 0;
|
|
|
|
|
for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) {
|
|
|
|
|
starch_benchmark_get_time(&_start);
|
|
|
|
|
for (uint64_t _loop = 0; _loop < _loops; ++_loop)
|
|
|
|
|
_entry->callable ( ${function.named_arglist} );
|
|
|
|
|
starch_benchmark_get_time(&_end);
|
|
|
|
|
uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end);
|
|
|
|
|
if (_elapsed_one < _elapsed_min)
|
|
|
|
|
_elapsed_min = _elapsed_one;
|
|
|
|
|
if (_elapsed_one > _elapsed_max)
|
|
|
|
|
_elapsed_max = _elapsed_one;
|
|
|
|
|
_elapsed += _elapsed_one;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
uint64_t _per_loop;
|
|
|
|
|
if (starch_benchmark_iterations > 2)
|
|
|
|
|
_per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2);
|
|
|
|
|
else
|
|
|
|
|
_per_loop = _elapsed / _loops / starch_benchmark_iterations;
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop);
|
|
|
|
|
|
|
|
|
|
if (starch_benchmark_result_count >= starch_benchmark_result_size) {
|
|
|
|
|
if (!starch_benchmark_result_size)
|
|
|
|
|
starch_benchmark_result_size = 64;
|
|
|
|
|
else
|
|
|
|
|
starch_benchmark_result_size *= 2;
|
|
|
|
|
starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results));
|
|
|
|
|
if (!starch_benchmark_results) {
|
|
|
|
|
fprintf(stderr, "realloc: %s\n", strerror(errno));
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
starch_benchmark_results[starch_benchmark_result_count].name = "${function.name}";
|
|
|
|
|
starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name;
|
|
|
|
|
starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop;
|
|
|
|
|
++starch_benchmark_result_count;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void starch_benchmark_run_${function.name}( ${function.declaration_arglist } )
|
|
|
|
|
{
|
|
|
|
|
for (${function.regentry_type} *_entry = ${function.registry_symbol}; _entry->name; ++_entry) {
|
|
|
|
|
starch_benchmark_one_${function.name}( _entry, ${function.named_arglist} );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
% endfor
|
|
|
|
|
|
|
|
|
|
#undef STARCH_ALIGNMENT
|
|
|
|
|
|
|
|
|
|
#define STARCH_ALIGNMENT 1
|
|
|
|
|
#define STARCH_ALIGNED(_ptr) (_ptr)
|
|
|
|
|
#define STARCH_SYMBOL(_name) ${gen.symbol_prefix} ## _name ## _benchmark_sym
|
|
|
|
|
#define STARCH_IMPL(_function,_impl) ${gen.symbol_prefix} ## _function ## _ ## _impl ## _dummy_benchmark
|
|
|
|
|
#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl)
|
|
|
|
|
#define STARCH_BENCHMARK(_function) ${gen.symbol_prefix} ## _function ## _benchmark
|
|
|
|
|
#define STARCH_BENCHMARK_VERIFY(_function) ${gen.symbol_prefix} ## _function ## _benchmark_verify
|
|
|
|
|
#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ( __VA_ARGS__ )
|
2021-02-02 03:34:37 +00:00
|
|
|
#define STARCH_BENCHMARK_ALLOC(_count, _type) ((_type *) starch_benchmark_aligned_alloc(1, alignof(_type), (_count) * sizeof(_type)))
|
Move all converters to starch-based implementations (#97)
* Switch all conversion routines to use starch.
main user-visible changes:
* ensure you check out submodules ('git clone --recurse-submodules")
* --version shows the CPU features and DSP implementations in use
* --wisdom allows overriding of the built-in architecture wisdom
* --dcfilter no longer supported
* "starch-benchmark" binary will benchmark all options on the
current machine and can produce a wisdom file to feed to
the --wisdom option
If you have a usecase for --dcfilter, please get in touch and
let me know - it's an edge case and for now there's no starch/DSP
support for it, but support can be written if needed.
In almost all cases the new conversion routines are slightly or
substantially faster than the old conversion routines. The only case
that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower
due to changing from heavily approximated lookup tables to higher
quality results (but SC16 is probably already out of reach of a Pi 0)
* No need to build with SC16Q11_TABLE_BITS any more
* Add oneoff/uc8_capture_stats
(reads a UC8 capture; measures min/max/mean I and Q)
* Switch UC8 conversion to 127.4 center, 128 range.
Looking at actual UC8 captures from a RTL2832, the mean I and Q
are actually at 127.4, so use that as the zero point.
This means that the resulting I/Q maximum values could be as large as
127.6. Switch to 128 for simplicity.
* Switch to the new UC8 zero offset in benchmarks, fix some bugs
* Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements
* Ditch UC8 approximation path, add a NEON VRQSQRTE path.
* Tweak the SC16 exact path, add a new impl that uses a mix of
u32 & floats.
* SC16Q11 impl tweaks:
* add a u32->float exact path
* ditch the approximation path
* add a NEON VRSQRTE path
* add a 12-bit table path (using the full signed I/Q value, not absolute value)
* Ditch SC16 approximation path, add NEON vrsqrte path
* Add oneoff/dsp_error_measurement
This runs sample input through the DSP functions that are
allowed to be inexact and dumps the results as a TSV suitable for
feeding to gnuplot to look at the actual errors.
* Update make clean, make wisdom targets
* Update wisdom based on benchmarking
* Preserve the raw wisdom benchmark data
* Update to latest starch
* Update .gitignore for new wisdom files
* Update starch generated code
* Build starch-benchmark as part of the 'all' target
* Use wisdom from /etc/dump1090-fa/wisdom.local if present
* Package starch-benchmark and a helper script to generate local wisdom data
* Remove submodules in preparation for importing them directly
* Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0
* Import starch at commit a725c8491dc33a321565d451b385131e589d8490
from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
|
|
|
#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr)
|
|
|
|
|
|
2021-02-09 06:05:05 +00:00
|
|
|
% for source in sorted(gen.benchmark_files):
|
Move all converters to starch-based implementations (#97)
* Switch all conversion routines to use starch.
main user-visible changes:
* ensure you check out submodules ('git clone --recurse-submodules")
* --version shows the CPU features and DSP implementations in use
* --wisdom allows overriding of the built-in architecture wisdom
* --dcfilter no longer supported
* "starch-benchmark" binary will benchmark all options on the
current machine and can produce a wisdom file to feed to
the --wisdom option
If you have a usecase for --dcfilter, please get in touch and
let me know - it's an edge case and for now there's no starch/DSP
support for it, but support can be written if needed.
In almost all cases the new conversion routines are slightly or
substantially faster than the old conversion routines. The only case
that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower
due to changing from heavily approximated lookup tables to higher
quality results (but SC16 is probably already out of reach of a Pi 0)
* No need to build with SC16Q11_TABLE_BITS any more
* Add oneoff/uc8_capture_stats
(reads a UC8 capture; measures min/max/mean I and Q)
* Switch UC8 conversion to 127.4 center, 128 range.
Looking at actual UC8 captures from a RTL2832, the mean I and Q
are actually at 127.4, so use that as the zero point.
This means that the resulting I/Q maximum values could be as large as
127.6. Switch to 128 for simplicity.
* Switch to the new UC8 zero offset in benchmarks, fix some bugs
* Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements
* Ditch UC8 approximation path, add a NEON VRQSQRTE path.
* Tweak the SC16 exact path, add a new impl that uses a mix of
u32 & floats.
* SC16Q11 impl tweaks:
* add a u32->float exact path
* ditch the approximation path
* add a NEON VRSQRTE path
* add a 12-bit table path (using the full signed I/Q value, not absolute value)
* Ditch SC16 approximation path, add NEON vrsqrte path
* Add oneoff/dsp_error_measurement
This runs sample input through the DSP functions that are
allowed to be inexact and dumps the results as a TSV suitable for
feeding to gnuplot to look at the actual errors.
* Update make clean, make wisdom targets
* Update wisdom based on benchmarking
* Preserve the raw wisdom benchmark data
* Update to latest starch
* Update .gitignore for new wisdom files
* Update starch generated code
* Build starch-benchmark as part of the 'all' target
* Use wisdom from /etc/dump1090-fa/wisdom.local if present
* Package starch-benchmark and a helper script to generate local wisdom data
* Remove submodules in preparation for importing them directly
* Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0
* Import starch at commit a725c8491dc33a321565d451b385131e589d8490
from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
|
|
|
#include "${os.path.relpath(source.path, current_dir)}"
|
|
|
|
|
% endfor
|
|
|
|
|
|
|
|
|
|
#undef STARCH_ALIGNMENT
|
|
|
|
|
#undef STARCH_ALIGNED
|
|
|
|
|
#undef STARCH_SYMBOL
|
|
|
|
|
#undef STARCH_IMPL
|
|
|
|
|
#undef STARCH_IMPL_REQUIRES
|
|
|
|
|
#undef STARCH_BENCHMARK
|
|
|
|
|
#undef STARCH_BENCHMARK_VERIFY
|
|
|
|
|
#undef STARCH_BENCHMARK_RUN
|
|
|
|
|
#undef STARCH_BENCHMARK_ALLOC
|
|
|
|
|
#undef STARCH_BENCHMARK_FREE
|
|
|
|
|
|
|
|
|
|
#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT
|
|
|
|
|
#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT))
|
|
|
|
|
#define STARCH_SYMBOL(_name) ${gen.symbol_prefix} ## _name ## _aligned_benchmark_sym
|
|
|
|
|
#define STARCH_IMPL(_function,_impl) ${gen.symbol_prefix} ## _function ## _aligned_ ## _impl ## _dummy_benchmark
|
|
|
|
|
#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl)
|
|
|
|
|
#define STARCH_BENCHMARK(_function) ${gen.symbol_prefix} ## _function ## _aligned_benchmark
|
|
|
|
|
#define STARCH_BENCHMARK_VERIFY(_function) ${gen.symbol_prefix} ## _function ## _aligned_benchmark_verify
|
|
|
|
|
#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ## _aligned ( __VA_ARGS__ )
|
2021-02-02 03:34:37 +00:00
|
|
|
#define STARCH_BENCHMARK_ALLOC(_count, _type) ((_type *) starch_benchmark_aligned_alloc(STARCH_MIX_ALIGNMENT, alignof(_type), (_count) * sizeof(_type)))
|
Move all converters to starch-based implementations (#97)
* Switch all conversion routines to use starch.
main user-visible changes:
* ensure you check out submodules ('git clone --recurse-submodules")
* --version shows the CPU features and DSP implementations in use
* --wisdom allows overriding of the built-in architecture wisdom
* --dcfilter no longer supported
* "starch-benchmark" binary will benchmark all options on the
current machine and can produce a wisdom file to feed to
the --wisdom option
If you have a usecase for --dcfilter, please get in touch and
let me know - it's an edge case and for now there's no starch/DSP
support for it, but support can be written if needed.
In almost all cases the new conversion routines are slightly or
substantially faster than the old conversion routines. The only case
that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower
due to changing from heavily approximated lookup tables to higher
quality results (but SC16 is probably already out of reach of a Pi 0)
* No need to build with SC16Q11_TABLE_BITS any more
* Add oneoff/uc8_capture_stats
(reads a UC8 capture; measures min/max/mean I and Q)
* Switch UC8 conversion to 127.4 center, 128 range.
Looking at actual UC8 captures from a RTL2832, the mean I and Q
are actually at 127.4, so use that as the zero point.
This means that the resulting I/Q maximum values could be as large as
127.6. Switch to 128 for simplicity.
* Switch to the new UC8 zero offset in benchmarks, fix some bugs
* Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements
* Ditch UC8 approximation path, add a NEON VRQSQRTE path.
* Tweak the SC16 exact path, add a new impl that uses a mix of
u32 & floats.
* SC16Q11 impl tweaks:
* add a u32->float exact path
* ditch the approximation path
* add a NEON VRSQRTE path
* add a 12-bit table path (using the full signed I/Q value, not absolute value)
* Ditch SC16 approximation path, add NEON vrsqrte path
* Add oneoff/dsp_error_measurement
This runs sample input through the DSP functions that are
allowed to be inexact and dumps the results as a TSV suitable for
feeding to gnuplot to look at the actual errors.
* Update make clean, make wisdom targets
* Update wisdom based on benchmarking
* Preserve the raw wisdom benchmark data
* Update to latest starch
* Update .gitignore for new wisdom files
* Update starch generated code
* Build starch-benchmark as part of the 'all' target
* Use wisdom from /etc/dump1090-fa/wisdom.local if present
* Package starch-benchmark and a helper script to generate local wisdom data
* Remove submodules in preparation for importing them directly
* Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0
* Import starch at commit a725c8491dc33a321565d451b385131e589d8490
from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
|
|
|
#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr)
|
|
|
|
|
|
2021-02-09 06:05:05 +00:00
|
|
|
% for source in sorted(gen.benchmark_files):
|
Move all converters to starch-based implementations (#97)
* Switch all conversion routines to use starch.
main user-visible changes:
* ensure you check out submodules ('git clone --recurse-submodules")
* --version shows the CPU features and DSP implementations in use
* --wisdom allows overriding of the built-in architecture wisdom
* --dcfilter no longer supported
* "starch-benchmark" binary will benchmark all options on the
current machine and can produce a wisdom file to feed to
the --wisdom option
If you have a usecase for --dcfilter, please get in touch and
let me know - it's an edge case and for now there's no starch/DSP
support for it, but support can be written if needed.
In almost all cases the new conversion routines are slightly or
substantially faster than the old conversion routines. The only case
that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower
due to changing from heavily approximated lookup tables to higher
quality results (but SC16 is probably already out of reach of a Pi 0)
* No need to build with SC16Q11_TABLE_BITS any more
* Add oneoff/uc8_capture_stats
(reads a UC8 capture; measures min/max/mean I and Q)
* Switch UC8 conversion to 127.4 center, 128 range.
Looking at actual UC8 captures from a RTL2832, the mean I and Q
are actually at 127.4, so use that as the zero point.
This means that the resulting I/Q maximum values could be as large as
127.6. Switch to 128 for simplicity.
* Switch to the new UC8 zero offset in benchmarks, fix some bugs
* Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements
* Ditch UC8 approximation path, add a NEON VRQSQRTE path.
* Tweak the SC16 exact path, add a new impl that uses a mix of
u32 & floats.
* SC16Q11 impl tweaks:
* add a u32->float exact path
* ditch the approximation path
* add a NEON VRSQRTE path
* add a 12-bit table path (using the full signed I/Q value, not absolute value)
* Ditch SC16 approximation path, add NEON vrsqrte path
* Add oneoff/dsp_error_measurement
This runs sample input through the DSP functions that are
allowed to be inexact and dumps the results as a TSV suitable for
feeding to gnuplot to look at the actual errors.
* Update make clean, make wisdom targets
* Update wisdom based on benchmarking
* Preserve the raw wisdom benchmark data
* Update to latest starch
* Update .gitignore for new wisdom files
* Update starch generated code
* Build starch-benchmark as part of the 'all' target
* Use wisdom from /etc/dump1090-fa/wisdom.local if present
* Package starch-benchmark and a helper script to generate local wisdom data
* Remove submodules in preparation for importing them directly
* Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0
* Import starch at commit a725c8491dc33a321565d451b385131e589d8490
from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
|
|
|
% if any( (function.aligned and function.benchmark == source) for function in gen.functions.values() ):
|
|
|
|
|
#include "${os.path.relpath(source.path, current_dir)}"
|
|
|
|
|
% endif
|
|
|
|
|
% endfor
|
|
|
|
|
|
2021-02-09 06:05:05 +00:00
|
|
|
% for function in sorted(functions_to_benchmark):
|
Move all converters to starch-based implementations (#97)
* Switch all conversion routines to use starch.
main user-visible changes:
* ensure you check out submodules ('git clone --recurse-submodules")
* --version shows the CPU features and DSP implementations in use
* --wisdom allows overriding of the built-in architecture wisdom
* --dcfilter no longer supported
* "starch-benchmark" binary will benchmark all options on the
current machine and can produce a wisdom file to feed to
the --wisdom option
If you have a usecase for --dcfilter, please get in touch and
let me know - it's an edge case and for now there's no starch/DSP
support for it, but support can be written if needed.
In almost all cases the new conversion routines are slightly or
substantially faster than the old conversion routines. The only case
that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower
due to changing from heavily approximated lookup tables to higher
quality results (but SC16 is probably already out of reach of a Pi 0)
* No need to build with SC16Q11_TABLE_BITS any more
* Add oneoff/uc8_capture_stats
(reads a UC8 capture; measures min/max/mean I and Q)
* Switch UC8 conversion to 127.4 center, 128 range.
Looking at actual UC8 captures from a RTL2832, the mean I and Q
are actually at 127.4, so use that as the zero point.
This means that the resulting I/Q maximum values could be as large as
127.6. Switch to 128 for simplicity.
* Switch to the new UC8 zero offset in benchmarks, fix some bugs
* Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements
* Ditch UC8 approximation path, add a NEON VRQSQRTE path.
* Tweak the SC16 exact path, add a new impl that uses a mix of
u32 & floats.
* SC16Q11 impl tweaks:
* add a u32->float exact path
* ditch the approximation path
* add a NEON VRSQRTE path
* add a 12-bit table path (using the full signed I/Q value, not absolute value)
* Ditch SC16 approximation path, add NEON vrsqrte path
* Add oneoff/dsp_error_measurement
This runs sample input through the DSP functions that are
allowed to be inexact and dumps the results as a TSV suitable for
feeding to gnuplot to look at the actual errors.
* Update make clean, make wisdom targets
* Update wisdom based on benchmarking
* Preserve the raw wisdom benchmark data
* Update to latest starch
* Update .gitignore for new wisdom files
* Update starch generated code
* Build starch-benchmark as part of the 'all' target
* Use wisdom from /etc/dump1090-fa/wisdom.local if present
* Package starch-benchmark and a helper script to generate local wisdom data
* Remove submodules in preparation for importing them directly
* Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0
* Import starch at commit a725c8491dc33a321565d451b385131e589d8490
from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
|
|
|
static void starch_benchmark_all_${function.name}(void)
|
|
|
|
|
{
|
|
|
|
|
fprintf(stderr, "==== ${function.name} ===\n");
|
|
|
|
|
${gen.symbol_prefix}${function.name}_benchmark ();
|
|
|
|
|
}
|
|
|
|
|
% endfor
|
|
|
|
|
|
|
|
|
|
static int starch_benchmark_compare_result(const void *a, const void *b)
|
|
|
|
|
{
|
|
|
|
|
const starch_benchmark_result *left = (const starch_benchmark_result *) a;
|
|
|
|
|
const starch_benchmark_result *right = (const starch_benchmark_result *) b;
|
|
|
|
|
|
|
|
|
|
int name_cmp = strcmp(left->name, right->name);
|
|
|
|
|
if (name_cmp)
|
|
|
|
|
return name_cmp;
|
|
|
|
|
|
|
|
|
|
if (left->ns < right->ns)
|
|
|
|
|
return -1;
|
|
|
|
|
if (left->ns > right->ns)
|
|
|
|
|
return 1;
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void starch_benchmark_usage(const char *argv0)
|
|
|
|
|
{
|
|
|
|
|
fprintf(stderr,
|
|
|
|
|
"Usage: %s [OPTION]... [FUNCTION]...\n"
|
|
|
|
|
"Benchmarks starch functions and optionally writes a sorted wisdom file.\n"
|
|
|
|
|
"\n"
|
|
|
|
|
" -r FILE Read initial wisdom from FILE\n"
|
|
|
|
|
" -o FILE Write sorted wisdom to FILE\n"
|
|
|
|
|
" -F FLAVOR Add FLAVOR to whitelist\n"
|
|
|
|
|
" (default: no whitelist, run all runtime-supported flavors)\n"
|
|
|
|
|
" -N FLAVOR Add FLAVOR to blacklist\n"
|
|
|
|
|
" (default: no blacklist, run all runtime-supported flavors)\n"
|
|
|
|
|
" -l List compiled-in implementations but don't benchmark them\n"
|
|
|
|
|
" -V Run validation tests, but don't run benchmarks\n"
|
|
|
|
|
" -t Include only the top candidate per function in wisdom output\n"
|
|
|
|
|
" -i ITERS Run benchmark ITERS times and use the mean. If ITERS > 2, ignore\n"
|
|
|
|
|
" the smallest and largest runs when calculating the mean.\n"
|
|
|
|
|
" (default: 1 iteration)\n"
|
|
|
|
|
" FUNCTION Run benchmarks for these functions only\n"
|
|
|
|
|
" (default: benchmark all functions)\n"
|
|
|
|
|
"\n"
|
|
|
|
|
"Supported flavors: "
|
2021-02-09 06:05:05 +00:00
|
|
|
% for flavor in sorted(gen.flavors.values()):
|
Move all converters to starch-based implementations (#97)
* Switch all conversion routines to use starch.
main user-visible changes:
* ensure you check out submodules ('git clone --recurse-submodules")
* --version shows the CPU features and DSP implementations in use
* --wisdom allows overriding of the built-in architecture wisdom
* --dcfilter no longer supported
* "starch-benchmark" binary will benchmark all options on the
current machine and can produce a wisdom file to feed to
the --wisdom option
If you have a usecase for --dcfilter, please get in touch and
let me know - it's an edge case and for now there's no starch/DSP
support for it, but support can be written if needed.
In almost all cases the new conversion routines are slightly or
substantially faster than the old conversion routines. The only case
that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower
due to changing from heavily approximated lookup tables to higher
quality results (but SC16 is probably already out of reach of a Pi 0)
* No need to build with SC16Q11_TABLE_BITS any more
* Add oneoff/uc8_capture_stats
(reads a UC8 capture; measures min/max/mean I and Q)
* Switch UC8 conversion to 127.4 center, 128 range.
Looking at actual UC8 captures from a RTL2832, the mean I and Q
are actually at 127.4, so use that as the zero point.
This means that the resulting I/Q maximum values could be as large as
127.6. Switch to 128 for simplicity.
* Switch to the new UC8 zero offset in benchmarks, fix some bugs
* Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements
* Ditch UC8 approximation path, add a NEON VRQSQRTE path.
* Tweak the SC16 exact path, add a new impl that uses a mix of
u32 & floats.
* SC16Q11 impl tweaks:
* add a u32->float exact path
* ditch the approximation path
* add a NEON VRSQRTE path
* add a 12-bit table path (using the full signed I/Q value, not absolute value)
* Ditch SC16 approximation path, add NEON vrsqrte path
* Add oneoff/dsp_error_measurement
This runs sample input through the DSP functions that are
allowed to be inexact and dumps the results as a TSV suitable for
feeding to gnuplot to look at the actual errors.
* Update make clean, make wisdom targets
* Update wisdom based on benchmarking
* Preserve the raw wisdom benchmark data
* Update to latest starch
* Update .gitignore for new wisdom files
* Update starch generated code
* Build starch-benchmark as part of the 'all' target
* Use wisdom from /etc/dump1090-fa/wisdom.local if present
* Package starch-benchmark and a helper script to generate local wisdom data
* Remove submodules in preparation for importing them directly
* Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0
* Import starch at commit a725c8491dc33a321565d451b385131e589d8490
from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
|
|
|
#ifdef ${flavor.macro}
|
|
|
|
|
"${flavor.name} "
|
|
|
|
|
#endif
|
|
|
|
|
% endfor
|
|
|
|
|
"\n"
|
|
|
|
|
"Supported functions: "
|
2021-02-09 06:05:05 +00:00
|
|
|
% for function in sorted(gen.functions.values()):
|
Move all converters to starch-based implementations (#97)
* Switch all conversion routines to use starch.
main user-visible changes:
* ensure you check out submodules ('git clone --recurse-submodules")
* --version shows the CPU features and DSP implementations in use
* --wisdom allows overriding of the built-in architecture wisdom
* --dcfilter no longer supported
* "starch-benchmark" binary will benchmark all options on the
current machine and can produce a wisdom file to feed to
the --wisdom option
If you have a usecase for --dcfilter, please get in touch and
let me know - it's an edge case and for now there's no starch/DSP
support for it, but support can be written if needed.
In almost all cases the new conversion routines are slightly or
substantially faster than the old conversion routines. The only case
that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower
due to changing from heavily approximated lookup tables to higher
quality results (but SC16 is probably already out of reach of a Pi 0)
* No need to build with SC16Q11_TABLE_BITS any more
* Add oneoff/uc8_capture_stats
(reads a UC8 capture; measures min/max/mean I and Q)
* Switch UC8 conversion to 127.4 center, 128 range.
Looking at actual UC8 captures from a RTL2832, the mean I and Q
are actually at 127.4, so use that as the zero point.
This means that the resulting I/Q maximum values could be as large as
127.6. Switch to 128 for simplicity.
* Switch to the new UC8 zero offset in benchmarks, fix some bugs
* Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements
* Ditch UC8 approximation path, add a NEON VRQSQRTE path.
* Tweak the SC16 exact path, add a new impl that uses a mix of
u32 & floats.
* SC16Q11 impl tweaks:
* add a u32->float exact path
* ditch the approximation path
* add a NEON VRSQRTE path
* add a 12-bit table path (using the full signed I/Q value, not absolute value)
* Ditch SC16 approximation path, add NEON vrsqrte path
* Add oneoff/dsp_error_measurement
This runs sample input through the DSP functions that are
allowed to be inexact and dumps the results as a TSV suitable for
feeding to gnuplot to look at the actual errors.
* Update make clean, make wisdom targets
* Update wisdom based on benchmarking
* Preserve the raw wisdom benchmark data
* Update to latest starch
* Update .gitignore for new wisdom files
* Update starch generated code
* Build starch-benchmark as part of the 'all' target
* Use wisdom from /etc/dump1090-fa/wisdom.local if present
* Package starch-benchmark and a helper script to generate local wisdom data
* Remove submodules in preparation for importing them directly
* Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0
* Import starch at commit a725c8491dc33a321565d451b385131e589d8490
from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
|
|
|
"${function.name} "
|
|
|
|
|
% endfor
|
|
|
|
|
"\n", argv0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void starch_benchmark_append_flavor(const char *flavor, starch_benchmark_flavor_list **list)
|
|
|
|
|
{
|
|
|
|
|
starch_benchmark_flavor_list *newnode = malloc(sizeof(*newnode));
|
|
|
|
|
if (!newnode) {
|
|
|
|
|
fprintf(stderr, "malloc: %s\n", strerror(errno));
|
|
|
|
|
exit(1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
newnode->flavor = flavor;
|
|
|
|
|
newnode->next = *list;
|
|
|
|
|
*list = newnode;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int main(int argc, char **argv)
|
|
|
|
|
{
|
|
|
|
|
int specific = 0;
|
|
|
|
|
const char *output_path = NULL;
|
|
|
|
|
|
|
|
|
|
int opt;
|
|
|
|
|
while ((opt = getopt(argc, argv, "r:o:F:N:i:lhtV")) != -1) {
|
|
|
|
|
switch (opt) {
|
|
|
|
|
case 'r':
|
|
|
|
|
if (${gen.sym("read_wisdom")}(optarg) < 0) {
|
|
|
|
|
fprintf(stderr, "%s: cannot read %s: %s\n", argv[0], optarg, strerror(errno));
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
fprintf(stderr, "%s: loaded wisdom file %s\n", argv[0], optarg);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case 'o':
|
|
|
|
|
output_path = optarg;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case 'F':
|
|
|
|
|
if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_blacklist)) {
|
|
|
|
|
fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg);
|
|
|
|
|
return 2;
|
|
|
|
|
}
|
|
|
|
|
starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_whitelist);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case 'N':
|
|
|
|
|
if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_whitelist)) {
|
|
|
|
|
fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg);
|
|
|
|
|
return 2;
|
|
|
|
|
}
|
|
|
|
|
starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_blacklist);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case 'l':
|
|
|
|
|
starch_benchmark_list_only = true;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case 't':
|
|
|
|
|
starch_benchmark_top_only = true;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case 'i':
|
|
|
|
|
starch_benchmark_iterations = atoi(optarg);
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case 'V':
|
|
|
|
|
starch_benchmark_validate_only = true;
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
case 'h':
|
|
|
|
|
starch_benchmark_usage(argv[0]);
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
case '?':
|
|
|
|
|
default:
|
|
|
|
|
starch_benchmark_usage(argv[0]);
|
|
|
|
|
return 2;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (starch_benchmark_list_only && output_path) {
|
|
|
|
|
fprintf(stderr, "%s: -o and -l options cannot be specified together\n", argv[0]);
|
|
|
|
|
return 2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for (int i = optind; i < argc; ++i) {
|
2021-02-09 06:05:05 +00:00
|
|
|
% for function in sorted(gen.functions.values()):
|
Move all converters to starch-based implementations (#97)
* Switch all conversion routines to use starch.
main user-visible changes:
* ensure you check out submodules ('git clone --recurse-submodules")
* --version shows the CPU features and DSP implementations in use
* --wisdom allows overriding of the built-in architecture wisdom
* --dcfilter no longer supported
* "starch-benchmark" binary will benchmark all options on the
current machine and can produce a wisdom file to feed to
the --wisdom option
If you have a usecase for --dcfilter, please get in touch and
let me know - it's an edge case and for now there's no starch/DSP
support for it, but support can be written if needed.
In almost all cases the new conversion routines are slightly or
substantially faster than the old conversion routines. The only case
that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower
due to changing from heavily approximated lookup tables to higher
quality results (but SC16 is probably already out of reach of a Pi 0)
* No need to build with SC16Q11_TABLE_BITS any more
* Add oneoff/uc8_capture_stats
(reads a UC8 capture; measures min/max/mean I and Q)
* Switch UC8 conversion to 127.4 center, 128 range.
Looking at actual UC8 captures from a RTL2832, the mean I and Q
are actually at 127.4, so use that as the zero point.
This means that the resulting I/Q maximum values could be as large as
127.6. Switch to 128 for simplicity.
* Switch to the new UC8 zero offset in benchmarks, fix some bugs
* Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements
* Ditch UC8 approximation path, add a NEON VRQSQRTE path.
* Tweak the SC16 exact path, add a new impl that uses a mix of
u32 & floats.
* SC16Q11 impl tweaks:
* add a u32->float exact path
* ditch the approximation path
* add a NEON VRSQRTE path
* add a 12-bit table path (using the full signed I/Q value, not absolute value)
* Ditch SC16 approximation path, add NEON vrsqrte path
* Add oneoff/dsp_error_measurement
This runs sample input through the DSP functions that are
allowed to be inexact and dumps the results as a TSV suitable for
feeding to gnuplot to look at the actual errors.
* Update make clean, make wisdom targets
* Update wisdom based on benchmarking
* Preserve the raw wisdom benchmark data
* Update to latest starch
* Update .gitignore for new wisdom files
* Update starch generated code
* Build starch-benchmark as part of the 'all' target
* Use wisdom from /etc/dump1090-fa/wisdom.local if present
* Package starch-benchmark and a helper script to generate local wisdom data
* Remove submodules in preparation for importing them directly
* Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0
* Import starch at commit a725c8491dc33a321565d451b385131e589d8490
from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
|
|
|
if (!strcmp(argv[i], "${function.name}")) {
|
|
|
|
|
specific = 1;
|
|
|
|
|
% if function.benchmark:
|
|
|
|
|
starch_benchmark_all_${function.name}();
|
|
|
|
|
% else:
|
|
|
|
|
fprintf(stderr, "=== ${function.name} ===\n");
|
|
|
|
|
fprintf(stderr, " (no benchmark support defined)\n");
|
|
|
|
|
% endif
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
% endfor
|
|
|
|
|
|
|
|
|
|
fprintf(stderr, "%s: unrecognized function name: %s\n", argv[0], argv[i]);
|
|
|
|
|
return 2;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!specific) {
|
2021-02-09 06:05:05 +00:00
|
|
|
% for function in sorted(gen.functions.values()):
|
Move all converters to starch-based implementations (#97)
* Switch all conversion routines to use starch.
main user-visible changes:
* ensure you check out submodules ('git clone --recurse-submodules")
* --version shows the CPU features and DSP implementations in use
* --wisdom allows overriding of the built-in architecture wisdom
* --dcfilter no longer supported
* "starch-benchmark" binary will benchmark all options on the
current machine and can produce a wisdom file to feed to
the --wisdom option
If you have a usecase for --dcfilter, please get in touch and
let me know - it's an edge case and for now there's no starch/DSP
support for it, but support can be written if needed.
In almost all cases the new conversion routines are slightly or
substantially faster than the old conversion routines. The only case
that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower
due to changing from heavily approximated lookup tables to higher
quality results (but SC16 is probably already out of reach of a Pi 0)
* No need to build with SC16Q11_TABLE_BITS any more
* Add oneoff/uc8_capture_stats
(reads a UC8 capture; measures min/max/mean I and Q)
* Switch UC8 conversion to 127.4 center, 128 range.
Looking at actual UC8 captures from a RTL2832, the mean I and Q
are actually at 127.4, so use that as the zero point.
This means that the resulting I/Q maximum values could be as large as
127.6. Switch to 128 for simplicity.
* Switch to the new UC8 zero offset in benchmarks, fix some bugs
* Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements
* Ditch UC8 approximation path, add a NEON VRQSQRTE path.
* Tweak the SC16 exact path, add a new impl that uses a mix of
u32 & floats.
* SC16Q11 impl tweaks:
* add a u32->float exact path
* ditch the approximation path
* add a NEON VRSQRTE path
* add a 12-bit table path (using the full signed I/Q value, not absolute value)
* Ditch SC16 approximation path, add NEON vrsqrte path
* Add oneoff/dsp_error_measurement
This runs sample input through the DSP functions that are
allowed to be inexact and dumps the results as a TSV suitable for
feeding to gnuplot to look at the actual errors.
* Update make clean, make wisdom targets
* Update wisdom based on benchmarking
* Preserve the raw wisdom benchmark data
* Update to latest starch
* Update .gitignore for new wisdom files
* Update starch generated code
* Build starch-benchmark as part of the 'all' target
* Use wisdom from /etc/dump1090-fa/wisdom.local if present
* Package starch-benchmark and a helper script to generate local wisdom data
* Remove submodules in preparation for importing them directly
* Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0
* Import starch at commit a725c8491dc33a321565d451b385131e589d8490
from https://github.com/flightaware/starch
2021-01-21 11:45:00 +00:00
|
|
|
% if function.benchmark:
|
|
|
|
|
starch_benchmark_all_${function.name}();
|
|
|
|
|
% else:
|
|
|
|
|
fprintf(stderr, "=== ${function.name} ===\n");
|
|
|
|
|
fprintf(stderr, " (no benchmark support defined)\n");
|
|
|
|
|
% endif
|
|
|
|
|
% endfor
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (output_path) {
|
|
|
|
|
FILE *out = fopen(output_path, "w");
|
|
|
|
|
if (!out) {
|
|
|
|
|
fprintf(stderr, "%s: cannot open %s: %s\n", argv[0], output_path, strerror(errno));
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fprintf(out, "# generated by ");
|
|
|
|
|
for (int i = 0; i < argc; ++i)
|
|
|
|
|
fprintf(out, "%s ", argv[i]);
|
|
|
|
|
fprintf(out, "\n\n");
|
|
|
|
|
|
|
|
|
|
qsort(starch_benchmark_results, starch_benchmark_result_count, sizeof(*starch_benchmark_results), starch_benchmark_compare_result);
|
|
|
|
|
|
|
|
|
|
const char *last_name = NULL;
|
|
|
|
|
bool first = true;
|
|
|
|
|
for (unsigned i = 0; i < starch_benchmark_result_count; ++i) {
|
|
|
|
|
starch_benchmark_result *result = &starch_benchmark_results[i];
|
|
|
|
|
if (last_name && strcmp(last_name, result->name) != 0) {
|
|
|
|
|
fprintf(out, "\n");
|
|
|
|
|
first = true;
|
|
|
|
|
}
|
|
|
|
|
last_name = result->name;
|
|
|
|
|
if (starch_benchmark_top_only && !first)
|
|
|
|
|
continue;
|
|
|
|
|
fprintf(out, "%-40s %-40s # %" PRIu64 " ns/call\n", result->name, result->impl, result->ns);
|
|
|
|
|
first = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fclose(out);
|
|
|
|
|
fprintf(stderr, "%s: wrote sorted wisdom to %s\n", argv[0], output_path);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return starch_benchmark_validation_failed ? 1 : 0;
|
|
|
|
|
}
|