From 2480e5169c1f70c421b906fb653237e47e0c7f10 Mon Sep 17 00:00:00 2001 From: Oliver Jowett Date: Thu, 8 Jul 2021 18:53:02 +0800 Subject: [PATCH] Use a starch implementation for the burst-detection sample counting loop. --- adaptive.c | 8 +- dsp/benchmark/count_above_u16_benchmark.c | 39 ++++ dsp/generated/benchmark.c | 250 ++++++++++++++++++++++ dsp/generated/dispatcher.c | 207 ++++++++++++++++++ dsp/generated/flavor.armv7a_neon_vfpv4.c | 2 + dsp/generated/flavor.armv8_neon_simd.c | 2 + dsp/generated/flavor.generic.c | 1 + dsp/generated/flavor.x86_avx2.c | 2 + dsp/generated/makefile.aarch64 | 8 +- dsp/generated/makefile.arm | 8 +- dsp/generated/makefile.generic | 6 +- dsp/generated/makefile.x86 | 8 +- dsp/generated/starch.h | 41 ++++ dsp/impl/count_above_u16.c | 58 +++++ dsp/starchgen.py | 1 + dump1090.c | 1 + wisdom.arm | 6 + wisdom.generic | 3 + wisdom.x86 | 6 + wisdom/wisdom.i7-6500u | 7 + wisdom/wisdom.pi4b | 10 + 21 files changed, 653 insertions(+), 21 deletions(-) create mode 100644 dsp/benchmark/count_above_u16_benchmark.c create mode 100644 dsp/impl/count_above_u16.c diff --git a/adaptive.c b/adaptive.c index cb609e6..f96450f 100644 --- a/adaptive.c +++ b/adaptive.c @@ -292,12 +292,8 @@ static void adaptive_burst_scan_windows(uint16_t *buf, unsigned windows) // return the number of loud samples seen static inline unsigned adaptive_burst_count_samples(uint16_t *buf, unsigned n) { - unsigned counter = 0; - while (n--) { - if (buf[0] > 46395) // -3dBFS - ++counter; - ++buf; - } + unsigned counter; + starch_count_above_u16(buf, n, 46395 /* -3dBFS */, &counter); return counter; } diff --git a/dsp/benchmark/count_above_u16_benchmark.c b/dsp/benchmark/count_above_u16_benchmark.c new file mode 100644 index 0000000..74b51db --- /dev/null +++ b/dsp/benchmark/count_above_u16_benchmark.c @@ -0,0 +1,39 @@ +#include + +void STARCH_BENCHMARK(count_above_u16) (void) +{ + uint16_t *in = NULL; + const unsigned len = 96; /* Typical use is with short burst windows (40us) */ + const unsigned threshold = 46395; /* -3dBFS */ + + if (!(in = STARCH_BENCHMARK_ALLOC(len, uint16_t))) { + goto done; + } + + srand(1); + for (unsigned i = 0; i < len; ++i) { + in[i] = rand() % 65536; + } + + unsigned count; + STARCH_BENCHMARK_RUN( count_above_u16, in, len, threshold, &count ); + + done: + STARCH_BENCHMARK_FREE(in); +} + +bool STARCH_BENCHMARK_VERIFY(count_above_u16) (const uint16_t *in, unsigned len, uint16_t threshold, unsigned *out_count) +{ + unsigned expected = 0; + for (unsigned i = 0; i < len; ++i) { + if (in[i] >= threshold) + ++expected; + } + + if (expected != *out_count) { + fprintf(stderr, "verification failed: expected count %u, got count %u\n", expected, *out_count); + return false; + } + + return true; +} diff --git a/dsp/generated/benchmark.c b/dsp/generated/benchmark.c index 7a15724..582d09f 100644 --- a/dsp/generated/benchmark.c +++ b/dsp/generated/benchmark.c @@ -112,6 +112,230 @@ static bool starch_benchmark_flavor_in_list(const char *flavor, const starch_ben } +/* prototypes for benchmark helpers provided by user code */ +void starch_count_above_u16_benchmark (void); +bool starch_count_above_u16_benchmark_verify ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_count_above_u16_benchmark(void); + +static void starch_benchmark_one_count_above_u16( starch_count_above_u16_regentry * _entry, const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + + /* verify correctness of the output */ + if (! starch_count_above_u16_benchmark_verify ( arg0, arg1, arg2, arg3 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "count_above_u16"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_count_above_u16( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ) +{ + for (starch_count_above_u16_regentry *_entry = starch_count_above_u16_registry; _entry->name; ++_entry) { + starch_benchmark_one_count_above_u16( _entry, arg0, arg1, arg2, arg3 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_count_above_u16_aligned_benchmark (void); +bool starch_count_above_u16_aligned_benchmark_verify ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_count_above_u16_aligned_benchmark(void); + +static void starch_benchmark_one_count_above_u16_aligned( starch_count_above_u16_aligned_regentry * _entry, const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + + /* verify correctness of the output */ + if (! starch_count_above_u16_aligned_benchmark_verify ( arg0, arg1, arg2, arg3 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "count_above_u16_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_count_above_u16_aligned( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ) +{ + for (starch_count_above_u16_aligned_regentry *_entry = starch_count_above_u16_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_count_above_u16_aligned( _entry, arg0, arg1, arg2, arg3 ); + } +} + /* prototypes for benchmark helpers provided by user code */ void starch_magnitude_power_uc8_benchmark (void); bool starch_magnitude_power_uc8_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); @@ -1246,6 +1470,7 @@ static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, #define STARCH_BENCHMARK_ALLOC(_count, _type) ((_type *) starch_benchmark_aligned_alloc(1, alignof(_type), (_count) * sizeof(_type))) #define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) +#include "../benchmark/count_above_u16_benchmark.c" #include "../benchmark/magnitude_power_uc8_benchmark.c" #include "../benchmark/magnitude_sc16_benchmark.c" #include "../benchmark/magnitude_sc16q11_benchmark.c" @@ -1274,12 +1499,23 @@ static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, #define STARCH_BENCHMARK_ALLOC(_count, _type) ((_type *) starch_benchmark_aligned_alloc(STARCH_MIX_ALIGNMENT, alignof(_type), (_count) * sizeof(_type))) #define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) +#include "../benchmark/count_above_u16_benchmark.c" #include "../benchmark/magnitude_power_uc8_benchmark.c" #include "../benchmark/magnitude_sc16_benchmark.c" #include "../benchmark/magnitude_sc16q11_benchmark.c" #include "../benchmark/magnitude_uc8_benchmark.c" #include "../benchmark/mean_power_u16_benchmark.c" +static void starch_benchmark_all_count_above_u16(void) +{ + fprintf(stderr, "==== count_above_u16 ===\n"); + starch_count_above_u16_benchmark (); +} +static void starch_benchmark_all_count_above_u16_aligned(void) +{ + fprintf(stderr, "==== count_above_u16_aligned ===\n"); + starch_count_above_u16_aligned_benchmark (); +} static void starch_benchmark_all_magnitude_power_uc8(void) { fprintf(stderr, "==== magnitude_power_uc8 ===\n"); @@ -1383,6 +1619,8 @@ static void starch_benchmark_usage(const char *argv0) #endif "\n" "Supported functions: " + "count_above_u16 " + "count_above_u16_aligned " "magnitude_power_uc8 " "magnitude_power_uc8_aligned " "magnitude_sc16 " @@ -1478,6 +1716,16 @@ int main(int argc, char **argv) } for (int i = optind; i < argc; ++i) { + if (!strcmp(argv[i], "count_above_u16")) { + specific = 1; + starch_benchmark_all_count_above_u16(); + continue; + } + if (!strcmp(argv[i], "count_above_u16_aligned")) { + specific = 1; + starch_benchmark_all_count_above_u16_aligned(); + continue; + } if (!strcmp(argv[i], "magnitude_power_uc8")) { specific = 1; starch_benchmark_all_magnitude_power_uc8(); @@ -1534,6 +1782,8 @@ int main(int argc, char **argv) } if (!specific) { + starch_benchmark_all_count_above_u16(); + starch_benchmark_all_count_above_u16_aligned(); starch_benchmark_all_magnitude_power_uc8(); starch_benchmark_all_magnitude_power_uc8_aligned(); starch_benchmark_all_magnitude_sc16(); diff --git a/dsp/generated/dispatcher.c b/dsp/generated/dispatcher.c index 09bba10..0c7fed1 100644 --- a/dsp/generated/dispatcher.c +++ b/dsp/generated/dispatcher.c @@ -19,6 +19,165 @@ static int starch_regentry_rank_compare (const void *l, const void *r) return left->rank - right->rank; } +/* dispatcher / registry for count_above_u16 */ + +starch_count_above_u16_regentry * starch_count_above_u16_select() { + for (starch_count_above_u16_regentry *entry = starch_count_above_u16_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_count_above_u16_dispatch ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ) { + starch_count_above_u16_regentry *entry = starch_count_above_u16_select(); + if (!entry) + abort(); + + starch_count_above_u16 = entry->callable; + starch_count_above_u16 ( arg0, arg1, arg2, arg3 ); +} + +starch_count_above_u16_ptr starch_count_above_u16 = starch_count_above_u16_dispatch; + +void starch_count_above_u16_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_count_above_u16_regentry *entry; + for (entry = starch_count_above_u16_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_count_above_u16_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_count_above_u16_registry, entry - starch_count_above_u16_registry, sizeof(starch_count_above_u16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_count_above_u16 = starch_count_above_u16_dispatch; +} + +starch_count_above_u16_regentry starch_count_above_u16_registry[] = { + +#ifdef STARCH_MIX_AARCH64 + { 0, "generic_armv8_neon_simd", "armv8_neon_simd", starch_count_above_u16_generic_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "neon_armv8_neon_simd", "armv8_neon_simd", starch_count_above_u16_neon_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "generic_generic", "generic", starch_count_above_u16_generic_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_count_above_u16_neon_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "generic_generic", "generic", starch_count_above_u16_generic_generic, NULL }, + { 2, "generic_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_count_above_u16_generic_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_GENERIC + { 0, "generic_generic", "generic", starch_count_above_u16_generic_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_X86 + { 0, "generic_x86_avx2", "x86_avx2", starch_count_above_u16_generic_x86_avx2, cpu_supports_avx2 }, + { 1, "generic_generic", "generic", starch_count_above_u16_generic_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for count_above_u16_aligned */ + +starch_count_above_u16_aligned_regentry * starch_count_above_u16_aligned_select() { + for (starch_count_above_u16_aligned_regentry *entry = starch_count_above_u16_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_count_above_u16_aligned_dispatch ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ) { + starch_count_above_u16_aligned_regentry *entry = starch_count_above_u16_aligned_select(); + if (!entry) + abort(); + + starch_count_above_u16_aligned = entry->callable; + starch_count_above_u16_aligned ( arg0, arg1, arg2, arg3 ); +} + +starch_count_above_u16_aligned_ptr starch_count_above_u16_aligned = starch_count_above_u16_aligned_dispatch; + +void starch_count_above_u16_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_count_above_u16_aligned_regentry *entry; + for (entry = starch_count_above_u16_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_count_above_u16_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_count_above_u16_aligned_registry, entry - starch_count_above_u16_aligned_registry, sizeof(starch_count_above_u16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_count_above_u16_aligned = starch_count_above_u16_aligned_dispatch; +} + +starch_count_above_u16_aligned_regentry starch_count_above_u16_aligned_registry[] = { + +#ifdef STARCH_MIX_AARCH64 + { 0, "generic_armv8_neon_simd_aligned", "armv8_neon_simd", starch_count_above_u16_aligned_generic_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "neon_armv8_neon_simd_aligned", "armv8_neon_simd", starch_count_above_u16_aligned_neon_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "generic_armv8_neon_simd", "armv8_neon_simd", starch_count_above_u16_generic_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "neon_armv8_neon_simd", "armv8_neon_simd", starch_count_above_u16_neon_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "generic_generic", "generic", starch_count_above_u16_generic_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_count_above_u16_neon_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "generic_generic", "generic", starch_count_above_u16_generic_generic, NULL }, + { 2, "generic_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_count_above_u16_aligned_generic_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "neon_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_count_above_u16_aligned_neon_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "generic_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_count_above_u16_generic_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_GENERIC + { 0, "generic_generic", "generic", starch_count_above_u16_generic_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_X86 + { 0, "generic_x86_avx2_aligned", "x86_avx2", starch_count_above_u16_aligned_generic_x86_avx2, cpu_supports_avx2 }, + { 1, "generic_generic", "generic", starch_count_above_u16_generic_generic, NULL }, + { 2, "generic_x86_avx2", "x86_avx2", starch_count_above_u16_generic_x86_avx2, cpu_supports_avx2 }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + /* dispatcher / registry for magnitude_power_uc8 */ starch_magnitude_power_uc8_regentry * starch_magnitude_power_uc8_select() { @@ -992,6 +1151,14 @@ int starch_read_wisdom (const char * path) return -1; /* reset all ranks to identify entries not listed in the wisdom file; we'll assign ranks at the end to produce a stable sort */ + int rank_count_above_u16 = 0; + for (starch_count_above_u16_regentry *entry = starch_count_above_u16_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_count_above_u16_aligned = 0; + for (starch_count_above_u16_aligned_regentry *entry = starch_count_above_u16_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } int rank_magnitude_power_uc8 = 0; for (starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { entry->rank = 0; @@ -1065,6 +1232,24 @@ int starch_read_wisdom (const char * path) *end = 0; /* try to find a matching registry entry */ + if (!strcmp(name, "count_above_u16")) { + for (starch_count_above_u16_regentry *entry = starch_count_above_u16_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_count_above_u16; + break; + } + } + continue; + } + if (!strcmp(name, "count_above_u16_aligned")) { + for (starch_count_above_u16_aligned_regentry *entry = starch_count_above_u16_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_count_above_u16_aligned; + break; + } + } + continue; + } if (!strcmp(name, "magnitude_power_uc8")) { for (starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { if (!strcmp(impl, entry->name)) { @@ -1165,6 +1350,28 @@ int starch_read_wisdom (const char * path) fclose(fp); /* assign ranks to unmatched items to (stable) sort them last; re-sort everything */ + { + starch_count_above_u16_regentry *entry; + for (entry = starch_count_above_u16_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_count_above_u16; + } + qsort(starch_count_above_u16_registry, entry - starch_count_above_u16_registry, sizeof(starch_count_above_u16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_count_above_u16 = starch_count_above_u16_dispatch; + } + { + starch_count_above_u16_aligned_regentry *entry; + for (entry = starch_count_above_u16_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_count_above_u16_aligned; + } + qsort(starch_count_above_u16_aligned_registry, entry - starch_count_above_u16_aligned_registry, sizeof(starch_count_above_u16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_count_above_u16_aligned = starch_count_above_u16_aligned_dispatch; + } { starch_magnitude_power_uc8_regentry *entry; for (entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { diff --git a/dsp/generated/flavor.armv7a_neon_vfpv4.c b/dsp/generated/flavor.armv7a_neon_vfpv4.c index cf8b5cc..0d00348 100644 --- a/dsp/generated/flavor.armv7a_neon_vfpv4.c +++ b/dsp/generated/flavor.armv7a_neon_vfpv4.c @@ -14,6 +14,7 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv7a_neon_vfpv4 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#include "../impl/count_above_u16.c" #include "../impl/magnitude_power_uc8.c" #include "../impl/magnitude_sc16.c" #include "../impl/magnitude_sc16q11.c" @@ -33,6 +34,7 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv7a_neon_vfpv4 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#include "../impl/count_above_u16.c" #include "../impl/magnitude_power_uc8.c" #include "../impl/magnitude_sc16.c" #include "../impl/magnitude_sc16q11.c" diff --git a/dsp/generated/flavor.armv8_neon_simd.c b/dsp/generated/flavor.armv8_neon_simd.c index 276e47e..1115d34 100644 --- a/dsp/generated/flavor.armv8_neon_simd.c +++ b/dsp/generated/flavor.armv8_neon_simd.c @@ -14,6 +14,7 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv8_neon_simd #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#include "../impl/count_above_u16.c" #include "../impl/magnitude_power_uc8.c" #include "../impl/magnitude_sc16.c" #include "../impl/magnitude_sc16q11.c" @@ -33,6 +34,7 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv8_neon_simd #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#include "../impl/count_above_u16.c" #include "../impl/magnitude_power_uc8.c" #include "../impl/magnitude_sc16.c" #include "../impl/magnitude_sc16q11.c" diff --git a/dsp/generated/flavor.generic.c b/dsp/generated/flavor.generic.c index 8b8fa0b..f0260b2 100644 --- a/dsp/generated/flavor.generic.c +++ b/dsp/generated/flavor.generic.c @@ -13,6 +13,7 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## generic #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#include "../impl/count_above_u16.c" #include "../impl/magnitude_power_uc8.c" #include "../impl/magnitude_sc16.c" #include "../impl/magnitude_sc16q11.c" diff --git a/dsp/generated/flavor.x86_avx2.c b/dsp/generated/flavor.x86_avx2.c index de56b0d..c0eec7e 100644 --- a/dsp/generated/flavor.x86_avx2.c +++ b/dsp/generated/flavor.x86_avx2.c @@ -13,6 +13,7 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## x86_avx2 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#include "../impl/count_above_u16.c" #include "../impl/magnitude_power_uc8.c" #include "../impl/magnitude_sc16.c" #include "../impl/magnitude_sc16q11.c" @@ -32,6 +33,7 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## x86_avx2 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#include "../impl/count_above_u16.c" #include "../impl/magnitude_power_uc8.c" #include "../impl/magnitude_sc16.c" #include "../impl/magnitude_sc16q11.c" diff --git a/dsp/generated/makefile.aarch64 b/dsp/generated/makefile.aarch64 index d9bed4c..082f081 100644 --- a/dsp/generated/makefile.aarch64 +++ b/dsp/generated/makefile.aarch64 @@ -21,19 +21,19 @@ STARCH_CFLAGS := -DSTARCH_MIX_AARCH64 -dsp/generated/flavor.armv8_neon_simd.o: dsp/generated/flavor.armv8_neon_simd.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.armv8_neon_simd.o: dsp/generated/flavor.armv8_neon_simd.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/count_above_u16.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv8-a+simd -ffast-math dsp/generated/flavor.armv8_neon_simd.c -o dsp/generated/flavor.armv8_neon_simd.o -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/count_above_u16.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/count_above_u16.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.armv8_neon_simd.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/count_above_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.arm b/dsp/generated/makefile.arm index 58eaf5b..e79332d 100644 --- a/dsp/generated/makefile.arm +++ b/dsp/generated/makefile.arm @@ -21,19 +21,19 @@ STARCH_CFLAGS := -DSTARCH_MIX_ARM -dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/count_above_u16.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv7-a+neon-vfpv4 -mfpu=neon-vfpv4 -ffast-math dsp/generated/flavor.armv7a_neon_vfpv4.c -o dsp/generated/flavor.armv7a_neon_vfpv4.o -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/count_above_u16.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/count_above_u16.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.armv7a_neon_vfpv4.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/count_above_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.generic b/dsp/generated/makefile.generic index 7f261d9..39ea24d 100644 --- a/dsp/generated/makefile.generic +++ b/dsp/generated/makefile.generic @@ -21,16 +21,16 @@ STARCH_CFLAGS := -DSTARCH_MIX_GENERIC -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/count_above_u16.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/count_above_u16.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/count_above_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.x86 b/dsp/generated/makefile.x86 index e88d3e1..990a6eb 100644 --- a/dsp/generated/makefile.x86 +++ b/dsp/generated/makefile.x86 @@ -21,19 +21,19 @@ STARCH_CFLAGS := -DSTARCH_MIX_X86 -dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/count_above_u16.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) -mavx2 -ffast-math dsp/generated/flavor.x86_avx2.c -o dsp/generated/flavor.x86_avx2.o -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/count_above_u16.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/count_above_u16.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.x86_avx2.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/count_above_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/starch.h b/dsp/generated/starch.h index a2b62e9..7fe97e9 100644 --- a/dsp/generated/starch.h +++ b/dsp/generated/starch.h @@ -195,6 +195,36 @@ extern starch_mean_power_u16_aligned_regentry starch_mean_power_u16_aligned_regi starch_mean_power_u16_aligned_regentry * starch_mean_power_u16_aligned_select(); void starch_mean_power_u16_aligned_set_wisdom( const char * const * received_wisdom ); +typedef void (* starch_count_above_u16_ptr) ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); +extern starch_count_above_u16_ptr starch_count_above_u16; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_count_above_u16_ptr callable; + int (*flavor_supported)(); +} starch_count_above_u16_regentry; + +extern starch_count_above_u16_regentry starch_count_above_u16_registry[]; +starch_count_above_u16_regentry * starch_count_above_u16_select(); +void starch_count_above_u16_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_count_above_u16_aligned_ptr) ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); +extern starch_count_above_u16_aligned_ptr starch_count_above_u16_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_count_above_u16_aligned_ptr callable; + int (*flavor_supported)(); +} starch_count_above_u16_aligned_regentry; + +extern starch_count_above_u16_aligned_regentry starch_count_above_u16_aligned_registry[]; +starch_count_above_u16_aligned_regentry * starch_count_above_u16_aligned_select(); +void starch_count_above_u16_aligned_set_wisdom( const char * const * received_wisdom ); + /* flavors and prototypes */ #ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 @@ -233,6 +263,10 @@ void starch_magnitude_sc16q11_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg void starch_magnitude_sc16q11_aligned_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_count_above_u16_generic_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); +void starch_count_above_u16_aligned_generic_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); +void starch_count_above_u16_neon_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); +void starch_count_above_u16_aligned_neon_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); void starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); @@ -279,6 +313,10 @@ void starch_magnitude_sc16q11_12bit_table_armv8_neon_simd ( const sc16_t * arg0, void starch_magnitude_sc16q11_aligned_12bit_table_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_count_above_u16_generic_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); +void starch_count_above_u16_aligned_generic_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); +void starch_count_above_u16_neon_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); +void starch_count_above_u16_aligned_neon_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); void starch_magnitude_sc16_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16_aligned_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); @@ -303,6 +341,7 @@ void starch_magnitude_sc16q11_exact_u32_generic ( const sc16_t * arg0, uint16_t void starch_magnitude_sc16q11_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_11bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_12bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_count_above_u16_generic_generic ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); void starch_magnitude_sc16_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); #endif /* STARCH_FLAVOR_GENERIC */ @@ -337,6 +376,8 @@ void starch_magnitude_sc16q11_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16 void starch_magnitude_sc16q11_aligned_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_count_above_u16_generic_x86_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); +void starch_count_above_u16_aligned_generic_x86_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, unsigned * arg3 ); void starch_magnitude_sc16_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); diff --git a/dsp/impl/count_above_u16.c b/dsp/impl/count_above_u16.c new file mode 100644 index 0000000..485b1ff --- /dev/null +++ b/dsp/impl/count_above_u16.c @@ -0,0 +1,58 @@ +/* + * Count the number of samples in a uint16_t buffer that are >= a threshold. + */ +void STARCH_IMPL(count_above_u16, generic) (const uint16_t *in, unsigned len, uint16_t threshold, unsigned *out_count) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + + unsigned count = 0; + while (len--) { + if (in_align[0] >= threshold) + ++count; + ++in_align; + } + + *out_count = count; +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(count_above_u16, neon, STARCH_FEATURE_NEON) (const uint16_t *in, unsigned len, uint16_t threshold, unsigned *out_count) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + const uint16x8_t threshold_x8 = vdupq_n_u16(threshold); + + int32x4_t accumulator0 = vdupq_n_s32(0); + int32x4_t accumulator1 = vdupq_n_s32(0); + + unsigned len8 = len >> 3; + while (len8--) { + uint16x8_t mag = vld1q_u16(in_align); + int16x8_t compare = vreinterpretq_s16_u16(vcgeq_u16(mag, threshold_x8)); + accumulator0 = vsubw_s16(accumulator0, vget_low_s16(compare)); + accumulator1 = vsubw_s16(accumulator1, vget_high_s16(compare)); + + in_align += 8; + } + + // sum accumulators across all lanes + int32x4_t sum2 = vaddq_s32(accumulator0, accumulator1); + int32x2_t sum4 = vadd_s32(vget_low_s32(sum2), vget_high_s32(sum2)); + int32x2_t sum8 = vpadd_s32(sum4, sum4); + int32x4_t sum8_x2 = vcombine_s32(sum8, sum8); + + unsigned len1 = len & 7; + while (len1--) { + uint16x4_t mag = vld1_dup_u16(in_align); + int16x4_t compare = vreinterpret_s16_u16(vcge_u16(mag, vget_low_u16(threshold_x8))); + sum8_x2 = vsubw_s16(sum8_x2, compare); + + in_align += 1; + } + + *out_count = vgetq_lane_s32(sum8_x2, 0); +} + +#endif diff --git a/dsp/starchgen.py b/dsp/starchgen.py index 81e5818..7c5c918 100755 --- a/dsp/starchgen.py +++ b/dsp/starchgen.py @@ -20,6 +20,7 @@ gen.add_function(name = 'magnitude_power_uc8', argtypes = ['const uc8_t *', 'uin gen.add_function(name = 'magnitude_sc16', argtypes = ['const sc16_t *', 'uint16_t *', 'unsigned'], aligned = True) gen.add_function(name = 'magnitude_sc16q11', argtypes = ['const sc16_t *', 'uint16_t *', 'unsigned'], aligned = True) gen.add_function(name = 'mean_power_u16', argtypes = ['const uint16_t *', 'unsigned', 'double *', 'double *'], aligned = True) +gen.add_function(name = 'count_above_u16', argtypes = ['const uint16_t *', 'unsigned', 'uint16_t', 'unsigned *'], aligned = True) gen.add_feature(name='neon', description='ARM NEON') diff --git a/dump1090.c b/dump1090.c index 6f2b1c5..611abf1 100644 --- a/dump1090.c +++ b/dump1090.c @@ -305,6 +305,7 @@ static void showDSP() SHOW(magnitude_sc16); SHOW(magnitude_sc16q11); SHOW(mean_power_u16); + SHOW(count_above_u16); #undef SHOW diff --git a/wisdom.arm b/wisdom.arm index 96dae84..ee045e1 100644 --- a/wisdom.arm +++ b/wisdom.arm @@ -29,3 +29,9 @@ mean_power_u16 u64_generic mean_power_u16_aligned u32_armv7a_neon_vfpv4_aligned # 44929 ns/call mean_power_u16_aligned u64_generic # 934445 ns/call + +count_above_u16 neon_armv7a_neon_vfpv4 # 35 ns/call +count_above_u16 generic_generic # 178 ns/call + +count_above_u16_aligned neon_armv7a_neon_vfpv4 # 34 ns/call +count_above_u16_aligned generic_generic # 179 ns/call diff --git a/wisdom.generic b/wisdom.generic index 2c924d3..df854bc 100644 --- a/wisdom.generic +++ b/wisdom.generic @@ -14,3 +14,6 @@ magnitude_uc8_aligned lookup_unroll_4_generic mean_power_u16 u32_generic mean_power_u16_aligned u32_generic + +count_above_u16 generic_generic +count_above_u16_aligned generic_generic diff --git a/wisdom.x86 b/wisdom.x86 index 28a7719..35a11f3 100644 --- a/wisdom.x86 +++ b/wisdom.x86 @@ -29,3 +29,9 @@ mean_power_u16 u32_generic mean_power_u16_aligned u32_x86_avx2_aligned # 11572 ns/call mean_power_u16_aligned u32_generic # 18207 ns/call + +count_above_u16 generic_x86_avx2 # 20 ns/call +count_above_u16 generic_generic # 30 ns/call + +count_above_u16_aligned generic_x86_avx2_aligned # 15 ns/call +count_above_u16_aligned generic_generic # 31 ns/call diff --git a/wisdom/wisdom.i7-6500u b/wisdom/wisdom.i7-6500u index 78b93bf..763bb15 100644 --- a/wisdom/wisdom.i7-6500u +++ b/wisdom/wisdom.i7-6500u @@ -88,3 +88,10 @@ mean_power_u16_aligned u64_x86_avx2_aligned mean_power_u16_aligned u64_x86_avx2 # 31283 ns/call mean_power_u16_aligned u64_generic # 39639 ns/call mean_power_u16_aligned float_generic # 105615 ns/call + +count_above_u16 generic_x86_avx2 # 20 ns/call +count_above_u16 generic_generic # 30 ns/call + +count_above_u16_aligned generic_x86_avx2_aligned # 15 ns/call +count_above_u16_aligned generic_x86_avx2 # 19 ns/call +count_above_u16_aligned generic_generic # 31 ns/call diff --git a/wisdom/wisdom.pi4b b/wisdom/wisdom.pi4b index 09a053c..f4ada3f 100644 --- a/wisdom/wisdom.pi4b +++ b/wisdom/wisdom.pi4b @@ -105,3 +105,13 @@ mean_power_u16_aligned float_armv7a_neon_vfpv4 mean_power_u16_aligned u64_generic # 131637 ns/call mean_power_u16_aligned u32_generic # 132092 ns/call mean_power_u16_aligned float_generic # 187127 ns/call + +count_above_u16 neon_armv7a_neon_vfpv4 # 35 ns/call +count_above_u16 generic_armv7a_neon_vfpv4 # 56 ns/call +count_above_u16 generic_generic # 178 ns/call + +count_above_u16_aligned neon_armv7a_neon_vfpv4_aligned # 34 ns/call +count_above_u16_aligned neon_armv7a_neon_vfpv4 # 34 ns/call +count_above_u16_aligned generic_armv7a_neon_vfpv4_aligned # 53 ns/call +count_above_u16_aligned generic_armv7a_neon_vfpv4 # 53 ns/call +count_above_u16_aligned generic_generic # 179 ns/call