From 132702cfa7d009b979d7c8a01b6e63ad904ecbc1 Mon Sep 17 00:00:00 2001 From: Oliver Jowett Date: Tue, 9 Feb 2021 14:15:33 +0800 Subject: [PATCH] Regenerate starch-generated code for starch update & aarch64 wisdom --- dsp/generated/benchmark.c | 502 +++++++++++----------- dsp/generated/dispatcher.c | 734 ++++++++++++++++----------------- dsp/generated/makefile.aarch64 | 39 ++ dsp/generated/makefile.arm | 8 +- dsp/generated/makefile.generic | 6 +- dsp/generated/makefile.x86 | 8 +- dsp/generated/starch.h | 178 ++++---- 7 files changed, 757 insertions(+), 718 deletions(-) create mode 100644 dsp/generated/makefile.aarch64 diff --git a/dsp/generated/benchmark.c b/dsp/generated/benchmark.c index ce38e01..7a15724 100644 --- a/dsp/generated/benchmark.c +++ b/dsp/generated/benchmark.c @@ -112,230 +112,6 @@ static bool starch_benchmark_flavor_in_list(const char *flavor, const starch_ben } -/* prototypes for benchmark helpers provided by user code */ -void starch_magnitude_uc8_benchmark (void); -bool starch_magnitude_uc8_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); - -/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ -void starch_magnitude_uc8_benchmark(void); - -static void starch_benchmark_one_magnitude_uc8( starch_magnitude_uc8_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) -{ - fprintf(stderr, " %-40s ", _entry->name); - - /* test for support */ - if (_entry->flavor_supported && !(_entry->flavor_supported())) { - fprintf(stderr, "unsupported\n"); - return; - } - - if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { - fprintf(stderr, "skipped (not whitelisted)\n"); - return; - } - - if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { - fprintf(stderr, "skipped (blacklisted)\n"); - return; - } - - if (starch_benchmark_list_only) { - fprintf(stderr, "supported\n"); - return; - } - - /* initial warmup */ - for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) - _entry->callable ( arg0, arg1, arg2 ); - - /* verify correctness of the output */ - if (! starch_magnitude_uc8_benchmark_verify ( arg0, arg1, arg2 )) { - fprintf(stderr, "skipped (verification failed)\n"); - starch_benchmark_validation_failed = true; - return; - } - if (starch_benchmark_validate_only) { - fprintf(stderr, "validation ok\n"); - return; - } - - /* pre-benchmark, find a loop count that takes at least 100ms */ - starch_benchmark_time _start, _end; - uint64_t _elapsed = 0; - uint64_t _loops = 127; - while (_elapsed < 100000000) { - _loops *= 2; - starch_benchmark_get_time(&_start); - for (uint64_t _loop = 0; _loop < _loops; ++_loop) - _entry->callable ( arg0, arg1, arg2 ); - starch_benchmark_get_time(&_end); - _elapsed = starch_benchmark_elapsed(&_start, &_end); - } - - /* real benchmark, run for approx 1 second */ - _loops = _loops * 1000000000 / _elapsed; - - _elapsed = 0; - uint64_t _elapsed_min = UINT64_MAX; - uint64_t _elapsed_max = 0; - for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { - starch_benchmark_get_time(&_start); - for (uint64_t _loop = 0; _loop < _loops; ++_loop) - _entry->callable ( arg0, arg1, arg2 ); - starch_benchmark_get_time(&_end); - uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); - if (_elapsed_one < _elapsed_min) - _elapsed_min = _elapsed_one; - if (_elapsed_one > _elapsed_max) - _elapsed_max = _elapsed_one; - _elapsed += _elapsed_one; - } - - uint64_t _per_loop; - if (starch_benchmark_iterations > 2) - _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); - else - _per_loop = _elapsed / _loops / starch_benchmark_iterations; - - fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); - - if (starch_benchmark_result_count >= starch_benchmark_result_size) { - if (!starch_benchmark_result_size) - starch_benchmark_result_size = 64; - else - starch_benchmark_result_size *= 2; - starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); - if (!starch_benchmark_results) { - fprintf(stderr, "realloc: %s\n", strerror(errno)); - exit(1); - } - } - - starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_uc8"; - starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; - starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; - ++starch_benchmark_result_count; -} - -static void starch_benchmark_run_magnitude_uc8( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) -{ - for (starch_magnitude_uc8_regentry *_entry = starch_magnitude_uc8_registry; _entry->name; ++_entry) { - starch_benchmark_one_magnitude_uc8( _entry, arg0, arg1, arg2 ); - } -} - -/* prototypes for benchmark helpers provided by user code */ -void starch_magnitude_uc8_aligned_benchmark (void); -bool starch_magnitude_uc8_aligned_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); - -/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ -void starch_magnitude_uc8_aligned_benchmark(void); - -static void starch_benchmark_one_magnitude_uc8_aligned( starch_magnitude_uc8_aligned_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) -{ - fprintf(stderr, " %-40s ", _entry->name); - - /* test for support */ - if (_entry->flavor_supported && !(_entry->flavor_supported())) { - fprintf(stderr, "unsupported\n"); - return; - } - - if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { - fprintf(stderr, "skipped (not whitelisted)\n"); - return; - } - - if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { - fprintf(stderr, "skipped (blacklisted)\n"); - return; - } - - if (starch_benchmark_list_only) { - fprintf(stderr, "supported\n"); - return; - } - - /* initial warmup */ - for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) - _entry->callable ( arg0, arg1, arg2 ); - - /* verify correctness of the output */ - if (! starch_magnitude_uc8_aligned_benchmark_verify ( arg0, arg1, arg2 )) { - fprintf(stderr, "skipped (verification failed)\n"); - starch_benchmark_validation_failed = true; - return; - } - if (starch_benchmark_validate_only) { - fprintf(stderr, "validation ok\n"); - return; - } - - /* pre-benchmark, find a loop count that takes at least 100ms */ - starch_benchmark_time _start, _end; - uint64_t _elapsed = 0; - uint64_t _loops = 127; - while (_elapsed < 100000000) { - _loops *= 2; - starch_benchmark_get_time(&_start); - for (uint64_t _loop = 0; _loop < _loops; ++_loop) - _entry->callable ( arg0, arg1, arg2 ); - starch_benchmark_get_time(&_end); - _elapsed = starch_benchmark_elapsed(&_start, &_end); - } - - /* real benchmark, run for approx 1 second */ - _loops = _loops * 1000000000 / _elapsed; - - _elapsed = 0; - uint64_t _elapsed_min = UINT64_MAX; - uint64_t _elapsed_max = 0; - for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { - starch_benchmark_get_time(&_start); - for (uint64_t _loop = 0; _loop < _loops; ++_loop) - _entry->callable ( arg0, arg1, arg2 ); - starch_benchmark_get_time(&_end); - uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); - if (_elapsed_one < _elapsed_min) - _elapsed_min = _elapsed_one; - if (_elapsed_one > _elapsed_max) - _elapsed_max = _elapsed_one; - _elapsed += _elapsed_one; - } - - uint64_t _per_loop; - if (starch_benchmark_iterations > 2) - _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); - else - _per_loop = _elapsed / _loops / starch_benchmark_iterations; - - fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); - - if (starch_benchmark_result_count >= starch_benchmark_result_size) { - if (!starch_benchmark_result_size) - starch_benchmark_result_size = 64; - else - starch_benchmark_result_size *= 2; - starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); - if (!starch_benchmark_results) { - fprintf(stderr, "realloc: %s\n", strerror(errno)); - exit(1); - } - } - - starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_uc8_aligned"; - starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; - starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; - ++starch_benchmark_result_count; -} - -static void starch_benchmark_run_magnitude_uc8_aligned( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) -{ - for (starch_magnitude_uc8_aligned_regentry *_entry = starch_magnitude_uc8_aligned_registry; _entry->name; ++_entry) { - starch_benchmark_one_magnitude_uc8_aligned( _entry, arg0, arg1, arg2 ); - } -} - /* prototypes for benchmark helpers provided by user code */ void starch_magnitude_power_uc8_benchmark (void); bool starch_magnitude_power_uc8_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); @@ -1008,6 +784,230 @@ static void starch_benchmark_run_magnitude_sc16q11_aligned( const sc16_t * arg0, } } +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_uc8_benchmark (void); +bool starch_magnitude_uc8_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_uc8_benchmark(void); + +static void starch_benchmark_one_magnitude_uc8( starch_magnitude_uc8_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_uc8_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_uc8"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_uc8( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_uc8_regentry *_entry = starch_magnitude_uc8_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_uc8( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_uc8_aligned_benchmark (void); +bool starch_magnitude_uc8_aligned_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_uc8_aligned_benchmark(void); + +static void starch_benchmark_one_magnitude_uc8_aligned( starch_magnitude_uc8_aligned_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_uc8_aligned_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_uc8_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_uc8_aligned( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_uc8_aligned_regentry *_entry = starch_magnitude_uc8_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_uc8_aligned( _entry, arg0, arg1, arg2 ); + } +} + /* prototypes for benchmark helpers provided by user code */ void starch_mean_power_u16_benchmark (void); bool starch_mean_power_u16_benchmark_verify ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); @@ -1280,16 +1280,6 @@ static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, #include "../benchmark/magnitude_uc8_benchmark.c" #include "../benchmark/mean_power_u16_benchmark.c" -static void starch_benchmark_all_magnitude_uc8(void) -{ - fprintf(stderr, "==== magnitude_uc8 ===\n"); - starch_magnitude_uc8_benchmark (); -} -static void starch_benchmark_all_magnitude_uc8_aligned(void) -{ - fprintf(stderr, "==== magnitude_uc8_aligned ===\n"); - starch_magnitude_uc8_aligned_benchmark (); -} static void starch_benchmark_all_magnitude_power_uc8(void) { fprintf(stderr, "==== magnitude_power_uc8 ===\n"); @@ -1320,6 +1310,16 @@ static void starch_benchmark_all_magnitude_sc16q11_aligned(void) fprintf(stderr, "==== magnitude_sc16q11_aligned ===\n"); starch_magnitude_sc16q11_aligned_benchmark (); } +static void starch_benchmark_all_magnitude_uc8(void) +{ + fprintf(stderr, "==== magnitude_uc8 ===\n"); + starch_magnitude_uc8_benchmark (); +} +static void starch_benchmark_all_magnitude_uc8_aligned(void) +{ + fprintf(stderr, "==== magnitude_uc8_aligned ===\n"); + starch_magnitude_uc8_aligned_benchmark (); +} static void starch_benchmark_all_mean_power_u16(void) { fprintf(stderr, "==== mean_power_u16 ===\n"); @@ -1369,28 +1369,28 @@ static void starch_benchmark_usage(const char *argv0) " (default: benchmark all functions)\n" "\n" "Supported flavors: " -#ifdef STARCH_FLAVOR_GENERIC - "generic " -#endif #ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 "armv7a_neon_vfpv4 " #endif #ifdef STARCH_FLAVOR_ARMV8_NEON_SIMD "armv8_neon_simd " #endif +#ifdef STARCH_FLAVOR_GENERIC + "generic " +#endif #ifdef STARCH_FLAVOR_X86_AVX2 "x86_avx2 " #endif "\n" "Supported functions: " - "magnitude_uc8 " - "magnitude_uc8_aligned " "magnitude_power_uc8 " "magnitude_power_uc8_aligned " "magnitude_sc16 " "magnitude_sc16_aligned " "magnitude_sc16q11 " "magnitude_sc16q11_aligned " + "magnitude_uc8 " + "magnitude_uc8_aligned " "mean_power_u16 " "mean_power_u16_aligned " "\n", argv0); @@ -1478,16 +1478,6 @@ int main(int argc, char **argv) } for (int i = optind; i < argc; ++i) { - if (!strcmp(argv[i], "magnitude_uc8")) { - specific = 1; - starch_benchmark_all_magnitude_uc8(); - continue; - } - if (!strcmp(argv[i], "magnitude_uc8_aligned")) { - specific = 1; - starch_benchmark_all_magnitude_uc8_aligned(); - continue; - } if (!strcmp(argv[i], "magnitude_power_uc8")) { specific = 1; starch_benchmark_all_magnitude_power_uc8(); @@ -1518,6 +1508,16 @@ int main(int argc, char **argv) starch_benchmark_all_magnitude_sc16q11_aligned(); continue; } + if (!strcmp(argv[i], "magnitude_uc8")) { + specific = 1; + starch_benchmark_all_magnitude_uc8(); + continue; + } + if (!strcmp(argv[i], "magnitude_uc8_aligned")) { + specific = 1; + starch_benchmark_all_magnitude_uc8_aligned(); + continue; + } if (!strcmp(argv[i], "mean_power_u16")) { specific = 1; starch_benchmark_all_mean_power_u16(); @@ -1534,14 +1534,14 @@ int main(int argc, char **argv) } if (!specific) { - starch_benchmark_all_magnitude_uc8(); - starch_benchmark_all_magnitude_uc8_aligned(); starch_benchmark_all_magnitude_power_uc8(); starch_benchmark_all_magnitude_power_uc8_aligned(); starch_benchmark_all_magnitude_sc16(); starch_benchmark_all_magnitude_sc16_aligned(); starch_benchmark_all_magnitude_sc16q11(); starch_benchmark_all_magnitude_sc16q11_aligned(); + starch_benchmark_all_magnitude_uc8(); + starch_benchmark_all_magnitude_uc8_aligned(); starch_benchmark_all_mean_power_u16(); starch_benchmark_all_mean_power_u16_aligned(); } diff --git a/dsp/generated/dispatcher.c b/dsp/generated/dispatcher.c index 7a0bce4..09bba10 100644 --- a/dsp/generated/dispatcher.c +++ b/dsp/generated/dispatcher.c @@ -19,199 +19,6 @@ static int starch_regentry_rank_compare (const void *l, const void *r) return left->rank - right->rank; } -/* dispatcher / registry for magnitude_uc8 */ - -starch_magnitude_uc8_regentry * starch_magnitude_uc8_select() { - for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; - entry->name; - ++entry) - { - if (entry->flavor_supported && !(entry->flavor_supported())) - continue; - return entry; - } - return NULL; -} - -static void starch_magnitude_uc8_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) { - starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_select(); - if (!entry) - abort(); - - starch_magnitude_uc8 = entry->callable; - starch_magnitude_uc8 ( arg0, arg1, arg2 ); -} - -starch_magnitude_uc8_ptr starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; - -void starch_magnitude_uc8_set_wisdom (const char * const * received_wisdom) -{ - /* re-rank the registry based on received wisdom */ - starch_magnitude_uc8_regentry *entry; - for (entry = starch_magnitude_uc8_registry; entry->name; ++entry) { - const char * const *search; - for (search = received_wisdom; *search; ++search) { - if (!strcmp(*search, entry->name)) { - break; - } - } - if (*search) { - /* matches an entry in the wisdom list, order by position in the list */ - entry->rank = search - received_wisdom; - } else { - /* no match, rank after all possible matches, retaining existing order */ - entry->rank = (search - received_wisdom) + (entry - starch_magnitude_uc8_registry); - } - } - - /* re-sort based on the new ranking */ - qsort(starch_magnitude_uc8_registry, entry - starch_magnitude_uc8_registry, sizeof(starch_magnitude_uc8_regentry), starch_regentry_rank_compare); - - /* reset the implementation pointer so the next call will re-select */ - starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; -} - -starch_magnitude_uc8_regentry starch_magnitude_uc8_registry[] = { - -#ifdef STARCH_MIX_GENERIC - { 0, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, - { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, - { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, -#endif /* STARCH_MIX_GENERIC */ - -#ifdef STARCH_MIX_ARM - { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, - { 2, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 3, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 4, "exact_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 5, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, - { 6, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, -#endif /* STARCH_MIX_ARM */ - -#ifdef STARCH_MIX_AARCH64 - { 0, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, - { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, - { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, - { 3, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, - { 4, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, - { 5, "exact_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_exact_armv8_neon_simd, cpu_supports_armv8_simd }, - { 6, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, -#endif /* STARCH_MIX_AARCH64 */ - -#ifdef STARCH_MIX_X86 - { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, - { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, - { 2, "lookup_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_x86_avx2, cpu_supports_avx2 }, - { 3, "exact_x86_avx2", "x86_avx2", starch_magnitude_uc8_exact_x86_avx2, cpu_supports_avx2 }, - { 4, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, - { 5, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, -#endif /* STARCH_MIX_X86 */ - { 0, NULL, NULL, NULL, NULL } -}; - -/* dispatcher / registry for magnitude_uc8_aligned */ - -starch_magnitude_uc8_aligned_regentry * starch_magnitude_uc8_aligned_select() { - for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; - entry->name; - ++entry) - { - if (entry->flavor_supported && !(entry->flavor_supported())) - continue; - return entry; - } - return NULL; -} - -static void starch_magnitude_uc8_aligned_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) { - starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_select(); - if (!entry) - abort(); - - starch_magnitude_uc8_aligned = entry->callable; - starch_magnitude_uc8_aligned ( arg0, arg1, arg2 ); -} - -starch_magnitude_uc8_aligned_ptr starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; - -void starch_magnitude_uc8_aligned_set_wisdom (const char * const * received_wisdom) -{ - /* re-rank the registry based on received wisdom */ - starch_magnitude_uc8_aligned_regentry *entry; - for (entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { - const char * const *search; - for (search = received_wisdom; *search; ++search) { - if (!strcmp(*search, entry->name)) { - break; - } - } - if (*search) { - /* matches an entry in the wisdom list, order by position in the list */ - entry->rank = search - received_wisdom; - } else { - /* no match, rank after all possible matches, retaining existing order */ - entry->rank = (search - received_wisdom) + (entry - starch_magnitude_uc8_aligned_registry); - } - } - - /* re-sort based on the new ranking */ - qsort(starch_magnitude_uc8_aligned_registry, entry - starch_magnitude_uc8_aligned_registry, sizeof(starch_magnitude_uc8_aligned_regentry), starch_regentry_rank_compare); - - /* reset the implementation pointer so the next call will re-select */ - starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; -} - -starch_magnitude_uc8_aligned_regentry starch_magnitude_uc8_aligned_registry[] = { - -#ifdef STARCH_MIX_GENERIC - { 0, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, - { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, - { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, -#endif /* STARCH_MIX_GENERIC */ - -#ifdef STARCH_MIX_ARM - { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, - { 2, "lookup_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 3, "lookup_unroll_4_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 4, "exact_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 5, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 6, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 7, "exact_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 8, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, - { 9, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, - { 10, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, -#endif /* STARCH_MIX_ARM */ - -#ifdef STARCH_MIX_AARCH64 - { 0, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, - { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, - { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, - { 3, "lookup_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, - { 4, "lookup_unroll_4_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, - { 5, "exact_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_exact_armv8_neon_simd, cpu_supports_armv8_simd }, - { 6, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, - { 7, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, - { 8, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, - { 9, "exact_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_exact_armv8_neon_simd, cpu_supports_armv8_simd }, - { 10, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, -#endif /* STARCH_MIX_AARCH64 */ - -#ifdef STARCH_MIX_X86 - { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, - { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, - { 2, "lookup_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_lookup_x86_avx2, cpu_supports_avx2 }, - { 3, "lookup_unroll_4_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, - { 4, "exact_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_exact_x86_avx2, cpu_supports_avx2 }, - { 5, "lookup_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_x86_avx2, cpu_supports_avx2 }, - { 6, "exact_x86_avx2", "x86_avx2", starch_magnitude_uc8_exact_x86_avx2, cpu_supports_avx2 }, - { 7, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, - { 8, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, -#endif /* STARCH_MIX_X86 */ - { 0, NULL, NULL, NULL, NULL } -}; - /* dispatcher / registry for magnitude_power_uc8 */ starch_magnitude_power_uc8_regentry * starch_magnitude_power_uc8_select() { @@ -266,11 +73,15 @@ void starch_magnitude_power_uc8_set_wisdom (const char * const * received_wisdom starch_magnitude_power_uc8_regentry starch_magnitude_power_uc8_registry[] = { -#ifdef STARCH_MIX_GENERIC - { 0, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, - { 1, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, - { 2, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, -#endif /* STARCH_MIX_GENERIC */ +#ifdef STARCH_MIX_AARCH64 + { 0, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, + { 2, "twopass_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 6, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_ARM { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, @@ -282,15 +93,11 @@ starch_magnitude_power_uc8_regentry starch_magnitude_power_uc8_registry[] = { { 6, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, #endif /* STARCH_MIX_ARM */ -#ifdef STARCH_MIX_AARCH64 - { 0, "twopass_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, - { 1, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, - { 2, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, - { 3, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, - { 4, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, - { 5, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, - { 6, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, -#endif /* STARCH_MIX_AARCH64 */ +#ifdef STARCH_MIX_GENERIC + { 0, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 2, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ #ifdef STARCH_MIX_X86 { 0, "twopass_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_twopass_x86_avx2, cpu_supports_avx2 }, @@ -357,11 +164,19 @@ void starch_magnitude_power_uc8_aligned_set_wisdom (const char * const * receive starch_magnitude_power_uc8_aligned_regentry starch_magnitude_power_uc8_aligned_registry[] = { -#ifdef STARCH_MIX_GENERIC - { 0, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, - { 1, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, - { 2, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, -#endif /* STARCH_MIX_GENERIC */ +#ifdef STARCH_MIX_AARCH64 + { 0, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, + { 2, "twopass_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "lookup_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "lookup_unroll_4_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "twopass_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 9, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 10, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_ARM { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, @@ -377,19 +192,11 @@ starch_magnitude_power_uc8_aligned_regentry starch_magnitude_power_uc8_aligned_r { 10, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, #endif /* STARCH_MIX_ARM */ -#ifdef STARCH_MIX_AARCH64 - { 0, "twopass_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, - { 1, "lookup_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, - { 2, "lookup_unroll_4_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, - { 3, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, - { 4, "twopass_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, - { 5, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, - { 6, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, - { 7, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, - { 8, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, - { 9, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, - { 10, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, -#endif /* STARCH_MIX_AARCH64 */ +#ifdef STARCH_MIX_GENERIC + { 0, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 2, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ #ifdef STARCH_MIX_X86 { 0, "twopass_x86_avx2_aligned", "x86_avx2", starch_magnitude_power_uc8_aligned_twopass_x86_avx2, cpu_supports_avx2 }, @@ -459,10 +266,13 @@ void starch_magnitude_sc16_set_wisdom (const char * const * received_wisdom) starch_magnitude_sc16_regentry starch_magnitude_sc16_registry[] = { -#ifdef STARCH_MIX_GENERIC - { 0, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, - { 1, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, -#endif /* STARCH_MIX_GENERIC */ +#ifdef STARCH_MIX_AARCH64 + { 0, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 2, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_ARM { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, @@ -472,13 +282,10 @@ starch_magnitude_sc16_regentry starch_magnitude_sc16_registry[] = { { 4, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ -#ifdef STARCH_MIX_AARCH64 - { 0, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, - { 1, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, - { 2, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, - { 3, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, - { 4, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, -#endif /* STARCH_MIX_AARCH64 */ +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16_exact_float_x86_avx2, cpu_supports_avx2 }, @@ -543,10 +350,16 @@ void starch_magnitude_sc16_aligned_set_wisdom (const char * const * received_wis starch_magnitude_sc16_aligned_regentry starch_magnitude_sc16_aligned_registry[] = { -#ifdef STARCH_MIX_GENERIC - { 0, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, - { 1, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, -#endif /* STARCH_MIX_GENERIC */ +#ifdef STARCH_MIX_AARCH64 + { 0, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 2, "exact_u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "exact_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_ARM { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, @@ -559,16 +372,10 @@ starch_magnitude_sc16_aligned_regentry starch_magnitude_sc16_aligned_registry[] { 7, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ -#ifdef STARCH_MIX_AARCH64 - { 0, "exact_u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, - { 1, "exact_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, - { 2, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, - { 3, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, - { 4, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, - { 5, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, - { 6, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, - { 7, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, -#endif /* STARCH_MIX_AARCH64 */ +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16_aligned_exact_float_x86_avx2, cpu_supports_avx2 }, @@ -635,12 +442,17 @@ void starch_magnitude_sc16q11_set_wisdom (const char * const * received_wisdom) starch_magnitude_sc16q11_regentry starch_magnitude_sc16q11_registry[] = { -#ifdef STARCH_MIX_GENERIC - { 0, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, - { 1, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, - { 2, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, - { 3, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, -#endif /* STARCH_MIX_GENERIC */ +#ifdef STARCH_MIX_AARCH64 + { 0, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 2, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "11bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "12bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 7, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 8, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_ARM { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, @@ -654,17 +466,12 @@ starch_magnitude_sc16q11_regentry starch_magnitude_sc16q11_registry[] = { { 8, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, #endif /* STARCH_MIX_ARM */ -#ifdef STARCH_MIX_AARCH64 - { 0, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, - { 1, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, - { 2, "11bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, - { 3, "12bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, - { 4, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, - { 5, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, - { 6, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, - { 7, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, - { 8, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, -#endif /* STARCH_MIX_AARCH64 */ +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 2, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 3, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_exact_float_x86_avx2, cpu_supports_avx2 }, @@ -733,12 +540,22 @@ void starch_magnitude_sc16q11_aligned_set_wisdom (const char * const * received_ starch_magnitude_sc16q11_aligned_regentry starch_magnitude_sc16q11_aligned_registry[] = { -#ifdef STARCH_MIX_GENERIC - { 0, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, - { 1, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, - { 2, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, - { 3, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, -#endif /* STARCH_MIX_GENERIC */ +#ifdef STARCH_MIX_AARCH64 + { 0, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 2, "exact_u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "exact_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "11bit_table_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "12bit_table_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 9, "11bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 10, "12bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 11, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 12, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 13, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_ARM { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, @@ -757,22 +574,12 @@ starch_magnitude_sc16q11_aligned_regentry starch_magnitude_sc16q11_aligned_regis { 13, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, #endif /* STARCH_MIX_ARM */ -#ifdef STARCH_MIX_AARCH64 - { 0, "exact_u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, - { 1, "exact_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, - { 2, "11bit_table_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, - { 3, "12bit_table_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, - { 4, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, - { 5, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, - { 6, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, - { 7, "11bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, - { 8, "12bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, - { 9, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, - { 10, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, - { 11, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, - { 12, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, - { 13, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, -#endif /* STARCH_MIX_AARCH64 */ +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 2, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 3, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16q11_aligned_exact_float_x86_avx2, cpu_supports_avx2 }, @@ -791,6 +598,199 @@ starch_magnitude_sc16q11_aligned_regentry starch_magnitude_sc16q11_aligned_regis { 0, NULL, NULL, NULL, NULL } }; +/* dispatcher / registry for magnitude_uc8 */ + +starch_magnitude_uc8_regentry * starch_magnitude_uc8_select() { + for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_uc8_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_select(); + if (!entry) + abort(); + + starch_magnitude_uc8 = entry->callable; + starch_magnitude_uc8 ( arg0, arg1, arg2 ); +} + +starch_magnitude_uc8_ptr starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; + +void starch_magnitude_uc8_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_uc8_regentry *entry; + for (entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_uc8_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_uc8_registry, entry - starch_magnitude_uc8_registry, sizeof(starch_magnitude_uc8_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; +} + +starch_magnitude_uc8_regentry starch_magnitude_uc8_registry[] = { + +#ifdef STARCH_MIX_AARCH64 + { 0, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "exact_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 6, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "exact_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 6, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_GENERIC + { 0, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_X86 + { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_x86_avx2, cpu_supports_avx2 }, + { 3, "exact_x86_avx2", "x86_avx2", starch_magnitude_uc8_exact_x86_avx2, cpu_supports_avx2 }, + { 4, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 5, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_uc8_aligned */ + +starch_magnitude_uc8_aligned_regentry * starch_magnitude_uc8_aligned_select() { + for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_uc8_aligned_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_select(); + if (!entry) + abort(); + + starch_magnitude_uc8_aligned = entry->callable; + starch_magnitude_uc8_aligned ( arg0, arg1, arg2 ); +} + +starch_magnitude_uc8_aligned_ptr starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; + +void starch_magnitude_uc8_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_uc8_aligned_regentry *entry; + for (entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_uc8_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_uc8_aligned_registry, entry - starch_magnitude_uc8_aligned_registry, sizeof(starch_magnitude_uc8_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; +} + +starch_magnitude_uc8_aligned_regentry starch_magnitude_uc8_aligned_registry[] = { + +#ifdef STARCH_MIX_AARCH64 + { 0, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "lookup_unroll_4_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "exact_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "exact_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 9, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 10, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_unroll_4_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "exact_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "exact_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 8, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 9, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 10, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_GENERIC + { 0, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_X86 + { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_lookup_x86_avx2, cpu_supports_avx2 }, + { 3, "lookup_unroll_4_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 4, "exact_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_exact_x86_avx2, cpu_supports_avx2 }, + { 5, "lookup_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_x86_avx2, cpu_supports_avx2 }, + { 6, "exact_x86_avx2", "x86_avx2", starch_magnitude_uc8_exact_x86_avx2, cpu_supports_avx2 }, + { 7, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 8, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + /* dispatcher / registry for mean_power_u16 */ starch_mean_power_u16_regentry * starch_mean_power_u16_select() { @@ -845,11 +845,15 @@ void starch_mean_power_u16_set_wisdom (const char * const * received_wisdom) starch_mean_power_u16_regentry starch_mean_power_u16_registry[] = { -#ifdef STARCH_MIX_GENERIC - { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, - { 1, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, - { 2, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, -#endif /* STARCH_MIX_GENERIC */ +#ifdef STARCH_MIX_AARCH64 + { 0, "u32_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 2, "float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "u64_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u64_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "neon_float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 6, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_ARM { 0, "u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, @@ -861,15 +865,11 @@ starch_mean_power_u16_regentry starch_mean_power_u16_registry[] = { { 6, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ -#ifdef STARCH_MIX_AARCH64 +#ifdef STARCH_MIX_GENERIC { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, - { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, - { 2, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, - { 3, "float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_float_armv8_neon_simd, cpu_supports_armv8_simd }, - { 4, "u32_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u32_armv8_neon_simd, cpu_supports_armv8_simd }, - { 5, "u64_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u64_armv8_neon_simd, cpu_supports_armv8_simd }, - { 6, "neon_float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, -#endif /* STARCH_MIX_AARCH64 */ + { 1, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 2, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ #ifdef STARCH_MIX_X86 { 0, "u32_x86_avx2", "x86_avx2", starch_mean_power_u16_u32_x86_avx2, cpu_supports_avx2 }, @@ -936,11 +936,19 @@ void starch_mean_power_u16_aligned_set_wisdom (const char * const * received_wis starch_mean_power_u16_aligned_regentry starch_mean_power_u16_aligned_registry[] = { -#ifdef STARCH_MIX_GENERIC - { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, - { 1, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, - { 2, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, -#endif /* STARCH_MIX_GENERIC */ +#ifdef STARCH_MIX_AARCH64 + { 0, "u32_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 2, "float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "u64_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_u64_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "neon_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "u64_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u64_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "neon_float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 9, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 10, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_ARM { 0, "u32_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_mean_power_u16_aligned_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, @@ -956,19 +964,11 @@ starch_mean_power_u16_aligned_regentry starch_mean_power_u16_aligned_registry[] { 10, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ -#ifdef STARCH_MIX_AARCH64 +#ifdef STARCH_MIX_GENERIC { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, - { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, - { 2, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, - { 3, "float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_float_armv8_neon_simd, cpu_supports_armv8_simd }, - { 4, "u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_u32_armv8_neon_simd, cpu_supports_armv8_simd }, - { 5, "u64_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_u64_armv8_neon_simd, cpu_supports_armv8_simd }, - { 6, "neon_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, - { 7, "float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_float_armv8_neon_simd, cpu_supports_armv8_simd }, - { 8, "u32_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u32_armv8_neon_simd, cpu_supports_armv8_simd }, - { 9, "u64_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u64_armv8_neon_simd, cpu_supports_armv8_simd }, - { 10, "neon_float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, -#endif /* STARCH_MIX_AARCH64 */ + { 1, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 2, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ #ifdef STARCH_MIX_X86 { 0, "u32_x86_avx2_aligned", "x86_avx2", starch_mean_power_u16_aligned_u32_x86_avx2, cpu_supports_avx2 }, @@ -992,14 +992,6 @@ int starch_read_wisdom (const char * path) return -1; /* reset all ranks to identify entries not listed in the wisdom file; we'll assign ranks at the end to produce a stable sort */ - int rank_magnitude_uc8 = 0; - for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; entry->name; ++entry) { - entry->rank = 0; - } - int rank_magnitude_uc8_aligned = 0; - for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { - entry->rank = 0; - } int rank_magnitude_power_uc8 = 0; for (starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { entry->rank = 0; @@ -1024,6 +1016,14 @@ int starch_read_wisdom (const char * path) for (starch_magnitude_sc16q11_aligned_regentry *entry = starch_magnitude_sc16q11_aligned_registry; entry->name; ++entry) { entry->rank = 0; } + int rank_magnitude_uc8 = 0; + for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_uc8_aligned = 0; + for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } int rank_mean_power_u16 = 0; for (starch_mean_power_u16_regentry *entry = starch_mean_power_u16_registry; entry->name; ++entry) { entry->rank = 0; @@ -1065,24 +1065,6 @@ int starch_read_wisdom (const char * path) *end = 0; /* try to find a matching registry entry */ - if (!strcmp(name, "magnitude_uc8")) { - for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; entry->name; ++entry) { - if (!strcmp(impl, entry->name)) { - entry->rank = ++rank_magnitude_uc8; - break; - } - } - continue; - } - if (!strcmp(name, "magnitude_uc8_aligned")) { - for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { - if (!strcmp(impl, entry->name)) { - entry->rank = ++rank_magnitude_uc8_aligned; - break; - } - } - continue; - } if (!strcmp(name, "magnitude_power_uc8")) { for (starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { if (!strcmp(impl, entry->name)) { @@ -1137,6 +1119,24 @@ int starch_read_wisdom (const char * path) } continue; } + if (!strcmp(name, "magnitude_uc8")) { + for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_uc8; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_uc8_aligned")) { + for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_uc8_aligned; + break; + } + } + continue; + } if (!strcmp(name, "mean_power_u16")) { for (starch_mean_power_u16_regentry *entry = starch_mean_power_u16_registry; entry->name; ++entry) { if (!strcmp(impl, entry->name)) { @@ -1165,28 +1165,6 @@ int starch_read_wisdom (const char * path) fclose(fp); /* assign ranks to unmatched items to (stable) sort them last; re-sort everything */ - { - starch_magnitude_uc8_regentry *entry; - for (entry = starch_magnitude_uc8_registry; entry->name; ++entry) { - if (!entry->rank) - entry->rank = ++rank_magnitude_uc8; - } - qsort(starch_magnitude_uc8_registry, entry - starch_magnitude_uc8_registry, sizeof(starch_magnitude_uc8_regentry), starch_regentry_rank_compare); - - /* reset the implementation pointer so the next call will re-select */ - starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; - } - { - starch_magnitude_uc8_aligned_regentry *entry; - for (entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { - if (!entry->rank) - entry->rank = ++rank_magnitude_uc8_aligned; - } - qsort(starch_magnitude_uc8_aligned_registry, entry - starch_magnitude_uc8_aligned_registry, sizeof(starch_magnitude_uc8_aligned_regentry), starch_regentry_rank_compare); - - /* reset the implementation pointer so the next call will re-select */ - starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; - } { starch_magnitude_power_uc8_regentry *entry; for (entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { @@ -1253,6 +1231,28 @@ int starch_read_wisdom (const char * path) /* reset the implementation pointer so the next call will re-select */ starch_magnitude_sc16q11_aligned = starch_magnitude_sc16q11_aligned_dispatch; } + { + starch_magnitude_uc8_regentry *entry; + for (entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_uc8; + } + qsort(starch_magnitude_uc8_registry, entry - starch_magnitude_uc8_registry, sizeof(starch_magnitude_uc8_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; + } + { + starch_magnitude_uc8_aligned_regentry *entry; + for (entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_uc8_aligned; + } + qsort(starch_magnitude_uc8_aligned_registry, entry - starch_magnitude_uc8_aligned_registry, sizeof(starch_magnitude_uc8_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; + } { starch_mean_power_u16_regentry *entry; for (entry = starch_mean_power_u16_registry; entry->name; ++entry) { diff --git a/dsp/generated/makefile.aarch64 b/dsp/generated/makefile.aarch64 new file mode 100644 index 0000000..d9bed4c --- /dev/null +++ b/dsp/generated/makefile.aarch64 @@ -0,0 +1,39 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_AARCH64 + + +dsp/generated/flavor.armv8_neon_simd.o: dsp/generated/flavor.armv8_neon_simd.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv8-a+simd -ffast-math dsp/generated/flavor.armv8_neon_simd.c -o dsp/generated/flavor.armv8_neon_simd.o + +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o + +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o + +STARCH_OBJS := dsp/generated/flavor.armv8_neon_simd.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o + + +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o + +STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.arm b/dsp/generated/makefile.arm index 96c0044..58eaf5b 100644 --- a/dsp/generated/makefile.arm +++ b/dsp/generated/makefile.arm @@ -21,19 +21,19 @@ STARCH_CFLAGS := -DSTARCH_MIX_ARM -dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv7-a+neon-vfpv4 -mfpu=neon-vfpv4 -ffast-math dsp/generated/flavor.armv7a_neon_vfpv4.c -o dsp/generated/flavor.armv7a_neon_vfpv4.o -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.armv7a_neon_vfpv4.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.generic b/dsp/generated/makefile.generic index 18c6787..7f261d9 100644 --- a/dsp/generated/makefile.generic +++ b/dsp/generated/makefile.generic @@ -21,16 +21,16 @@ STARCH_CFLAGS := -DSTARCH_MIX_GENERIC -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.x86 b/dsp/generated/makefile.x86 index 8d21e85..e88d3e1 100644 --- a/dsp/generated/makefile.x86 +++ b/dsp/generated/makefile.x86 @@ -21,19 +21,19 @@ STARCH_CFLAGS := -DSTARCH_MIX_X86 -dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) -mavx2 -ffast-math dsp/generated/flavor.x86_avx2.c -o dsp/generated/flavor.x86_avx2.o -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.x86_avx2.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/starch.h b/dsp/generated/starch.h index dabf950..a2b62e9 100644 --- a/dsp/generated/starch.h +++ b/dsp/generated/starch.h @@ -6,11 +6,12 @@ /* mixes */ -/* Generic build, compiler defaults only */ -#ifdef STARCH_MIX_GENERIC +/* AARCH64 */ +#ifdef STARCH_MIX_AARCH64 +#define STARCH_FLAVOR_ARMV8_NEON_SIMD #define STARCH_FLAVOR_GENERIC -#define STARCH_MIX_ALIGNMENT 1 -#endif /* STARCH_MIX_GENERIC */ +#define STARCH_MIX_ALIGNMENT 32 +#endif /* STARCH_MIX_AARCH64 */ /* ARM */ #ifdef STARCH_MIX_ARM @@ -19,12 +20,11 @@ #define STARCH_MIX_ALIGNMENT 16 #endif /* STARCH_MIX_ARM */ -/* AARCH64 */ -#ifdef STARCH_MIX_AARCH64 -#define STARCH_FLAVOR_ARMV8_NEON_SIMD +/* Generic build, compiler defaults only */ +#ifdef STARCH_MIX_GENERIC #define STARCH_FLAVOR_GENERIC -#define STARCH_MIX_ALIGNMENT 32 -#endif /* STARCH_MIX_AARCH64 */ +#define STARCH_MIX_ALIGNMENT 1 +#endif /* STARCH_MIX_GENERIC */ /* x64 */ #ifdef STARCH_MIX_X86 @@ -197,28 +197,16 @@ void starch_mean_power_u16_aligned_set_wisdom( const char * const * received_wis /* flavors and prototypes */ -#ifdef STARCH_FLAVOR_GENERIC -void starch_magnitude_power_uc8_twopass_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_power_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_power_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_sc16_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_11bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_12bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_exact_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_mean_power_u16_float_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u32_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u64_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -#endif /* STARCH_FLAVOR_GENERIC */ - -int starch_read_wisdom (const char * path); - #ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 int cpu_supports_armv7_neon_vfpv4 (void); +void starch_mean_power_u16_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_magnitude_power_uc8_twopass_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_twopass_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); @@ -227,12 +215,14 @@ void starch_magnitude_power_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t void starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); @@ -243,28 +233,26 @@ void starch_magnitude_sc16q11_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg void starch_magnitude_sc16q11_aligned_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_mean_power_u16_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); #endif /* STARCH_FLAVOR_ARMV7A_NEON_VFPV4 */ int starch_read_wisdom (const char * path); #ifdef STARCH_FLAVOR_ARMV8_NEON_SIMD int cpu_supports_armv8_simd (void); +void starch_mean_power_u16_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_neon_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_neon_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_magnitude_power_uc8_twopass_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_twopass_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); @@ -273,12 +261,14 @@ void starch_magnitude_power_uc8_lookup_unroll_4_armv8_neon_simd ( const uc8_t * void starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_sc16_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); @@ -289,38 +279,56 @@ void starch_magnitude_sc16q11_12bit_table_armv8_neon_simd ( const sc16_t * arg0, void starch_magnitude_sc16q11_aligned_12bit_table_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_exact_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_exact_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_mean_power_u16_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u32_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u32_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u64_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u64_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_neon_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_neon_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_sc16_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); #endif /* STARCH_FLAVOR_ARMV8_NEON_SIMD */ int starch_read_wisdom (const char * path); +#ifdef STARCH_FLAVOR_GENERIC +void starch_mean_power_u16_float_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_power_uc8_twopass_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_11bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_12bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +#endif /* STARCH_FLAVOR_GENERIC */ + +int starch_read_wisdom (const char * path); + #ifdef STARCH_FLAVOR_X86_AVX2 int cpu_supports_avx2 (void); +void starch_mean_power_u16_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_magnitude_power_uc8_twopass_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_twopass_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_sc16_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); @@ -329,18 +337,10 @@ void starch_magnitude_sc16q11_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16 void starch_magnitude_sc16q11_aligned_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_mean_power_u16_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_sc16_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); #endif /* STARCH_FLAVOR_X86_AVX2 */ int starch_read_wisdom (const char * path);