From 1b0bcefae6f49e7d40d13426a4ddb139469708dc Mon Sep 17 00:00:00 2001 From: George Joseph Date: Sun, 7 Feb 2021 17:07:17 -0700 Subject: [PATCH] Starch config: Add aarch64 * Added aarch64 to dsp/starchgen.py and Makefile. * Regenerated files --- Makefile | 26 +++-- dsp/generated/benchmark.c | 15 ++- dsp/generated/dispatcher.c | 105 ++++++++++++++++++ dsp/generated/flavor.armv7a_neon_vfpv4.c | 12 +- dsp/generated/flavor.armv8_a.c | 40 +++++++ dsp/generated/flavor.generic.c | 6 +- dsp/generated/flavor.x86_avx2.c | 12 +- dsp/generated/makefile.arm | 8 +- dsp/generated/makefile.generic | 6 +- dsp/generated/makefile.x86 | 8 +- dsp/generated/starch.h | 134 +++++++++++++++-------- dsp/starchgen.py | 10 ++ 12 files changed, 293 insertions(+), 89 deletions(-) create mode 100644 dsp/generated/flavor.armv8_a.c diff --git a/Makefile b/Makefile index 75195b4..2059746 100644 --- a/Makefile +++ b/Makefile @@ -142,19 +142,23 @@ ifneq ($(CPUFEATURES),yes) # need to be able to detect CPU features at runtime to enable any non-standard compiler flags STARCH_MIX := generic CPPFLAGS += -DSTARCH_MIX_GENERIC -else ifeq ($(ARCH),x86_64) - # AVX, AVX2 - STARCH_MIX := x86 - CPPFLAGS += -DSTARCH_MIX_X86 -else ifneq (,$(findstring arm,$(ARCH))) - # ARMv7 NEON - STARCH_MIX := arm - CPPFLAGS += -DSTARCH_MIX_ARM else - STARCH_MIX := generic - CPPFLAGS += -DSTARCH_MIX_GENERIC + ifeq ($(ARCH),x86_64) + # AVX, AVX2 + STARCH_MIX := x86 + CPPFLAGS += -DSTARCH_MIX_X86 + else ifeq ($(findstring arm,$(ARCH)),arm) + # ARMv7 NEON + STARCH_MIX := arm + CPPFLAGS += -DSTARCH_MIX_ARM + else ifeq ($(findstring aarch,$(ARCH)),aarch) + STARCH_MIX := aarch64 + CPPFLAGS += -DSTARCH_MIX_AARCH64 + else + STARCH_MIX := generic + CPPFLAGS += -DSTARCH_MIX_GENERIC + endif endif - all: showconfig dump1090 view1090 starch-benchmark STARCH_COMPILE := $(CC) $(CPPFLAGS) $(CFLAGS) -c diff --git a/dsp/generated/benchmark.c b/dsp/generated/benchmark.c index 1fd014a..dc52682 100644 --- a/dsp/generated/benchmark.c +++ b/dsp/generated/benchmark.c @@ -1247,10 +1247,10 @@ static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, #define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) #include "../benchmark/magnitude_sc16_benchmark.c" -#include "../benchmark/magnitude_uc8_benchmark.c" -#include "../benchmark/magnitude_power_uc8_benchmark.c" -#include "../benchmark/mean_power_u16_benchmark.c" #include "../benchmark/magnitude_sc16q11_benchmark.c" +#include "../benchmark/magnitude_uc8_benchmark.c" +#include "../benchmark/mean_power_u16_benchmark.c" +#include "../benchmark/magnitude_power_uc8_benchmark.c" #undef STARCH_ALIGNMENT #undef STARCH_ALIGNED @@ -1275,10 +1275,10 @@ static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, #define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) #include "../benchmark/magnitude_sc16_benchmark.c" -#include "../benchmark/magnitude_uc8_benchmark.c" -#include "../benchmark/magnitude_power_uc8_benchmark.c" -#include "../benchmark/mean_power_u16_benchmark.c" #include "../benchmark/magnitude_sc16q11_benchmark.c" +#include "../benchmark/magnitude_uc8_benchmark.c" +#include "../benchmark/mean_power_u16_benchmark.c" +#include "../benchmark/magnitude_power_uc8_benchmark.c" static void starch_benchmark_all_magnitude_uc8(void) { @@ -1375,6 +1375,9 @@ static void starch_benchmark_usage(const char *argv0) #ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 "armv7a_neon_vfpv4 " #endif +#ifdef STARCH_FLAVOR_ARMV8_A + "armv8_a " +#endif #ifdef STARCH_FLAVOR_X86_AVX2 "x86_avx2 " #endif diff --git a/dsp/generated/dispatcher.c b/dsp/generated/dispatcher.c index 565ed76..54072dd 100644 --- a/dsp/generated/dispatcher.c +++ b/dsp/generated/dispatcher.c @@ -89,6 +89,15 @@ starch_magnitude_uc8_regentry starch_magnitude_uc8_registry[] = { { 6, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "lookup_armv8_a", "armv8_a", starch_magnitude_uc8_lookup_armv8_a, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 2, "lookup_unroll_4_armv8_a", "armv8_a", starch_magnitude_uc8_lookup_unroll_4_armv8_a, NULL }, + { 3, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 4, "exact_armv8_a", "armv8_a", starch_magnitude_uc8_exact_armv8_a, NULL }, + { 5, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, @@ -174,6 +183,18 @@ starch_magnitude_uc8_aligned_regentry starch_magnitude_uc8_aligned_registry[] = { 10, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "lookup_armv8_a", "armv8_a", starch_magnitude_uc8_lookup_armv8_a, NULL }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 3, "lookup_armv8_a_aligned", "armv8_a", starch_magnitude_uc8_aligned_lookup_armv8_a, NULL }, + { 4, "lookup_unroll_4_armv8_a_aligned", "armv8_a", starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_a, NULL }, + { 5, "lookup_unroll_4_armv8_a", "armv8_a", starch_magnitude_uc8_lookup_unroll_4_armv8_a, NULL }, + { 6, "exact_armv8_a_aligned", "armv8_a", starch_magnitude_uc8_aligned_exact_armv8_a, NULL }, + { 7, "exact_armv8_a", "armv8_a", starch_magnitude_uc8_exact_armv8_a, NULL }, + { 8, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, @@ -258,6 +279,15 @@ starch_magnitude_power_uc8_regentry starch_magnitude_power_uc8_registry[] = { { 6, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "twopass_armv8_a", "armv8_a", starch_magnitude_power_uc8_twopass_armv8_a, NULL }, + { 1, "lookup_armv8_a", "armv8_a", starch_magnitude_power_uc8_lookup_armv8_a, NULL }, + { 2, "lookup_unroll_4_armv8_a", "armv8_a", starch_magnitude_power_uc8_lookup_unroll_4_armv8_a, NULL }, + { 3, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 4, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 5, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "twopass_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_twopass_x86_avx2, cpu_supports_avx2 }, { 1, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, @@ -343,6 +373,18 @@ starch_magnitude_power_uc8_aligned_regentry starch_magnitude_power_uc8_aligned_r { 10, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "twopass_armv8_a_aligned", "armv8_a", starch_magnitude_power_uc8_aligned_twopass_armv8_a, NULL }, + { 1, "lookup_armv8_a_aligned", "armv8_a", starch_magnitude_power_uc8_aligned_lookup_armv8_a, NULL }, + { 2, "lookup_unroll_4_armv8_a_aligned", "armv8_a", starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv8_a, NULL }, + { 3, "twopass_armv8_a", "armv8_a", starch_magnitude_power_uc8_twopass_armv8_a, NULL }, + { 4, "lookup_armv8_a", "armv8_a", starch_magnitude_power_uc8_lookup_armv8_a, NULL }, + { 5, "lookup_unroll_4_armv8_a", "armv8_a", starch_magnitude_power_uc8_lookup_unroll_4_armv8_a, NULL }, + { 6, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 7, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 8, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "twopass_x86_avx2_aligned", "x86_avx2", starch_magnitude_power_uc8_aligned_twopass_x86_avx2, cpu_supports_avx2 }, { 1, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, @@ -424,6 +466,13 @@ starch_magnitude_sc16_regentry starch_magnitude_sc16_registry[] = { { 4, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_u32_armv8_a", "armv8_a", starch_magnitude_sc16_exact_u32_armv8_a, NULL }, + { 1, "exact_float_armv8_a", "armv8_a", starch_magnitude_sc16_exact_float_armv8_a, NULL }, + { 2, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, + { 3, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16_exact_float_x86_avx2, cpu_supports_avx2 }, { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, @@ -503,6 +552,15 @@ starch_magnitude_sc16_aligned_regentry starch_magnitude_sc16_aligned_registry[] { 7, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_u32_armv8_a_aligned", "armv8_a", starch_magnitude_sc16_aligned_exact_u32_armv8_a, NULL }, + { 1, "exact_float_armv8_a_aligned", "armv8_a", starch_magnitude_sc16_aligned_exact_float_armv8_a, NULL }, + { 2, "exact_u32_armv8_a", "armv8_a", starch_magnitude_sc16_exact_u32_armv8_a, NULL }, + { 3, "exact_float_armv8_a", "armv8_a", starch_magnitude_sc16_exact_float_armv8_a, NULL }, + { 4, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, + { 5, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16_aligned_exact_float_x86_avx2, cpu_supports_avx2 }, { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, @@ -587,6 +645,17 @@ starch_magnitude_sc16q11_regentry starch_magnitude_sc16q11_registry[] = { { 8, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_u32_armv8_a", "armv8_a", starch_magnitude_sc16q11_exact_u32_armv8_a, NULL }, + { 1, "exact_float_armv8_a", "armv8_a", starch_magnitude_sc16q11_exact_float_armv8_a, NULL }, + { 2, "11bit_table_armv8_a", "armv8_a", starch_magnitude_sc16q11_11bit_table_armv8_a, NULL }, + { 3, "12bit_table_armv8_a", "armv8_a", starch_magnitude_sc16q11_12bit_table_armv8_a, NULL }, + { 4, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 5, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 6, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 7, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_exact_float_x86_avx2, cpu_supports_avx2 }, { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, @@ -678,6 +747,21 @@ starch_magnitude_sc16q11_aligned_regentry starch_magnitude_sc16q11_aligned_regis { 13, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_u32_armv8_a_aligned", "armv8_a", starch_magnitude_sc16q11_aligned_exact_u32_armv8_a, NULL }, + { 1, "exact_float_armv8_a_aligned", "armv8_a", starch_magnitude_sc16q11_aligned_exact_float_armv8_a, NULL }, + { 2, "11bit_table_armv8_a_aligned", "armv8_a", starch_magnitude_sc16q11_aligned_11bit_table_armv8_a, NULL }, + { 3, "12bit_table_armv8_a_aligned", "armv8_a", starch_magnitude_sc16q11_aligned_12bit_table_armv8_a, NULL }, + { 4, "exact_u32_armv8_a", "armv8_a", starch_magnitude_sc16q11_exact_u32_armv8_a, NULL }, + { 5, "exact_float_armv8_a", "armv8_a", starch_magnitude_sc16q11_exact_float_armv8_a, NULL }, + { 6, "11bit_table_armv8_a", "armv8_a", starch_magnitude_sc16q11_11bit_table_armv8_a, NULL }, + { 7, "12bit_table_armv8_a", "armv8_a", starch_magnitude_sc16q11_12bit_table_armv8_a, NULL }, + { 8, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 9, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 10, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 11, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16q11_aligned_exact_float_x86_avx2, cpu_supports_avx2 }, { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, @@ -765,6 +849,15 @@ starch_mean_power_u16_regentry starch_mean_power_u16_registry[] = { { 6, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "float_armv8_a", "armv8_a", starch_mean_power_u16_float_armv8_a, NULL }, + { 1, "u32_armv8_a", "armv8_a", starch_mean_power_u16_u32_armv8_a, NULL }, + { 2, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 3, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 4, "u64_armv8_a", "armv8_a", starch_mean_power_u16_u64_armv8_a, NULL }, + { 5, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "u32_x86_avx2", "x86_avx2", starch_mean_power_u16_u32_x86_avx2, cpu_supports_avx2 }, { 1, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, @@ -850,6 +943,18 @@ starch_mean_power_u16_aligned_regentry starch_mean_power_u16_aligned_registry[] { 10, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "u32_armv8_a", "armv8_a", starch_mean_power_u16_u32_armv8_a, NULL }, + { 1, "u32_armv8_a_aligned", "armv8_a", starch_mean_power_u16_aligned_u32_armv8_a, NULL }, + { 2, "u64_armv8_a", "armv8_a", starch_mean_power_u16_u64_armv8_a, NULL }, + { 3, "u64_armv8_a_aligned", "armv8_a", starch_mean_power_u16_aligned_u64_armv8_a, NULL }, + { 4, "float_armv8_a", "armv8_a", starch_mean_power_u16_float_armv8_a, NULL }, + { 5, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 6, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 7, "float_armv8_a_aligned", "armv8_a", starch_mean_power_u16_aligned_float_armv8_a, NULL }, + { 8, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "u32_x86_avx2_aligned", "x86_avx2", starch_mean_power_u16_aligned_u32_x86_avx2, cpu_supports_avx2 }, { 1, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, diff --git a/dsp/generated/flavor.armv7a_neon_vfpv4.c b/dsp/generated/flavor.armv7a_neon_vfpv4.c index acb84e7..cf8b5cc 100644 --- a/dsp/generated/flavor.armv7a_neon_vfpv4.c +++ b/dsp/generated/flavor.armv7a_neon_vfpv4.c @@ -14,11 +14,11 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv7a_neon_vfpv4 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) -#include "../impl/mean_power_u16.c" #include "../impl/magnitude_power_uc8.c" -#include "../impl/magnitude_uc8.c" -#include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" #undef STARCH_ALIGNMENT @@ -33,9 +33,9 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv7a_neon_vfpv4 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) -#include "../impl/mean_power_u16.c" #include "../impl/magnitude_power_uc8.c" -#include "../impl/magnitude_uc8.c" -#include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" diff --git a/dsp/generated/flavor.armv8_a.c b/dsp/generated/flavor.armv8_a.c new file mode 100644 index 0000000..2c7a9c2 --- /dev/null +++ b/dsp/generated/flavor.armv8_a.c @@ -0,0 +1,40 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_ARMV8_A + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## armv8_a +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv8_a +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## armv8_a +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv8_a +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" + diff --git a/dsp/generated/flavor.generic.c b/dsp/generated/flavor.generic.c index d869946..8b8fa0b 100644 --- a/dsp/generated/flavor.generic.c +++ b/dsp/generated/flavor.generic.c @@ -13,9 +13,9 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## generic #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) -#include "../impl/mean_power_u16.c" #include "../impl/magnitude_power_uc8.c" -#include "../impl/magnitude_uc8.c" -#include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" diff --git a/dsp/generated/flavor.x86_avx2.c b/dsp/generated/flavor.x86_avx2.c index 5b9f88e..de56b0d 100644 --- a/dsp/generated/flavor.x86_avx2.c +++ b/dsp/generated/flavor.x86_avx2.c @@ -13,11 +13,11 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## x86_avx2 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) -#include "../impl/mean_power_u16.c" #include "../impl/magnitude_power_uc8.c" -#include "../impl/magnitude_uc8.c" -#include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" #undef STARCH_ALIGNMENT @@ -32,9 +32,9 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## x86_avx2 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) -#include "../impl/mean_power_u16.c" #include "../impl/magnitude_power_uc8.c" -#include "../impl/magnitude_uc8.c" -#include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" diff --git a/dsp/generated/makefile.arm b/dsp/generated/makefile.arm index 58eaf5b..5a1e34a 100644 --- a/dsp/generated/makefile.arm +++ b/dsp/generated/makefile.arm @@ -21,19 +21,19 @@ STARCH_CFLAGS := -DSTARCH_MIX_ARM -dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv7-a+neon-vfpv4 -mfpu=neon-vfpv4 -ffast-math dsp/generated/flavor.armv7a_neon_vfpv4.c -o dsp/generated/flavor.armv7a_neon_vfpv4.o -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.armv7a_neon_vfpv4.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.generic b/dsp/generated/makefile.generic index 7f261d9..eec19f3 100644 --- a/dsp/generated/makefile.generic +++ b/dsp/generated/makefile.generic @@ -21,16 +21,16 @@ STARCH_CFLAGS := -DSTARCH_MIX_GENERIC -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.x86 b/dsp/generated/makefile.x86 index e88d3e1..f3d8d54 100644 --- a/dsp/generated/makefile.x86 +++ b/dsp/generated/makefile.x86 @@ -21,19 +21,19 @@ STARCH_CFLAGS := -DSTARCH_MIX_X86 -dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) -mavx2 -ffast-math dsp/generated/flavor.x86_avx2.c -o dsp/generated/flavor.x86_avx2.o -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.x86_avx2.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/starch.h b/dsp/generated/starch.h index 063ac04..052cf18 100644 --- a/dsp/generated/starch.h +++ b/dsp/generated/starch.h @@ -19,6 +19,13 @@ #define STARCH_MIX_ALIGNMENT 16 #endif /* STARCH_MIX_ARM */ +/* AARCH64 */ +#ifdef STARCH_MIX_AARCH64 +#define STARCH_FLAVOR_ARMV8_A +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 32 +#endif /* STARCH_MIX_AARCH64 */ + /* x64 */ #ifdef STARCH_MIX_X86 #define STARCH_FLAVOR_X86_AVX2 @@ -191,35 +198,27 @@ void starch_mean_power_u16_aligned_set_wisdom( const char * const * received_wis /* flavors and prototypes */ #ifdef STARCH_FLAVOR_GENERIC -void starch_mean_power_u16_float_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u32_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u64_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_magnitude_power_uc8_twopass_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_exact_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_11bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_12bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_mean_power_u16_float_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); #endif /* STARCH_FLAVOR_GENERIC */ int starch_read_wisdom (const char * path); #ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 int cpu_supports_armv7_neon_vfpv4 (void); -void starch_mean_power_u16_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_magnitude_power_uc8_twopass_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_twopass_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); @@ -228,14 +227,12 @@ void starch_magnitude_power_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t void starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); @@ -246,36 +243,73 @@ void starch_magnitude_sc16q11_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg void starch_magnitude_sc16q11_aligned_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_mean_power_u16_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); #endif /* STARCH_FLAVOR_ARMV7A_NEON_VFPV4 */ int starch_read_wisdom (const char * path); +#ifdef STARCH_FLAVOR_ARMV8_A +void starch_magnitude_power_uc8_twopass_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_twopass_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_unroll_4_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_sc16_exact_u32_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_u32_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_u32_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_float_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_float_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_11bit_table_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_11bit_table_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_12bit_table_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_12bit_table_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_mean_power_u16_float_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +#endif /* STARCH_FLAVOR_ARMV8_A */ + +int starch_read_wisdom (const char * path); + #ifdef STARCH_FLAVOR_X86_AVX2 int cpu_supports_avx2 (void); -void starch_mean_power_u16_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_magnitude_power_uc8_twopass_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_twopass_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); @@ -284,10 +318,18 @@ void starch_magnitude_sc16q11_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16 void starch_magnitude_sc16q11_aligned_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_mean_power_u16_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); #endif /* STARCH_FLAVOR_X86_AVX2 */ int starch_read_wisdom (const char * path); diff --git a/dsp/starchgen.py b/dsp/starchgen.py index ae963c8..1a3da9a 100755 --- a/dsp/starchgen.py +++ b/dsp/starchgen.py @@ -32,6 +32,11 @@ gen.add_flavor(name = 'armv7a_neon_vfpv4', features = ['neon'], test_function = 'cpu_supports_armv7_neon_vfpv4', alignment = 16) +gen.add_flavor(name = 'armv8_a', + description = 'ARMv8-A', + compile_flags = ['-ffast-math'], + features = [], + alignment = 32) gen.add_flavor(name = 'x86_avx2', description = 'x86 with AVX2', compile_flags = ['-mavx2', '-ffast-math'], @@ -48,6 +53,11 @@ gen.add_mix(name = 'arm', flavors = ['armv7a_neon_vfpv4', 'generic'], wisdom_file = 'wisdom.arm') +gen.add_mix(name = 'aarch64', + description = 'AARCH64', + flavors = ['armv8_a', 'generic'], + wisdom_file = 'wisdom.aarch64') + gen.add_mix(name = 'x86', description = 'x64', flavors = ['x86_avx2', 'generic'],