diff --git a/Makefile b/Makefile index 0ba6bf1..26785a1 100644 --- a/Makefile +++ b/Makefile @@ -141,19 +141,23 @@ ifneq ($(CPUFEATURES),yes) # need to be able to detect CPU features at runtime to enable any non-standard compiler flags STARCH_MIX := generic CPPFLAGS += -DSTARCH_MIX_GENERIC -else ifeq ($(ARCH),x86_64) - # AVX, AVX2 - STARCH_MIX := x86 - CPPFLAGS += -DSTARCH_MIX_X86 -else ifneq (,$(findstring arm,$(ARCH))) - # ARMv7 NEON - STARCH_MIX := arm - CPPFLAGS += -DSTARCH_MIX_ARM else - STARCH_MIX := generic - CPPFLAGS += -DSTARCH_MIX_GENERIC + ifeq ($(ARCH),x86_64) + # AVX, AVX2 + STARCH_MIX := x86 + CPPFLAGS += -DSTARCH_MIX_X86 + else ifeq ($(findstring arm,$(ARCH)),arm) + # ARMv7 NEON + STARCH_MIX := arm + CPPFLAGS += -DSTARCH_MIX_ARM + else ifeq ($(findstring aarch,$(ARCH)),aarch) + STARCH_MIX := aarch64 + CPPFLAGS += -DSTARCH_MIX_AARCH64 + else + STARCH_MIX := generic + CPPFLAGS += -DSTARCH_MIX_GENERIC + endif endif - all: showconfig dump1090 view1090 starch-benchmark STARCH_COMPILE := $(CC) $(CPPFLAGS) $(CFLAGS) -c diff --git a/Makefile.cpufeatures b/Makefile.cpufeatures index 3e34cb4..099714e 100644 --- a/Makefile.cpufeatures +++ b/Makefile.cpufeatures @@ -25,5 +25,9 @@ ifneq (,$(findstring arm,$(CPUFEATURES_ARCH))) CPUFEATURES_OBJS += cpu_features/src/cpuinfo_arm.o endif +ifneq (,$(findstring aarch64,$(CPUFEATURES_ARCH))) + CPUFEATURES_OBJS += cpu_features/src/cpuinfo_aarch64.o +endif + $(CPUFEATURES_OBJS): override CFLAGS := $(CPUFEATURES_CFLAGS) $(CPUFEATURES_OBJS): override CPPFLAGS := -Icpu_features/include diff --git a/cpu.c b/cpu.c index 831ab4f..e17e352 100644 --- a/cpu.c +++ b/cpu.c @@ -76,3 +76,34 @@ int cpu_supports_armv7_neon_vfpv4(void) return 0; #endif } + +// +// AARCH64 +// + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#include "cpuinfo_aarch64.h" + +static Aarch64Info *aarch64_info() +{ + static bool valid = false; + static Aarch64Info cache; + + if (!valid) { + cache = GetAarch64Info(); + valid = true; + } + + return &cache; +} + +#endif + +int cpu_supports_armv8_simd(void) +{ +#ifdef CPU_FEATURES_ARCH_AARCH64 + return aarch64_info()->features.asimd; +#else + return 0; +#endif +} diff --git a/cpu.h b/cpu.h index 0cf88bf..0492679 100644 --- a/cpu.h +++ b/cpu.h @@ -8,4 +8,8 @@ int cpu_supports_avx2(void); // ARM int cpu_supports_armv7_neon_vfpv4(void); +// AARCH64 +int cpu_supports_armv8_simd(void); +int cpu_supports_armv8_simd_sve(void); + #endif diff --git a/dsp/generated/benchmark.c b/dsp/generated/benchmark.c index 1fd014a..ce38e01 100644 --- a/dsp/generated/benchmark.c +++ b/dsp/generated/benchmark.c @@ -1246,11 +1246,11 @@ static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, #define STARCH_BENCHMARK_ALLOC(_count, _type) ((_type *) starch_benchmark_aligned_alloc(1, alignof(_type), (_count) * sizeof(_type))) #define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) -#include "../benchmark/magnitude_sc16_benchmark.c" -#include "../benchmark/magnitude_uc8_benchmark.c" #include "../benchmark/magnitude_power_uc8_benchmark.c" -#include "../benchmark/mean_power_u16_benchmark.c" +#include "../benchmark/magnitude_sc16_benchmark.c" #include "../benchmark/magnitude_sc16q11_benchmark.c" +#include "../benchmark/magnitude_uc8_benchmark.c" +#include "../benchmark/mean_power_u16_benchmark.c" #undef STARCH_ALIGNMENT #undef STARCH_ALIGNED @@ -1274,11 +1274,11 @@ static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, #define STARCH_BENCHMARK_ALLOC(_count, _type) ((_type *) starch_benchmark_aligned_alloc(STARCH_MIX_ALIGNMENT, alignof(_type), (_count) * sizeof(_type))) #define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) -#include "../benchmark/magnitude_sc16_benchmark.c" -#include "../benchmark/magnitude_uc8_benchmark.c" #include "../benchmark/magnitude_power_uc8_benchmark.c" -#include "../benchmark/mean_power_u16_benchmark.c" +#include "../benchmark/magnitude_sc16_benchmark.c" #include "../benchmark/magnitude_sc16q11_benchmark.c" +#include "../benchmark/magnitude_uc8_benchmark.c" +#include "../benchmark/mean_power_u16_benchmark.c" static void starch_benchmark_all_magnitude_uc8(void) { @@ -1375,6 +1375,9 @@ static void starch_benchmark_usage(const char *argv0) #ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 "armv7a_neon_vfpv4 " #endif +#ifdef STARCH_FLAVOR_ARMV8_NEON_SIMD + "armv8_neon_simd " +#endif #ifdef STARCH_FLAVOR_X86_AVX2 "x86_avx2 " #endif diff --git a/dsp/generated/dispatcher.c b/dsp/generated/dispatcher.c index 565ed76..7a0bce4 100644 --- a/dsp/generated/dispatcher.c +++ b/dsp/generated/dispatcher.c @@ -89,6 +89,16 @@ starch_magnitude_uc8_regentry starch_magnitude_uc8_registry[] = { { 6, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, + { 3, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "exact_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, @@ -174,6 +184,20 @@ starch_magnitude_uc8_aligned_regentry starch_magnitude_uc8_aligned_registry[] = { 10, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, + { 3, "lookup_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "lookup_unroll_4_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "exact_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 9, "exact_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 10, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, @@ -258,6 +282,16 @@ starch_magnitude_power_uc8_regentry starch_magnitude_power_uc8_registry[] = { { 6, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "twopass_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 5, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 6, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "twopass_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_twopass_x86_avx2, cpu_supports_avx2 }, { 1, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, @@ -343,6 +377,20 @@ starch_magnitude_power_uc8_aligned_regentry starch_magnitude_power_uc8_aligned_r { 10, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "twopass_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "lookup_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "lookup_unroll_4_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "twopass_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 9, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 10, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "twopass_x86_avx2_aligned", "x86_avx2", starch_magnitude_power_uc8_aligned_twopass_x86_avx2, cpu_supports_avx2 }, { 1, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, @@ -424,6 +472,14 @@ starch_magnitude_sc16_regentry starch_magnitude_sc16_registry[] = { { 4, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, + { 4, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16_exact_float_x86_avx2, cpu_supports_avx2 }, { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, @@ -503,6 +559,17 @@ starch_magnitude_sc16_aligned_regentry starch_magnitude_sc16_aligned_registry[] { 7, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, + { 7, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16_aligned_exact_float_x86_avx2, cpu_supports_avx2 }, { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, @@ -587,6 +654,18 @@ starch_magnitude_sc16q11_regentry starch_magnitude_sc16q11_registry[] = { { 8, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "11bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "12bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 6, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 7, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 8, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_exact_float_x86_avx2, cpu_supports_avx2 }, { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, @@ -678,6 +757,23 @@ starch_magnitude_sc16q11_aligned_regentry starch_magnitude_sc16q11_aligned_regis { 13, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "exact_u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "11bit_table_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "12bit_table_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "11bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "12bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 9, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 10, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 11, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 12, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 13, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "exact_float_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16q11_aligned_exact_float_x86_avx2, cpu_supports_avx2 }, { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, @@ -765,6 +861,16 @@ starch_mean_power_u16_regentry starch_mean_power_u16_registry[] = { { 6, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 2, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 3, "float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "u32_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "u64_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u64_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "neon_float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "u32_x86_avx2", "x86_avx2", starch_mean_power_u16_u32_x86_avx2, cpu_supports_avx2 }, { 1, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, @@ -850,6 +956,20 @@ starch_mean_power_u16_aligned_regentry starch_mean_power_u16_aligned_registry[] { 10, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, #endif /* STARCH_MIX_ARM */ +#ifdef STARCH_MIX_AARCH64 + { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 2, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 3, "float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "u64_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_u64_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "neon_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "u32_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 9, "u64_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u64_armv8_neon_simd, cpu_supports_armv8_simd }, + { 10, "neon_float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, +#endif /* STARCH_MIX_AARCH64 */ + #ifdef STARCH_MIX_X86 { 0, "u32_x86_avx2_aligned", "x86_avx2", starch_mean_power_u16_aligned_u32_x86_avx2, cpu_supports_avx2 }, { 1, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, diff --git a/dsp/generated/flavor.armv7a_neon_vfpv4.c b/dsp/generated/flavor.armv7a_neon_vfpv4.c index acb84e7..cf8b5cc 100644 --- a/dsp/generated/flavor.armv7a_neon_vfpv4.c +++ b/dsp/generated/flavor.armv7a_neon_vfpv4.c @@ -14,11 +14,11 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv7a_neon_vfpv4 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) -#include "../impl/mean_power_u16.c" #include "../impl/magnitude_power_uc8.c" -#include "../impl/magnitude_uc8.c" -#include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" #undef STARCH_ALIGNMENT @@ -33,9 +33,9 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv7a_neon_vfpv4 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) -#include "../impl/mean_power_u16.c" #include "../impl/magnitude_power_uc8.c" -#include "../impl/magnitude_uc8.c" -#include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" diff --git a/dsp/generated/flavor.armv8_a.c b/dsp/generated/flavor.armv8_a.c new file mode 100644 index 0000000..2c7a9c2 --- /dev/null +++ b/dsp/generated/flavor.armv8_a.c @@ -0,0 +1,40 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_ARMV8_A + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## armv8_a +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv8_a +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## armv8_a +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv8_a +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" + diff --git a/dsp/generated/flavor.armv8_neon_simd.c b/dsp/generated/flavor.armv8_neon_simd.c new file mode 100644 index 0000000..276e47e --- /dev/null +++ b/dsp/generated/flavor.armv8_neon_simd.c @@ -0,0 +1,41 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_ARMV8_NEON_SIMD +#define STARCH_FEATURE_NEON + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## armv8_neon_simd +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv8_neon_simd +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## armv8_neon_simd +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv8_neon_simd +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" + diff --git a/dsp/generated/flavor.generic.c b/dsp/generated/flavor.generic.c index d869946..8b8fa0b 100644 --- a/dsp/generated/flavor.generic.c +++ b/dsp/generated/flavor.generic.c @@ -13,9 +13,9 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## generic #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) -#include "../impl/mean_power_u16.c" #include "../impl/magnitude_power_uc8.c" -#include "../impl/magnitude_uc8.c" -#include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" diff --git a/dsp/generated/flavor.x86_avx2.c b/dsp/generated/flavor.x86_avx2.c index 5b9f88e..de56b0d 100644 --- a/dsp/generated/flavor.x86_avx2.c +++ b/dsp/generated/flavor.x86_avx2.c @@ -13,11 +13,11 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## x86_avx2 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) -#include "../impl/mean_power_u16.c" #include "../impl/magnitude_power_uc8.c" -#include "../impl/magnitude_uc8.c" -#include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" #undef STARCH_ALIGNMENT @@ -32,9 +32,9 @@ #define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## x86_avx2 #define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) -#include "../impl/mean_power_u16.c" #include "../impl/magnitude_power_uc8.c" -#include "../impl/magnitude_uc8.c" -#include "../impl/magnitude_sc16q11.c" #include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" diff --git a/dsp/generated/makefile.arm b/dsp/generated/makefile.arm index 58eaf5b..96c0044 100644 --- a/dsp/generated/makefile.arm +++ b/dsp/generated/makefile.arm @@ -21,19 +21,19 @@ STARCH_CFLAGS := -DSTARCH_MIX_ARM -dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv7-a+neon-vfpv4 -mfpu=neon-vfpv4 -ffast-math dsp/generated/flavor.armv7a_neon_vfpv4.c -o dsp/generated/flavor.armv7a_neon_vfpv4.o -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.armv7a_neon_vfpv4.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.generic b/dsp/generated/makefile.generic index 7f261d9..18c6787 100644 --- a/dsp/generated/makefile.generic +++ b/dsp/generated/makefile.generic @@ -21,16 +21,16 @@ STARCH_CFLAGS := -DSTARCH_MIX_GENERIC -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.x86 b/dsp/generated/makefile.x86 index e88d3e1..8d21e85 100644 --- a/dsp/generated/makefile.x86 +++ b/dsp/generated/makefile.x86 @@ -21,19 +21,19 @@ STARCH_CFLAGS := -DSTARCH_MIX_X86 -dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) -mavx2 -ffast-math dsp/generated/flavor.x86_avx2.c -o dsp/generated/flavor.x86_avx2.o -dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o -dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_sc16.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_uc8.c dsp/impl/mean_power_u16.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o STARCH_OBJS := dsp/generated/flavor.x86_avx2.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/starch.h b/dsp/generated/starch.h index 063ac04..dabf950 100644 --- a/dsp/generated/starch.h +++ b/dsp/generated/starch.h @@ -19,6 +19,13 @@ #define STARCH_MIX_ALIGNMENT 16 #endif /* STARCH_MIX_ARM */ +/* AARCH64 */ +#ifdef STARCH_MIX_AARCH64 +#define STARCH_FLAVOR_ARMV8_NEON_SIMD +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 32 +#endif /* STARCH_MIX_AARCH64 */ + /* x64 */ #ifdef STARCH_MIX_X86 #define STARCH_FLAVOR_X86_AVX2 @@ -191,35 +198,27 @@ void starch_mean_power_u16_aligned_set_wisdom( const char * const * received_wis /* flavors and prototypes */ #ifdef STARCH_FLAVOR_GENERIC -void starch_mean_power_u16_float_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u32_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u64_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_magnitude_power_uc8_twopass_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_exact_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_11bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_12bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_mean_power_u16_float_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); #endif /* STARCH_FLAVOR_GENERIC */ int starch_read_wisdom (const char * path); #ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 int cpu_supports_armv7_neon_vfpv4 (void); -void starch_mean_power_u16_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_magnitude_power_uc8_twopass_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_twopass_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); @@ -228,14 +227,12 @@ void starch_magnitude_power_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t void starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); @@ -246,36 +243,84 @@ void starch_magnitude_sc16q11_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg void starch_magnitude_sc16q11_aligned_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_mean_power_u16_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); #endif /* STARCH_FLAVOR_ARMV7A_NEON_VFPV4 */ int starch_read_wisdom (const char * path); +#ifdef STARCH_FLAVOR_ARMV8_NEON_SIMD +int cpu_supports_armv8_simd (void); +void starch_magnitude_power_uc8_twopass_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_twopass_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_sc16_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_11bit_table_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_11bit_table_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_12bit_table_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_12bit_table_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_mean_power_u16_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_neon_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_neon_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +#endif /* STARCH_FLAVOR_ARMV8_NEON_SIMD */ + +int starch_read_wisdom (const char * path); + #ifdef STARCH_FLAVOR_X86_AVX2 int cpu_supports_avx2 (void); -void starch_mean_power_u16_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); void starch_magnitude_power_uc8_twopass_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_twopass_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); void starch_magnitude_power_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); @@ -284,10 +329,18 @@ void starch_magnitude_sc16q11_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16 void starch_magnitude_sc16q11_aligned_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); void starch_magnitude_sc16q11_aligned_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_mean_power_u16_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); #endif /* STARCH_FLAVOR_X86_AVX2 */ int starch_read_wisdom (const char * path); diff --git a/dsp/starchgen.py b/dsp/starchgen.py index ae963c8..81e5818 100755 --- a/dsp/starchgen.py +++ b/dsp/starchgen.py @@ -32,6 +32,12 @@ gen.add_flavor(name = 'armv7a_neon_vfpv4', features = ['neon'], test_function = 'cpu_supports_armv7_neon_vfpv4', alignment = 16) +gen.add_flavor(name = 'armv8_neon_simd', + description = 'ARMv8-A, NEON, SIMD', + compile_flags = ['-march=armv8-a+simd', '-ffast-math'], + features = ['neon'], + test_function = 'cpu_supports_armv8_simd', + alignment = 32) gen.add_flavor(name = 'x86_avx2', description = 'x86 with AVX2', compile_flags = ['-mavx2', '-ffast-math'], @@ -48,6 +54,11 @@ gen.add_mix(name = 'arm', flavors = ['armv7a_neon_vfpv4', 'generic'], wisdom_file = 'wisdom.arm') +gen.add_mix(name = 'aarch64', + description = 'AARCH64', + flavors = ['armv8_neon_simd', 'generic'], + wisdom_file = 'wisdom.aarch64') + gen.add_mix(name = 'x86', description = 'x64', flavors = ['x86_avx2', 'generic'], diff --git a/wisdom/wisdom.aarch64.pi4b b/wisdom/wisdom.aarch64.pi4b new file mode 100644 index 0000000..093c70e --- /dev/null +++ b/wisdom/wisdom.aarch64.pi4b @@ -0,0 +1,101 @@ +# generated by ./starch-benchmark -i 15 -o wisdom.aarch64.pi4b + +magnitude_power_uc8 neon_vrsqrte_armv8_neon_simd # 242171 ns/call +magnitude_power_uc8 lookup_unroll_4_armv8_neon_simd # 309918 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 310083 ns/call +magnitude_power_uc8 twopass_armv8_neon_simd # 331999 ns/call +magnitude_power_uc8 twopass_generic # 332283 ns/call +magnitude_power_uc8 lookup_armv8_neon_simd # 354725 ns/call +magnitude_power_uc8 lookup_generic # 354993 ns/call + +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd # 231223 ns/call +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 231231 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd # 317120 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 317202 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 317261 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd # 326316 ns/call +magnitude_power_uc8_aligned twopass_generic # 326441 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd_aligned # 339548 ns/call +magnitude_power_uc8_aligned lookup_generic # 353854 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd_aligned # 353897 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd # 354025 ns/call + +magnitude_sc16 neon_vrsqrte_armv8_neon_simd # 687064 ns/call +magnitude_sc16 exact_u32_armv8_neon_simd # 1337885 ns/call +magnitude_sc16 exact_float_armv8_neon_simd # 1409773 ns/call +magnitude_sc16 exact_u32_generic # 3331842 ns/call +magnitude_sc16 exact_float_generic # 3414790 ns/call + +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd # 669434 ns/call +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd_aligned # 770926 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd # 1336333 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd # 1397618 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd_aligned # 1808644 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd_aligned # 1927454 ns/call +magnitude_sc16_aligned exact_u32_generic # 2750034 ns/call +magnitude_sc16_aligned exact_float_generic # 3167265 ns/call + +magnitude_sc16q11 neon_vrsqrte_armv8_neon_simd # 166265 ns/call +magnitude_sc16q11 exact_float_armv8_neon_simd # 347400 ns/call +magnitude_sc16q11 exact_u32_armv8_neon_simd # 350422 ns/call +magnitude_sc16q11 exact_u32_generic # 951466 ns/call +magnitude_sc16q11 exact_float_generic # 1041727 ns/call +magnitude_sc16q11 12bit_table_generic # 2008901 ns/call +magnitude_sc16q11 12bit_table_armv8_neon_simd # 2117606 ns/call +magnitude_sc16q11 11bit_table_generic # 2315294 ns/call +magnitude_sc16q11 11bit_table_armv8_neon_simd # 2317090 ns/call + +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd # 155062 ns/call +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd_aligned # 212453 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd_aligned # 329287 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd_aligned # 345611 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd # 426742 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd # 493451 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 993016 ns/call +magnitude_sc16q11_aligned exact_float_generic # 1041225 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd_aligned # 2008440 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 2010237 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd # 2010954 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd_aligned # 2314544 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 2317709 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd # 2672466 ns/call + +magnitude_uc8 neon_vrsqrte_armv8_neon_simd # 213353 ns/call +magnitude_uc8 lookup_generic # 285617 ns/call +magnitude_uc8 lookup_armv8_neon_simd # 285723 ns/call +magnitude_uc8 lookup_unroll_4_generic # 288439 ns/call +magnitude_uc8 lookup_unroll_4_armv8_neon_simd # 288520 ns/call +magnitude_uc8 exact_armv8_neon_simd # 533721 ns/call +magnitude_uc8 exact_generic # 1703775 ns/call + +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd # 214464 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd_aligned # 280649 ns/call +magnitude_uc8_aligned lookup_generic # 280742 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd # 293121 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 293163 ns/call +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 294461 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 313567 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd # 340192 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd # 533623 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd_aligned # 731823 ns/call +magnitude_uc8_aligned exact_generic # 1705445 ns/call + +mean_power_u16 u32_armv8_neon_simd # 45663 ns/call +mean_power_u16 u32_generic # 45672 ns/call +mean_power_u16 neon_float_armv8_neon_simd # 72283 ns/call +mean_power_u16 u64_armv8_neon_simd # 89187 ns/call +mean_power_u16 u64_generic # 89199 ns/call +mean_power_u16 float_armv8_neon_simd # 94634 ns/call +mean_power_u16 float_generic # 176676 ns/call + +mean_power_u16_aligned u32_armv8_neon_simd # 44865 ns/call +mean_power_u16_aligned u32_generic # 52958 ns/call +mean_power_u16_aligned u32_armv8_neon_simd_aligned # 60579 ns/call +mean_power_u16_aligned neon_float_armv8_neon_simd # 77277 ns/call +mean_power_u16_aligned u64_armv8_neon_simd # 86287 ns/call +mean_power_u16_aligned u64_generic # 86295 ns/call +mean_power_u16_aligned float_armv8_neon_simd_aligned # 87501 ns/call +mean_power_u16_aligned neon_float_armv8_neon_simd_aligned # 94315 ns/call +mean_power_u16_aligned float_armv8_neon_simd # 104800 ns/call +mean_power_u16_aligned u64_armv8_neon_simd_aligned # 119504 ns/call +mean_power_u16_aligned float_generic # 176475 ns/call diff --git a/wisdom/wisdom.aarch64.tegra b/wisdom/wisdom.aarch64.tegra new file mode 100644 index 0000000..48c66e0 --- /dev/null +++ b/wisdom/wisdom.aarch64.tegra @@ -0,0 +1,101 @@ +# generated by ./starch-benchmark -i 15 -o wisdom.aarch64.tegra + +magnitude_power_uc8 neon_vrsqrte_armv8_neon_simd # 94796 ns/call +magnitude_power_uc8 lookup_armv8_neon_simd # 192167 ns/call +magnitude_power_uc8 lookup_generic # 192384 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 201674 ns/call +magnitude_power_uc8 lookup_unroll_4_armv8_neon_simd # 202605 ns/call +magnitude_power_uc8 twopass_armv8_neon_simd # 211684 ns/call +magnitude_power_uc8 twopass_generic # 212405 ns/call + +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd # 94539 ns/call +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 96537 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd # 194018 ns/call +magnitude_power_uc8_aligned lookup_generic # 194129 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd_aligned # 194586 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd # 202656 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 203133 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 203492 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd # 218867 ns/call +magnitude_power_uc8_aligned twopass_generic # 219683 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd_aligned # 232710 ns/call + +magnitude_sc16 neon_vrsqrte_armv8_neon_simd # 248412 ns/call +magnitude_sc16 exact_u32_armv8_neon_simd # 497100 ns/call +magnitude_sc16 exact_float_armv8_neon_simd # 499026 ns/call +magnitude_sc16 exact_u32_generic # 2498651 ns/call +magnitude_sc16 exact_float_generic # 2630913 ns/call + +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd_aligned # 251091 ns/call +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd # 251917 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd # 495168 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd # 496604 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd_aligned # 497295 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd_aligned # 497677 ns/call +magnitude_sc16_aligned exact_u32_generic # 2502639 ns/call +magnitude_sc16_aligned exact_float_generic # 2508165 ns/call + +magnitude_sc16q11 neon_vrsqrte_armv8_neon_simd # 61889 ns/call +magnitude_sc16q11 exact_u32_armv8_neon_simd # 121180 ns/call +magnitude_sc16q11 exact_float_armv8_neon_simd # 122913 ns/call +magnitude_sc16q11 12bit_table_generic # 600092 ns/call +magnitude_sc16q11 12bit_table_armv8_neon_simd # 602741 ns/call +magnitude_sc16q11 11bit_table_armv8_neon_simd # 713333 ns/call +magnitude_sc16q11 11bit_table_generic # 747792 ns/call +magnitude_sc16q11 exact_float_generic # 819436 ns/call +magnitude_sc16q11 exact_u32_generic # 830130 ns/call + +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd_aligned # 62013 ns/call +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd # 62417 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd # 121349 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd_aligned # 121531 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd # 122073 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd_aligned # 122670 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd # 589282 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 590574 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd_aligned # 591626 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd # 708434 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd_aligned # 712503 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 739828 ns/call +magnitude_sc16q11_aligned exact_float_generic # 822781 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 831139 ns/call + +magnitude_uc8 neon_vrsqrte_armv8_neon_simd # 75259 ns/call +magnitude_uc8 lookup_armv8_neon_simd # 185908 ns/call +magnitude_uc8 lookup_generic # 187426 ns/call +magnitude_uc8 lookup_unroll_4_armv8_neon_simd # 203217 ns/call +magnitude_uc8 lookup_unroll_4_generic # 205435 ns/call +magnitude_uc8 exact_armv8_neon_simd # 211685 ns/call +magnitude_uc8 exact_generic # 1143963 ns/call + +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd # 74829 ns/call +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 75205 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd_aligned # 176228 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd # 176801 ns/call +magnitude_uc8_aligned lookup_generic # 177103 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 196536 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd # 197343 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 198190 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd # 210215 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd_aligned # 211766 ns/call +magnitude_uc8_aligned exact_generic # 1129546 ns/call + +mean_power_u16 neon_float_armv8_neon_simd # 39477 ns/call +mean_power_u16 u32_generic # 42560 ns/call +mean_power_u16 u32_armv8_neon_simd # 44544 ns/call +mean_power_u16 float_armv8_neon_simd # 52529 ns/call +mean_power_u16 u64_generic # 85141 ns/call +mean_power_u16 u64_armv8_neon_simd # 85219 ns/call +mean_power_u16 float_generic # 155312 ns/call + +mean_power_u16_aligned neon_float_armv8_neon_simd # 39385 ns/call +mean_power_u16_aligned neon_float_armv8_neon_simd_aligned # 39524 ns/call +mean_power_u16_aligned u32_generic # 42604 ns/call +mean_power_u16_aligned u32_armv8_neon_simd_aligned # 42712 ns/call +mean_power_u16_aligned u32_armv8_neon_simd # 44513 ns/call +mean_power_u16_aligned float_armv8_neon_simd # 52471 ns/call +mean_power_u16_aligned float_armv8_neon_simd_aligned # 52593 ns/call +mean_power_u16_aligned u64_armv8_neon_simd # 85041 ns/call +mean_power_u16_aligned u64_generic # 85056 ns/call +mean_power_u16_aligned u64_armv8_neon_simd_aligned # 85239 ns/call +mean_power_u16_aligned float_generic # 153697 ns/call