diff --git a/Makefile.cpufeatures b/Makefile.cpufeatures index 3e34cb4..099714e 100644 --- a/Makefile.cpufeatures +++ b/Makefile.cpufeatures @@ -25,5 +25,9 @@ ifneq (,$(findstring arm,$(CPUFEATURES_ARCH))) CPUFEATURES_OBJS += cpu_features/src/cpuinfo_arm.o endif +ifneq (,$(findstring aarch64,$(CPUFEATURES_ARCH))) + CPUFEATURES_OBJS += cpu_features/src/cpuinfo_aarch64.o +endif + $(CPUFEATURES_OBJS): override CFLAGS := $(CPUFEATURES_CFLAGS) $(CPUFEATURES_OBJS): override CPPFLAGS := -Icpu_features/include diff --git a/cpu.c b/cpu.c index 831ab4f..e17e352 100644 --- a/cpu.c +++ b/cpu.c @@ -76,3 +76,34 @@ int cpu_supports_armv7_neon_vfpv4(void) return 0; #endif } + +// +// AARCH64 +// + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#include "cpuinfo_aarch64.h" + +static Aarch64Info *aarch64_info() +{ + static bool valid = false; + static Aarch64Info cache; + + if (!valid) { + cache = GetAarch64Info(); + valid = true; + } + + return &cache; +} + +#endif + +int cpu_supports_armv8_simd(void) +{ +#ifdef CPU_FEATURES_ARCH_AARCH64 + return aarch64_info()->features.asimd; +#else + return 0; +#endif +} diff --git a/cpu.h b/cpu.h index 0cf88bf..0492679 100644 --- a/cpu.h +++ b/cpu.h @@ -8,4 +8,8 @@ int cpu_supports_avx2(void); // ARM int cpu_supports_armv7_neon_vfpv4(void); +// AARCH64 +int cpu_supports_armv8_simd(void); +int cpu_supports_armv8_simd_sve(void); + #endif diff --git a/dsp/generated/benchmark.c b/dsp/generated/benchmark.c index dc52682..ce38e01 100644 --- a/dsp/generated/benchmark.c +++ b/dsp/generated/benchmark.c @@ -1246,11 +1246,11 @@ static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, #define STARCH_BENCHMARK_ALLOC(_count, _type) ((_type *) starch_benchmark_aligned_alloc(1, alignof(_type), (_count) * sizeof(_type))) #define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) +#include "../benchmark/magnitude_power_uc8_benchmark.c" #include "../benchmark/magnitude_sc16_benchmark.c" #include "../benchmark/magnitude_sc16q11_benchmark.c" #include "../benchmark/magnitude_uc8_benchmark.c" #include "../benchmark/mean_power_u16_benchmark.c" -#include "../benchmark/magnitude_power_uc8_benchmark.c" #undef STARCH_ALIGNMENT #undef STARCH_ALIGNED @@ -1274,11 +1274,11 @@ static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, #define STARCH_BENCHMARK_ALLOC(_count, _type) ((_type *) starch_benchmark_aligned_alloc(STARCH_MIX_ALIGNMENT, alignof(_type), (_count) * sizeof(_type))) #define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) +#include "../benchmark/magnitude_power_uc8_benchmark.c" #include "../benchmark/magnitude_sc16_benchmark.c" #include "../benchmark/magnitude_sc16q11_benchmark.c" #include "../benchmark/magnitude_uc8_benchmark.c" #include "../benchmark/mean_power_u16_benchmark.c" -#include "../benchmark/magnitude_power_uc8_benchmark.c" static void starch_benchmark_all_magnitude_uc8(void) { @@ -1375,8 +1375,8 @@ static void starch_benchmark_usage(const char *argv0) #ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 "armv7a_neon_vfpv4 " #endif -#ifdef STARCH_FLAVOR_ARMV8_A - "armv8_a " +#ifdef STARCH_FLAVOR_ARMV8_NEON_SIMD + "armv8_neon_simd " #endif #ifdef STARCH_FLAVOR_X86_AVX2 "x86_avx2 " diff --git a/dsp/generated/dispatcher.c b/dsp/generated/dispatcher.c index 54072dd..7a0bce4 100644 --- a/dsp/generated/dispatcher.c +++ b/dsp/generated/dispatcher.c @@ -90,12 +90,13 @@ starch_magnitude_uc8_regentry starch_magnitude_uc8_registry[] = { #endif /* STARCH_MIX_ARM */ #ifdef STARCH_MIX_AARCH64 - { 0, "lookup_armv8_a", "armv8_a", starch_magnitude_uc8_lookup_armv8_a, NULL }, - { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, - { 2, "lookup_unroll_4_armv8_a", "armv8_a", starch_magnitude_uc8_lookup_unroll_4_armv8_a, NULL }, - { 3, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, - { 4, "exact_armv8_a", "armv8_a", starch_magnitude_uc8_exact_armv8_a, NULL }, - { 5, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, + { 0, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, + { 3, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "exact_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, #endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_X86 @@ -184,15 +185,17 @@ starch_magnitude_uc8_aligned_regentry starch_magnitude_uc8_aligned_registry[] = #endif /* STARCH_MIX_ARM */ #ifdef STARCH_MIX_AARCH64 - { 0, "lookup_armv8_a", "armv8_a", starch_magnitude_uc8_lookup_armv8_a, NULL }, - { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, - { 2, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, - { 3, "lookup_armv8_a_aligned", "armv8_a", starch_magnitude_uc8_aligned_lookup_armv8_a, NULL }, - { 4, "lookup_unroll_4_armv8_a_aligned", "armv8_a", starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_a, NULL }, - { 5, "lookup_unroll_4_armv8_a", "armv8_a", starch_magnitude_uc8_lookup_unroll_4_armv8_a, NULL }, - { 6, "exact_armv8_a_aligned", "armv8_a", starch_magnitude_uc8_aligned_exact_armv8_a, NULL }, - { 7, "exact_armv8_a", "armv8_a", starch_magnitude_uc8_exact_armv8_a, NULL }, - { 8, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, + { 0, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, + { 3, "lookup_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "lookup_unroll_4_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "exact_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_uc8_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 9, "exact_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_exact_armv8_neon_simd, cpu_supports_armv8_simd }, + { 10, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, #endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_X86 @@ -280,12 +283,13 @@ starch_magnitude_power_uc8_regentry starch_magnitude_power_uc8_registry[] = { #endif /* STARCH_MIX_ARM */ #ifdef STARCH_MIX_AARCH64 - { 0, "twopass_armv8_a", "armv8_a", starch_magnitude_power_uc8_twopass_armv8_a, NULL }, - { 1, "lookup_armv8_a", "armv8_a", starch_magnitude_power_uc8_lookup_armv8_a, NULL }, - { 2, "lookup_unroll_4_armv8_a", "armv8_a", starch_magnitude_power_uc8_lookup_unroll_4_armv8_a, NULL }, - { 3, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, - { 4, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, - { 5, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, + { 0, "twopass_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 5, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 6, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, #endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_X86 @@ -374,15 +378,17 @@ starch_magnitude_power_uc8_aligned_regentry starch_magnitude_power_uc8_aligned_r #endif /* STARCH_MIX_ARM */ #ifdef STARCH_MIX_AARCH64 - { 0, "twopass_armv8_a_aligned", "armv8_a", starch_magnitude_power_uc8_aligned_twopass_armv8_a, NULL }, - { 1, "lookup_armv8_a_aligned", "armv8_a", starch_magnitude_power_uc8_aligned_lookup_armv8_a, NULL }, - { 2, "lookup_unroll_4_armv8_a_aligned", "armv8_a", starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv8_a, NULL }, - { 3, "twopass_armv8_a", "armv8_a", starch_magnitude_power_uc8_twopass_armv8_a, NULL }, - { 4, "lookup_armv8_a", "armv8_a", starch_magnitude_power_uc8_lookup_armv8_a, NULL }, - { 5, "lookup_unroll_4_armv8_a", "armv8_a", starch_magnitude_power_uc8_lookup_unroll_4_armv8_a, NULL }, - { 6, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, - { 7, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, - { 8, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, + { 0, "twopass_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "lookup_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "lookup_unroll_4_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "twopass_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_twopass_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "lookup_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "lookup_unroll_4_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_lookup_unroll_4_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_power_uc8_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 9, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 10, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, #endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_X86 @@ -467,10 +473,11 @@ starch_magnitude_sc16_regentry starch_magnitude_sc16_registry[] = { #endif /* STARCH_MIX_ARM */ #ifdef STARCH_MIX_AARCH64 - { 0, "exact_u32_armv8_a", "armv8_a", starch_magnitude_sc16_exact_u32_armv8_a, NULL }, - { 1, "exact_float_armv8_a", "armv8_a", starch_magnitude_sc16_exact_float_armv8_a, NULL }, - { 2, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, - { 3, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 0, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, + { 4, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, #endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_X86 @@ -553,12 +560,14 @@ starch_magnitude_sc16_aligned_regentry starch_magnitude_sc16_aligned_registry[] #endif /* STARCH_MIX_ARM */ #ifdef STARCH_MIX_AARCH64 - { 0, "exact_u32_armv8_a_aligned", "armv8_a", starch_magnitude_sc16_aligned_exact_u32_armv8_a, NULL }, - { 1, "exact_float_armv8_a_aligned", "armv8_a", starch_magnitude_sc16_aligned_exact_float_armv8_a, NULL }, - { 2, "exact_u32_armv8_a", "armv8_a", starch_magnitude_sc16_exact_u32_armv8_a, NULL }, - { 3, "exact_float_armv8_a", "armv8_a", starch_magnitude_sc16_exact_float_armv8_a, NULL }, - { 4, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, - { 5, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 0, "exact_u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, + { 7, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, #endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_X86 @@ -646,14 +655,15 @@ starch_magnitude_sc16q11_regentry starch_magnitude_sc16q11_registry[] = { #endif /* STARCH_MIX_ARM */ #ifdef STARCH_MIX_AARCH64 - { 0, "exact_u32_armv8_a", "armv8_a", starch_magnitude_sc16q11_exact_u32_armv8_a, NULL }, - { 1, "exact_float_armv8_a", "armv8_a", starch_magnitude_sc16q11_exact_float_armv8_a, NULL }, - { 2, "11bit_table_armv8_a", "armv8_a", starch_magnitude_sc16q11_11bit_table_armv8_a, NULL }, - { 3, "12bit_table_armv8_a", "armv8_a", starch_magnitude_sc16q11_12bit_table_armv8_a, NULL }, - { 4, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, - { 5, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, - { 6, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, - { 7, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, + { 0, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "11bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "12bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 6, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 7, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 8, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, #endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_X86 @@ -748,18 +758,20 @@ starch_magnitude_sc16q11_aligned_regentry starch_magnitude_sc16q11_aligned_regis #endif /* STARCH_MIX_ARM */ #ifdef STARCH_MIX_AARCH64 - { 0, "exact_u32_armv8_a_aligned", "armv8_a", starch_magnitude_sc16q11_aligned_exact_u32_armv8_a, NULL }, - { 1, "exact_float_armv8_a_aligned", "armv8_a", starch_magnitude_sc16q11_aligned_exact_float_armv8_a, NULL }, - { 2, "11bit_table_armv8_a_aligned", "armv8_a", starch_magnitude_sc16q11_aligned_11bit_table_armv8_a, NULL }, - { 3, "12bit_table_armv8_a_aligned", "armv8_a", starch_magnitude_sc16q11_aligned_12bit_table_armv8_a, NULL }, - { 4, "exact_u32_armv8_a", "armv8_a", starch_magnitude_sc16q11_exact_u32_armv8_a, NULL }, - { 5, "exact_float_armv8_a", "armv8_a", starch_magnitude_sc16q11_exact_float_armv8_a, NULL }, - { 6, "11bit_table_armv8_a", "armv8_a", starch_magnitude_sc16q11_11bit_table_armv8_a, NULL }, - { 7, "12bit_table_armv8_a", "armv8_a", starch_magnitude_sc16q11_12bit_table_armv8_a, NULL }, - { 8, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, - { 9, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, - { 10, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, - { 11, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, + { 0, "exact_u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 1, "exact_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 2, "11bit_table_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 3, "12bit_table_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "neon_vrsqrte_armv8_neon_simd_aligned", "armv8_neon_simd", starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "exact_u32_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "exact_float_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_exact_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "11bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_11bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "12bit_table_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_12bit_table_armv8_neon_simd, cpu_supports_armv8_simd }, + { 9, "neon_vrsqrte_armv8_neon_simd", "armv8_neon_simd", starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd, cpu_supports_armv8_simd }, + { 10, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 11, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 12, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 13, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, #endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_X86 @@ -850,12 +862,13 @@ starch_mean_power_u16_regentry starch_mean_power_u16_registry[] = { #endif /* STARCH_MIX_ARM */ #ifdef STARCH_MIX_AARCH64 - { 0, "float_armv8_a", "armv8_a", starch_mean_power_u16_float_armv8_a, NULL }, - { 1, "u32_armv8_a", "armv8_a", starch_mean_power_u16_u32_armv8_a, NULL }, - { 2, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, - { 3, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, - { 4, "u64_armv8_a", "armv8_a", starch_mean_power_u16_u64_armv8_a, NULL }, - { 5, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 2, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 3, "float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "u32_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "u64_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u64_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "neon_float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, #endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_X86 @@ -944,15 +957,17 @@ starch_mean_power_u16_aligned_regentry starch_mean_power_u16_aligned_registry[] #endif /* STARCH_MIX_ARM */ #ifdef STARCH_MIX_AARCH64 - { 0, "u32_armv8_a", "armv8_a", starch_mean_power_u16_u32_armv8_a, NULL }, - { 1, "u32_armv8_a_aligned", "armv8_a", starch_mean_power_u16_aligned_u32_armv8_a, NULL }, - { 2, "u64_armv8_a", "armv8_a", starch_mean_power_u16_u64_armv8_a, NULL }, - { 3, "u64_armv8_a_aligned", "armv8_a", starch_mean_power_u16_aligned_u64_armv8_a, NULL }, - { 4, "float_armv8_a", "armv8_a", starch_mean_power_u16_float_armv8_a, NULL }, - { 5, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, - { 6, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, - { 7, "float_armv8_a_aligned", "armv8_a", starch_mean_power_u16_aligned_float_armv8_a, NULL }, - { 8, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 2, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 3, "float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 4, "u32_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 5, "u64_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_u64_armv8_neon_simd, cpu_supports_armv8_simd }, + { 6, "neon_float_armv8_neon_simd_aligned", "armv8_neon_simd", starch_mean_power_u16_aligned_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 7, "float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_float_armv8_neon_simd, cpu_supports_armv8_simd }, + { 8, "u32_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u32_armv8_neon_simd, cpu_supports_armv8_simd }, + { 9, "u64_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_u64_armv8_neon_simd, cpu_supports_armv8_simd }, + { 10, "neon_float_armv8_neon_simd", "armv8_neon_simd", starch_mean_power_u16_neon_float_armv8_neon_simd, cpu_supports_armv8_simd }, #endif /* STARCH_MIX_AARCH64 */ #ifdef STARCH_MIX_X86 diff --git a/dsp/generated/flavor.armv8_neon_simd.c b/dsp/generated/flavor.armv8_neon_simd.c new file mode 100644 index 0000000..276e47e --- /dev/null +++ b/dsp/generated/flavor.armv8_neon_simd.c @@ -0,0 +1,41 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_ARMV8_NEON_SIMD +#define STARCH_FEATURE_NEON + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## armv8_neon_simd +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv8_neon_simd +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## armv8_neon_simd +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv8_neon_simd +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_sc16.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/mean_power_u16.c" + diff --git a/dsp/generated/makefile.arm b/dsp/generated/makefile.arm index 5a1e34a..96c0044 100644 --- a/dsp/generated/makefile.arm +++ b/dsp/generated/makefile.arm @@ -33,7 +33,7 @@ dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_ STARCH_OBJS := dsp/generated/flavor.armv7a_neon_vfpv4.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.generic b/dsp/generated/makefile.generic index eec19f3..18c6787 100644 --- a/dsp/generated/makefile.generic +++ b/dsp/generated/makefile.generic @@ -30,7 +30,7 @@ dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_ STARCH_OBJS := dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.x86 b/dsp/generated/makefile.x86 index f3d8d54..8d21e85 100644 --- a/dsp/generated/makefile.x86 +++ b/dsp/generated/makefile.x86 @@ -33,7 +33,7 @@ dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/magnitude_power_ STARCH_OBJS := dsp/generated/flavor.x86_avx2.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o -dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/starch.h b/dsp/generated/starch.h index 052cf18..dabf950 100644 --- a/dsp/generated/starch.h +++ b/dsp/generated/starch.h @@ -21,7 +21,7 @@ /* AARCH64 */ #ifdef STARCH_MIX_AARCH64 -#define STARCH_FLAVOR_ARMV8_A +#define STARCH_FLAVOR_ARMV8_NEON_SIMD #define STARCH_FLAVOR_GENERIC #define STARCH_MIX_ALIGNMENT 32 #endif /* STARCH_MIX_AARCH64 */ @@ -263,38 +263,49 @@ void starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4 ( const uint16_t int starch_read_wisdom (const char * path); -#ifdef STARCH_FLAVOR_ARMV8_A -void starch_magnitude_power_uc8_twopass_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_power_uc8_aligned_twopass_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_power_uc8_lookup_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_power_uc8_aligned_lookup_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_power_uc8_lookup_unroll_4_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); -void starch_magnitude_sc16_exact_u32_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_u32_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_exact_float_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16_aligned_exact_float_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_exact_u32_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_aligned_exact_u32_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_exact_float_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_aligned_exact_float_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_11bit_table_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_aligned_11bit_table_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_12bit_table_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_sc16q11_aligned_12bit_table_armv8_a ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_lookup_unroll_4_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_exact_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_magnitude_uc8_aligned_exact_armv8_a ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); -void starch_mean_power_u16_float_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_float_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u32_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u32_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_u64_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -void starch_mean_power_u16_aligned_u64_armv8_a ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); -#endif /* STARCH_FLAVOR_ARMV8_A */ +#ifdef STARCH_FLAVOR_ARMV8_NEON_SIMD +int cpu_supports_armv8_simd (void); +void starch_magnitude_power_uc8_twopass_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_twopass_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_sc16_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_u32_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_float_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_11bit_table_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_11bit_table_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_12bit_table_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_12bit_table_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv8_neon_simd ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_neon_vrsqrte_armv8_neon_simd ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_mean_power_u16_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_neon_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_neon_float_armv8_neon_simd ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +#endif /* STARCH_FLAVOR_ARMV8_NEON_SIMD */ int starch_read_wisdom (const char * path); diff --git a/dsp/starchgen.py b/dsp/starchgen.py index 1a3da9a..81e5818 100755 --- a/dsp/starchgen.py +++ b/dsp/starchgen.py @@ -32,10 +32,11 @@ gen.add_flavor(name = 'armv7a_neon_vfpv4', features = ['neon'], test_function = 'cpu_supports_armv7_neon_vfpv4', alignment = 16) -gen.add_flavor(name = 'armv8_a', - description = 'ARMv8-A', - compile_flags = ['-ffast-math'], - features = [], +gen.add_flavor(name = 'armv8_neon_simd', + description = 'ARMv8-A, NEON, SIMD', + compile_flags = ['-march=armv8-a+simd', '-ffast-math'], + features = ['neon'], + test_function = 'cpu_supports_armv8_simd', alignment = 32) gen.add_flavor(name = 'x86_avx2', description = 'x86 with AVX2', @@ -55,7 +56,7 @@ gen.add_mix(name = 'arm', gen.add_mix(name = 'aarch64', description = 'AARCH64', - flavors = ['armv8_a', 'generic'], + flavors = ['armv8_neon_simd', 'generic'], wisdom_file = 'wisdom.aarch64') gen.add_mix(name = 'x86', diff --git a/wisdom/wisdom.aarch64.pi4b b/wisdom/wisdom.aarch64.pi4b new file mode 100644 index 0000000..093c70e --- /dev/null +++ b/wisdom/wisdom.aarch64.pi4b @@ -0,0 +1,101 @@ +# generated by ./starch-benchmark -i 15 -o wisdom.aarch64.pi4b + +magnitude_power_uc8 neon_vrsqrte_armv8_neon_simd # 242171 ns/call +magnitude_power_uc8 lookup_unroll_4_armv8_neon_simd # 309918 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 310083 ns/call +magnitude_power_uc8 twopass_armv8_neon_simd # 331999 ns/call +magnitude_power_uc8 twopass_generic # 332283 ns/call +magnitude_power_uc8 lookup_armv8_neon_simd # 354725 ns/call +magnitude_power_uc8 lookup_generic # 354993 ns/call + +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd # 231223 ns/call +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 231231 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd # 317120 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 317202 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 317261 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd # 326316 ns/call +magnitude_power_uc8_aligned twopass_generic # 326441 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd_aligned # 339548 ns/call +magnitude_power_uc8_aligned lookup_generic # 353854 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd_aligned # 353897 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd # 354025 ns/call + +magnitude_sc16 neon_vrsqrte_armv8_neon_simd # 687064 ns/call +magnitude_sc16 exact_u32_armv8_neon_simd # 1337885 ns/call +magnitude_sc16 exact_float_armv8_neon_simd # 1409773 ns/call +magnitude_sc16 exact_u32_generic # 3331842 ns/call +magnitude_sc16 exact_float_generic # 3414790 ns/call + +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd # 669434 ns/call +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd_aligned # 770926 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd # 1336333 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd # 1397618 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd_aligned # 1808644 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd_aligned # 1927454 ns/call +magnitude_sc16_aligned exact_u32_generic # 2750034 ns/call +magnitude_sc16_aligned exact_float_generic # 3167265 ns/call + +magnitude_sc16q11 neon_vrsqrte_armv8_neon_simd # 166265 ns/call +magnitude_sc16q11 exact_float_armv8_neon_simd # 347400 ns/call +magnitude_sc16q11 exact_u32_armv8_neon_simd # 350422 ns/call +magnitude_sc16q11 exact_u32_generic # 951466 ns/call +magnitude_sc16q11 exact_float_generic # 1041727 ns/call +magnitude_sc16q11 12bit_table_generic # 2008901 ns/call +magnitude_sc16q11 12bit_table_armv8_neon_simd # 2117606 ns/call +magnitude_sc16q11 11bit_table_generic # 2315294 ns/call +magnitude_sc16q11 11bit_table_armv8_neon_simd # 2317090 ns/call + +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd # 155062 ns/call +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd_aligned # 212453 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd_aligned # 329287 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd_aligned # 345611 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd # 426742 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd # 493451 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 993016 ns/call +magnitude_sc16q11_aligned exact_float_generic # 1041225 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd_aligned # 2008440 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 2010237 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd # 2010954 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd_aligned # 2314544 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 2317709 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd # 2672466 ns/call + +magnitude_uc8 neon_vrsqrte_armv8_neon_simd # 213353 ns/call +magnitude_uc8 lookup_generic # 285617 ns/call +magnitude_uc8 lookup_armv8_neon_simd # 285723 ns/call +magnitude_uc8 lookup_unroll_4_generic # 288439 ns/call +magnitude_uc8 lookup_unroll_4_armv8_neon_simd # 288520 ns/call +magnitude_uc8 exact_armv8_neon_simd # 533721 ns/call +magnitude_uc8 exact_generic # 1703775 ns/call + +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd # 214464 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd_aligned # 280649 ns/call +magnitude_uc8_aligned lookup_generic # 280742 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd # 293121 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 293163 ns/call +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 294461 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 313567 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd # 340192 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd # 533623 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd_aligned # 731823 ns/call +magnitude_uc8_aligned exact_generic # 1705445 ns/call + +mean_power_u16 u32_armv8_neon_simd # 45663 ns/call +mean_power_u16 u32_generic # 45672 ns/call +mean_power_u16 neon_float_armv8_neon_simd # 72283 ns/call +mean_power_u16 u64_armv8_neon_simd # 89187 ns/call +mean_power_u16 u64_generic # 89199 ns/call +mean_power_u16 float_armv8_neon_simd # 94634 ns/call +mean_power_u16 float_generic # 176676 ns/call + +mean_power_u16_aligned u32_armv8_neon_simd # 44865 ns/call +mean_power_u16_aligned u32_generic # 52958 ns/call +mean_power_u16_aligned u32_armv8_neon_simd_aligned # 60579 ns/call +mean_power_u16_aligned neon_float_armv8_neon_simd # 77277 ns/call +mean_power_u16_aligned u64_armv8_neon_simd # 86287 ns/call +mean_power_u16_aligned u64_generic # 86295 ns/call +mean_power_u16_aligned float_armv8_neon_simd_aligned # 87501 ns/call +mean_power_u16_aligned neon_float_armv8_neon_simd_aligned # 94315 ns/call +mean_power_u16_aligned float_armv8_neon_simd # 104800 ns/call +mean_power_u16_aligned u64_armv8_neon_simd_aligned # 119504 ns/call +mean_power_u16_aligned float_generic # 176475 ns/call diff --git a/wisdom/wisdom.aarch64.tegra b/wisdom/wisdom.aarch64.tegra new file mode 100644 index 0000000..48c66e0 --- /dev/null +++ b/wisdom/wisdom.aarch64.tegra @@ -0,0 +1,101 @@ +# generated by ./starch-benchmark -i 15 -o wisdom.aarch64.tegra + +magnitude_power_uc8 neon_vrsqrte_armv8_neon_simd # 94796 ns/call +magnitude_power_uc8 lookup_armv8_neon_simd # 192167 ns/call +magnitude_power_uc8 lookup_generic # 192384 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 201674 ns/call +magnitude_power_uc8 lookup_unroll_4_armv8_neon_simd # 202605 ns/call +magnitude_power_uc8 twopass_armv8_neon_simd # 211684 ns/call +magnitude_power_uc8 twopass_generic # 212405 ns/call + +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd # 94539 ns/call +magnitude_power_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 96537 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd # 194018 ns/call +magnitude_power_uc8_aligned lookup_generic # 194129 ns/call +magnitude_power_uc8_aligned lookup_armv8_neon_simd_aligned # 194586 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd # 202656 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 203133 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 203492 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd # 218867 ns/call +magnitude_power_uc8_aligned twopass_generic # 219683 ns/call +magnitude_power_uc8_aligned twopass_armv8_neon_simd_aligned # 232710 ns/call + +magnitude_sc16 neon_vrsqrte_armv8_neon_simd # 248412 ns/call +magnitude_sc16 exact_u32_armv8_neon_simd # 497100 ns/call +magnitude_sc16 exact_float_armv8_neon_simd # 499026 ns/call +magnitude_sc16 exact_u32_generic # 2498651 ns/call +magnitude_sc16 exact_float_generic # 2630913 ns/call + +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd_aligned # 251091 ns/call +magnitude_sc16_aligned neon_vrsqrte_armv8_neon_simd # 251917 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd # 495168 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd # 496604 ns/call +magnitude_sc16_aligned exact_u32_armv8_neon_simd_aligned # 497295 ns/call +magnitude_sc16_aligned exact_float_armv8_neon_simd_aligned # 497677 ns/call +magnitude_sc16_aligned exact_u32_generic # 2502639 ns/call +magnitude_sc16_aligned exact_float_generic # 2508165 ns/call + +magnitude_sc16q11 neon_vrsqrte_armv8_neon_simd # 61889 ns/call +magnitude_sc16q11 exact_u32_armv8_neon_simd # 121180 ns/call +magnitude_sc16q11 exact_float_armv8_neon_simd # 122913 ns/call +magnitude_sc16q11 12bit_table_generic # 600092 ns/call +magnitude_sc16q11 12bit_table_armv8_neon_simd # 602741 ns/call +magnitude_sc16q11 11bit_table_armv8_neon_simd # 713333 ns/call +magnitude_sc16q11 11bit_table_generic # 747792 ns/call +magnitude_sc16q11 exact_float_generic # 819436 ns/call +magnitude_sc16q11 exact_u32_generic # 830130 ns/call + +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd_aligned # 62013 ns/call +magnitude_sc16q11_aligned neon_vrsqrte_armv8_neon_simd # 62417 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd # 121349 ns/call +magnitude_sc16q11_aligned exact_u32_armv8_neon_simd_aligned # 121531 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd # 122073 ns/call +magnitude_sc16q11_aligned exact_float_armv8_neon_simd_aligned # 122670 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd # 589282 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 590574 ns/call +magnitude_sc16q11_aligned 12bit_table_armv8_neon_simd_aligned # 591626 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd # 708434 ns/call +magnitude_sc16q11_aligned 11bit_table_armv8_neon_simd_aligned # 712503 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 739828 ns/call +magnitude_sc16q11_aligned exact_float_generic # 822781 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 831139 ns/call + +magnitude_uc8 neon_vrsqrte_armv8_neon_simd # 75259 ns/call +magnitude_uc8 lookup_armv8_neon_simd # 185908 ns/call +magnitude_uc8 lookup_generic # 187426 ns/call +magnitude_uc8 lookup_unroll_4_armv8_neon_simd # 203217 ns/call +magnitude_uc8 lookup_unroll_4_generic # 205435 ns/call +magnitude_uc8 exact_armv8_neon_simd # 211685 ns/call +magnitude_uc8 exact_generic # 1143963 ns/call + +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd # 74829 ns/call +magnitude_uc8_aligned neon_vrsqrte_armv8_neon_simd_aligned # 75205 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd_aligned # 176228 ns/call +magnitude_uc8_aligned lookup_armv8_neon_simd # 176801 ns/call +magnitude_uc8_aligned lookup_generic # 177103 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd_aligned # 196536 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv8_neon_simd # 197343 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 198190 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd # 210215 ns/call +magnitude_uc8_aligned exact_armv8_neon_simd_aligned # 211766 ns/call +magnitude_uc8_aligned exact_generic # 1129546 ns/call + +mean_power_u16 neon_float_armv8_neon_simd # 39477 ns/call +mean_power_u16 u32_generic # 42560 ns/call +mean_power_u16 u32_armv8_neon_simd # 44544 ns/call +mean_power_u16 float_armv8_neon_simd # 52529 ns/call +mean_power_u16 u64_generic # 85141 ns/call +mean_power_u16 u64_armv8_neon_simd # 85219 ns/call +mean_power_u16 float_generic # 155312 ns/call + +mean_power_u16_aligned neon_float_armv8_neon_simd # 39385 ns/call +mean_power_u16_aligned neon_float_armv8_neon_simd_aligned # 39524 ns/call +mean_power_u16_aligned u32_generic # 42604 ns/call +mean_power_u16_aligned u32_armv8_neon_simd_aligned # 42712 ns/call +mean_power_u16_aligned u32_armv8_neon_simd # 44513 ns/call +mean_power_u16_aligned float_armv8_neon_simd # 52471 ns/call +mean_power_u16_aligned float_armv8_neon_simd_aligned # 52593 ns/call +mean_power_u16_aligned u64_armv8_neon_simd # 85041 ns/call +mean_power_u16_aligned u64_generic # 85056 ns/call +mean_power_u16_aligned u64_armv8_neon_simd_aligned # 85239 ns/call +mean_power_u16_aligned float_generic # 153697 ns/call