From bff71dc82031d8eecf9fb2fa8053796a26cd84fc Mon Sep 17 00:00:00 2001 From: Oliver Jowett Date: Thu, 21 Jan 2021 05:45:00 -0600 Subject: [PATCH] Move all converters to starch-based implementations (#97) * Switch all conversion routines to use starch. main user-visible changes: * ensure you check out submodules ('git clone --recurse-submodules") * --version shows the CPU features and DSP implementations in use * --wisdom allows overriding of the built-in architecture wisdom * --dcfilter no longer supported * "starch-benchmark" binary will benchmark all options on the current machine and can produce a wisdom file to feed to the --wisdom option If you have a usecase for --dcfilter, please get in touch and let me know - it's an edge case and for now there's no starch/DSP support for it, but support can be written if needed. In almost all cases the new conversion routines are slightly or substantially faster than the old conversion routines. The only case that is slower is SC16/SC16Q11 on a Pi 0, which is around 10% slower due to changing from heavily approximated lookup tables to higher quality results (but SC16 is probably already out of reach of a Pi 0) * No need to build with SC16Q11_TABLE_BITS any more * Add oneoff/uc8_capture_stats (reads a UC8 capture; measures min/max/mean I and Q) * Switch UC8 conversion to 127.4 center, 128 range. Looking at actual UC8 captures from a RTL2832, the mean I and Q are actually at 127.4, so use that as the zero point. This means that the resulting I/Q maximum values could be as large as 127.6. Switch to 128 for simplicity. * Switch to the new UC8 zero offset in benchmarks, fix some bugs * Fix some bugs in SC16/SC16Q11 validation, tighten the max error requirements * Ditch UC8 approximation path, add a NEON VRQSQRTE path. * Tweak the SC16 exact path, add a new impl that uses a mix of u32 & floats. * SC16Q11 impl tweaks: * add a u32->float exact path * ditch the approximation path * add a NEON VRSQRTE path * add a 12-bit table path (using the full signed I/Q value, not absolute value) * Ditch SC16 approximation path, add NEON vrsqrte path * Add oneoff/dsp_error_measurement This runs sample input through the DSP functions that are allowed to be inexact and dumps the results as a TSV suitable for feeding to gnuplot to look at the actual errors. * Update make clean, make wisdom targets * Update wisdom based on benchmarking * Preserve the raw wisdom benchmark data * Update to latest starch * Update .gitignore for new wisdom files * Update starch generated code * Build starch-benchmark as part of the 'all' target * Use wisdom from /etc/dump1090-fa/wisdom.local if present * Package starch-benchmark and a helper script to generate local wisdom data * Remove submodules in preparation for importing them directly * Import cpu_features v0.6.0 from https://github.com/google/cpu_features/releases/tag/v0.6.0 * Import starch at commit a725c8491dc33a321565d451b385131e589d8490 from https://github.com/flightaware/starch --- .gitignore | 4 + Makefile | 78 +- Makefile.cpufeatures | 29 + convert.c | 510 +----- cpu.c | 78 + cpu.h | 11 + cpu_features/.clang-format | 4 + cpu_features/.gitignore | 4 + cpu_features/.travis.yml | 121 ++ cpu_features/CMakeLists.txt | 259 +++ cpu_features/CONTRIBUTING.md | 23 + cpu_features/LICENSE | 230 +++ cpu_features/README.md | 199 ++ cpu_features/WORKSPACE | 7 + cpu_features/appveyor.yml | 24 + cpu_features/cmake/CpuFeaturesConfig.cmake.in | 3 + .../cmake/CpuFeaturesNdkCompatConfig.cmake.in | 3 + cpu_features/cmake/README.md | 28 + .../cmake/googletest.CMakeLists.txt.in | 15 + .../include/cpu_features_cache_info.h | 54 + cpu_features/include/cpu_features_macros.h | 216 +++ cpu_features/include/cpuinfo_aarch64.h | 156 ++ cpu_features/include/cpuinfo_arm.h | 121 ++ cpu_features/include/cpuinfo_mips.h | 60 + cpu_features/include/cpuinfo_ppc.h | 146 ++ cpu_features/include/cpuinfo_x86.h | 231 +++ cpu_features/include/internal/bit_utils.h | 40 + cpu_features/include/internal/cpuid_x86.h | 37 + cpu_features/include/internal/filesystem.h | 39 + cpu_features/include/internal/hwcaps.h | 186 ++ .../include/internal/stack_line_reader.h | 49 + cpu_features/include/internal/string_view.h | 109 ++ cpu_features/ndk_compat/CMakeLists.txt | 60 + cpu_features/ndk_compat/README.md | 4 + cpu_features/ndk_compat/cpu-features.c | 205 +++ cpu_features/ndk_compat/cpu-features.h | 320 ++++ cpu_features/ndk_compat/ndk-compat-test.c | 12 + cpu_features/scripts/run_integration.sh | 209 +++ cpu_features/scripts/test_integration.sh | 106 ++ cpu_features/src/cpuinfo_aarch64.c | 150 ++ cpu_features/src/cpuinfo_arm.c | 212 +++ cpu_features/src/cpuinfo_mips.c | 92 + cpu_features/src/cpuinfo_ppc.c | 154 ++ cpu_features/src/cpuinfo_x86.c | 1622 +++++++++++++++++ cpu_features/src/define_tables.h | 67 + cpu_features/src/filesystem.c | 62 + cpu_features/src/hwcaps.c | 182 ++ cpu_features/src/stack_line_reader.c | 132 ++ cpu_features/src/string_view.c | 182 ++ cpu_features/src/utils/list_cpu_features.c | 438 +++++ cpu_features/test/CMakeLists.txt | 85 + cpu_features/test/bit_utils_test.cc | 53 + cpu_features/test/cpuinfo_aarch64_test.cc | 171 ++ cpu_features/test/cpuinfo_arm_test.cc | 354 ++++ cpu_features/test/cpuinfo_mips_test.cc | 126 ++ cpu_features/test/cpuinfo_ppc_test.cc | 119 ++ cpu_features/test/cpuinfo_x86_test.cc | 533 ++++++ cpu_features/test/filesystem_for_testing.cc | 103 ++ cpu_features/test/filesystem_for_testing.h | 61 + cpu_features/test/hwcaps_for_testing.cc | 46 + cpu_features/test/hwcaps_for_testing.h | 27 + cpu_features/test/stack_line_reader_test.cc | 132 ++ cpu_features/test/string_view_test.cc | 192 ++ debian-jessie/rules | 5 - debian-stretch/rules | 5 - debian/dump1090-fa.default | 6 + debian/dump1090-fa.install | 2 + debian/generate-wisdom | 20 + debian/rules | 5 - dsp-types.h | 21 + dsp/benchmark/magnitude_power_uc8_benchmark.c | 102 ++ dsp/benchmark/magnitude_sc16_benchmark.c | 79 + dsp/benchmark/magnitude_sc16q11_benchmark.c | 79 + dsp/benchmark/magnitude_uc8_benchmark.c | 79 + dsp/benchmark/mean_power_u16_benchmark.c | 57 + dsp/generated/benchmark.c | 1590 ++++++++++++++++ dsp/generated/dispatcher.c | 1160 ++++++++++++ dsp/generated/flavor.armv7a_neon_vfpv4.c | 41 + dsp/generated/flavor.generic.c | 21 + dsp/generated/flavor.x86_avx2.c | 40 + dsp/generated/makefile.arm | 39 + dsp/generated/makefile.generic | 36 + dsp/generated/makefile.x86 | 39 + dsp/generated/starch.h | 294 +++ dsp/helpers/tables.c | 105 ++ dsp/helpers/tables.h | 10 + dsp/impl/magnitude_power_uc8.c | 201 ++ dsp/impl/magnitude_sc16.c | 100 + dsp/impl/magnitude_sc16q11.c | 137 ++ dsp/impl/magnitude_uc8.c | 164 ++ dsp/impl/mean_power_u16.c | 122 ++ dsp/starchgen.py | 60 + dump1090.c | 52 +- dump1090.h | 1 + oneoff/convert_benchmark.c | 11 +- oneoff/dsp_error_measurement.c | 229 +++ oneoff/uc8_capture_stats.c | 106 ++ starch/.gitignore | 6 + starch/LICENSE | 23 + starch/Makefile | 4 + starch/README.md | 182 ++ starch/example/Makefile | 28 + .../example/benchmark/subtract_n_benchmark.c | 33 + starch/example/generated/.keep | 0 starch/example/generated/benchmark.c | 569 ++++++ starch/example/generated/dispatcher.c | 313 ++++ .../example/generated/flavor.armv7a_vfpv3.c | 33 + .../example/generated/flavor.armv7a_vfpv4.c | 33 + starch/example/generated/flavor.generic.c | 17 + starch/example/generated/flavor.x86_64_avx.c | 32 + starch/example/generated/flavor.x86_64_avx2.c | 32 + starch/example/generated/makefile.arm | 42 + starch/example/generated/makefile.generic | 36 + starch/example/generated/makefile.x86_64 | 42 + starch/example/generated/starch.h | 133 ++ starch/example/impl/subtract_n.c | 94 + starch/example/starchgen.py | 71 + starch/example/support.c | 53 + starch/starch.py | 583 ++++++ starch/stubs/mako/__init__.pyi | 4 + starch/stubs/mako/lookup.pyi | 16 + starch/stubs/mako/template.pyi | 6 + starch/templates/benchmark.c.template | 490 +++++ starch/templates/dispatcher.c.template | 206 +++ starch/templates/flavor.c.template | 48 + starch/templates/makefile.template | 57 + starch/templates/starch.h.template | 68 + wisdom.arm | 31 + wisdom.generic | 16 + wisdom.x86 | 31 + wisdom/wisdom.i7-6500u | 90 + wisdom/wisdom.i7-6500u.generic | 43 + wisdom/wisdom.pi0w | 47 + wisdom/wisdom.pi4b | 107 ++ 134 files changed, 17439 insertions(+), 483 deletions(-) create mode 100644 Makefile.cpufeatures create mode 100644 cpu.c create mode 100644 cpu.h create mode 100644 cpu_features/.clang-format create mode 100644 cpu_features/.gitignore create mode 100644 cpu_features/.travis.yml create mode 100644 cpu_features/CMakeLists.txt create mode 100644 cpu_features/CONTRIBUTING.md create mode 100644 cpu_features/LICENSE create mode 100644 cpu_features/README.md create mode 100644 cpu_features/WORKSPACE create mode 100644 cpu_features/appveyor.yml create mode 100644 cpu_features/cmake/CpuFeaturesConfig.cmake.in create mode 100644 cpu_features/cmake/CpuFeaturesNdkCompatConfig.cmake.in create mode 100644 cpu_features/cmake/README.md create mode 100644 cpu_features/cmake/googletest.CMakeLists.txt.in create mode 100644 cpu_features/include/cpu_features_cache_info.h create mode 100644 cpu_features/include/cpu_features_macros.h create mode 100644 cpu_features/include/cpuinfo_aarch64.h create mode 100644 cpu_features/include/cpuinfo_arm.h create mode 100644 cpu_features/include/cpuinfo_mips.h create mode 100644 cpu_features/include/cpuinfo_ppc.h create mode 100644 cpu_features/include/cpuinfo_x86.h create mode 100644 cpu_features/include/internal/bit_utils.h create mode 100644 cpu_features/include/internal/cpuid_x86.h create mode 100644 cpu_features/include/internal/filesystem.h create mode 100644 cpu_features/include/internal/hwcaps.h create mode 100644 cpu_features/include/internal/stack_line_reader.h create mode 100644 cpu_features/include/internal/string_view.h create mode 100644 cpu_features/ndk_compat/CMakeLists.txt create mode 100644 cpu_features/ndk_compat/README.md create mode 100644 cpu_features/ndk_compat/cpu-features.c create mode 100644 cpu_features/ndk_compat/cpu-features.h create mode 100644 cpu_features/ndk_compat/ndk-compat-test.c create mode 100755 cpu_features/scripts/run_integration.sh create mode 100755 cpu_features/scripts/test_integration.sh create mode 100644 cpu_features/src/cpuinfo_aarch64.c create mode 100644 cpu_features/src/cpuinfo_arm.c create mode 100644 cpu_features/src/cpuinfo_mips.c create mode 100644 cpu_features/src/cpuinfo_ppc.c create mode 100644 cpu_features/src/cpuinfo_x86.c create mode 100644 cpu_features/src/define_tables.h create mode 100644 cpu_features/src/filesystem.c create mode 100644 cpu_features/src/hwcaps.c create mode 100644 cpu_features/src/stack_line_reader.c create mode 100644 cpu_features/src/string_view.c create mode 100644 cpu_features/src/utils/list_cpu_features.c create mode 100644 cpu_features/test/CMakeLists.txt create mode 100644 cpu_features/test/bit_utils_test.cc create mode 100644 cpu_features/test/cpuinfo_aarch64_test.cc create mode 100644 cpu_features/test/cpuinfo_arm_test.cc create mode 100644 cpu_features/test/cpuinfo_mips_test.cc create mode 100644 cpu_features/test/cpuinfo_ppc_test.cc create mode 100644 cpu_features/test/cpuinfo_x86_test.cc create mode 100644 cpu_features/test/filesystem_for_testing.cc create mode 100644 cpu_features/test/filesystem_for_testing.h create mode 100644 cpu_features/test/hwcaps_for_testing.cc create mode 100644 cpu_features/test/hwcaps_for_testing.h create mode 100644 cpu_features/test/stack_line_reader_test.cc create mode 100644 cpu_features/test/string_view_test.cc create mode 100755 debian/generate-wisdom create mode 100644 dsp-types.h create mode 100644 dsp/benchmark/magnitude_power_uc8_benchmark.c create mode 100644 dsp/benchmark/magnitude_sc16_benchmark.c create mode 100644 dsp/benchmark/magnitude_sc16q11_benchmark.c create mode 100644 dsp/benchmark/magnitude_uc8_benchmark.c create mode 100644 dsp/benchmark/mean_power_u16_benchmark.c create mode 100644 dsp/generated/benchmark.c create mode 100644 dsp/generated/dispatcher.c create mode 100644 dsp/generated/flavor.armv7a_neon_vfpv4.c create mode 100644 dsp/generated/flavor.generic.c create mode 100644 dsp/generated/flavor.x86_avx2.c create mode 100644 dsp/generated/makefile.arm create mode 100644 dsp/generated/makefile.generic create mode 100644 dsp/generated/makefile.x86 create mode 100644 dsp/generated/starch.h create mode 100644 dsp/helpers/tables.c create mode 100644 dsp/helpers/tables.h create mode 100644 dsp/impl/magnitude_power_uc8.c create mode 100644 dsp/impl/magnitude_sc16.c create mode 100644 dsp/impl/magnitude_sc16q11.c create mode 100644 dsp/impl/magnitude_uc8.c create mode 100644 dsp/impl/mean_power_u16.c create mode 100755 dsp/starchgen.py create mode 100644 oneoff/dsp_error_measurement.c create mode 100644 oneoff/uc8_capture_stats.c create mode 100644 starch/.gitignore create mode 100644 starch/LICENSE create mode 100644 starch/Makefile create mode 100644 starch/README.md create mode 100644 starch/example/Makefile create mode 100644 starch/example/benchmark/subtract_n_benchmark.c create mode 100644 starch/example/generated/.keep create mode 100644 starch/example/generated/benchmark.c create mode 100644 starch/example/generated/dispatcher.c create mode 100644 starch/example/generated/flavor.armv7a_vfpv3.c create mode 100644 starch/example/generated/flavor.armv7a_vfpv4.c create mode 100644 starch/example/generated/flavor.generic.c create mode 100644 starch/example/generated/flavor.x86_64_avx.c create mode 100644 starch/example/generated/flavor.x86_64_avx2.c create mode 100644 starch/example/generated/makefile.arm create mode 100644 starch/example/generated/makefile.generic create mode 100644 starch/example/generated/makefile.x86_64 create mode 100644 starch/example/generated/starch.h create mode 100644 starch/example/impl/subtract_n.c create mode 100755 starch/example/starchgen.py create mode 100644 starch/example/support.c create mode 100644 starch/starch.py create mode 100644 starch/stubs/mako/__init__.pyi create mode 100644 starch/stubs/mako/lookup.pyi create mode 100644 starch/stubs/mako/template.pyi create mode 100644 starch/templates/benchmark.c.template create mode 100644 starch/templates/dispatcher.c.template create mode 100644 starch/templates/flavor.c.template create mode 100644 starch/templates/makefile.template create mode 100644 starch/templates/starch.h.template create mode 100644 wisdom.arm create mode 100644 wisdom.generic create mode 100644 wisdom.x86 create mode 100644 wisdom/wisdom.i7-6500u create mode 100644 wisdom/wisdom.i7-6500u.generic create mode 100644 wisdom/wisdom.pi0w create mode 100644 wisdom/wisdom.pi4b diff --git a/.gitignore b/.gitignore index 28200a3..816e363 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,8 @@ view1090 faup1090 package-wheezy oneoff/convert_benchmark +oneoff/uc8_capture_stats +oneoff/dsp_error_measurement oneoff/decode_comm_b +starch-benchmark +wisdom.local diff --git a/Makefile b/Makefile index 6c99b3f..75195b4 100644 --- a/Makefile +++ b/Makefile @@ -2,12 +2,12 @@ PROGNAME=dump1090 DUMP1090_VERSION ?= unknown -CPPFLAGS += -DMODES_DUMP1090_VERSION=\"$(DUMP1090_VERSION)\" -DMODES_DUMP1090_VARIANT=\"dump1090-fa\" +CPPFLAGS += -I. -DMODES_DUMP1090_VERSION=\"$(DUMP1090_VERSION)\" -DMODES_DUMP1090_VARIANT=\"dump1090-fa\" DIALECT = -std=c11 CFLAGS += $(DIALECT) -O3 -g -Wall -Wmissing-declarations -Werror -W -D_DEFAULT_SOURCE -fno-common LIBS = -lpthread -lm -SDR_OBJ = sdr.o fifo.o sdr_ifile.o +SDR_OBJ = cpu.o sdr.o fifo.o sdr_ifile.o dsp/helpers/tables.o # Try to autodetect available libraries via pkg-config if no explicit setting was used PKGCONFIG=$(shell pkg-config --version >/dev/null 2>&1 && echo "yes" || echo "no") @@ -42,33 +42,43 @@ endif UNAME := $(shell uname) ifeq ($(UNAME), Linux) - CFLAGS += -D_DEFAULT_SOURCE + include Makefile.cpufeatures + CPPFLAGS += -D_DEFAULT_SOURCE LIBS += -lrt LIBS_USB += -lusb-1.0 + CPUFEATURES ?= yes endif ifeq ($(UNAME), Darwin) ifneq ($(shell sw_vers -productVersion | egrep '^10\.([0-9]|1[01])\.'),) # Mac OS X ver <= 10.11 - CFLAGS += -DMISSING_GETTIME + CPPFLAGS += -DMISSING_GETTIME COMPAT += compat/clock_gettime/clock_gettime.o endif - CFLAGS += -DMISSING_NANOSLEEP + CPPFLAGS += -DMISSING_NANOSLEEP COMPAT += compat/clock_nanosleep/clock_nanosleep.o LIBS_USB += -lusb-1.0 + CPUFEATURES ?= yes endif ifeq ($(UNAME), OpenBSD) - CFLAGS += -DMISSING_NANOSLEEP + CPPFLAGS += -DMISSING_NANOSLEEP COMPAT += compat/clock_nanosleep/clock_nanosleep.o LIBS_USB += -lusb-1.0 endif ifeq ($(UNAME), FreeBSD) - CFLAGS += -D_DEFAULT_SOURCE + CPPFLAGS += -D_DEFAULT_SOURCE LIBS += -lrt LIBS_USB += -lusb endif +CPUFEATURES ?= no + +ifeq ($(CPUFEATURES),yes) + include Makefile.cpufeatures + CPPFLAGS += -DENABLE_CPUFEATURES -Icpu_features/include +endif + RTLSDR ?= yes BLADERF ?= yes @@ -122,22 +132,47 @@ ifeq ($(LIMESDR), yes) LIBS_SDR += $(shell pkg-config --libs LimeSuite) endif -all: showconfig dump1090 view1090 + +## +## starch (runtime DSP code selection) mix, architecture-specific +## + +ARCH ?= $(shell uname -m) +ifneq ($(CPUFEATURES),yes) + # need to be able to detect CPU features at runtime to enable any non-standard compiler flags + STARCH_MIX := generic + CPPFLAGS += -DSTARCH_MIX_GENERIC +else ifeq ($(ARCH),x86_64) + # AVX, AVX2 + STARCH_MIX := x86 + CPPFLAGS += -DSTARCH_MIX_X86 +else ifneq (,$(findstring arm,$(ARCH))) + # ARMv7 NEON + STARCH_MIX := arm + CPPFLAGS += -DSTARCH_MIX_ARM +else + STARCH_MIX := generic + CPPFLAGS += -DSTARCH_MIX_GENERIC +endif + +all: showconfig dump1090 view1090 starch-benchmark + +STARCH_COMPILE := $(CC) $(CPPFLAGS) $(CFLAGS) -c +include dsp/generated/makefile.$(STARCH_MIX) showconfig: @echo "Building with:" >&2 @echo " Version string: $(DUMP1090_VERSION)" >&2 + @echo " DSP mix: $(STARCH_MIX)" >&2 @echo " RTLSDR support: $(RTLSDR)" >&2 @echo " BladeRF support: $(BLADERF)" >&2 @echo " HackRF support: $(HACKRF)" >&2 @echo " LimeSDR support: $(LIMESDR)" >&2 -all: dump1090 view1090 - %.o: %.c *.h $(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@ -dump1090: dump1090.o anet.o interactive.o mode_ac.o mode_s.o comm_b.o net_io.o crc.o demod_2400.o stats.o cpr.o icao_filter.o track.o util.o convert.o ais_charset.o $(SDR_OBJ) $(COMPAT) +dump1090: dump1090.o anet.o interactive.o mode_ac.o mode_s.o comm_b.o net_io.o crc.o demod_2400.o stats.o cpr.o icao_filter.o track.o util.o convert.o ais_charset.o $(SDR_OBJ) $(COMPAT) $(CPUFEATURES_OBJS) $(STARCH_OBJS) $(CC) -g -o $@ $^ $(LDFLAGS) $(LIBS) $(LIBS_SDR) -lncurses view1090: view1090.o anet.o interactive.o mode_ac.o mode_s.o comm_b.o net_io.o crc.o stats.o cpr.o icao_filter.o track.o util.o ais_charset.o $(COMPAT) @@ -146,8 +181,11 @@ view1090: view1090.o anet.o interactive.o mode_ac.o mode_s.o comm_b.o net_io.o c faup1090: faup1090.o anet.o mode_ac.o mode_s.o comm_b.o net_io.o crc.o stats.o cpr.o icao_filter.o track.o util.o ais_charset.o $(COMPAT) $(CC) -g -o $@ $^ $(LDFLAGS) $(LIBS) +starch-benchmark: cpu.o dsp/helpers/tables.o $(CPUFEATURES_OBJS) $(STARCH_OBJS) $(STARCH_BENCHMARK_OBJ) + $(CC) -g -o $@ $^ $(LDFLAGS) $(LIBS) + clean: - rm -f *.o oneoff/*.o compat/clock_gettime/*.o compat/clock_nanosleep/*.o dump1090 view1090 faup1090 cprtests crctests convert_benchmark + rm -f *.o oneoff/*.o compat/clock_gettime/*.o compat/clock_nanosleep/*.o cpu_features/src/*.o dsp/generated/*.o dsp/helpers/*.o $(CPUFEATURES_OBJS) dump1090 view1090 faup1090 cprtests crctests oneoff/convert_benchmark oneoff/decode_comm_b oneoff/dsp_error_measurement oneoff/uc8_capture_stats starch-benchmark test: cprtests ./cprtests @@ -161,8 +199,22 @@ crctests: crc.c crc.h benchmarks: oneoff/convert_benchmark oneoff/convert_benchmark -oneoff/convert_benchmark: oneoff/convert_benchmark.o convert.o util.o +oneoff/convert_benchmark: oneoff/convert_benchmark.o convert.o util.o dsp/helpers/tables.o cpu.o $(CPUFEATURES_OBJS) $(STARCH_OBJS) $(CC) $(CPPFLAGS) $(CFLAGS) -g -o $@ $^ -lm -lpthread oneoff/decode_comm_b: oneoff/decode_comm_b.o comm_b.o ais_charset.o $(CC) $(CPPFLAGS) $(CFLAGS) -g -o $@ $^ -lm + +oneoff/dsp_error_measurement: oneoff/dsp_error_measurement.o dsp/helpers/tables.o cpu.o $(CPUFEATURES_OBJS) $(STARCH_OBJS) + $(CC) $(CPPFLAGS) $(CFLAGS) -g -o $@ $^ -lm + +oneoff/uc8_capture_stats: oneoff/uc8_capture_stats.o + $(CC) $(CPPFLAGS) $(CFLAGS) -g -o $@ $^ -lm + +starchgen: + dsp/starchgen.py . + +.PHONY: wisdom.local +wisdom.local: starch-benchmark + ./starch-benchmark -i 15 -o wisdom.local mean_power_u16 mean_power_u16_aligned magnitude_uc8 magnitude_uc8_aligned + ./starch-benchmark -i 15 -r wisdom.local -o wisdom.local diff --git a/Makefile.cpufeatures b/Makefile.cpufeatures new file mode 100644 index 0000000..3e34cb4 --- /dev/null +++ b/Makefile.cpufeatures @@ -0,0 +1,29 @@ +# -*- makefile -*- + +# cmake integration is a little tricky, so let's do this by hand for now + +CPUFEATURES_UNAME := $(shell uname) +CPUFEATURES_ARCH := $(shell uname -m) + +CPUFEATURES_OBJS := cpu_features/src/filesystem.o cpu_features/src/stack_line_reader.o cpu_features/src/string_view.o +CPUFEATURES_CFLAGS := -std=c99 -O -g -DSTACK_LINE_READER_BUFFER_SIZE=1024 -DNDEBUG + +ifeq ($(CPUFEATURES_UNAME),Linux) + CPUFEATURES_OBJS += cpu_features/src/hwcaps.o + CPUFEATURES_CFLAGS += -DHAVE_STRONG_GETAUXVAL +endif + +ifeq ($(CPUFEATURES_UNAME),Darwin) + CPUFEATURES_CFLAGS += -DHAVE_SYSCTLBYNAME +endif + +ifeq ($(CPUFEATURES_ARCH), x86_64) + CPUFEATURES_OBJS += cpu_features/src/cpuinfo_x86.o +endif + +ifneq (,$(findstring arm,$(CPUFEATURES_ARCH))) + CPUFEATURES_OBJS += cpu_features/src/cpuinfo_arm.o +endif + +$(CPUFEATURES_OBJS): override CFLAGS := $(CPUFEATURES_CFLAGS) +$(CPUFEATURES_OBJS): override CPPFLAGS := -Icpu_features/include diff --git a/convert.c b/convert.c index 3f34a38..e1d1d60 100644 --- a/convert.c +++ b/convert.c @@ -19,483 +19,105 @@ #include "dump1090.h" -struct converter_state { - float dc_a; - float dc_b; - float z1_I; - float z1_Q; -}; - -static uint16_t *uc8_lookup; -static bool init_uc8_lookup() -{ - if (uc8_lookup) - return true; - - uc8_lookup = malloc(sizeof(uint16_t) * 256 * 256); - if (!uc8_lookup) { - fprintf(stderr, "can't allocate UC8 conversion lookup table\n"); - return false; - } - - for (int i = 0; i <= 255; i++) { - for (int q = 0; q <= 255; q++) { - float fI, fQ, magsq; - - fI = (i - 127.5) / 127.5; - fQ = (q - 127.5) / 127.5; - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - float mag = sqrtf(magsq); - - uc8_lookup[le16toh((i*256)+q)] = (uint16_t) (mag * 65535.0f + 0.5f); - } - } - - return true; -} - -static void convert_uc8_nodc(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) -{ - uint16_t *in = iq_data; - unsigned i; - uint64_t sum_level = 0; - uint64_t sum_power = 0; - uint16_t mag; - - MODES_NOTUSED(state); - - // unroll this a bit - -#define DO_ONE_SAMPLE \ - do { \ - mag = uc8_lookup[*in++]; \ - *mag_data++ = mag; \ - sum_level += mag; \ - sum_power += (uint32_t)mag * (uint32_t)mag; \ - } while(0) - - // unroll this a bit - for (i = 0; i < (nsamples>>3); ++i) { - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - } - - for (i = 0; i < (nsamples&7); ++i) { - DO_ONE_SAMPLE; - } - -#undef DO_ONE_SAMPLE - - if (out_mean_level) { - *out_mean_level = sum_level / 65536.0 / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / 65535.0 / 65535.0 / nsamples; - } -} - -static void convert_uc8_generic(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) -{ - uint8_t *in = iq_data; - float z1_I = state->z1_I; - float z1_Q = state->z1_Q; - const float dc_a = state->dc_a; - const float dc_b = state->dc_b; - - unsigned i; - uint8_t I, Q; - float fI, fQ, magsq; - float sum_level = 0, sum_power = 0; - - for (i = 0; i < nsamples; ++i) { - I = *in++; - Q = *in++; - fI = (I - 127.5f) / 127.5f; - fQ = (Q - 127.5f) / 127.5f; - - // DC block - z1_I = fI * dc_a + z1_I * dc_b; - z1_Q = fQ * dc_a + z1_Q * dc_b; - fI -= z1_I; - fQ -= z1_Q; - - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - - float mag = sqrtf(magsq); - sum_power += magsq; - sum_level += mag; - *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f); - } - - state->z1_I = z1_I; - state->z1_Q = z1_Q; - - if (out_mean_level) { - *out_mean_level = sum_level / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / nsamples; - } -} - -static void convert_sc16_generic(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) -{ - uint16_t *in = iq_data; - float z1_I = state->z1_I; - float z1_Q = state->z1_Q; - const float dc_a = state->dc_a; - const float dc_b = state->dc_b; - - unsigned i; - int16_t I, Q; - float fI, fQ, magsq; - float sum_level = 0, sum_power = 0; - - for (i = 0; i < nsamples; ++i) { - I = (int16_t)le16toh(*in++); - Q = (int16_t)le16toh(*in++); - fI = I / 32768.0f; - fQ = Q / 32768.0f; - - // DC block - z1_I = fI * dc_a + z1_I * dc_b; - z1_Q = fQ * dc_a + z1_Q * dc_b; - fI -= z1_I; - fQ -= z1_Q; - - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - - float mag = sqrtf(magsq); - sum_power += magsq; - sum_level += mag; - *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f); - } - - state->z1_I = z1_I; - state->z1_Q = z1_Q; - - if (out_mean_level) { - *out_mean_level = sum_level / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / nsamples; - } -} - -static void convert_sc16_nodc(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) +static void convert_uc8(void *iq_data, + uint16_t *mag_data, + unsigned nsamples, + struct converter_state *state, + double *out_mean_level, + double *out_mean_power) { MODES_NOTUSED(state); - uint16_t *in = iq_data; + const uc8_t *in = (const uc8_t *) iq_data; - unsigned i; - int16_t I, Q; - float fI, fQ, magsq; - float sum_level = 0, sum_power = 0; - - for (i = 0; i < nsamples; ++i) { - I = (int16_t)le16toh(*in++); - Q = (int16_t)le16toh(*in++); - fI = I / 32768.0f; - fQ = Q / 32768.0f; - - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - - float mag = sqrtf(magsq); - sum_power += magsq; - sum_level += mag; - *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f); - } - - if (out_mean_level) { - *out_mean_level = sum_level / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / nsamples; + if (out_mean_level && out_mean_power) { + if (STARCH_IS_ALIGNED(in) && STARCH_IS_ALIGNED(mag_data)) + starch_magnitude_power_uc8_aligned(in, mag_data, nsamples, out_mean_level, out_mean_power); + else + starch_magnitude_power_uc8(in, mag_data, nsamples, out_mean_level, out_mean_power); + } else { + if (STARCH_IS_ALIGNED(in) && STARCH_IS_ALIGNED(mag_data)) + starch_magnitude_uc8_aligned(in, mag_data, nsamples); + else + starch_magnitude_uc8(in, mag_data, nsamples); } } -// SC16Q11_TABLE_BITS controls the size of the lookup table -// for SC16Q11 data. The size of the table is 2 * (1 << (2*BITS)) -// bytes. Reducing the number of bits reduces precision but -// can run substantially faster by staying in cache. -// See convert_benchmark.c for some numbers. - -// Leaving SC16QQ_TABLE_BITS undefined will disable the table lookup and always use -// the floating-point path, which may be faster on some systems - -#if defined(SC16Q11_TABLE_BITS) - -#define USE_BITS SC16Q11_TABLE_BITS -#define LOSE_BITS (11 - SC16Q11_TABLE_BITS) - -static uint16_t *sc16q11_lookup; -static bool init_sc16q11_lookup() -{ - if (sc16q11_lookup) - return true; - - sc16q11_lookup = malloc(sizeof(uint16_t) * (1 << (USE_BITS * 2))); - if (!sc16q11_lookup) { - fprintf(stderr, "can't allocate SC16Q11 conversion lookup table\n"); - return false; - } - - for (int i = 0; i < 2048; i += (1 << LOSE_BITS)) { - for (int q = 0; q < 2048; q += (1 << LOSE_BITS)) { - float fI = i / 2048.0, fQ = q / 2048.0; - float magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - float mag = sqrtf(magsq); - - unsigned index = ((i >> LOSE_BITS) << USE_BITS) | (q >> LOSE_BITS); - sc16q11_lookup[index] = (uint16_t)(mag * 65535.0f + 0.5f); - } - } - - return true; -} - -static void convert_sc16q11_table(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) -{ - uint16_t *in = iq_data; - unsigned i; - uint16_t I, Q; - uint64_t sum_level = 0; - uint64_t sum_power = 0; - uint16_t mag; - - MODES_NOTUSED(state); - - for (i = 0; i < nsamples; ++i) { - I = abs((int16_t)le16toh(*in++)) & 2047; - Q = abs((int16_t)le16toh(*in++)) & 2047; - mag = sc16q11_lookup[((I >> LOSE_BITS) << USE_BITS) | (Q >> LOSE_BITS)]; - *mag_data++ = mag; - sum_level += mag; - sum_power += (uint32_t)mag * (uint32_t)mag; - } - - if (out_mean_level) { - *out_mean_level = sum_level / 65536.0 / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / 65535.0 / 65535.0 / nsamples; - } -} - -#else /* ! defined(SC16Q11_TABLE_BITS) */ - -static void convert_sc16q11_nodc(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) +static void convert_sc16(void *iq_data, + uint16_t *mag_data, + unsigned nsamples, + struct converter_state *state, + double *out_mean_level, + double *out_mean_power) { MODES_NOTUSED(state); - uint16_t *in = iq_data; + const sc16_t *in = (const sc16_t *) iq_data; - unsigned i; - int16_t I, Q; - float fI, fQ, magsq; - float sum_level = 0, sum_power = 0; + if (STARCH_IS_ALIGNED(in) && STARCH_IS_ALIGNED(mag_data)) + starch_magnitude_sc16_aligned(in, mag_data, nsamples); + else + starch_magnitude_sc16(in, mag_data, nsamples); - for (i = 0; i < nsamples; ++i) { - I = (int16_t)le16toh(*in++); - Q = (int16_t)le16toh(*in++); - fI = I / 2048.0f; - fQ = Q / 2048.0f; - - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - - float mag = sqrtf(magsq); - sum_power += magsq; - sum_level += mag; - *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f); - } - - if (out_mean_level) { - *out_mean_level = sum_level / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / nsamples; + if (out_mean_level && out_mean_power) { + if (STARCH_IS_ALIGNED(mag_data)) + starch_mean_power_u16_aligned(mag_data, nsamples, out_mean_level, out_mean_power); + else + starch_mean_power_u16(mag_data, nsamples, out_mean_level, out_mean_power); } } -#endif /* defined(SC16Q11_TABLE_BITS) */ - -static void convert_sc16q11_generic(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) +static void convert_sc16q11(void *iq_data, + uint16_t *mag_data, + unsigned nsamples, + struct converter_state *state, + double *out_mean_level, + double *out_mean_power) { - uint16_t *in = iq_data; - float z1_I = state->z1_I; - float z1_Q = state->z1_Q; - const float dc_a = state->dc_a; - const float dc_b = state->dc_b; + MODES_NOTUSED(state); - unsigned i; - int16_t I, Q; - float fI, fQ, magsq; - float sum_level = 0, sum_power = 0; + const sc16_t *in = (const sc16_t *) iq_data; - for (i = 0; i < nsamples; ++i) { - I = (int16_t)le16toh(*in++); - Q = (int16_t)le16toh(*in++); - fI = I / 2048.0f; - fQ = Q / 2048.0f; + if (STARCH_IS_ALIGNED(in) && STARCH_IS_ALIGNED(mag_data)) + starch_magnitude_sc16q11_aligned(in, mag_data, nsamples); + else + starch_magnitude_sc16q11(in, mag_data, nsamples); - // DC block - z1_I = fI * dc_a + z1_I * dc_b; - z1_Q = fQ * dc_a + z1_Q * dc_b; - fI -= z1_I; - fQ -= z1_Q; - - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - - float mag = sqrtf(magsq); - sum_power += magsq; - sum_level += mag; - *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f); - } - - state->z1_I = z1_I; - state->z1_Q = z1_Q; - - if (out_mean_level) { - *out_mean_level = sum_level / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / nsamples; + if (out_mean_level && out_mean_power) { + if (STARCH_IS_ALIGNED(mag_data)) + starch_mean_power_u16_aligned(mag_data, nsamples, out_mean_level, out_mean_power); + else + starch_mean_power_u16(mag_data, nsamples, out_mean_level, out_mean_power); } } -static struct { - input_format_t format; - int can_filter_dc; - iq_convert_fn fn; - const char *description; - bool (*init)(); -} converters_table[] = { - // In order of preference - { INPUT_UC8, 0, convert_uc8_nodc, "UC8, integer/table path", init_uc8_lookup }, - { INPUT_UC8, 1, convert_uc8_generic, "UC8, float path", NULL }, - { INPUT_SC16, 0, convert_sc16_nodc, "SC16, float path, no DC", NULL }, - { INPUT_SC16, 1, convert_sc16_generic, "SC16, float path", NULL }, -#if defined(SC16Q11_TABLE_BITS) - { INPUT_SC16Q11, 0, convert_sc16q11_table, "SC16Q11, integer/table path", init_sc16q11_lookup }, -#else - { INPUT_SC16Q11, 0, convert_sc16q11_nodc, "SC16Q11, float path, no DC", NULL }, -#endif - { INPUT_SC16Q11, 1, convert_sc16q11_generic, "SC16Q11, float path", NULL }, - { 0, 0, NULL, NULL, NULL } -}; - iq_convert_fn init_converter(input_format_t format, double sample_rate, int filter_dc, struct converter_state **out_state) { - int i; - - for (i = 0; converters_table[i].fn; ++i) { - if (converters_table[i].format != format) - continue; - if (filter_dc && !converters_table[i].can_filter_dc) - continue; - break; - } - - if (!converters_table[i].fn) { - fprintf(stderr, "no suitable converter for format=%d dc=%d\n", - format, filter_dc); - return NULL; - } - - if (converters_table[i].init) { - if (!converters_table[i].init()) - return NULL; - } - - *out_state = malloc(sizeof(struct converter_state)); - if (! *out_state) { - fprintf(stderr, "can't allocate converter state\n"); - return NULL; - } - - (*out_state)->z1_I = 0; - (*out_state)->z1_Q = 0; + MODES_NOTUSED(sample_rate); + MODES_NOTUSED(out_state); if (filter_dc) { - // init DC block @ 1Hz - (*out_state)->dc_b = exp(-2.0 * M_PI * 1.0 / sample_rate); - (*out_state)->dc_a = 1.0 - (*out_state)->dc_b; - } else { - // if the converter does filtering, make sure it has no effect - (*out_state)->dc_b = 1.0; - (*out_state)->dc_a = 0.0; + fprintf(stderr, "DC filtering not supported (yet)\n"); + return NULL; } - return converters_table[i].fn; + switch (format) { + case INPUT_UC8: + return convert_uc8; + case INPUT_SC16: + return convert_sc16; + case INPUT_SC16Q11: + return convert_sc16q11; + default: + fprintf(stderr, "no suitable converter for format=%d\n", format); + return NULL; + } } void cleanup_converter(struct converter_state *state) { - free(state); + MODES_NOTUSED(state); } diff --git a/cpu.c b/cpu.c new file mode 100644 index 0000000..831ab4f --- /dev/null +++ b/cpu.c @@ -0,0 +1,78 @@ +#include "cpu.h" + +#include + +#ifdef ENABLE_CPUFEATURES +#include "cpu_features_macros.h" +#endif + +// +// x86 +// + +#ifdef CPU_FEATURES_ARCH_X86 +#include "cpuinfo_x86.h" + +static X86Info *x86_info() +{ + static bool valid = false; + static X86Info cache; + + if (!valid) { + cache = GetX86Info(); + valid = true; + } + + return &cache; +} + +#endif + +int cpu_supports_avx(void) +{ +#ifdef CPU_FEATURES_ARCH_X86 + return x86_info()->features.avx; +#else + return 0; +#endif +} + +int cpu_supports_avx2(void) +{ +#ifdef CPU_FEATURES_ARCH_X86 + return x86_info()->features.avx2; +#else + return 0; +#endif +} + +// +// ARM +// + +#ifdef CPU_FEATURES_ARCH_ARM +#include "cpuinfo_arm.h" + +static ArmInfo *arm_info() +{ + static bool valid = false; + static ArmInfo cache; + + if (!valid) { + cache = GetArmInfo(); + valid = true; + } + + return &cache; +} + +#endif + +int cpu_supports_armv7_neon_vfpv4(void) +{ +#ifdef CPU_FEATURES_ARCH_ARM + return arm_info()->architecture >= 7 && arm_info()->features.neon && arm_info()->features.vfpv4 && arm_info()->features.vfpd32; +#else + return 0; +#endif +} diff --git a/cpu.h b/cpu.h new file mode 100644 index 0000000..0cf88bf --- /dev/null +++ b/cpu.h @@ -0,0 +1,11 @@ +#ifndef DUMP1090_CPU_H +#define DUMP1090_CPU_H + +// x86 +int cpu_supports_avx(void); +int cpu_supports_avx2(void); + +// ARM +int cpu_supports_armv7_neon_vfpv4(void); + +#endif diff --git a/cpu_features/.clang-format b/cpu_features/.clang-format new file mode 100644 index 0000000..06ea346 --- /dev/null +++ b/cpu_features/.clang-format @@ -0,0 +1,4 @@ +--- +Language: Cpp +BasedOnStyle: Google +... diff --git a/cpu_features/.gitignore b/cpu_features/.gitignore new file mode 100644 index 0000000..6285424 --- /dev/null +++ b/cpu_features/.gitignore @@ -0,0 +1,4 @@ +cmake_build/ +build/ + +*.swp diff --git a/cpu_features/.travis.yml b/cpu_features/.travis.yml new file mode 100644 index 0000000..b5845be --- /dev/null +++ b/cpu_features/.travis.yml @@ -0,0 +1,121 @@ +language: c + +sudo: false + +cache: + timeout: 1000 + directories: + - $HOME/cpu_features_archives + +addons: + apt_packages: + - ninja-build + +env: + global: + TOOLCHAIN=NATIVE + CMAKE_GENERATOR=Ninja + +matrix: + include: + - os: linux + compiler: gcc + env: + TARGET=x86_64-linux-gnu + - os: linux + compiler: clang + env: + TARGET=x86_64-linux-gnu + - os: osx + compiler: gcc + env: + TARGET=x86_64-osx + CMAKE_GENERATOR="Unix Makefiles" + - os: osx + compiler: clang + env: + TARGET=x86_64-osx + CMAKE_GENERATOR="Unix Makefiles" + - os: windows + env: + TARGET=x86_64-windows + CMAKE_GENERATOR="Visual Studio 15 2017 Win64" + + # see: https://docs.travis-ci.com/user/multi-cpu-architectures/ + - os: linux + arch: ppc64le + compiler: gcc + env: + TARGET=ppc64le-linux-gnu + - os: linux + arch: ppc64le + compiler: clang + env: + TARGET=ppc64le-linux-gnu + + # Toolchains for little-endian, 64-bit ARMv8 for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=aarch64-linux-gnu + QEMU_ARCH=aarch64 + # Toolchains for little-endian, hard-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=arm-linux-gnueabihf + QEMU_ARCH=arm + # Toolchains for little-endian, 32-bit ARMv8 for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=armv8l-linux-gnueabihf + QEMU_ARCH=arm + # Toolchains for little-endian, soft-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=arm-linux-gnueabi + QEMU_ARCH=arm + # Toolchains for big-endian, 64-bit ARMv8 for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=aarch64_be-linux-gnu + QEMU_ARCH=DISABLED + # Toolchains for big-endian, hard-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=armeb-linux-gnueabihf + QEMU_ARCH=DISABLED + # Toolchains for big-endian, soft-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=armeb-linux-gnueabi + QEMU_ARCH=DISABLED + - os: linux + env: + TOOLCHAIN=CODESCAPE + TARGET=mips32 + QEMU_ARCH=mips + - os: linux + env: + TOOLCHAIN=CODESCAPE + TARGET=mips32el + QEMU_ARCH=mipsel + - os: linux + env: + TOOLCHAIN=CODESCAPE + TARGET=mips64 + QEMU_ARCH=mips64 + - os: linux + env: + TOOLCHAIN=CODESCAPE + TARGET=mips64el + QEMU_ARCH=mips64el + +script: + - cmake --version + - bash -e -x ./scripts/run_integration.sh diff --git a/cpu_features/CMakeLists.txt b/cpu_features/CMakeLists.txt new file mode 100644 index 0000000..f9daeac --- /dev/null +++ b/cpu_features/CMakeLists.txt @@ -0,0 +1,259 @@ +cmake_minimum_required(VERSION 3.0) + +# option() honors normal variables. +# see: https://cmake.org/cmake/help/git-stage/policy/CMP0077.html +if(POLICY CMP0077) + cmake_policy(SET CMP0077 NEW) +endif() + +project(CpuFeatures VERSION 0.6.0 LANGUAGES C) + +set(CMAKE_C_STANDARD 99) + +# Default Build Type to be Release +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." + FORCE) +endif(NOT CMAKE_BUILD_TYPE) + +# BUILD_TESTING is a standard CMake variable, but we declare it here to make it +# prominent in the GUI. +option(BUILD_TESTING "Enable test (depends on googletest)." OFF) +# BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to make +# it prominent in the GUI. +# cpu_features uses bit-fields which are - to some extends - implementation-defined (see https://en.cppreference.com/w/c/language/bit_field). +# As a consequence it is discouraged to use cpu_features as a shared library because different compilers may interpret the code in different ways. +# Prefer static linking from source whenever possible. +option(BUILD_SHARED_LIBS "Build library as shared." OFF) +# PIC +option(BUILD_PIC "Build with Position Independant Code." OFF) # Default is off at least for GCC + +# Force PIC on unix when building shared libs +# see: https://en.wikipedia.org/wiki/Position-independent_code +if(BUILD_SHARED_LIBS AND UNIX) + set(BUILD_PIC ON) +endif() + +include(CheckIncludeFile) +include(CheckSymbolExists) +include(GNUInstallDirs) + +macro(setup_include_and_definitions TARGET_NAME) + target_include_directories(${TARGET_NAME} + PUBLIC $ + PRIVATE $ + ) + target_compile_definitions(${TARGET_NAME} + PUBLIC STACK_LINE_READER_BUFFER_SIZE=1024 + ) +endmacro() + +set(PROCESSOR_IS_MIPS FALSE) +set(PROCESSOR_IS_ARM FALSE) +set(PROCESSOR_IS_AARCH64 FALSE) +set(PROCESSOR_IS_X86 FALSE) +set(PROCESSOR_IS_POWER FALSE) + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^mips") + set(PROCESSOR_IS_MIPS TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm") + set(PROCESSOR_IS_ARM TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64") + set(PROCESSOR_IS_AARCH64 TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") + set(PROCESSOR_IS_X86 TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") + set(PROCESSOR_IS_POWER TRUE) +endif() + +macro(add_cpu_features_headers_and_sources HDRS_LIST_NAME SRCS_LIST_NAME) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpu_features_macros.h) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpu_features_cache_info.h) + if(PROCESSOR_IS_MIPS) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpuinfo_mips.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/src/cpuinfo_mips.c) + elseif(PROCESSOR_IS_ARM) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpuinfo_arm.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/src/cpuinfo_arm.c) + elseif(PROCESSOR_IS_AARCH64) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpuinfo_aarch64.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/src/cpuinfo_aarch64.c) + elseif(PROCESSOR_IS_X86) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpuinfo_x86.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/internal/cpuid_x86.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/src/cpuinfo_x86.c) + elseif(PROCESSOR_IS_POWER) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpuinfo_ppc.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/src/cpuinfo_ppc.c) + else() + message(FATAL_ERROR "Unsupported architectures ${CMAKE_SYSTEM_PROCESSOR}") + endif() +endmacro() + +# +# library : utils +# + +add_library(utils OBJECT + ${PROJECT_SOURCE_DIR}/include/internal/bit_utils.h + ${PROJECT_SOURCE_DIR}/include/internal/filesystem.h + ${PROJECT_SOURCE_DIR}/include/internal/stack_line_reader.h + ${PROJECT_SOURCE_DIR}/include/internal/string_view.h + ${PROJECT_SOURCE_DIR}/src/filesystem.c + ${PROJECT_SOURCE_DIR}/src/stack_line_reader.c + ${PROJECT_SOURCE_DIR}/src/string_view.c +) +set_property(TARGET utils PROPERTY POSITION_INDEPENDENT_CODE ${BUILD_PIC}) +setup_include_and_definitions(utils) + +# +# library : unix_based_hardware_detection +# + +if(UNIX) + add_library(unix_based_hardware_detection OBJECT + ${PROJECT_SOURCE_DIR}/include/internal/hwcaps.h + ${PROJECT_SOURCE_DIR}/src/hwcaps.c + ) + setup_include_and_definitions(unix_based_hardware_detection) + check_include_file(dlfcn.h HAVE_DLFCN_H) + if(HAVE_DLFCN_H) + target_compile_definitions(unix_based_hardware_detection PRIVATE HAVE_DLFCN_H) + endif() + check_symbol_exists(getauxval "sys/auxv.h" HAVE_STRONG_GETAUXVAL) + if(HAVE_STRONG_GETAUXVAL) + target_compile_definitions(unix_based_hardware_detection PRIVATE HAVE_STRONG_GETAUXVAL) + endif() + set_property(TARGET unix_based_hardware_detection PROPERTY POSITION_INDEPENDENT_CODE ${BUILD_PIC}) +endif() + +# +# library : cpu_features +# +set (CPU_FEATURES_HDRS) +set (CPU_FEATURES_SRCS) +add_cpu_features_headers_and_sources(CPU_FEATURES_HDRS CPU_FEATURES_SRCS) +list(APPEND CPU_FEATURES_SRCS $) +if(NOT PROCESSOR_IS_X86 AND UNIX) + list(APPEND CPU_FEATURES_SRCS $) +endif() +add_library(cpu_features ${CPU_FEATURES_HDRS} ${CPU_FEATURES_SRCS}) +set_target_properties(cpu_features PROPERTIES PUBLIC_HEADER "${CPU_FEATURES_HDRS}") +setup_include_and_definitions(cpu_features) +target_link_libraries(cpu_features PUBLIC ${CMAKE_DL_LIBS}) +set_property(TARGET cpu_features PROPERTY POSITION_INDEPENDENT_CODE ${BUILD_PIC}) +target_include_directories(cpu_features + PUBLIC $ +) +if(PROCESSOR_IS_X86) + if(APPLE) + target_compile_definitions(cpu_features PRIVATE HAVE_SYSCTLBYNAME) + endif() +endif() +add_library(CpuFeature::cpu_features ALIAS cpu_features) + +# +# program : list_cpu_features +# + +add_executable(list_cpu_features ${PROJECT_SOURCE_DIR}/src/utils/list_cpu_features.c) +target_link_libraries(list_cpu_features PRIVATE cpu_features) +add_executable(CpuFeature::list_cpu_features ALIAS list_cpu_features) + +# +# ndk_compat +# + +if(ANDROID) +add_subdirectory(ndk_compat) +endif() + +# +# tests +# + +include(CTest) +if(BUILD_TESTING) + # Automatically incorporate googletest into the CMake Project if target not + # found. + enable_language(CXX) + + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + set(CMAKE_CXX_EXTENSIONS OFF) # prefer use of -std11 instead of -gnustd11 + + if(NOT TARGET gtest OR NOT TARGET gmock_main) + # Download and unpack googletest at configure time. + configure_file( + cmake/googletest.CMakeLists.txt.in + googletest-download/CMakeLists.txt + ) + + execute_process( + COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download) + + if(result) + message(FATAL_ERROR "CMake step for googletest failed: ${result}") + endif() + + execute_process( + COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download) + + if(result) + message(FATAL_ERROR "Build step for googletest failed: ${result}") + endif() + + # Prevent overriding the parent project's compiler/linker settings on + # Windows. + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + + # Add googletest directly to our build. This defines the gtest and + # gtest_main targets. + add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src + ${CMAKE_BINARY_DIR}/googletest-build + EXCLUDE_FROM_ALL) + endif() + + add_subdirectory(test) +endif() + +# +# Install cpu_features and list_cpu_features +# + +include(GNUInstallDirs) +install(TARGETS cpu_features list_cpu_features + EXPORT CpuFeaturesTargets + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cpu_features + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) +install(EXPORT CpuFeaturesTargets + NAMESPACE CpuFeatures:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeatures + COMPONENT Devel +) +include(CMakePackageConfigHelpers) +configure_package_config_file(cmake/CpuFeaturesConfig.cmake.in + "${PROJECT_BINARY_DIR}/CpuFeaturesConfig.cmake" + INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeatures" + NO_SET_AND_CHECK_MACRO + NO_CHECK_REQUIRED_COMPONENTS_MACRO +) +write_basic_package_version_file( + "${PROJECT_BINARY_DIR}/CpuFeaturesConfigVersion.cmake" + COMPATIBILITY SameMajorVersion +) +install( + FILES + "${PROJECT_BINARY_DIR}/CpuFeaturesConfig.cmake" + "${PROJECT_BINARY_DIR}/CpuFeaturesConfigVersion.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeatures" + COMPONENT Devel +) diff --git a/cpu_features/CONTRIBUTING.md b/cpu_features/CONTRIBUTING.md new file mode 100644 index 0000000..c980350 --- /dev/null +++ b/cpu_features/CONTRIBUTING.md @@ -0,0 +1,23 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. diff --git a/cpu_features/LICENSE b/cpu_features/LICENSE new file mode 100644 index 0000000..a7043c6 --- /dev/null +++ b/cpu_features/LICENSE @@ -0,0 +1,230 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +-------------------------------------------------------------------------------- +For files in the `ndk_compat` folder: +-------------------------------------------------------------------------------- + +Copyright (C) 2010 The Android Open Source Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. diff --git a/cpu_features/README.md b/cpu_features/README.md new file mode 100644 index 0000000..8a34168 --- /dev/null +++ b/cpu_features/README.md @@ -0,0 +1,199 @@ +# cpu_features [![Build Status](https://travis-ci.org/google/cpu_features.svg?branch=master)](https://travis-ci.org/google/cpu_features) [![Build status](https://ci.appveyor.com/api/projects/status/46d1owsj7n8dsylq/branch/master?svg=true)](https://ci.appveyor.com/project/gchatelet/cpu-features/branch/master) + +A cross-platform C library to retrieve CPU features (such as available +instructions) at runtime. + +## Table of Contents + +- [Design Rationale](#rationale) +- [Code samples](#codesample) +- [Running sample code](#usagesample) +- [What's supported](#support) +- [Android NDK's drop in replacement](#ndk) +- [License](#license) +- [Build with cmake](#cmake) + + +## Design Rationale + +- **Simple to use.** See the snippets below for examples. +- **Extensible.** Easy to add missing features or architectures. +- **Compatible with old compilers** and available on many architectures so it + can be used widely. To ensure that cpu_features works on as many platforms + as possible, we implemented it in a highly portable version of C: C99. +- **Sandbox-compatible.** The library uses a variety of strategies to cope + with sandboxed environments or when `cpuid` is unavailable. This is useful + when running integration tests in hermetic environments. +- **Thread safe, no memory allocation, and raises no exceptions.** + cpu_features is suitable for implementing fundamental libc functions like + `malloc`, `memcpy`, and `memcmp`. +- **Unit tested.** + + +## Code samples + +**Note:** For C++ code, the library functions are defined in the `CpuFeatures` namespace. + +### Checking features at runtime + +Here's a simple example that executes a codepath if the CPU supports both the +AES and the SSE4.2 instruction sets: + +```c +#include "cpuinfo_x86.h" + +// For C++, add `using namespace CpuFeatures;` +static const X86Features features = GetX86Info().features; + +void Compute(void) { + if (features.aes && features.sse4_2) { + // Run optimized code. + } else { + // Run standard code. + } +} +``` + +### Caching for faster evaluation of complex checks + +If you wish, you can read all the features at once into a global variable, and +then query for the specific features you care about. Below, we store all the ARM +features and then check whether AES and NEON are supported. + +```c +#include +#include "cpuinfo_arm.h" + +// For C++, add `using namespace CpuFeatures;` +static const ArmFeatures features = GetArmInfo().features; +static const bool has_aes_and_neon = features.aes && features.neon; + +// use has_aes_and_neon. +``` + +This is a good approach to take if you're checking for combinations of features +when using a compiler that is slow to extract individual bits from bit-packed +structures. + +### Checking compile time flags + +The following code determines whether the compiler was told to use the AVX +instruction set (e.g., `g++ -mavx`) and sets `has_avx` accordingly. + +```c +#include +#include "cpuinfo_x86.h" + +// For C++, add `using namespace CpuFeatures;` +static const X86Features features = GetX86Info().features; +static const bool has_avx = CPU_FEATURES_COMPILED_X86_AVX || features.avx; + +// use has_avx. +``` + +`CPU_FEATURES_COMPILED_X86_AVX` is set to 1 if the compiler was instructed to +use AVX and 0 otherwise, combining compile time and runtime knowledge. + +### Rejecting poor hardware implementations based on microarchitecture + +On x86, the first incarnation of a feature in a microarchitecture might not be +the most efficient (e.g. AVX on Sandy Bridge). We provide a function to retrieve +the underlying microarchitecture so you can decide whether to use it. + +Below, `has_fast_avx` is set to 1 if the CPU supports the AVX instruction +set—but only if it's not Sandy Bridge. + +```c +#include +#include "cpuinfo_x86.h" + +// For C++, add `using namespace CpuFeatures;` +static const X86Info info = GetX86Info(); +static const X86Microarchitecture uarch = GetX86Microarchitecture(&info); +static const bool has_fast_avx = info.features.avx && uarch != INTEL_SNB; + +// use has_fast_avx. +``` + +This feature is currently available only for x86 microarchitectures. + + +### Running sample code + +Building `cpu_features` (check [quickstart](#quickstart) below) brings a small executable to test the library. + +```shell + % ./build/list_cpu_features +arch : x86 +brand : Intel(R) Xeon(R) CPU E5-1650 0 @ 3.20GHz +family : 6 (0x06) +model : 45 (0x2D) +stepping : 7 (0x07) +uarch : INTEL_SNB +flags : aes,avx,cx16,smx,sse4_1,sse4_2,ssse3 +``` + +```shell +% ./build/list_cpu_features --json +{"arch":"x86","brand":" Intel(R) Xeon(R) CPU E5-1650 0 @ 3.20GHz","family":6,"model":45,"stepping":7,"uarch":"INTEL_SNB","flags":["aes","avx","cx16","smx","sse4_1","sse4_2","ssse3"]} +``` + + +## What's supported + +| | x86³ | ARM | AArch64 | MIPS⁴ | POWER | +|---------|:----:|:-------:|:-------:|:------:|:-------:| +| Android | yes² | yes¹ | yes¹ | yes¹ | N/A | +| iOS | N/A | not yet | not yet | N/A | N/A | +| Linux | yes² | yes¹ | yes¹ | yes¹ | yes¹ | +| MacOs | yes² | N/A | not yet | N/A | no | +| Windows | yes² | not yet | not yet | N/A | N/A | + +1. **Features revealed from Linux.** We gather data from several sources + depending on availability: + + from glibc's + [getauxval](https://www.gnu.org/software/libc/manual/html_node/Auxiliary-Vector.html) + + by parsing `/proc/self/auxv` + + by parsing `/proc/cpuinfo` +2. **Features revealed from CPU.** features are retrieved by using the `cpuid` + instruction. +3. **Microarchitecture detection.** On x86 some features are not always + implemented efficiently in hardware (e.g. AVX on Sandybridge). Exposing the + microarchitecture allows the client to reject particular microarchitectures. +4. All flavors of Mips are supported, little and big endian as well as 32/64 + bits. + + +## Android NDK's drop in replacement + +[cpu_features](https://github.com/google/cpu_features) is now officially +supporting Android and offers a drop in replacement of for the NDK's [cpu-features.h](https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h) +, see [ndk_compat](ndk_compat) folder for details. + + +## License + +The cpu_features library is licensed under the terms of the Apache license. +See [LICENSE](LICENSE) for more information. + + +## Build with CMake + +Please check the [CMake build instructions](cmake/README.md). + + +### Quickstart with `Ninja` + + - build `list_cpu_features` +``` + cmake -B/tmp/cpu_features -H. -GNinja -DCMAKE_BUILD_TYPE=Release + ninja -C/tmp/cpu_features + /tmp/cpu_features/list_cpu_features --json +``` + + - run tests +``` + cmake -B/tmp/cpu_features -H. -GNinja -DBUILD_TESTING=ON + ninja -C/tmp/cpu_features + ninja -C/tmp/cpu_features test +``` diff --git a/cpu_features/WORKSPACE b/cpu_features/WORKSPACE new file mode 100644 index 0000000..8ea8a8b --- /dev/null +++ b/cpu_features/WORKSPACE @@ -0,0 +1,7 @@ +# ===== googletest ===== + +git_repository( + name = "com_google_googletest", + remote = "https://github.com/google/googletest.git", + commit = "c3f65335b79f47b05629e79a54685d899bc53b93", +) diff --git a/cpu_features/appveyor.yml b/cpu_features/appveyor.yml new file mode 100644 index 0000000..f18635a --- /dev/null +++ b/cpu_features/appveyor.yml @@ -0,0 +1,24 @@ +version: '{build}' +shallow_clone: true + +platform: x64 + +environment: + matrix: + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + CMAKE_GENERATOR: "Visual Studio 15 2017 Win64" + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + CMAKE_GENERATOR: "Visual Studio 14 2015 Win64" + +matrix: + fast_finish: true + +before_build: + - cmake --version + - cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -H. -Bcmake_build -G "%CMAKE_GENERATOR%" + +build_script: + - cmake --build cmake_build --config Debug --target ALL_BUILD + +test_script: + - cmake --build cmake_build --config Debug --target RUN_TESTS diff --git a/cpu_features/cmake/CpuFeaturesConfig.cmake.in b/cpu_features/cmake/CpuFeaturesConfig.cmake.in new file mode 100644 index 0000000..e0bf10e --- /dev/null +++ b/cpu_features/cmake/CpuFeaturesConfig.cmake.in @@ -0,0 +1,3 @@ +# CpuFeatures CMake configuration file + +include("${CMAKE_CURRENT_LIST_DIR}/CpuFeaturesTargets.cmake") diff --git a/cpu_features/cmake/CpuFeaturesNdkCompatConfig.cmake.in b/cpu_features/cmake/CpuFeaturesNdkCompatConfig.cmake.in new file mode 100644 index 0000000..5a53ffd --- /dev/null +++ b/cpu_features/cmake/CpuFeaturesNdkCompatConfig.cmake.in @@ -0,0 +1,3 @@ +# CpuFeaturesNdkCompat CMake configuration file + +include("${CMAKE_CURRENT_LIST_DIR}/CpuFeaturesNdkCompatTargets.cmake") diff --git a/cpu_features/cmake/README.md b/cpu_features/cmake/README.md new file mode 100644 index 0000000..b6baeaa --- /dev/null +++ b/cpu_features/cmake/README.md @@ -0,0 +1,28 @@ +# CMake build instructions + +## Recommended usage : Incorporating cpu_features into a CMake project + + For API / ABI compatibility reasons, it is recommended to build and use + cpu_features in a subdirectory of your project or as an embedded dependency. + + This is similar to the recommended usage of the googletest framework + ( https://github.com/google/googletest/blob/master/googletest/README.md ) + + Build and use step-by-step + + + 1- Download cpu_features and copy it in a sub-directory in your project. + or add cpu_features as a git-submodule in your project + + 2- You can then use the cmake command `add_subdirectory()` to include + cpu_features directly and use the `cpu_features` target in your project. + + 3- Add the `cpu_features` target to the `target_link_libraries()` section of + your executable or of your library. + +## Enabling tests + + CMake default options for cpu_features is Release built type with tests + disabled. To enable testing set cmake `BUILD_TESTING` variable to `ON`, + [.travis.yml](../.travis.yml) and [appveyor.yml](../appveyor.yml) have up to + date examples. diff --git a/cpu_features/cmake/googletest.CMakeLists.txt.in b/cpu_features/cmake/googletest.CMakeLists.txt.in new file mode 100644 index 0000000..d60a33e --- /dev/null +++ b/cpu_features/cmake/googletest.CMakeLists.txt.in @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 2.8.2) + +project(googletest-download NONE) + +include(ExternalProject) +ExternalProject_Add(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG master + SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-src" + BINARY_DIR "${CMAKE_BINARY_DIR}/googletest-build" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) \ No newline at end of file diff --git a/cpu_features/include/cpu_features_cache_info.h b/cpu_features/include/cpu_features_cache_info.h new file mode 100644 index 0000000..1a61ee1 --- /dev/null +++ b/cpu_features/include/cpu_features_cache_info.h @@ -0,0 +1,54 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_COMMON_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_COMMON_H_ + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef enum { + CPU_FEATURE_CACHE_NULL = 0, + CPU_FEATURE_CACHE_DATA = 1, + CPU_FEATURE_CACHE_INSTRUCTION = 2, + CPU_FEATURE_CACHE_UNIFIED = 3, + CPU_FEATURE_CACHE_TLB = 4, + CPU_FEATURE_CACHE_DTLB = 5, + CPU_FEATURE_CACHE_STLB = 6, + CPU_FEATURE_CACHE_PREFETCH = 7 +} CacheType; + +typedef struct { + int level; + CacheType cache_type; + int cache_size; // Cache size in bytes + int ways; // Associativity, 0 undefined, 0xFF fully associative + int line_size; // Cache line size in bytes + int tlb_entries; // number of entries for TLB + int partitioning; // number of lines per sector +} CacheLevelInfo; + +// Increase this value if more cache levels are needed. +#ifndef CPU_FEATURES_MAX_CACHE_LEVEL +#define CPU_FEATURES_MAX_CACHE_LEVEL 10 +#endif +typedef struct { + int size; + CacheLevelInfo levels[CPU_FEATURES_MAX_CACHE_LEVEL]; +} CacheInfo; + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_COMMON_H_ diff --git a/cpu_features/include/cpu_features_macros.h b/cpu_features/include/cpu_features_macros.h new file mode 100644 index 0000000..4b231a1 --- /dev/null +++ b/cpu_features/include/cpu_features_macros.h @@ -0,0 +1,216 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPU_FEATURES_MACROS_H_ +#define CPU_FEATURES_INCLUDE_CPU_FEATURES_MACROS_H_ + +//////////////////////////////////////////////////////////////////////////////// +// Architectures +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__pnacl__) || defined(__CLR_VER) +#define CPU_FEATURES_ARCH_VM +#endif + +#if (defined(_M_IX86) || defined(__i386__)) && !defined(CPU_FEATURES_ARCH_VM) +#define CPU_FEATURES_ARCH_X86_32 +#endif + +#if (defined(_M_X64) || defined(__x86_64__)) && !defined(CPU_FEATURES_ARCH_VM) +#define CPU_FEATURES_ARCH_X86_64 +#endif + +#if defined(CPU_FEATURES_ARCH_X86_32) || defined(CPU_FEATURES_ARCH_X86_64) +#define CPU_FEATURES_ARCH_X86 +#endif + +#if (defined(__arm__) || defined(_M_ARM)) +#define CPU_FEATURES_ARCH_ARM +#endif + +#if defined(__aarch64__) +#define CPU_FEATURES_ARCH_AARCH64 +#endif + +#if (defined(CPU_FEATURES_ARCH_AARCH64) || defined(CPU_FEATURES_ARCH_ARM)) +#define CPU_FEATURES_ARCH_ANY_ARM +#endif + +#if defined(__mips64) +#define CPU_FEATURES_ARCH_MIPS64 +#endif + +#if defined(__mips__) && !defined(__mips64) // mips64 also declares __mips__ +#define CPU_FEATURES_ARCH_MIPS32 +#endif + +#if defined(CPU_FEATURES_ARCH_MIPS32) || defined(CPU_FEATURES_ARCH_MIPS64) +#define CPU_FEATURES_ARCH_MIPS +#endif + +#if defined(__powerpc__) +#define CPU_FEATURES_ARCH_PPC +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Os +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__linux__) +#define CPU_FEATURES_OS_LINUX_OR_ANDROID +#endif + +#if defined(__ANDROID__) +#define CPU_FEATURES_OS_ANDROID +#endif + +#if (defined(_WIN64) || defined(_WIN32)) +#define CPU_FEATURES_OS_WINDOWS +#endif + +#if (defined(__apple__) || defined(__APPLE__) || defined(__MACH__)) +#define CPU_FEATURES_OS_DARWIN +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Compilers +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__clang__) +#define CPU_FEATURES_COMPILER_CLANG +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#define CPU_FEATURES_COMPILER_GCC +#endif + +#if defined(_MSC_VER) +#define CPU_FEATURES_COMPILER_MSC +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Cpp +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__cplusplus) +#define CPU_FEATURES_START_CPP_NAMESPACE \ + namespace cpu_features { \ + extern "C" { +#define CPU_FEATURES_END_CPP_NAMESPACE \ + } \ + } +#else +#define CPU_FEATURES_START_CPP_NAMESPACE +#define CPU_FEATURES_END_CPP_NAMESPACE +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Compiler flags +//////////////////////////////////////////////////////////////////////////////// + +// Use the following to check if a feature is known to be available at +// compile time. See README.md for an example. +#if defined(CPU_FEATURES_ARCH_X86) + +#if defined(__AES__) +#define CPU_FEATURES_COMPILED_X86_AES 1 +#else +#define CPU_FEATURES_COMPILED_X86_AES 0 +#endif // defined(__AES__) + +#if defined(__F16C__) +#define CPU_FEATURES_COMPILED_X86_F16C 1 +#else +#define CPU_FEATURES_COMPILED_X86_F16C 0 +#endif // defined(__F16C__) + +#if defined(__BMI__) +#define CPU_FEATURES_COMPILED_X86_BMI 1 +#else +#define CPU_FEATURES_COMPILED_X86_BMI 0 +#endif // defined(__BMI__) + +#if defined(__BMI2__) +#define CPU_FEATURES_COMPILED_X86_BMI2 1 +#else +#define CPU_FEATURES_COMPILED_X86_BMI2 0 +#endif // defined(__BMI2__) + +#if (defined(__SSE__) || (_M_IX86_FP >= 1)) +#define CPU_FEATURES_COMPILED_X86_SSE 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSE 0 +#endif + +#if (defined(__SSE2__) || (_M_IX86_FP >= 2)) +#define CPU_FEATURES_COMPILED_X86_SSE2 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSE2 0 +#endif + +#if defined(__SSE3__) +#define CPU_FEATURES_COMPILED_X86_SSE3 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSE3 0 +#endif // defined(__SSE3__) + +#if defined(__SSSE3__) +#define CPU_FEATURES_COMPILED_X86_SSSE3 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSSE3 0 +#endif // defined(__SSSE3__) + +#if defined(__SSE4_1__) +#define CPU_FEATURES_COMPILED_X86_SSE4_1 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSE4_1 0 +#endif // defined(__SSE4_1__) + +#if defined(__SSE4_2__) +#define CPU_FEATURES_COMPILED_X86_SSE4_2 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSE4_2 0 +#endif // defined(__SSE4_2__) + +#if defined(__AVX__) +#define CPU_FEATURES_COMPILED_X86_AVX 1 +#else +#define CPU_FEATURES_COMPILED_X86_AVX 0 +#endif // defined(__AVX__) + +#if defined(__AVX2__) +#define CPU_FEATURES_COMPILED_X86_AVX2 1 +#else +#define CPU_FEATURES_COMPILED_X86_AVX2 0 +#endif // defined(__AVX2__) + +#endif // defined(CPU_FEATURES_ARCH_X86) + +#if defined(CPU_FEATURES_ARCH_ANY_ARM) +#if defined(__ARM_NEON__) +#define CPU_FEATURES_COMPILED_ANY_ARM_NEON 1 +#else +#define CPU_FEATURES_COMPILED_ANY_ARM_NEON 0 +#endif // defined(__ARM_NEON__) +#endif // defined(CPU_FEATURES_ARCH_ANY_ARM) + +#if defined(CPU_FEATURES_ARCH_MIPS) +#if defined(__mips_msa) +#define CPU_FEATURES_COMPILED_MIPS_MSA 1 +#else +#define CPU_FEATURES_COMPILED_MIPS_MSA 0 +#endif // defined(__mips_msa) +#endif // defined(CPU_FEATURES_ARCH_MIPS) + +#endif // CPU_FEATURES_INCLUDE_CPU_FEATURES_MACROS_H_ diff --git a/cpu_features/include/cpuinfo_aarch64.h b/cpu_features/include/cpuinfo_aarch64.h new file mode 100644 index 0000000..d85d46d --- /dev/null +++ b/cpu_features/include/cpuinfo_aarch64.h @@ -0,0 +1,156 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_AARCH64_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_AARCH64_H_ + +#include "cpu_features_cache_info.h" +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + int fp : 1; // Floating-point. + int asimd : 1; // Advanced SIMD. + int evtstrm : 1; // Generic timer generated events. + int aes : 1; // Hardware-accelerated Advanced Encryption Standard. + int pmull : 1; // Polynomial multiply long. + int sha1 : 1; // Hardware-accelerated SHA1. + int sha2 : 1; // Hardware-accelerated SHA2-256. + int crc32 : 1; // Hardware-accelerated CRC-32. + int atomics : 1; // Armv8.1 atomic instructions. + int fphp : 1; // Half-precision floating point support. + int asimdhp : 1; // Advanced SIMD half-precision support. + int cpuid : 1; // Access to certain ID registers. + int asimdrdm : 1; // Rounding Double Multiply Accumulate/Subtract. + int jscvt : 1; // Support for JavaScript conversion. + int fcma : 1; // Floating point complex numbers. + int lrcpc : 1; // Support for weaker release consistency. + int dcpop : 1; // Data persistence writeback. + int sha3 : 1; // Hardware-accelerated SHA3. + int sm3 : 1; // Hardware-accelerated SM3. + int sm4 : 1; // Hardware-accelerated SM4. + int asimddp : 1; // Dot product instruction. + int sha512 : 1; // Hardware-accelerated SHA512. + int sve : 1; // Scalable Vector Extension. + int asimdfhm : 1; // Additional half-precision instructions. + int dit : 1; // Data independent timing. + int uscat : 1; // Unaligned atomics support. + int ilrcpc : 1; // Additional support for weaker release consistency. + int flagm : 1; // Flag manipulation instructions. + int ssbs : 1; // Speculative Store Bypass Safe PSTATE bit. + int sb : 1; // Speculation barrier. + int paca : 1; // Address authentication. + int pacg : 1; // Generic authentication. + int dcpodp : 1; // Data cache clean to point of persistence. + int sve2 : 1; // Scalable Vector Extension (version 2). + int sveaes : 1; // SVE AES instructions. + int svepmull : 1; // SVE polynomial multiply long instructions. + int svebitperm : 1; // SVE bit permute instructions. + int svesha3 : 1; // SVE SHA3 instructions. + int svesm4 : 1; // SVE SM4 instructions. + int flagm2 : 1; // Additional flag manipulation instructions. + int frint : 1; // Floating point to integer rounding. + int svei8mm : 1; // SVE Int8 matrix multiplication instructions. + int svef32mm : 1; // SVE FP32 matrix multiplication instruction. + int svef64mm : 1; // SVE FP64 matrix multiplication instructions. + int svebf16 : 1; // SVE BFloat16 instructions. + int i8mm : 1; // Int8 matrix multiplication instructions. + int bf16 : 1; // BFloat16 instructions. + int dgh : 1; // Data Gathering Hint instruction. + int rng : 1; // True random number generator support. + int bti : 1; // Branch target identification. + + // Make sure to update Aarch64FeaturesEnum below if you add a field here. +} Aarch64Features; + +typedef struct { + Aarch64Features features; + int implementer; + int variant; + int part; + int revision; +} Aarch64Info; + +Aarch64Info GetAarch64Info(void); + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +typedef enum { + AARCH64_FP, + AARCH64_ASIMD, + AARCH64_EVTSTRM, + AARCH64_AES, + AARCH64_PMULL, + AARCH64_SHA1, + AARCH64_SHA2, + AARCH64_CRC32, + AARCH64_ATOMICS, + AARCH64_FPHP, + AARCH64_ASIMDHP, + AARCH64_CPUID, + AARCH64_ASIMDRDM, + AARCH64_JSCVT, + AARCH64_FCMA, + AARCH64_LRCPC, + AARCH64_DCPOP, + AARCH64_SHA3, + AARCH64_SM3, + AARCH64_SM4, + AARCH64_ASIMDDP, + AARCH64_SHA512, + AARCH64_SVE, + AARCH64_ASIMDFHM, + AARCH64_DIT, + AARCH64_USCAT, + AARCH64_ILRCPC, + AARCH64_FLAGM, + AARCH64_SSBS, + AARCH64_SB, + AARCH64_PACA, + AARCH64_PACG, + AARCH64_DCPODP, + AARCH64_SVE2, + AARCH64_SVEAES, + AARCH64_SVEPMULL, + AARCH64_SVEBITPERM, + AARCH64_SVESHA3, + AARCH64_SVESM4, + AARCH64_FLAGM2, + AARCH64_FRINT, + AARCH64_SVEI8MM, + AARCH64_SVEF32MM, + AARCH64_SVEF64MM, + AARCH64_SVEBF16, + AARCH64_I8MM, + AARCH64_BF16, + AARCH64_DGH, + AARCH64_RNG, + AARCH64_BTI, + AARCH64_LAST_, +} Aarch64FeaturesEnum; + +int GetAarch64FeaturesEnumValue(const Aarch64Features* features, + Aarch64FeaturesEnum value); + +const char* GetAarch64FeaturesEnumName(Aarch64FeaturesEnum); + +CPU_FEATURES_END_CPP_NAMESPACE + +#if !defined(CPU_FEATURES_ARCH_AARCH64) +#error "Including cpuinfo_aarch64.h from a non-aarch64 target." +#endif + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_AARCH64_H_ diff --git a/cpu_features/include/cpuinfo_arm.h b/cpu_features/include/cpuinfo_arm.h new file mode 100644 index 0000000..0952d7c --- /dev/null +++ b/cpu_features/include/cpuinfo_arm.h @@ -0,0 +1,121 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_ARM_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_ARM_H_ + +#include // uint32_t + +#include "cpu_features_cache_info.h" +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + int swp : 1; // SWP instruction (atomic read-modify-write) + int half : 1; // Half-word loads and stores + int thumb : 1; // Thumb (16-bit instruction set) + int _26bit : 1; // "26 Bit" Model (Processor status register folded into + // program counter) + int fastmult : 1; // 32x32->64-bit multiplication + int fpa : 1; // Floating point accelerator + int vfp : 1; // Vector Floating Point. + int edsp : 1; // DSP extensions (the 'e' variant of the ARM9 CPUs, and all + // others above) + int java : 1; // Jazelle (Java bytecode accelerator) + int iwmmxt : 1; // Intel Wireless MMX Technology. + int crunch : 1; // MaverickCrunch coprocessor + int thumbee : 1; // ThumbEE + int neon : 1; // Advanced SIMD. + int vfpv3 : 1; // VFP version 3 + int vfpv3d16 : 1; // VFP version 3 with 16 D-registers + int tls : 1; // TLS register + int vfpv4 : 1; // VFP version 4 with fast context switching + int idiva : 1; // SDIV and UDIV hardware division in ARM mode. + int idivt : 1; // SDIV and UDIV hardware division in Thumb mode. + int vfpd32 : 1; // VFP with 32 D-registers + int lpae : 1; // Large Physical Address Extension (>4GB physical memory on + // 32-bit architecture) + int evtstrm : 1; // kernel event stream using generic architected timer + int aes : 1; // Hardware-accelerated Advanced Encryption Standard. + int pmull : 1; // Polynomial multiply long. + int sha1 : 1; // Hardware-accelerated SHA1. + int sha2 : 1; // Hardware-accelerated SHA2-256. + int crc32 : 1; // Hardware-accelerated CRC-32. + + // Make sure to update ArmFeaturesEnum below if you add a field here. +} ArmFeatures; + +typedef struct { + ArmFeatures features; + int implementer; + int architecture; + int variant; + int part; + int revision; +} ArmInfo; + +// TODO(user): Add macros to know which features are present at compile +// time. + +ArmInfo GetArmInfo(void); + +// Compute CpuId from ArmInfo. +uint32_t GetArmCpuId(const ArmInfo* const info); + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +typedef enum { + ARM_SWP, + ARM_HALF, + ARM_THUMB, + ARM_26BIT, + ARM_FASTMULT, + ARM_FPA, + ARM_VFP, + ARM_EDSP, + ARM_JAVA, + ARM_IWMMXT, + ARM_CRUNCH, + ARM_THUMBEE, + ARM_NEON, + ARM_VFPV3, + ARM_VFPV3D16, + ARM_TLS, + ARM_VFPV4, + ARM_IDIVA, + ARM_IDIVT, + ARM_VFPD32, + ARM_LPAE, + ARM_EVTSTRM, + ARM_AES, + ARM_PMULL, + ARM_SHA1, + ARM_SHA2, + ARM_CRC32, + ARM_LAST_, +} ArmFeaturesEnum; + +int GetArmFeaturesEnumValue(const ArmFeatures* features, ArmFeaturesEnum value); + +const char* GetArmFeaturesEnumName(ArmFeaturesEnum); + +CPU_FEATURES_END_CPP_NAMESPACE + +#if !defined(CPU_FEATURES_ARCH_ARM) +#error "Including cpuinfo_arm.h from a non-arm target." +#endif + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_ARM_H_ diff --git a/cpu_features/include/cpuinfo_mips.h b/cpu_features/include/cpuinfo_mips.h new file mode 100644 index 0000000..9e5e7fc --- /dev/null +++ b/cpu_features/include/cpuinfo_mips.h @@ -0,0 +1,60 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_MIPS_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_MIPS_H_ + +#include "cpu_features_cache_info.h" +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + int msa : 1; // MIPS SIMD Architecture + // https://www.mips.com/products/architectures/ase/simd/ + int eva : 1; // Enhanced Virtual Addressing + // https://www.mips.com/products/architectures/mips64/ + int r6 : 1; // True if is release 6 of the processor. + + // Make sure to update MipsFeaturesEnum below if you add a field here. +} MipsFeatures; + +typedef struct { + MipsFeatures features; +} MipsInfo; + +MipsInfo GetMipsInfo(void); + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +typedef enum { + MIPS_MSA, + MIPS_EVA, + MIPS_R6, + MIPS_LAST_, +} MipsFeaturesEnum; + +int GetMipsFeaturesEnumValue(const MipsFeatures* features, + MipsFeaturesEnum value); + +const char* GetMipsFeaturesEnumName(MipsFeaturesEnum); + +CPU_FEATURES_END_CPP_NAMESPACE + +#if !defined(CPU_FEATURES_ARCH_MIPS) +#error "Including cpuinfo_mips.h from a non-mips target." +#endif + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_MIPS_H_ diff --git a/cpu_features/include/cpuinfo_ppc.h b/cpu_features/include/cpuinfo_ppc.h new file mode 100644 index 0000000..f691194 --- /dev/null +++ b/cpu_features/include/cpuinfo_ppc.h @@ -0,0 +1,146 @@ +// Copyright 2018 IBM +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_PPC_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_PPC_H_ + +#include "cpu_features_cache_info.h" +#include "cpu_features_macros.h" +#include "internal/hwcaps.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + int ppc32 : 1; + int ppc64 : 1; + int ppc601 : 1; + int altivec : 1; + int fpu : 1; + int mmu : 1; + int mac_4xx : 1; + int unifiedcache : 1; + int spe : 1; + int efpsingle : 1; + int efpdouble : 1; + int no_tb : 1; + int power4 : 1; + int power5 : 1; + int power5plus : 1; + int cell : 1; + int booke : 1; + int smt : 1; + int icachesnoop : 1; + int arch205 : 1; + int pa6t : 1; + int dfp : 1; + int power6ext : 1; + int arch206 : 1; + int vsx : 1; + int pseries_perfmon_compat : 1; + int truele : 1; + int ppcle : 1; + int arch207 : 1; + int htm : 1; + int dscr : 1; + int ebb : 1; + int isel : 1; + int tar : 1; + int vcrypto : 1; + int htm_nosc : 1; + int arch300 : 1; + int ieee128 : 1; + int darn : 1; + int scv : 1; + int htm_no_suspend : 1; + + // Make sure to update PPCFeaturesEnum below if you add a field here. +} PPCFeatures; + +typedef struct { + PPCFeatures features; +} PPCInfo; + +// This function is guaranteed to be malloc, memset and memcpy free. +PPCInfo GetPPCInfo(void); + +typedef struct { + char platform[64]; // 0 terminated string + char model[64]; // 0 terminated string + char machine[64]; // 0 terminated string + char cpu[64]; // 0 terminated string + PlatformType type; +} PPCPlatformStrings; + +PPCPlatformStrings GetPPCPlatformStrings(void); + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +typedef enum { + PPC_32, /* 32 bit mode execution */ + PPC_64, /* 64 bit mode execution */ + PPC_601_INSTR, /* Old POWER ISA */ + PPC_HAS_ALTIVEC, /* SIMD Unit*/ + PPC_HAS_FPU, /* Floating Point Unit */ + PPC_HAS_MMU, /* Memory management unit */ + PPC_HAS_4xxMAC, + PPC_UNIFIED_CACHE, /* Unified instruction and data cache */ + PPC_HAS_SPE, /* Signal processing extention unit */ + PPC_HAS_EFP_SINGLE, /* SPE single precision fpu */ + PPC_HAS_EFP_DOUBLE, /* SPE double precision fpu */ + PPC_NO_TB, /* No timebase */ + PPC_POWER4, + PPC_POWER5, + PPC_POWER5_PLUS, + PPC_CELL, /* Cell broadband engine */ + PPC_BOOKE, /* Embedded ISA */ + PPC_SMT, /* Simultaneous multi-threading */ + PPC_ICACHE_SNOOP, + PPC_ARCH_2_05, /* ISA 2.05 - POWER6 */ + PPC_PA6T, /* PA Semi 6T core ISA */ + PPC_HAS_DFP, /* Decimal floating point unit */ + PPC_POWER6_EXT, + PPC_ARCH_2_06, /* ISA 2.06 - POWER7 */ + PPC_HAS_VSX, /* Vector-scalar extension */ + PPC_PSERIES_PERFMON_COMPAT, /* Set of backwards compatibile performance + monitoring events */ + PPC_TRUE_LE, + PPC_PPC_LE, + PPC_ARCH_2_07, /* ISA 2.07 - POWER8 */ + PPC_HTM, /* Hardware Transactional Memory */ + PPC_DSCR, /* Data stream control register */ + PPC_EBB, /* Event base branching */ + PPC_ISEL, /* Integer select instructions */ + PPC_TAR, /* Target address register */ + PPC_VEC_CRYPTO, /* Vector cryptography instructions */ + PPC_HTM_NOSC, /* Transactions aborted when syscall made*/ + PPC_ARCH_3_00, /* ISA 3.00 - POWER9 */ + PPC_HAS_IEEE128, /* VSX IEEE Binary Float 128-bit */ + PPC_DARN, /* Deliver a random number instruction */ + PPC_SCV, /* scv syscall */ + PPC_HTM_NO_SUSPEND, /* TM w/out suspended state */ + PPC_LAST_, +} PPCFeaturesEnum; + +int GetPPCFeaturesEnumValue(const PPCFeatures* features, PPCFeaturesEnum value); + +const char* GetPPCFeaturesEnumName(PPCFeaturesEnum); + +CPU_FEATURES_END_CPP_NAMESPACE + +#if !defined(CPU_FEATURES_ARCH_PPC) +#error "Including cpuinfo_ppc.h from a non-ppc target." +#endif + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_PPC_H_ diff --git a/cpu_features/include/cpuinfo_x86.h b/cpu_features/include/cpuinfo_x86.h new file mode 100644 index 0000000..8d40f71 --- /dev/null +++ b/cpu_features/include/cpuinfo_x86.h @@ -0,0 +1,231 @@ +// Copyright 2017 Google LLC +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_X86_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_X86_H_ + +#include "cpu_features_cache_info.h" +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +// See https://en.wikipedia.org/wiki/CPUID for a list of x86 cpu features. +// The field names are based on the short name provided in the wikipedia tables. +typedef struct { + int fpu : 1; + int tsc : 1; + int cx8 : 1; + int clfsh : 1; + int mmx : 1; + int aes : 1; + int erms : 1; + int f16c : 1; + int fma4 : 1; + int fma3 : 1; + int vaes : 1; + int vpclmulqdq : 1; + int bmi1 : 1; + int hle : 1; + int bmi2 : 1; + int rtm : 1; + int rdseed : 1; + int clflushopt : 1; + int clwb : 1; + + int sse : 1; + int sse2 : 1; + int sse3 : 1; + int ssse3 : 1; + int sse4_1 : 1; + int sse4_2 : 1; + int sse4a : 1; + + int avx : 1; + int avx2 : 1; + + int avx512f : 1; + int avx512cd : 1; + int avx512er : 1; + int avx512pf : 1; + int avx512bw : 1; + int avx512dq : 1; + int avx512vl : 1; + int avx512ifma : 1; + int avx512vbmi : 1; + int avx512vbmi2 : 1; + int avx512vnni : 1; + int avx512bitalg : 1; + int avx512vpopcntdq : 1; + int avx512_4vnniw : 1; + int avx512_4vbmi2 : 1; + int avx512_second_fma : 1; + int avx512_4fmaps : 1; + int avx512_bf16 : 1; + int avx512_vp2intersect : 1; + int amx_bf16 : 1; + int amx_tile : 1; + int amx_int8 : 1; + + int pclmulqdq : 1; + int smx : 1; + int sgx : 1; + int cx16 : 1; // aka. CMPXCHG16B + int sha : 1; + int popcnt : 1; + int movbe : 1; + int rdrnd : 1; + + int dca : 1; + int ss : 1; + // Make sure to update X86FeaturesEnum below if you add a field here. +} X86Features; + +typedef struct { + X86Features features; + int family; + int model; + int stepping; + char vendor[13]; // 0 terminated string +} X86Info; + +// Calls cpuid and returns an initialized X86info. +// This function is guaranteed to be malloc, memset and memcpy free. +X86Info GetX86Info(void); + +// Returns cache hierarchy informations. +// Can call cpuid multiple times. +// Only works on Intel CPU at the moment. +// This function is guaranteed to be malloc, memset and memcpy free. +CacheInfo GetX86CacheInfo(void); + +typedef enum { + X86_UNKNOWN, + INTEL_CORE, // CORE + INTEL_PNR, // PENRYN + INTEL_NHM, // NEHALEM + INTEL_ATOM_BNL, // BONNELL + INTEL_WSM, // WESTMERE + INTEL_SNB, // SANDYBRIDGE + INTEL_IVB, // IVYBRIDGE + INTEL_ATOM_SMT, // SILVERMONT + INTEL_HSW, // HASWELL + INTEL_BDW, // BROADWELL + INTEL_SKL, // SKYLAKE + INTEL_ATOM_GMT, // GOLDMONT + INTEL_KBL, // KABY LAKE + INTEL_CFL, // COFFEE LAKE + INTEL_WHL, // WHISKEY LAKE + INTEL_CNL, // CANNON LAKE + INTEL_ICL, // ICE LAKE + INTEL_TGL, // TIGER LAKE + INTEL_SPR, // SAPPHIRE RAPIDS + AMD_HAMMER, // K8 + AMD_K10, // K10 + AMD_BOBCAT, // K14 + AMD_BULLDOZER, // K15 + AMD_JAGUAR, // K16 + AMD_ZEN, // K17 +} X86Microarchitecture; + +// Returns the underlying microarchitecture by looking at X86Info's vendor, +// family and model. +X86Microarchitecture GetX86Microarchitecture(const X86Info* info); + +// Calls cpuid and fills the brand_string. +// - brand_string *must* be of size 49 (beware of array decaying). +// - brand_string will be zero terminated. +// - This function calls memcpy. +void FillX86BrandString(char brand_string[49]); + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +typedef enum { + X86_FPU, + X86_TSC, + X86_CX8, + X86_CLFSH, + X86_MMX, + X86_AES, + X86_ERMS, + X86_F16C, + X86_FMA4, + X86_FMA3, + X86_VAES, + X86_VPCLMULQDQ, + X86_BMI1, + X86_HLE, + X86_BMI2, + X86_RTM, + X86_RDSEED, + X86_CLFLUSHOPT, + X86_CLWB, + X86_SSE, + X86_SSE2, + X86_SSE3, + X86_SSSE3, + X86_SSE4_1, + X86_SSE4_2, + X86_SSE4A, + X86_AVX, + X86_AVX2, + X86_AVX512F, + X86_AVX512CD, + X86_AVX512ER, + X86_AVX512PF, + X86_AVX512BW, + X86_AVX512DQ, + X86_AVX512VL, + X86_AVX512IFMA, + X86_AVX512VBMI, + X86_AVX512VBMI2, + X86_AVX512VNNI, + X86_AVX512BITALG, + X86_AVX512VPOPCNTDQ, + X86_AVX512_4VNNIW, + X86_AVX512_4VBMI2, + X86_AVX512_SECOND_FMA, + X86_AVX512_4FMAPS, + X86_AVX512_BF16, + X86_AVX512_VP2INTERSECT, + X86_AMX_BF16, + X86_AMX_TILE, + X86_AMX_INT8, + X86_PCLMULQDQ, + X86_SMX, + X86_SGX, + X86_CX16, + X86_SHA, + X86_POPCNT, + X86_MOVBE, + X86_RDRND, + X86_DCA, + X86_SS, + X86_LAST_, +} X86FeaturesEnum; + +int GetX86FeaturesEnumValue(const X86Features* features, X86FeaturesEnum value); + +const char* GetX86FeaturesEnumName(X86FeaturesEnum); + +const char* GetX86MicroarchitectureName(X86Microarchitecture); + +CPU_FEATURES_END_CPP_NAMESPACE + +#if !defined(CPU_FEATURES_ARCH_X86) +#error "Including cpuinfo_x86.h from a non-x86 target." +#endif + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_X86_H_ diff --git a/cpu_features/include/internal/bit_utils.h b/cpu_features/include/internal/bit_utils.h new file mode 100644 index 0000000..3467ff9 --- /dev/null +++ b/cpu_features/include/internal/bit_utils.h @@ -0,0 +1,40 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_BIT_UTILS_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_BIT_UTILS_H_ + +#include +#include +#include + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +inline static bool IsBitSet(uint32_t reg, uint32_t bit) { + return (reg >> bit) & 0x1; +} + +inline static uint32_t ExtractBitRange(uint32_t reg, uint32_t msb, + uint32_t lsb) { + const uint64_t bits = msb - lsb + 1ULL; + const uint64_t mask = (1ULL << bits) - 1ULL; + assert(msb >= lsb); + return (reg >> lsb) & mask; +} + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_BIT_UTILS_H_ diff --git a/cpu_features/include/internal/cpuid_x86.h b/cpu_features/include/internal/cpuid_x86.h new file mode 100644 index 0000000..33327a4 --- /dev/null +++ b/cpu_features/include/internal/cpuid_x86.h @@ -0,0 +1,37 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_CPUID_X86_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_CPUID_X86_H_ + +#include + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +// A struct to hold the result of a call to cpuid. +typedef struct { + uint32_t eax, ebx, ecx, edx; +} Leaf; + +// Returns the result of a call to the cpuid instruction. +Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx); + +// Returns the eax value of the XCR0 register. +uint32_t GetXCR0Eax(void); + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_CPUID_X86_H_ diff --git a/cpu_features/include/internal/filesystem.h b/cpu_features/include/internal/filesystem.h new file mode 100644 index 0000000..d8f2f6a --- /dev/null +++ b/cpu_features/include/internal/filesystem.h @@ -0,0 +1,39 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// An interface for the filesystem that allows mocking the filesystem in +// unittests. +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_FILESYSTEM_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_FILESYSTEM_H_ + +#include +#include + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +// Same as linux "open(filename, O_RDONLY)", retries automatically on EINTR. +int CpuFeatures_OpenFile(const char* filename); + +// Same as linux "read(file_descriptor, buffer, buffer_size)", retries +// automatically on EINTR. +int CpuFeatures_ReadFile(int file_descriptor, void* buffer, size_t buffer_size); + +// Same as linux "close(file_descriptor)". +void CpuFeatures_CloseFile(int file_descriptor); + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_FILESYSTEM_H_ diff --git a/cpu_features/include/internal/hwcaps.h b/cpu_features/include/internal/hwcaps.h new file mode 100644 index 0000000..62037c8 --- /dev/null +++ b/cpu_features/include/internal/hwcaps.h @@ -0,0 +1,186 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Interface to retrieve hardware capabilities. It relies on Linux's getauxval +// or `/proc/self/auxval` under the hood. +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_HWCAPS_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_HWCAPS_H_ + +#include +#include + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +// To avoid depending on the linux kernel we reproduce the architecture specific +// constants here. + +// http://elixir.free-electrons.com/linux/latest/source/arch/arm64/include/uapi/asm/hwcap.h +#define AARCH64_HWCAP_FP (1UL << 0) +#define AARCH64_HWCAP_ASIMD (1UL << 1) +#define AARCH64_HWCAP_EVTSTRM (1UL << 2) +#define AARCH64_HWCAP_AES (1UL << 3) +#define AARCH64_HWCAP_PMULL (1UL << 4) +#define AARCH64_HWCAP_SHA1 (1UL << 5) +#define AARCH64_HWCAP_SHA2 (1UL << 6) +#define AARCH64_HWCAP_CRC32 (1UL << 7) +#define AARCH64_HWCAP_ATOMICS (1UL << 8) +#define AARCH64_HWCAP_FPHP (1UL << 9) +#define AARCH64_HWCAP_ASIMDHP (1UL << 10) +#define AARCH64_HWCAP_CPUID (1UL << 11) +#define AARCH64_HWCAP_ASIMDRDM (1UL << 12) +#define AARCH64_HWCAP_JSCVT (1UL << 13) +#define AARCH64_HWCAP_FCMA (1UL << 14) +#define AARCH64_HWCAP_LRCPC (1UL << 15) +#define AARCH64_HWCAP_DCPOP (1UL << 16) +#define AARCH64_HWCAP_SHA3 (1UL << 17) +#define AARCH64_HWCAP_SM3 (1UL << 18) +#define AARCH64_HWCAP_SM4 (1UL << 19) +#define AARCH64_HWCAP_ASIMDDP (1UL << 20) +#define AARCH64_HWCAP_SHA512 (1UL << 21) +#define AARCH64_HWCAP_SVE (1UL << 22) +#define AARCH64_HWCAP_ASIMDFHM (1UL << 23) +#define AARCH64_HWCAP_DIT (1UL << 24) +#define AARCH64_HWCAP_USCAT (1UL << 25) +#define AARCH64_HWCAP_ILRCPC (1UL << 26) +#define AARCH64_HWCAP_FLAGM (1UL << 27) +#define AARCH64_HWCAP_SSBS (1UL << 28) +#define AARCH64_HWCAP_SB (1UL << 29) +#define AARCH64_HWCAP_PACA (1UL << 30) +#define AARCH64_HWCAP_PACG (1UL << 31) + +#define AARCH64_HWCAP2_DCPODP (1UL << 0) +#define AARCH64_HWCAP2_SVE2 (1UL << 1) +#define AARCH64_HWCAP2_SVEAES (1UL << 2) +#define AARCH64_HWCAP2_SVEPMULL (1UL << 3) +#define AARCH64_HWCAP2_SVEBITPERM (1UL << 4) +#define AARCH64_HWCAP2_SVESHA3 (1UL << 5) +#define AARCH64_HWCAP2_SVESM4 (1UL << 6) +#define AARCH64_HWCAP2_FLAGM2 (1UL << 7) +#define AARCH64_HWCAP2_FRINT (1UL << 8) +#define AARCH64_HWCAP2_SVEI8MM (1UL << 9) +#define AARCH64_HWCAP2_SVEF32MM (1UL << 10) +#define AARCH64_HWCAP2_SVEF64MM (1UL << 11) +#define AARCH64_HWCAP2_SVEBF16 (1UL << 12) +#define AARCH64_HWCAP2_I8MM (1UL << 13) +#define AARCH64_HWCAP2_BF16 (1UL << 14) +#define AARCH64_HWCAP2_DGH (1UL << 15) +#define AARCH64_HWCAP2_RNG (1UL << 16) +#define AARCH64_HWCAP2_BTI (1UL << 17) + +// http://elixir.free-electrons.com/linux/latest/source/arch/arm/include/uapi/asm/hwcap.h +#define ARM_HWCAP_SWP (1UL << 0) +#define ARM_HWCAP_HALF (1UL << 1) +#define ARM_HWCAP_THUMB (1UL << 2) +#define ARM_HWCAP_26BIT (1UL << 3) +#define ARM_HWCAP_FAST_MULT (1UL << 4) +#define ARM_HWCAP_FPA (1UL << 5) +#define ARM_HWCAP_VFP (1UL << 6) +#define ARM_HWCAP_EDSP (1UL << 7) +#define ARM_HWCAP_JAVA (1UL << 8) +#define ARM_HWCAP_IWMMXT (1UL << 9) +#define ARM_HWCAP_CRUNCH (1UL << 10) +#define ARM_HWCAP_THUMBEE (1UL << 11) +#define ARM_HWCAP_NEON (1UL << 12) +#define ARM_HWCAP_VFPV3 (1UL << 13) +#define ARM_HWCAP_VFPV3D16 (1UL << 14) +#define ARM_HWCAP_TLS (1UL << 15) +#define ARM_HWCAP_VFPV4 (1UL << 16) +#define ARM_HWCAP_IDIVA (1UL << 17) +#define ARM_HWCAP_IDIVT (1UL << 18) +#define ARM_HWCAP_VFPD32 (1UL << 19) +#define ARM_HWCAP_LPAE (1UL << 20) +#define ARM_HWCAP_EVTSTRM (1UL << 21) +#define ARM_HWCAP2_AES (1UL << 0) +#define ARM_HWCAP2_PMULL (1UL << 1) +#define ARM_HWCAP2_SHA1 (1UL << 2) +#define ARM_HWCAP2_SHA2 (1UL << 3) +#define ARM_HWCAP2_CRC32 (1UL << 4) + +// http://elixir.free-electrons.com/linux/latest/source/arch/mips/include/uapi/asm/hwcap.h +#define MIPS_HWCAP_R6 (1UL << 0) +#define MIPS_HWCAP_MSA (1UL << 1) +#define MIPS_HWCAP_CRC32 (1UL << 2) + +// http://elixir.free-electrons.com/linux/latest/source/arch/powerpc/include/uapi/asm/cputable.h +#ifndef _UAPI__ASM_POWERPC_CPUTABLE_H +/* in AT_HWCAP */ +#define PPC_FEATURE_32 0x80000000 +#define PPC_FEATURE_64 0x40000000 +#define PPC_FEATURE_601_INSTR 0x20000000 +#define PPC_FEATURE_HAS_ALTIVEC 0x10000000 +#define PPC_FEATURE_HAS_FPU 0x08000000 +#define PPC_FEATURE_HAS_MMU 0x04000000 +#define PPC_FEATURE_HAS_4xxMAC 0x02000000 +#define PPC_FEATURE_UNIFIED_CACHE 0x01000000 +#define PPC_FEATURE_HAS_SPE 0x00800000 +#define PPC_FEATURE_HAS_EFP_SINGLE 0x00400000 +#define PPC_FEATURE_HAS_EFP_DOUBLE 0x00200000 +#define PPC_FEATURE_NO_TB 0x00100000 +#define PPC_FEATURE_POWER4 0x00080000 +#define PPC_FEATURE_POWER5 0x00040000 +#define PPC_FEATURE_POWER5_PLUS 0x00020000 +#define PPC_FEATURE_CELL 0x00010000 +#define PPC_FEATURE_BOOKE 0x00008000 +#define PPC_FEATURE_SMT 0x00004000 +#define PPC_FEATURE_ICACHE_SNOOP 0x00002000 +#define PPC_FEATURE_ARCH_2_05 0x00001000 +#define PPC_FEATURE_PA6T 0x00000800 +#define PPC_FEATURE_HAS_DFP 0x00000400 +#define PPC_FEATURE_POWER6_EXT 0x00000200 +#define PPC_FEATURE_ARCH_2_06 0x00000100 +#define PPC_FEATURE_HAS_VSX 0x00000080 + +#define PPC_FEATURE_PSERIES_PERFMON_COMPAT 0x00000040 + +/* Reserved - do not use 0x00000004 */ +#define PPC_FEATURE_TRUE_LE 0x00000002 +#define PPC_FEATURE_PPC_LE 0x00000001 + +/* in AT_HWCAP2 */ +#define PPC_FEATURE2_ARCH_2_07 0x80000000 +#define PPC_FEATURE2_HTM 0x40000000 +#define PPC_FEATURE2_DSCR 0x20000000 +#define PPC_FEATURE2_EBB 0x10000000 +#define PPC_FEATURE2_ISEL 0x08000000 +#define PPC_FEATURE2_TAR 0x04000000 +#define PPC_FEATURE2_VEC_CRYPTO 0x02000000 +#define PPC_FEATURE2_HTM_NOSC 0x01000000 +#define PPC_FEATURE2_ARCH_3_00 0x00800000 +#define PPC_FEATURE2_HAS_IEEE128 0x00400000 +#define PPC_FEATURE2_DARN 0x00200000 +#define PPC_FEATURE2_SCV 0x00100000 +#define PPC_FEATURE2_HTM_NO_SUSPEND 0x00080000 +#endif + +typedef struct { + unsigned long hwcaps; + unsigned long hwcaps2; +} HardwareCapabilities; + +HardwareCapabilities CpuFeatures_GetHardwareCapabilities(void); +bool CpuFeatures_IsHwCapsSet(const HardwareCapabilities hwcaps_mask, + const HardwareCapabilities hwcaps); + +typedef struct { + char platform[64]; // 0 terminated string + char base_platform[64]; // 0 terminated string +} PlatformType; + +PlatformType CpuFeatures_GetPlatformType(void); + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_HWCAPS_H_ diff --git a/cpu_features/include/internal/stack_line_reader.h b/cpu_features/include/internal/stack_line_reader.h new file mode 100644 index 0000000..39c1b8b --- /dev/null +++ b/cpu_features/include/internal/stack_line_reader.h @@ -0,0 +1,49 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Reads a file line by line and stores the data on the stack. This allows +// parsing files in one go without allocating. +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_STACK_LINE_READER_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_STACK_LINE_READER_H_ + +#include + +#include "cpu_features_macros.h" +#include "internal/string_view.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + char buffer[STACK_LINE_READER_BUFFER_SIZE]; + StringView view; + int fd; + bool skip_mode; +} StackLineReader; + +// Initializes a StackLineReader. +void StackLineReader_Initialize(StackLineReader* reader, int fd); + +typedef struct { + StringView line; // A view of the line. + bool eof; // Nothing more to read, we reached EOF. + bool full_line; // If false the line was truncated to + // STACK_LINE_READER_BUFFER_SIZE. +} LineResult; + +// Reads the file pointed to by fd and tries to read a full line. +LineResult StackLineReader_NextLine(StackLineReader* reader); + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_STACK_LINE_READER_H_ diff --git a/cpu_features/include/internal/string_view.h b/cpu_features/include/internal/string_view.h new file mode 100644 index 0000000..64fed40 --- /dev/null +++ b/cpu_features/include/internal/string_view.h @@ -0,0 +1,109 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// A view over a piece of string. The view is not 0 terminated. +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_STRING_VIEW_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_STRING_VIEW_H_ + +#include +#include +#include + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + const char* ptr; + size_t size; +} StringView; + +#ifdef __cplusplus +static const StringView kEmptyStringView = {NULL, 0}; +#else +static const StringView kEmptyStringView; +#endif + +// Returns a StringView from the provided string. +// Passing NULL is valid only if size is 0. +static inline StringView view(const char* str, const size_t size) { + StringView view; + view.ptr = str; + view.size = size; + return view; +} + +static inline StringView str(const char* str) { return view(str, strlen(str)); } + +// Returns the index of the first occurrence of c in view or -1 if not found. +int CpuFeatures_StringView_IndexOfChar(const StringView view, char c); + +// Returns the index of the first occurrence of sub_view in view or -1 if not +// found. +int CpuFeatures_StringView_IndexOf(const StringView view, + const StringView sub_view); + +// Returns whether a is equal to b (same content). +bool CpuFeatures_StringView_IsEquals(const StringView a, const StringView b); + +// Returns whether a starts with b. +bool CpuFeatures_StringView_StartsWith(const StringView a, const StringView b); + +// Removes count characters from the beginning of view or kEmptyStringView if +// count if greater than view.size. +StringView CpuFeatures_StringView_PopFront(const StringView str_view, + size_t count); + +// Removes count characters from the end of view or kEmptyStringView if count if +// greater than view.size. +StringView CpuFeatures_StringView_PopBack(const StringView str_view, + size_t count); + +// Keeps the count first characters of view or view if count if greater than +// view.size. +StringView CpuFeatures_StringView_KeepFront(const StringView str_view, + size_t count); + +// Retrieves the first character of view. If view is empty the behavior is +// undefined. +char CpuFeatures_StringView_Front(const StringView view); + +// Retrieves the last character of view. If view is empty the behavior is +// undefined. +char CpuFeatures_StringView_Back(const StringView view); + +// Removes leading and tailing space characters. +StringView CpuFeatures_StringView_TrimWhitespace(StringView view); + +// Convert StringView to positive integer. e.g. "42", "0x2a". +// Returns -1 on error. +int CpuFeatures_StringView_ParsePositiveNumber(const StringView view); + +// Copies src StringView to dst buffer. +void CpuFeatures_StringView_CopyString(const StringView src, char* dst, + size_t dst_size); + +// Checks if line contains the specified whitespace separated word. +bool CpuFeatures_StringView_HasWord(const StringView line, + const char* const word); + +// Get key/value from line. key and value are separated by ": ". +// key and value are cleaned up from leading and trailing whitespaces. +bool CpuFeatures_StringView_GetAttributeKeyValue(const StringView line, + StringView* key, + StringView* value); + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_STRING_VIEW_H_ diff --git a/cpu_features/ndk_compat/CMakeLists.txt b/cpu_features/ndk_compat/CMakeLists.txt new file mode 100644 index 0000000..186708a --- /dev/null +++ b/cpu_features/ndk_compat/CMakeLists.txt @@ -0,0 +1,60 @@ + +# +# library : NDK compat +# +find_package(Threads REQUIRED) +set (NDK_COMPAT_HDRS cpu-features.h) +set (NDK_COMPAT_SRCS + cpu-features.c + $ + $ +) +# Note that following `add_cpu_features_headers_and_sources` will use +# NDK_COMPAT_SRCS in lieu of NDK_COMPAT_HDRS because we don't want cpu_features +# headers to be installed alongside ndk_compat. +add_cpu_features_headers_and_sources(NDK_COMPAT_SRCS NDK_COMPAT_SRCS) +add_library(ndk_compat ${NDK_COMPAT_HDRS} ${NDK_COMPAT_SRCS}) +setup_include_and_definitions(ndk_compat) +target_include_directories(ndk_compat PUBLIC $) +target_link_libraries(ndk_compat PUBLIC ${CMAKE_DL_LIBS} ${CMAKE_THREAD_LIBS_INIT}) +set_target_properties(ndk_compat PROPERTIES PUBLIC_HEADER "${NDK_COMPAT_HDRS}") + +include(GNUInstallDirs) +install(TARGETS ndk_compat + EXPORT CpuFeaturesNdkCompatTargets + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ndk_compat + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) +install(EXPORT CpuFeaturesNdkCompatTargets + NAMESPACE CpuFeatures:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeaturesNdkCompat + COMPONENT Devel +) +include(CMakePackageConfigHelpers) +configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/CpuFeaturesNdkCompatConfig.cmake.in + "${PROJECT_BINARY_DIR}/CpuFeaturesNdkCompatConfig.cmake" + INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeaturesNdkCompat" + NO_SET_AND_CHECK_MACRO + NO_CHECK_REQUIRED_COMPONENTS_MACRO +) +write_basic_package_version_file( + "${PROJECT_BINARY_DIR}/CpuFeaturesNdkCompatConfigVersion.cmake" + COMPATIBILITY SameMajorVersion +) +install( + FILES + "${PROJECT_BINARY_DIR}/CpuFeaturesNdkCompatConfig.cmake" + "${PROJECT_BINARY_DIR}/CpuFeaturesNdkCompatConfigVersion.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeaturesNdkCompat" + COMPONENT Devel +) + +# +# program : NDK compat test program +# +if(ENABLE_TESTING) + add_executable(ndk-compat-test ndk-compat-test.c) + target_link_libraries(ndk-compat-test PRIVATE ndk_compat) +endif() diff --git a/cpu_features/ndk_compat/README.md b/cpu_features/ndk_compat/README.md new file mode 100644 index 0000000..38c8393 --- /dev/null +++ b/cpu_features/ndk_compat/README.md @@ -0,0 +1,4 @@ +Provides a header compatible with [android's NDK cpu-features.h](https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h). + +It is intended to be a drop in replacement for this header and help users +transition from the NDK to [Google's cpu_features library](https://github.com/google/cpu_features). diff --git a/cpu_features/ndk_compat/cpu-features.c b/cpu_features/ndk_compat/cpu-features.c new file mode 100644 index 0000000..27ff7bb --- /dev/null +++ b/cpu_features/ndk_compat/cpu-features.c @@ -0,0 +1,205 @@ +#include "cpu-features.h" + +#include + +#include "cpu_features_macros.h" +#include "internal/filesystem.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +#if defined(CPU_FEATURES_ARCH_ARM) +#include "cpuinfo_arm.h" +#elif defined(CPU_FEATURES_ARCH_X86) +#include "cpuinfo_x86.h" +#elif defined(CPU_FEATURES_ARCH_MIPS) +#include "cpuinfo_mips.h" +#elif defined(CPU_FEATURES_ARCH_AARCH64) +#include "cpuinfo_aarch64.h" +#endif + +static pthread_once_t g_once; +static int g_inited; +static uint64_t g_cpuFeatures; +static int g_cpuCount; + +#ifdef CPU_FEATURES_ARCH_ARM +static uint32_t g_cpuIdArm; +#endif + +static void set_cpu_mask_bit(uint32_t index, uint32_t* cpu_mask) { + *cpu_mask |= 1UL << index; +} + +// Examples of valid inputs: "31", "4-31" +static void parse_cpu_mask(const StringView text, uint32_t* cpu_mask) { + int separator_index = CpuFeatures_StringView_IndexOfChar(text, '-'); + if (separator_index < 0) { // A single cpu index + int cpu_index = CpuFeatures_StringView_ParsePositiveNumber(text); + if (cpu_index < 0) return; + set_cpu_mask_bit(cpu_index, cpu_mask); + } else { + int cpu_index_a = CpuFeatures_StringView_ParsePositiveNumber( + CpuFeatures_StringView_KeepFront(text, separator_index)); + int cpu_index_b = CpuFeatures_StringView_ParsePositiveNumber( + CpuFeatures_StringView_PopFront(text, separator_index + 1)); + int i; + if (cpu_index_a < 0 || cpu_index_b < 0) return; + for (i = cpu_index_a; i <= cpu_index_b; ++i) { + if (i < 32) { + set_cpu_mask_bit(i, cpu_mask); + } + } + } +} + +// Format specification from +// https://www.kernel.org/doc/Documentation/cputopology.txt +// Examples of valid inputs: "31", "2,4-31,32-63", "0-1,3" +static void parse_cpu_mask_line(const LineResult result, uint32_t* cpu_mask) { + if (!result.full_line || result.eof) return; + StringView line = result.line; + for (; line.size > 0;) { + int next_entry_index = CpuFeatures_StringView_IndexOfChar(line, ','); + if (next_entry_index < 0) { + parse_cpu_mask(line, cpu_mask); + break; + } + StringView entry = CpuFeatures_StringView_KeepFront(line, next_entry_index); + parse_cpu_mask(entry, cpu_mask); + line = CpuFeatures_StringView_PopFront(line, next_entry_index + 1); + } +} + +static void update_cpu_mask_from_file(const char* filename, + uint32_t* cpu_mask) { + const int fd = CpuFeatures_OpenFile(filename); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + parse_cpu_mask_line(StackLineReader_NextLine(&reader), cpu_mask); + CpuFeatures_CloseFile(fd); + } +} + +static int get_cpu_count(void) { + uint32_t cpu_mask = 0; + update_cpu_mask_from_file("/sys/devices/system/cpu/present", &cpu_mask); + update_cpu_mask_from_file("/sys/devices/system/cpu/possible", &cpu_mask); + return __builtin_popcount(cpu_mask); +} + +static void android_cpuInit(void) { + g_cpuFeatures = 0; + g_cpuCount = 1; + g_inited = 1; + + g_cpuCount = get_cpu_count(); + if (g_cpuCount == 0) { + g_cpuCount = 1; + } +#if defined(CPU_FEATURES_ARCH_ARM) + ArmInfo info = GetArmInfo(); + if (info.architecture == 7) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_ARMv7; + if (info.features.vfpv3) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3; + if (info.features.neon) { + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_NEON; + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFP_D32; + } + if (info.features.vfpv3d16) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFP_FP16; + if (info.features.idiva) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_ARM; + if (info.features.idivt) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2; + if (info.features.iwmmxt) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_iWMMXt; + if (info.features.aes) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_AES; + if (info.features.pmull) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_PMULL; + if (info.features.sha1) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_SHA1; + if (info.features.sha2) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_SHA2; + if (info.features.crc32) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_CRC32; + if (info.architecture >= 6) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_LDREX_STREX; + if (info.features.vfp) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2; + if (info.features.vfpv4) { + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFP_FMA; + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_NEON_FMA; + } + g_cpuIdArm = GetArmCpuId(&info); +#elif defined(CPU_FEATURES_ARCH_X86) + X86Info info = GetX86Info(); + if (info.features.ssse3) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSSE3; + if (info.features.popcnt) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_POPCNT; + if (info.features.movbe) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_MOVBE; + if (info.features.sse4_1) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSE4_1; + if (info.features.sse4_2) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSE4_2; + if (info.features.aes) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AES_NI; + if (info.features.avx) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AVX; + if (info.features.rdrnd) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_RDRAND; + if (info.features.avx2) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AVX2; + if (info.features.sha) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SHA_NI; +#elif defined(CPU_FEATURES_ARCH_MIPS) + MipsInfo info = GetMipsInfo(); + if (info.features.r6) g_cpuFeatures |= ANDROID_CPU_MIPS_FEATURE_R6; + if (info.features.msa) g_cpuFeatures |= ANDROID_CPU_MIPS_FEATURE_MSA; +#elif defined(CPU_FEATURES_ARCH_AARCH64) + Aarch64Info info = GetAarch64Info(); + if (info.features.fp) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_FP; + if (info.features.asimd) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_ASIMD; + if (info.features.aes) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_AES; + if (info.features.pmull) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_PMULL; + if (info.features.sha1) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_SHA1; + if (info.features.sha2) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_SHA2; + if (info.features.crc32) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_CRC32; +#endif +} + +AndroidCpuFamily android_getCpuFamily(void) { +#if defined(CPU_FEATURES_ARCH_ARM) + return ANDROID_CPU_FAMILY_ARM; +#elif defined(CPU_FEATURES_ARCH_X86_32) + return ANDROID_CPU_FAMILY_X86; +#elif defined(CPU_FEATURES_ARCH_MIPS64) + return ANDROID_CPU_FAMILY_MIPS64; +#elif defined(CPU_FEATURES_ARCH_MIPS32) + return ANDROID_CPU_FAMILY_MIPS; +#elif defined(CPU_FEATURES_ARCH_AARCH64) + return ANDROID_CPU_FAMILY_ARM64; +#elif defined(CPU_FEATURES_ARCH_X86_64) + return ANDROID_CPU_FAMILY_X86_64; +#else + return ANDROID_CPU_FAMILY_UNKNOWN; +#endif +} + +uint64_t android_getCpuFeatures(void) { + pthread_once(&g_once, android_cpuInit); + return g_cpuFeatures; +} + +int android_getCpuCount(void) { + pthread_once(&g_once, android_cpuInit); + return g_cpuCount; +} + +static void android_cpuInitDummy(void) { g_inited = 1; } + +int android_setCpu(int cpu_count, uint64_t cpu_features) { + /* Fail if the library was already initialized. */ + if (g_inited) return 0; + g_cpuCount = (cpu_count <= 0 ? 1 : cpu_count); + g_cpuFeatures = cpu_features; + pthread_once(&g_once, android_cpuInitDummy); + return 1; +} + +#ifdef CPU_FEATURES_ARCH_ARM + +uint32_t android_getCpuIdArm(void) { + pthread_once(&g_once, android_cpuInit); + return g_cpuIdArm; +} + +int android_setCpuArm(int cpu_count, uint64_t cpu_features, uint32_t cpu_id) { + if (!android_setCpu(cpu_count, cpu_features)) return 0; + g_cpuIdArm = cpu_id; + return 1; +} + +#endif // CPU_FEATURES_ARCH_ARM diff --git a/cpu_features/ndk_compat/cpu-features.h b/cpu_features/ndk_compat/cpu-features.h new file mode 100644 index 0000000..51bea53 --- /dev/null +++ b/cpu_features/ndk_compat/cpu-features.h @@ -0,0 +1,320 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef GOOGLE_CPU_FEATURES_H +#define GOOGLE_CPU_FEATURES_H +#include +#include + +__BEGIN_DECLS + +/* A list of valid values returned by android_getCpuFamily(). + * They describe the CPU Architecture of the current process. + */ +typedef enum { + ANDROID_CPU_FAMILY_UNKNOWN = 0, + ANDROID_CPU_FAMILY_ARM, + ANDROID_CPU_FAMILY_X86, + ANDROID_CPU_FAMILY_MIPS, + ANDROID_CPU_FAMILY_ARM64, + ANDROID_CPU_FAMILY_X86_64, + ANDROID_CPU_FAMILY_MIPS64, + ANDROID_CPU_FAMILY_MAX /* do not remove */ +} AndroidCpuFamily; + +/* Return the CPU family of the current process. + * + * Note that this matches the bitness of the current process. I.e. when + * running a 32-bit binary on a 64-bit capable CPU, this will return the + * 32-bit CPU family value. + */ +extern AndroidCpuFamily android_getCpuFamily(void); + +/* Return a bitmap describing a set of optional CPU features that are + * supported by the current device's CPU. The exact bit-flags returned + * depend on the value returned by android_getCpuFamily(). See the + * documentation for the ANDROID_CPU_*_FEATURE_* flags below for details. + */ +extern uint64_t android_getCpuFeatures(void); + +/* The list of feature flags for ANDROID_CPU_FAMILY_ARM that can be + * recognized by the library (see note below for 64-bit ARM). Value details + * are: + * + * VFPv2: + * CPU supports the VFPv2 instruction set. Many, but not all, ARMv6 CPUs + * support these instructions. VFPv2 is a subset of VFPv3 so this will + * be set whenever VFPv3 is set too. + * + * ARMv7: + * CPU supports the ARMv7-A basic instruction set. + * This feature is mandated by the 'armeabi-v7a' ABI. + * + * VFPv3: + * CPU supports the VFPv3-D16 instruction set, providing hardware FPU + * support for single and double precision floating point registers. + * Note that only 16 FPU registers are available by default, unless + * the D32 bit is set too. This feature is also mandated by the + * 'armeabi-v7a' ABI. + * + * VFP_D32: + * CPU VFP optional extension that provides 32 FPU registers, + * instead of 16. Note that ARM mandates this feature is the 'NEON' + * feature is implemented by the CPU. + * + * NEON: + * CPU FPU supports "ARM Advanced SIMD" instructions, also known as + * NEON. Note that this mandates the VFP_D32 feature as well, per the + * ARM Architecture specification. + * + * VFP_FP16: + * Half-width floating precision VFP extension. If set, the CPU + * supports instructions to perform floating-point operations on + * 16-bit registers. This is part of the VFPv4 specification, but + * not mandated by any Android ABI. + * + * VFP_FMA: + * Fused multiply-accumulate VFP instructions extension. Also part of + * the VFPv4 specification, but not mandated by any Android ABI. + * + * NEON_FMA: + * Fused multiply-accumulate NEON instructions extension. Optional + * extension from the VFPv4 specification, but not mandated by any + * Android ABI. + * + * IDIV_ARM: + * Integer division available in ARM mode. Only available + * on recent CPUs (e.g. Cortex-A15). + * + * IDIV_THUMB2: + * Integer division available in Thumb-2 mode. Only available + * on recent CPUs (e.g. Cortex-A15). + * + * iWMMXt: + * Optional extension that adds MMX registers and operations to an + * ARM CPU. This is only available on a few XScale-based CPU designs + * sold by Marvell. Pretty rare in practice. + * + * AES: + * CPU supports AES instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * CRC32: + * CPU supports CRC32 instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * SHA2: + * CPU supports SHA2 instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * SHA1: + * CPU supports SHA1 instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * PMULL: + * CPU supports 64-bit PMULL and PMULL2 instructions. These + * instructions are only available for 32-bit applications + * running on ARMv8 CPU. + * + * If you want to tell the compiler to generate code that targets one of + * the feature set above, you should probably use one of the following + * flags (for more details, see technical note at the end of this file): + * + * -mfpu=vfp + * -mfpu=vfpv2 + * These are equivalent and tell GCC to use VFPv2 instructions for + * floating-point operations. Use this if you want your code to + * run on *some* ARMv6 devices, and any ARMv7-A device supported + * by Android. + * + * Generated code requires VFPv2 feature. + * + * -mfpu=vfpv3-d16 + * Tell GCC to use VFPv3 instructions (using only 16 FPU registers). + * This should be generic code that runs on any CPU that supports the + * 'armeabi-v7a' Android ABI. Note that no ARMv6 CPU supports this. + * + * Generated code requires VFPv3 feature. + * + * -mfpu=vfpv3 + * Tell GCC to use VFPv3 instructions with 32 FPU registers. + * Generated code requires VFPv3|VFP_D32 features. + * + * -mfpu=neon + * Tell GCC to use VFPv3 instructions with 32 FPU registers, and + * also support NEON intrinsics (see ). + * Generated code requires VFPv3|VFP_D32|NEON features. + * + * -mfpu=vfpv4-d16 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA features. + * + * -mfpu=vfpv4 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32 features. + * + * -mfpu=neon-vfpv4 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|NEON|NEON_FMA + * features. + * + * -mcpu=cortex-a7 + * -mcpu=cortex-a15 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32| + * NEON|NEON_FMA|IDIV_ARM|IDIV_THUMB2 + * This flag implies -mfpu=neon-vfpv4. + * + * -mcpu=iwmmxt + * Allows the use of iWMMXt instrinsics with GCC. + * + * IMPORTANT NOTE: These flags should only be tested when + * android_getCpuFamily() returns ANDROID_CPU_FAMILY_ARM, i.e. this is a + * 32-bit process. + * + * When running a 64-bit ARM process on an ARMv8 CPU, + * android_getCpuFeatures() will return a different set of bitflags + */ +enum { + ANDROID_CPU_ARM_FEATURE_ARMv7 = (1 << 0), + ANDROID_CPU_ARM_FEATURE_VFPv3 = (1 << 1), + ANDROID_CPU_ARM_FEATURE_NEON = (1 << 2), + ANDROID_CPU_ARM_FEATURE_LDREX_STREX = (1 << 3), + ANDROID_CPU_ARM_FEATURE_VFPv2 = (1 << 4), + ANDROID_CPU_ARM_FEATURE_VFP_D32 = (1 << 5), + ANDROID_CPU_ARM_FEATURE_VFP_FP16 = (1 << 6), + ANDROID_CPU_ARM_FEATURE_VFP_FMA = (1 << 7), + ANDROID_CPU_ARM_FEATURE_NEON_FMA = (1 << 8), + ANDROID_CPU_ARM_FEATURE_IDIV_ARM = (1 << 9), + ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 = (1 << 10), + ANDROID_CPU_ARM_FEATURE_iWMMXt = (1 << 11), + ANDROID_CPU_ARM_FEATURE_AES = (1 << 12), + ANDROID_CPU_ARM_FEATURE_PMULL = (1 << 13), + ANDROID_CPU_ARM_FEATURE_SHA1 = (1 << 14), + ANDROID_CPU_ARM_FEATURE_SHA2 = (1 << 15), + ANDROID_CPU_ARM_FEATURE_CRC32 = (1 << 16), +}; + +/* The bit flags corresponding to the output of android_getCpuFeatures() + * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_ARM64. Value details + * are: + * + * FP: + * CPU has Floating-point unit. + * + * ASIMD: + * CPU has Advanced SIMD unit. + * + * AES: + * CPU supports AES instructions. + * + * CRC32: + * CPU supports CRC32 instructions. + * + * SHA2: + * CPU supports SHA2 instructions. + * + * SHA1: + * CPU supports SHA1 instructions. + * + * PMULL: + * CPU supports 64-bit PMULL and PMULL2 instructions. + */ +enum { + ANDROID_CPU_ARM64_FEATURE_FP = (1 << 0), + ANDROID_CPU_ARM64_FEATURE_ASIMD = (1 << 1), + ANDROID_CPU_ARM64_FEATURE_AES = (1 << 2), + ANDROID_CPU_ARM64_FEATURE_PMULL = (1 << 3), + ANDROID_CPU_ARM64_FEATURE_SHA1 = (1 << 4), + ANDROID_CPU_ARM64_FEATURE_SHA2 = (1 << 5), + ANDROID_CPU_ARM64_FEATURE_CRC32 = (1 << 6), +}; + +/* The bit flags corresponding to the output of android_getCpuFeatures() + * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_X86 or + * ANDROID_CPU_FAMILY_X86_64. + */ +enum { + ANDROID_CPU_X86_FEATURE_SSSE3 = (1 << 0), + ANDROID_CPU_X86_FEATURE_POPCNT = (1 << 1), + ANDROID_CPU_X86_FEATURE_MOVBE = (1 << 2), + ANDROID_CPU_X86_FEATURE_SSE4_1 = (1 << 3), + ANDROID_CPU_X86_FEATURE_SSE4_2 = (1 << 4), + ANDROID_CPU_X86_FEATURE_AES_NI = (1 << 5), + ANDROID_CPU_X86_FEATURE_AVX = (1 << 6), + ANDROID_CPU_X86_FEATURE_RDRAND = (1 << 7), + ANDROID_CPU_X86_FEATURE_AVX2 = (1 << 8), + ANDROID_CPU_X86_FEATURE_SHA_NI = (1 << 9), +}; + +/* The bit flags corresponding to the output of android_getCpuFeatures() + * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_MIPS + * or ANDROID_CPU_FAMILY_MIPS64. Values are: + * + * R6: + * CPU executes MIPS Release 6 instructions natively, and + * supports obsoleted R1..R5 instructions only via kernel traps. + * + * MSA: + * CPU supports Mips SIMD Architecture instructions. + */ +enum { + ANDROID_CPU_MIPS_FEATURE_R6 = (1 << 0), + ANDROID_CPU_MIPS_FEATURE_MSA = (1 << 1), +}; + +/* Return the number of CPU cores detected on this device. + * Please note the current implementation supports up to 32 cpus. + */ +extern int android_getCpuCount(void); + +/* The following is used to force the CPU count and features + * mask in sandboxed processes. Under 4.1 and higher, these processes + * cannot access /proc, which is the only way to get information from + * the kernel about the current hardware (at least on ARM). + * + * It _must_ be called only once, and before any android_getCpuXXX + * function, any other case will fail. + * + * This function return 1 on success, and 0 on failure. + */ +extern int android_setCpu(int cpu_count, uint64_t cpu_features); + +#ifdef __arm__ + +/* Retrieve the ARM 32-bit CPUID value from the kernel. + * Note that this cannot work on sandboxed processes under 4.1 and + * higher, unless you called android_setCpuArm() before. + */ +extern uint32_t android_getCpuIdArm(void); + +/* An ARM-specific variant of android_setCpu() that also allows you + * to set the ARM CPUID field. + */ +extern int android_setCpuArm(int cpu_count, uint64_t cpu_features, + uint32_t cpu_id); + +#endif + +__END_DECLS +#endif /* GOOGLE_CPU_FEATURES_H */ diff --git a/cpu_features/ndk_compat/ndk-compat-test.c b/cpu_features/ndk_compat/ndk-compat-test.c new file mode 100644 index 0000000..e4005d4 --- /dev/null +++ b/cpu_features/ndk_compat/ndk-compat-test.c @@ -0,0 +1,12 @@ +#include + +#include "cpu-features.h" + +int main() { + printf("android_getCpuFamily()=%d\n", android_getCpuFamily()); + printf("android_getCpuFeatures()=0x%08llx\n", android_getCpuFeatures()); + printf("android_getCpuCount()=%d\n", android_getCpuCount()); +#ifdef __arm__ + printf("android_getCpuIdArm()=0x%04x\n", android_getCpuIdArm()); +#endif //__arm__ +} diff --git a/cpu_features/scripts/run_integration.sh b/cpu_features/scripts/run_integration.sh new file mode 100755 index 0000000..fd88d60 --- /dev/null +++ b/cpu_features/scripts/run_integration.sh @@ -0,0 +1,209 @@ +#!/usr/bin/env bash + +readonly SCRIPT_FOLDER=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +readonly PROJECT_FOLDER="${SCRIPT_FOLDER}/.." +readonly ARCHIVE_FOLDER=~/cpu_features_archives +readonly QEMU_INSTALL=${ARCHIVE_FOLDER}/qemu +readonly DEFAULT_CMAKE_ARGS=" -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON" + +function extract() { + case $1 in + *.tar.bz2) tar xjf "$1" ;; + *.tar.xz) tar xJf "$1" ;; + *.tar.gz) tar xzf "$1" ;; + *) + echo "don't know how to extract '$1'..." + exit 1 + esac +} + +function unpackifnotexists() { + mkdir -p "${ARCHIVE_FOLDER}" + cd "${ARCHIVE_FOLDER}" || exit + local URL=$1 + local RELATIVE_FOLDER=$2 + local DESTINATION="${ARCHIVE_FOLDER}/${RELATIVE_FOLDER}" + if [[ ! -d "${DESTINATION}" ]] ; then + local ARCHIVE_NAME=$(echo ${URL} | sed 's/.*\///') + test -f "${ARCHIVE_NAME}" || wget -q "${URL}" + extract "${ARCHIVE_NAME}" + rm -f "${ARCHIVE_NAME}" + fi +} + +function installqemuifneeded() { + local VERSION=${QEMU_VERSION:=2.11.1} + local ARCHES=${QEMU_ARCHES:=arm aarch64 i386 x86_64 mips mipsel mips64 mips64el} + local TARGETS=${QEMU_TARGETS:=$(echo "$ARCHES" | sed 's#$# #;s#\([^ ]*\) #\1-linux-user #g')} + + if echo "${VERSION} ${TARGETS}" | cmp --silent ${QEMU_INSTALL}/.build -; then + echo "qemu ${VERSION} up to date!" + return 0 + fi + + echo "VERSION: ${VERSION}" + echo "TARGETS: ${TARGETS}" + + rm -rf ${QEMU_INSTALL} + + # Checking for a tarball before downloading makes testing easier :-) + local QEMU_URL="http://wiki.qemu-project.org/download/qemu-${VERSION}.tar.xz" + local QEMU_FOLDER="qemu-${VERSION}" + unpackifnotexists ${QEMU_URL} ${QEMU_FOLDER} + cd ${QEMU_FOLDER} || exit + + ./configure \ + --prefix="${QEMU_INSTALL}" \ + --target-list="${TARGETS}" \ + --disable-docs \ + --disable-sdl \ + --disable-gtk \ + --disable-gnutls \ + --disable-gcrypt \ + --disable-nettle \ + --disable-curses \ + --static + + make -j4 + make install + + echo "$VERSION $TARGETS" > ${QEMU_INSTALL}/.build +} + +function assert_defined(){ + local VALUE=${1} + : "${VALUE?"${1} needs to be defined"}" +} + +function integrate() { + cd "${PROJECT_FOLDER}" + case "${OS}" in + "Windows_NT") CMAKE_BUILD_ARGS="--config Debug --target ALL_BUILD" + CMAKE_TEST_FILES="${BUILD_DIR}/test/Debug/*_test.exe" + DEMO=${BUILD_DIR}/Debug/list_cpu_features.exe + ;; + *) CMAKE_BUILD_ARGS="--target all" + CMAKE_TEST_FILES="${BUILD_DIR}/test/*_test" + DEMO=${BUILD_DIR}/list_cpu_features + ;; + esac + + # Generating CMake configuration + cmake -H. -B"${BUILD_DIR}" ${DEFAULT_CMAKE_ARGS} "${CMAKE_ADDITIONAL_ARGS[@]}" -G"${CMAKE_GENERATOR:-Unix Makefiles}" + + # Building + cmake --build "${BUILD_DIR}" ${CMAKE_BUILD_ARGS} + + # Running tests if needed + if [[ "${QEMU_ARCH}" == "DISABLED" ]]; then + return + fi + RUN_CMD="" + if [[ -n "${QEMU_ARCH}" ]]; then + installqemuifneeded + RUN_CMD="${QEMU_INSTALL}/bin/qemu-${QEMU_ARCH} ${QEMU_ARGS[@]}" + fi + for test_binary in ${CMAKE_TEST_FILES}; do + ${RUN_CMD} ${test_binary} + done + ${RUN_CMD} ${DEMO} +} + +function expand_linaro_config() { + assert_defined TARGET + local LINARO_ROOT_URL=https://releases.linaro.org/components/toolchain/binaries/7.2-2017.11 + + local GCC_URL=${LINARO_ROOT_URL}/${TARGET}/gcc-linaro-7.2.1-2017.11-x86_64_${TARGET}.tar.xz + local GCC_RELATIVE_FOLDER="gcc-linaro-7.2.1-2017.11-x86_64_${TARGET}" + unpackifnotexists "${GCC_URL}" "${GCC_RELATIVE_FOLDER}" + + local SYSROOT_URL=${LINARO_ROOT_URL}/${TARGET}/sysroot-glibc-linaro-2.25-2017.11-${TARGET}.tar.xz + local SYSROOT_RELATIVE_FOLDER=sysroot-glibc-linaro-2.25-2017.11-${TARGET} + unpackifnotexists "${SYSROOT_URL}" "${SYSROOT_RELATIVE_FOLDER}" + + local SYSROOT_FOLDER=${ARCHIVE_FOLDER}/${SYSROOT_RELATIVE_FOLDER} + local GCC_FOLDER=${ARCHIVE_FOLDER}/${GCC_RELATIVE_FOLDER} + + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_SYSTEM_NAME=Linux) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_SYSTEM_PROCESSOR=${TARGET}) + + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_SYSROOT=${SYSROOT_FOLDER}) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_C_COMPILER=${GCC_FOLDER}/bin/${TARGET}-gcc) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_CXX_COMPILER=${GCC_FOLDER}/bin/${TARGET}-g++) + + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=ONLY) + + QEMU_ARGS+=(-L ${SYSROOT_FOLDER}) + QEMU_ARGS+=(-E LD_LIBRARY_PATH=/lib) +} + +function expand_codescape_config() { + assert_defined TARGET + local DATE=2017.10-08 + local CODESCAPE_URL=https://codescape.mips.com/components/toolchain/${DATE}/Codescape.GNU.Tools.Package.${DATE}.for.MIPS.MTI.Linux.CentOS-5.x86_64.tar.gz + local GCC_URL=${CODESCAPE_URL} + local GCC_RELATIVE_FOLDER="mips-mti-linux-gnu/${DATE}" + unpackifnotexists "${GCC_URL}" "${GCC_RELATIVE_FOLDER}" + + local GCC_FOLDER=${ARCHIVE_FOLDER}/${GCC_RELATIVE_FOLDER} + local MIPS_FLAGS="" + local LIBC_FOLDER_SUFFIX="" + local FLAVOUR="" + case "${TARGET}" in + "mips32") MIPS_FLAGS="-EB -mabi=32"; FLAVOUR="mips-r2-hard"; LIBC_FOLDER_SUFFIX="lib" ;; + "mips32el") MIPS_FLAGS="-EL -mabi=32"; FLAVOUR="mipsel-r2-hard"; LIBC_FOLDER_SUFFIX="lib" ;; + "mips64") MIPS_FLAGS="-EB -mabi=64"; FLAVOUR="mips-r2-hard"; LIBC_FOLDER_SUFFIX="lib64" ;; + "mips64el") MIPS_FLAGS="-EL -mabi=64"; FLAVOUR="mipsel-r2-hard"; LIBC_FOLDER_SUFFIX="lib64" ;; + *) echo 'unknown mips platform'; exit 1;; + esac + + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_FIND_ROOT_PATH=${GCC_FOLDER}) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_SYSTEM_NAME=Linux) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_SYSTEM_PROCESSOR=${TARGET}) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_C_COMPILER=mips-mti-linux-gnu-gcc) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_CXX_COMPILER=mips-mti-linux-gnu-g++) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_C_COMPILER_ARG1="${MIPS_FLAGS}") + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_CXX_COMPILER_ARG1="${MIPS_FLAGS}") + + local SYSROOT_FOLDER=${GCC_FOLDER}/sysroot/${FLAVOUR} + + # Keeping only the sysroot of interest to save on travis cache. + if [[ "${CONTINUOUS_INTEGRATION}" = "true" ]]; then + for folder in ${GCC_FOLDER}/sysroot/*; do + if [[ "${folder}" != "${SYSROOT_FOLDER}" ]]; then + rm -rf ${folder} + fi + done + fi + + local LIBC_FOLDER=${GCC_FOLDER}/mips-mti-linux-gnu/lib/${FLAVOUR}/${LIBC_FOLDER_SUFFIX} + QEMU_ARGS+=(-L ${SYSROOT_FOLDER}) + QEMU_ARGS+=(-E LD_PRELOAD=${LIBC_FOLDER}/libstdc++.so.6:${LIBC_FOLDER}/libgcc_s.so.1) +} + +function expand_environment_and_integrate() { + assert_defined PROJECT_FOLDER + assert_defined TARGET + + BUILD_DIR="${PROJECT_FOLDER}/cmake_build/${TARGET}" + mkdir -p "${BUILD_DIR}" + + declare -a CONFIG_NAMES=() + declare -a QEMU_ARGS=() + declare -a CMAKE_ADDITIONAL_ARGS=() + + case ${TOOLCHAIN} in + LINARO) expand_linaro_config ;; + CODESCAPE) expand_codescape_config ;; + NATIVE) QEMU_ARCH="" ;; + *) echo "Unknown toolchain '${TOOLCHAIN}'..."; exit 1;; + esac + integrate +} + +if [ "${CONTINUOUS_INTEGRATION}" = "true" ]; then + QEMU_ARCHES=${QEMU_ARCH} + expand_environment_and_integrate +fi diff --git a/cpu_features/scripts/test_integration.sh b/cpu_features/scripts/test_integration.sh new file mode 100755 index 0000000..d1c61b0 --- /dev/null +++ b/cpu_features/scripts/test_integration.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash + +source "$(dirname -- "$0")"/run_integration.sh + +# Toolchains for little-endian, 64-bit ARMv8 for GNU/Linux systems +function set_aarch64-linux-gnu() { + TOOLCHAIN=LINARO + TARGET=aarch64-linux-gnu + QEMU_ARCH=aarch64 +} + +# Toolchains for little-endian, hard-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems +function set_arm-linux-gnueabihf() { + TOOLCHAIN=LINARO + TARGET=arm-linux-gnueabihf + QEMU_ARCH=arm +} + +# Toolchains for little-endian, 32-bit ARMv8 for GNU/Linux systems +function set_armv8l-linux-gnueabihf() { + TOOLCHAIN=LINARO + TARGET=armv8l-linux-gnueabihf + QEMU_ARCH=arm +} + +# Toolchains for little-endian, soft-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems +function set_arm-linux-gnueabi() { + TOOLCHAIN=LINARO + TARGET=arm-linux-gnueabi + QEMU_ARCH=arm +} + +# Toolchains for big-endian, 64-bit ARMv8 for GNU/Linux systems +function set_aarch64_be-linux-gnu() { + TOOLCHAIN=LINARO + TARGET=aarch64_be-linux-gnu + QEMU_ARCH=DISABLED +} + +# Toolchains for big-endian, hard-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems +function set_armeb-linux-gnueabihf() { + TOOLCHAIN=LINARO + TARGET=armeb-linux-gnueabihf + QEMU_ARCH=DISABLED +} + +# Toolchains for big-endian, soft-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems +function set_armeb-linux-gnueabi() { + TOOLCHAIN=LINARO + TARGET=armeb-linux-gnueabi + QEMU_ARCH=DISABLED +} + +function set_mips32() { + TOOLCHAIN=CODESCAPE + TARGET=mips32 + QEMU_ARCH=mips +} + +function set_mips32el() { + TOOLCHAIN=CODESCAPE + TARGET=mips32el + QEMU_ARCH=mipsel +} + +function set_mips64() { + TOOLCHAIN=CODESCAPE + TARGET=mips64 + QEMU_ARCH=mips64 +} + +function set_mips64el() { + TOOLCHAIN=CODESCAPE + TARGET=mips64el + QEMU_ARCH=mips64el +} + +function set_native() { + TOOLCHAIN=NATIVE + TARGET=native + QEMU_ARCH="" +} + +ENVIRONMENTS=" + set_aarch64-linux-gnu + set_arm-linux-gnueabihf + set_armv8l-linux-gnueabihf + set_arm-linux-gnueabi + set_aarch64_be-linux-gnu + set_armeb-linux-gnueabihf + set_armeb-linux-gnueabi + set_mips32 + set_mips32el + set_mips64 + set_mips64el + set_native +" + +set -e + +CMAKE_GENERATOR="Ninja" + +for SET_ENVIRONMENT in ${ENVIRONMENTS}; do + ${SET_ENVIRONMENT} + expand_environment_and_integrate +done diff --git a/cpu_features/src/cpuinfo_aarch64.c b/cpu_features/src/cpuinfo_aarch64.c new file mode 100644 index 0000000..0a52718 --- /dev/null +++ b/cpu_features/src/cpuinfo_aarch64.c @@ -0,0 +1,150 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_aarch64.h" + +#include +#include + +#include "internal/filesystem.h" +#include "internal/hwcaps.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +// Generation of feature's getters/setters functions and kGetters, kSetters, +// kCpuInfoFlags and kHardwareCapabilities global tables. +#define DEFINE_TABLE_FEATURES \ + FEATURE(AARCH64_FP, fp, "fp", AARCH64_HWCAP_FP, 0) \ + FEATURE(AARCH64_ASIMD, asimd, "asimd", AARCH64_HWCAP_ASIMD, 0) \ + FEATURE(AARCH64_EVTSTRM, evtstrm, "evtstrm", AARCH64_HWCAP_EVTSTRM, 0) \ + FEATURE(AARCH64_AES, aes, "aes", AARCH64_HWCAP_AES, 0) \ + FEATURE(AARCH64_PMULL, pmull, "pmull", AARCH64_HWCAP_PMULL, 0) \ + FEATURE(AARCH64_SHA1, sha1, "sha1", AARCH64_HWCAP_SHA1, 0) \ + FEATURE(AARCH64_SHA2, sha2, "sha2", AARCH64_HWCAP_SHA2, 0) \ + FEATURE(AARCH64_CRC32, crc32, "crc32", AARCH64_HWCAP_CRC32, 0) \ + FEATURE(AARCH64_ATOMICS, atomics, "atomics", AARCH64_HWCAP_ATOMICS, 0) \ + FEATURE(AARCH64_FPHP, fphp, "fphp", AARCH64_HWCAP_FPHP, 0) \ + FEATURE(AARCH64_ASIMDHP, asimdhp, "asimdhp", AARCH64_HWCAP_ASIMDHP, 0) \ + FEATURE(AARCH64_CPUID, cpuid, "cpuid", AARCH64_HWCAP_CPUID, 0) \ + FEATURE(AARCH64_ASIMDRDM, asimdrdm, "asimdrdm", AARCH64_HWCAP_ASIMDRDM, 0) \ + FEATURE(AARCH64_JSCVT, jscvt, "jscvt", AARCH64_HWCAP_JSCVT, 0) \ + FEATURE(AARCH64_FCMA, fcma, "fcma", AARCH64_HWCAP_FCMA, 0) \ + FEATURE(AARCH64_LRCPC, lrcpc, "lrcpc", AARCH64_HWCAP_LRCPC, 0) \ + FEATURE(AARCH64_DCPOP, dcpop, "dcpop", AARCH64_HWCAP_DCPOP, 0) \ + FEATURE(AARCH64_SHA3, sha3, "sha3", AARCH64_HWCAP_SHA3, 0) \ + FEATURE(AARCH64_SM3, sm3, "sm3", AARCH64_HWCAP_SM3, 0) \ + FEATURE(AARCH64_SM4, sm4, "sm4", AARCH64_HWCAP_SM4, 0) \ + FEATURE(AARCH64_ASIMDDP, asimddp, "asimddp", AARCH64_HWCAP_ASIMDDP, 0) \ + FEATURE(AARCH64_SHA512, sha512, "sha512", AARCH64_HWCAP_SHA512, 0) \ + FEATURE(AARCH64_SVE, sve, "sve", AARCH64_HWCAP_SVE, 0) \ + FEATURE(AARCH64_ASIMDFHM, asimdfhm, "asimdfhm", AARCH64_HWCAP_ASIMDFHM, 0) \ + FEATURE(AARCH64_DIT, dit, "dit", AARCH64_HWCAP_DIT, 0) \ + FEATURE(AARCH64_USCAT, uscat, "uscat", AARCH64_HWCAP_USCAT, 0) \ + FEATURE(AARCH64_ILRCPC, ilrcpc, "ilrcpc", AARCH64_HWCAP_ILRCPC, 0) \ + FEATURE(AARCH64_FLAGM, flagm, "flagm", AARCH64_HWCAP_FLAGM, 0) \ + FEATURE(AARCH64_SSBS, ssbs, "ssbs", AARCH64_HWCAP_SSBS, 0) \ + FEATURE(AARCH64_SB, sb, "sb", AARCH64_HWCAP_SB, 0) \ + FEATURE(AARCH64_PACA, paca, "paca", AARCH64_HWCAP_PACA, 0) \ + FEATURE(AARCH64_PACG, pacg, "pacg", AARCH64_HWCAP_PACG, 0) \ + FEATURE(AARCH64_DCPODP, dcpodp, "dcpodp", 0, AARCH64_HWCAP2_DCPODP) \ + FEATURE(AARCH64_SVE2, sve2, "sve2", 0, AARCH64_HWCAP2_SVE2) \ + FEATURE(AARCH64_SVEAES, sveaes, "sveaes", 0, AARCH64_HWCAP2_SVEAES) \ + FEATURE(AARCH64_SVEPMULL, svepmull, "svepmull", 0, AARCH64_HWCAP2_SVEPMULL) \ + FEATURE(AARCH64_SVEBITPERM, svebitperm, "svebitperm", 0, \ + AARCH64_HWCAP2_SVEBITPERM) \ + FEATURE(AARCH64_SVESHA3, svesha3, "svesha3", 0, AARCH64_HWCAP2_SVESHA3) \ + FEATURE(AARCH64_SVESM4, svesm4, "svesm4", 0, AARCH64_HWCAP2_SVESM4) \ + FEATURE(AARCH64_FLAGM2, flagm2, "flagm2", 0, AARCH64_HWCAP2_FLAGM2) \ + FEATURE(AARCH64_FRINT, frint, "frint", 0, AARCH64_HWCAP2_FRINT) \ + FEATURE(AARCH64_SVEI8MM, svei8mm, "svei8mm", 0, AARCH64_HWCAP2_SVEI8MM) \ + FEATURE(AARCH64_SVEF32MM, svef32mm, "svef32mm", 0, AARCH64_HWCAP2_SVEF32MM) \ + FEATURE(AARCH64_SVEF64MM, svef64mm, "svef64mm", 0, AARCH64_HWCAP2_SVEF64MM) \ + FEATURE(AARCH64_SVEBF16, svebf16, "svebf16", 0, AARCH64_HWCAP2_SVEBF16) \ + FEATURE(AARCH64_I8MM, i8mm, "i8mm", 0, AARCH64_HWCAP2_I8MM) \ + FEATURE(AARCH64_BF16, bf16, "bf16", 0, AARCH64_HWCAP2_BF16) \ + FEATURE(AARCH64_DGH, dgh, "dgh", 0, AARCH64_HWCAP2_DGH) \ + FEATURE(AARCH64_RNG, rng, "rng", 0, AARCH64_HWCAP2_RNG) \ + FEATURE(AARCH64_BTI, bti, "bti", 0, AARCH64_HWCAP2_BTI) +#define DEFINE_TABLE_FEATURE_TYPE Aarch64Features +#include "define_tables.h" + +static bool HandleAarch64Line(const LineResult result, + Aarch64Info* const info) { + StringView line = result.line; + StringView key, value; + if (CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)) { + if (CpuFeatures_StringView_IsEquals(key, str("Features"))) { + for (size_t i = 0; i < AARCH64_LAST_; ++i) { + kSetters[i](&info->features, + CpuFeatures_StringView_HasWord(value, kCpuInfoFlags[i])); + } + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU implementer"))) { + info->implementer = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU variant"))) { + info->variant = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU part"))) { + info->part = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU revision"))) { + info->revision = CpuFeatures_StringView_ParsePositiveNumber(value); + } + } + return !result.eof; +} + +static void FillProcCpuInfoData(Aarch64Info* const info) { + const int fd = CpuFeatures_OpenFile("/proc/cpuinfo"); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + for (;;) { + if (!HandleAarch64Line(StackLineReader_NextLine(&reader), info)) { + break; + } + } + CpuFeatures_CloseFile(fd); + } +} + +static const Aarch64Info kEmptyAarch64Info; + +Aarch64Info GetAarch64Info(void) { + // capabilities are fetched from both getauxval and /proc/cpuinfo so we can + // have some information if the executable is sandboxed (aka no access to + // /proc/cpuinfo). + Aarch64Info info = kEmptyAarch64Info; + + FillProcCpuInfoData(&info); + const HardwareCapabilities hwcaps = CpuFeatures_GetHardwareCapabilities(); + for (size_t i = 0; i < AARCH64_LAST_; ++i) { + if (CpuFeatures_IsHwCapsSet(kHardwareCapabilities[i], hwcaps)) { + kSetters[i](&info.features, true); + } + } + + return info; +} + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +int GetAarch64FeaturesEnumValue(const Aarch64Features* features, + Aarch64FeaturesEnum value) { + if (value >= AARCH64_LAST_) return false; + return kGetters[value](features); +} + +const char* GetAarch64FeaturesEnumName(Aarch64FeaturesEnum value) { + if (value >= AARCH64_LAST_) return "unknown feature"; + return kCpuInfoFlags[value]; +} diff --git a/cpu_features/src/cpuinfo_arm.c b/cpu_features/src/cpuinfo_arm.c new file mode 100644 index 0000000..0f216bf --- /dev/null +++ b/cpu_features/src/cpuinfo_arm.c @@ -0,0 +1,212 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_arm.h" + +#include +#include + +#include "internal/bit_utils.h" +#include "internal/filesystem.h" +#include "internal/hwcaps.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +// Generation of feature's getters/setters functions and kGetters, kSetters, +// kCpuInfoFlags and kHardwareCapabilities global tables. +#define DEFINE_TABLE_FEATURES \ + FEATURE(ARM_SWP, swp, "swp", ARM_HWCAP_SWP, 0) \ + FEATURE(ARM_HALF, half, "half", ARM_HWCAP_HALF, 0) \ + FEATURE(ARM_THUMB, thumb, "thumb", ARM_HWCAP_THUMB, 0) \ + FEATURE(ARM_26BIT, _26bit, "26bit", ARM_HWCAP_26BIT, 0) \ + FEATURE(ARM_FASTMULT, fastmult, "fastmult", ARM_HWCAP_FAST_MULT, 0) \ + FEATURE(ARM_FPA, fpa, "fpa", ARM_HWCAP_FPA, 0) \ + FEATURE(ARM_VFP, vfp, "vfp", ARM_HWCAP_VFP, 0) \ + FEATURE(ARM_EDSP, edsp, "edsp", ARM_HWCAP_EDSP, 0) \ + FEATURE(ARM_JAVA, java, "java", ARM_HWCAP_JAVA, 0) \ + FEATURE(ARM_IWMMXT, iwmmxt, "iwmmxt", ARM_HWCAP_IWMMXT, 0) \ + FEATURE(ARM_CRUNCH, crunch, "crunch", ARM_HWCAP_CRUNCH, 0) \ + FEATURE(ARM_THUMBEE, thumbee, "thumbee", ARM_HWCAP_THUMBEE, 0) \ + FEATURE(ARM_NEON, neon, "neon", ARM_HWCAP_NEON, 0) \ + FEATURE(ARM_VFPV3, vfpv3, "vfpv3", ARM_HWCAP_VFPV3, 0) \ + FEATURE(ARM_VFPV3D16, vfpv3d16, "vfpv3d16", ARM_HWCAP_VFPV3D16, 0) \ + FEATURE(ARM_TLS, tls, "tls", ARM_HWCAP_TLS, 0) \ + FEATURE(ARM_VFPV4, vfpv4, "vfpv4", ARM_HWCAP_VFPV4, 0) \ + FEATURE(ARM_IDIVA, idiva, "idiva", ARM_HWCAP_IDIVA, 0) \ + FEATURE(ARM_IDIVT, idivt, "idivt", ARM_HWCAP_IDIVT, 0) \ + FEATURE(ARM_VFPD32, vfpd32, "vfpd32", ARM_HWCAP_VFPD32, 0) \ + FEATURE(ARM_LPAE, lpae, "lpae", ARM_HWCAP_LPAE, 0) \ + FEATURE(ARM_EVTSTRM, evtstrm, "evtstrm", ARM_HWCAP_EVTSTRM, 0) \ + FEATURE(ARM_AES, aes, "aes", 0, ARM_HWCAP2_AES) \ + FEATURE(ARM_PMULL, pmull, "pmull", 0, ARM_HWCAP2_PMULL) \ + FEATURE(ARM_SHA1, sha1, "sha1", 0, ARM_HWCAP2_SHA1) \ + FEATURE(ARM_SHA2, sha2, "sha2", 0, ARM_HWCAP2_SHA2) \ + FEATURE(ARM_CRC32, crc32, "crc32", 0, ARM_HWCAP2_CRC32) +#define DEFINE_TABLE_FEATURE_TYPE ArmFeatures +#include "define_tables.h" + +typedef struct { + bool processor_reports_armv6; + bool hardware_reports_goldfish; +} ProcCpuInfoData; + +static int IndexOfNonDigit(StringView str) { + size_t index = 0; + while (str.size && isdigit(CpuFeatures_StringView_Front(str))) { + str = CpuFeatures_StringView_PopFront(str, 1); + ++index; + } + return index; +} + +static bool HandleArmLine(const LineResult result, ArmInfo* const info, + ProcCpuInfoData* const proc_info) { + StringView line = result.line; + StringView key, value; + if (CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)) { + if (CpuFeatures_StringView_IsEquals(key, str("Features"))) { + for (size_t i = 0; i < ARM_LAST_; ++i) { + kSetters[i](&info->features, + CpuFeatures_StringView_HasWord(value, kCpuInfoFlags[i])); + } + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU implementer"))) { + info->implementer = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU variant"))) { + info->variant = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU part"))) { + info->part = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU revision"))) { + info->revision = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU architecture"))) { + // CPU architecture is a number that may be followed by letters. e.g. + // "6TEJ", "7". + const StringView digits = + CpuFeatures_StringView_KeepFront(value, IndexOfNonDigit(value)); + info->architecture = CpuFeatures_StringView_ParsePositiveNumber(digits); + } else if (CpuFeatures_StringView_IsEquals(key, str("Processor")) || + CpuFeatures_StringView_IsEquals(key, str("model name"))) { + // Android reports this in a non-Linux standard "Processor" but sometimes + // also in "model name", Linux reports it only in "model name" + // see RaspberryPiZero (Linux) vs InvalidArmv7 (Android) test-cases + proc_info->processor_reports_armv6 = + CpuFeatures_StringView_IndexOf(value, str("(v6l)")) >= 0; + } else if (CpuFeatures_StringView_IsEquals(key, str("Hardware"))) { + proc_info->hardware_reports_goldfish = + CpuFeatures_StringView_IsEquals(value, str("Goldfish")); + } + } + return !result.eof; +} + +uint32_t GetArmCpuId(const ArmInfo* const info) { + return (ExtractBitRange(info->implementer, 7, 0) << 24) | + (ExtractBitRange(info->variant, 3, 0) << 20) | + (ExtractBitRange(info->part, 11, 0) << 4) | + (ExtractBitRange(info->revision, 3, 0) << 0); +} + +static void FixErrors(ArmInfo* const info, + ProcCpuInfoData* const proc_cpu_info_data) { + // Fixing Samsung kernel reporting invalid cpu architecture. + // http://code.google.com/p/android/issues/detail?id=10812 + if (proc_cpu_info_data->processor_reports_armv6 && info->architecture >= 7) { + info->architecture = 6; + } + + // Handle kernel configuration bugs that prevent the correct reporting of CPU + // features. + switch (GetArmCpuId(info)) { + case 0x4100C080: + // Special case: The emulator-specific Android 4.2 kernel fails to report + // support for the 32-bit ARM IDIV instruction. Technically, this is a + // feature of the virtual CPU implemented by the emulator. Note that it + // could also support Thumb IDIV in the future, and this will have to be + // slightly updated. + if (info->architecture >= 7 && + proc_cpu_info_data->hardware_reports_goldfish) { + info->features.idiva = true; + } + break; + case 0x511004D0: + // https://crbug.com/341598. + info->features.neon = false; + break; + case 0x510006F2: + case 0x510006F3: + // The Nexus 4 (Qualcomm Krait) kernel configuration forgets to report + // IDIV support. + info->features.idiva = true; + info->features.idivt = true; + break; + } + + // Propagate cpu features. + if (info->features.vfpv4) info->features.vfpv3 = true; + if (info->features.neon) info->features.vfpv3 = true; + if (info->features.vfpv3) info->features.vfp = true; +} + +static void FillProcCpuInfoData(ArmInfo* const info, + ProcCpuInfoData* proc_cpu_info_data) { + const int fd = CpuFeatures_OpenFile("/proc/cpuinfo"); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + for (;;) { + if (!HandleArmLine(StackLineReader_NextLine(&reader), info, + proc_cpu_info_data)) { + break; + } + } + CpuFeatures_CloseFile(fd); + } +} + +static const ArmInfo kEmptyArmInfo; + +static const ProcCpuInfoData kEmptyProcCpuInfoData; + +ArmInfo GetArmInfo(void) { + // capabilities are fetched from both getauxval and /proc/cpuinfo so we can + // have some information if the executable is sandboxed (aka no access to + // /proc/cpuinfo). + ArmInfo info = kEmptyArmInfo; + ProcCpuInfoData proc_cpu_info_data = kEmptyProcCpuInfoData; + + FillProcCpuInfoData(&info, &proc_cpu_info_data); + const HardwareCapabilities hwcaps = CpuFeatures_GetHardwareCapabilities(); + for (size_t i = 0; i < ARM_LAST_; ++i) { + if (CpuFeatures_IsHwCapsSet(kHardwareCapabilities[i], hwcaps)) { + kSetters[i](&info.features, true); + } + } + + FixErrors(&info, &proc_cpu_info_data); + + return info; +} + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +int GetArmFeaturesEnumValue(const ArmFeatures* features, + ArmFeaturesEnum value) { + if (value >= ARM_LAST_) return false; + return kGetters[value](features); +} + +const char* GetArmFeaturesEnumName(ArmFeaturesEnum value) { + if (value >= ARM_LAST_) return "unknown feature"; + return kCpuInfoFlags[value]; +} diff --git a/cpu_features/src/cpuinfo_mips.c b/cpu_features/src/cpuinfo_mips.c new file mode 100644 index 0000000..83e959f --- /dev/null +++ b/cpu_features/src/cpuinfo_mips.c @@ -0,0 +1,92 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_mips.h" + +#include + +#include "internal/filesystem.h" +#include "internal/hwcaps.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +// Generation of feature's getters/setters functions and kGetters, kSetters, +// kCpuInfoFlags and kHardwareCapabilities global tables. +#define DEFINE_TABLE_FEATURES \ + FEATURE(MIPS_MSA, msa, "msa", MIPS_HWCAP_MSA, 0) \ + FEATURE(MIPS_EVA, eva, "eva", 0, 0) \ + FEATURE(MIPS_R6, r6, "r6", MIPS_HWCAP_R6, 0) +#define DEFINE_TABLE_FEATURE_TYPE MipsFeatures +#include "define_tables.h" + +static bool HandleMipsLine(const LineResult result, + MipsFeatures* const features) { + StringView key, value; + // See tests for an example. + if (CpuFeatures_StringView_GetAttributeKeyValue(result.line, &key, &value)) { + if (CpuFeatures_StringView_IsEquals(key, str("ASEs implemented"))) { + for (size_t i = 0; i < MIPS_LAST_; ++i) { + kSetters[i](features, + CpuFeatures_StringView_HasWord(value, kCpuInfoFlags[i])); + } + } + } + return !result.eof; +} + +static void FillProcCpuInfoData(MipsFeatures* const features) { + const int fd = CpuFeatures_OpenFile("/proc/cpuinfo"); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + for (;;) { + if (!HandleMipsLine(StackLineReader_NextLine(&reader), features)) { + break; + } + } + CpuFeatures_CloseFile(fd); + } +} + +static const MipsInfo kEmptyMipsInfo; + +MipsInfo GetMipsInfo(void) { + // capabilities are fetched from both getauxval and /proc/cpuinfo so we can + // have some information if the executable is sandboxed (aka no access to + // /proc/cpuinfo). + MipsInfo info = kEmptyMipsInfo; + + FillProcCpuInfoData(&info.features); + const HardwareCapabilities hwcaps = CpuFeatures_GetHardwareCapabilities(); + for (size_t i = 0; i < MIPS_LAST_; ++i) { + if (CpuFeatures_IsHwCapsSet(kHardwareCapabilities[i], hwcaps)) { + kSetters[i](&info.features, true); + } + } + return info; +} + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +int GetMipsFeaturesEnumValue(const MipsFeatures* features, + MipsFeaturesEnum value) { + if (value >= MIPS_LAST_) return false; + return kGetters[value](features); +} + +const char* GetMipsFeaturesEnumName(MipsFeaturesEnum value) { + if (value >= MIPS_LAST_) return "unknown feature"; + return kCpuInfoFlags[value]; +} diff --git a/cpu_features/src/cpuinfo_ppc.c b/cpu_features/src/cpuinfo_ppc.c new file mode 100644 index 0000000..24401f9 --- /dev/null +++ b/cpu_features/src/cpuinfo_ppc.c @@ -0,0 +1,154 @@ +// Copyright 2018 IBM. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_ppc.h" + +#include +#include +#include + +#include "internal/bit_utils.h" +#include "internal/filesystem.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +// Generation of feature's getters/setters functions and kGetters, kSetters, +// kCpuInfoFlags and kHardwareCapabilities global tables. +#define DEFINE_TABLE_FEATURES \ + FEATURE(PPC_32, ppc32, "ppc32", PPC_FEATURE_32, 0) \ + FEATURE(PPC_64, ppc64, "ppc64", PPC_FEATURE_64, 0) \ + FEATURE(PPC_601_INSTR, ppc601, "ppc601", PPC_FEATURE_601_INSTR, 0) \ + FEATURE(PPC_HAS_ALTIVEC, altivec, "altivec", PPC_FEATURE_HAS_ALTIVEC, 0) \ + FEATURE(PPC_HAS_FPU, fpu, "fpu", PPC_FEATURE_HAS_FPU, 0) \ + FEATURE(PPC_HAS_MMU, mmu, "mmu", PPC_FEATURE_HAS_MMU, 0) \ + FEATURE(PPC_HAS_4xxMAC, mac_4xx, "4xxmac", PPC_FEATURE_HAS_4xxMAC, 0) \ + FEATURE(PPC_UNIFIED_CACHE, unifiedcache, "ucache", \ + PPC_FEATURE_UNIFIED_CACHE, 0) \ + FEATURE(PPC_HAS_SPE, spe, "spe", PPC_FEATURE_HAS_SPE, 0) \ + FEATURE(PPC_HAS_EFP_SINGLE, efpsingle, "efpsingle", \ + PPC_FEATURE_HAS_EFP_SINGLE, 0) \ + FEATURE(PPC_HAS_EFP_DOUBLE, efpdouble, "efpdouble", \ + PPC_FEATURE_HAS_EFP_DOUBLE, 0) \ + FEATURE(PPC_NO_TB, no_tb, "notb", PPC_FEATURE_NO_TB, 0) \ + FEATURE(PPC_POWER4, power4, "power4", PPC_FEATURE_POWER4, 0) \ + FEATURE(PPC_POWER5, power5, "power5", PPC_FEATURE_POWER5, 0) \ + FEATURE(PPC_POWER5_PLUS, power5plus, "power5+", PPC_FEATURE_POWER5_PLUS, 0) \ + FEATURE(PPC_CELL, cell, "cellbe", PPC_FEATURE_CELL, 0) \ + FEATURE(PPC_BOOKE, booke, "booke", PPC_FEATURE_BOOKE, 0) \ + FEATURE(PPC_SMT, smt, "smt", PPC_FEATURE_SMT, 0) \ + FEATURE(PPC_ICACHE_SNOOP, icachesnoop, "ic_snoop", PPC_FEATURE_ICACHE_SNOOP, \ + 0) \ + FEATURE(PPC_ARCH_2_05, arch205, "arch_2_05", PPC_FEATURE_ARCH_2_05, 0) \ + FEATURE(PPC_PA6T, pa6t, "pa6t", PPC_FEATURE_PA6T, 0) \ + FEATURE(PPC_HAS_DFP, dfp, "dfp", PPC_FEATURE_HAS_DFP, 0) \ + FEATURE(PPC_POWER6_EXT, power6ext, "power6x", PPC_FEATURE_POWER6_EXT, 0) \ + FEATURE(PPC_ARCH_2_06, arch206, "arch_2_06", PPC_FEATURE_ARCH_2_06, 0) \ + FEATURE(PPC_HAS_VSX, vsx, "vsx", PPC_FEATURE_HAS_VSX, 0) \ + FEATURE(PPC_PSERIES_PERFMON_COMPAT, pseries_perfmon_compat, "archpmu", \ + PPC_FEATURE_PSERIES_PERFMON_COMPAT, 0) \ + FEATURE(PPC_TRUE_LE, truele, "true_le", PPC_FEATURE_TRUE_LE, 0) \ + FEATURE(PPC_PPC_LE, ppcle, "ppcle", PPC_FEATURE_PPC_LE, 0) \ + FEATURE(PPC_ARCH_2_07, arch207, "arch_2_07", 0, PPC_FEATURE2_ARCH_2_07) \ + FEATURE(PPC_HTM, htm, "htm", 0, PPC_FEATURE2_HTM) \ + FEATURE(PPC_DSCR, dscr, "dscr", 0, PPC_FEATURE2_DSCR) \ + FEATURE(PPC_EBB, ebb, "ebb", 0, PPC_FEATURE2_EBB) \ + FEATURE(PPC_ISEL, isel, "isel", 0, PPC_FEATURE2_ISEL) \ + FEATURE(PPC_TAR, tar, "tar", 0, PPC_FEATURE2_TAR) \ + FEATURE(PPC_VEC_CRYPTO, vcrypto, "vcrypto", 0, PPC_FEATURE2_VEC_CRYPTO) \ + FEATURE(PPC_HTM_NOSC, htm_nosc, "htm-nosc", 0, PPC_FEATURE2_HTM_NOSC) \ + FEATURE(PPC_ARCH_3_00, arch300, "arch_3_00", 0, PPC_FEATURE2_ARCH_3_00) \ + FEATURE(PPC_HAS_IEEE128, ieee128, "ieee128", 0, PPC_FEATURE2_HAS_IEEE128) \ + FEATURE(PPC_DARN, darn, "darn", 0, PPC_FEATURE2_DARN) \ + FEATURE(PPC_SCV, scv, "scv", 0, PPC_FEATURE2_SCV) \ + FEATURE(PPC_HTM_NO_SUSPEND, htm_no_suspend, "htm-no-suspend", 0, \ + PPC_FEATURE2_HTM_NO_SUSPEND) +#define DEFINE_TABLE_FEATURE_TYPE PPCFeatures +#include "define_tables.h" + +static bool HandlePPCLine(const LineResult result, + PPCPlatformStrings* const strings) { + StringView line = result.line; + StringView key, value; + if (CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)) { + if (CpuFeatures_StringView_HasWord(key, "platform")) { + CpuFeatures_StringView_CopyString(value, strings->platform, + sizeof(strings->platform)); + } else if (CpuFeatures_StringView_IsEquals(key, str("model"))) { + CpuFeatures_StringView_CopyString(value, strings->model, + sizeof(strings->platform)); + } else if (CpuFeatures_StringView_IsEquals(key, str("machine"))) { + CpuFeatures_StringView_CopyString(value, strings->machine, + sizeof(strings->platform)); + } else if (CpuFeatures_StringView_IsEquals(key, str("cpu"))) { + CpuFeatures_StringView_CopyString(value, strings->cpu, + sizeof(strings->platform)); + } + } + return !result.eof; +} + +static void FillProcCpuInfoData(PPCPlatformStrings* const strings) { + const int fd = CpuFeatures_OpenFile("/proc/cpuinfo"); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + for (;;) { + if (!HandlePPCLine(StackLineReader_NextLine(&reader), strings)) { + break; + } + } + CpuFeatures_CloseFile(fd); + } +} + +static const PPCInfo kEmptyPPCInfo; + +PPCInfo GetPPCInfo(void) { + /* + * On Power feature flags aren't currently in cpuinfo so we only look at + * the auxilary vector. + */ + PPCInfo info = kEmptyPPCInfo; + const HardwareCapabilities hwcaps = CpuFeatures_GetHardwareCapabilities(); + for (size_t i = 0; i < PPC_LAST_; ++i) { + if (CpuFeatures_IsHwCapsSet(kHardwareCapabilities[i], hwcaps)) { + kSetters[i](&info.features, true); + } + } + return info; +} + +static const PPCPlatformStrings kEmptyPPCPlatformStrings; + +PPCPlatformStrings GetPPCPlatformStrings(void) { + PPCPlatformStrings strings = kEmptyPPCPlatformStrings; + + FillProcCpuInfoData(&strings); + strings.type = CpuFeatures_GetPlatformType(); + return strings; +} + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +int GetPPCFeaturesEnumValue(const PPCFeatures* features, + PPCFeaturesEnum value) { + if (value >= PPC_LAST_) return false; + return kGetters[value](features); +} + +const char* GetPPCFeaturesEnumName(PPCFeaturesEnum value) { + if (value >= PPC_LAST_) return "unknown feature"; + return kCpuInfoFlags[value]; +} diff --git a/cpu_features/src/cpuinfo_x86.c b/cpu_features/src/cpuinfo_x86.c new file mode 100644 index 0000000..378ed05 --- /dev/null +++ b/cpu_features/src/cpuinfo_x86.c @@ -0,0 +1,1622 @@ +// Copyright 2017 Google LLC +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_x86.h" + +#include +#include + +#include "internal/bit_utils.h" +#include "internal/cpuid_x86.h" + +#if !defined(CPU_FEATURES_ARCH_X86) +#error "Cannot compile cpuinfo_x86 on a non x86 platform." +#endif + +// Generation of feature's getters/setters functions and kGetters, kSetters, +// kCpuInfoFlags global tables. +#define DEFINE_TABLE_FEATURES \ + FEATURE(X86_FPU, fpu, "fpu", 0, 0) \ + FEATURE(X86_TSC, tsc, "tsc", 0, 0) \ + FEATURE(X86_CX8, cx8, "cx8", 0, 0) \ + FEATURE(X86_CLFSH, clfsh, "clfsh", 0, 0) \ + FEATURE(X86_MMX, mmx, "mmx", 0, 0) \ + FEATURE(X86_AES, aes, "aes", 0, 0) \ + FEATURE(X86_ERMS, erms, "erms", 0, 0) \ + FEATURE(X86_F16C, f16c, "f16c", 0, 0) \ + FEATURE(X86_FMA4, fma4, "fma4", 0, 0) \ + FEATURE(X86_FMA3, fma3, "fma3", 0, 0) \ + FEATURE(X86_VAES, vaes, "vaes", 0, 0) \ + FEATURE(X86_VPCLMULQDQ, vpclmulqdq, "vpclmulqdq", 0, 0) \ + FEATURE(X86_BMI1, bmi1, "bmi1", 0, 0) \ + FEATURE(X86_HLE, hle, "hle", 0, 0) \ + FEATURE(X86_BMI2, bmi2, "bmi2", 0, 0) \ + FEATURE(X86_RTM, rtm, "rtm", 0, 0) \ + FEATURE(X86_RDSEED, rdseed, "rdseed", 0, 0) \ + FEATURE(X86_CLFLUSHOPT, clflushopt, "clflushopt", 0, 0) \ + FEATURE(X86_CLWB, clwb, "clwb", 0, 0) \ + FEATURE(X86_SSE, sse, "sse", 0, 0) \ + FEATURE(X86_SSE2, sse2, "sse2", 0, 0) \ + FEATURE(X86_SSE3, sse3, "sse3", 0, 0) \ + FEATURE(X86_SSSE3, ssse3, "ssse3", 0, 0) \ + FEATURE(X86_SSE4_1, sse4_1, "sse4_1", 0, 0) \ + FEATURE(X86_SSE4_2, sse4_2, "sse4_2", 0, 0) \ + FEATURE(X86_SSE4A, sse4a, "sse4a", 0, 0) \ + FEATURE(X86_AVX, avx, "avx", 0, 0) \ + FEATURE(X86_AVX2, avx2, "avx2", 0, 0) \ + FEATURE(X86_AVX512F, avx512f, "avx512f", 0, 0) \ + FEATURE(X86_AVX512CD, avx512cd, "avx512cd", 0, 0) \ + FEATURE(X86_AVX512ER, avx512er, "avx512er", 0, 0) \ + FEATURE(X86_AVX512PF, avx512pf, "avx512pf", 0, 0) \ + FEATURE(X86_AVX512BW, avx512bw, "avx512bw", 0, 0) \ + FEATURE(X86_AVX512DQ, avx512dq, "avx512dq", 0, 0) \ + FEATURE(X86_AVX512VL, avx512vl, "avx512vl", 0, 0) \ + FEATURE(X86_AVX512IFMA, avx512ifma, "avx512ifma", 0, 0) \ + FEATURE(X86_AVX512VBMI, avx512vbmi, "avx512vbmi", 0, 0) \ + FEATURE(X86_AVX512VBMI2, avx512vbmi2, "avx512vbmi2", 0, 0) \ + FEATURE(X86_AVX512VNNI, avx512vnni, "avx512vnni", 0, 0) \ + FEATURE(X86_AVX512BITALG, avx512bitalg, "avx512bitalg", 0, 0) \ + FEATURE(X86_AVX512VPOPCNTDQ, avx512vpopcntdq, "avx512vpopcntdq", 0, 0) \ + FEATURE(X86_AVX512_4VNNIW, avx512_4vnniw, "avx512_4vnniw", 0, 0) \ + FEATURE(X86_AVX512_4VBMI2, avx512_4vbmi2, "avx512_4vbmi2", 0, 0) \ + FEATURE(X86_AVX512_SECOND_FMA, avx512_second_fma, "avx512_second_fma", 0, 0) \ + FEATURE(X86_AVX512_4FMAPS, avx512_4fmaps, "avx512_4fmaps", 0, 0) \ + FEATURE(X86_AVX512_BF16, avx512_bf16, "avx512_bf16", 0, 0) \ + FEATURE(X86_AVX512_VP2INTERSECT, avx512_vp2intersect, "avx512_vp2intersect", \ + 0, 0) \ + FEATURE(X86_AMX_BF16, amx_bf16, "amx_bf16", 0, 0) \ + FEATURE(X86_AMX_TILE, amx_tile, "amx_tile", 0, 0) \ + FEATURE(X86_AMX_INT8, amx_int8, "amx_int8", 0, 0) \ + FEATURE(X86_PCLMULQDQ, pclmulqdq, "pclmulqdq", 0, 0) \ + FEATURE(X86_SMX, smx, "smx", 0, 0) \ + FEATURE(X86_SGX, sgx, "sgx", 0, 0) \ + FEATURE(X86_CX16, cx16, "cx16", 0, 0) \ + FEATURE(X86_SHA, sha, "sha", 0, 0) \ + FEATURE(X86_POPCNT, popcnt, "popcnt", 0, 0) \ + FEATURE(X86_MOVBE, movbe, "movbe", 0, 0) \ + FEATURE(X86_RDRND, rdrnd, "rdrnd", 0, 0) \ + FEATURE(X86_DCA, dca, "dca", 0, 0) \ + FEATURE(X86_SS, ss, "ss", 0, 0) +#define DEFINE_TABLE_FEATURE_TYPE X86Features +#define DEFINE_TABLE_DONT_GENERATE_HWCAPS +#include "define_tables.h" + +// The following includes are necessary to provide SSE detections on pre-AVX +// microarchitectures. +#if defined(CPU_FEATURES_OS_WINDOWS) +#include // IsProcessorFeaturePresent +#elif defined(CPU_FEATURES_OS_LINUX_OR_ANDROID) +#include "internal/filesystem.h" // Needed to parse /proc/cpuinfo +#include "internal/stack_line_reader.h" // Needed to parse /proc/cpuinfo +#include "internal/string_view.h" // Needed to parse /proc/cpuinfo +#elif defined(CPU_FEATURES_OS_DARWIN) +#if !defined(HAVE_SYSCTLBYNAME) +#error "Darwin needs support for sysctlbyname" +#endif +#include +#else +#error "Unsupported OS" +#endif // CPU_FEATURES_OS + +//////////////////////////////////////////////////////////////////////////////// +// Definitions for CpuId and GetXCR0Eax. +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CPU_FEATURES_MOCK_CPUID_X86) +// Implementation will be provided by test/cpuinfo_x86_test.cc. +#elif defined(CPU_FEATURES_COMPILER_CLANG) || defined(CPU_FEATURES_COMPILER_GCC) + +#include + +Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) { + Leaf leaf; + __cpuid_count(leaf_id, ecx, leaf.eax, leaf.ebx, leaf.ecx, leaf.edx); + return leaf; +} + +uint32_t GetXCR0Eax(void) { + uint32_t eax, edx; + /* named form of xgetbv not supported on OSX, so must use byte form, see: + https://github.com/asmjit/asmjit/issues/78 + */ + __asm(".byte 0x0F, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0)); + return eax; +} + +#elif defined(CPU_FEATURES_COMPILER_MSC) + +#include +#include // For __cpuidex() + +Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) { + Leaf leaf; + int data[4]; + __cpuidex(data, leaf_id, ecx); + leaf.eax = data[0]; + leaf.ebx = data[1]; + leaf.ecx = data[2]; + leaf.edx = data[3]; + return leaf; +} + +uint32_t GetXCR0Eax(void) { return (uint32_t)_xgetbv(0); } + +#else +#error "Unsupported compiler, x86 cpuid requires either GCC, Clang or MSVC." +#endif + +static Leaf CpuId(uint32_t leaf_id) { return GetCpuidLeaf(leaf_id, 0); } + +static const Leaf kEmptyLeaf; + +static Leaf SafeCpuIdEx(uint32_t max_cpuid_leaf, uint32_t leaf_id, int ecx) { + if (leaf_id <= max_cpuid_leaf) { + return GetCpuidLeaf(leaf_id, ecx); + } else { + return kEmptyLeaf; + } +} + +static Leaf SafeCpuId(uint32_t max_cpuid_leaf, uint32_t leaf_id) { + return SafeCpuIdEx(max_cpuid_leaf, leaf_id, 0); +} + +#define MASK_XMM 0x2 +#define MASK_YMM 0x4 +#define MASK_MASKREG 0x20 +#define MASK_ZMM0_15 0x40 +#define MASK_ZMM16_31 0x80 +#define MASK_XTILECFG 0x20000 +#define MASK_XTILEDATA 0x40000 + +static bool HasMask(uint32_t value, uint32_t mask) { + return (value & mask) == mask; +} + +// Checks that operating system saves and restores xmm registers during context +// switches. +static bool HasXmmOsXSave(uint32_t xcr0_eax) { + return HasMask(xcr0_eax, MASK_XMM); +} + +// Checks that operating system saves and restores ymm registers during context +// switches. +static bool HasYmmOsXSave(uint32_t xcr0_eax) { + return HasMask(xcr0_eax, MASK_XMM | MASK_YMM); +} + +// Checks that operating system saves and restores zmm registers during context +// switches. +static bool HasZmmOsXSave(uint32_t xcr0_eax) { + return HasMask(xcr0_eax, MASK_XMM | MASK_YMM | MASK_MASKREG | MASK_ZMM0_15 | + MASK_ZMM16_31); +} + +// Checks that operating system saves and restores AMX/TMUL state during context +// switches. +static bool HasTmmOsXSave(uint32_t xcr0_eax) { + return HasMask(xcr0_eax, MASK_XMM | MASK_YMM | MASK_MASKREG | MASK_ZMM0_15 | + MASK_ZMM16_31 | MASK_XTILECFG | MASK_XTILEDATA); +} + +static bool HasSecondFMA(uint32_t model) { + // Skylake server + if (model == 0x55) { + char proc_name[49] = {0}; + FillX86BrandString(proc_name); + // detect Xeon + if (proc_name[9] == 'X') { + // detect Silver or Bronze + if (proc_name[17] == 'S' || proc_name[17] == 'B') return false; + // detect Gold 5_20 and below, except for Gold 53__ + if (proc_name[17] == 'G' && proc_name[22] == '5') + return ((proc_name[23] == '3') || + (proc_name[24] == '2' && proc_name[25] == '2')); + // detect Xeon W 210x + if (proc_name[17] == 'W' && proc_name[21] == '0') return false; + // detect Xeon D 2xxx + if (proc_name[17] == 'D' && proc_name[19] == '2' && proc_name[20] == '1') + return false; + } + return true; + } + // Cannon Lake client + if (model == 0x66) return false; + // Ice Lake client + if (model == 0x7d || model == 0x7e) return false; + // This is the right default... + return true; +} + +static void SetVendor(const Leaf leaf, char* const vendor) { + *(uint32_t*)(vendor) = leaf.ebx; + *(uint32_t*)(vendor + 4) = leaf.edx; + *(uint32_t*)(vendor + 8) = leaf.ecx; + vendor[12] = '\0'; +} + +static int IsVendor(const Leaf leaf, const char* const name) { + const uint32_t ebx = *(const uint32_t*)(name); + const uint32_t edx = *(const uint32_t*)(name + 4); + const uint32_t ecx = *(const uint32_t*)(name + 8); + return leaf.ebx == ebx && leaf.ecx == ecx && leaf.edx == edx; +} + +static const CacheLevelInfo kEmptyCacheLevelInfo; + +static CacheLevelInfo GetCacheLevelInfo(const uint32_t reg) { + const int UNDEF = -1; + const int KiB = 1024; + const int MiB = 1024 * KiB; + switch (reg) { + case 0x01: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 32, + .partitioning = 0}; + case 0x02: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * MiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 2, + .partitioning = 0}; + case 0x03: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0x04: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 8, + .partitioning = 0}; + case 0x05: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 32, + .partitioning = 0}; + case 0x06: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 8 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x08: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 16 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x09: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 32 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x0A: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 8 * KiB, + .ways = 2, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x0B: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 4, + .partitioning = 0}; + case 0x0C: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 16 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x0D: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 16 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x0E: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 24 * KiB, + .ways = 6, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x1D: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 128 * KiB, + .ways = 2, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x21: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 256 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x22: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x23: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x24: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x25: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x29: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x2C: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 32 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x30: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 32 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x40: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = UNDEF, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x41: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 128 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x42: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 256 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x43: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x44: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x45: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x46: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x47: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 8 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x48: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 3 * MiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x49: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case (0x49 | (1 << 8)): + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4A: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 6 * MiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4B: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 8 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4C: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 12 * MiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4D: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 16 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4E: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 6 * MiB, + .ways = 24, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4F: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 32, + .partitioning = 0}; + case 0x50: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0x51: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 128, + .partitioning = 0}; + case 0x52: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 256, + .partitioning = 0}; + case 0x55: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 2 * MiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 7, + .partitioning = 0}; + case 0x56: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 16, + .partitioning = 0}; + case 0x57: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 16, + .partitioning = 0}; + case 0x59: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 16, + .partitioning = 0}; + case 0x5A: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 2 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 32, + .partitioning = 0}; + case 0x5B: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0x5C: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 128, + .partitioning = 0}; + case 0x5D: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 256, + .partitioning = 0}; + case 0x60: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 16 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x61: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 48, + .partitioning = 0}; + case 0x63: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 2 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 4, + .partitioning = 0}; + case 0x66: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 8 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x67: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 16 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x68: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 32 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x70: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 12 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x71: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 16 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x72: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 32 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x76: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 2 * MiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 8, + .partitioning = 0}; + case 0x78: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x79: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 128 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x7A: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 256 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x7B: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x7C: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x7D: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x7F: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 2, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x80: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x82: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 256 * KiB, + .ways = 8, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x83: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 8, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x84: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 8, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x85: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 8, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x86: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x87: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xA0: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_DTLB, + .cache_size = 4 * KiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 32, + .partitioning = 0}; + case 0xB0: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 128, + .partitioning = 0}; + case 0xB1: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 2 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 8, + .partitioning = 0}; + case 0xB2: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0xB3: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 128, + .partitioning = 0}; + case 0xB4: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 256, + .partitioning = 0}; + case 0xB5: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0xB6: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = 128, + .partitioning = 0}; + case 0xBA: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0xC0: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 8, + .partitioning = 0}; + case 0xC1: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_STLB, + .cache_size = 4 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = 1024, + .partitioning = 0}; + case 0xC2: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_DTLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 16, + .partitioning = 0}; + case 0xC3: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_STLB, + .cache_size = 4 * KiB, + .ways = 6, + .line_size = UNDEF, + .tlb_entries = 1536, + .partitioning = 0}; + case 0xCA: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_STLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 512, + .partitioning = 0}; + case 0xD0: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xD1: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xD2: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xD6: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xD7: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xD8: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xDC: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * 1536 * KiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xDD: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 3 * MiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xDE: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 6 * MiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xE2: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xE3: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xE4: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 8 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xEA: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 12 * MiB, + .ways = 24, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xEB: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 18 * MiB, + .ways = 24, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xEC: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 24 * MiB, + .ways = 24, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xF0: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_PREFETCH, + .cache_size = 64 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xF1: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_PREFETCH, + .cache_size = 128 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xFF: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_NULL, + .cache_size = UNDEF, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + default: + return kEmptyCacheLevelInfo; + } +} + +static void GetByteArrayFromRegister(uint32_t result[4], const uint32_t reg) { + for (int i = 0; i < 4; ++i) { + result[i] = ExtractBitRange(reg, (i + 1) * 8, i * 8); + } +} + +static void ParseLeaf2(const int max_cpuid_leaf, CacheInfo* info) { + Leaf leaf = SafeCpuId(max_cpuid_leaf, 2); + uint32_t registers[] = {leaf.eax, leaf.ebx, leaf.ecx, leaf.edx}; + for (int i = 0; i < 4; ++i) { + if (registers[i] & (1U << 31)) { + continue; // register does not contains valid information + } + uint32_t bytes[4]; + GetByteArrayFromRegister(bytes, registers[i]); + for (int j = 0; j < 4; ++j) { + if (bytes[j] == 0xFF) + break; // leaf 4 should be used to fetch cache information + info->levels[info->size] = GetCacheLevelInfo(bytes[j]); + } + info->size++; + } +} + +static void ParseLeaf4(const int max_cpuid_leaf, CacheInfo* info) { + info->size = 0; + for (int cache_id = 0; cache_id < CPU_FEATURES_MAX_CACHE_LEVEL; cache_id++) { + const Leaf leaf = SafeCpuIdEx(max_cpuid_leaf, 4, cache_id); + CacheType cache_type = ExtractBitRange(leaf.eax, 4, 0); + if (cache_type == CPU_FEATURE_CACHE_NULL) { + info->levels[cache_id] = kEmptyCacheLevelInfo; + continue; + } + int level = ExtractBitRange(leaf.eax, 7, 5); + int line_size = ExtractBitRange(leaf.ebx, 11, 0) + 1; + int partitioning = ExtractBitRange(leaf.ebx, 21, 12) + 1; + int ways = ExtractBitRange(leaf.ebx, 31, 22) + 1; + int tlb_entries = leaf.ecx + 1; + int cache_size = (ways * partitioning * line_size * (tlb_entries)); + info->levels[cache_id] = (CacheLevelInfo){.level = level, + .cache_type = cache_type, + .cache_size = cache_size, + .ways = ways, + .line_size = line_size, + .tlb_entries = tlb_entries, + .partitioning = partitioning}; + info->size++; + } +} + +// Internal structure to hold the OS support for vector operations. +// Avoid to recompute them since each call to cpuid is ~100 cycles. +typedef struct { + bool have_sse_via_os; + bool have_sse_via_cpuid; + bool have_avx; + bool have_avx512; + bool have_amx; +} OsSupport; + +static const OsSupport kEmptyOsSupport; + +static OsSupport CheckOsSupport(const uint32_t max_cpuid_leaf) { + const Leaf leaf_1 = SafeCpuId(max_cpuid_leaf, 1); + const bool have_xsave = IsBitSet(leaf_1.ecx, 26); + const bool have_osxsave = IsBitSet(leaf_1.ecx, 27); + const bool have_xcr0 = have_xsave && have_osxsave; + + OsSupport os_support = kEmptyOsSupport; + + if (have_xcr0) { + // AVX capable cpu will expose XCR0. + const uint32_t xcr0_eax = GetXCR0Eax(); + os_support.have_sse_via_cpuid = HasXmmOsXSave(xcr0_eax); + os_support.have_avx = HasYmmOsXSave(xcr0_eax); + os_support.have_avx512 = HasZmmOsXSave(xcr0_eax); + os_support.have_amx = HasTmmOsXSave(xcr0_eax); + } else { + // Atom based or older cpus need to ask the OS for sse support. + os_support.have_sse_via_os = true; + } + + return os_support; +} + +#if defined(CPU_FEATURES_OS_WINDOWS) +#if defined(CPU_FEATURES_MOCK_CPUID_X86) +extern bool GetWindowsIsProcessorFeaturePresent(DWORD); +#else // CPU_FEATURES_MOCK_CPUID_X86 +static bool GetWindowsIsProcessorFeaturePresent(DWORD ProcessorFeature) { + return IsProcessorFeaturePresent(ProcessorFeature); +} +#endif +#endif // CPU_FEATURES_OS_WINDOWS + +#if defined(CPU_FEATURES_OS_DARWIN) +#if defined(CPU_FEATURES_MOCK_CPUID_X86) +extern bool GetDarwinSysCtlByName(const char*); +#else // CPU_FEATURES_MOCK_CPUID_X86 +static bool GetDarwinSysCtlByName(const char* name) { + int enabled; + size_t enabled_len = sizeof(enabled); + const int failure = sysctlbyname(name, &enabled, &enabled_len, NULL, 0); + return failure ? false : enabled; +} +#endif +#endif // CPU_FEATURES_OS_DARWIN + +static void DetectSseViaOs(X86Features* features) { +#if defined(CPU_FEATURES_OS_WINDOWS) + // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent + features->sse = + GetWindowsIsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE); + features->sse2 = + GetWindowsIsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE); + features->sse3 = + GetWindowsIsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE); +#elif defined(CPU_FEATURES_OS_DARWIN) + // Handling Darwin platform through sysctlbyname. + features->sse = GetDarwinSysCtlByName("hw.optional.sse"); + features->sse2 = GetDarwinSysCtlByName("hw.optional.sse2"); + features->sse3 = GetDarwinSysCtlByName("hw.optional.sse3"); + features->ssse3 = GetDarwinSysCtlByName("hw.optional.supplementalsse3"); + features->sse4_1 = GetDarwinSysCtlByName("hw.optional.sse4_1"); + features->sse4_2 = GetDarwinSysCtlByName("hw.optional.sse4_2"); +#elif defined(CPU_FEATURES_OS_LINUX_OR_ANDROID) + // Handling Linux platform through /proc/cpuinfo. + const int fd = CpuFeatures_OpenFile("/proc/cpuinfo"); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + for (;;) { + const LineResult result = StackLineReader_NextLine(&reader); + const StringView line = result.line; + StringView key, value; + if (CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)) { + if (CpuFeatures_StringView_IsEquals(key, str("flags"))) { + features->sse = CpuFeatures_StringView_HasWord(value, "sse"); + features->sse2 = CpuFeatures_StringView_HasWord(value, "sse2"); + features->sse3 = CpuFeatures_StringView_HasWord(value, "sse3"); + features->ssse3 = CpuFeatures_StringView_HasWord(value, "ssse3"); + features->sse4_1 = CpuFeatures_StringView_HasWord(value, "sse4_1"); + features->sse4_2 = CpuFeatures_StringView_HasWord(value, "sse4_2"); + break; + } + } + if (result.eof) break; + } + CpuFeatures_CloseFile(fd); + } +#else +#error "Unsupported fallback detection of SSE OS support." +#endif +} + +// Reference https://en.wikipedia.org/wiki/CPUID. +static void ParseCpuId(const uint32_t max_cpuid_leaf, + const OsSupport os_support, X86Info* info) { + const Leaf leaf_1 = SafeCpuId(max_cpuid_leaf, 1); + const Leaf leaf_7 = SafeCpuId(max_cpuid_leaf, 7); + const Leaf leaf_7_1 = SafeCpuIdEx(max_cpuid_leaf, 7, 1); + + const uint32_t family = ExtractBitRange(leaf_1.eax, 11, 8); + const uint32_t extended_family = ExtractBitRange(leaf_1.eax, 27, 20); + const uint32_t model = ExtractBitRange(leaf_1.eax, 7, 4); + const uint32_t extended_model = ExtractBitRange(leaf_1.eax, 19, 16); + + X86Features* const features = &info->features; + + info->family = extended_family + family; + info->model = (extended_model << 4) + model; + info->stepping = ExtractBitRange(leaf_1.eax, 3, 0); + + features->fpu = IsBitSet(leaf_1.edx, 0); + features->tsc = IsBitSet(leaf_1.edx, 4); + features->cx8 = IsBitSet(leaf_1.edx, 8); + features->clfsh = IsBitSet(leaf_1.edx, 19); + features->mmx = IsBitSet(leaf_1.edx, 23); + features->ss = IsBitSet(leaf_1.edx, 27); + features->pclmulqdq = IsBitSet(leaf_1.ecx, 1); + features->smx = IsBitSet(leaf_1.ecx, 6); + features->cx16 = IsBitSet(leaf_1.ecx, 13); + features->dca = IsBitSet(leaf_1.ecx, 18); + features->movbe = IsBitSet(leaf_1.ecx, 22); + features->popcnt = IsBitSet(leaf_1.ecx, 23); + features->aes = IsBitSet(leaf_1.ecx, 25); + features->f16c = IsBitSet(leaf_1.ecx, 29); + features->rdrnd = IsBitSet(leaf_1.ecx, 30); + features->sgx = IsBitSet(leaf_7.ebx, 2); + features->bmi1 = IsBitSet(leaf_7.ebx, 3); + features->hle = IsBitSet(leaf_7.ebx, 4); + features->bmi2 = IsBitSet(leaf_7.ebx, 8); + features->erms = IsBitSet(leaf_7.ebx, 9); + features->rtm = IsBitSet(leaf_7.ebx, 11); + features->rdseed = IsBitSet(leaf_7.ebx, 18); + features->clflushopt = IsBitSet(leaf_7.ebx, 23); + features->clwb = IsBitSet(leaf_7.ebx, 24); + features->sha = IsBitSet(leaf_7.ebx, 29); + features->vaes = IsBitSet(leaf_7.ecx, 9); + features->vpclmulqdq = IsBitSet(leaf_7.ecx, 10); + + if (os_support.have_sse_via_os) { + DetectSseViaOs(features); + } else if (os_support.have_sse_via_cpuid) { + features->sse = IsBitSet(leaf_1.edx, 25); + features->sse2 = IsBitSet(leaf_1.edx, 26); + features->sse3 = IsBitSet(leaf_1.ecx, 0); + features->ssse3 = IsBitSet(leaf_1.ecx, 9); + features->sse4_1 = IsBitSet(leaf_1.ecx, 19); + features->sse4_2 = IsBitSet(leaf_1.ecx, 20); + } + + if (os_support.have_avx) { + features->fma3 = IsBitSet(leaf_1.ecx, 12); + features->avx = IsBitSet(leaf_1.ecx, 28); + features->avx2 = IsBitSet(leaf_7.ebx, 5); + } + + if (os_support.have_avx512) { + features->avx512f = IsBitSet(leaf_7.ebx, 16); + features->avx512cd = IsBitSet(leaf_7.ebx, 28); + features->avx512er = IsBitSet(leaf_7.ebx, 27); + features->avx512pf = IsBitSet(leaf_7.ebx, 26); + features->avx512bw = IsBitSet(leaf_7.ebx, 30); + features->avx512dq = IsBitSet(leaf_7.ebx, 17); + features->avx512vl = IsBitSet(leaf_7.ebx, 31); + features->avx512ifma = IsBitSet(leaf_7.ebx, 21); + features->avx512vbmi = IsBitSet(leaf_7.ecx, 1); + features->avx512vbmi2 = IsBitSet(leaf_7.ecx, 6); + features->avx512vnni = IsBitSet(leaf_7.ecx, 11); + features->avx512bitalg = IsBitSet(leaf_7.ecx, 12); + features->avx512vpopcntdq = IsBitSet(leaf_7.ecx, 14); + features->avx512_4vnniw = IsBitSet(leaf_7.edx, 2); + features->avx512_4vbmi2 = IsBitSet(leaf_7.edx, 3); + features->avx512_second_fma = HasSecondFMA(info->model); + features->avx512_4fmaps = IsBitSet(leaf_7.edx, 3); + features->avx512_bf16 = IsBitSet(leaf_7_1.eax, 5); + features->avx512_vp2intersect = IsBitSet(leaf_7.edx, 8); + } + + if (os_support.have_amx) { + features->amx_bf16 = IsBitSet(leaf_7.edx, 22); + features->amx_tile = IsBitSet(leaf_7.edx, 24); + features->amx_int8 = IsBitSet(leaf_7.edx, 25); + } +} + +// Reference +// https://en.wikipedia.org/wiki/CPUID#EAX=80000000h:_Get_Highest_Extended_Function_Implemented. +static void ParseExtraAMDCpuId(X86Info* info, OsSupport os_support) { + const Leaf leaf_80000000 = CpuId(0x80000000); + const uint32_t max_extended_cpuid_leaf = leaf_80000000.eax; + const Leaf leaf_80000001 = SafeCpuId(max_extended_cpuid_leaf, 0x80000001); + + X86Features* const features = &info->features; + + if (os_support.have_sse_via_cpuid) { + features->sse4a = IsBitSet(leaf_80000001.ecx, 6); + } + + if (os_support.have_avx) { + features->fma4 = IsBitSet(leaf_80000001.ecx, 16); + } +} + +static const X86Info kEmptyX86Info; +static const CacheInfo kEmptyCacheInfo; + +X86Info GetX86Info(void) { + X86Info info = kEmptyX86Info; + const Leaf leaf_0 = CpuId(0); + const bool is_intel = IsVendor(leaf_0, "GenuineIntel"); + const bool is_amd = IsVendor(leaf_0, "AuthenticAMD"); + SetVendor(leaf_0, info.vendor); + if (is_intel || is_amd) { + const uint32_t max_cpuid_leaf = leaf_0.eax; + const OsSupport os_support = CheckOsSupport(max_cpuid_leaf); + ParseCpuId(max_cpuid_leaf, os_support, &info); + if (is_amd) { + ParseExtraAMDCpuId(&info, os_support); + } + } + return info; +} + +CacheInfo GetX86CacheInfo(void) { + CacheInfo info = kEmptyCacheInfo; + const Leaf leaf_0 = CpuId(0); + const uint32_t max_cpuid_leaf = leaf_0.eax; + if (IsVendor(leaf_0, "GenuineIntel")) { + ParseLeaf2(max_cpuid_leaf, &info); + ParseLeaf4(max_cpuid_leaf, &info); + } + return info; +} + +#define CPUID(FAMILY, MODEL) ((((FAMILY)&0xFF) << 8) | ((MODEL)&0xFF)) + +X86Microarchitecture GetX86Microarchitecture(const X86Info* info) { + if (memcmp(info->vendor, "GenuineIntel", sizeof(info->vendor)) == 0) { + switch (CPUID(info->family, info->model)) { + case CPUID(0x06, 0x35): + case CPUID(0x06, 0x36): + // https://en.wikipedia.org/wiki/Bonnell_(microarchitecture) + return INTEL_ATOM_BNL; + case CPUID(0x06, 0x37): + case CPUID(0x06, 0x4C): + // https://en.wikipedia.org/wiki/Silvermont + return INTEL_ATOM_SMT; + case CPUID(0x06, 0x5C): + // https://en.wikipedia.org/wiki/Goldmont + return INTEL_ATOM_GMT; + case CPUID(0x06, 0x0F): + case CPUID(0x06, 0x16): + // https://en.wikipedia.org/wiki/Intel_Core_(microarchitecture) + return INTEL_CORE; + case CPUID(0x06, 0x17): + case CPUID(0x06, 0x1D): + // https://en.wikipedia.org/wiki/Penryn_(microarchitecture) + return INTEL_PNR; + case CPUID(0x06, 0x1A): + case CPUID(0x06, 0x1E): + case CPUID(0x06, 0x1F): + case CPUID(0x06, 0x2E): + // https://en.wikipedia.org/wiki/Nehalem_(microarchitecture) + return INTEL_NHM; + case CPUID(0x06, 0x25): + case CPUID(0x06, 0x2C): + case CPUID(0x06, 0x2F): + // https://en.wikipedia.org/wiki/Westmere_(microarchitecture) + return INTEL_WSM; + case CPUID(0x06, 0x2A): + case CPUID(0x06, 0x2D): + // https://en.wikipedia.org/wiki/Sandy_Bridge#Models_and_steppings + return INTEL_SNB; + case CPUID(0x06, 0x3A): + case CPUID(0x06, 0x3E): + // https://en.wikipedia.org/wiki/Ivy_Bridge_(microarchitecture)#Models_and_steppings + return INTEL_IVB; + case CPUID(0x06, 0x3C): + case CPUID(0x06, 0x3F): + case CPUID(0x06, 0x45): + case CPUID(0x06, 0x46): + // https://en.wikipedia.org/wiki/Haswell_(microarchitecture) + return INTEL_HSW; + case CPUID(0x06, 0x3D): + case CPUID(0x06, 0x47): + case CPUID(0x06, 0x4F): + case CPUID(0x06, 0x56): + // https://en.wikipedia.org/wiki/Broadwell_(microarchitecture) + return INTEL_BDW; + case CPUID(0x06, 0x4E): + case CPUID(0x06, 0x55): + case CPUID(0x06, 0x5E): + // https://en.wikipedia.org/wiki/Skylake_(microarchitecture) + return INTEL_SKL; + case CPUID(0x06, 0x66): + // https://en.wikipedia.org/wiki/Cannon_Lake_(microarchitecture) + return INTEL_CNL; + case CPUID(0x06, 0x7D): // client + case CPUID(0x06, 0x7E): // client + case CPUID(0x06, 0x9D): // NNP-I + case CPUID(0x06, 0x6A): // server + case CPUID(0x06, 0x6C): // server + // https://en.wikipedia.org/wiki/Ice_Lake_(microprocessor) + return INTEL_ICL; + case CPUID(0x06, 0x8C): + case CPUID(0x06, 0x8D): + // https://en.wikipedia.org/wiki/Tiger_Lake_(microarchitecture) + return INTEL_TGL; + case CPUID(0x06, 0x8F): + // https://en.wikipedia.org/wiki/Sapphire_Rapids + return INTEL_SPR; + case CPUID(0x06, 0x8E): + switch (info->stepping) { + case 9: + return INTEL_KBL; // https://en.wikipedia.org/wiki/Kaby_Lake + case 10: + return INTEL_CFL; // https://en.wikipedia.org/wiki/Coffee_Lake + case 11: + return INTEL_WHL; // https://en.wikipedia.org/wiki/Whiskey_Lake_(microarchitecture) + default: + return X86_UNKNOWN; + } + case CPUID(0x06, 0x9E): + if (info->stepping > 9) { + // https://en.wikipedia.org/wiki/Coffee_Lake + return INTEL_CFL; + } else { + // https://en.wikipedia.org/wiki/Kaby_Lake + return INTEL_KBL; + } + default: + return X86_UNKNOWN; + } + } + if (memcmp(info->vendor, "AuthenticAMD", sizeof(info->vendor)) == 0) { + switch (info->family) { + // https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures + case 0x0F: + return AMD_HAMMER; + case 0x10: + return AMD_K10; + case 0x14: + return AMD_BOBCAT; + case 0x15: + return AMD_BULLDOZER; + case 0x16: + return AMD_JAGUAR; + case 0x17: + return AMD_ZEN; + default: + return X86_UNKNOWN; + } + } + return X86_UNKNOWN; +} + +static void SetString(const uint32_t max_cpuid_ext_leaf, const uint32_t leaf_id, + char* buffer) { + const Leaf leaf = SafeCpuId(max_cpuid_ext_leaf, leaf_id); + // We allow calling memcpy from SetString which is only called when requesting + // X86BrandString. + memcpy(buffer, &leaf, sizeof(Leaf)); +} + +void FillX86BrandString(char brand_string[49]) { + const Leaf leaf_ext_0 = CpuId(0x80000000); + const uint32_t max_cpuid_leaf_ext = leaf_ext_0.eax; + SetString(max_cpuid_leaf_ext, 0x80000002, brand_string); + SetString(max_cpuid_leaf_ext, 0x80000003, brand_string + 16); + SetString(max_cpuid_leaf_ext, 0x80000004, brand_string + 32); + brand_string[48] = '\0'; +} + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +int GetX86FeaturesEnumValue(const X86Features* features, + X86FeaturesEnum value) { + if (value >= X86_LAST_) return false; + return kGetters[value](features); +} + +const char* GetX86FeaturesEnumName(X86FeaturesEnum value) { + if (value >= X86_LAST_) return "unknown_feature"; + return kCpuInfoFlags[value]; +} + +const char* GetX86MicroarchitectureName(X86Microarchitecture uarch) { + switch (uarch) { + case X86_UNKNOWN: + return "X86_UNKNOWN"; + case INTEL_CORE: + return "INTEL_CORE"; + case INTEL_PNR: + return "INTEL_PNR"; + case INTEL_NHM: + return "INTEL_NHM"; + case INTEL_ATOM_BNL: + return "INTEL_ATOM_BNL"; + case INTEL_WSM: + return "INTEL_WSM"; + case INTEL_SNB: + return "INTEL_SNB"; + case INTEL_IVB: + return "INTEL_IVB"; + case INTEL_ATOM_SMT: + return "INTEL_ATOM_SMT"; + case INTEL_HSW: + return "INTEL_HSW"; + case INTEL_BDW: + return "INTEL_BDW"; + case INTEL_SKL: + return "INTEL_SKL"; + case INTEL_ATOM_GMT: + return "INTEL_ATOM_GMT"; + case INTEL_KBL: + return "INTEL_KBL"; + case INTEL_CFL: + return "INTEL_CFL"; + case INTEL_WHL: + return "INTEL_WHL"; + case INTEL_CNL: + return "INTEL_CNL"; + case INTEL_ICL: + return "INTEL_ICL"; + case INTEL_TGL: + return "INTEL_TGL"; + case INTEL_SPR: + return "INTEL_SPR"; + case AMD_HAMMER: + return "AMD_HAMMER"; + case AMD_K10: + return "AMD_K10"; + case AMD_BOBCAT: + return "AMD_BOBCAT"; + case AMD_BULLDOZER: + return "AMD_BULLDOZER"; + case AMD_JAGUAR: + return "AMD_JAGUAR"; + case AMD_ZEN: + return "AMD_ZEN"; + } + return "unknown microarchitecture"; +} diff --git a/cpu_features/src/define_tables.h b/cpu_features/src/define_tables.h new file mode 100644 index 0000000..dc1485c --- /dev/null +++ b/cpu_features/src/define_tables.h @@ -0,0 +1,67 @@ +// Copyright 2020 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The following preprocessor constants must be defined before including this +// file: +// - DEFINE_TABLE_FEATURE_TYPE, the underlying type (e.g. X86Features) +// - DEFINE_TABLE_FEATURES, the list of FEATURE macros to be inserted. + +// This file is to be included once per `cpuinfo_XXX.c` in order to construct +// feature getters and setters functions as well as several enum indexed tables +// from the db file. +// - `kGetters` a table of getters function pointers from feature enum to +// retrieve a feature, +// - `kSetters` a table of setters function pointers from feature enum to set a +// feature, +// - `kCpuInfoFlags` a table of strings from feature enum to /proc/cpuinfo +// flags, +// - `kHardwareCapabilities` a table of HardwareCapabilities structs indexed by +// their feature enum. + +#ifndef SRC_DEFINE_TABLES_H_ +#define SRC_DEFINE_TABLES_H_ + +#define FEATURE(ENUM, NAME, CPUINFO_FLAG, HWCAP, HWCAP2) [ENUM] = CPUINFO_FLAG, +static const char* kCpuInfoFlags[] = {DEFINE_TABLE_FEATURES}; +#undef FEATURE + +#ifndef DEFINE_TABLE_DONT_GENERATE_HWCAPS +#define FEATURE(ENUM, NAME, CPUINFO_FLAG, HWCAP, HWCAP2) \ + [ENUM] = (HardwareCapabilities){HWCAP, HWCAP2}, +static const HardwareCapabilities kHardwareCapabilities[] = { + DEFINE_TABLE_FEATURES}; +#undef FEATURE +#endif // DEFINE_TABLE_DONT_GENERATE_HWCAPS + +#define FEATURE(ENUM, NAME, CPUINFO_FLAG, HWCAP, HWCAP2) \ + static void set_##ENUM(DEFINE_TABLE_FEATURE_TYPE* features, bool value) { \ + features->NAME = value; \ + } \ + static int get_##ENUM(const DEFINE_TABLE_FEATURE_TYPE* features) { \ + return features->NAME; \ + } +DEFINE_TABLE_FEATURES +#undef FEATURE + +#define FEATURE(ENUM, NAME, CPUINFO_FLAG, HWCAP, HWCAP2) [ENUM] = set_##ENUM, +static void (*const kSetters[])(DEFINE_TABLE_FEATURE_TYPE*, + bool) = {DEFINE_TABLE_FEATURES}; +#undef FEATURE + +#define FEATURE(ENUM, NAME, CPUINFO_FLAG, HWCAP, HWCAP2) [ENUM] = get_##ENUM, +static int (*const kGetters[])(const DEFINE_TABLE_FEATURE_TYPE*) = { + DEFINE_TABLE_FEATURES}; +#undef FEATURE + +#endif // SRC_DEFINE_TABLES_H_ diff --git a/cpu_features/src/filesystem.c b/cpu_features/src/filesystem.c new file mode 100644 index 0000000..46c9906 --- /dev/null +++ b/cpu_features/src/filesystem.c @@ -0,0 +1,62 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/filesystem.h" + +#include +#include +#include +#include +#include + +#if defined(CPU_FEATURES_MOCK_FILESYSTEM) +// Implementation will be provided by test/filesystem_for_testing.cc. +#elif defined(_MSC_VER) +#include +int CpuFeatures_OpenFile(const char* filename) { + int fd = -1; + _sopen_s(&fd, filename, _O_RDONLY, _SH_DENYWR, _S_IREAD); + return fd; +} + +void CpuFeatures_CloseFile(int file_descriptor) { _close(file_descriptor); } + +int CpuFeatures_ReadFile(int file_descriptor, void* buffer, + size_t buffer_size) { + return _read(file_descriptor, buffer, (unsigned int)buffer_size); +} + +#else +#include + +int CpuFeatures_OpenFile(const char* filename) { + int result; + do { + result = open(filename, O_RDONLY); + } while (result == -1L && errno == EINTR); + return result; +} + +void CpuFeatures_CloseFile(int file_descriptor) { close(file_descriptor); } + +int CpuFeatures_ReadFile(int file_descriptor, void* buffer, + size_t buffer_size) { + int result; + do { + result = read(file_descriptor, buffer, buffer_size); + } while (result == -1L && errno == EINTR); + return result; +} + +#endif diff --git a/cpu_features/src/hwcaps.c b/cpu_features/src/hwcaps.c new file mode 100644 index 0000000..dd17e3b --- /dev/null +++ b/cpu_features/src/hwcaps.c @@ -0,0 +1,182 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/hwcaps.h" + +#include +#include + +#include "cpu_features_macros.h" +#include "internal/filesystem.h" +#include "internal/string_view.h" + +static bool IsSet(const uint32_t mask, const uint32_t value) { + if (mask == 0) return false; + return (value & mask) == mask; +} + +bool CpuFeatures_IsHwCapsSet(const HardwareCapabilities hwcaps_mask, + const HardwareCapabilities hwcaps) { + return IsSet(hwcaps_mask.hwcaps, hwcaps.hwcaps) || + IsSet(hwcaps_mask.hwcaps2, hwcaps.hwcaps2); +} + +#ifdef CPU_FEATURES_TEST +// In test mode, hwcaps_for_testing will define the following functions. +HardwareCapabilities CpuFeatures_GetHardwareCapabilities(void); +PlatformType CpuFeatures_GetPlatformType(void); +#else + +// Debug facilities +#if defined(NDEBUG) +#define D(...) +#else +#include +#define D(...) \ + do { \ + printf(__VA_ARGS__); \ + fflush(stdout); \ + } while (0) +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Implementation of GetElfHwcapFromGetauxval +//////////////////////////////////////////////////////////////////////////////// + +#define AT_HWCAP 16 +#define AT_HWCAP2 26 +#define AT_PLATFORM 15 +#define AT_BASE_PLATFORM 24 + +#if defined(HAVE_STRONG_GETAUXVAL) +#include +static unsigned long GetElfHwcapFromGetauxval(uint32_t hwcap_type) { + return getauxval(hwcap_type); +} +#elif defined(HAVE_DLFCN_H) +// On Android we probe the system's C library for a 'getauxval' function and +// call it if it exits, or return 0 for failure. This function is available +// since API level 20. +// +// This code does *NOT* check for '__ANDROID_API__ >= 20' to support the edge +// case where some NDK developers use headers for a platform that is newer than +// the one really targetted by their application. This is typically done to use +// newer native APIs only when running on more recent Android versions, and +// requires careful symbol management. +// +// Note that getauxval() can't really be re-implemented here, because its +// implementation does not parse /proc/self/auxv. Instead it depends on values +// that are passed by the kernel at process-init time to the C runtime +// initialization layer. + +#include + +typedef unsigned long getauxval_func_t(unsigned long); + +static uint32_t GetElfHwcapFromGetauxval(uint32_t hwcap_type) { + uint32_t ret = 0; + void *libc_handle = NULL; + getauxval_func_t *func = NULL; + + dlerror(); // Cleaning error state before calling dlopen. + libc_handle = dlopen("libc.so", RTLD_NOW); + if (!libc_handle) { + D("Could not dlopen() C library: %s\n", dlerror()); + return 0; + } + func = (getauxval_func_t *)dlsym(libc_handle, "getauxval"); + if (!func) { + D("Could not find getauxval() in C library\n"); + } else { + // Note: getauxval() returns 0 on failure. Doesn't touch errno. + ret = (uint32_t)(*func)(hwcap_type); + } + dlclose(libc_handle); + return ret; +} +#else +#error "This platform does not provide hardware capabilities." +#endif + +// Implementation of GetHardwareCapabilities for OS that provide +// GetElfHwcapFromGetauxval(). + +// Fallback when getauxval is not available, retrieves hwcaps from +// "/proc/self/auxv". +static uint32_t GetElfHwcapFromProcSelfAuxv(uint32_t hwcap_type) { + struct { + uint32_t tag; + uint32_t value; + } entry; + uint32_t result = 0; + const char filepath[] = "/proc/self/auxv"; + const int fd = CpuFeatures_OpenFile(filepath); + if (fd < 0) { + D("Could not open %s\n", filepath); + return 0; + } + for (;;) { + const int ret = CpuFeatures_ReadFile(fd, (char *)&entry, sizeof entry); + if (ret < 0) { + D("Error while reading %s\n", filepath); + break; + } + // Detect end of list. + if (ret == 0 || (entry.tag == 0 && entry.value == 0)) { + break; + } + if (entry.tag == hwcap_type) { + result = entry.value; + break; + } + } + CpuFeatures_CloseFile(fd); + return result; +} + +// Retrieves hardware capabilities by first trying to call getauxval, if not +// available falls back to reading "/proc/self/auxv". +static unsigned long GetHardwareCapabilitiesFor(uint32_t type) { + unsigned long hwcaps = GetElfHwcapFromGetauxval(type); + if (!hwcaps) { + D("Parsing /proc/self/auxv to extract ELF hwcaps!\n"); + hwcaps = GetElfHwcapFromProcSelfAuxv(type); + } + return hwcaps; +} + +HardwareCapabilities CpuFeatures_GetHardwareCapabilities(void) { + HardwareCapabilities capabilities; + capabilities.hwcaps = GetHardwareCapabilitiesFor(AT_HWCAP); + capabilities.hwcaps2 = GetHardwareCapabilitiesFor(AT_HWCAP2); + return capabilities; +} + +PlatformType kEmptyPlatformType; + +PlatformType CpuFeatures_GetPlatformType(void) { + PlatformType type = kEmptyPlatformType; + char *platform = (char *)GetHardwareCapabilitiesFor(AT_PLATFORM); + char *base_platform = (char *)GetHardwareCapabilitiesFor(AT_BASE_PLATFORM); + + if (platform != NULL) + CpuFeatures_StringView_CopyString(str(platform), type.platform, + sizeof(type.platform)); + if (base_platform != NULL) + CpuFeatures_StringView_CopyString(str(base_platform), type.base_platform, + sizeof(type.base_platform)); + return type; +} + +#endif // CPU_FEATURES_TEST diff --git a/cpu_features/src/stack_line_reader.c b/cpu_features/src/stack_line_reader.c new file mode 100644 index 0000000..ffc778d --- /dev/null +++ b/cpu_features/src/stack_line_reader.c @@ -0,0 +1,132 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/stack_line_reader.h" + +#include +#include +#include + +#include "internal/filesystem.h" + +void StackLineReader_Initialize(StackLineReader* reader, int fd) { + reader->view.ptr = reader->buffer; + reader->view.size = 0; + reader->skip_mode = false; + reader->fd = fd; +} + +// Replaces the content of buffer with bytes from the file. +static int LoadFullBuffer(StackLineReader* reader) { + const int read = CpuFeatures_ReadFile(reader->fd, reader->buffer, + STACK_LINE_READER_BUFFER_SIZE); + assert(read >= 0); + reader->view.ptr = reader->buffer; + reader->view.size = read; + return read; +} + +// Appends with bytes from the file to buffer, filling the remaining space. +static int LoadMore(StackLineReader* reader) { + char* const ptr = reader->buffer + reader->view.size; + const size_t size_to_read = STACK_LINE_READER_BUFFER_SIZE - reader->view.size; + const int read = CpuFeatures_ReadFile(reader->fd, ptr, size_to_read); + assert(read >= 0); + assert(read <= (int)size_to_read); + reader->view.size += read; + return read; +} + +static int IndexOfEol(StackLineReader* reader) { + return CpuFeatures_StringView_IndexOfChar(reader->view, '\n'); +} + +// Relocate buffer's pending bytes at the beginning of the array and fills the +// remaining space with bytes from the file. +static int BringToFrontAndLoadMore(StackLineReader* reader) { + if (reader->view.size && reader->view.ptr != reader->buffer) { + memmove(reader->buffer, reader->view.ptr, reader->view.size); + } + reader->view.ptr = reader->buffer; + return LoadMore(reader); +} + +// Loads chunks of buffer size from disks until it contains a newline character +// or end of file. +static void SkipToNextLine(StackLineReader* reader) { + for (;;) { + const int read = LoadFullBuffer(reader); + if (read == 0) { + break; + } else { + const int eol_index = IndexOfEol(reader); + if (eol_index >= 0) { + reader->view = + CpuFeatures_StringView_PopFront(reader->view, eol_index + 1); + break; + } + } + } +} + +static LineResult CreateLineResult(bool eof, bool full_line, StringView view) { + LineResult result; + result.eof = eof; + result.full_line = full_line; + result.line = view; + return result; +} + +// Helper methods to provide clearer semantic in StackLineReader_NextLine. +static LineResult CreateEOFLineResult(StringView view) { + return CreateLineResult(true, true, view); +} + +static LineResult CreateTruncatedLineResult(StringView view) { + return CreateLineResult(false, false, view); +} + +static LineResult CreateValidLineResult(StringView view) { + return CreateLineResult(false, true, view); +} + +LineResult StackLineReader_NextLine(StackLineReader* reader) { + if (reader->skip_mode) { + SkipToNextLine(reader); + reader->skip_mode = false; + } + { + const bool can_load_more = + reader->view.size < STACK_LINE_READER_BUFFER_SIZE; + int eol_index = IndexOfEol(reader); + if (eol_index < 0 && can_load_more) { + const int read = BringToFrontAndLoadMore(reader); + if (read == 0) { + return CreateEOFLineResult(reader->view); + } + eol_index = IndexOfEol(reader); + } + if (eol_index < 0) { + reader->skip_mode = true; + return CreateTruncatedLineResult(reader->view); + } + { + StringView line = + CpuFeatures_StringView_KeepFront(reader->view, eol_index); + reader->view = + CpuFeatures_StringView_PopFront(reader->view, eol_index + 1); + return CreateValidLineResult(line); + } + } +} diff --git a/cpu_features/src/string_view.c b/cpu_features/src/string_view.c new file mode 100644 index 0000000..dc3158f --- /dev/null +++ b/cpu_features/src/string_view.c @@ -0,0 +1,182 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/string_view.h" + +#include +#include +#include + +int CpuFeatures_StringView_IndexOfChar(const StringView view, char c) { + if (view.ptr && view.size) { + const char* const found = (const char*)memchr(view.ptr, c, view.size); + if (found) { + return (int)(found - view.ptr); + } + } + return -1; +} + +int CpuFeatures_StringView_IndexOf(const StringView view, + const StringView sub_view) { + if (sub_view.size) { + StringView remainder = view; + while (remainder.size >= sub_view.size) { + const int found_index = + CpuFeatures_StringView_IndexOfChar(remainder, sub_view.ptr[0]); + if (found_index < 0) break; + remainder = CpuFeatures_StringView_PopFront(remainder, found_index); + if (CpuFeatures_StringView_StartsWith(remainder, sub_view)) { + return (int)(remainder.ptr - view.ptr); + } + remainder = CpuFeatures_StringView_PopFront(remainder, 1); + } + } + return -1; +} + +bool CpuFeatures_StringView_IsEquals(const StringView a, const StringView b) { + if (a.size == b.size) { + return a.ptr == b.ptr || memcmp(a.ptr, b.ptr, b.size) == 0; + } + return false; +} + +bool CpuFeatures_StringView_StartsWith(const StringView a, const StringView b) { + return a.ptr && b.ptr && b.size && a.size >= b.size + ? memcmp(a.ptr, b.ptr, b.size) == 0 + : false; +} + +StringView CpuFeatures_StringView_PopFront(const StringView str_view, + size_t count) { + if (count > str_view.size) { + return kEmptyStringView; + } + return view(str_view.ptr + count, str_view.size - count); +} + +StringView CpuFeatures_StringView_PopBack(const StringView str_view, + size_t count) { + if (count > str_view.size) { + return kEmptyStringView; + } + return view(str_view.ptr, str_view.size - count); +} + +StringView CpuFeatures_StringView_KeepFront(const StringView str_view, + size_t count) { + return count <= str_view.size ? view(str_view.ptr, count) : str_view; +} + +char CpuFeatures_StringView_Front(const StringView view) { + assert(view.size); + assert(view.ptr); + return view.ptr[0]; +} + +char CpuFeatures_StringView_Back(const StringView view) { + assert(view.size); + return view.ptr[view.size - 1]; +} + +StringView CpuFeatures_StringView_TrimWhitespace(StringView view) { + while (view.size && isspace(CpuFeatures_StringView_Front(view))) + view = CpuFeatures_StringView_PopFront(view, 1); + while (view.size && isspace(CpuFeatures_StringView_Back(view))) + view = CpuFeatures_StringView_PopBack(view, 1); + return view; +} + +static int HexValue(const char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return -1; +} + +// Returns -1 if view contains non digits. +static int ParsePositiveNumberWithBase(const StringView view, int base) { + int result = 0; + StringView remainder = view; + for (; remainder.size; + remainder = CpuFeatures_StringView_PopFront(remainder, 1)) { + const int value = HexValue(CpuFeatures_StringView_Front(remainder)); + if (value < 0 || value >= base) return -1; + result = (result * base) + value; + } + return result; +} + +int CpuFeatures_StringView_ParsePositiveNumber(const StringView view) { + if (view.size) { + const StringView hex_prefix = str("0x"); + if (CpuFeatures_StringView_StartsWith(view, hex_prefix)) { + const StringView span_no_prefix = + CpuFeatures_StringView_PopFront(view, hex_prefix.size); + return ParsePositiveNumberWithBase(span_no_prefix, 16); + } + return ParsePositiveNumberWithBase(view, 10); + } + return -1; +} + +void CpuFeatures_StringView_CopyString(const StringView src, char* dst, + size_t dst_size) { + if (dst_size > 0) { + const size_t max_copy_size = dst_size - 1; + const size_t copy_size = + src.size > max_copy_size ? max_copy_size : src.size; + memcpy(dst, src.ptr, copy_size); + dst[copy_size] = '\0'; + } +} + +bool CpuFeatures_StringView_HasWord(const StringView line, + const char* const word_str) { + const StringView word = str(word_str); + StringView remainder = line; + for (;;) { + const int index_of_word = CpuFeatures_StringView_IndexOf(remainder, word); + if (index_of_word < 0) { + return false; + } else { + const StringView before = + CpuFeatures_StringView_KeepFront(line, index_of_word); + const StringView after = + CpuFeatures_StringView_PopFront(line, index_of_word + word.size); + const bool valid_before = + before.size == 0 || CpuFeatures_StringView_Back(before) == ' '; + const bool valid_after = + after.size == 0 || CpuFeatures_StringView_Front(after) == ' '; + if (valid_before && valid_after) return true; + remainder = + CpuFeatures_StringView_PopFront(remainder, index_of_word + word.size); + } + } + return false; +} + +bool CpuFeatures_StringView_GetAttributeKeyValue(const StringView line, + StringView* key, + StringView* value) { + const StringView sep = str(": "); + const int index_of_separator = CpuFeatures_StringView_IndexOf(line, sep); + if (index_of_separator < 0) return false; + *value = CpuFeatures_StringView_TrimWhitespace( + CpuFeatures_StringView_PopFront(line, index_of_separator + sep.size)); + *key = CpuFeatures_StringView_TrimWhitespace( + CpuFeatures_StringView_KeepFront(line, index_of_separator)); + return true; +} diff --git a/cpu_features/src/utils/list_cpu_features.c b/cpu_features/src/utils/list_cpu_features.c new file mode 100644 index 0000000..c80ffc5 --- /dev/null +++ b/cpu_features/src/utils/list_cpu_features.c @@ -0,0 +1,438 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This program dumps current host data to the standard output. +// Output can be text or json if the `--json` flag is passed. + +#include +#include +#include +#include +#include +#include +#include + +#include "cpu_features_macros.h" + +#if defined(CPU_FEATURES_ARCH_X86) +#include "cpuinfo_x86.h" +#elif defined(CPU_FEATURES_ARCH_ARM) +#include "cpuinfo_arm.h" +#elif defined(CPU_FEATURES_ARCH_AARCH64) +#include "cpuinfo_aarch64.h" +#elif defined(CPU_FEATURES_ARCH_MIPS) +#include "cpuinfo_mips.h" +#elif defined(CPU_FEATURES_ARCH_PPC) +#include "cpuinfo_ppc.h" +#endif + +// Design principles +// ----------------- +// We build a tree structure containing all the data to be displayed. +// Then depending on the output type (text or json) we walk the tree and display +// the data accordingly. + +// We use a bump allocator to allocate strings and nodes of the tree, +// Memory is not intended to be reclaimed. +typedef struct { + char* ptr; + size_t size; +} BumpAllocator; + +char gGlobalBuffer[64 * 1024]; +BumpAllocator gBumpAllocator = {.ptr = gGlobalBuffer, + .size = sizeof(gGlobalBuffer)}; + +static void internal_error() { + fputs("internal error\n", stderr); + exit(EXIT_FAILURE); +} + +#define ALIGN 8 + +static void assertAligned() { + if ((uintptr_t)(gBumpAllocator.ptr) % ALIGN) internal_error(); +} + +static void BA_Align() { + while (gBumpAllocator.size && (uintptr_t)(gBumpAllocator.ptr) % ALIGN) { + --gBumpAllocator.size; + ++gBumpAllocator.ptr; + } + assertAligned(); +} + +// Update the available memory left in the BumpAllocator. +static void* BA_Bump(size_t size) { + assertAligned(); + // Align size to next 8B boundary. + size = (size + ALIGN - 1) / ALIGN * ALIGN; + if (gBumpAllocator.size < size) internal_error(); + void* ptr = gBumpAllocator.ptr; + gBumpAllocator.size -= size; + gBumpAllocator.ptr += size; + return ptr; +} + +// The type of the nodes in the tree. +typedef enum { + NT_INVALID, + NT_INT, + NT_MAP, + NT_MAP_ENTRY, + NT_ARRAY, + NT_ARRAY_ELEMENT, + NT_STRING, +} NodeType; + +// The node in the tree. +typedef struct Node { + NodeType type; + unsigned integer; + const char* string; + struct Node* value; + struct Node* next; +} Node; + +// Creates an initialized Node. +static Node* BA_CreateNode(NodeType type) { + Node* tv = (Node*)BA_Bump(sizeof(Node)); + assert(tv); + *tv = (Node){.type = type}; + return tv; +} + +// Adds an integer node. +static Node* CreateInt(int value) { + Node* tv = BA_CreateNode(NT_INT); + tv->integer = value; + return tv; +} + +// Adds a string node. +// `value` must outlive the tree. +static Node* CreateConstantString(const char* value) { + Node* tv = BA_CreateNode(NT_STRING); + tv->string = value; + return tv; +} + +// Adds a map node. +static Node* CreateMap() { return BA_CreateNode(NT_MAP); } + +// Adds an array node. +static Node* CreateArray() { return BA_CreateNode(NT_ARRAY); } + +// Adds a formatted string node. +static Node* CreatePrintfString(const char* format, ...) { + va_list arglist; + va_start(arglist, format); + char* const ptr = gBumpAllocator.ptr; + const int written = vsnprintf(ptr, gBumpAllocator.size, format, arglist); + va_end(arglist); + if (written < 0 || written >= (int)gBumpAllocator.size) internal_error(); + return CreateConstantString((char*)BA_Bump(written)); +} + +// Adds a string node. +static Node* CreateString(const char* value) { + return CreatePrintfString("%s", value); +} + +// Adds a map entry node. +static void AddMapEntry(Node* map, const char* key, Node* value) { + assert(map && map->type == NT_MAP); + Node* current = map; + while (current->next) current = current->next; + current->next = (Node*)BA_Bump(sizeof(Node)); + *current->next = (Node){.type = NT_MAP_ENTRY, .string = key, .value = value}; +} + +// Adds an array element node. +static void AddArrayElement(Node* array, Node* value) { + assert(array && array->type == NT_ARRAY); + Node* current = array; + while (current->next) current = current->next; + current->next = (Node*)BA_Bump(sizeof(Node)); + *current->next = (Node){.type = NT_ARRAY_ELEMENT, .value = value}; +} + +static int cmp(const void* p1, const void* p2) { + return strcmp(*(const char* const*)p1, *(const char* const*)p2); +} + +#define DEFINE_ADD_FLAGS(HasFeature, FeatureName, FeatureType, LastEnum) \ + static void AddFlags(Node* map, const FeatureType* features) { \ + size_t i; \ + const char* ptrs[LastEnum] = {0}; \ + size_t count = 0; \ + for (i = 0; i < LastEnum; ++i) { \ + if (HasFeature(features, i)) { \ + ptrs[count] = FeatureName(i); \ + ++count; \ + } \ + } \ + qsort((void*)ptrs, count, sizeof(char*), cmp); \ + Node* const array = CreateArray(); \ + for (i = 0; i < count; ++i) \ + AddArrayElement(array, CreateConstantString(ptrs[i])); \ + AddMapEntry(map, "flags", array); \ + } + +#if defined(CPU_FEATURES_ARCH_X86) +DEFINE_ADD_FLAGS(GetX86FeaturesEnumValue, GetX86FeaturesEnumName, X86Features, + X86_LAST_) +#elif defined(CPU_FEATURES_ARCH_ARM) +DEFINE_ADD_FLAGS(GetArmFeaturesEnumValue, GetArmFeaturesEnumName, ArmFeatures, + ARM_LAST_) +#elif defined(CPU_FEATURES_ARCH_AARCH64) +DEFINE_ADD_FLAGS(GetAarch64FeaturesEnumValue, GetAarch64FeaturesEnumName, + Aarch64Features, AARCH64_LAST_) +#elif defined(CPU_FEATURES_ARCH_MIPS) +DEFINE_ADD_FLAGS(GetMipsFeaturesEnumValue, GetMipsFeaturesEnumName, + MipsFeatures, MIPS_LAST_) +#elif defined(CPU_FEATURES_ARCH_PPC) +DEFINE_ADD_FLAGS(GetPPCFeaturesEnumValue, GetPPCFeaturesEnumName, PPCFeatures, + PPC_LAST_) +#endif + +// Prints a json string with characters escaping. +static void printJsonString(const char* str) { + putchar('"'); + for (; str && *str; ++str) { + switch (*str) { + case '\"': + case '\\': + case '/': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + putchar('\\'); + } + putchar(*str); + } + putchar('"'); +} + +// Walks a Node and print it as json. +static void printJson(const Node* current) { + assert(current); + switch (current->type) { + case NT_INVALID: + break; + case NT_INT: + printf("%d", current->integer); + break; + case NT_STRING: + printJsonString(current->string); + break; + case NT_ARRAY: + putchar('['); + if (current->next) printJson(current->next); + putchar(']'); + break; + case NT_MAP: + putchar('{'); + if (current->next) printJson(current->next); + putchar('}'); + break; + case NT_MAP_ENTRY: + printf("\"%s\":", current->string); + printJson(current->value); + if (current->next) { + putchar(','); + printJson(current->next); + } + break; + case NT_ARRAY_ELEMENT: + printJson(current->value); + if (current->next) { + putchar(','); + printJson(current->next); + } + break; + } +} + +// Walks a Node and print it as text. +static void printTextField(const Node* current) { + switch (current->type) { + case NT_INVALID: + break; + case NT_INT: + printf("%3d (0x%02X)", current->integer, current->integer); + break; + case NT_STRING: + fputs(current->string, stdout); + break; + case NT_ARRAY: + if (current->next) printTextField(current->next); + break; + case NT_MAP: + if (current->next) { + printf("{"); + printJson(current->next); + printf("}"); + } + break; + case NT_MAP_ENTRY: + printf("%-15s : ", current->string); + printTextField(current->value); + if (current->next) { + putchar('\n'); + printTextField(current->next); + } + break; + case NT_ARRAY_ELEMENT: + printTextField(current->value); + if (current->next) { + putchar(','); + printTextField(current->next); + } + break; + } +} + +static void printTextRoot(const Node* current) { + if (current->type == NT_MAP && current->next) printTextField(current->next); +} + +static void showUsage(const char* name) { + printf( + "\n" + "Usage: %s [options]\n" + " Options:\n" + " -h | --help Show help message.\n" + " -j | --json Format output as json instead of plain text.\n" + "\n", + name); +} + +static Node* GetCacheTypeString(CacheType cache_type) { + switch (cache_type) { + case CPU_FEATURE_CACHE_NULL: + return CreateConstantString("null"); + case CPU_FEATURE_CACHE_DATA: + return CreateConstantString("data"); + case CPU_FEATURE_CACHE_INSTRUCTION: + return CreateConstantString("instruction"); + case CPU_FEATURE_CACHE_UNIFIED: + return CreateConstantString("unified"); + case CPU_FEATURE_CACHE_TLB: + return CreateConstantString("tlb"); + case CPU_FEATURE_CACHE_DTLB: + return CreateConstantString("dtlb"); + case CPU_FEATURE_CACHE_STLB: + return CreateConstantString("stlb"); + case CPU_FEATURE_CACHE_PREFETCH: + return CreateConstantString("prefetch"); + } +} + +static void AddCacheInfo(Node* root, const CacheInfo* cache_info) { + Node* array = CreateArray(); + for (int i = 0; i < cache_info->size; ++i) { + CacheLevelInfo info = cache_info->levels[i]; + Node* map = CreateMap(); + AddMapEntry(map, "level", CreateInt(info.level)); + AddMapEntry(map, "cache_type", GetCacheTypeString(info.cache_type)); + AddMapEntry(map, "cache_size", CreateInt(info.cache_size)); + AddMapEntry(map, "ways", CreateInt(info.ways)); + AddMapEntry(map, "line_size", CreateInt(info.line_size)); + AddMapEntry(map, "tlb_entries", CreateInt(info.tlb_entries)); + AddMapEntry(map, "partitioning", CreateInt(info.partitioning)); + AddArrayElement(array, map); + } + AddMapEntry(root, "cache_info", array); +} + +static Node* CreateTree() { + Node* root = CreateMap(); +#if defined(CPU_FEATURES_ARCH_X86) + char brand_string[49]; + const X86Info info = GetX86Info(); + const CacheInfo cache_info = GetX86CacheInfo(); + FillX86BrandString(brand_string); + AddMapEntry(root, "arch", CreateString("x86")); + AddMapEntry(root, "brand", CreateString(brand_string)); + AddMapEntry(root, "family", CreateInt(info.family)); + AddMapEntry(root, "model", CreateInt(info.model)); + AddMapEntry(root, "stepping", CreateInt(info.stepping)); + AddMapEntry(root, "uarch", + CreateString( + GetX86MicroarchitectureName(GetX86Microarchitecture(&info)))); + AddFlags(root, &info.features); + AddCacheInfo(root, &cache_info); +#elif defined(CPU_FEATURES_ARCH_ARM) + const ArmInfo info = GetArmInfo(); + AddMapEntry(root, "arch", CreateString("ARM")); + AddMapEntry(root, "implementer", CreateInt(info.implementer)); + AddMapEntry(root, "architecture", CreateInt(info.architecture)); + AddMapEntry(root, "variant", CreateInt(info.variant)); + AddMapEntry(root, "part", CreateInt(info.part)); + AddMapEntry(root, "revision", CreateInt(info.revision)); + AddFlags(root, &info.features); +#elif defined(CPU_FEATURES_ARCH_AARCH64) + const Aarch64Info info = GetAarch64Info(); + AddMapEntry(root, "arch", CreateString("aarch64")); + AddMapEntry(root, "implementer", CreateInt(info.implementer)); + AddMapEntry(root, "variant", CreateInt(info.variant)); + AddMapEntry(root, "part", CreateInt(info.part)); + AddMapEntry(root, "revision", CreateInt(info.revision)); + AddFlags(root, &info.features); +#elif defined(CPU_FEATURES_ARCH_MIPS) + const MipsInfo info = GetMipsInfo(); + AddMapEntry(root, "arch", CreateString("mips")); + AddFlags(root, &info.features); +#elif defined(CPU_FEATURES_ARCH_PPC) + const PPCInfo info = GetPPCInfo(); + const PPCPlatformStrings strings = GetPPCPlatformStrings(); + AddMapEntry(root, "arch", CreateString("ppc")); + AddMapEntry(root, "platform", CreateString(strings.platform)); + AddMapEntry(root, "model", CreateString(strings.model)); + AddMapEntry(root, "machine", CreateString(strings.machine)); + AddMapEntry(root, "cpu", CreateString(strings.cpu)); + AddMapEntry(root, "instruction", CreateString(strings.type.platform)); + AddMapEntry(root, "microarchitecture", + CreateString(strings.type.base_platform)); + AddFlags(root, &info.features); +#endif + return root; +} + +int main(int argc, char** argv) { + BA_Align(); + const Node* const root = CreateTree(); + bool outputJson = false; + int i = 1; + for (; i < argc; ++i) { + const char* arg = argv[i]; + if (strcmp(arg, "-j") == 0 || strcmp(arg, "--json") == 0) { + outputJson = true; + } else { + showUsage(argv[0]); + if (strcmp(arg, "-h") == 0 || strcmp(arg, "--help") == 0) + return EXIT_SUCCESS; + return EXIT_FAILURE; + } + } + if (outputJson) + printJson(root); + else + printTextRoot(root); + putchar('\n'); + return EXIT_SUCCESS; +} diff --git a/cpu_features/test/CMakeLists.txt b/cpu_features/test/CMakeLists.txt new file mode 100644 index 0000000..c10e617 --- /dev/null +++ b/cpu_features/test/CMakeLists.txt @@ -0,0 +1,85 @@ +# +# libraries for tests +# + +include_directories(../include) +add_definitions(-DCPU_FEATURES_TEST) + +##------------------------------------------------------------------------------ +add_library(string_view ../src/string_view.c) +##------------------------------------------------------------------------------ +add_library(filesystem_for_testing filesystem_for_testing.cc) +target_compile_definitions(filesystem_for_testing PUBLIC CPU_FEATURES_MOCK_FILESYSTEM) +##------------------------------------------------------------------------------ +add_library(hwcaps_for_testing hwcaps_for_testing.cc) +target_link_libraries(hwcaps_for_testing filesystem_for_testing) +##------------------------------------------------------------------------------ +add_library(stack_line_reader ../src/stack_line_reader.c) +target_compile_definitions(stack_line_reader PUBLIC STACK_LINE_READER_BUFFER_SIZE=1024) +target_link_libraries(stack_line_reader string_view) +##------------------------------------------------------------------------------ +add_library(stack_line_reader_for_test ../src/stack_line_reader.c) +target_compile_definitions(stack_line_reader_for_test PUBLIC STACK_LINE_READER_BUFFER_SIZE=16) +target_link_libraries(stack_line_reader_for_test string_view filesystem_for_testing) +##------------------------------------------------------------------------------ +add_library(all_libraries ../src/hwcaps.c ../src/stack_line_reader.c) +target_link_libraries(all_libraries hwcaps_for_testing stack_line_reader string_view) + +# +# tests +# +link_libraries(gtest gmock_main) + +## bit_utils_test +add_executable(bit_utils_test bit_utils_test.cc) +target_link_libraries(bit_utils_test) +add_test(NAME bit_utils_test COMMAND bit_utils_test) +##------------------------------------------------------------------------------ +## string_view_test +add_executable(string_view_test string_view_test.cc ../src/string_view.c) +target_link_libraries(string_view_test string_view) +add_test(NAME string_view_test COMMAND string_view_test) +##------------------------------------------------------------------------------ +## stack_line_reader_test +add_executable(stack_line_reader_test stack_line_reader_test.cc) +target_link_libraries(stack_line_reader_test stack_line_reader_for_test) +add_test(NAME stack_line_reader_test COMMAND stack_line_reader_test) +##------------------------------------------------------------------------------ +## cpuinfo_x86_test +if(PROCESSOR_IS_X86) + add_executable(cpuinfo_x86_test cpuinfo_x86_test.cc ../src/cpuinfo_x86.c) + target_compile_definitions(cpuinfo_x86_test PUBLIC CPU_FEATURES_MOCK_CPUID_X86) + if(APPLE) + target_compile_definitions(cpuinfo_x86_test PRIVATE HAVE_SYSCTLBYNAME) + endif() + target_link_libraries(cpuinfo_x86_test all_libraries) + add_test(NAME cpuinfo_x86_test COMMAND cpuinfo_x86_test) +endif() +##------------------------------------------------------------------------------ +## cpuinfo_arm_test +if(PROCESSOR_IS_ARM) + add_executable(cpuinfo_arm_test cpuinfo_arm_test.cc ../src/cpuinfo_arm.c) + target_link_libraries(cpuinfo_arm_test all_libraries) + add_test(NAME cpuinfo_arm_test COMMAND cpuinfo_arm_test) +endif() +##------------------------------------------------------------------------------ +## cpuinfo_aarch64_test +if(PROCESSOR_IS_AARCH64) + add_executable(cpuinfo_aarch64_test cpuinfo_aarch64_test.cc ../src/cpuinfo_aarch64.c) + target_link_libraries(cpuinfo_aarch64_test all_libraries) + add_test(NAME cpuinfo_aarch64_test COMMAND cpuinfo_aarch64_test) +endif() +##------------------------------------------------------------------------------ +## cpuinfo_mips_test +if(PROCESSOR_IS_MIPS) + add_executable(cpuinfo_mips_test cpuinfo_mips_test.cc ../src/cpuinfo_mips.c) + target_link_libraries(cpuinfo_mips_test all_libraries) + add_test(NAME cpuinfo_mips_test COMMAND cpuinfo_mips_test) +endif() +##------------------------------------------------------------------------------ +## cpuinfo_ppc_test +if(PROCESSOR_IS_POWER) + add_executable(cpuinfo_ppc_test cpuinfo_ppc_test.cc ../src/cpuinfo_ppc.c) + target_link_libraries(cpuinfo_ppc_test all_libraries) + add_test(NAME cpuinfo_ppc_test COMMAND cpuinfo_ppc_test) +endif() diff --git a/cpu_features/test/bit_utils_test.cc b/cpu_features/test/bit_utils_test.cc new file mode 100644 index 0000000..3874e13 --- /dev/null +++ b/cpu_features/test/bit_utils_test.cc @@ -0,0 +1,53 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/bit_utils.h" + +#include "gtest/gtest.h" + +namespace cpu_features { +namespace { + +TEST(UtilsTest, IsBitSet) { + for (size_t bit_set = 0; bit_set < 32; ++bit_set) { + const uint32_t value = 1UL << bit_set; + for (uint32_t i = 0; i < 32; ++i) { + EXPECT_EQ(IsBitSet(value, i), i == bit_set); + } + } + + // testing 0, all bits should be 0. + for (uint32_t i = 0; i < 32; ++i) { + EXPECT_FALSE(IsBitSet(0, i)); + } + + // testing ~0, all bits should be 1. + for (uint32_t i = 0; i < 32; ++i) { + EXPECT_TRUE(IsBitSet(-1, i)); + } +} + +TEST(UtilsTest, ExtractBitRange) { + // Extracting all bits gives the same number. + EXPECT_EQ(ExtractBitRange(123, 31, 0), 123); + // Extracting 1 bit gives parity. + EXPECT_EQ(ExtractBitRange(123, 0, 0), 1); + EXPECT_EQ(ExtractBitRange(122, 0, 0), 0); + + EXPECT_EQ(ExtractBitRange(0xF0, 7, 4), 0xF); + EXPECT_EQ(ExtractBitRange(0x42 << 2, 10, 2), 0x42); +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/cpuinfo_aarch64_test.cc b/cpu_features/test/cpuinfo_aarch64_test.cc new file mode 100644 index 0000000..5afaaa8 --- /dev/null +++ b/cpu_features/test/cpuinfo_aarch64_test.cc @@ -0,0 +1,171 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_aarch64.h" + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" +#include "hwcaps_for_testing.h" + +namespace cpu_features { +namespace { + +void DisableHardwareCapabilities() { SetHardwareCapabilities(0, 0); } + +TEST(CpuinfoAarch64Test, FromHardwareCap) { + SetHardwareCapabilities(AARCH64_HWCAP_FP | AARCH64_HWCAP_AES, 0); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetAarch64Info(); + EXPECT_TRUE(info.features.fp); + EXPECT_FALSE(info.features.asimd); + EXPECT_FALSE(info.features.evtstrm); + EXPECT_TRUE(info.features.aes); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + EXPECT_FALSE(info.features.crc32); + EXPECT_FALSE(info.features.atomics); + EXPECT_FALSE(info.features.fphp); + EXPECT_FALSE(info.features.asimdhp); + EXPECT_FALSE(info.features.cpuid); + EXPECT_FALSE(info.features.asimdrdm); + EXPECT_FALSE(info.features.jscvt); + EXPECT_FALSE(info.features.fcma); + EXPECT_FALSE(info.features.lrcpc); + EXPECT_FALSE(info.features.dcpop); + EXPECT_FALSE(info.features.sha3); + EXPECT_FALSE(info.features.sm3); + EXPECT_FALSE(info.features.sm4); + EXPECT_FALSE(info.features.asimddp); + EXPECT_FALSE(info.features.sha512); + EXPECT_FALSE(info.features.sve); + EXPECT_FALSE(info.features.asimdfhm); + EXPECT_FALSE(info.features.dit); + EXPECT_FALSE(info.features.uscat); + EXPECT_FALSE(info.features.ilrcpc); + EXPECT_FALSE(info.features.flagm); + EXPECT_FALSE(info.features.ssbs); + EXPECT_FALSE(info.features.sb); + EXPECT_FALSE(info.features.paca); + EXPECT_FALSE(info.features.pacg); +} + +TEST(CpuinfoAarch64Test, FromHardwareCap2) { + SetHardwareCapabilities(AARCH64_HWCAP_FP, + AARCH64_HWCAP2_SVE2 | AARCH64_HWCAP2_BTI); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetAarch64Info(); + EXPECT_TRUE(info.features.fp); + + EXPECT_TRUE(info.features.sve2); + EXPECT_TRUE(info.features.bti); + + EXPECT_FALSE(info.features.dcpodp); + EXPECT_FALSE(info.features.sveaes); + EXPECT_FALSE(info.features.svepmull); + EXPECT_FALSE(info.features.svebitperm); + EXPECT_FALSE(info.features.svesha3); + EXPECT_FALSE(info.features.svesm4); + EXPECT_FALSE(info.features.flagm2); + EXPECT_FALSE(info.features.frint); + EXPECT_FALSE(info.features.svei8mm); + EXPECT_FALSE(info.features.svef32mm); + EXPECT_FALSE(info.features.svef64mm); + EXPECT_FALSE(info.features.svebf16); + EXPECT_FALSE(info.features.i8mm); + EXPECT_FALSE(info.features.bf16); + EXPECT_FALSE(info.features.dgh); + EXPECT_FALSE(info.features.rng); +} + +TEST(CpuinfoAarch64Test, ARMCortexA53) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(Processor : AArch64 Processor rev 3 (aarch64) +processor : 0 +processor : 1 +processor : 2 +processor : 3 +processor : 4 +processor : 5 +processor : 6 +processor : 7 +Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 +CPU implementer : 0x41 +CPU architecture: AArch64 +CPU variant : 0x0 +CPU part : 0xd03 +CPU revision : 3)"); + const auto info = GetAarch64Info(); + EXPECT_EQ(info.implementer, 0x41); + EXPECT_EQ(info.variant, 0x0); + EXPECT_EQ(info.part, 0xd03); + EXPECT_EQ(info.revision, 3); + + EXPECT_TRUE(info.features.fp); + EXPECT_TRUE(info.features.asimd); + EXPECT_TRUE(info.features.evtstrm); + EXPECT_TRUE(info.features.aes); + EXPECT_TRUE(info.features.pmull); + EXPECT_TRUE(info.features.sha1); + EXPECT_TRUE(info.features.sha2); + EXPECT_TRUE(info.features.crc32); + + EXPECT_FALSE(info.features.atomics); + EXPECT_FALSE(info.features.fphp); + EXPECT_FALSE(info.features.asimdhp); + EXPECT_FALSE(info.features.cpuid); + EXPECT_FALSE(info.features.asimdrdm); + EXPECT_FALSE(info.features.jscvt); + EXPECT_FALSE(info.features.fcma); + EXPECT_FALSE(info.features.lrcpc); + EXPECT_FALSE(info.features.dcpop); + EXPECT_FALSE(info.features.sha3); + EXPECT_FALSE(info.features.sm3); + EXPECT_FALSE(info.features.sm4); + EXPECT_FALSE(info.features.asimddp); + EXPECT_FALSE(info.features.sha512); + EXPECT_FALSE(info.features.sve); + EXPECT_FALSE(info.features.asimdfhm); + EXPECT_FALSE(info.features.dit); + EXPECT_FALSE(info.features.uscat); + EXPECT_FALSE(info.features.ilrcpc); + EXPECT_FALSE(info.features.flagm); + EXPECT_FALSE(info.features.ssbs); + EXPECT_FALSE(info.features.sb); + EXPECT_FALSE(info.features.paca); + EXPECT_FALSE(info.features.pacg); + EXPECT_FALSE(info.features.dcpodp); + EXPECT_FALSE(info.features.sve2); + EXPECT_FALSE(info.features.sveaes); + EXPECT_FALSE(info.features.svepmull); + EXPECT_FALSE(info.features.svebitperm); + EXPECT_FALSE(info.features.svesha3); + EXPECT_FALSE(info.features.svesm4); + EXPECT_FALSE(info.features.flagm2); + EXPECT_FALSE(info.features.frint); + EXPECT_FALSE(info.features.svei8mm); + EXPECT_FALSE(info.features.svef32mm); + EXPECT_FALSE(info.features.svef64mm); + EXPECT_FALSE(info.features.svebf16); + EXPECT_FALSE(info.features.i8mm); + EXPECT_FALSE(info.features.bf16); + EXPECT_FALSE(info.features.dgh); + EXPECT_FALSE(info.features.rng); + EXPECT_FALSE(info.features.bti); +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/cpuinfo_arm_test.cc b/cpu_features/test/cpuinfo_arm_test.cc new file mode 100644 index 0000000..e0b08a4 --- /dev/null +++ b/cpu_features/test/cpuinfo_arm_test.cc @@ -0,0 +1,354 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_arm.h" + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" +#include "hwcaps_for_testing.h" + +namespace cpu_features { +namespace { + +void DisableHardwareCapabilities() { SetHardwareCapabilities(0, 0); } + +TEST(CpuinfoArmTest, FromHardwareCap) { + SetHardwareCapabilities(ARM_HWCAP_NEON, ARM_HWCAP2_AES | ARM_HWCAP2_CRC32); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetArmInfo(); + EXPECT_TRUE(info.features.vfp); // triggered by vfpv3 + EXPECT_TRUE(info.features.vfpv3); // triggered by neon + EXPECT_TRUE(info.features.neon); + EXPECT_TRUE(info.features.aes); + EXPECT_TRUE(info.features.crc32); + + EXPECT_FALSE(info.features.vfpv4); + EXPECT_FALSE(info.features.iwmmxt); + EXPECT_FALSE(info.features.crunch); + EXPECT_FALSE(info.features.thumbee); + EXPECT_FALSE(info.features.vfpv3d16); + EXPECT_FALSE(info.features.idiva); + EXPECT_FALSE(info.features.idivt); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + + // check some random features with EnumValue(): + EXPECT_TRUE(GetArmFeaturesEnumValue(&info.features, ARM_VFP)); + EXPECT_FALSE(GetArmFeaturesEnumValue(&info.features, ARM_VFPV4)); + // out of bound EnumValue() check + EXPECT_FALSE(GetArmFeaturesEnumValue(&info.features, (ArmFeaturesEnum)~0x0)); +} + +TEST(CpuinfoArmTest, ODroidFromCpuInfo) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(processor : 0 +model name : ARMv7 Processor rev 3 (v71) +BogoMIPS : 120.00 +Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x2 +CPU part : 0xc0f +CPU revision : 3)"); + const auto info = GetArmInfo(); + EXPECT_EQ(info.implementer, 0x41); + EXPECT_EQ(info.variant, 0x2); + EXPECT_EQ(info.part, 0xc0f); + EXPECT_EQ(info.revision, 3); + EXPECT_EQ(info.architecture, 7); + + EXPECT_FALSE(info.features.swp); + EXPECT_TRUE(info.features.half); + EXPECT_TRUE(info.features.thumb); + EXPECT_FALSE(info.features._26bit); + EXPECT_TRUE(info.features.fastmult); + EXPECT_FALSE(info.features.fpa); + EXPECT_TRUE(info.features.vfp); + EXPECT_TRUE(info.features.edsp); + EXPECT_FALSE(info.features.java); + EXPECT_FALSE(info.features.iwmmxt); + EXPECT_FALSE(info.features.crunch); + EXPECT_FALSE(info.features.thumbee); + EXPECT_TRUE(info.features.neon); + EXPECT_TRUE(info.features.vfpv3); + EXPECT_FALSE(info.features.vfpv3d16); + EXPECT_TRUE(info.features.tls); + EXPECT_TRUE(info.features.vfpv4); + EXPECT_TRUE(info.features.idiva); + EXPECT_TRUE(info.features.idivt); + EXPECT_TRUE(info.features.vfpd32); + EXPECT_TRUE(info.features.lpae); + EXPECT_FALSE(info.features.evtstrm); + EXPECT_FALSE(info.features.aes); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + EXPECT_FALSE(info.features.crc32); +} + +// Linux test-case +TEST(CpuinfoArmTest, RaspberryPiZeroFromCpuInfo) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(processor : 0 +model name : ARMv6-compatible processor rev 7 (v6l) +BogoMIPS : 697.95 +Features : half thumb fastmult vfp edsp java tls +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x0 +CPU part : 0xb76 +CPU revision : 7 + +Hardware : BCM2835 +Revision : 9000c1 +Serial : 000000006cd946f3)"); + const auto info = GetArmInfo(); + EXPECT_EQ(info.implementer, 0x41); + EXPECT_EQ(info.variant, 0x0); + EXPECT_EQ(info.part, 0xb76); + EXPECT_EQ(info.revision, 7); + EXPECT_EQ(info.architecture, 6); + + EXPECT_FALSE(info.features.swp); + EXPECT_TRUE(info.features.half); + EXPECT_TRUE(info.features.thumb); + EXPECT_FALSE(info.features._26bit); + EXPECT_TRUE(info.features.fastmult); + EXPECT_FALSE(info.features.fpa); + EXPECT_TRUE(info.features.vfp); + EXPECT_TRUE(info.features.edsp); + EXPECT_TRUE(info.features.java); + EXPECT_FALSE(info.features.iwmmxt); + EXPECT_FALSE(info.features.crunch); + EXPECT_FALSE(info.features.thumbee); + EXPECT_FALSE(info.features.neon); + EXPECT_FALSE(info.features.vfpv3); + EXPECT_FALSE(info.features.vfpv3d16); + EXPECT_TRUE(info.features.tls); + EXPECT_FALSE(info.features.vfpv4); + EXPECT_FALSE(info.features.idiva); + EXPECT_FALSE(info.features.idivt); + EXPECT_FALSE(info.features.vfpd32); + EXPECT_FALSE(info.features.lpae); + EXPECT_FALSE(info.features.evtstrm); + EXPECT_FALSE(info.features.aes); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + EXPECT_FALSE(info.features.crc32); +} + +TEST(CpuinfoArmTest, MarvellArmadaFromCpuInfo) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(processor : 0 +model name : ARMv7 Processor rev 1 (v7l) +BogoMIPS : 50.00 +Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpd32 +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x4 +CPU part : 0xc09 +CPU revision : 1 + +processor : 1 +model name : ARMv7 Processor rev 1 (v7l) +BogoMIPS : 50.00 +Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpd32 +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x4 +CPU part : 0xc09 +CPU revision : 1 + +Hardware : Marvell Armada 380/385 (Device Tree) +Revision : 0000 +Serial : 0000000000000000)"); + const auto info = GetArmInfo(); + EXPECT_EQ(info.implementer, 0x41); + EXPECT_EQ(info.variant, 0x4); + EXPECT_EQ(info.part, 0xc09); + EXPECT_EQ(info.revision, 1); + EXPECT_EQ(info.architecture, 7); + + EXPECT_FALSE(info.features.swp); + EXPECT_TRUE(info.features.half); + EXPECT_TRUE(info.features.thumb); + EXPECT_FALSE(info.features._26bit); + EXPECT_TRUE(info.features.fastmult); + EXPECT_FALSE(info.features.fpa); + EXPECT_TRUE(info.features.vfp); + EXPECT_TRUE(info.features.edsp); + EXPECT_FALSE(info.features.java); + EXPECT_FALSE(info.features.iwmmxt); + EXPECT_FALSE(info.features.crunch); + EXPECT_FALSE(info.features.thumbee); + EXPECT_TRUE(info.features.neon); + EXPECT_TRUE(info.features.vfpv3); + EXPECT_FALSE(info.features.vfpv3d16); + EXPECT_TRUE(info.features.tls); + EXPECT_FALSE(info.features.vfpv4); + EXPECT_FALSE(info.features.idiva); + EXPECT_FALSE(info.features.idivt); + EXPECT_TRUE(info.features.vfpd32); + EXPECT_FALSE(info.features.lpae); + EXPECT_FALSE(info.features.evtstrm); + EXPECT_FALSE(info.features.aes); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + EXPECT_FALSE(info.features.crc32); +} + +// Android test-case +// http://code.google.com/p/android/issues/detail?id=10812 +TEST(CpuinfoArmTest, InvalidArmv7) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(Processor : ARMv6-compatible processor rev 6 (v6l) +BogoMIPS : 199.47 +Features : swp half thumb fastmult vfp edsp java +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x0 +CPU part : 0xb76 +CPU revision : 6 + +Hardware : SPICA +Revision : 0020 +Serial : 33323613546d00ec )"); + const auto info = GetArmInfo(); + EXPECT_EQ(info.architecture, 6); + + EXPECT_TRUE(info.features.swp); + EXPECT_TRUE(info.features.half); + EXPECT_TRUE(info.features.thumb); + EXPECT_FALSE(info.features._26bit); + EXPECT_TRUE(info.features.fastmult); + EXPECT_FALSE(info.features.fpa); + EXPECT_TRUE(info.features.vfp); + EXPECT_TRUE(info.features.edsp); + EXPECT_TRUE(info.features.java); + EXPECT_FALSE(info.features.iwmmxt); + EXPECT_FALSE(info.features.crunch); + EXPECT_FALSE(info.features.thumbee); + EXPECT_FALSE(info.features.neon); + EXPECT_FALSE(info.features.vfpv3); + EXPECT_FALSE(info.features.vfpv3d16); + EXPECT_FALSE(info.features.tls); + EXPECT_FALSE(info.features.vfpv4); + EXPECT_FALSE(info.features.idiva); + EXPECT_FALSE(info.features.idivt); + EXPECT_FALSE(info.features.vfpd32); + EXPECT_FALSE(info.features.lpae); + EXPECT_FALSE(info.features.evtstrm); + EXPECT_FALSE(info.features.aes); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + EXPECT_FALSE(info.features.crc32); +} + +// Android test-case +// https://crbug.com/341598. +TEST(CpuinfoArmTest, InvalidNeon) { + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(Processor: ARMv7 Processory rev 0 (v71) +processor: 0 +BogoMIPS: 13.50 + +Processor: 1 +BogoMIPS: 13.50 + +Features: swp half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt +CPU implementer : 0x51 +CPU architecture: 7 +CPU variant: 0x1 +CPU part: 0x04d +CPU revision: 0 + +Hardware: SAMSUNG M2 +Revision: 0010 +Serial: 00001e030000354e)"); + const auto info = GetArmInfo(); + EXPECT_TRUE(info.features.swp); + EXPECT_FALSE(info.features.neon); +} + +// The Nexus 4 (Qualcomm Krait) kernel configuration forgets to report IDIV +// support. +TEST(CpuinfoArmTest, Nexus4_0x510006f2) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(CPU implementer : 0x51 +CPU architecture: 7 +CPU variant : 0x0 +CPU part : 0x6f +CPU revision : 2)"); + const auto info = GetArmInfo(); + EXPECT_TRUE(info.features.idiva); + EXPECT_TRUE(info.features.idivt); + + EXPECT_EQ(GetArmCpuId(&info), 0x510006f2); +} + +// The Nexus 4 (Qualcomm Krait) kernel configuration forgets to report IDIV +// support. +TEST(CpuinfoArmTest, Nexus4_0x510006f3) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(CPU implementer : 0x51 +CPU architecture: 7 +CPU variant : 0x0 +CPU part : 0x6f +CPU revision : 3)"); + const auto info = GetArmInfo(); + EXPECT_TRUE(info.features.idiva); + EXPECT_TRUE(info.features.idivt); + + EXPECT_EQ(GetArmCpuId(&info), 0x510006f3); +} + +// The emulator-specific Android 4.2 kernel fails to report support for the +// 32-bit ARM IDIV instruction. Technically, this is a feature of the virtual +// CPU implemented by the emulator. +TEST(CpuinfoArmTest, EmulatorSpecificIdiv) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(Processor : ARMv7 Processor rev 0 (v7l) +BogoMIPS : 629.14 +Features : swp half thumb fastmult vfp edsp neon vfpv3 +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x0 +CPU part : 0xc08 +CPU revision : 0 + +Hardware : Goldfish +Revision : 0000 +Serial : 0000000000000000)"); + const auto info = GetArmInfo(); + EXPECT_TRUE(info.features.idiva); +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/cpuinfo_mips_test.cc b/cpu_features/test/cpuinfo_mips_test.cc new file mode 100644 index 0000000..d734058 --- /dev/null +++ b/cpu_features/test/cpuinfo_mips_test.cc @@ -0,0 +1,126 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_mips.h" + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" +#include "hwcaps_for_testing.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +namespace cpu_features { + +namespace { + +void DisableHardwareCapabilities() { SetHardwareCapabilities(0, 0); } + +TEST(CpuinfoMipsTest, FromHardwareCapBoth) { + SetHardwareCapabilities(MIPS_HWCAP_MSA | MIPS_HWCAP_R6, 0); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetMipsInfo(); + EXPECT_TRUE(info.features.msa); + EXPECT_FALSE(info.features.eva); + EXPECT_TRUE(info.features.r6); +} + +TEST(CpuinfoMipsTest, FromHardwareCapOnlyOne) { + SetHardwareCapabilities(MIPS_HWCAP_MSA, 0); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetMipsInfo(); + EXPECT_TRUE(info.features.msa); + EXPECT_FALSE(info.features.eva); +} + +TEST(CpuinfoMipsTest, Ci40) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(system type : IMG Pistachio SoC (B0) +machine : IMG Marduk – Ci40 with cc2520 +processor : 0 +cpu model : MIPS interAptiv (multi) V2.0 FPU V0.0 +BogoMIPS : 363.72 +wait instruction : yes +microsecond timers : yes +tlb_entries : 64 +extra interrupt vector : yes +hardware watchpoint : yes, count: 4, address/irw mask: [0x0ffc, 0x0ffc, 0x0ffb, 0x0ffb] +isa : mips1 mips2 mips32r1 mips32r2 +ASEs implemented : mips16 dsp mt eva +shadow register sets : 1 +kscratch registers : 0 +package : 0 +core : 0 +VCED exceptions : not available +VCEI exceptions : not available +VPE : 0 +)"); + const auto info = GetMipsInfo(); + EXPECT_FALSE(info.features.msa); + EXPECT_TRUE(info.features.eva); +} + +TEST(CpuinfoMipsTest, AR7161) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(system type : Atheros AR7161 rev 2 +machine : NETGEAR WNDR3700/WNDR3800/WNDRMAC +processor : 0 +cpu model : MIPS 24Kc V7.4 +BogoMIPS : 452.19 +wait instruction : yes +microsecond timers : yes +tlb_entries : 16 +extra interrupt vector : yes +hardware watchpoint : yes, count: 4, address/irw mask: [0x0000, 0x0f98, 0x0f78, 0x0df8] +ASEs implemented : mips16 +shadow register sets : 1 +kscratch registers : 0 +core : 0 +VCED exceptions : not available +VCEI exceptions : not available +)"); + const auto info = GetMipsInfo(); + EXPECT_FALSE(info.features.msa); + EXPECT_FALSE(info.features.eva); +} + +TEST(CpuinfoMipsTest, Goldfish) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(system type : MIPS-Goldfish +Hardware : goldfish +Revison : 1 +processor : 0 +cpu model : MIPS 24Kc V0.0 FPU V0.0 +BogoMIPS : 1042.02 +wait instruction : yes +microsecond timers : yes +tlb_entries : 16 +extra interrupt vector : yes +hardware watchpoint : yes, count: 1, address/irw mask: [0x0ff8] +ASEs implemented : +shadow register sets : 1 +core : 0 +VCED exceptions : not available +VCEI exceptions : not available +)"); + const auto info = GetMipsInfo(); + EXPECT_FALSE(info.features.msa); + EXPECT_FALSE(info.features.eva); +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/cpuinfo_ppc_test.cc b/cpu_features/test/cpuinfo_ppc_test.cc new file mode 100644 index 0000000..8f0cb65 --- /dev/null +++ b/cpu_features/test/cpuinfo_ppc_test.cc @@ -0,0 +1,119 @@ +// Copyright 2018 IBM. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_ppc.h" + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" +#include "hwcaps_for_testing.h" +#include "internal/string_view.h" + +namespace cpu_features { +namespace { + +void DisableHardwareCapabilities() { SetHardwareCapabilities(0, 0); } + +TEST(CpustringsPPCTest, FromHardwareCap) { + SetHardwareCapabilities(PPC_FEATURE_HAS_FPU | PPC_FEATURE_HAS_VSX, + PPC_FEATURE2_ARCH_3_00); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetPPCInfo(); + EXPECT_TRUE(info.features.fpu); + EXPECT_FALSE(info.features.mmu); + EXPECT_TRUE(info.features.vsx); + EXPECT_TRUE(info.features.arch300); + EXPECT_FALSE(info.features.power4); + EXPECT_FALSE(info.features.altivec); + EXPECT_FALSE(info.features.vcrypto); + EXPECT_FALSE(info.features.htm); +} + +TEST(CpustringsPPCTest, Blade) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(processor : 14 +cpu : POWER7 (architected), altivec supported +clock : 3000.000000MHz +revision : 2.1 (pvr 003f 0201) + +processor : 15 +cpu : POWER7 (architected), altivec supported +clock : 3000.000000MHz +revision : 2.1 (pvr 003f 0201) + +timebase : 512000000 +platform : pSeries +model : IBM,8406-70Y +machine : CHRP IBM,8406-70Y)"); + SetPlatformTypes("power7", "power8"); + const auto strings = GetPPCPlatformStrings(); + ASSERT_STREQ(strings.platform, "pSeries"); + ASSERT_STREQ(strings.model, "IBM,8406-70Y"); + ASSERT_STREQ(strings.machine, "CHRP IBM,8406-70Y"); + ASSERT_STREQ(strings.cpu, "POWER7 (architected), altivec supported"); + ASSERT_STREQ(strings.type.platform, "power7"); + ASSERT_STREQ(strings.type.base_platform, "power8"); +} + +TEST(CpustringsPPCTest, Firestone) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(processor : 126 +cpu : POWER8 (raw), altivec supported +clock : 2061.000000MHz +revision : 2.0 (pvr 004d 0200) + +processor : 127 +cpu : POWER8 (raw), altivec supported +clock : 2061.000000MHz +revision : 2.0 (pvr 004d 0200) + +timebase : 512000000 +platform : PowerNV +model : 8335-GTA +machine : PowerNV 8335-GTA +firmware : OPAL v3)"); + const auto strings = GetPPCPlatformStrings(); + ASSERT_STREQ(strings.platform, "PowerNV"); + ASSERT_STREQ(strings.model, "8335-GTA"); + ASSERT_STREQ(strings.machine, "PowerNV 8335-GTA"); + ASSERT_STREQ(strings.cpu, "POWER8 (raw), altivec supported"); +} + +TEST(CpustringsPPCTest, w8) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(processor : 143 +cpu : POWER9, altivec supported +clock : 2300.000000MHz +revision : 2.2 (pvr 004e 1202) + +timebase : 512000000 +platform : PowerNV +model : 0000000000000000 +machine : PowerNV 0000000000000000 +firmware : OPAL +MMU : Radix)"); + const auto strings = GetPPCPlatformStrings(); + ASSERT_STREQ(strings.platform, "PowerNV"); + ASSERT_STREQ(strings.model, "0000000000000000"); + ASSERT_STREQ(strings.machine, "PowerNV 0000000000000000"); + ASSERT_STREQ(strings.cpu, "POWER9, altivec supported"); +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/cpuinfo_x86_test.cc b/cpu_features/test/cpuinfo_x86_test.cc new file mode 100644 index 0000000..636d0f9 --- /dev/null +++ b/cpu_features/test/cpuinfo_x86_test.cc @@ -0,0 +1,533 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_x86.h" + +#include +#include +#include +#include +#if defined(CPU_FEATURES_OS_WINDOWS) +#include // IsProcessorFeaturePresent +#endif // CPU_FEATURES_OS_WINDOWS + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" +#include "internal/cpuid_x86.h" + +namespace cpu_features { + +class FakeCpu { + public: + Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) const { + const auto itr = cpuid_leaves_.find(std::make_pair(leaf_id, ecx)); + if (itr != cpuid_leaves_.end()) { + return itr->second; + } + return {0, 0, 0, 0}; + } + + uint32_t GetXCR0Eax() const { return xcr0_eax_; } + + void SetLeaves(std::map, Leaf> configuration) { + cpuid_leaves_ = std::move(configuration); + } + + void SetOsBackupsExtendedRegisters(bool os_backups_extended_registers) { + xcr0_eax_ = os_backups_extended_registers ? -1 : 0; + } + +#if defined(CPU_FEATURES_OS_DARWIN) + bool GetDarwinSysCtlByName(std::string name) const { + return darwin_sysctlbyname_.count(name); + } + + void SetDarwinSysCtlByName(std::string name) { + darwin_sysctlbyname_.insert(name); + } +#endif // CPU_FEATURES_OS_DARWIN + +#if defined(CPU_FEATURES_OS_WINDOWS) + bool GetWindowsIsProcessorFeaturePresent(DWORD ProcessorFeature) { + return windows_isprocessorfeaturepresent_.count(ProcessorFeature); + } + + void SetWindowsIsProcessorFeaturePresent(DWORD ProcessorFeature) { + windows_isprocessorfeaturepresent_.insert(ProcessorFeature); + } +#endif // CPU_FEATURES_OS_WINDOWS + + private: + std::map, Leaf> cpuid_leaves_; +#if defined(CPU_FEATURES_OS_DARWIN) + std::set darwin_sysctlbyname_; +#endif // CPU_FEATURES_OS_DARWIN +#if defined(CPU_FEATURES_OS_WINDOWS) + std::set windows_isprocessorfeaturepresent_; +#endif // CPU_FEATURES_OS_WINDOWS + uint32_t xcr0_eax_; +}; + +FakeCpu* g_fake_cpu = nullptr; + +extern "C" Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) { + return g_fake_cpu->GetCpuidLeaf(leaf_id, ecx); +} + +extern "C" uint32_t GetXCR0Eax(void) { return g_fake_cpu->GetXCR0Eax(); } + +#if defined(CPU_FEATURES_OS_DARWIN) +extern "C" bool GetDarwinSysCtlByName(const char* name) { + return g_fake_cpu->GetDarwinSysCtlByName(name); +} +#endif // CPU_FEATURES_OS_DARWIN + +#if defined(CPU_FEATURES_OS_WINDOWS) +extern "C" bool GetWindowsIsProcessorFeaturePresent(DWORD ProcessorFeature) { + return g_fake_cpu->GetWindowsIsProcessorFeaturePresent(ProcessorFeature); +} +#endif // CPU_FEATURES_OS_WINDOWS + +namespace { + +class CpuidX86Test : public ::testing::Test { + protected: + void SetUp() override { g_fake_cpu = new FakeCpu(); } + void TearDown() override { delete g_fake_cpu; } +}; + +TEST_F(CpuidX86Test, SandyBridge) { + g_fake_cpu->SetOsBackupsExtendedRegisters(true); + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x0000000D, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000206A6, 0x00100800, 0x1F9AE3BF, 0xBFEBFBFF}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + }); + const auto info = GetX86Info(); + EXPECT_STREQ(info.vendor, "GenuineIntel"); + EXPECT_EQ(info.family, 0x06); + EXPECT_EQ(info.model, 0x02A); + EXPECT_EQ(info.stepping, 0x06); + // Leaf 7 is zeroed out so none of the Leaf 7 flags are set. + const auto features = info.features; + EXPECT_FALSE(features.erms); + EXPECT_FALSE(features.avx2); + EXPECT_FALSE(features.avx512f); + EXPECT_FALSE(features.avx512cd); + EXPECT_FALSE(features.avx512er); + EXPECT_FALSE(features.avx512pf); + EXPECT_FALSE(features.avx512bw); + EXPECT_FALSE(features.avx512dq); + EXPECT_FALSE(features.avx512vl); + EXPECT_FALSE(features.avx512ifma); + EXPECT_FALSE(features.avx512vbmi); + EXPECT_FALSE(features.avx512vbmi2); + EXPECT_FALSE(features.avx512vnni); + EXPECT_FALSE(features.avx512bitalg); + EXPECT_FALSE(features.avx512vpopcntdq); + EXPECT_FALSE(features.avx512_4vnniw); + EXPECT_FALSE(features.avx512_4fmaps); + // All old cpu features should be set. + EXPECT_TRUE(features.aes); + EXPECT_TRUE(features.ssse3); + EXPECT_TRUE(features.sse4_1); + EXPECT_TRUE(features.sse4_2); + EXPECT_TRUE(features.avx); + EXPECT_FALSE(features.sha); + EXPECT_TRUE(features.popcnt); + EXPECT_FALSE(features.movbe); + EXPECT_FALSE(features.rdrnd); +} + +const int KiB = 1024; +const int MiB = 1024 * KiB; + +TEST_F(CpuidX86Test, SandyBridgeTestOsSupport) { + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x0000000D, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000206A6, 0x00100800, 0x1F9AE3BF, 0xBFEBFBFF}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + }); + // avx is disabled if os does not support backing up ymm registers. + g_fake_cpu->SetOsBackupsExtendedRegisters(false); + EXPECT_FALSE(GetX86Info().features.avx); + // avx is disabled if os does not support backing up ymm registers. + g_fake_cpu->SetOsBackupsExtendedRegisters(true); + EXPECT_TRUE(GetX86Info().features.avx); +} + +TEST_F(CpuidX86Test, SkyLake) { + g_fake_cpu->SetOsBackupsExtendedRegisters(true); + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x00000016, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000406E3, 0x00100800, 0x7FFAFBBF, 0xBFEBFBFF}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x029C67AF, 0x00000000, 0x00000000}}, + }); + const auto info = GetX86Info(); + EXPECT_STREQ(info.vendor, "GenuineIntel"); + EXPECT_EQ(info.family, 0x06); + EXPECT_EQ(info.model, 0x04E); + EXPECT_EQ(info.stepping, 0x03); + EXPECT_EQ(GetX86Microarchitecture(&info), X86Microarchitecture::INTEL_SKL); +} + +TEST_F(CpuidX86Test, Branding) { + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x00000016, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000406E3, 0x00100800, 0x7FFAFBBF, 0xBFEBFBFF}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x029C67AF, 0x00000000, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x80000008, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000001, 0}, Leaf{0x00000000, 0x00000000, 0x00000121, 0x2C100000}}, + {{0x80000002, 0}, Leaf{0x65746E49, 0x2952286C, 0x726F4320, 0x4D542865}}, + {{0x80000003, 0}, Leaf{0x37692029, 0x3035362D, 0x43205530, 0x40205550}}, + {{0x80000004, 0}, Leaf{0x352E3220, 0x7A484730, 0x00000000, 0x00000000}}, + }); + char brand_string[49]; + FillX86BrandString(brand_string); + EXPECT_STREQ(brand_string, "Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz"); +} + +TEST_F(CpuidX86Test, KabyLakeCache) { + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x00000016, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000406E3, 0x00100800, 0x7FFAFBBF, 0xBFEBFBFF}}, + {{0x00000004, 0}, Leaf{0x1C004121, 0x01C0003F, 0x0000003F, 0x00000000}}, + {{0x00000004, 1}, Leaf{0x1C004122, 0x01C0003F, 0x0000003F, 0x00000000}}, + {{0x00000004, 2}, Leaf{0x1C004143, 0x00C0003F, 0x000003FF, 0x00000000}}, + {{0x00000004, 3}, Leaf{0x1C03C163, 0x02C0003F, 0x00001FFF, 0x00000002}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x029C67AF, 0x00000000, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x80000008, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000001, 0}, Leaf{0x00000000, 0x00000000, 0x00000121, 0x2C100000}}, + {{0x80000002, 0}, Leaf{0x65746E49, 0x2952286C, 0x726F4320, 0x4D542865}}, + {{0x80000003, 0}, Leaf{0x37692029, 0x3035362D, 0x43205530, 0x40205550}}, + }); + const auto info = GetX86CacheInfo(); + EXPECT_EQ(info.size, 4); + EXPECT_EQ(info.levels[0].level, 1); + EXPECT_EQ(info.levels[0].cache_type, 1); + EXPECT_EQ(info.levels[0].cache_size, 32 * KiB); + EXPECT_EQ(info.levels[0].ways, 8); + EXPECT_EQ(info.levels[0].line_size, 64); + EXPECT_EQ(info.levels[0].tlb_entries, 64); + EXPECT_EQ(info.levels[0].partitioning, 1); + + EXPECT_EQ(info.levels[1].level, 1); + EXPECT_EQ(info.levels[1].cache_type, 2); + EXPECT_EQ(info.levels[1].cache_size, 32 * KiB); + EXPECT_EQ(info.levels[1].ways, 8); + EXPECT_EQ(info.levels[1].line_size, 64); + EXPECT_EQ(info.levels[1].tlb_entries, 64); + EXPECT_EQ(info.levels[1].partitioning, 1); + + EXPECT_EQ(info.levels[2].level, 2); + EXPECT_EQ(info.levels[2].cache_type, 3); + EXPECT_EQ(info.levels[2].cache_size, 256 * KiB); + EXPECT_EQ(info.levels[2].ways, 4); + EXPECT_EQ(info.levels[2].line_size, 64); + EXPECT_EQ(info.levels[2].tlb_entries, 1024); + EXPECT_EQ(info.levels[2].partitioning, 1); + + EXPECT_EQ(info.levels[3].level, 3); + EXPECT_EQ(info.levels[3].cache_type, 3); + EXPECT_EQ(info.levels[3].cache_size, 6 * MiB); + EXPECT_EQ(info.levels[3].ways, 12); + EXPECT_EQ(info.levels[3].line_size, 64); + EXPECT_EQ(info.levels[3].tlb_entries, 8192); + EXPECT_EQ(info.levels[3].partitioning, 1); +} + +TEST_F(CpuidX86Test, HSWCache) { + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x00000016, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000406E3, 0x00100800, 0x7FFAFBBF, 0xBFEBFBFF}}, + {{0x00000004, 0}, Leaf{0x1C004121, 0x01C0003F, 0x0000003F, 0x00000000}}, + {{0x00000004, 1}, Leaf{0x1C004122, 0x01C0003F, 0x0000003F, 0x00000000}}, + {{0x00000004, 2}, Leaf{0x1C004143, 0x01C0003F, 0x000001FF, 0x00000000}}, + {{0x00000004, 3}, Leaf{0x1C03C163, 0x02C0003F, 0x00001FFF, 0x00000006}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x029C67AF, 0x00000000, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x80000008, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000001, 0}, Leaf{0x00000000, 0x00000000, 0x00000121, 0x2C100000}}, + {{0x80000002, 0}, Leaf{0x65746E49, 0x2952286C, 0x726F4320, 0x4D542865}}, + {{0x80000003, 0}, Leaf{0x37692029, 0x3035362D, 0x43205530, 0x40205550}}, + }); + const auto info = GetX86CacheInfo(); + EXPECT_EQ(info.size, 4); + EXPECT_EQ(info.levels[0].level, 1); + EXPECT_EQ(info.levels[0].cache_type, 1); + EXPECT_EQ(info.levels[0].cache_size, 32 * KiB); + EXPECT_EQ(info.levels[0].ways, 8); + EXPECT_EQ(info.levels[0].line_size, 64); + EXPECT_EQ(info.levels[0].tlb_entries, 64); + EXPECT_EQ(info.levels[0].partitioning, 1); + + EXPECT_EQ(info.levels[1].level, 1); + EXPECT_EQ(info.levels[1].cache_type, 2); + EXPECT_EQ(info.levels[1].cache_size, 32 * KiB); + EXPECT_EQ(info.levels[1].ways, 8); + EXPECT_EQ(info.levels[1].line_size, 64); + EXPECT_EQ(info.levels[1].tlb_entries, 64); + EXPECT_EQ(info.levels[1].partitioning, 1); + + EXPECT_EQ(info.levels[2].level, 2); + EXPECT_EQ(info.levels[2].cache_type, 3); + EXPECT_EQ(info.levels[2].cache_size, 256 * KiB); + EXPECT_EQ(info.levels[2].ways, 8); + EXPECT_EQ(info.levels[2].line_size, 64); + EXPECT_EQ(info.levels[2].tlb_entries, 512); + EXPECT_EQ(info.levels[2].partitioning, 1); + + EXPECT_EQ(info.levels[3].level, 3); + EXPECT_EQ(info.levels[3].cache_type, 3); + EXPECT_EQ(info.levels[3].cache_size, 6 * MiB); + EXPECT_EQ(info.levels[3].ways, 12); + EXPECT_EQ(info.levels[3].line_size, 64); + EXPECT_EQ(info.levels[3].tlb_entries, 8192); + EXPECT_EQ(info.levels[3].partitioning, 1); +} + +// http://users.atw.hu/instlatx64/AuthenticAMD0630F81_K15_Godavari_CPUID.txt +TEST_F(CpuidX86Test, AMD_K15) { + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x0000000D, 0x68747541, 0x444D4163, 0x69746E65}}, + {{0x00000001, 0}, Leaf{0x00630F81, 0x00040800, 0x3E98320B, 0x178BFBFF}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x8000001E, 0x68747541, 0x444D4163, 0x69746E65}}, + {{0x80000001, 0}, Leaf{0x00630F81, 0x10000000, 0x0FEBBFFF, 0x2FD3FBFF}}, + {{0x80000002, 0}, Leaf{0x20444D41, 0x372D3841, 0x4B303736, 0x64615220}}, + {{0x80000003, 0}, Leaf{0x206E6F65, 0x202C3752, 0x43203031, 0x75706D6F}}, + {{0x80000004, 0}, Leaf{0x43206574, 0x7365726F, 0x2B433420, 0x00204736}}, + {{0x80000005, 0}, Leaf{0xFF40FF18, 0xFF40FF30, 0x10040140, 0x60030140}}, + }); + const auto info = GetX86Info(); + + EXPECT_STREQ(info.vendor, "AuthenticAMD"); + EXPECT_EQ(info.family, 0x15); + EXPECT_EQ(info.model, 0x38); + EXPECT_EQ(info.stepping, 0x01); + EXPECT_EQ(GetX86Microarchitecture(&info), + X86Microarchitecture::AMD_BULLDOZER); + + char brand_string[49]; + FillX86BrandString(brand_string); + EXPECT_STREQ(brand_string, "AMD A8-7670K Radeon R7, 10 Compute Cores 4C+6G "); +} + +// https://github.com/InstLatx64/InstLatx64/blob/master/GenuineIntel/GenuineIntel00106A1_Nehalem_CPUID.txt +TEST_F(CpuidX86Test, Nehalem) { + // Pre AVX cpus don't have xsave + g_fake_cpu->SetOsBackupsExtendedRegisters(false); +#if defined(CPU_FEATURES_OS_WINDOWS) + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_XMMI_INSTRUCTIONS_AVAILABLE); + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_XMMI64_INSTRUCTIONS_AVAILABLE); + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_SSE3_INSTRUCTIONS_AVAILABLE); +#endif // CPU_FEATURES_OS_WINDOWS +#if defined(CPU_FEATURES_OS_DARWIN) + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse2"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse3"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.supplementalsse3"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse4_1"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse4_2"); +#endif // CPU_FEATURES_OS_DARWIN +#if defined(CPU_FEATURES_OS_LINUX_OR_ANDROID) + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(processor : +flags : fpu mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 +)"); +#endif // CPU_FEATURES_OS_LINUX_OR_ANDROID + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x0000000B, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000106A2, 0x00100800, 0x00BCE3BD, 0xBFEBFBFF}}, + {{0x00000002, 0}, Leaf{0x55035A01, 0x00F0B0E3, 0x00000000, 0x09CA212C}}, + {{0x00000003, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x00000004, 0}, Leaf{0x1C004121, 0x01C0003F, 0x0000003F, 0x00000000}}, + {{0x00000004, 0}, Leaf{0x1C004122, 0x00C0003F, 0x0000007F, 0x00000000}}, + {{0x00000004, 0}, Leaf{0x1C004143, 0x01C0003F, 0x000001FF, 0x00000000}}, + {{0x00000004, 0}, Leaf{0x1C03C163, 0x03C0003F, 0x00000FFF, 0x00000002}}, + {{0x00000005, 0}, Leaf{0x00000040, 0x00000040, 0x00000003, 0x00021120}}, + {{0x00000006, 0}, Leaf{0x00000001, 0x00000002, 0x00000001, 0x00000000}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x00000008, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x00000009, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x0000000A, 0}, Leaf{0x07300403, 0x00000000, 0x00000000, 0x00000603}}, + {{0x0000000B, 0}, Leaf{0x00000001, 0x00000001, 0x00000100, 0x00000000}}, + {{0x0000000B, 0}, Leaf{0x00000004, 0x00000002, 0x00000201, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x80000008, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000001, 0}, Leaf{0x00000000, 0x00000000, 0x00000001, 0x28100000}}, + {{0x80000002, 0}, Leaf{0x756E6547, 0x20656E69, 0x65746E49, 0x2952286C}}, + {{0x80000003, 0}, Leaf{0x55504320, 0x20202020, 0x20202020, 0x40202020}}, + {{0x80000004, 0}, Leaf{0x30303020, 0x20402030, 0x37382E31, 0x007A4847}}, + {{0x80000005, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000006, 0}, Leaf{0x00000000, 0x00000000, 0x01006040, 0x00000000}}, + {{0x80000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000100}}, + {{0x80000008, 0}, Leaf{0x00003028, 0x00000000, 0x00000000, 0x00000000}}, + }); + const auto info = GetX86Info(); + + EXPECT_STREQ(info.vendor, "GenuineIntel"); + EXPECT_EQ(info.family, 0x06); + EXPECT_EQ(info.model, 0x1A); + EXPECT_EQ(info.stepping, 0x02); + EXPECT_EQ(GetX86Microarchitecture(&info), X86Microarchitecture::INTEL_NHM); + + char brand_string[49]; + FillX86BrandString(brand_string); + EXPECT_STREQ(brand_string, "Genuine Intel(R) CPU @ 0000 @ 1.87GHz"); + + EXPECT_TRUE(info.features.sse); + EXPECT_TRUE(info.features.sse2); + EXPECT_TRUE(info.features.sse3); +#ifndef CPU_FEATURES_OS_WINDOWS + // Currently disabled on Windows as IsProcessorFeaturePresent do not support + // feature detection > sse3. + EXPECT_TRUE(info.features.ssse3); + EXPECT_TRUE(info.features.sse4_1); + EXPECT_TRUE(info.features.sse4_2); +#endif // CPU_FEATURES_OS_WINDOWS +} + +// https://github.com/InstLatx64/InstLatx64/blob/master/GenuineIntel/GenuineIntel0030673_Silvermont3_CPUID.txt +TEST_F(CpuidX86Test, Atom) { + // Pre AVX cpus don't have xsave + g_fake_cpu->SetOsBackupsExtendedRegisters(false); +#if defined(CPU_FEATURES_OS_WINDOWS) + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_XMMI_INSTRUCTIONS_AVAILABLE); + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_XMMI64_INSTRUCTIONS_AVAILABLE); + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_SSE3_INSTRUCTIONS_AVAILABLE); +#endif // CPU_FEATURES_OS_WINDOWS +#if defined(CPU_FEATURES_OS_DARWIN) + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse2"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse3"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.supplementalsse3"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse4_1"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse4_2"); +#endif // CPU_FEATURES_OS_DARWIN +#if defined(CPU_FEATURES_OS_LINUX_OR_ANDROID) + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"( +flags : fpu mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 +)"); +#endif // CPU_FEATURES_OS_LINUX_OR_ANDROID + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x0000000B, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x00030673, 0x00100800, 0x41D8E3BF, 0xBFEBFBFF}}, + {{0x00000002, 0}, Leaf{0x61B3A001, 0x0000FFC2, 0x00000000, 0x00000000}}, + {{0x00000003, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x00000004, 0}, Leaf{0x1C000121, 0x0140003F, 0x0000003F, 0x00000001}}, + {{0x00000004, 1}, Leaf{0x1C000122, 0x01C0003F, 0x0000003F, 0x00000001}}, + {{0x00000004, 2}, Leaf{0x1C00C143, 0x03C0003F, 0x000003FF, 0x00000001}}, + {{0x00000005, 0}, Leaf{0x00000040, 0x00000040, 0x00000003, 0x33000020}}, + {{0x00000006, 0}, Leaf{0x00000005, 0x00000002, 0x00000009, 0x00000000}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x00002282, 0x00000000, 0x00000000}}, + {{0x00000008, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x00000009, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x0000000A, 0}, Leaf{0x07280203, 0x00000000, 0x00000000, 0x00004503}}, + {{0x0000000B, 0}, Leaf{0x00000001, 0x00000001, 0x00000100, 0x00000000}}, + {{0x0000000B, 1}, Leaf{0x00000004, 0x00000004, 0x00000201, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x80000008, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000001, 0}, Leaf{0x00000000, 0x00000000, 0x00000101, 0x28100000}}, + {{0x80000002, 0}, Leaf{0x20202020, 0x6E492020, 0x286C6574, 0x43202952}}, + {{0x80000003, 0}, Leaf{0x72656C65, 0x52286E6F, 0x50432029, 0x4A202055}}, + {{0x80000004, 0}, Leaf{0x30303931, 0x20402020, 0x39392E31, 0x007A4847}}, + {{0x80000005, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000006, 0}, Leaf{0x00000000, 0x00000000, 0x04008040, 0x00000000}}, + {{0x80000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000100}}, + {{0x80000008, 0}, Leaf{0x00003024, 0x00000000, 0x00000000, 0x00000000}}, + }); + const auto info = GetX86Info(); + + EXPECT_STREQ(info.vendor, "GenuineIntel"); + EXPECT_EQ(info.family, 0x06); + EXPECT_EQ(info.model, 0x37); + EXPECT_EQ(info.stepping, 0x03); + EXPECT_EQ(GetX86Microarchitecture(&info), + X86Microarchitecture::INTEL_ATOM_SMT); + + char brand_string[49]; + FillX86BrandString(brand_string); + EXPECT_STREQ(brand_string, " Intel(R) Celeron(R) CPU J1900 @ 1.99GHz"); + + EXPECT_TRUE(info.features.sse); + EXPECT_TRUE(info.features.sse2); + EXPECT_TRUE(info.features.sse3); +#ifndef CPU_FEATURES_OS_WINDOWS + // Currently disabled on Windows as IsProcessorFeaturePresent do not support + // feature detection > sse3. + EXPECT_TRUE(info.features.ssse3); + EXPECT_TRUE(info.features.sse4_1); + EXPECT_TRUE(info.features.sse4_2); +#endif // CPU_FEATURES_OS_WINDOWS +} + +// https://github.com/InstLatx64/InstLatx64/blob/master/GenuineIntel/GenuineIntel0000673_P3_KatmaiDP_CPUID.txt +TEST_F(CpuidX86Test, P3) { + // Pre AVX cpus don't have xsave + g_fake_cpu->SetOsBackupsExtendedRegisters(false); +#if defined(CPU_FEATURES_OS_WINDOWS) + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_XMMI_INSTRUCTIONS_AVAILABLE); +#endif // CPU_FEATURES_OS_WINDOWS +#if defined(CPU_FEATURES_OS_DARWIN) + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse"); +#endif // CPU_FEATURES_OS_DARWIN +#if defined(CPU_FEATURES_OS_LINUX_OR_ANDROID) + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"( +flags : fpu mmx sse +)"); +#endif // CPU_FEATURES_OS_LINUX_OR_ANDROID + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x00000003, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x00000673, 0x00000000, 0x00000000, 0x0387FBFF}}, + {{0x00000002, 0}, Leaf{0x03020101, 0x00000000, 0x00000000, 0x0C040843}}, + {{0x00000003, 0}, Leaf{0x00000000, 0x00000000, 0x4CECC782, 0x00006778}}, + }); + const auto info = GetX86Info(); + + EXPECT_STREQ(info.vendor, "GenuineIntel"); + EXPECT_EQ(info.family, 0x06); + EXPECT_EQ(info.model, 0x07); + EXPECT_EQ(info.stepping, 0x03); + EXPECT_EQ(GetX86Microarchitecture(&info), X86Microarchitecture::X86_UNKNOWN); + + char brand_string[49]; + FillX86BrandString(brand_string); + EXPECT_STREQ(brand_string, ""); + + EXPECT_TRUE(info.features.mmx); + EXPECT_TRUE(info.features.sse); + EXPECT_FALSE(info.features.sse2); + EXPECT_FALSE(info.features.sse3); +#ifndef CPU_FEATURES_OS_WINDOWS + // Currently disabled on Windows as IsProcessorFeaturePresent do not support + // feature detection > sse3. + EXPECT_FALSE(info.features.ssse3); + EXPECT_FALSE(info.features.sse4_1); + EXPECT_FALSE(info.features.sse4_2); +#endif // CPU_FEATURES_OS_WINDOWS +} + +// TODO(user): test what happens when xsave/osxsave are not present. +// TODO(user): test what happens when xmm/ymm/zmm os support are not +// present. + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/filesystem_for_testing.cc b/cpu_features/test/filesystem_for_testing.cc new file mode 100644 index 0000000..648a53e --- /dev/null +++ b/cpu_features/test/filesystem_for_testing.cc @@ -0,0 +1,103 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "filesystem_for_testing.h" + +#include +#include +#include +#include +#include + +namespace cpu_features { + +FakeFile::FakeFile(int file_descriptor, const char* content) + : file_descriptor_(file_descriptor), content_(content) {} + +FakeFile::~FakeFile() { assert(!opened_); } + +void FakeFile::Open() { + assert(!opened_); + opened_ = true; +} + +void FakeFile::Close() { + assert(opened_); + opened_ = false; +} + +int FakeFile::Read(int fd, void* buf, size_t count) { + assert(count < INT_MAX); + assert(fd == file_descriptor_); + const size_t remainder = content_.size() - head_index_; + const size_t read = count > remainder ? remainder : count; + memcpy(buf, content_.data() + head_index_, read); + head_index_ += read; + assert(read < INT_MAX); + return (int)read; +} + +void FakeFilesystem::Reset() { files_.clear(); } + +FakeFile* FakeFilesystem::CreateFile(const std::string& filename, + const char* content) { + auto& file = files_[filename]; + file = + std::unique_ptr(new FakeFile(next_file_descriptor_++, content)); + return file.get(); +} + +FakeFile* FakeFilesystem::FindFileOrNull(const std::string& filename) const { + const auto itr = files_.find(filename); + return itr == files_.end() ? nullptr : itr->second.get(); +} + +FakeFile* FakeFilesystem::FindFileOrDie(const int file_descriptor) const { + for (const auto& filename_file_pair : files_) { + FakeFile* const file_ptr = filename_file_pair.second.get(); + if (file_ptr->GetFileDescriptor() == file_descriptor) { + return file_ptr; + } + } + assert(false); + return nullptr; +} + +static FakeFilesystem* kFilesystem = new FakeFilesystem(); + +FakeFilesystem& GetEmptyFilesystem() { + kFilesystem->Reset(); + return *kFilesystem; +} + +extern "C" int CpuFeatures_OpenFile(const char* filename) { + auto* const file = kFilesystem->FindFileOrNull(filename); + if (file) { + file->Open(); + return file->GetFileDescriptor(); + } + return -1; +} + +extern "C" void CpuFeatures_CloseFile(int file_descriptor) { + kFilesystem->FindFileOrDie(file_descriptor)->Close(); +} + +extern "C" int CpuFeatures_ReadFile(int file_descriptor, void* buffer, + size_t buffer_size) { + return kFilesystem->FindFileOrDie(file_descriptor) + ->Read(file_descriptor, buffer, buffer_size); +} + +} // namespace cpu_features diff --git a/cpu_features/test/filesystem_for_testing.h b/cpu_features/test/filesystem_for_testing.h new file mode 100644 index 0000000..ef717fd --- /dev/null +++ b/cpu_features/test/filesystem_for_testing.h @@ -0,0 +1,61 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements a fake filesystem, useful for tests. +#ifndef CPU_FEATURES_TEST_FILESYSTEM_FOR_TESTING_H_ +#define CPU_FEATURES_TEST_FILESYSTEM_FOR_TESTING_H_ + +#include +#include +#include + +#include "internal/filesystem.h" + +namespace cpu_features { + +class FakeFile { + public: + explicit FakeFile(int file_descriptor, const char* content); + ~FakeFile(); + + void Open(); + void Close(); + int Read(int fd, void* buf, size_t count); + + int GetFileDescriptor() const { return file_descriptor_; } + + private: + const int file_descriptor_; + const std::string content_; + bool opened_ = false; + size_t head_index_ = 0; +}; + +class FakeFilesystem { + public: + void Reset(); + FakeFile* CreateFile(const std::string& filename, const char* content); + FakeFile* FindFileOrDie(const int file_descriptor) const; + FakeFile* FindFileOrNull(const std::string& filename) const; + + private: + int next_file_descriptor_ = 0; + std::unordered_map> files_; +}; + +FakeFilesystem& GetEmptyFilesystem(); + +} // namespace cpu_features + +#endif // CPU_FEATURES_TEST_FILESYSTEM_FOR_TESTING_H_ diff --git a/cpu_features/test/hwcaps_for_testing.cc b/cpu_features/test/hwcaps_for_testing.cc new file mode 100644 index 0000000..a8086a0 --- /dev/null +++ b/cpu_features/test/hwcaps_for_testing.cc @@ -0,0 +1,46 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwcaps_for_testing.h" + +#include + +#include "internal/string_view.h" + +namespace cpu_features { + +namespace { +static auto* const g_hardware_capabilities = new HardwareCapabilities(); +static auto* const g_platform_types = new PlatformType(); +} // namespace + +void SetHardwareCapabilities(uint32_t hwcaps, uint32_t hwcaps2) { + g_hardware_capabilities->hwcaps = hwcaps; + g_hardware_capabilities->hwcaps2 = hwcaps2; +} + +HardwareCapabilities CpuFeatures_GetHardwareCapabilities(void) { + return *g_hardware_capabilities; +} + +void SetPlatformTypes(const char* platform, const char* base_platform) { + CpuFeatures_StringView_CopyString(str(platform), g_platform_types->platform, + sizeof(g_platform_types->platform)); + CpuFeatures_StringView_CopyString(str(base_platform), + g_platform_types->base_platform, + sizeof(g_platform_types->base_platform)); +} + +PlatformType CpuFeatures_GetPlatformType(void) { return *g_platform_types; } +} // namespace cpu_features diff --git a/cpu_features/test/hwcaps_for_testing.h b/cpu_features/test/hwcaps_for_testing.h new file mode 100644 index 0000000..bcab82e --- /dev/null +++ b/cpu_features/test/hwcaps_for_testing.h @@ -0,0 +1,27 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_TEST_HWCAPS_FOR_TESTING_H_ +#define CPU_FEATURES_TEST_HWCAPS_FOR_TESTING_H_ + +#include "internal/hwcaps.h" + +namespace cpu_features { + +void SetHardwareCapabilities(uint32_t hwcaps, uint32_t hwcaps2); +void SetPlatformTypes(const char *platform, const char *base_platform); + +} // namespace cpu_features + +#endif // CPU_FEATURES_TEST_HWCAPS_FOR_TESTING_H_ diff --git a/cpu_features/test/stack_line_reader_test.cc b/cpu_features/test/stack_line_reader_test.cc new file mode 100644 index 0000000..9ac5388 --- /dev/null +++ b/cpu_features/test/stack_line_reader_test.cc @@ -0,0 +1,132 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/stack_line_reader.h" + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" + +namespace cpu_features { + +bool operator==(const StringView& a, const StringView& b) { + return CpuFeatures_StringView_IsEquals(a, b); +} + +namespace { + +std::string ToString(StringView view) { return {view.ptr, view.size}; } + +TEST(StackLineReaderTest, Empty) { + auto& fs = GetEmptyFilesystem(); + auto* file = fs.CreateFile("/proc/cpuinfo", ""); + StackLineReader reader; + StackLineReader_Initialize(&reader, file->GetFileDescriptor()); + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_TRUE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("")); + } +} + +TEST(StackLineReaderTest, ManySmallLines) { + auto& fs = GetEmptyFilesystem(); + auto* file = fs.CreateFile("/proc/cpuinfo", "a\nb\nc"); + + StackLineReader reader; + StackLineReader_Initialize(&reader, file->GetFileDescriptor()); + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("a")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("b")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_TRUE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("c")); + } +} + +TEST(StackLineReaderTest, TruncatedLine) { + auto& fs = GetEmptyFilesystem(); + auto* file = fs.CreateFile("/proc/cpuinfo", R"(First +Second +More than 16 characters, this will be truncated. +last)"); + + StackLineReader reader; + StackLineReader_Initialize(&reader, file->GetFileDescriptor()); + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("First")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("Second")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_FALSE(result.full_line); + EXPECT_EQ(result.line, str("More than 16 cha")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_TRUE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("last")); + } +} + +TEST(StackLineReaderTest, TruncatedLines) { + auto& fs = GetEmptyFilesystem(); + auto* file = fs.CreateFile("/proc/cpuinfo", R"(More than 16 characters +Another line that is too long)"); + + StackLineReader reader; + StackLineReader_Initialize(&reader, file->GetFileDescriptor()); + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_FALSE(result.full_line); + EXPECT_EQ(result.line, str("More than 16 cha")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_FALSE(result.full_line); + EXPECT_EQ(result.line, str("Another line tha")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_TRUE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("")); + } +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/string_view_test.cc b/cpu_features/test/string_view_test.cc new file mode 100644 index 0000000..ca3e023 --- /dev/null +++ b/cpu_features/test/string_view_test.cc @@ -0,0 +1,192 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/string_view.h" + +#include "gtest/gtest.h" + +namespace cpu_features { + +bool operator==(const StringView& a, const StringView& b) { + return CpuFeatures_StringView_IsEquals(a, b); +} + +namespace { + +TEST(StringViewTest, Empty) { + EXPECT_EQ(kEmptyStringView.ptr, nullptr); + EXPECT_EQ(kEmptyStringView.size, 0); +} + +TEST(StringViewTest, Build) { + const auto view = str("test"); + EXPECT_EQ(view.ptr[0], 't'); + EXPECT_EQ(view.size, 4); +} + +TEST(StringViewTest, CpuFeatures_StringView_IndexOfChar) { + // Found. + EXPECT_EQ(CpuFeatures_StringView_IndexOfChar(str("test"), 'e'), 1); + EXPECT_EQ(CpuFeatures_StringView_IndexOfChar(str("test"), 't'), 0); + EXPECT_EQ(CpuFeatures_StringView_IndexOfChar(str("beef"), 'e'), 1); + // Not found. + EXPECT_EQ(CpuFeatures_StringView_IndexOfChar(str("test"), 'z'), -1); + // Empty. + EXPECT_EQ(CpuFeatures_StringView_IndexOfChar(kEmptyStringView, 'z'), -1); +} + +TEST(StringViewTest, CpuFeatures_StringView_IndexOf) { + // Found. + EXPECT_EQ(CpuFeatures_StringView_IndexOf(str("test"), str("es")), 1); + EXPECT_EQ(CpuFeatures_StringView_IndexOf(str("test"), str("test")), 0); + EXPECT_EQ(CpuFeatures_StringView_IndexOf(str("tesstest"), str("test")), 4); + // Not found. + EXPECT_EQ(CpuFeatures_StringView_IndexOf(str("test"), str("aa")), -1); + // Empty. + EXPECT_EQ(CpuFeatures_StringView_IndexOf(kEmptyStringView, str("aa")), -1); + EXPECT_EQ(CpuFeatures_StringView_IndexOf(str("aa"), kEmptyStringView), -1); +} + +TEST(StringViewTest, CpuFeatures_StringView_StartsWith) { + EXPECT_TRUE(CpuFeatures_StringView_StartsWith(str("test"), str("te"))); + EXPECT_TRUE(CpuFeatures_StringView_StartsWith(str("test"), str("test"))); + EXPECT_FALSE(CpuFeatures_StringView_StartsWith(str("test"), str("st"))); + EXPECT_FALSE(CpuFeatures_StringView_StartsWith(str("test"), str("est"))); + EXPECT_FALSE(CpuFeatures_StringView_StartsWith(str("test"), str(""))); + EXPECT_FALSE( + CpuFeatures_StringView_StartsWith(str("test"), kEmptyStringView)); + EXPECT_FALSE( + CpuFeatures_StringView_StartsWith(kEmptyStringView, str("test"))); +} + +TEST(StringViewTest, CpuFeatures_StringView_IsEquals) { + EXPECT_TRUE( + CpuFeatures_StringView_IsEquals(kEmptyStringView, kEmptyStringView)); + EXPECT_TRUE(CpuFeatures_StringView_IsEquals(kEmptyStringView, str(""))); + EXPECT_TRUE(CpuFeatures_StringView_IsEquals(str(""), kEmptyStringView)); + EXPECT_TRUE(CpuFeatures_StringView_IsEquals(str("test"), str("test"))); + EXPECT_TRUE(CpuFeatures_StringView_IsEquals(str("a"), str("a"))); + EXPECT_FALSE(CpuFeatures_StringView_IsEquals(str("a"), str("b"))); + EXPECT_FALSE(CpuFeatures_StringView_IsEquals(str("aa"), str("a"))); + EXPECT_FALSE(CpuFeatures_StringView_IsEquals(str("a"), str("aa"))); + EXPECT_FALSE(CpuFeatures_StringView_IsEquals(str("a"), kEmptyStringView)); + EXPECT_FALSE(CpuFeatures_StringView_IsEquals(kEmptyStringView, str("a"))); +} + +TEST(StringViewTest, CpuFeatures_StringView_PopFront) { + EXPECT_EQ(CpuFeatures_StringView_PopFront(str("test"), 2), str("st")); + EXPECT_EQ(CpuFeatures_StringView_PopFront(str("test"), 0), str("test")); + EXPECT_EQ(CpuFeatures_StringView_PopFront(str("test"), 4), str("")); + EXPECT_EQ(CpuFeatures_StringView_PopFront(str("test"), 100), str("")); +} + +TEST(StringViewTest, CpuFeatures_StringView_PopBack) { + EXPECT_EQ(CpuFeatures_StringView_PopBack(str("test"), 2), str("te")); + EXPECT_EQ(CpuFeatures_StringView_PopBack(str("test"), 0), str("test")); + EXPECT_EQ(CpuFeatures_StringView_PopBack(str("test"), 4), str("")); + EXPECT_EQ(CpuFeatures_StringView_PopBack(str("test"), 100), str("")); +} + +TEST(StringViewTest, CpuFeatures_StringView_KeepFront) { + EXPECT_EQ(CpuFeatures_StringView_KeepFront(str("test"), 2), str("te")); + EXPECT_EQ(CpuFeatures_StringView_KeepFront(str("test"), 0), str("")); + EXPECT_EQ(CpuFeatures_StringView_KeepFront(str("test"), 4), str("test")); + EXPECT_EQ(CpuFeatures_StringView_KeepFront(str("test"), 6), str("test")); +} + +TEST(StringViewTest, CpuFeatures_StringView_Front) { + EXPECT_EQ(CpuFeatures_StringView_Front(str("apple")), 'a'); + EXPECT_EQ(CpuFeatures_StringView_Front(str("a")), 'a'); +} + +TEST(StringViewTest, CpuFeatures_StringView_Back) { + EXPECT_EQ(CpuFeatures_StringView_Back(str("apple")), 'e'); + EXPECT_EQ(CpuFeatures_StringView_Back(str("a")), 'a'); +} + +TEST(StringViewTest, CpuFeatures_StringView_TrimWhitespace) { + EXPECT_EQ(CpuFeatures_StringView_TrimWhitespace(str(" first middle last ")), + str("first middle last")); + EXPECT_EQ(CpuFeatures_StringView_TrimWhitespace(str("first middle last ")), + str("first middle last")); + EXPECT_EQ(CpuFeatures_StringView_TrimWhitespace(str(" first middle last")), + str("first middle last")); + EXPECT_EQ(CpuFeatures_StringView_TrimWhitespace(str("first middle last")), + str("first middle last")); +} + +TEST(StringViewTest, CpuFeatures_StringView_ParsePositiveNumber) { + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("42")), 42); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("0x2a")), 42); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("0x2A")), 42); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("0x2A2a")), 10794); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("0x2a2A")), 10794); + + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("-10")), -1); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("-0x2A")), -1); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("abc")), -1); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("")), -1); +} + +TEST(StringViewTest, CpuFeatures_StringView_CopyString) { + char buf[4]; + buf[0] = 'X'; + + // Empty + CpuFeatures_StringView_CopyString(str(""), buf, sizeof(buf)); + EXPECT_STREQ(buf, ""); + + // Less + CpuFeatures_StringView_CopyString(str("a"), buf, sizeof(buf)); + EXPECT_STREQ(buf, "a"); + + // exact + CpuFeatures_StringView_CopyString(str("abc"), buf, sizeof(buf)); + EXPECT_STREQ(buf, "abc"); + + // More + CpuFeatures_StringView_CopyString(str("abcd"), buf, sizeof(buf)); + EXPECT_STREQ(buf, "abc"); +} + +TEST(StringViewTest, CpuFeatures_StringView_HasWord) { + // Find flags at beginning, middle and end. + EXPECT_TRUE( + CpuFeatures_StringView_HasWord(str("first middle last"), "first")); + EXPECT_TRUE( + CpuFeatures_StringView_HasWord(str("first middle last"), "middle")); + EXPECT_TRUE(CpuFeatures_StringView_HasWord(str("first middle last"), "last")); + // Do not match partial flags + EXPECT_FALSE( + CpuFeatures_StringView_HasWord(str("first middle last"), "irst")); + EXPECT_FALSE(CpuFeatures_StringView_HasWord(str("first middle last"), "mid")); + EXPECT_FALSE(CpuFeatures_StringView_HasWord(str("first middle last"), "las")); +} + +TEST(StringViewTest, CpuFeatures_StringView_GetAttributeKeyValue) { + const StringView line = str(" key : first middle last "); + StringView key, value; + EXPECT_TRUE(CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)); + EXPECT_EQ(key, str("key")); + EXPECT_EQ(value, str("first middle last")); +} + +TEST(StringViewTest, FailingGetAttributeKeyValue) { + const StringView line = str("key first middle last"); + StringView key, value; + EXPECT_FALSE(CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)); +} + +} // namespace +} // namespace cpu_features diff --git a/debian-jessie/rules b/debian-jessie/rules index ee18e9e..f03be8a 100755 --- a/debian-jessie/rules +++ b/debian-jessie/rules @@ -14,11 +14,6 @@ export DEB_BUILD_MAINT_OPTIONS = hardening=+all DPKG_EXPORT_BUILDFLAGS = 1 include /usr/share/dpkg/default.mk -ifeq ($(DEB_HOST_ARCH),armhf) - # Assume a Pi-like target, where using an 8-bit table is a fairly big win over the float path - CPPFLAGS += -DSC16Q11_TABLE_BITS=8 -endif - override_dh_auto_build: dh_auto_build -- RTLSDR=yes BLADERF=yes HACKRF=no LIMESDR=no DUMP1090_VERSION=$(DEB_VERSION) diff --git a/debian-stretch/rules b/debian-stretch/rules index ee18e9e..f03be8a 100755 --- a/debian-stretch/rules +++ b/debian-stretch/rules @@ -14,11 +14,6 @@ export DEB_BUILD_MAINT_OPTIONS = hardening=+all DPKG_EXPORT_BUILDFLAGS = 1 include /usr/share/dpkg/default.mk -ifeq ($(DEB_HOST_ARCH),armhf) - # Assume a Pi-like target, where using an 8-bit table is a fairly big win over the float path - CPPFLAGS += -DSC16Q11_TABLE_BITS=8 -endif - override_dh_auto_build: dh_auto_build -- RTLSDR=yes BLADERF=yes HACKRF=no LIMESDR=no DUMP1090_VERSION=$(DEB_VERSION) diff --git a/debian/dump1090-fa.default b/debian/dump1090-fa.default index 3e4bb73..12c3e31 100644 --- a/debian/dump1090-fa.default +++ b/debian/dump1090-fa.default @@ -13,3 +13,9 @@ RECEIVER_OPTIONS="--device-index 0 --gain -10 --ppm 0" DECODER_OPTIONS="--max-range 360 --fix" NET_OPTIONS="--net --net-heartbeat 60 --net-ro-size 1300 --net-ro-interval 0.2 --net-ri-port 0 --net-ro-port 30002 --net-sbs-port 30003 --net-bi-port 30004,30104 --net-bo-port 30005" JSON_OPTIONS="--json-location-accuracy 1" + +# Use a machine-specific wisdom file if it exists +if [ -f /etc/dump1090-fa/wisdom.local ] +then + RECEIVER_OPTIONS="${RECEIVER_OPTIONS} --wisdom /etc/dump1090-fa/wisdom.local" +fi diff --git a/debian/dump1090-fa.install b/debian/dump1090-fa.install index 67292d1..220826c 100644 --- a/debian/dump1090-fa.install +++ b/debian/dump1090-fa.install @@ -2,3 +2,5 @@ public_html/* usr/share/dump1090-fa/html debian/lighttpd/* etc/lighttpd/conf-available bladerf/* usr/share/dump1090-fa/bladerf debian/start-dump1090-fa usr/share/dump1090-fa/ +debian/generate-wisdom usr/share/dump1090-fa/ +starch-benchmark /usr/lib/dump1090-fa/ diff --git a/debian/generate-wisdom b/debian/generate-wisdom new file mode 100755 index 0000000..dc98d72 --- /dev/null +++ b/debian/generate-wisdom @@ -0,0 +1,20 @@ +#!/bin/sh -e + +# This script generates a machine-specific wisdom file for dump1090-fa +# (containing information about which DSP implementations are fastest) + +WORKDIR=$(mktemp -t -d wisdom.XXXXXX) + +echo "Benchmarking .. this will take a while." >&2 + +# generate initial wisdom so that twopass implementations have something to work with +echo "First pass: generating $WORKDIR/wisdom.initial" >&2 +/usr/lib/dump1090-fa/starch-benchmark -i 5 -o $WORKDIR/wisdom.initial magnitude_uc8 magnitude_uc8_aligned mean_power_u16 mean_power_u16_aligned + +# generate the real wisdom +echo "Second pass: generating $WORKDIR/wisdom.local" >&2 +/usr/lib/dump1090-fa/starch-benchmark -i 5 -r $WORKDIR/wisdom.initial -o $WORKDIR/wisdom.local + +echo "Wisdom written to $WORKDIR/wisdom.local" >&2 +echo "Copy this file to /etc/dump1090-fa/wisdom.local" >&2 +echo "(and restart dump1090-fa) to start using it." >&2 diff --git a/debian/rules b/debian/rules index 93f94f8..5aa76c1 100755 --- a/debian/rules +++ b/debian/rules @@ -14,11 +14,6 @@ export DEB_BUILD_MAINT_OPTIONS = hardening=+all DPKG_EXPORT_BUILDFLAGS = 1 include /usr/share/dpkg/default.mk -ifeq ($(DEB_HOST_ARCH),armhf) - # Assume a Pi-like target, where using an 8-bit table is a fairly big win over the float path - CPPFLAGS += -DSC16Q11_TABLE_BITS=8 -endif - ifeq (,$(filter custom,$(DEB_BUILD_PROFILES))) # Standard build RTLSDR = yes diff --git a/dsp-types.h b/dsp-types.h new file mode 100644 index 0000000..6eaaa8d --- /dev/null +++ b/dsp-types.h @@ -0,0 +1,21 @@ +#ifndef DUMP1090_DSP_TYPES_H +#define DUMP1090_DSP_TYPES_H + +#include + +typedef struct { + uint8_t I; + uint8_t Q; +} __attribute__((packed)) uc8_t; + +typedef union { + uc8_t uc8; + uint16_t u16; +} uc8_u16_t; + +typedef struct { + int16_t I; + int16_t Q; +} __attribute__((packed)) sc16_t; + +#endif diff --git a/dsp/benchmark/magnitude_power_uc8_benchmark.c b/dsp/benchmark/magnitude_power_uc8_benchmark.c new file mode 100644 index 0000000..1c1c105 --- /dev/null +++ b/dsp/benchmark/magnitude_power_uc8_benchmark.c @@ -0,0 +1,102 @@ +#include +#include + +void STARCH_BENCHMARK(magnitude_power_uc8) (void) +{ + uc8_t *in = NULL; + uint16_t *out_mag = NULL; + const unsigned len = 65536; + double out_level, out_power; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, *in)) || !(out_mag = STARCH_BENCHMARK_ALLOC(len, *out_mag))) { + goto done; + } + + unsigned i = 0; + + // 0.9 magnitude, varying phase + double degrees = 0; + for (; i < len && degrees < 360; i += 1, degrees += 1) { + in[i].I = (uint8_t) (0.9 * cos(degrees * M_PI / 180.0) * 128 + 127.4); + in[i].Q = (uint8_t) (0.9 * sin(degrees * M_PI / 180.0) * 128 + 127.4); + } + + // 0, 45, 90 degree phase, full input range + unsigned sequence = 0; + for (; (i+3) <= len && sequence < 256; i += 3, sequence += 1) { + in[i + 0].I = sequence; + in[i + 0].Q = 0; + + in[i + 1].I = sequence; + in[i + 1].Q = sequence; + + in[i + 2].I = 0; + in[i + 2].Q = sequence; + } + + // Fill the rest with random values + srand(1); + for (; i < len; ++i) { + in[i].I = rand() % 256; + in[i].Q = rand() % 256; + } + + STARCH_BENCHMARK_RUN( magnitude_power_uc8, in, out_mag, len, &out_level, &out_power ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out_mag); +} + +bool STARCH_BENCHMARK_VERIFY(magnitude_power_uc8) (const uc8_t *in, uint16_t *out, unsigned len, double *out_level, double *out_power) +{ + const double max_error = 0.015; // tolerate 1.5% error + const double epsilon = 1.0; + bool okay = true; + + double sum_level = 0, sum_power = 0; + + for (unsigned i = 0; i < len; ++i) { + double I = (in[i].I - 127.4) / 128; + double Q = (in[i].Q - 127.4) / 128; + double magsq = I * I + Q * Q; + double expected = round(sqrt(magsq) * 65536.0); + if (expected > 65535.0) + expected = 65535.0; + double actual = out[i]; + + double error = fabs(expected - actual); + double error_fraction = error / (expected > epsilon ? expected : epsilon); + if (error > epsilon && error_fraction > max_error) { + fprintf(stderr, "verification failed: in[%u].I=%u in[%u].Q=%u out[%u]=%u, expected=%.0f, error=%.2f%%\n", + i, in[i].I, + i, in[i].Q, + i, out[i], + expected, + error_fraction * 100.0); + okay = false; + } + + sum_level += expected; + sum_power += expected * expected; + } + + sum_level = sum_level / len / 65536.0; + sum_power = sum_power / len / (65536.0 * 65536.0); + + double level_error = sum_level - *out_level; + if (fabs(level_error / sum_level) > max_error) { + fprintf(stderr, "verification failed: expected mean level %.5f, got mean level %.5f, error=%.2f%%\n", + sum_level, *out_level, 100.0 * level_error / sum_level); + okay = false; + } + + double power_error = sum_power - *out_power; + if (fabs(power_error / sum_power) > max_error) { + fprintf(stderr, "verification failed: expected mean power %.5f, got mean power %.5f, error=%.2f%%\n", + sum_power, *out_power, 100.0 * power_error / sum_power); + okay = false; + } + + return okay; +} diff --git a/dsp/benchmark/magnitude_sc16_benchmark.c b/dsp/benchmark/magnitude_sc16_benchmark.c new file mode 100644 index 0000000..8c1edee --- /dev/null +++ b/dsp/benchmark/magnitude_sc16_benchmark.c @@ -0,0 +1,79 @@ +#include +#include +#include + +void STARCH_BENCHMARK(magnitude_sc16) (void) +{ + sc16_t *in = NULL; + uint16_t *out_mag = NULL; + const unsigned len = 262144; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, *in)) || !(out_mag = STARCH_BENCHMARK_ALLOC(len, *out_mag))) { + goto done; + } + + unsigned i = 0; + + // 0.9 magnitude, varying phase + double degrees = 0; + for (; i < len && degrees < 360; i += 1, degrees += 1) { + in[i].I = (int16_t) (0.9 * cos(degrees * M_PI / 180.0) * 32768.0); + in[i].Q = (int16_t) (0.9 * sin(degrees * M_PI / 180.0) * 32768.0); + } + + // 0, 45, 90 degree phase, full input range + unsigned sequence = 0; + for (; (i+3) <= len && sequence < 65536; i += 3, sequence += 1) { + in[i + 0].I = (int16_t) (sequence - 32768); + in[i + 0].Q = 0; + + in[i + 1].I = (int16_t) (sequence - 32768); + in[i + 1].Q = (int16_t) (sequence - 32768); + + in[i + 2].I = 0; + in[i + 2].Q = (int16_t) (sequence - 32768); + } + + // Fill the rest with random values + srand(1); + for (; i < len; ++i) { + in[i].I = rand() % 65536 - 32768; + in[i].Q = rand() % 65536 - 32768; + } + + STARCH_BENCHMARK_RUN( magnitude_sc16, in, out_mag, len ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out_mag); +} + +bool STARCH_BENCHMARK_VERIFY(magnitude_sc16) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const double max_error = 0.015; // tolerate 1.5% error + const double epsilon = 3.0; + bool okay = true; + + for (unsigned i = 0; i < len; ++i) { + double I = in[i].I / 32768.0; + double Q = in[i].Q / 32768.0; + double expected = round(sqrt(I * I + Q * Q) * 65536.0); + if (expected > 65535.0) + expected = 65535.0; + double actual = out[i]; + + double error = fabs(expected - actual); + double error_fraction = error / (expected > epsilon ? expected : epsilon); + if (error > epsilon && error_fraction > max_error) { + fprintf(stderr, "verification failed: in[%u].I=%d in[%u].Q=%d out[%u]=%u, expected=%.0f, error=%.2f%%\n", + i, in[i].I, + i, in[i].Q, + i, out[i], + expected, + error_fraction * 100.0); + okay = false; + } + } + + return okay; +} diff --git a/dsp/benchmark/magnitude_sc16q11_benchmark.c b/dsp/benchmark/magnitude_sc16q11_benchmark.c new file mode 100644 index 0000000..a08b96e --- /dev/null +++ b/dsp/benchmark/magnitude_sc16q11_benchmark.c @@ -0,0 +1,79 @@ +#include +#include +#include + +void STARCH_BENCHMARK(magnitude_sc16q11) (void) +{ + sc16_t *in = NULL; + uint16_t *out_mag = NULL; + const unsigned len = 65536; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, *in)) || !(out_mag = STARCH_BENCHMARK_ALLOC(len, *out_mag))) { + goto done; + } + + unsigned i = 0; + + // 0.9 magnitude, varying phase + double degrees = 0; + for (; i < len && degrees < 360; i += 1, degrees += 1) { + in[i].I = (int16_t) (0.9 * cos(degrees * M_PI / 180.0) * 2048.0); + in[i].Q = (int16_t) (0.9 * sin(degrees * M_PI / 180.0) * 2048.0); + } + + // 0, 45, 90 degree phase, full input range + unsigned sequence = 0; + for (; (i+3) <= len && sequence < 4096; i += 3, sequence += 1) { + in[i + 0].I = (int16_t) (sequence - 2048); + in[i + 0].Q = 0; + + in[i + 1].I = (int16_t) (sequence - 2048); + in[i + 1].Q = (int16_t) (sequence - 2048); + + in[i + 2].I = 0; + in[i + 2].Q = (int16_t) (sequence - 2048); + } + + // Fill the rest with random values + srand(1); + for (; i < len; ++i) { + in[i].I = rand() % 4096 - 2048; + in[i].Q = rand() % 4096 - 2048; + } + + STARCH_BENCHMARK_RUN( magnitude_sc16q11, in, out_mag, len ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out_mag); +} + +bool STARCH_BENCHMARK_VERIFY(magnitude_sc16q11) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const double max_error = 0.015; // tolerate 1.5% error + const double epsilon = 3.0; + bool okay = true; + + for (unsigned i = 0; i < len; ++i) { + double I = in[i].I / 2048.0; + double Q = in[i].Q / 2048.0; + double expected = round(sqrt(I * I + Q * Q) * 65536.0); + if (expected > 65535.0) + expected = 65535.0; + double actual = out[i]; + + double error = fabs(expected - actual); + double error_fraction = error / (expected > epsilon ? expected : epsilon); + if (error > epsilon && error_fraction > max_error) { + fprintf(stderr, "verification failed: in[%u].I=%d in[%u].Q=%d out[%u]=%u, expected=%.0f, error=%.2f%%\n", + i, in[i].I, + i, in[i].Q, + i, out[i], + expected, + error_fraction * 100.0); + okay = false; + } + } + + return okay; +} diff --git a/dsp/benchmark/magnitude_uc8_benchmark.c b/dsp/benchmark/magnitude_uc8_benchmark.c new file mode 100644 index 0000000..e03fc0c --- /dev/null +++ b/dsp/benchmark/magnitude_uc8_benchmark.c @@ -0,0 +1,79 @@ +#include +#include + +void STARCH_BENCHMARK(magnitude_uc8) (void) +{ + uc8_t *in = NULL; + uint16_t *out_mag = NULL; + const unsigned len = 65536; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, *in)) || !(out_mag = STARCH_BENCHMARK_ALLOC(len, *out_mag))) { + goto done; + } + + unsigned i = 0; + + // 0.9 magnitude, varying phase + double degrees = 0; + for (; i < len && degrees < 360; i += 1, degrees += 1) { + in[i].I = (uint8_t) (0.9 * cos(degrees * M_PI / 180.0) * 128 + 127.4); + in[i].Q = (uint8_t) (0.9 * sin(degrees * M_PI / 180.0) * 128 + 127.4); + } + + // 0, 45, 90 degree phase, full input range + unsigned sequence = 0; + for (; (i+3) <= len && sequence < 256; i += 3, sequence += 1) { + in[i + 0].I = sequence; + in[i + 0].Q = 0; + + in[i + 1].I = sequence; + in[i + 1].Q = sequence; + + in[i + 2].I = 0; + in[i + 2].Q = sequence; + } + + // Fill the rest with random values + srand(1); + for (; i < len; ++i) { + in[i].I = rand() % 256; + in[i].Q = rand() % 256; + } + + STARCH_BENCHMARK_RUN( magnitude_uc8, in, out_mag, len ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out_mag); +} + +bool STARCH_BENCHMARK_VERIFY(magnitude_uc8) (const uc8_t *in, uint16_t *out, unsigned len) +{ + const double max_error = 0.015; // tolerate 1.5% error + const double epsilon = 3.0; + bool okay = true; + + for (unsigned i = 0; i < len; ++i) { + double I = (in[i].I - 127.4) / 128; + double Q = (in[i].Q - 127.4) / 128; + double magsq = I * I + Q * Q; + double expected = round(sqrt(magsq) * 65536.0); + if (expected > 65535.0) + expected = 65535.0; + double actual = out[i]; + + double error = fabs(expected - actual); + double error_fraction = error / (expected > epsilon ? expected : epsilon); + if (error > epsilon && error_fraction > max_error) { + fprintf(stderr, "verification failed: in[%u].I=%u in[%u].Q=%u out[%u]=%u, expected=%.0f, error=%.2f%%\n", + i, in[i].I, + i, in[i].Q, + i, out[i], + expected, + error_fraction * 100.0); + okay = false; + } + } + + return okay; +} diff --git a/dsp/benchmark/mean_power_u16_benchmark.c b/dsp/benchmark/mean_power_u16_benchmark.c new file mode 100644 index 0000000..16c60fd --- /dev/null +++ b/dsp/benchmark/mean_power_u16_benchmark.c @@ -0,0 +1,57 @@ +#include + +void STARCH_BENCHMARK(mean_power_u16) (void) +{ + uint16_t *in = NULL; + double mean_mag, mean_magsq; + const unsigned len = 65536; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, *in))) { + goto done; + } + + for (unsigned i = 0; i < len; ++i) { + in[i] = i; + } + + STARCH_BENCHMARK_RUN( mean_power_u16, in, len, &mean_mag, &mean_magsq ); + + done: + STARCH_BENCHMARK_FREE(in); +} + +bool STARCH_BENCHMARK_VERIFY(mean_power_u16) (const uint16_t *in, unsigned len, double *out_mag, double *out_magsq) +{ + const double max_error = 0.01; // tolerate 1% error + + double sum_mag = 0; + double sum_magsq = 0; + + for (unsigned i = 0; i < len; ++i) { + double mag = in[i] / 65536.0; + sum_mag += mag; + sum_magsq += mag * mag; + } + + sum_mag /= len; + sum_magsq /= len; + + bool okay = true; + + double mag_error = sum_mag - *out_mag; + if (fabs(mag_error / sum_mag) > max_error) { + fprintf(stderr, "verification failed: expected mean magnitude %.5f, got %.5f, error=%.2f%%\n", + sum_mag, *out_mag, 100.0 * mag_error / sum_mag); + okay = false; + } + + + double magsq_error = sum_magsq - *out_magsq; + if (fabs(magsq_error / sum_magsq) > max_error) { + fprintf(stderr, "verification failed: expected mean magnitude-squared %.5f, got %.5f, error=%.2f%%\n", + sum_magsq, *out_magsq, 100.0 * magsq_error / sum_magsq); + okay = false; + } + + return okay; +} diff --git a/dsp/generated/benchmark.c b/dsp/generated/benchmark.c new file mode 100644 index 0000000..56a590c --- /dev/null +++ b/dsp/generated/benchmark.c @@ -0,0 +1,1590 @@ + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "starch.h" + +typedef struct timespec starch_benchmark_time; + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end); +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size); +void starch_benchmark_aligned_free(void *user_ptr); +void starch_benchmark_get_time(starch_benchmark_time *t); + +const unsigned starch_benchmark_warmup_loops = 10; + +typedef struct { + const char *name; + const char *impl; + uint64_t ns; +} starch_benchmark_result; + +static starch_benchmark_result *starch_benchmark_results = NULL; +static unsigned starch_benchmark_result_size = 0; +static unsigned starch_benchmark_result_count = 0; + +typedef struct benchmark_flavor_list_node { + const char *flavor; + struct benchmark_flavor_list_node *next; +} starch_benchmark_flavor_list; + +static starch_benchmark_flavor_list *starch_benchmark_flavor_whitelist = NULL; +static starch_benchmark_flavor_list *starch_benchmark_flavor_blacklist = NULL; + +static bool starch_benchmark_list_only = false; +static bool starch_benchmark_validate_only = false; +static bool starch_benchmark_validation_failed = false; +static bool starch_benchmark_top_only = false; +static unsigned starch_benchmark_iterations = 1; + +typedef struct timespec starch_benchmark_time; +void starch_benchmark_get_time(starch_benchmark_time *t) +{ +#ifdef CLOCK_THREAD_CPUTIME_ID + clock_gettime(CLOCK_THREAD_CPUTIME_ID, t); +#else + clock_gettime(CLOCK_MONOTONIC, t); +#endif +} + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end) +{ + return ((uint64_t)end->tv_sec - (uint64_t)start->tv_sec) * 1000000000U + (uint64_t)end->tv_nsec - (uint64_t)start->tv_nsec; +} + +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size) +{ + size_t use_alignment = (type_alignment > alignment ? type_alignment : alignment); + if ( (use_alignment % type_alignment) || (use_alignment % alignment) ) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: conflicting alignment requirements (%zu versus %zu)\n", size, alignment, type_alignment); + return NULL; + } + + /* Over-allocate so we can stash our own pointer before the start, and so that we can adjust + * the returned alignment so it is only aligned to the requested boundary, and not also + * aligned to a larger power of two (we don't want to accidentally benchmark the performance + * of a more restrictive larger alignment) + */ + size_t header_size = (use_alignment < sizeof(void*) ? sizeof(void*) : use_alignment); + char *block_ptr = aligned_alloc(use_alignment, header_size + size + use_alignment); + if (!block_ptr) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: %s\n", size, strerror(errno)); + return NULL; + } + + char *user_ptr = block_ptr + header_size; + if ( (uintptr_t)user_ptr % (use_alignment * 2) == 0 ) { + // user_ptr is aligned to the next power of two, but we don't want that, move it on + user_ptr += use_alignment; + } + + void **stash = (void**)user_ptr - 1; + *stash = block_ptr; + + return user_ptr; +} + +void starch_benchmark_aligned_free(void *user_ptr) +{ + if (!user_ptr) + return; + void **stash = (void**)user_ptr - 1; + free(*stash); +} + +static bool starch_benchmark_flavor_in_list(const char *flavor, const starch_benchmark_flavor_list *list) +{ + for (; list; list = list->next) { + if (!strcmp(flavor, list->flavor)) + return true; + } + return false; +} + + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_uc8_benchmark (void); +bool starch_magnitude_uc8_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_uc8_benchmark(void); + +static void starch_benchmark_one_magnitude_uc8( starch_magnitude_uc8_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_uc8_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_uc8"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_uc8( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_uc8_regentry *_entry = starch_magnitude_uc8_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_uc8( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_uc8_aligned_benchmark (void); +bool starch_magnitude_uc8_aligned_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_uc8_aligned_benchmark(void); + +static void starch_benchmark_one_magnitude_uc8_aligned( starch_magnitude_uc8_aligned_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_uc8_aligned_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_uc8_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_uc8_aligned( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_uc8_aligned_regentry *_entry = starch_magnitude_uc8_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_uc8_aligned( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_power_uc8_benchmark (void); +bool starch_magnitude_power_uc8_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_power_uc8_benchmark(void); + +static void starch_benchmark_one_magnitude_power_uc8( starch_magnitude_power_uc8_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + + /* verify correctness of the output */ + if (! starch_magnitude_power_uc8_benchmark_verify ( arg0, arg1, arg2, arg3, arg4 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_power_uc8"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_power_uc8( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) +{ + for (starch_magnitude_power_uc8_regentry *_entry = starch_magnitude_power_uc8_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_power_uc8( _entry, arg0, arg1, arg2, arg3, arg4 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_power_uc8_aligned_benchmark (void); +bool starch_magnitude_power_uc8_aligned_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_power_uc8_aligned_benchmark(void); + +static void starch_benchmark_one_magnitude_power_uc8_aligned( starch_magnitude_power_uc8_aligned_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + + /* verify correctness of the output */ + if (! starch_magnitude_power_uc8_aligned_benchmark_verify ( arg0, arg1, arg2, arg3, arg4 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_power_uc8_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_power_uc8_aligned( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) +{ + for (starch_magnitude_power_uc8_aligned_regentry *_entry = starch_magnitude_power_uc8_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_power_uc8_aligned( _entry, arg0, arg1, arg2, arg3, arg4 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_sc16_benchmark (void); +bool starch_magnitude_sc16_benchmark_verify ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_sc16_benchmark(void); + +static void starch_benchmark_one_magnitude_sc16( starch_magnitude_sc16_regentry * _entry, const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_sc16_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_sc16"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_sc16( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_sc16_regentry *_entry = starch_magnitude_sc16_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_sc16( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_sc16_aligned_benchmark (void); +bool starch_magnitude_sc16_aligned_benchmark_verify ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_sc16_aligned_benchmark(void); + +static void starch_benchmark_one_magnitude_sc16_aligned( starch_magnitude_sc16_aligned_regentry * _entry, const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_sc16_aligned_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_sc16_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_sc16_aligned( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_sc16_aligned_regentry *_entry = starch_magnitude_sc16_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_sc16_aligned( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_sc16q11_benchmark (void); +bool starch_magnitude_sc16q11_benchmark_verify ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_sc16q11_benchmark(void); + +static void starch_benchmark_one_magnitude_sc16q11( starch_magnitude_sc16q11_regentry * _entry, const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_sc16q11_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_sc16q11"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_sc16q11( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_sc16q11_regentry *_entry = starch_magnitude_sc16q11_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_sc16q11( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_sc16q11_aligned_benchmark (void); +bool starch_magnitude_sc16q11_aligned_benchmark_verify ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_sc16q11_aligned_benchmark(void); + +static void starch_benchmark_one_magnitude_sc16q11_aligned( starch_magnitude_sc16q11_aligned_regentry * _entry, const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_sc16q11_aligned_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_sc16q11_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_sc16q11_aligned( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_sc16q11_aligned_regentry *_entry = starch_magnitude_sc16q11_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_sc16q11_aligned( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_mean_power_u16_benchmark (void); +bool starch_mean_power_u16_benchmark_verify ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_mean_power_u16_benchmark(void); + +static void starch_benchmark_one_mean_power_u16( starch_mean_power_u16_regentry * _entry, const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + + /* verify correctness of the output */ + if (! starch_mean_power_u16_benchmark_verify ( arg0, arg1, arg2, arg3 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "mean_power_u16"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_mean_power_u16( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) +{ + for (starch_mean_power_u16_regentry *_entry = starch_mean_power_u16_registry; _entry->name; ++_entry) { + starch_benchmark_one_mean_power_u16( _entry, arg0, arg1, arg2, arg3 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_mean_power_u16_aligned_benchmark (void); +bool starch_mean_power_u16_aligned_benchmark_verify ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_mean_power_u16_aligned_benchmark(void); + +static void starch_benchmark_one_mean_power_u16_aligned( starch_mean_power_u16_aligned_regentry * _entry, const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + + /* verify correctness of the output */ + if (! starch_mean_power_u16_aligned_benchmark_verify ( arg0, arg1, arg2, arg3 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "mean_power_u16_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) +{ + for (starch_mean_power_u16_aligned_regentry *_entry = starch_mean_power_u16_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_mean_power_u16_aligned( _entry, arg0, arg1, arg2, arg3 ); + } +} + + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _benchmark_sym +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) starch_ ## _function ## _benchmark +#define STARCH_BENCHMARK_VERIFY(_function) starch_ ## _function ## _benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(1, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +#include "../benchmark/magnitude_sc16_benchmark.c" +#include "../benchmark/magnitude_uc8_benchmark.c" +#include "../benchmark/magnitude_power_uc8_benchmark.c" +#include "../benchmark/mean_power_u16_benchmark.c" +#include "../benchmark/magnitude_sc16q11_benchmark.c" + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES +#undef STARCH_BENCHMARK +#undef STARCH_BENCHMARK_VERIFY +#undef STARCH_BENCHMARK_RUN +#undef STARCH_BENCHMARK_ALLOC +#undef STARCH_BENCHMARK_FREE + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_benchmark_sym +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) starch_ ## _function ## _aligned_benchmark +#define STARCH_BENCHMARK_VERIFY(_function) starch_ ## _function ## _aligned_benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ## _aligned ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(STARCH_MIX_ALIGNMENT, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +#include "../benchmark/magnitude_sc16_benchmark.c" +#include "../benchmark/magnitude_uc8_benchmark.c" +#include "../benchmark/magnitude_power_uc8_benchmark.c" +#include "../benchmark/mean_power_u16_benchmark.c" +#include "../benchmark/magnitude_sc16q11_benchmark.c" + +static void starch_benchmark_all_magnitude_uc8(void) +{ + fprintf(stderr, "==== magnitude_uc8 ===\n"); + starch_magnitude_uc8_benchmark (); +} +static void starch_benchmark_all_magnitude_uc8_aligned(void) +{ + fprintf(stderr, "==== magnitude_uc8_aligned ===\n"); + starch_magnitude_uc8_aligned_benchmark (); +} +static void starch_benchmark_all_magnitude_power_uc8(void) +{ + fprintf(stderr, "==== magnitude_power_uc8 ===\n"); + starch_magnitude_power_uc8_benchmark (); +} +static void starch_benchmark_all_magnitude_power_uc8_aligned(void) +{ + fprintf(stderr, "==== magnitude_power_uc8_aligned ===\n"); + starch_magnitude_power_uc8_aligned_benchmark (); +} +static void starch_benchmark_all_magnitude_sc16(void) +{ + fprintf(stderr, "==== magnitude_sc16 ===\n"); + starch_magnitude_sc16_benchmark (); +} +static void starch_benchmark_all_magnitude_sc16_aligned(void) +{ + fprintf(stderr, "==== magnitude_sc16_aligned ===\n"); + starch_magnitude_sc16_aligned_benchmark (); +} +static void starch_benchmark_all_magnitude_sc16q11(void) +{ + fprintf(stderr, "==== magnitude_sc16q11 ===\n"); + starch_magnitude_sc16q11_benchmark (); +} +static void starch_benchmark_all_magnitude_sc16q11_aligned(void) +{ + fprintf(stderr, "==== magnitude_sc16q11_aligned ===\n"); + starch_magnitude_sc16q11_aligned_benchmark (); +} +static void starch_benchmark_all_mean_power_u16(void) +{ + fprintf(stderr, "==== mean_power_u16 ===\n"); + starch_mean_power_u16_benchmark (); +} +static void starch_benchmark_all_mean_power_u16_aligned(void) +{ + fprintf(stderr, "==== mean_power_u16_aligned ===\n"); + starch_mean_power_u16_aligned_benchmark (); +} + +static int starch_benchmark_compare_result(const void *a, const void *b) +{ + const starch_benchmark_result *left = (const starch_benchmark_result *) a; + const starch_benchmark_result *right = (const starch_benchmark_result *) b; + + int name_cmp = strcmp(left->name, right->name); + if (name_cmp) + return name_cmp; + + if (left->ns < right->ns) + return -1; + if (left->ns > right->ns) + return 1; + return 0; +} + +static void starch_benchmark_usage(const char *argv0) +{ + fprintf(stderr, + "Usage: %s [OPTION]... [FUNCTION]...\n" + "Benchmarks starch functions and optionally writes a sorted wisdom file.\n" + "\n" + " -r FILE Read initial wisdom from FILE\n" + " -o FILE Write sorted wisdom to FILE\n" + " -F FLAVOR Add FLAVOR to whitelist\n" + " (default: no whitelist, run all runtime-supported flavors)\n" + " -N FLAVOR Add FLAVOR to blacklist\n" + " (default: no blacklist, run all runtime-supported flavors)\n" + " -l List compiled-in implementations but don't benchmark them\n" + " -V Run validation tests, but don't run benchmarks\n" + " -t Include only the top candidate per function in wisdom output\n" + " -i ITERS Run benchmark ITERS times and use the mean. If ITERS > 2, ignore\n" + " the smallest and largest runs when calculating the mean.\n" + " (default: 1 iteration)\n" + " FUNCTION Run benchmarks for these functions only\n" + " (default: benchmark all functions)\n" + "\n" + "Supported flavors: " +#ifdef STARCH_FLAVOR_GENERIC + "generic " +#endif +#ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 + "armv7a_neon_vfpv4 " +#endif +#ifdef STARCH_FLAVOR_X86_AVX2 + "x86_avx2 " +#endif + "\n" + "Supported functions: " + "magnitude_uc8 " + "magnitude_uc8_aligned " + "magnitude_power_uc8 " + "magnitude_power_uc8_aligned " + "magnitude_sc16 " + "magnitude_sc16_aligned " + "magnitude_sc16q11 " + "magnitude_sc16q11_aligned " + "mean_power_u16 " + "mean_power_u16_aligned " + "\n", argv0); +} + +static void starch_benchmark_append_flavor(const char *flavor, starch_benchmark_flavor_list **list) +{ + starch_benchmark_flavor_list *newnode = malloc(sizeof(*newnode)); + if (!newnode) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + exit(1); + } + + newnode->flavor = flavor; + newnode->next = *list; + *list = newnode; +} + +int main(int argc, char **argv) +{ + int specific = 0; + const char *output_path = NULL; + + int opt; + while ((opt = getopt(argc, argv, "r:o:F:N:i:lhtV")) != -1) { + switch (opt) { + case 'r': + if (starch_read_wisdom(optarg) < 0) { + fprintf(stderr, "%s: cannot read %s: %s\n", argv[0], optarg, strerror(errno)); + return 1; + } + fprintf(stderr, "%s: loaded wisdom file %s\n", argv[0], optarg); + break; + + case 'o': + output_path = optarg; + break; + + case 'F': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_whitelist); + break; + + case 'N': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_blacklist); + break; + + case 'l': + starch_benchmark_list_only = true; + break; + + case 't': + starch_benchmark_top_only = true; + break; + + case 'i': + starch_benchmark_iterations = atoi(optarg); + break; + + case 'V': + starch_benchmark_validate_only = true; + break; + + case 'h': + starch_benchmark_usage(argv[0]); + return 0; + + case '?': + default: + starch_benchmark_usage(argv[0]); + return 2; + } + } + + if (starch_benchmark_list_only && output_path) { + fprintf(stderr, "%s: -o and -l options cannot be specified together\n", argv[0]); + return 2; + } + + for (int i = optind; i < argc; ++i) { + if (!strcmp(argv[i], "magnitude_uc8")) { + specific = 1; + starch_benchmark_all_magnitude_uc8(); + continue; + } + if (!strcmp(argv[i], "magnitude_uc8_aligned")) { + specific = 1; + starch_benchmark_all_magnitude_uc8_aligned(); + continue; + } + if (!strcmp(argv[i], "magnitude_power_uc8")) { + specific = 1; + starch_benchmark_all_magnitude_power_uc8(); + continue; + } + if (!strcmp(argv[i], "magnitude_power_uc8_aligned")) { + specific = 1; + starch_benchmark_all_magnitude_power_uc8_aligned(); + continue; + } + if (!strcmp(argv[i], "magnitude_sc16")) { + specific = 1; + starch_benchmark_all_magnitude_sc16(); + continue; + } + if (!strcmp(argv[i], "magnitude_sc16_aligned")) { + specific = 1; + starch_benchmark_all_magnitude_sc16_aligned(); + continue; + } + if (!strcmp(argv[i], "magnitude_sc16q11")) { + specific = 1; + starch_benchmark_all_magnitude_sc16q11(); + continue; + } + if (!strcmp(argv[i], "magnitude_sc16q11_aligned")) { + specific = 1; + starch_benchmark_all_magnitude_sc16q11_aligned(); + continue; + } + if (!strcmp(argv[i], "mean_power_u16")) { + specific = 1; + starch_benchmark_all_mean_power_u16(); + continue; + } + if (!strcmp(argv[i], "mean_power_u16_aligned")) { + specific = 1; + starch_benchmark_all_mean_power_u16_aligned(); + continue; + } + + fprintf(stderr, "%s: unrecognized function name: %s\n", argv[0], argv[i]); + return 2; + } + + if (!specific) { + starch_benchmark_all_magnitude_uc8(); + starch_benchmark_all_magnitude_uc8_aligned(); + starch_benchmark_all_magnitude_power_uc8(); + starch_benchmark_all_magnitude_power_uc8_aligned(); + starch_benchmark_all_magnitude_sc16(); + starch_benchmark_all_magnitude_sc16_aligned(); + starch_benchmark_all_magnitude_sc16q11(); + starch_benchmark_all_magnitude_sc16q11_aligned(); + starch_benchmark_all_mean_power_u16(); + starch_benchmark_all_mean_power_u16_aligned(); + } + + if (output_path) { + FILE *out = fopen(output_path, "w"); + if (!out) { + fprintf(stderr, "%s: cannot open %s: %s\n", argv[0], output_path, strerror(errno)); + return 1; + } + + fprintf(out, "# generated by "); + for (int i = 0; i < argc; ++i) + fprintf(out, "%s ", argv[i]); + fprintf(out, "\n\n"); + + qsort(starch_benchmark_results, starch_benchmark_result_count, sizeof(*starch_benchmark_results), starch_benchmark_compare_result); + + const char *last_name = NULL; + bool first = true; + for (unsigned i = 0; i < starch_benchmark_result_count; ++i) { + starch_benchmark_result *result = &starch_benchmark_results[i]; + if (last_name && strcmp(last_name, result->name) != 0) { + fprintf(out, "\n"); + first = true; + } + last_name = result->name; + if (starch_benchmark_top_only && !first) + continue; + fprintf(out, "%-40s %-40s # %" PRIu64 " ns/call\n", result->name, result->impl, result->ns); + first = false; + } + + fclose(out); + fprintf(stderr, "%s: wrote sorted wisdom to %s\n", argv[0], output_path); + } + + return starch_benchmark_validation_failed ? 1 : 0; +} diff --git a/dsp/generated/dispatcher.c b/dsp/generated/dispatcher.c new file mode 100644 index 0000000..565ed76 --- /dev/null +++ b/dsp/generated/dispatcher.c @@ -0,0 +1,1160 @@ + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include + +#include "starch.h" + +/* helper for re-sorting registries */ +struct starch_regentry_prefix { + int rank; +}; + +static int starch_regentry_rank_compare (const void *l, const void *r) +{ + const struct starch_regentry_prefix *left = l, *right = r; + return left->rank - right->rank; +} + +/* dispatcher / registry for magnitude_uc8 */ + +starch_magnitude_uc8_regentry * starch_magnitude_uc8_select() { + for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_uc8_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_select(); + if (!entry) + abort(); + + starch_magnitude_uc8 = entry->callable; + starch_magnitude_uc8 ( arg0, arg1, arg2 ); +} + +starch_magnitude_uc8_ptr starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; + +void starch_magnitude_uc8_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_uc8_regentry *entry; + for (entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_uc8_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_uc8_registry, entry - starch_magnitude_uc8_registry, sizeof(starch_magnitude_uc8_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; +} + +starch_magnitude_uc8_regentry starch_magnitude_uc8_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "exact_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 6, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_x86_avx2, cpu_supports_avx2 }, + { 3, "exact_x86_avx2", "x86_avx2", starch_magnitude_uc8_exact_x86_avx2, cpu_supports_avx2 }, + { 4, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 5, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_uc8_aligned */ + +starch_magnitude_uc8_aligned_regentry * starch_magnitude_uc8_aligned_select() { + for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_uc8_aligned_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_select(); + if (!entry) + abort(); + + starch_magnitude_uc8_aligned = entry->callable; + starch_magnitude_uc8_aligned ( arg0, arg1, arg2 ); +} + +starch_magnitude_uc8_aligned_ptr starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; + +void starch_magnitude_uc8_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_uc8_aligned_regentry *entry; + for (entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_uc8_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_uc8_aligned_registry, entry - starch_magnitude_uc8_aligned_registry, sizeof(starch_magnitude_uc8_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; +} + +starch_magnitude_uc8_aligned_regentry starch_magnitude_uc8_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_unroll_4_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "exact_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "exact_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 8, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 9, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 10, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_lookup_x86_avx2, cpu_supports_avx2 }, + { 3, "lookup_unroll_4_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 4, "exact_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_exact_x86_avx2, cpu_supports_avx2 }, + { 5, "lookup_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_x86_avx2, cpu_supports_avx2 }, + { 6, "exact_x86_avx2", "x86_avx2", starch_magnitude_uc8_exact_x86_avx2, cpu_supports_avx2 }, + { 7, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 8, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_power_uc8 */ + +starch_magnitude_power_uc8_regentry * starch_magnitude_power_uc8_select() { + for (starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_power_uc8_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) { + starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_select(); + if (!entry) + abort(); + + starch_magnitude_power_uc8 = entry->callable; + starch_magnitude_power_uc8 ( arg0, arg1, arg2, arg3, arg4 ); +} + +starch_magnitude_power_uc8_ptr starch_magnitude_power_uc8 = starch_magnitude_power_uc8_dispatch; + +void starch_magnitude_power_uc8_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_power_uc8_regentry *entry; + for (entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_power_uc8_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_power_uc8_registry, entry - starch_magnitude_power_uc8_registry, sizeof(starch_magnitude_power_uc8_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_power_uc8 = starch_magnitude_power_uc8_dispatch; +} + +starch_magnitude_power_uc8_regentry starch_magnitude_power_uc8_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 2, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, + { 2, "twopass_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_twopass_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 6, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "twopass_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_twopass_x86_avx2, cpu_supports_avx2 }, + { 1, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 2, "lookup_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_lookup_x86_avx2, cpu_supports_avx2 }, + { 3, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 4, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 5, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_power_uc8_aligned */ + +starch_magnitude_power_uc8_aligned_regentry * starch_magnitude_power_uc8_aligned_select() { + for (starch_magnitude_power_uc8_aligned_regentry *entry = starch_magnitude_power_uc8_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_power_uc8_aligned_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) { + starch_magnitude_power_uc8_aligned_regentry *entry = starch_magnitude_power_uc8_aligned_select(); + if (!entry) + abort(); + + starch_magnitude_power_uc8_aligned = entry->callable; + starch_magnitude_power_uc8_aligned ( arg0, arg1, arg2, arg3, arg4 ); +} + +starch_magnitude_power_uc8_aligned_ptr starch_magnitude_power_uc8_aligned = starch_magnitude_power_uc8_aligned_dispatch; + +void starch_magnitude_power_uc8_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_power_uc8_aligned_regentry *entry; + for (entry = starch_magnitude_power_uc8_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_power_uc8_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_power_uc8_aligned_registry, entry - starch_magnitude_power_uc8_aligned_registry, sizeof(starch_magnitude_power_uc8_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_power_uc8_aligned = starch_magnitude_power_uc8_aligned_dispatch; +} + +starch_magnitude_power_uc8_aligned_regentry starch_magnitude_power_uc8_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 2, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, + { 2, "twopass_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_aligned_twopass_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_aligned_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "lookup_unroll_4_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "twopass_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_twopass_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 8, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 9, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 10, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "twopass_x86_avx2_aligned", "x86_avx2", starch_magnitude_power_uc8_aligned_twopass_x86_avx2, cpu_supports_avx2 }, + { 1, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 2, "lookup_x86_avx2_aligned", "x86_avx2", starch_magnitude_power_uc8_aligned_lookup_x86_avx2, cpu_supports_avx2 }, + { 3, "lookup_unroll_4_x86_avx2_aligned", "x86_avx2", starch_magnitude_power_uc8_aligned_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 4, "twopass_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_twopass_x86_avx2, cpu_supports_avx2 }, + { 5, "lookup_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_lookup_x86_avx2, cpu_supports_avx2 }, + { 6, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 7, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 8, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_sc16 */ + +starch_magnitude_sc16_regentry * starch_magnitude_sc16_select() { + for (starch_magnitude_sc16_regentry *entry = starch_magnitude_sc16_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_sc16_dispatch ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_sc16_regentry *entry = starch_magnitude_sc16_select(); + if (!entry) + abort(); + + starch_magnitude_sc16 = entry->callable; + starch_magnitude_sc16 ( arg0, arg1, arg2 ); +} + +starch_magnitude_sc16_ptr starch_magnitude_sc16 = starch_magnitude_sc16_dispatch; + +void starch_magnitude_sc16_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_sc16_regentry *entry; + for (entry = starch_magnitude_sc16_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_sc16_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_sc16_registry, entry - starch_magnitude_sc16_registry, sizeof(starch_magnitude_sc16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16 = starch_magnitude_sc16_dispatch; +} + +starch_magnitude_sc16_regentry starch_magnitude_sc16_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 2, "exact_u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "exact_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16_exact_float_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 2, "exact_u32_x86_avx2", "x86_avx2", starch_magnitude_sc16_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 3, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_sc16_aligned */ + +starch_magnitude_sc16_aligned_regentry * starch_magnitude_sc16_aligned_select() { + for (starch_magnitude_sc16_aligned_regentry *entry = starch_magnitude_sc16_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_sc16_aligned_dispatch ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_sc16_aligned_regentry *entry = starch_magnitude_sc16_aligned_select(); + if (!entry) + abort(); + + starch_magnitude_sc16_aligned = entry->callable; + starch_magnitude_sc16_aligned ( arg0, arg1, arg2 ); +} + +starch_magnitude_sc16_aligned_ptr starch_magnitude_sc16_aligned = starch_magnitude_sc16_aligned_dispatch; + +void starch_magnitude_sc16_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_sc16_aligned_regentry *entry; + for (entry = starch_magnitude_sc16_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_sc16_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_sc16_aligned_registry, entry - starch_magnitude_sc16_aligned_registry, sizeof(starch_magnitude_sc16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16_aligned = starch_magnitude_sc16_aligned_dispatch; +} + +starch_magnitude_sc16_aligned_regentry starch_magnitude_sc16_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 2, "exact_u32_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16_aligned_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "exact_float_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16_aligned_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "exact_u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "exact_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_float_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16_aligned_exact_float_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 2, "exact_u32_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16_aligned_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 3, "exact_u32_x86_avx2", "x86_avx2", starch_magnitude_sc16_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 4, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16_exact_float_x86_avx2, cpu_supports_avx2 }, + { 5, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_sc16q11 */ + +starch_magnitude_sc16q11_regentry * starch_magnitude_sc16q11_select() { + for (starch_magnitude_sc16q11_regentry *entry = starch_magnitude_sc16q11_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_sc16q11_dispatch ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_sc16q11_regentry *entry = starch_magnitude_sc16q11_select(); + if (!entry) + abort(); + + starch_magnitude_sc16q11 = entry->callable; + starch_magnitude_sc16q11 ( arg0, arg1, arg2 ); +} + +starch_magnitude_sc16q11_ptr starch_magnitude_sc16q11 = starch_magnitude_sc16q11_dispatch; + +void starch_magnitude_sc16q11_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_sc16q11_regentry *entry; + for (entry = starch_magnitude_sc16q11_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_sc16q11_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_sc16q11_registry, entry - starch_magnitude_sc16q11_registry, sizeof(starch_magnitude_sc16q11_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16q11 = starch_magnitude_sc16q11_dispatch; +} + +starch_magnitude_sc16q11_regentry starch_magnitude_sc16q11_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 2, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 3, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 2, "exact_u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "exact_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "11bit_table_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_11bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "12bit_table_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_12bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 7, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 8, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_exact_float_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 2, "exact_u32_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 3, "11bit_table_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_11bit_table_x86_avx2, cpu_supports_avx2 }, + { 4, "12bit_table_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_12bit_table_x86_avx2, cpu_supports_avx2 }, + { 5, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 6, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 7, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_sc16q11_aligned */ + +starch_magnitude_sc16q11_aligned_regentry * starch_magnitude_sc16q11_aligned_select() { + for (starch_magnitude_sc16q11_aligned_regentry *entry = starch_magnitude_sc16q11_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_sc16q11_aligned_dispatch ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_sc16q11_aligned_regentry *entry = starch_magnitude_sc16q11_aligned_select(); + if (!entry) + abort(); + + starch_magnitude_sc16q11_aligned = entry->callable; + starch_magnitude_sc16q11_aligned ( arg0, arg1, arg2 ); +} + +starch_magnitude_sc16q11_aligned_ptr starch_magnitude_sc16q11_aligned = starch_magnitude_sc16q11_aligned_dispatch; + +void starch_magnitude_sc16q11_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_sc16q11_aligned_regentry *entry; + for (entry = starch_magnitude_sc16q11_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_sc16q11_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_sc16q11_aligned_registry, entry - starch_magnitude_sc16q11_aligned_registry, sizeof(starch_magnitude_sc16q11_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16q11_aligned = starch_magnitude_sc16q11_aligned_dispatch; +} + +starch_magnitude_sc16q11_aligned_regentry starch_magnitude_sc16q11_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 2, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 3, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 2, "exact_u32_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_aligned_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "exact_float_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_aligned_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "11bit_table_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_aligned_11bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "12bit_table_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_aligned_12bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "exact_u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "exact_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 8, "11bit_table_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_11bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 9, "12bit_table_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_12bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 10, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 11, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 12, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 13, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_float_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16q11_aligned_exact_float_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 2, "exact_u32_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16q11_aligned_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 3, "11bit_table_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16q11_aligned_11bit_table_x86_avx2, cpu_supports_avx2 }, + { 4, "12bit_table_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16q11_aligned_12bit_table_x86_avx2, cpu_supports_avx2 }, + { 5, "exact_u32_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 6, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_exact_float_x86_avx2, cpu_supports_avx2 }, + { 7, "11bit_table_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_11bit_table_x86_avx2, cpu_supports_avx2 }, + { 8, "12bit_table_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_12bit_table_x86_avx2, cpu_supports_avx2 }, + { 9, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 10, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 11, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for mean_power_u16 */ + +starch_mean_power_u16_regentry * starch_mean_power_u16_select() { + for (starch_mean_power_u16_regentry *entry = starch_mean_power_u16_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_mean_power_u16_dispatch ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) { + starch_mean_power_u16_regentry *entry = starch_mean_power_u16_select(); + if (!entry) + abort(); + + starch_mean_power_u16 = entry->callable; + starch_mean_power_u16 ( arg0, arg1, arg2, arg3 ); +} + +starch_mean_power_u16_ptr starch_mean_power_u16 = starch_mean_power_u16_dispatch; + +void starch_mean_power_u16_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_mean_power_u16_regentry *entry; + for (entry = starch_mean_power_u16_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_mean_power_u16_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_mean_power_u16_registry, entry - starch_mean_power_u16_registry, sizeof(starch_mean_power_u16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_mean_power_u16 = starch_mean_power_u16_dispatch; +} + +starch_mean_power_u16_regentry starch_mean_power_u16_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 1, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 2, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 2, "float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "u64_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_u64_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "neon_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_neon_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 6, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "u32_x86_avx2", "x86_avx2", starch_mean_power_u16_u32_x86_avx2, cpu_supports_avx2 }, + { 1, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 2, "float_x86_avx2", "x86_avx2", starch_mean_power_u16_float_x86_avx2, cpu_supports_avx2 }, + { 3, "u64_x86_avx2", "x86_avx2", starch_mean_power_u16_u64_x86_avx2, cpu_supports_avx2 }, + { 4, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 5, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for mean_power_u16_aligned */ + +starch_mean_power_u16_aligned_regentry * starch_mean_power_u16_aligned_select() { + for (starch_mean_power_u16_aligned_regentry *entry = starch_mean_power_u16_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_mean_power_u16_aligned_dispatch ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) { + starch_mean_power_u16_aligned_regentry *entry = starch_mean_power_u16_aligned_select(); + if (!entry) + abort(); + + starch_mean_power_u16_aligned = entry->callable; + starch_mean_power_u16_aligned ( arg0, arg1, arg2, arg3 ); +} + +starch_mean_power_u16_aligned_ptr starch_mean_power_u16_aligned = starch_mean_power_u16_aligned_dispatch; + +void starch_mean_power_u16_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_mean_power_u16_aligned_regentry *entry; + for (entry = starch_mean_power_u16_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_mean_power_u16_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_mean_power_u16_aligned_registry, entry - starch_mean_power_u16_aligned_registry, sizeof(starch_mean_power_u16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_mean_power_u16_aligned = starch_mean_power_u16_aligned_dispatch; +} + +starch_mean_power_u16_aligned_regentry starch_mean_power_u16_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 1, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 2, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "u32_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_mean_power_u16_aligned_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 2, "float_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_mean_power_u16_aligned_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "u64_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_mean_power_u16_aligned_u64_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "neon_float_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "u64_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_u64_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 8, "neon_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_neon_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 9, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 10, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "u32_x86_avx2_aligned", "x86_avx2", starch_mean_power_u16_aligned_u32_x86_avx2, cpu_supports_avx2 }, + { 1, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 2, "float_x86_avx2_aligned", "x86_avx2", starch_mean_power_u16_aligned_float_x86_avx2, cpu_supports_avx2 }, + { 3, "u64_x86_avx2_aligned", "x86_avx2", starch_mean_power_u16_aligned_u64_x86_avx2, cpu_supports_avx2 }, + { 4, "float_x86_avx2", "x86_avx2", starch_mean_power_u16_float_x86_avx2, cpu_supports_avx2 }, + { 5, "u32_x86_avx2", "x86_avx2", starch_mean_power_u16_u32_x86_avx2, cpu_supports_avx2 }, + { 6, "u64_x86_avx2", "x86_avx2", starch_mean_power_u16_u64_x86_avx2, cpu_supports_avx2 }, + { 7, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 8, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + + +int starch_read_wisdom (const char * path) +{ + FILE *fp = fopen(path, "r"); + if (!fp) + return -1; + + /* reset all ranks to identify entries not listed in the wisdom file; we'll assign ranks at the end to produce a stable sort */ + int rank_magnitude_uc8 = 0; + for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_uc8_aligned = 0; + for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_power_uc8 = 0; + for (starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_power_uc8_aligned = 0; + for (starch_magnitude_power_uc8_aligned_regentry *entry = starch_magnitude_power_uc8_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_sc16 = 0; + for (starch_magnitude_sc16_regentry *entry = starch_magnitude_sc16_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_sc16_aligned = 0; + for (starch_magnitude_sc16_aligned_regentry *entry = starch_magnitude_sc16_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_sc16q11 = 0; + for (starch_magnitude_sc16q11_regentry *entry = starch_magnitude_sc16q11_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_sc16q11_aligned = 0; + for (starch_magnitude_sc16q11_aligned_regentry *entry = starch_magnitude_sc16q11_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_mean_power_u16 = 0; + for (starch_mean_power_u16_regentry *entry = starch_mean_power_u16_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_mean_power_u16_aligned = 0; + for (starch_mean_power_u16_aligned_regentry *entry = starch_mean_power_u16_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + + char linebuf[512]; + while (fgets(linebuf, sizeof(linebuf), fp)) { + /* split name and impl on whitespace, handle comments etc */ + char *name = linebuf; + while (*name && isspace(*name)) + ++name; + + if (!*name || *name == '#') + continue; + + char *end = name; + while (*end && !isspace(*end)) + ++end; + + if (!*end) + continue; + *end = 0; + + char *impl = end + 1; + while (*impl && isspace(*impl)) + ++impl; + + if (!*impl) + continue; + + end = impl; + while (*end && !isspace(*end)) + ++end; + + *end = 0; + + /* try to find a matching registry entry */ + if (!strcmp(name, "magnitude_uc8")) { + for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_uc8; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_uc8_aligned")) { + for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_uc8_aligned; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_power_uc8")) { + for (starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_power_uc8; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_power_uc8_aligned")) { + for (starch_magnitude_power_uc8_aligned_regentry *entry = starch_magnitude_power_uc8_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_power_uc8_aligned; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_sc16")) { + for (starch_magnitude_sc16_regentry *entry = starch_magnitude_sc16_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_sc16; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_sc16_aligned")) { + for (starch_magnitude_sc16_aligned_regentry *entry = starch_magnitude_sc16_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_sc16_aligned; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_sc16q11")) { + for (starch_magnitude_sc16q11_regentry *entry = starch_magnitude_sc16q11_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_sc16q11; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_sc16q11_aligned")) { + for (starch_magnitude_sc16q11_aligned_regentry *entry = starch_magnitude_sc16q11_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_sc16q11_aligned; + break; + } + } + continue; + } + if (!strcmp(name, "mean_power_u16")) { + for (starch_mean_power_u16_regentry *entry = starch_mean_power_u16_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_mean_power_u16; + break; + } + } + continue; + } + if (!strcmp(name, "mean_power_u16_aligned")) { + for (starch_mean_power_u16_aligned_regentry *entry = starch_mean_power_u16_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_mean_power_u16_aligned; + break; + } + } + continue; + } + } + + if (ferror(fp)) { + fclose(fp); + return -1; + } + + fclose(fp); + + /* assign ranks to unmatched items to (stable) sort them last; re-sort everything */ + { + starch_magnitude_uc8_regentry *entry; + for (entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_uc8; + } + qsort(starch_magnitude_uc8_registry, entry - starch_magnitude_uc8_registry, sizeof(starch_magnitude_uc8_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; + } + { + starch_magnitude_uc8_aligned_regentry *entry; + for (entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_uc8_aligned; + } + qsort(starch_magnitude_uc8_aligned_registry, entry - starch_magnitude_uc8_aligned_registry, sizeof(starch_magnitude_uc8_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; + } + { + starch_magnitude_power_uc8_regentry *entry; + for (entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_power_uc8; + } + qsort(starch_magnitude_power_uc8_registry, entry - starch_magnitude_power_uc8_registry, sizeof(starch_magnitude_power_uc8_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_power_uc8 = starch_magnitude_power_uc8_dispatch; + } + { + starch_magnitude_power_uc8_aligned_regentry *entry; + for (entry = starch_magnitude_power_uc8_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_power_uc8_aligned; + } + qsort(starch_magnitude_power_uc8_aligned_registry, entry - starch_magnitude_power_uc8_aligned_registry, sizeof(starch_magnitude_power_uc8_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_power_uc8_aligned = starch_magnitude_power_uc8_aligned_dispatch; + } + { + starch_magnitude_sc16_regentry *entry; + for (entry = starch_magnitude_sc16_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_sc16; + } + qsort(starch_magnitude_sc16_registry, entry - starch_magnitude_sc16_registry, sizeof(starch_magnitude_sc16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16 = starch_magnitude_sc16_dispatch; + } + { + starch_magnitude_sc16_aligned_regentry *entry; + for (entry = starch_magnitude_sc16_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_sc16_aligned; + } + qsort(starch_magnitude_sc16_aligned_registry, entry - starch_magnitude_sc16_aligned_registry, sizeof(starch_magnitude_sc16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16_aligned = starch_magnitude_sc16_aligned_dispatch; + } + { + starch_magnitude_sc16q11_regentry *entry; + for (entry = starch_magnitude_sc16q11_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_sc16q11; + } + qsort(starch_magnitude_sc16q11_registry, entry - starch_magnitude_sc16q11_registry, sizeof(starch_magnitude_sc16q11_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16q11 = starch_magnitude_sc16q11_dispatch; + } + { + starch_magnitude_sc16q11_aligned_regentry *entry; + for (entry = starch_magnitude_sc16q11_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_sc16q11_aligned; + } + qsort(starch_magnitude_sc16q11_aligned_registry, entry - starch_magnitude_sc16q11_aligned_registry, sizeof(starch_magnitude_sc16q11_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16q11_aligned = starch_magnitude_sc16q11_aligned_dispatch; + } + { + starch_mean_power_u16_regentry *entry; + for (entry = starch_mean_power_u16_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_mean_power_u16; + } + qsort(starch_mean_power_u16_registry, entry - starch_mean_power_u16_registry, sizeof(starch_mean_power_u16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_mean_power_u16 = starch_mean_power_u16_dispatch; + } + { + starch_mean_power_u16_aligned_regentry *entry; + for (entry = starch_mean_power_u16_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_mean_power_u16_aligned; + } + qsort(starch_mean_power_u16_aligned_registry, entry - starch_mean_power_u16_aligned_registry, sizeof(starch_mean_power_u16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_mean_power_u16_aligned = starch_mean_power_u16_aligned_dispatch; + } + + return 0; +} diff --git a/dsp/generated/flavor.armv7a_neon_vfpv4.c b/dsp/generated/flavor.armv7a_neon_vfpv4.c new file mode 100644 index 0000000..acb84e7 --- /dev/null +++ b/dsp/generated/flavor.armv7a_neon_vfpv4.c @@ -0,0 +1,41 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_ARMV7A_NEON_VFPV4 +#define STARCH_FEATURE_NEON + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## armv7a_neon_vfpv4 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv7a_neon_vfpv4 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/mean_power_u16.c" +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_sc16.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## armv7a_neon_vfpv4 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv7a_neon_vfpv4 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/mean_power_u16.c" +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_sc16.c" + diff --git a/dsp/generated/flavor.generic.c b/dsp/generated/flavor.generic.c new file mode 100644 index 0000000..d869946 --- /dev/null +++ b/dsp/generated/flavor.generic.c @@ -0,0 +1,21 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_GENERIC + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## generic +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## generic +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/mean_power_u16.c" +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_sc16.c" + diff --git a/dsp/generated/flavor.x86_avx2.c b/dsp/generated/flavor.x86_avx2.c new file mode 100644 index 0000000..5b9f88e --- /dev/null +++ b/dsp/generated/flavor.x86_avx2.c @@ -0,0 +1,40 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_X86_AVX2 + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## x86_avx2 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## x86_avx2 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/mean_power_u16.c" +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_sc16.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## x86_avx2 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## x86_avx2 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/mean_power_u16.c" +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_sc16.c" + diff --git a/dsp/generated/makefile.arm b/dsp/generated/makefile.arm new file mode 100644 index 0000000..58eaf5b --- /dev/null +++ b/dsp/generated/makefile.arm @@ -0,0 +1,39 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_ARM + + +dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv7-a+neon-vfpv4 -mfpu=neon-vfpv4 -ffast-math dsp/generated/flavor.armv7a_neon_vfpv4.c -o dsp/generated/flavor.armv7a_neon_vfpv4.o + +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o + +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o + +STARCH_OBJS := dsp/generated/flavor.armv7a_neon_vfpv4.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o + + +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o + +STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.generic b/dsp/generated/makefile.generic new file mode 100644 index 0000000..7f261d9 --- /dev/null +++ b/dsp/generated/makefile.generic @@ -0,0 +1,36 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_GENERIC + + +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o + +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o + +STARCH_OBJS := dsp/generated/flavor.generic.o dsp/generated/dispatcher.o + + +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o + +STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.x86 b/dsp/generated/makefile.x86 new file mode 100644 index 0000000..e88d3e1 --- /dev/null +++ b/dsp/generated/makefile.x86 @@ -0,0 +1,39 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_X86 + + +dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -mavx2 -ffast-math dsp/generated/flavor.x86_avx2.c -o dsp/generated/flavor.x86_avx2.o + +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o + +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o + +STARCH_OBJS := dsp/generated/flavor.x86_avx2.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o + + +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o + +STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/starch.h b/dsp/generated/starch.h new file mode 100644 index 0000000..063ac04 --- /dev/null +++ b/dsp/generated/starch.h @@ -0,0 +1,294 @@ + +/* starch generated code. Do not edit. */ + +#include "dsp-types.h" +#include "cpu.h" + +/* mixes */ + +/* Generic build, compiler defaults only */ +#ifdef STARCH_MIX_GENERIC +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 1 +#endif /* STARCH_MIX_GENERIC */ + +/* ARM */ +#ifdef STARCH_MIX_ARM +#define STARCH_FLAVOR_ARMV7A_NEON_VFPV4 +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 16 +#endif /* STARCH_MIX_ARM */ + +/* x64 */ +#ifdef STARCH_MIX_X86 +#define STARCH_FLAVOR_X86_AVX2 +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 32 +#endif /* STARCH_MIX_X86 */ + + +#ifdef STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_IS_ALIGNED(_ptr) (((uintptr_t)(_ptr) & (STARCH_MIX_ALIGNMENT-1)) == 0) +#else +/* mix not defined, alignment is unknown, treat everything as unaligned */ +#define STARCH_IS_ALIGNED(_ptr) (0) +#endif + + +/* entry points and registries */ + +typedef void (* starch_magnitude_uc8_ptr) ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_uc8_ptr starch_magnitude_uc8; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_uc8_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_uc8_regentry; + +extern starch_magnitude_uc8_regentry starch_magnitude_uc8_registry[]; +starch_magnitude_uc8_regentry * starch_magnitude_uc8_select(); +void starch_magnitude_uc8_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_uc8_aligned_ptr) ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_uc8_aligned_ptr starch_magnitude_uc8_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_uc8_aligned_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_uc8_aligned_regentry; + +extern starch_magnitude_uc8_aligned_regentry starch_magnitude_uc8_aligned_registry[]; +starch_magnitude_uc8_aligned_regentry * starch_magnitude_uc8_aligned_select(); +void starch_magnitude_uc8_aligned_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_power_uc8_ptr) ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +extern starch_magnitude_power_uc8_ptr starch_magnitude_power_uc8; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_power_uc8_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_power_uc8_regentry; + +extern starch_magnitude_power_uc8_regentry starch_magnitude_power_uc8_registry[]; +starch_magnitude_power_uc8_regentry * starch_magnitude_power_uc8_select(); +void starch_magnitude_power_uc8_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_power_uc8_aligned_ptr) ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +extern starch_magnitude_power_uc8_aligned_ptr starch_magnitude_power_uc8_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_power_uc8_aligned_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_power_uc8_aligned_regentry; + +extern starch_magnitude_power_uc8_aligned_regentry starch_magnitude_power_uc8_aligned_registry[]; +starch_magnitude_power_uc8_aligned_regentry * starch_magnitude_power_uc8_aligned_select(); +void starch_magnitude_power_uc8_aligned_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_sc16_ptr) ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_sc16_ptr starch_magnitude_sc16; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_sc16_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_sc16_regentry; + +extern starch_magnitude_sc16_regentry starch_magnitude_sc16_registry[]; +starch_magnitude_sc16_regentry * starch_magnitude_sc16_select(); +void starch_magnitude_sc16_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_sc16_aligned_ptr) ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_sc16_aligned_ptr starch_magnitude_sc16_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_sc16_aligned_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_sc16_aligned_regentry; + +extern starch_magnitude_sc16_aligned_regentry starch_magnitude_sc16_aligned_registry[]; +starch_magnitude_sc16_aligned_regentry * starch_magnitude_sc16_aligned_select(); +void starch_magnitude_sc16_aligned_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_sc16q11_ptr) ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_sc16q11_ptr starch_magnitude_sc16q11; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_sc16q11_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_sc16q11_regentry; + +extern starch_magnitude_sc16q11_regentry starch_magnitude_sc16q11_registry[]; +starch_magnitude_sc16q11_regentry * starch_magnitude_sc16q11_select(); +void starch_magnitude_sc16q11_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_sc16q11_aligned_ptr) ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_sc16q11_aligned_ptr starch_magnitude_sc16q11_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_sc16q11_aligned_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_sc16q11_aligned_regentry; + +extern starch_magnitude_sc16q11_aligned_regentry starch_magnitude_sc16q11_aligned_registry[]; +starch_magnitude_sc16q11_aligned_regentry * starch_magnitude_sc16q11_aligned_select(); +void starch_magnitude_sc16q11_aligned_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_mean_power_u16_ptr) ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +extern starch_mean_power_u16_ptr starch_mean_power_u16; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_mean_power_u16_ptr callable; + int (*flavor_supported)(); +} starch_mean_power_u16_regentry; + +extern starch_mean_power_u16_regentry starch_mean_power_u16_registry[]; +starch_mean_power_u16_regentry * starch_mean_power_u16_select(); +void starch_mean_power_u16_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_mean_power_u16_aligned_ptr) ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +extern starch_mean_power_u16_aligned_ptr starch_mean_power_u16_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_mean_power_u16_aligned_ptr callable; + int (*flavor_supported)(); +} starch_mean_power_u16_aligned_regentry; + +extern starch_mean_power_u16_aligned_regentry starch_mean_power_u16_aligned_registry[]; +starch_mean_power_u16_aligned_regentry * starch_mean_power_u16_aligned_select(); +void starch_mean_power_u16_aligned_set_wisdom( const char * const * received_wisdom ); + +/* flavors and prototypes */ + +#ifdef STARCH_FLAVOR_GENERIC +void starch_mean_power_u16_float_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_power_uc8_twopass_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_11bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_12bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +#endif /* STARCH_FLAVOR_GENERIC */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 +int cpu_supports_armv7_neon_vfpv4 (void); +void starch_mean_power_u16_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_power_uc8_twopass_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_twopass_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_11bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_11bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +#endif /* STARCH_FLAVOR_ARMV7A_NEON_VFPV4 */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_X86_AVX2 +int cpu_supports_avx2 (void); +void starch_mean_power_u16_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_power_uc8_twopass_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_twopass_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +#endif /* STARCH_FLAVOR_X86_AVX2 */ + +int starch_read_wisdom (const char * path); + diff --git a/dsp/helpers/tables.c b/dsp/helpers/tables.c new file mode 100644 index 0000000..fe0bc8f --- /dev/null +++ b/dsp/helpers/tables.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include + +#include "dsp-types.h" +#include "dsp/helpers/tables.h" + +const uint16_t * get_uc8_mag_table() +{ + static uint16_t *table = NULL; + + if (!table) { + table = malloc(sizeof(uint16_t) * 256 * 256); + if (!table) { + fprintf(stderr, "can't allocate UC8 conversion lookup table\n"); + abort(); + } + + for (int i = 0; i <= 255; i++) { + for (int q = 0; q <= 255; q++) { + float fI, fQ, magsq; + + fI = (i - 127.4) / 128; + fQ = (q - 127.4) / 128; + magsq = fI * fI + fQ * fQ; + + float mag = round(sqrtf(magsq) * 65536.0f); + if (mag > 65535) + mag = 65535; + + uc8_u16_t u; + u.uc8.I = i; + u.uc8.Q = q; + table[u.u16] = mag; + } + } + } + + return table; +} + +const uint16_t * get_sc16q11_mag_11bit_table() +{ + static uint16_t *table = NULL; + + if (!table) { + table = malloc(sizeof(uint16_t) * 2048 * 2048); + if (!table) { + fprintf(stderr, "can't allocate SC16Q11 conversion lookup table\n"); + abort(); + } + + for (int i = 0; i <= 2047; i++) { + for (int q = 0; q <= 2047; q++) { + float fI, fQ, magsq; + + fI = i / 2048.0; + fQ = q / 2048.0; + magsq = fI * fI + fQ * fQ; + + float mag = round(sqrtf(magsq) * 65536.0f); + if (mag > 65535) + mag = 65535; + + table[(q << 11) | i] = mag; + } + } + } + + return table; +} + +const uint16_t * get_sc16q11_mag_12bit_table() +{ + static uint16_t *table = NULL; + + if (!table) { + table = malloc(sizeof(uint16_t) * 4096 * 4096); + if (!table) { + fprintf(stderr, "can't allocate SC16Q11 conversion lookup table\n"); + abort(); + } + + for (int i = -2048; i <= 2047; i++) { + for (int q = -2048; q <= 2047; q++) { + float fI, fQ, magsq; + + fI = fabs(i) / 2048.0; + fQ = fabs(q) / 2048.0; + magsq = fI * fI + fQ * fQ; + + float mag = round(sqrtf(magsq) * 65536.0f); + if (mag > 65535) + mag = 65535; + + unsigned index = ((i & 4095) << 12) | (q & 4095); + table[index] = mag; + } + } + } + + return table; +} + diff --git a/dsp/helpers/tables.h b/dsp/helpers/tables.h new file mode 100644 index 0000000..cfb86d3 --- /dev/null +++ b/dsp/helpers/tables.h @@ -0,0 +1,10 @@ +#ifndef DSP_TABLES_H +#define DSP_TABLES_H + +#include + +const uint16_t * get_uc8_mag_table(); +const uint16_t * get_sc16q11_mag_11bit_table(); +const uint16_t * get_sc16q11_mag_12bit_table(); + +#endif diff --git a/dsp/impl/magnitude_power_uc8.c b/dsp/impl/magnitude_power_uc8.c new file mode 100644 index 0000000..eca5988 --- /dev/null +++ b/dsp/impl/magnitude_power_uc8.c @@ -0,0 +1,201 @@ +#include +#include +#include +#include +#include +#include + +#include "dsp/helpers/tables.h" + +/* Convert UC8 values to unsigned 16-bit magnitudes */ + +void STARCH_IMPL(magnitude_power_uc8, twopass) (const uc8_t *in, uint16_t *out, unsigned len, double *out_level, double *out_power) +{ +#if STARCH_ALIGNMENT > 1 + starch_magnitude_uc8_aligned(in, out, len); + starch_mean_power_u16_aligned(out, len, out_level, out_power); +#else + starch_magnitude_uc8(in, out, len); + starch_mean_power_u16(out, len, out_level, out_power); +#endif +} + +void STARCH_IMPL(magnitude_power_uc8, lookup) (const uc8_t *in, uint16_t *out, unsigned len, double *out_level, double *out_power) +{ + const uint16_t * const restrict mag_table = get_uc8_mag_table(); + + const uc8_u16_t * restrict in_align = (const uc8_u16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + uint64_t sum_level = 0; + uint64_t sum_power = 0; + + unsigned len1 = len; + while (len1--) { + uint16_t mag = mag_table[in_align[0].u16]; + out_align[0] = mag; + sum_level += mag; + sum_power += (uint32_t)mag * mag; + + out_align += 1; + in_align += 1; + } + + *out_level = sum_level / 65536.0 / len; + *out_power = sum_power / 65536.0 / 65536.0 / len; +} + +void STARCH_IMPL(magnitude_power_uc8, lookup_unroll_4) (const uc8_t *in, uint16_t *out, unsigned len, double *out_level, double *out_power) +{ + const uint16_t * const restrict mag_table = get_uc8_mag_table(); + + const uc8_u16_t * restrict in_align = (const uc8_u16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + uint64_t sum_level = 0; + uint64_t sum_power = 0; + + unsigned len4 = len >> 2; + unsigned len1 = len & 3; + + while (len4--) { + uint16_t mag0 = mag_table[in_align[0].u16]; + uint16_t mag1 = mag_table[in_align[1].u16]; + uint16_t mag2 = mag_table[in_align[2].u16]; + uint16_t mag3 = mag_table[in_align[3].u16]; + + out_align[0] = mag0; + out_align[1] = mag1; + out_align[2] = mag2; + out_align[3] = mag3; + + sum_level = sum_level + mag0 + mag1 + mag2 + mag3; + sum_power = sum_power + (uint32_t)mag0 * mag0 + (uint32_t)mag1 * mag1 + (uint32_t)mag2 * mag2 + (uint32_t)mag3 * mag3; + + out_align += 4; + in_align += 4; + } + + while (len1--) { + uint16_t mag = mag_table[in_align[0].u16]; + + out_align[0] = mag; + + sum_level = sum_level + mag; + sum_power = sum_power + mag * mag; + + out_align += 1; + in_align += 1; + } + + *out_level = sum_level / 65536.0 / len; + *out_power = sum_power / 65536.0 / 65536.0 / len; +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(magnitude_power_uc8, neon_vrsqrte, STARCH_FEATURE_NEON) (const uc8_t *in, uint16_t *out, unsigned len, double *out_level, double *out_power) +{ + const uint8_t * restrict in_align = (const uint8_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + const uint16x8_t offset = vdupq_n_u16((uint16_t) (127.4 * 256)); + const float32x4_t almost_one = vdupq_n_f32(65535.0 / 65536.0); + + float32x4_t sum_level = vdupq_n_f32(0); + float32x4_t sum_power = vdupq_n_f32(0); + + unsigned len8 = len >> 3; + while (len8--) { + uint8x8x2_t iq = vld2_u8(in_align); + + // widen to 16 bits, convert to signed + uint16x8_t i_u16 = vshll_n_u8(iq.val[0], 8); + uint16x8_t q_u16 = vshll_n_u8(iq.val[1], 8); + int16x8_t i_s16 = vreinterpretq_s16_u16(vsubq_u16(i_u16, offset)); + int16x8_t q_s16 = vreinterpretq_s16_u16(vsubq_u16(q_u16, offset)); + + // low half + int16x4_t i_s16_low = vget_low_s16(i_s16); + int16x4_t q_s16_low = vget_low_s16(q_s16); + uint32x4_t isq_low = vreinterpretq_u32_s32(vmull_s16(i_s16_low, i_s16_low)); + uint32x4_t qsq_low = vreinterpretq_u32_s32(vmull_s16(q_s16_low, q_s16_low)); + uint32x4_t magsq_low = vqaddq_u32(isq_low, qsq_low); + float32x4_t magsq_f32_low = vcvtq_n_f32_u32(magsq_low, 30); /* input values are Q15, magsq is Q30 */ + float32x4_t mag_f32_low = vmulq_f32(magsq_f32_low, vrsqrteq_f32(magsq_f32_low)); + uint16x4_t mag_u16_low = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_low, 16)); + + sum_level = vaddq_f32(sum_level, vminq_f32(mag_f32_low, almost_one)); + sum_power = vaddq_f32(sum_power, vminq_f32(magsq_f32_low, almost_one)); + + // high half + int16x4_t i_s16_high = vget_high_s16(i_s16); + int16x4_t q_s16_high = vget_high_s16(q_s16); + uint32x4_t isq_high = vreinterpretq_u32_s32(vmull_s16(i_s16_high, i_s16_high)); + uint32x4_t qsq_high = vreinterpretq_u32_s32(vmull_s16(q_s16_high, q_s16_high)); + uint32x4_t magsq_high = vqaddq_u32(isq_high, qsq_high); + float32x4_t magsq_f32_high = vcvtq_n_f32_u32(magsq_high, 30); + float32x4_t mag_f32_high = vmulq_f32(magsq_f32_high, vrsqrteq_f32(magsq_f32_high)); + uint16x4_t mag_u16_high = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_high, 16)); + + sum_level = vaddq_f32(sum_level, vminq_f32(mag_f32_high, almost_one)); + sum_power = vaddq_f32(sum_power, vminq_f32(magsq_f32_high, almost_one)); + + // store + uint16x8_t result = vcombine_u16(mag_u16_low, mag_u16_high); + vst1q_u16(out_align, result); + + in_align += 16; + out_align += 8; + } + + const int16x8_t lane0_mask = { 0xFF, 0, 0, 0, 0, 0, 0, 0 }; + + unsigned len1 = len & 7; + while (len1--) { + uint8x8x2_t iq = vld2_dup_u8(in_align); + + // widen to 16 bits, convert to signed + uint16x8_t i_u16 = vshll_n_u8(iq.val[0], 8); + uint16x8_t q_u16 = vshll_n_u8(iq.val[1], 8); + int16x8_t i_s16 = vreinterpretq_s16_u16(vsubq_u16(i_u16, offset)); + int16x8_t q_s16 = vreinterpretq_s16_u16(vsubq_u16(q_u16, offset)); + + // mask so only lane 0 has a non-zero value + // (important for sum_level / sum_power later) + i_s16 = vandq_s16(i_s16, lane0_mask); + q_s16 = vandq_s16(q_s16, lane0_mask); + + // low half (don't care about high half) + int16x4_t i_s16_low = vget_low_s16(i_s16); + int16x4_t q_s16_low = vget_low_s16(q_s16); + uint32x4_t isq_low = vreinterpretq_u32_s32(vmull_s16(i_s16_low, i_s16_low)); + uint32x4_t qsq_low = vreinterpretq_u32_s32(vmull_s16(q_s16_low, q_s16_low)); + uint32x4_t magsq_low = vqaddq_u32(isq_low, qsq_low); + float32x4_t magsq_f32_low = vcvtq_n_f32_u32(magsq_low, 30); /* input values are Q15, magsq is Q30 */ + float32x4_t mag_f32_low = vmulq_f32(magsq_f32_low, vrsqrteq_f32(magsq_f32_low)); + uint16x4_t mag_u16_low = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_low, 16)); + + sum_level = vaddq_f32(sum_level, vminq_f32(mag_f32_low, almost_one)); + sum_power = vaddq_f32(sum_power, vminq_f32(magsq_f32_low, almost_one)); + + // store 1 lane only + vst1_lane_u16(out_align, mag_u16_low, 0); + + in_align += 2; + out_align += 1; + } + + // add sums across vector + float32x2_t sum2_level = vadd_f32(vget_low_f32(sum_level), vget_high_f32(sum_level)); + float32x2_t sum4_level = vpadd_f32(sum2_level, sum2_level); + *out_level = vget_lane_f32(sum4_level, 0) / len; + + float32x2_t sum2_power = vadd_f32(vget_low_f32(sum_power), vget_high_f32(sum_power)); + float32x2_t sum4_power = vpadd_f32(sum2_power, sum2_power); + *out_power = vget_lane_f32(sum4_power, 0) / len; +} + +#endif diff --git a/dsp/impl/magnitude_sc16.c b/dsp/impl/magnitude_sc16.c new file mode 100644 index 0000000..1f45bde --- /dev/null +++ b/dsp/impl/magnitude_sc16.c @@ -0,0 +1,100 @@ +#include +#include + +/* Convert (little-endian) SC16 values to unsigned 16-bit magnitudes */ + +void STARCH_IMPL(magnitude_sc16, exact_u32) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + uint32_t I = abs((int16_t) le16toh(in_align[0].I)); + uint32_t Q = abs((int16_t) le16toh(in_align[0].Q)); + + uint32_t magsq = I * I + Q * Q; + float mag = sqrtf(magsq) * 2; + if (mag > 65535.0) + mag = 65535.0; + out_align[0] = (uint16_t)mag; + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_sc16, exact_float) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + float I = abs((int16_t) le16toh(in_align[0].I)) * 2; + float Q = abs((int16_t) le16toh(in_align[0].Q)) * 2; + + float magsq = I * I + Q * Q; + float mag = sqrtf(magsq); + if (mag > 65535.0) + mag = 65535.0; + out_align[0] = (uint16_t)mag; + + out_align += 1; + in_align += 1; + } +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(magnitude_sc16, neon_vrsqrte, STARCH_FEATURE_NEON) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const int16_t * restrict in_align = (const int16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + /* This uses NEON's floating-point reciprocal square root estimate (vrsqrte instruction). + * The estimate is accurate to about 9 bits of mantissa, which is good enough for our purposes. + */ + + unsigned len4 = len >> 2; + while (len4--) { + int16x4x2_t iq = vld2_s16(in_align); + int16x4_t i16 = iq.val[0]; /* Q15 */ + int16x4_t q16 = iq.val[1]; /* Q15 */ + + uint32x4_t isq = vreinterpretq_u32_s32(vmull_s16(i16, i16)); /* Q30, unsigned */ + uint32x4_t qsq = vreinterpretq_u32_s32(vmull_s16(q16, q16)); /* Q30, unsigned */ + uint32x4_t magsq = vqaddq_u32(isq, qsq); /* Q30, unsigned */ + + float32x4_t magsq_f32 = vcvtq_n_f32_u32(magsq, 30); + float32x4_t mag_f32 = vmulq_f32(magsq_f32, vrsqrteq_f32(magsq_f32)); /* sqrt(x) = x * (1/sqrt(x)) */ + uint16x4_t mag_u16 = vqmovn_u32(vcvtq_n_u32_f32(mag_f32, 16)); + + vst1_u16(out_align, mag_u16); + + in_align += 8; + out_align += 4; + } + + unsigned len1 = len & 3; + while (len1--) { + int16x4x2_t iq = vld2_dup_s16(in_align); + int16x4_t i16 = iq.val[0]; + int16x4_t q16 = iq.val[1]; + + uint32x4_t isq = vreinterpretq_u32_s32(vmull_s16(i16, i16)); + uint32x4_t qsq = vreinterpretq_u32_s32(vmull_s16(q16, q16)); + uint32x4_t magsq = vqaddq_u32(isq, qsq); + + float32x4_t magsq_f32 = vcvtq_n_f32_u32(magsq, 30); + float32x4_t mag_f32 = vmulq_f32(magsq_f32, vrsqrteq_f32(magsq_f32)); + uint16x4_t mag_u16 = vqmovn_u32(vcvtq_n_u32_f32(mag_f32, 16)); + + vst1_lane_u16(out_align, mag_u16, 0); + + in_align += 2; + out_align += 1; + } +} + +#endif /* STARCH_FEATURE_NEON */ diff --git a/dsp/impl/magnitude_sc16q11.c b/dsp/impl/magnitude_sc16q11.c new file mode 100644 index 0000000..bd9a0f5 --- /dev/null +++ b/dsp/impl/magnitude_sc16q11.c @@ -0,0 +1,137 @@ +#include +#include + +#include "dsp/helpers/tables.h" + +/* Convert (little-endian) SC16 values with a range of -2048..+2047 to unsigned 16-bit magnitudes */ + +void STARCH_IMPL(magnitude_sc16q11, exact_u32) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + uint32_t I = abs((int16_t) le16toh(in_align[0].I)); + uint32_t Q = abs((int16_t) le16toh(in_align[0].Q)); + + uint32_t magsq = I * I + Q * Q; + float mag = sqrtf(magsq) * 32; + if (mag > 65535.0) + mag = 65535.0; + out_align[0] = (uint16_t)mag; + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_sc16q11, exact_float) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + float I = abs((int16_t) le16toh(in_align[0].I)) * 32; + float Q = abs((int16_t) le16toh(in_align[0].Q)) * 32; + + float magsq = I * I + Q * Q; + float mag = sqrtf(magsq); + if (mag > 65535.0) + mag = 65535.0; + out_align[0] = (uint16_t)mag; + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_sc16q11, 11bit_table) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * restrict table = get_sc16q11_mag_11bit_table(); + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + uint16_t I = abs((int16_t)le16toh(in_align[0].I)); + if (I >= 2048) + I = 2047; + uint16_t Q = abs((int16_t)le16toh(in_align[0].Q)); + if (Q >= 2048) + Q = 2047; + out_align[0] = table[(Q << 11) | I]; + + in_align += 1; + out_align += 1; + } +} + +void STARCH_IMPL(magnitude_sc16q11, 12bit_table) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * restrict table = get_sc16q11_mag_12bit_table(); + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + unsigned index = ((in_align[0].I & 4095) << 12) | (in_align[0].Q & 4095); + out_align[0] = table[index]; + + in_align += 1; + out_align += 1; + } +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(magnitude_sc16q11, neon_vrsqrte, STARCH_FEATURE_NEON) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const int16_t * restrict in_align = (const int16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + /* This uses NEON's floating-point reciprocal square root estimate instruction (vrsqrte). + * The estimate is accurate to about 9 bits of mantissa, which is good enough for our purposes. + */ + + unsigned len4 = len >> 2; + while (len4--) { + int16x4x2_t iq = vld2_s16(in_align); + int16x4_t i16 = iq.val[0]; /* Q11 */ + int16x4_t q16 = iq.val[1]; /* Q11 */ + + uint32x4_t isq = vreinterpretq_u32_s32(vmull_s16(i16, i16)); /* Q22, unsigned */ + uint32x4_t qsq = vreinterpretq_u32_s32(vmull_s16(q16, q16)); /* Q22, unsigned */ + uint32x4_t magsq = vqaddq_u32(isq, qsq); /* Q22, unsigned */ + + float32x4_t magsq_f32 = vcvtq_n_f32_u32(magsq, 22); + float32x4_t mag_f32 = vmulq_f32(magsq_f32, vrsqrteq_f32(magsq_f32)); /* sqrt(x) = x * (1/sqrt(x)) */ + uint16x4_t mag_u16 = vqmovn_u32(vcvtq_n_u32_f32(mag_f32, 16)); + + vst1_u16(out_align, mag_u16); + + in_align += 8; + out_align += 4; + } + + unsigned len1 = len & 3; + while (len1--) { + int16x4x2_t iq = vld2_dup_s16(in_align); + int16x4_t i16 = iq.val[0]; + int16x4_t q16 = iq.val[1]; + + uint32x4_t isq = vreinterpretq_u32_s32(vmull_s16(i16, i16)); + uint32x4_t qsq = vreinterpretq_u32_s32(vmull_s16(q16, q16)); + uint32x4_t magsq = vqaddq_u32(isq, qsq); + + float32x4_t magsq_f32 = vcvtq_n_f32_u32(magsq, 22); + float32x4_t mag_f32 = vmulq_f32(magsq_f32, vrsqrteq_f32(magsq_f32)); + uint16x4_t mag_u16 = vqmovn_u32(vcvtq_n_u32_f32(mag_f32, 16)); + + vst1_lane_u16(out_align, mag_u16, 0); + + in_align += 2; + out_align += 1; + } +} + +#endif /* STARCH_FEATURE_NEON */ diff --git a/dsp/impl/magnitude_uc8.c b/dsp/impl/magnitude_uc8.c new file mode 100644 index 0000000..71279c6 --- /dev/null +++ b/dsp/impl/magnitude_uc8.c @@ -0,0 +1,164 @@ +#include +#include +#include +#include +#include +#include + +#include "dsp/helpers/tables.h" + +/* Convert UC8 values to unsigned 16-bit magnitudes */ + +void STARCH_IMPL(magnitude_uc8, lookup) (const uc8_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * const mag_table = get_uc8_mag_table(); + + const uc8_u16_t * restrict in_align = (const uc8_u16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + unsigned len1 = len; + while (len1--) { + uint16_t mag = mag_table[in_align[0].u16]; + out_align[0] = mag; + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_uc8, lookup_unroll_4) (const uc8_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * const mag_table = get_uc8_mag_table(); + + const uc8_u16_t * restrict in_align = (const uc8_u16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + unsigned len4 = len >> 2; + unsigned len1 = len & 3; + + while (len4--) { + uint16_t mag0 = mag_table[in_align[0].u16]; + uint16_t mag1 = mag_table[in_align[1].u16]; + uint16_t mag2 = mag_table[in_align[2].u16]; + uint16_t mag3 = mag_table[in_align[3].u16]; + + out_align[0] = mag0; + out_align[1] = mag1; + out_align[2] = mag2; + out_align[3] = mag3; + + out_align += 4; + in_align += 4; + } + + while (len1--) { + uint16_t mag = mag_table[in_align[0].u16]; + + out_align[0] = mag; + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_uc8, exact) (const uc8_t *in, uint16_t *out, unsigned len) +{ + const uc8_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + unsigned len1 = len; + + while (len1--) { + float I = (in_align[0].I - 127.4); + float Q = (in_align[0].Q - 127.4); + + float magsq = I * I + Q * Q; + float mag = sqrtf(magsq) * 65536.0 / 128.0; + if (mag > 65535.0) + mag = 65535.0; + + out_align[0] = (uint16_t)mag; + + in_align += 1; + out_align += 1; + } +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(magnitude_uc8, neon_vrsqrte, STARCH_FEATURE_NEON) (const uc8_t *in, uint16_t *out, unsigned len) +{ + const uint8_t * restrict in_align = (const uint8_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + const uint16x8_t offset = vdupq_n_u16((uint16_t) (127.4 * 256)); + + unsigned len8 = len >> 3; + while (len8--) { + uint8x8x2_t iq = vld2_u8(in_align); + + // widen to 16 bits, convert to signed + uint16x8_t i_u16 = vshll_n_u8(iq.val[0], 8); + uint16x8_t q_u16 = vshll_n_u8(iq.val[1], 8); + int16x8_t i_s16 = vreinterpretq_s16_u16(vsubq_u16(i_u16, offset)); + int16x8_t q_s16 = vreinterpretq_s16_u16(vsubq_u16(q_u16, offset)); + + // low half + int16x4_t i_s16_low = vget_low_s16(i_s16); + int16x4_t q_s16_low = vget_low_s16(q_s16); + uint32x4_t isq_low = vreinterpretq_u32_s32(vmull_s16(i_s16_low, i_s16_low)); + uint32x4_t qsq_low = vreinterpretq_u32_s32(vmull_s16(q_s16_low, q_s16_low)); + uint32x4_t magsq_low = vqaddq_u32(isq_low, qsq_low); + float32x4_t magsq_f32_low = vcvtq_n_f32_u32(magsq_low, 30); /* input values are Q15, magsq is Q30 */ + float32x4_t mag_f32_low = vmulq_f32(vrsqrteq_f32(magsq_f32_low), magsq_f32_low); /* sqrt(x) = x * (1/sqrt(x)) */ + uint16x4_t mag_u16_low = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_low, 16)); + + // high half + int16x4_t i_s16_high = vget_high_s16(i_s16); + int16x4_t q_s16_high = vget_high_s16(q_s16); + uint32x4_t isq_high = vreinterpretq_u32_s32(vmull_s16(i_s16_high, i_s16_high)); + uint32x4_t qsq_high = vreinterpretq_u32_s32(vmull_s16(q_s16_high, q_s16_high)); + uint32x4_t magsq_high = vqaddq_u32(isq_high, qsq_high); + float32x4_t magsq_f32_high = vcvtq_n_f32_u32(magsq_high, 30); + float32x4_t mag_f32_high = vmulq_f32(vrsqrteq_f32(magsq_f32_high), magsq_f32_high); + uint16x4_t mag_u16_high = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_high, 16)); + + // store + uint16x8_t result = vcombine_u16(mag_u16_low, mag_u16_high); + vst1q_u16(out_align, result); + + in_align += 16; + out_align += 8; + } + + unsigned len1 = len & 7; + while (len1--) { + uint8x8x2_t iq = vld2_dup_u8(in_align); + + // widen to 16 bits, convert to signed + uint16x8_t i_u16 = vshll_n_u8(iq.val[0], 8); + uint16x8_t q_u16 = vshll_n_u8(iq.val[1], 8); + int16x8_t i_s16 = vreinterpretq_s16_u16(vsubq_u16(i_u16, offset)); + int16x8_t q_s16 = vreinterpretq_s16_u16(vsubq_u16(q_u16, offset)); + + // low half (don't care about high half) + int16x4_t i_s16_low = vget_low_s16(i_s16); + int16x4_t q_s16_low = vget_low_s16(q_s16); + uint32x4_t isq_low = vreinterpretq_u32_s32(vmull_s16(i_s16_low, i_s16_low)); + uint32x4_t qsq_low = vreinterpretq_u32_s32(vmull_s16(q_s16_low, q_s16_low)); + uint32x4_t magsq_low = vqaddq_u32(isq_low, qsq_low); + float32x4_t magsq_f32_low = vcvtq_n_f32_u32(magsq_low, 30); /* input values are Q15, magsq is Q30 */ + float32x4_t mag_f32_low = vmulq_f32(vrsqrteq_f32(magsq_f32_low), magsq_f32_low); /* sqrt(x) = x * (1/sqrt(x)) */ + uint16x4_t mag_u16_low = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_low, 16)); + + // store 1 lane only + vst1_lane_u16(out_align, mag_u16_low, 0); + + in_align += 2; + out_align += 1; + } +} + +#endif /* STARCH_FEATURE_NEON */ diff --git a/dsp/impl/mean_power_u16.c b/dsp/impl/mean_power_u16.c new file mode 100644 index 0000000..b236baa --- /dev/null +++ b/dsp/impl/mean_power_u16.c @@ -0,0 +1,122 @@ +/* + * Given a buffer of uint16_t Q16 magnitude values + * return the mean magnitude and mean squared magnitude + * (normalized to 0..1) + */ + +void STARCH_IMPL(mean_power_u16, float) (const uint16_t *in, unsigned len, double *out_mean_mag, double *out_mean_magsq) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + + float sum = 0, sumsq = 0; + unsigned n = len; + while (n--) { + uint16_t mag = in_align[0]; + sum += mag; + sumsq += (uint32_t)mag * mag; + in_align += 1; + } + + *out_mean_mag = sum / len / 65536.0; + *out_mean_magsq = sumsq / len / 65536.0 / 65536.0; +} + +void STARCH_IMPL(mean_power_u16, u32) (const uint16_t *in, unsigned len, double *out_mean_mag, double *out_mean_magsq) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + + double sum = 0, sumsq = 0; + + unsigned remaining = len; + while (remaining > 0) { + uint32_t sum32 = 0, sumsq32 = 0; + unsigned blocklen = (remaining > 65536 ? 65536 : remaining); + remaining -= blocklen; + + while (blocklen--) { + uint16_t mag = in_align[0]; + sum32 += mag; + sumsq32 += ((uint32_t)mag * mag) >> 16; + in_align += 1; + } + + sum += sum32; + sumsq += sumsq32; + } + + *out_mean_mag = (double)sum / len / 65536.0; + *out_mean_magsq = (double)sumsq / len / 65536.0; +} + +void STARCH_IMPL(mean_power_u16, u64) (const uint16_t *in, unsigned len, double *out_mean_mag, double *out_mean_magsq) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + + uint64_t sum = 0, sumsq = 0; + unsigned n = len; + while (n--) { + uint16_t mag = in_align[0]; + sum += mag; + sumsq += (uint32_t)mag * mag; + in_align += 1; + } + + *out_mean_mag = (double)sum / len / 65536.0; + *out_mean_magsq = (double)sumsq / len / 65536.0 / 65536.0; +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(mean_power_u16, neon_float, STARCH_FEATURE_NEON) (const uint16_t *in, unsigned len, double *out_mean_mag, double *out_mean_magsq) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + + float32x4_t mag_sum_0 = vdupq_n_f32(0); + float32x4_t magsq_sum_0 = vdupq_n_f32(0); + float32x4_t mag_sum_1 = vdupq_n_f32(0); + float32x4_t magsq_sum_1 = vdupq_n_f32(0); + + unsigned len8 = len >> 3; + while (len8--) { + uint16x8_t mag_u16 = vld1q_u16(in_align); + uint16x4_t mag_u16_0 = vget_low_u16(mag_u16); + uint16x4_t mag_u16_1 = vget_high_u16(mag_u16); + + float32x4_t mag_float32_0 = vcvtq_n_f32_u32(vmovl_u16(mag_u16_0), 16); + float32x4_t mag_float32_1 = vcvtq_n_f32_u32(vmovl_u16(mag_u16_1), 16); + + mag_sum_0 = vaddq_f32(mag_sum_0, mag_float32_0); + mag_sum_1 = vaddq_f32(mag_sum_1, mag_float32_1); + + magsq_sum_0 = vfmaq_f32(magsq_sum_0, mag_float32_0, mag_float32_0); + magsq_sum_1 = vfmaq_f32(magsq_sum_1, mag_float32_1, mag_float32_1); + + in_align += 8; + } + + // reduce sums to lane 0 + float32x4_t mag_sum_q = vaddq_f32(mag_sum_0, mag_sum_1); + float32x2_t mag_sum = vadd_f32(vget_low_f32(mag_sum_q), vget_high_f32(mag_sum_q)); + mag_sum = vpadd_f32(mag_sum, mag_sum); + + float32x4_t magsq_sum_q = vaddq_f32(magsq_sum_0, magsq_sum_1); + float32x2_t magsq_sum = vadd_f32(vget_low_f32(magsq_sum_q), vget_high_f32(magsq_sum_q)); + magsq_sum = vpadd_f32(magsq_sum, magsq_sum); + + unsigned len1 = len & 7; + while (len1--) { + uint16x4_t mag_u16 = vld1_dup_u16(in_align); + // we process both lanes here, but lane 1's sums are ignored + float32x2_t mag_float32 = vcvt_n_f32_u32(vget_low_u32(vmovl_u16(mag_u16)), 16); + mag_sum = vadd_f32(mag_sum, mag_float32); + magsq_sum = vfma_f32(magsq_sum, mag_float32, mag_float32); + in_align += 1; + } + + *out_mean_mag = vget_lane_f32(mag_sum, 0) / len; + *out_mean_magsq = vget_lane_f32(magsq_sum, 0) / len; +} + +#endif /* STARCH_FEATURE_NEON */ diff --git a/dsp/starchgen.py b/dsp/starchgen.py new file mode 100755 index 0000000..ae963c8 --- /dev/null +++ b/dsp/starchgen.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import os +import sys +import glob + +top_dir = sys.argv[1] +starch_dir = os.path.join(top_dir, 'starch') +sys.path.append(starch_dir) +import starch + +gen = starch.Generator(runtime_dir = top_dir, + output_dir = os.path.join(top_dir, 'dsp', 'generated')) + +gen.add_include('"dsp-types.h"') +gen.add_include('"cpu.h"') + +gen.add_function(name = 'magnitude_uc8', argtypes = ['const uc8_t *', 'uint16_t *', 'unsigned'], aligned = True) +gen.add_function(name = 'magnitude_power_uc8', argtypes = ['const uc8_t *', 'uint16_t *', 'unsigned', 'double *', 'double *'], aligned = True) +gen.add_function(name = 'magnitude_sc16', argtypes = ['const sc16_t *', 'uint16_t *', 'unsigned'], aligned = True) +gen.add_function(name = 'magnitude_sc16q11', argtypes = ['const sc16_t *', 'uint16_t *', 'unsigned'], aligned = True) +gen.add_function(name = 'mean_power_u16', argtypes = ['const uint16_t *', 'unsigned', 'double *', 'double *'], aligned = True) + +gen.add_feature(name='neon', description='ARM NEON') + +gen.add_flavor(name = 'generic', + description = 'Generic build, default compiler options', + compile_flags = []) +gen.add_flavor(name = 'armv7a_neon_vfpv4', + description = 'ARMv7-A, NEON, VFPv4', + compile_flags = ['-march=armv7-a+neon-vfpv4', '-mfpu=neon-vfpv4', '-ffast-math'], + features = ['neon'], + test_function = 'cpu_supports_armv7_neon_vfpv4', + alignment = 16) +gen.add_flavor(name = 'x86_avx2', + description = 'x86 with AVX2', + compile_flags = ['-mavx2', '-ffast-math'], + test_function = 'cpu_supports_avx2', + alignment = 32) + +gen.add_mix(name = 'generic', + description = 'Generic build, compiler defaults only', + flavors = ['generic'], + wisdom_file = 'wisdom.generic') + +gen.add_mix(name = 'arm', + description = 'ARM', + flavors = ['armv7a_neon_vfpv4', 'generic'], + wisdom_file = 'wisdom.arm') + +gen.add_mix(name = 'x86', + description = 'x64', + flavors = ['x86_avx2', 'generic'], + wisdom_file = 'wisdom.x86') + +for pattern in ['dsp/impl/*.c', 'dsp/benchmark/*.c']: + for c_file in glob.glob(pattern): + gen.scan_file(c_file) + +gen.generate() diff --git a/dump1090.c b/dump1090.c index b15d45a..07c7042 100644 --- a/dump1090.c +++ b/dump1090.c @@ -48,6 +48,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dump1090.h" +#include "cpu.h" #include @@ -257,17 +258,36 @@ static void showVersion() #endif #ifdef ENABLE_LIMESDR "ENABLE_LIMESDR " -#endif -#ifdef SC16Q11_TABLE_BITS - // This is a little silly, but that's how the preprocessor works.. -#define _stringize(x) #x -#define stringize(x) _stringize(x) - "SC16Q11_TABLE_BITS=" stringize(SC16Q11_TABLE_BITS) -#undef stringize -#undef _stringize #endif ); printf("-----------------------------------------------------------------------------\n"); +} + +static void showDSP() +{ + printf(" detected runtime CPU features: "); + if (cpu_supports_avx()) + printf("AVX "); + if (cpu_supports_avx2()) + printf("AVX2 "); + if (cpu_supports_armv7_neon_vfpv4()) + printf("ARMv7+NEON+VFPv4 "); + printf("\n"); + + printf(" selected DSP implementations: \n"); +#define SHOW(x) do { \ + printf(" %-40s %s\n", #x , starch_ ## x ## _select()->name); \ + printf(" %-40s %s\n", #x "_aligned", starch_ ## x ## _aligned_select()->name); \ + } while(0) + + SHOW(magnitude_uc8); + SHOW(magnitude_power_uc8); + SHOW(magnitude_sc16); + SHOW(magnitude_sc16q11); + SHOW(mean_power_u16); + +#undef SHOW + printf("\n"); } @@ -327,8 +347,11 @@ static void showHelp(void) "--write-json Periodically write json output to (for serving by a separate webserver)\n" "--write-json-every Write json output every t seconds (default 1)\n" "--json-location-accuracy Accuracy of receiver location in json metadata: 0=no location, 1=approximate, 2=exact\n" +#if 0 "--dcfilter Apply a 1Hz DC filter to input data (requires more CPU)\n" -"--version Show version and build options\n" +#endif +"--wisdom Read DSP wisdom from given path\n" +"--version Show version, build and DSP options\n" "--help Show this help\n" ); } @@ -488,7 +511,11 @@ int main(int argc, char **argv) { } else if (!strcmp(argv[j],"--gain") && more) { Modes.gain = (int) (atof(argv[++j])*10); // Gain is in tens of DBs } else if (!strcmp(argv[j],"--dcfilter")) { +#if 0 Modes.dc_filter = 1; +#else + fprintf(stderr, "--dcfilter option ignored (please raise an issue on github if you have a usecase that needs this)\n"); +#endif } else if (!strcmp(argv[j],"--measure-noise")) { // Ignored } else if (!strcmp(argv[j],"--fix")) { @@ -612,6 +639,7 @@ int main(int argc, char **argv) { exit(0); } else if (!strcmp(argv[j],"--version")) { showVersion(); + showDSP(); exit(0); } else if (!strcmp(argv[j],"--quiet")) { Modes.quiet = 1; @@ -629,6 +657,12 @@ int main(int argc, char **argv) { Modes.json_interval = 100; } else if (!strcmp(argv[j], "--json-location-accuracy") && more) { Modes.json_location_accuracy = atoi(argv[++j]); + } else if (!strcmp(argv[j], "--wisdom") && more) { + if (starch_read_wisdom (argv[++j]) < 0) { + fprintf(stderr, + "Failed to read wisdom file %s: %s\n", argv[j], strerror(errno)); + exit(1); + } } else if (sdrHandleOption(argc, argv, &j)) { /* handled */ } else { diff --git a/dump1090.h b/dump1090.h index 3068ea0..c00c513 100644 --- a/dump1090.h +++ b/dump1090.h @@ -81,6 +81,7 @@ #include #include "compat/compat.h" +#include "dsp/generated/starch.h" // ============================= #defines =============================== diff --git a/oneoff/convert_benchmark.c b/oneoff/convert_benchmark.c index 6778cde..7ee2d0a 100644 --- a/oneoff/convert_benchmark.c +++ b/oneoff/convert_benchmark.c @@ -102,8 +102,12 @@ static void test(const char *what, input_format_t format, void **data, double sa struct timespec total = { 0, 0 }; int iterations = 0; + double level, power; + // Run it once to force init. - converter(data[0], outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL); + for (int i = 0; i < 10; ++i) { + converter(data[i], outdata, MODES_MAG_BUF_SAMPLES, state, &level, &power); + } while (total.tv_sec < 5) { fprintf(stderr, "."); @@ -112,7 +116,7 @@ static void test(const char *what, input_format_t format, void **data, double sa start_cpu_timing(&start); for (int i = 0; i < 10; ++i) { - converter(data[i], outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL); + converter(data[i], outdata, MODES_MAG_BUF_SAMPLES, state, &level, &power); } end_cpu_timing(&start, &total); @@ -135,6 +139,9 @@ int main(int argc, char **argv) MODES_NOTUSED(argc); MODES_NOTUSED(argv); + if (argc > 1) + starch_read_wisdom(argv[1]); + prepare(); test("SC16Q11, DC", INPUT_SC16Q11, testdata_sc16q11, 2400000, true); diff --git a/oneoff/dsp_error_measurement.c b/oneoff/dsp_error_measurement.c new file mode 100644 index 0000000..c46870c --- /dev/null +++ b/oneoff/dsp_error_measurement.c @@ -0,0 +1,229 @@ +/* measures actual vs expected magnitude values for various magnitude_* + * implementations + */ + +#include +#include +#include +#include +#include + +#include "dsp-types.h" +#include "dsp/generated/starch.h" + +static void write_results_uc8(const uc8_t *in, uint16_t *out, unsigned len, char *path) +{ + FILE *fp = fopen(path, "w"); + if (!fp) { + fprintf(stderr, "fopen(%s): %s\n", path, strerror(errno)); + return; + } + + while (--len) { + float I = (in[0].I - 127.4) / 128; + float Q = (in[0].Q - 127.4) / 128; + + float phase = atan2(Q, I) * 180.0 / M_PI; + float expected = round(sqrtf(I * I + Q * Q) * 65536); + if (expected > 65535) + expected = 65535; + fprintf(fp, "%u %u %.3f %.0f %u\n", in[0].I, in[0].Q, phase, expected, out[0]); + + ++in; + ++out; + } + + fclose(fp); + fprintf(stderr, "wrote %s\n", path); +} + +static void write_results_sc16(const sc16_t *in, uint16_t *out, unsigned len, char *path) +{ + FILE *fp = fopen(path, "w"); + if (!fp) { + fprintf(stderr, "fopen(%s): %s\n", path, strerror(errno)); + return; + } + + while (--len) { + float I = in[0].I / 32768.0; + float Q = in[0].Q / 32768.0; + + float phase = atan2(Q, I) * 180.0 / M_PI; + float expected = round(sqrtf(I * I + Q * Q) * 65536); + if (expected > 65535) + expected = 65535; + fprintf(fp, "%d %d %.3f %.0f %u\n", in[0].I, in[0].Q, phase, expected, out[0]); + + ++in; + ++out; + } + + fclose(fp); + fprintf(stderr, "wrote %s\n", path); +} + +static void write_results_sc16q11(const sc16_t *in, uint16_t *out, unsigned len, char *path) +{ + FILE *fp = fopen(path, "w"); + if (!fp) { + fprintf(stderr, "fopen(%s): %s\n", path, strerror(errno)); + return; + } + + while (--len) { + float I = in[0].I / 2048.0; + float Q = in[0].Q / 2048.0; + + float phase = atan2(Q, I) * 180.0 / M_PI; + float expected = round(sqrtf(I * I + Q * Q) * 65536); + if (expected > 65535) + expected = 65535; + fprintf(fp, "%d %d %.3f %.0f %u\n", in[0].I, in[0].Q, phase, expected, out[0]); + + ++in; + ++out; + } + + fclose(fp); + fprintf(stderr, "wrote %s\n", path); +} + +static void process_uc8() +{ + const float mag_min = 0.05; + const float mag_max = 0.95; + + const unsigned mag_steps = 5; + const unsigned phase_steps = 256; + + const unsigned len = mag_steps * phase_steps; + + uc8_t *in = malloc(len * sizeof(*in)); + uint16_t *out = malloc(len * sizeof(*out)); + uc8_t *fill = in; + + for (unsigned mag_step = 0; mag_step < mag_steps; ++mag_step) { + float mag = mag_min + mag_step * (mag_max - mag_min) / (mag_steps - 1); + for (unsigned phase_step = 0; phase_step < phase_steps; ++phase_step) { + float phase = 360.0 * phase_step / phase_steps; + fill->I = 128 * mag * cos(phase * M_PI / 180.0) + 127.4; + fill->Q = 128 * mag * sin(phase * M_PI / 180.0) + 127.4; + + if (fill == in || fill[-1].I != fill[0].I || fill[-1].Q != fill[0].Q) + ++fill; + } + } + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_uc8_exact_generic(in, out, len); + write_results_uc8(in, out, fill - in, "uc8-exact.tsv"); +#endif + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_uc8_lookup_generic(in, out, len); + write_results_uc8(in, out, fill - in, "uc8-lookup.tsv"); +#endif + +#ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 + starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4(in, out, len); + write_results_uc8(in, out, fill - in, "uc8-neon-vrsqrte.tsv"); +#endif +} + +static void process_sc16() +{ + const float mag_min = 0.05; + const float mag_max = 0.95; + + const unsigned mag_steps = 5; + const unsigned phase_steps = 65536; + + const unsigned len = mag_steps * phase_steps; + + sc16_t *in = malloc(len * sizeof(*in)); + uint16_t *out = malloc(len * sizeof(*out)); + sc16_t *fill = in; + + for (unsigned mag_step = 0; mag_step < mag_steps; ++mag_step) { + float mag = mag_min + mag_step * (mag_max - mag_min) / (mag_steps - 1); + for (unsigned phase_step = 0; phase_step < phase_steps; ++phase_step) { + float phase = 360.0 * phase_step / phase_steps; + fill->I = 32768.0f * mag * cos(phase * M_PI / 180.0); + fill->Q = 32768.0f * mag * sin(phase * M_PI / 180.0); + + if (fill == in || fill[-1].I != fill[0].I || fill[-1].Q != fill[0].Q) + ++fill; + } + } + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_sc16_exact_generic(in, out, len); + write_results_sc16(in, out, fill - in, "sc16-exact.tsv"); +#endif + +#ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 + starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4(in, out, len); + write_results_sc16(in, out, fill - in, "sc16-neon-vrsqrte.tsv"); +#endif +} + +static void process_sc16q11() +{ + const float mag_min = 0.05; + const float mag_max = 0.95; + + const unsigned mag_steps = 5; + const unsigned phase_steps = 2048; + + const unsigned len = mag_steps * phase_steps; + + sc16_t *in = malloc(len * sizeof(*in)); + uint16_t *out = malloc(len * sizeof(*out)); + sc16_t *fill = in; + + for (unsigned mag_step = 0; mag_step < mag_steps; ++mag_step) { + float mag = mag_min + mag_step * (mag_max - mag_min) / (mag_steps - 1); + for (unsigned phase_step = 0; phase_step < phase_steps; ++phase_step) { + float phase = 360.0 * phase_step / phase_steps; + fill->I = 2048.0f * mag * cos(phase * M_PI / 180.0); + fill->Q = 2048.0f * mag * sin(phase * M_PI / 180.0); + if (fill == in || fill[-1].I != fill[0].I || fill[-1].Q != fill[0].Q) + ++fill; + } + } + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_sc16q11_exact_generic(in, out, len); + write_results_sc16q11(in, out, fill - in, "sc16q11-exact.tsv"); +#endif + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_sc16q11_11bit_table_generic(in, out, len); + write_results_sc16q11(in, out, fill - in, "sc16q11-lookup.tsv"); +#endif + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_sc16q11_12bit_table_generic(in, out, len); + write_results_sc16q11(in, out, fill - in, "sc16q11-lookup.tsv"); +#endif + +#ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 + starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4(in, out, len); + write_results_sc16q11(in, out, fill - in, "sc16q11-neon-vrsqrte.tsv"); +#endif +} + +int main(int argc, char **argv) +{ + (void) argc; + (void) argv; + + process_uc8(); + process_sc16(); + process_sc16q11(); + + return 0; +} + + diff --git a/oneoff/uc8_capture_stats.c b/oneoff/uc8_capture_stats.c new file mode 100644 index 0000000..340d75b --- /dev/null +++ b/oneoff/uc8_capture_stats.c @@ -0,0 +1,106 @@ +/* measures min, max, mean I and Q values in a UC8-format capture */ + +#include +#include + +#include +#include +#include +#include + +#include + +#include "dsp-types.h" + +int main(int argc, char **argv) +{ + if (argc < 2) { + fprintf(stderr, "need a capture filename\n"); + return 1; + } + + const unsigned len = 1<<24; + uc8_t *buffer = malloc(len * sizeof(uc8_t)); + + int fd = open(argv[1], O_RDONLY); + if (fd < 0) { + perror("open"); + return 1; + } + + unsigned all_samples = 0; + double all_I = 0, all_Q = 0; + + for (;;) { + ssize_t count = read(fd, buffer, len * sizeof(uc8_t)); + if (count < 0) { + perror("read"); + close(fd); + return 1; + } + + if (count <= 0) + break; + + unsigned actual_len = count / sizeof(uc8_t); + + int min_I = INT_MAX, max_I = INT_MIN; + unsigned min_I_count = 0, max_I_count = 0; + int min_Q = INT_MAX, max_Q = INT_MIN; + unsigned min_Q_count = 0, max_Q_count = 0; + double sum_I = 0, sum_Q = 0; + + for (unsigned i = 0; i < actual_len; ++i) { + int I = buffer[i].I; + int Q = buffer[i].Q; + + if (I < min_I) { + min_I = I; + min_I_count = 0; + } + if (I == min_I) { + ++min_I_count; + } + + if (Q < min_Q) { + min_Q = Q; + min_Q_count = 0; + } + if (Q == min_Q) { + ++min_Q_count; + } + + if (I > max_I) { + max_I = I; + max_I_count = 0; + } + if (I == max_I) { + ++max_I_count; + } + + if (Q > max_Q) { + max_Q = Q; + max_Q_count = 0; + } + if (Q == max_Q) { + ++max_Q_count; + } + + sum_I += I; + sum_Q += Q; + } + + all_I += sum_I; + all_Q += sum_Q; + all_samples += actual_len; + + fprintf(stderr, + "%u samples; I: min %4d (%5u); max %4d (%5u); mean %7.2f; overall mean %7.2f; Q: min %4d (%5u); max %4d (%5u); mean %7.2f; overall mean %7.2f\n", + actual_len, + min_I, min_I_count, max_I, max_I_count, sum_I / actual_len, all_I / all_samples, + min_Q, min_Q_count, max_Q, max_Q_count, sum_Q / actual_len, all_Q / all_samples); + } + + close(fd); + return 0; +} diff --git a/starch/.gitignore b/starch/.gitignore new file mode 100644 index 0000000..9f7e9fd --- /dev/null +++ b/starch/.gitignore @@ -0,0 +1,6 @@ +__pycache__ +example/mako/ +example/starch-benchmark +*.o +*~ +.mypy_cache diff --git a/starch/LICENSE b/starch/LICENSE new file mode 100644 index 0000000..0443d73 --- /dev/null +++ b/starch/LICENSE @@ -0,0 +1,23 @@ +Copyright (c) 2020, FlightAware LLC. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/starch/Makefile b/starch/Makefile new file mode 100644 index 0000000..155a675 --- /dev/null +++ b/starch/Makefile @@ -0,0 +1,4 @@ +all: mypy + +mypy: + MYPYPATH=$(CURDIR)/stubs mypy --show-error-codes starch.py diff --git a/starch/README.md b/starch/README.md new file mode 100644 index 0000000..86f5e59 --- /dev/null +++ b/starch/README.md @@ -0,0 +1,182 @@ +# starch - a framework for selecting architecture-specific code at runtime + +`starch` helps generates glue code to *s*elec*t* *arch*itecture-specific +versions of code depending on the hardware detected at runtime. + +It arranges for code to be built multiple times with different compiler +options. At runtime, user code calls a dispatcher entry point which +selects the best compiled version of the versions that can safely run +on the hardware used at runtime. + +It tries to be agnostic about the details of the code being generated +and the details of the hardware. + +## Caution caution work in progress + +This documentation isn't very complete. You'll need to look at the example +and the code itself. + +## Design notes + + * Architecture-independent generated output; the generated outputs can + be generated during development and committed as part of the main + source code, and at build time starch does not need to be re-run. + + * Doesn't care about the details of the functions you call; they can + have any signature. + + * Can automatically generate benchmarking code given a benchmarking + helper that sets up inputs to the function. + + * Does not do any hardware detection itself, and does not care about + the hardware details; for each combination of compiler flags, the user + code provides a test function to be called at runtime to determine if + it is safe to run code compiled with those flags. + + * Allows the same generic code to be compiled multiple times with different + compile flags to take advantage of compile auto-vectorization that + requires additional instruction set features (AVX, NEON, ..) being enabled. + + * Emits makefile fragments to be included into a larger makefile structure + +## License + +The generator script and templates are licensed under a BSD 2-clause license, +see the LICENSE file. + +No copyright claim is made on generated code. + +## Prerequisites + +At generation time (results can be committed to version control): + + * Python 3 + * [Mako](https://www.makotemplates.org/) + +At build time: + + * a C compiler + * make + +## Quickstart + +Look in example/ for a full example. + +## Concepts + +A *function* is the user-visible API to starch-generated code. It just looks +like a C function pointer. Initially, this pointer points to a dispatcher +routine which will select an appropriate implementation at runtime and call +it. For subsequent calls, the dispatcher updates the function pointer to +point directly to the selected implementation. + +A *function impl* is one particular way of implementing a function. All +impls should produce the same results given the same inputs to avoid confusing +user code. There may be different impls with different performance +characteristics - for example, different degrees of manual loop unrolling, or +an impl that takes advantage of a particular instruction set (NEON, AVX, etc). +Each impl has a unique-within-the-function "variant" name that identifies it. + +Function impls may be conditionally compiled depending on build features +(see below). This is useful for impls that cannot always be compiled e.g. +they depend on the availability of a particular instruction set. + +A *build flavor* is a particular way of building the function impl. It +consists of a set of compiler flags to use, plus an associated test function +that determines at runtime if it is safe to run the code. For example, +a flavor may enable use of specific instructions that may or may not be +available at runtime via `-mavx`, `-march=...`, and similar flags. Each +flavor declares that it provides zero or more *features*. + +A *feature* is a characteristic of the build flavor compiler flags that +allows certain impls to be compiled. For example, an impl that uses NEON +intrinsics can only be compiled if the compiler is building for an ARM +instruction set that supports NEON. Features are defined in the build flavor, +and are advertised at compile time by the presence of a `STARCH_FEATURE_x` +macro; implementations may conditionally compile on this macro and should use +`STARCH_IMPL_REQUIRES` to indicate they will only be emitted when a given +feature is present. + +A *build mix* is a combination of build flavors that can coexist in the same +binary. For example, an "x86" mix might include build flavors that build +for generic x86, x86-with-AVX, and x86-with-AVX2; but it would not include +a build flavor for ARM, because ARM and x86 object code can't be linked +together into a single binary. + +## Alignment + +A function can optionally include an aligned version; this is a version of the +function with an independent call point and wisdom, which assumes that +data passed to the function is already aligned. Each flavor has an associated +alignment in bytes, but otherwise it is up to the implementations to decide +what exactly is aligned. Implementations for an aligned function on a flavor +that specifies an alignment (>1 byte) will be compiled twice, once with an +alignment of 1 and once with the flavor's alignment, to generate two different +compiled versions. + +starch provides macros to help with alignment: + + * `STARCH_ALIGNMENT`, in implementations, is the alignment (in bytes) that + implementations can assume. + * `STARCH_MIX_ALIGNMENT`, defined in the generated header file, is the required + alignment (in bytes) for callers of the _aligned version of a function. + It is the largest alignment of all flavors in the mix. + * `STARCH_ALIGNED(ptr)` in implementations evaluates to `ptr` while hinting to + the compiler that the data is aligned according to STARCH_ALIGNMENT. This + maps to gcc's `__builtin_assume_aligned` builtin. + +## Benchmarks + +Functions can optionally provide a benchmark helper by defining a +(no args, void return typer) function using the STARCH_BENCHMARK macro. This +macro is only present when benchmark code is being compiled. + +The benchmark helper should set up function inputs for benchmarking and then +use the `STARCH_BENCHMARK_RUN` macro. This macro expands to code that will +benchmark each possible impl in turn with the provided arguments. + +If the benchmark needs to allocated possibly-aligned buffers, +two macros `STARCH_BENCHMARK_ALLOC` and `STARCH_BENCHMARK_FREE` +will allocate suitably aligned buffers for the current `STARCH_ALIGNMENT` +value. `STARCH_BENCHMARK_ALLOC(count,type)` will allocate `count` elements of +type `type`, aligned to either `STARCH_ALIGNMENT` or the required alignment +for `type`, whichever is larger. `STARCH_BENCHMARK_FREE(ptr)` will free a +buffer previously allocated by `STARCH_BENCHMARK_ALLOC`. + +See `example/benchmark/subtract_n_benchmark.c` for examples. + +## Gotchas + +Files added by `scan_file` are `#include`-d into surrounding support files. +Multiple files may be included into the same compilation unit. You should +ensure that you don't pollute the global namespace (macros, static functions +names, etc) for subsequent files that will follow. + +Files added by `scan_file` will be compiled multiple times. You should ensure +that any symbols other than those handled by STARCH_IMPL / STARCH_IMPL_REQUIRES +are either static or use the STARCH_SYMBOL macro to get a unique name for +this compilation pass. + +You probably want to separate out benchmark-support code into separate files +to avoid an extra version of any impls present in the same file from being +emitted. + +## Wisdom + +There is partial support for a wisdom implementation. Wisdom is a priori +information about the preferred code to use for a given function, for example +as the result of benchmarking to find the fastest version. It is simply the +order in which compiled impls are tried until one that is supported is found. + +To set wisdom, there are two options: + +1) Provide a wisdom ordering for the function when defining a build mix. This +controls the order in which the compiled impls are included in the generated +registry that is searched at runtime. + +2) Call `starch__set_wisdom` at runtime. This accepts an array of +function variants, terminated by NULL. When called, the registry is re-sorted +to prefer the listed variants in the order provided (and the function pointer +is reset to the dispatcher so the chosen code will be re-selected on the next +call). This could be used to load install-specific wisdom during program +startup. diff --git a/starch/example/Makefile b/starch/example/Makefile new file mode 100644 index 0000000..767deff --- /dev/null +++ b/starch/example/Makefile @@ -0,0 +1,28 @@ +CC ?= gcc +CFLAGS = -O3 -Wall -g + +STARCH_COMPILE := $(CC) $(CFLAGS) -c + +ARCH := $(shell uname -m) + +all: generate starch-benchmark + +ifneq (,$(findstring arm,$(ARCH))) + -include generated/makefile.arm +else ifneq (,$(findstring x86_64,$(ARCH))) + -include generated/makefile.x86_64 +else + -include generated/makefile.generic +endif + +support.o: support.c + $(CC) $(CFLAGS) -c -o $@ $^ + +starch-benchmark: $(STARCH_OBJS) $(STARCH_BENCHMARK_OBJ) support.o + $(CC) $(CFLAGS) -o $@ $^ + +generate: + ./starchgen.py + +clean: + rm -f $(STARCH_OBJS) $(STARCH_BENCHMARK_OBJ) support.o starch-benchmark diff --git a/starch/example/benchmark/subtract_n_benchmark.c b/starch/example/benchmark/subtract_n_benchmark.c new file mode 100644 index 0000000..b083088 --- /dev/null +++ b/starch/example/benchmark/subtract_n_benchmark.c @@ -0,0 +1,33 @@ +#include + +void STARCH_BENCHMARK(subtract_n) (void) +{ + uint16_t *in = NULL, *out = NULL; + const unsigned len = 65536; + const unsigned n = 42; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, uint16_t)) || !(out = STARCH_BENCHMARK_ALLOC(len, uint16_t))) { + goto done; + } + + STARCH_BENCHMARK_RUN( subtract_n, in, len, n, out ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out); +} + +bool STARCH_BENCHMARK_VERIFY(subtract_n)(const uint16_t *in, unsigned len, uint16_t n, uint16_t *out) +{ + bool okay = true; + + for (unsigned i = 0; i < len; ++i) { + uint16_t expected = in[i] - n; + if (out[i] != expected) { + fprintf(stderr, "verification failed: in[%u]=%u n=%u out[%u]=%u expected=%u\n", i, in[i], n, i, out[i], expected); + okay = false; + } + } + + return okay; +} diff --git a/starch/example/generated/.keep b/starch/example/generated/.keep new file mode 100644 index 0000000..e69de29 diff --git a/starch/example/generated/benchmark.c b/starch/example/generated/benchmark.c new file mode 100644 index 0000000..8e1908e --- /dev/null +++ b/starch/example/generated/benchmark.c @@ -0,0 +1,569 @@ + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "starch.h" + +typedef struct timespec starch_benchmark_time; + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end); +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size); +void starch_benchmark_aligned_free(void *user_ptr); +void starch_benchmark_get_time(starch_benchmark_time *t); + +const unsigned starch_benchmark_warmup_loops = 10; + +typedef struct { + const char *name; + const char *impl; + uint64_t ns; +} starch_benchmark_result; + +static starch_benchmark_result *starch_benchmark_results = NULL; +static unsigned starch_benchmark_result_size = 0; +static unsigned starch_benchmark_result_count = 0; + +typedef struct benchmark_flavor_list_node { + const char *flavor; + struct benchmark_flavor_list_node *next; +} starch_benchmark_flavor_list; + +static starch_benchmark_flavor_list *starch_benchmark_flavor_whitelist = NULL; +static starch_benchmark_flavor_list *starch_benchmark_flavor_blacklist = NULL; + +static bool starch_benchmark_list_only = false; +static bool starch_benchmark_top_only = false; +static unsigned starch_benchmark_iterations = 1; + +typedef struct timespec starch_benchmark_time; +void starch_benchmark_get_time(starch_benchmark_time *t) +{ +#ifdef CLOCK_THREAD_CPUTIME_ID + clock_gettime(CLOCK_THREAD_CPUTIME_ID, t); +#else + clock_gettime(CLOCK_MONOTONIC, t); +#endif +} + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end) +{ + return ((uint64_t)end->tv_sec - (uint64_t)start->tv_sec) * 1000000000U + (uint64_t)end->tv_nsec - (uint64_t)start->tv_nsec; +} + +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size) +{ + size_t use_alignment = (type_alignment > alignment ? type_alignment : alignment); + if ( (use_alignment % type_alignment) || (use_alignment % alignment) ) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: conflicting alignment requirements (%zu versus %zu)\n", size, alignment, type_alignment); + return NULL; + } + + /* Over-allocate so we can stash our own pointer before the start, and so that we can adjust + * the returned alignment so it is only aligned to the requested boundary, and not also + * aligned to a larger power of two (we don't want to accidentally benchmark the performance + * of a more restrictive larger alignment) + */ + size_t header_size = (use_alignment < sizeof(void*) ? sizeof(void*) : use_alignment); + char *block_ptr = aligned_alloc(use_alignment, header_size + size + use_alignment); + if (!block_ptr) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: %s\n", size, strerror(errno)); + return NULL; + } + + char *user_ptr = block_ptr + header_size; + if ( (uintptr_t)user_ptr % (use_alignment * 2) == 0 ) { + // user_ptr is aligned to the next power of two, but we don't want that, move it on + user_ptr += use_alignment; + } + + void **stash = (void**)user_ptr - 1; + *stash = block_ptr; + + return user_ptr; +} + +void starch_benchmark_aligned_free(void *user_ptr) +{ + if (!user_ptr) + return; + void **stash = (void**)user_ptr - 1; + free(*stash); +} + +static bool starch_benchmark_flavor_in_list(const char *flavor, const starch_benchmark_flavor_list *list) +{ + for (; list; list = list->next) { + if (!strcmp(flavor, list->flavor)) + return true; + } + return false; +} + + +/* prototypes for benchmark helpers provided by user code */ +void starch_subtract_n_benchmark (void); +bool starch_subtract_n_benchmark_verify ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_subtract_n_benchmark(void); + +static void starch_benchmark_one_subtract_n( starch_subtract_n_regentry * _entry, const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + + /* verify correctness of the output */ + if (! starch_subtract_n_benchmark_verify ( arg0, arg1, arg2, arg3 )) { + fprintf(stderr, "skipped (verification failed)\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "subtract_n"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_subtract_n( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) +{ + for (starch_subtract_n_regentry *_entry = starch_subtract_n_registry; _entry->name; ++_entry) { + starch_benchmark_one_subtract_n( _entry, arg0, arg1, arg2, arg3 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_subtract_n_aligned_benchmark (void); +bool starch_subtract_n_aligned_benchmark_verify ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_subtract_n_aligned_benchmark(void); + +static void starch_benchmark_one_subtract_n_aligned( starch_subtract_n_aligned_regentry * _entry, const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + + /* verify correctness of the output */ + if (! starch_subtract_n_aligned_benchmark_verify ( arg0, arg1, arg2, arg3 )) { + fprintf(stderr, "skipped (verification failed)\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "subtract_n_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_subtract_n_aligned( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) +{ + for (starch_subtract_n_aligned_regentry *_entry = starch_subtract_n_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_subtract_n_aligned( _entry, arg0, arg1, arg2, arg3 ); + } +} + + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _benchmark_sym +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) starch_ ## _function ## _benchmark +#define STARCH_BENCHMARK_VERIFY(_function) starch_ ## _function ## _benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(1, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +#include "../benchmark/subtract_n_benchmark.c" + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES +#undef STARCH_BENCHMARK +#undef STARCH_BENCHMARK_VERIFY +#undef STARCH_BENCHMARK_RUN +#undef STARCH_BENCHMARK_ALLOC +#undef STARCH_BENCHMARK_FREE + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_benchmark_sym +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) starch_ ## _function ## _aligned_benchmark +#define STARCH_BENCHMARK_VERIFY(_function) starch_ ## _function ## _aligned_benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ## _aligned ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(STARCH_MIX_ALIGNMENT, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +#include "../benchmark/subtract_n_benchmark.c" + +static void starch_benchmark_all_subtract_n(void) +{ + fprintf(stderr, "==== subtract_n ===\n"); + starch_subtract_n_benchmark (); +} +static void starch_benchmark_all_subtract_n_aligned(void) +{ + fprintf(stderr, "==== subtract_n_aligned ===\n"); + starch_subtract_n_aligned_benchmark (); +} + +static int starch_benchmark_compare_result(const void *a, const void *b) +{ + const starch_benchmark_result *left = (const starch_benchmark_result *) a; + const starch_benchmark_result *right = (const starch_benchmark_result *) b; + + int name_cmp = strcmp(left->name, right->name); + if (name_cmp) + return name_cmp; + + if (left->ns < right->ns) + return -1; + if (left->ns > right->ns) + return 1; + return 0; +} + +static void starch_benchmark_usage(const char *argv0) +{ + fprintf(stderr, + "Usage: %s [OPTION]... [FUNCTION]...\n" + "Benchmarks starch functions and optionally writes a sorted wisdom file.\n" + "\n" + " -r FILE Read initial wisdom from FILE\n" + " -o FILE Write sorted wisdom to FILE\n" + " -F FLAVOR Add FLAVOR to whitelist\n" + " (default: no whitelist, run all runtime-supported flavors)\n" + " -N FLAVOR Add FLAVOR to blacklist\n" + " (default: no blacklist, run all runtime-supported flavors)\n" + " -l List compiled-in implementations but don't benchmark them\n" + " -t Include only the top candidate per function in wisdom output\n" + " -i ITERS Run benchmark ITERS times and use the mean. If ITERS > 2, ignore\n" + " the smallest and largest runs when calculating the mean.\n" + " (default: 1 iteration)\n" + " FUNCTION Run benchmarks for these functions only\n" + " (default: benchmark all functions)\n" + "\n" + "Supported flavors: " +#ifdef STARCH_FLAVOR_GENERIC + "generic " +#endif +#ifdef STARCH_FLAVOR_ARMV7A_VFPV3 + "armv7a_vfpv3 " +#endif +#ifdef STARCH_FLAVOR_ARMV7A_VFPV4 + "armv7a_vfpv4 " +#endif +#ifdef STARCH_FLAVOR_X86_64_AVX + "x86_64_avx " +#endif +#ifdef STARCH_FLAVOR_X86_64_AVX2 + "x86_64_avx2 " +#endif + "\n" + "Supported functions: " + "subtract_n " + "subtract_n_aligned " + "\n", argv0); +} + +static void starch_benchmark_append_flavor(const char *flavor, starch_benchmark_flavor_list **list) +{ + starch_benchmark_flavor_list *newnode = malloc(sizeof(*newnode)); + if (!newnode) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + exit(1); + } + + newnode->flavor = flavor; + newnode->next = *list; + *list = newnode; +} + +int main(int argc, char **argv) +{ + int specific = 0; + const char *output_path = NULL; + + int opt; + while ((opt = getopt(argc, argv, "r:o:F:N:i:lht")) != -1) { + switch (opt) { + case 'r': + if (starch_read_wisdom(optarg) < 0) { + fprintf(stderr, "%s: cannot read %s: %s\n", argv[0], optarg, strerror(errno)); + return 1; + } + fprintf(stderr, "%s: loaded wisdom file %s\n", argv[0], optarg); + break; + + case 'o': + output_path = optarg; + break; + + case 'F': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_whitelist); + break; + + case 'N': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_blacklist); + break; + + case 'l': + starch_benchmark_list_only = true; + break; + + case 't': + starch_benchmark_top_only = true; + break; + + case 'i': + starch_benchmark_iterations = atoi(optarg); + break; + + case 'h': + starch_benchmark_usage(argv[0]); + return 0; + + case '?': + default: + starch_benchmark_usage(argv[0]); + return 2; + } + } + + if (starch_benchmark_list_only && output_path) { + fprintf(stderr, "%s: -o and -l options cannot be specified together\n", argv[0]); + return 2; + } + + for (int i = optind; i < argc; ++i) { + if (!strcmp(argv[i], "subtract_n")) { + specific = 1; + starch_benchmark_all_subtract_n(); + continue; + } + if (!strcmp(argv[i], "subtract_n_aligned")) { + specific = 1; + starch_benchmark_all_subtract_n_aligned(); + continue; + } + + fprintf(stderr, "%s: unrecognized function name: %s\n", argv[0], argv[i]); + return 2; + } + + if (!specific) { + starch_benchmark_all_subtract_n(); + starch_benchmark_all_subtract_n_aligned(); + } + + if (output_path) { + FILE *out = fopen(output_path, "w"); + if (!out) { + fprintf(stderr, "%s: cannot open %s: %s\n", argv[0], output_path, strerror(errno)); + return 1; + } + + fprintf(out, "# generated by "); + for (int i = 0; i < argc; ++i) + fprintf(out, "%s ", argv[i]); + fprintf(out, "\n\n"); + + qsort(starch_benchmark_results, starch_benchmark_result_count, sizeof(*starch_benchmark_results), starch_benchmark_compare_result); + + const char *last_name = NULL; + bool first = true; + for (unsigned i = 0; i < starch_benchmark_result_count; ++i) { + starch_benchmark_result *result = &starch_benchmark_results[i]; + if (last_name && strcmp(last_name, result->name) != 0) { + fprintf(out, "\n"); + first = true; + } + last_name = result->name; + if (starch_benchmark_top_only && !first) + continue; + fprintf(out, "%-40s %-40s # %" PRIu64 " ns/call\n", result->name, result->impl, result->ns); + first = false; + } + + fclose(out); + fprintf(stderr, "%s: wrote sorted wisdom to %s\n", argv[0], output_path); + } + + return 0; +} diff --git a/starch/example/generated/dispatcher.c b/starch/example/generated/dispatcher.c new file mode 100644 index 0000000..0b6a061 --- /dev/null +++ b/starch/example/generated/dispatcher.c @@ -0,0 +1,313 @@ + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include + +#include "starch.h" + +/* helper for re-sorting registries */ +struct starch_regentry_prefix { + int rank; +}; + +static int starch_regentry_rank_compare (const void *l, const void *r) +{ + const struct starch_regentry_prefix *left = l, *right = r; + return left->rank - right->rank; +} + +/* dispatcher / registry for subtract_n */ + +starch_subtract_n_regentry * starch_subtract_n_select() { + for (starch_subtract_n_regentry *entry = starch_subtract_n_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_subtract_n_dispatch ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) { + starch_subtract_n_regentry *entry = starch_subtract_n_select(); + if (!entry) + abort(); + + starch_subtract_n = entry->callable; + starch_subtract_n ( arg0, arg1, arg2, arg3 ); +} + +starch_subtract_n_ptr starch_subtract_n = starch_subtract_n_dispatch; + +void starch_subtract_n_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_subtract_n_regentry *entry; + for (entry = starch_subtract_n_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_subtract_n_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_subtract_n_registry, entry - starch_subtract_n_registry, sizeof(starch_subtract_n_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_subtract_n = starch_subtract_n_dispatch; +} + +starch_subtract_n_regentry starch_subtract_n_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 1, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 2, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_intrinsics_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_neon_intrinsics_armv7a_vfpv4, supports_neon_vfpv4 }, + { 1, "neon_intrinsics_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_neon_intrinsics_armv7a_vfpv3, supports_neon_vfpv3 }, + { 2, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 3, "generic_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_generic_armv7a_vfpv4, supports_neon_vfpv4 }, + { 4, "unroll_4_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_unroll_4_armv7a_vfpv4, supports_neon_vfpv4 }, + { 5, "bad_implementation_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_bad_implementation_armv7a_vfpv4, supports_neon_vfpv4 }, + { 6, "generic_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_generic_armv7a_vfpv3, supports_neon_vfpv3 }, + { 7, "unroll_4_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_unroll_4_armv7a_vfpv3, supports_neon_vfpv3 }, + { 8, "bad_implementation_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_bad_implementation_armv7a_vfpv3, supports_neon_vfpv3 }, + { 9, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 10, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86_64 + { 0, "generic_x86_64_avx2", "x86_64_avx2", starch_subtract_n_generic_x86_64_avx2, supports_x86_avx2 }, + { 1, "generic_x86_64_avx", "x86_64_avx", starch_subtract_n_generic_x86_64_avx, supports_x86_avx }, + { 2, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 3, "unroll_4_x86_64_avx2", "x86_64_avx2", starch_subtract_n_unroll_4_x86_64_avx2, supports_x86_avx2 }, + { 4, "bad_implementation_x86_64_avx2", "x86_64_avx2", starch_subtract_n_bad_implementation_x86_64_avx2, supports_x86_avx2 }, + { 5, "unroll_4_x86_64_avx", "x86_64_avx", starch_subtract_n_unroll_4_x86_64_avx, supports_x86_avx }, + { 6, "bad_implementation_x86_64_avx", "x86_64_avx", starch_subtract_n_bad_implementation_x86_64_avx, supports_x86_avx }, + { 7, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 8, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_X86_64 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for subtract_n_aligned */ + +starch_subtract_n_aligned_regentry * starch_subtract_n_aligned_select() { + for (starch_subtract_n_aligned_regentry *entry = starch_subtract_n_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_subtract_n_aligned_dispatch ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) { + starch_subtract_n_aligned_regentry *entry = starch_subtract_n_aligned_select(); + if (!entry) + abort(); + + starch_subtract_n_aligned = entry->callable; + starch_subtract_n_aligned ( arg0, arg1, arg2, arg3 ); +} + +starch_subtract_n_aligned_ptr starch_subtract_n_aligned = starch_subtract_n_aligned_dispatch; + +void starch_subtract_n_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_subtract_n_aligned_regentry *entry; + for (entry = starch_subtract_n_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_subtract_n_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_subtract_n_aligned_registry, entry - starch_subtract_n_aligned_registry, sizeof(starch_subtract_n_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_subtract_n_aligned = starch_subtract_n_aligned_dispatch; +} + +starch_subtract_n_aligned_regentry starch_subtract_n_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 1, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 2, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "generic_armv7a_vfpv4_aligned", "armv7a_vfpv4", starch_subtract_n_aligned_generic_armv7a_vfpv4, supports_neon_vfpv4 }, + { 1, "unroll_4_armv7a_vfpv4_aligned", "armv7a_vfpv4", starch_subtract_n_aligned_unroll_4_armv7a_vfpv4, supports_neon_vfpv4 }, + { 2, "bad_implementation_armv7a_vfpv4_aligned", "armv7a_vfpv4", starch_subtract_n_aligned_bad_implementation_armv7a_vfpv4, supports_neon_vfpv4 }, + { 3, "neon_intrinsics_armv7a_vfpv4_aligned", "armv7a_vfpv4", starch_subtract_n_aligned_neon_intrinsics_armv7a_vfpv4, supports_neon_vfpv4 }, + { 4, "generic_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_generic_armv7a_vfpv4, supports_neon_vfpv4 }, + { 5, "unroll_4_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_unroll_4_armv7a_vfpv4, supports_neon_vfpv4 }, + { 6, "bad_implementation_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_bad_implementation_armv7a_vfpv4, supports_neon_vfpv4 }, + { 7, "neon_intrinsics_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_neon_intrinsics_armv7a_vfpv4, supports_neon_vfpv4 }, + { 8, "generic_armv7a_vfpv3_aligned", "armv7a_vfpv3", starch_subtract_n_aligned_generic_armv7a_vfpv3, supports_neon_vfpv3 }, + { 9, "unroll_4_armv7a_vfpv3_aligned", "armv7a_vfpv3", starch_subtract_n_aligned_unroll_4_armv7a_vfpv3, supports_neon_vfpv3 }, + { 10, "bad_implementation_armv7a_vfpv3_aligned", "armv7a_vfpv3", starch_subtract_n_aligned_bad_implementation_armv7a_vfpv3, supports_neon_vfpv3 }, + { 11, "neon_intrinsics_armv7a_vfpv3_aligned", "armv7a_vfpv3", starch_subtract_n_aligned_neon_intrinsics_armv7a_vfpv3, supports_neon_vfpv3 }, + { 12, "generic_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_generic_armv7a_vfpv3, supports_neon_vfpv3 }, + { 13, "unroll_4_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_unroll_4_armv7a_vfpv3, supports_neon_vfpv3 }, + { 14, "bad_implementation_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_bad_implementation_armv7a_vfpv3, supports_neon_vfpv3 }, + { 15, "neon_intrinsics_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_neon_intrinsics_armv7a_vfpv3, supports_neon_vfpv3 }, + { 16, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 17, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 18, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86_64 + { 0, "generic_x86_64_avx2_aligned", "x86_64_avx2", starch_subtract_n_aligned_generic_x86_64_avx2, supports_x86_avx2 }, + { 1, "unroll_4_x86_64_avx2_aligned", "x86_64_avx2", starch_subtract_n_aligned_unroll_4_x86_64_avx2, supports_x86_avx2 }, + { 2, "bad_implementation_x86_64_avx2_aligned", "x86_64_avx2", starch_subtract_n_aligned_bad_implementation_x86_64_avx2, supports_x86_avx2 }, + { 3, "generic_x86_64_avx2", "x86_64_avx2", starch_subtract_n_generic_x86_64_avx2, supports_x86_avx2 }, + { 4, "unroll_4_x86_64_avx2", "x86_64_avx2", starch_subtract_n_unroll_4_x86_64_avx2, supports_x86_avx2 }, + { 5, "bad_implementation_x86_64_avx2", "x86_64_avx2", starch_subtract_n_bad_implementation_x86_64_avx2, supports_x86_avx2 }, + { 6, "generic_x86_64_avx_aligned", "x86_64_avx", starch_subtract_n_aligned_generic_x86_64_avx, supports_x86_avx }, + { 7, "unroll_4_x86_64_avx_aligned", "x86_64_avx", starch_subtract_n_aligned_unroll_4_x86_64_avx, supports_x86_avx }, + { 8, "bad_implementation_x86_64_avx_aligned", "x86_64_avx", starch_subtract_n_aligned_bad_implementation_x86_64_avx, supports_x86_avx }, + { 9, "generic_x86_64_avx", "x86_64_avx", starch_subtract_n_generic_x86_64_avx, supports_x86_avx }, + { 10, "unroll_4_x86_64_avx", "x86_64_avx", starch_subtract_n_unroll_4_x86_64_avx, supports_x86_avx }, + { 11, "bad_implementation_x86_64_avx", "x86_64_avx", starch_subtract_n_bad_implementation_x86_64_avx, supports_x86_avx }, + { 12, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 13, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 14, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_X86_64 */ + { 0, NULL, NULL, NULL, NULL } +}; + + +int starch_read_wisdom (const char * path) +{ + FILE *fp = fopen(path, "r"); + if (!fp) + return -1; + + /* reset all ranks to identify entries not listed in the wisdom file; we'll assign ranks at the end to produce a stable sort */ + int rank_subtract_n = 0; + for (starch_subtract_n_regentry *entry = starch_subtract_n_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_subtract_n_aligned = 0; + for (starch_subtract_n_aligned_regentry *entry = starch_subtract_n_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + + char linebuf[512]; + while (fgets(linebuf, sizeof(linebuf), fp)) { + /* split name and impl on whitespace, handle comments etc */ + char *name = linebuf; + while (*name && isspace(*name)) + ++name; + + if (!*name || *name == '#') + continue; + + char *end = name; + while (*end && !isspace(*end)) + ++end; + + if (!*end) + continue; + *end = 0; + + char *impl = end + 1; + while (*impl && isspace(*impl)) + ++impl; + + if (!*impl) + continue; + + end = impl; + while (*end && !isspace(*end)) + ++end; + + *end = 0; + + /* try to find a matching registry entry */ + if (!strcmp(name, "subtract_n")) { + for (starch_subtract_n_regentry *entry = starch_subtract_n_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_subtract_n; + break; + } + } + continue; + } + if (!strcmp(name, "subtract_n_aligned")) { + for (starch_subtract_n_aligned_regentry *entry = starch_subtract_n_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_subtract_n_aligned; + break; + } + } + continue; + } + } + + if (ferror(fp)) { + fclose(fp); + return -1; + } + + fclose(fp); + + /* assign ranks to unmatched items to (stable) sort them last; re-sort everything */ + { + starch_subtract_n_regentry *entry; + for (entry = starch_subtract_n_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_subtract_n; + } + qsort(starch_subtract_n_registry, entry - starch_subtract_n_registry, sizeof(starch_subtract_n_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_subtract_n = starch_subtract_n_dispatch; + } + { + starch_subtract_n_aligned_regentry *entry; + for (entry = starch_subtract_n_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_subtract_n_aligned; + } + qsort(starch_subtract_n_aligned_registry, entry - starch_subtract_n_aligned_registry, sizeof(starch_subtract_n_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_subtract_n_aligned = starch_subtract_n_aligned_dispatch; + } + + return 0; +} diff --git a/starch/example/generated/flavor.armv7a_vfpv3.c b/starch/example/generated/flavor.armv7a_vfpv3.c new file mode 100644 index 0000000..ad81cf2 --- /dev/null +++ b/starch/example/generated/flavor.armv7a_vfpv3.c @@ -0,0 +1,33 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_ARMV7A_VFPV3 +#define STARCH_FEATURE_NEON + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## armv7a_vfpv3 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv7a_vfpv3 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## armv7a_vfpv3 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv7a_vfpv3 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + diff --git a/starch/example/generated/flavor.armv7a_vfpv4.c b/starch/example/generated/flavor.armv7a_vfpv4.c new file mode 100644 index 0000000..e6c117c --- /dev/null +++ b/starch/example/generated/flavor.armv7a_vfpv4.c @@ -0,0 +1,33 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_ARMV7A_VFPV4 +#define STARCH_FEATURE_NEON + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## armv7a_vfpv4 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv7a_vfpv4 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## armv7a_vfpv4 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv7a_vfpv4 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + diff --git a/starch/example/generated/flavor.generic.c b/starch/example/generated/flavor.generic.c new file mode 100644 index 0000000..02d52b4 --- /dev/null +++ b/starch/example/generated/flavor.generic.c @@ -0,0 +1,17 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_GENERIC + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## generic +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## generic +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + diff --git a/starch/example/generated/flavor.x86_64_avx.c b/starch/example/generated/flavor.x86_64_avx.c new file mode 100644 index 0000000..8eea708 --- /dev/null +++ b/starch/example/generated/flavor.x86_64_avx.c @@ -0,0 +1,32 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_X86_64_AVX + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## x86_64_avx +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## x86_64_avx +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## x86_64_avx +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## x86_64_avx +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + diff --git a/starch/example/generated/flavor.x86_64_avx2.c b/starch/example/generated/flavor.x86_64_avx2.c new file mode 100644 index 0000000..50f2c5f --- /dev/null +++ b/starch/example/generated/flavor.x86_64_avx2.c @@ -0,0 +1,32 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_X86_64_AVX2 + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## x86_64_avx2 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## x86_64_avx2 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## x86_64_avx2 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## x86_64_avx2 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + diff --git a/starch/example/generated/makefile.arm b/starch/example/generated/makefile.arm new file mode 100644 index 0000000..d312adc --- /dev/null +++ b/starch/example/generated/makefile.arm @@ -0,0 +1,42 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_ARM + + +generated/flavor.armv7a_vfpv4.o: generated/flavor.armv7a_vfpv4.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv7-a+neon-vfpv4 -mfpu=neon-vfpv4 -ffast-math generated/flavor.armv7a_vfpv4.c -o generated/flavor.armv7a_vfpv4.o + +generated/flavor.armv7a_vfpv3.o: generated/flavor.armv7a_vfpv3.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv7-a+neon-vfpv3 -mfpu=neon-vfpv3 -ffast-math generated/flavor.armv7a_vfpv3.c -o generated/flavor.armv7a_vfpv3.o + +generated/flavor.generic.o: generated/flavor.generic.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/flavor.generic.c -o generated/flavor.generic.o + +generated/dispatcher.o: generated/dispatcher.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/dispatcher.c -o generated/dispatcher.o + +STARCH_OBJS := generated/flavor.armv7a_vfpv4.o generated/flavor.armv7a_vfpv3.o generated/flavor.generic.o generated/dispatcher.o + + +generated/benchmark.o: generated/benchmark.c benchmark/subtract_n_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/benchmark.c -o generated/benchmark.o + +STARCH_BENCHMARK_OBJ := generated/benchmark.o diff --git a/starch/example/generated/makefile.generic b/starch/example/generated/makefile.generic new file mode 100644 index 0000000..a98971f --- /dev/null +++ b/starch/example/generated/makefile.generic @@ -0,0 +1,36 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_GENERIC + + +generated/flavor.generic.o: generated/flavor.generic.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/flavor.generic.c -o generated/flavor.generic.o + +generated/dispatcher.o: generated/dispatcher.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/dispatcher.c -o generated/dispatcher.o + +STARCH_OBJS := generated/flavor.generic.o generated/dispatcher.o + + +generated/benchmark.o: generated/benchmark.c benchmark/subtract_n_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/benchmark.c -o generated/benchmark.o + +STARCH_BENCHMARK_OBJ := generated/benchmark.o diff --git a/starch/example/generated/makefile.x86_64 b/starch/example/generated/makefile.x86_64 new file mode 100644 index 0000000..8cd9d6d --- /dev/null +++ b/starch/example/generated/makefile.x86_64 @@ -0,0 +1,42 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_X86_64 + + +generated/flavor.x86_64_avx2.o: generated/flavor.x86_64_avx2.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -mavx2 -ffast-math generated/flavor.x86_64_avx2.c -o generated/flavor.x86_64_avx2.o + +generated/flavor.x86_64_avx.o: generated/flavor.x86_64_avx.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -mavx -ffast-math generated/flavor.x86_64_avx.c -o generated/flavor.x86_64_avx.o + +generated/flavor.generic.o: generated/flavor.generic.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/flavor.generic.c -o generated/flavor.generic.o + +generated/dispatcher.o: generated/dispatcher.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/dispatcher.c -o generated/dispatcher.o + +STARCH_OBJS := generated/flavor.x86_64_avx2.o generated/flavor.x86_64_avx.o generated/flavor.generic.o generated/dispatcher.o + + +generated/benchmark.o: generated/benchmark.c benchmark/subtract_n_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/benchmark.c -o generated/benchmark.o + +STARCH_BENCHMARK_OBJ := generated/benchmark.o diff --git a/starch/example/generated/starch.h b/starch/example/generated/starch.h new file mode 100644 index 0000000..407b705 --- /dev/null +++ b/starch/example/generated/starch.h @@ -0,0 +1,133 @@ + +/* starch generated code. Do not edit. */ + +#include + +/* mixes */ + +/* Generic build, compiler defaults only */ +#ifdef STARCH_MIX_GENERIC +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 1 +#endif /* STARCH_MIX_GENERIC */ + +/* ARM */ +#ifdef STARCH_MIX_ARM +#define STARCH_FLAVOR_ARMV7A_VFPV4 +#define STARCH_FLAVOR_ARMV7A_VFPV3 +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 16 +#endif /* STARCH_MIX_ARM */ + +/* x64-64 */ +#ifdef STARCH_MIX_X86_64 +#define STARCH_FLAVOR_X86_64_AVX2 +#define STARCH_FLAVOR_X86_64_AVX +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 32 +#endif /* STARCH_MIX_X86_64 */ + + +#ifdef STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_IS_ALIGNED(_ptr) (((uintptr_t)(_ptr) & (STARCH_MIX_ALIGNMENT-1)) == 0) +#else +/* mix not defined, alignment is unknown, treat everything as unaligned */ +#define STARCH_IS_ALIGNED(_ptr) (0) +#endif + + +/* entry points and registries */ + +typedef void (* starch_subtract_n_ptr) ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +extern starch_subtract_n_ptr starch_subtract_n; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_subtract_n_ptr callable; + int (*flavor_supported)(); +} starch_subtract_n_regentry; + +extern starch_subtract_n_regentry starch_subtract_n_registry[]; +starch_subtract_n_regentry * starch_subtract_n_select(); +void starch_subtract_n_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_subtract_n_aligned_ptr) ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +extern starch_subtract_n_aligned_ptr starch_subtract_n_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_subtract_n_aligned_ptr callable; + int (*flavor_supported)(); +} starch_subtract_n_aligned_regentry; + +extern starch_subtract_n_aligned_regentry starch_subtract_n_aligned_registry[]; +starch_subtract_n_aligned_regentry * starch_subtract_n_aligned_select(); +void starch_subtract_n_aligned_set_wisdom( const char * const * received_wisdom ); + +/* flavors and prototypes */ + +#ifdef STARCH_FLAVOR_GENERIC +void starch_subtract_n_generic_generic ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_unroll_4_generic ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_bad_implementation_generic ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +#endif /* STARCH_FLAVOR_GENERIC */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_ARMV7A_VFPV3 +int supports_neon_vfpv3 (void); +void starch_subtract_n_generic_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_generic_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_unroll_4_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_unroll_4_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_bad_implementation_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_bad_implementation_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_neon_intrinsics_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_neon_intrinsics_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +#endif /* STARCH_FLAVOR_ARMV7A_VFPV3 */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_ARMV7A_VFPV4 +int supports_neon_vfpv4 (void); +void starch_subtract_n_generic_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_generic_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_unroll_4_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_unroll_4_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_bad_implementation_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_bad_implementation_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_neon_intrinsics_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_neon_intrinsics_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +#endif /* STARCH_FLAVOR_ARMV7A_VFPV4 */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_X86_64_AVX +int supports_x86_avx (void); +void starch_subtract_n_generic_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_generic_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_unroll_4_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_unroll_4_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_bad_implementation_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_bad_implementation_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +#endif /* STARCH_FLAVOR_X86_64_AVX */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_X86_64_AVX2 +int supports_x86_avx2 (void); +void starch_subtract_n_generic_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_generic_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_unroll_4_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_unroll_4_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_bad_implementation_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_bad_implementation_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +#endif /* STARCH_FLAVOR_X86_64_AVX2 */ + +int starch_read_wisdom (const char * path); + diff --git a/starch/example/impl/subtract_n.c b/starch/example/impl/subtract_n.c new file mode 100644 index 0000000..830ee0e --- /dev/null +++ b/starch/example/impl/subtract_n.c @@ -0,0 +1,94 @@ +void STARCH_IMPL(subtract_n, generic) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + out_align[0] = in_align[0] - n; + in_align++; + out_align++; + } +} + +void STARCH_IMPL(subtract_n, unroll_4) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + unsigned len4 = len >> 2; + unsigned len1 = len & 3; + + while (len4--) { + out_align[0] = in_align[0] - n; + out_align[1] = in_align[1] - n; + out_align[2] = in_align[2] - n; + out_align[3] = in_align[3] - n; + in_align += 4; + out_align += 4; + } + + while (len1--) { + out_align[0] = in_align[0] - n; + in_align++; + out_align++; + } +} + +void STARCH_IMPL(subtract_n, bad_implementation) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out) +{ + // This is a deliberately bad implementation that produces + // incorrect results. The error should be caught during + // benchmarking via STARCH_BENCHMARK_VERIFY. + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + out_align[0] = in_align[0] - n; + if (len % 10000 == 0) + out_align[0] += 1; + in_align++; + out_align++; + } +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(subtract_n, neon_intrinsics, STARCH_FEATURE_NEON) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + uint16x8_t subtractor = vdupq_n_u16(n); + + unsigned len8 = len >> 3; + unsigned len4 = len & 4; + unsigned len1 = len & 3; + + while (len8--) { + uint16x8_t value = vld1q_u16(in_align); + uint16x8_t result = vsubq_u16(value, subtractor); + vst1q_u16(out_align, result); + in_align += 8; + out_align += 8; + } + + if (len4) { + uint16x4_t value = vld1_u16(in_align); + uint16x4_t result = vsub_u16(value, vget_low_u16(subtractor)); + vst1_u16(out_align, result); + in_align += 4; + out_align += 4; + } + + while (len1--) { + uint16x4_t value = vld1_dup_u16(in_align); + uint16x4_t result = vsub_u16(value, vget_low_u16(subtractor)); + vst1_lane_u16(out_align, result, 0); + in_align++; + out_align++; + } +} + +#endif diff --git a/starch/example/starchgen.py b/starch/example/starchgen.py new file mode 100755 index 0000000..993fa57 --- /dev/null +++ b/starch/example/starchgen.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +import sys +import os +import glob + +example_dir = os.path.dirname(sys.argv[0]) +starch_dir = os.path.join(example_dir, '..') +sys.path.append(starch_dir) +import starch + +gen = starch.Generator(runtime_dir = example_dir, + output_dir = os.path.join(example_dir, 'generated')) + +gen.add_include('') + +gen.add_function(name = 'subtract_n', argtypes = ['const uint16_t *', 'unsigned', 'uint16_t', 'uint16_t *'], aligned=True) + +gen.add_feature(name='neon', + description='ARM NEON v2') + +gen.add_flavor(name = 'generic', + description = 'Generic build, default compiler options', + compile_flags = []) +gen.add_flavor(name = 'armv7a_vfpv3', + description = 'ARMv7-A, NEON, VFPv3', + compile_flags = ['-march=armv7-a+neon-vfpv3', '-mfpu=neon-vfpv3', '-ffast-math'], + features = ['neon'], + test_function = 'supports_neon_vfpv3', + alignment=16) +gen.add_flavor(name = 'armv7a_vfpv4', + description = 'ARMv7-A, NEON, VFPv4', + compile_flags = ['-march=armv7-a+neon-vfpv4', '-mfpu=neon-vfpv4', '-ffast-math'], + features = ['neon'], + test_function = 'supports_neon_vfpv4', + alignment=16) +gen.add_flavor(name = 'x86_64_avx', + description = 'x86-64 with AVX', + compile_flags = ['-mavx', '-ffast-math'], + features = [], + test_function = 'supports_x86_avx', + alignment=32) +gen.add_flavor(name = 'x86_64_avx2', + description = 'x86-64 with AVX2', + compile_flags = ['-mavx2', '-ffast-math'], + features = [], + test_function = 'supports_x86_avx2', + alignment=32) + +gen.add_mix(name = 'generic', + description = 'Generic build, compiler defaults only', + flavors = ['generic']) + +gen.add_mix(name = 'arm', + description = 'ARM', + flavors = ['armv7a_vfpv4', 'armv7a_vfpv3', 'generic'], + wisdom = { + 'subtract_n': ['neon_intrinsics_armv7a_vfpv4', 'neon_intrinsics_armv7a_vfpv3', 'generic_generic'] + }) +gen.add_mix(name = 'x86_64', + description = 'x64-64', + flavors = ['x86_64_avx2', 'x86_64_avx', 'generic'], + wisdom = { + 'subtract_n': ['generic_x86_64_avx2', 'generic_x86_64_avx', 'unroll_4_generic'] + }) + +for pattern in ['impl/*.c', 'benchmark/*.c']: + for c_file in glob.glob(pattern): + gen.scan_file(c_file) + +gen.generate() diff --git a/starch/example/support.c b/starch/example/support.c new file mode 100644 index 0000000..887a823 --- /dev/null +++ b/starch/example/support.c @@ -0,0 +1,53 @@ +#ifdef __arm__ + +#include +#include + +int supports_neon_vfpv3(void) +{ + long hwcaps = getauxval(AT_HWCAP); + return (hwcaps & HWCAP_ARM_NEON) && (hwcaps & HWCAP_ARM_VFPv3); +} + +int supports_neon_vfpv4(void) +{ + long hwcaps = getauxval(AT_HWCAP); + return (hwcaps & HWCAP_ARM_NEON) && (hwcaps & HWCAP_ARM_VFPv4); +} + +#endif /* __arm__ */ + +#ifdef __x86_64__ + +#include +#include + +int supports_x86_avx(void) +{ + unsigned int maxlevel = __get_cpuid_max (0, 0); + if (maxlevel < 1) + return 0; + + unsigned eax, ebx, ecx, edx; + __cpuid(1, eax, ebx, ecx, edx); + if (!(ecx & bit_AVX)) + return 0; + + return 1; +} + +int supports_x86_avx2(void) +{ + unsigned int maxlevel = __get_cpuid_max (0, 0); + if (maxlevel < 7) + return 0; + + unsigned eax, ebx, ecx, edx; + __cpuid_count(7, 0, eax, ebx, ecx, edx); + if (!(ebx & bit_AVX2)) + return 0; + + return 1; +} + +#endif diff --git a/starch/starch.py b/starch/starch.py new file mode 100644 index 0000000..f20ae3c --- /dev/null +++ b/starch/starch.py @@ -0,0 +1,583 @@ +# starch: framework glue for selecting ISA-specific code at runtime + +# Copyright (c) 2020, FlightAware LLC. +# All rights reserved. +# See the LICENSE file for licensing terms. + +import sys +import re +import os +import mako.lookup + +from typing import Optional, Union, Iterable, Sequence, MutableSequence, Mapping, MutableMapping, FrozenSet + +class Feature(object): + """Feature represents a type of code that can only be built with +certain compiler flags. For example, code that uses NEON intrinsics +can only be compiled if the compiler is building for an ARM instruction +set that supports NEON. Implementation code should be conditionally +compiled using the corresponding macro name, and should declare +themselves using the STARCH_IMPL_REQUIRES macro.""" + + gen: 'Generator' + name: str + description: str + + def __init__(self, + gen: 'Generator', + name: str, + description: str): + self.gen = gen + self.name = name + self.description = description + + @property + def macro(self) -> str: + return 'STARCH_FEATURE_' + self.name.upper() + + +class BuildFlavor(object): + """BuildFlavor models code built with specific compiler flags. +Shared implementation code will be built multiple times, once per flavor. + +Each flavor has an associated test function that is called at runtime to +check if the current hardware supports the code emitted by the flavor. If +the test function returns false, no code built with the flavor will be executed. + +Each flavor has a (possibly empty) list of optional Features that may +be present at runtime. This list controls which feature-dependent code is +compiled for this flavor (e.g. an x86 flavor might try to build code that +depends on SSE, but should not try to build code that depends on ARM NEON +intrinsics)""" + + gen: 'Generator' + name: str + description: str + compile_flags: Sequence[str] + features: FrozenSet[Feature] + test_function: Optional[str] + alignment: int + + def __init__(self, + gen: 'Generator', + name: str, + description: str, + compile_flags: Iterable[str] = (), + features: Iterable[Feature] = (), + test_function: Optional[str] = None, + alignment: int = 1): + + self.gen = gen + self.name = name + self.compile_flags = tuple(compile_flags) + self.features = frozenset(features) + self.test_function = test_function + self.alignment = alignment + + @property + def macro(self) -> str: + return 'STARCH_FLAVOR_' + self.name.upper() + + @property + def test_function_expr(self) -> str: + if self.test_function is None: + return "NULL" + else: + return self.test_function + + @property + def cflags(self) -> str: + return ' '.join(self.compile_flags) + + +class Function(object): + """A user-callable function that will be dispatched to +one of the many possible implementations based on runtime feature +support.""" + + gen: 'Generator' + name: str + returntype: str + argtypes: Sequence[str] + argnames: Sequence[str] + impls: Sequence['FunctionImpl'] + benchmark: Optional['SourceFile'] = None + benchmark_verify: Optional['SourceFile'] = None + aligned: bool + aligned_pair: Optional['Function'] = None + + def __init__(self, + gen: 'Generator', + name: str, + argtypes: Iterable[str], + returntype: str = 'void', + argnames: Optional[Iterable[str]] = None, + aligned: bool = False): + + self.gen = gen + self.name = name + self.returntype = returntype + self.argtypes = tuple(argtypes) + self.aligned = aligned + self.impls = [] + + if argnames is None: + self.argnames = tuple( f'arg{n}' for n in range(len(self.argtypes)) ) + else: + self.argnames = tuple(argnames) + if len(self.argnames) != len(self.argtypes): + raise ValueError('length of argnames must match length of argtypes') + + @property + def declaration_arglist(self) -> str: + return ', '.join([f'{typename} {argname}' for typename, argname in zip(self.argtypes, self.argnames)]) + + @property + def named_arglist(self) -> str: + return ', '.join(self.argnames) + + @property + def callable_symbol(self) -> str: + if self.gen.prefix_function_symbols: + return self.gen.sym(self.name) + else: + return self.name + + @property + def select_symbol(self) -> str: + return self.gen.sym(self.name + '_select') + + @property + def dispatcher_symbol(self) -> str: + return self.gen.sym(self.name + '_dispatch') + + @property + def pointer_type(self) -> str: + return self.gen.sym(self.name + '_ptr') + + @property + def regentry_type(self) -> str: + return self.gen.sym(self.name + '_regentry') + + @property + def registry_symbol(self) -> str: + return self.gen.sym(self.name + '_registry') + + @property + def set_wisdom_symbol(self) -> str: + return self.gen.sym(self.name + '_set_wisdom') + + @property + def benchmark_symbol(self) -> str: + return self.gen.sym(self.name + '_benchmark') + + @property + def benchmark_verify_symbol(self) -> str: + return self.gen.sym(self.name + '_benchmark_verify') + + +class FunctionImpl(object): + """A possible implementation of a function, not built in any particular way yet.""" + + gen: 'Generator' + function: Function + name: str + feature: Optional[Feature] + source: 'SourceFile' + lineno: int + + def __init__(self, + gen: 'Generator', + function: Function, + name: str, + feature: Optional[Feature], + source: 'SourceFile', + lineno: int): + self.gen = gen + self.function = function + self.name = name + self.feature = feature + self.source = source + self.lineno = lineno + + def wisdom_name(self, flavor) -> str: + if self.function.aligned: + return self.name + '_' + flavor.name + '_aligned' + else: + return self.name + '_' + flavor.name + + def impl_symbol(self, flavor) -> str: + return self.gen.sym(self.function.name + '_' + self.name + '_' + flavor.name) + + +class SourceFile(object): + """A scanned source file that contains implementation code.""" + + path: str + impls: Sequence[FunctionImpl] + + def __init__(self, path): + self.path = path + self.impls = [] + + +class BuildMix(object): + """A combination of build flavors that make up one possible way of building all +the code. The output of a mix is a library that dispatches functions within the +mixed flavors. For example, when building a binary that is intended to run on +generic ARM systems, a mix could be used that includes flavors for ARMv6, ARMv7, +and ARMv8. + +The order of flavors within a mix is significant. At runtime, flavors will be tried +in order until a supported flavor is found; so more efficient flavors should be +specified first.""" + + name: str + description: str + flavors: Sequence[BuildFlavor] + wisdom: Mapping[Function,Sequence[str]] + + def __init__(self, + name: str, + description: str, + flavors: Iterable[BuildFlavor], + wisdom: Mapping[Function,Iterable[str]] = {}): + self.name = name + self.description = description + self.flavors = tuple(flavors) + self.wisdom = dict( (k,tuple(v)) for k, v in wisdom.items() ) + + @property + def macro(self): + return 'STARCH_MIX_' + self.name.upper() + + def function_wisdom(self, function) -> Sequence[str]: + return self.wisdom.get(function, []) + +class Generator(object): + functions: MutableMapping[str, Function] + features: MutableMapping[str, Feature] + features_by_macro: MutableMapping[str, Feature] + flavors: MutableMapping[str, BuildFlavor] + function_impls: MutableMapping[str, FunctionImpl] + impl_files: MutableSequence[SourceFile] + benchmark_files: MutableSequence[SourceFile] + mixes: MutableMapping[str, BuildMix] + symbol_prefix: str + templates: mako.lookup.TemplateLookup + generated_include_path: str + generated_flavor_pattern: str + generated_dispatcher_path: str + generated_benchmark_path: str + generated_makefile_pattern: str + includes: MutableSequence[str] = [] + + def __init__(self, + runtime_dir: str, + output_dir: str, + template_dir: Optional[str] = None, + mako_dir: Optional[str] = None, + generated_include_path: str = 'starch.h', + generated_flavor_pattern: str = 'flavor.{0}.c', + generated_dispatcher_path: str = 'dispatcher.c', + generated_benchmark_path: str = 'benchmark.c', + generated_makefile_pattern: str = 'makefile.{0}', + symbol_prefix: str = 'starch_', + prefix_function_symbols: bool = True): + self.runtime_dir = runtime_dir + self.output_dir = output_dir + self.generated_include_path = os.path.join(output_dir, generated_include_path) + self.generated_flavor_pattern = generated_flavor_pattern + self.generated_dispatcher_path = os.path.join(output_dir, generated_dispatcher_path) + self.generated_benchmark_path = os.path.join(output_dir, generated_benchmark_path) + self.generated_makefile_pattern = generated_makefile_pattern + self.symbol_prefix = symbol_prefix + self.prefix_function_symbols = prefix_function_symbols + + if template_dir is None and '__file__' in globals(): + template_dir = os.path.join(os.path.dirname(__file__), 'templates') + if template_dir is None: + raise RuntimeError('cannot determine template directory location, please specify template_dir') + self.templates = mako.lookup.TemplateLookup(directories = [template_dir], module_directory = mako_dir, imports=['import os']) + + self.functions = {} + self.features = {} + self.features_by_macro = {} + self.flavors = {} + self.function_impls = {} + self.impl_files = [] + self.benchmark_files = [] + self.mixes = {} + self.includes = [] + + def generated_flavor_path(self, flavor: BuildFlavor) -> str: + return os.path.join(self.output_dir, self.generated_flavor_pattern.format(flavor.name)) + + def generated_makefile_path(self, mix: BuildMix) -> str: + return os.path.join(self.output_dir, self.generated_makefile_pattern.format(mix.name)) + + def add_include(self, what): + if what[0] == '<' or what[0] == '"': + self.includes.append(what) + else: + self.includes.append('"' + what + '"') + + def add_feature(self, + name: str, + description: str): + if name in self.features: + raise RuntimeError('duplicated flavor: ' + name) + feature = Feature(self, name, description) + self.features[name] = self.features_by_macro[feature.macro] = feature + + def get_feature(self, key: Union[str, Feature]) -> Feature: + if isinstance(key, Feature): + return key + return self.features[key] + + def get_feature_macro(self, key: str) -> Optional[Feature]: + return self.features_by_macro.get(key, None) + + def add_function(self, + name: str, + argtypes: Iterable[str], + returntype: str = 'void', + argnames: Optional[Iterable[str]] = None, + aligned: bool = False): + if name in self.functions: + raise RuntimeError('duplicated function: ' + name) + + base_function = Function(self, name, argtypes, returntype, argnames, aligned = False) + aligned_function: Optional[Function] = None + if aligned: + aligned_function = Function(self, name + '_aligned', argtypes, returntype, argnames, aligned = True) + base_function.aligned_pair = aligned_function + aligned_function.aligned_pair = base_function + + self.functions[base_function.name] = base_function + if aligned_function: + self.functions[aligned_function.name] = aligned_function + + def get_function(self, key: Union[str, Function]) -> Function: + if isinstance(key, Function): + return key + return self.functions[key] + + def add_flavor(self, + name: str, + description: str, + compile_flags: Iterable[str] = (), + features: Iterable[Union[Feature,str]] = (), + test_function: Optional[str] = None, + alignment: int = 1): + if name in self.flavors: + raise RuntimeError('duplicated flavor: ' + name) + resolved_features = map(self.get_feature, features) + self.flavors[name] = BuildFlavor(self, name, description, compile_flags, resolved_features, test_function, alignment) + + def get_flavor(self, key: Union[str, BuildFlavor]) -> BuildFlavor: + if isinstance(key, BuildFlavor): + return key + return self.flavors[key] + + def load_wisdom(self, path: str) -> Mapping[str,Sequence[str]]: + results: Mapping[Function,Sequence[str]] = {} + + try: + f = open(path, 'r') + except IOError: + self.warning(None, None, f"ignoring missing wisdom file {path}") + return results + + with f: + for line in f: + line = line.strip() + if line == '' or line.startswith('#'): + continue + + parts = re.split('\s+', line) + if len(parts) < 2: + continue + + func, impl = parts[:2] + if func in self.functions: + results.setdefault(self.functions[func], []).append(impl) + else: + self.warning(None, None, f"ignoring unknown function {func} in wisdom file {path}") + + return results + + def add_mix(self, + name: str, + description: str, + flavors: Iterable[Union[BuildFlavor,str]], + wisdom: Mapping[Union[Function,str],Iterable[str]] = {}, + wisdom_file: Optional[str] = None): + if name in self.mixes: + raise RuntimeError('duplicated mix: ' + name) + + resolved_flavors = map(self.get_flavor, flavors) + if wisdom_file: + resolved_wisdom = self.load_wisdom(wisdom_file) + else: + resolved_wisdom = dict( (self.get_function(name), values) for name,values in wisdom.items() ) + self.mixes[name] = BuildMix(name, description, resolved_flavors, resolved_wisdom) + + def sym(self, symbol: str) -> str: + return self.symbol_prefix + symbol + + def build_impls(self, source: SourceFile, lineno: int, function_name: str, impl_name: str, feature_name: Optional[str] = None) -> Sequence[FunctionImpl]: + if function_name not in self.functions: + self.warning(source, lineno, f"implementation defined for unknown function '{function_name}', skipped") + return [] + + function = self.functions[function_name] + + feature: Optional[Feature] = None + if feature_name is not None: + if feature_name not in self.features_by_macro: + self.warning(source, lineno, f"implementation {function_name} ({impl_name}) requires unknown feature '{feature_name}', skipped") + return [] + feature = self.features_by_macro.get(feature_name) + + result = [FunctionImpl(gen = self, + function = function, + name = impl_name, + source = source, + lineno = lineno, + feature = feature)] + + if function.aligned_pair: + result.append(FunctionImpl(gen = self, + function = function.aligned_pair, + name = impl_name, + source = source, + lineno = lineno, + feature = feature)) + + return result + + def add_impl(self, impl): + key = (impl.function, impl.name) + old = self.function_impls.get(key) + if old: + self.warning('duplicate definition of {impl.function.name} / {impl.name}, previously defined at {old.location[0]}:{old.location[1]}') + return + self.function_impls[key] = impl + impl.function.impls.append(impl) + impl.source.impls.append(impl) + + def warning(self, source: Optional[SourceFile], lineno: Optional[int], message): + if source is not None: + if lineno is not None: + print(f'{source.path}:{lineno}: warning: {message}', file=sys.stderr) + else: + print(f'{source.path}: warning: {message}', file=sys.stderr) + else: + print(f'warning: {message}', file=sys.stderr) + + def scan_file(self, path: str): + source = SourceFile(path) + + match_impl = re.compile(r'''[^a-zA-Z0-9_]+ STARCH_IMPL \s* \( \s* # macro call + ([a-zA-Z0-9_]+) \s* , \s* # function name + ([a-zA-Z0-9_]+) \s* \) # implementation name + ''', re.VERBOSE) + match_impl_requires = re.compile(r'''[^a-zA-Z0-9_]+ STARCH_IMPL_REQUIRES \s* \( \s* # macro call + ([a-zA-Z0-9_]+) \s* , \s* # function name + ([a-zA-Z0-9_]+) \s* , \s* # implementation name + ([a-zA-Z0-9_]+) \s* \) # feature name + ''', re.VERBOSE) + + match_benchmark = re.compile(r'''[^a-zA-Z0-9_]+ STARCH_BENCHMARK \s* \( \s* # macro call + ([a-zA-Z0-9_]+) \s* \) # function name + ''', re.VERBOSE) + + match_verify = re.compile(r'''[^a-zA-Z0-9_]+ STARCH_BENCHMARK_VERIFY \s* \( \s* # macro call + ([a-zA-Z0-9_]+) \s* \) # function name + ''', re.VERBOSE) + + has_benchmark = has_impl = has_benchmark_verify = False + with open(path, 'r') as f: + for lineno, line in enumerate(f): + if line[0] == '#': + continue # ignore preprocessor lines + + for match in match_impl.finditer(line): + for impl in self.build_impls(source, lineno, match.group(1), match.group(2)): + has_impl = True + self.add_impl(impl) + + for match in match_impl_requires.finditer(line): + for impl in self.build_impls(source, lineno, match.group(1), match.group(2), match.group(3)): + has_impl = True + self.add_impl(impl) + + for match in match_benchmark.finditer(line): + function_name = match.group(1) + if function_name in self.functions: + function = self.functions[function_name] + if function.benchmark: + self.warning(source, lineno, f"duplicate benchmark defined for unknown function {function_name}") + function.benchmark = source + if function.aligned_pair: + function.aligned_pair.benchmark = source + has_benchmark = True + else: + self.warning(source, lineno, f"benchmark defined for unknown function {function_name}, ignored") + + for match in match_verify.finditer(line): + function_name = match.group(1) + if function_name in self.functions: + function = self.functions[function_name] + if function.benchmark_verify: + self.warning(source, lineno, f"duplicate benchmark verifier defined for unknown function {function_name}") + function.benchmark_verify = source + if function.aligned_pair: + function.aligned_pair.benchmark_verify = source + has_benchmark_verify = True + else: + self.warning(source, lineno, f"benchmark verifier defined for unknown function {function_name}, ignored") + + if has_impl: + self.impl_files.append(source) + if has_benchmark or has_benchmark_verify: + self.benchmark_files.append(source) + + def render(self, template_path, output_path, **kwargs): + t = self.templates.get_template(template_path) + result = t.render(gen=self, current_dir=os.path.dirname(output_path), **kwargs) + + if os.path.exists(output_path): + with open(output_path, 'r') as f: + contents = f.read() + if contents == result: + print(f'unchanged: {output_path}', file=sys.stderr) + return + + with open(output_path, 'w') as f: + f.write(result) + print(f' wrote: {output_path}', file=sys.stderr) + + def generate(self): + if not self.functions: + self.warning(None, None, 'no functions defined') + if not self.flavors: + self.warning(None, None, 'no flavors defined') + if not self.mixes: + self.warning(None, None, 'no mixes defined') + for function in self.functions.values(): + if not function.impls: + self.warning(None, None, f'no implementations of function {function.name} provided') + + self.render('/starch.h.template', self.generated_include_path) + + for name, flavor in self.flavors.items(): + self.render('/flavor.c.template', self.generated_flavor_path(flavor), flavor=flavor) + + self.render('/dispatcher.c.template', self.generated_dispatcher_path) + self.render('/benchmark.c.template', self.generated_benchmark_path) + + for name, mix in self.mixes.items(): + self.render('/makefile.template', self.generated_makefile_path(mix), mix=mix) + diff --git a/starch/stubs/mako/__init__.pyi b/starch/stubs/mako/__init__.pyi new file mode 100644 index 0000000..792da1d --- /dev/null +++ b/starch/stubs/mako/__init__.pyi @@ -0,0 +1,4 @@ +# -*- python -*- + +# typing stubs for mako + diff --git a/starch/stubs/mako/lookup.pyi b/starch/stubs/mako/lookup.pyi new file mode 100644 index 0000000..792b046 --- /dev/null +++ b/starch/stubs/mako/lookup.pyi @@ -0,0 +1,16 @@ +# -*- python -*- + +# typing stubs for mako + +from mako.template import Template +from typing import List, Optional + +class TemplateCollection(object): + def get_template(self, uri: str, relativeto: Optional[str] = None) -> Template: ... + +class TemplateLookup(TemplateCollection): + def __init__(self,directories: Optional[List[str]] = None, + module_directory: Optional[str] = None, + imports: Optional[List[str]] = None): ... + + diff --git a/starch/stubs/mako/template.pyi b/starch/stubs/mako/template.pyi new file mode 100644 index 0000000..2076b1a --- /dev/null +++ b/starch/stubs/mako/template.pyi @@ -0,0 +1,6 @@ +# -*- python -*- + +# typing stubs for mako + +class Template(object): + def render(self, *args, **kwargs) -> str: ... diff --git a/starch/templates/benchmark.c.template b/starch/templates/benchmark.c.template new file mode 100644 index 0000000..c77bd59 --- /dev/null +++ b/starch/templates/benchmark.c.template @@ -0,0 +1,490 @@ +### Copyright (c) 2020, FlightAware LLC. +### All rights reserved. +### See the LICENSE file for licensing terms. + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "${os.path.relpath(gen.generated_include_path, current_dir)}" + +typedef struct timespec starch_benchmark_time; + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end); +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size); +void starch_benchmark_aligned_free(void *user_ptr); +void starch_benchmark_get_time(starch_benchmark_time *t); + +const unsigned starch_benchmark_warmup_loops = 10; + +typedef struct { + const char *name; + const char *impl; + uint64_t ns; +} starch_benchmark_result; + +static starch_benchmark_result *starch_benchmark_results = NULL; +static unsigned starch_benchmark_result_size = 0; +static unsigned starch_benchmark_result_count = 0; + +typedef struct benchmark_flavor_list_node { + const char *flavor; + struct benchmark_flavor_list_node *next; +} starch_benchmark_flavor_list; + +static starch_benchmark_flavor_list *starch_benchmark_flavor_whitelist = NULL; +static starch_benchmark_flavor_list *starch_benchmark_flavor_blacklist = NULL; + +static bool starch_benchmark_list_only = false; +static bool starch_benchmark_validate_only = false; +static bool starch_benchmark_validation_failed = false; +static bool starch_benchmark_top_only = false; +static unsigned starch_benchmark_iterations = 1; + +typedef struct timespec starch_benchmark_time; +void starch_benchmark_get_time(starch_benchmark_time *t) +{ +#ifdef CLOCK_THREAD_CPUTIME_ID + clock_gettime(CLOCK_THREAD_CPUTIME_ID, t); +#else + clock_gettime(CLOCK_MONOTONIC, t); +#endif +} + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end) +{ + return ((uint64_t)end->tv_sec - (uint64_t)start->tv_sec) * 1000000000U + (uint64_t)end->tv_nsec - (uint64_t)start->tv_nsec; +} + +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size) +{ + size_t use_alignment = (type_alignment > alignment ? type_alignment : alignment); + if ( (use_alignment % type_alignment) || (use_alignment % alignment) ) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: conflicting alignment requirements (%zu versus %zu)\n", size, alignment, type_alignment); + return NULL; + } + + /* Over-allocate so we can stash our own pointer before the start, and so that we can adjust + * the returned alignment so it is only aligned to the requested boundary, and not also + * aligned to a larger power of two (we don't want to accidentally benchmark the performance + * of a more restrictive larger alignment) + */ + size_t header_size = (use_alignment < sizeof(void*) ? sizeof(void*) : use_alignment); + char *block_ptr = aligned_alloc(use_alignment, header_size + size + use_alignment); + if (!block_ptr) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: %s\n", size, strerror(errno)); + return NULL; + } + + char *user_ptr = block_ptr + header_size; + if ( (uintptr_t)user_ptr % (use_alignment * 2) == 0 ) { + // user_ptr is aligned to the next power of two, but we don't want that, move it on + user_ptr += use_alignment; + } + + void **stash = (void**)user_ptr - 1; + *stash = block_ptr; + + return user_ptr; +} + +void starch_benchmark_aligned_free(void *user_ptr) +{ + if (!user_ptr) + return; + void **stash = (void**)user_ptr - 1; + free(*stash); +} + +static bool starch_benchmark_flavor_in_list(const char *flavor, const starch_benchmark_flavor_list *list) +{ + for (; list; list = list->next) { + if (!strcmp(flavor, list->flavor)) + return true; + } + return false; +} + +<% functions_to_benchmark = [f for f in gen.functions.values() if f.benchmark] %> +% for function in functions_to_benchmark: +/* prototypes for benchmark helpers provided by user code */ +void ${function.benchmark_symbol} (void); +% if function.benchmark_verify: +bool ${function.benchmark_verify_symbol} ( ${function.declaration_arglist } ); +% endif + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void ${gen.symbol_prefix}${function.name}_benchmark(void); + +static void starch_benchmark_one_${function.name}( ${function.regentry_type} * _entry, ${function.declaration_arglist } ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( ${function.named_arglist} ); + + % if function.benchmark_verify: + /* verify correctness of the output */ + if (! ${function.benchmark_verify_symbol} ( ${function.named_arglist} )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + % else: + if (starch_benchmark_validate_only) { + fprintf(stderr, "no validator defined\n"); + return; + } + % endif + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( ${function.named_arglist} ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( ${function.named_arglist} ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "${function.name}"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_${function.name}( ${function.declaration_arglist } ) +{ + for (${function.regentry_type} *_entry = ${function.registry_symbol}; _entry->name; ++_entry) { + starch_benchmark_one_${function.name}( _entry, ${function.named_arglist} ); + } +} + +% endfor + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) ${gen.symbol_prefix} ## _name ## _benchmark_sym +#define STARCH_IMPL(_function,_impl) ${gen.symbol_prefix} ## _function ## _ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) ${gen.symbol_prefix} ## _function ## _benchmark +#define STARCH_BENCHMARK_VERIFY(_function) ${gen.symbol_prefix} ## _function ## _benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(1, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +% for source in gen.benchmark_files: +#include "${os.path.relpath(source.path, current_dir)}" +% endfor + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES +#undef STARCH_BENCHMARK +#undef STARCH_BENCHMARK_VERIFY +#undef STARCH_BENCHMARK_RUN +#undef STARCH_BENCHMARK_ALLOC +#undef STARCH_BENCHMARK_FREE + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) ${gen.symbol_prefix} ## _name ## _aligned_benchmark_sym +#define STARCH_IMPL(_function,_impl) ${gen.symbol_prefix} ## _function ## _aligned_ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) ${gen.symbol_prefix} ## _function ## _aligned_benchmark +#define STARCH_BENCHMARK_VERIFY(_function) ${gen.symbol_prefix} ## _function ## _aligned_benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ## _aligned ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(STARCH_MIX_ALIGNMENT, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +% for source in gen.benchmark_files: + % if any( (function.aligned and function.benchmark == source) for function in gen.functions.values() ): +#include "${os.path.relpath(source.path, current_dir)}" + % endif +% endfor + +% for function in functions_to_benchmark: +static void starch_benchmark_all_${function.name}(void) +{ + fprintf(stderr, "==== ${function.name} ===\n"); + ${gen.symbol_prefix}${function.name}_benchmark (); +} +% endfor + +static int starch_benchmark_compare_result(const void *a, const void *b) +{ + const starch_benchmark_result *left = (const starch_benchmark_result *) a; + const starch_benchmark_result *right = (const starch_benchmark_result *) b; + + int name_cmp = strcmp(left->name, right->name); + if (name_cmp) + return name_cmp; + + if (left->ns < right->ns) + return -1; + if (left->ns > right->ns) + return 1; + return 0; +} + +static void starch_benchmark_usage(const char *argv0) +{ + fprintf(stderr, + "Usage: %s [OPTION]... [FUNCTION]...\n" + "Benchmarks starch functions and optionally writes a sorted wisdom file.\n" + "\n" + " -r FILE Read initial wisdom from FILE\n" + " -o FILE Write sorted wisdom to FILE\n" + " -F FLAVOR Add FLAVOR to whitelist\n" + " (default: no whitelist, run all runtime-supported flavors)\n" + " -N FLAVOR Add FLAVOR to blacklist\n" + " (default: no blacklist, run all runtime-supported flavors)\n" + " -l List compiled-in implementations but don't benchmark them\n" + " -V Run validation tests, but don't run benchmarks\n" + " -t Include only the top candidate per function in wisdom output\n" + " -i ITERS Run benchmark ITERS times and use the mean. If ITERS > 2, ignore\n" + " the smallest and largest runs when calculating the mean.\n" + " (default: 1 iteration)\n" + " FUNCTION Run benchmarks for these functions only\n" + " (default: benchmark all functions)\n" + "\n" + "Supported flavors: " +% for flavor in gen.flavors.values(): +#ifdef ${flavor.macro} + "${flavor.name} " +#endif +% endfor + "\n" + "Supported functions: " +% for function in gen.functions.values(): + "${function.name} " +% endfor + "\n", argv0); +} + +static void starch_benchmark_append_flavor(const char *flavor, starch_benchmark_flavor_list **list) +{ + starch_benchmark_flavor_list *newnode = malloc(sizeof(*newnode)); + if (!newnode) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + exit(1); + } + + newnode->flavor = flavor; + newnode->next = *list; + *list = newnode; +} + +int main(int argc, char **argv) +{ + int specific = 0; + const char *output_path = NULL; + + int opt; + while ((opt = getopt(argc, argv, "r:o:F:N:i:lhtV")) != -1) { + switch (opt) { + case 'r': + if (${gen.sym("read_wisdom")}(optarg) < 0) { + fprintf(stderr, "%s: cannot read %s: %s\n", argv[0], optarg, strerror(errno)); + return 1; + } + fprintf(stderr, "%s: loaded wisdom file %s\n", argv[0], optarg); + break; + + case 'o': + output_path = optarg; + break; + + case 'F': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_whitelist); + break; + + case 'N': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_blacklist); + break; + + case 'l': + starch_benchmark_list_only = true; + break; + + case 't': + starch_benchmark_top_only = true; + break; + + case 'i': + starch_benchmark_iterations = atoi(optarg); + break; + + case 'V': + starch_benchmark_validate_only = true; + break; + + case 'h': + starch_benchmark_usage(argv[0]); + return 0; + + case '?': + default: + starch_benchmark_usage(argv[0]); + return 2; + } + } + + if (starch_benchmark_list_only && output_path) { + fprintf(stderr, "%s: -o and -l options cannot be specified together\n", argv[0]); + return 2; + } + + for (int i = optind; i < argc; ++i) { +% for function in gen.functions.values(): + if (!strcmp(argv[i], "${function.name}")) { + specific = 1; +% if function.benchmark: + starch_benchmark_all_${function.name}(); +% else: + fprintf(stderr, "=== ${function.name} ===\n"); + fprintf(stderr, " (no benchmark support defined)\n"); +% endif + continue; + } +% endfor + + fprintf(stderr, "%s: unrecognized function name: %s\n", argv[0], argv[i]); + return 2; + } + + if (!specific) { +% for function in gen.functions.values(): + % if function.benchmark: + starch_benchmark_all_${function.name}(); + % else: + fprintf(stderr, "=== ${function.name} ===\n"); + fprintf(stderr, " (no benchmark support defined)\n"); + % endif +% endfor + } + + if (output_path) { + FILE *out = fopen(output_path, "w"); + if (!out) { + fprintf(stderr, "%s: cannot open %s: %s\n", argv[0], output_path, strerror(errno)); + return 1; + } + + fprintf(out, "# generated by "); + for (int i = 0; i < argc; ++i) + fprintf(out, "%s ", argv[i]); + fprintf(out, "\n\n"); + + qsort(starch_benchmark_results, starch_benchmark_result_count, sizeof(*starch_benchmark_results), starch_benchmark_compare_result); + + const char *last_name = NULL; + bool first = true; + for (unsigned i = 0; i < starch_benchmark_result_count; ++i) { + starch_benchmark_result *result = &starch_benchmark_results[i]; + if (last_name && strcmp(last_name, result->name) != 0) { + fprintf(out, "\n"); + first = true; + } + last_name = result->name; + if (starch_benchmark_top_only && !first) + continue; + fprintf(out, "%-40s %-40s # %" PRIu64 " ns/call\n", result->name, result->impl, result->ns); + first = false; + } + + fclose(out); + fprintf(stderr, "%s: wrote sorted wisdom to %s\n", argv[0], output_path); + } + + return starch_benchmark_validation_failed ? 1 : 0; +} diff --git a/starch/templates/dispatcher.c.template b/starch/templates/dispatcher.c.template new file mode 100644 index 0000000..cb3dbc3 --- /dev/null +++ b/starch/templates/dispatcher.c.template @@ -0,0 +1,206 @@ +### Copyright (c) 2020, FlightAware LLC. +### All rights reserved. +### See the LICENSE file for licensing terms. + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include + +#include "${os.path.relpath(gen.generated_include_path, current_dir)}" + +/* helper for re-sorting registries */ +struct starch_regentry_prefix { + int rank; +}; + +static int starch_regentry_rank_compare (const void *l, const void *r) +{ + const struct starch_regentry_prefix *left = l, *right = r; + return left->rank - right->rank; +} + +% for function in gen.functions.values(): +/* dispatcher / registry for ${function.name} */ + +${function.regentry_type} * ${function.select_symbol}() { + for (${function.regentry_type} *entry = ${function.registry_symbol}; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static ${function.returntype} ${function.dispatcher_symbol} ( ${function.declaration_arglist} ) { + ${function.regentry_type} *entry = ${function.select_symbol}(); + if (!entry) + abort(); + + ${function.callable_symbol} = entry->callable; +% if function.returntype == 'void': + ${function.callable_symbol} ( ${function.named_arglist} ); +% else: + return ${function.callable_symbol} ( ${function.named_arglist} ); +% endif +} + +${function.pointer_type} ${function.callable_symbol} = ${function.dispatcher_symbol}; + +void ${function.set_wisdom_symbol} (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + ${function.regentry_type} *entry; + for (entry = ${function.registry_symbol}; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - ${function.registry_symbol}); + } + } + + /* re-sort based on the new ranking */ + qsort(${function.registry_symbol}, entry - ${function.registry_symbol}, sizeof(${function.regentry_type}), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + ${function.callable_symbol} = ${function.dispatcher_symbol}; +} + +${function.regentry_type} ${function.registry_symbol}[] = { +% for mix in gen.mixes.values(): + <% + # gather all implementations for this mix, sort by wisdom + def rank_key(value, wisdom=mix.function_wisdom(function)): + impl, flavor = value + try: + return wisdom.index(impl.wisdom_name(flavor)) + except ValueError: + return len(wisdom) + + mix_impls = [] + for flavor in mix.flavors: + if function.aligned and function.aligned_pair: + if flavor.alignment > 1: + # add aligned impls + for impl in function.impls: + if impl.feature is None or impl.feature in flavor.features: + mix_impls.append( (impl, flavor) ) + # add unaligned impls + for impl in function.aligned_pair.impls: + if impl.feature is None or impl.feature in flavor.features: + mix_impls.append( (impl, flavor) ) + else: + # no alignment specialization + for impl in function.impls: + if impl.feature is None or impl.feature in flavor.features: + mix_impls.append( (impl, flavor) ) + + mix_impls.sort(key=rank_key) + %> +#ifdef ${mix.macro} + % for rank, (impl, flavor) in enumerate(mix_impls): + { ${rank}, "${impl.wisdom_name(flavor)}", "${flavor.name}", ${impl.impl_symbol(flavor)}, ${flavor.test_function_expr} }, + % endfor +#endif /* ${mix.macro} */ +% endfor + { 0, NULL, NULL, NULL, NULL } +}; + +% endfor + +int ${gen.sym("read_wisdom")} (const char * path) +{ + FILE *fp = fopen(path, "r"); + if (!fp) + return -1; + + /* reset all ranks to identify entries not listed in the wisdom file; we'll assign ranks at the end to produce a stable sort */ +% for function in gen.functions.values(): + int rank_${function.name} = 0; + for (${function.regentry_type} *entry = ${function.registry_symbol}; entry->name; ++entry) { + entry->rank = 0; + } +% endfor + + char linebuf[512]; + while (fgets(linebuf, sizeof(linebuf), fp)) { + /* split name and impl on whitespace, handle comments etc */ + char *name = linebuf; + while (*name && isspace(*name)) + ++name; + + if (!*name || *name == '#') + continue; + + char *end = name; + while (*end && !isspace(*end)) + ++end; + + if (!*end) + continue; + *end = 0; + + char *impl = end + 1; + while (*impl && isspace(*impl)) + ++impl; + + if (!*impl) + continue; + + end = impl; + while (*end && !isspace(*end)) + ++end; + + *end = 0; + + /* try to find a matching registry entry */ +% for function in gen.functions.values(): + if (!strcmp(name, "${function.name}")) { + for (${function.regentry_type} *entry = ${function.registry_symbol}; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_${function.name}; + break; + } + } + continue; + } +% endfor + } + + if (ferror(fp)) { + fclose(fp); + return -1; + } + + fclose(fp); + + /* assign ranks to unmatched items to (stable) sort them last; re-sort everything */ +% for function in gen.functions.values(): + { + ${function.regentry_type} *entry; + for (entry = ${function.registry_symbol}; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_${function.name}; + } + qsort(${function.registry_symbol}, entry - ${function.registry_symbol}, sizeof(${function.regentry_type}), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + ${function.callable_symbol} = ${function.dispatcher_symbol}; + } +% endfor + + return 0; +} diff --git a/starch/templates/flavor.c.template b/starch/templates/flavor.c.template new file mode 100644 index 0000000..227ffa9 --- /dev/null +++ b/starch/templates/flavor.c.template @@ -0,0 +1,48 @@ +### Copyright (c) 2020, FlightAware LLC. +### All rights reserved. +### See the LICENSE file for licensing terms. + +/* starch generated code. Do not edit. */ + +#define ${flavor.macro} +% for feature in flavor.features: +#define ${feature.macro} +% endfor + +#include "${os.path.relpath(gen.generated_include_path, current_dir)}" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) ${gen.symbol_prefix} ## _name ## _ ## ${flavor.name} +#define STARCH_IMPL(_function,_impl) ${gen.symbol_prefix} ## _function ## _ ## _impl ## _ ## ${flavor.name} +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +% for source in gen.impl_files: + % if any( ((impl.feature is None or impl.feature in flavor.features) and not impl.function.aligned) for impl in source.impls): +#include "${os.path.relpath(source.path, current_dir)}" + % endif +% endfor + +% if flavor.alignment > 1: + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) ${gen.symbol_prefix} ## _name ## _aligned_ ## ${flavor.name} +#define STARCH_IMPL(_function,_impl) ${gen.symbol_prefix} ## _function ## _aligned_ ## _impl ## _ ## ${flavor.name} +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +% for source in gen.impl_files: + % if any( ((impl.feature is None or impl.feature in flavor.features) and impl.function.aligned) for impl in source.impls): +#include "${os.path.relpath(source.path, current_dir)}" + % endif +% endfor + +% endif diff --git a/starch/templates/makefile.template b/starch/templates/makefile.template new file mode 100644 index 0000000..3fae92c --- /dev/null +++ b/starch/templates/makefile.template @@ -0,0 +1,57 @@ +# -*- makefile -*- + +### Copyright (c) 2020, FlightAware LLC. +### All rights reserved. +### See the LICENSE file for licensing terms. + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -D${mix.macro} +<% + import os + o_files = [] + impl_c_files = ' '.join( map(lambda x, gen=gen: os.path.relpath(x.path, gen.runtime_dir), gen.impl_files) ) + benchmark_c_files = ' '.join( map(lambda x, gen=gen: os.path.relpath(x.path, gen.runtime_dir), gen.benchmark_files) ) +%> +% for flavor in mix.flavors: +<% + c_file = os.path.relpath(gen.generated_flavor_path(flavor), gen.runtime_dir) + o_file = os.path.splitext(c_file)[0] + '.o' + o_files.append(o_file) +%> +${o_file}: ${c_file} ${impl_c_files} + $(STARCH_COMPILE) $(STARCH_CFLAGS) ${flavor.cflags} ${c_file} -o ${o_file} +% endfor +<% + c_file = os.path.relpath(gen.generated_dispatcher_path, gen.runtime_dir) + o_file = os.path.splitext(c_file)[0] + '.o' + o_files.append(o_file) + %> +${o_file}: ${c_file} ${impl_c_files} + $(STARCH_COMPILE) $(STARCH_CFLAGS) ${c_file} -o ${o_file} + +STARCH_OBJS := ${' '.join(o_files)} + +<% + c_file = os.path.relpath(gen.generated_benchmark_path, gen.runtime_dir) + o_file = os.path.splitext(c_file)[0] + '.o' + %> +${o_file}: ${c_file} ${benchmark_c_files} + $(STARCH_COMPILE) $(STARCH_CFLAGS) ${c_file} -o ${o_file} + +STARCH_BENCHMARK_OBJ := ${o_file} diff --git a/starch/templates/starch.h.template b/starch/templates/starch.h.template new file mode 100644 index 0000000..ae77b59 --- /dev/null +++ b/starch/templates/starch.h.template @@ -0,0 +1,68 @@ +### Copyright (c) 2020, FlightAware LLC. +### All rights reserved. +### See the LICENSE file for licensing terms. + +/* starch generated code. Do not edit. */ + +% for include in gen.includes: +#include ${include} +% endfor + +/* mixes */ + +% for mix in gen.mixes.values(): +/* ${mix.description} */ +#ifdef ${mix.macro} + % for flavor in mix.flavors: +#define ${flavor.macro} + % endfor +#define STARCH_MIX_ALIGNMENT ${max((flavor.alignment) for flavor in mix.flavors)} +#endif /* ${mix.macro} */ + +% endfor + +#ifdef STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_IS_ALIGNED(_ptr) (((uintptr_t)(_ptr) & (STARCH_MIX_ALIGNMENT-1)) == 0) +#else +/* mix not defined, alignment is unknown, treat everything as unaligned */ +#define STARCH_IS_ALIGNED(_ptr) (0) +#endif + + +/* entry points and registries */ + +% for function in gen.functions.values(): +typedef ${function.returntype} (* ${function.pointer_type}) ( ${function.declaration_arglist} ); +extern ${function.pointer_type} ${function.callable_symbol}; + +typedef struct { + int rank; + const char *name; + const char *flavor; + ${function.pointer_type} callable; + int (*flavor_supported)(); +} ${function.regentry_type}; + +extern ${function.regentry_type} ${function.registry_symbol}[]; +${function.regentry_type} * ${function.select_symbol}(); +void ${function.set_wisdom_symbol}( const char * const * received_wisdom ); + +% endfor +/* flavors and prototypes */ + +% for flavor in gen.flavors.values(): +#ifdef ${flavor.macro} + % if flavor.test_function is not None: +int ${flavor.test_function} (void); + % endif + % for impl in gen.function_impls.values(): + % if (flavor.alignment > 1 or not impl.function.aligned) and (impl.feature is None or impl.feature in flavor.features): +${impl.function.returntype} ${impl.impl_symbol(flavor)} ( ${impl.function.declaration_arglist} ); + % endif + % endfor +#endif /* ${flavor.macro} */ + +int ${gen.sym("read_wisdom")} (const char * path); + +% endfor diff --git a/wisdom.arm b/wisdom.arm new file mode 100644 index 0000000..96dae84 --- /dev/null +++ b/wisdom.arm @@ -0,0 +1,31 @@ +# derived from wisdom.pi4b and wisdom.pi0w + +magnitude_power_uc8 neon_vrsqrte_armv7a_neon_vfpv4 # 225511 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 5464685 ns/call + +magnitude_power_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 212204 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 5516196 ns/call + +magnitude_sc16 neon_vrsqrte_armv7a_neon_vfpv4 # 684978 ns/call +magnitude_sc16 exact_float_generic # 28623479 ns/call + +magnitude_sc16_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 639779 ns/call +magnitude_sc16_aligned exact_float_generic # 28613950 ns/call + +magnitude_sc16q11 neon_vrsqrte_armv7a_neon_vfpv4 # 166113 ns/call +magnitude_sc16q11 exact_float_generic # 7131190 ns/call + +magnitude_sc16q11_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 155221 ns/call +magnitude_sc16q11_aligned exact_float_generic # 7124159 ns/call + +magnitude_uc8 neon_vrsqrte_armv7a_neon_vfpv4 # 188746 ns/call +magnitude_uc8 lookup_unroll_4_generic # 4179036 ns/call + +magnitude_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 187209 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 4445877 ns/call + +mean_power_u16 u32_armv7a_neon_vfpv4 # 45484 ns/call +mean_power_u16 u64_generic # 990367 ns/call + +mean_power_u16_aligned u32_armv7a_neon_vfpv4_aligned # 44929 ns/call +mean_power_u16_aligned u64_generic # 934445 ns/call diff --git a/wisdom.generic b/wisdom.generic new file mode 100644 index 0000000..2c924d3 --- /dev/null +++ b/wisdom.generic @@ -0,0 +1,16 @@ +# some fairly arbitrary defaults for when we don't know the target architecture + +magnitude_power_uc8 twopass_generic +magnitude_power_uc8_aligned twopass_generic + +magnitude_sc16 exact_float_generic +magnitude_sc16_aligned exact_float_generic + +magnitude_sc16q11 exact_float_generic +magnitude_sc16q11_aligned exact_float_generic + +magnitude_uc8 lookup_unroll_4_generic +magnitude_uc8_aligned lookup_unroll_4_generic + +mean_power_u16 u32_generic +mean_power_u16_aligned u32_generic diff --git a/wisdom.x86 b/wisdom.x86 new file mode 100644 index 0000000..28a7719 --- /dev/null +++ b/wisdom.x86 @@ -0,0 +1,31 @@ +# derived from wisdom.i7-6500u / wisdom.i7-6500u.generic + +magnitude_power_uc8 twopass_x86_avx2 # 65331 ns/call +magnitude_power_uc8 twopass_generic # 72679 ns/call + +magnitude_power_uc8_aligned twopass_x86_avx2_aligned # 66294 ns/call +magnitude_power_uc8_aligned twopass_generic # 68415 ns/call + +magnitude_sc16 exact_float_x86_avx2 # 238602 ns/call +magnitude_sc16 exact_float_generic # 1359997 ns/call + +magnitude_sc16_aligned exact_float_x86_avx2_aligned # 202484 ns/call +magnitude_sc16_aligned exact_float_generic # 1351564 ns/call + +magnitude_sc16q11 exact_float_x86_avx2 # 65311 ns/call +magnitude_sc16q11 exact_float_generic # 513012 ns/call + +magnitude_sc16q11_aligned exact_float_x86_avx2_aligned # 56217 ns/call +magnitude_sc16q11_aligned exact_float_generic # 510226 ns/call + +magnitude_uc8 lookup_unroll_4_x86_avx2 # 53581 ns/call +magnitude_uc8 lookup_unroll_4_generic # 52709 ns/call + +magnitude_uc8_aligned lookup_unroll_4_x86_avx2 # 53870 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 54033 ns/call + +mean_power_u16 u32_x86_avx2 # 11627 ns/call +mean_power_u16 u32_generic # 18252 ns/call + +mean_power_u16_aligned u32_x86_avx2_aligned # 11572 ns/call +mean_power_u16_aligned u32_generic # 18207 ns/call diff --git a/wisdom/wisdom.i7-6500u b/wisdom/wisdom.i7-6500u new file mode 100644 index 0000000..78b93bf --- /dev/null +++ b/wisdom/wisdom.i7-6500u @@ -0,0 +1,90 @@ +# model name : Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz +# +# "performance" cpufreq governor @ 2.50 GHz + +# generated by ./starch-benchmark -i 15 -r wisdom.local -o wisdom.local + +magnitude_power_uc8 twopass_x86_avx2 # 65331 ns/call +magnitude_power_uc8 twopass_generic # 65363 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 67147 ns/call +magnitude_power_uc8 lookup_unroll_4_x86_avx2 # 67202 ns/call +magnitude_power_uc8 lookup_generic # 74612 ns/call +magnitude_power_uc8 lookup_x86_avx2 # 74801 ns/call + +magnitude_power_uc8_aligned twopass_generic # 66243 ns/call +magnitude_power_uc8_aligned twopass_x86_avx2 # 66258 ns/call +magnitude_power_uc8_aligned twopass_x86_avx2_aligned # 66294 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_x86_avx2_aligned # 67621 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_x86_avx2 # 67657 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 67684 ns/call +magnitude_power_uc8_aligned lookup_generic # 75036 ns/call +magnitude_power_uc8_aligned lookup_x86_avx2_aligned # 75191 ns/call +magnitude_power_uc8_aligned lookup_x86_avx2 # 75335 ns/call + +magnitude_sc16 exact_float_x86_avx2 # 256796 ns/call +magnitude_sc16 exact_u32_x86_avx2 # 300270 ns/call +magnitude_sc16 exact_float_generic # 1357410 ns/call +magnitude_sc16 exact_u32_generic # 2039745 ns/call + +magnitude_sc16_aligned exact_float_x86_avx2_aligned # 225583 ns/call +magnitude_sc16_aligned exact_float_x86_avx2 # 245087 ns/call +magnitude_sc16_aligned exact_u32_x86_avx2_aligned # 265908 ns/call +magnitude_sc16_aligned exact_u32_x86_avx2 # 289047 ns/call +magnitude_sc16_aligned exact_float_generic # 1345505 ns/call +magnitude_sc16_aligned exact_u32_generic # 2037905 ns/call + +magnitude_sc16q11 exact_float_x86_avx2 # 63530 ns/call +magnitude_sc16q11 exact_u32_x86_avx2 # 74567 ns/call +magnitude_sc16q11 exact_float_generic # 524297 ns/call +magnitude_sc16q11 12bit_table_x86_avx2 # 549772 ns/call +magnitude_sc16q11 12bit_table_generic # 551318 ns/call +magnitude_sc16q11 11bit_table_generic # 612628 ns/call +magnitude_sc16q11 11bit_table_x86_avx2 # 612833 ns/call +magnitude_sc16q11 exact_u32_generic # 652008 ns/call + +magnitude_sc16q11_aligned exact_float_x86_avx2_aligned # 56413 ns/call +magnitude_sc16q11_aligned exact_float_x86_avx2 # 61285 ns/call +magnitude_sc16q11_aligned exact_u32_x86_avx2_aligned # 66331 ns/call +magnitude_sc16q11_aligned exact_u32_x86_avx2 # 72272 ns/call +magnitude_sc16q11_aligned exact_float_generic # 521575 ns/call +magnitude_sc16q11_aligned 12bit_table_x86_avx2 # 549193 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 549588 ns/call +magnitude_sc16q11_aligned 12bit_table_x86_avx2_aligned # 570064 ns/call +magnitude_sc16q11_aligned 11bit_table_x86_avx2 # 616504 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 616961 ns/call +magnitude_sc16q11_aligned 11bit_table_x86_avx2_aligned # 618931 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 650346 ns/call + +magnitude_uc8 lookup_unroll_4_x86_avx2 # 53027 ns/call +magnitude_uc8 lookup_unroll_4_generic # 53081 ns/call +magnitude_uc8 lookup_x86_avx2 # 53482 ns/call +magnitude_uc8 lookup_generic # 53489 ns/call +magnitude_uc8 exact_x86_avx2 # 91623 ns/call +magnitude_uc8 exact_generic # 801481 ns/call + +magnitude_uc8_aligned lookup_unroll_4_x86_avx2 # 53313 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 53329 ns/call +magnitude_uc8_aligned lookup_unroll_4_x86_avx2_aligned # 53358 ns/call +magnitude_uc8_aligned lookup_x86_avx2 # 53692 ns/call +magnitude_uc8_aligned lookup_x86_avx2_aligned # 53790 ns/call +magnitude_uc8_aligned lookup_generic # 55871 ns/call +magnitude_uc8_aligned exact_x86_avx2_aligned # 86939 ns/call +magnitude_uc8_aligned exact_x86_avx2 # 89688 ns/call +magnitude_uc8_aligned exact_generic # 802054 ns/call + +mean_power_u16 u32_x86_avx2 # 11601 ns/call +mean_power_u16 u32_generic # 18249 ns/call +mean_power_u16 float_x86_avx2 # 18556 ns/call +mean_power_u16 u64_x86_avx2 # 31297 ns/call +mean_power_u16 u64_generic # 39618 ns/call +mean_power_u16 float_generic # 105649 ns/call + +mean_power_u16_aligned u32_x86_avx2 # 11606 ns/call +mean_power_u16_aligned u32_x86_avx2_aligned # 11609 ns/call +mean_power_u16_aligned float_x86_avx2 # 18231 ns/call +mean_power_u16_aligned float_x86_avx2_aligned # 18253 ns/call +mean_power_u16_aligned u32_generic # 18254 ns/call +mean_power_u16_aligned u64_x86_avx2_aligned # 31282 ns/call +mean_power_u16_aligned u64_x86_avx2 # 31283 ns/call +mean_power_u16_aligned u64_generic # 39639 ns/call +mean_power_u16_aligned float_generic # 105615 ns/call diff --git a/wisdom/wisdom.i7-6500u.generic b/wisdom/wisdom.i7-6500u.generic new file mode 100644 index 0000000..96fbb6b --- /dev/null +++ b/wisdom/wisdom.i7-6500u.generic @@ -0,0 +1,43 @@ +# model name : Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz +# +# "performance" cpufreq governor @ 2.50 GHz + +magnitude_power_uc8 twopass_generic # 72679 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 84247 ns/call +magnitude_power_uc8 lookup_generic # 87929 ns/call + +magnitude_power_uc8_aligned twopass_generic # 68415 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 71632 ns/call +magnitude_power_uc8_aligned lookup_generic # 79056 ns/call + +magnitude_sc16 exact_float_generic # 1350012 ns/call +magnitude_sc16 exact_u32_generic # 2036183 ns/call + +magnitude_sc16_aligned exact_float_generic # 1340202 ns/call +magnitude_sc16_aligned exact_u32_generic # 2035257 ns/call + +magnitude_sc16q11 exact_float_generic # 523422 ns/call +magnitude_sc16q11 12bit_table_generic # 539142 ns/call +magnitude_sc16q11 11bit_table_generic # 613256 ns/call +magnitude_sc16q11 exact_u32_generic # 651178 ns/call + +magnitude_sc16q11_aligned exact_float_generic # 520001 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 539652 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 616597 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 649809 ns/call + +magnitude_uc8 lookup_unroll_4_generic # 56626 ns/call +magnitude_uc8 lookup_generic # 57064 ns/call +magnitude_uc8 exact_generic # 809893 ns/call + +magnitude_uc8_aligned lookup_unroll_4_generic # 58632 ns/call +magnitude_uc8_aligned lookup_generic # 62214 ns/call +magnitude_uc8_aligned exact_generic # 808622 ns/call + +mean_power_u16 u32_generic # 18135 ns/call +mean_power_u16 u64_generic # 39496 ns/call +mean_power_u16 float_generic # 105266 ns/call + +mean_power_u16_aligned u32_generic # 18155 ns/call +mean_power_u16_aligned u64_generic # 39493 ns/call +mean_power_u16_aligned float_generic # 105261 ns/call diff --git a/wisdom/wisdom.pi0w b/wisdom/wisdom.pi0w new file mode 100644 index 0000000..0f01ff3 --- /dev/null +++ b/wisdom/wisdom.pi0w @@ -0,0 +1,47 @@ +# Hardware : BCM2835 +# Revision : 9000c1 +# Model : Raspberry Pi Zero W Rev 1.1 +# +# "performance" cpufreq governor @ 1GHz + +# generated by ./starch-benchmark -i 15 -r wisdom.local -o wisdom.local + +magnitude_power_uc8 lookup_unroll_4_generic # 5711147 ns/call +magnitude_power_uc8 twopass_generic # 6205338 ns/call +magnitude_power_uc8 lookup_generic # 6880126 ns/call + +magnitude_power_uc8_aligned lookup_unroll_4_generic # 5750495 ns/call +magnitude_power_uc8_aligned twopass_generic # 6209062 ns/call +magnitude_power_uc8_aligned lookup_generic # 6941100 ns/call + +magnitude_sc16 exact_float_generic # 28623479 ns/call +magnitude_sc16 exact_u32_generic # 28660776 ns/call + +magnitude_sc16_aligned exact_float_generic # 28613950 ns/call +magnitude_sc16_aligned exact_u32_generic # 28671952 ns/call + +magnitude_sc16q11 exact_float_generic # 7142819 ns/call +magnitude_sc16q11 exact_u32_generic # 7146487 ns/call +magnitude_sc16q11 11bit_table_generic # 17820638 ns/call +magnitude_sc16q11 12bit_table_generic # 19280398 ns/call + +magnitude_sc16q11_aligned exact_float_generic # 7130689 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 7152986 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 17872904 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 19332280 ns/call + +magnitude_uc8 lookup_unroll_4_generic # 4420819 ns/call +magnitude_uc8 lookup_generic # 6346327 ns/call +magnitude_uc8 exact_generic # 9264172 ns/call + +magnitude_uc8_aligned lookup_unroll_4_generic # 4699258 ns/call +magnitude_uc8_aligned lookup_generic # 6600661 ns/call +magnitude_uc8_aligned exact_generic # 9308206 ns/call + +mean_power_u16 u64_generic # 976416 ns/call +mean_power_u16 u32_generic # 1040812 ns/call +mean_power_u16 float_generic # 1794994 ns/call + +mean_power_u16_aligned u64_generic # 961388 ns/call +mean_power_u16_aligned u32_generic # 1024339 ns/call +mean_power_u16_aligned float_generic # 1778085 ns/call diff --git a/wisdom/wisdom.pi4b b/wisdom/wisdom.pi4b new file mode 100644 index 0000000..09a053c --- /dev/null +++ b/wisdom/wisdom.pi4b @@ -0,0 +1,107 @@ +# Hardware : BCM2711 +# Revision : a03111 +# Model : Raspberry Pi 4 Model B Rev 1.1 +# +# "performance" cpufreq governor @ 1.5GHz + +# generated by ./starch-benchmark -i 15 -r wisdom.local -o wisdom.local + +magnitude_power_uc8 neon_vrsqrte_armv7a_neon_vfpv4 # 225494 ns/call +magnitude_power_uc8 twopass_generic # 232985 ns/call +magnitude_power_uc8 twopass_armv7a_neon_vfpv4 # 233043 ns/call +magnitude_power_uc8 lookup_generic # 312890 ns/call +magnitude_power_uc8 lookup_armv7a_neon_vfpv4 # 313395 ns/call +magnitude_power_uc8 lookup_unroll_4_armv7a_neon_vfpv4 # 351108 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 392295 ns/call + +magnitude_power_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 212203 ns/call +magnitude_power_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 212204 ns/call +magnitude_power_uc8_aligned twopass_armv7a_neon_vfpv4_aligned # 232057 ns/call +magnitude_power_uc8_aligned twopass_armv7a_neon_vfpv4 # 232072 ns/call +magnitude_power_uc8_aligned twopass_generic # 232141 ns/call +magnitude_power_uc8_aligned lookup_generic # 304510 ns/call +magnitude_power_uc8_aligned lookup_armv7a_neon_vfpv4_aligned # 304855 ns/call +magnitude_power_uc8_aligned lookup_armv7a_neon_vfpv4 # 304863 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4 # 332848 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4_aligned # 333134 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 377063 ns/call + +magnitude_sc16 neon_vrsqrte_armv7a_neon_vfpv4 # 685671 ns/call +magnitude_sc16 exact_u32_armv7a_neon_vfpv4 # 2471841 ns/call +magnitude_sc16 exact_float_armv7a_neon_vfpv4 # 2488725 ns/call +magnitude_sc16 exact_u32_generic # 3475780 ns/call +magnitude_sc16 exact_float_generic # 3627016 ns/call + +magnitude_sc16_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 645434 ns/call +magnitude_sc16_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 646233 ns/call +magnitude_sc16_aligned exact_u32_armv7a_neon_vfpv4 # 2464487 ns/call +magnitude_sc16_aligned exact_u32_armv7a_neon_vfpv4_aligned # 2464639 ns/call +magnitude_sc16_aligned exact_float_armv7a_neon_vfpv4_aligned # 2489450 ns/call +magnitude_sc16_aligned exact_float_armv7a_neon_vfpv4 # 2495798 ns/call +magnitude_sc16_aligned exact_u32_generic # 3473976 ns/call +magnitude_sc16_aligned exact_float_generic # 3629034 ns/call + +magnitude_sc16q11 neon_vrsqrte_armv7a_neon_vfpv4 # 166102 ns/call +magnitude_sc16q11 exact_u32_armv7a_neon_vfpv4 # 615312 ns/call +magnitude_sc16q11 exact_float_armv7a_neon_vfpv4 # 822023 ns/call +magnitude_sc16q11 exact_u32_generic # 1151805 ns/call +magnitude_sc16q11 exact_float_generic # 1218908 ns/call +magnitude_sc16q11 11bit_table_armv7a_neon_vfpv4 # 1940816 ns/call +magnitude_sc16q11 12bit_table_armv7a_neon_vfpv4 # 2035932 ns/call +magnitude_sc16q11 12bit_table_generic # 2401932 ns/call +magnitude_sc16q11 11bit_table_generic # 2656593 ns/call + +magnitude_sc16q11_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 155218 ns/call +magnitude_sc16q11_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 155242 ns/call +magnitude_sc16q11_aligned exact_u32_armv7a_neon_vfpv4 # 612259 ns/call +magnitude_sc16q11_aligned exact_u32_armv7a_neon_vfpv4_aligned # 612269 ns/call +magnitude_sc16q11_aligned exact_float_armv7a_neon_vfpv4_aligned # 815733 ns/call +magnitude_sc16q11_aligned exact_float_armv7a_neon_vfpv4 # 821729 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 1154414 ns/call +magnitude_sc16q11_aligned exact_float_generic # 1224252 ns/call +magnitude_sc16q11_aligned 11bit_table_armv7a_neon_vfpv4 # 1940788 ns/call +magnitude_sc16q11_aligned 12bit_table_armv7a_neon_vfpv4_aligned # 2035889 ns/call +magnitude_sc16q11_aligned 12bit_table_armv7a_neon_vfpv4 # 2036579 ns/call +magnitude_sc16q11_aligned 11bit_table_armv7a_neon_vfpv4_aligned # 2077521 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 2405119 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 2657152 ns/call + +magnitude_uc8 neon_vrsqrte_armv7a_neon_vfpv4 # 188739 ns/call +magnitude_uc8 lookup_unroll_4_generic # 284930 ns/call +magnitude_uc8 lookup_armv7a_neon_vfpv4 # 291956 ns/call +magnitude_uc8 lookup_generic # 292047 ns/call +magnitude_uc8 lookup_unroll_4_armv7a_neon_vfpv4 # 298012 ns/call +magnitude_uc8 exact_armv7a_neon_vfpv4 # 921119 ns/call +magnitude_uc8 exact_generic # 1676587 ns/call + +magnitude_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 187202 ns/call +magnitude_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 187203 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 280048 ns/call +magnitude_uc8_aligned lookup_armv7a_neon_vfpv4_aligned # 282247 ns/call +magnitude_uc8_aligned lookup_generic # 282254 ns/call +magnitude_uc8_aligned lookup_armv7a_neon_vfpv4 # 282262 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4_aligned # 292923 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4 # 292985 ns/call +magnitude_uc8_aligned exact_armv7a_neon_vfpv4 # 921141 ns/call +magnitude_uc8_aligned exact_armv7a_neon_vfpv4_aligned # 921149 ns/call +magnitude_uc8_aligned exact_generic # 1676551 ns/call + +mean_power_u16 u32_armv7a_neon_vfpv4 # 45483 ns/call +mean_power_u16 neon_float_armv7a_neon_vfpv4 # 58654 ns/call +mean_power_u16 u64_armv7a_neon_vfpv4 # 79486 ns/call +mean_power_u16 float_armv7a_neon_vfpv4 # 94322 ns/call +mean_power_u16 u64_generic # 131666 ns/call +mean_power_u16 u32_generic # 132124 ns/call +mean_power_u16 float_generic # 187161 ns/call + +mean_power_u16_aligned u32_armv7a_neon_vfpv4_aligned # 44929 ns/call +mean_power_u16_aligned u32_armv7a_neon_vfpv4 # 44933 ns/call +mean_power_u16_aligned neon_float_armv7a_neon_vfpv4 # 58485 ns/call +mean_power_u16_aligned neon_float_armv7a_neon_vfpv4_aligned # 58488 ns/call +mean_power_u16_aligned u64_armv7a_neon_vfpv4 # 80349 ns/call +mean_power_u16_aligned u64_armv7a_neon_vfpv4_aligned # 80669 ns/call +mean_power_u16_aligned float_armv7a_neon_vfpv4_aligned # 86325 ns/call +mean_power_u16_aligned float_armv7a_neon_vfpv4 # 86326 ns/call +mean_power_u16_aligned u64_generic # 131637 ns/call +mean_power_u16_aligned u32_generic # 132092 ns/call +mean_power_u16_aligned float_generic # 187127 ns/call