diff --git a/.gitignore b/.gitignore index 28200a3..816e363 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,8 @@ view1090 faup1090 package-wheezy oneoff/convert_benchmark +oneoff/uc8_capture_stats +oneoff/dsp_error_measurement oneoff/decode_comm_b +starch-benchmark +wisdom.local diff --git a/Makefile b/Makefile index 6c99b3f..75195b4 100644 --- a/Makefile +++ b/Makefile @@ -2,12 +2,12 @@ PROGNAME=dump1090 DUMP1090_VERSION ?= unknown -CPPFLAGS += -DMODES_DUMP1090_VERSION=\"$(DUMP1090_VERSION)\" -DMODES_DUMP1090_VARIANT=\"dump1090-fa\" +CPPFLAGS += -I. -DMODES_DUMP1090_VERSION=\"$(DUMP1090_VERSION)\" -DMODES_DUMP1090_VARIANT=\"dump1090-fa\" DIALECT = -std=c11 CFLAGS += $(DIALECT) -O3 -g -Wall -Wmissing-declarations -Werror -W -D_DEFAULT_SOURCE -fno-common LIBS = -lpthread -lm -SDR_OBJ = sdr.o fifo.o sdr_ifile.o +SDR_OBJ = cpu.o sdr.o fifo.o sdr_ifile.o dsp/helpers/tables.o # Try to autodetect available libraries via pkg-config if no explicit setting was used PKGCONFIG=$(shell pkg-config --version >/dev/null 2>&1 && echo "yes" || echo "no") @@ -42,33 +42,43 @@ endif UNAME := $(shell uname) ifeq ($(UNAME), Linux) - CFLAGS += -D_DEFAULT_SOURCE + include Makefile.cpufeatures + CPPFLAGS += -D_DEFAULT_SOURCE LIBS += -lrt LIBS_USB += -lusb-1.0 + CPUFEATURES ?= yes endif ifeq ($(UNAME), Darwin) ifneq ($(shell sw_vers -productVersion | egrep '^10\.([0-9]|1[01])\.'),) # Mac OS X ver <= 10.11 - CFLAGS += -DMISSING_GETTIME + CPPFLAGS += -DMISSING_GETTIME COMPAT += compat/clock_gettime/clock_gettime.o endif - CFLAGS += -DMISSING_NANOSLEEP + CPPFLAGS += -DMISSING_NANOSLEEP COMPAT += compat/clock_nanosleep/clock_nanosleep.o LIBS_USB += -lusb-1.0 + CPUFEATURES ?= yes endif ifeq ($(UNAME), OpenBSD) - CFLAGS += -DMISSING_NANOSLEEP + CPPFLAGS += -DMISSING_NANOSLEEP COMPAT += compat/clock_nanosleep/clock_nanosleep.o LIBS_USB += -lusb-1.0 endif ifeq ($(UNAME), FreeBSD) - CFLAGS += -D_DEFAULT_SOURCE + CPPFLAGS += -D_DEFAULT_SOURCE LIBS += -lrt LIBS_USB += -lusb endif +CPUFEATURES ?= no + +ifeq ($(CPUFEATURES),yes) + include Makefile.cpufeatures + CPPFLAGS += -DENABLE_CPUFEATURES -Icpu_features/include +endif + RTLSDR ?= yes BLADERF ?= yes @@ -122,22 +132,47 @@ ifeq ($(LIMESDR), yes) LIBS_SDR += $(shell pkg-config --libs LimeSuite) endif -all: showconfig dump1090 view1090 + +## +## starch (runtime DSP code selection) mix, architecture-specific +## + +ARCH ?= $(shell uname -m) +ifneq ($(CPUFEATURES),yes) + # need to be able to detect CPU features at runtime to enable any non-standard compiler flags + STARCH_MIX := generic + CPPFLAGS += -DSTARCH_MIX_GENERIC +else ifeq ($(ARCH),x86_64) + # AVX, AVX2 + STARCH_MIX := x86 + CPPFLAGS += -DSTARCH_MIX_X86 +else ifneq (,$(findstring arm,$(ARCH))) + # ARMv7 NEON + STARCH_MIX := arm + CPPFLAGS += -DSTARCH_MIX_ARM +else + STARCH_MIX := generic + CPPFLAGS += -DSTARCH_MIX_GENERIC +endif + +all: showconfig dump1090 view1090 starch-benchmark + +STARCH_COMPILE := $(CC) $(CPPFLAGS) $(CFLAGS) -c +include dsp/generated/makefile.$(STARCH_MIX) showconfig: @echo "Building with:" >&2 @echo " Version string: $(DUMP1090_VERSION)" >&2 + @echo " DSP mix: $(STARCH_MIX)" >&2 @echo " RTLSDR support: $(RTLSDR)" >&2 @echo " BladeRF support: $(BLADERF)" >&2 @echo " HackRF support: $(HACKRF)" >&2 @echo " LimeSDR support: $(LIMESDR)" >&2 -all: dump1090 view1090 - %.o: %.c *.h $(CC) $(CPPFLAGS) $(CFLAGS) -c $< -o $@ -dump1090: dump1090.o anet.o interactive.o mode_ac.o mode_s.o comm_b.o net_io.o crc.o demod_2400.o stats.o cpr.o icao_filter.o track.o util.o convert.o ais_charset.o $(SDR_OBJ) $(COMPAT) +dump1090: dump1090.o anet.o interactive.o mode_ac.o mode_s.o comm_b.o net_io.o crc.o demod_2400.o stats.o cpr.o icao_filter.o track.o util.o convert.o ais_charset.o $(SDR_OBJ) $(COMPAT) $(CPUFEATURES_OBJS) $(STARCH_OBJS) $(CC) -g -o $@ $^ $(LDFLAGS) $(LIBS) $(LIBS_SDR) -lncurses view1090: view1090.o anet.o interactive.o mode_ac.o mode_s.o comm_b.o net_io.o crc.o stats.o cpr.o icao_filter.o track.o util.o ais_charset.o $(COMPAT) @@ -146,8 +181,11 @@ view1090: view1090.o anet.o interactive.o mode_ac.o mode_s.o comm_b.o net_io.o c faup1090: faup1090.o anet.o mode_ac.o mode_s.o comm_b.o net_io.o crc.o stats.o cpr.o icao_filter.o track.o util.o ais_charset.o $(COMPAT) $(CC) -g -o $@ $^ $(LDFLAGS) $(LIBS) +starch-benchmark: cpu.o dsp/helpers/tables.o $(CPUFEATURES_OBJS) $(STARCH_OBJS) $(STARCH_BENCHMARK_OBJ) + $(CC) -g -o $@ $^ $(LDFLAGS) $(LIBS) + clean: - rm -f *.o oneoff/*.o compat/clock_gettime/*.o compat/clock_nanosleep/*.o dump1090 view1090 faup1090 cprtests crctests convert_benchmark + rm -f *.o oneoff/*.o compat/clock_gettime/*.o compat/clock_nanosleep/*.o cpu_features/src/*.o dsp/generated/*.o dsp/helpers/*.o $(CPUFEATURES_OBJS) dump1090 view1090 faup1090 cprtests crctests oneoff/convert_benchmark oneoff/decode_comm_b oneoff/dsp_error_measurement oneoff/uc8_capture_stats starch-benchmark test: cprtests ./cprtests @@ -161,8 +199,22 @@ crctests: crc.c crc.h benchmarks: oneoff/convert_benchmark oneoff/convert_benchmark -oneoff/convert_benchmark: oneoff/convert_benchmark.o convert.o util.o +oneoff/convert_benchmark: oneoff/convert_benchmark.o convert.o util.o dsp/helpers/tables.o cpu.o $(CPUFEATURES_OBJS) $(STARCH_OBJS) $(CC) $(CPPFLAGS) $(CFLAGS) -g -o $@ $^ -lm -lpthread oneoff/decode_comm_b: oneoff/decode_comm_b.o comm_b.o ais_charset.o $(CC) $(CPPFLAGS) $(CFLAGS) -g -o $@ $^ -lm + +oneoff/dsp_error_measurement: oneoff/dsp_error_measurement.o dsp/helpers/tables.o cpu.o $(CPUFEATURES_OBJS) $(STARCH_OBJS) + $(CC) $(CPPFLAGS) $(CFLAGS) -g -o $@ $^ -lm + +oneoff/uc8_capture_stats: oneoff/uc8_capture_stats.o + $(CC) $(CPPFLAGS) $(CFLAGS) -g -o $@ $^ -lm + +starchgen: + dsp/starchgen.py . + +.PHONY: wisdom.local +wisdom.local: starch-benchmark + ./starch-benchmark -i 15 -o wisdom.local mean_power_u16 mean_power_u16_aligned magnitude_uc8 magnitude_uc8_aligned + ./starch-benchmark -i 15 -r wisdom.local -o wisdom.local diff --git a/Makefile.cpufeatures b/Makefile.cpufeatures new file mode 100644 index 0000000..3e34cb4 --- /dev/null +++ b/Makefile.cpufeatures @@ -0,0 +1,29 @@ +# -*- makefile -*- + +# cmake integration is a little tricky, so let's do this by hand for now + +CPUFEATURES_UNAME := $(shell uname) +CPUFEATURES_ARCH := $(shell uname -m) + +CPUFEATURES_OBJS := cpu_features/src/filesystem.o cpu_features/src/stack_line_reader.o cpu_features/src/string_view.o +CPUFEATURES_CFLAGS := -std=c99 -O -g -DSTACK_LINE_READER_BUFFER_SIZE=1024 -DNDEBUG + +ifeq ($(CPUFEATURES_UNAME),Linux) + CPUFEATURES_OBJS += cpu_features/src/hwcaps.o + CPUFEATURES_CFLAGS += -DHAVE_STRONG_GETAUXVAL +endif + +ifeq ($(CPUFEATURES_UNAME),Darwin) + CPUFEATURES_CFLAGS += -DHAVE_SYSCTLBYNAME +endif + +ifeq ($(CPUFEATURES_ARCH), x86_64) + CPUFEATURES_OBJS += cpu_features/src/cpuinfo_x86.o +endif + +ifneq (,$(findstring arm,$(CPUFEATURES_ARCH))) + CPUFEATURES_OBJS += cpu_features/src/cpuinfo_arm.o +endif + +$(CPUFEATURES_OBJS): override CFLAGS := $(CPUFEATURES_CFLAGS) +$(CPUFEATURES_OBJS): override CPPFLAGS := -Icpu_features/include diff --git a/convert.c b/convert.c index 3f34a38..e1d1d60 100644 --- a/convert.c +++ b/convert.c @@ -19,483 +19,105 @@ #include "dump1090.h" -struct converter_state { - float dc_a; - float dc_b; - float z1_I; - float z1_Q; -}; - -static uint16_t *uc8_lookup; -static bool init_uc8_lookup() -{ - if (uc8_lookup) - return true; - - uc8_lookup = malloc(sizeof(uint16_t) * 256 * 256); - if (!uc8_lookup) { - fprintf(stderr, "can't allocate UC8 conversion lookup table\n"); - return false; - } - - for (int i = 0; i <= 255; i++) { - for (int q = 0; q <= 255; q++) { - float fI, fQ, magsq; - - fI = (i - 127.5) / 127.5; - fQ = (q - 127.5) / 127.5; - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - float mag = sqrtf(magsq); - - uc8_lookup[le16toh((i*256)+q)] = (uint16_t) (mag * 65535.0f + 0.5f); - } - } - - return true; -} - -static void convert_uc8_nodc(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) -{ - uint16_t *in = iq_data; - unsigned i; - uint64_t sum_level = 0; - uint64_t sum_power = 0; - uint16_t mag; - - MODES_NOTUSED(state); - - // unroll this a bit - -#define DO_ONE_SAMPLE \ - do { \ - mag = uc8_lookup[*in++]; \ - *mag_data++ = mag; \ - sum_level += mag; \ - sum_power += (uint32_t)mag * (uint32_t)mag; \ - } while(0) - - // unroll this a bit - for (i = 0; i < (nsamples>>3); ++i) { - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - DO_ONE_SAMPLE; - } - - for (i = 0; i < (nsamples&7); ++i) { - DO_ONE_SAMPLE; - } - -#undef DO_ONE_SAMPLE - - if (out_mean_level) { - *out_mean_level = sum_level / 65536.0 / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / 65535.0 / 65535.0 / nsamples; - } -} - -static void convert_uc8_generic(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) -{ - uint8_t *in = iq_data; - float z1_I = state->z1_I; - float z1_Q = state->z1_Q; - const float dc_a = state->dc_a; - const float dc_b = state->dc_b; - - unsigned i; - uint8_t I, Q; - float fI, fQ, magsq; - float sum_level = 0, sum_power = 0; - - for (i = 0; i < nsamples; ++i) { - I = *in++; - Q = *in++; - fI = (I - 127.5f) / 127.5f; - fQ = (Q - 127.5f) / 127.5f; - - // DC block - z1_I = fI * dc_a + z1_I * dc_b; - z1_Q = fQ * dc_a + z1_Q * dc_b; - fI -= z1_I; - fQ -= z1_Q; - - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - - float mag = sqrtf(magsq); - sum_power += magsq; - sum_level += mag; - *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f); - } - - state->z1_I = z1_I; - state->z1_Q = z1_Q; - - if (out_mean_level) { - *out_mean_level = sum_level / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / nsamples; - } -} - -static void convert_sc16_generic(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) -{ - uint16_t *in = iq_data; - float z1_I = state->z1_I; - float z1_Q = state->z1_Q; - const float dc_a = state->dc_a; - const float dc_b = state->dc_b; - - unsigned i; - int16_t I, Q; - float fI, fQ, magsq; - float sum_level = 0, sum_power = 0; - - for (i = 0; i < nsamples; ++i) { - I = (int16_t)le16toh(*in++); - Q = (int16_t)le16toh(*in++); - fI = I / 32768.0f; - fQ = Q / 32768.0f; - - // DC block - z1_I = fI * dc_a + z1_I * dc_b; - z1_Q = fQ * dc_a + z1_Q * dc_b; - fI -= z1_I; - fQ -= z1_Q; - - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - - float mag = sqrtf(magsq); - sum_power += magsq; - sum_level += mag; - *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f); - } - - state->z1_I = z1_I; - state->z1_Q = z1_Q; - - if (out_mean_level) { - *out_mean_level = sum_level / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / nsamples; - } -} - -static void convert_sc16_nodc(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) +static void convert_uc8(void *iq_data, + uint16_t *mag_data, + unsigned nsamples, + struct converter_state *state, + double *out_mean_level, + double *out_mean_power) { MODES_NOTUSED(state); - uint16_t *in = iq_data; + const uc8_t *in = (const uc8_t *) iq_data; - unsigned i; - int16_t I, Q; - float fI, fQ, magsq; - float sum_level = 0, sum_power = 0; - - for (i = 0; i < nsamples; ++i) { - I = (int16_t)le16toh(*in++); - Q = (int16_t)le16toh(*in++); - fI = I / 32768.0f; - fQ = Q / 32768.0f; - - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - - float mag = sqrtf(magsq); - sum_power += magsq; - sum_level += mag; - *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f); - } - - if (out_mean_level) { - *out_mean_level = sum_level / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / nsamples; + if (out_mean_level && out_mean_power) { + if (STARCH_IS_ALIGNED(in) && STARCH_IS_ALIGNED(mag_data)) + starch_magnitude_power_uc8_aligned(in, mag_data, nsamples, out_mean_level, out_mean_power); + else + starch_magnitude_power_uc8(in, mag_data, nsamples, out_mean_level, out_mean_power); + } else { + if (STARCH_IS_ALIGNED(in) && STARCH_IS_ALIGNED(mag_data)) + starch_magnitude_uc8_aligned(in, mag_data, nsamples); + else + starch_magnitude_uc8(in, mag_data, nsamples); } } -// SC16Q11_TABLE_BITS controls the size of the lookup table -// for SC16Q11 data. The size of the table is 2 * (1 << (2*BITS)) -// bytes. Reducing the number of bits reduces precision but -// can run substantially faster by staying in cache. -// See convert_benchmark.c for some numbers. - -// Leaving SC16QQ_TABLE_BITS undefined will disable the table lookup and always use -// the floating-point path, which may be faster on some systems - -#if defined(SC16Q11_TABLE_BITS) - -#define USE_BITS SC16Q11_TABLE_BITS -#define LOSE_BITS (11 - SC16Q11_TABLE_BITS) - -static uint16_t *sc16q11_lookup; -static bool init_sc16q11_lookup() -{ - if (sc16q11_lookup) - return true; - - sc16q11_lookup = malloc(sizeof(uint16_t) * (1 << (USE_BITS * 2))); - if (!sc16q11_lookup) { - fprintf(stderr, "can't allocate SC16Q11 conversion lookup table\n"); - return false; - } - - for (int i = 0; i < 2048; i += (1 << LOSE_BITS)) { - for (int q = 0; q < 2048; q += (1 << LOSE_BITS)) { - float fI = i / 2048.0, fQ = q / 2048.0; - float magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - float mag = sqrtf(magsq); - - unsigned index = ((i >> LOSE_BITS) << USE_BITS) | (q >> LOSE_BITS); - sc16q11_lookup[index] = (uint16_t)(mag * 65535.0f + 0.5f); - } - } - - return true; -} - -static void convert_sc16q11_table(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) -{ - uint16_t *in = iq_data; - unsigned i; - uint16_t I, Q; - uint64_t sum_level = 0; - uint64_t sum_power = 0; - uint16_t mag; - - MODES_NOTUSED(state); - - for (i = 0; i < nsamples; ++i) { - I = abs((int16_t)le16toh(*in++)) & 2047; - Q = abs((int16_t)le16toh(*in++)) & 2047; - mag = sc16q11_lookup[((I >> LOSE_BITS) << USE_BITS) | (Q >> LOSE_BITS)]; - *mag_data++ = mag; - sum_level += mag; - sum_power += (uint32_t)mag * (uint32_t)mag; - } - - if (out_mean_level) { - *out_mean_level = sum_level / 65536.0 / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / 65535.0 / 65535.0 / nsamples; - } -} - -#else /* ! defined(SC16Q11_TABLE_BITS) */ - -static void convert_sc16q11_nodc(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) +static void convert_sc16(void *iq_data, + uint16_t *mag_data, + unsigned nsamples, + struct converter_state *state, + double *out_mean_level, + double *out_mean_power) { MODES_NOTUSED(state); - uint16_t *in = iq_data; + const sc16_t *in = (const sc16_t *) iq_data; - unsigned i; - int16_t I, Q; - float fI, fQ, magsq; - float sum_level = 0, sum_power = 0; + if (STARCH_IS_ALIGNED(in) && STARCH_IS_ALIGNED(mag_data)) + starch_magnitude_sc16_aligned(in, mag_data, nsamples); + else + starch_magnitude_sc16(in, mag_data, nsamples); - for (i = 0; i < nsamples; ++i) { - I = (int16_t)le16toh(*in++); - Q = (int16_t)le16toh(*in++); - fI = I / 2048.0f; - fQ = Q / 2048.0f; - - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - - float mag = sqrtf(magsq); - sum_power += magsq; - sum_level += mag; - *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f); - } - - if (out_mean_level) { - *out_mean_level = sum_level / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / nsamples; + if (out_mean_level && out_mean_power) { + if (STARCH_IS_ALIGNED(mag_data)) + starch_mean_power_u16_aligned(mag_data, nsamples, out_mean_level, out_mean_power); + else + starch_mean_power_u16(mag_data, nsamples, out_mean_level, out_mean_power); } } -#endif /* defined(SC16Q11_TABLE_BITS) */ - -static void convert_sc16q11_generic(void *iq_data, - uint16_t *mag_data, - unsigned nsamples, - struct converter_state *state, - double *out_mean_level, - double *out_mean_power) +static void convert_sc16q11(void *iq_data, + uint16_t *mag_data, + unsigned nsamples, + struct converter_state *state, + double *out_mean_level, + double *out_mean_power) { - uint16_t *in = iq_data; - float z1_I = state->z1_I; - float z1_Q = state->z1_Q; - const float dc_a = state->dc_a; - const float dc_b = state->dc_b; + MODES_NOTUSED(state); - unsigned i; - int16_t I, Q; - float fI, fQ, magsq; - float sum_level = 0, sum_power = 0; + const sc16_t *in = (const sc16_t *) iq_data; - for (i = 0; i < nsamples; ++i) { - I = (int16_t)le16toh(*in++); - Q = (int16_t)le16toh(*in++); - fI = I / 2048.0f; - fQ = Q / 2048.0f; + if (STARCH_IS_ALIGNED(in) && STARCH_IS_ALIGNED(mag_data)) + starch_magnitude_sc16q11_aligned(in, mag_data, nsamples); + else + starch_magnitude_sc16q11(in, mag_data, nsamples); - // DC block - z1_I = fI * dc_a + z1_I * dc_b; - z1_Q = fQ * dc_a + z1_Q * dc_b; - fI -= z1_I; - fQ -= z1_Q; - - magsq = fI * fI + fQ * fQ; - if (magsq > 1) - magsq = 1; - - float mag = sqrtf(magsq); - sum_power += magsq; - sum_level += mag; - *mag_data++ = (uint16_t)(mag * 65535.0f + 0.5f); - } - - state->z1_I = z1_I; - state->z1_Q = z1_Q; - - if (out_mean_level) { - *out_mean_level = sum_level / nsamples; - } - - if (out_mean_power) { - *out_mean_power = sum_power / nsamples; + if (out_mean_level && out_mean_power) { + if (STARCH_IS_ALIGNED(mag_data)) + starch_mean_power_u16_aligned(mag_data, nsamples, out_mean_level, out_mean_power); + else + starch_mean_power_u16(mag_data, nsamples, out_mean_level, out_mean_power); } } -static struct { - input_format_t format; - int can_filter_dc; - iq_convert_fn fn; - const char *description; - bool (*init)(); -} converters_table[] = { - // In order of preference - { INPUT_UC8, 0, convert_uc8_nodc, "UC8, integer/table path", init_uc8_lookup }, - { INPUT_UC8, 1, convert_uc8_generic, "UC8, float path", NULL }, - { INPUT_SC16, 0, convert_sc16_nodc, "SC16, float path, no DC", NULL }, - { INPUT_SC16, 1, convert_sc16_generic, "SC16, float path", NULL }, -#if defined(SC16Q11_TABLE_BITS) - { INPUT_SC16Q11, 0, convert_sc16q11_table, "SC16Q11, integer/table path", init_sc16q11_lookup }, -#else - { INPUT_SC16Q11, 0, convert_sc16q11_nodc, "SC16Q11, float path, no DC", NULL }, -#endif - { INPUT_SC16Q11, 1, convert_sc16q11_generic, "SC16Q11, float path", NULL }, - { 0, 0, NULL, NULL, NULL } -}; - iq_convert_fn init_converter(input_format_t format, double sample_rate, int filter_dc, struct converter_state **out_state) { - int i; - - for (i = 0; converters_table[i].fn; ++i) { - if (converters_table[i].format != format) - continue; - if (filter_dc && !converters_table[i].can_filter_dc) - continue; - break; - } - - if (!converters_table[i].fn) { - fprintf(stderr, "no suitable converter for format=%d dc=%d\n", - format, filter_dc); - return NULL; - } - - if (converters_table[i].init) { - if (!converters_table[i].init()) - return NULL; - } - - *out_state = malloc(sizeof(struct converter_state)); - if (! *out_state) { - fprintf(stderr, "can't allocate converter state\n"); - return NULL; - } - - (*out_state)->z1_I = 0; - (*out_state)->z1_Q = 0; + MODES_NOTUSED(sample_rate); + MODES_NOTUSED(out_state); if (filter_dc) { - // init DC block @ 1Hz - (*out_state)->dc_b = exp(-2.0 * M_PI * 1.0 / sample_rate); - (*out_state)->dc_a = 1.0 - (*out_state)->dc_b; - } else { - // if the converter does filtering, make sure it has no effect - (*out_state)->dc_b = 1.0; - (*out_state)->dc_a = 0.0; + fprintf(stderr, "DC filtering not supported (yet)\n"); + return NULL; } - return converters_table[i].fn; + switch (format) { + case INPUT_UC8: + return convert_uc8; + case INPUT_SC16: + return convert_sc16; + case INPUT_SC16Q11: + return convert_sc16q11; + default: + fprintf(stderr, "no suitable converter for format=%d\n", format); + return NULL; + } } void cleanup_converter(struct converter_state *state) { - free(state); + MODES_NOTUSED(state); } diff --git a/cpu.c b/cpu.c new file mode 100644 index 0000000..831ab4f --- /dev/null +++ b/cpu.c @@ -0,0 +1,78 @@ +#include "cpu.h" + +#include + +#ifdef ENABLE_CPUFEATURES +#include "cpu_features_macros.h" +#endif + +// +// x86 +// + +#ifdef CPU_FEATURES_ARCH_X86 +#include "cpuinfo_x86.h" + +static X86Info *x86_info() +{ + static bool valid = false; + static X86Info cache; + + if (!valid) { + cache = GetX86Info(); + valid = true; + } + + return &cache; +} + +#endif + +int cpu_supports_avx(void) +{ +#ifdef CPU_FEATURES_ARCH_X86 + return x86_info()->features.avx; +#else + return 0; +#endif +} + +int cpu_supports_avx2(void) +{ +#ifdef CPU_FEATURES_ARCH_X86 + return x86_info()->features.avx2; +#else + return 0; +#endif +} + +// +// ARM +// + +#ifdef CPU_FEATURES_ARCH_ARM +#include "cpuinfo_arm.h" + +static ArmInfo *arm_info() +{ + static bool valid = false; + static ArmInfo cache; + + if (!valid) { + cache = GetArmInfo(); + valid = true; + } + + return &cache; +} + +#endif + +int cpu_supports_armv7_neon_vfpv4(void) +{ +#ifdef CPU_FEATURES_ARCH_ARM + return arm_info()->architecture >= 7 && arm_info()->features.neon && arm_info()->features.vfpv4 && arm_info()->features.vfpd32; +#else + return 0; +#endif +} diff --git a/cpu.h b/cpu.h new file mode 100644 index 0000000..0cf88bf --- /dev/null +++ b/cpu.h @@ -0,0 +1,11 @@ +#ifndef DUMP1090_CPU_H +#define DUMP1090_CPU_H + +// x86 +int cpu_supports_avx(void); +int cpu_supports_avx2(void); + +// ARM +int cpu_supports_armv7_neon_vfpv4(void); + +#endif diff --git a/cpu_features/.clang-format b/cpu_features/.clang-format new file mode 100644 index 0000000..06ea346 --- /dev/null +++ b/cpu_features/.clang-format @@ -0,0 +1,4 @@ +--- +Language: Cpp +BasedOnStyle: Google +... diff --git a/cpu_features/.gitignore b/cpu_features/.gitignore new file mode 100644 index 0000000..6285424 --- /dev/null +++ b/cpu_features/.gitignore @@ -0,0 +1,4 @@ +cmake_build/ +build/ + +*.swp diff --git a/cpu_features/.travis.yml b/cpu_features/.travis.yml new file mode 100644 index 0000000..b5845be --- /dev/null +++ b/cpu_features/.travis.yml @@ -0,0 +1,121 @@ +language: c + +sudo: false + +cache: + timeout: 1000 + directories: + - $HOME/cpu_features_archives + +addons: + apt_packages: + - ninja-build + +env: + global: + TOOLCHAIN=NATIVE + CMAKE_GENERATOR=Ninja + +matrix: + include: + - os: linux + compiler: gcc + env: + TARGET=x86_64-linux-gnu + - os: linux + compiler: clang + env: + TARGET=x86_64-linux-gnu + - os: osx + compiler: gcc + env: + TARGET=x86_64-osx + CMAKE_GENERATOR="Unix Makefiles" + - os: osx + compiler: clang + env: + TARGET=x86_64-osx + CMAKE_GENERATOR="Unix Makefiles" + - os: windows + env: + TARGET=x86_64-windows + CMAKE_GENERATOR="Visual Studio 15 2017 Win64" + + # see: https://docs.travis-ci.com/user/multi-cpu-architectures/ + - os: linux + arch: ppc64le + compiler: gcc + env: + TARGET=ppc64le-linux-gnu + - os: linux + arch: ppc64le + compiler: clang + env: + TARGET=ppc64le-linux-gnu + + # Toolchains for little-endian, 64-bit ARMv8 for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=aarch64-linux-gnu + QEMU_ARCH=aarch64 + # Toolchains for little-endian, hard-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=arm-linux-gnueabihf + QEMU_ARCH=arm + # Toolchains for little-endian, 32-bit ARMv8 for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=armv8l-linux-gnueabihf + QEMU_ARCH=arm + # Toolchains for little-endian, soft-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=arm-linux-gnueabi + QEMU_ARCH=arm + # Toolchains for big-endian, 64-bit ARMv8 for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=aarch64_be-linux-gnu + QEMU_ARCH=DISABLED + # Toolchains for big-endian, hard-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=armeb-linux-gnueabihf + QEMU_ARCH=DISABLED + # Toolchains for big-endian, soft-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems + - os: linux + env: + TOOLCHAIN=LINARO + TARGET=armeb-linux-gnueabi + QEMU_ARCH=DISABLED + - os: linux + env: + TOOLCHAIN=CODESCAPE + TARGET=mips32 + QEMU_ARCH=mips + - os: linux + env: + TOOLCHAIN=CODESCAPE + TARGET=mips32el + QEMU_ARCH=mipsel + - os: linux + env: + TOOLCHAIN=CODESCAPE + TARGET=mips64 + QEMU_ARCH=mips64 + - os: linux + env: + TOOLCHAIN=CODESCAPE + TARGET=mips64el + QEMU_ARCH=mips64el + +script: + - cmake --version + - bash -e -x ./scripts/run_integration.sh diff --git a/cpu_features/CMakeLists.txt b/cpu_features/CMakeLists.txt new file mode 100644 index 0000000..f9daeac --- /dev/null +++ b/cpu_features/CMakeLists.txt @@ -0,0 +1,259 @@ +cmake_minimum_required(VERSION 3.0) + +# option() honors normal variables. +# see: https://cmake.org/cmake/help/git-stage/policy/CMP0077.html +if(POLICY CMP0077) + cmake_policy(SET CMP0077 NEW) +endif() + +project(CpuFeatures VERSION 0.6.0 LANGUAGES C) + +set(CMAKE_C_STANDARD 99) + +# Default Build Type to be Release +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel." + FORCE) +endif(NOT CMAKE_BUILD_TYPE) + +# BUILD_TESTING is a standard CMake variable, but we declare it here to make it +# prominent in the GUI. +option(BUILD_TESTING "Enable test (depends on googletest)." OFF) +# BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to make +# it prominent in the GUI. +# cpu_features uses bit-fields which are - to some extends - implementation-defined (see https://en.cppreference.com/w/c/language/bit_field). +# As a consequence it is discouraged to use cpu_features as a shared library because different compilers may interpret the code in different ways. +# Prefer static linking from source whenever possible. +option(BUILD_SHARED_LIBS "Build library as shared." OFF) +# PIC +option(BUILD_PIC "Build with Position Independant Code." OFF) # Default is off at least for GCC + +# Force PIC on unix when building shared libs +# see: https://en.wikipedia.org/wiki/Position-independent_code +if(BUILD_SHARED_LIBS AND UNIX) + set(BUILD_PIC ON) +endif() + +include(CheckIncludeFile) +include(CheckSymbolExists) +include(GNUInstallDirs) + +macro(setup_include_and_definitions TARGET_NAME) + target_include_directories(${TARGET_NAME} + PUBLIC $ + PRIVATE $ + ) + target_compile_definitions(${TARGET_NAME} + PUBLIC STACK_LINE_READER_BUFFER_SIZE=1024 + ) +endmacro() + +set(PROCESSOR_IS_MIPS FALSE) +set(PROCESSOR_IS_ARM FALSE) +set(PROCESSOR_IS_AARCH64 FALSE) +set(PROCESSOR_IS_X86 FALSE) +set(PROCESSOR_IS_POWER FALSE) + +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^mips") + set(PROCESSOR_IS_MIPS TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm") + set(PROCESSOR_IS_ARM TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64") + set(PROCESSOR_IS_AARCH64 TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") + set(PROCESSOR_IS_X86 TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)") + set(PROCESSOR_IS_POWER TRUE) +endif() + +macro(add_cpu_features_headers_and_sources HDRS_LIST_NAME SRCS_LIST_NAME) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpu_features_macros.h) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpu_features_cache_info.h) + if(PROCESSOR_IS_MIPS) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpuinfo_mips.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/src/cpuinfo_mips.c) + elseif(PROCESSOR_IS_ARM) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpuinfo_arm.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/src/cpuinfo_arm.c) + elseif(PROCESSOR_IS_AARCH64) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpuinfo_aarch64.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/src/cpuinfo_aarch64.c) + elseif(PROCESSOR_IS_X86) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpuinfo_x86.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/internal/cpuid_x86.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/src/cpuinfo_x86.c) + elseif(PROCESSOR_IS_POWER) + list(APPEND ${HDRS_LIST_NAME} ${PROJECT_SOURCE_DIR}/include/cpuinfo_ppc.h) + list(APPEND ${SRCS_LIST_NAME} ${PROJECT_SOURCE_DIR}/src/cpuinfo_ppc.c) + else() + message(FATAL_ERROR "Unsupported architectures ${CMAKE_SYSTEM_PROCESSOR}") + endif() +endmacro() + +# +# library : utils +# + +add_library(utils OBJECT + ${PROJECT_SOURCE_DIR}/include/internal/bit_utils.h + ${PROJECT_SOURCE_DIR}/include/internal/filesystem.h + ${PROJECT_SOURCE_DIR}/include/internal/stack_line_reader.h + ${PROJECT_SOURCE_DIR}/include/internal/string_view.h + ${PROJECT_SOURCE_DIR}/src/filesystem.c + ${PROJECT_SOURCE_DIR}/src/stack_line_reader.c + ${PROJECT_SOURCE_DIR}/src/string_view.c +) +set_property(TARGET utils PROPERTY POSITION_INDEPENDENT_CODE ${BUILD_PIC}) +setup_include_and_definitions(utils) + +# +# library : unix_based_hardware_detection +# + +if(UNIX) + add_library(unix_based_hardware_detection OBJECT + ${PROJECT_SOURCE_DIR}/include/internal/hwcaps.h + ${PROJECT_SOURCE_DIR}/src/hwcaps.c + ) + setup_include_and_definitions(unix_based_hardware_detection) + check_include_file(dlfcn.h HAVE_DLFCN_H) + if(HAVE_DLFCN_H) + target_compile_definitions(unix_based_hardware_detection PRIVATE HAVE_DLFCN_H) + endif() + check_symbol_exists(getauxval "sys/auxv.h" HAVE_STRONG_GETAUXVAL) + if(HAVE_STRONG_GETAUXVAL) + target_compile_definitions(unix_based_hardware_detection PRIVATE HAVE_STRONG_GETAUXVAL) + endif() + set_property(TARGET unix_based_hardware_detection PROPERTY POSITION_INDEPENDENT_CODE ${BUILD_PIC}) +endif() + +# +# library : cpu_features +# +set (CPU_FEATURES_HDRS) +set (CPU_FEATURES_SRCS) +add_cpu_features_headers_and_sources(CPU_FEATURES_HDRS CPU_FEATURES_SRCS) +list(APPEND CPU_FEATURES_SRCS $) +if(NOT PROCESSOR_IS_X86 AND UNIX) + list(APPEND CPU_FEATURES_SRCS $) +endif() +add_library(cpu_features ${CPU_FEATURES_HDRS} ${CPU_FEATURES_SRCS}) +set_target_properties(cpu_features PROPERTIES PUBLIC_HEADER "${CPU_FEATURES_HDRS}") +setup_include_and_definitions(cpu_features) +target_link_libraries(cpu_features PUBLIC ${CMAKE_DL_LIBS}) +set_property(TARGET cpu_features PROPERTY POSITION_INDEPENDENT_CODE ${BUILD_PIC}) +target_include_directories(cpu_features + PUBLIC $ +) +if(PROCESSOR_IS_X86) + if(APPLE) + target_compile_definitions(cpu_features PRIVATE HAVE_SYSCTLBYNAME) + endif() +endif() +add_library(CpuFeature::cpu_features ALIAS cpu_features) + +# +# program : list_cpu_features +# + +add_executable(list_cpu_features ${PROJECT_SOURCE_DIR}/src/utils/list_cpu_features.c) +target_link_libraries(list_cpu_features PRIVATE cpu_features) +add_executable(CpuFeature::list_cpu_features ALIAS list_cpu_features) + +# +# ndk_compat +# + +if(ANDROID) +add_subdirectory(ndk_compat) +endif() + +# +# tests +# + +include(CTest) +if(BUILD_TESTING) + # Automatically incorporate googletest into the CMake Project if target not + # found. + enable_language(CXX) + + set(CMAKE_CXX_STANDARD 11) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + set(CMAKE_CXX_EXTENSIONS OFF) # prefer use of -std11 instead of -gnustd11 + + if(NOT TARGET gtest OR NOT TARGET gmock_main) + # Download and unpack googletest at configure time. + configure_file( + cmake/googletest.CMakeLists.txt.in + googletest-download/CMakeLists.txt + ) + + execute_process( + COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download) + + if(result) + message(FATAL_ERROR "CMake step for googletest failed: ${result}") + endif() + + execute_process( + COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/googletest-download) + + if(result) + message(FATAL_ERROR "Build step for googletest failed: ${result}") + endif() + + # Prevent overriding the parent project's compiler/linker settings on + # Windows. + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + + # Add googletest directly to our build. This defines the gtest and + # gtest_main targets. + add_subdirectory(${CMAKE_BINARY_DIR}/googletest-src + ${CMAKE_BINARY_DIR}/googletest-build + EXCLUDE_FROM_ALL) + endif() + + add_subdirectory(test) +endif() + +# +# Install cpu_features and list_cpu_features +# + +include(GNUInstallDirs) +install(TARGETS cpu_features list_cpu_features + EXPORT CpuFeaturesTargets + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cpu_features + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) +install(EXPORT CpuFeaturesTargets + NAMESPACE CpuFeatures:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeatures + COMPONENT Devel +) +include(CMakePackageConfigHelpers) +configure_package_config_file(cmake/CpuFeaturesConfig.cmake.in + "${PROJECT_BINARY_DIR}/CpuFeaturesConfig.cmake" + INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeatures" + NO_SET_AND_CHECK_MACRO + NO_CHECK_REQUIRED_COMPONENTS_MACRO +) +write_basic_package_version_file( + "${PROJECT_BINARY_DIR}/CpuFeaturesConfigVersion.cmake" + COMPATIBILITY SameMajorVersion +) +install( + FILES + "${PROJECT_BINARY_DIR}/CpuFeaturesConfig.cmake" + "${PROJECT_BINARY_DIR}/CpuFeaturesConfigVersion.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeatures" + COMPONENT Devel +) diff --git a/cpu_features/CONTRIBUTING.md b/cpu_features/CONTRIBUTING.md new file mode 100644 index 0000000..c980350 --- /dev/null +++ b/cpu_features/CONTRIBUTING.md @@ -0,0 +1,23 @@ +# How to Contribute + +We'd love to accept your patches and contributions to this project. There are +just a few small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. diff --git a/cpu_features/LICENSE b/cpu_features/LICENSE new file mode 100644 index 0000000..a7043c6 --- /dev/null +++ b/cpu_features/LICENSE @@ -0,0 +1,230 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +-------------------------------------------------------------------------------- +For files in the `ndk_compat` folder: +-------------------------------------------------------------------------------- + +Copyright (C) 2010 The Android Open Source Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS +OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT +OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. diff --git a/cpu_features/README.md b/cpu_features/README.md new file mode 100644 index 0000000..8a34168 --- /dev/null +++ b/cpu_features/README.md @@ -0,0 +1,199 @@ +# cpu_features [![Build Status](https://travis-ci.org/google/cpu_features.svg?branch=master)](https://travis-ci.org/google/cpu_features) [![Build status](https://ci.appveyor.com/api/projects/status/46d1owsj7n8dsylq/branch/master?svg=true)](https://ci.appveyor.com/project/gchatelet/cpu-features/branch/master) + +A cross-platform C library to retrieve CPU features (such as available +instructions) at runtime. + +## Table of Contents + +- [Design Rationale](#rationale) +- [Code samples](#codesample) +- [Running sample code](#usagesample) +- [What's supported](#support) +- [Android NDK's drop in replacement](#ndk) +- [License](#license) +- [Build with cmake](#cmake) + + +## Design Rationale + +- **Simple to use.** See the snippets below for examples. +- **Extensible.** Easy to add missing features or architectures. +- **Compatible with old compilers** and available on many architectures so it + can be used widely. To ensure that cpu_features works on as many platforms + as possible, we implemented it in a highly portable version of C: C99. +- **Sandbox-compatible.** The library uses a variety of strategies to cope + with sandboxed environments or when `cpuid` is unavailable. This is useful + when running integration tests in hermetic environments. +- **Thread safe, no memory allocation, and raises no exceptions.** + cpu_features is suitable for implementing fundamental libc functions like + `malloc`, `memcpy`, and `memcmp`. +- **Unit tested.** + + +## Code samples + +**Note:** For C++ code, the library functions are defined in the `CpuFeatures` namespace. + +### Checking features at runtime + +Here's a simple example that executes a codepath if the CPU supports both the +AES and the SSE4.2 instruction sets: + +```c +#include "cpuinfo_x86.h" + +// For C++, add `using namespace CpuFeatures;` +static const X86Features features = GetX86Info().features; + +void Compute(void) { + if (features.aes && features.sse4_2) { + // Run optimized code. + } else { + // Run standard code. + } +} +``` + +### Caching for faster evaluation of complex checks + +If you wish, you can read all the features at once into a global variable, and +then query for the specific features you care about. Below, we store all the ARM +features and then check whether AES and NEON are supported. + +```c +#include +#include "cpuinfo_arm.h" + +// For C++, add `using namespace CpuFeatures;` +static const ArmFeatures features = GetArmInfo().features; +static const bool has_aes_and_neon = features.aes && features.neon; + +// use has_aes_and_neon. +``` + +This is a good approach to take if you're checking for combinations of features +when using a compiler that is slow to extract individual bits from bit-packed +structures. + +### Checking compile time flags + +The following code determines whether the compiler was told to use the AVX +instruction set (e.g., `g++ -mavx`) and sets `has_avx` accordingly. + +```c +#include +#include "cpuinfo_x86.h" + +// For C++, add `using namespace CpuFeatures;` +static const X86Features features = GetX86Info().features; +static const bool has_avx = CPU_FEATURES_COMPILED_X86_AVX || features.avx; + +// use has_avx. +``` + +`CPU_FEATURES_COMPILED_X86_AVX` is set to 1 if the compiler was instructed to +use AVX and 0 otherwise, combining compile time and runtime knowledge. + +### Rejecting poor hardware implementations based on microarchitecture + +On x86, the first incarnation of a feature in a microarchitecture might not be +the most efficient (e.g. AVX on Sandy Bridge). We provide a function to retrieve +the underlying microarchitecture so you can decide whether to use it. + +Below, `has_fast_avx` is set to 1 if the CPU supports the AVX instruction +set—but only if it's not Sandy Bridge. + +```c +#include +#include "cpuinfo_x86.h" + +// For C++, add `using namespace CpuFeatures;` +static const X86Info info = GetX86Info(); +static const X86Microarchitecture uarch = GetX86Microarchitecture(&info); +static const bool has_fast_avx = info.features.avx && uarch != INTEL_SNB; + +// use has_fast_avx. +``` + +This feature is currently available only for x86 microarchitectures. + + +### Running sample code + +Building `cpu_features` (check [quickstart](#quickstart) below) brings a small executable to test the library. + +```shell + % ./build/list_cpu_features +arch : x86 +brand : Intel(R) Xeon(R) CPU E5-1650 0 @ 3.20GHz +family : 6 (0x06) +model : 45 (0x2D) +stepping : 7 (0x07) +uarch : INTEL_SNB +flags : aes,avx,cx16,smx,sse4_1,sse4_2,ssse3 +``` + +```shell +% ./build/list_cpu_features --json +{"arch":"x86","brand":" Intel(R) Xeon(R) CPU E5-1650 0 @ 3.20GHz","family":6,"model":45,"stepping":7,"uarch":"INTEL_SNB","flags":["aes","avx","cx16","smx","sse4_1","sse4_2","ssse3"]} +``` + + +## What's supported + +| | x86³ | ARM | AArch64 | MIPS⁴ | POWER | +|---------|:----:|:-------:|:-------:|:------:|:-------:| +| Android | yes² | yes¹ | yes¹ | yes¹ | N/A | +| iOS | N/A | not yet | not yet | N/A | N/A | +| Linux | yes² | yes¹ | yes¹ | yes¹ | yes¹ | +| MacOs | yes² | N/A | not yet | N/A | no | +| Windows | yes² | not yet | not yet | N/A | N/A | + +1. **Features revealed from Linux.** We gather data from several sources + depending on availability: + + from glibc's + [getauxval](https://www.gnu.org/software/libc/manual/html_node/Auxiliary-Vector.html) + + by parsing `/proc/self/auxv` + + by parsing `/proc/cpuinfo` +2. **Features revealed from CPU.** features are retrieved by using the `cpuid` + instruction. +3. **Microarchitecture detection.** On x86 some features are not always + implemented efficiently in hardware (e.g. AVX on Sandybridge). Exposing the + microarchitecture allows the client to reject particular microarchitectures. +4. All flavors of Mips are supported, little and big endian as well as 32/64 + bits. + + +## Android NDK's drop in replacement + +[cpu_features](https://github.com/google/cpu_features) is now officially +supporting Android and offers a drop in replacement of for the NDK's [cpu-features.h](https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h) +, see [ndk_compat](ndk_compat) folder for details. + + +## License + +The cpu_features library is licensed under the terms of the Apache license. +See [LICENSE](LICENSE) for more information. + + +## Build with CMake + +Please check the [CMake build instructions](cmake/README.md). + + +### Quickstart with `Ninja` + + - build `list_cpu_features` +``` + cmake -B/tmp/cpu_features -H. -GNinja -DCMAKE_BUILD_TYPE=Release + ninja -C/tmp/cpu_features + /tmp/cpu_features/list_cpu_features --json +``` + + - run tests +``` + cmake -B/tmp/cpu_features -H. -GNinja -DBUILD_TESTING=ON + ninja -C/tmp/cpu_features + ninja -C/tmp/cpu_features test +``` diff --git a/cpu_features/WORKSPACE b/cpu_features/WORKSPACE new file mode 100644 index 0000000..8ea8a8b --- /dev/null +++ b/cpu_features/WORKSPACE @@ -0,0 +1,7 @@ +# ===== googletest ===== + +git_repository( + name = "com_google_googletest", + remote = "https://github.com/google/googletest.git", + commit = "c3f65335b79f47b05629e79a54685d899bc53b93", +) diff --git a/cpu_features/appveyor.yml b/cpu_features/appveyor.yml new file mode 100644 index 0000000..f18635a --- /dev/null +++ b/cpu_features/appveyor.yml @@ -0,0 +1,24 @@ +version: '{build}' +shallow_clone: true + +platform: x64 + +environment: + matrix: + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 + CMAKE_GENERATOR: "Visual Studio 15 2017 Win64" + - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 + CMAKE_GENERATOR: "Visual Studio 14 2015 Win64" + +matrix: + fast_finish: true + +before_build: + - cmake --version + - cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -H. -Bcmake_build -G "%CMAKE_GENERATOR%" + +build_script: + - cmake --build cmake_build --config Debug --target ALL_BUILD + +test_script: + - cmake --build cmake_build --config Debug --target RUN_TESTS diff --git a/cpu_features/cmake/CpuFeaturesConfig.cmake.in b/cpu_features/cmake/CpuFeaturesConfig.cmake.in new file mode 100644 index 0000000..e0bf10e --- /dev/null +++ b/cpu_features/cmake/CpuFeaturesConfig.cmake.in @@ -0,0 +1,3 @@ +# CpuFeatures CMake configuration file + +include("${CMAKE_CURRENT_LIST_DIR}/CpuFeaturesTargets.cmake") diff --git a/cpu_features/cmake/CpuFeaturesNdkCompatConfig.cmake.in b/cpu_features/cmake/CpuFeaturesNdkCompatConfig.cmake.in new file mode 100644 index 0000000..5a53ffd --- /dev/null +++ b/cpu_features/cmake/CpuFeaturesNdkCompatConfig.cmake.in @@ -0,0 +1,3 @@ +# CpuFeaturesNdkCompat CMake configuration file + +include("${CMAKE_CURRENT_LIST_DIR}/CpuFeaturesNdkCompatTargets.cmake") diff --git a/cpu_features/cmake/README.md b/cpu_features/cmake/README.md new file mode 100644 index 0000000..b6baeaa --- /dev/null +++ b/cpu_features/cmake/README.md @@ -0,0 +1,28 @@ +# CMake build instructions + +## Recommended usage : Incorporating cpu_features into a CMake project + + For API / ABI compatibility reasons, it is recommended to build and use + cpu_features in a subdirectory of your project or as an embedded dependency. + + This is similar to the recommended usage of the googletest framework + ( https://github.com/google/googletest/blob/master/googletest/README.md ) + + Build and use step-by-step + + + 1- Download cpu_features and copy it in a sub-directory in your project. + or add cpu_features as a git-submodule in your project + + 2- You can then use the cmake command `add_subdirectory()` to include + cpu_features directly and use the `cpu_features` target in your project. + + 3- Add the `cpu_features` target to the `target_link_libraries()` section of + your executable or of your library. + +## Enabling tests + + CMake default options for cpu_features is Release built type with tests + disabled. To enable testing set cmake `BUILD_TESTING` variable to `ON`, + [.travis.yml](../.travis.yml) and [appveyor.yml](../appveyor.yml) have up to + date examples. diff --git a/cpu_features/cmake/googletest.CMakeLists.txt.in b/cpu_features/cmake/googletest.CMakeLists.txt.in new file mode 100644 index 0000000..d60a33e --- /dev/null +++ b/cpu_features/cmake/googletest.CMakeLists.txt.in @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 2.8.2) + +project(googletest-download NONE) + +include(ExternalProject) +ExternalProject_Add(googletest + GIT_REPOSITORY https://github.com/google/googletest.git + GIT_TAG master + SOURCE_DIR "${CMAKE_BINARY_DIR}/googletest-src" + BINARY_DIR "${CMAKE_BINARY_DIR}/googletest-build" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) \ No newline at end of file diff --git a/cpu_features/include/cpu_features_cache_info.h b/cpu_features/include/cpu_features_cache_info.h new file mode 100644 index 0000000..1a61ee1 --- /dev/null +++ b/cpu_features/include/cpu_features_cache_info.h @@ -0,0 +1,54 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_COMMON_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_COMMON_H_ + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef enum { + CPU_FEATURE_CACHE_NULL = 0, + CPU_FEATURE_CACHE_DATA = 1, + CPU_FEATURE_CACHE_INSTRUCTION = 2, + CPU_FEATURE_CACHE_UNIFIED = 3, + CPU_FEATURE_CACHE_TLB = 4, + CPU_FEATURE_CACHE_DTLB = 5, + CPU_FEATURE_CACHE_STLB = 6, + CPU_FEATURE_CACHE_PREFETCH = 7 +} CacheType; + +typedef struct { + int level; + CacheType cache_type; + int cache_size; // Cache size in bytes + int ways; // Associativity, 0 undefined, 0xFF fully associative + int line_size; // Cache line size in bytes + int tlb_entries; // number of entries for TLB + int partitioning; // number of lines per sector +} CacheLevelInfo; + +// Increase this value if more cache levels are needed. +#ifndef CPU_FEATURES_MAX_CACHE_LEVEL +#define CPU_FEATURES_MAX_CACHE_LEVEL 10 +#endif +typedef struct { + int size; + CacheLevelInfo levels[CPU_FEATURES_MAX_CACHE_LEVEL]; +} CacheInfo; + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_COMMON_H_ diff --git a/cpu_features/include/cpu_features_macros.h b/cpu_features/include/cpu_features_macros.h new file mode 100644 index 0000000..4b231a1 --- /dev/null +++ b/cpu_features/include/cpu_features_macros.h @@ -0,0 +1,216 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPU_FEATURES_MACROS_H_ +#define CPU_FEATURES_INCLUDE_CPU_FEATURES_MACROS_H_ + +//////////////////////////////////////////////////////////////////////////////// +// Architectures +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__pnacl__) || defined(__CLR_VER) +#define CPU_FEATURES_ARCH_VM +#endif + +#if (defined(_M_IX86) || defined(__i386__)) && !defined(CPU_FEATURES_ARCH_VM) +#define CPU_FEATURES_ARCH_X86_32 +#endif + +#if (defined(_M_X64) || defined(__x86_64__)) && !defined(CPU_FEATURES_ARCH_VM) +#define CPU_FEATURES_ARCH_X86_64 +#endif + +#if defined(CPU_FEATURES_ARCH_X86_32) || defined(CPU_FEATURES_ARCH_X86_64) +#define CPU_FEATURES_ARCH_X86 +#endif + +#if (defined(__arm__) || defined(_M_ARM)) +#define CPU_FEATURES_ARCH_ARM +#endif + +#if defined(__aarch64__) +#define CPU_FEATURES_ARCH_AARCH64 +#endif + +#if (defined(CPU_FEATURES_ARCH_AARCH64) || defined(CPU_FEATURES_ARCH_ARM)) +#define CPU_FEATURES_ARCH_ANY_ARM +#endif + +#if defined(__mips64) +#define CPU_FEATURES_ARCH_MIPS64 +#endif + +#if defined(__mips__) && !defined(__mips64) // mips64 also declares __mips__ +#define CPU_FEATURES_ARCH_MIPS32 +#endif + +#if defined(CPU_FEATURES_ARCH_MIPS32) || defined(CPU_FEATURES_ARCH_MIPS64) +#define CPU_FEATURES_ARCH_MIPS +#endif + +#if defined(__powerpc__) +#define CPU_FEATURES_ARCH_PPC +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Os +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__linux__) +#define CPU_FEATURES_OS_LINUX_OR_ANDROID +#endif + +#if defined(__ANDROID__) +#define CPU_FEATURES_OS_ANDROID +#endif + +#if (defined(_WIN64) || defined(_WIN32)) +#define CPU_FEATURES_OS_WINDOWS +#endif + +#if (defined(__apple__) || defined(__APPLE__) || defined(__MACH__)) +#define CPU_FEATURES_OS_DARWIN +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Compilers +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__clang__) +#define CPU_FEATURES_COMPILER_CLANG +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#define CPU_FEATURES_COMPILER_GCC +#endif + +#if defined(_MSC_VER) +#define CPU_FEATURES_COMPILER_MSC +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Cpp +//////////////////////////////////////////////////////////////////////////////// + +#if defined(__cplusplus) +#define CPU_FEATURES_START_CPP_NAMESPACE \ + namespace cpu_features { \ + extern "C" { +#define CPU_FEATURES_END_CPP_NAMESPACE \ + } \ + } +#else +#define CPU_FEATURES_START_CPP_NAMESPACE +#define CPU_FEATURES_END_CPP_NAMESPACE +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Compiler flags +//////////////////////////////////////////////////////////////////////////////// + +// Use the following to check if a feature is known to be available at +// compile time. See README.md for an example. +#if defined(CPU_FEATURES_ARCH_X86) + +#if defined(__AES__) +#define CPU_FEATURES_COMPILED_X86_AES 1 +#else +#define CPU_FEATURES_COMPILED_X86_AES 0 +#endif // defined(__AES__) + +#if defined(__F16C__) +#define CPU_FEATURES_COMPILED_X86_F16C 1 +#else +#define CPU_FEATURES_COMPILED_X86_F16C 0 +#endif // defined(__F16C__) + +#if defined(__BMI__) +#define CPU_FEATURES_COMPILED_X86_BMI 1 +#else +#define CPU_FEATURES_COMPILED_X86_BMI 0 +#endif // defined(__BMI__) + +#if defined(__BMI2__) +#define CPU_FEATURES_COMPILED_X86_BMI2 1 +#else +#define CPU_FEATURES_COMPILED_X86_BMI2 0 +#endif // defined(__BMI2__) + +#if (defined(__SSE__) || (_M_IX86_FP >= 1)) +#define CPU_FEATURES_COMPILED_X86_SSE 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSE 0 +#endif + +#if (defined(__SSE2__) || (_M_IX86_FP >= 2)) +#define CPU_FEATURES_COMPILED_X86_SSE2 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSE2 0 +#endif + +#if defined(__SSE3__) +#define CPU_FEATURES_COMPILED_X86_SSE3 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSE3 0 +#endif // defined(__SSE3__) + +#if defined(__SSSE3__) +#define CPU_FEATURES_COMPILED_X86_SSSE3 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSSE3 0 +#endif // defined(__SSSE3__) + +#if defined(__SSE4_1__) +#define CPU_FEATURES_COMPILED_X86_SSE4_1 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSE4_1 0 +#endif // defined(__SSE4_1__) + +#if defined(__SSE4_2__) +#define CPU_FEATURES_COMPILED_X86_SSE4_2 1 +#else +#define CPU_FEATURES_COMPILED_X86_SSE4_2 0 +#endif // defined(__SSE4_2__) + +#if defined(__AVX__) +#define CPU_FEATURES_COMPILED_X86_AVX 1 +#else +#define CPU_FEATURES_COMPILED_X86_AVX 0 +#endif // defined(__AVX__) + +#if defined(__AVX2__) +#define CPU_FEATURES_COMPILED_X86_AVX2 1 +#else +#define CPU_FEATURES_COMPILED_X86_AVX2 0 +#endif // defined(__AVX2__) + +#endif // defined(CPU_FEATURES_ARCH_X86) + +#if defined(CPU_FEATURES_ARCH_ANY_ARM) +#if defined(__ARM_NEON__) +#define CPU_FEATURES_COMPILED_ANY_ARM_NEON 1 +#else +#define CPU_FEATURES_COMPILED_ANY_ARM_NEON 0 +#endif // defined(__ARM_NEON__) +#endif // defined(CPU_FEATURES_ARCH_ANY_ARM) + +#if defined(CPU_FEATURES_ARCH_MIPS) +#if defined(__mips_msa) +#define CPU_FEATURES_COMPILED_MIPS_MSA 1 +#else +#define CPU_FEATURES_COMPILED_MIPS_MSA 0 +#endif // defined(__mips_msa) +#endif // defined(CPU_FEATURES_ARCH_MIPS) + +#endif // CPU_FEATURES_INCLUDE_CPU_FEATURES_MACROS_H_ diff --git a/cpu_features/include/cpuinfo_aarch64.h b/cpu_features/include/cpuinfo_aarch64.h new file mode 100644 index 0000000..d85d46d --- /dev/null +++ b/cpu_features/include/cpuinfo_aarch64.h @@ -0,0 +1,156 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_AARCH64_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_AARCH64_H_ + +#include "cpu_features_cache_info.h" +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + int fp : 1; // Floating-point. + int asimd : 1; // Advanced SIMD. + int evtstrm : 1; // Generic timer generated events. + int aes : 1; // Hardware-accelerated Advanced Encryption Standard. + int pmull : 1; // Polynomial multiply long. + int sha1 : 1; // Hardware-accelerated SHA1. + int sha2 : 1; // Hardware-accelerated SHA2-256. + int crc32 : 1; // Hardware-accelerated CRC-32. + int atomics : 1; // Armv8.1 atomic instructions. + int fphp : 1; // Half-precision floating point support. + int asimdhp : 1; // Advanced SIMD half-precision support. + int cpuid : 1; // Access to certain ID registers. + int asimdrdm : 1; // Rounding Double Multiply Accumulate/Subtract. + int jscvt : 1; // Support for JavaScript conversion. + int fcma : 1; // Floating point complex numbers. + int lrcpc : 1; // Support for weaker release consistency. + int dcpop : 1; // Data persistence writeback. + int sha3 : 1; // Hardware-accelerated SHA3. + int sm3 : 1; // Hardware-accelerated SM3. + int sm4 : 1; // Hardware-accelerated SM4. + int asimddp : 1; // Dot product instruction. + int sha512 : 1; // Hardware-accelerated SHA512. + int sve : 1; // Scalable Vector Extension. + int asimdfhm : 1; // Additional half-precision instructions. + int dit : 1; // Data independent timing. + int uscat : 1; // Unaligned atomics support. + int ilrcpc : 1; // Additional support for weaker release consistency. + int flagm : 1; // Flag manipulation instructions. + int ssbs : 1; // Speculative Store Bypass Safe PSTATE bit. + int sb : 1; // Speculation barrier. + int paca : 1; // Address authentication. + int pacg : 1; // Generic authentication. + int dcpodp : 1; // Data cache clean to point of persistence. + int sve2 : 1; // Scalable Vector Extension (version 2). + int sveaes : 1; // SVE AES instructions. + int svepmull : 1; // SVE polynomial multiply long instructions. + int svebitperm : 1; // SVE bit permute instructions. + int svesha3 : 1; // SVE SHA3 instructions. + int svesm4 : 1; // SVE SM4 instructions. + int flagm2 : 1; // Additional flag manipulation instructions. + int frint : 1; // Floating point to integer rounding. + int svei8mm : 1; // SVE Int8 matrix multiplication instructions. + int svef32mm : 1; // SVE FP32 matrix multiplication instruction. + int svef64mm : 1; // SVE FP64 matrix multiplication instructions. + int svebf16 : 1; // SVE BFloat16 instructions. + int i8mm : 1; // Int8 matrix multiplication instructions. + int bf16 : 1; // BFloat16 instructions. + int dgh : 1; // Data Gathering Hint instruction. + int rng : 1; // True random number generator support. + int bti : 1; // Branch target identification. + + // Make sure to update Aarch64FeaturesEnum below if you add a field here. +} Aarch64Features; + +typedef struct { + Aarch64Features features; + int implementer; + int variant; + int part; + int revision; +} Aarch64Info; + +Aarch64Info GetAarch64Info(void); + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +typedef enum { + AARCH64_FP, + AARCH64_ASIMD, + AARCH64_EVTSTRM, + AARCH64_AES, + AARCH64_PMULL, + AARCH64_SHA1, + AARCH64_SHA2, + AARCH64_CRC32, + AARCH64_ATOMICS, + AARCH64_FPHP, + AARCH64_ASIMDHP, + AARCH64_CPUID, + AARCH64_ASIMDRDM, + AARCH64_JSCVT, + AARCH64_FCMA, + AARCH64_LRCPC, + AARCH64_DCPOP, + AARCH64_SHA3, + AARCH64_SM3, + AARCH64_SM4, + AARCH64_ASIMDDP, + AARCH64_SHA512, + AARCH64_SVE, + AARCH64_ASIMDFHM, + AARCH64_DIT, + AARCH64_USCAT, + AARCH64_ILRCPC, + AARCH64_FLAGM, + AARCH64_SSBS, + AARCH64_SB, + AARCH64_PACA, + AARCH64_PACG, + AARCH64_DCPODP, + AARCH64_SVE2, + AARCH64_SVEAES, + AARCH64_SVEPMULL, + AARCH64_SVEBITPERM, + AARCH64_SVESHA3, + AARCH64_SVESM4, + AARCH64_FLAGM2, + AARCH64_FRINT, + AARCH64_SVEI8MM, + AARCH64_SVEF32MM, + AARCH64_SVEF64MM, + AARCH64_SVEBF16, + AARCH64_I8MM, + AARCH64_BF16, + AARCH64_DGH, + AARCH64_RNG, + AARCH64_BTI, + AARCH64_LAST_, +} Aarch64FeaturesEnum; + +int GetAarch64FeaturesEnumValue(const Aarch64Features* features, + Aarch64FeaturesEnum value); + +const char* GetAarch64FeaturesEnumName(Aarch64FeaturesEnum); + +CPU_FEATURES_END_CPP_NAMESPACE + +#if !defined(CPU_FEATURES_ARCH_AARCH64) +#error "Including cpuinfo_aarch64.h from a non-aarch64 target." +#endif + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_AARCH64_H_ diff --git a/cpu_features/include/cpuinfo_arm.h b/cpu_features/include/cpuinfo_arm.h new file mode 100644 index 0000000..0952d7c --- /dev/null +++ b/cpu_features/include/cpuinfo_arm.h @@ -0,0 +1,121 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_ARM_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_ARM_H_ + +#include // uint32_t + +#include "cpu_features_cache_info.h" +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + int swp : 1; // SWP instruction (atomic read-modify-write) + int half : 1; // Half-word loads and stores + int thumb : 1; // Thumb (16-bit instruction set) + int _26bit : 1; // "26 Bit" Model (Processor status register folded into + // program counter) + int fastmult : 1; // 32x32->64-bit multiplication + int fpa : 1; // Floating point accelerator + int vfp : 1; // Vector Floating Point. + int edsp : 1; // DSP extensions (the 'e' variant of the ARM9 CPUs, and all + // others above) + int java : 1; // Jazelle (Java bytecode accelerator) + int iwmmxt : 1; // Intel Wireless MMX Technology. + int crunch : 1; // MaverickCrunch coprocessor + int thumbee : 1; // ThumbEE + int neon : 1; // Advanced SIMD. + int vfpv3 : 1; // VFP version 3 + int vfpv3d16 : 1; // VFP version 3 with 16 D-registers + int tls : 1; // TLS register + int vfpv4 : 1; // VFP version 4 with fast context switching + int idiva : 1; // SDIV and UDIV hardware division in ARM mode. + int idivt : 1; // SDIV and UDIV hardware division in Thumb mode. + int vfpd32 : 1; // VFP with 32 D-registers + int lpae : 1; // Large Physical Address Extension (>4GB physical memory on + // 32-bit architecture) + int evtstrm : 1; // kernel event stream using generic architected timer + int aes : 1; // Hardware-accelerated Advanced Encryption Standard. + int pmull : 1; // Polynomial multiply long. + int sha1 : 1; // Hardware-accelerated SHA1. + int sha2 : 1; // Hardware-accelerated SHA2-256. + int crc32 : 1; // Hardware-accelerated CRC-32. + + // Make sure to update ArmFeaturesEnum below if you add a field here. +} ArmFeatures; + +typedef struct { + ArmFeatures features; + int implementer; + int architecture; + int variant; + int part; + int revision; +} ArmInfo; + +// TODO(user): Add macros to know which features are present at compile +// time. + +ArmInfo GetArmInfo(void); + +// Compute CpuId from ArmInfo. +uint32_t GetArmCpuId(const ArmInfo* const info); + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +typedef enum { + ARM_SWP, + ARM_HALF, + ARM_THUMB, + ARM_26BIT, + ARM_FASTMULT, + ARM_FPA, + ARM_VFP, + ARM_EDSP, + ARM_JAVA, + ARM_IWMMXT, + ARM_CRUNCH, + ARM_THUMBEE, + ARM_NEON, + ARM_VFPV3, + ARM_VFPV3D16, + ARM_TLS, + ARM_VFPV4, + ARM_IDIVA, + ARM_IDIVT, + ARM_VFPD32, + ARM_LPAE, + ARM_EVTSTRM, + ARM_AES, + ARM_PMULL, + ARM_SHA1, + ARM_SHA2, + ARM_CRC32, + ARM_LAST_, +} ArmFeaturesEnum; + +int GetArmFeaturesEnumValue(const ArmFeatures* features, ArmFeaturesEnum value); + +const char* GetArmFeaturesEnumName(ArmFeaturesEnum); + +CPU_FEATURES_END_CPP_NAMESPACE + +#if !defined(CPU_FEATURES_ARCH_ARM) +#error "Including cpuinfo_arm.h from a non-arm target." +#endif + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_ARM_H_ diff --git a/cpu_features/include/cpuinfo_mips.h b/cpu_features/include/cpuinfo_mips.h new file mode 100644 index 0000000..9e5e7fc --- /dev/null +++ b/cpu_features/include/cpuinfo_mips.h @@ -0,0 +1,60 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_MIPS_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_MIPS_H_ + +#include "cpu_features_cache_info.h" +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + int msa : 1; // MIPS SIMD Architecture + // https://www.mips.com/products/architectures/ase/simd/ + int eva : 1; // Enhanced Virtual Addressing + // https://www.mips.com/products/architectures/mips64/ + int r6 : 1; // True if is release 6 of the processor. + + // Make sure to update MipsFeaturesEnum below if you add a field here. +} MipsFeatures; + +typedef struct { + MipsFeatures features; +} MipsInfo; + +MipsInfo GetMipsInfo(void); + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +typedef enum { + MIPS_MSA, + MIPS_EVA, + MIPS_R6, + MIPS_LAST_, +} MipsFeaturesEnum; + +int GetMipsFeaturesEnumValue(const MipsFeatures* features, + MipsFeaturesEnum value); + +const char* GetMipsFeaturesEnumName(MipsFeaturesEnum); + +CPU_FEATURES_END_CPP_NAMESPACE + +#if !defined(CPU_FEATURES_ARCH_MIPS) +#error "Including cpuinfo_mips.h from a non-mips target." +#endif + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_MIPS_H_ diff --git a/cpu_features/include/cpuinfo_ppc.h b/cpu_features/include/cpuinfo_ppc.h new file mode 100644 index 0000000..f691194 --- /dev/null +++ b/cpu_features/include/cpuinfo_ppc.h @@ -0,0 +1,146 @@ +// Copyright 2018 IBM +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_PPC_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_PPC_H_ + +#include "cpu_features_cache_info.h" +#include "cpu_features_macros.h" +#include "internal/hwcaps.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + int ppc32 : 1; + int ppc64 : 1; + int ppc601 : 1; + int altivec : 1; + int fpu : 1; + int mmu : 1; + int mac_4xx : 1; + int unifiedcache : 1; + int spe : 1; + int efpsingle : 1; + int efpdouble : 1; + int no_tb : 1; + int power4 : 1; + int power5 : 1; + int power5plus : 1; + int cell : 1; + int booke : 1; + int smt : 1; + int icachesnoop : 1; + int arch205 : 1; + int pa6t : 1; + int dfp : 1; + int power6ext : 1; + int arch206 : 1; + int vsx : 1; + int pseries_perfmon_compat : 1; + int truele : 1; + int ppcle : 1; + int arch207 : 1; + int htm : 1; + int dscr : 1; + int ebb : 1; + int isel : 1; + int tar : 1; + int vcrypto : 1; + int htm_nosc : 1; + int arch300 : 1; + int ieee128 : 1; + int darn : 1; + int scv : 1; + int htm_no_suspend : 1; + + // Make sure to update PPCFeaturesEnum below if you add a field here. +} PPCFeatures; + +typedef struct { + PPCFeatures features; +} PPCInfo; + +// This function is guaranteed to be malloc, memset and memcpy free. +PPCInfo GetPPCInfo(void); + +typedef struct { + char platform[64]; // 0 terminated string + char model[64]; // 0 terminated string + char machine[64]; // 0 terminated string + char cpu[64]; // 0 terminated string + PlatformType type; +} PPCPlatformStrings; + +PPCPlatformStrings GetPPCPlatformStrings(void); + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +typedef enum { + PPC_32, /* 32 bit mode execution */ + PPC_64, /* 64 bit mode execution */ + PPC_601_INSTR, /* Old POWER ISA */ + PPC_HAS_ALTIVEC, /* SIMD Unit*/ + PPC_HAS_FPU, /* Floating Point Unit */ + PPC_HAS_MMU, /* Memory management unit */ + PPC_HAS_4xxMAC, + PPC_UNIFIED_CACHE, /* Unified instruction and data cache */ + PPC_HAS_SPE, /* Signal processing extention unit */ + PPC_HAS_EFP_SINGLE, /* SPE single precision fpu */ + PPC_HAS_EFP_DOUBLE, /* SPE double precision fpu */ + PPC_NO_TB, /* No timebase */ + PPC_POWER4, + PPC_POWER5, + PPC_POWER5_PLUS, + PPC_CELL, /* Cell broadband engine */ + PPC_BOOKE, /* Embedded ISA */ + PPC_SMT, /* Simultaneous multi-threading */ + PPC_ICACHE_SNOOP, + PPC_ARCH_2_05, /* ISA 2.05 - POWER6 */ + PPC_PA6T, /* PA Semi 6T core ISA */ + PPC_HAS_DFP, /* Decimal floating point unit */ + PPC_POWER6_EXT, + PPC_ARCH_2_06, /* ISA 2.06 - POWER7 */ + PPC_HAS_VSX, /* Vector-scalar extension */ + PPC_PSERIES_PERFMON_COMPAT, /* Set of backwards compatibile performance + monitoring events */ + PPC_TRUE_LE, + PPC_PPC_LE, + PPC_ARCH_2_07, /* ISA 2.07 - POWER8 */ + PPC_HTM, /* Hardware Transactional Memory */ + PPC_DSCR, /* Data stream control register */ + PPC_EBB, /* Event base branching */ + PPC_ISEL, /* Integer select instructions */ + PPC_TAR, /* Target address register */ + PPC_VEC_CRYPTO, /* Vector cryptography instructions */ + PPC_HTM_NOSC, /* Transactions aborted when syscall made*/ + PPC_ARCH_3_00, /* ISA 3.00 - POWER9 */ + PPC_HAS_IEEE128, /* VSX IEEE Binary Float 128-bit */ + PPC_DARN, /* Deliver a random number instruction */ + PPC_SCV, /* scv syscall */ + PPC_HTM_NO_SUSPEND, /* TM w/out suspended state */ + PPC_LAST_, +} PPCFeaturesEnum; + +int GetPPCFeaturesEnumValue(const PPCFeatures* features, PPCFeaturesEnum value); + +const char* GetPPCFeaturesEnumName(PPCFeaturesEnum); + +CPU_FEATURES_END_CPP_NAMESPACE + +#if !defined(CPU_FEATURES_ARCH_PPC) +#error "Including cpuinfo_ppc.h from a non-ppc target." +#endif + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_PPC_H_ diff --git a/cpu_features/include/cpuinfo_x86.h b/cpu_features/include/cpuinfo_x86.h new file mode 100644 index 0000000..8d40f71 --- /dev/null +++ b/cpu_features/include/cpuinfo_x86.h @@ -0,0 +1,231 @@ +// Copyright 2017 Google LLC +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_CPUINFO_X86_H_ +#define CPU_FEATURES_INCLUDE_CPUINFO_X86_H_ + +#include "cpu_features_cache_info.h" +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +// See https://en.wikipedia.org/wiki/CPUID for a list of x86 cpu features. +// The field names are based on the short name provided in the wikipedia tables. +typedef struct { + int fpu : 1; + int tsc : 1; + int cx8 : 1; + int clfsh : 1; + int mmx : 1; + int aes : 1; + int erms : 1; + int f16c : 1; + int fma4 : 1; + int fma3 : 1; + int vaes : 1; + int vpclmulqdq : 1; + int bmi1 : 1; + int hle : 1; + int bmi2 : 1; + int rtm : 1; + int rdseed : 1; + int clflushopt : 1; + int clwb : 1; + + int sse : 1; + int sse2 : 1; + int sse3 : 1; + int ssse3 : 1; + int sse4_1 : 1; + int sse4_2 : 1; + int sse4a : 1; + + int avx : 1; + int avx2 : 1; + + int avx512f : 1; + int avx512cd : 1; + int avx512er : 1; + int avx512pf : 1; + int avx512bw : 1; + int avx512dq : 1; + int avx512vl : 1; + int avx512ifma : 1; + int avx512vbmi : 1; + int avx512vbmi2 : 1; + int avx512vnni : 1; + int avx512bitalg : 1; + int avx512vpopcntdq : 1; + int avx512_4vnniw : 1; + int avx512_4vbmi2 : 1; + int avx512_second_fma : 1; + int avx512_4fmaps : 1; + int avx512_bf16 : 1; + int avx512_vp2intersect : 1; + int amx_bf16 : 1; + int amx_tile : 1; + int amx_int8 : 1; + + int pclmulqdq : 1; + int smx : 1; + int sgx : 1; + int cx16 : 1; // aka. CMPXCHG16B + int sha : 1; + int popcnt : 1; + int movbe : 1; + int rdrnd : 1; + + int dca : 1; + int ss : 1; + // Make sure to update X86FeaturesEnum below if you add a field here. +} X86Features; + +typedef struct { + X86Features features; + int family; + int model; + int stepping; + char vendor[13]; // 0 terminated string +} X86Info; + +// Calls cpuid and returns an initialized X86info. +// This function is guaranteed to be malloc, memset and memcpy free. +X86Info GetX86Info(void); + +// Returns cache hierarchy informations. +// Can call cpuid multiple times. +// Only works on Intel CPU at the moment. +// This function is guaranteed to be malloc, memset and memcpy free. +CacheInfo GetX86CacheInfo(void); + +typedef enum { + X86_UNKNOWN, + INTEL_CORE, // CORE + INTEL_PNR, // PENRYN + INTEL_NHM, // NEHALEM + INTEL_ATOM_BNL, // BONNELL + INTEL_WSM, // WESTMERE + INTEL_SNB, // SANDYBRIDGE + INTEL_IVB, // IVYBRIDGE + INTEL_ATOM_SMT, // SILVERMONT + INTEL_HSW, // HASWELL + INTEL_BDW, // BROADWELL + INTEL_SKL, // SKYLAKE + INTEL_ATOM_GMT, // GOLDMONT + INTEL_KBL, // KABY LAKE + INTEL_CFL, // COFFEE LAKE + INTEL_WHL, // WHISKEY LAKE + INTEL_CNL, // CANNON LAKE + INTEL_ICL, // ICE LAKE + INTEL_TGL, // TIGER LAKE + INTEL_SPR, // SAPPHIRE RAPIDS + AMD_HAMMER, // K8 + AMD_K10, // K10 + AMD_BOBCAT, // K14 + AMD_BULLDOZER, // K15 + AMD_JAGUAR, // K16 + AMD_ZEN, // K17 +} X86Microarchitecture; + +// Returns the underlying microarchitecture by looking at X86Info's vendor, +// family and model. +X86Microarchitecture GetX86Microarchitecture(const X86Info* info); + +// Calls cpuid and fills the brand_string. +// - brand_string *must* be of size 49 (beware of array decaying). +// - brand_string will be zero terminated. +// - This function calls memcpy. +void FillX86BrandString(char brand_string[49]); + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +typedef enum { + X86_FPU, + X86_TSC, + X86_CX8, + X86_CLFSH, + X86_MMX, + X86_AES, + X86_ERMS, + X86_F16C, + X86_FMA4, + X86_FMA3, + X86_VAES, + X86_VPCLMULQDQ, + X86_BMI1, + X86_HLE, + X86_BMI2, + X86_RTM, + X86_RDSEED, + X86_CLFLUSHOPT, + X86_CLWB, + X86_SSE, + X86_SSE2, + X86_SSE3, + X86_SSSE3, + X86_SSE4_1, + X86_SSE4_2, + X86_SSE4A, + X86_AVX, + X86_AVX2, + X86_AVX512F, + X86_AVX512CD, + X86_AVX512ER, + X86_AVX512PF, + X86_AVX512BW, + X86_AVX512DQ, + X86_AVX512VL, + X86_AVX512IFMA, + X86_AVX512VBMI, + X86_AVX512VBMI2, + X86_AVX512VNNI, + X86_AVX512BITALG, + X86_AVX512VPOPCNTDQ, + X86_AVX512_4VNNIW, + X86_AVX512_4VBMI2, + X86_AVX512_SECOND_FMA, + X86_AVX512_4FMAPS, + X86_AVX512_BF16, + X86_AVX512_VP2INTERSECT, + X86_AMX_BF16, + X86_AMX_TILE, + X86_AMX_INT8, + X86_PCLMULQDQ, + X86_SMX, + X86_SGX, + X86_CX16, + X86_SHA, + X86_POPCNT, + X86_MOVBE, + X86_RDRND, + X86_DCA, + X86_SS, + X86_LAST_, +} X86FeaturesEnum; + +int GetX86FeaturesEnumValue(const X86Features* features, X86FeaturesEnum value); + +const char* GetX86FeaturesEnumName(X86FeaturesEnum); + +const char* GetX86MicroarchitectureName(X86Microarchitecture); + +CPU_FEATURES_END_CPP_NAMESPACE + +#if !defined(CPU_FEATURES_ARCH_X86) +#error "Including cpuinfo_x86.h from a non-x86 target." +#endif + +#endif // CPU_FEATURES_INCLUDE_CPUINFO_X86_H_ diff --git a/cpu_features/include/internal/bit_utils.h b/cpu_features/include/internal/bit_utils.h new file mode 100644 index 0000000..3467ff9 --- /dev/null +++ b/cpu_features/include/internal/bit_utils.h @@ -0,0 +1,40 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_BIT_UTILS_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_BIT_UTILS_H_ + +#include +#include +#include + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +inline static bool IsBitSet(uint32_t reg, uint32_t bit) { + return (reg >> bit) & 0x1; +} + +inline static uint32_t ExtractBitRange(uint32_t reg, uint32_t msb, + uint32_t lsb) { + const uint64_t bits = msb - lsb + 1ULL; + const uint64_t mask = (1ULL << bits) - 1ULL; + assert(msb >= lsb); + return (reg >> lsb) & mask; +} + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_BIT_UTILS_H_ diff --git a/cpu_features/include/internal/cpuid_x86.h b/cpu_features/include/internal/cpuid_x86.h new file mode 100644 index 0000000..33327a4 --- /dev/null +++ b/cpu_features/include/internal/cpuid_x86.h @@ -0,0 +1,37 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_CPUID_X86_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_CPUID_X86_H_ + +#include + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +// A struct to hold the result of a call to cpuid. +typedef struct { + uint32_t eax, ebx, ecx, edx; +} Leaf; + +// Returns the result of a call to the cpuid instruction. +Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx); + +// Returns the eax value of the XCR0 register. +uint32_t GetXCR0Eax(void); + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_CPUID_X86_H_ diff --git a/cpu_features/include/internal/filesystem.h b/cpu_features/include/internal/filesystem.h new file mode 100644 index 0000000..d8f2f6a --- /dev/null +++ b/cpu_features/include/internal/filesystem.h @@ -0,0 +1,39 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// An interface for the filesystem that allows mocking the filesystem in +// unittests. +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_FILESYSTEM_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_FILESYSTEM_H_ + +#include +#include + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +// Same as linux "open(filename, O_RDONLY)", retries automatically on EINTR. +int CpuFeatures_OpenFile(const char* filename); + +// Same as linux "read(file_descriptor, buffer, buffer_size)", retries +// automatically on EINTR. +int CpuFeatures_ReadFile(int file_descriptor, void* buffer, size_t buffer_size); + +// Same as linux "close(file_descriptor)". +void CpuFeatures_CloseFile(int file_descriptor); + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_FILESYSTEM_H_ diff --git a/cpu_features/include/internal/hwcaps.h b/cpu_features/include/internal/hwcaps.h new file mode 100644 index 0000000..62037c8 --- /dev/null +++ b/cpu_features/include/internal/hwcaps.h @@ -0,0 +1,186 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Interface to retrieve hardware capabilities. It relies on Linux's getauxval +// or `/proc/self/auxval` under the hood. +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_HWCAPS_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_HWCAPS_H_ + +#include +#include + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +// To avoid depending on the linux kernel we reproduce the architecture specific +// constants here. + +// http://elixir.free-electrons.com/linux/latest/source/arch/arm64/include/uapi/asm/hwcap.h +#define AARCH64_HWCAP_FP (1UL << 0) +#define AARCH64_HWCAP_ASIMD (1UL << 1) +#define AARCH64_HWCAP_EVTSTRM (1UL << 2) +#define AARCH64_HWCAP_AES (1UL << 3) +#define AARCH64_HWCAP_PMULL (1UL << 4) +#define AARCH64_HWCAP_SHA1 (1UL << 5) +#define AARCH64_HWCAP_SHA2 (1UL << 6) +#define AARCH64_HWCAP_CRC32 (1UL << 7) +#define AARCH64_HWCAP_ATOMICS (1UL << 8) +#define AARCH64_HWCAP_FPHP (1UL << 9) +#define AARCH64_HWCAP_ASIMDHP (1UL << 10) +#define AARCH64_HWCAP_CPUID (1UL << 11) +#define AARCH64_HWCAP_ASIMDRDM (1UL << 12) +#define AARCH64_HWCAP_JSCVT (1UL << 13) +#define AARCH64_HWCAP_FCMA (1UL << 14) +#define AARCH64_HWCAP_LRCPC (1UL << 15) +#define AARCH64_HWCAP_DCPOP (1UL << 16) +#define AARCH64_HWCAP_SHA3 (1UL << 17) +#define AARCH64_HWCAP_SM3 (1UL << 18) +#define AARCH64_HWCAP_SM4 (1UL << 19) +#define AARCH64_HWCAP_ASIMDDP (1UL << 20) +#define AARCH64_HWCAP_SHA512 (1UL << 21) +#define AARCH64_HWCAP_SVE (1UL << 22) +#define AARCH64_HWCAP_ASIMDFHM (1UL << 23) +#define AARCH64_HWCAP_DIT (1UL << 24) +#define AARCH64_HWCAP_USCAT (1UL << 25) +#define AARCH64_HWCAP_ILRCPC (1UL << 26) +#define AARCH64_HWCAP_FLAGM (1UL << 27) +#define AARCH64_HWCAP_SSBS (1UL << 28) +#define AARCH64_HWCAP_SB (1UL << 29) +#define AARCH64_HWCAP_PACA (1UL << 30) +#define AARCH64_HWCAP_PACG (1UL << 31) + +#define AARCH64_HWCAP2_DCPODP (1UL << 0) +#define AARCH64_HWCAP2_SVE2 (1UL << 1) +#define AARCH64_HWCAP2_SVEAES (1UL << 2) +#define AARCH64_HWCAP2_SVEPMULL (1UL << 3) +#define AARCH64_HWCAP2_SVEBITPERM (1UL << 4) +#define AARCH64_HWCAP2_SVESHA3 (1UL << 5) +#define AARCH64_HWCAP2_SVESM4 (1UL << 6) +#define AARCH64_HWCAP2_FLAGM2 (1UL << 7) +#define AARCH64_HWCAP2_FRINT (1UL << 8) +#define AARCH64_HWCAP2_SVEI8MM (1UL << 9) +#define AARCH64_HWCAP2_SVEF32MM (1UL << 10) +#define AARCH64_HWCAP2_SVEF64MM (1UL << 11) +#define AARCH64_HWCAP2_SVEBF16 (1UL << 12) +#define AARCH64_HWCAP2_I8MM (1UL << 13) +#define AARCH64_HWCAP2_BF16 (1UL << 14) +#define AARCH64_HWCAP2_DGH (1UL << 15) +#define AARCH64_HWCAP2_RNG (1UL << 16) +#define AARCH64_HWCAP2_BTI (1UL << 17) + +// http://elixir.free-electrons.com/linux/latest/source/arch/arm/include/uapi/asm/hwcap.h +#define ARM_HWCAP_SWP (1UL << 0) +#define ARM_HWCAP_HALF (1UL << 1) +#define ARM_HWCAP_THUMB (1UL << 2) +#define ARM_HWCAP_26BIT (1UL << 3) +#define ARM_HWCAP_FAST_MULT (1UL << 4) +#define ARM_HWCAP_FPA (1UL << 5) +#define ARM_HWCAP_VFP (1UL << 6) +#define ARM_HWCAP_EDSP (1UL << 7) +#define ARM_HWCAP_JAVA (1UL << 8) +#define ARM_HWCAP_IWMMXT (1UL << 9) +#define ARM_HWCAP_CRUNCH (1UL << 10) +#define ARM_HWCAP_THUMBEE (1UL << 11) +#define ARM_HWCAP_NEON (1UL << 12) +#define ARM_HWCAP_VFPV3 (1UL << 13) +#define ARM_HWCAP_VFPV3D16 (1UL << 14) +#define ARM_HWCAP_TLS (1UL << 15) +#define ARM_HWCAP_VFPV4 (1UL << 16) +#define ARM_HWCAP_IDIVA (1UL << 17) +#define ARM_HWCAP_IDIVT (1UL << 18) +#define ARM_HWCAP_VFPD32 (1UL << 19) +#define ARM_HWCAP_LPAE (1UL << 20) +#define ARM_HWCAP_EVTSTRM (1UL << 21) +#define ARM_HWCAP2_AES (1UL << 0) +#define ARM_HWCAP2_PMULL (1UL << 1) +#define ARM_HWCAP2_SHA1 (1UL << 2) +#define ARM_HWCAP2_SHA2 (1UL << 3) +#define ARM_HWCAP2_CRC32 (1UL << 4) + +// http://elixir.free-electrons.com/linux/latest/source/arch/mips/include/uapi/asm/hwcap.h +#define MIPS_HWCAP_R6 (1UL << 0) +#define MIPS_HWCAP_MSA (1UL << 1) +#define MIPS_HWCAP_CRC32 (1UL << 2) + +// http://elixir.free-electrons.com/linux/latest/source/arch/powerpc/include/uapi/asm/cputable.h +#ifndef _UAPI__ASM_POWERPC_CPUTABLE_H +/* in AT_HWCAP */ +#define PPC_FEATURE_32 0x80000000 +#define PPC_FEATURE_64 0x40000000 +#define PPC_FEATURE_601_INSTR 0x20000000 +#define PPC_FEATURE_HAS_ALTIVEC 0x10000000 +#define PPC_FEATURE_HAS_FPU 0x08000000 +#define PPC_FEATURE_HAS_MMU 0x04000000 +#define PPC_FEATURE_HAS_4xxMAC 0x02000000 +#define PPC_FEATURE_UNIFIED_CACHE 0x01000000 +#define PPC_FEATURE_HAS_SPE 0x00800000 +#define PPC_FEATURE_HAS_EFP_SINGLE 0x00400000 +#define PPC_FEATURE_HAS_EFP_DOUBLE 0x00200000 +#define PPC_FEATURE_NO_TB 0x00100000 +#define PPC_FEATURE_POWER4 0x00080000 +#define PPC_FEATURE_POWER5 0x00040000 +#define PPC_FEATURE_POWER5_PLUS 0x00020000 +#define PPC_FEATURE_CELL 0x00010000 +#define PPC_FEATURE_BOOKE 0x00008000 +#define PPC_FEATURE_SMT 0x00004000 +#define PPC_FEATURE_ICACHE_SNOOP 0x00002000 +#define PPC_FEATURE_ARCH_2_05 0x00001000 +#define PPC_FEATURE_PA6T 0x00000800 +#define PPC_FEATURE_HAS_DFP 0x00000400 +#define PPC_FEATURE_POWER6_EXT 0x00000200 +#define PPC_FEATURE_ARCH_2_06 0x00000100 +#define PPC_FEATURE_HAS_VSX 0x00000080 + +#define PPC_FEATURE_PSERIES_PERFMON_COMPAT 0x00000040 + +/* Reserved - do not use 0x00000004 */ +#define PPC_FEATURE_TRUE_LE 0x00000002 +#define PPC_FEATURE_PPC_LE 0x00000001 + +/* in AT_HWCAP2 */ +#define PPC_FEATURE2_ARCH_2_07 0x80000000 +#define PPC_FEATURE2_HTM 0x40000000 +#define PPC_FEATURE2_DSCR 0x20000000 +#define PPC_FEATURE2_EBB 0x10000000 +#define PPC_FEATURE2_ISEL 0x08000000 +#define PPC_FEATURE2_TAR 0x04000000 +#define PPC_FEATURE2_VEC_CRYPTO 0x02000000 +#define PPC_FEATURE2_HTM_NOSC 0x01000000 +#define PPC_FEATURE2_ARCH_3_00 0x00800000 +#define PPC_FEATURE2_HAS_IEEE128 0x00400000 +#define PPC_FEATURE2_DARN 0x00200000 +#define PPC_FEATURE2_SCV 0x00100000 +#define PPC_FEATURE2_HTM_NO_SUSPEND 0x00080000 +#endif + +typedef struct { + unsigned long hwcaps; + unsigned long hwcaps2; +} HardwareCapabilities; + +HardwareCapabilities CpuFeatures_GetHardwareCapabilities(void); +bool CpuFeatures_IsHwCapsSet(const HardwareCapabilities hwcaps_mask, + const HardwareCapabilities hwcaps); + +typedef struct { + char platform[64]; // 0 terminated string + char base_platform[64]; // 0 terminated string +} PlatformType; + +PlatformType CpuFeatures_GetPlatformType(void); + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_HWCAPS_H_ diff --git a/cpu_features/include/internal/stack_line_reader.h b/cpu_features/include/internal/stack_line_reader.h new file mode 100644 index 0000000..39c1b8b --- /dev/null +++ b/cpu_features/include/internal/stack_line_reader.h @@ -0,0 +1,49 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Reads a file line by line and stores the data on the stack. This allows +// parsing files in one go without allocating. +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_STACK_LINE_READER_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_STACK_LINE_READER_H_ + +#include + +#include "cpu_features_macros.h" +#include "internal/string_view.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + char buffer[STACK_LINE_READER_BUFFER_SIZE]; + StringView view; + int fd; + bool skip_mode; +} StackLineReader; + +// Initializes a StackLineReader. +void StackLineReader_Initialize(StackLineReader* reader, int fd); + +typedef struct { + StringView line; // A view of the line. + bool eof; // Nothing more to read, we reached EOF. + bool full_line; // If false the line was truncated to + // STACK_LINE_READER_BUFFER_SIZE. +} LineResult; + +// Reads the file pointed to by fd and tries to read a full line. +LineResult StackLineReader_NextLine(StackLineReader* reader); + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_STACK_LINE_READER_H_ diff --git a/cpu_features/include/internal/string_view.h b/cpu_features/include/internal/string_view.h new file mode 100644 index 0000000..64fed40 --- /dev/null +++ b/cpu_features/include/internal/string_view.h @@ -0,0 +1,109 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// A view over a piece of string. The view is not 0 terminated. +#ifndef CPU_FEATURES_INCLUDE_INTERNAL_STRING_VIEW_H_ +#define CPU_FEATURES_INCLUDE_INTERNAL_STRING_VIEW_H_ + +#include +#include +#include + +#include "cpu_features_macros.h" + +CPU_FEATURES_START_CPP_NAMESPACE + +typedef struct { + const char* ptr; + size_t size; +} StringView; + +#ifdef __cplusplus +static const StringView kEmptyStringView = {NULL, 0}; +#else +static const StringView kEmptyStringView; +#endif + +// Returns a StringView from the provided string. +// Passing NULL is valid only if size is 0. +static inline StringView view(const char* str, const size_t size) { + StringView view; + view.ptr = str; + view.size = size; + return view; +} + +static inline StringView str(const char* str) { return view(str, strlen(str)); } + +// Returns the index of the first occurrence of c in view or -1 if not found. +int CpuFeatures_StringView_IndexOfChar(const StringView view, char c); + +// Returns the index of the first occurrence of sub_view in view or -1 if not +// found. +int CpuFeatures_StringView_IndexOf(const StringView view, + const StringView sub_view); + +// Returns whether a is equal to b (same content). +bool CpuFeatures_StringView_IsEquals(const StringView a, const StringView b); + +// Returns whether a starts with b. +bool CpuFeatures_StringView_StartsWith(const StringView a, const StringView b); + +// Removes count characters from the beginning of view or kEmptyStringView if +// count if greater than view.size. +StringView CpuFeatures_StringView_PopFront(const StringView str_view, + size_t count); + +// Removes count characters from the end of view or kEmptyStringView if count if +// greater than view.size. +StringView CpuFeatures_StringView_PopBack(const StringView str_view, + size_t count); + +// Keeps the count first characters of view or view if count if greater than +// view.size. +StringView CpuFeatures_StringView_KeepFront(const StringView str_view, + size_t count); + +// Retrieves the first character of view. If view is empty the behavior is +// undefined. +char CpuFeatures_StringView_Front(const StringView view); + +// Retrieves the last character of view. If view is empty the behavior is +// undefined. +char CpuFeatures_StringView_Back(const StringView view); + +// Removes leading and tailing space characters. +StringView CpuFeatures_StringView_TrimWhitespace(StringView view); + +// Convert StringView to positive integer. e.g. "42", "0x2a". +// Returns -1 on error. +int CpuFeatures_StringView_ParsePositiveNumber(const StringView view); + +// Copies src StringView to dst buffer. +void CpuFeatures_StringView_CopyString(const StringView src, char* dst, + size_t dst_size); + +// Checks if line contains the specified whitespace separated word. +bool CpuFeatures_StringView_HasWord(const StringView line, + const char* const word); + +// Get key/value from line. key and value are separated by ": ". +// key and value are cleaned up from leading and trailing whitespaces. +bool CpuFeatures_StringView_GetAttributeKeyValue(const StringView line, + StringView* key, + StringView* value); + +CPU_FEATURES_END_CPP_NAMESPACE + +#endif // CPU_FEATURES_INCLUDE_INTERNAL_STRING_VIEW_H_ diff --git a/cpu_features/ndk_compat/CMakeLists.txt b/cpu_features/ndk_compat/CMakeLists.txt new file mode 100644 index 0000000..186708a --- /dev/null +++ b/cpu_features/ndk_compat/CMakeLists.txt @@ -0,0 +1,60 @@ + +# +# library : NDK compat +# +find_package(Threads REQUIRED) +set (NDK_COMPAT_HDRS cpu-features.h) +set (NDK_COMPAT_SRCS + cpu-features.c + $ + $ +) +# Note that following `add_cpu_features_headers_and_sources` will use +# NDK_COMPAT_SRCS in lieu of NDK_COMPAT_HDRS because we don't want cpu_features +# headers to be installed alongside ndk_compat. +add_cpu_features_headers_and_sources(NDK_COMPAT_SRCS NDK_COMPAT_SRCS) +add_library(ndk_compat ${NDK_COMPAT_HDRS} ${NDK_COMPAT_SRCS}) +setup_include_and_definitions(ndk_compat) +target_include_directories(ndk_compat PUBLIC $) +target_link_libraries(ndk_compat PUBLIC ${CMAKE_DL_LIBS} ${CMAKE_THREAD_LIBS_INIT}) +set_target_properties(ndk_compat PROPERTIES PUBLIC_HEADER "${NDK_COMPAT_HDRS}") + +include(GNUInstallDirs) +install(TARGETS ndk_compat + EXPORT CpuFeaturesNdkCompatTargets + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ndk_compat + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) +install(EXPORT CpuFeaturesNdkCompatTargets + NAMESPACE CpuFeatures:: + DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeaturesNdkCompat + COMPONENT Devel +) +include(CMakePackageConfigHelpers) +configure_package_config_file(${PROJECT_SOURCE_DIR}/cmake/CpuFeaturesNdkCompatConfig.cmake.in + "${PROJECT_BINARY_DIR}/CpuFeaturesNdkCompatConfig.cmake" + INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeaturesNdkCompat" + NO_SET_AND_CHECK_MACRO + NO_CHECK_REQUIRED_COMPONENTS_MACRO +) +write_basic_package_version_file( + "${PROJECT_BINARY_DIR}/CpuFeaturesNdkCompatConfigVersion.cmake" + COMPATIBILITY SameMajorVersion +) +install( + FILES + "${PROJECT_BINARY_DIR}/CpuFeaturesNdkCompatConfig.cmake" + "${PROJECT_BINARY_DIR}/CpuFeaturesNdkCompatConfigVersion.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/CpuFeaturesNdkCompat" + COMPONENT Devel +) + +# +# program : NDK compat test program +# +if(ENABLE_TESTING) + add_executable(ndk-compat-test ndk-compat-test.c) + target_link_libraries(ndk-compat-test PRIVATE ndk_compat) +endif() diff --git a/cpu_features/ndk_compat/README.md b/cpu_features/ndk_compat/README.md new file mode 100644 index 0000000..38c8393 --- /dev/null +++ b/cpu_features/ndk_compat/README.md @@ -0,0 +1,4 @@ +Provides a header compatible with [android's NDK cpu-features.h](https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h). + +It is intended to be a drop in replacement for this header and help users +transition from the NDK to [Google's cpu_features library](https://github.com/google/cpu_features). diff --git a/cpu_features/ndk_compat/cpu-features.c b/cpu_features/ndk_compat/cpu-features.c new file mode 100644 index 0000000..27ff7bb --- /dev/null +++ b/cpu_features/ndk_compat/cpu-features.c @@ -0,0 +1,205 @@ +#include "cpu-features.h" + +#include + +#include "cpu_features_macros.h" +#include "internal/filesystem.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +#if defined(CPU_FEATURES_ARCH_ARM) +#include "cpuinfo_arm.h" +#elif defined(CPU_FEATURES_ARCH_X86) +#include "cpuinfo_x86.h" +#elif defined(CPU_FEATURES_ARCH_MIPS) +#include "cpuinfo_mips.h" +#elif defined(CPU_FEATURES_ARCH_AARCH64) +#include "cpuinfo_aarch64.h" +#endif + +static pthread_once_t g_once; +static int g_inited; +static uint64_t g_cpuFeatures; +static int g_cpuCount; + +#ifdef CPU_FEATURES_ARCH_ARM +static uint32_t g_cpuIdArm; +#endif + +static void set_cpu_mask_bit(uint32_t index, uint32_t* cpu_mask) { + *cpu_mask |= 1UL << index; +} + +// Examples of valid inputs: "31", "4-31" +static void parse_cpu_mask(const StringView text, uint32_t* cpu_mask) { + int separator_index = CpuFeatures_StringView_IndexOfChar(text, '-'); + if (separator_index < 0) { // A single cpu index + int cpu_index = CpuFeatures_StringView_ParsePositiveNumber(text); + if (cpu_index < 0) return; + set_cpu_mask_bit(cpu_index, cpu_mask); + } else { + int cpu_index_a = CpuFeatures_StringView_ParsePositiveNumber( + CpuFeatures_StringView_KeepFront(text, separator_index)); + int cpu_index_b = CpuFeatures_StringView_ParsePositiveNumber( + CpuFeatures_StringView_PopFront(text, separator_index + 1)); + int i; + if (cpu_index_a < 0 || cpu_index_b < 0) return; + for (i = cpu_index_a; i <= cpu_index_b; ++i) { + if (i < 32) { + set_cpu_mask_bit(i, cpu_mask); + } + } + } +} + +// Format specification from +// https://www.kernel.org/doc/Documentation/cputopology.txt +// Examples of valid inputs: "31", "2,4-31,32-63", "0-1,3" +static void parse_cpu_mask_line(const LineResult result, uint32_t* cpu_mask) { + if (!result.full_line || result.eof) return; + StringView line = result.line; + for (; line.size > 0;) { + int next_entry_index = CpuFeatures_StringView_IndexOfChar(line, ','); + if (next_entry_index < 0) { + parse_cpu_mask(line, cpu_mask); + break; + } + StringView entry = CpuFeatures_StringView_KeepFront(line, next_entry_index); + parse_cpu_mask(entry, cpu_mask); + line = CpuFeatures_StringView_PopFront(line, next_entry_index + 1); + } +} + +static void update_cpu_mask_from_file(const char* filename, + uint32_t* cpu_mask) { + const int fd = CpuFeatures_OpenFile(filename); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + parse_cpu_mask_line(StackLineReader_NextLine(&reader), cpu_mask); + CpuFeatures_CloseFile(fd); + } +} + +static int get_cpu_count(void) { + uint32_t cpu_mask = 0; + update_cpu_mask_from_file("/sys/devices/system/cpu/present", &cpu_mask); + update_cpu_mask_from_file("/sys/devices/system/cpu/possible", &cpu_mask); + return __builtin_popcount(cpu_mask); +} + +static void android_cpuInit(void) { + g_cpuFeatures = 0; + g_cpuCount = 1; + g_inited = 1; + + g_cpuCount = get_cpu_count(); + if (g_cpuCount == 0) { + g_cpuCount = 1; + } +#if defined(CPU_FEATURES_ARCH_ARM) + ArmInfo info = GetArmInfo(); + if (info.architecture == 7) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_ARMv7; + if (info.features.vfpv3) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv3; + if (info.features.neon) { + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_NEON; + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFP_D32; + } + if (info.features.vfpv3d16) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFP_FP16; + if (info.features.idiva) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_ARM; + if (info.features.idivt) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2; + if (info.features.iwmmxt) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_iWMMXt; + if (info.features.aes) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_AES; + if (info.features.pmull) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_PMULL; + if (info.features.sha1) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_SHA1; + if (info.features.sha2) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_SHA2; + if (info.features.crc32) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_CRC32; + if (info.architecture >= 6) + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_LDREX_STREX; + if (info.features.vfp) g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFPv2; + if (info.features.vfpv4) { + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_VFP_FMA; + g_cpuFeatures |= ANDROID_CPU_ARM_FEATURE_NEON_FMA; + } + g_cpuIdArm = GetArmCpuId(&info); +#elif defined(CPU_FEATURES_ARCH_X86) + X86Info info = GetX86Info(); + if (info.features.ssse3) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSSE3; + if (info.features.popcnt) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_POPCNT; + if (info.features.movbe) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_MOVBE; + if (info.features.sse4_1) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSE4_1; + if (info.features.sse4_2) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SSE4_2; + if (info.features.aes) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AES_NI; + if (info.features.avx) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AVX; + if (info.features.rdrnd) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_RDRAND; + if (info.features.avx2) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_AVX2; + if (info.features.sha) g_cpuFeatures |= ANDROID_CPU_X86_FEATURE_SHA_NI; +#elif defined(CPU_FEATURES_ARCH_MIPS) + MipsInfo info = GetMipsInfo(); + if (info.features.r6) g_cpuFeatures |= ANDROID_CPU_MIPS_FEATURE_R6; + if (info.features.msa) g_cpuFeatures |= ANDROID_CPU_MIPS_FEATURE_MSA; +#elif defined(CPU_FEATURES_ARCH_AARCH64) + Aarch64Info info = GetAarch64Info(); + if (info.features.fp) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_FP; + if (info.features.asimd) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_ASIMD; + if (info.features.aes) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_AES; + if (info.features.pmull) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_PMULL; + if (info.features.sha1) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_SHA1; + if (info.features.sha2) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_SHA2; + if (info.features.crc32) g_cpuFeatures |= ANDROID_CPU_ARM64_FEATURE_CRC32; +#endif +} + +AndroidCpuFamily android_getCpuFamily(void) { +#if defined(CPU_FEATURES_ARCH_ARM) + return ANDROID_CPU_FAMILY_ARM; +#elif defined(CPU_FEATURES_ARCH_X86_32) + return ANDROID_CPU_FAMILY_X86; +#elif defined(CPU_FEATURES_ARCH_MIPS64) + return ANDROID_CPU_FAMILY_MIPS64; +#elif defined(CPU_FEATURES_ARCH_MIPS32) + return ANDROID_CPU_FAMILY_MIPS; +#elif defined(CPU_FEATURES_ARCH_AARCH64) + return ANDROID_CPU_FAMILY_ARM64; +#elif defined(CPU_FEATURES_ARCH_X86_64) + return ANDROID_CPU_FAMILY_X86_64; +#else + return ANDROID_CPU_FAMILY_UNKNOWN; +#endif +} + +uint64_t android_getCpuFeatures(void) { + pthread_once(&g_once, android_cpuInit); + return g_cpuFeatures; +} + +int android_getCpuCount(void) { + pthread_once(&g_once, android_cpuInit); + return g_cpuCount; +} + +static void android_cpuInitDummy(void) { g_inited = 1; } + +int android_setCpu(int cpu_count, uint64_t cpu_features) { + /* Fail if the library was already initialized. */ + if (g_inited) return 0; + g_cpuCount = (cpu_count <= 0 ? 1 : cpu_count); + g_cpuFeatures = cpu_features; + pthread_once(&g_once, android_cpuInitDummy); + return 1; +} + +#ifdef CPU_FEATURES_ARCH_ARM + +uint32_t android_getCpuIdArm(void) { + pthread_once(&g_once, android_cpuInit); + return g_cpuIdArm; +} + +int android_setCpuArm(int cpu_count, uint64_t cpu_features, uint32_t cpu_id) { + if (!android_setCpu(cpu_count, cpu_features)) return 0; + g_cpuIdArm = cpu_id; + return 1; +} + +#endif // CPU_FEATURES_ARCH_ARM diff --git a/cpu_features/ndk_compat/cpu-features.h b/cpu_features/ndk_compat/cpu-features.h new file mode 100644 index 0000000..51bea53 --- /dev/null +++ b/cpu_features/ndk_compat/cpu-features.h @@ -0,0 +1,320 @@ +/* + * Copyright (C) 2010 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef GOOGLE_CPU_FEATURES_H +#define GOOGLE_CPU_FEATURES_H +#include +#include + +__BEGIN_DECLS + +/* A list of valid values returned by android_getCpuFamily(). + * They describe the CPU Architecture of the current process. + */ +typedef enum { + ANDROID_CPU_FAMILY_UNKNOWN = 0, + ANDROID_CPU_FAMILY_ARM, + ANDROID_CPU_FAMILY_X86, + ANDROID_CPU_FAMILY_MIPS, + ANDROID_CPU_FAMILY_ARM64, + ANDROID_CPU_FAMILY_X86_64, + ANDROID_CPU_FAMILY_MIPS64, + ANDROID_CPU_FAMILY_MAX /* do not remove */ +} AndroidCpuFamily; + +/* Return the CPU family of the current process. + * + * Note that this matches the bitness of the current process. I.e. when + * running a 32-bit binary on a 64-bit capable CPU, this will return the + * 32-bit CPU family value. + */ +extern AndroidCpuFamily android_getCpuFamily(void); + +/* Return a bitmap describing a set of optional CPU features that are + * supported by the current device's CPU. The exact bit-flags returned + * depend on the value returned by android_getCpuFamily(). See the + * documentation for the ANDROID_CPU_*_FEATURE_* flags below for details. + */ +extern uint64_t android_getCpuFeatures(void); + +/* The list of feature flags for ANDROID_CPU_FAMILY_ARM that can be + * recognized by the library (see note below for 64-bit ARM). Value details + * are: + * + * VFPv2: + * CPU supports the VFPv2 instruction set. Many, but not all, ARMv6 CPUs + * support these instructions. VFPv2 is a subset of VFPv3 so this will + * be set whenever VFPv3 is set too. + * + * ARMv7: + * CPU supports the ARMv7-A basic instruction set. + * This feature is mandated by the 'armeabi-v7a' ABI. + * + * VFPv3: + * CPU supports the VFPv3-D16 instruction set, providing hardware FPU + * support for single and double precision floating point registers. + * Note that only 16 FPU registers are available by default, unless + * the D32 bit is set too. This feature is also mandated by the + * 'armeabi-v7a' ABI. + * + * VFP_D32: + * CPU VFP optional extension that provides 32 FPU registers, + * instead of 16. Note that ARM mandates this feature is the 'NEON' + * feature is implemented by the CPU. + * + * NEON: + * CPU FPU supports "ARM Advanced SIMD" instructions, also known as + * NEON. Note that this mandates the VFP_D32 feature as well, per the + * ARM Architecture specification. + * + * VFP_FP16: + * Half-width floating precision VFP extension. If set, the CPU + * supports instructions to perform floating-point operations on + * 16-bit registers. This is part of the VFPv4 specification, but + * not mandated by any Android ABI. + * + * VFP_FMA: + * Fused multiply-accumulate VFP instructions extension. Also part of + * the VFPv4 specification, but not mandated by any Android ABI. + * + * NEON_FMA: + * Fused multiply-accumulate NEON instructions extension. Optional + * extension from the VFPv4 specification, but not mandated by any + * Android ABI. + * + * IDIV_ARM: + * Integer division available in ARM mode. Only available + * on recent CPUs (e.g. Cortex-A15). + * + * IDIV_THUMB2: + * Integer division available in Thumb-2 mode. Only available + * on recent CPUs (e.g. Cortex-A15). + * + * iWMMXt: + * Optional extension that adds MMX registers and operations to an + * ARM CPU. This is only available on a few XScale-based CPU designs + * sold by Marvell. Pretty rare in practice. + * + * AES: + * CPU supports AES instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * CRC32: + * CPU supports CRC32 instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * SHA2: + * CPU supports SHA2 instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * SHA1: + * CPU supports SHA1 instructions. These instructions are only + * available for 32-bit applications running on ARMv8 CPU. + * + * PMULL: + * CPU supports 64-bit PMULL and PMULL2 instructions. These + * instructions are only available for 32-bit applications + * running on ARMv8 CPU. + * + * If you want to tell the compiler to generate code that targets one of + * the feature set above, you should probably use one of the following + * flags (for more details, see technical note at the end of this file): + * + * -mfpu=vfp + * -mfpu=vfpv2 + * These are equivalent and tell GCC to use VFPv2 instructions for + * floating-point operations. Use this if you want your code to + * run on *some* ARMv6 devices, and any ARMv7-A device supported + * by Android. + * + * Generated code requires VFPv2 feature. + * + * -mfpu=vfpv3-d16 + * Tell GCC to use VFPv3 instructions (using only 16 FPU registers). + * This should be generic code that runs on any CPU that supports the + * 'armeabi-v7a' Android ABI. Note that no ARMv6 CPU supports this. + * + * Generated code requires VFPv3 feature. + * + * -mfpu=vfpv3 + * Tell GCC to use VFPv3 instructions with 32 FPU registers. + * Generated code requires VFPv3|VFP_D32 features. + * + * -mfpu=neon + * Tell GCC to use VFPv3 instructions with 32 FPU registers, and + * also support NEON intrinsics (see ). + * Generated code requires VFPv3|VFP_D32|NEON features. + * + * -mfpu=vfpv4-d16 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA features. + * + * -mfpu=vfpv4 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32 features. + * + * -mfpu=neon-vfpv4 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32|NEON|NEON_FMA + * features. + * + * -mcpu=cortex-a7 + * -mcpu=cortex-a15 + * Generated code requires VFPv3|VFP_FP16|VFP_FMA|VFP_D32| + * NEON|NEON_FMA|IDIV_ARM|IDIV_THUMB2 + * This flag implies -mfpu=neon-vfpv4. + * + * -mcpu=iwmmxt + * Allows the use of iWMMXt instrinsics with GCC. + * + * IMPORTANT NOTE: These flags should only be tested when + * android_getCpuFamily() returns ANDROID_CPU_FAMILY_ARM, i.e. this is a + * 32-bit process. + * + * When running a 64-bit ARM process on an ARMv8 CPU, + * android_getCpuFeatures() will return a different set of bitflags + */ +enum { + ANDROID_CPU_ARM_FEATURE_ARMv7 = (1 << 0), + ANDROID_CPU_ARM_FEATURE_VFPv3 = (1 << 1), + ANDROID_CPU_ARM_FEATURE_NEON = (1 << 2), + ANDROID_CPU_ARM_FEATURE_LDREX_STREX = (1 << 3), + ANDROID_CPU_ARM_FEATURE_VFPv2 = (1 << 4), + ANDROID_CPU_ARM_FEATURE_VFP_D32 = (1 << 5), + ANDROID_CPU_ARM_FEATURE_VFP_FP16 = (1 << 6), + ANDROID_CPU_ARM_FEATURE_VFP_FMA = (1 << 7), + ANDROID_CPU_ARM_FEATURE_NEON_FMA = (1 << 8), + ANDROID_CPU_ARM_FEATURE_IDIV_ARM = (1 << 9), + ANDROID_CPU_ARM_FEATURE_IDIV_THUMB2 = (1 << 10), + ANDROID_CPU_ARM_FEATURE_iWMMXt = (1 << 11), + ANDROID_CPU_ARM_FEATURE_AES = (1 << 12), + ANDROID_CPU_ARM_FEATURE_PMULL = (1 << 13), + ANDROID_CPU_ARM_FEATURE_SHA1 = (1 << 14), + ANDROID_CPU_ARM_FEATURE_SHA2 = (1 << 15), + ANDROID_CPU_ARM_FEATURE_CRC32 = (1 << 16), +}; + +/* The bit flags corresponding to the output of android_getCpuFeatures() + * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_ARM64. Value details + * are: + * + * FP: + * CPU has Floating-point unit. + * + * ASIMD: + * CPU has Advanced SIMD unit. + * + * AES: + * CPU supports AES instructions. + * + * CRC32: + * CPU supports CRC32 instructions. + * + * SHA2: + * CPU supports SHA2 instructions. + * + * SHA1: + * CPU supports SHA1 instructions. + * + * PMULL: + * CPU supports 64-bit PMULL and PMULL2 instructions. + */ +enum { + ANDROID_CPU_ARM64_FEATURE_FP = (1 << 0), + ANDROID_CPU_ARM64_FEATURE_ASIMD = (1 << 1), + ANDROID_CPU_ARM64_FEATURE_AES = (1 << 2), + ANDROID_CPU_ARM64_FEATURE_PMULL = (1 << 3), + ANDROID_CPU_ARM64_FEATURE_SHA1 = (1 << 4), + ANDROID_CPU_ARM64_FEATURE_SHA2 = (1 << 5), + ANDROID_CPU_ARM64_FEATURE_CRC32 = (1 << 6), +}; + +/* The bit flags corresponding to the output of android_getCpuFeatures() + * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_X86 or + * ANDROID_CPU_FAMILY_X86_64. + */ +enum { + ANDROID_CPU_X86_FEATURE_SSSE3 = (1 << 0), + ANDROID_CPU_X86_FEATURE_POPCNT = (1 << 1), + ANDROID_CPU_X86_FEATURE_MOVBE = (1 << 2), + ANDROID_CPU_X86_FEATURE_SSE4_1 = (1 << 3), + ANDROID_CPU_X86_FEATURE_SSE4_2 = (1 << 4), + ANDROID_CPU_X86_FEATURE_AES_NI = (1 << 5), + ANDROID_CPU_X86_FEATURE_AVX = (1 << 6), + ANDROID_CPU_X86_FEATURE_RDRAND = (1 << 7), + ANDROID_CPU_X86_FEATURE_AVX2 = (1 << 8), + ANDROID_CPU_X86_FEATURE_SHA_NI = (1 << 9), +}; + +/* The bit flags corresponding to the output of android_getCpuFeatures() + * when android_getCpuFamily() returns ANDROID_CPU_FAMILY_MIPS + * or ANDROID_CPU_FAMILY_MIPS64. Values are: + * + * R6: + * CPU executes MIPS Release 6 instructions natively, and + * supports obsoleted R1..R5 instructions only via kernel traps. + * + * MSA: + * CPU supports Mips SIMD Architecture instructions. + */ +enum { + ANDROID_CPU_MIPS_FEATURE_R6 = (1 << 0), + ANDROID_CPU_MIPS_FEATURE_MSA = (1 << 1), +}; + +/* Return the number of CPU cores detected on this device. + * Please note the current implementation supports up to 32 cpus. + */ +extern int android_getCpuCount(void); + +/* The following is used to force the CPU count and features + * mask in sandboxed processes. Under 4.1 and higher, these processes + * cannot access /proc, which is the only way to get information from + * the kernel about the current hardware (at least on ARM). + * + * It _must_ be called only once, and before any android_getCpuXXX + * function, any other case will fail. + * + * This function return 1 on success, and 0 on failure. + */ +extern int android_setCpu(int cpu_count, uint64_t cpu_features); + +#ifdef __arm__ + +/* Retrieve the ARM 32-bit CPUID value from the kernel. + * Note that this cannot work on sandboxed processes under 4.1 and + * higher, unless you called android_setCpuArm() before. + */ +extern uint32_t android_getCpuIdArm(void); + +/* An ARM-specific variant of android_setCpu() that also allows you + * to set the ARM CPUID field. + */ +extern int android_setCpuArm(int cpu_count, uint64_t cpu_features, + uint32_t cpu_id); + +#endif + +__END_DECLS +#endif /* GOOGLE_CPU_FEATURES_H */ diff --git a/cpu_features/ndk_compat/ndk-compat-test.c b/cpu_features/ndk_compat/ndk-compat-test.c new file mode 100644 index 0000000..e4005d4 --- /dev/null +++ b/cpu_features/ndk_compat/ndk-compat-test.c @@ -0,0 +1,12 @@ +#include + +#include "cpu-features.h" + +int main() { + printf("android_getCpuFamily()=%d\n", android_getCpuFamily()); + printf("android_getCpuFeatures()=0x%08llx\n", android_getCpuFeatures()); + printf("android_getCpuCount()=%d\n", android_getCpuCount()); +#ifdef __arm__ + printf("android_getCpuIdArm()=0x%04x\n", android_getCpuIdArm()); +#endif //__arm__ +} diff --git a/cpu_features/scripts/run_integration.sh b/cpu_features/scripts/run_integration.sh new file mode 100755 index 0000000..fd88d60 --- /dev/null +++ b/cpu_features/scripts/run_integration.sh @@ -0,0 +1,209 @@ +#!/usr/bin/env bash + +readonly SCRIPT_FOLDER=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +readonly PROJECT_FOLDER="${SCRIPT_FOLDER}/.." +readonly ARCHIVE_FOLDER=~/cpu_features_archives +readonly QEMU_INSTALL=${ARCHIVE_FOLDER}/qemu +readonly DEFAULT_CMAKE_ARGS=" -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON" + +function extract() { + case $1 in + *.tar.bz2) tar xjf "$1" ;; + *.tar.xz) tar xJf "$1" ;; + *.tar.gz) tar xzf "$1" ;; + *) + echo "don't know how to extract '$1'..." + exit 1 + esac +} + +function unpackifnotexists() { + mkdir -p "${ARCHIVE_FOLDER}" + cd "${ARCHIVE_FOLDER}" || exit + local URL=$1 + local RELATIVE_FOLDER=$2 + local DESTINATION="${ARCHIVE_FOLDER}/${RELATIVE_FOLDER}" + if [[ ! -d "${DESTINATION}" ]] ; then + local ARCHIVE_NAME=$(echo ${URL} | sed 's/.*\///') + test -f "${ARCHIVE_NAME}" || wget -q "${URL}" + extract "${ARCHIVE_NAME}" + rm -f "${ARCHIVE_NAME}" + fi +} + +function installqemuifneeded() { + local VERSION=${QEMU_VERSION:=2.11.1} + local ARCHES=${QEMU_ARCHES:=arm aarch64 i386 x86_64 mips mipsel mips64 mips64el} + local TARGETS=${QEMU_TARGETS:=$(echo "$ARCHES" | sed 's#$# #;s#\([^ ]*\) #\1-linux-user #g')} + + if echo "${VERSION} ${TARGETS}" | cmp --silent ${QEMU_INSTALL}/.build -; then + echo "qemu ${VERSION} up to date!" + return 0 + fi + + echo "VERSION: ${VERSION}" + echo "TARGETS: ${TARGETS}" + + rm -rf ${QEMU_INSTALL} + + # Checking for a tarball before downloading makes testing easier :-) + local QEMU_URL="http://wiki.qemu-project.org/download/qemu-${VERSION}.tar.xz" + local QEMU_FOLDER="qemu-${VERSION}" + unpackifnotexists ${QEMU_URL} ${QEMU_FOLDER} + cd ${QEMU_FOLDER} || exit + + ./configure \ + --prefix="${QEMU_INSTALL}" \ + --target-list="${TARGETS}" \ + --disable-docs \ + --disable-sdl \ + --disable-gtk \ + --disable-gnutls \ + --disable-gcrypt \ + --disable-nettle \ + --disable-curses \ + --static + + make -j4 + make install + + echo "$VERSION $TARGETS" > ${QEMU_INSTALL}/.build +} + +function assert_defined(){ + local VALUE=${1} + : "${VALUE?"${1} needs to be defined"}" +} + +function integrate() { + cd "${PROJECT_FOLDER}" + case "${OS}" in + "Windows_NT") CMAKE_BUILD_ARGS="--config Debug --target ALL_BUILD" + CMAKE_TEST_FILES="${BUILD_DIR}/test/Debug/*_test.exe" + DEMO=${BUILD_DIR}/Debug/list_cpu_features.exe + ;; + *) CMAKE_BUILD_ARGS="--target all" + CMAKE_TEST_FILES="${BUILD_DIR}/test/*_test" + DEMO=${BUILD_DIR}/list_cpu_features + ;; + esac + + # Generating CMake configuration + cmake -H. -B"${BUILD_DIR}" ${DEFAULT_CMAKE_ARGS} "${CMAKE_ADDITIONAL_ARGS[@]}" -G"${CMAKE_GENERATOR:-Unix Makefiles}" + + # Building + cmake --build "${BUILD_DIR}" ${CMAKE_BUILD_ARGS} + + # Running tests if needed + if [[ "${QEMU_ARCH}" == "DISABLED" ]]; then + return + fi + RUN_CMD="" + if [[ -n "${QEMU_ARCH}" ]]; then + installqemuifneeded + RUN_CMD="${QEMU_INSTALL}/bin/qemu-${QEMU_ARCH} ${QEMU_ARGS[@]}" + fi + for test_binary in ${CMAKE_TEST_FILES}; do + ${RUN_CMD} ${test_binary} + done + ${RUN_CMD} ${DEMO} +} + +function expand_linaro_config() { + assert_defined TARGET + local LINARO_ROOT_URL=https://releases.linaro.org/components/toolchain/binaries/7.2-2017.11 + + local GCC_URL=${LINARO_ROOT_URL}/${TARGET}/gcc-linaro-7.2.1-2017.11-x86_64_${TARGET}.tar.xz + local GCC_RELATIVE_FOLDER="gcc-linaro-7.2.1-2017.11-x86_64_${TARGET}" + unpackifnotexists "${GCC_URL}" "${GCC_RELATIVE_FOLDER}" + + local SYSROOT_URL=${LINARO_ROOT_URL}/${TARGET}/sysroot-glibc-linaro-2.25-2017.11-${TARGET}.tar.xz + local SYSROOT_RELATIVE_FOLDER=sysroot-glibc-linaro-2.25-2017.11-${TARGET} + unpackifnotexists "${SYSROOT_URL}" "${SYSROOT_RELATIVE_FOLDER}" + + local SYSROOT_FOLDER=${ARCHIVE_FOLDER}/${SYSROOT_RELATIVE_FOLDER} + local GCC_FOLDER=${ARCHIVE_FOLDER}/${GCC_RELATIVE_FOLDER} + + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_SYSTEM_NAME=Linux) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_SYSTEM_PROCESSOR=${TARGET}) + + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_SYSROOT=${SYSROOT_FOLDER}) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_C_COMPILER=${GCC_FOLDER}/bin/${TARGET}-gcc) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_CXX_COMPILER=${GCC_FOLDER}/bin/${TARGET}-g++) + + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=ONLY) + + QEMU_ARGS+=(-L ${SYSROOT_FOLDER}) + QEMU_ARGS+=(-E LD_LIBRARY_PATH=/lib) +} + +function expand_codescape_config() { + assert_defined TARGET + local DATE=2017.10-08 + local CODESCAPE_URL=https://codescape.mips.com/components/toolchain/${DATE}/Codescape.GNU.Tools.Package.${DATE}.for.MIPS.MTI.Linux.CentOS-5.x86_64.tar.gz + local GCC_URL=${CODESCAPE_URL} + local GCC_RELATIVE_FOLDER="mips-mti-linux-gnu/${DATE}" + unpackifnotexists "${GCC_URL}" "${GCC_RELATIVE_FOLDER}" + + local GCC_FOLDER=${ARCHIVE_FOLDER}/${GCC_RELATIVE_FOLDER} + local MIPS_FLAGS="" + local LIBC_FOLDER_SUFFIX="" + local FLAVOUR="" + case "${TARGET}" in + "mips32") MIPS_FLAGS="-EB -mabi=32"; FLAVOUR="mips-r2-hard"; LIBC_FOLDER_SUFFIX="lib" ;; + "mips32el") MIPS_FLAGS="-EL -mabi=32"; FLAVOUR="mipsel-r2-hard"; LIBC_FOLDER_SUFFIX="lib" ;; + "mips64") MIPS_FLAGS="-EB -mabi=64"; FLAVOUR="mips-r2-hard"; LIBC_FOLDER_SUFFIX="lib64" ;; + "mips64el") MIPS_FLAGS="-EL -mabi=64"; FLAVOUR="mipsel-r2-hard"; LIBC_FOLDER_SUFFIX="lib64" ;; + *) echo 'unknown mips platform'; exit 1;; + esac + + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_FIND_ROOT_PATH=${GCC_FOLDER}) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_SYSTEM_NAME=Linux) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_SYSTEM_PROCESSOR=${TARGET}) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_C_COMPILER=mips-mti-linux-gnu-gcc) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_CXX_COMPILER=mips-mti-linux-gnu-g++) + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_C_COMPILER_ARG1="${MIPS_FLAGS}") + CMAKE_ADDITIONAL_ARGS+=(-DCMAKE_CXX_COMPILER_ARG1="${MIPS_FLAGS}") + + local SYSROOT_FOLDER=${GCC_FOLDER}/sysroot/${FLAVOUR} + + # Keeping only the sysroot of interest to save on travis cache. + if [[ "${CONTINUOUS_INTEGRATION}" = "true" ]]; then + for folder in ${GCC_FOLDER}/sysroot/*; do + if [[ "${folder}" != "${SYSROOT_FOLDER}" ]]; then + rm -rf ${folder} + fi + done + fi + + local LIBC_FOLDER=${GCC_FOLDER}/mips-mti-linux-gnu/lib/${FLAVOUR}/${LIBC_FOLDER_SUFFIX} + QEMU_ARGS+=(-L ${SYSROOT_FOLDER}) + QEMU_ARGS+=(-E LD_PRELOAD=${LIBC_FOLDER}/libstdc++.so.6:${LIBC_FOLDER}/libgcc_s.so.1) +} + +function expand_environment_and_integrate() { + assert_defined PROJECT_FOLDER + assert_defined TARGET + + BUILD_DIR="${PROJECT_FOLDER}/cmake_build/${TARGET}" + mkdir -p "${BUILD_DIR}" + + declare -a CONFIG_NAMES=() + declare -a QEMU_ARGS=() + declare -a CMAKE_ADDITIONAL_ARGS=() + + case ${TOOLCHAIN} in + LINARO) expand_linaro_config ;; + CODESCAPE) expand_codescape_config ;; + NATIVE) QEMU_ARCH="" ;; + *) echo "Unknown toolchain '${TOOLCHAIN}'..."; exit 1;; + esac + integrate +} + +if [ "${CONTINUOUS_INTEGRATION}" = "true" ]; then + QEMU_ARCHES=${QEMU_ARCH} + expand_environment_and_integrate +fi diff --git a/cpu_features/scripts/test_integration.sh b/cpu_features/scripts/test_integration.sh new file mode 100755 index 0000000..d1c61b0 --- /dev/null +++ b/cpu_features/scripts/test_integration.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash + +source "$(dirname -- "$0")"/run_integration.sh + +# Toolchains for little-endian, 64-bit ARMv8 for GNU/Linux systems +function set_aarch64-linux-gnu() { + TOOLCHAIN=LINARO + TARGET=aarch64-linux-gnu + QEMU_ARCH=aarch64 +} + +# Toolchains for little-endian, hard-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems +function set_arm-linux-gnueabihf() { + TOOLCHAIN=LINARO + TARGET=arm-linux-gnueabihf + QEMU_ARCH=arm +} + +# Toolchains for little-endian, 32-bit ARMv8 for GNU/Linux systems +function set_armv8l-linux-gnueabihf() { + TOOLCHAIN=LINARO + TARGET=armv8l-linux-gnueabihf + QEMU_ARCH=arm +} + +# Toolchains for little-endian, soft-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems +function set_arm-linux-gnueabi() { + TOOLCHAIN=LINARO + TARGET=arm-linux-gnueabi + QEMU_ARCH=arm +} + +# Toolchains for big-endian, 64-bit ARMv8 for GNU/Linux systems +function set_aarch64_be-linux-gnu() { + TOOLCHAIN=LINARO + TARGET=aarch64_be-linux-gnu + QEMU_ARCH=DISABLED +} + +# Toolchains for big-endian, hard-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems +function set_armeb-linux-gnueabihf() { + TOOLCHAIN=LINARO + TARGET=armeb-linux-gnueabihf + QEMU_ARCH=DISABLED +} + +# Toolchains for big-endian, soft-float, 32-bit ARMv7 (and earlier) for GNU/Linux systems +function set_armeb-linux-gnueabi() { + TOOLCHAIN=LINARO + TARGET=armeb-linux-gnueabi + QEMU_ARCH=DISABLED +} + +function set_mips32() { + TOOLCHAIN=CODESCAPE + TARGET=mips32 + QEMU_ARCH=mips +} + +function set_mips32el() { + TOOLCHAIN=CODESCAPE + TARGET=mips32el + QEMU_ARCH=mipsel +} + +function set_mips64() { + TOOLCHAIN=CODESCAPE + TARGET=mips64 + QEMU_ARCH=mips64 +} + +function set_mips64el() { + TOOLCHAIN=CODESCAPE + TARGET=mips64el + QEMU_ARCH=mips64el +} + +function set_native() { + TOOLCHAIN=NATIVE + TARGET=native + QEMU_ARCH="" +} + +ENVIRONMENTS=" + set_aarch64-linux-gnu + set_arm-linux-gnueabihf + set_armv8l-linux-gnueabihf + set_arm-linux-gnueabi + set_aarch64_be-linux-gnu + set_armeb-linux-gnueabihf + set_armeb-linux-gnueabi + set_mips32 + set_mips32el + set_mips64 + set_mips64el + set_native +" + +set -e + +CMAKE_GENERATOR="Ninja" + +for SET_ENVIRONMENT in ${ENVIRONMENTS}; do + ${SET_ENVIRONMENT} + expand_environment_and_integrate +done diff --git a/cpu_features/src/cpuinfo_aarch64.c b/cpu_features/src/cpuinfo_aarch64.c new file mode 100644 index 0000000..0a52718 --- /dev/null +++ b/cpu_features/src/cpuinfo_aarch64.c @@ -0,0 +1,150 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_aarch64.h" + +#include +#include + +#include "internal/filesystem.h" +#include "internal/hwcaps.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +// Generation of feature's getters/setters functions and kGetters, kSetters, +// kCpuInfoFlags and kHardwareCapabilities global tables. +#define DEFINE_TABLE_FEATURES \ + FEATURE(AARCH64_FP, fp, "fp", AARCH64_HWCAP_FP, 0) \ + FEATURE(AARCH64_ASIMD, asimd, "asimd", AARCH64_HWCAP_ASIMD, 0) \ + FEATURE(AARCH64_EVTSTRM, evtstrm, "evtstrm", AARCH64_HWCAP_EVTSTRM, 0) \ + FEATURE(AARCH64_AES, aes, "aes", AARCH64_HWCAP_AES, 0) \ + FEATURE(AARCH64_PMULL, pmull, "pmull", AARCH64_HWCAP_PMULL, 0) \ + FEATURE(AARCH64_SHA1, sha1, "sha1", AARCH64_HWCAP_SHA1, 0) \ + FEATURE(AARCH64_SHA2, sha2, "sha2", AARCH64_HWCAP_SHA2, 0) \ + FEATURE(AARCH64_CRC32, crc32, "crc32", AARCH64_HWCAP_CRC32, 0) \ + FEATURE(AARCH64_ATOMICS, atomics, "atomics", AARCH64_HWCAP_ATOMICS, 0) \ + FEATURE(AARCH64_FPHP, fphp, "fphp", AARCH64_HWCAP_FPHP, 0) \ + FEATURE(AARCH64_ASIMDHP, asimdhp, "asimdhp", AARCH64_HWCAP_ASIMDHP, 0) \ + FEATURE(AARCH64_CPUID, cpuid, "cpuid", AARCH64_HWCAP_CPUID, 0) \ + FEATURE(AARCH64_ASIMDRDM, asimdrdm, "asimdrdm", AARCH64_HWCAP_ASIMDRDM, 0) \ + FEATURE(AARCH64_JSCVT, jscvt, "jscvt", AARCH64_HWCAP_JSCVT, 0) \ + FEATURE(AARCH64_FCMA, fcma, "fcma", AARCH64_HWCAP_FCMA, 0) \ + FEATURE(AARCH64_LRCPC, lrcpc, "lrcpc", AARCH64_HWCAP_LRCPC, 0) \ + FEATURE(AARCH64_DCPOP, dcpop, "dcpop", AARCH64_HWCAP_DCPOP, 0) \ + FEATURE(AARCH64_SHA3, sha3, "sha3", AARCH64_HWCAP_SHA3, 0) \ + FEATURE(AARCH64_SM3, sm3, "sm3", AARCH64_HWCAP_SM3, 0) \ + FEATURE(AARCH64_SM4, sm4, "sm4", AARCH64_HWCAP_SM4, 0) \ + FEATURE(AARCH64_ASIMDDP, asimddp, "asimddp", AARCH64_HWCAP_ASIMDDP, 0) \ + FEATURE(AARCH64_SHA512, sha512, "sha512", AARCH64_HWCAP_SHA512, 0) \ + FEATURE(AARCH64_SVE, sve, "sve", AARCH64_HWCAP_SVE, 0) \ + FEATURE(AARCH64_ASIMDFHM, asimdfhm, "asimdfhm", AARCH64_HWCAP_ASIMDFHM, 0) \ + FEATURE(AARCH64_DIT, dit, "dit", AARCH64_HWCAP_DIT, 0) \ + FEATURE(AARCH64_USCAT, uscat, "uscat", AARCH64_HWCAP_USCAT, 0) \ + FEATURE(AARCH64_ILRCPC, ilrcpc, "ilrcpc", AARCH64_HWCAP_ILRCPC, 0) \ + FEATURE(AARCH64_FLAGM, flagm, "flagm", AARCH64_HWCAP_FLAGM, 0) \ + FEATURE(AARCH64_SSBS, ssbs, "ssbs", AARCH64_HWCAP_SSBS, 0) \ + FEATURE(AARCH64_SB, sb, "sb", AARCH64_HWCAP_SB, 0) \ + FEATURE(AARCH64_PACA, paca, "paca", AARCH64_HWCAP_PACA, 0) \ + FEATURE(AARCH64_PACG, pacg, "pacg", AARCH64_HWCAP_PACG, 0) \ + FEATURE(AARCH64_DCPODP, dcpodp, "dcpodp", 0, AARCH64_HWCAP2_DCPODP) \ + FEATURE(AARCH64_SVE2, sve2, "sve2", 0, AARCH64_HWCAP2_SVE2) \ + FEATURE(AARCH64_SVEAES, sveaes, "sveaes", 0, AARCH64_HWCAP2_SVEAES) \ + FEATURE(AARCH64_SVEPMULL, svepmull, "svepmull", 0, AARCH64_HWCAP2_SVEPMULL) \ + FEATURE(AARCH64_SVEBITPERM, svebitperm, "svebitperm", 0, \ + AARCH64_HWCAP2_SVEBITPERM) \ + FEATURE(AARCH64_SVESHA3, svesha3, "svesha3", 0, AARCH64_HWCAP2_SVESHA3) \ + FEATURE(AARCH64_SVESM4, svesm4, "svesm4", 0, AARCH64_HWCAP2_SVESM4) \ + FEATURE(AARCH64_FLAGM2, flagm2, "flagm2", 0, AARCH64_HWCAP2_FLAGM2) \ + FEATURE(AARCH64_FRINT, frint, "frint", 0, AARCH64_HWCAP2_FRINT) \ + FEATURE(AARCH64_SVEI8MM, svei8mm, "svei8mm", 0, AARCH64_HWCAP2_SVEI8MM) \ + FEATURE(AARCH64_SVEF32MM, svef32mm, "svef32mm", 0, AARCH64_HWCAP2_SVEF32MM) \ + FEATURE(AARCH64_SVEF64MM, svef64mm, "svef64mm", 0, AARCH64_HWCAP2_SVEF64MM) \ + FEATURE(AARCH64_SVEBF16, svebf16, "svebf16", 0, AARCH64_HWCAP2_SVEBF16) \ + FEATURE(AARCH64_I8MM, i8mm, "i8mm", 0, AARCH64_HWCAP2_I8MM) \ + FEATURE(AARCH64_BF16, bf16, "bf16", 0, AARCH64_HWCAP2_BF16) \ + FEATURE(AARCH64_DGH, dgh, "dgh", 0, AARCH64_HWCAP2_DGH) \ + FEATURE(AARCH64_RNG, rng, "rng", 0, AARCH64_HWCAP2_RNG) \ + FEATURE(AARCH64_BTI, bti, "bti", 0, AARCH64_HWCAP2_BTI) +#define DEFINE_TABLE_FEATURE_TYPE Aarch64Features +#include "define_tables.h" + +static bool HandleAarch64Line(const LineResult result, + Aarch64Info* const info) { + StringView line = result.line; + StringView key, value; + if (CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)) { + if (CpuFeatures_StringView_IsEquals(key, str("Features"))) { + for (size_t i = 0; i < AARCH64_LAST_; ++i) { + kSetters[i](&info->features, + CpuFeatures_StringView_HasWord(value, kCpuInfoFlags[i])); + } + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU implementer"))) { + info->implementer = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU variant"))) { + info->variant = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU part"))) { + info->part = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU revision"))) { + info->revision = CpuFeatures_StringView_ParsePositiveNumber(value); + } + } + return !result.eof; +} + +static void FillProcCpuInfoData(Aarch64Info* const info) { + const int fd = CpuFeatures_OpenFile("/proc/cpuinfo"); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + for (;;) { + if (!HandleAarch64Line(StackLineReader_NextLine(&reader), info)) { + break; + } + } + CpuFeatures_CloseFile(fd); + } +} + +static const Aarch64Info kEmptyAarch64Info; + +Aarch64Info GetAarch64Info(void) { + // capabilities are fetched from both getauxval and /proc/cpuinfo so we can + // have some information if the executable is sandboxed (aka no access to + // /proc/cpuinfo). + Aarch64Info info = kEmptyAarch64Info; + + FillProcCpuInfoData(&info); + const HardwareCapabilities hwcaps = CpuFeatures_GetHardwareCapabilities(); + for (size_t i = 0; i < AARCH64_LAST_; ++i) { + if (CpuFeatures_IsHwCapsSet(kHardwareCapabilities[i], hwcaps)) { + kSetters[i](&info.features, true); + } + } + + return info; +} + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +int GetAarch64FeaturesEnumValue(const Aarch64Features* features, + Aarch64FeaturesEnum value) { + if (value >= AARCH64_LAST_) return false; + return kGetters[value](features); +} + +const char* GetAarch64FeaturesEnumName(Aarch64FeaturesEnum value) { + if (value >= AARCH64_LAST_) return "unknown feature"; + return kCpuInfoFlags[value]; +} diff --git a/cpu_features/src/cpuinfo_arm.c b/cpu_features/src/cpuinfo_arm.c new file mode 100644 index 0000000..0f216bf --- /dev/null +++ b/cpu_features/src/cpuinfo_arm.c @@ -0,0 +1,212 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_arm.h" + +#include +#include + +#include "internal/bit_utils.h" +#include "internal/filesystem.h" +#include "internal/hwcaps.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +// Generation of feature's getters/setters functions and kGetters, kSetters, +// kCpuInfoFlags and kHardwareCapabilities global tables. +#define DEFINE_TABLE_FEATURES \ + FEATURE(ARM_SWP, swp, "swp", ARM_HWCAP_SWP, 0) \ + FEATURE(ARM_HALF, half, "half", ARM_HWCAP_HALF, 0) \ + FEATURE(ARM_THUMB, thumb, "thumb", ARM_HWCAP_THUMB, 0) \ + FEATURE(ARM_26BIT, _26bit, "26bit", ARM_HWCAP_26BIT, 0) \ + FEATURE(ARM_FASTMULT, fastmult, "fastmult", ARM_HWCAP_FAST_MULT, 0) \ + FEATURE(ARM_FPA, fpa, "fpa", ARM_HWCAP_FPA, 0) \ + FEATURE(ARM_VFP, vfp, "vfp", ARM_HWCAP_VFP, 0) \ + FEATURE(ARM_EDSP, edsp, "edsp", ARM_HWCAP_EDSP, 0) \ + FEATURE(ARM_JAVA, java, "java", ARM_HWCAP_JAVA, 0) \ + FEATURE(ARM_IWMMXT, iwmmxt, "iwmmxt", ARM_HWCAP_IWMMXT, 0) \ + FEATURE(ARM_CRUNCH, crunch, "crunch", ARM_HWCAP_CRUNCH, 0) \ + FEATURE(ARM_THUMBEE, thumbee, "thumbee", ARM_HWCAP_THUMBEE, 0) \ + FEATURE(ARM_NEON, neon, "neon", ARM_HWCAP_NEON, 0) \ + FEATURE(ARM_VFPV3, vfpv3, "vfpv3", ARM_HWCAP_VFPV3, 0) \ + FEATURE(ARM_VFPV3D16, vfpv3d16, "vfpv3d16", ARM_HWCAP_VFPV3D16, 0) \ + FEATURE(ARM_TLS, tls, "tls", ARM_HWCAP_TLS, 0) \ + FEATURE(ARM_VFPV4, vfpv4, "vfpv4", ARM_HWCAP_VFPV4, 0) \ + FEATURE(ARM_IDIVA, idiva, "idiva", ARM_HWCAP_IDIVA, 0) \ + FEATURE(ARM_IDIVT, idivt, "idivt", ARM_HWCAP_IDIVT, 0) \ + FEATURE(ARM_VFPD32, vfpd32, "vfpd32", ARM_HWCAP_VFPD32, 0) \ + FEATURE(ARM_LPAE, lpae, "lpae", ARM_HWCAP_LPAE, 0) \ + FEATURE(ARM_EVTSTRM, evtstrm, "evtstrm", ARM_HWCAP_EVTSTRM, 0) \ + FEATURE(ARM_AES, aes, "aes", 0, ARM_HWCAP2_AES) \ + FEATURE(ARM_PMULL, pmull, "pmull", 0, ARM_HWCAP2_PMULL) \ + FEATURE(ARM_SHA1, sha1, "sha1", 0, ARM_HWCAP2_SHA1) \ + FEATURE(ARM_SHA2, sha2, "sha2", 0, ARM_HWCAP2_SHA2) \ + FEATURE(ARM_CRC32, crc32, "crc32", 0, ARM_HWCAP2_CRC32) +#define DEFINE_TABLE_FEATURE_TYPE ArmFeatures +#include "define_tables.h" + +typedef struct { + bool processor_reports_armv6; + bool hardware_reports_goldfish; +} ProcCpuInfoData; + +static int IndexOfNonDigit(StringView str) { + size_t index = 0; + while (str.size && isdigit(CpuFeatures_StringView_Front(str))) { + str = CpuFeatures_StringView_PopFront(str, 1); + ++index; + } + return index; +} + +static bool HandleArmLine(const LineResult result, ArmInfo* const info, + ProcCpuInfoData* const proc_info) { + StringView line = result.line; + StringView key, value; + if (CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)) { + if (CpuFeatures_StringView_IsEquals(key, str("Features"))) { + for (size_t i = 0; i < ARM_LAST_; ++i) { + kSetters[i](&info->features, + CpuFeatures_StringView_HasWord(value, kCpuInfoFlags[i])); + } + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU implementer"))) { + info->implementer = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU variant"))) { + info->variant = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU part"))) { + info->part = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU revision"))) { + info->revision = CpuFeatures_StringView_ParsePositiveNumber(value); + } else if (CpuFeatures_StringView_IsEquals(key, str("CPU architecture"))) { + // CPU architecture is a number that may be followed by letters. e.g. + // "6TEJ", "7". + const StringView digits = + CpuFeatures_StringView_KeepFront(value, IndexOfNonDigit(value)); + info->architecture = CpuFeatures_StringView_ParsePositiveNumber(digits); + } else if (CpuFeatures_StringView_IsEquals(key, str("Processor")) || + CpuFeatures_StringView_IsEquals(key, str("model name"))) { + // Android reports this in a non-Linux standard "Processor" but sometimes + // also in "model name", Linux reports it only in "model name" + // see RaspberryPiZero (Linux) vs InvalidArmv7 (Android) test-cases + proc_info->processor_reports_armv6 = + CpuFeatures_StringView_IndexOf(value, str("(v6l)")) >= 0; + } else if (CpuFeatures_StringView_IsEquals(key, str("Hardware"))) { + proc_info->hardware_reports_goldfish = + CpuFeatures_StringView_IsEquals(value, str("Goldfish")); + } + } + return !result.eof; +} + +uint32_t GetArmCpuId(const ArmInfo* const info) { + return (ExtractBitRange(info->implementer, 7, 0) << 24) | + (ExtractBitRange(info->variant, 3, 0) << 20) | + (ExtractBitRange(info->part, 11, 0) << 4) | + (ExtractBitRange(info->revision, 3, 0) << 0); +} + +static void FixErrors(ArmInfo* const info, + ProcCpuInfoData* const proc_cpu_info_data) { + // Fixing Samsung kernel reporting invalid cpu architecture. + // http://code.google.com/p/android/issues/detail?id=10812 + if (proc_cpu_info_data->processor_reports_armv6 && info->architecture >= 7) { + info->architecture = 6; + } + + // Handle kernel configuration bugs that prevent the correct reporting of CPU + // features. + switch (GetArmCpuId(info)) { + case 0x4100C080: + // Special case: The emulator-specific Android 4.2 kernel fails to report + // support for the 32-bit ARM IDIV instruction. Technically, this is a + // feature of the virtual CPU implemented by the emulator. Note that it + // could also support Thumb IDIV in the future, and this will have to be + // slightly updated. + if (info->architecture >= 7 && + proc_cpu_info_data->hardware_reports_goldfish) { + info->features.idiva = true; + } + break; + case 0x511004D0: + // https://crbug.com/341598. + info->features.neon = false; + break; + case 0x510006F2: + case 0x510006F3: + // The Nexus 4 (Qualcomm Krait) kernel configuration forgets to report + // IDIV support. + info->features.idiva = true; + info->features.idivt = true; + break; + } + + // Propagate cpu features. + if (info->features.vfpv4) info->features.vfpv3 = true; + if (info->features.neon) info->features.vfpv3 = true; + if (info->features.vfpv3) info->features.vfp = true; +} + +static void FillProcCpuInfoData(ArmInfo* const info, + ProcCpuInfoData* proc_cpu_info_data) { + const int fd = CpuFeatures_OpenFile("/proc/cpuinfo"); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + for (;;) { + if (!HandleArmLine(StackLineReader_NextLine(&reader), info, + proc_cpu_info_data)) { + break; + } + } + CpuFeatures_CloseFile(fd); + } +} + +static const ArmInfo kEmptyArmInfo; + +static const ProcCpuInfoData kEmptyProcCpuInfoData; + +ArmInfo GetArmInfo(void) { + // capabilities are fetched from both getauxval and /proc/cpuinfo so we can + // have some information if the executable is sandboxed (aka no access to + // /proc/cpuinfo). + ArmInfo info = kEmptyArmInfo; + ProcCpuInfoData proc_cpu_info_data = kEmptyProcCpuInfoData; + + FillProcCpuInfoData(&info, &proc_cpu_info_data); + const HardwareCapabilities hwcaps = CpuFeatures_GetHardwareCapabilities(); + for (size_t i = 0; i < ARM_LAST_; ++i) { + if (CpuFeatures_IsHwCapsSet(kHardwareCapabilities[i], hwcaps)) { + kSetters[i](&info.features, true); + } + } + + FixErrors(&info, &proc_cpu_info_data); + + return info; +} + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +int GetArmFeaturesEnumValue(const ArmFeatures* features, + ArmFeaturesEnum value) { + if (value >= ARM_LAST_) return false; + return kGetters[value](features); +} + +const char* GetArmFeaturesEnumName(ArmFeaturesEnum value) { + if (value >= ARM_LAST_) return "unknown feature"; + return kCpuInfoFlags[value]; +} diff --git a/cpu_features/src/cpuinfo_mips.c b/cpu_features/src/cpuinfo_mips.c new file mode 100644 index 0000000..83e959f --- /dev/null +++ b/cpu_features/src/cpuinfo_mips.c @@ -0,0 +1,92 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_mips.h" + +#include + +#include "internal/filesystem.h" +#include "internal/hwcaps.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +// Generation of feature's getters/setters functions and kGetters, kSetters, +// kCpuInfoFlags and kHardwareCapabilities global tables. +#define DEFINE_TABLE_FEATURES \ + FEATURE(MIPS_MSA, msa, "msa", MIPS_HWCAP_MSA, 0) \ + FEATURE(MIPS_EVA, eva, "eva", 0, 0) \ + FEATURE(MIPS_R6, r6, "r6", MIPS_HWCAP_R6, 0) +#define DEFINE_TABLE_FEATURE_TYPE MipsFeatures +#include "define_tables.h" + +static bool HandleMipsLine(const LineResult result, + MipsFeatures* const features) { + StringView key, value; + // See tests for an example. + if (CpuFeatures_StringView_GetAttributeKeyValue(result.line, &key, &value)) { + if (CpuFeatures_StringView_IsEquals(key, str("ASEs implemented"))) { + for (size_t i = 0; i < MIPS_LAST_; ++i) { + kSetters[i](features, + CpuFeatures_StringView_HasWord(value, kCpuInfoFlags[i])); + } + } + } + return !result.eof; +} + +static void FillProcCpuInfoData(MipsFeatures* const features) { + const int fd = CpuFeatures_OpenFile("/proc/cpuinfo"); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + for (;;) { + if (!HandleMipsLine(StackLineReader_NextLine(&reader), features)) { + break; + } + } + CpuFeatures_CloseFile(fd); + } +} + +static const MipsInfo kEmptyMipsInfo; + +MipsInfo GetMipsInfo(void) { + // capabilities are fetched from both getauxval and /proc/cpuinfo so we can + // have some information if the executable is sandboxed (aka no access to + // /proc/cpuinfo). + MipsInfo info = kEmptyMipsInfo; + + FillProcCpuInfoData(&info.features); + const HardwareCapabilities hwcaps = CpuFeatures_GetHardwareCapabilities(); + for (size_t i = 0; i < MIPS_LAST_; ++i) { + if (CpuFeatures_IsHwCapsSet(kHardwareCapabilities[i], hwcaps)) { + kSetters[i](&info.features, true); + } + } + return info; +} + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +int GetMipsFeaturesEnumValue(const MipsFeatures* features, + MipsFeaturesEnum value) { + if (value >= MIPS_LAST_) return false; + return kGetters[value](features); +} + +const char* GetMipsFeaturesEnumName(MipsFeaturesEnum value) { + if (value >= MIPS_LAST_) return "unknown feature"; + return kCpuInfoFlags[value]; +} diff --git a/cpu_features/src/cpuinfo_ppc.c b/cpu_features/src/cpuinfo_ppc.c new file mode 100644 index 0000000..24401f9 --- /dev/null +++ b/cpu_features/src/cpuinfo_ppc.c @@ -0,0 +1,154 @@ +// Copyright 2018 IBM. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_ppc.h" + +#include +#include +#include + +#include "internal/bit_utils.h" +#include "internal/filesystem.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +// Generation of feature's getters/setters functions and kGetters, kSetters, +// kCpuInfoFlags and kHardwareCapabilities global tables. +#define DEFINE_TABLE_FEATURES \ + FEATURE(PPC_32, ppc32, "ppc32", PPC_FEATURE_32, 0) \ + FEATURE(PPC_64, ppc64, "ppc64", PPC_FEATURE_64, 0) \ + FEATURE(PPC_601_INSTR, ppc601, "ppc601", PPC_FEATURE_601_INSTR, 0) \ + FEATURE(PPC_HAS_ALTIVEC, altivec, "altivec", PPC_FEATURE_HAS_ALTIVEC, 0) \ + FEATURE(PPC_HAS_FPU, fpu, "fpu", PPC_FEATURE_HAS_FPU, 0) \ + FEATURE(PPC_HAS_MMU, mmu, "mmu", PPC_FEATURE_HAS_MMU, 0) \ + FEATURE(PPC_HAS_4xxMAC, mac_4xx, "4xxmac", PPC_FEATURE_HAS_4xxMAC, 0) \ + FEATURE(PPC_UNIFIED_CACHE, unifiedcache, "ucache", \ + PPC_FEATURE_UNIFIED_CACHE, 0) \ + FEATURE(PPC_HAS_SPE, spe, "spe", PPC_FEATURE_HAS_SPE, 0) \ + FEATURE(PPC_HAS_EFP_SINGLE, efpsingle, "efpsingle", \ + PPC_FEATURE_HAS_EFP_SINGLE, 0) \ + FEATURE(PPC_HAS_EFP_DOUBLE, efpdouble, "efpdouble", \ + PPC_FEATURE_HAS_EFP_DOUBLE, 0) \ + FEATURE(PPC_NO_TB, no_tb, "notb", PPC_FEATURE_NO_TB, 0) \ + FEATURE(PPC_POWER4, power4, "power4", PPC_FEATURE_POWER4, 0) \ + FEATURE(PPC_POWER5, power5, "power5", PPC_FEATURE_POWER5, 0) \ + FEATURE(PPC_POWER5_PLUS, power5plus, "power5+", PPC_FEATURE_POWER5_PLUS, 0) \ + FEATURE(PPC_CELL, cell, "cellbe", PPC_FEATURE_CELL, 0) \ + FEATURE(PPC_BOOKE, booke, "booke", PPC_FEATURE_BOOKE, 0) \ + FEATURE(PPC_SMT, smt, "smt", PPC_FEATURE_SMT, 0) \ + FEATURE(PPC_ICACHE_SNOOP, icachesnoop, "ic_snoop", PPC_FEATURE_ICACHE_SNOOP, \ + 0) \ + FEATURE(PPC_ARCH_2_05, arch205, "arch_2_05", PPC_FEATURE_ARCH_2_05, 0) \ + FEATURE(PPC_PA6T, pa6t, "pa6t", PPC_FEATURE_PA6T, 0) \ + FEATURE(PPC_HAS_DFP, dfp, "dfp", PPC_FEATURE_HAS_DFP, 0) \ + FEATURE(PPC_POWER6_EXT, power6ext, "power6x", PPC_FEATURE_POWER6_EXT, 0) \ + FEATURE(PPC_ARCH_2_06, arch206, "arch_2_06", PPC_FEATURE_ARCH_2_06, 0) \ + FEATURE(PPC_HAS_VSX, vsx, "vsx", PPC_FEATURE_HAS_VSX, 0) \ + FEATURE(PPC_PSERIES_PERFMON_COMPAT, pseries_perfmon_compat, "archpmu", \ + PPC_FEATURE_PSERIES_PERFMON_COMPAT, 0) \ + FEATURE(PPC_TRUE_LE, truele, "true_le", PPC_FEATURE_TRUE_LE, 0) \ + FEATURE(PPC_PPC_LE, ppcle, "ppcle", PPC_FEATURE_PPC_LE, 0) \ + FEATURE(PPC_ARCH_2_07, arch207, "arch_2_07", 0, PPC_FEATURE2_ARCH_2_07) \ + FEATURE(PPC_HTM, htm, "htm", 0, PPC_FEATURE2_HTM) \ + FEATURE(PPC_DSCR, dscr, "dscr", 0, PPC_FEATURE2_DSCR) \ + FEATURE(PPC_EBB, ebb, "ebb", 0, PPC_FEATURE2_EBB) \ + FEATURE(PPC_ISEL, isel, "isel", 0, PPC_FEATURE2_ISEL) \ + FEATURE(PPC_TAR, tar, "tar", 0, PPC_FEATURE2_TAR) \ + FEATURE(PPC_VEC_CRYPTO, vcrypto, "vcrypto", 0, PPC_FEATURE2_VEC_CRYPTO) \ + FEATURE(PPC_HTM_NOSC, htm_nosc, "htm-nosc", 0, PPC_FEATURE2_HTM_NOSC) \ + FEATURE(PPC_ARCH_3_00, arch300, "arch_3_00", 0, PPC_FEATURE2_ARCH_3_00) \ + FEATURE(PPC_HAS_IEEE128, ieee128, "ieee128", 0, PPC_FEATURE2_HAS_IEEE128) \ + FEATURE(PPC_DARN, darn, "darn", 0, PPC_FEATURE2_DARN) \ + FEATURE(PPC_SCV, scv, "scv", 0, PPC_FEATURE2_SCV) \ + FEATURE(PPC_HTM_NO_SUSPEND, htm_no_suspend, "htm-no-suspend", 0, \ + PPC_FEATURE2_HTM_NO_SUSPEND) +#define DEFINE_TABLE_FEATURE_TYPE PPCFeatures +#include "define_tables.h" + +static bool HandlePPCLine(const LineResult result, + PPCPlatformStrings* const strings) { + StringView line = result.line; + StringView key, value; + if (CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)) { + if (CpuFeatures_StringView_HasWord(key, "platform")) { + CpuFeatures_StringView_CopyString(value, strings->platform, + sizeof(strings->platform)); + } else if (CpuFeatures_StringView_IsEquals(key, str("model"))) { + CpuFeatures_StringView_CopyString(value, strings->model, + sizeof(strings->platform)); + } else if (CpuFeatures_StringView_IsEquals(key, str("machine"))) { + CpuFeatures_StringView_CopyString(value, strings->machine, + sizeof(strings->platform)); + } else if (CpuFeatures_StringView_IsEquals(key, str("cpu"))) { + CpuFeatures_StringView_CopyString(value, strings->cpu, + sizeof(strings->platform)); + } + } + return !result.eof; +} + +static void FillProcCpuInfoData(PPCPlatformStrings* const strings) { + const int fd = CpuFeatures_OpenFile("/proc/cpuinfo"); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + for (;;) { + if (!HandlePPCLine(StackLineReader_NextLine(&reader), strings)) { + break; + } + } + CpuFeatures_CloseFile(fd); + } +} + +static const PPCInfo kEmptyPPCInfo; + +PPCInfo GetPPCInfo(void) { + /* + * On Power feature flags aren't currently in cpuinfo so we only look at + * the auxilary vector. + */ + PPCInfo info = kEmptyPPCInfo; + const HardwareCapabilities hwcaps = CpuFeatures_GetHardwareCapabilities(); + for (size_t i = 0; i < PPC_LAST_; ++i) { + if (CpuFeatures_IsHwCapsSet(kHardwareCapabilities[i], hwcaps)) { + kSetters[i](&info.features, true); + } + } + return info; +} + +static const PPCPlatformStrings kEmptyPPCPlatformStrings; + +PPCPlatformStrings GetPPCPlatformStrings(void) { + PPCPlatformStrings strings = kEmptyPPCPlatformStrings; + + FillProcCpuInfoData(&strings); + strings.type = CpuFeatures_GetPlatformType(); + return strings; +} + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +int GetPPCFeaturesEnumValue(const PPCFeatures* features, + PPCFeaturesEnum value) { + if (value >= PPC_LAST_) return false; + return kGetters[value](features); +} + +const char* GetPPCFeaturesEnumName(PPCFeaturesEnum value) { + if (value >= PPC_LAST_) return "unknown feature"; + return kCpuInfoFlags[value]; +} diff --git a/cpu_features/src/cpuinfo_x86.c b/cpu_features/src/cpuinfo_x86.c new file mode 100644 index 0000000..378ed05 --- /dev/null +++ b/cpu_features/src/cpuinfo_x86.c @@ -0,0 +1,1622 @@ +// Copyright 2017 Google LLC +// Copyright 2020 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_x86.h" + +#include +#include + +#include "internal/bit_utils.h" +#include "internal/cpuid_x86.h" + +#if !defined(CPU_FEATURES_ARCH_X86) +#error "Cannot compile cpuinfo_x86 on a non x86 platform." +#endif + +// Generation of feature's getters/setters functions and kGetters, kSetters, +// kCpuInfoFlags global tables. +#define DEFINE_TABLE_FEATURES \ + FEATURE(X86_FPU, fpu, "fpu", 0, 0) \ + FEATURE(X86_TSC, tsc, "tsc", 0, 0) \ + FEATURE(X86_CX8, cx8, "cx8", 0, 0) \ + FEATURE(X86_CLFSH, clfsh, "clfsh", 0, 0) \ + FEATURE(X86_MMX, mmx, "mmx", 0, 0) \ + FEATURE(X86_AES, aes, "aes", 0, 0) \ + FEATURE(X86_ERMS, erms, "erms", 0, 0) \ + FEATURE(X86_F16C, f16c, "f16c", 0, 0) \ + FEATURE(X86_FMA4, fma4, "fma4", 0, 0) \ + FEATURE(X86_FMA3, fma3, "fma3", 0, 0) \ + FEATURE(X86_VAES, vaes, "vaes", 0, 0) \ + FEATURE(X86_VPCLMULQDQ, vpclmulqdq, "vpclmulqdq", 0, 0) \ + FEATURE(X86_BMI1, bmi1, "bmi1", 0, 0) \ + FEATURE(X86_HLE, hle, "hle", 0, 0) \ + FEATURE(X86_BMI2, bmi2, "bmi2", 0, 0) \ + FEATURE(X86_RTM, rtm, "rtm", 0, 0) \ + FEATURE(X86_RDSEED, rdseed, "rdseed", 0, 0) \ + FEATURE(X86_CLFLUSHOPT, clflushopt, "clflushopt", 0, 0) \ + FEATURE(X86_CLWB, clwb, "clwb", 0, 0) \ + FEATURE(X86_SSE, sse, "sse", 0, 0) \ + FEATURE(X86_SSE2, sse2, "sse2", 0, 0) \ + FEATURE(X86_SSE3, sse3, "sse3", 0, 0) \ + FEATURE(X86_SSSE3, ssse3, "ssse3", 0, 0) \ + FEATURE(X86_SSE4_1, sse4_1, "sse4_1", 0, 0) \ + FEATURE(X86_SSE4_2, sse4_2, "sse4_2", 0, 0) \ + FEATURE(X86_SSE4A, sse4a, "sse4a", 0, 0) \ + FEATURE(X86_AVX, avx, "avx", 0, 0) \ + FEATURE(X86_AVX2, avx2, "avx2", 0, 0) \ + FEATURE(X86_AVX512F, avx512f, "avx512f", 0, 0) \ + FEATURE(X86_AVX512CD, avx512cd, "avx512cd", 0, 0) \ + FEATURE(X86_AVX512ER, avx512er, "avx512er", 0, 0) \ + FEATURE(X86_AVX512PF, avx512pf, "avx512pf", 0, 0) \ + FEATURE(X86_AVX512BW, avx512bw, "avx512bw", 0, 0) \ + FEATURE(X86_AVX512DQ, avx512dq, "avx512dq", 0, 0) \ + FEATURE(X86_AVX512VL, avx512vl, "avx512vl", 0, 0) \ + FEATURE(X86_AVX512IFMA, avx512ifma, "avx512ifma", 0, 0) \ + FEATURE(X86_AVX512VBMI, avx512vbmi, "avx512vbmi", 0, 0) \ + FEATURE(X86_AVX512VBMI2, avx512vbmi2, "avx512vbmi2", 0, 0) \ + FEATURE(X86_AVX512VNNI, avx512vnni, "avx512vnni", 0, 0) \ + FEATURE(X86_AVX512BITALG, avx512bitalg, "avx512bitalg", 0, 0) \ + FEATURE(X86_AVX512VPOPCNTDQ, avx512vpopcntdq, "avx512vpopcntdq", 0, 0) \ + FEATURE(X86_AVX512_4VNNIW, avx512_4vnniw, "avx512_4vnniw", 0, 0) \ + FEATURE(X86_AVX512_4VBMI2, avx512_4vbmi2, "avx512_4vbmi2", 0, 0) \ + FEATURE(X86_AVX512_SECOND_FMA, avx512_second_fma, "avx512_second_fma", 0, 0) \ + FEATURE(X86_AVX512_4FMAPS, avx512_4fmaps, "avx512_4fmaps", 0, 0) \ + FEATURE(X86_AVX512_BF16, avx512_bf16, "avx512_bf16", 0, 0) \ + FEATURE(X86_AVX512_VP2INTERSECT, avx512_vp2intersect, "avx512_vp2intersect", \ + 0, 0) \ + FEATURE(X86_AMX_BF16, amx_bf16, "amx_bf16", 0, 0) \ + FEATURE(X86_AMX_TILE, amx_tile, "amx_tile", 0, 0) \ + FEATURE(X86_AMX_INT8, amx_int8, "amx_int8", 0, 0) \ + FEATURE(X86_PCLMULQDQ, pclmulqdq, "pclmulqdq", 0, 0) \ + FEATURE(X86_SMX, smx, "smx", 0, 0) \ + FEATURE(X86_SGX, sgx, "sgx", 0, 0) \ + FEATURE(X86_CX16, cx16, "cx16", 0, 0) \ + FEATURE(X86_SHA, sha, "sha", 0, 0) \ + FEATURE(X86_POPCNT, popcnt, "popcnt", 0, 0) \ + FEATURE(X86_MOVBE, movbe, "movbe", 0, 0) \ + FEATURE(X86_RDRND, rdrnd, "rdrnd", 0, 0) \ + FEATURE(X86_DCA, dca, "dca", 0, 0) \ + FEATURE(X86_SS, ss, "ss", 0, 0) +#define DEFINE_TABLE_FEATURE_TYPE X86Features +#define DEFINE_TABLE_DONT_GENERATE_HWCAPS +#include "define_tables.h" + +// The following includes are necessary to provide SSE detections on pre-AVX +// microarchitectures. +#if defined(CPU_FEATURES_OS_WINDOWS) +#include // IsProcessorFeaturePresent +#elif defined(CPU_FEATURES_OS_LINUX_OR_ANDROID) +#include "internal/filesystem.h" // Needed to parse /proc/cpuinfo +#include "internal/stack_line_reader.h" // Needed to parse /proc/cpuinfo +#include "internal/string_view.h" // Needed to parse /proc/cpuinfo +#elif defined(CPU_FEATURES_OS_DARWIN) +#if !defined(HAVE_SYSCTLBYNAME) +#error "Darwin needs support for sysctlbyname" +#endif +#include +#else +#error "Unsupported OS" +#endif // CPU_FEATURES_OS + +//////////////////////////////////////////////////////////////////////////////// +// Definitions for CpuId and GetXCR0Eax. +//////////////////////////////////////////////////////////////////////////////// + +#if defined(CPU_FEATURES_MOCK_CPUID_X86) +// Implementation will be provided by test/cpuinfo_x86_test.cc. +#elif defined(CPU_FEATURES_COMPILER_CLANG) || defined(CPU_FEATURES_COMPILER_GCC) + +#include + +Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) { + Leaf leaf; + __cpuid_count(leaf_id, ecx, leaf.eax, leaf.ebx, leaf.ecx, leaf.edx); + return leaf; +} + +uint32_t GetXCR0Eax(void) { + uint32_t eax, edx; + /* named form of xgetbv not supported on OSX, so must use byte form, see: + https://github.com/asmjit/asmjit/issues/78 + */ + __asm(".byte 0x0F, 0x01, 0xd0" : "=a"(eax), "=d"(edx) : "c"(0)); + return eax; +} + +#elif defined(CPU_FEATURES_COMPILER_MSC) + +#include +#include // For __cpuidex() + +Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) { + Leaf leaf; + int data[4]; + __cpuidex(data, leaf_id, ecx); + leaf.eax = data[0]; + leaf.ebx = data[1]; + leaf.ecx = data[2]; + leaf.edx = data[3]; + return leaf; +} + +uint32_t GetXCR0Eax(void) { return (uint32_t)_xgetbv(0); } + +#else +#error "Unsupported compiler, x86 cpuid requires either GCC, Clang or MSVC." +#endif + +static Leaf CpuId(uint32_t leaf_id) { return GetCpuidLeaf(leaf_id, 0); } + +static const Leaf kEmptyLeaf; + +static Leaf SafeCpuIdEx(uint32_t max_cpuid_leaf, uint32_t leaf_id, int ecx) { + if (leaf_id <= max_cpuid_leaf) { + return GetCpuidLeaf(leaf_id, ecx); + } else { + return kEmptyLeaf; + } +} + +static Leaf SafeCpuId(uint32_t max_cpuid_leaf, uint32_t leaf_id) { + return SafeCpuIdEx(max_cpuid_leaf, leaf_id, 0); +} + +#define MASK_XMM 0x2 +#define MASK_YMM 0x4 +#define MASK_MASKREG 0x20 +#define MASK_ZMM0_15 0x40 +#define MASK_ZMM16_31 0x80 +#define MASK_XTILECFG 0x20000 +#define MASK_XTILEDATA 0x40000 + +static bool HasMask(uint32_t value, uint32_t mask) { + return (value & mask) == mask; +} + +// Checks that operating system saves and restores xmm registers during context +// switches. +static bool HasXmmOsXSave(uint32_t xcr0_eax) { + return HasMask(xcr0_eax, MASK_XMM); +} + +// Checks that operating system saves and restores ymm registers during context +// switches. +static bool HasYmmOsXSave(uint32_t xcr0_eax) { + return HasMask(xcr0_eax, MASK_XMM | MASK_YMM); +} + +// Checks that operating system saves and restores zmm registers during context +// switches. +static bool HasZmmOsXSave(uint32_t xcr0_eax) { + return HasMask(xcr0_eax, MASK_XMM | MASK_YMM | MASK_MASKREG | MASK_ZMM0_15 | + MASK_ZMM16_31); +} + +// Checks that operating system saves and restores AMX/TMUL state during context +// switches. +static bool HasTmmOsXSave(uint32_t xcr0_eax) { + return HasMask(xcr0_eax, MASK_XMM | MASK_YMM | MASK_MASKREG | MASK_ZMM0_15 | + MASK_ZMM16_31 | MASK_XTILECFG | MASK_XTILEDATA); +} + +static bool HasSecondFMA(uint32_t model) { + // Skylake server + if (model == 0x55) { + char proc_name[49] = {0}; + FillX86BrandString(proc_name); + // detect Xeon + if (proc_name[9] == 'X') { + // detect Silver or Bronze + if (proc_name[17] == 'S' || proc_name[17] == 'B') return false; + // detect Gold 5_20 and below, except for Gold 53__ + if (proc_name[17] == 'G' && proc_name[22] == '5') + return ((proc_name[23] == '3') || + (proc_name[24] == '2' && proc_name[25] == '2')); + // detect Xeon W 210x + if (proc_name[17] == 'W' && proc_name[21] == '0') return false; + // detect Xeon D 2xxx + if (proc_name[17] == 'D' && proc_name[19] == '2' && proc_name[20] == '1') + return false; + } + return true; + } + // Cannon Lake client + if (model == 0x66) return false; + // Ice Lake client + if (model == 0x7d || model == 0x7e) return false; + // This is the right default... + return true; +} + +static void SetVendor(const Leaf leaf, char* const vendor) { + *(uint32_t*)(vendor) = leaf.ebx; + *(uint32_t*)(vendor + 4) = leaf.edx; + *(uint32_t*)(vendor + 8) = leaf.ecx; + vendor[12] = '\0'; +} + +static int IsVendor(const Leaf leaf, const char* const name) { + const uint32_t ebx = *(const uint32_t*)(name); + const uint32_t edx = *(const uint32_t*)(name + 4); + const uint32_t ecx = *(const uint32_t*)(name + 8); + return leaf.ebx == ebx && leaf.ecx == ecx && leaf.edx == edx; +} + +static const CacheLevelInfo kEmptyCacheLevelInfo; + +static CacheLevelInfo GetCacheLevelInfo(const uint32_t reg) { + const int UNDEF = -1; + const int KiB = 1024; + const int MiB = 1024 * KiB; + switch (reg) { + case 0x01: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 32, + .partitioning = 0}; + case 0x02: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * MiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 2, + .partitioning = 0}; + case 0x03: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0x04: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 8, + .partitioning = 0}; + case 0x05: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 32, + .partitioning = 0}; + case 0x06: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 8 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x08: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 16 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x09: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 32 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x0A: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 8 * KiB, + .ways = 2, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x0B: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 4, + .partitioning = 0}; + case 0x0C: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 16 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x0D: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 16 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x0E: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 24 * KiB, + .ways = 6, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x1D: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 128 * KiB, + .ways = 2, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x21: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 256 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x22: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x23: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x24: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x25: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x29: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x2C: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 32 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x30: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 32 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x40: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = UNDEF, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x41: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 128 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x42: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 256 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x43: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x44: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x45: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x46: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x47: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 8 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x48: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 3 * MiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x49: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case (0x49 | (1 << 8)): + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4A: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 6 * MiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4B: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 8 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4C: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 12 * MiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4D: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 16 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4E: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 6 * MiB, + .ways = 24, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x4F: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 32, + .partitioning = 0}; + case 0x50: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0x51: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 128, + .partitioning = 0}; + case 0x52: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 256, + .partitioning = 0}; + case 0x55: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 2 * MiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 7, + .partitioning = 0}; + case 0x56: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 16, + .partitioning = 0}; + case 0x57: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 16, + .partitioning = 0}; + case 0x59: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 16, + .partitioning = 0}; + case 0x5A: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 2 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 32, + .partitioning = 0}; + case 0x5B: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0x5C: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 128, + .partitioning = 0}; + case 0x5D: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = 256, + .partitioning = 0}; + case 0x60: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 16 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x61: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 48, + .partitioning = 0}; + case 0x63: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 2 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 4, + .partitioning = 0}; + case 0x66: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 8 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x67: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 16 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x68: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 32 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x70: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 12 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x71: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 16 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x72: + return (CacheLevelInfo){.level = 1, + .cache_type = CPU_FEATURE_CACHE_INSTRUCTION, + .cache_size = 32 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x76: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 2 * MiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 8, + .partitioning = 0}; + case 0x78: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x79: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 128 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x7A: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 256 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x7B: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x7C: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 2}; + case 0x7D: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x7F: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 2, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x80: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x82: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 256 * KiB, + .ways = 8, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x83: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 8, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x84: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 8, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x85: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 8, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x86: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 4, + .line_size = 32, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0x87: + return (CacheLevelInfo){.level = 2, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xA0: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_DTLB, + .cache_size = 4 * KiB, + .ways = 0xFF, + .line_size = UNDEF, + .tlb_entries = 32, + .partitioning = 0}; + case 0xB0: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 128, + .partitioning = 0}; + case 0xB1: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 2 * MiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 8, + .partitioning = 0}; + case 0xB2: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0xB3: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 128, + .partitioning = 0}; + case 0xB4: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 256, + .partitioning = 0}; + case 0xB5: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0xB6: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = 128, + .partitioning = 0}; + case 0xBA: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 64, + .partitioning = 0}; + case 0xC0: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_TLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 8, + .partitioning = 0}; + case 0xC1: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_STLB, + .cache_size = 4 * KiB, + .ways = 8, + .line_size = UNDEF, + .tlb_entries = 1024, + .partitioning = 0}; + case 0xC2: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_DTLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 16, + .partitioning = 0}; + case 0xC3: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_STLB, + .cache_size = 4 * KiB, + .ways = 6, + .line_size = UNDEF, + .tlb_entries = 1536, + .partitioning = 0}; + case 0xCA: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_STLB, + .cache_size = 4 * KiB, + .ways = 4, + .line_size = UNDEF, + .tlb_entries = 512, + .partitioning = 0}; + case 0xD0: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 512 * KiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xD1: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xD2: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 4, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xD6: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xD7: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xD8: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 8, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xDC: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 1 * 1536 * KiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xDD: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 3 * MiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xDE: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 6 * MiB, + .ways = 12, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xE2: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 2 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xE3: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 4 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xE4: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 8 * MiB, + .ways = 16, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xEA: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 12 * MiB, + .ways = 24, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xEB: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 18 * MiB, + .ways = 24, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xEC: + return (CacheLevelInfo){.level = 3, + .cache_type = CPU_FEATURE_CACHE_DATA, + .cache_size = 24 * MiB, + .ways = 24, + .line_size = 64, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xF0: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_PREFETCH, + .cache_size = 64 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xF1: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_PREFETCH, + .cache_size = 128 * KiB, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + case 0xFF: + return (CacheLevelInfo){.level = UNDEF, + .cache_type = CPU_FEATURE_CACHE_NULL, + .cache_size = UNDEF, + .ways = UNDEF, + .line_size = UNDEF, + .tlb_entries = UNDEF, + .partitioning = 0}; + default: + return kEmptyCacheLevelInfo; + } +} + +static void GetByteArrayFromRegister(uint32_t result[4], const uint32_t reg) { + for (int i = 0; i < 4; ++i) { + result[i] = ExtractBitRange(reg, (i + 1) * 8, i * 8); + } +} + +static void ParseLeaf2(const int max_cpuid_leaf, CacheInfo* info) { + Leaf leaf = SafeCpuId(max_cpuid_leaf, 2); + uint32_t registers[] = {leaf.eax, leaf.ebx, leaf.ecx, leaf.edx}; + for (int i = 0; i < 4; ++i) { + if (registers[i] & (1U << 31)) { + continue; // register does not contains valid information + } + uint32_t bytes[4]; + GetByteArrayFromRegister(bytes, registers[i]); + for (int j = 0; j < 4; ++j) { + if (bytes[j] == 0xFF) + break; // leaf 4 should be used to fetch cache information + info->levels[info->size] = GetCacheLevelInfo(bytes[j]); + } + info->size++; + } +} + +static void ParseLeaf4(const int max_cpuid_leaf, CacheInfo* info) { + info->size = 0; + for (int cache_id = 0; cache_id < CPU_FEATURES_MAX_CACHE_LEVEL; cache_id++) { + const Leaf leaf = SafeCpuIdEx(max_cpuid_leaf, 4, cache_id); + CacheType cache_type = ExtractBitRange(leaf.eax, 4, 0); + if (cache_type == CPU_FEATURE_CACHE_NULL) { + info->levels[cache_id] = kEmptyCacheLevelInfo; + continue; + } + int level = ExtractBitRange(leaf.eax, 7, 5); + int line_size = ExtractBitRange(leaf.ebx, 11, 0) + 1; + int partitioning = ExtractBitRange(leaf.ebx, 21, 12) + 1; + int ways = ExtractBitRange(leaf.ebx, 31, 22) + 1; + int tlb_entries = leaf.ecx + 1; + int cache_size = (ways * partitioning * line_size * (tlb_entries)); + info->levels[cache_id] = (CacheLevelInfo){.level = level, + .cache_type = cache_type, + .cache_size = cache_size, + .ways = ways, + .line_size = line_size, + .tlb_entries = tlb_entries, + .partitioning = partitioning}; + info->size++; + } +} + +// Internal structure to hold the OS support for vector operations. +// Avoid to recompute them since each call to cpuid is ~100 cycles. +typedef struct { + bool have_sse_via_os; + bool have_sse_via_cpuid; + bool have_avx; + bool have_avx512; + bool have_amx; +} OsSupport; + +static const OsSupport kEmptyOsSupport; + +static OsSupport CheckOsSupport(const uint32_t max_cpuid_leaf) { + const Leaf leaf_1 = SafeCpuId(max_cpuid_leaf, 1); + const bool have_xsave = IsBitSet(leaf_1.ecx, 26); + const bool have_osxsave = IsBitSet(leaf_1.ecx, 27); + const bool have_xcr0 = have_xsave && have_osxsave; + + OsSupport os_support = kEmptyOsSupport; + + if (have_xcr0) { + // AVX capable cpu will expose XCR0. + const uint32_t xcr0_eax = GetXCR0Eax(); + os_support.have_sse_via_cpuid = HasXmmOsXSave(xcr0_eax); + os_support.have_avx = HasYmmOsXSave(xcr0_eax); + os_support.have_avx512 = HasZmmOsXSave(xcr0_eax); + os_support.have_amx = HasTmmOsXSave(xcr0_eax); + } else { + // Atom based or older cpus need to ask the OS for sse support. + os_support.have_sse_via_os = true; + } + + return os_support; +} + +#if defined(CPU_FEATURES_OS_WINDOWS) +#if defined(CPU_FEATURES_MOCK_CPUID_X86) +extern bool GetWindowsIsProcessorFeaturePresent(DWORD); +#else // CPU_FEATURES_MOCK_CPUID_X86 +static bool GetWindowsIsProcessorFeaturePresent(DWORD ProcessorFeature) { + return IsProcessorFeaturePresent(ProcessorFeature); +} +#endif +#endif // CPU_FEATURES_OS_WINDOWS + +#if defined(CPU_FEATURES_OS_DARWIN) +#if defined(CPU_FEATURES_MOCK_CPUID_X86) +extern bool GetDarwinSysCtlByName(const char*); +#else // CPU_FEATURES_MOCK_CPUID_X86 +static bool GetDarwinSysCtlByName(const char* name) { + int enabled; + size_t enabled_len = sizeof(enabled); + const int failure = sysctlbyname(name, &enabled, &enabled_len, NULL, 0); + return failure ? false : enabled; +} +#endif +#endif // CPU_FEATURES_OS_DARWIN + +static void DetectSseViaOs(X86Features* features) { +#if defined(CPU_FEATURES_OS_WINDOWS) + // https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent + features->sse = + GetWindowsIsProcessorFeaturePresent(PF_XMMI_INSTRUCTIONS_AVAILABLE); + features->sse2 = + GetWindowsIsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE); + features->sse3 = + GetWindowsIsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE); +#elif defined(CPU_FEATURES_OS_DARWIN) + // Handling Darwin platform through sysctlbyname. + features->sse = GetDarwinSysCtlByName("hw.optional.sse"); + features->sse2 = GetDarwinSysCtlByName("hw.optional.sse2"); + features->sse3 = GetDarwinSysCtlByName("hw.optional.sse3"); + features->ssse3 = GetDarwinSysCtlByName("hw.optional.supplementalsse3"); + features->sse4_1 = GetDarwinSysCtlByName("hw.optional.sse4_1"); + features->sse4_2 = GetDarwinSysCtlByName("hw.optional.sse4_2"); +#elif defined(CPU_FEATURES_OS_LINUX_OR_ANDROID) + // Handling Linux platform through /proc/cpuinfo. + const int fd = CpuFeatures_OpenFile("/proc/cpuinfo"); + if (fd >= 0) { + StackLineReader reader; + StackLineReader_Initialize(&reader, fd); + for (;;) { + const LineResult result = StackLineReader_NextLine(&reader); + const StringView line = result.line; + StringView key, value; + if (CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)) { + if (CpuFeatures_StringView_IsEquals(key, str("flags"))) { + features->sse = CpuFeatures_StringView_HasWord(value, "sse"); + features->sse2 = CpuFeatures_StringView_HasWord(value, "sse2"); + features->sse3 = CpuFeatures_StringView_HasWord(value, "sse3"); + features->ssse3 = CpuFeatures_StringView_HasWord(value, "ssse3"); + features->sse4_1 = CpuFeatures_StringView_HasWord(value, "sse4_1"); + features->sse4_2 = CpuFeatures_StringView_HasWord(value, "sse4_2"); + break; + } + } + if (result.eof) break; + } + CpuFeatures_CloseFile(fd); + } +#else +#error "Unsupported fallback detection of SSE OS support." +#endif +} + +// Reference https://en.wikipedia.org/wiki/CPUID. +static void ParseCpuId(const uint32_t max_cpuid_leaf, + const OsSupport os_support, X86Info* info) { + const Leaf leaf_1 = SafeCpuId(max_cpuid_leaf, 1); + const Leaf leaf_7 = SafeCpuId(max_cpuid_leaf, 7); + const Leaf leaf_7_1 = SafeCpuIdEx(max_cpuid_leaf, 7, 1); + + const uint32_t family = ExtractBitRange(leaf_1.eax, 11, 8); + const uint32_t extended_family = ExtractBitRange(leaf_1.eax, 27, 20); + const uint32_t model = ExtractBitRange(leaf_1.eax, 7, 4); + const uint32_t extended_model = ExtractBitRange(leaf_1.eax, 19, 16); + + X86Features* const features = &info->features; + + info->family = extended_family + family; + info->model = (extended_model << 4) + model; + info->stepping = ExtractBitRange(leaf_1.eax, 3, 0); + + features->fpu = IsBitSet(leaf_1.edx, 0); + features->tsc = IsBitSet(leaf_1.edx, 4); + features->cx8 = IsBitSet(leaf_1.edx, 8); + features->clfsh = IsBitSet(leaf_1.edx, 19); + features->mmx = IsBitSet(leaf_1.edx, 23); + features->ss = IsBitSet(leaf_1.edx, 27); + features->pclmulqdq = IsBitSet(leaf_1.ecx, 1); + features->smx = IsBitSet(leaf_1.ecx, 6); + features->cx16 = IsBitSet(leaf_1.ecx, 13); + features->dca = IsBitSet(leaf_1.ecx, 18); + features->movbe = IsBitSet(leaf_1.ecx, 22); + features->popcnt = IsBitSet(leaf_1.ecx, 23); + features->aes = IsBitSet(leaf_1.ecx, 25); + features->f16c = IsBitSet(leaf_1.ecx, 29); + features->rdrnd = IsBitSet(leaf_1.ecx, 30); + features->sgx = IsBitSet(leaf_7.ebx, 2); + features->bmi1 = IsBitSet(leaf_7.ebx, 3); + features->hle = IsBitSet(leaf_7.ebx, 4); + features->bmi2 = IsBitSet(leaf_7.ebx, 8); + features->erms = IsBitSet(leaf_7.ebx, 9); + features->rtm = IsBitSet(leaf_7.ebx, 11); + features->rdseed = IsBitSet(leaf_7.ebx, 18); + features->clflushopt = IsBitSet(leaf_7.ebx, 23); + features->clwb = IsBitSet(leaf_7.ebx, 24); + features->sha = IsBitSet(leaf_7.ebx, 29); + features->vaes = IsBitSet(leaf_7.ecx, 9); + features->vpclmulqdq = IsBitSet(leaf_7.ecx, 10); + + if (os_support.have_sse_via_os) { + DetectSseViaOs(features); + } else if (os_support.have_sse_via_cpuid) { + features->sse = IsBitSet(leaf_1.edx, 25); + features->sse2 = IsBitSet(leaf_1.edx, 26); + features->sse3 = IsBitSet(leaf_1.ecx, 0); + features->ssse3 = IsBitSet(leaf_1.ecx, 9); + features->sse4_1 = IsBitSet(leaf_1.ecx, 19); + features->sse4_2 = IsBitSet(leaf_1.ecx, 20); + } + + if (os_support.have_avx) { + features->fma3 = IsBitSet(leaf_1.ecx, 12); + features->avx = IsBitSet(leaf_1.ecx, 28); + features->avx2 = IsBitSet(leaf_7.ebx, 5); + } + + if (os_support.have_avx512) { + features->avx512f = IsBitSet(leaf_7.ebx, 16); + features->avx512cd = IsBitSet(leaf_7.ebx, 28); + features->avx512er = IsBitSet(leaf_7.ebx, 27); + features->avx512pf = IsBitSet(leaf_7.ebx, 26); + features->avx512bw = IsBitSet(leaf_7.ebx, 30); + features->avx512dq = IsBitSet(leaf_7.ebx, 17); + features->avx512vl = IsBitSet(leaf_7.ebx, 31); + features->avx512ifma = IsBitSet(leaf_7.ebx, 21); + features->avx512vbmi = IsBitSet(leaf_7.ecx, 1); + features->avx512vbmi2 = IsBitSet(leaf_7.ecx, 6); + features->avx512vnni = IsBitSet(leaf_7.ecx, 11); + features->avx512bitalg = IsBitSet(leaf_7.ecx, 12); + features->avx512vpopcntdq = IsBitSet(leaf_7.ecx, 14); + features->avx512_4vnniw = IsBitSet(leaf_7.edx, 2); + features->avx512_4vbmi2 = IsBitSet(leaf_7.edx, 3); + features->avx512_second_fma = HasSecondFMA(info->model); + features->avx512_4fmaps = IsBitSet(leaf_7.edx, 3); + features->avx512_bf16 = IsBitSet(leaf_7_1.eax, 5); + features->avx512_vp2intersect = IsBitSet(leaf_7.edx, 8); + } + + if (os_support.have_amx) { + features->amx_bf16 = IsBitSet(leaf_7.edx, 22); + features->amx_tile = IsBitSet(leaf_7.edx, 24); + features->amx_int8 = IsBitSet(leaf_7.edx, 25); + } +} + +// Reference +// https://en.wikipedia.org/wiki/CPUID#EAX=80000000h:_Get_Highest_Extended_Function_Implemented. +static void ParseExtraAMDCpuId(X86Info* info, OsSupport os_support) { + const Leaf leaf_80000000 = CpuId(0x80000000); + const uint32_t max_extended_cpuid_leaf = leaf_80000000.eax; + const Leaf leaf_80000001 = SafeCpuId(max_extended_cpuid_leaf, 0x80000001); + + X86Features* const features = &info->features; + + if (os_support.have_sse_via_cpuid) { + features->sse4a = IsBitSet(leaf_80000001.ecx, 6); + } + + if (os_support.have_avx) { + features->fma4 = IsBitSet(leaf_80000001.ecx, 16); + } +} + +static const X86Info kEmptyX86Info; +static const CacheInfo kEmptyCacheInfo; + +X86Info GetX86Info(void) { + X86Info info = kEmptyX86Info; + const Leaf leaf_0 = CpuId(0); + const bool is_intel = IsVendor(leaf_0, "GenuineIntel"); + const bool is_amd = IsVendor(leaf_0, "AuthenticAMD"); + SetVendor(leaf_0, info.vendor); + if (is_intel || is_amd) { + const uint32_t max_cpuid_leaf = leaf_0.eax; + const OsSupport os_support = CheckOsSupport(max_cpuid_leaf); + ParseCpuId(max_cpuid_leaf, os_support, &info); + if (is_amd) { + ParseExtraAMDCpuId(&info, os_support); + } + } + return info; +} + +CacheInfo GetX86CacheInfo(void) { + CacheInfo info = kEmptyCacheInfo; + const Leaf leaf_0 = CpuId(0); + const uint32_t max_cpuid_leaf = leaf_0.eax; + if (IsVendor(leaf_0, "GenuineIntel")) { + ParseLeaf2(max_cpuid_leaf, &info); + ParseLeaf4(max_cpuid_leaf, &info); + } + return info; +} + +#define CPUID(FAMILY, MODEL) ((((FAMILY)&0xFF) << 8) | ((MODEL)&0xFF)) + +X86Microarchitecture GetX86Microarchitecture(const X86Info* info) { + if (memcmp(info->vendor, "GenuineIntel", sizeof(info->vendor)) == 0) { + switch (CPUID(info->family, info->model)) { + case CPUID(0x06, 0x35): + case CPUID(0x06, 0x36): + // https://en.wikipedia.org/wiki/Bonnell_(microarchitecture) + return INTEL_ATOM_BNL; + case CPUID(0x06, 0x37): + case CPUID(0x06, 0x4C): + // https://en.wikipedia.org/wiki/Silvermont + return INTEL_ATOM_SMT; + case CPUID(0x06, 0x5C): + // https://en.wikipedia.org/wiki/Goldmont + return INTEL_ATOM_GMT; + case CPUID(0x06, 0x0F): + case CPUID(0x06, 0x16): + // https://en.wikipedia.org/wiki/Intel_Core_(microarchitecture) + return INTEL_CORE; + case CPUID(0x06, 0x17): + case CPUID(0x06, 0x1D): + // https://en.wikipedia.org/wiki/Penryn_(microarchitecture) + return INTEL_PNR; + case CPUID(0x06, 0x1A): + case CPUID(0x06, 0x1E): + case CPUID(0x06, 0x1F): + case CPUID(0x06, 0x2E): + // https://en.wikipedia.org/wiki/Nehalem_(microarchitecture) + return INTEL_NHM; + case CPUID(0x06, 0x25): + case CPUID(0x06, 0x2C): + case CPUID(0x06, 0x2F): + // https://en.wikipedia.org/wiki/Westmere_(microarchitecture) + return INTEL_WSM; + case CPUID(0x06, 0x2A): + case CPUID(0x06, 0x2D): + // https://en.wikipedia.org/wiki/Sandy_Bridge#Models_and_steppings + return INTEL_SNB; + case CPUID(0x06, 0x3A): + case CPUID(0x06, 0x3E): + // https://en.wikipedia.org/wiki/Ivy_Bridge_(microarchitecture)#Models_and_steppings + return INTEL_IVB; + case CPUID(0x06, 0x3C): + case CPUID(0x06, 0x3F): + case CPUID(0x06, 0x45): + case CPUID(0x06, 0x46): + // https://en.wikipedia.org/wiki/Haswell_(microarchitecture) + return INTEL_HSW; + case CPUID(0x06, 0x3D): + case CPUID(0x06, 0x47): + case CPUID(0x06, 0x4F): + case CPUID(0x06, 0x56): + // https://en.wikipedia.org/wiki/Broadwell_(microarchitecture) + return INTEL_BDW; + case CPUID(0x06, 0x4E): + case CPUID(0x06, 0x55): + case CPUID(0x06, 0x5E): + // https://en.wikipedia.org/wiki/Skylake_(microarchitecture) + return INTEL_SKL; + case CPUID(0x06, 0x66): + // https://en.wikipedia.org/wiki/Cannon_Lake_(microarchitecture) + return INTEL_CNL; + case CPUID(0x06, 0x7D): // client + case CPUID(0x06, 0x7E): // client + case CPUID(0x06, 0x9D): // NNP-I + case CPUID(0x06, 0x6A): // server + case CPUID(0x06, 0x6C): // server + // https://en.wikipedia.org/wiki/Ice_Lake_(microprocessor) + return INTEL_ICL; + case CPUID(0x06, 0x8C): + case CPUID(0x06, 0x8D): + // https://en.wikipedia.org/wiki/Tiger_Lake_(microarchitecture) + return INTEL_TGL; + case CPUID(0x06, 0x8F): + // https://en.wikipedia.org/wiki/Sapphire_Rapids + return INTEL_SPR; + case CPUID(0x06, 0x8E): + switch (info->stepping) { + case 9: + return INTEL_KBL; // https://en.wikipedia.org/wiki/Kaby_Lake + case 10: + return INTEL_CFL; // https://en.wikipedia.org/wiki/Coffee_Lake + case 11: + return INTEL_WHL; // https://en.wikipedia.org/wiki/Whiskey_Lake_(microarchitecture) + default: + return X86_UNKNOWN; + } + case CPUID(0x06, 0x9E): + if (info->stepping > 9) { + // https://en.wikipedia.org/wiki/Coffee_Lake + return INTEL_CFL; + } else { + // https://en.wikipedia.org/wiki/Kaby_Lake + return INTEL_KBL; + } + default: + return X86_UNKNOWN; + } + } + if (memcmp(info->vendor, "AuthenticAMD", sizeof(info->vendor)) == 0) { + switch (info->family) { + // https://en.wikipedia.org/wiki/List_of_AMD_CPU_microarchitectures + case 0x0F: + return AMD_HAMMER; + case 0x10: + return AMD_K10; + case 0x14: + return AMD_BOBCAT; + case 0x15: + return AMD_BULLDOZER; + case 0x16: + return AMD_JAGUAR; + case 0x17: + return AMD_ZEN; + default: + return X86_UNKNOWN; + } + } + return X86_UNKNOWN; +} + +static void SetString(const uint32_t max_cpuid_ext_leaf, const uint32_t leaf_id, + char* buffer) { + const Leaf leaf = SafeCpuId(max_cpuid_ext_leaf, leaf_id); + // We allow calling memcpy from SetString which is only called when requesting + // X86BrandString. + memcpy(buffer, &leaf, sizeof(Leaf)); +} + +void FillX86BrandString(char brand_string[49]) { + const Leaf leaf_ext_0 = CpuId(0x80000000); + const uint32_t max_cpuid_leaf_ext = leaf_ext_0.eax; + SetString(max_cpuid_leaf_ext, 0x80000002, brand_string); + SetString(max_cpuid_leaf_ext, 0x80000003, brand_string + 16); + SetString(max_cpuid_leaf_ext, 0x80000004, brand_string + 32); + brand_string[48] = '\0'; +} + +//////////////////////////////////////////////////////////////////////////////// +// Introspection functions + +int GetX86FeaturesEnumValue(const X86Features* features, + X86FeaturesEnum value) { + if (value >= X86_LAST_) return false; + return kGetters[value](features); +} + +const char* GetX86FeaturesEnumName(X86FeaturesEnum value) { + if (value >= X86_LAST_) return "unknown_feature"; + return kCpuInfoFlags[value]; +} + +const char* GetX86MicroarchitectureName(X86Microarchitecture uarch) { + switch (uarch) { + case X86_UNKNOWN: + return "X86_UNKNOWN"; + case INTEL_CORE: + return "INTEL_CORE"; + case INTEL_PNR: + return "INTEL_PNR"; + case INTEL_NHM: + return "INTEL_NHM"; + case INTEL_ATOM_BNL: + return "INTEL_ATOM_BNL"; + case INTEL_WSM: + return "INTEL_WSM"; + case INTEL_SNB: + return "INTEL_SNB"; + case INTEL_IVB: + return "INTEL_IVB"; + case INTEL_ATOM_SMT: + return "INTEL_ATOM_SMT"; + case INTEL_HSW: + return "INTEL_HSW"; + case INTEL_BDW: + return "INTEL_BDW"; + case INTEL_SKL: + return "INTEL_SKL"; + case INTEL_ATOM_GMT: + return "INTEL_ATOM_GMT"; + case INTEL_KBL: + return "INTEL_KBL"; + case INTEL_CFL: + return "INTEL_CFL"; + case INTEL_WHL: + return "INTEL_WHL"; + case INTEL_CNL: + return "INTEL_CNL"; + case INTEL_ICL: + return "INTEL_ICL"; + case INTEL_TGL: + return "INTEL_TGL"; + case INTEL_SPR: + return "INTEL_SPR"; + case AMD_HAMMER: + return "AMD_HAMMER"; + case AMD_K10: + return "AMD_K10"; + case AMD_BOBCAT: + return "AMD_BOBCAT"; + case AMD_BULLDOZER: + return "AMD_BULLDOZER"; + case AMD_JAGUAR: + return "AMD_JAGUAR"; + case AMD_ZEN: + return "AMD_ZEN"; + } + return "unknown microarchitecture"; +} diff --git a/cpu_features/src/define_tables.h b/cpu_features/src/define_tables.h new file mode 100644 index 0000000..dc1485c --- /dev/null +++ b/cpu_features/src/define_tables.h @@ -0,0 +1,67 @@ +// Copyright 2020 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// The following preprocessor constants must be defined before including this +// file: +// - DEFINE_TABLE_FEATURE_TYPE, the underlying type (e.g. X86Features) +// - DEFINE_TABLE_FEATURES, the list of FEATURE macros to be inserted. + +// This file is to be included once per `cpuinfo_XXX.c` in order to construct +// feature getters and setters functions as well as several enum indexed tables +// from the db file. +// - `kGetters` a table of getters function pointers from feature enum to +// retrieve a feature, +// - `kSetters` a table of setters function pointers from feature enum to set a +// feature, +// - `kCpuInfoFlags` a table of strings from feature enum to /proc/cpuinfo +// flags, +// - `kHardwareCapabilities` a table of HardwareCapabilities structs indexed by +// their feature enum. + +#ifndef SRC_DEFINE_TABLES_H_ +#define SRC_DEFINE_TABLES_H_ + +#define FEATURE(ENUM, NAME, CPUINFO_FLAG, HWCAP, HWCAP2) [ENUM] = CPUINFO_FLAG, +static const char* kCpuInfoFlags[] = {DEFINE_TABLE_FEATURES}; +#undef FEATURE + +#ifndef DEFINE_TABLE_DONT_GENERATE_HWCAPS +#define FEATURE(ENUM, NAME, CPUINFO_FLAG, HWCAP, HWCAP2) \ + [ENUM] = (HardwareCapabilities){HWCAP, HWCAP2}, +static const HardwareCapabilities kHardwareCapabilities[] = { + DEFINE_TABLE_FEATURES}; +#undef FEATURE +#endif // DEFINE_TABLE_DONT_GENERATE_HWCAPS + +#define FEATURE(ENUM, NAME, CPUINFO_FLAG, HWCAP, HWCAP2) \ + static void set_##ENUM(DEFINE_TABLE_FEATURE_TYPE* features, bool value) { \ + features->NAME = value; \ + } \ + static int get_##ENUM(const DEFINE_TABLE_FEATURE_TYPE* features) { \ + return features->NAME; \ + } +DEFINE_TABLE_FEATURES +#undef FEATURE + +#define FEATURE(ENUM, NAME, CPUINFO_FLAG, HWCAP, HWCAP2) [ENUM] = set_##ENUM, +static void (*const kSetters[])(DEFINE_TABLE_FEATURE_TYPE*, + bool) = {DEFINE_TABLE_FEATURES}; +#undef FEATURE + +#define FEATURE(ENUM, NAME, CPUINFO_FLAG, HWCAP, HWCAP2) [ENUM] = get_##ENUM, +static int (*const kGetters[])(const DEFINE_TABLE_FEATURE_TYPE*) = { + DEFINE_TABLE_FEATURES}; +#undef FEATURE + +#endif // SRC_DEFINE_TABLES_H_ diff --git a/cpu_features/src/filesystem.c b/cpu_features/src/filesystem.c new file mode 100644 index 0000000..46c9906 --- /dev/null +++ b/cpu_features/src/filesystem.c @@ -0,0 +1,62 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/filesystem.h" + +#include +#include +#include +#include +#include + +#if defined(CPU_FEATURES_MOCK_FILESYSTEM) +// Implementation will be provided by test/filesystem_for_testing.cc. +#elif defined(_MSC_VER) +#include +int CpuFeatures_OpenFile(const char* filename) { + int fd = -1; + _sopen_s(&fd, filename, _O_RDONLY, _SH_DENYWR, _S_IREAD); + return fd; +} + +void CpuFeatures_CloseFile(int file_descriptor) { _close(file_descriptor); } + +int CpuFeatures_ReadFile(int file_descriptor, void* buffer, + size_t buffer_size) { + return _read(file_descriptor, buffer, (unsigned int)buffer_size); +} + +#else +#include + +int CpuFeatures_OpenFile(const char* filename) { + int result; + do { + result = open(filename, O_RDONLY); + } while (result == -1L && errno == EINTR); + return result; +} + +void CpuFeatures_CloseFile(int file_descriptor) { close(file_descriptor); } + +int CpuFeatures_ReadFile(int file_descriptor, void* buffer, + size_t buffer_size) { + int result; + do { + result = read(file_descriptor, buffer, buffer_size); + } while (result == -1L && errno == EINTR); + return result; +} + +#endif diff --git a/cpu_features/src/hwcaps.c b/cpu_features/src/hwcaps.c new file mode 100644 index 0000000..dd17e3b --- /dev/null +++ b/cpu_features/src/hwcaps.c @@ -0,0 +1,182 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/hwcaps.h" + +#include +#include + +#include "cpu_features_macros.h" +#include "internal/filesystem.h" +#include "internal/string_view.h" + +static bool IsSet(const uint32_t mask, const uint32_t value) { + if (mask == 0) return false; + return (value & mask) == mask; +} + +bool CpuFeatures_IsHwCapsSet(const HardwareCapabilities hwcaps_mask, + const HardwareCapabilities hwcaps) { + return IsSet(hwcaps_mask.hwcaps, hwcaps.hwcaps) || + IsSet(hwcaps_mask.hwcaps2, hwcaps.hwcaps2); +} + +#ifdef CPU_FEATURES_TEST +// In test mode, hwcaps_for_testing will define the following functions. +HardwareCapabilities CpuFeatures_GetHardwareCapabilities(void); +PlatformType CpuFeatures_GetPlatformType(void); +#else + +// Debug facilities +#if defined(NDEBUG) +#define D(...) +#else +#include +#define D(...) \ + do { \ + printf(__VA_ARGS__); \ + fflush(stdout); \ + } while (0) +#endif + +//////////////////////////////////////////////////////////////////////////////// +// Implementation of GetElfHwcapFromGetauxval +//////////////////////////////////////////////////////////////////////////////// + +#define AT_HWCAP 16 +#define AT_HWCAP2 26 +#define AT_PLATFORM 15 +#define AT_BASE_PLATFORM 24 + +#if defined(HAVE_STRONG_GETAUXVAL) +#include +static unsigned long GetElfHwcapFromGetauxval(uint32_t hwcap_type) { + return getauxval(hwcap_type); +} +#elif defined(HAVE_DLFCN_H) +// On Android we probe the system's C library for a 'getauxval' function and +// call it if it exits, or return 0 for failure. This function is available +// since API level 20. +// +// This code does *NOT* check for '__ANDROID_API__ >= 20' to support the edge +// case where some NDK developers use headers for a platform that is newer than +// the one really targetted by their application. This is typically done to use +// newer native APIs only when running on more recent Android versions, and +// requires careful symbol management. +// +// Note that getauxval() can't really be re-implemented here, because its +// implementation does not parse /proc/self/auxv. Instead it depends on values +// that are passed by the kernel at process-init time to the C runtime +// initialization layer. + +#include + +typedef unsigned long getauxval_func_t(unsigned long); + +static uint32_t GetElfHwcapFromGetauxval(uint32_t hwcap_type) { + uint32_t ret = 0; + void *libc_handle = NULL; + getauxval_func_t *func = NULL; + + dlerror(); // Cleaning error state before calling dlopen. + libc_handle = dlopen("libc.so", RTLD_NOW); + if (!libc_handle) { + D("Could not dlopen() C library: %s\n", dlerror()); + return 0; + } + func = (getauxval_func_t *)dlsym(libc_handle, "getauxval"); + if (!func) { + D("Could not find getauxval() in C library\n"); + } else { + // Note: getauxval() returns 0 on failure. Doesn't touch errno. + ret = (uint32_t)(*func)(hwcap_type); + } + dlclose(libc_handle); + return ret; +} +#else +#error "This platform does not provide hardware capabilities." +#endif + +// Implementation of GetHardwareCapabilities for OS that provide +// GetElfHwcapFromGetauxval(). + +// Fallback when getauxval is not available, retrieves hwcaps from +// "/proc/self/auxv". +static uint32_t GetElfHwcapFromProcSelfAuxv(uint32_t hwcap_type) { + struct { + uint32_t tag; + uint32_t value; + } entry; + uint32_t result = 0; + const char filepath[] = "/proc/self/auxv"; + const int fd = CpuFeatures_OpenFile(filepath); + if (fd < 0) { + D("Could not open %s\n", filepath); + return 0; + } + for (;;) { + const int ret = CpuFeatures_ReadFile(fd, (char *)&entry, sizeof entry); + if (ret < 0) { + D("Error while reading %s\n", filepath); + break; + } + // Detect end of list. + if (ret == 0 || (entry.tag == 0 && entry.value == 0)) { + break; + } + if (entry.tag == hwcap_type) { + result = entry.value; + break; + } + } + CpuFeatures_CloseFile(fd); + return result; +} + +// Retrieves hardware capabilities by first trying to call getauxval, if not +// available falls back to reading "/proc/self/auxv". +static unsigned long GetHardwareCapabilitiesFor(uint32_t type) { + unsigned long hwcaps = GetElfHwcapFromGetauxval(type); + if (!hwcaps) { + D("Parsing /proc/self/auxv to extract ELF hwcaps!\n"); + hwcaps = GetElfHwcapFromProcSelfAuxv(type); + } + return hwcaps; +} + +HardwareCapabilities CpuFeatures_GetHardwareCapabilities(void) { + HardwareCapabilities capabilities; + capabilities.hwcaps = GetHardwareCapabilitiesFor(AT_HWCAP); + capabilities.hwcaps2 = GetHardwareCapabilitiesFor(AT_HWCAP2); + return capabilities; +} + +PlatformType kEmptyPlatformType; + +PlatformType CpuFeatures_GetPlatformType(void) { + PlatformType type = kEmptyPlatformType; + char *platform = (char *)GetHardwareCapabilitiesFor(AT_PLATFORM); + char *base_platform = (char *)GetHardwareCapabilitiesFor(AT_BASE_PLATFORM); + + if (platform != NULL) + CpuFeatures_StringView_CopyString(str(platform), type.platform, + sizeof(type.platform)); + if (base_platform != NULL) + CpuFeatures_StringView_CopyString(str(base_platform), type.base_platform, + sizeof(type.base_platform)); + return type; +} + +#endif // CPU_FEATURES_TEST diff --git a/cpu_features/src/stack_line_reader.c b/cpu_features/src/stack_line_reader.c new file mode 100644 index 0000000..ffc778d --- /dev/null +++ b/cpu_features/src/stack_line_reader.c @@ -0,0 +1,132 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/stack_line_reader.h" + +#include +#include +#include + +#include "internal/filesystem.h" + +void StackLineReader_Initialize(StackLineReader* reader, int fd) { + reader->view.ptr = reader->buffer; + reader->view.size = 0; + reader->skip_mode = false; + reader->fd = fd; +} + +// Replaces the content of buffer with bytes from the file. +static int LoadFullBuffer(StackLineReader* reader) { + const int read = CpuFeatures_ReadFile(reader->fd, reader->buffer, + STACK_LINE_READER_BUFFER_SIZE); + assert(read >= 0); + reader->view.ptr = reader->buffer; + reader->view.size = read; + return read; +} + +// Appends with bytes from the file to buffer, filling the remaining space. +static int LoadMore(StackLineReader* reader) { + char* const ptr = reader->buffer + reader->view.size; + const size_t size_to_read = STACK_LINE_READER_BUFFER_SIZE - reader->view.size; + const int read = CpuFeatures_ReadFile(reader->fd, ptr, size_to_read); + assert(read >= 0); + assert(read <= (int)size_to_read); + reader->view.size += read; + return read; +} + +static int IndexOfEol(StackLineReader* reader) { + return CpuFeatures_StringView_IndexOfChar(reader->view, '\n'); +} + +// Relocate buffer's pending bytes at the beginning of the array and fills the +// remaining space with bytes from the file. +static int BringToFrontAndLoadMore(StackLineReader* reader) { + if (reader->view.size && reader->view.ptr != reader->buffer) { + memmove(reader->buffer, reader->view.ptr, reader->view.size); + } + reader->view.ptr = reader->buffer; + return LoadMore(reader); +} + +// Loads chunks of buffer size from disks until it contains a newline character +// or end of file. +static void SkipToNextLine(StackLineReader* reader) { + for (;;) { + const int read = LoadFullBuffer(reader); + if (read == 0) { + break; + } else { + const int eol_index = IndexOfEol(reader); + if (eol_index >= 0) { + reader->view = + CpuFeatures_StringView_PopFront(reader->view, eol_index + 1); + break; + } + } + } +} + +static LineResult CreateLineResult(bool eof, bool full_line, StringView view) { + LineResult result; + result.eof = eof; + result.full_line = full_line; + result.line = view; + return result; +} + +// Helper methods to provide clearer semantic in StackLineReader_NextLine. +static LineResult CreateEOFLineResult(StringView view) { + return CreateLineResult(true, true, view); +} + +static LineResult CreateTruncatedLineResult(StringView view) { + return CreateLineResult(false, false, view); +} + +static LineResult CreateValidLineResult(StringView view) { + return CreateLineResult(false, true, view); +} + +LineResult StackLineReader_NextLine(StackLineReader* reader) { + if (reader->skip_mode) { + SkipToNextLine(reader); + reader->skip_mode = false; + } + { + const bool can_load_more = + reader->view.size < STACK_LINE_READER_BUFFER_SIZE; + int eol_index = IndexOfEol(reader); + if (eol_index < 0 && can_load_more) { + const int read = BringToFrontAndLoadMore(reader); + if (read == 0) { + return CreateEOFLineResult(reader->view); + } + eol_index = IndexOfEol(reader); + } + if (eol_index < 0) { + reader->skip_mode = true; + return CreateTruncatedLineResult(reader->view); + } + { + StringView line = + CpuFeatures_StringView_KeepFront(reader->view, eol_index); + reader->view = + CpuFeatures_StringView_PopFront(reader->view, eol_index + 1); + return CreateValidLineResult(line); + } + } +} diff --git a/cpu_features/src/string_view.c b/cpu_features/src/string_view.c new file mode 100644 index 0000000..dc3158f --- /dev/null +++ b/cpu_features/src/string_view.c @@ -0,0 +1,182 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/string_view.h" + +#include +#include +#include + +int CpuFeatures_StringView_IndexOfChar(const StringView view, char c) { + if (view.ptr && view.size) { + const char* const found = (const char*)memchr(view.ptr, c, view.size); + if (found) { + return (int)(found - view.ptr); + } + } + return -1; +} + +int CpuFeatures_StringView_IndexOf(const StringView view, + const StringView sub_view) { + if (sub_view.size) { + StringView remainder = view; + while (remainder.size >= sub_view.size) { + const int found_index = + CpuFeatures_StringView_IndexOfChar(remainder, sub_view.ptr[0]); + if (found_index < 0) break; + remainder = CpuFeatures_StringView_PopFront(remainder, found_index); + if (CpuFeatures_StringView_StartsWith(remainder, sub_view)) { + return (int)(remainder.ptr - view.ptr); + } + remainder = CpuFeatures_StringView_PopFront(remainder, 1); + } + } + return -1; +} + +bool CpuFeatures_StringView_IsEquals(const StringView a, const StringView b) { + if (a.size == b.size) { + return a.ptr == b.ptr || memcmp(a.ptr, b.ptr, b.size) == 0; + } + return false; +} + +bool CpuFeatures_StringView_StartsWith(const StringView a, const StringView b) { + return a.ptr && b.ptr && b.size && a.size >= b.size + ? memcmp(a.ptr, b.ptr, b.size) == 0 + : false; +} + +StringView CpuFeatures_StringView_PopFront(const StringView str_view, + size_t count) { + if (count > str_view.size) { + return kEmptyStringView; + } + return view(str_view.ptr + count, str_view.size - count); +} + +StringView CpuFeatures_StringView_PopBack(const StringView str_view, + size_t count) { + if (count > str_view.size) { + return kEmptyStringView; + } + return view(str_view.ptr, str_view.size - count); +} + +StringView CpuFeatures_StringView_KeepFront(const StringView str_view, + size_t count) { + return count <= str_view.size ? view(str_view.ptr, count) : str_view; +} + +char CpuFeatures_StringView_Front(const StringView view) { + assert(view.size); + assert(view.ptr); + return view.ptr[0]; +} + +char CpuFeatures_StringView_Back(const StringView view) { + assert(view.size); + return view.ptr[view.size - 1]; +} + +StringView CpuFeatures_StringView_TrimWhitespace(StringView view) { + while (view.size && isspace(CpuFeatures_StringView_Front(view))) + view = CpuFeatures_StringView_PopFront(view, 1); + while (view.size && isspace(CpuFeatures_StringView_Back(view))) + view = CpuFeatures_StringView_PopBack(view, 1); + return view; +} + +static int HexValue(const char c) { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return -1; +} + +// Returns -1 if view contains non digits. +static int ParsePositiveNumberWithBase(const StringView view, int base) { + int result = 0; + StringView remainder = view; + for (; remainder.size; + remainder = CpuFeatures_StringView_PopFront(remainder, 1)) { + const int value = HexValue(CpuFeatures_StringView_Front(remainder)); + if (value < 0 || value >= base) return -1; + result = (result * base) + value; + } + return result; +} + +int CpuFeatures_StringView_ParsePositiveNumber(const StringView view) { + if (view.size) { + const StringView hex_prefix = str("0x"); + if (CpuFeatures_StringView_StartsWith(view, hex_prefix)) { + const StringView span_no_prefix = + CpuFeatures_StringView_PopFront(view, hex_prefix.size); + return ParsePositiveNumberWithBase(span_no_prefix, 16); + } + return ParsePositiveNumberWithBase(view, 10); + } + return -1; +} + +void CpuFeatures_StringView_CopyString(const StringView src, char* dst, + size_t dst_size) { + if (dst_size > 0) { + const size_t max_copy_size = dst_size - 1; + const size_t copy_size = + src.size > max_copy_size ? max_copy_size : src.size; + memcpy(dst, src.ptr, copy_size); + dst[copy_size] = '\0'; + } +} + +bool CpuFeatures_StringView_HasWord(const StringView line, + const char* const word_str) { + const StringView word = str(word_str); + StringView remainder = line; + for (;;) { + const int index_of_word = CpuFeatures_StringView_IndexOf(remainder, word); + if (index_of_word < 0) { + return false; + } else { + const StringView before = + CpuFeatures_StringView_KeepFront(line, index_of_word); + const StringView after = + CpuFeatures_StringView_PopFront(line, index_of_word + word.size); + const bool valid_before = + before.size == 0 || CpuFeatures_StringView_Back(before) == ' '; + const bool valid_after = + after.size == 0 || CpuFeatures_StringView_Front(after) == ' '; + if (valid_before && valid_after) return true; + remainder = + CpuFeatures_StringView_PopFront(remainder, index_of_word + word.size); + } + } + return false; +} + +bool CpuFeatures_StringView_GetAttributeKeyValue(const StringView line, + StringView* key, + StringView* value) { + const StringView sep = str(": "); + const int index_of_separator = CpuFeatures_StringView_IndexOf(line, sep); + if (index_of_separator < 0) return false; + *value = CpuFeatures_StringView_TrimWhitespace( + CpuFeatures_StringView_PopFront(line, index_of_separator + sep.size)); + *key = CpuFeatures_StringView_TrimWhitespace( + CpuFeatures_StringView_KeepFront(line, index_of_separator)); + return true; +} diff --git a/cpu_features/src/utils/list_cpu_features.c b/cpu_features/src/utils/list_cpu_features.c new file mode 100644 index 0000000..c80ffc5 --- /dev/null +++ b/cpu_features/src/utils/list_cpu_features.c @@ -0,0 +1,438 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This program dumps current host data to the standard output. +// Output can be text or json if the `--json` flag is passed. + +#include +#include +#include +#include +#include +#include +#include + +#include "cpu_features_macros.h" + +#if defined(CPU_FEATURES_ARCH_X86) +#include "cpuinfo_x86.h" +#elif defined(CPU_FEATURES_ARCH_ARM) +#include "cpuinfo_arm.h" +#elif defined(CPU_FEATURES_ARCH_AARCH64) +#include "cpuinfo_aarch64.h" +#elif defined(CPU_FEATURES_ARCH_MIPS) +#include "cpuinfo_mips.h" +#elif defined(CPU_FEATURES_ARCH_PPC) +#include "cpuinfo_ppc.h" +#endif + +// Design principles +// ----------------- +// We build a tree structure containing all the data to be displayed. +// Then depending on the output type (text or json) we walk the tree and display +// the data accordingly. + +// We use a bump allocator to allocate strings and nodes of the tree, +// Memory is not intended to be reclaimed. +typedef struct { + char* ptr; + size_t size; +} BumpAllocator; + +char gGlobalBuffer[64 * 1024]; +BumpAllocator gBumpAllocator = {.ptr = gGlobalBuffer, + .size = sizeof(gGlobalBuffer)}; + +static void internal_error() { + fputs("internal error\n", stderr); + exit(EXIT_FAILURE); +} + +#define ALIGN 8 + +static void assertAligned() { + if ((uintptr_t)(gBumpAllocator.ptr) % ALIGN) internal_error(); +} + +static void BA_Align() { + while (gBumpAllocator.size && (uintptr_t)(gBumpAllocator.ptr) % ALIGN) { + --gBumpAllocator.size; + ++gBumpAllocator.ptr; + } + assertAligned(); +} + +// Update the available memory left in the BumpAllocator. +static void* BA_Bump(size_t size) { + assertAligned(); + // Align size to next 8B boundary. + size = (size + ALIGN - 1) / ALIGN * ALIGN; + if (gBumpAllocator.size < size) internal_error(); + void* ptr = gBumpAllocator.ptr; + gBumpAllocator.size -= size; + gBumpAllocator.ptr += size; + return ptr; +} + +// The type of the nodes in the tree. +typedef enum { + NT_INVALID, + NT_INT, + NT_MAP, + NT_MAP_ENTRY, + NT_ARRAY, + NT_ARRAY_ELEMENT, + NT_STRING, +} NodeType; + +// The node in the tree. +typedef struct Node { + NodeType type; + unsigned integer; + const char* string; + struct Node* value; + struct Node* next; +} Node; + +// Creates an initialized Node. +static Node* BA_CreateNode(NodeType type) { + Node* tv = (Node*)BA_Bump(sizeof(Node)); + assert(tv); + *tv = (Node){.type = type}; + return tv; +} + +// Adds an integer node. +static Node* CreateInt(int value) { + Node* tv = BA_CreateNode(NT_INT); + tv->integer = value; + return tv; +} + +// Adds a string node. +// `value` must outlive the tree. +static Node* CreateConstantString(const char* value) { + Node* tv = BA_CreateNode(NT_STRING); + tv->string = value; + return tv; +} + +// Adds a map node. +static Node* CreateMap() { return BA_CreateNode(NT_MAP); } + +// Adds an array node. +static Node* CreateArray() { return BA_CreateNode(NT_ARRAY); } + +// Adds a formatted string node. +static Node* CreatePrintfString(const char* format, ...) { + va_list arglist; + va_start(arglist, format); + char* const ptr = gBumpAllocator.ptr; + const int written = vsnprintf(ptr, gBumpAllocator.size, format, arglist); + va_end(arglist); + if (written < 0 || written >= (int)gBumpAllocator.size) internal_error(); + return CreateConstantString((char*)BA_Bump(written)); +} + +// Adds a string node. +static Node* CreateString(const char* value) { + return CreatePrintfString("%s", value); +} + +// Adds a map entry node. +static void AddMapEntry(Node* map, const char* key, Node* value) { + assert(map && map->type == NT_MAP); + Node* current = map; + while (current->next) current = current->next; + current->next = (Node*)BA_Bump(sizeof(Node)); + *current->next = (Node){.type = NT_MAP_ENTRY, .string = key, .value = value}; +} + +// Adds an array element node. +static void AddArrayElement(Node* array, Node* value) { + assert(array && array->type == NT_ARRAY); + Node* current = array; + while (current->next) current = current->next; + current->next = (Node*)BA_Bump(sizeof(Node)); + *current->next = (Node){.type = NT_ARRAY_ELEMENT, .value = value}; +} + +static int cmp(const void* p1, const void* p2) { + return strcmp(*(const char* const*)p1, *(const char* const*)p2); +} + +#define DEFINE_ADD_FLAGS(HasFeature, FeatureName, FeatureType, LastEnum) \ + static void AddFlags(Node* map, const FeatureType* features) { \ + size_t i; \ + const char* ptrs[LastEnum] = {0}; \ + size_t count = 0; \ + for (i = 0; i < LastEnum; ++i) { \ + if (HasFeature(features, i)) { \ + ptrs[count] = FeatureName(i); \ + ++count; \ + } \ + } \ + qsort((void*)ptrs, count, sizeof(char*), cmp); \ + Node* const array = CreateArray(); \ + for (i = 0; i < count; ++i) \ + AddArrayElement(array, CreateConstantString(ptrs[i])); \ + AddMapEntry(map, "flags", array); \ + } + +#if defined(CPU_FEATURES_ARCH_X86) +DEFINE_ADD_FLAGS(GetX86FeaturesEnumValue, GetX86FeaturesEnumName, X86Features, + X86_LAST_) +#elif defined(CPU_FEATURES_ARCH_ARM) +DEFINE_ADD_FLAGS(GetArmFeaturesEnumValue, GetArmFeaturesEnumName, ArmFeatures, + ARM_LAST_) +#elif defined(CPU_FEATURES_ARCH_AARCH64) +DEFINE_ADD_FLAGS(GetAarch64FeaturesEnumValue, GetAarch64FeaturesEnumName, + Aarch64Features, AARCH64_LAST_) +#elif defined(CPU_FEATURES_ARCH_MIPS) +DEFINE_ADD_FLAGS(GetMipsFeaturesEnumValue, GetMipsFeaturesEnumName, + MipsFeatures, MIPS_LAST_) +#elif defined(CPU_FEATURES_ARCH_PPC) +DEFINE_ADD_FLAGS(GetPPCFeaturesEnumValue, GetPPCFeaturesEnumName, PPCFeatures, + PPC_LAST_) +#endif + +// Prints a json string with characters escaping. +static void printJsonString(const char* str) { + putchar('"'); + for (; str && *str; ++str) { + switch (*str) { + case '\"': + case '\\': + case '/': + case '\b': + case '\f': + case '\n': + case '\r': + case '\t': + putchar('\\'); + } + putchar(*str); + } + putchar('"'); +} + +// Walks a Node and print it as json. +static void printJson(const Node* current) { + assert(current); + switch (current->type) { + case NT_INVALID: + break; + case NT_INT: + printf("%d", current->integer); + break; + case NT_STRING: + printJsonString(current->string); + break; + case NT_ARRAY: + putchar('['); + if (current->next) printJson(current->next); + putchar(']'); + break; + case NT_MAP: + putchar('{'); + if (current->next) printJson(current->next); + putchar('}'); + break; + case NT_MAP_ENTRY: + printf("\"%s\":", current->string); + printJson(current->value); + if (current->next) { + putchar(','); + printJson(current->next); + } + break; + case NT_ARRAY_ELEMENT: + printJson(current->value); + if (current->next) { + putchar(','); + printJson(current->next); + } + break; + } +} + +// Walks a Node and print it as text. +static void printTextField(const Node* current) { + switch (current->type) { + case NT_INVALID: + break; + case NT_INT: + printf("%3d (0x%02X)", current->integer, current->integer); + break; + case NT_STRING: + fputs(current->string, stdout); + break; + case NT_ARRAY: + if (current->next) printTextField(current->next); + break; + case NT_MAP: + if (current->next) { + printf("{"); + printJson(current->next); + printf("}"); + } + break; + case NT_MAP_ENTRY: + printf("%-15s : ", current->string); + printTextField(current->value); + if (current->next) { + putchar('\n'); + printTextField(current->next); + } + break; + case NT_ARRAY_ELEMENT: + printTextField(current->value); + if (current->next) { + putchar(','); + printTextField(current->next); + } + break; + } +} + +static void printTextRoot(const Node* current) { + if (current->type == NT_MAP && current->next) printTextField(current->next); +} + +static void showUsage(const char* name) { + printf( + "\n" + "Usage: %s [options]\n" + " Options:\n" + " -h | --help Show help message.\n" + " -j | --json Format output as json instead of plain text.\n" + "\n", + name); +} + +static Node* GetCacheTypeString(CacheType cache_type) { + switch (cache_type) { + case CPU_FEATURE_CACHE_NULL: + return CreateConstantString("null"); + case CPU_FEATURE_CACHE_DATA: + return CreateConstantString("data"); + case CPU_FEATURE_CACHE_INSTRUCTION: + return CreateConstantString("instruction"); + case CPU_FEATURE_CACHE_UNIFIED: + return CreateConstantString("unified"); + case CPU_FEATURE_CACHE_TLB: + return CreateConstantString("tlb"); + case CPU_FEATURE_CACHE_DTLB: + return CreateConstantString("dtlb"); + case CPU_FEATURE_CACHE_STLB: + return CreateConstantString("stlb"); + case CPU_FEATURE_CACHE_PREFETCH: + return CreateConstantString("prefetch"); + } +} + +static void AddCacheInfo(Node* root, const CacheInfo* cache_info) { + Node* array = CreateArray(); + for (int i = 0; i < cache_info->size; ++i) { + CacheLevelInfo info = cache_info->levels[i]; + Node* map = CreateMap(); + AddMapEntry(map, "level", CreateInt(info.level)); + AddMapEntry(map, "cache_type", GetCacheTypeString(info.cache_type)); + AddMapEntry(map, "cache_size", CreateInt(info.cache_size)); + AddMapEntry(map, "ways", CreateInt(info.ways)); + AddMapEntry(map, "line_size", CreateInt(info.line_size)); + AddMapEntry(map, "tlb_entries", CreateInt(info.tlb_entries)); + AddMapEntry(map, "partitioning", CreateInt(info.partitioning)); + AddArrayElement(array, map); + } + AddMapEntry(root, "cache_info", array); +} + +static Node* CreateTree() { + Node* root = CreateMap(); +#if defined(CPU_FEATURES_ARCH_X86) + char brand_string[49]; + const X86Info info = GetX86Info(); + const CacheInfo cache_info = GetX86CacheInfo(); + FillX86BrandString(brand_string); + AddMapEntry(root, "arch", CreateString("x86")); + AddMapEntry(root, "brand", CreateString(brand_string)); + AddMapEntry(root, "family", CreateInt(info.family)); + AddMapEntry(root, "model", CreateInt(info.model)); + AddMapEntry(root, "stepping", CreateInt(info.stepping)); + AddMapEntry(root, "uarch", + CreateString( + GetX86MicroarchitectureName(GetX86Microarchitecture(&info)))); + AddFlags(root, &info.features); + AddCacheInfo(root, &cache_info); +#elif defined(CPU_FEATURES_ARCH_ARM) + const ArmInfo info = GetArmInfo(); + AddMapEntry(root, "arch", CreateString("ARM")); + AddMapEntry(root, "implementer", CreateInt(info.implementer)); + AddMapEntry(root, "architecture", CreateInt(info.architecture)); + AddMapEntry(root, "variant", CreateInt(info.variant)); + AddMapEntry(root, "part", CreateInt(info.part)); + AddMapEntry(root, "revision", CreateInt(info.revision)); + AddFlags(root, &info.features); +#elif defined(CPU_FEATURES_ARCH_AARCH64) + const Aarch64Info info = GetAarch64Info(); + AddMapEntry(root, "arch", CreateString("aarch64")); + AddMapEntry(root, "implementer", CreateInt(info.implementer)); + AddMapEntry(root, "variant", CreateInt(info.variant)); + AddMapEntry(root, "part", CreateInt(info.part)); + AddMapEntry(root, "revision", CreateInt(info.revision)); + AddFlags(root, &info.features); +#elif defined(CPU_FEATURES_ARCH_MIPS) + const MipsInfo info = GetMipsInfo(); + AddMapEntry(root, "arch", CreateString("mips")); + AddFlags(root, &info.features); +#elif defined(CPU_FEATURES_ARCH_PPC) + const PPCInfo info = GetPPCInfo(); + const PPCPlatformStrings strings = GetPPCPlatformStrings(); + AddMapEntry(root, "arch", CreateString("ppc")); + AddMapEntry(root, "platform", CreateString(strings.platform)); + AddMapEntry(root, "model", CreateString(strings.model)); + AddMapEntry(root, "machine", CreateString(strings.machine)); + AddMapEntry(root, "cpu", CreateString(strings.cpu)); + AddMapEntry(root, "instruction", CreateString(strings.type.platform)); + AddMapEntry(root, "microarchitecture", + CreateString(strings.type.base_platform)); + AddFlags(root, &info.features); +#endif + return root; +} + +int main(int argc, char** argv) { + BA_Align(); + const Node* const root = CreateTree(); + bool outputJson = false; + int i = 1; + for (; i < argc; ++i) { + const char* arg = argv[i]; + if (strcmp(arg, "-j") == 0 || strcmp(arg, "--json") == 0) { + outputJson = true; + } else { + showUsage(argv[0]); + if (strcmp(arg, "-h") == 0 || strcmp(arg, "--help") == 0) + return EXIT_SUCCESS; + return EXIT_FAILURE; + } + } + if (outputJson) + printJson(root); + else + printTextRoot(root); + putchar('\n'); + return EXIT_SUCCESS; +} diff --git a/cpu_features/test/CMakeLists.txt b/cpu_features/test/CMakeLists.txt new file mode 100644 index 0000000..c10e617 --- /dev/null +++ b/cpu_features/test/CMakeLists.txt @@ -0,0 +1,85 @@ +# +# libraries for tests +# + +include_directories(../include) +add_definitions(-DCPU_FEATURES_TEST) + +##------------------------------------------------------------------------------ +add_library(string_view ../src/string_view.c) +##------------------------------------------------------------------------------ +add_library(filesystem_for_testing filesystem_for_testing.cc) +target_compile_definitions(filesystem_for_testing PUBLIC CPU_FEATURES_MOCK_FILESYSTEM) +##------------------------------------------------------------------------------ +add_library(hwcaps_for_testing hwcaps_for_testing.cc) +target_link_libraries(hwcaps_for_testing filesystem_for_testing) +##------------------------------------------------------------------------------ +add_library(stack_line_reader ../src/stack_line_reader.c) +target_compile_definitions(stack_line_reader PUBLIC STACK_LINE_READER_BUFFER_SIZE=1024) +target_link_libraries(stack_line_reader string_view) +##------------------------------------------------------------------------------ +add_library(stack_line_reader_for_test ../src/stack_line_reader.c) +target_compile_definitions(stack_line_reader_for_test PUBLIC STACK_LINE_READER_BUFFER_SIZE=16) +target_link_libraries(stack_line_reader_for_test string_view filesystem_for_testing) +##------------------------------------------------------------------------------ +add_library(all_libraries ../src/hwcaps.c ../src/stack_line_reader.c) +target_link_libraries(all_libraries hwcaps_for_testing stack_line_reader string_view) + +# +# tests +# +link_libraries(gtest gmock_main) + +## bit_utils_test +add_executable(bit_utils_test bit_utils_test.cc) +target_link_libraries(bit_utils_test) +add_test(NAME bit_utils_test COMMAND bit_utils_test) +##------------------------------------------------------------------------------ +## string_view_test +add_executable(string_view_test string_view_test.cc ../src/string_view.c) +target_link_libraries(string_view_test string_view) +add_test(NAME string_view_test COMMAND string_view_test) +##------------------------------------------------------------------------------ +## stack_line_reader_test +add_executable(stack_line_reader_test stack_line_reader_test.cc) +target_link_libraries(stack_line_reader_test stack_line_reader_for_test) +add_test(NAME stack_line_reader_test COMMAND stack_line_reader_test) +##------------------------------------------------------------------------------ +## cpuinfo_x86_test +if(PROCESSOR_IS_X86) + add_executable(cpuinfo_x86_test cpuinfo_x86_test.cc ../src/cpuinfo_x86.c) + target_compile_definitions(cpuinfo_x86_test PUBLIC CPU_FEATURES_MOCK_CPUID_X86) + if(APPLE) + target_compile_definitions(cpuinfo_x86_test PRIVATE HAVE_SYSCTLBYNAME) + endif() + target_link_libraries(cpuinfo_x86_test all_libraries) + add_test(NAME cpuinfo_x86_test COMMAND cpuinfo_x86_test) +endif() +##------------------------------------------------------------------------------ +## cpuinfo_arm_test +if(PROCESSOR_IS_ARM) + add_executable(cpuinfo_arm_test cpuinfo_arm_test.cc ../src/cpuinfo_arm.c) + target_link_libraries(cpuinfo_arm_test all_libraries) + add_test(NAME cpuinfo_arm_test COMMAND cpuinfo_arm_test) +endif() +##------------------------------------------------------------------------------ +## cpuinfo_aarch64_test +if(PROCESSOR_IS_AARCH64) + add_executable(cpuinfo_aarch64_test cpuinfo_aarch64_test.cc ../src/cpuinfo_aarch64.c) + target_link_libraries(cpuinfo_aarch64_test all_libraries) + add_test(NAME cpuinfo_aarch64_test COMMAND cpuinfo_aarch64_test) +endif() +##------------------------------------------------------------------------------ +## cpuinfo_mips_test +if(PROCESSOR_IS_MIPS) + add_executable(cpuinfo_mips_test cpuinfo_mips_test.cc ../src/cpuinfo_mips.c) + target_link_libraries(cpuinfo_mips_test all_libraries) + add_test(NAME cpuinfo_mips_test COMMAND cpuinfo_mips_test) +endif() +##------------------------------------------------------------------------------ +## cpuinfo_ppc_test +if(PROCESSOR_IS_POWER) + add_executable(cpuinfo_ppc_test cpuinfo_ppc_test.cc ../src/cpuinfo_ppc.c) + target_link_libraries(cpuinfo_ppc_test all_libraries) + add_test(NAME cpuinfo_ppc_test COMMAND cpuinfo_ppc_test) +endif() diff --git a/cpu_features/test/bit_utils_test.cc b/cpu_features/test/bit_utils_test.cc new file mode 100644 index 0000000..3874e13 --- /dev/null +++ b/cpu_features/test/bit_utils_test.cc @@ -0,0 +1,53 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/bit_utils.h" + +#include "gtest/gtest.h" + +namespace cpu_features { +namespace { + +TEST(UtilsTest, IsBitSet) { + for (size_t bit_set = 0; bit_set < 32; ++bit_set) { + const uint32_t value = 1UL << bit_set; + for (uint32_t i = 0; i < 32; ++i) { + EXPECT_EQ(IsBitSet(value, i), i == bit_set); + } + } + + // testing 0, all bits should be 0. + for (uint32_t i = 0; i < 32; ++i) { + EXPECT_FALSE(IsBitSet(0, i)); + } + + // testing ~0, all bits should be 1. + for (uint32_t i = 0; i < 32; ++i) { + EXPECT_TRUE(IsBitSet(-1, i)); + } +} + +TEST(UtilsTest, ExtractBitRange) { + // Extracting all bits gives the same number. + EXPECT_EQ(ExtractBitRange(123, 31, 0), 123); + // Extracting 1 bit gives parity. + EXPECT_EQ(ExtractBitRange(123, 0, 0), 1); + EXPECT_EQ(ExtractBitRange(122, 0, 0), 0); + + EXPECT_EQ(ExtractBitRange(0xF0, 7, 4), 0xF); + EXPECT_EQ(ExtractBitRange(0x42 << 2, 10, 2), 0x42); +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/cpuinfo_aarch64_test.cc b/cpu_features/test/cpuinfo_aarch64_test.cc new file mode 100644 index 0000000..5afaaa8 --- /dev/null +++ b/cpu_features/test/cpuinfo_aarch64_test.cc @@ -0,0 +1,171 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_aarch64.h" + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" +#include "hwcaps_for_testing.h" + +namespace cpu_features { +namespace { + +void DisableHardwareCapabilities() { SetHardwareCapabilities(0, 0); } + +TEST(CpuinfoAarch64Test, FromHardwareCap) { + SetHardwareCapabilities(AARCH64_HWCAP_FP | AARCH64_HWCAP_AES, 0); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetAarch64Info(); + EXPECT_TRUE(info.features.fp); + EXPECT_FALSE(info.features.asimd); + EXPECT_FALSE(info.features.evtstrm); + EXPECT_TRUE(info.features.aes); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + EXPECT_FALSE(info.features.crc32); + EXPECT_FALSE(info.features.atomics); + EXPECT_FALSE(info.features.fphp); + EXPECT_FALSE(info.features.asimdhp); + EXPECT_FALSE(info.features.cpuid); + EXPECT_FALSE(info.features.asimdrdm); + EXPECT_FALSE(info.features.jscvt); + EXPECT_FALSE(info.features.fcma); + EXPECT_FALSE(info.features.lrcpc); + EXPECT_FALSE(info.features.dcpop); + EXPECT_FALSE(info.features.sha3); + EXPECT_FALSE(info.features.sm3); + EXPECT_FALSE(info.features.sm4); + EXPECT_FALSE(info.features.asimddp); + EXPECT_FALSE(info.features.sha512); + EXPECT_FALSE(info.features.sve); + EXPECT_FALSE(info.features.asimdfhm); + EXPECT_FALSE(info.features.dit); + EXPECT_FALSE(info.features.uscat); + EXPECT_FALSE(info.features.ilrcpc); + EXPECT_FALSE(info.features.flagm); + EXPECT_FALSE(info.features.ssbs); + EXPECT_FALSE(info.features.sb); + EXPECT_FALSE(info.features.paca); + EXPECT_FALSE(info.features.pacg); +} + +TEST(CpuinfoAarch64Test, FromHardwareCap2) { + SetHardwareCapabilities(AARCH64_HWCAP_FP, + AARCH64_HWCAP2_SVE2 | AARCH64_HWCAP2_BTI); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetAarch64Info(); + EXPECT_TRUE(info.features.fp); + + EXPECT_TRUE(info.features.sve2); + EXPECT_TRUE(info.features.bti); + + EXPECT_FALSE(info.features.dcpodp); + EXPECT_FALSE(info.features.sveaes); + EXPECT_FALSE(info.features.svepmull); + EXPECT_FALSE(info.features.svebitperm); + EXPECT_FALSE(info.features.svesha3); + EXPECT_FALSE(info.features.svesm4); + EXPECT_FALSE(info.features.flagm2); + EXPECT_FALSE(info.features.frint); + EXPECT_FALSE(info.features.svei8mm); + EXPECT_FALSE(info.features.svef32mm); + EXPECT_FALSE(info.features.svef64mm); + EXPECT_FALSE(info.features.svebf16); + EXPECT_FALSE(info.features.i8mm); + EXPECT_FALSE(info.features.bf16); + EXPECT_FALSE(info.features.dgh); + EXPECT_FALSE(info.features.rng); +} + +TEST(CpuinfoAarch64Test, ARMCortexA53) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(Processor : AArch64 Processor rev 3 (aarch64) +processor : 0 +processor : 1 +processor : 2 +processor : 3 +processor : 4 +processor : 5 +processor : 6 +processor : 7 +Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 +CPU implementer : 0x41 +CPU architecture: AArch64 +CPU variant : 0x0 +CPU part : 0xd03 +CPU revision : 3)"); + const auto info = GetAarch64Info(); + EXPECT_EQ(info.implementer, 0x41); + EXPECT_EQ(info.variant, 0x0); + EXPECT_EQ(info.part, 0xd03); + EXPECT_EQ(info.revision, 3); + + EXPECT_TRUE(info.features.fp); + EXPECT_TRUE(info.features.asimd); + EXPECT_TRUE(info.features.evtstrm); + EXPECT_TRUE(info.features.aes); + EXPECT_TRUE(info.features.pmull); + EXPECT_TRUE(info.features.sha1); + EXPECT_TRUE(info.features.sha2); + EXPECT_TRUE(info.features.crc32); + + EXPECT_FALSE(info.features.atomics); + EXPECT_FALSE(info.features.fphp); + EXPECT_FALSE(info.features.asimdhp); + EXPECT_FALSE(info.features.cpuid); + EXPECT_FALSE(info.features.asimdrdm); + EXPECT_FALSE(info.features.jscvt); + EXPECT_FALSE(info.features.fcma); + EXPECT_FALSE(info.features.lrcpc); + EXPECT_FALSE(info.features.dcpop); + EXPECT_FALSE(info.features.sha3); + EXPECT_FALSE(info.features.sm3); + EXPECT_FALSE(info.features.sm4); + EXPECT_FALSE(info.features.asimddp); + EXPECT_FALSE(info.features.sha512); + EXPECT_FALSE(info.features.sve); + EXPECT_FALSE(info.features.asimdfhm); + EXPECT_FALSE(info.features.dit); + EXPECT_FALSE(info.features.uscat); + EXPECT_FALSE(info.features.ilrcpc); + EXPECT_FALSE(info.features.flagm); + EXPECT_FALSE(info.features.ssbs); + EXPECT_FALSE(info.features.sb); + EXPECT_FALSE(info.features.paca); + EXPECT_FALSE(info.features.pacg); + EXPECT_FALSE(info.features.dcpodp); + EXPECT_FALSE(info.features.sve2); + EXPECT_FALSE(info.features.sveaes); + EXPECT_FALSE(info.features.svepmull); + EXPECT_FALSE(info.features.svebitperm); + EXPECT_FALSE(info.features.svesha3); + EXPECT_FALSE(info.features.svesm4); + EXPECT_FALSE(info.features.flagm2); + EXPECT_FALSE(info.features.frint); + EXPECT_FALSE(info.features.svei8mm); + EXPECT_FALSE(info.features.svef32mm); + EXPECT_FALSE(info.features.svef64mm); + EXPECT_FALSE(info.features.svebf16); + EXPECT_FALSE(info.features.i8mm); + EXPECT_FALSE(info.features.bf16); + EXPECT_FALSE(info.features.dgh); + EXPECT_FALSE(info.features.rng); + EXPECT_FALSE(info.features.bti); +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/cpuinfo_arm_test.cc b/cpu_features/test/cpuinfo_arm_test.cc new file mode 100644 index 0000000..e0b08a4 --- /dev/null +++ b/cpu_features/test/cpuinfo_arm_test.cc @@ -0,0 +1,354 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_arm.h" + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" +#include "hwcaps_for_testing.h" + +namespace cpu_features { +namespace { + +void DisableHardwareCapabilities() { SetHardwareCapabilities(0, 0); } + +TEST(CpuinfoArmTest, FromHardwareCap) { + SetHardwareCapabilities(ARM_HWCAP_NEON, ARM_HWCAP2_AES | ARM_HWCAP2_CRC32); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetArmInfo(); + EXPECT_TRUE(info.features.vfp); // triggered by vfpv3 + EXPECT_TRUE(info.features.vfpv3); // triggered by neon + EXPECT_TRUE(info.features.neon); + EXPECT_TRUE(info.features.aes); + EXPECT_TRUE(info.features.crc32); + + EXPECT_FALSE(info.features.vfpv4); + EXPECT_FALSE(info.features.iwmmxt); + EXPECT_FALSE(info.features.crunch); + EXPECT_FALSE(info.features.thumbee); + EXPECT_FALSE(info.features.vfpv3d16); + EXPECT_FALSE(info.features.idiva); + EXPECT_FALSE(info.features.idivt); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + + // check some random features with EnumValue(): + EXPECT_TRUE(GetArmFeaturesEnumValue(&info.features, ARM_VFP)); + EXPECT_FALSE(GetArmFeaturesEnumValue(&info.features, ARM_VFPV4)); + // out of bound EnumValue() check + EXPECT_FALSE(GetArmFeaturesEnumValue(&info.features, (ArmFeaturesEnum)~0x0)); +} + +TEST(CpuinfoArmTest, ODroidFromCpuInfo) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(processor : 0 +model name : ARMv7 Processor rev 3 (v71) +BogoMIPS : 120.00 +Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt vfpd32 lpae +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x2 +CPU part : 0xc0f +CPU revision : 3)"); + const auto info = GetArmInfo(); + EXPECT_EQ(info.implementer, 0x41); + EXPECT_EQ(info.variant, 0x2); + EXPECT_EQ(info.part, 0xc0f); + EXPECT_EQ(info.revision, 3); + EXPECT_EQ(info.architecture, 7); + + EXPECT_FALSE(info.features.swp); + EXPECT_TRUE(info.features.half); + EXPECT_TRUE(info.features.thumb); + EXPECT_FALSE(info.features._26bit); + EXPECT_TRUE(info.features.fastmult); + EXPECT_FALSE(info.features.fpa); + EXPECT_TRUE(info.features.vfp); + EXPECT_TRUE(info.features.edsp); + EXPECT_FALSE(info.features.java); + EXPECT_FALSE(info.features.iwmmxt); + EXPECT_FALSE(info.features.crunch); + EXPECT_FALSE(info.features.thumbee); + EXPECT_TRUE(info.features.neon); + EXPECT_TRUE(info.features.vfpv3); + EXPECT_FALSE(info.features.vfpv3d16); + EXPECT_TRUE(info.features.tls); + EXPECT_TRUE(info.features.vfpv4); + EXPECT_TRUE(info.features.idiva); + EXPECT_TRUE(info.features.idivt); + EXPECT_TRUE(info.features.vfpd32); + EXPECT_TRUE(info.features.lpae); + EXPECT_FALSE(info.features.evtstrm); + EXPECT_FALSE(info.features.aes); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + EXPECT_FALSE(info.features.crc32); +} + +// Linux test-case +TEST(CpuinfoArmTest, RaspberryPiZeroFromCpuInfo) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(processor : 0 +model name : ARMv6-compatible processor rev 7 (v6l) +BogoMIPS : 697.95 +Features : half thumb fastmult vfp edsp java tls +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x0 +CPU part : 0xb76 +CPU revision : 7 + +Hardware : BCM2835 +Revision : 9000c1 +Serial : 000000006cd946f3)"); + const auto info = GetArmInfo(); + EXPECT_EQ(info.implementer, 0x41); + EXPECT_EQ(info.variant, 0x0); + EXPECT_EQ(info.part, 0xb76); + EXPECT_EQ(info.revision, 7); + EXPECT_EQ(info.architecture, 6); + + EXPECT_FALSE(info.features.swp); + EXPECT_TRUE(info.features.half); + EXPECT_TRUE(info.features.thumb); + EXPECT_FALSE(info.features._26bit); + EXPECT_TRUE(info.features.fastmult); + EXPECT_FALSE(info.features.fpa); + EXPECT_TRUE(info.features.vfp); + EXPECT_TRUE(info.features.edsp); + EXPECT_TRUE(info.features.java); + EXPECT_FALSE(info.features.iwmmxt); + EXPECT_FALSE(info.features.crunch); + EXPECT_FALSE(info.features.thumbee); + EXPECT_FALSE(info.features.neon); + EXPECT_FALSE(info.features.vfpv3); + EXPECT_FALSE(info.features.vfpv3d16); + EXPECT_TRUE(info.features.tls); + EXPECT_FALSE(info.features.vfpv4); + EXPECT_FALSE(info.features.idiva); + EXPECT_FALSE(info.features.idivt); + EXPECT_FALSE(info.features.vfpd32); + EXPECT_FALSE(info.features.lpae); + EXPECT_FALSE(info.features.evtstrm); + EXPECT_FALSE(info.features.aes); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + EXPECT_FALSE(info.features.crc32); +} + +TEST(CpuinfoArmTest, MarvellArmadaFromCpuInfo) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(processor : 0 +model name : ARMv7 Processor rev 1 (v7l) +BogoMIPS : 50.00 +Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpd32 +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x4 +CPU part : 0xc09 +CPU revision : 1 + +processor : 1 +model name : ARMv7 Processor rev 1 (v7l) +BogoMIPS : 50.00 +Features : half thumb fastmult vfp edsp neon vfpv3 tls vfpd32 +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x4 +CPU part : 0xc09 +CPU revision : 1 + +Hardware : Marvell Armada 380/385 (Device Tree) +Revision : 0000 +Serial : 0000000000000000)"); + const auto info = GetArmInfo(); + EXPECT_EQ(info.implementer, 0x41); + EXPECT_EQ(info.variant, 0x4); + EXPECT_EQ(info.part, 0xc09); + EXPECT_EQ(info.revision, 1); + EXPECT_EQ(info.architecture, 7); + + EXPECT_FALSE(info.features.swp); + EXPECT_TRUE(info.features.half); + EXPECT_TRUE(info.features.thumb); + EXPECT_FALSE(info.features._26bit); + EXPECT_TRUE(info.features.fastmult); + EXPECT_FALSE(info.features.fpa); + EXPECT_TRUE(info.features.vfp); + EXPECT_TRUE(info.features.edsp); + EXPECT_FALSE(info.features.java); + EXPECT_FALSE(info.features.iwmmxt); + EXPECT_FALSE(info.features.crunch); + EXPECT_FALSE(info.features.thumbee); + EXPECT_TRUE(info.features.neon); + EXPECT_TRUE(info.features.vfpv3); + EXPECT_FALSE(info.features.vfpv3d16); + EXPECT_TRUE(info.features.tls); + EXPECT_FALSE(info.features.vfpv4); + EXPECT_FALSE(info.features.idiva); + EXPECT_FALSE(info.features.idivt); + EXPECT_TRUE(info.features.vfpd32); + EXPECT_FALSE(info.features.lpae); + EXPECT_FALSE(info.features.evtstrm); + EXPECT_FALSE(info.features.aes); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + EXPECT_FALSE(info.features.crc32); +} + +// Android test-case +// http://code.google.com/p/android/issues/detail?id=10812 +TEST(CpuinfoArmTest, InvalidArmv7) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(Processor : ARMv6-compatible processor rev 6 (v6l) +BogoMIPS : 199.47 +Features : swp half thumb fastmult vfp edsp java +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x0 +CPU part : 0xb76 +CPU revision : 6 + +Hardware : SPICA +Revision : 0020 +Serial : 33323613546d00ec )"); + const auto info = GetArmInfo(); + EXPECT_EQ(info.architecture, 6); + + EXPECT_TRUE(info.features.swp); + EXPECT_TRUE(info.features.half); + EXPECT_TRUE(info.features.thumb); + EXPECT_FALSE(info.features._26bit); + EXPECT_TRUE(info.features.fastmult); + EXPECT_FALSE(info.features.fpa); + EXPECT_TRUE(info.features.vfp); + EXPECT_TRUE(info.features.edsp); + EXPECT_TRUE(info.features.java); + EXPECT_FALSE(info.features.iwmmxt); + EXPECT_FALSE(info.features.crunch); + EXPECT_FALSE(info.features.thumbee); + EXPECT_FALSE(info.features.neon); + EXPECT_FALSE(info.features.vfpv3); + EXPECT_FALSE(info.features.vfpv3d16); + EXPECT_FALSE(info.features.tls); + EXPECT_FALSE(info.features.vfpv4); + EXPECT_FALSE(info.features.idiva); + EXPECT_FALSE(info.features.idivt); + EXPECT_FALSE(info.features.vfpd32); + EXPECT_FALSE(info.features.lpae); + EXPECT_FALSE(info.features.evtstrm); + EXPECT_FALSE(info.features.aes); + EXPECT_FALSE(info.features.pmull); + EXPECT_FALSE(info.features.sha1); + EXPECT_FALSE(info.features.sha2); + EXPECT_FALSE(info.features.crc32); +} + +// Android test-case +// https://crbug.com/341598. +TEST(CpuinfoArmTest, InvalidNeon) { + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(Processor: ARMv7 Processory rev 0 (v71) +processor: 0 +BogoMIPS: 13.50 + +Processor: 1 +BogoMIPS: 13.50 + +Features: swp half thumb fastmult vfp edsp neon vfpv3 tls vfpv4 idiva idivt +CPU implementer : 0x51 +CPU architecture: 7 +CPU variant: 0x1 +CPU part: 0x04d +CPU revision: 0 + +Hardware: SAMSUNG M2 +Revision: 0010 +Serial: 00001e030000354e)"); + const auto info = GetArmInfo(); + EXPECT_TRUE(info.features.swp); + EXPECT_FALSE(info.features.neon); +} + +// The Nexus 4 (Qualcomm Krait) kernel configuration forgets to report IDIV +// support. +TEST(CpuinfoArmTest, Nexus4_0x510006f2) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(CPU implementer : 0x51 +CPU architecture: 7 +CPU variant : 0x0 +CPU part : 0x6f +CPU revision : 2)"); + const auto info = GetArmInfo(); + EXPECT_TRUE(info.features.idiva); + EXPECT_TRUE(info.features.idivt); + + EXPECT_EQ(GetArmCpuId(&info), 0x510006f2); +} + +// The Nexus 4 (Qualcomm Krait) kernel configuration forgets to report IDIV +// support. +TEST(CpuinfoArmTest, Nexus4_0x510006f3) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(CPU implementer : 0x51 +CPU architecture: 7 +CPU variant : 0x0 +CPU part : 0x6f +CPU revision : 3)"); + const auto info = GetArmInfo(); + EXPECT_TRUE(info.features.idiva); + EXPECT_TRUE(info.features.idivt); + + EXPECT_EQ(GetArmCpuId(&info), 0x510006f3); +} + +// The emulator-specific Android 4.2 kernel fails to report support for the +// 32-bit ARM IDIV instruction. Technically, this is a feature of the virtual +// CPU implemented by the emulator. +TEST(CpuinfoArmTest, EmulatorSpecificIdiv) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(Processor : ARMv7 Processor rev 0 (v7l) +BogoMIPS : 629.14 +Features : swp half thumb fastmult vfp edsp neon vfpv3 +CPU implementer : 0x41 +CPU architecture: 7 +CPU variant : 0x0 +CPU part : 0xc08 +CPU revision : 0 + +Hardware : Goldfish +Revision : 0000 +Serial : 0000000000000000)"); + const auto info = GetArmInfo(); + EXPECT_TRUE(info.features.idiva); +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/cpuinfo_mips_test.cc b/cpu_features/test/cpuinfo_mips_test.cc new file mode 100644 index 0000000..d734058 --- /dev/null +++ b/cpu_features/test/cpuinfo_mips_test.cc @@ -0,0 +1,126 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_mips.h" + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" +#include "hwcaps_for_testing.h" +#include "internal/stack_line_reader.h" +#include "internal/string_view.h" + +namespace cpu_features { + +namespace { + +void DisableHardwareCapabilities() { SetHardwareCapabilities(0, 0); } + +TEST(CpuinfoMipsTest, FromHardwareCapBoth) { + SetHardwareCapabilities(MIPS_HWCAP_MSA | MIPS_HWCAP_R6, 0); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetMipsInfo(); + EXPECT_TRUE(info.features.msa); + EXPECT_FALSE(info.features.eva); + EXPECT_TRUE(info.features.r6); +} + +TEST(CpuinfoMipsTest, FromHardwareCapOnlyOne) { + SetHardwareCapabilities(MIPS_HWCAP_MSA, 0); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetMipsInfo(); + EXPECT_TRUE(info.features.msa); + EXPECT_FALSE(info.features.eva); +} + +TEST(CpuinfoMipsTest, Ci40) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(system type : IMG Pistachio SoC (B0) +machine : IMG Marduk – Ci40 with cc2520 +processor : 0 +cpu model : MIPS interAptiv (multi) V2.0 FPU V0.0 +BogoMIPS : 363.72 +wait instruction : yes +microsecond timers : yes +tlb_entries : 64 +extra interrupt vector : yes +hardware watchpoint : yes, count: 4, address/irw mask: [0x0ffc, 0x0ffc, 0x0ffb, 0x0ffb] +isa : mips1 mips2 mips32r1 mips32r2 +ASEs implemented : mips16 dsp mt eva +shadow register sets : 1 +kscratch registers : 0 +package : 0 +core : 0 +VCED exceptions : not available +VCEI exceptions : not available +VPE : 0 +)"); + const auto info = GetMipsInfo(); + EXPECT_FALSE(info.features.msa); + EXPECT_TRUE(info.features.eva); +} + +TEST(CpuinfoMipsTest, AR7161) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(system type : Atheros AR7161 rev 2 +machine : NETGEAR WNDR3700/WNDR3800/WNDRMAC +processor : 0 +cpu model : MIPS 24Kc V7.4 +BogoMIPS : 452.19 +wait instruction : yes +microsecond timers : yes +tlb_entries : 16 +extra interrupt vector : yes +hardware watchpoint : yes, count: 4, address/irw mask: [0x0000, 0x0f98, 0x0f78, 0x0df8] +ASEs implemented : mips16 +shadow register sets : 1 +kscratch registers : 0 +core : 0 +VCED exceptions : not available +VCEI exceptions : not available +)"); + const auto info = GetMipsInfo(); + EXPECT_FALSE(info.features.msa); + EXPECT_FALSE(info.features.eva); +} + +TEST(CpuinfoMipsTest, Goldfish) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(system type : MIPS-Goldfish +Hardware : goldfish +Revison : 1 +processor : 0 +cpu model : MIPS 24Kc V0.0 FPU V0.0 +BogoMIPS : 1042.02 +wait instruction : yes +microsecond timers : yes +tlb_entries : 16 +extra interrupt vector : yes +hardware watchpoint : yes, count: 1, address/irw mask: [0x0ff8] +ASEs implemented : +shadow register sets : 1 +core : 0 +VCED exceptions : not available +VCEI exceptions : not available +)"); + const auto info = GetMipsInfo(); + EXPECT_FALSE(info.features.msa); + EXPECT_FALSE(info.features.eva); +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/cpuinfo_ppc_test.cc b/cpu_features/test/cpuinfo_ppc_test.cc new file mode 100644 index 0000000..8f0cb65 --- /dev/null +++ b/cpu_features/test/cpuinfo_ppc_test.cc @@ -0,0 +1,119 @@ +// Copyright 2018 IBM. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_ppc.h" + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" +#include "hwcaps_for_testing.h" +#include "internal/string_view.h" + +namespace cpu_features { +namespace { + +void DisableHardwareCapabilities() { SetHardwareCapabilities(0, 0); } + +TEST(CpustringsPPCTest, FromHardwareCap) { + SetHardwareCapabilities(PPC_FEATURE_HAS_FPU | PPC_FEATURE_HAS_VSX, + PPC_FEATURE2_ARCH_3_00); + GetEmptyFilesystem(); // disabling /proc/cpuinfo + const auto info = GetPPCInfo(); + EXPECT_TRUE(info.features.fpu); + EXPECT_FALSE(info.features.mmu); + EXPECT_TRUE(info.features.vsx); + EXPECT_TRUE(info.features.arch300); + EXPECT_FALSE(info.features.power4); + EXPECT_FALSE(info.features.altivec); + EXPECT_FALSE(info.features.vcrypto); + EXPECT_FALSE(info.features.htm); +} + +TEST(CpustringsPPCTest, Blade) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(processor : 14 +cpu : POWER7 (architected), altivec supported +clock : 3000.000000MHz +revision : 2.1 (pvr 003f 0201) + +processor : 15 +cpu : POWER7 (architected), altivec supported +clock : 3000.000000MHz +revision : 2.1 (pvr 003f 0201) + +timebase : 512000000 +platform : pSeries +model : IBM,8406-70Y +machine : CHRP IBM,8406-70Y)"); + SetPlatformTypes("power7", "power8"); + const auto strings = GetPPCPlatformStrings(); + ASSERT_STREQ(strings.platform, "pSeries"); + ASSERT_STREQ(strings.model, "IBM,8406-70Y"); + ASSERT_STREQ(strings.machine, "CHRP IBM,8406-70Y"); + ASSERT_STREQ(strings.cpu, "POWER7 (architected), altivec supported"); + ASSERT_STREQ(strings.type.platform, "power7"); + ASSERT_STREQ(strings.type.base_platform, "power8"); +} + +TEST(CpustringsPPCTest, Firestone) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(processor : 126 +cpu : POWER8 (raw), altivec supported +clock : 2061.000000MHz +revision : 2.0 (pvr 004d 0200) + +processor : 127 +cpu : POWER8 (raw), altivec supported +clock : 2061.000000MHz +revision : 2.0 (pvr 004d 0200) + +timebase : 512000000 +platform : PowerNV +model : 8335-GTA +machine : PowerNV 8335-GTA +firmware : OPAL v3)"); + const auto strings = GetPPCPlatformStrings(); + ASSERT_STREQ(strings.platform, "PowerNV"); + ASSERT_STREQ(strings.model, "8335-GTA"); + ASSERT_STREQ(strings.machine, "PowerNV 8335-GTA"); + ASSERT_STREQ(strings.cpu, "POWER8 (raw), altivec supported"); +} + +TEST(CpustringsPPCTest, w8) { + DisableHardwareCapabilities(); + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", + R"(processor : 143 +cpu : POWER9, altivec supported +clock : 2300.000000MHz +revision : 2.2 (pvr 004e 1202) + +timebase : 512000000 +platform : PowerNV +model : 0000000000000000 +machine : PowerNV 0000000000000000 +firmware : OPAL +MMU : Radix)"); + const auto strings = GetPPCPlatformStrings(); + ASSERT_STREQ(strings.platform, "PowerNV"); + ASSERT_STREQ(strings.model, "0000000000000000"); + ASSERT_STREQ(strings.machine, "PowerNV 0000000000000000"); + ASSERT_STREQ(strings.cpu, "POWER9, altivec supported"); +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/cpuinfo_x86_test.cc b/cpu_features/test/cpuinfo_x86_test.cc new file mode 100644 index 0000000..636d0f9 --- /dev/null +++ b/cpu_features/test/cpuinfo_x86_test.cc @@ -0,0 +1,533 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "cpuinfo_x86.h" + +#include +#include +#include +#include +#if defined(CPU_FEATURES_OS_WINDOWS) +#include // IsProcessorFeaturePresent +#endif // CPU_FEATURES_OS_WINDOWS + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" +#include "internal/cpuid_x86.h" + +namespace cpu_features { + +class FakeCpu { + public: + Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) const { + const auto itr = cpuid_leaves_.find(std::make_pair(leaf_id, ecx)); + if (itr != cpuid_leaves_.end()) { + return itr->second; + } + return {0, 0, 0, 0}; + } + + uint32_t GetXCR0Eax() const { return xcr0_eax_; } + + void SetLeaves(std::map, Leaf> configuration) { + cpuid_leaves_ = std::move(configuration); + } + + void SetOsBackupsExtendedRegisters(bool os_backups_extended_registers) { + xcr0_eax_ = os_backups_extended_registers ? -1 : 0; + } + +#if defined(CPU_FEATURES_OS_DARWIN) + bool GetDarwinSysCtlByName(std::string name) const { + return darwin_sysctlbyname_.count(name); + } + + void SetDarwinSysCtlByName(std::string name) { + darwin_sysctlbyname_.insert(name); + } +#endif // CPU_FEATURES_OS_DARWIN + +#if defined(CPU_FEATURES_OS_WINDOWS) + bool GetWindowsIsProcessorFeaturePresent(DWORD ProcessorFeature) { + return windows_isprocessorfeaturepresent_.count(ProcessorFeature); + } + + void SetWindowsIsProcessorFeaturePresent(DWORD ProcessorFeature) { + windows_isprocessorfeaturepresent_.insert(ProcessorFeature); + } +#endif // CPU_FEATURES_OS_WINDOWS + + private: + std::map, Leaf> cpuid_leaves_; +#if defined(CPU_FEATURES_OS_DARWIN) + std::set darwin_sysctlbyname_; +#endif // CPU_FEATURES_OS_DARWIN +#if defined(CPU_FEATURES_OS_WINDOWS) + std::set windows_isprocessorfeaturepresent_; +#endif // CPU_FEATURES_OS_WINDOWS + uint32_t xcr0_eax_; +}; + +FakeCpu* g_fake_cpu = nullptr; + +extern "C" Leaf GetCpuidLeaf(uint32_t leaf_id, int ecx) { + return g_fake_cpu->GetCpuidLeaf(leaf_id, ecx); +} + +extern "C" uint32_t GetXCR0Eax(void) { return g_fake_cpu->GetXCR0Eax(); } + +#if defined(CPU_FEATURES_OS_DARWIN) +extern "C" bool GetDarwinSysCtlByName(const char* name) { + return g_fake_cpu->GetDarwinSysCtlByName(name); +} +#endif // CPU_FEATURES_OS_DARWIN + +#if defined(CPU_FEATURES_OS_WINDOWS) +extern "C" bool GetWindowsIsProcessorFeaturePresent(DWORD ProcessorFeature) { + return g_fake_cpu->GetWindowsIsProcessorFeaturePresent(ProcessorFeature); +} +#endif // CPU_FEATURES_OS_WINDOWS + +namespace { + +class CpuidX86Test : public ::testing::Test { + protected: + void SetUp() override { g_fake_cpu = new FakeCpu(); } + void TearDown() override { delete g_fake_cpu; } +}; + +TEST_F(CpuidX86Test, SandyBridge) { + g_fake_cpu->SetOsBackupsExtendedRegisters(true); + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x0000000D, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000206A6, 0x00100800, 0x1F9AE3BF, 0xBFEBFBFF}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + }); + const auto info = GetX86Info(); + EXPECT_STREQ(info.vendor, "GenuineIntel"); + EXPECT_EQ(info.family, 0x06); + EXPECT_EQ(info.model, 0x02A); + EXPECT_EQ(info.stepping, 0x06); + // Leaf 7 is zeroed out so none of the Leaf 7 flags are set. + const auto features = info.features; + EXPECT_FALSE(features.erms); + EXPECT_FALSE(features.avx2); + EXPECT_FALSE(features.avx512f); + EXPECT_FALSE(features.avx512cd); + EXPECT_FALSE(features.avx512er); + EXPECT_FALSE(features.avx512pf); + EXPECT_FALSE(features.avx512bw); + EXPECT_FALSE(features.avx512dq); + EXPECT_FALSE(features.avx512vl); + EXPECT_FALSE(features.avx512ifma); + EXPECT_FALSE(features.avx512vbmi); + EXPECT_FALSE(features.avx512vbmi2); + EXPECT_FALSE(features.avx512vnni); + EXPECT_FALSE(features.avx512bitalg); + EXPECT_FALSE(features.avx512vpopcntdq); + EXPECT_FALSE(features.avx512_4vnniw); + EXPECT_FALSE(features.avx512_4fmaps); + // All old cpu features should be set. + EXPECT_TRUE(features.aes); + EXPECT_TRUE(features.ssse3); + EXPECT_TRUE(features.sse4_1); + EXPECT_TRUE(features.sse4_2); + EXPECT_TRUE(features.avx); + EXPECT_FALSE(features.sha); + EXPECT_TRUE(features.popcnt); + EXPECT_FALSE(features.movbe); + EXPECT_FALSE(features.rdrnd); +} + +const int KiB = 1024; +const int MiB = 1024 * KiB; + +TEST_F(CpuidX86Test, SandyBridgeTestOsSupport) { + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x0000000D, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000206A6, 0x00100800, 0x1F9AE3BF, 0xBFEBFBFF}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + }); + // avx is disabled if os does not support backing up ymm registers. + g_fake_cpu->SetOsBackupsExtendedRegisters(false); + EXPECT_FALSE(GetX86Info().features.avx); + // avx is disabled if os does not support backing up ymm registers. + g_fake_cpu->SetOsBackupsExtendedRegisters(true); + EXPECT_TRUE(GetX86Info().features.avx); +} + +TEST_F(CpuidX86Test, SkyLake) { + g_fake_cpu->SetOsBackupsExtendedRegisters(true); + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x00000016, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000406E3, 0x00100800, 0x7FFAFBBF, 0xBFEBFBFF}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x029C67AF, 0x00000000, 0x00000000}}, + }); + const auto info = GetX86Info(); + EXPECT_STREQ(info.vendor, "GenuineIntel"); + EXPECT_EQ(info.family, 0x06); + EXPECT_EQ(info.model, 0x04E); + EXPECT_EQ(info.stepping, 0x03); + EXPECT_EQ(GetX86Microarchitecture(&info), X86Microarchitecture::INTEL_SKL); +} + +TEST_F(CpuidX86Test, Branding) { + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x00000016, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000406E3, 0x00100800, 0x7FFAFBBF, 0xBFEBFBFF}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x029C67AF, 0x00000000, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x80000008, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000001, 0}, Leaf{0x00000000, 0x00000000, 0x00000121, 0x2C100000}}, + {{0x80000002, 0}, Leaf{0x65746E49, 0x2952286C, 0x726F4320, 0x4D542865}}, + {{0x80000003, 0}, Leaf{0x37692029, 0x3035362D, 0x43205530, 0x40205550}}, + {{0x80000004, 0}, Leaf{0x352E3220, 0x7A484730, 0x00000000, 0x00000000}}, + }); + char brand_string[49]; + FillX86BrandString(brand_string); + EXPECT_STREQ(brand_string, "Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz"); +} + +TEST_F(CpuidX86Test, KabyLakeCache) { + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x00000016, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000406E3, 0x00100800, 0x7FFAFBBF, 0xBFEBFBFF}}, + {{0x00000004, 0}, Leaf{0x1C004121, 0x01C0003F, 0x0000003F, 0x00000000}}, + {{0x00000004, 1}, Leaf{0x1C004122, 0x01C0003F, 0x0000003F, 0x00000000}}, + {{0x00000004, 2}, Leaf{0x1C004143, 0x00C0003F, 0x000003FF, 0x00000000}}, + {{0x00000004, 3}, Leaf{0x1C03C163, 0x02C0003F, 0x00001FFF, 0x00000002}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x029C67AF, 0x00000000, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x80000008, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000001, 0}, Leaf{0x00000000, 0x00000000, 0x00000121, 0x2C100000}}, + {{0x80000002, 0}, Leaf{0x65746E49, 0x2952286C, 0x726F4320, 0x4D542865}}, + {{0x80000003, 0}, Leaf{0x37692029, 0x3035362D, 0x43205530, 0x40205550}}, + }); + const auto info = GetX86CacheInfo(); + EXPECT_EQ(info.size, 4); + EXPECT_EQ(info.levels[0].level, 1); + EXPECT_EQ(info.levels[0].cache_type, 1); + EXPECT_EQ(info.levels[0].cache_size, 32 * KiB); + EXPECT_EQ(info.levels[0].ways, 8); + EXPECT_EQ(info.levels[0].line_size, 64); + EXPECT_EQ(info.levels[0].tlb_entries, 64); + EXPECT_EQ(info.levels[0].partitioning, 1); + + EXPECT_EQ(info.levels[1].level, 1); + EXPECT_EQ(info.levels[1].cache_type, 2); + EXPECT_EQ(info.levels[1].cache_size, 32 * KiB); + EXPECT_EQ(info.levels[1].ways, 8); + EXPECT_EQ(info.levels[1].line_size, 64); + EXPECT_EQ(info.levels[1].tlb_entries, 64); + EXPECT_EQ(info.levels[1].partitioning, 1); + + EXPECT_EQ(info.levels[2].level, 2); + EXPECT_EQ(info.levels[2].cache_type, 3); + EXPECT_EQ(info.levels[2].cache_size, 256 * KiB); + EXPECT_EQ(info.levels[2].ways, 4); + EXPECT_EQ(info.levels[2].line_size, 64); + EXPECT_EQ(info.levels[2].tlb_entries, 1024); + EXPECT_EQ(info.levels[2].partitioning, 1); + + EXPECT_EQ(info.levels[3].level, 3); + EXPECT_EQ(info.levels[3].cache_type, 3); + EXPECT_EQ(info.levels[3].cache_size, 6 * MiB); + EXPECT_EQ(info.levels[3].ways, 12); + EXPECT_EQ(info.levels[3].line_size, 64); + EXPECT_EQ(info.levels[3].tlb_entries, 8192); + EXPECT_EQ(info.levels[3].partitioning, 1); +} + +TEST_F(CpuidX86Test, HSWCache) { + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x00000016, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000406E3, 0x00100800, 0x7FFAFBBF, 0xBFEBFBFF}}, + {{0x00000004, 0}, Leaf{0x1C004121, 0x01C0003F, 0x0000003F, 0x00000000}}, + {{0x00000004, 1}, Leaf{0x1C004122, 0x01C0003F, 0x0000003F, 0x00000000}}, + {{0x00000004, 2}, Leaf{0x1C004143, 0x01C0003F, 0x000001FF, 0x00000000}}, + {{0x00000004, 3}, Leaf{0x1C03C163, 0x02C0003F, 0x00001FFF, 0x00000006}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x029C67AF, 0x00000000, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x80000008, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000001, 0}, Leaf{0x00000000, 0x00000000, 0x00000121, 0x2C100000}}, + {{0x80000002, 0}, Leaf{0x65746E49, 0x2952286C, 0x726F4320, 0x4D542865}}, + {{0x80000003, 0}, Leaf{0x37692029, 0x3035362D, 0x43205530, 0x40205550}}, + }); + const auto info = GetX86CacheInfo(); + EXPECT_EQ(info.size, 4); + EXPECT_EQ(info.levels[0].level, 1); + EXPECT_EQ(info.levels[0].cache_type, 1); + EXPECT_EQ(info.levels[0].cache_size, 32 * KiB); + EXPECT_EQ(info.levels[0].ways, 8); + EXPECT_EQ(info.levels[0].line_size, 64); + EXPECT_EQ(info.levels[0].tlb_entries, 64); + EXPECT_EQ(info.levels[0].partitioning, 1); + + EXPECT_EQ(info.levels[1].level, 1); + EXPECT_EQ(info.levels[1].cache_type, 2); + EXPECT_EQ(info.levels[1].cache_size, 32 * KiB); + EXPECT_EQ(info.levels[1].ways, 8); + EXPECT_EQ(info.levels[1].line_size, 64); + EXPECT_EQ(info.levels[1].tlb_entries, 64); + EXPECT_EQ(info.levels[1].partitioning, 1); + + EXPECT_EQ(info.levels[2].level, 2); + EXPECT_EQ(info.levels[2].cache_type, 3); + EXPECT_EQ(info.levels[2].cache_size, 256 * KiB); + EXPECT_EQ(info.levels[2].ways, 8); + EXPECT_EQ(info.levels[2].line_size, 64); + EXPECT_EQ(info.levels[2].tlb_entries, 512); + EXPECT_EQ(info.levels[2].partitioning, 1); + + EXPECT_EQ(info.levels[3].level, 3); + EXPECT_EQ(info.levels[3].cache_type, 3); + EXPECT_EQ(info.levels[3].cache_size, 6 * MiB); + EXPECT_EQ(info.levels[3].ways, 12); + EXPECT_EQ(info.levels[3].line_size, 64); + EXPECT_EQ(info.levels[3].tlb_entries, 8192); + EXPECT_EQ(info.levels[3].partitioning, 1); +} + +// http://users.atw.hu/instlatx64/AuthenticAMD0630F81_K15_Godavari_CPUID.txt +TEST_F(CpuidX86Test, AMD_K15) { + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x0000000D, 0x68747541, 0x444D4163, 0x69746E65}}, + {{0x00000001, 0}, Leaf{0x00630F81, 0x00040800, 0x3E98320B, 0x178BFBFF}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x8000001E, 0x68747541, 0x444D4163, 0x69746E65}}, + {{0x80000001, 0}, Leaf{0x00630F81, 0x10000000, 0x0FEBBFFF, 0x2FD3FBFF}}, + {{0x80000002, 0}, Leaf{0x20444D41, 0x372D3841, 0x4B303736, 0x64615220}}, + {{0x80000003, 0}, Leaf{0x206E6F65, 0x202C3752, 0x43203031, 0x75706D6F}}, + {{0x80000004, 0}, Leaf{0x43206574, 0x7365726F, 0x2B433420, 0x00204736}}, + {{0x80000005, 0}, Leaf{0xFF40FF18, 0xFF40FF30, 0x10040140, 0x60030140}}, + }); + const auto info = GetX86Info(); + + EXPECT_STREQ(info.vendor, "AuthenticAMD"); + EXPECT_EQ(info.family, 0x15); + EXPECT_EQ(info.model, 0x38); + EXPECT_EQ(info.stepping, 0x01); + EXPECT_EQ(GetX86Microarchitecture(&info), + X86Microarchitecture::AMD_BULLDOZER); + + char brand_string[49]; + FillX86BrandString(brand_string); + EXPECT_STREQ(brand_string, "AMD A8-7670K Radeon R7, 10 Compute Cores 4C+6G "); +} + +// https://github.com/InstLatx64/InstLatx64/blob/master/GenuineIntel/GenuineIntel00106A1_Nehalem_CPUID.txt +TEST_F(CpuidX86Test, Nehalem) { + // Pre AVX cpus don't have xsave + g_fake_cpu->SetOsBackupsExtendedRegisters(false); +#if defined(CPU_FEATURES_OS_WINDOWS) + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_XMMI_INSTRUCTIONS_AVAILABLE); + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_XMMI64_INSTRUCTIONS_AVAILABLE); + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_SSE3_INSTRUCTIONS_AVAILABLE); +#endif // CPU_FEATURES_OS_WINDOWS +#if defined(CPU_FEATURES_OS_DARWIN) + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse2"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse3"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.supplementalsse3"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse4_1"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse4_2"); +#endif // CPU_FEATURES_OS_DARWIN +#if defined(CPU_FEATURES_OS_LINUX_OR_ANDROID) + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"(processor : +flags : fpu mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 +)"); +#endif // CPU_FEATURES_OS_LINUX_OR_ANDROID + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x0000000B, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x000106A2, 0x00100800, 0x00BCE3BD, 0xBFEBFBFF}}, + {{0x00000002, 0}, Leaf{0x55035A01, 0x00F0B0E3, 0x00000000, 0x09CA212C}}, + {{0x00000003, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x00000004, 0}, Leaf{0x1C004121, 0x01C0003F, 0x0000003F, 0x00000000}}, + {{0x00000004, 0}, Leaf{0x1C004122, 0x00C0003F, 0x0000007F, 0x00000000}}, + {{0x00000004, 0}, Leaf{0x1C004143, 0x01C0003F, 0x000001FF, 0x00000000}}, + {{0x00000004, 0}, Leaf{0x1C03C163, 0x03C0003F, 0x00000FFF, 0x00000002}}, + {{0x00000005, 0}, Leaf{0x00000040, 0x00000040, 0x00000003, 0x00021120}}, + {{0x00000006, 0}, Leaf{0x00000001, 0x00000002, 0x00000001, 0x00000000}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x00000008, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x00000009, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x0000000A, 0}, Leaf{0x07300403, 0x00000000, 0x00000000, 0x00000603}}, + {{0x0000000B, 0}, Leaf{0x00000001, 0x00000001, 0x00000100, 0x00000000}}, + {{0x0000000B, 0}, Leaf{0x00000004, 0x00000002, 0x00000201, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x80000008, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000001, 0}, Leaf{0x00000000, 0x00000000, 0x00000001, 0x28100000}}, + {{0x80000002, 0}, Leaf{0x756E6547, 0x20656E69, 0x65746E49, 0x2952286C}}, + {{0x80000003, 0}, Leaf{0x55504320, 0x20202020, 0x20202020, 0x40202020}}, + {{0x80000004, 0}, Leaf{0x30303020, 0x20402030, 0x37382E31, 0x007A4847}}, + {{0x80000005, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000006, 0}, Leaf{0x00000000, 0x00000000, 0x01006040, 0x00000000}}, + {{0x80000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000100}}, + {{0x80000008, 0}, Leaf{0x00003028, 0x00000000, 0x00000000, 0x00000000}}, + }); + const auto info = GetX86Info(); + + EXPECT_STREQ(info.vendor, "GenuineIntel"); + EXPECT_EQ(info.family, 0x06); + EXPECT_EQ(info.model, 0x1A); + EXPECT_EQ(info.stepping, 0x02); + EXPECT_EQ(GetX86Microarchitecture(&info), X86Microarchitecture::INTEL_NHM); + + char brand_string[49]; + FillX86BrandString(brand_string); + EXPECT_STREQ(brand_string, "Genuine Intel(R) CPU @ 0000 @ 1.87GHz"); + + EXPECT_TRUE(info.features.sse); + EXPECT_TRUE(info.features.sse2); + EXPECT_TRUE(info.features.sse3); +#ifndef CPU_FEATURES_OS_WINDOWS + // Currently disabled on Windows as IsProcessorFeaturePresent do not support + // feature detection > sse3. + EXPECT_TRUE(info.features.ssse3); + EXPECT_TRUE(info.features.sse4_1); + EXPECT_TRUE(info.features.sse4_2); +#endif // CPU_FEATURES_OS_WINDOWS +} + +// https://github.com/InstLatx64/InstLatx64/blob/master/GenuineIntel/GenuineIntel0030673_Silvermont3_CPUID.txt +TEST_F(CpuidX86Test, Atom) { + // Pre AVX cpus don't have xsave + g_fake_cpu->SetOsBackupsExtendedRegisters(false); +#if defined(CPU_FEATURES_OS_WINDOWS) + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_XMMI_INSTRUCTIONS_AVAILABLE); + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_XMMI64_INSTRUCTIONS_AVAILABLE); + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_SSE3_INSTRUCTIONS_AVAILABLE); +#endif // CPU_FEATURES_OS_WINDOWS +#if defined(CPU_FEATURES_OS_DARWIN) + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse2"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse3"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.supplementalsse3"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse4_1"); + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse4_2"); +#endif // CPU_FEATURES_OS_DARWIN +#if defined(CPU_FEATURES_OS_LINUX_OR_ANDROID) + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"( +flags : fpu mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 +)"); +#endif // CPU_FEATURES_OS_LINUX_OR_ANDROID + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x0000000B, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x00030673, 0x00100800, 0x41D8E3BF, 0xBFEBFBFF}}, + {{0x00000002, 0}, Leaf{0x61B3A001, 0x0000FFC2, 0x00000000, 0x00000000}}, + {{0x00000003, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x00000004, 0}, Leaf{0x1C000121, 0x0140003F, 0x0000003F, 0x00000001}}, + {{0x00000004, 1}, Leaf{0x1C000122, 0x01C0003F, 0x0000003F, 0x00000001}}, + {{0x00000004, 2}, Leaf{0x1C00C143, 0x03C0003F, 0x000003FF, 0x00000001}}, + {{0x00000005, 0}, Leaf{0x00000040, 0x00000040, 0x00000003, 0x33000020}}, + {{0x00000006, 0}, Leaf{0x00000005, 0x00000002, 0x00000009, 0x00000000}}, + {{0x00000007, 0}, Leaf{0x00000000, 0x00002282, 0x00000000, 0x00000000}}, + {{0x00000008, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x00000009, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x0000000A, 0}, Leaf{0x07280203, 0x00000000, 0x00000000, 0x00004503}}, + {{0x0000000B, 0}, Leaf{0x00000001, 0x00000001, 0x00000100, 0x00000000}}, + {{0x0000000B, 1}, Leaf{0x00000004, 0x00000004, 0x00000201, 0x00000000}}, + {{0x80000000, 0}, Leaf{0x80000008, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000001, 0}, Leaf{0x00000000, 0x00000000, 0x00000101, 0x28100000}}, + {{0x80000002, 0}, Leaf{0x20202020, 0x6E492020, 0x286C6574, 0x43202952}}, + {{0x80000003, 0}, Leaf{0x72656C65, 0x52286E6F, 0x50432029, 0x4A202055}}, + {{0x80000004, 0}, Leaf{0x30303931, 0x20402020, 0x39392E31, 0x007A4847}}, + {{0x80000005, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000000}}, + {{0x80000006, 0}, Leaf{0x00000000, 0x00000000, 0x04008040, 0x00000000}}, + {{0x80000007, 0}, Leaf{0x00000000, 0x00000000, 0x00000000, 0x00000100}}, + {{0x80000008, 0}, Leaf{0x00003024, 0x00000000, 0x00000000, 0x00000000}}, + }); + const auto info = GetX86Info(); + + EXPECT_STREQ(info.vendor, "GenuineIntel"); + EXPECT_EQ(info.family, 0x06); + EXPECT_EQ(info.model, 0x37); + EXPECT_EQ(info.stepping, 0x03); + EXPECT_EQ(GetX86Microarchitecture(&info), + X86Microarchitecture::INTEL_ATOM_SMT); + + char brand_string[49]; + FillX86BrandString(brand_string); + EXPECT_STREQ(brand_string, " Intel(R) Celeron(R) CPU J1900 @ 1.99GHz"); + + EXPECT_TRUE(info.features.sse); + EXPECT_TRUE(info.features.sse2); + EXPECT_TRUE(info.features.sse3); +#ifndef CPU_FEATURES_OS_WINDOWS + // Currently disabled on Windows as IsProcessorFeaturePresent do not support + // feature detection > sse3. + EXPECT_TRUE(info.features.ssse3); + EXPECT_TRUE(info.features.sse4_1); + EXPECT_TRUE(info.features.sse4_2); +#endif // CPU_FEATURES_OS_WINDOWS +} + +// https://github.com/InstLatx64/InstLatx64/blob/master/GenuineIntel/GenuineIntel0000673_P3_KatmaiDP_CPUID.txt +TEST_F(CpuidX86Test, P3) { + // Pre AVX cpus don't have xsave + g_fake_cpu->SetOsBackupsExtendedRegisters(false); +#if defined(CPU_FEATURES_OS_WINDOWS) + g_fake_cpu->SetWindowsIsProcessorFeaturePresent( + PF_XMMI_INSTRUCTIONS_AVAILABLE); +#endif // CPU_FEATURES_OS_WINDOWS +#if defined(CPU_FEATURES_OS_DARWIN) + g_fake_cpu->SetDarwinSysCtlByName("hw.optional.sse"); +#endif // CPU_FEATURES_OS_DARWIN +#if defined(CPU_FEATURES_OS_LINUX_OR_ANDROID) + auto& fs = GetEmptyFilesystem(); + fs.CreateFile("/proc/cpuinfo", R"( +flags : fpu mmx sse +)"); +#endif // CPU_FEATURES_OS_LINUX_OR_ANDROID + g_fake_cpu->SetLeaves({ + {{0x00000000, 0}, Leaf{0x00000003, 0x756E6547, 0x6C65746E, 0x49656E69}}, + {{0x00000001, 0}, Leaf{0x00000673, 0x00000000, 0x00000000, 0x0387FBFF}}, + {{0x00000002, 0}, Leaf{0x03020101, 0x00000000, 0x00000000, 0x0C040843}}, + {{0x00000003, 0}, Leaf{0x00000000, 0x00000000, 0x4CECC782, 0x00006778}}, + }); + const auto info = GetX86Info(); + + EXPECT_STREQ(info.vendor, "GenuineIntel"); + EXPECT_EQ(info.family, 0x06); + EXPECT_EQ(info.model, 0x07); + EXPECT_EQ(info.stepping, 0x03); + EXPECT_EQ(GetX86Microarchitecture(&info), X86Microarchitecture::X86_UNKNOWN); + + char brand_string[49]; + FillX86BrandString(brand_string); + EXPECT_STREQ(brand_string, ""); + + EXPECT_TRUE(info.features.mmx); + EXPECT_TRUE(info.features.sse); + EXPECT_FALSE(info.features.sse2); + EXPECT_FALSE(info.features.sse3); +#ifndef CPU_FEATURES_OS_WINDOWS + // Currently disabled on Windows as IsProcessorFeaturePresent do not support + // feature detection > sse3. + EXPECT_FALSE(info.features.ssse3); + EXPECT_FALSE(info.features.sse4_1); + EXPECT_FALSE(info.features.sse4_2); +#endif // CPU_FEATURES_OS_WINDOWS +} + +// TODO(user): test what happens when xsave/osxsave are not present. +// TODO(user): test what happens when xmm/ymm/zmm os support are not +// present. + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/filesystem_for_testing.cc b/cpu_features/test/filesystem_for_testing.cc new file mode 100644 index 0000000..648a53e --- /dev/null +++ b/cpu_features/test/filesystem_for_testing.cc @@ -0,0 +1,103 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "filesystem_for_testing.h" + +#include +#include +#include +#include +#include + +namespace cpu_features { + +FakeFile::FakeFile(int file_descriptor, const char* content) + : file_descriptor_(file_descriptor), content_(content) {} + +FakeFile::~FakeFile() { assert(!opened_); } + +void FakeFile::Open() { + assert(!opened_); + opened_ = true; +} + +void FakeFile::Close() { + assert(opened_); + opened_ = false; +} + +int FakeFile::Read(int fd, void* buf, size_t count) { + assert(count < INT_MAX); + assert(fd == file_descriptor_); + const size_t remainder = content_.size() - head_index_; + const size_t read = count > remainder ? remainder : count; + memcpy(buf, content_.data() + head_index_, read); + head_index_ += read; + assert(read < INT_MAX); + return (int)read; +} + +void FakeFilesystem::Reset() { files_.clear(); } + +FakeFile* FakeFilesystem::CreateFile(const std::string& filename, + const char* content) { + auto& file = files_[filename]; + file = + std::unique_ptr(new FakeFile(next_file_descriptor_++, content)); + return file.get(); +} + +FakeFile* FakeFilesystem::FindFileOrNull(const std::string& filename) const { + const auto itr = files_.find(filename); + return itr == files_.end() ? nullptr : itr->second.get(); +} + +FakeFile* FakeFilesystem::FindFileOrDie(const int file_descriptor) const { + for (const auto& filename_file_pair : files_) { + FakeFile* const file_ptr = filename_file_pair.second.get(); + if (file_ptr->GetFileDescriptor() == file_descriptor) { + return file_ptr; + } + } + assert(false); + return nullptr; +} + +static FakeFilesystem* kFilesystem = new FakeFilesystem(); + +FakeFilesystem& GetEmptyFilesystem() { + kFilesystem->Reset(); + return *kFilesystem; +} + +extern "C" int CpuFeatures_OpenFile(const char* filename) { + auto* const file = kFilesystem->FindFileOrNull(filename); + if (file) { + file->Open(); + return file->GetFileDescriptor(); + } + return -1; +} + +extern "C" void CpuFeatures_CloseFile(int file_descriptor) { + kFilesystem->FindFileOrDie(file_descriptor)->Close(); +} + +extern "C" int CpuFeatures_ReadFile(int file_descriptor, void* buffer, + size_t buffer_size) { + return kFilesystem->FindFileOrDie(file_descriptor) + ->Read(file_descriptor, buffer, buffer_size); +} + +} // namespace cpu_features diff --git a/cpu_features/test/filesystem_for_testing.h b/cpu_features/test/filesystem_for_testing.h new file mode 100644 index 0000000..ef717fd --- /dev/null +++ b/cpu_features/test/filesystem_for_testing.h @@ -0,0 +1,61 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Implements a fake filesystem, useful for tests. +#ifndef CPU_FEATURES_TEST_FILESYSTEM_FOR_TESTING_H_ +#define CPU_FEATURES_TEST_FILESYSTEM_FOR_TESTING_H_ + +#include +#include +#include + +#include "internal/filesystem.h" + +namespace cpu_features { + +class FakeFile { + public: + explicit FakeFile(int file_descriptor, const char* content); + ~FakeFile(); + + void Open(); + void Close(); + int Read(int fd, void* buf, size_t count); + + int GetFileDescriptor() const { return file_descriptor_; } + + private: + const int file_descriptor_; + const std::string content_; + bool opened_ = false; + size_t head_index_ = 0; +}; + +class FakeFilesystem { + public: + void Reset(); + FakeFile* CreateFile(const std::string& filename, const char* content); + FakeFile* FindFileOrDie(const int file_descriptor) const; + FakeFile* FindFileOrNull(const std::string& filename) const; + + private: + int next_file_descriptor_ = 0; + std::unordered_map> files_; +}; + +FakeFilesystem& GetEmptyFilesystem(); + +} // namespace cpu_features + +#endif // CPU_FEATURES_TEST_FILESYSTEM_FOR_TESTING_H_ diff --git a/cpu_features/test/hwcaps_for_testing.cc b/cpu_features/test/hwcaps_for_testing.cc new file mode 100644 index 0000000..a8086a0 --- /dev/null +++ b/cpu_features/test/hwcaps_for_testing.cc @@ -0,0 +1,46 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "hwcaps_for_testing.h" + +#include + +#include "internal/string_view.h" + +namespace cpu_features { + +namespace { +static auto* const g_hardware_capabilities = new HardwareCapabilities(); +static auto* const g_platform_types = new PlatformType(); +} // namespace + +void SetHardwareCapabilities(uint32_t hwcaps, uint32_t hwcaps2) { + g_hardware_capabilities->hwcaps = hwcaps; + g_hardware_capabilities->hwcaps2 = hwcaps2; +} + +HardwareCapabilities CpuFeatures_GetHardwareCapabilities(void) { + return *g_hardware_capabilities; +} + +void SetPlatformTypes(const char* platform, const char* base_platform) { + CpuFeatures_StringView_CopyString(str(platform), g_platform_types->platform, + sizeof(g_platform_types->platform)); + CpuFeatures_StringView_CopyString(str(base_platform), + g_platform_types->base_platform, + sizeof(g_platform_types->base_platform)); +} + +PlatformType CpuFeatures_GetPlatformType(void) { return *g_platform_types; } +} // namespace cpu_features diff --git a/cpu_features/test/hwcaps_for_testing.h b/cpu_features/test/hwcaps_for_testing.h new file mode 100644 index 0000000..bcab82e --- /dev/null +++ b/cpu_features/test/hwcaps_for_testing.h @@ -0,0 +1,27 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef CPU_FEATURES_TEST_HWCAPS_FOR_TESTING_H_ +#define CPU_FEATURES_TEST_HWCAPS_FOR_TESTING_H_ + +#include "internal/hwcaps.h" + +namespace cpu_features { + +void SetHardwareCapabilities(uint32_t hwcaps, uint32_t hwcaps2); +void SetPlatformTypes(const char *platform, const char *base_platform); + +} // namespace cpu_features + +#endif // CPU_FEATURES_TEST_HWCAPS_FOR_TESTING_H_ diff --git a/cpu_features/test/stack_line_reader_test.cc b/cpu_features/test/stack_line_reader_test.cc new file mode 100644 index 0000000..9ac5388 --- /dev/null +++ b/cpu_features/test/stack_line_reader_test.cc @@ -0,0 +1,132 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/stack_line_reader.h" + +#include "filesystem_for_testing.h" +#include "gtest/gtest.h" + +namespace cpu_features { + +bool operator==(const StringView& a, const StringView& b) { + return CpuFeatures_StringView_IsEquals(a, b); +} + +namespace { + +std::string ToString(StringView view) { return {view.ptr, view.size}; } + +TEST(StackLineReaderTest, Empty) { + auto& fs = GetEmptyFilesystem(); + auto* file = fs.CreateFile("/proc/cpuinfo", ""); + StackLineReader reader; + StackLineReader_Initialize(&reader, file->GetFileDescriptor()); + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_TRUE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("")); + } +} + +TEST(StackLineReaderTest, ManySmallLines) { + auto& fs = GetEmptyFilesystem(); + auto* file = fs.CreateFile("/proc/cpuinfo", "a\nb\nc"); + + StackLineReader reader; + StackLineReader_Initialize(&reader, file->GetFileDescriptor()); + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("a")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("b")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_TRUE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("c")); + } +} + +TEST(StackLineReaderTest, TruncatedLine) { + auto& fs = GetEmptyFilesystem(); + auto* file = fs.CreateFile("/proc/cpuinfo", R"(First +Second +More than 16 characters, this will be truncated. +last)"); + + StackLineReader reader; + StackLineReader_Initialize(&reader, file->GetFileDescriptor()); + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("First")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("Second")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_FALSE(result.full_line); + EXPECT_EQ(result.line, str("More than 16 cha")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_TRUE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("last")); + } +} + +TEST(StackLineReaderTest, TruncatedLines) { + auto& fs = GetEmptyFilesystem(); + auto* file = fs.CreateFile("/proc/cpuinfo", R"(More than 16 characters +Another line that is too long)"); + + StackLineReader reader; + StackLineReader_Initialize(&reader, file->GetFileDescriptor()); + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_FALSE(result.full_line); + EXPECT_EQ(result.line, str("More than 16 cha")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_FALSE(result.eof); + EXPECT_FALSE(result.full_line); + EXPECT_EQ(result.line, str("Another line tha")); + } + { + const auto result = StackLineReader_NextLine(&reader); + EXPECT_TRUE(result.eof); + EXPECT_TRUE(result.full_line); + EXPECT_EQ(result.line, str("")); + } +} + +} // namespace +} // namespace cpu_features diff --git a/cpu_features/test/string_view_test.cc b/cpu_features/test/string_view_test.cc new file mode 100644 index 0000000..ca3e023 --- /dev/null +++ b/cpu_features/test/string_view_test.cc @@ -0,0 +1,192 @@ +// Copyright 2017 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal/string_view.h" + +#include "gtest/gtest.h" + +namespace cpu_features { + +bool operator==(const StringView& a, const StringView& b) { + return CpuFeatures_StringView_IsEquals(a, b); +} + +namespace { + +TEST(StringViewTest, Empty) { + EXPECT_EQ(kEmptyStringView.ptr, nullptr); + EXPECT_EQ(kEmptyStringView.size, 0); +} + +TEST(StringViewTest, Build) { + const auto view = str("test"); + EXPECT_EQ(view.ptr[0], 't'); + EXPECT_EQ(view.size, 4); +} + +TEST(StringViewTest, CpuFeatures_StringView_IndexOfChar) { + // Found. + EXPECT_EQ(CpuFeatures_StringView_IndexOfChar(str("test"), 'e'), 1); + EXPECT_EQ(CpuFeatures_StringView_IndexOfChar(str("test"), 't'), 0); + EXPECT_EQ(CpuFeatures_StringView_IndexOfChar(str("beef"), 'e'), 1); + // Not found. + EXPECT_EQ(CpuFeatures_StringView_IndexOfChar(str("test"), 'z'), -1); + // Empty. + EXPECT_EQ(CpuFeatures_StringView_IndexOfChar(kEmptyStringView, 'z'), -1); +} + +TEST(StringViewTest, CpuFeatures_StringView_IndexOf) { + // Found. + EXPECT_EQ(CpuFeatures_StringView_IndexOf(str("test"), str("es")), 1); + EXPECT_EQ(CpuFeatures_StringView_IndexOf(str("test"), str("test")), 0); + EXPECT_EQ(CpuFeatures_StringView_IndexOf(str("tesstest"), str("test")), 4); + // Not found. + EXPECT_EQ(CpuFeatures_StringView_IndexOf(str("test"), str("aa")), -1); + // Empty. + EXPECT_EQ(CpuFeatures_StringView_IndexOf(kEmptyStringView, str("aa")), -1); + EXPECT_EQ(CpuFeatures_StringView_IndexOf(str("aa"), kEmptyStringView), -1); +} + +TEST(StringViewTest, CpuFeatures_StringView_StartsWith) { + EXPECT_TRUE(CpuFeatures_StringView_StartsWith(str("test"), str("te"))); + EXPECT_TRUE(CpuFeatures_StringView_StartsWith(str("test"), str("test"))); + EXPECT_FALSE(CpuFeatures_StringView_StartsWith(str("test"), str("st"))); + EXPECT_FALSE(CpuFeatures_StringView_StartsWith(str("test"), str("est"))); + EXPECT_FALSE(CpuFeatures_StringView_StartsWith(str("test"), str(""))); + EXPECT_FALSE( + CpuFeatures_StringView_StartsWith(str("test"), kEmptyStringView)); + EXPECT_FALSE( + CpuFeatures_StringView_StartsWith(kEmptyStringView, str("test"))); +} + +TEST(StringViewTest, CpuFeatures_StringView_IsEquals) { + EXPECT_TRUE( + CpuFeatures_StringView_IsEquals(kEmptyStringView, kEmptyStringView)); + EXPECT_TRUE(CpuFeatures_StringView_IsEquals(kEmptyStringView, str(""))); + EXPECT_TRUE(CpuFeatures_StringView_IsEquals(str(""), kEmptyStringView)); + EXPECT_TRUE(CpuFeatures_StringView_IsEquals(str("test"), str("test"))); + EXPECT_TRUE(CpuFeatures_StringView_IsEquals(str("a"), str("a"))); + EXPECT_FALSE(CpuFeatures_StringView_IsEquals(str("a"), str("b"))); + EXPECT_FALSE(CpuFeatures_StringView_IsEquals(str("aa"), str("a"))); + EXPECT_FALSE(CpuFeatures_StringView_IsEquals(str("a"), str("aa"))); + EXPECT_FALSE(CpuFeatures_StringView_IsEquals(str("a"), kEmptyStringView)); + EXPECT_FALSE(CpuFeatures_StringView_IsEquals(kEmptyStringView, str("a"))); +} + +TEST(StringViewTest, CpuFeatures_StringView_PopFront) { + EXPECT_EQ(CpuFeatures_StringView_PopFront(str("test"), 2), str("st")); + EXPECT_EQ(CpuFeatures_StringView_PopFront(str("test"), 0), str("test")); + EXPECT_EQ(CpuFeatures_StringView_PopFront(str("test"), 4), str("")); + EXPECT_EQ(CpuFeatures_StringView_PopFront(str("test"), 100), str("")); +} + +TEST(StringViewTest, CpuFeatures_StringView_PopBack) { + EXPECT_EQ(CpuFeatures_StringView_PopBack(str("test"), 2), str("te")); + EXPECT_EQ(CpuFeatures_StringView_PopBack(str("test"), 0), str("test")); + EXPECT_EQ(CpuFeatures_StringView_PopBack(str("test"), 4), str("")); + EXPECT_EQ(CpuFeatures_StringView_PopBack(str("test"), 100), str("")); +} + +TEST(StringViewTest, CpuFeatures_StringView_KeepFront) { + EXPECT_EQ(CpuFeatures_StringView_KeepFront(str("test"), 2), str("te")); + EXPECT_EQ(CpuFeatures_StringView_KeepFront(str("test"), 0), str("")); + EXPECT_EQ(CpuFeatures_StringView_KeepFront(str("test"), 4), str("test")); + EXPECT_EQ(CpuFeatures_StringView_KeepFront(str("test"), 6), str("test")); +} + +TEST(StringViewTest, CpuFeatures_StringView_Front) { + EXPECT_EQ(CpuFeatures_StringView_Front(str("apple")), 'a'); + EXPECT_EQ(CpuFeatures_StringView_Front(str("a")), 'a'); +} + +TEST(StringViewTest, CpuFeatures_StringView_Back) { + EXPECT_EQ(CpuFeatures_StringView_Back(str("apple")), 'e'); + EXPECT_EQ(CpuFeatures_StringView_Back(str("a")), 'a'); +} + +TEST(StringViewTest, CpuFeatures_StringView_TrimWhitespace) { + EXPECT_EQ(CpuFeatures_StringView_TrimWhitespace(str(" first middle last ")), + str("first middle last")); + EXPECT_EQ(CpuFeatures_StringView_TrimWhitespace(str("first middle last ")), + str("first middle last")); + EXPECT_EQ(CpuFeatures_StringView_TrimWhitespace(str(" first middle last")), + str("first middle last")); + EXPECT_EQ(CpuFeatures_StringView_TrimWhitespace(str("first middle last")), + str("first middle last")); +} + +TEST(StringViewTest, CpuFeatures_StringView_ParsePositiveNumber) { + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("42")), 42); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("0x2a")), 42); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("0x2A")), 42); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("0x2A2a")), 10794); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("0x2a2A")), 10794); + + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("-10")), -1); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("-0x2A")), -1); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("abc")), -1); + EXPECT_EQ(CpuFeatures_StringView_ParsePositiveNumber(str("")), -1); +} + +TEST(StringViewTest, CpuFeatures_StringView_CopyString) { + char buf[4]; + buf[0] = 'X'; + + // Empty + CpuFeatures_StringView_CopyString(str(""), buf, sizeof(buf)); + EXPECT_STREQ(buf, ""); + + // Less + CpuFeatures_StringView_CopyString(str("a"), buf, sizeof(buf)); + EXPECT_STREQ(buf, "a"); + + // exact + CpuFeatures_StringView_CopyString(str("abc"), buf, sizeof(buf)); + EXPECT_STREQ(buf, "abc"); + + // More + CpuFeatures_StringView_CopyString(str("abcd"), buf, sizeof(buf)); + EXPECT_STREQ(buf, "abc"); +} + +TEST(StringViewTest, CpuFeatures_StringView_HasWord) { + // Find flags at beginning, middle and end. + EXPECT_TRUE( + CpuFeatures_StringView_HasWord(str("first middle last"), "first")); + EXPECT_TRUE( + CpuFeatures_StringView_HasWord(str("first middle last"), "middle")); + EXPECT_TRUE(CpuFeatures_StringView_HasWord(str("first middle last"), "last")); + // Do not match partial flags + EXPECT_FALSE( + CpuFeatures_StringView_HasWord(str("first middle last"), "irst")); + EXPECT_FALSE(CpuFeatures_StringView_HasWord(str("first middle last"), "mid")); + EXPECT_FALSE(CpuFeatures_StringView_HasWord(str("first middle last"), "las")); +} + +TEST(StringViewTest, CpuFeatures_StringView_GetAttributeKeyValue) { + const StringView line = str(" key : first middle last "); + StringView key, value; + EXPECT_TRUE(CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)); + EXPECT_EQ(key, str("key")); + EXPECT_EQ(value, str("first middle last")); +} + +TEST(StringViewTest, FailingGetAttributeKeyValue) { + const StringView line = str("key first middle last"); + StringView key, value; + EXPECT_FALSE(CpuFeatures_StringView_GetAttributeKeyValue(line, &key, &value)); +} + +} // namespace +} // namespace cpu_features diff --git a/debian-jessie/rules b/debian-jessie/rules index ee18e9e..f03be8a 100755 --- a/debian-jessie/rules +++ b/debian-jessie/rules @@ -14,11 +14,6 @@ export DEB_BUILD_MAINT_OPTIONS = hardening=+all DPKG_EXPORT_BUILDFLAGS = 1 include /usr/share/dpkg/default.mk -ifeq ($(DEB_HOST_ARCH),armhf) - # Assume a Pi-like target, where using an 8-bit table is a fairly big win over the float path - CPPFLAGS += -DSC16Q11_TABLE_BITS=8 -endif - override_dh_auto_build: dh_auto_build -- RTLSDR=yes BLADERF=yes HACKRF=no LIMESDR=no DUMP1090_VERSION=$(DEB_VERSION) diff --git a/debian-stretch/rules b/debian-stretch/rules index ee18e9e..f03be8a 100755 --- a/debian-stretch/rules +++ b/debian-stretch/rules @@ -14,11 +14,6 @@ export DEB_BUILD_MAINT_OPTIONS = hardening=+all DPKG_EXPORT_BUILDFLAGS = 1 include /usr/share/dpkg/default.mk -ifeq ($(DEB_HOST_ARCH),armhf) - # Assume a Pi-like target, where using an 8-bit table is a fairly big win over the float path - CPPFLAGS += -DSC16Q11_TABLE_BITS=8 -endif - override_dh_auto_build: dh_auto_build -- RTLSDR=yes BLADERF=yes HACKRF=no LIMESDR=no DUMP1090_VERSION=$(DEB_VERSION) diff --git a/debian/dump1090-fa.default b/debian/dump1090-fa.default index 3e4bb73..12c3e31 100644 --- a/debian/dump1090-fa.default +++ b/debian/dump1090-fa.default @@ -13,3 +13,9 @@ RECEIVER_OPTIONS="--device-index 0 --gain -10 --ppm 0" DECODER_OPTIONS="--max-range 360 --fix" NET_OPTIONS="--net --net-heartbeat 60 --net-ro-size 1300 --net-ro-interval 0.2 --net-ri-port 0 --net-ro-port 30002 --net-sbs-port 30003 --net-bi-port 30004,30104 --net-bo-port 30005" JSON_OPTIONS="--json-location-accuracy 1" + +# Use a machine-specific wisdom file if it exists +if [ -f /etc/dump1090-fa/wisdom.local ] +then + RECEIVER_OPTIONS="${RECEIVER_OPTIONS} --wisdom /etc/dump1090-fa/wisdom.local" +fi diff --git a/debian/dump1090-fa.install b/debian/dump1090-fa.install index 67292d1..220826c 100644 --- a/debian/dump1090-fa.install +++ b/debian/dump1090-fa.install @@ -2,3 +2,5 @@ public_html/* usr/share/dump1090-fa/html debian/lighttpd/* etc/lighttpd/conf-available bladerf/* usr/share/dump1090-fa/bladerf debian/start-dump1090-fa usr/share/dump1090-fa/ +debian/generate-wisdom usr/share/dump1090-fa/ +starch-benchmark /usr/lib/dump1090-fa/ diff --git a/debian/generate-wisdom b/debian/generate-wisdom new file mode 100755 index 0000000..dc98d72 --- /dev/null +++ b/debian/generate-wisdom @@ -0,0 +1,20 @@ +#!/bin/sh -e + +# This script generates a machine-specific wisdom file for dump1090-fa +# (containing information about which DSP implementations are fastest) + +WORKDIR=$(mktemp -t -d wisdom.XXXXXX) + +echo "Benchmarking .. this will take a while." >&2 + +# generate initial wisdom so that twopass implementations have something to work with +echo "First pass: generating $WORKDIR/wisdom.initial" >&2 +/usr/lib/dump1090-fa/starch-benchmark -i 5 -o $WORKDIR/wisdom.initial magnitude_uc8 magnitude_uc8_aligned mean_power_u16 mean_power_u16_aligned + +# generate the real wisdom +echo "Second pass: generating $WORKDIR/wisdom.local" >&2 +/usr/lib/dump1090-fa/starch-benchmark -i 5 -r $WORKDIR/wisdom.initial -o $WORKDIR/wisdom.local + +echo "Wisdom written to $WORKDIR/wisdom.local" >&2 +echo "Copy this file to /etc/dump1090-fa/wisdom.local" >&2 +echo "(and restart dump1090-fa) to start using it." >&2 diff --git a/debian/rules b/debian/rules index 93f94f8..5aa76c1 100755 --- a/debian/rules +++ b/debian/rules @@ -14,11 +14,6 @@ export DEB_BUILD_MAINT_OPTIONS = hardening=+all DPKG_EXPORT_BUILDFLAGS = 1 include /usr/share/dpkg/default.mk -ifeq ($(DEB_HOST_ARCH),armhf) - # Assume a Pi-like target, where using an 8-bit table is a fairly big win over the float path - CPPFLAGS += -DSC16Q11_TABLE_BITS=8 -endif - ifeq (,$(filter custom,$(DEB_BUILD_PROFILES))) # Standard build RTLSDR = yes diff --git a/dsp-types.h b/dsp-types.h new file mode 100644 index 0000000..6eaaa8d --- /dev/null +++ b/dsp-types.h @@ -0,0 +1,21 @@ +#ifndef DUMP1090_DSP_TYPES_H +#define DUMP1090_DSP_TYPES_H + +#include + +typedef struct { + uint8_t I; + uint8_t Q; +} __attribute__((packed)) uc8_t; + +typedef union { + uc8_t uc8; + uint16_t u16; +} uc8_u16_t; + +typedef struct { + int16_t I; + int16_t Q; +} __attribute__((packed)) sc16_t; + +#endif diff --git a/dsp/benchmark/magnitude_power_uc8_benchmark.c b/dsp/benchmark/magnitude_power_uc8_benchmark.c new file mode 100644 index 0000000..1c1c105 --- /dev/null +++ b/dsp/benchmark/magnitude_power_uc8_benchmark.c @@ -0,0 +1,102 @@ +#include +#include + +void STARCH_BENCHMARK(magnitude_power_uc8) (void) +{ + uc8_t *in = NULL; + uint16_t *out_mag = NULL; + const unsigned len = 65536; + double out_level, out_power; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, *in)) || !(out_mag = STARCH_BENCHMARK_ALLOC(len, *out_mag))) { + goto done; + } + + unsigned i = 0; + + // 0.9 magnitude, varying phase + double degrees = 0; + for (; i < len && degrees < 360; i += 1, degrees += 1) { + in[i].I = (uint8_t) (0.9 * cos(degrees * M_PI / 180.0) * 128 + 127.4); + in[i].Q = (uint8_t) (0.9 * sin(degrees * M_PI / 180.0) * 128 + 127.4); + } + + // 0, 45, 90 degree phase, full input range + unsigned sequence = 0; + for (; (i+3) <= len && sequence < 256; i += 3, sequence += 1) { + in[i + 0].I = sequence; + in[i + 0].Q = 0; + + in[i + 1].I = sequence; + in[i + 1].Q = sequence; + + in[i + 2].I = 0; + in[i + 2].Q = sequence; + } + + // Fill the rest with random values + srand(1); + for (; i < len; ++i) { + in[i].I = rand() % 256; + in[i].Q = rand() % 256; + } + + STARCH_BENCHMARK_RUN( magnitude_power_uc8, in, out_mag, len, &out_level, &out_power ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out_mag); +} + +bool STARCH_BENCHMARK_VERIFY(magnitude_power_uc8) (const uc8_t *in, uint16_t *out, unsigned len, double *out_level, double *out_power) +{ + const double max_error = 0.015; // tolerate 1.5% error + const double epsilon = 1.0; + bool okay = true; + + double sum_level = 0, sum_power = 0; + + for (unsigned i = 0; i < len; ++i) { + double I = (in[i].I - 127.4) / 128; + double Q = (in[i].Q - 127.4) / 128; + double magsq = I * I + Q * Q; + double expected = round(sqrt(magsq) * 65536.0); + if (expected > 65535.0) + expected = 65535.0; + double actual = out[i]; + + double error = fabs(expected - actual); + double error_fraction = error / (expected > epsilon ? expected : epsilon); + if (error > epsilon && error_fraction > max_error) { + fprintf(stderr, "verification failed: in[%u].I=%u in[%u].Q=%u out[%u]=%u, expected=%.0f, error=%.2f%%\n", + i, in[i].I, + i, in[i].Q, + i, out[i], + expected, + error_fraction * 100.0); + okay = false; + } + + sum_level += expected; + sum_power += expected * expected; + } + + sum_level = sum_level / len / 65536.0; + sum_power = sum_power / len / (65536.0 * 65536.0); + + double level_error = sum_level - *out_level; + if (fabs(level_error / sum_level) > max_error) { + fprintf(stderr, "verification failed: expected mean level %.5f, got mean level %.5f, error=%.2f%%\n", + sum_level, *out_level, 100.0 * level_error / sum_level); + okay = false; + } + + double power_error = sum_power - *out_power; + if (fabs(power_error / sum_power) > max_error) { + fprintf(stderr, "verification failed: expected mean power %.5f, got mean power %.5f, error=%.2f%%\n", + sum_power, *out_power, 100.0 * power_error / sum_power); + okay = false; + } + + return okay; +} diff --git a/dsp/benchmark/magnitude_sc16_benchmark.c b/dsp/benchmark/magnitude_sc16_benchmark.c new file mode 100644 index 0000000..8c1edee --- /dev/null +++ b/dsp/benchmark/magnitude_sc16_benchmark.c @@ -0,0 +1,79 @@ +#include +#include +#include + +void STARCH_BENCHMARK(magnitude_sc16) (void) +{ + sc16_t *in = NULL; + uint16_t *out_mag = NULL; + const unsigned len = 262144; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, *in)) || !(out_mag = STARCH_BENCHMARK_ALLOC(len, *out_mag))) { + goto done; + } + + unsigned i = 0; + + // 0.9 magnitude, varying phase + double degrees = 0; + for (; i < len && degrees < 360; i += 1, degrees += 1) { + in[i].I = (int16_t) (0.9 * cos(degrees * M_PI / 180.0) * 32768.0); + in[i].Q = (int16_t) (0.9 * sin(degrees * M_PI / 180.0) * 32768.0); + } + + // 0, 45, 90 degree phase, full input range + unsigned sequence = 0; + for (; (i+3) <= len && sequence < 65536; i += 3, sequence += 1) { + in[i + 0].I = (int16_t) (sequence - 32768); + in[i + 0].Q = 0; + + in[i + 1].I = (int16_t) (sequence - 32768); + in[i + 1].Q = (int16_t) (sequence - 32768); + + in[i + 2].I = 0; + in[i + 2].Q = (int16_t) (sequence - 32768); + } + + // Fill the rest with random values + srand(1); + for (; i < len; ++i) { + in[i].I = rand() % 65536 - 32768; + in[i].Q = rand() % 65536 - 32768; + } + + STARCH_BENCHMARK_RUN( magnitude_sc16, in, out_mag, len ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out_mag); +} + +bool STARCH_BENCHMARK_VERIFY(magnitude_sc16) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const double max_error = 0.015; // tolerate 1.5% error + const double epsilon = 3.0; + bool okay = true; + + for (unsigned i = 0; i < len; ++i) { + double I = in[i].I / 32768.0; + double Q = in[i].Q / 32768.0; + double expected = round(sqrt(I * I + Q * Q) * 65536.0); + if (expected > 65535.0) + expected = 65535.0; + double actual = out[i]; + + double error = fabs(expected - actual); + double error_fraction = error / (expected > epsilon ? expected : epsilon); + if (error > epsilon && error_fraction > max_error) { + fprintf(stderr, "verification failed: in[%u].I=%d in[%u].Q=%d out[%u]=%u, expected=%.0f, error=%.2f%%\n", + i, in[i].I, + i, in[i].Q, + i, out[i], + expected, + error_fraction * 100.0); + okay = false; + } + } + + return okay; +} diff --git a/dsp/benchmark/magnitude_sc16q11_benchmark.c b/dsp/benchmark/magnitude_sc16q11_benchmark.c new file mode 100644 index 0000000..a08b96e --- /dev/null +++ b/dsp/benchmark/magnitude_sc16q11_benchmark.c @@ -0,0 +1,79 @@ +#include +#include +#include + +void STARCH_BENCHMARK(magnitude_sc16q11) (void) +{ + sc16_t *in = NULL; + uint16_t *out_mag = NULL; + const unsigned len = 65536; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, *in)) || !(out_mag = STARCH_BENCHMARK_ALLOC(len, *out_mag))) { + goto done; + } + + unsigned i = 0; + + // 0.9 magnitude, varying phase + double degrees = 0; + for (; i < len && degrees < 360; i += 1, degrees += 1) { + in[i].I = (int16_t) (0.9 * cos(degrees * M_PI / 180.0) * 2048.0); + in[i].Q = (int16_t) (0.9 * sin(degrees * M_PI / 180.0) * 2048.0); + } + + // 0, 45, 90 degree phase, full input range + unsigned sequence = 0; + for (; (i+3) <= len && sequence < 4096; i += 3, sequence += 1) { + in[i + 0].I = (int16_t) (sequence - 2048); + in[i + 0].Q = 0; + + in[i + 1].I = (int16_t) (sequence - 2048); + in[i + 1].Q = (int16_t) (sequence - 2048); + + in[i + 2].I = 0; + in[i + 2].Q = (int16_t) (sequence - 2048); + } + + // Fill the rest with random values + srand(1); + for (; i < len; ++i) { + in[i].I = rand() % 4096 - 2048; + in[i].Q = rand() % 4096 - 2048; + } + + STARCH_BENCHMARK_RUN( magnitude_sc16q11, in, out_mag, len ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out_mag); +} + +bool STARCH_BENCHMARK_VERIFY(magnitude_sc16q11) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const double max_error = 0.015; // tolerate 1.5% error + const double epsilon = 3.0; + bool okay = true; + + for (unsigned i = 0; i < len; ++i) { + double I = in[i].I / 2048.0; + double Q = in[i].Q / 2048.0; + double expected = round(sqrt(I * I + Q * Q) * 65536.0); + if (expected > 65535.0) + expected = 65535.0; + double actual = out[i]; + + double error = fabs(expected - actual); + double error_fraction = error / (expected > epsilon ? expected : epsilon); + if (error > epsilon && error_fraction > max_error) { + fprintf(stderr, "verification failed: in[%u].I=%d in[%u].Q=%d out[%u]=%u, expected=%.0f, error=%.2f%%\n", + i, in[i].I, + i, in[i].Q, + i, out[i], + expected, + error_fraction * 100.0); + okay = false; + } + } + + return okay; +} diff --git a/dsp/benchmark/magnitude_uc8_benchmark.c b/dsp/benchmark/magnitude_uc8_benchmark.c new file mode 100644 index 0000000..e03fc0c --- /dev/null +++ b/dsp/benchmark/magnitude_uc8_benchmark.c @@ -0,0 +1,79 @@ +#include +#include + +void STARCH_BENCHMARK(magnitude_uc8) (void) +{ + uc8_t *in = NULL; + uint16_t *out_mag = NULL; + const unsigned len = 65536; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, *in)) || !(out_mag = STARCH_BENCHMARK_ALLOC(len, *out_mag))) { + goto done; + } + + unsigned i = 0; + + // 0.9 magnitude, varying phase + double degrees = 0; + for (; i < len && degrees < 360; i += 1, degrees += 1) { + in[i].I = (uint8_t) (0.9 * cos(degrees * M_PI / 180.0) * 128 + 127.4); + in[i].Q = (uint8_t) (0.9 * sin(degrees * M_PI / 180.0) * 128 + 127.4); + } + + // 0, 45, 90 degree phase, full input range + unsigned sequence = 0; + for (; (i+3) <= len && sequence < 256; i += 3, sequence += 1) { + in[i + 0].I = sequence; + in[i + 0].Q = 0; + + in[i + 1].I = sequence; + in[i + 1].Q = sequence; + + in[i + 2].I = 0; + in[i + 2].Q = sequence; + } + + // Fill the rest with random values + srand(1); + for (; i < len; ++i) { + in[i].I = rand() % 256; + in[i].Q = rand() % 256; + } + + STARCH_BENCHMARK_RUN( magnitude_uc8, in, out_mag, len ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out_mag); +} + +bool STARCH_BENCHMARK_VERIFY(magnitude_uc8) (const uc8_t *in, uint16_t *out, unsigned len) +{ + const double max_error = 0.015; // tolerate 1.5% error + const double epsilon = 3.0; + bool okay = true; + + for (unsigned i = 0; i < len; ++i) { + double I = (in[i].I - 127.4) / 128; + double Q = (in[i].Q - 127.4) / 128; + double magsq = I * I + Q * Q; + double expected = round(sqrt(magsq) * 65536.0); + if (expected > 65535.0) + expected = 65535.0; + double actual = out[i]; + + double error = fabs(expected - actual); + double error_fraction = error / (expected > epsilon ? expected : epsilon); + if (error > epsilon && error_fraction > max_error) { + fprintf(stderr, "verification failed: in[%u].I=%u in[%u].Q=%u out[%u]=%u, expected=%.0f, error=%.2f%%\n", + i, in[i].I, + i, in[i].Q, + i, out[i], + expected, + error_fraction * 100.0); + okay = false; + } + } + + return okay; +} diff --git a/dsp/benchmark/mean_power_u16_benchmark.c b/dsp/benchmark/mean_power_u16_benchmark.c new file mode 100644 index 0000000..16c60fd --- /dev/null +++ b/dsp/benchmark/mean_power_u16_benchmark.c @@ -0,0 +1,57 @@ +#include + +void STARCH_BENCHMARK(mean_power_u16) (void) +{ + uint16_t *in = NULL; + double mean_mag, mean_magsq; + const unsigned len = 65536; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, *in))) { + goto done; + } + + for (unsigned i = 0; i < len; ++i) { + in[i] = i; + } + + STARCH_BENCHMARK_RUN( mean_power_u16, in, len, &mean_mag, &mean_magsq ); + + done: + STARCH_BENCHMARK_FREE(in); +} + +bool STARCH_BENCHMARK_VERIFY(mean_power_u16) (const uint16_t *in, unsigned len, double *out_mag, double *out_magsq) +{ + const double max_error = 0.01; // tolerate 1% error + + double sum_mag = 0; + double sum_magsq = 0; + + for (unsigned i = 0; i < len; ++i) { + double mag = in[i] / 65536.0; + sum_mag += mag; + sum_magsq += mag * mag; + } + + sum_mag /= len; + sum_magsq /= len; + + bool okay = true; + + double mag_error = sum_mag - *out_mag; + if (fabs(mag_error / sum_mag) > max_error) { + fprintf(stderr, "verification failed: expected mean magnitude %.5f, got %.5f, error=%.2f%%\n", + sum_mag, *out_mag, 100.0 * mag_error / sum_mag); + okay = false; + } + + + double magsq_error = sum_magsq - *out_magsq; + if (fabs(magsq_error / sum_magsq) > max_error) { + fprintf(stderr, "verification failed: expected mean magnitude-squared %.5f, got %.5f, error=%.2f%%\n", + sum_magsq, *out_magsq, 100.0 * magsq_error / sum_magsq); + okay = false; + } + + return okay; +} diff --git a/dsp/generated/benchmark.c b/dsp/generated/benchmark.c new file mode 100644 index 0000000..56a590c --- /dev/null +++ b/dsp/generated/benchmark.c @@ -0,0 +1,1590 @@ + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "starch.h" + +typedef struct timespec starch_benchmark_time; + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end); +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size); +void starch_benchmark_aligned_free(void *user_ptr); +void starch_benchmark_get_time(starch_benchmark_time *t); + +const unsigned starch_benchmark_warmup_loops = 10; + +typedef struct { + const char *name; + const char *impl; + uint64_t ns; +} starch_benchmark_result; + +static starch_benchmark_result *starch_benchmark_results = NULL; +static unsigned starch_benchmark_result_size = 0; +static unsigned starch_benchmark_result_count = 0; + +typedef struct benchmark_flavor_list_node { + const char *flavor; + struct benchmark_flavor_list_node *next; +} starch_benchmark_flavor_list; + +static starch_benchmark_flavor_list *starch_benchmark_flavor_whitelist = NULL; +static starch_benchmark_flavor_list *starch_benchmark_flavor_blacklist = NULL; + +static bool starch_benchmark_list_only = false; +static bool starch_benchmark_validate_only = false; +static bool starch_benchmark_validation_failed = false; +static bool starch_benchmark_top_only = false; +static unsigned starch_benchmark_iterations = 1; + +typedef struct timespec starch_benchmark_time; +void starch_benchmark_get_time(starch_benchmark_time *t) +{ +#ifdef CLOCK_THREAD_CPUTIME_ID + clock_gettime(CLOCK_THREAD_CPUTIME_ID, t); +#else + clock_gettime(CLOCK_MONOTONIC, t); +#endif +} + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end) +{ + return ((uint64_t)end->tv_sec - (uint64_t)start->tv_sec) * 1000000000U + (uint64_t)end->tv_nsec - (uint64_t)start->tv_nsec; +} + +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size) +{ + size_t use_alignment = (type_alignment > alignment ? type_alignment : alignment); + if ( (use_alignment % type_alignment) || (use_alignment % alignment) ) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: conflicting alignment requirements (%zu versus %zu)\n", size, alignment, type_alignment); + return NULL; + } + + /* Over-allocate so we can stash our own pointer before the start, and so that we can adjust + * the returned alignment so it is only aligned to the requested boundary, and not also + * aligned to a larger power of two (we don't want to accidentally benchmark the performance + * of a more restrictive larger alignment) + */ + size_t header_size = (use_alignment < sizeof(void*) ? sizeof(void*) : use_alignment); + char *block_ptr = aligned_alloc(use_alignment, header_size + size + use_alignment); + if (!block_ptr) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: %s\n", size, strerror(errno)); + return NULL; + } + + char *user_ptr = block_ptr + header_size; + if ( (uintptr_t)user_ptr % (use_alignment * 2) == 0 ) { + // user_ptr is aligned to the next power of two, but we don't want that, move it on + user_ptr += use_alignment; + } + + void **stash = (void**)user_ptr - 1; + *stash = block_ptr; + + return user_ptr; +} + +void starch_benchmark_aligned_free(void *user_ptr) +{ + if (!user_ptr) + return; + void **stash = (void**)user_ptr - 1; + free(*stash); +} + +static bool starch_benchmark_flavor_in_list(const char *flavor, const starch_benchmark_flavor_list *list) +{ + for (; list; list = list->next) { + if (!strcmp(flavor, list->flavor)) + return true; + } + return false; +} + + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_uc8_benchmark (void); +bool starch_magnitude_uc8_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_uc8_benchmark(void); + +static void starch_benchmark_one_magnitude_uc8( starch_magnitude_uc8_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_uc8_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_uc8"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_uc8( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_uc8_regentry *_entry = starch_magnitude_uc8_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_uc8( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_uc8_aligned_benchmark (void); +bool starch_magnitude_uc8_aligned_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_uc8_aligned_benchmark(void); + +static void starch_benchmark_one_magnitude_uc8_aligned( starch_magnitude_uc8_aligned_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_uc8_aligned_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_uc8_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_uc8_aligned( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_uc8_aligned_regentry *_entry = starch_magnitude_uc8_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_uc8_aligned( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_power_uc8_benchmark (void); +bool starch_magnitude_power_uc8_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_power_uc8_benchmark(void); + +static void starch_benchmark_one_magnitude_power_uc8( starch_magnitude_power_uc8_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + + /* verify correctness of the output */ + if (! starch_magnitude_power_uc8_benchmark_verify ( arg0, arg1, arg2, arg3, arg4 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_power_uc8"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_power_uc8( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) +{ + for (starch_magnitude_power_uc8_regentry *_entry = starch_magnitude_power_uc8_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_power_uc8( _entry, arg0, arg1, arg2, arg3, arg4 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_power_uc8_aligned_benchmark (void); +bool starch_magnitude_power_uc8_aligned_benchmark_verify ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_power_uc8_aligned_benchmark(void); + +static void starch_benchmark_one_magnitude_power_uc8_aligned( starch_magnitude_power_uc8_aligned_regentry * _entry, const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + + /* verify correctness of the output */ + if (! starch_magnitude_power_uc8_aligned_benchmark_verify ( arg0, arg1, arg2, arg3, arg4 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3, arg4 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_power_uc8_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_power_uc8_aligned( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) +{ + for (starch_magnitude_power_uc8_aligned_regentry *_entry = starch_magnitude_power_uc8_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_power_uc8_aligned( _entry, arg0, arg1, arg2, arg3, arg4 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_sc16_benchmark (void); +bool starch_magnitude_sc16_benchmark_verify ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_sc16_benchmark(void); + +static void starch_benchmark_one_magnitude_sc16( starch_magnitude_sc16_regentry * _entry, const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_sc16_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_sc16"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_sc16( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_sc16_regentry *_entry = starch_magnitude_sc16_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_sc16( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_sc16_aligned_benchmark (void); +bool starch_magnitude_sc16_aligned_benchmark_verify ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_sc16_aligned_benchmark(void); + +static void starch_benchmark_one_magnitude_sc16_aligned( starch_magnitude_sc16_aligned_regentry * _entry, const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_sc16_aligned_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_sc16_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_sc16_aligned( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_sc16_aligned_regentry *_entry = starch_magnitude_sc16_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_sc16_aligned( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_sc16q11_benchmark (void); +bool starch_magnitude_sc16q11_benchmark_verify ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_sc16q11_benchmark(void); + +static void starch_benchmark_one_magnitude_sc16q11( starch_magnitude_sc16q11_regentry * _entry, const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_sc16q11_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_sc16q11"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_sc16q11( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_sc16q11_regentry *_entry = starch_magnitude_sc16q11_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_sc16q11( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_magnitude_sc16q11_aligned_benchmark (void); +bool starch_magnitude_sc16q11_aligned_benchmark_verify ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_magnitude_sc16q11_aligned_benchmark(void); + +static void starch_benchmark_one_magnitude_sc16q11_aligned( starch_magnitude_sc16q11_aligned_regentry * _entry, const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + + /* verify correctness of the output */ + if (! starch_magnitude_sc16q11_aligned_benchmark_verify ( arg0, arg1, arg2 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "magnitude_sc16q11_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_magnitude_sc16q11_aligned( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) +{ + for (starch_magnitude_sc16q11_aligned_regentry *_entry = starch_magnitude_sc16q11_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_magnitude_sc16q11_aligned( _entry, arg0, arg1, arg2 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_mean_power_u16_benchmark (void); +bool starch_mean_power_u16_benchmark_verify ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_mean_power_u16_benchmark(void); + +static void starch_benchmark_one_mean_power_u16( starch_mean_power_u16_regentry * _entry, const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + + /* verify correctness of the output */ + if (! starch_mean_power_u16_benchmark_verify ( arg0, arg1, arg2, arg3 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "mean_power_u16"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_mean_power_u16( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) +{ + for (starch_mean_power_u16_regentry *_entry = starch_mean_power_u16_registry; _entry->name; ++_entry) { + starch_benchmark_one_mean_power_u16( _entry, arg0, arg1, arg2, arg3 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_mean_power_u16_aligned_benchmark (void); +bool starch_mean_power_u16_aligned_benchmark_verify ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_mean_power_u16_aligned_benchmark(void); + +static void starch_benchmark_one_mean_power_u16_aligned( starch_mean_power_u16_aligned_regentry * _entry, const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + + /* verify correctness of the output */ + if (! starch_mean_power_u16_aligned_benchmark_verify ( arg0, arg1, arg2, arg3 )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "mean_power_u16_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_mean_power_u16_aligned( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) +{ + for (starch_mean_power_u16_aligned_regentry *_entry = starch_mean_power_u16_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_mean_power_u16_aligned( _entry, arg0, arg1, arg2, arg3 ); + } +} + + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _benchmark_sym +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) starch_ ## _function ## _benchmark +#define STARCH_BENCHMARK_VERIFY(_function) starch_ ## _function ## _benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(1, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +#include "../benchmark/magnitude_sc16_benchmark.c" +#include "../benchmark/magnitude_uc8_benchmark.c" +#include "../benchmark/magnitude_power_uc8_benchmark.c" +#include "../benchmark/mean_power_u16_benchmark.c" +#include "../benchmark/magnitude_sc16q11_benchmark.c" + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES +#undef STARCH_BENCHMARK +#undef STARCH_BENCHMARK_VERIFY +#undef STARCH_BENCHMARK_RUN +#undef STARCH_BENCHMARK_ALLOC +#undef STARCH_BENCHMARK_FREE + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_benchmark_sym +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) starch_ ## _function ## _aligned_benchmark +#define STARCH_BENCHMARK_VERIFY(_function) starch_ ## _function ## _aligned_benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ## _aligned ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(STARCH_MIX_ALIGNMENT, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +#include "../benchmark/magnitude_sc16_benchmark.c" +#include "../benchmark/magnitude_uc8_benchmark.c" +#include "../benchmark/magnitude_power_uc8_benchmark.c" +#include "../benchmark/mean_power_u16_benchmark.c" +#include "../benchmark/magnitude_sc16q11_benchmark.c" + +static void starch_benchmark_all_magnitude_uc8(void) +{ + fprintf(stderr, "==== magnitude_uc8 ===\n"); + starch_magnitude_uc8_benchmark (); +} +static void starch_benchmark_all_magnitude_uc8_aligned(void) +{ + fprintf(stderr, "==== magnitude_uc8_aligned ===\n"); + starch_magnitude_uc8_aligned_benchmark (); +} +static void starch_benchmark_all_magnitude_power_uc8(void) +{ + fprintf(stderr, "==== magnitude_power_uc8 ===\n"); + starch_magnitude_power_uc8_benchmark (); +} +static void starch_benchmark_all_magnitude_power_uc8_aligned(void) +{ + fprintf(stderr, "==== magnitude_power_uc8_aligned ===\n"); + starch_magnitude_power_uc8_aligned_benchmark (); +} +static void starch_benchmark_all_magnitude_sc16(void) +{ + fprintf(stderr, "==== magnitude_sc16 ===\n"); + starch_magnitude_sc16_benchmark (); +} +static void starch_benchmark_all_magnitude_sc16_aligned(void) +{ + fprintf(stderr, "==== magnitude_sc16_aligned ===\n"); + starch_magnitude_sc16_aligned_benchmark (); +} +static void starch_benchmark_all_magnitude_sc16q11(void) +{ + fprintf(stderr, "==== magnitude_sc16q11 ===\n"); + starch_magnitude_sc16q11_benchmark (); +} +static void starch_benchmark_all_magnitude_sc16q11_aligned(void) +{ + fprintf(stderr, "==== magnitude_sc16q11_aligned ===\n"); + starch_magnitude_sc16q11_aligned_benchmark (); +} +static void starch_benchmark_all_mean_power_u16(void) +{ + fprintf(stderr, "==== mean_power_u16 ===\n"); + starch_mean_power_u16_benchmark (); +} +static void starch_benchmark_all_mean_power_u16_aligned(void) +{ + fprintf(stderr, "==== mean_power_u16_aligned ===\n"); + starch_mean_power_u16_aligned_benchmark (); +} + +static int starch_benchmark_compare_result(const void *a, const void *b) +{ + const starch_benchmark_result *left = (const starch_benchmark_result *) a; + const starch_benchmark_result *right = (const starch_benchmark_result *) b; + + int name_cmp = strcmp(left->name, right->name); + if (name_cmp) + return name_cmp; + + if (left->ns < right->ns) + return -1; + if (left->ns > right->ns) + return 1; + return 0; +} + +static void starch_benchmark_usage(const char *argv0) +{ + fprintf(stderr, + "Usage: %s [OPTION]... [FUNCTION]...\n" + "Benchmarks starch functions and optionally writes a sorted wisdom file.\n" + "\n" + " -r FILE Read initial wisdom from FILE\n" + " -o FILE Write sorted wisdom to FILE\n" + " -F FLAVOR Add FLAVOR to whitelist\n" + " (default: no whitelist, run all runtime-supported flavors)\n" + " -N FLAVOR Add FLAVOR to blacklist\n" + " (default: no blacklist, run all runtime-supported flavors)\n" + " -l List compiled-in implementations but don't benchmark them\n" + " -V Run validation tests, but don't run benchmarks\n" + " -t Include only the top candidate per function in wisdom output\n" + " -i ITERS Run benchmark ITERS times and use the mean. If ITERS > 2, ignore\n" + " the smallest and largest runs when calculating the mean.\n" + " (default: 1 iteration)\n" + " FUNCTION Run benchmarks for these functions only\n" + " (default: benchmark all functions)\n" + "\n" + "Supported flavors: " +#ifdef STARCH_FLAVOR_GENERIC + "generic " +#endif +#ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 + "armv7a_neon_vfpv4 " +#endif +#ifdef STARCH_FLAVOR_X86_AVX2 + "x86_avx2 " +#endif + "\n" + "Supported functions: " + "magnitude_uc8 " + "magnitude_uc8_aligned " + "magnitude_power_uc8 " + "magnitude_power_uc8_aligned " + "magnitude_sc16 " + "magnitude_sc16_aligned " + "magnitude_sc16q11 " + "magnitude_sc16q11_aligned " + "mean_power_u16 " + "mean_power_u16_aligned " + "\n", argv0); +} + +static void starch_benchmark_append_flavor(const char *flavor, starch_benchmark_flavor_list **list) +{ + starch_benchmark_flavor_list *newnode = malloc(sizeof(*newnode)); + if (!newnode) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + exit(1); + } + + newnode->flavor = flavor; + newnode->next = *list; + *list = newnode; +} + +int main(int argc, char **argv) +{ + int specific = 0; + const char *output_path = NULL; + + int opt; + while ((opt = getopt(argc, argv, "r:o:F:N:i:lhtV")) != -1) { + switch (opt) { + case 'r': + if (starch_read_wisdom(optarg) < 0) { + fprintf(stderr, "%s: cannot read %s: %s\n", argv[0], optarg, strerror(errno)); + return 1; + } + fprintf(stderr, "%s: loaded wisdom file %s\n", argv[0], optarg); + break; + + case 'o': + output_path = optarg; + break; + + case 'F': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_whitelist); + break; + + case 'N': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_blacklist); + break; + + case 'l': + starch_benchmark_list_only = true; + break; + + case 't': + starch_benchmark_top_only = true; + break; + + case 'i': + starch_benchmark_iterations = atoi(optarg); + break; + + case 'V': + starch_benchmark_validate_only = true; + break; + + case 'h': + starch_benchmark_usage(argv[0]); + return 0; + + case '?': + default: + starch_benchmark_usage(argv[0]); + return 2; + } + } + + if (starch_benchmark_list_only && output_path) { + fprintf(stderr, "%s: -o and -l options cannot be specified together\n", argv[0]); + return 2; + } + + for (int i = optind; i < argc; ++i) { + if (!strcmp(argv[i], "magnitude_uc8")) { + specific = 1; + starch_benchmark_all_magnitude_uc8(); + continue; + } + if (!strcmp(argv[i], "magnitude_uc8_aligned")) { + specific = 1; + starch_benchmark_all_magnitude_uc8_aligned(); + continue; + } + if (!strcmp(argv[i], "magnitude_power_uc8")) { + specific = 1; + starch_benchmark_all_magnitude_power_uc8(); + continue; + } + if (!strcmp(argv[i], "magnitude_power_uc8_aligned")) { + specific = 1; + starch_benchmark_all_magnitude_power_uc8_aligned(); + continue; + } + if (!strcmp(argv[i], "magnitude_sc16")) { + specific = 1; + starch_benchmark_all_magnitude_sc16(); + continue; + } + if (!strcmp(argv[i], "magnitude_sc16_aligned")) { + specific = 1; + starch_benchmark_all_magnitude_sc16_aligned(); + continue; + } + if (!strcmp(argv[i], "magnitude_sc16q11")) { + specific = 1; + starch_benchmark_all_magnitude_sc16q11(); + continue; + } + if (!strcmp(argv[i], "magnitude_sc16q11_aligned")) { + specific = 1; + starch_benchmark_all_magnitude_sc16q11_aligned(); + continue; + } + if (!strcmp(argv[i], "mean_power_u16")) { + specific = 1; + starch_benchmark_all_mean_power_u16(); + continue; + } + if (!strcmp(argv[i], "mean_power_u16_aligned")) { + specific = 1; + starch_benchmark_all_mean_power_u16_aligned(); + continue; + } + + fprintf(stderr, "%s: unrecognized function name: %s\n", argv[0], argv[i]); + return 2; + } + + if (!specific) { + starch_benchmark_all_magnitude_uc8(); + starch_benchmark_all_magnitude_uc8_aligned(); + starch_benchmark_all_magnitude_power_uc8(); + starch_benchmark_all_magnitude_power_uc8_aligned(); + starch_benchmark_all_magnitude_sc16(); + starch_benchmark_all_magnitude_sc16_aligned(); + starch_benchmark_all_magnitude_sc16q11(); + starch_benchmark_all_magnitude_sc16q11_aligned(); + starch_benchmark_all_mean_power_u16(); + starch_benchmark_all_mean_power_u16_aligned(); + } + + if (output_path) { + FILE *out = fopen(output_path, "w"); + if (!out) { + fprintf(stderr, "%s: cannot open %s: %s\n", argv[0], output_path, strerror(errno)); + return 1; + } + + fprintf(out, "# generated by "); + for (int i = 0; i < argc; ++i) + fprintf(out, "%s ", argv[i]); + fprintf(out, "\n\n"); + + qsort(starch_benchmark_results, starch_benchmark_result_count, sizeof(*starch_benchmark_results), starch_benchmark_compare_result); + + const char *last_name = NULL; + bool first = true; + for (unsigned i = 0; i < starch_benchmark_result_count; ++i) { + starch_benchmark_result *result = &starch_benchmark_results[i]; + if (last_name && strcmp(last_name, result->name) != 0) { + fprintf(out, "\n"); + first = true; + } + last_name = result->name; + if (starch_benchmark_top_only && !first) + continue; + fprintf(out, "%-40s %-40s # %" PRIu64 " ns/call\n", result->name, result->impl, result->ns); + first = false; + } + + fclose(out); + fprintf(stderr, "%s: wrote sorted wisdom to %s\n", argv[0], output_path); + } + + return starch_benchmark_validation_failed ? 1 : 0; +} diff --git a/dsp/generated/dispatcher.c b/dsp/generated/dispatcher.c new file mode 100644 index 0000000..565ed76 --- /dev/null +++ b/dsp/generated/dispatcher.c @@ -0,0 +1,1160 @@ + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include + +#include "starch.h" + +/* helper for re-sorting registries */ +struct starch_regentry_prefix { + int rank; +}; + +static int starch_regentry_rank_compare (const void *l, const void *r) +{ + const struct starch_regentry_prefix *left = l, *right = r; + return left->rank - right->rank; +} + +/* dispatcher / registry for magnitude_uc8 */ + +starch_magnitude_uc8_regentry * starch_magnitude_uc8_select() { + for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_uc8_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_select(); + if (!entry) + abort(); + + starch_magnitude_uc8 = entry->callable; + starch_magnitude_uc8 ( arg0, arg1, arg2 ); +} + +starch_magnitude_uc8_ptr starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; + +void starch_magnitude_uc8_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_uc8_regentry *entry; + for (entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_uc8_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_uc8_registry, entry - starch_magnitude_uc8_registry, sizeof(starch_magnitude_uc8_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; +} + +starch_magnitude_uc8_regentry starch_magnitude_uc8_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "exact_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 6, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_x86_avx2, cpu_supports_avx2 }, + { 3, "exact_x86_avx2", "x86_avx2", starch_magnitude_uc8_exact_x86_avx2, cpu_supports_avx2 }, + { 4, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 5, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_uc8_aligned */ + +starch_magnitude_uc8_aligned_regentry * starch_magnitude_uc8_aligned_select() { + for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_uc8_aligned_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_select(); + if (!entry) + abort(); + + starch_magnitude_uc8_aligned = entry->callable; + starch_magnitude_uc8_aligned ( arg0, arg1, arg2 ); +} + +starch_magnitude_uc8_aligned_ptr starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; + +void starch_magnitude_uc8_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_uc8_aligned_regentry *entry; + for (entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_uc8_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_uc8_aligned_registry, entry - starch_magnitude_uc8_aligned_registry, sizeof(starch_magnitude_uc8_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; +} + +starch_magnitude_uc8_aligned_regentry starch_magnitude_uc8_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 2, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_unroll_4_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "exact_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "exact_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_exact_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 8, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 9, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 10, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_uc8_lookup_unroll_4_generic, NULL }, + { 2, "lookup_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_lookup_x86_avx2, cpu_supports_avx2 }, + { 3, "lookup_unroll_4_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 4, "exact_x86_avx2_aligned", "x86_avx2", starch_magnitude_uc8_aligned_exact_x86_avx2, cpu_supports_avx2 }, + { 5, "lookup_x86_avx2", "x86_avx2", starch_magnitude_uc8_lookup_x86_avx2, cpu_supports_avx2 }, + { 6, "exact_x86_avx2", "x86_avx2", starch_magnitude_uc8_exact_x86_avx2, cpu_supports_avx2 }, + { 7, "lookup_generic", "generic", starch_magnitude_uc8_lookup_generic, NULL }, + { 8, "exact_generic", "generic", starch_magnitude_uc8_exact_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_power_uc8 */ + +starch_magnitude_power_uc8_regentry * starch_magnitude_power_uc8_select() { + for (starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_power_uc8_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) { + starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_select(); + if (!entry) + abort(); + + starch_magnitude_power_uc8 = entry->callable; + starch_magnitude_power_uc8 ( arg0, arg1, arg2, arg3, arg4 ); +} + +starch_magnitude_power_uc8_ptr starch_magnitude_power_uc8 = starch_magnitude_power_uc8_dispatch; + +void starch_magnitude_power_uc8_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_power_uc8_regentry *entry; + for (entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_power_uc8_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_power_uc8_registry, entry - starch_magnitude_power_uc8_registry, sizeof(starch_magnitude_power_uc8_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_power_uc8 = starch_magnitude_power_uc8_dispatch; +} + +starch_magnitude_power_uc8_regentry starch_magnitude_power_uc8_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 2, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, + { 2, "twopass_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_twopass_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 6, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "twopass_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_twopass_x86_avx2, cpu_supports_avx2 }, + { 1, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 2, "lookup_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_lookup_x86_avx2, cpu_supports_avx2 }, + { 3, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 4, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 5, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_power_uc8_aligned */ + +starch_magnitude_power_uc8_aligned_regentry * starch_magnitude_power_uc8_aligned_select() { + for (starch_magnitude_power_uc8_aligned_regentry *entry = starch_magnitude_power_uc8_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_power_uc8_aligned_dispatch ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ) { + starch_magnitude_power_uc8_aligned_regentry *entry = starch_magnitude_power_uc8_aligned_select(); + if (!entry) + abort(); + + starch_magnitude_power_uc8_aligned = entry->callable; + starch_magnitude_power_uc8_aligned ( arg0, arg1, arg2, arg3, arg4 ); +} + +starch_magnitude_power_uc8_aligned_ptr starch_magnitude_power_uc8_aligned = starch_magnitude_power_uc8_aligned_dispatch; + +void starch_magnitude_power_uc8_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_power_uc8_aligned_regentry *entry; + for (entry = starch_magnitude_power_uc8_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_power_uc8_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_power_uc8_aligned_registry, entry - starch_magnitude_power_uc8_aligned_registry, sizeof(starch_magnitude_power_uc8_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_power_uc8_aligned = starch_magnitude_power_uc8_aligned_dispatch; +} + +starch_magnitude_power_uc8_aligned_regentry starch_magnitude_power_uc8_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 1, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 2, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, + { 2, "twopass_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_aligned_twopass_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "lookup_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_aligned_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "lookup_unroll_4_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "twopass_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_twopass_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "lookup_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_lookup_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "lookup_unroll_4_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_lookup_unroll_4_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 8, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_power_uc8_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 9, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 10, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "twopass_x86_avx2_aligned", "x86_avx2", starch_magnitude_power_uc8_aligned_twopass_x86_avx2, cpu_supports_avx2 }, + { 1, "twopass_generic", "generic", starch_magnitude_power_uc8_twopass_generic, NULL }, + { 2, "lookup_x86_avx2_aligned", "x86_avx2", starch_magnitude_power_uc8_aligned_lookup_x86_avx2, cpu_supports_avx2 }, + { 3, "lookup_unroll_4_x86_avx2_aligned", "x86_avx2", starch_magnitude_power_uc8_aligned_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 4, "twopass_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_twopass_x86_avx2, cpu_supports_avx2 }, + { 5, "lookup_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_lookup_x86_avx2, cpu_supports_avx2 }, + { 6, "lookup_unroll_4_x86_avx2", "x86_avx2", starch_magnitude_power_uc8_lookup_unroll_4_x86_avx2, cpu_supports_avx2 }, + { 7, "lookup_generic", "generic", starch_magnitude_power_uc8_lookup_generic, NULL }, + { 8, "lookup_unroll_4_generic", "generic", starch_magnitude_power_uc8_lookup_unroll_4_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_sc16 */ + +starch_magnitude_sc16_regentry * starch_magnitude_sc16_select() { + for (starch_magnitude_sc16_regentry *entry = starch_magnitude_sc16_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_sc16_dispatch ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_sc16_regentry *entry = starch_magnitude_sc16_select(); + if (!entry) + abort(); + + starch_magnitude_sc16 = entry->callable; + starch_magnitude_sc16 ( arg0, arg1, arg2 ); +} + +starch_magnitude_sc16_ptr starch_magnitude_sc16 = starch_magnitude_sc16_dispatch; + +void starch_magnitude_sc16_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_sc16_regentry *entry; + for (entry = starch_magnitude_sc16_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_sc16_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_sc16_registry, entry - starch_magnitude_sc16_registry, sizeof(starch_magnitude_sc16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16 = starch_magnitude_sc16_dispatch; +} + +starch_magnitude_sc16_regentry starch_magnitude_sc16_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 2, "exact_u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "exact_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16_exact_float_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 2, "exact_u32_x86_avx2", "x86_avx2", starch_magnitude_sc16_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 3, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_sc16_aligned */ + +starch_magnitude_sc16_aligned_regentry * starch_magnitude_sc16_aligned_select() { + for (starch_magnitude_sc16_aligned_regentry *entry = starch_magnitude_sc16_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_sc16_aligned_dispatch ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_sc16_aligned_regentry *entry = starch_magnitude_sc16_aligned_select(); + if (!entry) + abort(); + + starch_magnitude_sc16_aligned = entry->callable; + starch_magnitude_sc16_aligned ( arg0, arg1, arg2 ); +} + +starch_magnitude_sc16_aligned_ptr starch_magnitude_sc16_aligned = starch_magnitude_sc16_aligned_dispatch; + +void starch_magnitude_sc16_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_sc16_aligned_regentry *entry; + for (entry = starch_magnitude_sc16_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_sc16_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_sc16_aligned_registry, entry - starch_magnitude_sc16_aligned_registry, sizeof(starch_magnitude_sc16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16_aligned = starch_magnitude_sc16_aligned_dispatch; +} + +starch_magnitude_sc16_aligned_regentry starch_magnitude_sc16_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 2, "exact_u32_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16_aligned_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "exact_float_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16_aligned_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "exact_u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "exact_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_float_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16_aligned_exact_float_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16_exact_float_generic, NULL }, + { 2, "exact_u32_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16_aligned_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 3, "exact_u32_x86_avx2", "x86_avx2", starch_magnitude_sc16_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 4, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16_exact_float_x86_avx2, cpu_supports_avx2 }, + { 5, "exact_u32_generic", "generic", starch_magnitude_sc16_exact_u32_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_sc16q11 */ + +starch_magnitude_sc16q11_regentry * starch_magnitude_sc16q11_select() { + for (starch_magnitude_sc16q11_regentry *entry = starch_magnitude_sc16q11_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_sc16q11_dispatch ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_sc16q11_regentry *entry = starch_magnitude_sc16q11_select(); + if (!entry) + abort(); + + starch_magnitude_sc16q11 = entry->callable; + starch_magnitude_sc16q11 ( arg0, arg1, arg2 ); +} + +starch_magnitude_sc16q11_ptr starch_magnitude_sc16q11 = starch_magnitude_sc16q11_dispatch; + +void starch_magnitude_sc16q11_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_sc16q11_regentry *entry; + for (entry = starch_magnitude_sc16q11_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_sc16q11_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_sc16q11_registry, entry - starch_magnitude_sc16q11_registry, sizeof(starch_magnitude_sc16q11_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16q11 = starch_magnitude_sc16q11_dispatch; +} + +starch_magnitude_sc16q11_regentry starch_magnitude_sc16q11_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 2, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 3, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 2, "exact_u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "exact_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "11bit_table_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_11bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "12bit_table_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_12bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 7, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 8, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_exact_float_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 2, "exact_u32_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 3, "11bit_table_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_11bit_table_x86_avx2, cpu_supports_avx2 }, + { 4, "12bit_table_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_12bit_table_x86_avx2, cpu_supports_avx2 }, + { 5, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 6, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 7, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for magnitude_sc16q11_aligned */ + +starch_magnitude_sc16q11_aligned_regentry * starch_magnitude_sc16q11_aligned_select() { + for (starch_magnitude_sc16q11_aligned_regentry *entry = starch_magnitude_sc16q11_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_magnitude_sc16q11_aligned_dispatch ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ) { + starch_magnitude_sc16q11_aligned_regentry *entry = starch_magnitude_sc16q11_aligned_select(); + if (!entry) + abort(); + + starch_magnitude_sc16q11_aligned = entry->callable; + starch_magnitude_sc16q11_aligned ( arg0, arg1, arg2 ); +} + +starch_magnitude_sc16q11_aligned_ptr starch_magnitude_sc16q11_aligned = starch_magnitude_sc16q11_aligned_dispatch; + +void starch_magnitude_sc16q11_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_magnitude_sc16q11_aligned_regentry *entry; + for (entry = starch_magnitude_sc16q11_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_magnitude_sc16q11_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_magnitude_sc16q11_aligned_registry, entry - starch_magnitude_sc16q11_aligned_registry, sizeof(starch_magnitude_sc16q11_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16q11_aligned = starch_magnitude_sc16q11_aligned_dispatch; +} + +starch_magnitude_sc16q11_aligned_regentry starch_magnitude_sc16q11_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 1, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 2, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 3, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_vrsqrte_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 2, "exact_u32_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_aligned_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "exact_float_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_aligned_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "11bit_table_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_aligned_11bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "12bit_table_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_aligned_12bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "exact_u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_exact_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "exact_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_exact_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 8, "11bit_table_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_11bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 9, "12bit_table_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_12bit_table_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 10, "neon_vrsqrte_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 11, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 12, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 13, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "exact_float_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16q11_aligned_exact_float_x86_avx2, cpu_supports_avx2 }, + { 1, "exact_float_generic", "generic", starch_magnitude_sc16q11_exact_float_generic, NULL }, + { 2, "exact_u32_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16q11_aligned_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 3, "11bit_table_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16q11_aligned_11bit_table_x86_avx2, cpu_supports_avx2 }, + { 4, "12bit_table_x86_avx2_aligned", "x86_avx2", starch_magnitude_sc16q11_aligned_12bit_table_x86_avx2, cpu_supports_avx2 }, + { 5, "exact_u32_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_exact_u32_x86_avx2, cpu_supports_avx2 }, + { 6, "exact_float_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_exact_float_x86_avx2, cpu_supports_avx2 }, + { 7, "11bit_table_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_11bit_table_x86_avx2, cpu_supports_avx2 }, + { 8, "12bit_table_x86_avx2", "x86_avx2", starch_magnitude_sc16q11_12bit_table_x86_avx2, cpu_supports_avx2 }, + { 9, "exact_u32_generic", "generic", starch_magnitude_sc16q11_exact_u32_generic, NULL }, + { 10, "11bit_table_generic", "generic", starch_magnitude_sc16q11_11bit_table_generic, NULL }, + { 11, "12bit_table_generic", "generic", starch_magnitude_sc16q11_12bit_table_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for mean_power_u16 */ + +starch_mean_power_u16_regentry * starch_mean_power_u16_select() { + for (starch_mean_power_u16_regentry *entry = starch_mean_power_u16_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_mean_power_u16_dispatch ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) { + starch_mean_power_u16_regentry *entry = starch_mean_power_u16_select(); + if (!entry) + abort(); + + starch_mean_power_u16 = entry->callable; + starch_mean_power_u16 ( arg0, arg1, arg2, arg3 ); +} + +starch_mean_power_u16_ptr starch_mean_power_u16 = starch_mean_power_u16_dispatch; + +void starch_mean_power_u16_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_mean_power_u16_regentry *entry; + for (entry = starch_mean_power_u16_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_mean_power_u16_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_mean_power_u16_registry, entry - starch_mean_power_u16_registry, sizeof(starch_mean_power_u16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_mean_power_u16 = starch_mean_power_u16_dispatch; +} + +starch_mean_power_u16_regentry starch_mean_power_u16_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 1, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 2, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 2, "float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "u64_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_u64_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "neon_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_neon_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 6, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "u32_x86_avx2", "x86_avx2", starch_mean_power_u16_u32_x86_avx2, cpu_supports_avx2 }, + { 1, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 2, "float_x86_avx2", "x86_avx2", starch_mean_power_u16_float_x86_avx2, cpu_supports_avx2 }, + { 3, "u64_x86_avx2", "x86_avx2", starch_mean_power_u16_u64_x86_avx2, cpu_supports_avx2 }, + { 4, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 5, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for mean_power_u16_aligned */ + +starch_mean_power_u16_aligned_regentry * starch_mean_power_u16_aligned_select() { + for (starch_mean_power_u16_aligned_regentry *entry = starch_mean_power_u16_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_mean_power_u16_aligned_dispatch ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ) { + starch_mean_power_u16_aligned_regentry *entry = starch_mean_power_u16_aligned_select(); + if (!entry) + abort(); + + starch_mean_power_u16_aligned = entry->callable; + starch_mean_power_u16_aligned ( arg0, arg1, arg2, arg3 ); +} + +starch_mean_power_u16_aligned_ptr starch_mean_power_u16_aligned = starch_mean_power_u16_aligned_dispatch; + +void starch_mean_power_u16_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_mean_power_u16_aligned_regentry *entry; + for (entry = starch_mean_power_u16_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_mean_power_u16_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_mean_power_u16_aligned_registry, entry - starch_mean_power_u16_aligned_registry, sizeof(starch_mean_power_u16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_mean_power_u16_aligned = starch_mean_power_u16_aligned_dispatch; +} + +starch_mean_power_u16_aligned_regentry starch_mean_power_u16_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 1, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 2, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "u32_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_mean_power_u16_aligned_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 1, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, + { 2, "float_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_mean_power_u16_aligned_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 3, "u64_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_mean_power_u16_aligned_u64_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 4, "neon_float_armv7a_neon_vfpv4_aligned", "armv7a_neon_vfpv4", starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 5, "float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 6, "u32_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_u32_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 7, "u64_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_u64_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 8, "neon_float_armv7a_neon_vfpv4", "armv7a_neon_vfpv4", starch_mean_power_u16_neon_float_armv7a_neon_vfpv4, cpu_supports_armv7_neon_vfpv4 }, + { 9, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 10, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86 + { 0, "u32_x86_avx2_aligned", "x86_avx2", starch_mean_power_u16_aligned_u32_x86_avx2, cpu_supports_avx2 }, + { 1, "u32_generic", "generic", starch_mean_power_u16_u32_generic, NULL }, + { 2, "float_x86_avx2_aligned", "x86_avx2", starch_mean_power_u16_aligned_float_x86_avx2, cpu_supports_avx2 }, + { 3, "u64_x86_avx2_aligned", "x86_avx2", starch_mean_power_u16_aligned_u64_x86_avx2, cpu_supports_avx2 }, + { 4, "float_x86_avx2", "x86_avx2", starch_mean_power_u16_float_x86_avx2, cpu_supports_avx2 }, + { 5, "u32_x86_avx2", "x86_avx2", starch_mean_power_u16_u32_x86_avx2, cpu_supports_avx2 }, + { 6, "u64_x86_avx2", "x86_avx2", starch_mean_power_u16_u64_x86_avx2, cpu_supports_avx2 }, + { 7, "float_generic", "generic", starch_mean_power_u16_float_generic, NULL }, + { 8, "u64_generic", "generic", starch_mean_power_u16_u64_generic, NULL }, +#endif /* STARCH_MIX_X86 */ + { 0, NULL, NULL, NULL, NULL } +}; + + +int starch_read_wisdom (const char * path) +{ + FILE *fp = fopen(path, "r"); + if (!fp) + return -1; + + /* reset all ranks to identify entries not listed in the wisdom file; we'll assign ranks at the end to produce a stable sort */ + int rank_magnitude_uc8 = 0; + for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_uc8_aligned = 0; + for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_power_uc8 = 0; + for (starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_power_uc8_aligned = 0; + for (starch_magnitude_power_uc8_aligned_regentry *entry = starch_magnitude_power_uc8_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_sc16 = 0; + for (starch_magnitude_sc16_regentry *entry = starch_magnitude_sc16_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_sc16_aligned = 0; + for (starch_magnitude_sc16_aligned_regentry *entry = starch_magnitude_sc16_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_sc16q11 = 0; + for (starch_magnitude_sc16q11_regentry *entry = starch_magnitude_sc16q11_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_magnitude_sc16q11_aligned = 0; + for (starch_magnitude_sc16q11_aligned_regentry *entry = starch_magnitude_sc16q11_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_mean_power_u16 = 0; + for (starch_mean_power_u16_regentry *entry = starch_mean_power_u16_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_mean_power_u16_aligned = 0; + for (starch_mean_power_u16_aligned_regentry *entry = starch_mean_power_u16_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + + char linebuf[512]; + while (fgets(linebuf, sizeof(linebuf), fp)) { + /* split name and impl on whitespace, handle comments etc */ + char *name = linebuf; + while (*name && isspace(*name)) + ++name; + + if (!*name || *name == '#') + continue; + + char *end = name; + while (*end && !isspace(*end)) + ++end; + + if (!*end) + continue; + *end = 0; + + char *impl = end + 1; + while (*impl && isspace(*impl)) + ++impl; + + if (!*impl) + continue; + + end = impl; + while (*end && !isspace(*end)) + ++end; + + *end = 0; + + /* try to find a matching registry entry */ + if (!strcmp(name, "magnitude_uc8")) { + for (starch_magnitude_uc8_regentry *entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_uc8; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_uc8_aligned")) { + for (starch_magnitude_uc8_aligned_regentry *entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_uc8_aligned; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_power_uc8")) { + for (starch_magnitude_power_uc8_regentry *entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_power_uc8; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_power_uc8_aligned")) { + for (starch_magnitude_power_uc8_aligned_regentry *entry = starch_magnitude_power_uc8_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_power_uc8_aligned; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_sc16")) { + for (starch_magnitude_sc16_regentry *entry = starch_magnitude_sc16_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_sc16; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_sc16_aligned")) { + for (starch_magnitude_sc16_aligned_regentry *entry = starch_magnitude_sc16_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_sc16_aligned; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_sc16q11")) { + for (starch_magnitude_sc16q11_regentry *entry = starch_magnitude_sc16q11_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_sc16q11; + break; + } + } + continue; + } + if (!strcmp(name, "magnitude_sc16q11_aligned")) { + for (starch_magnitude_sc16q11_aligned_regentry *entry = starch_magnitude_sc16q11_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_magnitude_sc16q11_aligned; + break; + } + } + continue; + } + if (!strcmp(name, "mean_power_u16")) { + for (starch_mean_power_u16_regentry *entry = starch_mean_power_u16_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_mean_power_u16; + break; + } + } + continue; + } + if (!strcmp(name, "mean_power_u16_aligned")) { + for (starch_mean_power_u16_aligned_regentry *entry = starch_mean_power_u16_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_mean_power_u16_aligned; + break; + } + } + continue; + } + } + + if (ferror(fp)) { + fclose(fp); + return -1; + } + + fclose(fp); + + /* assign ranks to unmatched items to (stable) sort them last; re-sort everything */ + { + starch_magnitude_uc8_regentry *entry; + for (entry = starch_magnitude_uc8_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_uc8; + } + qsort(starch_magnitude_uc8_registry, entry - starch_magnitude_uc8_registry, sizeof(starch_magnitude_uc8_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8 = starch_magnitude_uc8_dispatch; + } + { + starch_magnitude_uc8_aligned_regentry *entry; + for (entry = starch_magnitude_uc8_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_uc8_aligned; + } + qsort(starch_magnitude_uc8_aligned_registry, entry - starch_magnitude_uc8_aligned_registry, sizeof(starch_magnitude_uc8_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_uc8_aligned = starch_magnitude_uc8_aligned_dispatch; + } + { + starch_magnitude_power_uc8_regentry *entry; + for (entry = starch_magnitude_power_uc8_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_power_uc8; + } + qsort(starch_magnitude_power_uc8_registry, entry - starch_magnitude_power_uc8_registry, sizeof(starch_magnitude_power_uc8_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_power_uc8 = starch_magnitude_power_uc8_dispatch; + } + { + starch_magnitude_power_uc8_aligned_regentry *entry; + for (entry = starch_magnitude_power_uc8_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_power_uc8_aligned; + } + qsort(starch_magnitude_power_uc8_aligned_registry, entry - starch_magnitude_power_uc8_aligned_registry, sizeof(starch_magnitude_power_uc8_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_power_uc8_aligned = starch_magnitude_power_uc8_aligned_dispatch; + } + { + starch_magnitude_sc16_regentry *entry; + for (entry = starch_magnitude_sc16_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_sc16; + } + qsort(starch_magnitude_sc16_registry, entry - starch_magnitude_sc16_registry, sizeof(starch_magnitude_sc16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16 = starch_magnitude_sc16_dispatch; + } + { + starch_magnitude_sc16_aligned_regentry *entry; + for (entry = starch_magnitude_sc16_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_sc16_aligned; + } + qsort(starch_magnitude_sc16_aligned_registry, entry - starch_magnitude_sc16_aligned_registry, sizeof(starch_magnitude_sc16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16_aligned = starch_magnitude_sc16_aligned_dispatch; + } + { + starch_magnitude_sc16q11_regentry *entry; + for (entry = starch_magnitude_sc16q11_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_sc16q11; + } + qsort(starch_magnitude_sc16q11_registry, entry - starch_magnitude_sc16q11_registry, sizeof(starch_magnitude_sc16q11_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16q11 = starch_magnitude_sc16q11_dispatch; + } + { + starch_magnitude_sc16q11_aligned_regentry *entry; + for (entry = starch_magnitude_sc16q11_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_magnitude_sc16q11_aligned; + } + qsort(starch_magnitude_sc16q11_aligned_registry, entry - starch_magnitude_sc16q11_aligned_registry, sizeof(starch_magnitude_sc16q11_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_magnitude_sc16q11_aligned = starch_magnitude_sc16q11_aligned_dispatch; + } + { + starch_mean_power_u16_regentry *entry; + for (entry = starch_mean_power_u16_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_mean_power_u16; + } + qsort(starch_mean_power_u16_registry, entry - starch_mean_power_u16_registry, sizeof(starch_mean_power_u16_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_mean_power_u16 = starch_mean_power_u16_dispatch; + } + { + starch_mean_power_u16_aligned_regentry *entry; + for (entry = starch_mean_power_u16_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_mean_power_u16_aligned; + } + qsort(starch_mean_power_u16_aligned_registry, entry - starch_mean_power_u16_aligned_registry, sizeof(starch_mean_power_u16_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_mean_power_u16_aligned = starch_mean_power_u16_aligned_dispatch; + } + + return 0; +} diff --git a/dsp/generated/flavor.armv7a_neon_vfpv4.c b/dsp/generated/flavor.armv7a_neon_vfpv4.c new file mode 100644 index 0000000..acb84e7 --- /dev/null +++ b/dsp/generated/flavor.armv7a_neon_vfpv4.c @@ -0,0 +1,41 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_ARMV7A_NEON_VFPV4 +#define STARCH_FEATURE_NEON + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## armv7a_neon_vfpv4 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv7a_neon_vfpv4 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/mean_power_u16.c" +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_sc16.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## armv7a_neon_vfpv4 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv7a_neon_vfpv4 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/mean_power_u16.c" +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_sc16.c" + diff --git a/dsp/generated/flavor.generic.c b/dsp/generated/flavor.generic.c new file mode 100644 index 0000000..d869946 --- /dev/null +++ b/dsp/generated/flavor.generic.c @@ -0,0 +1,21 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_GENERIC + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## generic +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## generic +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/mean_power_u16.c" +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_sc16.c" + diff --git a/dsp/generated/flavor.x86_avx2.c b/dsp/generated/flavor.x86_avx2.c new file mode 100644 index 0000000..5b9f88e --- /dev/null +++ b/dsp/generated/flavor.x86_avx2.c @@ -0,0 +1,40 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_X86_AVX2 + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## x86_avx2 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## x86_avx2 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/mean_power_u16.c" +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_sc16.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## x86_avx2 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## x86_avx2 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/mean_power_u16.c" +#include "../impl/magnitude_power_uc8.c" +#include "../impl/magnitude_uc8.c" +#include "../impl/magnitude_sc16q11.c" +#include "../impl/magnitude_sc16.c" + diff --git a/dsp/generated/makefile.arm b/dsp/generated/makefile.arm new file mode 100644 index 0000000..58eaf5b --- /dev/null +++ b/dsp/generated/makefile.arm @@ -0,0 +1,39 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_ARM + + +dsp/generated/flavor.armv7a_neon_vfpv4.o: dsp/generated/flavor.armv7a_neon_vfpv4.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv7-a+neon-vfpv4 -mfpu=neon-vfpv4 -ffast-math dsp/generated/flavor.armv7a_neon_vfpv4.c -o dsp/generated/flavor.armv7a_neon_vfpv4.o + +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o + +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o + +STARCH_OBJS := dsp/generated/flavor.armv7a_neon_vfpv4.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o + + +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o + +STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.generic b/dsp/generated/makefile.generic new file mode 100644 index 0000000..7f261d9 --- /dev/null +++ b/dsp/generated/makefile.generic @@ -0,0 +1,36 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_GENERIC + + +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o + +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o + +STARCH_OBJS := dsp/generated/flavor.generic.o dsp/generated/dispatcher.o + + +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o + +STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/makefile.x86 b/dsp/generated/makefile.x86 new file mode 100644 index 0000000..e88d3e1 --- /dev/null +++ b/dsp/generated/makefile.x86 @@ -0,0 +1,39 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_X86 + + +dsp/generated/flavor.x86_avx2.o: dsp/generated/flavor.x86_avx2.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -mavx2 -ffast-math dsp/generated/flavor.x86_avx2.c -o dsp/generated/flavor.x86_avx2.o + +dsp/generated/flavor.generic.o: dsp/generated/flavor.generic.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/flavor.generic.c -o dsp/generated/flavor.generic.o + +dsp/generated/dispatcher.o: dsp/generated/dispatcher.c dsp/impl/mean_power_u16.c dsp/impl/magnitude_power_uc8.c dsp/impl/magnitude_uc8.c dsp/impl/magnitude_sc16q11.c dsp/impl/magnitude_sc16.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/dispatcher.c -o dsp/generated/dispatcher.o + +STARCH_OBJS := dsp/generated/flavor.x86_avx2.o dsp/generated/flavor.generic.o dsp/generated/dispatcher.o + + +dsp/generated/benchmark.o: dsp/generated/benchmark.c dsp/benchmark/magnitude_sc16_benchmark.c dsp/benchmark/magnitude_uc8_benchmark.c dsp/benchmark/magnitude_power_uc8_benchmark.c dsp/benchmark/mean_power_u16_benchmark.c dsp/benchmark/magnitude_sc16q11_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) dsp/generated/benchmark.c -o dsp/generated/benchmark.o + +STARCH_BENCHMARK_OBJ := dsp/generated/benchmark.o diff --git a/dsp/generated/starch.h b/dsp/generated/starch.h new file mode 100644 index 0000000..063ac04 --- /dev/null +++ b/dsp/generated/starch.h @@ -0,0 +1,294 @@ + +/* starch generated code. Do not edit. */ + +#include "dsp-types.h" +#include "cpu.h" + +/* mixes */ + +/* Generic build, compiler defaults only */ +#ifdef STARCH_MIX_GENERIC +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 1 +#endif /* STARCH_MIX_GENERIC */ + +/* ARM */ +#ifdef STARCH_MIX_ARM +#define STARCH_FLAVOR_ARMV7A_NEON_VFPV4 +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 16 +#endif /* STARCH_MIX_ARM */ + +/* x64 */ +#ifdef STARCH_MIX_X86 +#define STARCH_FLAVOR_X86_AVX2 +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 32 +#endif /* STARCH_MIX_X86 */ + + +#ifdef STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_IS_ALIGNED(_ptr) (((uintptr_t)(_ptr) & (STARCH_MIX_ALIGNMENT-1)) == 0) +#else +/* mix not defined, alignment is unknown, treat everything as unaligned */ +#define STARCH_IS_ALIGNED(_ptr) (0) +#endif + + +/* entry points and registries */ + +typedef void (* starch_magnitude_uc8_ptr) ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_uc8_ptr starch_magnitude_uc8; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_uc8_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_uc8_regentry; + +extern starch_magnitude_uc8_regentry starch_magnitude_uc8_registry[]; +starch_magnitude_uc8_regentry * starch_magnitude_uc8_select(); +void starch_magnitude_uc8_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_uc8_aligned_ptr) ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_uc8_aligned_ptr starch_magnitude_uc8_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_uc8_aligned_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_uc8_aligned_regentry; + +extern starch_magnitude_uc8_aligned_regentry starch_magnitude_uc8_aligned_registry[]; +starch_magnitude_uc8_aligned_regentry * starch_magnitude_uc8_aligned_select(); +void starch_magnitude_uc8_aligned_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_power_uc8_ptr) ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +extern starch_magnitude_power_uc8_ptr starch_magnitude_power_uc8; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_power_uc8_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_power_uc8_regentry; + +extern starch_magnitude_power_uc8_regentry starch_magnitude_power_uc8_registry[]; +starch_magnitude_power_uc8_regentry * starch_magnitude_power_uc8_select(); +void starch_magnitude_power_uc8_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_power_uc8_aligned_ptr) ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +extern starch_magnitude_power_uc8_aligned_ptr starch_magnitude_power_uc8_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_power_uc8_aligned_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_power_uc8_aligned_regentry; + +extern starch_magnitude_power_uc8_aligned_regentry starch_magnitude_power_uc8_aligned_registry[]; +starch_magnitude_power_uc8_aligned_regentry * starch_magnitude_power_uc8_aligned_select(); +void starch_magnitude_power_uc8_aligned_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_sc16_ptr) ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_sc16_ptr starch_magnitude_sc16; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_sc16_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_sc16_regentry; + +extern starch_magnitude_sc16_regentry starch_magnitude_sc16_registry[]; +starch_magnitude_sc16_regentry * starch_magnitude_sc16_select(); +void starch_magnitude_sc16_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_sc16_aligned_ptr) ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_sc16_aligned_ptr starch_magnitude_sc16_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_sc16_aligned_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_sc16_aligned_regentry; + +extern starch_magnitude_sc16_aligned_regentry starch_magnitude_sc16_aligned_registry[]; +starch_magnitude_sc16_aligned_regentry * starch_magnitude_sc16_aligned_select(); +void starch_magnitude_sc16_aligned_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_sc16q11_ptr) ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_sc16q11_ptr starch_magnitude_sc16q11; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_sc16q11_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_sc16q11_regentry; + +extern starch_magnitude_sc16q11_regentry starch_magnitude_sc16q11_registry[]; +starch_magnitude_sc16q11_regentry * starch_magnitude_sc16q11_select(); +void starch_magnitude_sc16q11_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_magnitude_sc16q11_aligned_ptr) ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +extern starch_magnitude_sc16q11_aligned_ptr starch_magnitude_sc16q11_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_magnitude_sc16q11_aligned_ptr callable; + int (*flavor_supported)(); +} starch_magnitude_sc16q11_aligned_regentry; + +extern starch_magnitude_sc16q11_aligned_regentry starch_magnitude_sc16q11_aligned_registry[]; +starch_magnitude_sc16q11_aligned_regentry * starch_magnitude_sc16q11_aligned_select(); +void starch_magnitude_sc16q11_aligned_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_mean_power_u16_ptr) ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +extern starch_mean_power_u16_ptr starch_mean_power_u16; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_mean_power_u16_ptr callable; + int (*flavor_supported)(); +} starch_mean_power_u16_regentry; + +extern starch_mean_power_u16_regentry starch_mean_power_u16_registry[]; +starch_mean_power_u16_regentry * starch_mean_power_u16_select(); +void starch_mean_power_u16_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_mean_power_u16_aligned_ptr) ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +extern starch_mean_power_u16_aligned_ptr starch_mean_power_u16_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_mean_power_u16_aligned_ptr callable; + int (*flavor_supported)(); +} starch_mean_power_u16_aligned_regentry; + +extern starch_mean_power_u16_aligned_regentry starch_mean_power_u16_aligned_registry[]; +starch_mean_power_u16_aligned_regentry * starch_mean_power_u16_aligned_select(); +void starch_mean_power_u16_aligned_set_wisdom( const char * const * received_wisdom ); + +/* flavors and prototypes */ + +#ifdef STARCH_FLAVOR_GENERIC +void starch_mean_power_u16_float_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_generic ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_power_uc8_twopass_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_uc8_lookup_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_generic ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_11bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_12bit_table_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_generic ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +#endif /* STARCH_FLAVOR_GENERIC */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 +int cpu_supports_armv7_neon_vfpv4 (void); +void starch_mean_power_u16_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_neon_float_armv7a_neon_vfpv4 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_power_uc8_twopass_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_twopass_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_uc8_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_11bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_11bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_12bit_table_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_neon_vrsqrte_armv7a_neon_vfpv4 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +#endif /* STARCH_FLAVOR_ARMV7A_NEON_VFPV4 */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_X86_AVX2 +int cpu_supports_avx2 (void); +void starch_mean_power_u16_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_float_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u32_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_mean_power_u16_aligned_u64_x86_avx2 ( const uint16_t * arg0, unsigned arg1, double * arg2, double * arg3 ); +void starch_magnitude_power_uc8_twopass_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_twopass_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_power_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2, double * arg3, double * arg4 ); +void starch_magnitude_uc8_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_lookup_unroll_4_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_uc8_aligned_exact_x86_avx2 ( const uc8_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_11bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16q11_aligned_12bit_table_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_u32_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +void starch_magnitude_sc16_aligned_exact_float_x86_avx2 ( const sc16_t * arg0, uint16_t * arg1, unsigned arg2 ); +#endif /* STARCH_FLAVOR_X86_AVX2 */ + +int starch_read_wisdom (const char * path); + diff --git a/dsp/helpers/tables.c b/dsp/helpers/tables.c new file mode 100644 index 0000000..fe0bc8f --- /dev/null +++ b/dsp/helpers/tables.c @@ -0,0 +1,105 @@ +#include +#include +#include +#include + +#include "dsp-types.h" +#include "dsp/helpers/tables.h" + +const uint16_t * get_uc8_mag_table() +{ + static uint16_t *table = NULL; + + if (!table) { + table = malloc(sizeof(uint16_t) * 256 * 256); + if (!table) { + fprintf(stderr, "can't allocate UC8 conversion lookup table\n"); + abort(); + } + + for (int i = 0; i <= 255; i++) { + for (int q = 0; q <= 255; q++) { + float fI, fQ, magsq; + + fI = (i - 127.4) / 128; + fQ = (q - 127.4) / 128; + magsq = fI * fI + fQ * fQ; + + float mag = round(sqrtf(magsq) * 65536.0f); + if (mag > 65535) + mag = 65535; + + uc8_u16_t u; + u.uc8.I = i; + u.uc8.Q = q; + table[u.u16] = mag; + } + } + } + + return table; +} + +const uint16_t * get_sc16q11_mag_11bit_table() +{ + static uint16_t *table = NULL; + + if (!table) { + table = malloc(sizeof(uint16_t) * 2048 * 2048); + if (!table) { + fprintf(stderr, "can't allocate SC16Q11 conversion lookup table\n"); + abort(); + } + + for (int i = 0; i <= 2047; i++) { + for (int q = 0; q <= 2047; q++) { + float fI, fQ, magsq; + + fI = i / 2048.0; + fQ = q / 2048.0; + magsq = fI * fI + fQ * fQ; + + float mag = round(sqrtf(magsq) * 65536.0f); + if (mag > 65535) + mag = 65535; + + table[(q << 11) | i] = mag; + } + } + } + + return table; +} + +const uint16_t * get_sc16q11_mag_12bit_table() +{ + static uint16_t *table = NULL; + + if (!table) { + table = malloc(sizeof(uint16_t) * 4096 * 4096); + if (!table) { + fprintf(stderr, "can't allocate SC16Q11 conversion lookup table\n"); + abort(); + } + + for (int i = -2048; i <= 2047; i++) { + for (int q = -2048; q <= 2047; q++) { + float fI, fQ, magsq; + + fI = fabs(i) / 2048.0; + fQ = fabs(q) / 2048.0; + magsq = fI * fI + fQ * fQ; + + float mag = round(sqrtf(magsq) * 65536.0f); + if (mag > 65535) + mag = 65535; + + unsigned index = ((i & 4095) << 12) | (q & 4095); + table[index] = mag; + } + } + } + + return table; +} + diff --git a/dsp/helpers/tables.h b/dsp/helpers/tables.h new file mode 100644 index 0000000..cfb86d3 --- /dev/null +++ b/dsp/helpers/tables.h @@ -0,0 +1,10 @@ +#ifndef DSP_TABLES_H +#define DSP_TABLES_H + +#include + +const uint16_t * get_uc8_mag_table(); +const uint16_t * get_sc16q11_mag_11bit_table(); +const uint16_t * get_sc16q11_mag_12bit_table(); + +#endif diff --git a/dsp/impl/magnitude_power_uc8.c b/dsp/impl/magnitude_power_uc8.c new file mode 100644 index 0000000..eca5988 --- /dev/null +++ b/dsp/impl/magnitude_power_uc8.c @@ -0,0 +1,201 @@ +#include +#include +#include +#include +#include +#include + +#include "dsp/helpers/tables.h" + +/* Convert UC8 values to unsigned 16-bit magnitudes */ + +void STARCH_IMPL(magnitude_power_uc8, twopass) (const uc8_t *in, uint16_t *out, unsigned len, double *out_level, double *out_power) +{ +#if STARCH_ALIGNMENT > 1 + starch_magnitude_uc8_aligned(in, out, len); + starch_mean_power_u16_aligned(out, len, out_level, out_power); +#else + starch_magnitude_uc8(in, out, len); + starch_mean_power_u16(out, len, out_level, out_power); +#endif +} + +void STARCH_IMPL(magnitude_power_uc8, lookup) (const uc8_t *in, uint16_t *out, unsigned len, double *out_level, double *out_power) +{ + const uint16_t * const restrict mag_table = get_uc8_mag_table(); + + const uc8_u16_t * restrict in_align = (const uc8_u16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + uint64_t sum_level = 0; + uint64_t sum_power = 0; + + unsigned len1 = len; + while (len1--) { + uint16_t mag = mag_table[in_align[0].u16]; + out_align[0] = mag; + sum_level += mag; + sum_power += (uint32_t)mag * mag; + + out_align += 1; + in_align += 1; + } + + *out_level = sum_level / 65536.0 / len; + *out_power = sum_power / 65536.0 / 65536.0 / len; +} + +void STARCH_IMPL(magnitude_power_uc8, lookup_unroll_4) (const uc8_t *in, uint16_t *out, unsigned len, double *out_level, double *out_power) +{ + const uint16_t * const restrict mag_table = get_uc8_mag_table(); + + const uc8_u16_t * restrict in_align = (const uc8_u16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + uint64_t sum_level = 0; + uint64_t sum_power = 0; + + unsigned len4 = len >> 2; + unsigned len1 = len & 3; + + while (len4--) { + uint16_t mag0 = mag_table[in_align[0].u16]; + uint16_t mag1 = mag_table[in_align[1].u16]; + uint16_t mag2 = mag_table[in_align[2].u16]; + uint16_t mag3 = mag_table[in_align[3].u16]; + + out_align[0] = mag0; + out_align[1] = mag1; + out_align[2] = mag2; + out_align[3] = mag3; + + sum_level = sum_level + mag0 + mag1 + mag2 + mag3; + sum_power = sum_power + (uint32_t)mag0 * mag0 + (uint32_t)mag1 * mag1 + (uint32_t)mag2 * mag2 + (uint32_t)mag3 * mag3; + + out_align += 4; + in_align += 4; + } + + while (len1--) { + uint16_t mag = mag_table[in_align[0].u16]; + + out_align[0] = mag; + + sum_level = sum_level + mag; + sum_power = sum_power + mag * mag; + + out_align += 1; + in_align += 1; + } + + *out_level = sum_level / 65536.0 / len; + *out_power = sum_power / 65536.0 / 65536.0 / len; +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(magnitude_power_uc8, neon_vrsqrte, STARCH_FEATURE_NEON) (const uc8_t *in, uint16_t *out, unsigned len, double *out_level, double *out_power) +{ + const uint8_t * restrict in_align = (const uint8_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + const uint16x8_t offset = vdupq_n_u16((uint16_t) (127.4 * 256)); + const float32x4_t almost_one = vdupq_n_f32(65535.0 / 65536.0); + + float32x4_t sum_level = vdupq_n_f32(0); + float32x4_t sum_power = vdupq_n_f32(0); + + unsigned len8 = len >> 3; + while (len8--) { + uint8x8x2_t iq = vld2_u8(in_align); + + // widen to 16 bits, convert to signed + uint16x8_t i_u16 = vshll_n_u8(iq.val[0], 8); + uint16x8_t q_u16 = vshll_n_u8(iq.val[1], 8); + int16x8_t i_s16 = vreinterpretq_s16_u16(vsubq_u16(i_u16, offset)); + int16x8_t q_s16 = vreinterpretq_s16_u16(vsubq_u16(q_u16, offset)); + + // low half + int16x4_t i_s16_low = vget_low_s16(i_s16); + int16x4_t q_s16_low = vget_low_s16(q_s16); + uint32x4_t isq_low = vreinterpretq_u32_s32(vmull_s16(i_s16_low, i_s16_low)); + uint32x4_t qsq_low = vreinterpretq_u32_s32(vmull_s16(q_s16_low, q_s16_low)); + uint32x4_t magsq_low = vqaddq_u32(isq_low, qsq_low); + float32x4_t magsq_f32_low = vcvtq_n_f32_u32(magsq_low, 30); /* input values are Q15, magsq is Q30 */ + float32x4_t mag_f32_low = vmulq_f32(magsq_f32_low, vrsqrteq_f32(magsq_f32_low)); + uint16x4_t mag_u16_low = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_low, 16)); + + sum_level = vaddq_f32(sum_level, vminq_f32(mag_f32_low, almost_one)); + sum_power = vaddq_f32(sum_power, vminq_f32(magsq_f32_low, almost_one)); + + // high half + int16x4_t i_s16_high = vget_high_s16(i_s16); + int16x4_t q_s16_high = vget_high_s16(q_s16); + uint32x4_t isq_high = vreinterpretq_u32_s32(vmull_s16(i_s16_high, i_s16_high)); + uint32x4_t qsq_high = vreinterpretq_u32_s32(vmull_s16(q_s16_high, q_s16_high)); + uint32x4_t magsq_high = vqaddq_u32(isq_high, qsq_high); + float32x4_t magsq_f32_high = vcvtq_n_f32_u32(magsq_high, 30); + float32x4_t mag_f32_high = vmulq_f32(magsq_f32_high, vrsqrteq_f32(magsq_f32_high)); + uint16x4_t mag_u16_high = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_high, 16)); + + sum_level = vaddq_f32(sum_level, vminq_f32(mag_f32_high, almost_one)); + sum_power = vaddq_f32(sum_power, vminq_f32(magsq_f32_high, almost_one)); + + // store + uint16x8_t result = vcombine_u16(mag_u16_low, mag_u16_high); + vst1q_u16(out_align, result); + + in_align += 16; + out_align += 8; + } + + const int16x8_t lane0_mask = { 0xFF, 0, 0, 0, 0, 0, 0, 0 }; + + unsigned len1 = len & 7; + while (len1--) { + uint8x8x2_t iq = vld2_dup_u8(in_align); + + // widen to 16 bits, convert to signed + uint16x8_t i_u16 = vshll_n_u8(iq.val[0], 8); + uint16x8_t q_u16 = vshll_n_u8(iq.val[1], 8); + int16x8_t i_s16 = vreinterpretq_s16_u16(vsubq_u16(i_u16, offset)); + int16x8_t q_s16 = vreinterpretq_s16_u16(vsubq_u16(q_u16, offset)); + + // mask so only lane 0 has a non-zero value + // (important for sum_level / sum_power later) + i_s16 = vandq_s16(i_s16, lane0_mask); + q_s16 = vandq_s16(q_s16, lane0_mask); + + // low half (don't care about high half) + int16x4_t i_s16_low = vget_low_s16(i_s16); + int16x4_t q_s16_low = vget_low_s16(q_s16); + uint32x4_t isq_low = vreinterpretq_u32_s32(vmull_s16(i_s16_low, i_s16_low)); + uint32x4_t qsq_low = vreinterpretq_u32_s32(vmull_s16(q_s16_low, q_s16_low)); + uint32x4_t magsq_low = vqaddq_u32(isq_low, qsq_low); + float32x4_t magsq_f32_low = vcvtq_n_f32_u32(magsq_low, 30); /* input values are Q15, magsq is Q30 */ + float32x4_t mag_f32_low = vmulq_f32(magsq_f32_low, vrsqrteq_f32(magsq_f32_low)); + uint16x4_t mag_u16_low = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_low, 16)); + + sum_level = vaddq_f32(sum_level, vminq_f32(mag_f32_low, almost_one)); + sum_power = vaddq_f32(sum_power, vminq_f32(magsq_f32_low, almost_one)); + + // store 1 lane only + vst1_lane_u16(out_align, mag_u16_low, 0); + + in_align += 2; + out_align += 1; + } + + // add sums across vector + float32x2_t sum2_level = vadd_f32(vget_low_f32(sum_level), vget_high_f32(sum_level)); + float32x2_t sum4_level = vpadd_f32(sum2_level, sum2_level); + *out_level = vget_lane_f32(sum4_level, 0) / len; + + float32x2_t sum2_power = vadd_f32(vget_low_f32(sum_power), vget_high_f32(sum_power)); + float32x2_t sum4_power = vpadd_f32(sum2_power, sum2_power); + *out_power = vget_lane_f32(sum4_power, 0) / len; +} + +#endif diff --git a/dsp/impl/magnitude_sc16.c b/dsp/impl/magnitude_sc16.c new file mode 100644 index 0000000..1f45bde --- /dev/null +++ b/dsp/impl/magnitude_sc16.c @@ -0,0 +1,100 @@ +#include +#include + +/* Convert (little-endian) SC16 values to unsigned 16-bit magnitudes */ + +void STARCH_IMPL(magnitude_sc16, exact_u32) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + uint32_t I = abs((int16_t) le16toh(in_align[0].I)); + uint32_t Q = abs((int16_t) le16toh(in_align[0].Q)); + + uint32_t magsq = I * I + Q * Q; + float mag = sqrtf(magsq) * 2; + if (mag > 65535.0) + mag = 65535.0; + out_align[0] = (uint16_t)mag; + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_sc16, exact_float) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + float I = abs((int16_t) le16toh(in_align[0].I)) * 2; + float Q = abs((int16_t) le16toh(in_align[0].Q)) * 2; + + float magsq = I * I + Q * Q; + float mag = sqrtf(magsq); + if (mag > 65535.0) + mag = 65535.0; + out_align[0] = (uint16_t)mag; + + out_align += 1; + in_align += 1; + } +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(magnitude_sc16, neon_vrsqrte, STARCH_FEATURE_NEON) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const int16_t * restrict in_align = (const int16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + /* This uses NEON's floating-point reciprocal square root estimate (vrsqrte instruction). + * The estimate is accurate to about 9 bits of mantissa, which is good enough for our purposes. + */ + + unsigned len4 = len >> 2; + while (len4--) { + int16x4x2_t iq = vld2_s16(in_align); + int16x4_t i16 = iq.val[0]; /* Q15 */ + int16x4_t q16 = iq.val[1]; /* Q15 */ + + uint32x4_t isq = vreinterpretq_u32_s32(vmull_s16(i16, i16)); /* Q30, unsigned */ + uint32x4_t qsq = vreinterpretq_u32_s32(vmull_s16(q16, q16)); /* Q30, unsigned */ + uint32x4_t magsq = vqaddq_u32(isq, qsq); /* Q30, unsigned */ + + float32x4_t magsq_f32 = vcvtq_n_f32_u32(magsq, 30); + float32x4_t mag_f32 = vmulq_f32(magsq_f32, vrsqrteq_f32(magsq_f32)); /* sqrt(x) = x * (1/sqrt(x)) */ + uint16x4_t mag_u16 = vqmovn_u32(vcvtq_n_u32_f32(mag_f32, 16)); + + vst1_u16(out_align, mag_u16); + + in_align += 8; + out_align += 4; + } + + unsigned len1 = len & 3; + while (len1--) { + int16x4x2_t iq = vld2_dup_s16(in_align); + int16x4_t i16 = iq.val[0]; + int16x4_t q16 = iq.val[1]; + + uint32x4_t isq = vreinterpretq_u32_s32(vmull_s16(i16, i16)); + uint32x4_t qsq = vreinterpretq_u32_s32(vmull_s16(q16, q16)); + uint32x4_t magsq = vqaddq_u32(isq, qsq); + + float32x4_t magsq_f32 = vcvtq_n_f32_u32(magsq, 30); + float32x4_t mag_f32 = vmulq_f32(magsq_f32, vrsqrteq_f32(magsq_f32)); + uint16x4_t mag_u16 = vqmovn_u32(vcvtq_n_u32_f32(mag_f32, 16)); + + vst1_lane_u16(out_align, mag_u16, 0); + + in_align += 2; + out_align += 1; + } +} + +#endif /* STARCH_FEATURE_NEON */ diff --git a/dsp/impl/magnitude_sc16q11.c b/dsp/impl/magnitude_sc16q11.c new file mode 100644 index 0000000..bd9a0f5 --- /dev/null +++ b/dsp/impl/magnitude_sc16q11.c @@ -0,0 +1,137 @@ +#include +#include + +#include "dsp/helpers/tables.h" + +/* Convert (little-endian) SC16 values with a range of -2048..+2047 to unsigned 16-bit magnitudes */ + +void STARCH_IMPL(magnitude_sc16q11, exact_u32) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + uint32_t I = abs((int16_t) le16toh(in_align[0].I)); + uint32_t Q = abs((int16_t) le16toh(in_align[0].Q)); + + uint32_t magsq = I * I + Q * Q; + float mag = sqrtf(magsq) * 32; + if (mag > 65535.0) + mag = 65535.0; + out_align[0] = (uint16_t)mag; + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_sc16q11, exact_float) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + float I = abs((int16_t) le16toh(in_align[0].I)) * 32; + float Q = abs((int16_t) le16toh(in_align[0].Q)) * 32; + + float magsq = I * I + Q * Q; + float mag = sqrtf(magsq); + if (mag > 65535.0) + mag = 65535.0; + out_align[0] = (uint16_t)mag; + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_sc16q11, 11bit_table) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * restrict table = get_sc16q11_mag_11bit_table(); + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + uint16_t I = abs((int16_t)le16toh(in_align[0].I)); + if (I >= 2048) + I = 2047; + uint16_t Q = abs((int16_t)le16toh(in_align[0].Q)); + if (Q >= 2048) + Q = 2047; + out_align[0] = table[(Q << 11) | I]; + + in_align += 1; + out_align += 1; + } +} + +void STARCH_IMPL(magnitude_sc16q11, 12bit_table) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * restrict table = get_sc16q11_mag_12bit_table(); + const sc16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + unsigned index = ((in_align[0].I & 4095) << 12) | (in_align[0].Q & 4095); + out_align[0] = table[index]; + + in_align += 1; + out_align += 1; + } +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(magnitude_sc16q11, neon_vrsqrte, STARCH_FEATURE_NEON) (const sc16_t *in, uint16_t *out, unsigned len) +{ + const int16_t * restrict in_align = (const int16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + /* This uses NEON's floating-point reciprocal square root estimate instruction (vrsqrte). + * The estimate is accurate to about 9 bits of mantissa, which is good enough for our purposes. + */ + + unsigned len4 = len >> 2; + while (len4--) { + int16x4x2_t iq = vld2_s16(in_align); + int16x4_t i16 = iq.val[0]; /* Q11 */ + int16x4_t q16 = iq.val[1]; /* Q11 */ + + uint32x4_t isq = vreinterpretq_u32_s32(vmull_s16(i16, i16)); /* Q22, unsigned */ + uint32x4_t qsq = vreinterpretq_u32_s32(vmull_s16(q16, q16)); /* Q22, unsigned */ + uint32x4_t magsq = vqaddq_u32(isq, qsq); /* Q22, unsigned */ + + float32x4_t magsq_f32 = vcvtq_n_f32_u32(magsq, 22); + float32x4_t mag_f32 = vmulq_f32(magsq_f32, vrsqrteq_f32(magsq_f32)); /* sqrt(x) = x * (1/sqrt(x)) */ + uint16x4_t mag_u16 = vqmovn_u32(vcvtq_n_u32_f32(mag_f32, 16)); + + vst1_u16(out_align, mag_u16); + + in_align += 8; + out_align += 4; + } + + unsigned len1 = len & 3; + while (len1--) { + int16x4x2_t iq = vld2_dup_s16(in_align); + int16x4_t i16 = iq.val[0]; + int16x4_t q16 = iq.val[1]; + + uint32x4_t isq = vreinterpretq_u32_s32(vmull_s16(i16, i16)); + uint32x4_t qsq = vreinterpretq_u32_s32(vmull_s16(q16, q16)); + uint32x4_t magsq = vqaddq_u32(isq, qsq); + + float32x4_t magsq_f32 = vcvtq_n_f32_u32(magsq, 22); + float32x4_t mag_f32 = vmulq_f32(magsq_f32, vrsqrteq_f32(magsq_f32)); + uint16x4_t mag_u16 = vqmovn_u32(vcvtq_n_u32_f32(mag_f32, 16)); + + vst1_lane_u16(out_align, mag_u16, 0); + + in_align += 2; + out_align += 1; + } +} + +#endif /* STARCH_FEATURE_NEON */ diff --git a/dsp/impl/magnitude_uc8.c b/dsp/impl/magnitude_uc8.c new file mode 100644 index 0000000..71279c6 --- /dev/null +++ b/dsp/impl/magnitude_uc8.c @@ -0,0 +1,164 @@ +#include +#include +#include +#include +#include +#include + +#include "dsp/helpers/tables.h" + +/* Convert UC8 values to unsigned 16-bit magnitudes */ + +void STARCH_IMPL(magnitude_uc8, lookup) (const uc8_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * const mag_table = get_uc8_mag_table(); + + const uc8_u16_t * restrict in_align = (const uc8_u16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + unsigned len1 = len; + while (len1--) { + uint16_t mag = mag_table[in_align[0].u16]; + out_align[0] = mag; + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_uc8, lookup_unroll_4) (const uc8_t *in, uint16_t *out, unsigned len) +{ + const uint16_t * const mag_table = get_uc8_mag_table(); + + const uc8_u16_t * restrict in_align = (const uc8_u16_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + unsigned len4 = len >> 2; + unsigned len1 = len & 3; + + while (len4--) { + uint16_t mag0 = mag_table[in_align[0].u16]; + uint16_t mag1 = mag_table[in_align[1].u16]; + uint16_t mag2 = mag_table[in_align[2].u16]; + uint16_t mag3 = mag_table[in_align[3].u16]; + + out_align[0] = mag0; + out_align[1] = mag1; + out_align[2] = mag2; + out_align[3] = mag3; + + out_align += 4; + in_align += 4; + } + + while (len1--) { + uint16_t mag = mag_table[in_align[0].u16]; + + out_align[0] = mag; + + out_align += 1; + in_align += 1; + } +} + +void STARCH_IMPL(magnitude_uc8, exact) (const uc8_t *in, uint16_t *out, unsigned len) +{ + const uc8_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + unsigned len1 = len; + + while (len1--) { + float I = (in_align[0].I - 127.4); + float Q = (in_align[0].Q - 127.4); + + float magsq = I * I + Q * Q; + float mag = sqrtf(magsq) * 65536.0 / 128.0; + if (mag > 65535.0) + mag = 65535.0; + + out_align[0] = (uint16_t)mag; + + in_align += 1; + out_align += 1; + } +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(magnitude_uc8, neon_vrsqrte, STARCH_FEATURE_NEON) (const uc8_t *in, uint16_t *out, unsigned len) +{ + const uint8_t * restrict in_align = (const uint8_t *) STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + const uint16x8_t offset = vdupq_n_u16((uint16_t) (127.4 * 256)); + + unsigned len8 = len >> 3; + while (len8--) { + uint8x8x2_t iq = vld2_u8(in_align); + + // widen to 16 bits, convert to signed + uint16x8_t i_u16 = vshll_n_u8(iq.val[0], 8); + uint16x8_t q_u16 = vshll_n_u8(iq.val[1], 8); + int16x8_t i_s16 = vreinterpretq_s16_u16(vsubq_u16(i_u16, offset)); + int16x8_t q_s16 = vreinterpretq_s16_u16(vsubq_u16(q_u16, offset)); + + // low half + int16x4_t i_s16_low = vget_low_s16(i_s16); + int16x4_t q_s16_low = vget_low_s16(q_s16); + uint32x4_t isq_low = vreinterpretq_u32_s32(vmull_s16(i_s16_low, i_s16_low)); + uint32x4_t qsq_low = vreinterpretq_u32_s32(vmull_s16(q_s16_low, q_s16_low)); + uint32x4_t magsq_low = vqaddq_u32(isq_low, qsq_low); + float32x4_t magsq_f32_low = vcvtq_n_f32_u32(magsq_low, 30); /* input values are Q15, magsq is Q30 */ + float32x4_t mag_f32_low = vmulq_f32(vrsqrteq_f32(magsq_f32_low), magsq_f32_low); /* sqrt(x) = x * (1/sqrt(x)) */ + uint16x4_t mag_u16_low = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_low, 16)); + + // high half + int16x4_t i_s16_high = vget_high_s16(i_s16); + int16x4_t q_s16_high = vget_high_s16(q_s16); + uint32x4_t isq_high = vreinterpretq_u32_s32(vmull_s16(i_s16_high, i_s16_high)); + uint32x4_t qsq_high = vreinterpretq_u32_s32(vmull_s16(q_s16_high, q_s16_high)); + uint32x4_t magsq_high = vqaddq_u32(isq_high, qsq_high); + float32x4_t magsq_f32_high = vcvtq_n_f32_u32(magsq_high, 30); + float32x4_t mag_f32_high = vmulq_f32(vrsqrteq_f32(magsq_f32_high), magsq_f32_high); + uint16x4_t mag_u16_high = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_high, 16)); + + // store + uint16x8_t result = vcombine_u16(mag_u16_low, mag_u16_high); + vst1q_u16(out_align, result); + + in_align += 16; + out_align += 8; + } + + unsigned len1 = len & 7; + while (len1--) { + uint8x8x2_t iq = vld2_dup_u8(in_align); + + // widen to 16 bits, convert to signed + uint16x8_t i_u16 = vshll_n_u8(iq.val[0], 8); + uint16x8_t q_u16 = vshll_n_u8(iq.val[1], 8); + int16x8_t i_s16 = vreinterpretq_s16_u16(vsubq_u16(i_u16, offset)); + int16x8_t q_s16 = vreinterpretq_s16_u16(vsubq_u16(q_u16, offset)); + + // low half (don't care about high half) + int16x4_t i_s16_low = vget_low_s16(i_s16); + int16x4_t q_s16_low = vget_low_s16(q_s16); + uint32x4_t isq_low = vreinterpretq_u32_s32(vmull_s16(i_s16_low, i_s16_low)); + uint32x4_t qsq_low = vreinterpretq_u32_s32(vmull_s16(q_s16_low, q_s16_low)); + uint32x4_t magsq_low = vqaddq_u32(isq_low, qsq_low); + float32x4_t magsq_f32_low = vcvtq_n_f32_u32(magsq_low, 30); /* input values are Q15, magsq is Q30 */ + float32x4_t mag_f32_low = vmulq_f32(vrsqrteq_f32(magsq_f32_low), magsq_f32_low); /* sqrt(x) = x * (1/sqrt(x)) */ + uint16x4_t mag_u16_low = vqmovn_u32(vcvtq_n_u32_f32(mag_f32_low, 16)); + + // store 1 lane only + vst1_lane_u16(out_align, mag_u16_low, 0); + + in_align += 2; + out_align += 1; + } +} + +#endif /* STARCH_FEATURE_NEON */ diff --git a/dsp/impl/mean_power_u16.c b/dsp/impl/mean_power_u16.c new file mode 100644 index 0000000..b236baa --- /dev/null +++ b/dsp/impl/mean_power_u16.c @@ -0,0 +1,122 @@ +/* + * Given a buffer of uint16_t Q16 magnitude values + * return the mean magnitude and mean squared magnitude + * (normalized to 0..1) + */ + +void STARCH_IMPL(mean_power_u16, float) (const uint16_t *in, unsigned len, double *out_mean_mag, double *out_mean_magsq) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + + float sum = 0, sumsq = 0; + unsigned n = len; + while (n--) { + uint16_t mag = in_align[0]; + sum += mag; + sumsq += (uint32_t)mag * mag; + in_align += 1; + } + + *out_mean_mag = sum / len / 65536.0; + *out_mean_magsq = sumsq / len / 65536.0 / 65536.0; +} + +void STARCH_IMPL(mean_power_u16, u32) (const uint16_t *in, unsigned len, double *out_mean_mag, double *out_mean_magsq) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + + double sum = 0, sumsq = 0; + + unsigned remaining = len; + while (remaining > 0) { + uint32_t sum32 = 0, sumsq32 = 0; + unsigned blocklen = (remaining > 65536 ? 65536 : remaining); + remaining -= blocklen; + + while (blocklen--) { + uint16_t mag = in_align[0]; + sum32 += mag; + sumsq32 += ((uint32_t)mag * mag) >> 16; + in_align += 1; + } + + sum += sum32; + sumsq += sumsq32; + } + + *out_mean_mag = (double)sum / len / 65536.0; + *out_mean_magsq = (double)sumsq / len / 65536.0; +} + +void STARCH_IMPL(mean_power_u16, u64) (const uint16_t *in, unsigned len, double *out_mean_mag, double *out_mean_magsq) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + + uint64_t sum = 0, sumsq = 0; + unsigned n = len; + while (n--) { + uint16_t mag = in_align[0]; + sum += mag; + sumsq += (uint32_t)mag * mag; + in_align += 1; + } + + *out_mean_mag = (double)sum / len / 65536.0; + *out_mean_magsq = (double)sumsq / len / 65536.0 / 65536.0; +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(mean_power_u16, neon_float, STARCH_FEATURE_NEON) (const uint16_t *in, unsigned len, double *out_mean_mag, double *out_mean_magsq) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + + float32x4_t mag_sum_0 = vdupq_n_f32(0); + float32x4_t magsq_sum_0 = vdupq_n_f32(0); + float32x4_t mag_sum_1 = vdupq_n_f32(0); + float32x4_t magsq_sum_1 = vdupq_n_f32(0); + + unsigned len8 = len >> 3; + while (len8--) { + uint16x8_t mag_u16 = vld1q_u16(in_align); + uint16x4_t mag_u16_0 = vget_low_u16(mag_u16); + uint16x4_t mag_u16_1 = vget_high_u16(mag_u16); + + float32x4_t mag_float32_0 = vcvtq_n_f32_u32(vmovl_u16(mag_u16_0), 16); + float32x4_t mag_float32_1 = vcvtq_n_f32_u32(vmovl_u16(mag_u16_1), 16); + + mag_sum_0 = vaddq_f32(mag_sum_0, mag_float32_0); + mag_sum_1 = vaddq_f32(mag_sum_1, mag_float32_1); + + magsq_sum_0 = vfmaq_f32(magsq_sum_0, mag_float32_0, mag_float32_0); + magsq_sum_1 = vfmaq_f32(magsq_sum_1, mag_float32_1, mag_float32_1); + + in_align += 8; + } + + // reduce sums to lane 0 + float32x4_t mag_sum_q = vaddq_f32(mag_sum_0, mag_sum_1); + float32x2_t mag_sum = vadd_f32(vget_low_f32(mag_sum_q), vget_high_f32(mag_sum_q)); + mag_sum = vpadd_f32(mag_sum, mag_sum); + + float32x4_t magsq_sum_q = vaddq_f32(magsq_sum_0, magsq_sum_1); + float32x2_t magsq_sum = vadd_f32(vget_low_f32(magsq_sum_q), vget_high_f32(magsq_sum_q)); + magsq_sum = vpadd_f32(magsq_sum, magsq_sum); + + unsigned len1 = len & 7; + while (len1--) { + uint16x4_t mag_u16 = vld1_dup_u16(in_align); + // we process both lanes here, but lane 1's sums are ignored + float32x2_t mag_float32 = vcvt_n_f32_u32(vget_low_u32(vmovl_u16(mag_u16)), 16); + mag_sum = vadd_f32(mag_sum, mag_float32); + magsq_sum = vfma_f32(magsq_sum, mag_float32, mag_float32); + in_align += 1; + } + + *out_mean_mag = vget_lane_f32(mag_sum, 0) / len; + *out_mean_magsq = vget_lane_f32(magsq_sum, 0) / len; +} + +#endif /* STARCH_FEATURE_NEON */ diff --git a/dsp/starchgen.py b/dsp/starchgen.py new file mode 100755 index 0000000..ae963c8 --- /dev/null +++ b/dsp/starchgen.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 + +import os +import sys +import glob + +top_dir = sys.argv[1] +starch_dir = os.path.join(top_dir, 'starch') +sys.path.append(starch_dir) +import starch + +gen = starch.Generator(runtime_dir = top_dir, + output_dir = os.path.join(top_dir, 'dsp', 'generated')) + +gen.add_include('"dsp-types.h"') +gen.add_include('"cpu.h"') + +gen.add_function(name = 'magnitude_uc8', argtypes = ['const uc8_t *', 'uint16_t *', 'unsigned'], aligned = True) +gen.add_function(name = 'magnitude_power_uc8', argtypes = ['const uc8_t *', 'uint16_t *', 'unsigned', 'double *', 'double *'], aligned = True) +gen.add_function(name = 'magnitude_sc16', argtypes = ['const sc16_t *', 'uint16_t *', 'unsigned'], aligned = True) +gen.add_function(name = 'magnitude_sc16q11', argtypes = ['const sc16_t *', 'uint16_t *', 'unsigned'], aligned = True) +gen.add_function(name = 'mean_power_u16', argtypes = ['const uint16_t *', 'unsigned', 'double *', 'double *'], aligned = True) + +gen.add_feature(name='neon', description='ARM NEON') + +gen.add_flavor(name = 'generic', + description = 'Generic build, default compiler options', + compile_flags = []) +gen.add_flavor(name = 'armv7a_neon_vfpv4', + description = 'ARMv7-A, NEON, VFPv4', + compile_flags = ['-march=armv7-a+neon-vfpv4', '-mfpu=neon-vfpv4', '-ffast-math'], + features = ['neon'], + test_function = 'cpu_supports_armv7_neon_vfpv4', + alignment = 16) +gen.add_flavor(name = 'x86_avx2', + description = 'x86 with AVX2', + compile_flags = ['-mavx2', '-ffast-math'], + test_function = 'cpu_supports_avx2', + alignment = 32) + +gen.add_mix(name = 'generic', + description = 'Generic build, compiler defaults only', + flavors = ['generic'], + wisdom_file = 'wisdom.generic') + +gen.add_mix(name = 'arm', + description = 'ARM', + flavors = ['armv7a_neon_vfpv4', 'generic'], + wisdom_file = 'wisdom.arm') + +gen.add_mix(name = 'x86', + description = 'x64', + flavors = ['x86_avx2', 'generic'], + wisdom_file = 'wisdom.x86') + +for pattern in ['dsp/impl/*.c', 'dsp/benchmark/*.c']: + for c_file in glob.glob(pattern): + gen.scan_file(c_file) + +gen.generate() diff --git a/dump1090.c b/dump1090.c index b15d45a..07c7042 100644 --- a/dump1090.c +++ b/dump1090.c @@ -48,6 +48,7 @@ // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dump1090.h" +#include "cpu.h" #include @@ -257,17 +258,36 @@ static void showVersion() #endif #ifdef ENABLE_LIMESDR "ENABLE_LIMESDR " -#endif -#ifdef SC16Q11_TABLE_BITS - // This is a little silly, but that's how the preprocessor works.. -#define _stringize(x) #x -#define stringize(x) _stringize(x) - "SC16Q11_TABLE_BITS=" stringize(SC16Q11_TABLE_BITS) -#undef stringize -#undef _stringize #endif ); printf("-----------------------------------------------------------------------------\n"); +} + +static void showDSP() +{ + printf(" detected runtime CPU features: "); + if (cpu_supports_avx()) + printf("AVX "); + if (cpu_supports_avx2()) + printf("AVX2 "); + if (cpu_supports_armv7_neon_vfpv4()) + printf("ARMv7+NEON+VFPv4 "); + printf("\n"); + + printf(" selected DSP implementations: \n"); +#define SHOW(x) do { \ + printf(" %-40s %s\n", #x , starch_ ## x ## _select()->name); \ + printf(" %-40s %s\n", #x "_aligned", starch_ ## x ## _aligned_select()->name); \ + } while(0) + + SHOW(magnitude_uc8); + SHOW(magnitude_power_uc8); + SHOW(magnitude_sc16); + SHOW(magnitude_sc16q11); + SHOW(mean_power_u16); + +#undef SHOW + printf("\n"); } @@ -327,8 +347,11 @@ static void showHelp(void) "--write-json Periodically write json output to (for serving by a separate webserver)\n" "--write-json-every Write json output every t seconds (default 1)\n" "--json-location-accuracy Accuracy of receiver location in json metadata: 0=no location, 1=approximate, 2=exact\n" +#if 0 "--dcfilter Apply a 1Hz DC filter to input data (requires more CPU)\n" -"--version Show version and build options\n" +#endif +"--wisdom Read DSP wisdom from given path\n" +"--version Show version, build and DSP options\n" "--help Show this help\n" ); } @@ -488,7 +511,11 @@ int main(int argc, char **argv) { } else if (!strcmp(argv[j],"--gain") && more) { Modes.gain = (int) (atof(argv[++j])*10); // Gain is in tens of DBs } else if (!strcmp(argv[j],"--dcfilter")) { +#if 0 Modes.dc_filter = 1; +#else + fprintf(stderr, "--dcfilter option ignored (please raise an issue on github if you have a usecase that needs this)\n"); +#endif } else if (!strcmp(argv[j],"--measure-noise")) { // Ignored } else if (!strcmp(argv[j],"--fix")) { @@ -612,6 +639,7 @@ int main(int argc, char **argv) { exit(0); } else if (!strcmp(argv[j],"--version")) { showVersion(); + showDSP(); exit(0); } else if (!strcmp(argv[j],"--quiet")) { Modes.quiet = 1; @@ -629,6 +657,12 @@ int main(int argc, char **argv) { Modes.json_interval = 100; } else if (!strcmp(argv[j], "--json-location-accuracy") && more) { Modes.json_location_accuracy = atoi(argv[++j]); + } else if (!strcmp(argv[j], "--wisdom") && more) { + if (starch_read_wisdom (argv[++j]) < 0) { + fprintf(stderr, + "Failed to read wisdom file %s: %s\n", argv[j], strerror(errno)); + exit(1); + } } else if (sdrHandleOption(argc, argv, &j)) { /* handled */ } else { diff --git a/dump1090.h b/dump1090.h index 3068ea0..c00c513 100644 --- a/dump1090.h +++ b/dump1090.h @@ -81,6 +81,7 @@ #include #include "compat/compat.h" +#include "dsp/generated/starch.h" // ============================= #defines =============================== diff --git a/oneoff/convert_benchmark.c b/oneoff/convert_benchmark.c index 6778cde..7ee2d0a 100644 --- a/oneoff/convert_benchmark.c +++ b/oneoff/convert_benchmark.c @@ -102,8 +102,12 @@ static void test(const char *what, input_format_t format, void **data, double sa struct timespec total = { 0, 0 }; int iterations = 0; + double level, power; + // Run it once to force init. - converter(data[0], outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL); + for (int i = 0; i < 10; ++i) { + converter(data[i], outdata, MODES_MAG_BUF_SAMPLES, state, &level, &power); + } while (total.tv_sec < 5) { fprintf(stderr, "."); @@ -112,7 +116,7 @@ static void test(const char *what, input_format_t format, void **data, double sa start_cpu_timing(&start); for (int i = 0; i < 10; ++i) { - converter(data[i], outdata, MODES_MAG_BUF_SAMPLES, state, NULL, NULL); + converter(data[i], outdata, MODES_MAG_BUF_SAMPLES, state, &level, &power); } end_cpu_timing(&start, &total); @@ -135,6 +139,9 @@ int main(int argc, char **argv) MODES_NOTUSED(argc); MODES_NOTUSED(argv); + if (argc > 1) + starch_read_wisdom(argv[1]); + prepare(); test("SC16Q11, DC", INPUT_SC16Q11, testdata_sc16q11, 2400000, true); diff --git a/oneoff/dsp_error_measurement.c b/oneoff/dsp_error_measurement.c new file mode 100644 index 0000000..c46870c --- /dev/null +++ b/oneoff/dsp_error_measurement.c @@ -0,0 +1,229 @@ +/* measures actual vs expected magnitude values for various magnitude_* + * implementations + */ + +#include +#include +#include +#include +#include + +#include "dsp-types.h" +#include "dsp/generated/starch.h" + +static void write_results_uc8(const uc8_t *in, uint16_t *out, unsigned len, char *path) +{ + FILE *fp = fopen(path, "w"); + if (!fp) { + fprintf(stderr, "fopen(%s): %s\n", path, strerror(errno)); + return; + } + + while (--len) { + float I = (in[0].I - 127.4) / 128; + float Q = (in[0].Q - 127.4) / 128; + + float phase = atan2(Q, I) * 180.0 / M_PI; + float expected = round(sqrtf(I * I + Q * Q) * 65536); + if (expected > 65535) + expected = 65535; + fprintf(fp, "%u %u %.3f %.0f %u\n", in[0].I, in[0].Q, phase, expected, out[0]); + + ++in; + ++out; + } + + fclose(fp); + fprintf(stderr, "wrote %s\n", path); +} + +static void write_results_sc16(const sc16_t *in, uint16_t *out, unsigned len, char *path) +{ + FILE *fp = fopen(path, "w"); + if (!fp) { + fprintf(stderr, "fopen(%s): %s\n", path, strerror(errno)); + return; + } + + while (--len) { + float I = in[0].I / 32768.0; + float Q = in[0].Q / 32768.0; + + float phase = atan2(Q, I) * 180.0 / M_PI; + float expected = round(sqrtf(I * I + Q * Q) * 65536); + if (expected > 65535) + expected = 65535; + fprintf(fp, "%d %d %.3f %.0f %u\n", in[0].I, in[0].Q, phase, expected, out[0]); + + ++in; + ++out; + } + + fclose(fp); + fprintf(stderr, "wrote %s\n", path); +} + +static void write_results_sc16q11(const sc16_t *in, uint16_t *out, unsigned len, char *path) +{ + FILE *fp = fopen(path, "w"); + if (!fp) { + fprintf(stderr, "fopen(%s): %s\n", path, strerror(errno)); + return; + } + + while (--len) { + float I = in[0].I / 2048.0; + float Q = in[0].Q / 2048.0; + + float phase = atan2(Q, I) * 180.0 / M_PI; + float expected = round(sqrtf(I * I + Q * Q) * 65536); + if (expected > 65535) + expected = 65535; + fprintf(fp, "%d %d %.3f %.0f %u\n", in[0].I, in[0].Q, phase, expected, out[0]); + + ++in; + ++out; + } + + fclose(fp); + fprintf(stderr, "wrote %s\n", path); +} + +static void process_uc8() +{ + const float mag_min = 0.05; + const float mag_max = 0.95; + + const unsigned mag_steps = 5; + const unsigned phase_steps = 256; + + const unsigned len = mag_steps * phase_steps; + + uc8_t *in = malloc(len * sizeof(*in)); + uint16_t *out = malloc(len * sizeof(*out)); + uc8_t *fill = in; + + for (unsigned mag_step = 0; mag_step < mag_steps; ++mag_step) { + float mag = mag_min + mag_step * (mag_max - mag_min) / (mag_steps - 1); + for (unsigned phase_step = 0; phase_step < phase_steps; ++phase_step) { + float phase = 360.0 * phase_step / phase_steps; + fill->I = 128 * mag * cos(phase * M_PI / 180.0) + 127.4; + fill->Q = 128 * mag * sin(phase * M_PI / 180.0) + 127.4; + + if (fill == in || fill[-1].I != fill[0].I || fill[-1].Q != fill[0].Q) + ++fill; + } + } + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_uc8_exact_generic(in, out, len); + write_results_uc8(in, out, fill - in, "uc8-exact.tsv"); +#endif + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_uc8_lookup_generic(in, out, len); + write_results_uc8(in, out, fill - in, "uc8-lookup.tsv"); +#endif + +#ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 + starch_magnitude_uc8_neon_vrsqrte_armv7a_neon_vfpv4(in, out, len); + write_results_uc8(in, out, fill - in, "uc8-neon-vrsqrte.tsv"); +#endif +} + +static void process_sc16() +{ + const float mag_min = 0.05; + const float mag_max = 0.95; + + const unsigned mag_steps = 5; + const unsigned phase_steps = 65536; + + const unsigned len = mag_steps * phase_steps; + + sc16_t *in = malloc(len * sizeof(*in)); + uint16_t *out = malloc(len * sizeof(*out)); + sc16_t *fill = in; + + for (unsigned mag_step = 0; mag_step < mag_steps; ++mag_step) { + float mag = mag_min + mag_step * (mag_max - mag_min) / (mag_steps - 1); + for (unsigned phase_step = 0; phase_step < phase_steps; ++phase_step) { + float phase = 360.0 * phase_step / phase_steps; + fill->I = 32768.0f * mag * cos(phase * M_PI / 180.0); + fill->Q = 32768.0f * mag * sin(phase * M_PI / 180.0); + + if (fill == in || fill[-1].I != fill[0].I || fill[-1].Q != fill[0].Q) + ++fill; + } + } + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_sc16_exact_generic(in, out, len); + write_results_sc16(in, out, fill - in, "sc16-exact.tsv"); +#endif + +#ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 + starch_magnitude_sc16_neon_vrsqrte_armv7a_neon_vfpv4(in, out, len); + write_results_sc16(in, out, fill - in, "sc16-neon-vrsqrte.tsv"); +#endif +} + +static void process_sc16q11() +{ + const float mag_min = 0.05; + const float mag_max = 0.95; + + const unsigned mag_steps = 5; + const unsigned phase_steps = 2048; + + const unsigned len = mag_steps * phase_steps; + + sc16_t *in = malloc(len * sizeof(*in)); + uint16_t *out = malloc(len * sizeof(*out)); + sc16_t *fill = in; + + for (unsigned mag_step = 0; mag_step < mag_steps; ++mag_step) { + float mag = mag_min + mag_step * (mag_max - mag_min) / (mag_steps - 1); + for (unsigned phase_step = 0; phase_step < phase_steps; ++phase_step) { + float phase = 360.0 * phase_step / phase_steps; + fill->I = 2048.0f * mag * cos(phase * M_PI / 180.0); + fill->Q = 2048.0f * mag * sin(phase * M_PI / 180.0); + if (fill == in || fill[-1].I != fill[0].I || fill[-1].Q != fill[0].Q) + ++fill; + } + } + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_sc16q11_exact_generic(in, out, len); + write_results_sc16q11(in, out, fill - in, "sc16q11-exact.tsv"); +#endif + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_sc16q11_11bit_table_generic(in, out, len); + write_results_sc16q11(in, out, fill - in, "sc16q11-lookup.tsv"); +#endif + +#ifdef STARCH_FLAVOR_GENERIC + starch_magnitude_sc16q11_12bit_table_generic(in, out, len); + write_results_sc16q11(in, out, fill - in, "sc16q11-lookup.tsv"); +#endif + +#ifdef STARCH_FLAVOR_ARMV7A_NEON_VFPV4 + starch_magnitude_sc16q11_neon_vrsqrte_armv7a_neon_vfpv4(in, out, len); + write_results_sc16q11(in, out, fill - in, "sc16q11-neon-vrsqrte.tsv"); +#endif +} + +int main(int argc, char **argv) +{ + (void) argc; + (void) argv; + + process_uc8(); + process_sc16(); + process_sc16q11(); + + return 0; +} + + diff --git a/oneoff/uc8_capture_stats.c b/oneoff/uc8_capture_stats.c new file mode 100644 index 0000000..340d75b --- /dev/null +++ b/oneoff/uc8_capture_stats.c @@ -0,0 +1,106 @@ +/* measures min, max, mean I and Q values in a UC8-format capture */ + +#include +#include + +#include +#include +#include +#include + +#include + +#include "dsp-types.h" + +int main(int argc, char **argv) +{ + if (argc < 2) { + fprintf(stderr, "need a capture filename\n"); + return 1; + } + + const unsigned len = 1<<24; + uc8_t *buffer = malloc(len * sizeof(uc8_t)); + + int fd = open(argv[1], O_RDONLY); + if (fd < 0) { + perror("open"); + return 1; + } + + unsigned all_samples = 0; + double all_I = 0, all_Q = 0; + + for (;;) { + ssize_t count = read(fd, buffer, len * sizeof(uc8_t)); + if (count < 0) { + perror("read"); + close(fd); + return 1; + } + + if (count <= 0) + break; + + unsigned actual_len = count / sizeof(uc8_t); + + int min_I = INT_MAX, max_I = INT_MIN; + unsigned min_I_count = 0, max_I_count = 0; + int min_Q = INT_MAX, max_Q = INT_MIN; + unsigned min_Q_count = 0, max_Q_count = 0; + double sum_I = 0, sum_Q = 0; + + for (unsigned i = 0; i < actual_len; ++i) { + int I = buffer[i].I; + int Q = buffer[i].Q; + + if (I < min_I) { + min_I = I; + min_I_count = 0; + } + if (I == min_I) { + ++min_I_count; + } + + if (Q < min_Q) { + min_Q = Q; + min_Q_count = 0; + } + if (Q == min_Q) { + ++min_Q_count; + } + + if (I > max_I) { + max_I = I; + max_I_count = 0; + } + if (I == max_I) { + ++max_I_count; + } + + if (Q > max_Q) { + max_Q = Q; + max_Q_count = 0; + } + if (Q == max_Q) { + ++max_Q_count; + } + + sum_I += I; + sum_Q += Q; + } + + all_I += sum_I; + all_Q += sum_Q; + all_samples += actual_len; + + fprintf(stderr, + "%u samples; I: min %4d (%5u); max %4d (%5u); mean %7.2f; overall mean %7.2f; Q: min %4d (%5u); max %4d (%5u); mean %7.2f; overall mean %7.2f\n", + actual_len, + min_I, min_I_count, max_I, max_I_count, sum_I / actual_len, all_I / all_samples, + min_Q, min_Q_count, max_Q, max_Q_count, sum_Q / actual_len, all_Q / all_samples); + } + + close(fd); + return 0; +} diff --git a/starch/.gitignore b/starch/.gitignore new file mode 100644 index 0000000..9f7e9fd --- /dev/null +++ b/starch/.gitignore @@ -0,0 +1,6 @@ +__pycache__ +example/mako/ +example/starch-benchmark +*.o +*~ +.mypy_cache diff --git a/starch/LICENSE b/starch/LICENSE new file mode 100644 index 0000000..0443d73 --- /dev/null +++ b/starch/LICENSE @@ -0,0 +1,23 @@ +Copyright (c) 2020, FlightAware LLC. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/starch/Makefile b/starch/Makefile new file mode 100644 index 0000000..155a675 --- /dev/null +++ b/starch/Makefile @@ -0,0 +1,4 @@ +all: mypy + +mypy: + MYPYPATH=$(CURDIR)/stubs mypy --show-error-codes starch.py diff --git a/starch/README.md b/starch/README.md new file mode 100644 index 0000000..86f5e59 --- /dev/null +++ b/starch/README.md @@ -0,0 +1,182 @@ +# starch - a framework for selecting architecture-specific code at runtime + +`starch` helps generates glue code to *s*elec*t* *arch*itecture-specific +versions of code depending on the hardware detected at runtime. + +It arranges for code to be built multiple times with different compiler +options. At runtime, user code calls a dispatcher entry point which +selects the best compiled version of the versions that can safely run +on the hardware used at runtime. + +It tries to be agnostic about the details of the code being generated +and the details of the hardware. + +## Caution caution work in progress + +This documentation isn't very complete. You'll need to look at the example +and the code itself. + +## Design notes + + * Architecture-independent generated output; the generated outputs can + be generated during development and committed as part of the main + source code, and at build time starch does not need to be re-run. + + * Doesn't care about the details of the functions you call; they can + have any signature. + + * Can automatically generate benchmarking code given a benchmarking + helper that sets up inputs to the function. + + * Does not do any hardware detection itself, and does not care about + the hardware details; for each combination of compiler flags, the user + code provides a test function to be called at runtime to determine if + it is safe to run code compiled with those flags. + + * Allows the same generic code to be compiled multiple times with different + compile flags to take advantage of compile auto-vectorization that + requires additional instruction set features (AVX, NEON, ..) being enabled. + + * Emits makefile fragments to be included into a larger makefile structure + +## License + +The generator script and templates are licensed under a BSD 2-clause license, +see the LICENSE file. + +No copyright claim is made on generated code. + +## Prerequisites + +At generation time (results can be committed to version control): + + * Python 3 + * [Mako](https://www.makotemplates.org/) + +At build time: + + * a C compiler + * make + +## Quickstart + +Look in example/ for a full example. + +## Concepts + +A *function* is the user-visible API to starch-generated code. It just looks +like a C function pointer. Initially, this pointer points to a dispatcher +routine which will select an appropriate implementation at runtime and call +it. For subsequent calls, the dispatcher updates the function pointer to +point directly to the selected implementation. + +A *function impl* is one particular way of implementing a function. All +impls should produce the same results given the same inputs to avoid confusing +user code. There may be different impls with different performance +characteristics - for example, different degrees of manual loop unrolling, or +an impl that takes advantage of a particular instruction set (NEON, AVX, etc). +Each impl has a unique-within-the-function "variant" name that identifies it. + +Function impls may be conditionally compiled depending on build features +(see below). This is useful for impls that cannot always be compiled e.g. +they depend on the availability of a particular instruction set. + +A *build flavor* is a particular way of building the function impl. It +consists of a set of compiler flags to use, plus an associated test function +that determines at runtime if it is safe to run the code. For example, +a flavor may enable use of specific instructions that may or may not be +available at runtime via `-mavx`, `-march=...`, and similar flags. Each +flavor declares that it provides zero or more *features*. + +A *feature* is a characteristic of the build flavor compiler flags that +allows certain impls to be compiled. For example, an impl that uses NEON +intrinsics can only be compiled if the compiler is building for an ARM +instruction set that supports NEON. Features are defined in the build flavor, +and are advertised at compile time by the presence of a `STARCH_FEATURE_x` +macro; implementations may conditionally compile on this macro and should use +`STARCH_IMPL_REQUIRES` to indicate they will only be emitted when a given +feature is present. + +A *build mix* is a combination of build flavors that can coexist in the same +binary. For example, an "x86" mix might include build flavors that build +for generic x86, x86-with-AVX, and x86-with-AVX2; but it would not include +a build flavor for ARM, because ARM and x86 object code can't be linked +together into a single binary. + +## Alignment + +A function can optionally include an aligned version; this is a version of the +function with an independent call point and wisdom, which assumes that +data passed to the function is already aligned. Each flavor has an associated +alignment in bytes, but otherwise it is up to the implementations to decide +what exactly is aligned. Implementations for an aligned function on a flavor +that specifies an alignment (>1 byte) will be compiled twice, once with an +alignment of 1 and once with the flavor's alignment, to generate two different +compiled versions. + +starch provides macros to help with alignment: + + * `STARCH_ALIGNMENT`, in implementations, is the alignment (in bytes) that + implementations can assume. + * `STARCH_MIX_ALIGNMENT`, defined in the generated header file, is the required + alignment (in bytes) for callers of the _aligned version of a function. + It is the largest alignment of all flavors in the mix. + * `STARCH_ALIGNED(ptr)` in implementations evaluates to `ptr` while hinting to + the compiler that the data is aligned according to STARCH_ALIGNMENT. This + maps to gcc's `__builtin_assume_aligned` builtin. + +## Benchmarks + +Functions can optionally provide a benchmark helper by defining a +(no args, void return typer) function using the STARCH_BENCHMARK macro. This +macro is only present when benchmark code is being compiled. + +The benchmark helper should set up function inputs for benchmarking and then +use the `STARCH_BENCHMARK_RUN` macro. This macro expands to code that will +benchmark each possible impl in turn with the provided arguments. + +If the benchmark needs to allocated possibly-aligned buffers, +two macros `STARCH_BENCHMARK_ALLOC` and `STARCH_BENCHMARK_FREE` +will allocate suitably aligned buffers for the current `STARCH_ALIGNMENT` +value. `STARCH_BENCHMARK_ALLOC(count,type)` will allocate `count` elements of +type `type`, aligned to either `STARCH_ALIGNMENT` or the required alignment +for `type`, whichever is larger. `STARCH_BENCHMARK_FREE(ptr)` will free a +buffer previously allocated by `STARCH_BENCHMARK_ALLOC`. + +See `example/benchmark/subtract_n_benchmark.c` for examples. + +## Gotchas + +Files added by `scan_file` are `#include`-d into surrounding support files. +Multiple files may be included into the same compilation unit. You should +ensure that you don't pollute the global namespace (macros, static functions +names, etc) for subsequent files that will follow. + +Files added by `scan_file` will be compiled multiple times. You should ensure +that any symbols other than those handled by STARCH_IMPL / STARCH_IMPL_REQUIRES +are either static or use the STARCH_SYMBOL macro to get a unique name for +this compilation pass. + +You probably want to separate out benchmark-support code into separate files +to avoid an extra version of any impls present in the same file from being +emitted. + +## Wisdom + +There is partial support for a wisdom implementation. Wisdom is a priori +information about the preferred code to use for a given function, for example +as the result of benchmarking to find the fastest version. It is simply the +order in which compiled impls are tried until one that is supported is found. + +To set wisdom, there are two options: + +1) Provide a wisdom ordering for the function when defining a build mix. This +controls the order in which the compiled impls are included in the generated +registry that is searched at runtime. + +2) Call `starch__set_wisdom` at runtime. This accepts an array of +function variants, terminated by NULL. When called, the registry is re-sorted +to prefer the listed variants in the order provided (and the function pointer +is reset to the dispatcher so the chosen code will be re-selected on the next +call). This could be used to load install-specific wisdom during program +startup. diff --git a/starch/example/Makefile b/starch/example/Makefile new file mode 100644 index 0000000..767deff --- /dev/null +++ b/starch/example/Makefile @@ -0,0 +1,28 @@ +CC ?= gcc +CFLAGS = -O3 -Wall -g + +STARCH_COMPILE := $(CC) $(CFLAGS) -c + +ARCH := $(shell uname -m) + +all: generate starch-benchmark + +ifneq (,$(findstring arm,$(ARCH))) + -include generated/makefile.arm +else ifneq (,$(findstring x86_64,$(ARCH))) + -include generated/makefile.x86_64 +else + -include generated/makefile.generic +endif + +support.o: support.c + $(CC) $(CFLAGS) -c -o $@ $^ + +starch-benchmark: $(STARCH_OBJS) $(STARCH_BENCHMARK_OBJ) support.o + $(CC) $(CFLAGS) -o $@ $^ + +generate: + ./starchgen.py + +clean: + rm -f $(STARCH_OBJS) $(STARCH_BENCHMARK_OBJ) support.o starch-benchmark diff --git a/starch/example/benchmark/subtract_n_benchmark.c b/starch/example/benchmark/subtract_n_benchmark.c new file mode 100644 index 0000000..b083088 --- /dev/null +++ b/starch/example/benchmark/subtract_n_benchmark.c @@ -0,0 +1,33 @@ +#include + +void STARCH_BENCHMARK(subtract_n) (void) +{ + uint16_t *in = NULL, *out = NULL; + const unsigned len = 65536; + const unsigned n = 42; + + if (!(in = STARCH_BENCHMARK_ALLOC(len, uint16_t)) || !(out = STARCH_BENCHMARK_ALLOC(len, uint16_t))) { + goto done; + } + + STARCH_BENCHMARK_RUN( subtract_n, in, len, n, out ); + + done: + STARCH_BENCHMARK_FREE(in); + STARCH_BENCHMARK_FREE(out); +} + +bool STARCH_BENCHMARK_VERIFY(subtract_n)(const uint16_t *in, unsigned len, uint16_t n, uint16_t *out) +{ + bool okay = true; + + for (unsigned i = 0; i < len; ++i) { + uint16_t expected = in[i] - n; + if (out[i] != expected) { + fprintf(stderr, "verification failed: in[%u]=%u n=%u out[%u]=%u expected=%u\n", i, in[i], n, i, out[i], expected); + okay = false; + } + } + + return okay; +} diff --git a/starch/example/generated/.keep b/starch/example/generated/.keep new file mode 100644 index 0000000..e69de29 diff --git a/starch/example/generated/benchmark.c b/starch/example/generated/benchmark.c new file mode 100644 index 0000000..8e1908e --- /dev/null +++ b/starch/example/generated/benchmark.c @@ -0,0 +1,569 @@ + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "starch.h" + +typedef struct timespec starch_benchmark_time; + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end); +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size); +void starch_benchmark_aligned_free(void *user_ptr); +void starch_benchmark_get_time(starch_benchmark_time *t); + +const unsigned starch_benchmark_warmup_loops = 10; + +typedef struct { + const char *name; + const char *impl; + uint64_t ns; +} starch_benchmark_result; + +static starch_benchmark_result *starch_benchmark_results = NULL; +static unsigned starch_benchmark_result_size = 0; +static unsigned starch_benchmark_result_count = 0; + +typedef struct benchmark_flavor_list_node { + const char *flavor; + struct benchmark_flavor_list_node *next; +} starch_benchmark_flavor_list; + +static starch_benchmark_flavor_list *starch_benchmark_flavor_whitelist = NULL; +static starch_benchmark_flavor_list *starch_benchmark_flavor_blacklist = NULL; + +static bool starch_benchmark_list_only = false; +static bool starch_benchmark_top_only = false; +static unsigned starch_benchmark_iterations = 1; + +typedef struct timespec starch_benchmark_time; +void starch_benchmark_get_time(starch_benchmark_time *t) +{ +#ifdef CLOCK_THREAD_CPUTIME_ID + clock_gettime(CLOCK_THREAD_CPUTIME_ID, t); +#else + clock_gettime(CLOCK_MONOTONIC, t); +#endif +} + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end) +{ + return ((uint64_t)end->tv_sec - (uint64_t)start->tv_sec) * 1000000000U + (uint64_t)end->tv_nsec - (uint64_t)start->tv_nsec; +} + +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size) +{ + size_t use_alignment = (type_alignment > alignment ? type_alignment : alignment); + if ( (use_alignment % type_alignment) || (use_alignment % alignment) ) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: conflicting alignment requirements (%zu versus %zu)\n", size, alignment, type_alignment); + return NULL; + } + + /* Over-allocate so we can stash our own pointer before the start, and so that we can adjust + * the returned alignment so it is only aligned to the requested boundary, and not also + * aligned to a larger power of two (we don't want to accidentally benchmark the performance + * of a more restrictive larger alignment) + */ + size_t header_size = (use_alignment < sizeof(void*) ? sizeof(void*) : use_alignment); + char *block_ptr = aligned_alloc(use_alignment, header_size + size + use_alignment); + if (!block_ptr) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: %s\n", size, strerror(errno)); + return NULL; + } + + char *user_ptr = block_ptr + header_size; + if ( (uintptr_t)user_ptr % (use_alignment * 2) == 0 ) { + // user_ptr is aligned to the next power of two, but we don't want that, move it on + user_ptr += use_alignment; + } + + void **stash = (void**)user_ptr - 1; + *stash = block_ptr; + + return user_ptr; +} + +void starch_benchmark_aligned_free(void *user_ptr) +{ + if (!user_ptr) + return; + void **stash = (void**)user_ptr - 1; + free(*stash); +} + +static bool starch_benchmark_flavor_in_list(const char *flavor, const starch_benchmark_flavor_list *list) +{ + for (; list; list = list->next) { + if (!strcmp(flavor, list->flavor)) + return true; + } + return false; +} + + +/* prototypes for benchmark helpers provided by user code */ +void starch_subtract_n_benchmark (void); +bool starch_subtract_n_benchmark_verify ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_subtract_n_benchmark(void); + +static void starch_benchmark_one_subtract_n( starch_subtract_n_regentry * _entry, const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + + /* verify correctness of the output */ + if (! starch_subtract_n_benchmark_verify ( arg0, arg1, arg2, arg3 )) { + fprintf(stderr, "skipped (verification failed)\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "subtract_n"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_subtract_n( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) +{ + for (starch_subtract_n_regentry *_entry = starch_subtract_n_registry; _entry->name; ++_entry) { + starch_benchmark_one_subtract_n( _entry, arg0, arg1, arg2, arg3 ); + } +} + +/* prototypes for benchmark helpers provided by user code */ +void starch_subtract_n_aligned_benchmark (void); +bool starch_subtract_n_aligned_benchmark_verify ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void starch_subtract_n_aligned_benchmark(void); + +static void starch_benchmark_one_subtract_n_aligned( starch_subtract_n_aligned_regentry * _entry, const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + + /* verify correctness of the output */ + if (! starch_subtract_n_aligned_benchmark_verify ( arg0, arg1, arg2, arg3 )) { + fprintf(stderr, "skipped (verification failed)\n"); + return; + } + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( arg0, arg1, arg2, arg3 ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "subtract_n_aligned"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_subtract_n_aligned( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) +{ + for (starch_subtract_n_aligned_regentry *_entry = starch_subtract_n_aligned_registry; _entry->name; ++_entry) { + starch_benchmark_one_subtract_n_aligned( _entry, arg0, arg1, arg2, arg3 ); + } +} + + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _benchmark_sym +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) starch_ ## _function ## _benchmark +#define STARCH_BENCHMARK_VERIFY(_function) starch_ ## _function ## _benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(1, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +#include "../benchmark/subtract_n_benchmark.c" + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES +#undef STARCH_BENCHMARK +#undef STARCH_BENCHMARK_VERIFY +#undef STARCH_BENCHMARK_RUN +#undef STARCH_BENCHMARK_ALLOC +#undef STARCH_BENCHMARK_FREE + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_benchmark_sym +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) starch_ ## _function ## _aligned_benchmark +#define STARCH_BENCHMARK_VERIFY(_function) starch_ ## _function ## _aligned_benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ## _aligned ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(STARCH_MIX_ALIGNMENT, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +#include "../benchmark/subtract_n_benchmark.c" + +static void starch_benchmark_all_subtract_n(void) +{ + fprintf(stderr, "==== subtract_n ===\n"); + starch_subtract_n_benchmark (); +} +static void starch_benchmark_all_subtract_n_aligned(void) +{ + fprintf(stderr, "==== subtract_n_aligned ===\n"); + starch_subtract_n_aligned_benchmark (); +} + +static int starch_benchmark_compare_result(const void *a, const void *b) +{ + const starch_benchmark_result *left = (const starch_benchmark_result *) a; + const starch_benchmark_result *right = (const starch_benchmark_result *) b; + + int name_cmp = strcmp(left->name, right->name); + if (name_cmp) + return name_cmp; + + if (left->ns < right->ns) + return -1; + if (left->ns > right->ns) + return 1; + return 0; +} + +static void starch_benchmark_usage(const char *argv0) +{ + fprintf(stderr, + "Usage: %s [OPTION]... [FUNCTION]...\n" + "Benchmarks starch functions and optionally writes a sorted wisdom file.\n" + "\n" + " -r FILE Read initial wisdom from FILE\n" + " -o FILE Write sorted wisdom to FILE\n" + " -F FLAVOR Add FLAVOR to whitelist\n" + " (default: no whitelist, run all runtime-supported flavors)\n" + " -N FLAVOR Add FLAVOR to blacklist\n" + " (default: no blacklist, run all runtime-supported flavors)\n" + " -l List compiled-in implementations but don't benchmark them\n" + " -t Include only the top candidate per function in wisdom output\n" + " -i ITERS Run benchmark ITERS times and use the mean. If ITERS > 2, ignore\n" + " the smallest and largest runs when calculating the mean.\n" + " (default: 1 iteration)\n" + " FUNCTION Run benchmarks for these functions only\n" + " (default: benchmark all functions)\n" + "\n" + "Supported flavors: " +#ifdef STARCH_FLAVOR_GENERIC + "generic " +#endif +#ifdef STARCH_FLAVOR_ARMV7A_VFPV3 + "armv7a_vfpv3 " +#endif +#ifdef STARCH_FLAVOR_ARMV7A_VFPV4 + "armv7a_vfpv4 " +#endif +#ifdef STARCH_FLAVOR_X86_64_AVX + "x86_64_avx " +#endif +#ifdef STARCH_FLAVOR_X86_64_AVX2 + "x86_64_avx2 " +#endif + "\n" + "Supported functions: " + "subtract_n " + "subtract_n_aligned " + "\n", argv0); +} + +static void starch_benchmark_append_flavor(const char *flavor, starch_benchmark_flavor_list **list) +{ + starch_benchmark_flavor_list *newnode = malloc(sizeof(*newnode)); + if (!newnode) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + exit(1); + } + + newnode->flavor = flavor; + newnode->next = *list; + *list = newnode; +} + +int main(int argc, char **argv) +{ + int specific = 0; + const char *output_path = NULL; + + int opt; + while ((opt = getopt(argc, argv, "r:o:F:N:i:lht")) != -1) { + switch (opt) { + case 'r': + if (starch_read_wisdom(optarg) < 0) { + fprintf(stderr, "%s: cannot read %s: %s\n", argv[0], optarg, strerror(errno)); + return 1; + } + fprintf(stderr, "%s: loaded wisdom file %s\n", argv[0], optarg); + break; + + case 'o': + output_path = optarg; + break; + + case 'F': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_whitelist); + break; + + case 'N': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_blacklist); + break; + + case 'l': + starch_benchmark_list_only = true; + break; + + case 't': + starch_benchmark_top_only = true; + break; + + case 'i': + starch_benchmark_iterations = atoi(optarg); + break; + + case 'h': + starch_benchmark_usage(argv[0]); + return 0; + + case '?': + default: + starch_benchmark_usage(argv[0]); + return 2; + } + } + + if (starch_benchmark_list_only && output_path) { + fprintf(stderr, "%s: -o and -l options cannot be specified together\n", argv[0]); + return 2; + } + + for (int i = optind; i < argc; ++i) { + if (!strcmp(argv[i], "subtract_n")) { + specific = 1; + starch_benchmark_all_subtract_n(); + continue; + } + if (!strcmp(argv[i], "subtract_n_aligned")) { + specific = 1; + starch_benchmark_all_subtract_n_aligned(); + continue; + } + + fprintf(stderr, "%s: unrecognized function name: %s\n", argv[0], argv[i]); + return 2; + } + + if (!specific) { + starch_benchmark_all_subtract_n(); + starch_benchmark_all_subtract_n_aligned(); + } + + if (output_path) { + FILE *out = fopen(output_path, "w"); + if (!out) { + fprintf(stderr, "%s: cannot open %s: %s\n", argv[0], output_path, strerror(errno)); + return 1; + } + + fprintf(out, "# generated by "); + for (int i = 0; i < argc; ++i) + fprintf(out, "%s ", argv[i]); + fprintf(out, "\n\n"); + + qsort(starch_benchmark_results, starch_benchmark_result_count, sizeof(*starch_benchmark_results), starch_benchmark_compare_result); + + const char *last_name = NULL; + bool first = true; + for (unsigned i = 0; i < starch_benchmark_result_count; ++i) { + starch_benchmark_result *result = &starch_benchmark_results[i]; + if (last_name && strcmp(last_name, result->name) != 0) { + fprintf(out, "\n"); + first = true; + } + last_name = result->name; + if (starch_benchmark_top_only && !first) + continue; + fprintf(out, "%-40s %-40s # %" PRIu64 " ns/call\n", result->name, result->impl, result->ns); + first = false; + } + + fclose(out); + fprintf(stderr, "%s: wrote sorted wisdom to %s\n", argv[0], output_path); + } + + return 0; +} diff --git a/starch/example/generated/dispatcher.c b/starch/example/generated/dispatcher.c new file mode 100644 index 0000000..0b6a061 --- /dev/null +++ b/starch/example/generated/dispatcher.c @@ -0,0 +1,313 @@ + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include + +#include "starch.h" + +/* helper for re-sorting registries */ +struct starch_regentry_prefix { + int rank; +}; + +static int starch_regentry_rank_compare (const void *l, const void *r) +{ + const struct starch_regentry_prefix *left = l, *right = r; + return left->rank - right->rank; +} + +/* dispatcher / registry for subtract_n */ + +starch_subtract_n_regentry * starch_subtract_n_select() { + for (starch_subtract_n_regentry *entry = starch_subtract_n_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_subtract_n_dispatch ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) { + starch_subtract_n_regentry *entry = starch_subtract_n_select(); + if (!entry) + abort(); + + starch_subtract_n = entry->callable; + starch_subtract_n ( arg0, arg1, arg2, arg3 ); +} + +starch_subtract_n_ptr starch_subtract_n = starch_subtract_n_dispatch; + +void starch_subtract_n_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_subtract_n_regentry *entry; + for (entry = starch_subtract_n_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_subtract_n_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_subtract_n_registry, entry - starch_subtract_n_registry, sizeof(starch_subtract_n_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_subtract_n = starch_subtract_n_dispatch; +} + +starch_subtract_n_regentry starch_subtract_n_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 1, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 2, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "neon_intrinsics_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_neon_intrinsics_armv7a_vfpv4, supports_neon_vfpv4 }, + { 1, "neon_intrinsics_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_neon_intrinsics_armv7a_vfpv3, supports_neon_vfpv3 }, + { 2, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 3, "generic_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_generic_armv7a_vfpv4, supports_neon_vfpv4 }, + { 4, "unroll_4_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_unroll_4_armv7a_vfpv4, supports_neon_vfpv4 }, + { 5, "bad_implementation_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_bad_implementation_armv7a_vfpv4, supports_neon_vfpv4 }, + { 6, "generic_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_generic_armv7a_vfpv3, supports_neon_vfpv3 }, + { 7, "unroll_4_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_unroll_4_armv7a_vfpv3, supports_neon_vfpv3 }, + { 8, "bad_implementation_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_bad_implementation_armv7a_vfpv3, supports_neon_vfpv3 }, + { 9, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 10, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86_64 + { 0, "generic_x86_64_avx2", "x86_64_avx2", starch_subtract_n_generic_x86_64_avx2, supports_x86_avx2 }, + { 1, "generic_x86_64_avx", "x86_64_avx", starch_subtract_n_generic_x86_64_avx, supports_x86_avx }, + { 2, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 3, "unroll_4_x86_64_avx2", "x86_64_avx2", starch_subtract_n_unroll_4_x86_64_avx2, supports_x86_avx2 }, + { 4, "bad_implementation_x86_64_avx2", "x86_64_avx2", starch_subtract_n_bad_implementation_x86_64_avx2, supports_x86_avx2 }, + { 5, "unroll_4_x86_64_avx", "x86_64_avx", starch_subtract_n_unroll_4_x86_64_avx, supports_x86_avx }, + { 6, "bad_implementation_x86_64_avx", "x86_64_avx", starch_subtract_n_bad_implementation_x86_64_avx, supports_x86_avx }, + { 7, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 8, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_X86_64 */ + { 0, NULL, NULL, NULL, NULL } +}; + +/* dispatcher / registry for subtract_n_aligned */ + +starch_subtract_n_aligned_regentry * starch_subtract_n_aligned_select() { + for (starch_subtract_n_aligned_regentry *entry = starch_subtract_n_aligned_registry; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static void starch_subtract_n_aligned_dispatch ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ) { + starch_subtract_n_aligned_regentry *entry = starch_subtract_n_aligned_select(); + if (!entry) + abort(); + + starch_subtract_n_aligned = entry->callable; + starch_subtract_n_aligned ( arg0, arg1, arg2, arg3 ); +} + +starch_subtract_n_aligned_ptr starch_subtract_n_aligned = starch_subtract_n_aligned_dispatch; + +void starch_subtract_n_aligned_set_wisdom (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + starch_subtract_n_aligned_regentry *entry; + for (entry = starch_subtract_n_aligned_registry; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - starch_subtract_n_aligned_registry); + } + } + + /* re-sort based on the new ranking */ + qsort(starch_subtract_n_aligned_registry, entry - starch_subtract_n_aligned_registry, sizeof(starch_subtract_n_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_subtract_n_aligned = starch_subtract_n_aligned_dispatch; +} + +starch_subtract_n_aligned_regentry starch_subtract_n_aligned_registry[] = { + +#ifdef STARCH_MIX_GENERIC + { 0, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 1, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 2, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_GENERIC */ + +#ifdef STARCH_MIX_ARM + { 0, "generic_armv7a_vfpv4_aligned", "armv7a_vfpv4", starch_subtract_n_aligned_generic_armv7a_vfpv4, supports_neon_vfpv4 }, + { 1, "unroll_4_armv7a_vfpv4_aligned", "armv7a_vfpv4", starch_subtract_n_aligned_unroll_4_armv7a_vfpv4, supports_neon_vfpv4 }, + { 2, "bad_implementation_armv7a_vfpv4_aligned", "armv7a_vfpv4", starch_subtract_n_aligned_bad_implementation_armv7a_vfpv4, supports_neon_vfpv4 }, + { 3, "neon_intrinsics_armv7a_vfpv4_aligned", "armv7a_vfpv4", starch_subtract_n_aligned_neon_intrinsics_armv7a_vfpv4, supports_neon_vfpv4 }, + { 4, "generic_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_generic_armv7a_vfpv4, supports_neon_vfpv4 }, + { 5, "unroll_4_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_unroll_4_armv7a_vfpv4, supports_neon_vfpv4 }, + { 6, "bad_implementation_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_bad_implementation_armv7a_vfpv4, supports_neon_vfpv4 }, + { 7, "neon_intrinsics_armv7a_vfpv4", "armv7a_vfpv4", starch_subtract_n_neon_intrinsics_armv7a_vfpv4, supports_neon_vfpv4 }, + { 8, "generic_armv7a_vfpv3_aligned", "armv7a_vfpv3", starch_subtract_n_aligned_generic_armv7a_vfpv3, supports_neon_vfpv3 }, + { 9, "unroll_4_armv7a_vfpv3_aligned", "armv7a_vfpv3", starch_subtract_n_aligned_unroll_4_armv7a_vfpv3, supports_neon_vfpv3 }, + { 10, "bad_implementation_armv7a_vfpv3_aligned", "armv7a_vfpv3", starch_subtract_n_aligned_bad_implementation_armv7a_vfpv3, supports_neon_vfpv3 }, + { 11, "neon_intrinsics_armv7a_vfpv3_aligned", "armv7a_vfpv3", starch_subtract_n_aligned_neon_intrinsics_armv7a_vfpv3, supports_neon_vfpv3 }, + { 12, "generic_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_generic_armv7a_vfpv3, supports_neon_vfpv3 }, + { 13, "unroll_4_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_unroll_4_armv7a_vfpv3, supports_neon_vfpv3 }, + { 14, "bad_implementation_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_bad_implementation_armv7a_vfpv3, supports_neon_vfpv3 }, + { 15, "neon_intrinsics_armv7a_vfpv3", "armv7a_vfpv3", starch_subtract_n_neon_intrinsics_armv7a_vfpv3, supports_neon_vfpv3 }, + { 16, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 17, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 18, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_ARM */ + +#ifdef STARCH_MIX_X86_64 + { 0, "generic_x86_64_avx2_aligned", "x86_64_avx2", starch_subtract_n_aligned_generic_x86_64_avx2, supports_x86_avx2 }, + { 1, "unroll_4_x86_64_avx2_aligned", "x86_64_avx2", starch_subtract_n_aligned_unroll_4_x86_64_avx2, supports_x86_avx2 }, + { 2, "bad_implementation_x86_64_avx2_aligned", "x86_64_avx2", starch_subtract_n_aligned_bad_implementation_x86_64_avx2, supports_x86_avx2 }, + { 3, "generic_x86_64_avx2", "x86_64_avx2", starch_subtract_n_generic_x86_64_avx2, supports_x86_avx2 }, + { 4, "unroll_4_x86_64_avx2", "x86_64_avx2", starch_subtract_n_unroll_4_x86_64_avx2, supports_x86_avx2 }, + { 5, "bad_implementation_x86_64_avx2", "x86_64_avx2", starch_subtract_n_bad_implementation_x86_64_avx2, supports_x86_avx2 }, + { 6, "generic_x86_64_avx_aligned", "x86_64_avx", starch_subtract_n_aligned_generic_x86_64_avx, supports_x86_avx }, + { 7, "unroll_4_x86_64_avx_aligned", "x86_64_avx", starch_subtract_n_aligned_unroll_4_x86_64_avx, supports_x86_avx }, + { 8, "bad_implementation_x86_64_avx_aligned", "x86_64_avx", starch_subtract_n_aligned_bad_implementation_x86_64_avx, supports_x86_avx }, + { 9, "generic_x86_64_avx", "x86_64_avx", starch_subtract_n_generic_x86_64_avx, supports_x86_avx }, + { 10, "unroll_4_x86_64_avx", "x86_64_avx", starch_subtract_n_unroll_4_x86_64_avx, supports_x86_avx }, + { 11, "bad_implementation_x86_64_avx", "x86_64_avx", starch_subtract_n_bad_implementation_x86_64_avx, supports_x86_avx }, + { 12, "generic_generic", "generic", starch_subtract_n_generic_generic, NULL }, + { 13, "unroll_4_generic", "generic", starch_subtract_n_unroll_4_generic, NULL }, + { 14, "bad_implementation_generic", "generic", starch_subtract_n_bad_implementation_generic, NULL }, +#endif /* STARCH_MIX_X86_64 */ + { 0, NULL, NULL, NULL, NULL } +}; + + +int starch_read_wisdom (const char * path) +{ + FILE *fp = fopen(path, "r"); + if (!fp) + return -1; + + /* reset all ranks to identify entries not listed in the wisdom file; we'll assign ranks at the end to produce a stable sort */ + int rank_subtract_n = 0; + for (starch_subtract_n_regentry *entry = starch_subtract_n_registry; entry->name; ++entry) { + entry->rank = 0; + } + int rank_subtract_n_aligned = 0; + for (starch_subtract_n_aligned_regentry *entry = starch_subtract_n_aligned_registry; entry->name; ++entry) { + entry->rank = 0; + } + + char linebuf[512]; + while (fgets(linebuf, sizeof(linebuf), fp)) { + /* split name and impl on whitespace, handle comments etc */ + char *name = linebuf; + while (*name && isspace(*name)) + ++name; + + if (!*name || *name == '#') + continue; + + char *end = name; + while (*end && !isspace(*end)) + ++end; + + if (!*end) + continue; + *end = 0; + + char *impl = end + 1; + while (*impl && isspace(*impl)) + ++impl; + + if (!*impl) + continue; + + end = impl; + while (*end && !isspace(*end)) + ++end; + + *end = 0; + + /* try to find a matching registry entry */ + if (!strcmp(name, "subtract_n")) { + for (starch_subtract_n_regentry *entry = starch_subtract_n_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_subtract_n; + break; + } + } + continue; + } + if (!strcmp(name, "subtract_n_aligned")) { + for (starch_subtract_n_aligned_regentry *entry = starch_subtract_n_aligned_registry; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_subtract_n_aligned; + break; + } + } + continue; + } + } + + if (ferror(fp)) { + fclose(fp); + return -1; + } + + fclose(fp); + + /* assign ranks to unmatched items to (stable) sort them last; re-sort everything */ + { + starch_subtract_n_regentry *entry; + for (entry = starch_subtract_n_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_subtract_n; + } + qsort(starch_subtract_n_registry, entry - starch_subtract_n_registry, sizeof(starch_subtract_n_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_subtract_n = starch_subtract_n_dispatch; + } + { + starch_subtract_n_aligned_regentry *entry; + for (entry = starch_subtract_n_aligned_registry; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_subtract_n_aligned; + } + qsort(starch_subtract_n_aligned_registry, entry - starch_subtract_n_aligned_registry, sizeof(starch_subtract_n_aligned_regentry), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + starch_subtract_n_aligned = starch_subtract_n_aligned_dispatch; + } + + return 0; +} diff --git a/starch/example/generated/flavor.armv7a_vfpv3.c b/starch/example/generated/flavor.armv7a_vfpv3.c new file mode 100644 index 0000000..ad81cf2 --- /dev/null +++ b/starch/example/generated/flavor.armv7a_vfpv3.c @@ -0,0 +1,33 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_ARMV7A_VFPV3 +#define STARCH_FEATURE_NEON + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## armv7a_vfpv3 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv7a_vfpv3 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## armv7a_vfpv3 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv7a_vfpv3 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + diff --git a/starch/example/generated/flavor.armv7a_vfpv4.c b/starch/example/generated/flavor.armv7a_vfpv4.c new file mode 100644 index 0000000..e6c117c --- /dev/null +++ b/starch/example/generated/flavor.armv7a_vfpv4.c @@ -0,0 +1,33 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_ARMV7A_VFPV4 +#define STARCH_FEATURE_NEON + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## armv7a_vfpv4 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## armv7a_vfpv4 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## armv7a_vfpv4 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## armv7a_vfpv4 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + diff --git a/starch/example/generated/flavor.generic.c b/starch/example/generated/flavor.generic.c new file mode 100644 index 0000000..02d52b4 --- /dev/null +++ b/starch/example/generated/flavor.generic.c @@ -0,0 +1,17 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_GENERIC + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## generic +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## generic +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + diff --git a/starch/example/generated/flavor.x86_64_avx.c b/starch/example/generated/flavor.x86_64_avx.c new file mode 100644 index 0000000..8eea708 --- /dev/null +++ b/starch/example/generated/flavor.x86_64_avx.c @@ -0,0 +1,32 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_X86_64_AVX + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## x86_64_avx +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## x86_64_avx +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## x86_64_avx +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## x86_64_avx +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + diff --git a/starch/example/generated/flavor.x86_64_avx2.c b/starch/example/generated/flavor.x86_64_avx2.c new file mode 100644 index 0000000..50f2c5f --- /dev/null +++ b/starch/example/generated/flavor.x86_64_avx2.c @@ -0,0 +1,32 @@ + +/* starch generated code. Do not edit. */ + +#define STARCH_FLAVOR_X86_64_AVX2 + +#include "starch.h" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _ ## x86_64_avx2 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _ ## _impl ## _ ## x86_64_avx2 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) starch_ ## _name ## _aligned_ ## x86_64_avx2 +#define STARCH_IMPL(_function,_impl) starch_ ## _function ## _aligned_ ## _impl ## _ ## x86_64_avx2 +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +#include "../impl/subtract_n.c" + diff --git a/starch/example/generated/makefile.arm b/starch/example/generated/makefile.arm new file mode 100644 index 0000000..d312adc --- /dev/null +++ b/starch/example/generated/makefile.arm @@ -0,0 +1,42 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_ARM + + +generated/flavor.armv7a_vfpv4.o: generated/flavor.armv7a_vfpv4.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv7-a+neon-vfpv4 -mfpu=neon-vfpv4 -ffast-math generated/flavor.armv7a_vfpv4.c -o generated/flavor.armv7a_vfpv4.o + +generated/flavor.armv7a_vfpv3.o: generated/flavor.armv7a_vfpv3.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -march=armv7-a+neon-vfpv3 -mfpu=neon-vfpv3 -ffast-math generated/flavor.armv7a_vfpv3.c -o generated/flavor.armv7a_vfpv3.o + +generated/flavor.generic.o: generated/flavor.generic.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/flavor.generic.c -o generated/flavor.generic.o + +generated/dispatcher.o: generated/dispatcher.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/dispatcher.c -o generated/dispatcher.o + +STARCH_OBJS := generated/flavor.armv7a_vfpv4.o generated/flavor.armv7a_vfpv3.o generated/flavor.generic.o generated/dispatcher.o + + +generated/benchmark.o: generated/benchmark.c benchmark/subtract_n_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/benchmark.c -o generated/benchmark.o + +STARCH_BENCHMARK_OBJ := generated/benchmark.o diff --git a/starch/example/generated/makefile.generic b/starch/example/generated/makefile.generic new file mode 100644 index 0000000..a98971f --- /dev/null +++ b/starch/example/generated/makefile.generic @@ -0,0 +1,36 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_GENERIC + + +generated/flavor.generic.o: generated/flavor.generic.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/flavor.generic.c -o generated/flavor.generic.o + +generated/dispatcher.o: generated/dispatcher.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/dispatcher.c -o generated/dispatcher.o + +STARCH_OBJS := generated/flavor.generic.o generated/dispatcher.o + + +generated/benchmark.o: generated/benchmark.c benchmark/subtract_n_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/benchmark.c -o generated/benchmark.o + +STARCH_BENCHMARK_OBJ := generated/benchmark.o diff --git a/starch/example/generated/makefile.x86_64 b/starch/example/generated/makefile.x86_64 new file mode 100644 index 0000000..8cd9d6d --- /dev/null +++ b/starch/example/generated/makefile.x86_64 @@ -0,0 +1,42 @@ +# -*- makefile -*- + + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -DSTARCH_MIX_X86_64 + + +generated/flavor.x86_64_avx2.o: generated/flavor.x86_64_avx2.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -mavx2 -ffast-math generated/flavor.x86_64_avx2.c -o generated/flavor.x86_64_avx2.o + +generated/flavor.x86_64_avx.o: generated/flavor.x86_64_avx.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) -mavx -ffast-math generated/flavor.x86_64_avx.c -o generated/flavor.x86_64_avx.o + +generated/flavor.generic.o: generated/flavor.generic.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/flavor.generic.c -o generated/flavor.generic.o + +generated/dispatcher.o: generated/dispatcher.c impl/subtract_n.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/dispatcher.c -o generated/dispatcher.o + +STARCH_OBJS := generated/flavor.x86_64_avx2.o generated/flavor.x86_64_avx.o generated/flavor.generic.o generated/dispatcher.o + + +generated/benchmark.o: generated/benchmark.c benchmark/subtract_n_benchmark.c + $(STARCH_COMPILE) $(STARCH_CFLAGS) generated/benchmark.c -o generated/benchmark.o + +STARCH_BENCHMARK_OBJ := generated/benchmark.o diff --git a/starch/example/generated/starch.h b/starch/example/generated/starch.h new file mode 100644 index 0000000..407b705 --- /dev/null +++ b/starch/example/generated/starch.h @@ -0,0 +1,133 @@ + +/* starch generated code. Do not edit. */ + +#include + +/* mixes */ + +/* Generic build, compiler defaults only */ +#ifdef STARCH_MIX_GENERIC +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 1 +#endif /* STARCH_MIX_GENERIC */ + +/* ARM */ +#ifdef STARCH_MIX_ARM +#define STARCH_FLAVOR_ARMV7A_VFPV4 +#define STARCH_FLAVOR_ARMV7A_VFPV3 +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 16 +#endif /* STARCH_MIX_ARM */ + +/* x64-64 */ +#ifdef STARCH_MIX_X86_64 +#define STARCH_FLAVOR_X86_64_AVX2 +#define STARCH_FLAVOR_X86_64_AVX +#define STARCH_FLAVOR_GENERIC +#define STARCH_MIX_ALIGNMENT 32 +#endif /* STARCH_MIX_X86_64 */ + + +#ifdef STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_IS_ALIGNED(_ptr) (((uintptr_t)(_ptr) & (STARCH_MIX_ALIGNMENT-1)) == 0) +#else +/* mix not defined, alignment is unknown, treat everything as unaligned */ +#define STARCH_IS_ALIGNED(_ptr) (0) +#endif + + +/* entry points and registries */ + +typedef void (* starch_subtract_n_ptr) ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +extern starch_subtract_n_ptr starch_subtract_n; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_subtract_n_ptr callable; + int (*flavor_supported)(); +} starch_subtract_n_regentry; + +extern starch_subtract_n_regentry starch_subtract_n_registry[]; +starch_subtract_n_regentry * starch_subtract_n_select(); +void starch_subtract_n_set_wisdom( const char * const * received_wisdom ); + +typedef void (* starch_subtract_n_aligned_ptr) ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +extern starch_subtract_n_aligned_ptr starch_subtract_n_aligned; + +typedef struct { + int rank; + const char *name; + const char *flavor; + starch_subtract_n_aligned_ptr callable; + int (*flavor_supported)(); +} starch_subtract_n_aligned_regentry; + +extern starch_subtract_n_aligned_regentry starch_subtract_n_aligned_registry[]; +starch_subtract_n_aligned_regentry * starch_subtract_n_aligned_select(); +void starch_subtract_n_aligned_set_wisdom( const char * const * received_wisdom ); + +/* flavors and prototypes */ + +#ifdef STARCH_FLAVOR_GENERIC +void starch_subtract_n_generic_generic ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_unroll_4_generic ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_bad_implementation_generic ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +#endif /* STARCH_FLAVOR_GENERIC */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_ARMV7A_VFPV3 +int supports_neon_vfpv3 (void); +void starch_subtract_n_generic_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_generic_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_unroll_4_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_unroll_4_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_bad_implementation_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_bad_implementation_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_neon_intrinsics_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_neon_intrinsics_armv7a_vfpv3 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +#endif /* STARCH_FLAVOR_ARMV7A_VFPV3 */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_ARMV7A_VFPV4 +int supports_neon_vfpv4 (void); +void starch_subtract_n_generic_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_generic_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_unroll_4_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_unroll_4_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_bad_implementation_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_bad_implementation_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_neon_intrinsics_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_neon_intrinsics_armv7a_vfpv4 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +#endif /* STARCH_FLAVOR_ARMV7A_VFPV4 */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_X86_64_AVX +int supports_x86_avx (void); +void starch_subtract_n_generic_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_generic_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_unroll_4_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_unroll_4_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_bad_implementation_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_bad_implementation_x86_64_avx ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +#endif /* STARCH_FLAVOR_X86_64_AVX */ + +int starch_read_wisdom (const char * path); + +#ifdef STARCH_FLAVOR_X86_64_AVX2 +int supports_x86_avx2 (void); +void starch_subtract_n_generic_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_generic_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_unroll_4_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_unroll_4_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_bad_implementation_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +void starch_subtract_n_aligned_bad_implementation_x86_64_avx2 ( const uint16_t * arg0, unsigned arg1, uint16_t arg2, uint16_t * arg3 ); +#endif /* STARCH_FLAVOR_X86_64_AVX2 */ + +int starch_read_wisdom (const char * path); + diff --git a/starch/example/impl/subtract_n.c b/starch/example/impl/subtract_n.c new file mode 100644 index 0000000..830ee0e --- /dev/null +++ b/starch/example/impl/subtract_n.c @@ -0,0 +1,94 @@ +void STARCH_IMPL(subtract_n, generic) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + out_align[0] = in_align[0] - n; + in_align++; + out_align++; + } +} + +void STARCH_IMPL(subtract_n, unroll_4) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + unsigned len4 = len >> 2; + unsigned len1 = len & 3; + + while (len4--) { + out_align[0] = in_align[0] - n; + out_align[1] = in_align[1] - n; + out_align[2] = in_align[2] - n; + out_align[3] = in_align[3] - n; + in_align += 4; + out_align += 4; + } + + while (len1--) { + out_align[0] = in_align[0] - n; + in_align++; + out_align++; + } +} + +void STARCH_IMPL(subtract_n, bad_implementation) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out) +{ + // This is a deliberately bad implementation that produces + // incorrect results. The error should be caught during + // benchmarking via STARCH_BENCHMARK_VERIFY. + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + while (len--) { + out_align[0] = in_align[0] - n; + if (len % 10000 == 0) + out_align[0] += 1; + in_align++; + out_align++; + } +} + +#ifdef STARCH_FEATURE_NEON + +#include + +void STARCH_IMPL_REQUIRES(subtract_n, neon_intrinsics, STARCH_FEATURE_NEON) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out) +{ + const uint16_t * restrict in_align = STARCH_ALIGNED(in); + uint16_t * restrict out_align = STARCH_ALIGNED(out); + + uint16x8_t subtractor = vdupq_n_u16(n); + + unsigned len8 = len >> 3; + unsigned len4 = len & 4; + unsigned len1 = len & 3; + + while (len8--) { + uint16x8_t value = vld1q_u16(in_align); + uint16x8_t result = vsubq_u16(value, subtractor); + vst1q_u16(out_align, result); + in_align += 8; + out_align += 8; + } + + if (len4) { + uint16x4_t value = vld1_u16(in_align); + uint16x4_t result = vsub_u16(value, vget_low_u16(subtractor)); + vst1_u16(out_align, result); + in_align += 4; + out_align += 4; + } + + while (len1--) { + uint16x4_t value = vld1_dup_u16(in_align); + uint16x4_t result = vsub_u16(value, vget_low_u16(subtractor)); + vst1_lane_u16(out_align, result, 0); + in_align++; + out_align++; + } +} + +#endif diff --git a/starch/example/starchgen.py b/starch/example/starchgen.py new file mode 100755 index 0000000..993fa57 --- /dev/null +++ b/starch/example/starchgen.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 + +import sys +import os +import glob + +example_dir = os.path.dirname(sys.argv[0]) +starch_dir = os.path.join(example_dir, '..') +sys.path.append(starch_dir) +import starch + +gen = starch.Generator(runtime_dir = example_dir, + output_dir = os.path.join(example_dir, 'generated')) + +gen.add_include('') + +gen.add_function(name = 'subtract_n', argtypes = ['const uint16_t *', 'unsigned', 'uint16_t', 'uint16_t *'], aligned=True) + +gen.add_feature(name='neon', + description='ARM NEON v2') + +gen.add_flavor(name = 'generic', + description = 'Generic build, default compiler options', + compile_flags = []) +gen.add_flavor(name = 'armv7a_vfpv3', + description = 'ARMv7-A, NEON, VFPv3', + compile_flags = ['-march=armv7-a+neon-vfpv3', '-mfpu=neon-vfpv3', '-ffast-math'], + features = ['neon'], + test_function = 'supports_neon_vfpv3', + alignment=16) +gen.add_flavor(name = 'armv7a_vfpv4', + description = 'ARMv7-A, NEON, VFPv4', + compile_flags = ['-march=armv7-a+neon-vfpv4', '-mfpu=neon-vfpv4', '-ffast-math'], + features = ['neon'], + test_function = 'supports_neon_vfpv4', + alignment=16) +gen.add_flavor(name = 'x86_64_avx', + description = 'x86-64 with AVX', + compile_flags = ['-mavx', '-ffast-math'], + features = [], + test_function = 'supports_x86_avx', + alignment=32) +gen.add_flavor(name = 'x86_64_avx2', + description = 'x86-64 with AVX2', + compile_flags = ['-mavx2', '-ffast-math'], + features = [], + test_function = 'supports_x86_avx2', + alignment=32) + +gen.add_mix(name = 'generic', + description = 'Generic build, compiler defaults only', + flavors = ['generic']) + +gen.add_mix(name = 'arm', + description = 'ARM', + flavors = ['armv7a_vfpv4', 'armv7a_vfpv3', 'generic'], + wisdom = { + 'subtract_n': ['neon_intrinsics_armv7a_vfpv4', 'neon_intrinsics_armv7a_vfpv3', 'generic_generic'] + }) +gen.add_mix(name = 'x86_64', + description = 'x64-64', + flavors = ['x86_64_avx2', 'x86_64_avx', 'generic'], + wisdom = { + 'subtract_n': ['generic_x86_64_avx2', 'generic_x86_64_avx', 'unroll_4_generic'] + }) + +for pattern in ['impl/*.c', 'benchmark/*.c']: + for c_file in glob.glob(pattern): + gen.scan_file(c_file) + +gen.generate() diff --git a/starch/example/support.c b/starch/example/support.c new file mode 100644 index 0000000..887a823 --- /dev/null +++ b/starch/example/support.c @@ -0,0 +1,53 @@ +#ifdef __arm__ + +#include +#include + +int supports_neon_vfpv3(void) +{ + long hwcaps = getauxval(AT_HWCAP); + return (hwcaps & HWCAP_ARM_NEON) && (hwcaps & HWCAP_ARM_VFPv3); +} + +int supports_neon_vfpv4(void) +{ + long hwcaps = getauxval(AT_HWCAP); + return (hwcaps & HWCAP_ARM_NEON) && (hwcaps & HWCAP_ARM_VFPv4); +} + +#endif /* __arm__ */ + +#ifdef __x86_64__ + +#include +#include + +int supports_x86_avx(void) +{ + unsigned int maxlevel = __get_cpuid_max (0, 0); + if (maxlevel < 1) + return 0; + + unsigned eax, ebx, ecx, edx; + __cpuid(1, eax, ebx, ecx, edx); + if (!(ecx & bit_AVX)) + return 0; + + return 1; +} + +int supports_x86_avx2(void) +{ + unsigned int maxlevel = __get_cpuid_max (0, 0); + if (maxlevel < 7) + return 0; + + unsigned eax, ebx, ecx, edx; + __cpuid_count(7, 0, eax, ebx, ecx, edx); + if (!(ebx & bit_AVX2)) + return 0; + + return 1; +} + +#endif diff --git a/starch/starch.py b/starch/starch.py new file mode 100644 index 0000000..f20ae3c --- /dev/null +++ b/starch/starch.py @@ -0,0 +1,583 @@ +# starch: framework glue for selecting ISA-specific code at runtime + +# Copyright (c) 2020, FlightAware LLC. +# All rights reserved. +# See the LICENSE file for licensing terms. + +import sys +import re +import os +import mako.lookup + +from typing import Optional, Union, Iterable, Sequence, MutableSequence, Mapping, MutableMapping, FrozenSet + +class Feature(object): + """Feature represents a type of code that can only be built with +certain compiler flags. For example, code that uses NEON intrinsics +can only be compiled if the compiler is building for an ARM instruction +set that supports NEON. Implementation code should be conditionally +compiled using the corresponding macro name, and should declare +themselves using the STARCH_IMPL_REQUIRES macro.""" + + gen: 'Generator' + name: str + description: str + + def __init__(self, + gen: 'Generator', + name: str, + description: str): + self.gen = gen + self.name = name + self.description = description + + @property + def macro(self) -> str: + return 'STARCH_FEATURE_' + self.name.upper() + + +class BuildFlavor(object): + """BuildFlavor models code built with specific compiler flags. +Shared implementation code will be built multiple times, once per flavor. + +Each flavor has an associated test function that is called at runtime to +check if the current hardware supports the code emitted by the flavor. If +the test function returns false, no code built with the flavor will be executed. + +Each flavor has a (possibly empty) list of optional Features that may +be present at runtime. This list controls which feature-dependent code is +compiled for this flavor (e.g. an x86 flavor might try to build code that +depends on SSE, but should not try to build code that depends on ARM NEON +intrinsics)""" + + gen: 'Generator' + name: str + description: str + compile_flags: Sequence[str] + features: FrozenSet[Feature] + test_function: Optional[str] + alignment: int + + def __init__(self, + gen: 'Generator', + name: str, + description: str, + compile_flags: Iterable[str] = (), + features: Iterable[Feature] = (), + test_function: Optional[str] = None, + alignment: int = 1): + + self.gen = gen + self.name = name + self.compile_flags = tuple(compile_flags) + self.features = frozenset(features) + self.test_function = test_function + self.alignment = alignment + + @property + def macro(self) -> str: + return 'STARCH_FLAVOR_' + self.name.upper() + + @property + def test_function_expr(self) -> str: + if self.test_function is None: + return "NULL" + else: + return self.test_function + + @property + def cflags(self) -> str: + return ' '.join(self.compile_flags) + + +class Function(object): + """A user-callable function that will be dispatched to +one of the many possible implementations based on runtime feature +support.""" + + gen: 'Generator' + name: str + returntype: str + argtypes: Sequence[str] + argnames: Sequence[str] + impls: Sequence['FunctionImpl'] + benchmark: Optional['SourceFile'] = None + benchmark_verify: Optional['SourceFile'] = None + aligned: bool + aligned_pair: Optional['Function'] = None + + def __init__(self, + gen: 'Generator', + name: str, + argtypes: Iterable[str], + returntype: str = 'void', + argnames: Optional[Iterable[str]] = None, + aligned: bool = False): + + self.gen = gen + self.name = name + self.returntype = returntype + self.argtypes = tuple(argtypes) + self.aligned = aligned + self.impls = [] + + if argnames is None: + self.argnames = tuple( f'arg{n}' for n in range(len(self.argtypes)) ) + else: + self.argnames = tuple(argnames) + if len(self.argnames) != len(self.argtypes): + raise ValueError('length of argnames must match length of argtypes') + + @property + def declaration_arglist(self) -> str: + return ', '.join([f'{typename} {argname}' for typename, argname in zip(self.argtypes, self.argnames)]) + + @property + def named_arglist(self) -> str: + return ', '.join(self.argnames) + + @property + def callable_symbol(self) -> str: + if self.gen.prefix_function_symbols: + return self.gen.sym(self.name) + else: + return self.name + + @property + def select_symbol(self) -> str: + return self.gen.sym(self.name + '_select') + + @property + def dispatcher_symbol(self) -> str: + return self.gen.sym(self.name + '_dispatch') + + @property + def pointer_type(self) -> str: + return self.gen.sym(self.name + '_ptr') + + @property + def regentry_type(self) -> str: + return self.gen.sym(self.name + '_regentry') + + @property + def registry_symbol(self) -> str: + return self.gen.sym(self.name + '_registry') + + @property + def set_wisdom_symbol(self) -> str: + return self.gen.sym(self.name + '_set_wisdom') + + @property + def benchmark_symbol(self) -> str: + return self.gen.sym(self.name + '_benchmark') + + @property + def benchmark_verify_symbol(self) -> str: + return self.gen.sym(self.name + '_benchmark_verify') + + +class FunctionImpl(object): + """A possible implementation of a function, not built in any particular way yet.""" + + gen: 'Generator' + function: Function + name: str + feature: Optional[Feature] + source: 'SourceFile' + lineno: int + + def __init__(self, + gen: 'Generator', + function: Function, + name: str, + feature: Optional[Feature], + source: 'SourceFile', + lineno: int): + self.gen = gen + self.function = function + self.name = name + self.feature = feature + self.source = source + self.lineno = lineno + + def wisdom_name(self, flavor) -> str: + if self.function.aligned: + return self.name + '_' + flavor.name + '_aligned' + else: + return self.name + '_' + flavor.name + + def impl_symbol(self, flavor) -> str: + return self.gen.sym(self.function.name + '_' + self.name + '_' + flavor.name) + + +class SourceFile(object): + """A scanned source file that contains implementation code.""" + + path: str + impls: Sequence[FunctionImpl] + + def __init__(self, path): + self.path = path + self.impls = [] + + +class BuildMix(object): + """A combination of build flavors that make up one possible way of building all +the code. The output of a mix is a library that dispatches functions within the +mixed flavors. For example, when building a binary that is intended to run on +generic ARM systems, a mix could be used that includes flavors for ARMv6, ARMv7, +and ARMv8. + +The order of flavors within a mix is significant. At runtime, flavors will be tried +in order until a supported flavor is found; so more efficient flavors should be +specified first.""" + + name: str + description: str + flavors: Sequence[BuildFlavor] + wisdom: Mapping[Function,Sequence[str]] + + def __init__(self, + name: str, + description: str, + flavors: Iterable[BuildFlavor], + wisdom: Mapping[Function,Iterable[str]] = {}): + self.name = name + self.description = description + self.flavors = tuple(flavors) + self.wisdom = dict( (k,tuple(v)) for k, v in wisdom.items() ) + + @property + def macro(self): + return 'STARCH_MIX_' + self.name.upper() + + def function_wisdom(self, function) -> Sequence[str]: + return self.wisdom.get(function, []) + +class Generator(object): + functions: MutableMapping[str, Function] + features: MutableMapping[str, Feature] + features_by_macro: MutableMapping[str, Feature] + flavors: MutableMapping[str, BuildFlavor] + function_impls: MutableMapping[str, FunctionImpl] + impl_files: MutableSequence[SourceFile] + benchmark_files: MutableSequence[SourceFile] + mixes: MutableMapping[str, BuildMix] + symbol_prefix: str + templates: mako.lookup.TemplateLookup + generated_include_path: str + generated_flavor_pattern: str + generated_dispatcher_path: str + generated_benchmark_path: str + generated_makefile_pattern: str + includes: MutableSequence[str] = [] + + def __init__(self, + runtime_dir: str, + output_dir: str, + template_dir: Optional[str] = None, + mako_dir: Optional[str] = None, + generated_include_path: str = 'starch.h', + generated_flavor_pattern: str = 'flavor.{0}.c', + generated_dispatcher_path: str = 'dispatcher.c', + generated_benchmark_path: str = 'benchmark.c', + generated_makefile_pattern: str = 'makefile.{0}', + symbol_prefix: str = 'starch_', + prefix_function_symbols: bool = True): + self.runtime_dir = runtime_dir + self.output_dir = output_dir + self.generated_include_path = os.path.join(output_dir, generated_include_path) + self.generated_flavor_pattern = generated_flavor_pattern + self.generated_dispatcher_path = os.path.join(output_dir, generated_dispatcher_path) + self.generated_benchmark_path = os.path.join(output_dir, generated_benchmark_path) + self.generated_makefile_pattern = generated_makefile_pattern + self.symbol_prefix = symbol_prefix + self.prefix_function_symbols = prefix_function_symbols + + if template_dir is None and '__file__' in globals(): + template_dir = os.path.join(os.path.dirname(__file__), 'templates') + if template_dir is None: + raise RuntimeError('cannot determine template directory location, please specify template_dir') + self.templates = mako.lookup.TemplateLookup(directories = [template_dir], module_directory = mako_dir, imports=['import os']) + + self.functions = {} + self.features = {} + self.features_by_macro = {} + self.flavors = {} + self.function_impls = {} + self.impl_files = [] + self.benchmark_files = [] + self.mixes = {} + self.includes = [] + + def generated_flavor_path(self, flavor: BuildFlavor) -> str: + return os.path.join(self.output_dir, self.generated_flavor_pattern.format(flavor.name)) + + def generated_makefile_path(self, mix: BuildMix) -> str: + return os.path.join(self.output_dir, self.generated_makefile_pattern.format(mix.name)) + + def add_include(self, what): + if what[0] == '<' or what[0] == '"': + self.includes.append(what) + else: + self.includes.append('"' + what + '"') + + def add_feature(self, + name: str, + description: str): + if name in self.features: + raise RuntimeError('duplicated flavor: ' + name) + feature = Feature(self, name, description) + self.features[name] = self.features_by_macro[feature.macro] = feature + + def get_feature(self, key: Union[str, Feature]) -> Feature: + if isinstance(key, Feature): + return key + return self.features[key] + + def get_feature_macro(self, key: str) -> Optional[Feature]: + return self.features_by_macro.get(key, None) + + def add_function(self, + name: str, + argtypes: Iterable[str], + returntype: str = 'void', + argnames: Optional[Iterable[str]] = None, + aligned: bool = False): + if name in self.functions: + raise RuntimeError('duplicated function: ' + name) + + base_function = Function(self, name, argtypes, returntype, argnames, aligned = False) + aligned_function: Optional[Function] = None + if aligned: + aligned_function = Function(self, name + '_aligned', argtypes, returntype, argnames, aligned = True) + base_function.aligned_pair = aligned_function + aligned_function.aligned_pair = base_function + + self.functions[base_function.name] = base_function + if aligned_function: + self.functions[aligned_function.name] = aligned_function + + def get_function(self, key: Union[str, Function]) -> Function: + if isinstance(key, Function): + return key + return self.functions[key] + + def add_flavor(self, + name: str, + description: str, + compile_flags: Iterable[str] = (), + features: Iterable[Union[Feature,str]] = (), + test_function: Optional[str] = None, + alignment: int = 1): + if name in self.flavors: + raise RuntimeError('duplicated flavor: ' + name) + resolved_features = map(self.get_feature, features) + self.flavors[name] = BuildFlavor(self, name, description, compile_flags, resolved_features, test_function, alignment) + + def get_flavor(self, key: Union[str, BuildFlavor]) -> BuildFlavor: + if isinstance(key, BuildFlavor): + return key + return self.flavors[key] + + def load_wisdom(self, path: str) -> Mapping[str,Sequence[str]]: + results: Mapping[Function,Sequence[str]] = {} + + try: + f = open(path, 'r') + except IOError: + self.warning(None, None, f"ignoring missing wisdom file {path}") + return results + + with f: + for line in f: + line = line.strip() + if line == '' or line.startswith('#'): + continue + + parts = re.split('\s+', line) + if len(parts) < 2: + continue + + func, impl = parts[:2] + if func in self.functions: + results.setdefault(self.functions[func], []).append(impl) + else: + self.warning(None, None, f"ignoring unknown function {func} in wisdom file {path}") + + return results + + def add_mix(self, + name: str, + description: str, + flavors: Iterable[Union[BuildFlavor,str]], + wisdom: Mapping[Union[Function,str],Iterable[str]] = {}, + wisdom_file: Optional[str] = None): + if name in self.mixes: + raise RuntimeError('duplicated mix: ' + name) + + resolved_flavors = map(self.get_flavor, flavors) + if wisdom_file: + resolved_wisdom = self.load_wisdom(wisdom_file) + else: + resolved_wisdom = dict( (self.get_function(name), values) for name,values in wisdom.items() ) + self.mixes[name] = BuildMix(name, description, resolved_flavors, resolved_wisdom) + + def sym(self, symbol: str) -> str: + return self.symbol_prefix + symbol + + def build_impls(self, source: SourceFile, lineno: int, function_name: str, impl_name: str, feature_name: Optional[str] = None) -> Sequence[FunctionImpl]: + if function_name not in self.functions: + self.warning(source, lineno, f"implementation defined for unknown function '{function_name}', skipped") + return [] + + function = self.functions[function_name] + + feature: Optional[Feature] = None + if feature_name is not None: + if feature_name not in self.features_by_macro: + self.warning(source, lineno, f"implementation {function_name} ({impl_name}) requires unknown feature '{feature_name}', skipped") + return [] + feature = self.features_by_macro.get(feature_name) + + result = [FunctionImpl(gen = self, + function = function, + name = impl_name, + source = source, + lineno = lineno, + feature = feature)] + + if function.aligned_pair: + result.append(FunctionImpl(gen = self, + function = function.aligned_pair, + name = impl_name, + source = source, + lineno = lineno, + feature = feature)) + + return result + + def add_impl(self, impl): + key = (impl.function, impl.name) + old = self.function_impls.get(key) + if old: + self.warning('duplicate definition of {impl.function.name} / {impl.name}, previously defined at {old.location[0]}:{old.location[1]}') + return + self.function_impls[key] = impl + impl.function.impls.append(impl) + impl.source.impls.append(impl) + + def warning(self, source: Optional[SourceFile], lineno: Optional[int], message): + if source is not None: + if lineno is not None: + print(f'{source.path}:{lineno}: warning: {message}', file=sys.stderr) + else: + print(f'{source.path}: warning: {message}', file=sys.stderr) + else: + print(f'warning: {message}', file=sys.stderr) + + def scan_file(self, path: str): + source = SourceFile(path) + + match_impl = re.compile(r'''[^a-zA-Z0-9_]+ STARCH_IMPL \s* \( \s* # macro call + ([a-zA-Z0-9_]+) \s* , \s* # function name + ([a-zA-Z0-9_]+) \s* \) # implementation name + ''', re.VERBOSE) + match_impl_requires = re.compile(r'''[^a-zA-Z0-9_]+ STARCH_IMPL_REQUIRES \s* \( \s* # macro call + ([a-zA-Z0-9_]+) \s* , \s* # function name + ([a-zA-Z0-9_]+) \s* , \s* # implementation name + ([a-zA-Z0-9_]+) \s* \) # feature name + ''', re.VERBOSE) + + match_benchmark = re.compile(r'''[^a-zA-Z0-9_]+ STARCH_BENCHMARK \s* \( \s* # macro call + ([a-zA-Z0-9_]+) \s* \) # function name + ''', re.VERBOSE) + + match_verify = re.compile(r'''[^a-zA-Z0-9_]+ STARCH_BENCHMARK_VERIFY \s* \( \s* # macro call + ([a-zA-Z0-9_]+) \s* \) # function name + ''', re.VERBOSE) + + has_benchmark = has_impl = has_benchmark_verify = False + with open(path, 'r') as f: + for lineno, line in enumerate(f): + if line[0] == '#': + continue # ignore preprocessor lines + + for match in match_impl.finditer(line): + for impl in self.build_impls(source, lineno, match.group(1), match.group(2)): + has_impl = True + self.add_impl(impl) + + for match in match_impl_requires.finditer(line): + for impl in self.build_impls(source, lineno, match.group(1), match.group(2), match.group(3)): + has_impl = True + self.add_impl(impl) + + for match in match_benchmark.finditer(line): + function_name = match.group(1) + if function_name in self.functions: + function = self.functions[function_name] + if function.benchmark: + self.warning(source, lineno, f"duplicate benchmark defined for unknown function {function_name}") + function.benchmark = source + if function.aligned_pair: + function.aligned_pair.benchmark = source + has_benchmark = True + else: + self.warning(source, lineno, f"benchmark defined for unknown function {function_name}, ignored") + + for match in match_verify.finditer(line): + function_name = match.group(1) + if function_name in self.functions: + function = self.functions[function_name] + if function.benchmark_verify: + self.warning(source, lineno, f"duplicate benchmark verifier defined for unknown function {function_name}") + function.benchmark_verify = source + if function.aligned_pair: + function.aligned_pair.benchmark_verify = source + has_benchmark_verify = True + else: + self.warning(source, lineno, f"benchmark verifier defined for unknown function {function_name}, ignored") + + if has_impl: + self.impl_files.append(source) + if has_benchmark or has_benchmark_verify: + self.benchmark_files.append(source) + + def render(self, template_path, output_path, **kwargs): + t = self.templates.get_template(template_path) + result = t.render(gen=self, current_dir=os.path.dirname(output_path), **kwargs) + + if os.path.exists(output_path): + with open(output_path, 'r') as f: + contents = f.read() + if contents == result: + print(f'unchanged: {output_path}', file=sys.stderr) + return + + with open(output_path, 'w') as f: + f.write(result) + print(f' wrote: {output_path}', file=sys.stderr) + + def generate(self): + if not self.functions: + self.warning(None, None, 'no functions defined') + if not self.flavors: + self.warning(None, None, 'no flavors defined') + if not self.mixes: + self.warning(None, None, 'no mixes defined') + for function in self.functions.values(): + if not function.impls: + self.warning(None, None, f'no implementations of function {function.name} provided') + + self.render('/starch.h.template', self.generated_include_path) + + for name, flavor in self.flavors.items(): + self.render('/flavor.c.template', self.generated_flavor_path(flavor), flavor=flavor) + + self.render('/dispatcher.c.template', self.generated_dispatcher_path) + self.render('/benchmark.c.template', self.generated_benchmark_path) + + for name, mix in self.mixes.items(): + self.render('/makefile.template', self.generated_makefile_path(mix), mix=mix) + diff --git a/starch/stubs/mako/__init__.pyi b/starch/stubs/mako/__init__.pyi new file mode 100644 index 0000000..792da1d --- /dev/null +++ b/starch/stubs/mako/__init__.pyi @@ -0,0 +1,4 @@ +# -*- python -*- + +# typing stubs for mako + diff --git a/starch/stubs/mako/lookup.pyi b/starch/stubs/mako/lookup.pyi new file mode 100644 index 0000000..792b046 --- /dev/null +++ b/starch/stubs/mako/lookup.pyi @@ -0,0 +1,16 @@ +# -*- python -*- + +# typing stubs for mako + +from mako.template import Template +from typing import List, Optional + +class TemplateCollection(object): + def get_template(self, uri: str, relativeto: Optional[str] = None) -> Template: ... + +class TemplateLookup(TemplateCollection): + def __init__(self,directories: Optional[List[str]] = None, + module_directory: Optional[str] = None, + imports: Optional[List[str]] = None): ... + + diff --git a/starch/stubs/mako/template.pyi b/starch/stubs/mako/template.pyi new file mode 100644 index 0000000..2076b1a --- /dev/null +++ b/starch/stubs/mako/template.pyi @@ -0,0 +1,6 @@ +# -*- python -*- + +# typing stubs for mako + +class Template(object): + def render(self, *args, **kwargs) -> str: ... diff --git a/starch/templates/benchmark.c.template b/starch/templates/benchmark.c.template new file mode 100644 index 0000000..c77bd59 --- /dev/null +++ b/starch/templates/benchmark.c.template @@ -0,0 +1,490 @@ +### Copyright (c) 2020, FlightAware LLC. +### All rights reserved. +### See the LICENSE file for licensing terms. + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "${os.path.relpath(gen.generated_include_path, current_dir)}" + +typedef struct timespec starch_benchmark_time; + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end); +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size); +void starch_benchmark_aligned_free(void *user_ptr); +void starch_benchmark_get_time(starch_benchmark_time *t); + +const unsigned starch_benchmark_warmup_loops = 10; + +typedef struct { + const char *name; + const char *impl; + uint64_t ns; +} starch_benchmark_result; + +static starch_benchmark_result *starch_benchmark_results = NULL; +static unsigned starch_benchmark_result_size = 0; +static unsigned starch_benchmark_result_count = 0; + +typedef struct benchmark_flavor_list_node { + const char *flavor; + struct benchmark_flavor_list_node *next; +} starch_benchmark_flavor_list; + +static starch_benchmark_flavor_list *starch_benchmark_flavor_whitelist = NULL; +static starch_benchmark_flavor_list *starch_benchmark_flavor_blacklist = NULL; + +static bool starch_benchmark_list_only = false; +static bool starch_benchmark_validate_only = false; +static bool starch_benchmark_validation_failed = false; +static bool starch_benchmark_top_only = false; +static unsigned starch_benchmark_iterations = 1; + +typedef struct timespec starch_benchmark_time; +void starch_benchmark_get_time(starch_benchmark_time *t) +{ +#ifdef CLOCK_THREAD_CPUTIME_ID + clock_gettime(CLOCK_THREAD_CPUTIME_ID, t); +#else + clock_gettime(CLOCK_MONOTONIC, t); +#endif +} + +uint64_t starch_benchmark_elapsed(starch_benchmark_time *start, starch_benchmark_time *end) +{ + return ((uint64_t)end->tv_sec - (uint64_t)start->tv_sec) * 1000000000U + (uint64_t)end->tv_nsec - (uint64_t)start->tv_nsec; +} + +void *starch_benchmark_aligned_alloc(size_t alignment, size_t type_alignment, size_t size) +{ + size_t use_alignment = (type_alignment > alignment ? type_alignment : alignment); + if ( (use_alignment % type_alignment) || (use_alignment % alignment) ) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: conflicting alignment requirements (%zu versus %zu)\n", size, alignment, type_alignment); + return NULL; + } + + /* Over-allocate so we can stash our own pointer before the start, and so that we can adjust + * the returned alignment so it is only aligned to the requested boundary, and not also + * aligned to a larger power of two (we don't want to accidentally benchmark the performance + * of a more restrictive larger alignment) + */ + size_t header_size = (use_alignment < sizeof(void*) ? sizeof(void*) : use_alignment); + char *block_ptr = aligned_alloc(use_alignment, header_size + size + use_alignment); + if (!block_ptr) { + fprintf(stderr, "STARCH_BENCHMARK_ALLOC of %zu bytes failed: %s\n", size, strerror(errno)); + return NULL; + } + + char *user_ptr = block_ptr + header_size; + if ( (uintptr_t)user_ptr % (use_alignment * 2) == 0 ) { + // user_ptr is aligned to the next power of two, but we don't want that, move it on + user_ptr += use_alignment; + } + + void **stash = (void**)user_ptr - 1; + *stash = block_ptr; + + return user_ptr; +} + +void starch_benchmark_aligned_free(void *user_ptr) +{ + if (!user_ptr) + return; + void **stash = (void**)user_ptr - 1; + free(*stash); +} + +static bool starch_benchmark_flavor_in_list(const char *flavor, const starch_benchmark_flavor_list *list) +{ + for (; list; list = list->next) { + if (!strcmp(flavor, list->flavor)) + return true; + } + return false; +} + +<% functions_to_benchmark = [f for f in gen.functions.values() if f.benchmark] %> +% for function in functions_to_benchmark: +/* prototypes for benchmark helpers provided by user code */ +void ${function.benchmark_symbol} (void); +% if function.benchmark_verify: +bool ${function.benchmark_verify_symbol} ( ${function.declaration_arglist } ); +% endif + +/* prototype the benchmarking function so that we can build with -Wmissing-declarations */ +void ${gen.symbol_prefix}${function.name}_benchmark(void); + +static void starch_benchmark_one_${function.name}( ${function.regentry_type} * _entry, ${function.declaration_arglist } ) +{ + fprintf(stderr, " %-40s ", _entry->name); + + /* test for support */ + if (_entry->flavor_supported && !(_entry->flavor_supported())) { + fprintf(stderr, "unsupported\n"); + return; + } + + if (starch_benchmark_flavor_whitelist && !starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "skipped (not whitelisted)\n"); + return; + } + + if (starch_benchmark_flavor_blacklist && starch_benchmark_flavor_in_list(_entry->flavor, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "skipped (blacklisted)\n"); + return; + } + + if (starch_benchmark_list_only) { + fprintf(stderr, "supported\n"); + return; + } + + /* initial warmup */ + for (unsigned _loop = 0; _loop < starch_benchmark_warmup_loops; ++_loop) + _entry->callable ( ${function.named_arglist} ); + + % if function.benchmark_verify: + /* verify correctness of the output */ + if (! ${function.benchmark_verify_symbol} ( ${function.named_arglist} )) { + fprintf(stderr, "skipped (verification failed)\n"); + starch_benchmark_validation_failed = true; + return; + } + if (starch_benchmark_validate_only) { + fprintf(stderr, "validation ok\n"); + return; + } + % else: + if (starch_benchmark_validate_only) { + fprintf(stderr, "no validator defined\n"); + return; + } + % endif + + /* pre-benchmark, find a loop count that takes at least 100ms */ + starch_benchmark_time _start, _end; + uint64_t _elapsed = 0; + uint64_t _loops = 127; + while (_elapsed < 100000000) { + _loops *= 2; + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( ${function.named_arglist} ); + starch_benchmark_get_time(&_end); + _elapsed = starch_benchmark_elapsed(&_start, &_end); + } + + /* real benchmark, run for approx 1 second */ + _loops = _loops * 1000000000 / _elapsed; + + _elapsed = 0; + uint64_t _elapsed_min = UINT64_MAX; + uint64_t _elapsed_max = 0; + for (unsigned _iter = 0; _iter < starch_benchmark_iterations; ++_iter) { + starch_benchmark_get_time(&_start); + for (uint64_t _loop = 0; _loop < _loops; ++_loop) + _entry->callable ( ${function.named_arglist} ); + starch_benchmark_get_time(&_end); + uint64_t _elapsed_one = starch_benchmark_elapsed(&_start, &_end); + if (_elapsed_one < _elapsed_min) + _elapsed_min = _elapsed_one; + if (_elapsed_one > _elapsed_max) + _elapsed_max = _elapsed_one; + _elapsed += _elapsed_one; + } + + uint64_t _per_loop; + if (starch_benchmark_iterations > 2) + _per_loop = (_elapsed - _elapsed_min - _elapsed_max) / _loops / (starch_benchmark_iterations - 2); + else + _per_loop = _elapsed / _loops / starch_benchmark_iterations; + + fprintf(stderr, "%" PRIu64 " ns/call\n", _per_loop); + + if (starch_benchmark_result_count >= starch_benchmark_result_size) { + if (!starch_benchmark_result_size) + starch_benchmark_result_size = 64; + else + starch_benchmark_result_size *= 2; + starch_benchmark_results = realloc(starch_benchmark_results, starch_benchmark_result_size * sizeof(*starch_benchmark_results)); + if (!starch_benchmark_results) { + fprintf(stderr, "realloc: %s\n", strerror(errno)); + exit(1); + } + } + + starch_benchmark_results[starch_benchmark_result_count].name = "${function.name}"; + starch_benchmark_results[starch_benchmark_result_count].impl = _entry->name; + starch_benchmark_results[starch_benchmark_result_count].ns = _per_loop; + ++starch_benchmark_result_count; +} + +static void starch_benchmark_run_${function.name}( ${function.declaration_arglist } ) +{ + for (${function.regentry_type} *_entry = ${function.registry_symbol}; _entry->name; ++_entry) { + starch_benchmark_one_${function.name}( _entry, ${function.named_arglist} ); + } +} + +% endfor + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) ${gen.symbol_prefix} ## _name ## _benchmark_sym +#define STARCH_IMPL(_function,_impl) ${gen.symbol_prefix} ## _function ## _ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) ${gen.symbol_prefix} ## _function ## _benchmark +#define STARCH_BENCHMARK_VERIFY(_function) ${gen.symbol_prefix} ## _function ## _benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(1, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +% for source in gen.benchmark_files: +#include "${os.path.relpath(source.path, current_dir)}" +% endfor + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES +#undef STARCH_BENCHMARK +#undef STARCH_BENCHMARK_VERIFY +#undef STARCH_BENCHMARK_RUN +#undef STARCH_BENCHMARK_ALLOC +#undef STARCH_BENCHMARK_FREE + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) ${gen.symbol_prefix} ## _name ## _aligned_benchmark_sym +#define STARCH_IMPL(_function,_impl) ${gen.symbol_prefix} ## _function ## _aligned_ ## _impl ## _dummy_benchmark +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) +#define STARCH_BENCHMARK(_function) ${gen.symbol_prefix} ## _function ## _aligned_benchmark +#define STARCH_BENCHMARK_VERIFY(_function) ${gen.symbol_prefix} ## _function ## _aligned_benchmark_verify +#define STARCH_BENCHMARK_RUN(_function, ...) starch_benchmark_run_ ## _function ## _aligned ( __VA_ARGS__ ) +#define STARCH_BENCHMARK_ALLOC(_count, _type) starch_benchmark_aligned_alloc(STARCH_MIX_ALIGNMENT, alignof(_type), (_count) * sizeof(_type)) +#define STARCH_BENCHMARK_FREE(_ptr) starch_benchmark_aligned_free(_ptr) + +% for source in gen.benchmark_files: + % if any( (function.aligned and function.benchmark == source) for function in gen.functions.values() ): +#include "${os.path.relpath(source.path, current_dir)}" + % endif +% endfor + +% for function in functions_to_benchmark: +static void starch_benchmark_all_${function.name}(void) +{ + fprintf(stderr, "==== ${function.name} ===\n"); + ${gen.symbol_prefix}${function.name}_benchmark (); +} +% endfor + +static int starch_benchmark_compare_result(const void *a, const void *b) +{ + const starch_benchmark_result *left = (const starch_benchmark_result *) a; + const starch_benchmark_result *right = (const starch_benchmark_result *) b; + + int name_cmp = strcmp(left->name, right->name); + if (name_cmp) + return name_cmp; + + if (left->ns < right->ns) + return -1; + if (left->ns > right->ns) + return 1; + return 0; +} + +static void starch_benchmark_usage(const char *argv0) +{ + fprintf(stderr, + "Usage: %s [OPTION]... [FUNCTION]...\n" + "Benchmarks starch functions and optionally writes a sorted wisdom file.\n" + "\n" + " -r FILE Read initial wisdom from FILE\n" + " -o FILE Write sorted wisdom to FILE\n" + " -F FLAVOR Add FLAVOR to whitelist\n" + " (default: no whitelist, run all runtime-supported flavors)\n" + " -N FLAVOR Add FLAVOR to blacklist\n" + " (default: no blacklist, run all runtime-supported flavors)\n" + " -l List compiled-in implementations but don't benchmark them\n" + " -V Run validation tests, but don't run benchmarks\n" + " -t Include only the top candidate per function in wisdom output\n" + " -i ITERS Run benchmark ITERS times and use the mean. If ITERS > 2, ignore\n" + " the smallest and largest runs when calculating the mean.\n" + " (default: 1 iteration)\n" + " FUNCTION Run benchmarks for these functions only\n" + " (default: benchmark all functions)\n" + "\n" + "Supported flavors: " +% for flavor in gen.flavors.values(): +#ifdef ${flavor.macro} + "${flavor.name} " +#endif +% endfor + "\n" + "Supported functions: " +% for function in gen.functions.values(): + "${function.name} " +% endfor + "\n", argv0); +} + +static void starch_benchmark_append_flavor(const char *flavor, starch_benchmark_flavor_list **list) +{ + starch_benchmark_flavor_list *newnode = malloc(sizeof(*newnode)); + if (!newnode) { + fprintf(stderr, "malloc: %s\n", strerror(errno)); + exit(1); + } + + newnode->flavor = flavor; + newnode->next = *list; + *list = newnode; +} + +int main(int argc, char **argv) +{ + int specific = 0; + const char *output_path = NULL; + + int opt; + while ((opt = getopt(argc, argv, "r:o:F:N:i:lhtV")) != -1) { + switch (opt) { + case 'r': + if (${gen.sym("read_wisdom")}(optarg) < 0) { + fprintf(stderr, "%s: cannot read %s: %s\n", argv[0], optarg, strerror(errno)); + return 1; + } + fprintf(stderr, "%s: loaded wisdom file %s\n", argv[0], optarg); + break; + + case 'o': + output_path = optarg; + break; + + case 'F': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_blacklist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_whitelist); + break; + + case 'N': + if (starch_benchmark_flavor_in_list(optarg, starch_benchmark_flavor_whitelist)) { + fprintf(stderr, "%s: conflicting -F and -N options for flavor %s\n", argv[0], optarg); + return 2; + } + starch_benchmark_append_flavor(optarg, &starch_benchmark_flavor_blacklist); + break; + + case 'l': + starch_benchmark_list_only = true; + break; + + case 't': + starch_benchmark_top_only = true; + break; + + case 'i': + starch_benchmark_iterations = atoi(optarg); + break; + + case 'V': + starch_benchmark_validate_only = true; + break; + + case 'h': + starch_benchmark_usage(argv[0]); + return 0; + + case '?': + default: + starch_benchmark_usage(argv[0]); + return 2; + } + } + + if (starch_benchmark_list_only && output_path) { + fprintf(stderr, "%s: -o and -l options cannot be specified together\n", argv[0]); + return 2; + } + + for (int i = optind; i < argc; ++i) { +% for function in gen.functions.values(): + if (!strcmp(argv[i], "${function.name}")) { + specific = 1; +% if function.benchmark: + starch_benchmark_all_${function.name}(); +% else: + fprintf(stderr, "=== ${function.name} ===\n"); + fprintf(stderr, " (no benchmark support defined)\n"); +% endif + continue; + } +% endfor + + fprintf(stderr, "%s: unrecognized function name: %s\n", argv[0], argv[i]); + return 2; + } + + if (!specific) { +% for function in gen.functions.values(): + % if function.benchmark: + starch_benchmark_all_${function.name}(); + % else: + fprintf(stderr, "=== ${function.name} ===\n"); + fprintf(stderr, " (no benchmark support defined)\n"); + % endif +% endfor + } + + if (output_path) { + FILE *out = fopen(output_path, "w"); + if (!out) { + fprintf(stderr, "%s: cannot open %s: %s\n", argv[0], output_path, strerror(errno)); + return 1; + } + + fprintf(out, "# generated by "); + for (int i = 0; i < argc; ++i) + fprintf(out, "%s ", argv[i]); + fprintf(out, "\n\n"); + + qsort(starch_benchmark_results, starch_benchmark_result_count, sizeof(*starch_benchmark_results), starch_benchmark_compare_result); + + const char *last_name = NULL; + bool first = true; + for (unsigned i = 0; i < starch_benchmark_result_count; ++i) { + starch_benchmark_result *result = &starch_benchmark_results[i]; + if (last_name && strcmp(last_name, result->name) != 0) { + fprintf(out, "\n"); + first = true; + } + last_name = result->name; + if (starch_benchmark_top_only && !first) + continue; + fprintf(out, "%-40s %-40s # %" PRIu64 " ns/call\n", result->name, result->impl, result->ns); + first = false; + } + + fclose(out); + fprintf(stderr, "%s: wrote sorted wisdom to %s\n", argv[0], output_path); + } + + return starch_benchmark_validation_failed ? 1 : 0; +} diff --git a/starch/templates/dispatcher.c.template b/starch/templates/dispatcher.c.template new file mode 100644 index 0000000..cb3dbc3 --- /dev/null +++ b/starch/templates/dispatcher.c.template @@ -0,0 +1,206 @@ +### Copyright (c) 2020, FlightAware LLC. +### All rights reserved. +### See the LICENSE file for licensing terms. + +/* starch generated code. Do not edit. */ + +#include +#include +#include +#include + +#include "${os.path.relpath(gen.generated_include_path, current_dir)}" + +/* helper for re-sorting registries */ +struct starch_regentry_prefix { + int rank; +}; + +static int starch_regentry_rank_compare (const void *l, const void *r) +{ + const struct starch_regentry_prefix *left = l, *right = r; + return left->rank - right->rank; +} + +% for function in gen.functions.values(): +/* dispatcher / registry for ${function.name} */ + +${function.regentry_type} * ${function.select_symbol}() { + for (${function.regentry_type} *entry = ${function.registry_symbol}; + entry->name; + ++entry) + { + if (entry->flavor_supported && !(entry->flavor_supported())) + continue; + return entry; + } + return NULL; +} + +static ${function.returntype} ${function.dispatcher_symbol} ( ${function.declaration_arglist} ) { + ${function.regentry_type} *entry = ${function.select_symbol}(); + if (!entry) + abort(); + + ${function.callable_symbol} = entry->callable; +% if function.returntype == 'void': + ${function.callable_symbol} ( ${function.named_arglist} ); +% else: + return ${function.callable_symbol} ( ${function.named_arglist} ); +% endif +} + +${function.pointer_type} ${function.callable_symbol} = ${function.dispatcher_symbol}; + +void ${function.set_wisdom_symbol} (const char * const * received_wisdom) +{ + /* re-rank the registry based on received wisdom */ + ${function.regentry_type} *entry; + for (entry = ${function.registry_symbol}; entry->name; ++entry) { + const char * const *search; + for (search = received_wisdom; *search; ++search) { + if (!strcmp(*search, entry->name)) { + break; + } + } + if (*search) { + /* matches an entry in the wisdom list, order by position in the list */ + entry->rank = search - received_wisdom; + } else { + /* no match, rank after all possible matches, retaining existing order */ + entry->rank = (search - received_wisdom) + (entry - ${function.registry_symbol}); + } + } + + /* re-sort based on the new ranking */ + qsort(${function.registry_symbol}, entry - ${function.registry_symbol}, sizeof(${function.regentry_type}), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + ${function.callable_symbol} = ${function.dispatcher_symbol}; +} + +${function.regentry_type} ${function.registry_symbol}[] = { +% for mix in gen.mixes.values(): + <% + # gather all implementations for this mix, sort by wisdom + def rank_key(value, wisdom=mix.function_wisdom(function)): + impl, flavor = value + try: + return wisdom.index(impl.wisdom_name(flavor)) + except ValueError: + return len(wisdom) + + mix_impls = [] + for flavor in mix.flavors: + if function.aligned and function.aligned_pair: + if flavor.alignment > 1: + # add aligned impls + for impl in function.impls: + if impl.feature is None or impl.feature in flavor.features: + mix_impls.append( (impl, flavor) ) + # add unaligned impls + for impl in function.aligned_pair.impls: + if impl.feature is None or impl.feature in flavor.features: + mix_impls.append( (impl, flavor) ) + else: + # no alignment specialization + for impl in function.impls: + if impl.feature is None or impl.feature in flavor.features: + mix_impls.append( (impl, flavor) ) + + mix_impls.sort(key=rank_key) + %> +#ifdef ${mix.macro} + % for rank, (impl, flavor) in enumerate(mix_impls): + { ${rank}, "${impl.wisdom_name(flavor)}", "${flavor.name}", ${impl.impl_symbol(flavor)}, ${flavor.test_function_expr} }, + % endfor +#endif /* ${mix.macro} */ +% endfor + { 0, NULL, NULL, NULL, NULL } +}; + +% endfor + +int ${gen.sym("read_wisdom")} (const char * path) +{ + FILE *fp = fopen(path, "r"); + if (!fp) + return -1; + + /* reset all ranks to identify entries not listed in the wisdom file; we'll assign ranks at the end to produce a stable sort */ +% for function in gen.functions.values(): + int rank_${function.name} = 0; + for (${function.regentry_type} *entry = ${function.registry_symbol}; entry->name; ++entry) { + entry->rank = 0; + } +% endfor + + char linebuf[512]; + while (fgets(linebuf, sizeof(linebuf), fp)) { + /* split name and impl on whitespace, handle comments etc */ + char *name = linebuf; + while (*name && isspace(*name)) + ++name; + + if (!*name || *name == '#') + continue; + + char *end = name; + while (*end && !isspace(*end)) + ++end; + + if (!*end) + continue; + *end = 0; + + char *impl = end + 1; + while (*impl && isspace(*impl)) + ++impl; + + if (!*impl) + continue; + + end = impl; + while (*end && !isspace(*end)) + ++end; + + *end = 0; + + /* try to find a matching registry entry */ +% for function in gen.functions.values(): + if (!strcmp(name, "${function.name}")) { + for (${function.regentry_type} *entry = ${function.registry_symbol}; entry->name; ++entry) { + if (!strcmp(impl, entry->name)) { + entry->rank = ++rank_${function.name}; + break; + } + } + continue; + } +% endfor + } + + if (ferror(fp)) { + fclose(fp); + return -1; + } + + fclose(fp); + + /* assign ranks to unmatched items to (stable) sort them last; re-sort everything */ +% for function in gen.functions.values(): + { + ${function.regentry_type} *entry; + for (entry = ${function.registry_symbol}; entry->name; ++entry) { + if (!entry->rank) + entry->rank = ++rank_${function.name}; + } + qsort(${function.registry_symbol}, entry - ${function.registry_symbol}, sizeof(${function.regentry_type}), starch_regentry_rank_compare); + + /* reset the implementation pointer so the next call will re-select */ + ${function.callable_symbol} = ${function.dispatcher_symbol}; + } +% endfor + + return 0; +} diff --git a/starch/templates/flavor.c.template b/starch/templates/flavor.c.template new file mode 100644 index 0000000..227ffa9 --- /dev/null +++ b/starch/templates/flavor.c.template @@ -0,0 +1,48 @@ +### Copyright (c) 2020, FlightAware LLC. +### All rights reserved. +### See the LICENSE file for licensing terms. + +/* starch generated code. Do not edit. */ + +#define ${flavor.macro} +% for feature in flavor.features: +#define ${feature.macro} +% endfor + +#include "${os.path.relpath(gen.generated_include_path, current_dir)}" + +#undef STARCH_ALIGNMENT + +#define STARCH_ALIGNMENT 1 +#define STARCH_ALIGNED(_ptr) (_ptr) +#define STARCH_SYMBOL(_name) ${gen.symbol_prefix} ## _name ## _ ## ${flavor.name} +#define STARCH_IMPL(_function,_impl) ${gen.symbol_prefix} ## _function ## _ ## _impl ## _ ## ${flavor.name} +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +% for source in gen.impl_files: + % if any( ((impl.feature is None or impl.feature in flavor.features) and not impl.function.aligned) for impl in source.impls): +#include "${os.path.relpath(source.path, current_dir)}" + % endif +% endfor + +% if flavor.alignment > 1: + +#undef STARCH_ALIGNMENT +#undef STARCH_ALIGNED +#undef STARCH_SYMBOL +#undef STARCH_IMPL +#undef STARCH_IMPL_REQUIRES + +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNED(_ptr) (__builtin_assume_aligned((_ptr), STARCH_MIX_ALIGNMENT)) +#define STARCH_SYMBOL(_name) ${gen.symbol_prefix} ## _name ## _aligned_ ## ${flavor.name} +#define STARCH_IMPL(_function,_impl) ${gen.symbol_prefix} ## _function ## _aligned_ ## _impl ## _ ## ${flavor.name} +#define STARCH_IMPL_REQUIRES(_function,_impl,_feature) STARCH_IMPL(_function,_impl) + +% for source in gen.impl_files: + % if any( ((impl.feature is None or impl.feature in flavor.features) and impl.function.aligned) for impl in source.impls): +#include "${os.path.relpath(source.path, current_dir)}" + % endif +% endfor + +% endif diff --git a/starch/templates/makefile.template b/starch/templates/makefile.template new file mode 100644 index 0000000..3fae92c --- /dev/null +++ b/starch/templates/makefile.template @@ -0,0 +1,57 @@ +# -*- makefile -*- + +### Copyright (c) 2020, FlightAware LLC. +### All rights reserved. +### See the LICENSE file for licensing terms. + +# starch generated makefile fragment. do not edit. +# +# This makefile is designed to be included in a surrounding makefile. The including makefile +# should set $(STARCH_COMPILE) to a (partial) command line that provides suitable cflags etc +# and handles the following appended things: +# * a C source filename to compile to the corresponding .o file +# * a -o option to specify the output object file +# * additional command-line arguments to set compile flags as defined in each flavor +# +# Including the makefile fragment provides these variables/rules: +# +# $(STARCH_CFLAGS): additional cflags that may be used when compiling other code that uses starch.h +# (not required - if omitted, the only change is that flavor-specific prototypes are unavailable) +# $(STARCH_OBJS): a list of object files to link to the main binary +# $(STARCH_BENCHMARK_OBJ): object files providing a standalone benchmarking app (link all of $(STARCH_OBJS) too) +# explicit build rules for each object file listed in $(STARCH_OBJS) + +STARCH_CFLAGS := -D${mix.macro} +<% + import os + o_files = [] + impl_c_files = ' '.join( map(lambda x, gen=gen: os.path.relpath(x.path, gen.runtime_dir), gen.impl_files) ) + benchmark_c_files = ' '.join( map(lambda x, gen=gen: os.path.relpath(x.path, gen.runtime_dir), gen.benchmark_files) ) +%> +% for flavor in mix.flavors: +<% + c_file = os.path.relpath(gen.generated_flavor_path(flavor), gen.runtime_dir) + o_file = os.path.splitext(c_file)[0] + '.o' + o_files.append(o_file) +%> +${o_file}: ${c_file} ${impl_c_files} + $(STARCH_COMPILE) $(STARCH_CFLAGS) ${flavor.cflags} ${c_file} -o ${o_file} +% endfor +<% + c_file = os.path.relpath(gen.generated_dispatcher_path, gen.runtime_dir) + o_file = os.path.splitext(c_file)[0] + '.o' + o_files.append(o_file) + %> +${o_file}: ${c_file} ${impl_c_files} + $(STARCH_COMPILE) $(STARCH_CFLAGS) ${c_file} -o ${o_file} + +STARCH_OBJS := ${' '.join(o_files)} + +<% + c_file = os.path.relpath(gen.generated_benchmark_path, gen.runtime_dir) + o_file = os.path.splitext(c_file)[0] + '.o' + %> +${o_file}: ${c_file} ${benchmark_c_files} + $(STARCH_COMPILE) $(STARCH_CFLAGS) ${c_file} -o ${o_file} + +STARCH_BENCHMARK_OBJ := ${o_file} diff --git a/starch/templates/starch.h.template b/starch/templates/starch.h.template new file mode 100644 index 0000000..ae77b59 --- /dev/null +++ b/starch/templates/starch.h.template @@ -0,0 +1,68 @@ +### Copyright (c) 2020, FlightAware LLC. +### All rights reserved. +### See the LICENSE file for licensing terms. + +/* starch generated code. Do not edit. */ + +% for include in gen.includes: +#include ${include} +% endfor + +/* mixes */ + +% for mix in gen.mixes.values(): +/* ${mix.description} */ +#ifdef ${mix.macro} + % for flavor in mix.flavors: +#define ${flavor.macro} + % endfor +#define STARCH_MIX_ALIGNMENT ${max((flavor.alignment) for flavor in mix.flavors)} +#endif /* ${mix.macro} */ + +% endfor + +#ifdef STARCH_MIX_ALIGNMENT +#define STARCH_ALIGNMENT STARCH_MIX_ALIGNMENT +#define STARCH_IS_ALIGNED(_ptr) (((uintptr_t)(_ptr) & (STARCH_MIX_ALIGNMENT-1)) == 0) +#else +/* mix not defined, alignment is unknown, treat everything as unaligned */ +#define STARCH_IS_ALIGNED(_ptr) (0) +#endif + + +/* entry points and registries */ + +% for function in gen.functions.values(): +typedef ${function.returntype} (* ${function.pointer_type}) ( ${function.declaration_arglist} ); +extern ${function.pointer_type} ${function.callable_symbol}; + +typedef struct { + int rank; + const char *name; + const char *flavor; + ${function.pointer_type} callable; + int (*flavor_supported)(); +} ${function.regentry_type}; + +extern ${function.regentry_type} ${function.registry_symbol}[]; +${function.regentry_type} * ${function.select_symbol}(); +void ${function.set_wisdom_symbol}( const char * const * received_wisdom ); + +% endfor +/* flavors and prototypes */ + +% for flavor in gen.flavors.values(): +#ifdef ${flavor.macro} + % if flavor.test_function is not None: +int ${flavor.test_function} (void); + % endif + % for impl in gen.function_impls.values(): + % if (flavor.alignment > 1 or not impl.function.aligned) and (impl.feature is None or impl.feature in flavor.features): +${impl.function.returntype} ${impl.impl_symbol(flavor)} ( ${impl.function.declaration_arglist} ); + % endif + % endfor +#endif /* ${flavor.macro} */ + +int ${gen.sym("read_wisdom")} (const char * path); + +% endfor diff --git a/wisdom.arm b/wisdom.arm new file mode 100644 index 0000000..96dae84 --- /dev/null +++ b/wisdom.arm @@ -0,0 +1,31 @@ +# derived from wisdom.pi4b and wisdom.pi0w + +magnitude_power_uc8 neon_vrsqrte_armv7a_neon_vfpv4 # 225511 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 5464685 ns/call + +magnitude_power_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 212204 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 5516196 ns/call + +magnitude_sc16 neon_vrsqrte_armv7a_neon_vfpv4 # 684978 ns/call +magnitude_sc16 exact_float_generic # 28623479 ns/call + +magnitude_sc16_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 639779 ns/call +magnitude_sc16_aligned exact_float_generic # 28613950 ns/call + +magnitude_sc16q11 neon_vrsqrte_armv7a_neon_vfpv4 # 166113 ns/call +magnitude_sc16q11 exact_float_generic # 7131190 ns/call + +magnitude_sc16q11_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 155221 ns/call +magnitude_sc16q11_aligned exact_float_generic # 7124159 ns/call + +magnitude_uc8 neon_vrsqrte_armv7a_neon_vfpv4 # 188746 ns/call +magnitude_uc8 lookup_unroll_4_generic # 4179036 ns/call + +magnitude_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 187209 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 4445877 ns/call + +mean_power_u16 u32_armv7a_neon_vfpv4 # 45484 ns/call +mean_power_u16 u64_generic # 990367 ns/call + +mean_power_u16_aligned u32_armv7a_neon_vfpv4_aligned # 44929 ns/call +mean_power_u16_aligned u64_generic # 934445 ns/call diff --git a/wisdom.generic b/wisdom.generic new file mode 100644 index 0000000..2c924d3 --- /dev/null +++ b/wisdom.generic @@ -0,0 +1,16 @@ +# some fairly arbitrary defaults for when we don't know the target architecture + +magnitude_power_uc8 twopass_generic +magnitude_power_uc8_aligned twopass_generic + +magnitude_sc16 exact_float_generic +magnitude_sc16_aligned exact_float_generic + +magnitude_sc16q11 exact_float_generic +magnitude_sc16q11_aligned exact_float_generic + +magnitude_uc8 lookup_unroll_4_generic +magnitude_uc8_aligned lookup_unroll_4_generic + +mean_power_u16 u32_generic +mean_power_u16_aligned u32_generic diff --git a/wisdom.x86 b/wisdom.x86 new file mode 100644 index 0000000..28a7719 --- /dev/null +++ b/wisdom.x86 @@ -0,0 +1,31 @@ +# derived from wisdom.i7-6500u / wisdom.i7-6500u.generic + +magnitude_power_uc8 twopass_x86_avx2 # 65331 ns/call +magnitude_power_uc8 twopass_generic # 72679 ns/call + +magnitude_power_uc8_aligned twopass_x86_avx2_aligned # 66294 ns/call +magnitude_power_uc8_aligned twopass_generic # 68415 ns/call + +magnitude_sc16 exact_float_x86_avx2 # 238602 ns/call +magnitude_sc16 exact_float_generic # 1359997 ns/call + +magnitude_sc16_aligned exact_float_x86_avx2_aligned # 202484 ns/call +magnitude_sc16_aligned exact_float_generic # 1351564 ns/call + +magnitude_sc16q11 exact_float_x86_avx2 # 65311 ns/call +magnitude_sc16q11 exact_float_generic # 513012 ns/call + +magnitude_sc16q11_aligned exact_float_x86_avx2_aligned # 56217 ns/call +magnitude_sc16q11_aligned exact_float_generic # 510226 ns/call + +magnitude_uc8 lookup_unroll_4_x86_avx2 # 53581 ns/call +magnitude_uc8 lookup_unroll_4_generic # 52709 ns/call + +magnitude_uc8_aligned lookup_unroll_4_x86_avx2 # 53870 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 54033 ns/call + +mean_power_u16 u32_x86_avx2 # 11627 ns/call +mean_power_u16 u32_generic # 18252 ns/call + +mean_power_u16_aligned u32_x86_avx2_aligned # 11572 ns/call +mean_power_u16_aligned u32_generic # 18207 ns/call diff --git a/wisdom/wisdom.i7-6500u b/wisdom/wisdom.i7-6500u new file mode 100644 index 0000000..78b93bf --- /dev/null +++ b/wisdom/wisdom.i7-6500u @@ -0,0 +1,90 @@ +# model name : Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz +# +# "performance" cpufreq governor @ 2.50 GHz + +# generated by ./starch-benchmark -i 15 -r wisdom.local -o wisdom.local + +magnitude_power_uc8 twopass_x86_avx2 # 65331 ns/call +magnitude_power_uc8 twopass_generic # 65363 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 67147 ns/call +magnitude_power_uc8 lookup_unroll_4_x86_avx2 # 67202 ns/call +magnitude_power_uc8 lookup_generic # 74612 ns/call +magnitude_power_uc8 lookup_x86_avx2 # 74801 ns/call + +magnitude_power_uc8_aligned twopass_generic # 66243 ns/call +magnitude_power_uc8_aligned twopass_x86_avx2 # 66258 ns/call +magnitude_power_uc8_aligned twopass_x86_avx2_aligned # 66294 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_x86_avx2_aligned # 67621 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_x86_avx2 # 67657 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 67684 ns/call +magnitude_power_uc8_aligned lookup_generic # 75036 ns/call +magnitude_power_uc8_aligned lookup_x86_avx2_aligned # 75191 ns/call +magnitude_power_uc8_aligned lookup_x86_avx2 # 75335 ns/call + +magnitude_sc16 exact_float_x86_avx2 # 256796 ns/call +magnitude_sc16 exact_u32_x86_avx2 # 300270 ns/call +magnitude_sc16 exact_float_generic # 1357410 ns/call +magnitude_sc16 exact_u32_generic # 2039745 ns/call + +magnitude_sc16_aligned exact_float_x86_avx2_aligned # 225583 ns/call +magnitude_sc16_aligned exact_float_x86_avx2 # 245087 ns/call +magnitude_sc16_aligned exact_u32_x86_avx2_aligned # 265908 ns/call +magnitude_sc16_aligned exact_u32_x86_avx2 # 289047 ns/call +magnitude_sc16_aligned exact_float_generic # 1345505 ns/call +magnitude_sc16_aligned exact_u32_generic # 2037905 ns/call + +magnitude_sc16q11 exact_float_x86_avx2 # 63530 ns/call +magnitude_sc16q11 exact_u32_x86_avx2 # 74567 ns/call +magnitude_sc16q11 exact_float_generic # 524297 ns/call +magnitude_sc16q11 12bit_table_x86_avx2 # 549772 ns/call +magnitude_sc16q11 12bit_table_generic # 551318 ns/call +magnitude_sc16q11 11bit_table_generic # 612628 ns/call +magnitude_sc16q11 11bit_table_x86_avx2 # 612833 ns/call +magnitude_sc16q11 exact_u32_generic # 652008 ns/call + +magnitude_sc16q11_aligned exact_float_x86_avx2_aligned # 56413 ns/call +magnitude_sc16q11_aligned exact_float_x86_avx2 # 61285 ns/call +magnitude_sc16q11_aligned exact_u32_x86_avx2_aligned # 66331 ns/call +magnitude_sc16q11_aligned exact_u32_x86_avx2 # 72272 ns/call +magnitude_sc16q11_aligned exact_float_generic # 521575 ns/call +magnitude_sc16q11_aligned 12bit_table_x86_avx2 # 549193 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 549588 ns/call +magnitude_sc16q11_aligned 12bit_table_x86_avx2_aligned # 570064 ns/call +magnitude_sc16q11_aligned 11bit_table_x86_avx2 # 616504 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 616961 ns/call +magnitude_sc16q11_aligned 11bit_table_x86_avx2_aligned # 618931 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 650346 ns/call + +magnitude_uc8 lookup_unroll_4_x86_avx2 # 53027 ns/call +magnitude_uc8 lookup_unroll_4_generic # 53081 ns/call +magnitude_uc8 lookup_x86_avx2 # 53482 ns/call +magnitude_uc8 lookup_generic # 53489 ns/call +magnitude_uc8 exact_x86_avx2 # 91623 ns/call +magnitude_uc8 exact_generic # 801481 ns/call + +magnitude_uc8_aligned lookup_unroll_4_x86_avx2 # 53313 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 53329 ns/call +magnitude_uc8_aligned lookup_unroll_4_x86_avx2_aligned # 53358 ns/call +magnitude_uc8_aligned lookup_x86_avx2 # 53692 ns/call +magnitude_uc8_aligned lookup_x86_avx2_aligned # 53790 ns/call +magnitude_uc8_aligned lookup_generic # 55871 ns/call +magnitude_uc8_aligned exact_x86_avx2_aligned # 86939 ns/call +magnitude_uc8_aligned exact_x86_avx2 # 89688 ns/call +magnitude_uc8_aligned exact_generic # 802054 ns/call + +mean_power_u16 u32_x86_avx2 # 11601 ns/call +mean_power_u16 u32_generic # 18249 ns/call +mean_power_u16 float_x86_avx2 # 18556 ns/call +mean_power_u16 u64_x86_avx2 # 31297 ns/call +mean_power_u16 u64_generic # 39618 ns/call +mean_power_u16 float_generic # 105649 ns/call + +mean_power_u16_aligned u32_x86_avx2 # 11606 ns/call +mean_power_u16_aligned u32_x86_avx2_aligned # 11609 ns/call +mean_power_u16_aligned float_x86_avx2 # 18231 ns/call +mean_power_u16_aligned float_x86_avx2_aligned # 18253 ns/call +mean_power_u16_aligned u32_generic # 18254 ns/call +mean_power_u16_aligned u64_x86_avx2_aligned # 31282 ns/call +mean_power_u16_aligned u64_x86_avx2 # 31283 ns/call +mean_power_u16_aligned u64_generic # 39639 ns/call +mean_power_u16_aligned float_generic # 105615 ns/call diff --git a/wisdom/wisdom.i7-6500u.generic b/wisdom/wisdom.i7-6500u.generic new file mode 100644 index 0000000..96fbb6b --- /dev/null +++ b/wisdom/wisdom.i7-6500u.generic @@ -0,0 +1,43 @@ +# model name : Intel(R) Core(TM) i7-6500U CPU @ 2.50GHz +# +# "performance" cpufreq governor @ 2.50 GHz + +magnitude_power_uc8 twopass_generic # 72679 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 84247 ns/call +magnitude_power_uc8 lookup_generic # 87929 ns/call + +magnitude_power_uc8_aligned twopass_generic # 68415 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 71632 ns/call +magnitude_power_uc8_aligned lookup_generic # 79056 ns/call + +magnitude_sc16 exact_float_generic # 1350012 ns/call +magnitude_sc16 exact_u32_generic # 2036183 ns/call + +magnitude_sc16_aligned exact_float_generic # 1340202 ns/call +magnitude_sc16_aligned exact_u32_generic # 2035257 ns/call + +magnitude_sc16q11 exact_float_generic # 523422 ns/call +magnitude_sc16q11 12bit_table_generic # 539142 ns/call +magnitude_sc16q11 11bit_table_generic # 613256 ns/call +magnitude_sc16q11 exact_u32_generic # 651178 ns/call + +magnitude_sc16q11_aligned exact_float_generic # 520001 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 539652 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 616597 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 649809 ns/call + +magnitude_uc8 lookup_unroll_4_generic # 56626 ns/call +magnitude_uc8 lookup_generic # 57064 ns/call +magnitude_uc8 exact_generic # 809893 ns/call + +magnitude_uc8_aligned lookup_unroll_4_generic # 58632 ns/call +magnitude_uc8_aligned lookup_generic # 62214 ns/call +magnitude_uc8_aligned exact_generic # 808622 ns/call + +mean_power_u16 u32_generic # 18135 ns/call +mean_power_u16 u64_generic # 39496 ns/call +mean_power_u16 float_generic # 105266 ns/call + +mean_power_u16_aligned u32_generic # 18155 ns/call +mean_power_u16_aligned u64_generic # 39493 ns/call +mean_power_u16_aligned float_generic # 105261 ns/call diff --git a/wisdom/wisdom.pi0w b/wisdom/wisdom.pi0w new file mode 100644 index 0000000..0f01ff3 --- /dev/null +++ b/wisdom/wisdom.pi0w @@ -0,0 +1,47 @@ +# Hardware : BCM2835 +# Revision : 9000c1 +# Model : Raspberry Pi Zero W Rev 1.1 +# +# "performance" cpufreq governor @ 1GHz + +# generated by ./starch-benchmark -i 15 -r wisdom.local -o wisdom.local + +magnitude_power_uc8 lookup_unroll_4_generic # 5711147 ns/call +magnitude_power_uc8 twopass_generic # 6205338 ns/call +magnitude_power_uc8 lookup_generic # 6880126 ns/call + +magnitude_power_uc8_aligned lookup_unroll_4_generic # 5750495 ns/call +magnitude_power_uc8_aligned twopass_generic # 6209062 ns/call +magnitude_power_uc8_aligned lookup_generic # 6941100 ns/call + +magnitude_sc16 exact_float_generic # 28623479 ns/call +magnitude_sc16 exact_u32_generic # 28660776 ns/call + +magnitude_sc16_aligned exact_float_generic # 28613950 ns/call +magnitude_sc16_aligned exact_u32_generic # 28671952 ns/call + +magnitude_sc16q11 exact_float_generic # 7142819 ns/call +magnitude_sc16q11 exact_u32_generic # 7146487 ns/call +magnitude_sc16q11 11bit_table_generic # 17820638 ns/call +magnitude_sc16q11 12bit_table_generic # 19280398 ns/call + +magnitude_sc16q11_aligned exact_float_generic # 7130689 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 7152986 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 17872904 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 19332280 ns/call + +magnitude_uc8 lookup_unroll_4_generic # 4420819 ns/call +magnitude_uc8 lookup_generic # 6346327 ns/call +magnitude_uc8 exact_generic # 9264172 ns/call + +magnitude_uc8_aligned lookup_unroll_4_generic # 4699258 ns/call +magnitude_uc8_aligned lookup_generic # 6600661 ns/call +magnitude_uc8_aligned exact_generic # 9308206 ns/call + +mean_power_u16 u64_generic # 976416 ns/call +mean_power_u16 u32_generic # 1040812 ns/call +mean_power_u16 float_generic # 1794994 ns/call + +mean_power_u16_aligned u64_generic # 961388 ns/call +mean_power_u16_aligned u32_generic # 1024339 ns/call +mean_power_u16_aligned float_generic # 1778085 ns/call diff --git a/wisdom/wisdom.pi4b b/wisdom/wisdom.pi4b new file mode 100644 index 0000000..09a053c --- /dev/null +++ b/wisdom/wisdom.pi4b @@ -0,0 +1,107 @@ +# Hardware : BCM2711 +# Revision : a03111 +# Model : Raspberry Pi 4 Model B Rev 1.1 +# +# "performance" cpufreq governor @ 1.5GHz + +# generated by ./starch-benchmark -i 15 -r wisdom.local -o wisdom.local + +magnitude_power_uc8 neon_vrsqrte_armv7a_neon_vfpv4 # 225494 ns/call +magnitude_power_uc8 twopass_generic # 232985 ns/call +magnitude_power_uc8 twopass_armv7a_neon_vfpv4 # 233043 ns/call +magnitude_power_uc8 lookup_generic # 312890 ns/call +magnitude_power_uc8 lookup_armv7a_neon_vfpv4 # 313395 ns/call +magnitude_power_uc8 lookup_unroll_4_armv7a_neon_vfpv4 # 351108 ns/call +magnitude_power_uc8 lookup_unroll_4_generic # 392295 ns/call + +magnitude_power_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 212203 ns/call +magnitude_power_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 212204 ns/call +magnitude_power_uc8_aligned twopass_armv7a_neon_vfpv4_aligned # 232057 ns/call +magnitude_power_uc8_aligned twopass_armv7a_neon_vfpv4 # 232072 ns/call +magnitude_power_uc8_aligned twopass_generic # 232141 ns/call +magnitude_power_uc8_aligned lookup_generic # 304510 ns/call +magnitude_power_uc8_aligned lookup_armv7a_neon_vfpv4_aligned # 304855 ns/call +magnitude_power_uc8_aligned lookup_armv7a_neon_vfpv4 # 304863 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4 # 332848 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4_aligned # 333134 ns/call +magnitude_power_uc8_aligned lookup_unroll_4_generic # 377063 ns/call + +magnitude_sc16 neon_vrsqrte_armv7a_neon_vfpv4 # 685671 ns/call +magnitude_sc16 exact_u32_armv7a_neon_vfpv4 # 2471841 ns/call +magnitude_sc16 exact_float_armv7a_neon_vfpv4 # 2488725 ns/call +magnitude_sc16 exact_u32_generic # 3475780 ns/call +magnitude_sc16 exact_float_generic # 3627016 ns/call + +magnitude_sc16_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 645434 ns/call +magnitude_sc16_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 646233 ns/call +magnitude_sc16_aligned exact_u32_armv7a_neon_vfpv4 # 2464487 ns/call +magnitude_sc16_aligned exact_u32_armv7a_neon_vfpv4_aligned # 2464639 ns/call +magnitude_sc16_aligned exact_float_armv7a_neon_vfpv4_aligned # 2489450 ns/call +magnitude_sc16_aligned exact_float_armv7a_neon_vfpv4 # 2495798 ns/call +magnitude_sc16_aligned exact_u32_generic # 3473976 ns/call +magnitude_sc16_aligned exact_float_generic # 3629034 ns/call + +magnitude_sc16q11 neon_vrsqrte_armv7a_neon_vfpv4 # 166102 ns/call +magnitude_sc16q11 exact_u32_armv7a_neon_vfpv4 # 615312 ns/call +magnitude_sc16q11 exact_float_armv7a_neon_vfpv4 # 822023 ns/call +magnitude_sc16q11 exact_u32_generic # 1151805 ns/call +magnitude_sc16q11 exact_float_generic # 1218908 ns/call +magnitude_sc16q11 11bit_table_armv7a_neon_vfpv4 # 1940816 ns/call +magnitude_sc16q11 12bit_table_armv7a_neon_vfpv4 # 2035932 ns/call +magnitude_sc16q11 12bit_table_generic # 2401932 ns/call +magnitude_sc16q11 11bit_table_generic # 2656593 ns/call + +magnitude_sc16q11_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 155218 ns/call +magnitude_sc16q11_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 155242 ns/call +magnitude_sc16q11_aligned exact_u32_armv7a_neon_vfpv4 # 612259 ns/call +magnitude_sc16q11_aligned exact_u32_armv7a_neon_vfpv4_aligned # 612269 ns/call +magnitude_sc16q11_aligned exact_float_armv7a_neon_vfpv4_aligned # 815733 ns/call +magnitude_sc16q11_aligned exact_float_armv7a_neon_vfpv4 # 821729 ns/call +magnitude_sc16q11_aligned exact_u32_generic # 1154414 ns/call +magnitude_sc16q11_aligned exact_float_generic # 1224252 ns/call +magnitude_sc16q11_aligned 11bit_table_armv7a_neon_vfpv4 # 1940788 ns/call +magnitude_sc16q11_aligned 12bit_table_armv7a_neon_vfpv4_aligned # 2035889 ns/call +magnitude_sc16q11_aligned 12bit_table_armv7a_neon_vfpv4 # 2036579 ns/call +magnitude_sc16q11_aligned 11bit_table_armv7a_neon_vfpv4_aligned # 2077521 ns/call +magnitude_sc16q11_aligned 12bit_table_generic # 2405119 ns/call +magnitude_sc16q11_aligned 11bit_table_generic # 2657152 ns/call + +magnitude_uc8 neon_vrsqrte_armv7a_neon_vfpv4 # 188739 ns/call +magnitude_uc8 lookup_unroll_4_generic # 284930 ns/call +magnitude_uc8 lookup_armv7a_neon_vfpv4 # 291956 ns/call +magnitude_uc8 lookup_generic # 292047 ns/call +magnitude_uc8 lookup_unroll_4_armv7a_neon_vfpv4 # 298012 ns/call +magnitude_uc8 exact_armv7a_neon_vfpv4 # 921119 ns/call +magnitude_uc8 exact_generic # 1676587 ns/call + +magnitude_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4 # 187202 ns/call +magnitude_uc8_aligned neon_vrsqrte_armv7a_neon_vfpv4_aligned # 187203 ns/call +magnitude_uc8_aligned lookup_unroll_4_generic # 280048 ns/call +magnitude_uc8_aligned lookup_armv7a_neon_vfpv4_aligned # 282247 ns/call +magnitude_uc8_aligned lookup_generic # 282254 ns/call +magnitude_uc8_aligned lookup_armv7a_neon_vfpv4 # 282262 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4_aligned # 292923 ns/call +magnitude_uc8_aligned lookup_unroll_4_armv7a_neon_vfpv4 # 292985 ns/call +magnitude_uc8_aligned exact_armv7a_neon_vfpv4 # 921141 ns/call +magnitude_uc8_aligned exact_armv7a_neon_vfpv4_aligned # 921149 ns/call +magnitude_uc8_aligned exact_generic # 1676551 ns/call + +mean_power_u16 u32_armv7a_neon_vfpv4 # 45483 ns/call +mean_power_u16 neon_float_armv7a_neon_vfpv4 # 58654 ns/call +mean_power_u16 u64_armv7a_neon_vfpv4 # 79486 ns/call +mean_power_u16 float_armv7a_neon_vfpv4 # 94322 ns/call +mean_power_u16 u64_generic # 131666 ns/call +mean_power_u16 u32_generic # 132124 ns/call +mean_power_u16 float_generic # 187161 ns/call + +mean_power_u16_aligned u32_armv7a_neon_vfpv4_aligned # 44929 ns/call +mean_power_u16_aligned u32_armv7a_neon_vfpv4 # 44933 ns/call +mean_power_u16_aligned neon_float_armv7a_neon_vfpv4 # 58485 ns/call +mean_power_u16_aligned neon_float_armv7a_neon_vfpv4_aligned # 58488 ns/call +mean_power_u16_aligned u64_armv7a_neon_vfpv4 # 80349 ns/call +mean_power_u16_aligned u64_armv7a_neon_vfpv4_aligned # 80669 ns/call +mean_power_u16_aligned float_armv7a_neon_vfpv4_aligned # 86325 ns/call +mean_power_u16_aligned float_armv7a_neon_vfpv4 # 86326 ns/call +mean_power_u16_aligned u64_generic # 131637 ns/call +mean_power_u16_aligned u32_generic # 132092 ns/call +mean_power_u16_aligned float_generic # 187127 ns/call