dump1090-fa/starch/example/impl/subtract_n.c

95 lines
2.6 KiB
C

void STARCH_IMPL(subtract_n, generic) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out)
{
const uint16_t * restrict in_align = STARCH_ALIGNED(in);
uint16_t * restrict out_align = STARCH_ALIGNED(out);
while (len--) {
out_align[0] = in_align[0] - n;
in_align++;
out_align++;
}
}
void STARCH_IMPL(subtract_n, unroll_4) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out)
{
const uint16_t * restrict in_align = STARCH_ALIGNED(in);
uint16_t * restrict out_align = STARCH_ALIGNED(out);
unsigned len4 = len >> 2;
unsigned len1 = len & 3;
while (len4--) {
out_align[0] = in_align[0] - n;
out_align[1] = in_align[1] - n;
out_align[2] = in_align[2] - n;
out_align[3] = in_align[3] - n;
in_align += 4;
out_align += 4;
}
while (len1--) {
out_align[0] = in_align[0] - n;
in_align++;
out_align++;
}
}
void STARCH_IMPL(subtract_n, bad_implementation) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out)
{
// This is a deliberately bad implementation that produces
// incorrect results. The error should be caught during
// benchmarking via STARCH_BENCHMARK_VERIFY.
const uint16_t * restrict in_align = STARCH_ALIGNED(in);
uint16_t * restrict out_align = STARCH_ALIGNED(out);
while (len--) {
out_align[0] = in_align[0] - n;
if (len % 10000 == 0)
out_align[0] += 1;
in_align++;
out_align++;
}
}
#ifdef STARCH_FEATURE_NEON
#include <arm_neon.h>
void STARCH_IMPL_REQUIRES(subtract_n, neon_intrinsics, STARCH_FEATURE_NEON) (const uint16_t *in, unsigned len, uint16_t n, uint16_t *out)
{
const uint16_t * restrict in_align = STARCH_ALIGNED(in);
uint16_t * restrict out_align = STARCH_ALIGNED(out);
uint16x8_t subtractor = vdupq_n_u16(n);
unsigned len8 = len >> 3;
unsigned len4 = len & 4;
unsigned len1 = len & 3;
while (len8--) {
uint16x8_t value = vld1q_u16(in_align);
uint16x8_t result = vsubq_u16(value, subtractor);
vst1q_u16(out_align, result);
in_align += 8;
out_align += 8;
}
if (len4) {
uint16x4_t value = vld1_u16(in_align);
uint16x4_t result = vsub_u16(value, vget_low_u16(subtractor));
vst1_u16(out_align, result);
in_align += 4;
out_align += 4;
}
while (len1--) {
uint16x4_t value = vld1_dup_u16(in_align);
uint16x4_t result = vsub_u16(value, vget_low_u16(subtractor));
vst1_lane_u16(out_align, result, 0);
in_align++;
out_align++;
}
}
#endif