diff options
Diffstat (limited to 'lib/ffts/src/macros-sse.h')
-rw-r--r-- | lib/ffts/src/macros-sse.h | 223 |
1 files changed, 219 insertions, 4 deletions
diff --git a/lib/ffts/src/macros-sse.h b/lib/ffts/src/macros-sse.h index 827aa67..46e1f29 100644 --- a/lib/ffts/src/macros-sse.h +++ b/lib/ffts/src/macros-sse.h @@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> Copyright (c) 2012, The University of Waikato +Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi> All rights reserved. @@ -40,9 +41,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include <xmmintrin.h> -#define FFTS_MALLOC(d,a) (_mm_malloc(d,a)) -#define FFTS_FREE(d) (_mm_free(d)) - typedef __m128 V4SF; #define V4SF_ADD _mm_add_ps @@ -56,8 +54,9 @@ typedef __m128 V4SF; #define V4SF_SWAP_PAIRS(x) \ (_mm_shuffle_ps(x, x, _MM_SHUFFLE(2,3,0,1))) +/* note: order is swapped */ #define V4SF_UNPACK_HI(x,y) \ - (_mm_shuffle_ps(x, y, _MM_SHUFFLE(3,2,3,2))) + (_mm_movehl_ps(y, x)) #define V4SF_UNPACK_LO(x,y) \ (_mm_movelh_ps(x, y)) @@ -97,4 +96,220 @@ V4SF_IMULJ(V4SF d, V4SF re, V4SF im) return V4SF_ADD(re, im); } +#ifdef FFTS_DOUBLE +typedef union { + struct { + double r1; + double i1; + double r2; + double i2; + } r; + uint32_t u[8]; +} V4DF; + +static FFTS_ALWAYS_INLINE V4DF +V4DF_LIT4(double f3, double f2, double f1, double f0) +{ + V4DF z; + + z.r.r1 = f0; + z.r.i1 = f1; + z.r.r2 = f2; + z.r.i2 = f3; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_ADD(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r1 + y.r.r1; + z.r.i1 = x.r.i1 + y.r.i1; + z.r.r2 = x.r.r2 + y.r.r2; + z.r.i2 = x.r.i2 + y.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_SUB(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r1 - y.r.r1; + z.r.i1 = x.r.i1 - y.r.i1; + z.r.r2 = x.r.r2 - y.r.r2; + z.r.i2 = x.r.i2 - y.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_MUL(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r1 * y.r.r1; + z.r.i1 = x.r.i1 * y.r.i1; + z.r.r2 = x.r.r2 * y.r.r2; + z.r.i2 = x.r.i2 * y.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_XOR(V4DF x, V4DF y) +{ + V4DF z; + + z.u[0] = x.u[0] ^ y.u[0]; + z.u[1] = x.u[1] ^ y.u[1]; + z.u[2] = x.u[2] ^ y.u[2]; + z.u[3] = x.u[3] ^ y.u[3]; + z.u[4] = x.u[4] ^ y.u[4]; + z.u[5] = x.u[5] ^ y.u[5]; + z.u[6] = x.u[6] ^ y.u[6]; + z.u[7] = x.u[7] ^ y.u[7]; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_SWAP_PAIRS(V4DF x) +{ + V4DF z; + + z.r.r1 = x.r.i1; + z.r.i1 = x.r.r1; + z.r.r2 = x.r.i2; + z.r.i2 = x.r.r2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_BLEND(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r1; + z.r.i1 = x.r.i1; + z.r.r2 = y.r.r2; + z.r.i2 = y.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_UNPACK_HI(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r2; + z.r.i1 = x.r.i2; + z.r.r2 = y.r.r2; + z.r.i2 = y.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_UNPACK_LO(V4DF x, V4DF y) +{ + V4DF z; + + z.r.r1 = x.r.r1; + z.r.i1 = x.r.i1; + z.r.r2 = y.r.r1; + z.r.i2 = y.r.i1; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_DUPLICATE_RE(V4DF x) +{ + V4DF z; + + z.r.r1 = x.r.r1; + z.r.i1 = x.r.r1; + z.r.r2 = x.r.r2; + z.r.i2 = x.r.r2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_DUPLICATE_IM(V4DF x) +{ + V4DF z; + + z.r.r1 = x.r.i1; + z.r.i1 = x.r.i1; + z.r.r2 = x.r.i2; + z.r.i2 = x.r.i2; + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_IMUL(V4DF d, V4DF re, V4DF im) +{ + re = V4DF_MUL(re, d); + im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d)); + return V4DF_SUB(re, im); +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_IMULJ(V4DF d, V4DF re, V4DF im) +{ + re = V4DF_MUL(re, d); + im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d)); + return V4DF_ADD(re, im); +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_MULI(int inv, V4DF x) +{ + V4DF z; + + if (inv) { + z.r.r1 = -x.r.r1; + z.r.i1 = x.r.i1; + z.r.r2 = -x.r.r2; + z.r.i2 = x.r.i2; + } else { + z.r.r1 = x.r.r1; + z.r.i1 = -x.r.i1; + z.r.r2 = x.r.r2; + z.r.i2 = -x.r.i2; + } + + return z; +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_IMULI(int inv, V4DF x) +{ + return V4DF_SWAP_PAIRS(V4DF_MULI(inv, x)); +} + +static FFTS_ALWAYS_INLINE V4DF +V4DF_LD(const void *s) +{ + V4DF z; + memcpy(&z, s, sizeof(z)); + return z; +} + +static FFTS_ALWAYS_INLINE void +V4DF_ST(void *d, V4DF s) +{ + V4DF *r = (V4DF*) d; + *r = s; +} +#endif + #endif /* FFTS_MACROS_SSE_H */ |