diff options
Diffstat (limited to 'lib/ffts/src/macros.h')
-rw-r--r-- | lib/ffts/src/macros.h | 172 |
1 files changed, 170 insertions, 2 deletions
diff --git a/lib/ffts/src/macros.h b/lib/ffts/src/macros.h index e7e349f..99b0c53 100644 --- a/lib/ffts/src/macros.h +++ b/lib/ffts/src/macros.h @@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com> +Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi> All rights reserved. @@ -41,14 +42,29 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef HAVE_NEON #include "macros-neon.h" #elif HAVE_SSE +#ifdef HAVE_AVX +#include "macros-avx.h" +#else #include "macros-sse.h" +#endif // NOTE: AltiVec support disabled until updated to provide new V4SF variable type -//#elif __powerpc__ -//#include "macros-altivec.h" +#elif __powerpc__ +#include "macros-altivec.h" #else #include "macros-alpha.h" #endif +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_TX2(V4DF *a, V4DF *b) +{ + V4DF t0 = V4DF_UNPACK_LO(*a, *b); + V4DF t1 = V4DF_UNPACK_HI(*a, *b); + *a = t0; + *b = t1; +} +#endif + static FFTS_INLINE void V4SF_TX2(V4SF *a, V4SF *b) { @@ -58,6 +74,34 @@ V4SF_TX2(V4SF *a, V4SF *b) *b = t1; } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_K_N(int inv, + V4DF re, + V4DF im, + V4DF *r0, + V4DF *r1, + V4DF *r2, + V4DF *r3) +{ + V4DF uk, uk2, zk_p, zk_n, zk, zk_d; + + uk = *r0; + uk2 = *r1; + + zk_p = V4DF_IMUL(*r2, re, im); + zk_n = V4DF_IMULJ(*r3, re, im); + + zk = V4DF_ADD(zk_p, zk_n); + zk_d = V4DF_IMULI(inv, V4DF_SUB(zk_p, zk_n)); + + *r2 = V4DF_SUB(uk, zk); + *r0 = V4DF_ADD(uk, zk); + *r3 = V4DF_ADD(uk2, zk_d); + *r1 = V4DF_SUB(uk2, zk_d); +} +#endif + static FFTS_INLINE void V4SF_K_N(int inv, V4SF re, @@ -84,6 +128,45 @@ V4SF_K_N(int inv, *r1 = V4SF_SUB(uk2, zk_d); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_L_2_4(int inv, + const double *FFTS_RESTRICT i0, + const double *FFTS_RESTRICT i1, + const double *FFTS_RESTRICT i2, + const double *FFTS_RESTRICT i3, + V4DF *r0, + V4DF *r1, + V4DF *r2, + V4DF *r3) +{ + V4DF t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = V4DF_LD(i0); + t1 = V4DF_LD(i1); + t2 = V4DF_LD(i2); + t3 = V4DF_LD(i3); + + t4 = V4DF_ADD(t0, t1); + t5 = V4DF_SUB(t0, t1); + t6 = V4DF_ADD(t2, t3); + t7 = V4DF_SUB(t2, t3); + + *r0 = V4DF_UNPACK_LO(t4, t5); + *r1 = V4DF_UNPACK_LO(t6, t7); + + t5 = V4DF_IMULI(inv, t5); + + t0 = V4DF_ADD(t6, t4); + t2 = V4DF_SUB(t6, t4); + t1 = V4DF_SUB(t7, t5); + t3 = V4DF_ADD(t7, t5); + + *r3 = V4DF_UNPACK_HI(t0, t1); + *r2 = V4DF_UNPACK_HI(t2, t3); +} +#endif + static FFTS_INLINE void V4SF_L_2_4(int inv, const float *FFTS_RESTRICT i0, @@ -121,6 +204,46 @@ V4SF_L_2_4(int inv, *r2 = V4SF_UNPACK_HI(t2, t3); } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_L_4_4(int inv, + const double *FFTS_RESTRICT i0, + const double *FFTS_RESTRICT i1, + const double *FFTS_RESTRICT i2, + const double *FFTS_RESTRICT i3, + V4DF *r0, + V4DF *r1, + V4DF *r2, + V4DF *r3) +{ + V4DF t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = V4DF_LD(i0); + t1 = V4DF_LD(i1); + t2 = V4DF_LD(i2); + t3 = V4DF_LD(i3); + + t4 = V4DF_ADD(t0, t1); + t5 = V4DF_SUB(t0, t1); + t6 = V4DF_ADD(t2, t3); + + t7 = V4DF_IMULI(inv, V4DF_SUB(t2, t3)); + + t0 = V4DF_ADD(t4, t6); + t2 = V4DF_SUB(t4, t6); + t1 = V4DF_SUB(t5, t7); + t3 = V4DF_ADD(t5, t7); + + V4DF_TX2(&t0, &t1); + V4DF_TX2(&t2, &t3); + + *r0 = t0; + *r2 = t1; + *r1 = t2; + *r3 = t3; +} +#endif + static FFTS_INLINE void V4SF_L_4_4(int inv, const float *FFTS_RESTRICT i0, @@ -159,6 +282,48 @@ V4SF_L_4_4(int inv, *r3 = t3; } +#ifdef FFTS_DOUBLE +static FFTS_INLINE void +V4DF_L_4_2(int inv, + const double *FFTS_RESTRICT i0, + const double *FFTS_RESTRICT i1, + const double *FFTS_RESTRICT i2, + const double *FFTS_RESTRICT i3, + V4DF *r0, + V4DF *r1, + V4DF *r2, + V4DF *r3) +{ + V4DF t0, t1, t2, t3, t4, t5, t6, t7; + + t0 = V4DF_LD(i0); + t1 = V4DF_LD(i1); + t6 = V4DF_LD(i2); + t7 = V4DF_LD(i3); + + t2 = V4DF_BLEND(t6, t7); + t3 = V4DF_BLEND(t7, t6); + + t4 = V4DF_ADD(t0, t1); + t5 = V4DF_SUB(t0, t1); + t6 = V4DF_ADD(t2, t3); + t7 = V4DF_SUB(t2, t3); + + *r2 = V4DF_UNPACK_HI(t4, t5); + *r3 = V4DF_UNPACK_HI(t6, t7); + + t7 = V4DF_IMULI(inv, t7); + + t0 = V4DF_ADD(t4, t6); + t2 = V4DF_SUB(t4, t6); + t1 = V4DF_SUB(t5, t7); + t3 = V4DF_ADD(t5, t7); + + *r0 = V4DF_UNPACK_LO(t0, t1); + *r1 = V4DF_UNPACK_LO(t2, t3); +} +#endif + static FFTS_INLINE void V4SF_L_4_2(int inv, const float *FFTS_RESTRICT i0, @@ -199,6 +364,9 @@ V4SF_L_4_2(int inv, *r1 = V4SF_UNPACK_LO(t2, t3); } +#define V4DF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \ + V4DF_ST(o0, r0); V4DF_ST(o1, r1); V4DF_ST(o2, r2); V4DF_ST(o3, r3); + #define V4SF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \ V4SF_ST(o0, r0); V4SF_ST(o1, r1); V4SF_ST(o2, r2); V4SF_ST(o3, r3); |