diff options
Diffstat (limited to 'lib/ffts/src/ffts.c')
-rw-r--r-- | lib/ffts/src/ffts.c | 292 |
1 files changed, 271 insertions, 21 deletions
diff --git a/lib/ffts/src/ffts.c b/lib/ffts/src/ffts.c index 7fa675a..35c5cad 100644 --- a/lib/ffts/src/ffts.c +++ b/lib/ffts/src/ffts.c @@ -34,6 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "ffts.h" #include "ffts_internal.h" +#include "ffts_chirp_z.h" #include "ffts_static.h" #include "ffts_trig.h" #include "macros.h" @@ -76,7 +77,8 @@ static const FFTS_ALIGN(64) float w_data[16] = { }; #endif -static FFTS_INLINE int ffts_allow_execute(void *start, size_t len) +static FFTS_INLINE int +ffts_allow_execute(void *start, size_t len) { int result; @@ -90,7 +92,8 @@ static FFTS_INLINE int ffts_allow_execute(void *start, size_t len) return result; } -static FFTS_INLINE int ffts_deny_execute(void *start, size_t len) +static FFTS_INLINE int +ffts_deny_execute(void *start, size_t len) { int result; @@ -104,7 +107,8 @@ static FFTS_INLINE int ffts_deny_execute(void *start, size_t len) return result; } -static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length) +static FFTS_INLINE int +ffts_flush_instruction_cache(void *start, size_t length) { #ifdef _WIN32 return !FlushInstructionCache(GetCurrentProcess(), start, length); @@ -124,7 +128,8 @@ static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length) #endif } -static FFTS_INLINE void *ffts_vmem_alloc(size_t length) +static FFTS_INLINE void* +ffts_vmem_alloc(size_t length) { #if __APPLE__ return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0); @@ -139,7 +144,8 @@ static FFTS_INLINE void *ffts_vmem_alloc(size_t length) #endif } -static FFTS_INLINE void ffts_vmem_free(void *addr, size_t length) +static FFTS_INLINE void +ffts_vmem_free(void *addr, size_t length) { #ifdef _WIN32 (void) length; @@ -174,7 +180,8 @@ ffts_free(ffts_plan_t *p) } } -void ffts_free_1d(ffts_plan_t *p) +static void +ffts_free_1d(ffts_plan_t *p) { #if !defined(DYNAMIC_DISABLED) if (p->transform_base) { @@ -188,7 +195,7 @@ void ffts_free_1d(ffts_plan_t *p) } if (p->ws) { - FFTS_FREE(p->ws); + ffts_aligned_free(p->ws); } if (p->is) { @@ -233,7 +240,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f); #endif - p->ws = FFTS_MALLOC(lut_size, 32); + p->ws = ffts_aligned_malloc(lut_size); if (!p->ws) { goto cleanup; } @@ -253,7 +260,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) /* calculate factors */ m = leaf_N << (n_luts - 2); - tmp = FFTS_MALLOC(m * sizeof(ffts_cpx_32f), 32); + tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_32f)); ffts_generate_cosine_sine_pow2_32f(tmp, m); @@ -263,7 +270,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) p->ws_is[i] = w - (ffts_cpx_32f*) p->ws; if (!i) { - ffts_cpx_32f *w0 = FFTS_MALLOC(n/4 * sizeof(ffts_cpx_32f), 32); + ffts_cpx_32f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_32f)); float *fw0 = (float*) w0; float *fw = (float*) w; @@ -300,11 +307,11 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) w += n/4 * 2; #endif - FFTS_FREE(w0); + ffts_aligned_free(w0); } else { - ffts_cpx_32f *w0 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32); - ffts_cpx_32f *w1 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32); - ffts_cpx_32f *w2 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32); + ffts_cpx_32f *w0 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f)); + ffts_cpx_32f *w1 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f)); + ffts_cpx_32f *w2 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f)); float *fw0 = (float*) w0; float *fw1 = (float*) w1; @@ -380,9 +387,9 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) w += n/8 * 3 * 2; #endif - FFTS_FREE(w0); - FFTS_FREE(w1); - FFTS_FREE(w2); + ffts_aligned_free(w0); + ffts_aligned_free(w1); + ffts_aligned_free(w2); } n *= 2; @@ -401,7 +408,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) } #endif - FFTS_FREE(tmp); + ffts_aligned_free(tmp); p->lastlut = w; p->n_luts = n_luts; @@ -411,18 +418,166 @@ cleanup: return -1; } +#ifdef FFTS_DOUBLE +static int +ffts_generate_luts_64f(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) +{ + V4DF MULI_SIGN; + size_t n_luts; + ffts_cpx_64f *w; + ffts_cpx_64f *tmp; + size_t i, j, m, n; + int stride; + + if (sign < 0) { + MULI_SIGN = V4DF_LIT4(-0.0, 0.0, -0.0, 0.0); + } else { + MULI_SIGN = V4DF_LIT4(0.0, -0.0, 0.0, -0.0); + } + + /* LUTS */ + n_luts = ffts_ctzl(N / leaf_N); + if (n_luts >= 32) { + n_luts = 0; + } + + if (n_luts) { + size_t lut_size; + + lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_64f); + + p->ws = ffts_aligned_malloc(lut_size); + if (!p->ws) { + goto cleanup; + } + + p->ws_is = (size_t*) malloc(n_luts * sizeof(*p->ws_is)); + if (!p->ws_is) { + goto cleanup; + } + } + + w = p->ws; + n = leaf_N * 2; + + /* calculate factors */ + m = leaf_N << (n_luts - 2); + tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_64f)); + + ffts_generate_cosine_sine_pow2_64f(tmp, m); + + /* generate lookup tables */ + stride = 1 << (n_luts - 1); + for (i = 0; i < n_luts; i++) { + p->ws_is[i] = w - (ffts_cpx_64f*) p->ws; + + if (!i) { + ffts_cpx_64f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_64f)); + double *fw0 = (double*) w0; + double *fw = (double*) w; + + for (j = 0; j < n/4; j++) { + w0[j][0] = tmp[j * stride][0]; + w0[j][1] = tmp[j * stride][1]; + } + + for (j = 0; j < n/4; j += 2) { + V4DF re, im, temp0; + temp0 = V4DF_LD(fw0 + j*2); + re = V4DF_DUPLICATE_RE(temp0); + im = V4DF_DUPLICATE_IM(temp0); + im = V4DF_XOR(im, MULI_SIGN); + V4DF_ST(fw + j*4 + 0, re); + V4DF_ST(fw + j*4 + 4, im); + } + + w += n/4 * 2; + ffts_aligned_free(w0); + } else { + ffts_cpx_64f *w0 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f)); + ffts_cpx_64f *w1 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f)); + ffts_cpx_64f *w2 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f)); + + double *fw0 = (double*) w0; + double *fw1 = (double*) w1; + double *fw2 = (double*) w2; + + double *fw = (double*)w; + + for (j = 0; j < n/8; j++) { + w0[j][0] = tmp[2 * j * stride][0]; + w0[j][1] = tmp[2 * j * stride][1]; + + w1[j][0] = tmp[j * stride][0]; + w1[j][1] = tmp[j * stride][1]; + + w2[j][0] = tmp[(j + (n/8)) * stride][0]; + w2[j][1] = tmp[(j + (n/8)) * stride][1]; + } + + for (j = 0; j < n/8; j += 2) { + V4DF temp0, temp1, temp2, re, im; + + temp0 = V4DF_LD(fw0 + j*2); + re = V4DF_DUPLICATE_RE(temp0); + im = V4DF_DUPLICATE_IM(temp0); + im = V4DF_XOR(im, MULI_SIGN); + V4DF_ST(fw + j*2*6+0, re); + V4DF_ST(fw + j*2*6+4, im); + + temp1 = V4DF_LD(fw1 + j*2); + re = V4DF_DUPLICATE_RE(temp1); + im = V4DF_DUPLICATE_IM(temp1); + im = V4DF_XOR(im, MULI_SIGN); + V4DF_ST(fw + j*2*6+8 , re); + V4DF_ST(fw + j*2*6+12, im); + + temp2 = V4DF_LD(fw2 + j*2); + re = V4DF_DUPLICATE_RE(temp2); + im = V4DF_DUPLICATE_IM(temp2); + im = V4DF_XOR(im, MULI_SIGN); + V4DF_ST(fw + j*2*6+16, re); + V4DF_ST(fw + j*2*6+20, im); + } + + w += n/8 * 3 * 2; + ffts_aligned_free(w0); + ffts_aligned_free(w1); + ffts_aligned_free(w2); + } + + n *= 2; + stride >>= 1; + } + + ffts_aligned_free(tmp); + + p->lastlut = w; + p->n_luts = n_luts; + return 0; + +cleanup: + return -1; +} +#endif + FFTS_API ffts_plan_t* ffts_init_1d(size_t N, int sign) { const size_t leaf_N = 8; ffts_plan_t *p; - if (N < 2 || (N & (N - 1)) != 0) { - LOG("FFT size must be a power of two\n"); + if (N < 2) { + LOG("FFT size must be greater than 1"); return NULL; } - p = calloc(1, sizeof(*p)); + /* check if size is not a power of two */ + if (N & (N - 1)) { + return ffts_chirp_z_init(N, sign); + } + + p = (ffts_plan_t*) calloc(1, sizeof(*p)); if (!p) { return NULL; } @@ -537,3 +692,98 @@ cleanup: ffts_free_1d(p); return NULL; } + +#ifdef FFTS_DOUBLE +FFTS_API ffts_plan_t* +ffts_init_1d_64f(size_t N, int sign) +{ + const size_t leaf_N = 8; + ffts_plan_t *p; + + if (N < 2) { + LOG("FFT size must be greater than 1"); + return NULL; + } + + p = (ffts_plan_t*) calloc(1, sizeof(*p)); + if (!p) { + return NULL; + } + + p->destroy = ffts_free_1d; + p->N = N; + + if (N >= 32) { + /* generate lookup tables */ + if (ffts_generate_luts_64f(p, N, leaf_N, sign)) { + goto cleanup; + } + + p->offsets = ffts_init_offsets(N, leaf_N); + if (!p->offsets) { + goto cleanup; + } + + p->is = ffts_init_is(N, leaf_N, 1); + if (!p->is) { + goto cleanup; + } + + p->i0 = N/leaf_N/3 + 1; + p->i1 = p->i2 = N/leaf_N/3; + if ((N/leaf_N) % 3 > 1) { + p->i1++; + } + + p->i0 /= 2; + p->i1 /= 2; + + if (sign < 0) { + p->transform = ffts_static_transform_f_64f; + } else { + p->transform = ffts_static_transform_i_64f; + } + } else { + switch (N) { + case 2: + p->transform = &ffts_small_2_64f; + break; + case 4: + if (sign == -1) { + p->transform = &ffts_small_forward4_64f; + } else if (sign == 1) { + p->transform = &ffts_small_backward4_64f; + } + break; + case 8: + if (sign == -1) { + p->transform = &ffts_small_forward8_64f; + } else if (sign == 1) { + p->transform = &ffts_small_backward8_64f; + } + break; + case 16: + default: + if (sign == -1) { + p->transform = &ffts_small_forward16_64f; + } else { + p->transform = &ffts_small_backward16_64f; + } + break; + } + } + + return p; + +cleanup: + ffts_free_1d(p); + return NULL; +} +#else +FFTS_API ffts_plan_t* +ffts_init_1d_64f(size_t N, int sign) +{ + /* disabled */ + return NULL; +} +#endif
\ No newline at end of file |