diff options
Diffstat (limited to 'lib/ffts/src/ffts.c')
| -rw-r--r-- | lib/ffts/src/ffts.c | 867 | 
1 files changed, 504 insertions, 363 deletions
| diff --git a/lib/ffts/src/ffts.c b/lib/ffts/src/ffts.c index bec2177..5d72a52 100644 --- a/lib/ffts/src/ffts.c +++ b/lib/ffts/src/ffts.c @@ -1,398 +1,539 @@  /* -  - This file is part of FFTS -- The Fastest Fourier Transform in the South -   - Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> - Copyright (c) 2012, The University of Waikato  -  - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - 	* Redistributions of source code must retain the above copyright - 		notice, this list of conditions and the following disclaimer. - 	* Redistributions in binary form must reproduce the above copyright - 		notice, this list of conditions and the following disclaimer in the - 		documentation and/or other materials provided with the distribution. - 	* Neither the name of the organization nor the -	  names of its contributors may be used to endorse or promote products - 		derived from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND - ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED - WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY - DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com> +Copyright (c) 2012, The University of Waikato + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  */ +  #include "ffts.h" + +#include "ffts_internal.h" +#include "ffts_static.h" +#include "ffts_trig.h"  #include "macros.h" -//#include "mini_macros.h"  #include "patterns.h" -#include "ffts_small.h" -#ifdef DYNAMIC_DISABLED -	#include "ffts_static.h" +#ifndef DYNAMIC_DISABLED +#include "codegen.h" +#endif + +#if _WIN32 +#include <windows.h> +#else +#if __APPLE__ +#include <libkern/OSCacheControl.h> +#endif + +#if HAVE_SYS_MMAN_H +#include <sys/mman.h> +#endif +#endif + +#if defined(HAVE_NEON) +static const FFTS_ALIGN(64) float w_data[16] = { +     0.70710678118654757273731092936941f, +     0.70710678118654746171500846685376f, +    -0.70710678118654757273731092936941f, +    -0.70710678118654746171500846685376f, +     1.0f, +     0.70710678118654757273731092936941f, +    -0.0f, +    -0.70710678118654746171500846685376f, +     0.70710678118654757273731092936941f, +     0.70710678118654746171500846685376f, +     0.70710678118654757273731092936941f, +     0.70710678118654746171500846685376f, +     1.0f, +     0.70710678118654757273731092936941f, +     0.0f, +     0.70710678118654746171500846685376f +}; +#endif + +static FFTS_INLINE int ffts_allow_execute(void *start, size_t len) +{ +    int result; + +#ifdef _WIN32 +    DWORD old_protect; +    result = !VirtualProtect(start, len, PAGE_EXECUTE_READ, &old_protect);  #else -	#include "codegen.h" +    result = mprotect(start, len, PROT_READ | PROT_EXEC);  #endif -#include <errno.h> -  #include <sys/mman.h> -  #include <string.h> -  #include <limits.h>	   /* for PAGESIZE */ +    return result; +} + +static FFTS_INLINE int ffts_deny_execute(void *start, size_t len) +{ +    int result; +#ifdef _WIN32 +    DWORD old_protect; +    result = (int) VirtualProtect(start, len, PAGE_READWRITE, &old_protect); +#else +    result = mprotect(start, len, PROT_READ | PROT_WRITE); +#endif + +    return result; +} + +static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length) +{ +#ifdef _WIN32 +    return !FlushInstructionCache(GetCurrentProcess(), start, length); +#else +#ifdef __APPLE__ +    sys_icache_invalidate(start, length); +#elif __ANDROID__ +    cacheflush((long) start, (long) start + length, 0); +#elif __linux__ +#if GCC_VERSION_AT_LEAST(4,3) +    __builtin___clear_cache(start, (char*) start + length); +#elif __GNUC__ +    __clear_cache((long) start, (long) start + length); +#endif +#endif +    return 0; +#endif +} + +static FFTS_INLINE void *ffts_vmem_alloc(size_t length) +{  #if __APPLE__ -  #include <libkern/OSCacheControl.h> +    return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0); +#elif _WIN32 +    return VirtualAlloc(NULL, length, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);  #else +#ifndef MAP_ANONYMOUS +#define MAP_ANONYMOUS 0x20  #endif -void ffts_execute(ffts_plan_t *p, const void *  in, void *  out) { -	p->transform(p, (const float *)in, (float *)out); +    return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); +#endif  } -void ffts_free(ffts_plan_t *p) { -	p->destroy(p); +static FFTS_INLINE void ffts_vmem_free(void *addr, size_t length) +{ +#ifdef _WIN32 +    (void) length; +    VirtualFree(addr, 0, MEM_RELEASE); +#else +    munmap(addr, length); +#endif  } -void ffts_free_1d(ffts_plan_t *p) { -	 -	size_t i; - -	if(p->ws) { -		FFTS_FREE(p->ws); -	} -	if(p->is) free(p->is); -	if(p->ws_is) free(p->ws_is); -	if(p->offsets)		free(p->offsets); -	//free(p->transforms); -	if(p->transforms) free(p->transforms); - -	if(p->transform_base) { -		if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) { -			perror("Couldn't mprotect"); -			exit(errno); -		} -		munmap(p->transform_base, p->transform_size); -		//free(p->transform_base); -	} -	free(p); +FFTS_API void +ffts_execute(ffts_plan_t *p, const void *in, void *out) +{ +    /* TODO: Define NEEDS_ALIGNED properly instead */ +#if defined(HAVE_SSE) || defined(HAVE_NEON) +    if (((uintptr_t) in % 16) != 0) { +        LOG("ffts_execute: input buffer needs to be aligned to a 128bit boundary\n"); +    } + +    if (((uintptr_t) out % 16) != 0) { +        LOG("ffts_execute: output buffer needs to be aligned to a 128bit boundary\n"); +    } +#endif + +    p->transform(p, (const float*) in, (float*) out);  } -ffts_plan_t *ffts_init_1d(size_t N, int sign) { -	ffts_plan_t *p = malloc(sizeof(ffts_plan_t)); -	size_t leafN = 8;	 -	size_t i;	 - -#ifdef __arm__ -//#ifdef HAVE_NEON -	V MULI_SIGN; -	 -	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); -	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); -//#endif  +FFTS_API void +ffts_free(ffts_plan_t *p) +{ +    if (p) { +        p->destroy(p); +    } +} + +void ffts_free_1d(ffts_plan_t *p) +{ +#if !defined(DYNAMIC_DISABLED) +    if (p->transform_base) { +        ffts_deny_execute(p->transform_base, p->transform_size); +        ffts_vmem_free(p->transform_base, p->transform_size); +    } +#endif + +    if (p->ws_is) { +        free(p->ws_is); +    } + +    if (p->ws) { +        FFTS_FREE(p->ws); +    } + +    if (p->is) { +        free(p->is); +    } + +    if (p->offsets) { +        free(p->offsets); +    } + +    free(p); +} + +static int +ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign) +{ +    V4SF MULI_SIGN; +    size_t n_luts; +    ffts_cpx_32f *w; +    ffts_cpx_32f *tmp; +    size_t i, j, m, n; +    int stride; + +    if (sign < 0) { +        MULI_SIGN = V4SF_LIT4(-0.0f, 0.0f, -0.0f, 0.0f); +    } else { +        MULI_SIGN = V4SF_LIT4(0.0f, -0.0f, 0.0f, -0.0f); +    } + +    /* LUTS */ +    n_luts = ffts_ctzl(N / leaf_N); +    if (n_luts >= 32) { +        n_luts = 0; +    } + +    if (n_luts) { +        size_t lut_size; + +#if defined(__arm__) && !defined(HAVE_NEON) +        lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f) / 2;  #else -	V MULI_SIGN; -	 -	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f); -	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f); +        lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f);  #endif -	p->transform = NULL; -	p->transform_base = NULL; -	p->transforms = NULL; -	p->is = NULL; -	p->ws_is = NULL; -	p->ws = NULL; -	p->offsets = NULL; -	p->destroy = ffts_free_1d; - -	if(N >= 32) { -		ffts_init_offsets(p, N, leafN); -#ifdef __arm__ +        p->ws = FFTS_MALLOC(lut_size, 32); +        if (!p->ws) { +            goto cleanup; +        } + +        p->ws_is = (size_t*) malloc(n_luts * sizeof(*p->ws_is)); +        if (!p->ws_is) { +            goto cleanup; +        } +    } + +    w = p->ws; +    n = leaf_N * 2; +  #ifdef HAVE_NEON -		ffts_init_is(p, N, leafN, 1); +    V4SF neg = (sign < 0) ? V4SF_LIT4(0.0f, 0.0f, 0.0f, 0.0f) : V4SF_LIT4(-0.0f, -0.0f, -0.0f, -0.0f); +#endif + +    /* calculate factors */ +    m = leaf_N << (n_luts - 2); +    tmp = FFTS_MALLOC(m * sizeof(ffts_cpx_32f), 32); + +    ffts_generate_cosine_sine_pow2_32f(tmp, m); + +    /* generate lookup tables */ +    stride = 1 << (n_luts - 1); +    for (i = 0; i < n_luts; i++) { +        p->ws_is[i] = w - (ffts_cpx_32f*) p->ws; + +        if (!i) { +            ffts_cpx_32f *w0 = FFTS_MALLOC(n/4 * sizeof(ffts_cpx_32f), 32); +            float *fw0 = (float*) w0; +            float *fw = (float*) w; + +            for (j = 0; j < n/4; j++) { +                w0[j][0] = tmp[j * stride][0]; +                w0[j][1] = tmp[j * stride][1]; +            } + +#if defined(__arm__) +#ifdef HAVE_NEON +            for (j = 0; j < n/4; j += 4) { +                V4SF2 temp0 = V4SF2_LD(fw0 + j*2); +                temp0.val[1] = V4SF_XOR(temp0.val[1], neg); +                V4SF2_STORE_SPR(fw + j*2, temp0); +            }  #else -		ffts_init_is(p, N, leafN, 1); +            for (j = 0; j < n/4; j++) { +                fw[j*2+0] = fw0[j*2+0]; +                fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; +            }  #endif +            w += n/4;  #else -		ffts_init_is(p, N, leafN, 1); +            for (j = 0; j < n/4; j += 2) { +                V4SF re, im, temp0; +                temp0 = V4SF_LD(fw0 + j*2); +                re = V4SF_DUPLICATE_RE(temp0); +                im = V4SF_DUPLICATE_IM(temp0); +                im = V4SF_XOR(im, MULI_SIGN); +                V4SF_ST(fw + j*4 + 0, re); +                V4SF_ST(fw + j*4 + 4, im); +            } + +            w += n/4 * 2;  #endif -		 -		p->i0 = N/leafN/3+1; -		p->i1 = N/leafN/3; -		if((N/leafN) % 3 > 1) p->i1++; -		p->i2 = N/leafN/3; -		 -		#ifdef __arm__	 -		#ifdef HAVE_NEON -		p->i0/=2; -		p->i1/=2; -		#endif -		#else -		p->i0/=2; -		p->i1/=2; -		#endif - -	}else{ -		p->transforms = malloc(2 * sizeof(transform_index_t)); -		p->transforms[0] = 0; -		p->transforms[1] = 1; -		if(N == 2) p->transform = &firstpass_2; -		else if(N == 4 && sign == -1) p->transform = &firstpass_4_f; -		else if(N == 4 && sign == 1) p->transform = &firstpass_4_b; -		else if(N == 8 && sign == -1) p->transform = &firstpass_8_f; -		else if(N == 8 && sign == 1) p->transform = &firstpass_8_b; -		else if(N == 16 && sign == -1) p->transform = &firstpass_16_f; -		else if(N == 16 && sign == 1) p->transform = &firstpass_16_b; - -		p->is = NULL; -		p->offsets = NULL; -	} - -		int hardcoded = 0; - -		/*      LUTS           */ -		size_t n_luts = __builtin_ctzl(N/leafN); -		if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; } - -		if(n_luts >= 32) n_luts = 0; - -//		fprintf(stderr, "n_luts = %zu\n", n_luts); -		 -		cdata_t *w; - -		int n = leafN*2; -		if(hardcoded) n = 8; -		 -		size_t lut_size = 0; - -		for(i=0;i<n_luts;i++) { -			if(!i || hardcoded) { -			#ifdef __arm__  -				if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t); -				else lut_size += n/4 * sizeof(cdata_t); -			#else -				lut_size += n/4 * 2 * sizeof(cdata_t); -			#endif -				n *= 2; -			} else { -			#ifdef __arm__ -				lut_size += n/8 * 3 * sizeof(cdata_t); -			#else -				lut_size += n/8 * 3 * 2 * sizeof(cdata_t); -			#endif -			} -			n *= 2; -		} -		 -//		lut_size *= 16; -		 -	//	fprintf(stderr, "lut size = %zu\n", lut_size); -		if(n_luts) { -			p->ws = FFTS_MALLOC(lut_size,32); -			p->ws_is = malloc(n_luts * sizeof(size_t)); -		}else{ -			p->ws = NULL; -			p->ws_is = NULL; -		} -		w = p->ws; - -		n = leafN*2; -		if(hardcoded) n = 8; -		 -		#ifdef HAVE_NEON -			V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f); -		#endif -		 -		for(i=0;i<n_luts;i++) { -			p->ws_is[i] = w - (cdata_t *)p->ws;	 -			//fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);	 -			 -			if(!i || hardcoded) { -				cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); - -				size_t j; -				for(j=0;j<n/4;j++) { -					w0[j][0]	= W_re(n,j); -					w0[j][1]	= W_im(n,j); -				} - - -				float *fw0 = (float *)w0; -				#ifdef __arm__ -					if(N < 32) { -						//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); -						float *fw = (float *)w; -						V temp0, temp1, temp2; -						for(j=0;j<n/4;j+=2) { -						//	#ifdef HAVE_NEON -							temp0 = VLD(fw0 + j*2); -							V re, im; -							re = VDUPRE(temp0); -							im = VDUPIM(temp0); -							#ifdef HAVE_NEON  -								im = VXOR(im, MULI_SIGN); -								//im = IMULI(sign>0, im); -							#else -								im = MULI(sign>0, im); -							#endif -							VST(fw + j*4  , re); -							VST(fw + j*4+4, im); -					//		#endif -						} -						w += n/4 * 2; -					}else{ -						//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32); -						float *fw = (float *)w; -						#ifdef HAVE_NEON -							VS temp0, temp1, temp2; -							for(j=0;j<n/4;j+=4) { -								temp0 = VLD2(fw0 + j*2); -								temp0.val[1] = VXOR(temp0.val[1], neg); -								STORESPR(fw + j*2, temp0); -							} -						#else -							for(j=0;j<n/4;j+=1) { -								fw[j*2] = fw0[j*2]; -								fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; -							} -						#endif -						w += n/4; -					} -				#else -					//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32); -					float *fw = (float *)w; -					V temp0, temp1, temp2; -					for(j=0;j<n/4;j+=2) { -						temp0 = VLD(fw0 + j*2); -						V re, im; -						re = VDUPRE(temp0); -						im = VDUPIM(temp0); -						im = VXOR(im, MULI_SIGN); -						VST(fw + j*4  , re); -						VST(fw + j*4+4, im); -					} -					w += n/4 * 2; -				#endif - -				FFTS_FREE(w0); -			}else{ - -				cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); -				cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); -				cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32); - -				size_t j; -				for(j=0;j<n/8;j++) { -					w0[j][0]	= W_re(n,j*2); -					w0[j][1]	= W_im(n,j*2); -					w1[j][0]	= W_re(n,j); -					w1[j][1]	= W_im(n,j); -					w2[j][0]	= W_re(n,j + (n/8)); -					w2[j][1]	= W_im(n,j + (n/8)); - -				} - -				float *fw0 = (float *)w0; -				float *fw1 = (float *)w1; -				float *fw2 = (float *)w2; -				#ifdef __arm__ -					//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32); -					float *fw = (float *)w; -					#ifdef HAVE_NEON	 -						VS temp0, temp1, temp2; -						for(j=0;j<n/8;j+=4) { -							temp0 = VLD2(fw0 + j*2); -							temp0.val[1] = VXOR(temp0.val[1], neg); -							STORESPR(fw + j*2*3,      temp0); -							temp1 = VLD2(fw1 + j*2); -							temp1.val[1] = VXOR(temp1.val[1], neg); -							STORESPR(fw + j*2*3 + 8,  temp1); -							temp2 = VLD2(fw2 + j*2); -							temp2.val[1] = VXOR(temp2.val[1], neg); -							STORESPR(fw + j*2*3 + 16, temp2); -						} -					#else -						for(j=0;j<n/8;j+=1) { -								fw[j*6] = fw0[j*2]; -								fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; -								fw[j*6+2] = fw1[j*2+0]; -								fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1]; -								fw[j*6+4] = fw2[j*2+0]; -								fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1]; -						} -					#endif -					w += n/8 * 3; -				#else -					//w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32); -					float *fw = (float *)w; -					V temp0, temp1, temp2, re, im; -					for(j=0;j<n/8;j+=2) { -						temp0 = VLD(fw0 + j*2); -						re = VDUPRE(temp0); -						im = VDUPIM(temp0); -						im = VXOR(im, MULI_SIGN); -						VST(fw + j*2*6  , re); -						VST(fw + j*2*6+4, im); - -						temp1 = VLD(fw1 + j*2); -						re = VDUPRE(temp1); -						im = VDUPIM(temp1); -						im = VXOR(im, MULI_SIGN); -						VST(fw + j*2*6+8 , re); -						VST(fw + j*2*6+12, im); - -						temp2 = VLD(fw2 + j*2); -						re = VDUPRE(temp2); -						im = VDUPIM(temp2); -						im = VXOR(im, MULI_SIGN); -						VST(fw + j*2*6+16, re); -						VST(fw + j*2*6+20, im); -					} -					w += n/8 * 3 * 2; -				#endif - -				FFTS_FREE(w0); -				FFTS_FREE(w1); -				FFTS_FREE(w2); -			} -			///p->ws[i] = w; - -			n *= 2; -		} - -	float *tmp = (float *)p->ws; - -	if(sign < 0) { -		p->oe_ws = (void *)(&w_data[4]); -		p->ee_ws = (void *)(w_data); -		p->eo_ws = (void *)(&w_data[4]); -	}else{ -		p->oe_ws = (void *)(w_data + 12); -		p->ee_ws = (void *)(w_data + 8); -		p->eo_ws = (void *)(w_data + 12); -	} - -	p->N = N; -	p->lastlut = w; -	p->n_luts = n_luts; -#ifdef DYNAMIC_DISABLED -	if(sign < 0) {  -		if(N >= 32) p->transform = ffts_static_transform_f;  -	}else{ -		if(N >= 32) p->transform = ffts_static_transform_i;  -	} +            FFTS_FREE(w0); +        } else { +            ffts_cpx_32f *w0 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32); +            ffts_cpx_32f *w1 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32); +            ffts_cpx_32f *w2 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32); + +            float *fw0 = (float*) w0; +            float *fw1 = (float*) w1; +            float *fw2 = (float*) w2; + +            float *fw = (float *)w; + +            for (j = 0; j < n/8; j++) { +                w0[j][0] = tmp[2 * j * stride][0]; +                w0[j][1] = tmp[2 * j * stride][1]; + +                w1[j][0] = tmp[j * stride][0]; +                w1[j][1] = tmp[j * stride][1]; + +                w2[j][0] = tmp[(j + (n/8)) * stride][0]; +                w2[j][1] = tmp[(j + (n/8)) * stride][1]; +            } + +#if defined(__arm__) +#ifdef HAVE_NEON +            for (j = 0; j < n/8; j += 4) { +                V4SF2 temp0, temp1, temp2; + +                temp0 = V4SF2_LD(fw0 + j*2); +                temp0.val[1] = V4SF_XOR(temp0.val[1], neg); +                V4SF2_STORE_SPR(fw + j*2*3, temp0); +                 +                temp1 = V4SF2_LD(fw1 + j*2); +                temp1.val[1] = V4SF_XOR(temp1.val[1], neg); +                V4SF2_STORE_SPR(fw + j*2*3 + 8,  temp1); +                 +                temp2 = V4SF2_LD(fw2 + j*2); +                temp2.val[1] = V4SF_XOR(temp2.val[1], neg); +                V4SF2_STORE_SPR(fw + j*2*3 + 16, temp2); +            }  #else -	if(N>=32)  ffts_generate_func_code(p, N, leafN, sign); +            for (j = 0; j < n/8; j++) { +                fw[j*6+0] = fw0[j*2+0]; +                fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1]; +                fw[j*6+2] = fw1[j*2+0]; +                fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1]; +                fw[j*6+4] = fw2[j*2+0]; +                fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1]; +            }  #endif +            w += n/8 * 3; +#else +            for (j = 0; j < n/8; j += 2) { +                V4SF temp0, temp1, temp2, re, im; + +                temp0 = V4SF_LD(fw0 + j*2); +                re = V4SF_DUPLICATE_RE(temp0); +                im = V4SF_DUPLICATE_IM(temp0); +                im = V4SF_XOR(im, MULI_SIGN); +                V4SF_ST(fw + j*2*6+0, re); +                V4SF_ST(fw + j*2*6+4, im); + +                temp1 = V4SF_LD(fw1 + j*2); +                re = V4SF_DUPLICATE_RE(temp1); +                im = V4SF_DUPLICATE_IM(temp1); +                im = V4SF_XOR(im, MULI_SIGN); +                V4SF_ST(fw + j*2*6+8 , re); +                V4SF_ST(fw + j*2*6+12, im); + +                temp2 = V4SF_LD(fw2 + j*2); +                re = V4SF_DUPLICATE_RE(temp2); +                im = V4SF_DUPLICATE_IM(temp2); +                im = V4SF_XOR(im, MULI_SIGN); +                V4SF_ST(fw + j*2*6+16, re); +                V4SF_ST(fw + j*2*6+20, im); +            } + +            w += n/8 * 3 * 2; +#endif + +            FFTS_FREE(w0); +            FFTS_FREE(w1); +            FFTS_FREE(w2); +        } + +        n *= 2; +        stride >>= 1; +    } + +#if defined(HAVE_NEON) +    if (sign < 0) { +        p->oe_ws = (void*)(w_data + 4); +        p->ee_ws = (void*)(w_data); +        p->eo_ws = (void*)(w_data + 4); +    } else { +        p->oe_ws = (void*)(w_data + 12); +        p->ee_ws = (void*)(w_data + 8); +        p->eo_ws = (void*)(w_data + 12); +    } +#endif + +    FFTS_FREE(tmp); -	return p; +    p->lastlut = w; +    p->n_luts = n_luts; +    return 0; + +cleanup: +    return -1;  } +FFTS_API ffts_plan_t* +ffts_init_1d(size_t N, int sign) +{ +    const size_t leaf_N = 8; +    ffts_plan_t *p; + +    if (N < 2 || (N & (N - 1)) != 0) { +        LOG("FFT size must be a power of two\n"); +        return NULL; +    } + +    p = calloc(1, sizeof(*p)); +    if (!p) { +        return NULL; +    } + +    p->destroy = ffts_free_1d; +    p->N = N; + +    if (N >= 32) { +        /* generate lookup tables */ +        if (ffts_generate_luts(p, N, leaf_N, sign)) { +            goto cleanup; +        } + +        p->offsets = ffts_init_offsets(N, leaf_N); +        if (!p->offsets) { +            goto cleanup; +        } + +        p->is = ffts_init_is(N, leaf_N, 1); +        if (!p->is) { +            goto cleanup; +        } + +        p->i0 = N/leaf_N/3 + 1; +        p->i1 = p->i2 = N/leaf_N/3; +        if ((N/leaf_N) % 3 > 1) { +            p->i1++; +        } + +#if !defined(HAVE_VFP) || defined(DYNAMIC_DISABLED) +        p->i0 /= 2; +        p->i1 /= 2; +#endif + +#ifdef DYNAMIC_DISABLED +        if (sign < 0) { +            p->transform = ffts_static_transform_f_32f; +        } else { +            p->transform = ffts_static_transform_i_32f; +        } +#else +        /* determinate transform size */ +#if defined(__arm__) +        if (N < 8192) { +            p->transform_size = 8192; +        } else { +            p->transform_size = N; +        } +#else +        if (N < 2048) { +            p->transform_size = 16384; +        } else { +            p->transform_size = 16384 + 2*N/8 * ffts_ctzl(N); +        } +#endif + +        /* allocate code/function buffer */ +        p->transform_base = ffts_vmem_alloc(p->transform_size); +        if (!p->transform_base) { +            goto cleanup; +        } + +        /* generate code */ +        p->transform = ffts_generate_func_code(p, N, leaf_N, sign); +        if (!p->transform) { +            goto cleanup; +        } + +        /* enable execution with read access for the block */ +        if (ffts_allow_execute(p->transform_base, p->transform_size)) { +            goto cleanup; +        } + +        /* flush from the instruction cache */ +        if (ffts_flush_instruction_cache(p->transform_base, p->transform_size)) { +            goto cleanup; +        } +#endif +    } else { +        switch (N) { +        case 2: +            p->transform = &ffts_small_2_32f; +            break; +        case 4: +            if (sign == -1) { +                p->transform = &ffts_small_forward4_32f; +            } else if (sign == 1) { +                p->transform = &ffts_small_backward4_32f; +            } +            break; +        case 8: +            if (sign == -1) { +                p->transform = &ffts_small_forward8_32f; +            } else if (sign == 1) { +                p->transform = &ffts_small_backward8_32f; +            } +            break; +        case 16: +        default: +            if (sign == -1) { +                p->transform = &ffts_small_forward16_32f; +            } else { +                p->transform = &ffts_small_backward16_32f; +            } +            break; +        } +    } + +    return p; + +cleanup: +    ffts_free_1d(p); +    return NULL; +}
\ No newline at end of file | 
