diff options
Diffstat (limited to 'lib/ffts/src/ffts_cpu.c')
-rw-r--r-- | lib/ffts/src/ffts_cpu.c | 371 |
1 files changed, 371 insertions, 0 deletions
diff --git a/lib/ffts/src/ffts_cpu.c b/lib/ffts/src/ffts_cpu.c new file mode 100644 index 0000000..daf92c8 --- /dev/null +++ b/lib/ffts/src/ffts_cpu.c @@ -0,0 +1,371 @@ +/* + +This file is part of FFTS -- The Fastest Fourier Transform in the South + +Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi> + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. +* Neither the name of the organization nor the +names of its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +*/ + +#include "ffts_cpu.h" + +#if defined(FFTS_BUILDING_CPU_TEST) +#include <stdio.h> +#endif + +#if defined(_WIN32) +#include <intrin.h> +#include <windows.h> +#endif + +/* TODO: add detection/declaration of these to CMake phase */ +#if !defined(FFTS_CPU_X64) +#if defined(_M_AMD64) || defined(__amd64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64) || defined(__x86_64__) +/* 64 bit x86 detected */ +#define FFTS_CPU_X64 +#endif +#endif + +#if !defined(FFTS_CPU_X64) && !defined(FFTS_CPU_X86) +#if defined(i386) || defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__X86__) || defined(_X86_) +/* 32 bit x86 detected */ +#define FFTS_CPU_X86 +#endif +#endif + +/* check if build is 32 bit or 64 bit x86 */ +#if defined(FFTS_CPU_X64) || defined(FFTS_CPU_X86) + +/* Build and tested on +CentOS 6.8 2.6.32-642.11.1.el6.x86_64 - gcc version 4.4.7 20120313 +Mac OSX 10.9 - Apple Clang 6.0 +Ubuntu 14.04 LTS 4.2.0-42 x86_64 - gcc version 4.8.4 +Windows XP SP3 - Visual Studio 2005 SP1 x86/x64 +Windows Vista SP2 - Visual Studio 2010 SP1 x86/x64 +Windows 7 Ultimate SP1 - Visual Studio 2015 x86/x64 +Windows 7 Ultimate SP1 - gcc version 4.9.2 (i686-posix-dwarf-rev1) +Windows 7 Ultimate SP1 - gcc version 4.9.2 (x86_64-posix-seh-rev3) +Windows 10 Pro - Visual Studio 2017 x86/x64 +*/ + +/* Visual Studio 2010 SP1 or newer have _xgetbv intrinsic */ +#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219) +#define FFTS_HAVE_XGETBV +#endif + +#ifndef BIT +#define BIT(n) (1u << n) +#endif + +/* bit masks */ +#define FFTS_CPU_X86_SSE_BITS (BIT(0) | BIT(15) | BIT(23) | BIT(24) | BIT(25)) +#define FFTS_CPU_X86_SSE2_BITS (BIT(26)) +#define FFTS_CPU_X86_SSE3_BITS (BIT(0)) +#define FFTS_CPU_X86_SSSE3_BITS (BIT(9)) +#define FFTS_CPU_X86_SSE4_1_BITS (BIT(19)) +#define FFTS_CPU_X86_SSE4_2_BITS (BIT(20) | BIT(23)) +#define FFTS_CPU_X86_AVX_BITS (BIT(26) | BIT(27) | BIT(28)) +#define FFTS_CPU_X86_XCR0_BITS ( +#define FFTS_CPU_X86_AVX2_BITS (BIT(5)) +#define FFTS_CPU_X86_AVX512_BITS (BIT(16)) + +/* Visual Studio 2008 or older */ +#if defined(FFTS_CPU_X64) && defined(_MSC_VER) && _MSC_VER <= 1500 +#pragma optimize("", off) +static void __fastcall ffts_cpuidex(int subleaf, int regs[4], int leaf) +{ + /* x64 uses a four register fast-call calling convention by default and + arguments are passed in registers RCX, RDX, R8, and R9. By disabling + optimization and passing subleaf as first argument we get __cpuidex + */ + (void) subleaf; + __cpuid(regs, leaf); +} +#pragma optimize("", on) +#endif + +static FFTS_INLINE void ffts_cpuid(int regs[4], int leaf, int subleaf) +{ +#if defined(_MSC_VER) +#if defined(FFTS_CPU_X64) + /* Visual Studio 2010 or newer */ +#if _MSC_VER > 1500 + __cpuidex(regs, leaf, subleaf); +#else + ffts_cpuidex(subleaf, regs, leaf); +#endif +#else + __asm { + mov eax, leaf + mov ecx, subleaf + mov esi, regs + cpuid + mov [esi + 0x0], eax + mov [esi + 0x4], ebx + mov [esi + 0x8], ecx + mov [esi + 0xc], edx + } +#endif +#elif defined(__GNUC__) && __GNUC__ +#if defined(FFTS_CPU_X64) + __asm__ __volatile__( + "cpuid\n\t" + : "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3]) + : "a"(leaf), "c"(subleaf)); +#elif defined(__PIC__) + __asm__ __volatile__( + "xchgl %%ebx, %1\n\t" + "cpuid \n\t" + "xchgl %%ebx, %1\n\t" + : "=a"(regs[0]), "=r"(regs[1]), "=c"(regs[2]), "=d"(regs[3]) + : "a"(leaf), "c"(subleaf)); +#else + __asm__ __volatile__( + "cpuid\n\t" + : "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3]) + : "a"(leaf), "c"(subleaf)); +#endif +#else + /* unknown compiler for x86 */ + regs[0] = regs[1] = regs[2] = regs[3] = 0; +#endif +} + +/* at least Visual Studio 2010 generates invalidate optimized _xgetbv */ +#if defined(FFTS_HAVE_XGETBV) +#pragma optimize("", off) +#endif +static FFTS_INLINE unsigned int ffts_get_xcr0(void) +{ +#if defined(FFTS_HAVE_XGETBV) + return (unsigned int) _xgetbv(0); +#elif defined(_MSC_VER) +#if defined(FFTS_CPU_X64) + /* emulate xgetbv(0) on Windows 7 SP1 or newer */ + typedef DWORD64 (WINAPI *PGETENABLEDXSTATEFEATURES)(VOID); + PGETENABLEDXSTATEFEATURES pfnGetEnabledXStateFeatures = + (PGETENABLEDXSTATEFEATURES) GetProcAddress( + GetModuleHandle(TEXT("kernel32.dll")), "GetEnabledXStateFeatures"); + return pfnGetEnabledXStateFeatures ? (unsigned int) pfnGetEnabledXStateFeatures() : 0; +#else + /* note that we have to touch edx register to tell compiler it's used by emited xgetbv */ + unsigned __int32 hi, lo; + __asm { + xor ecx, ecx + _emit 0x0f + _emit 0x01 + _emit 0xd0 + mov lo, eax + mov hi, edx + } + return (unsigned int) lo; +#endif +#elif defined(__GNUC__) && __GNUC__ + unsigned int lo; + __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0\n" + : "=a"(lo) + : "c"(0) + : "edx"); + return lo; +#else + /* unknown x86 compiler */ + return 0; +#endif +} +#if defined(FFTS_HAVE_XGETBV) +#pragma optimize("", on) +#endif + +int +ffts_cpu_detect(int *extra_flags) +{ + static int cpu_flags = -1; + static int cpu_extra_flags = -1; + int max_basic_func; + int regs[4]; + unsigned int xcr0; + + if (cpu_flags >= 0) { + goto exit; + } + + /* initialize */ + cpu_flags = cpu_extra_flags = 0; + +#if defined(FFTS_BUILDING_CPU_TEST) + printf("cpuid check: "); +#endif +#if defined(FFTS_CPU_X64) + /* cpuid is always supported on x64 */ +#if defined(FFTS_BUILDING_CPU_TEST) + printf("skipped\n"); +#endif +#else +#if defined(_MSC_VER) + _asm { + pushfd + pop eax + mov ebx,eax + xor eax,200000h + push eax + popfd + pushfd + pop eax + push ebx + popfd + mov regs[0 * TYPE regs],eax + mov regs[1 * TYPE regs],ebx + } +#else + __asm__ ( + "pushfl\n\t" + "pop %0\n\t" + "movl %0,%1\n\t" + "xorl $0x200000,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "pushl %1\n\t" + "popfl\n\t" + : "=r" (regs[0]), "=r" (regs[1]) + ); +#endif + /* check CPUID bit (bit 21) in EFLAGS register can be toggled */ + if (((regs[0] ^ regs[1]) & 0x200000) == 0) { +#if defined(FFTS_BUILDING_CPU_TEST) + printf("not supported\n"); +#endif + goto exit; + } +#if defined(FFTS_BUILDING_CPU_TEST) + printf("supported\n"); +#endif +#endif + + /* get the number of basic functions */ + ffts_cpuid(regs, 0, 0); + max_basic_func = regs[0]; +#if defined(FFTS_BUILDING_CPU_TEST) + printf("cpuid eax=0, ecx=0: %d\n", max_basic_func); +#endif + if (max_basic_func == 0) + goto exit; + + /* get feature flags */ + ffts_cpuid(regs, 1, 0); + +#if defined(FFTS_BUILDING_CPU_TEST) + printf("cpuid eax=1, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]); +#endif + +#if defined(FFTS_CPU_X64) + /* minimum for any x64 */ + cpu_flags = FFTS_CPU_X86_SSE | FFTS_CPU_X86_SSE2; +#else + /* test if SSE is supported */ + if ((regs[3] & FFTS_CPU_X86_SSE_BITS) != FFTS_CPU_X86_SSE_BITS) + goto exit; + cpu_flags = FFTS_CPU_X86_SSE; + + /* test if SSE2 is supported */ + if (!(regs[3] & FFTS_CPU_X86_SSE2_BITS)) + goto exit; + cpu_flags |= FFTS_CPU_X86_SSE2; +#endif + + /* test if SSE3 is supported */ + if (!(regs[2] & FFTS_CPU_X86_SSE3_BITS)) + goto exit; + cpu_flags |= FFTS_CPU_X86_SSE3; + + /* test if SSSE3 is supported */ + if (!(regs[2] & FFTS_CPU_X86_SSSE3_BITS)) + goto exit; + cpu_flags |= FFTS_CPU_X86_SSSE3; + + /* test if SSE4.1 is supported */ + if (!(regs[2] & FFTS_CPU_X86_SSE4_1_BITS)) + goto exit; + cpu_flags |= FFTS_CPU_X86_SSE4_1; + + /* test if SSE4.2 is supported */ + if ((regs[2] & FFTS_CPU_X86_SSE4_2_BITS) != FFTS_CPU_X86_SSE4_2_BITS) + goto exit; + cpu_flags |= FFTS_CPU_X86_SSE4_2; + + /* test if AVX is supported */ + if ((regs[2] & FFTS_CPU_X86_AVX_BITS) != FFTS_CPU_X86_AVX_BITS) + goto exit; + + /* test if legaxy x87, 128-bit SSE and 256-bit AVX states are enabled in XCR0 */ + xcr0 = ffts_get_xcr0(); +#if defined(FFTS_BUILDING_CPU_TEST) + printf("xcr0: %u\n", xcr0); +#endif + if ((xcr0 & 0x6) != 0x6) + goto exit; + + cpu_flags |= FFTS_CPU_X86_AVX; + + /* check that cpuid extended features exist */ + if (max_basic_func < 7) + goto exit; + + /* get extended features */ + ffts_cpuid(regs, 7, 0); + +#if defined(FFTS_BUILDING_CPU_TEST) + printf("cpuid eax=7, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]); +#endif + + /* test if AVX2 is supported */ + if ((regs[1] & FFTS_CPU_X86_AVX2_BITS) != FFTS_CPU_X86_AVX2_BITS) + goto exit; + cpu_flags |= FFTS_CPU_X86_AVX2; + + /* test if AVX512 is supported */ + if ((regs[1] & FFTS_CPU_X86_AVX512_BITS) != FFTS_CPU_X86_AVX512_BITS) + goto exit; + cpu_flags |= FFTS_CPU_X86_AVX512; + +exit: + if (extra_flags) { + *extra_flags = cpu_extra_flags; + } + return cpu_flags; +} +#else +int +ffts_cpu_detect(int *extra_flags) +{ + /* not implemented */ +#if defined(FFTS_BUILDING_CPU_TEST) + printf("CPU detection not implemented!!\n"); +#endif + return 0; +} +#endif
\ No newline at end of file |