diff options
Diffstat (limited to 'debian/transcode/transcode-1.1.7/aclib/img_x86_common.h')
| -rw-r--r-- | debian/transcode/transcode-1.1.7/aclib/img_x86_common.h | 613 |
1 files changed, 613 insertions, 0 deletions
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h b/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h new file mode 100644 index 00000000..13ed851f --- /dev/null +++ b/debian/transcode/transcode-1.1.7/aclib/img_x86_common.h @@ -0,0 +1,613 @@ +/* + * img_x86_common.h - common x86/x86-64 assembly macros + * Written by Andrew Church <achurch@achurch.org> + * + * This file is part of transcode, a video stream processing tool. + * transcode is free software, distributable under the terms of the GNU + * General Public License (version 2 or later). See the file COPYING + * for details. + */ + +#ifndef ACLIB_IMG_X86_COMMON_H +#define ACLIB_IMG_X86_COMMON_H + +/*************************************************************************/ + +/* Register names for pointers */ +#ifdef ARCH_X86_64 +# define EAX "%%rax" +# define EBX "%%rbx" +# define ECX "%%rcx" +# define EDX "%%rdx" +# define ESP "%%rsp" +# define EBP "%%rbp" +# define ESI "%%rsi" +# define EDI "%%rdi" +#else +# define EAX "%%eax" +# define EBX "%%ebx" +# define ECX "%%ecx" +# define EDX "%%edx" +# define ESP "%%esp" +# define EBP "%%ebp" +# define ESI "%%esi" +# define EDI "%%edi" +#endif + +/* Macros to push and pop one or two registers within an assembly block. + * The x86-64 ABI allows leaf functions to write to 128 bytes BELOW + * (yes, below) the stack pointer, so we can't just push our own stuff + * there. Argh. */ +#ifdef ARCH_X86_64 +# define FAKE_PUSH_REG "r12" +# define FAKE_PUSH_REG_2 "r13" +# define COMMA_FAKE_PUSH_REG ,FAKE_PUSH_REG +# define PUSH(reg) "mov " reg ", %%" FAKE_PUSH_REG +# define POP(reg) "mov %%" FAKE_PUSH_REG ", " reg +# define PUSH2(reg1,reg2) PUSH(reg1) "; mov " reg2 ", %%" FAKE_PUSH_REG_2 +# define POP2(reg2,reg1) "mov %%" FAKE_PUSH_REG_2 ", " reg2 "; " POP(reg1) +#else +# define COMMA_FAKE_PUSH_REG /*nothing*/ +# define PUSH(reg) "push " reg +# define POP(reg) "pop " reg +# define PUSH2(reg1,reg2) "push " reg1 "; push " reg2 +# define POP2(reg2,reg1) "pop " reg2 "; pop " reg1 +#endif + +/* Data for isolating particular bytes. Used by the SWAP32 macros; if you + * use them, make sure to define DEFINE_MASK_DATA before including this + * file! */ +#ifdef DEFINE_MASK_DATA +static const struct { uint32_t n[64]; } __attribute__((aligned(16))) mask_data = {{ + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, + 0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FF00, + 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, + 0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000, + 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, + 0x00FFFF00, 0x00FFFF00, 0x00FFFF00, 0x00FFFF00, + 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, + 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, + 0xFF0000FF, 0xFF0000FF, 0xFF0000FF, 0xFF0000FF, + 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, + 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, + 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, + 0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF, + 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, +}}; +#endif + +/*************************************************************************/ + +/* Basic assembly macros, used for odd-count loops */ + +/* Swap bytes in pairs of 16-bit values */ +#define X86_SWAP16_2 \ + "movl -4("ESI","ECX",4), %%eax \n\ + movl %%eax, %%edx \n\ + shll $8, %%eax \n\ + andl $0xFF00FF00, %%eax \n\ + shrl $8, %%edx \n\ + andl $0x00FF00FF, %%edx \n\ + orl %%edx, %%eax \n\ + movl %%eax, -4("EDI","ECX",4)" + +/* Swap words in a 32-bit value */ +#define X86_SWAP32 \ + "movl -4("ESI","ECX",4), %%eax \n\ + roll $16, %%eax \n\ + movl %%eax, -4("EDI","ECX",4)" + +/* Swap bytes 0 and 2 of a 32-bit value */ +#define X86_SWAP32_02 \ + "movw -4("ESI","ECX",4), %%ax \n\ + movw -2("ESI","ECX",4), %%dx \n\ + xchg %%dl, %%al \n\ + movw %%ax, -4("EDI","ECX",4) \n\ + movw %%dx, -2("EDI","ECX",4)" + +/* Swap bytes 1 and 3 of a 32-bit value */ +#define X86_SWAP32_13 \ + "movw -4("ESI","ECX",4), %%ax \n\ + movw -2("ESI","ECX",4), %%dx \n\ + xchg %%dh, %%ah \n\ + movw %%ax, -4("EDI","ECX",4) \n\ + movw %%dx, -2("EDI","ECX",4)" + +/* Reverse the order of bytes in a 32-bit value */ +#define X86_REV32 \ + "movl -4("ESI","ECX",4), %%eax \n\ + xchg %%ah, %%al \n\ + roll $16, %%eax \n\ + xchg %%ah, %%al \n\ + movl %%eax, -4("EDI","ECX",4)" + +/* The same, using the BSWAP instruction */ +#define X86_REV32_BSWAP \ + "movl -4("ESI","ECX",4), %%eax \n\ + bswap %%eax \n\ + movl %%eax, -4("EDI","ECX",4)" + +/* Rotate a 32-bit value left 8 bits */ +#define X86_ROL32 \ + "movl -4("ESI","ECX",4), %%eax \n\ + roll $8, %%eax \n\ + movl %%eax, -4("EDI","ECX",4)" + +/* Rotate a 32-bit value right 8 bits */ +#define X86_ROR32 \ + "movl -4("ESI","ECX",4), %%eax \n\ + rorl $8, %%eax \n\ + movl %%eax, -4("EDI","ECX",4)" + +/*************************************************************************/ + +/* Basic assembly routines. Sizes are all given in 32-bit units. */ + +#define ASM_SWAP16_2_X86(size) \ + asm("0: "X86_SWAP16_2" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax", "edx") + +#define ASM_SWAP32_X86(size) \ + asm("0: "X86_SWAP32" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax", "edx") + +#define ASM_SWAP32_02_X86(size) \ + asm("0: "X86_SWAP32_02" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax", "edx") + +#define ASM_SWAP32_13_X86(size) \ + asm("0: "X86_SWAP32_13" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax", "edx") + +#define ASM_REV32_X86(size) \ + asm("0: "X86_REV32" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +#define ASM_ROL32_X86(size) \ + asm("0: "X86_ROL32" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +#define ASM_ROR32_X86(size) \ + asm("0: "X86_ROR32" \n\ + subl $1, %%ecx \n\ + jnz 0b" \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +/*************************************************************************/ +/*************************************************************************/ + +/* Wrapper for SIMD loops. This generates the body of an asm() construct + * (the string only, not the input/output/clobber lists) given the data + * block size (number of data units processed per SIMD loop iteration), + * instructions to save and restore unclobberable registers (such as EBX), + * and the bodies of the odd-count and main loops. The data count is + * assumed to be preloaded in ECX. Parameters are: + * blocksize: number of units of data processed per SIMD loop (must be + * a power of 2); can be a constant or a numerical + * expression containing only constants + * push_regs: string constant containing instructions to push registers + * that must be saved over the small loop + * pop_regs: string constant containing instructions to pop registers + * saved by `push_regs' (restored before the main loop) + * small_loop: loop for handling data elements one at a time (when the + * count is not a multiple of `blocksize' + * main_loop: main SIMD loop for processing data + * emms: EMMS/SFENCE instructions to end main loop with, as needed + */ + +#define SIMD_LOOP_WRAPPER(blocksize,push_regs,pop_regs,small_loop,main_loop,emms) \ + /* Check whether the count is a multiple of the blocksize (this \ + * can cause branch mispredicts but seems to be faster overall) */ \ + "testl $(("#blocksize")-1), %%ecx; " \ + "jz 1f; " \ + /* It's not--run the small loop to align the count */ \ + push_regs"; " \ + "0: " \ + small_loop"; " \ + "subl $1, %%ecx; " \ + "testl $(("#blocksize")-1), %%ecx; " \ + "jnz 0b; " \ + pop_regs"; " \ + /* Make sure there's some data left */ \ + "testl %%ecx, %%ecx; " \ + "jz 2f; " \ + /* Now run the main SIMD loop */ \ + "1: " \ + main_loop"; " \ + "subl $("#blocksize"), %%ecx; " \ + "jnz 1b; " \ + /* Clear MMX state and/or SFENCE, as needed */ \ + emms"; " \ + /* Done */ \ + "2: " + +/*************************************************************************/ + +/* MMX- and SSE2-optimized swap/rotate routines. These routines are + * identical save for data size, so we use common macros to implement them, + * with register names and data offsets replaced by parameters to the + * macros. */ + +#define ASM_SIMD_MMX(name,size) \ + name((size), 64, \ + "movq", "movq", "movq", "", \ + "%%mm0", "%%mm1", "%%mm2", "%%mm3", \ + "%%mm4", "%%mm5", "%%mm6", "%%mm7") +#define ASM_SIMD_SSE2(name,size) \ + name((size), 128, \ + "movdqu", "movdqa", "movdqu", "", \ + "%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\ + "%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7") +#define ASM_SIMD_SSE2_ALIGNED(name,size) \ + name((size), 128, \ + "movdqa", "movdqa", "movntdq", "sfence",\ + "%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\ + "%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7") + +#define ASM_SWAP16_2_MMX(size) ASM_SIMD_MMX(ASM_SWAP16_2_SIMD,(size)) +#define ASM_SWAP16_2_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP16_2_SIMD,(size)) +#define ASM_SWAP16_2_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP16_2_SIMD,(size)) +#define ASM_SWAP32_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_SIMD,(size)) +#define ASM_SWAP32_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_SIMD,(size)) +#define ASM_SWAP32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_SIMD,(size)) +#define ASM_SWAP32_02_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_02_SIMD,(size)) +#define ASM_SWAP32_02_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_02_SIMD,(size)) +#define ASM_SWAP32_02_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_02_SIMD,(size)) +#define ASM_SWAP32_13_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_13_SIMD,(size)) +#define ASM_SWAP32_13_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_13_SIMD,(size)) +#define ASM_SWAP32_13_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_13_SIMD,(size)) +#define ASM_REV32_MMX(size) ASM_SIMD_MMX(ASM_REV32_SIMD,(size)) +#define ASM_REV32_SSE2(size) ASM_SIMD_SSE2(ASM_REV32_SIMD,(size)) +#define ASM_REV32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_REV32_SIMD,(size)) +#define ASM_ROL32_MMX(size) ASM_SIMD_MMX(ASM_ROL32_SIMD,(size)) +#define ASM_ROL32_SSE2(size) ASM_SIMD_SSE2(ASM_ROL32_SIMD,(size)) +#define ASM_ROL32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROL32_SIMD,(size)) +#define ASM_ROR32_MMX(size) ASM_SIMD_MMX(ASM_ROR32_SIMD,(size)) +#define ASM_ROR32_SSE2(size) ASM_SIMD_SSE2(ASM_ROR32_SIMD,(size)) +#define ASM_ROR32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROR32_SIMD,(size)) + +/*************************************************************************/ + +/* Actual implementations. Note that unrolling the SIMD loops doesn't seem + * to be a win (only 2-3% improvement at most), and in fact can lose by a + * bit in short loops. */ + +#define ASM_SWAP16_2_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ X86_SWAP16_2, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + psrlw $8, "MM0" # MM0: - 7 - 5 - 3 - 1 \n\ + psllw $8, "MM1" # MM1: 6 - 4 - 2 - 0 - \n\ + por "MM1", "MM0" # MM0: 6 7 4 5 2 3 0 1 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax", "edx") + +#define ASM_SWAP32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ X86_SWAP32, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + psrld $16, "MM0" # MM0: - - 7 6 - - 3 2 \n\ + pslld $16, "MM1" # MM1: 5 4 - - 1 0 - - \n\ + por "MM1", "MM0" # MM0: 5 4 7 6 1 0 3 2 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +#define ASM_SWAP32_02_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "push "EDX, \ + /* pop_regs */ "pop "EDX, \ + /* small_loop */ X86_SWAP32_02, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\ + pand 16("EDX"), "MM1" # MM1: - - - 4 - - - 0 \n\ + pslld $16, "MM1" # MM1: - 4 - - - 0 - - \n\ + pand 64("EDX"), "MM2" # MM2: - 6 - - - 2 - - \n\ + psrld $16, "MM2" # MM2: - - - 6 - - - 2 \n\ + pand 160("EDX"), "MM0" # MM0: 7 - 5 - 3 - 1 - \n\ + por "MM1", "MM0" # MM0: 7 4 5 - 3 0 1 - \n\ + por "MM2", "MM0" # MM0: 7 4 5 6 3 0 1 2 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \ + "m" (mask_data) \ + : "eax") + +#define ASM_SWAP32_13_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "push "EDX, \ + /* pop_regs */ "pop "EDX, \ + /* small_loop */ X86_SWAP32_13, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\ + pand 32("EDX"), "MM1" # MM1: - - 5 - - - 1 - \n\ + pslld $16, "MM1" # MM1: 5 - - - 1 - - - \n\ + pand 128("EDX"), "MM2" # MM2: 7 - - - 3 - - - \n\ + psrld $16, "MM2" # MM2: - - 7 - - - 3 - \n\ + pand 80("EDX"), "MM0" # MM0: - 6 - 4 - 2 - 0 \n\ + por "MM1", "MM0" # MM0: 5 6 - 4 1 2 - 0 \n\ + por "MM2", "MM0" # MM0: 5 6 7 4 1 2 3 0 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \ + "m" (mask_data) \ + : "eax"); + +#define ASM_REV32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ X86_REV32_BSWAP, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM3" # MM3: 7 6 5 4 3 2 1 0 \n\ + psrld $24, "MM0" # MM0: - - - 7 - - - 3 \n\ + pand 32("EDX"), "MM2" # MM2: - - 5 - - - 1 - \n\ + psrld $8, "MM1" # MM1: - 7 6 5 - 3 2 1 \n\ + pand 32("EDX"), "MM1" # MM1: - - 6 - - - 2 - \n\ + pslld $8, "MM2" # MM2: - 5 - - - 1 - - \n\ + pslld $24, "MM3" # MM3: 4 - - - 0 - - - \n\ + por "MM1", "MM0" # MM0: - - 6 7 - - 2 3 \n\ + por "MM2", "MM0" # MM0: - 5 6 7 - 1 2 3 \n\ + por "MM3", "MM0" # MM0: 4 5 6 7 0 1 2 3 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \ + "m" (mask_data) \ + : "eax") + +#define ASM_ROL32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ X86_ROL32, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + pslld $8, "MM0" # MM0: 6 5 4 - 2 1 0 - \n\ + psrld $24, "MM1" # MM1: - - - 7 - - - 3 \n\ + por "MM1", "MM0" # MM0: 6 5 4 7 2 1 0 3 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +#define ASM_ROR32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ + asm(SIMD_LOOP_WRAPPER( \ + /* blocksize */ (regsize)/32, \ + /* push_regs */ "", \ + /* pop_regs */ "", \ + /* small_loop */ X86_ROR32, \ + /* main_loop */ \ + ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ + # MM0: 7 6 5 4 3 2 1 0 \n\ + "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ + psrld $8, "MM0" # MM0: - 7 6 5 - 3 2 1 \n\ + pslld $24, "MM1" # MM1: 4 - - - 0 - - - \n\ + por "MM1", "MM0" # MM0: 4 7 6 5 0 3 2 1 \n\ + "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ + /* emms */ "emms; "sfence) \ + : /* no outputs */ \ + : "S" (src[0]), "D" (dest[0]), "c" (size) \ + : "eax") + +/*************************************************************************/ + +/* SSE2 macros to load 8 24- or 32-bit RGB pixels into XMM0/1/2 (R/G/B) as + * 16-bit values, used for RGB->YUV and RGB->grayscale conversions. + * ZERO is the number of the XMM register containing all zeroes. */ + +#define SSE2_LOAD_RGB24(ZERO) \ + "movl -21("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xBGR1 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR1 ----- ----- ----- \n\ + movl -18("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xBGR1 ----- ----- xBGR2 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR2 xBGR1 ----- ----- \n\ + movl -15("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xBGR2 xBGR1 ----- xBGR3 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 ----- \n\ + movl -24("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 xBGR0 \n\ + movl -9("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xBGR5 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR5 ----- ----- ----- \n\ + movl -6("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xBGR5 ----- ----- xBGR6 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR6 xBGR5 ----- ----- \n\ + movl -3("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xBGR6 xBGR5 ----- xBGR7 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 ----- \n\ + movl -12("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 xBGR4 \n"\ + SSE2_MASSAGE_RGBA32(ZERO) + +#define SSE2_LOAD_BGR24(ZERO) \ + "movl -21("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xRGB1 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB1 ----- ----- ----- \n\ + movl -18("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xRGB1 ----- ----- xRGB2 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB2 xRGB1 ----- ----- \n\ + movl -15("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xRGB2 xRGB1 ----- xRGB3 \n\ + pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 ----- \n\ + movl -24("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 xRGB0 \n\ + movl -9("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xRGB5 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB5 ----- ----- ----- \n\ + movl -6("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xRGB5 ----- ----- xRGB6 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB6 xRGB5 ----- ----- \n\ + movl -3("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xRGB6 xRGB5 ----- xRGB7 \n\ + pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 ----- \n\ + movl -12("ESI","EBX"), %%eax \n\ + movd %%eax, %%xmm2 \n\ + por %%xmm2, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 xRGB4 \n"\ + SSE2_MASSAGE_BGRA32(ZERO) + +#define SSE2_LOAD_RGBA32(ZERO) "\ + movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ABGR3 ABGR2 ABGR1 ABGR0 \n\ + movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ABGR7 ABGR6 ABGR5 ABGR4 \n"\ + SSE2_MASSAGE_RGBA32(ZERO) +#define SSE2_MASSAGE_RGBA32(ZERO) "\ + movdqa %%xmm0, %%xmm2 # XMM2: ABGR3 ABGR2 ABGR1 ABGR0 \n\ + punpcklbw %%xmm1, %%xmm0 # X0.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\ + punpckhbw %%xmm1, %%xmm2 # X2.l: A6 A2 B6 B2 G6 G2 R6 R2 \n\ + movdqa %%xmm0, %%xmm1 # X1.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\ + punpcklbw %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\ + punpckhbw %%xmm2, %%xmm1 # X1.l: G7 G5 G3 G1 R7 R5 R3 R1 \n\ + movdqa %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\ + punpcklbw %%xmm1, %%xmm0 # XMM0: G7.......G0 R7.......R0 \n\ + punpckhbw %%xmm1, %%xmm2 # XMM2: A7.......A0 B7.......B0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: G7.......G0 R7.......R0 \n\ + punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" + +#define SSE2_LOAD_BGRA32(ZERO) "\ + movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ARGB3 ARGB2 ARGB1 ARGB0 \n\ + movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ARGB7 ARGB6 ARGB5 ARGB4 \n"\ + SSE2_MASSAGE_BGRA32(ZERO) +#define SSE2_MASSAGE_BGRA32(ZERO) "\ + movdqa %%xmm0, %%xmm2 # XMM2: ARGB3 ARGB2 ARGB1 ARGB0 \n\ + punpcklbw %%xmm1, %%xmm2 # X2.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\ + punpckhbw %%xmm1, %%xmm0 # X0.l: A6 A2 R6 R2 G6 G2 B6 B2 \n\ + movdqa %%xmm2, %%xmm1 # X1.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\ + punpcklbw %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\ + punpckhbw %%xmm0, %%xmm1 # X1.l: G7 G5 G3 G1 B7 B5 B3 B1 \n\ + movdqa %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\ + punpcklbw %%xmm1, %%xmm2 # XMM2: G7.......G0 B7.......B0 \n\ + punpckhbw %%xmm1, %%xmm0 # XMM0: A7.......A0 R7.......R0 \n\ + movdqa %%xmm2, %%xmm1 # XMM1: G7.......G0 B7.......B0 \n\ + punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" + +#define SSE2_LOAD_ARGB32(ZERO) "\ + movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: BGRA3 BGRA2 BGRA1 BGRA0 \n\ + movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: BGRA7 BGRA6 BGRA5 BGRA4 \n"\ + SSE2_MASSAGE_ARGB32(ZERO) +#define SSE2_MASSAGE_ARGB32(ZERO) "\ + movdqa %%xmm0, %%xmm2 # XMM2: BGRA3 BGRA2 BGRA1 BGRA0 \n\ + punpcklbw %%xmm1, %%xmm0 # X0.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\ + punpckhbw %%xmm1, %%xmm2 # X2.l: B6 B2 G6 G2 R6 R2 A6 A2 \n\ + movdqa %%xmm0, %%xmm1 # X1.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\ + punpcklbw %%xmm2, %%xmm0 # X0.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\ + punpckhbw %%xmm2, %%xmm1 # X1.l: R7 R5 R3 R1 A7 A5 A3 A1 \n\ + movdqa %%xmm0, %%xmm2 # X2.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\ + punpcklbw %%xmm1, %%xmm0 # XMM0: R7.......G0 A7.......A0 \n\ + punpckhbw %%xmm1, %%xmm2 # XMM2: B7.......G0 G7.......G0 \n\ + movdqa %%xmm2, %%xmm1 # XMM1: B7.......B0 G7.......G0 \n\ + punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" + +#define SSE2_LOAD_ABGR32(ZERO) "\ + movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: RGBA3 RGBA2 RGBA1 RGBA0 \n\ + movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: RGBA7 RGBA6 RGBA5 RGBA4 \n"\ + SSE2_MASSAGE_ABGR32(ZERO) +#define SSE2_MASSAGE_ABGR32(ZERO) "\ + movdqa %%xmm0, %%xmm2 # XMM2: RGBA3 RGBA2 RGBA1 RGBA0 \n\ + punpcklbw %%xmm1, %%xmm2 # X2.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\ + punpckhbw %%xmm1, %%xmm0 # X0.l: R6 R2 G6 G2 B6 B2 A6 A2 \n\ + movdqa %%xmm2, %%xmm1 # X1.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\ + punpcklbw %%xmm0, %%xmm2 # X2.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\ + punpckhbw %%xmm0, %%xmm1 # X1.l: B7 B5 B3 B1 A7 A5 A3 A1 \n\ + movdqa %%xmm2, %%xmm0 # X0.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\ + punpcklbw %%xmm1, %%xmm2 # XMM2: B7.......B0 A7.......A0 \n\ + punpckhbw %%xmm1, %%xmm0 # XMM0: R7.......R0 G7.......G0 \n\ + movdqa %%xmm0, %%xmm1 # XMM1: R7.......R0 G7.......G0 \n\ + punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ + punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ + punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" + +/*************************************************************************/ + +#endif /* ACLIB_IMG_X86_COMMON_H */ + +/* + * Local variables: + * c-file-style: "stroustrup" + * c-file-offsets: ((case-label . *) (statement-case-intro . *)) + * indent-tabs-mode: nil + * End: + * + * vim: expandtab shiftwidth=4: + */ |
