1 files changed, 2410 insertions, 0 deletions
diff --git a/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c b/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c
new file mode 100644
index 00000000..9dc04fcb
--- /dev/null
+++ b/debian/transcode/transcode-1.1.7/aclib/img_yuv_rgb.c
@@ -0,0 +1,2410 @@
+/*
+ * img_yuv_rgb.c - YUV<->RGB image format conversion routines
+ * Written by Andrew Church <achurch@achurch.org>
+ *
+ * This file is part of transcode, a video stream processing tool.
+ * transcode is free software, distributable under the terms of the GNU
+ * General Public License (version 2 or later).  See the file COPYING
+ * for details.
+ */
+
+#include "ac.h"
+#include "ac_internal.h"
+#include "imgconvert.h"
+#include "img_internal.h"
+
+#include <string.h>
+
+#define USE_LOOKUP_TABLES  /* for YUV420P->RGB24 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Standard C implementations */
+
+const int cY  =  76309;
+const int crV = 104597;
+const int cgU = -25675;
+const int cgV = -53279;
+const int cbU = 132201;
+
+/*************************************************************************/
+
+#ifdef USE_LOOKUP_TABLES
+# define TABLE_SCALE 16   /* scale factor for Y */
+static int Ylutbase[768*TABLE_SCALE];
+static int *Ylut = Ylutbase+256*TABLE_SCALE;
+static int rVlut[256];
+static int gUlut[256];
+static int gVlut[256];
+static int bUlut[256];
+static void yuv_create_tables(void) {
+    static int yuv_tables_created = 0;
+    if (!yuv_tables_created) {
+        int i;
+        for (i = -256*TABLE_SCALE; i < 512*TABLE_SCALE; i++) {
+            int v = ((cY*(i-16*TABLE_SCALE)/TABLE_SCALE) + 32768) >> 16;
+            Ylut[i] = v<0 ? 0 : v>255 ? 255 : v;
+        }
+        for (i = 0; i < 256; i++) {
+            rVlut[i] = ((crV * (i-128)) * TABLE_SCALE + cY/2) / cY;
+            gUlut[i] = ((cgU * (i-128)) * TABLE_SCALE + cY/2) / cY;
+            gVlut[i] = ((cgV * (i-128)) * TABLE_SCALE + cY/2) / cY;
+            bUlut[i] = ((cbU * (i-128)) * TABLE_SCALE + cY/2) / cY;
+        }
+        yuv_tables_created = 1;
+    }
+}
+# define YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs) do {               \
+    int Y = src[0][y*width+x] * TABLE_SCALE;                    \
+    int U = src[1][(uvofs)];                                    \
+    int V = src[2][(uvofs)];                                    \
+    dest[0][(y*width+x)*rgbsz+rofs] = Ylut[Y+rVlut[V]];         \
+    dest[0][(y*width+x)*rgbsz+gofs] = Ylut[Y+gUlut[U]+gVlut[V]];\
+    dest[0][(y*width+x)*rgbsz+bofs] = Ylut[Y+bUlut[U]];         \
+} while (0)
+# define YUV2RGB_PACKED(yofs,uofs,vofs,rgbsz,rofs,gofs,bofs) do { \
+    int Y = src[0][(y*width+x)*2+yofs] * TABLE_SCALE;           \
+    int U = src[0][(y*width+(x&~1))*2+uofs];                    \
+    int V = src[0][(y*width+(x&~1))*2+vofs];                    \
+    dest[0][(y*width+x)*rgbsz+rofs] = Ylut[Y+rVlut[V]];         \
+    dest[0][(y*width+x)*rgbsz+gofs] = Ylut[Y+gUlut[U]+gVlut[V]];\
+    dest[0][(y*width+x)*rgbsz+bofs] = Ylut[Y+bUlut[U]];         \
+} while (0)
+#else  /* !USE_LOOKUP_TABLES */
+# define yuv_create_tables() /*nothing*/
+# define YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs) do {               \
+    int Y = cY * (src[0][y*width+x] - 16);                      \
+    int U = src[1][(uvofs)] - 128;                              \
+    int V = src[2][(uvofs)] - 128;                              \
+    int r = (Y + crV*V + 32768) >> 16;                          \
+    int g = (Y + cgU*U + cgV*V + 32768) >> 16;                  \
+    int b = (Y + cbU*U + 32768) >> 16;                          \
+    dest[0][(y*width+x)*rgbsz+rofs] = r<0 ? 0 : r>255 ? 255 : r;\
+    dest[0][(y*width+x)*rgbsz+gofs] = g<0 ? 0 : g>255 ? 255 : g;\
+    dest[0][(y*width+x)*rgbsz+bofs] = b<0 ? 0 : b>255 ? 255 : b;\
+} while (0)
+# define YUV2RGB_PACKED(yofs,uofs,vofs,rgbsz,rofs,gofs,bofs) do { \
+    int Y = cY * (src[0][(y*width+x)*2+yofs] - 16);             \
+    int U = src[0][(y*width+(x&~1))*2+uofs] - 128;              \
+    int V = src[0][(y*width+(x&~1))*2+vofs] - 128;              \
+    int r = (Y + crV*V + 32768) >> 16;                          \
+    int g = (Y + cgU*U + cgV*V + 32768) >> 16;                  \
+    int b = (Y + cbU*U + 32768) >> 16;                          \
+    dest[0][(y*width+x)*rgbsz+rofs] = r<0 ? 0 : r>255 ? 255 : r;\
+    dest[0][(y*width+x)*rgbsz+gofs] = g<0 ? 0 : g>255 ? 255 : g;\
+    dest[0][(y*width+x)*rgbsz+bofs] = b<0 ? 0 : b>255 ? 255 : b;\
+} while (0)
+#endif
+
+#define YUV2RGB_420P(s,r,g,b)  YUV2RGB((y/2)*(width/2)+(x/2),s,r,g,b)
+#define YUV2RGB_411P(s,r,g,b)  YUV2RGB((y  )*(width/4)+(x/4),s,r,g,b)
+#define YUV2RGB_422P(s,r,g,b)  YUV2RGB((y  )*(width/2)+(x/2),s,r,g,b)
+#define YUV2RGB_444P(s,r,g,b)  YUV2RGB((y  )*(width  )+(x  ),s,r,g,b)
+#define YUV2RGB_YUY2(s,r,g,b)  YUV2RGB_PACKED(0,1,3,         s,r,g,b)
+#define YUV2RGB_UYVY(s,r,g,b)  YUV2RGB_PACKED(1,0,2,         s,r,g,b)
+#define YUV2RGB_YVYU(s,r,g,b)  YUV2RGB_PACKED(0,3,1,         s,r,g,b)
+
+#define DEFINE_YUV2RGB(name,op) \
+static int name(uint8_t **src, uint8_t **dest, int width, int height)   \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    yuv_create_tables();                                                \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < width; x++) {                                   \
+            op;                                                         \
+        }                                                               \
+    }                                                                   \
+    return 1;                                                           \
+}
+
+#define DEFINE_YUV2RGB_SET(rgb,rgbsz,rofs,gofs,bofs) \
+    DEFINE_YUV2RGB(yuv420p_##rgb, YUV2RGB_420P(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(yuv411p_##rgb, YUV2RGB_411P(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(yuv422p_##rgb, YUV2RGB_422P(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(yuv444p_##rgb, YUV2RGB_444P(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(yuy2_##rgb,    YUV2RGB_YUY2(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(uyvy_##rgb,    YUV2RGB_UYVY(rgbsz,rofs,gofs,bofs)) \
+    DEFINE_YUV2RGB(yvyu_##rgb,    YUV2RGB_YVYU(rgbsz,rofs,gofs,bofs))
+
+DEFINE_YUV2RGB_SET(rgb24,  3,0,1,2)
+DEFINE_YUV2RGB_SET(bgr24,  3,2,1,0)
+DEFINE_YUV2RGB_SET(rgba32, 4,0,1,2)
+DEFINE_YUV2RGB_SET(abgr32, 4,3,2,1)
+DEFINE_YUV2RGB_SET(argb32, 4,1,2,3)
+DEFINE_YUV2RGB_SET(bgra32, 4,2,1,0)
+
+/* Y8->RGB is defined as part of grayscale stuff below */
+
+/*************************************************************************/
+
+#define RGB2Y() \
+    (dest[0][y*width+x] = ((16829*r + 33039*g +  6416*b + 32768) >> 16) + 16)
+#define RGB2U(uvofs) \
+    (dest[1][(uvofs)]   = ((-9714*r - 19070*g + 28784*b + 32768) >> 16) + 128)
+#define RGB2V(uvofs) \
+    (dest[2][(uvofs)]   = ((28784*r - 24103*g -  4681*b + 32768) >> 16) + 128)
+#define RGB2Y_PACKED(ofs) \
+    (dest[0][(y*width+x)*2+(ofs)] = ((16829*r + 33039*g +  6416*b + 32768) >> 16) + 16)
+#define RGB2U_PACKED(ofs) \
+    (dest[0][(y*width+x)*2+(ofs)] = ((-9714*r - 19070*g + 28784*b + 32768) >> 16) + 128)
+#define RGB2V_PACKED(ofs) \
+    (dest[0][(y*width+x)*2+(ofs)] = ((28784*r - 24103*g -  4681*b + 32768) >> 16) + 128)
+
+#define RGB2YUV(utest,vtest,uvofs) \
+    RGB2Y(); if (utest) RGB2U(uvofs); if (vtest) RGB2V(uvofs)
+#define RGB2YUV_PACKED(utest,vtest,yofs,uvofs) \
+    RGB2Y_PACKED(yofs); \
+    if (utest) RGB2U_PACKED(uvofs); \
+    if (vtest) RGB2V_PACKED(uvofs)
+/* YUV420P: take Cb/Cr from opposite corners */
+#define RGB2YUV_420P  RGB2YUV(!((x|y) & 1), (x&y) & 1, (y/2)*(width/2)+(x/2))
+/* YUV411P: take Cb/Cr from points 2 pixels apart */
+#define RGB2YUV_411P  RGB2YUV(!(x & 3), !((x^2) & 3), y*(width/4)+(x/4))
+/* YUV422P: take Cb/Cr from adjacent pixels */
+#define RGB2YUV_422P  RGB2YUV(!(x & 1), x & 1, y*(width/2)+(x/2))
+/* YUV444P: every pixel is sampled */
+#define RGB2YUV_444P  RGB2YUV(1, 1, y*width+x)
+/* YUY2/UYVY/YVYU: take Cb/Cr from the corresponding pixel */
+#define RGB2YUV_YUY2  RGB2YUV_PACKED(!(x & 1), x & 1, 0,1)
+#define RGB2YUV_UYVY  RGB2YUV_PACKED(!(x & 1), x & 1, 1,0)
+#define RGB2YUV_YVYU  RGB2YUV_PACKED(x & 1, !(x & 1), 0,1)
+
+#define DEFINE_RGB2YUV(name,rgbsz,rofs,gofs,bofs,op) \
+static int name(uint8_t **src, uint8_t **dest, int width, int height)   \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < width; x++) {                                   \
+            int r = src[0][(y*width+x)*rgbsz+rofs];                     \
+            int g = src[0][(y*width+x)*rgbsz+gofs];                     \
+            int b = src[0][(y*width+x)*rgbsz+bofs];                     \
+            op;                                                         \
+        }                                                               \
+    }                                                                   \
+    return 1;                                                           \
+}
+
+#define DEFINE_RGB2Y8(name,rgbsz,rofs,gofs,bofs) \
+static int name(uint8_t **src, uint8_t **dest, int width, int height)   \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < width; x++) {                                   \
+            int r = src[0][(y*width+x)*rgbsz+rofs];                     \
+            int g = src[0][(y*width+x)*rgbsz+gofs];                     \
+            int b = src[0][(y*width+x)*rgbsz+bofs];                     \
+            RGB2Y();                                                    \
+        }                                                               \
+    }                                                                   \
+    return 1;                                                           \
+}
+
+#define DEFINE_RGB2YUV_SET(rgb,rgbsz,rofs,gofs,bofs) \
+    DEFINE_RGB2YUV(rgb##_yuv420p,  rgbsz,rofs,gofs,bofs, RGB2YUV_420P) \
+    DEFINE_RGB2YUV(rgb##_yuv411p,  rgbsz,rofs,gofs,bofs, RGB2YUV_411P) \
+    DEFINE_RGB2YUV(rgb##_yuv422p,  rgbsz,rofs,gofs,bofs, RGB2YUV_422P) \
+    DEFINE_RGB2YUV(rgb##_yuv444p,  rgbsz,rofs,gofs,bofs, RGB2YUV_444P) \
+    DEFINE_RGB2YUV(rgb##_yuy2,     rgbsz,rofs,gofs,bofs, RGB2YUV_YUY2) \
+    DEFINE_RGB2YUV(rgb##_uyvy,     rgbsz,rofs,gofs,bofs, RGB2YUV_UYVY) \
+    DEFINE_RGB2YUV(rgb##_yvyu,     rgbsz,rofs,gofs,bofs, RGB2YUV_YVYU) \
+    DEFINE_RGB2Y8 (rgb##_y8,       rgbsz,rofs,gofs,bofs)
+
+DEFINE_RGB2YUV_SET(rgb24,  3,0,1,2)
+DEFINE_RGB2YUV_SET(bgr24,  3,2,1,0)
+DEFINE_RGB2YUV_SET(rgba32, 4,0,1,2)
+DEFINE_RGB2YUV_SET(abgr32, 4,3,2,1)
+DEFINE_RGB2YUV_SET(argb32, 4,1,2,3)
+DEFINE_RGB2YUV_SET(bgra32, 4,2,1,0)
+
+/*************************************************************************/
+
+/* All YUV planar formats convert to grayscale the same way */
+
+#ifdef USE_LOOKUP_TABLES
+static uint8_t graylut[2][256];
+static int graylut_created = 0;
+static void gray8_create_tables(void)
+{
+    if (!graylut_created) {
+        int i;
+        for (i = 0; i < 256; i++) {
+            if (i <= 16)
+                graylut[0][i] = 0;
+            else if (i >= 235)
+                graylut[0][i] = 255;
+            else
+                graylut[0][i] = (i-16) * 255 / 219;
+            graylut[1][i] = 16 + i*219/255;
+        }
+        graylut_created = 1;
+    }
+}
+# define Y2GRAY(val) (graylut[0][(val)])
+# define GRAY2Y(val) (graylut[1][(val)])
+#else
+# define gray8_create_tables() /*nothing*/
+# define Y2GRAY(val) ((val)<16 ? 0 : (val)>=235 ? 255 : ((val)-16)*256/219)
+# define GRAY2Y(val) (16 + (val)*219/255)
+#endif
+
+static int yuvp_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i] = Y2GRAY(src[0][i]);
+    return 1;
+}
+
+static int yuy2_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i] = Y2GRAY(src[0][i*2]);
+    return 1;
+}
+
+static int uyvy_gray8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i] = Y2GRAY(src[0][i*2+1]);
+    return 1;
+}
+
+/*************************************************************************/
+
+static int gray8_y8(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i] = GRAY2Y(src[0][i]);
+    return 1;
+}
+
+static int gray8_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+        return 0;
+    memset(dest[1], 128, (width/2)*(height/2));
+    memset(dest[2], 128, (width/2)*(height/2));
+    return 1;
+}
+
+static int gray8_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+        return 0;
+    memset(dest[1], 128, (width/4)*height);
+    memset(dest[2], 128, (width/4)*height);
+    return 1;
+}
+
+static int gray8_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+        return 0;
+    memset(dest[1], 128, (width/2)*height);
+    memset(dest[2], 128, (width/2)*height);
+    return 1;
+}
+
+static int gray8_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    if (!ac_imgconvert(src, IMG_GRAY8, dest, IMG_Y8, width, height))
+        return 0;
+    memset(dest[1], 128, width*height);
+    memset(dest[2], 128, width*height);
+    return 1;
+}
+
+static int gray8_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*2  ] = GRAY2Y(src[0][i]);
+        dest[0][i*2+1] = 128;
+    }
+    return 1;
+}
+
+static int gray8_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++) {
+        dest[0][i*2  ] = 128;
+        dest[0][i*2+1] = GRAY2Y(src[0][i]);
+    }
+    return 1;
+}
+
+/*************************************************************************/
+
+/* We only need 3 functions for Y8->RGB (no difference between RGB and BGR) */
+
+static int y8_rgb24(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i*3] = dest[0][i*3+1] = dest[0][i*3+2] = Y2GRAY(src[0][i]);
+    return 1;
+}
+
+static int y8_rgba32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i*4] = dest[0][i*4+1] = dest[0][i*4+2] = Y2GRAY(src[0][i]);
+    return 1;
+}
+
+static int y8_argb32(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    int i;
+    gray8_create_tables();
+    for (i = 0; i < width*height; i++)
+        dest[0][i*4+1] = dest[0][i*4+2] = dest[0][i*4+3] = Y2GRAY(src[0][i]);
+    return 1;
+}
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Accelerated versions of colorspace routines. */
+
+/* Common constant values used in routines: */
+
+#if defined(HAVE_ASM_MMX)
+
+#include "img_x86_common.h"
+
+static const struct { uint16_t n[72]; } __attribute__((aligned(16))) yuv_data = {{
+    0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF, /* for odd/even */
+    0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010, /* for Y -16    */
+    0x0080,0x0080,0x0080,0x0080,0x0080,0x0080,0x0080,0x0080, /* for U/V -128 */
+    0x2543,0x2543,0x2543,0x2543,0x2543,0x2543,0x2543,0x2543, /* Y constant   */
+    0x3313,0x3313,0x3313,0x3313,0x3313,0x3313,0x3313,0x3313, /* rV constant  */
+    0xF377,0xF377,0xF377,0xF377,0xF377,0xF377,0xF377,0xF377, /* gU constant  */
+    0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC,0xE5FC, /* gV constant  */
+    0x408D,0x408D,0x408D,0x408D,0x408D,0x408D,0x408D,0x408D, /* bU constant  */
+    0x0008,0x0008,0x0008,0x0008,0x0008,0x0008,0x0008,0x0008, /* for rounding */
+}};
+/* Note that G->Y exceeds 0x7FFF, so be careful to treat it as unsigned
+ * (the rest of the values are signed) */
+static const struct { uint16_t n[96]; } __attribute__((aligned(16))) rgb_data = {{
+    0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD,0x41BD, /* R->Y         */
+    0x810F,0x810F,0x810F,0x810F,0x810F,0x810F,0x810F,0x810F, /* G->Y         */
+    0x1910,0x1910,0x1910,0x1910,0x1910,0x1910,0x1910,0x1910, /* B->Y         */
+    0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E,0xDA0E, /* R->U         */
+    0xB582,0xB582,0xB582,0xB582,0xB582,0xB582,0xB582,0xB582, /* G->U         */
+    0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070, /* B->U         */
+    0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070,0x7070, /* R->V         */
+    0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9,0xA1D9, /* G->V         */
+    0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7,0xEDB7, /* B->V         */
+    0x0420,0x0420,0x0420,0x0420,0x0420,0x0420,0x0420,0x0420, /* Y +16.5      */
+    0x2020,0x2020,0x2020,0x2020,0x2020,0x2020,0x2020,0x2020, /* U/V +128.5   */
+    0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF,0x00FF, /* for odd/even */
+}};
+#define Y_GRAY 0x4A85
+#define GRAY_Y 0x36F7
+static const struct { uint16_t n[32]; } __attribute__((aligned(16))) gray_data = {{
+    Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY,Y_GRAY, /* 255/219      */
+    GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y,GRAY_Y, /* 219/255      */
+    0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010,0x0010, /* Y +/-16      */
+    0x00FF,0xFF00,0x0000,0x00FF,0xFF00,0x0000,0x0000,0x0000, /* for Y->RGB   */
+}};
+
+/* Convert 4 RGB32 pixels in EAX/EBX/ECX/EDX to RGB24 in EAX/EBX/ECX */
+#define IA32_RGB32_TO_RGB24 \
+        "movl %%ebx, %%esi      # ESI: 00 B1 G1 R1                      \n\
+        shll $24, %%esi         # ESI: R1 00 00 00                      \n\
+        shrl $8, %%ebx          # EBX: 00 00 B1 G1                      \n\
+        orl %%esi, %%eax        # EAX: R1 B0 G0 R0                      \n\
+        movl %%ecx, %%esi       # ESI: 00 B2 G2 R2                      \n\
+        shll $16, %%esi         # ESI: G2 R2 00 00                      \n\
+        shrl $16, %%ecx         # ECX: 00 00 00 B2                      \n\
+        shll $8, %%edx          # EDX: B3 G3 R3 00                      \n\
+        orl %%esi, %%ebx        # EBX: G2 R2 B1 G1                      \n\
+        orl %%edx, %%ecx        # ECX: B3 G3 R3 B2                      \n"
+
+/* Convert 4 RGB24 pixels in EAX/EBX/ECX to RGB32 in EAX/EBX/ECX/EDX */
+#define IA32_RGB24_TO_RGB32 \
+        "movl %%ecx, %%edx      # EDX: B3 G3 R3 B2                      \n\
+        shrl $8, %%edx          # EDX: 00 B3 G3 R3                      \n\
+        andl $0xFF, %%ecx       # ECX: 00 00 00 B2                      \n\
+        movl %%ebx, %%edi       # EDI: G2 R2 B1 G1                      \n\
+        andl $0xFFFF0000, %%edi # EDI: G2 R2 00 00                      \n\
+        orl %%edi, %%ecx        # ECX: G2 R2 00 B2                      \n\
+        rorl $16, %%ecx         # ECX: 00 B2 G2 R2                      \n\
+        movl %%eax, %%edi       # EDI: R1 B0 G0 R0                      \n\
+        andl $0xFF000000, %%edi # EDI: R1 00 00 00                      \n\
+        andl $0x0000FFFF, %%ebx # EBX: 00 00 B1 G1                      \n\
+        orl %%edi, %%ebx        # EBX: R1 00 B1 G1                      \n\
+        roll $8, %%ebx          # EBX: 00 B1 G1 R1                      \n\
+        andl $0x00FFFFFF, %%eax # EAX: 00 B0 G0 R0                      \n"
+
+#endif  /* HAVE_ASM_MMX */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* MMX routines */
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)  /* i.e. not x86_64 */
+
+static inline void mmx_yuv42Xp_to_rgb(uint8_t *srcY, uint8_t *srcU,
+                                      uint8_t *srcV);
+#define mmx_yuv420p_to_rgb mmx_yuv42Xp_to_rgb
+#define mmx_yuv422p_to_rgb mmx_yuv42Xp_to_rgb
+static inline void mmx_store_rgb24(uint8_t *dest);
+static inline void mmx_store_bgr24(uint8_t *dest);
+static inline void mmx_store_rgba32(uint8_t *dest);
+static inline void mmx_store_abgr32(uint8_t *dest);
+static inline void mmx_store_argb32(uint8_t *dest);
+static inline void mmx_store_bgra32(uint8_t *dest);
+
+#define DEFINE_YUV2RGB_MMX(yuv,rgb,uvofs,rgbsz,rofs,gofs,bofs) \
+static int yuv##_##rgb##_mmx(uint8_t **src, uint8_t **dest,             \
+                             int width, int height)                     \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    yuv_create_tables();                                                \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < (width & ~7); x += 8) {                         \
+            mmx_##yuv##_to_rgb(src[0]+y*width+x,                        \
+                               src[1]+(uvofs), src[2]+(uvofs));         \
+            mmx_store_##rgb(dest[0]+(y*width+x)*rgbsz);                 \
+        }                                                               \
+        while (x < width) {                                             \
+            YUV2RGB(uvofs,rgbsz,rofs,gofs,bofs);                        \
+            x++;                                                        \
+        }                                                               \
+    }                                                                   \
+    asm("emms");                                                        \
+    return 1;                                                           \
+}
+
+#define DEFINE_YUV2RGB_MMX_SET(rgb,rgbsz,rofs,gofs,bofs) \
+    DEFINE_YUV2RGB_MMX(yuv420p,rgb,(y/2)*(width/2)+(x/2),rgbsz,rofs,gofs,bofs)\
+    DEFINE_YUV2RGB_MMX(yuv422p,rgb,(y  )*(width/2)+(x/2),rgbsz,rofs,gofs,bofs)
+
+DEFINE_YUV2RGB_MMX_SET(rgb24,  3,0,1,2)
+DEFINE_YUV2RGB_MMX_SET(bgr24,  3,2,1,0)
+DEFINE_YUV2RGB_MMX_SET(rgba32, 4,0,1,2)
+DEFINE_YUV2RGB_MMX_SET(abgr32, 4,3,2,1)
+DEFINE_YUV2RGB_MMX_SET(argb32, 4,1,2,3)
+DEFINE_YUV2RGB_MMX_SET(bgra32, 4,2,1,0)
+
+/************************************/
+
+static inline void mmx_yuv42Xp_to_rgb(uint8_t *srcY, uint8_t *srcU,
+                                      uint8_t *srcV)
+{
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%mm4, %%mm4       # MM4: 00 00 00 00 00 00 00 00          \n\
+        movq ("EAX"), %%mm6     # MM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0          \n\
+        movd ("ECX"), %%mm2     # MM2:             U3 U2 U1 U0          \n\
+        movd ("EDX"), %%mm3     # MM3:             V3 V2 V1 V0          \n\
+        movq %%mm6, %%mm7       # MM7: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0          \n\
+        pand ("ESI"), %%mm6     # MM6:  -Y6-  -Y4-  -Y2-  -Y0-          \n\
+        psrlw $8, %%mm7         # MM7:  -Y7-  -Y5-  -Y3-  -Y1-          \n\
+        punpcklbw %%mm4, %%mm2  # MM2:  -U3-  -U2-  -U1-  -U0-          \n\
+        punpcklbw %%mm4, %%mm3  # MM3:  -V3-  -V2-  -V1-  -V0-          \n\
+        psubw 16("ESI"), %%mm6  # MM6: subtract 16                      \n\
+        psubw 16("ESI"), %%mm7  # MM7: subtract 16                      \n\
+        psubw 32("ESI"), %%mm2  # MM2: subtract 128                     \n\
+        psubw 32("ESI"), %%mm3  # MM3: subtract 128                     \n\
+        psllw $7, %%mm6         # MM6: convert to fixed point 8.7       \n\
+        psllw $7, %%mm7         # MM7: convert to fixed point 8.7       \n\
+        psllw $7, %%mm2         # MM2: convert to fixed point 8.7       \n\
+        psllw $7, %%mm3         # MM3: convert to fixed point 8.7       \n\
+        # Multiply by constants                                         \n\
+        pmulhw 48("ESI"), %%mm6 # MM6: -cY6- -cY4- -cY2- -cY0-          \n\
+        pmulhw 48("ESI"), %%mm7 # MM6: -cY7- -cY5- -cY3- -cY1-          \n\
+        movq 80("ESI"), %%mm4   # MM4: gU constant                      \n\
+        movq 96("ESI"), %%mm5   # MM5: gV constant                      \n\
+        pmulhw %%mm2, %%mm4     # MM4: -gU3- -gU2- -gU1- -gU0-          \n\
+        pmulhw %%mm3, %%mm5     # MM5: -gV3- -gV2- -gV1- -gV0-          \n\
+        paddw %%mm5, %%mm4      # MM4:  -g3-  -g2-  -g1-  -g0-          \n\
+        pmulhw 64("ESI"), %%mm3 # MM3:  -r3-  -r2-  -r1-  -r0-          \n\
+        pmulhw 112("ESI"),%%mm2 # MM2:  -b3-  -b2-  -b1-  -b0-          \n\
+        movq %%mm3, %%mm0       # MM0:  -r3-  -r2-  -r1-  -r0-          \n\
+        movq %%mm4, %%mm1       # MM1:  -g3-  -g2-  -g1-  -g0-          \n\
+        movq %%mm2, %%mm5       # MM5:  -b3-  -b2-  -b1-  -b0-          \n\
+        # Add intermediate results and round/shift to get R/G/B values  \n\
+        paddw 128("ESI"), %%mm6 # Add rounding value (0.5 @ 8.4 fixed)  \n\
+        paddw 128("ESI"), %%mm7                                         \n\
+        paddw %%mm6, %%mm0      # MM0:  -R6-  -R4-  -R2-  -R0-          \n\
+        psraw $4, %%mm0         # Shift back to 8.0 fixed               \n\
+        paddw %%mm6, %%mm1      # MM1:  -G6-  -G4-  -G2-  -G0-          \n\
+        psraw $4, %%mm1                                                 \n\
+        paddw %%mm6, %%mm2      # MM2:  -B6-  -B4-  -B2-  -B0-          \n\
+        psraw $4, %%mm2                                                 \n\
+        paddw %%mm7, %%mm3      # MM3:  -R7-  -R5-  -R3-  -R1-          \n\
+        psraw $4, %%mm3                                                 \n\
+        paddw %%mm7, %%mm4      # MM4:  -G7-  -G5-  -G3-  -G1-          \n\
+        psraw $4, %%mm4                                                 \n\
+        paddw %%mm7, %%mm5      # MM5:  -B7-  -B5-  -B3-  -B1-          \n\
+        psraw $4, %%mm5                                                 \n\
+        # Saturate to 0-255 and pack into bytes                         \n\
+        packuswb %%mm0, %%mm0   # MM0: R6 R4 R2 R0 R6 R4 R2 R0          \n\
+        packuswb %%mm1, %%mm1   # MM1: G6 G4 G2 G0 G6 G4 G2 G0          \n\
+        packuswb %%mm2, %%mm2   # MM2: B6 B4 B2 B0 B6 B4 B2 B0          \n\
+        packuswb %%mm3, %%mm3   # MM3: R7 R5 R3 R1 R7 R5 R3 R1          \n\
+        packuswb %%mm4, %%mm4   # MM4: G7 G5 G3 G1 G7 G5 G3 G1          \n\
+        packuswb %%mm5, %%mm5   # MM5: B7 B5 B3 B1 B7 B5 B3 B1          \n\
+        punpcklbw %%mm3, %%mm0  # MM0: R7 R6 R5 R4 R3 R2 R1 R0          \n\
+        punpcklbw %%mm4, %%mm1  # MM1: G7 G6 G5 G4 G3 G2 G1 G0          \n\
+        punpcklbw %%mm5, %%mm2  # MM2: B7 B6 B5 B4 B3 B2 B1 B0          \n"
+        : /* no outputs */
+        : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+/************************************/
+
+/* Convert YUV->RGB output to RGBA pixels in MM0..MM3 */
+#define MMX_RGB_TO_RGBA "\
+        pxor %%mm7, %%mm7       # MM7: 00 00 00 00 00 00 00 00          \n\
+        movq %%mm0, %%mm3       # MM3: R7 R6 R5 R4 R3 R2 R1 R0          \n\
+        movq %%mm1, %%mm4       # MM4: G7 G6 G5 G4 G3 G2 G1 G0          \n\
+        movq %%mm2, %%mm5       # MM5: B7 B6 B5 B4 B3 B2 B1 B0          \n\
+        punpcklbw %%mm1, %%mm0  # MM0: G3 R3 G2 R2 G1 R1 G0 R0          \n\
+        punpcklbw %%mm7, %%mm2  # MM2: 00 B3 00 B2 00 B1 00 B0          \n\
+        movq %%mm0, %%mm1       # MM1: G3 R3 G2 R2 G1 R1 G0 R0          \n\
+        punpcklwd %%mm2, %%mm0  # MM0: 00 B1 G1 R1 00 B0 G0 R0          \n\
+        punpckhwd %%mm2, %%mm1  # MM1: 00 B3 G3 R3 00 B2 G2 R2          \n\
+        punpckhbw %%mm4, %%mm3  # MM3: G7 R7 G6 R6 G5 R5 G4 R4          \n\
+        punpckhbw %%mm7, %%mm5  # MM5: 00 B7 00 B6 00 B5 00 B4          \n\
+        movq %%mm3, %%mm2       # MM2: G7 R7 G6 R6 G5 R5 G4 R4          \n\
+        punpckhwd %%mm5, %%mm3  # MM3: 00 B7 G7 R7 00 B6 G6 R6          \n\
+        punpcklwd %%mm5, %%mm2  # MM2: 00 B5 G5 R5 00 B4 G4 R4          \n"
+
+/* Convert YUV->RGB output to BGRA pixels in MM0..MM3 */
+#define MMX_RGB_TO_BGRA "\
+        pxor %%mm7, %%mm7       # MM7: 00 00 00 00 00 00 00 00          \n\
+        movq %%mm0, %%mm5       # MM5: R7 R6 R5 R4 R3 R2 R1 R0          \n\
+        movq %%mm1, %%mm4       # MM4: G7 G6 G5 G4 G3 G2 G1 G0          \n\
+        movq %%mm2, %%mm3       # MM3: B7 B6 B5 B4 B3 B2 B1 B0          \n\
+        punpcklbw %%mm1, %%mm2  # MM2: G3 B3 G2 B2 G1 B1 G0 B0          \n\
+        punpcklbw %%mm7, %%mm0  # MM0: 00 R3 00 R2 00 R1 00 R0          \n\
+        movq %%mm2, %%mm1       # MM1: G3 B3 G2 B2 G1 B1 G0 B0          \n\
+        punpcklwd %%mm0, %%mm2  # MM2: 00 R1 G1 B1 00 R0 G0 B0          \n\
+        punpckhwd %%mm0, %%mm1  # MM1: 00 R3 G3 B3 00 R2 G2 B2          \n\
+        movq %%mm2, %%mm0       # MM0: 00 R1 G1 B1 00 R0 G0 B0          \n\
+        punpckhbw %%mm4, %%mm3  # MM3: G7 B7 G6 B6 G5 B5 G4 B4          \n\
+        punpckhbw %%mm7, %%mm5  # MM5: 00 R7 00 R6 00 R5 00 R4          \n\
+        movq %%mm3, %%mm2       # MM2: G7 B7 G6 B6 G5 B5 G4 B4          \n\
+        punpckhwd %%mm5, %%mm3  # MM3: 00 R7 G7 B7 00 R6 G6 B6          \n\
+        punpcklwd %%mm5, %%mm2  # MM2: 00 R5 G5 B5 00 R4 G4 B4          \n"
+
+
+static inline void mmx_store_rgb24(uint8_t *dest)
+{
+    /* It looks like it's fastest to go to RGB32 first, then shift the
+     * result to merge the 24-bit pixels together. */
+    asm(MMX_RGB_TO_RGBA                                                  "\
+        movq %%mm0, %%mm4       # MM4: 00 B1 G1 R1 00 B0 G0 R0          \n\
+        movq %%mm1, %%mm5       # MM5: 00 B3 G3 R3 00 B2 G2 R2          \n\
+        movq %%mm2, %%mm6       # MM6: 00 B5 G5 R5 00 B4 G4 R4          \n\
+        movq %%mm3, %%mm7       # MM7: 00 B7 G7 R7 00 B6 G6 R6          \n\
+        psrlq $32, %%mm4        # MM4: 00 00 00 00 00 B1 G1 R1          \n\
+        psrlq $32, %%mm5        # MM5: 00 00 00 00 00 B3 G3 R3          \n\
+        psrlq $32, %%mm6        # MM6: 00 00 00 00 00 B5 G5 R5          \n\
+        psrlq $32, %%mm7        # MM7: 00 00 00 00 00 B7 G7 R7          \n\
+        push "EBX"                                                      \n\
+        movd %%mm0, %%eax       # EAX: 00 B0 G0 R0                      \n\
+        movd %%mm4, %%ebx       # EBX: 00 B1 G1 R1                      \n\
+        movd %%mm1, %%ecx       # ECX: 00 B2 G2 R2                      \n\
+        movd %%mm5, %%edx       # EDX: 00 B3 G3 R3                      \n\
+        "IA32_RGB32_TO_RGB24"                                           \n\
+        movl %%eax, ("EDI")                                             \n\
+        movl %%ebx, 4("EDI")                                            \n\
+        movl %%ecx, 8("EDI")                                            \n\
+        movd %%mm2, %%eax       # EAX: 00 B4 G4 R4                      \n\
+        movd %%mm6, %%ebx       # EBX: 00 B5 G5 R5                      \n\
+        movd %%mm3, %%ecx       # ECX: 00 B6 G6 R6                      \n\
+        movd %%mm7, %%edx       # EDX: 00 B7 G7 R7                      \n\
+        "IA32_RGB32_TO_RGB24"                                           \n\
+        movl %%eax, 12("EDI")                                           \n\
+        movl %%ebx, 16("EDI")                                           \n\
+        movl %%ecx, 20("EDI")                                           \n\
+        pop "EBX"                                                       \n"
+        : /* no outputs */
+        : "D" (dest)
+        : "eax", "ecx", "edx", "esi"
+    );
+}
+
+static inline void mmx_store_bgr24(uint8_t *dest)
+{
+    asm(MMX_RGB_TO_BGRA                                                  "\
+        movq %%mm0, %%mm4       # MM4: 00 B1 G1 R1 00 B0 G0 R0          \n\
+        movq %%mm1, %%mm5       # MM5: 00 B3 G3 R3 00 B2 G2 R2          \n\
+        movq %%mm2, %%mm6       # MM6: 00 B5 G5 R5 00 B4 G4 R4          \n\
+        movq %%mm3, %%mm7       # MM7: 00 B7 G7 R7 00 B6 G6 R6          \n\
+        psrlq $32, %%mm4        # MM4: 00 00 00 00 00 B1 G1 R1          \n\
+        psrlq $32, %%mm5        # MM5: 00 00 00 00 00 B3 G3 R3          \n\
+        psrlq $32, %%mm6        # MM6: 00 00 00 00 00 B5 G5 R5          \n\
+        psrlq $32, %%mm7        # MM7: 00 00 00 00 00 B7 G7 R7          \n\
+        push "EBX"                                                      \n\
+        movd %%mm0, %%eax       # EAX: 00 B0 G0 R0                      \n\
+        movd %%mm4, %%ebx       # EBX: 00 B1 G1 R1                      \n\
+        movd %%mm1, %%ecx       # ECX: 00 B2 G2 R2                      \n\
+        movd %%mm5, %%edx       # EDX: 00 B3 G3 R3                      \n\
+        "IA32_RGB32_TO_RGB24"                                           \n\
+        movl %%eax, ("EDI")                                             \n\
+        movl %%ebx, 4("EDI")                                            \n\
+        movl %%ecx, 8("EDI")                                            \n\
+        movd %%mm2, %%eax       # EAX: 00 B4 G4 R4                      \n\
+        movd %%mm6, %%ebx       # EBX: 00 B5 G5 R5                      \n\
+        movd %%mm3, %%ecx       # ECX: 00 B6 G6 R6                      \n\
+        movd %%mm7, %%edx       # EDX: 00 B7 G7 R7                      \n\
+        "IA32_RGB32_TO_RGB24"                                           \n\
+        movl %%eax, 12("EDI")                                           \n\
+        movl %%ebx, 16("EDI")                                           \n\
+        movl %%ecx, 20("EDI")                                           \n\
+        pop "EBX"                                                       \n"
+        : /* no outputs */
+        : "D" (dest)
+        : "eax", "ecx", "edx", "esi"
+    );
+}
+
+static inline void mmx_store_rgba32(uint8_t *dest)
+{
+    asm(MMX_RGB_TO_RGBA                                                  "\
+        movq %%mm0,   ("EDI")                                           \n\
+        movq %%mm1,  8("EDI")                                           \n\
+        movq %%mm2, 16("EDI")                                           \n\
+        movq %%mm3, 24("EDI")                                           \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void mmx_store_abgr32(uint8_t *dest)
+{
+    asm(MMX_RGB_TO_BGRA                                                  "\
+        psllq $8, %%mm0                                                 \n\
+        psllq $8, %%mm1                                                 \n\
+        psllq $8, %%mm2                                                 \n\
+        psllq $8, %%mm3                                                 \n\
+        movq %%mm0,   ("EDI")                                           \n\
+        movq %%mm1,  8("EDI")                                           \n\
+        movq %%mm2, 16("EDI")                                           \n\
+        movq %%mm3, 24("EDI")                                           \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void mmx_store_argb32(uint8_t *dest)
+{
+    asm(MMX_RGB_TO_RGBA                                                  "\
+        psllq $8, %%mm0                                                 \n\
+        psllq $8, %%mm1                                                 \n\
+        psllq $8, %%mm2                                                 \n\
+        psllq $8, %%mm3                                                 \n\
+        movq %%mm0,   ("EDI")                                           \n\
+        movq %%mm1,  8("EDI")                                           \n\
+        movq %%mm2, 16("EDI")                                           \n\
+        movq %%mm3, 24("EDI")                                           \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void mmx_store_bgra32(uint8_t *dest)
+{
+    asm(MMX_RGB_TO_BGRA                                                  "\
+        movq %%mm0,   ("EDI")                                           \n\
+        movq %%mm1,  8("EDI")                                           \n\
+        movq %%mm2, 16("EDI")                                           \n\
+        movq %%mm3, 24("EDI")                                           \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+#endif  /* HAVE_ASM_MMX && ARCH_X86 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* SSE2 routines */
+
+#if defined(HAVE_ASM_SSE2)
+
+/*************************************************************************/
+
+static inline void sse2_load_yuv420p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuv411p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuv422p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuv444p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yuy2(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_uyvy(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width);
+static inline void sse2_load_yvyu(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width);
+static inline void sse2_yuv_to_rgb(void);
+static inline void sse2_yuv444_to_rgb(void);
+static inline void sse2_store_rgb24(uint8_t *dest);
+static inline void sse2_store_bgr24(uint8_t *dest);
+static inline void sse2_store_rgba32(uint8_t *dest);
+static inline void sse2_store_abgr32(uint8_t *dest);
+static inline void sse2_store_argb32(uint8_t *dest);
+static inline void sse2_store_bgra32(uint8_t *dest);
+
+#define DEFINE_YUV2RGB_SSE2(yuv,y2r,rgb,rgbsz,slowop) \
+static int yuv##_##rgb##_sse2(uint8_t **src, uint8_t **dest,            \
+                              int width, int height)                    \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    yuv_create_tables();                                                \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < (width & ~15); x += 16) {                       \
+            sse2_load_##yuv(src[0], src[1], src[2], x, y, width);       \
+            sse2_##y2r();                                               \
+            sse2_store_##rgb(dest[0] + (y*width+x)*rgbsz);              \
+        }                                                               \
+        while (x < width) {                                             \
+            slowop;                                                     \
+            x++;                                                        \
+        }                                                               \
+    }                                                                   \
+    asm("emms");                                                        \
+    return 1;                                                           \
+}
+
+#define DEFINE_YUV2RGB_SSE2_SET(rgb,sz,r,g,b) \
+    DEFINE_YUV2RGB_SSE2(yuv420p, yuv_to_rgb,   rgb,sz, YUV2RGB_420P(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(yuv411p, yuv_to_rgb,   rgb,sz, YUV2RGB_411P(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(yuv422p, yuv_to_rgb,   rgb,sz, YUV2RGB_422P(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(yuv444p, yuv444_to_rgb,rgb,sz, YUV2RGB_444P(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(yuy2,    yuv_to_rgb,   rgb,sz, YUV2RGB_YUY2(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(uyvy,    yuv_to_rgb,   rgb,sz, YUV2RGB_UYVY(sz,r,g,b))\
+    DEFINE_YUV2RGB_SSE2(yvyu,    yuv_to_rgb,   rgb,sz, YUV2RGB_YVYU(sz,r,g,b))
+
+DEFINE_YUV2RGB_SSE2_SET(rgb24,  3,0,1,2)
+DEFINE_YUV2RGB_SSE2_SET(bgr24,  3,2,1,0)
+DEFINE_YUV2RGB_SSE2_SET(rgba32, 4,0,1,2)
+DEFINE_YUV2RGB_SSE2_SET(abgr32, 4,3,2,1)
+DEFINE_YUV2RGB_SSE2_SET(argb32, 4,1,2,3)
+DEFINE_YUV2RGB_SSE2_SET(bgra32, 4,2,1,0)
+
+/************************************/
+
+static inline void sse2_load_yuv420p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width)
+{
+    srcY += y*width+x;
+    srcU += (y/2)*(width/2)+(x/2);
+    srcV += (y/2)*(width/2)+(x/2);
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: YF...................Y0         \n\
+        movq ("ECX"), %%xmm2    # XMM2:             U7.......U0         \n\
+        movq ("EDX"), %%xmm3    # XMM3:             V7.......V0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n\
+        punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        punpcklbw %%xmm4,%%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0         \n"
+        : /* no outputs */
+        : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_yuv411p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width)
+{
+    srcY += y*width+x;
+    srcU += y*(width/4)+(x/4);
+    srcV += y*(width/4)+(x/4);
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: YF...................Y0         \n\
+        movd ("ECX"), %%xmm2    # XMM2:                   U3.U0         \n\
+        punpcklbw %%xmm2,%%xmm2 # XMM2:             U3 U3.U0 U0         \n\
+        movd ("EDX"), %%xmm3    # XMM3:                   V3.V0         \n\
+        punpcklbw %%xmm3,%%xmm3 # XMM2:             V3 V3.V0 V0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n\
+        punpcklbw %%xmm4,%%xmm2 # XMM2: U3 U3 U2 U2 U1 U1 U0 U0         \n\
+        punpcklbw %%xmm4,%%xmm3 # XMM3: V3 V3 V2 V2 V1 V1 V0 V0         \n"
+        : /* no outputs */
+        : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_yuv422p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width)
+{
+    srcY += y*width+x;
+    srcU += y*(width/2)+(x/2);
+    srcV += y*(width/2)+(x/2);
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: YF...................Y0         \n\
+        movq ("ECX"), %%xmm2    # XMM2:             U7.......U0         \n\
+        movq ("EDX"), %%xmm3    # XMM3:             V7.......V0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n\
+        punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        punpcklbw %%xmm4,%%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0         \n"
+        : /* no outputs */
+        : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_yuv444p(uint8_t *srcY, uint8_t *srcU,
+                                     uint8_t *srcV, int x, int y, int width)
+{
+    srcY += y*width+x;
+    srcU += y*width+x;
+    srcV += y*width+x;
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: YF...................Y0         \n\
+        movdqu ("ECX"), %%xmm2  # XMM2: UF...................U0         \n\
+        movdqu ("EDX"), %%xmm0  # XMM0: VF...................V0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        punpcklbw %%xmm4,%%xmm6 # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0         \n\
+        punpckhbw %%xmm4,%%xmm7 # XMM7: YF YE YD YC YB YA Y9 Y8         \n\
+        movdqa %%xmm2, %%xmm5   # XMM5: UF...................U0         \n\
+        punpcklbw %%xmm4,%%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        punpckhbw %%xmm4,%%xmm5 # XMM5: UF UE UD UC UB UA U9 U8         \n\
+        movdqa %%xmm0, %%xmm3   # XMM3: VF...................V0         \n\
+        punpcklbw %%xmm4,%%xmm0 # XMM0: V7 V6 V5 V4 V3 V2 V1 V0         \n\
+        punpckhbw %%xmm4,%%xmm3 # XMM3: VF VE VD VC VB VA V9 V8         \n"
+        : /* no outputs */
+        : "a" (srcY), "c" (srcU), "d" (srcV), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_yuy2(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width)
+{
+    srcY += (y*width+x)*2;
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: V3 Y7.............U0 Y0         \n\
+        movdqu 16("EAX"),%%xmm7 # XMM7: V7 YF.............U4 Y8         \n\
+        movdqa %%xmm6, %%xmm2   # XMM2: V3 Y7.............U0 Y0         \n\
+        psrlw $8, %%xmm2        # XMM2: V3 U3 V2 U2 V1 U1 V0 U0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0         \n\
+        movdqa %%xmm7, %%xmm3   # XMM3: V7 YF.............U4 Y8         \n\
+        psrlw $8, %%xmm3        # XMM3: V7 U7 V6 U6 V5 U5 V4 U4         \n\
+        pand ("ESI"), %%xmm7    # XMM6: YF YE YD YC YB YA Y9 Y8         \n\
+        packuswb %%xmm3, %%xmm2 # XMM2: V7 U7.............V0 U0         \n\
+        movdqa %%xmm2, %%xmm3   # XMM3: V7 U7.............V0 U0         \n\
+        pand ("ESI"), %%xmm2    # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        psrlw $8, %%xmm3        # XMM3: V7 V6 V5 V4 V3 V2 V1 V0         \n\
+        packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n"
+        : /* no outputs */
+        : "a" (srcY), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_uyvy(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width)
+{
+    srcY += (y*width+x)*2;
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: Y7 V3.............Y0 00         \n\
+        movdqu 16("EAX"),%%xmm7 # XMM7: YF V7.............Y8 U4         \n\
+        movdqa %%xmm6, %%xmm2   # XMM2: Y7 V3.............Y0 U0         \n\
+        pand ("ESI"), %%xmm2    # XMM2: V3 U3 V2 U2 V1 U1 V0 U0         \n\
+        psrlw $8, %%xmm6        # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0         \n\
+        movdqa %%xmm7, %%xmm3   # XMM3: YF V7.............Y8 U4         \n\
+        pand ("ESI"), %%xmm3    # XMM3: V7 U7 V6 U6 V5 U5 V4 U4         \n\
+        psrlw $8, %%xmm7        # XMM6: YF YE YD YC YB YA Y9 Y8         \n\
+        packuswb %%xmm3, %%xmm2 # XMM2: V7 U7.............V0 U0         \n\
+        movdqa %%xmm2, %%xmm3   # XMM3: V7 U7.............V0 U0         \n\
+        pand ("ESI"), %%xmm2    # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        psrlw $8, %%xmm3        # XMM3: V7 V6 V5 V4 V3 V2 V1 V0         \n\
+        packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n"
+        : /* no outputs */
+        : "a" (srcY), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+static inline void sse2_load_yvyu(uint8_t *srcY, uint8_t *srcU,
+                                  uint8_t *srcV, int x, int y, int width)
+{
+    srcY += (y*width+x)*2;
+    asm("\
+        # Load data, bias and expand to 16 bits                         \n\
+        pxor %%xmm4, %%xmm4     # XMM4: 00 00 00 00 00 00 00 00         \n\
+        movdqu ("EAX"), %%xmm6  # XMM6: U3 Y7.............V0 Y0         \n\
+        movdqu 16("EAX"),%%xmm7 # XMM7: U7 YF.............V4 Y8         \n\
+        movdqa %%xmm6, %%xmm2   # XMM2: U3 Y7.............V0 Y0         \n\
+        psrlw $8, %%xmm2        # XMM2: U3 V3 U2 V2 U1 V1 U0 V0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0         \n\
+        movdqa %%xmm7, %%xmm3   # XMM3: U7 YF.............V4 Y8         \n\
+        psrlw $8, %%xmm3        # XMM3: U7 V7 U6 V6 U5 V5 U4 V4         \n\
+        pand ("ESI"), %%xmm7    # XMM6: YF YE YD YC YB YA Y9 Y8         \n\
+        packuswb %%xmm3, %%xmm2 # XMM2: U7 V7.............U0 V0         \n\
+        movdqa %%xmm2, %%xmm3   # XMM3: U7 V7.............U0 V0         \n\
+        psrlw $8, %%xmm2        # XMM2: U7 U6 U5 U4 U3 U2 U1 U0         \n\
+        pand ("ESI"), %%xmm3    # XMM3: V7 V6 V5 V4 V3 V2 V1 V0         \n\
+        packuswb %%xmm7, %%xmm6 # XMM6: YF...................Y0         \n\
+        movdqa %%xmm6, %%xmm7   # XMM7: YF...................Y0         \n\
+        pand ("ESI"), %%xmm6    # XMM6: YE YC YA Y8 Y6 Y4 Y2 Y0         \n\
+        psrlw $8, %%xmm7        # XMM7: YF YD YB Y9 Y7 Y5 Y3 Y1         \n"
+        : /* no outputs */
+        : "a" (srcY), "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+/************************************/
+
+/* Standard YUV->RGB (Yodd=XMM7 Yeven=XMM6 U=XMM2 V=XMM3) */
+static inline void sse2_yuv_to_rgb(void)
+{
+    asm("\
+        psubw 16("ESI"), %%xmm6 # XMM6: subtract 16                     \n\
+        psllw $7, %%xmm6        # XMM6: convert to fixed point 8.7      \n\
+        psubw 16("ESI"), %%xmm7 # XMM7: subtract 16                     \n\
+        psllw $7, %%xmm7        # XMM7: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm2 # XMM2: subtract 128                    \n\
+        psllw $7, %%xmm2        # XMM2: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm3 # XMM3: subtract 128                    \n\
+        psllw $7, %%xmm3        # XMM3: convert to fixed point 8.7      \n\
+        # Multiply by constants                                         \n\
+        pmulhw 48("ESI"),%%xmm6 # XMM6: cYE.................cY0         \n\
+        pmulhw 48("ESI"),%%xmm7 # XMM7: cYF.................cY1         \n\
+        movdqa 80("ESI"),%%xmm4 # XMM4: gU constant                     \n\
+        pmulhw %%xmm2, %%xmm4   # XMM4: gU7.................gU0         \n\
+        movdqa 96("ESI"),%%xmm5 # XMM5: gV constant                     \n\
+        pmulhw %%xmm3, %%xmm5   # XMM5: gV7.................gV0         \n\
+        paddw %%xmm5, %%xmm4    # XMM4: g7 g6 g5 g4 g3 g2 g1 g0         \n\
+        pmulhw 64("ESI"),%%xmm3 # XMM3: r7 r6 r5 r4 r3 r2 r1 r0         \n\
+        pmulhw 112("ESI"),%%xmm2 #XMM2: b7 b6 b5 b4 b3 b2 b1 b0         \n\
+        movdqa %%xmm3, %%xmm0   # XMM0: r7 r6 r5 r4 r3 r2 r1 r0         \n\
+        movdqa %%xmm4, %%xmm1   # XMM1: g7 g6 g5 g4 g3 g2 g1 g0         \n\
+        movdqa %%xmm2, %%xmm5   # XMM5: b7 b6 b5 b4 b3 b2 b1 b0         \n\
+        # Add intermediate results and round/shift to get R/G/B values  \n\
+        paddw 128("ESI"),%%xmm6 # Add rounding value (0.5 @ 8.4 fixed)  \n\
+        paddw 128("ESI"),%%xmm7                                         \n\
+        paddw %%xmm6, %%xmm0    # XMM0: RE RC RA R8 R6 R4 R2 R0         \n\
+        psraw $4, %%xmm0        # Shift back to 8.0 fixed               \n\
+        paddw %%xmm6, %%xmm1    # XMM1: GE GC GA G8 G6 G4 G2 G0         \n\
+        psraw $4, %%xmm1                                                \n\
+        paddw %%xmm6, %%xmm2    # XMM2: BE BC BA B8 B6 B4 B2 B0         \n\
+        psraw $4, %%xmm2                                                \n\
+        paddw %%xmm7, %%xmm3    # XMM3: RF RD RB R9 R7 R5 R3 R1         \n\
+        psraw $4, %%xmm3                                                \n\
+        paddw %%xmm7, %%xmm4    # XMM4: GF GD GB G9 G7 G5 G3 G1         \n\
+        psraw $4, %%xmm4                                                \n\
+        paddw %%xmm7, %%xmm5    # XMM5: BF BD BB B9 B7 B5 B3 B1         \n\
+        psraw $4, %%xmm5                                                \n\
+        # Saturate to 0-255 and pack into bytes                         \n\
+        packuswb %%xmm0, %%xmm0 # XMM0: RE.......R0 RE.......R0         \n\
+        packuswb %%xmm1, %%xmm1 # XMM1: GE.......G0 GE.......G0         \n\
+        packuswb %%xmm2, %%xmm2 # XMM2: BE.......B0 BE.......B0         \n\
+        packuswb %%xmm3, %%xmm3 # XMM3: RF.......R1 RF.......R1         \n\
+        packuswb %%xmm4, %%xmm4 # XMM4: GF.......G1 GF.......G1         \n\
+        packuswb %%xmm5, %%xmm5 # XMM5: BF.......B1 BF.......B1         \n\
+        punpcklbw %%xmm3,%%xmm0 # XMM0: RF...................R0         \n\
+        punpcklbw %%xmm4,%%xmm1 # XMM1: GF...................G0         \n\
+        punpcklbw %%xmm5,%%xmm2 # XMM2: BF...................B0         \n"
+        : /* no outputs */
+        : "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+/* YUV444 YUV->RGB (Y=XMM7:XMM6 U=XMM5:XMM2 V=XMM3:XMM0) */
+static inline void sse2_yuv444_to_rgb(void)
+{
+    asm("\
+        psubw 16("ESI"), %%xmm6 # XMM6: subtract 16                     \n\
+        psllw $7, %%xmm6        # XMM6: convert to fixed point 8.7      \n\
+        psubw 16("ESI"), %%xmm7 # XMM7: subtract 16                     \n\
+        psllw $7, %%xmm7        # XMM7: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm2 # XMM2: subtract 128                    \n\
+        psllw $7, %%xmm2        # XMM2: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm5 # XMM5: subtract 128                    \n\
+        psllw $7, %%xmm5        # XMM5: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm0 # XMM0: subtract 128                    \n\
+        psllw $7, %%xmm0        # XMM0: convert to fixed point 8.7      \n\
+        psubw 32("ESI"), %%xmm3 # XMM3: subtract 128                    \n\
+        psllw $7, %%xmm3        # XMM3: convert to fixed point 8.7      \n\
+        # Multiply by constants                                         \n\
+        pmulhw 48("ESI"),%%xmm6 # XMM6: cY7.................cY0         \n\
+        movdqa 80("ESI"),%%xmm1 # XMM1: gU constant                     \n\
+        pmulhw %%xmm2, %%xmm1   # XMM1: gU7.................gU0         \n\
+        movdqa 96("ESI"),%%xmm4 # XMM4: gV constant                     \n\
+        pmulhw %%xmm0, %%xmm4   # XMM4: gV7.................gV0         \n\
+        paddw %%xmm4, %%xmm1    # XMM1: g7 g6 g5 g4 g3 g2 g1 g0         \n\
+        pmulhw 64("ESI"),%%xmm0 # XMM0: r7 r6 r5 r4 r3 r2 r1 r0         \n\
+        pmulhw 112("ESI"),%%xmm2 #XMM2: b7 b6 b5 b4 b3 b2 b1 b0         \n\
+        # Add intermediate results and round/shift to get R/G/B values  \n\
+        paddw 128("ESI"),%%xmm6 # Add rounding value (0.5 @ 8.4 fixed)  \n\
+        paddw %%xmm6, %%xmm0    # XMM0: R7 R6 R5 R4 R3 R2 R1 R0         \n\
+        psraw $4, %%xmm0        # Shift back to 8.0 fixed               \n\
+        paddw %%xmm6, %%xmm1    # XMM1: G7 G6 G5 G4 G3 G2 G1 G0         \n\
+        psraw $4, %%xmm1                                                \n\
+        paddw %%xmm6, %%xmm2    # XMM2: B7 B6 B5 B4 B3 B2 B1 B0         \n\
+        psraw $4, %%xmm2                                                \n\
+        # Do it all over again for pixels 8-15                          \n\
+        pmulhw 48("ESI"),%%xmm7 # XMM7: cYF.................cY8         \n\
+        movdqa 80("ESI"),%%xmm6 # XMM6: gU constant                     \n\
+        pmulhw %%xmm5, %%xmm6   # XMM6: gUF.................gU8         \n\
+        movdqa 96("ESI"),%%xmm4 # XMM4: gV constant                     \n\
+        pmulhw %%xmm3, %%xmm4   # XMM4: gVF.................gV8         \n\
+        paddw %%xmm6, %%xmm4    # XMM4: gF gE gD gC gB gA g9 g8         \n\
+        pmulhw 64("ESI"),%%xmm3 # XMM3: rF rE rD rC rB rA r9 r8         \n\
+        pmulhw 112("ESI"),%%xmm5 #XMM5: bF bE bD bC bB bA b9 b8         \n\
+        paddw 128("ESI"),%%xmm7 # Add rounding value (0.5 @ 8.4 fixed)  \n\
+        paddw %%xmm7, %%xmm3    # XMM3: RF RE RD RC RB RA R9 R8         \n\
+        psraw $4, %%xmm3                                                \n\
+        paddw %%xmm7, %%xmm4    # XMM4: GF GE GD GC GB GA G9 G8         \n\
+        psraw $4, %%xmm4                                                \n\
+        paddw %%xmm7, %%xmm5    # XMM5: BF BE BD BC BB BA B9 B8         \n\
+        psraw $4, %%xmm5                                                \n\
+        # Saturate to 0-255 and pack into bytes                         \n\
+        packuswb %%xmm3, %%xmm0 # XMM0: RF...................R0         \n\
+        packuswb %%xmm4, %%xmm1 # XMM1: GF...................G0         \n\
+        packuswb %%xmm5, %%xmm2 # XMM2: BF...................B0         \n"
+        : /* no outputs */
+        : "S" (&yuv_data), "m" (yuv_data)
+    );
+}
+
+/************************************/
+
+/* Convert YUV->RGB output to RGBA pixels in XMM0..XMM3 */
+#define SSE2_RGB_TO_RGBA "\
+        pxor %%xmm7, %%xmm7     # XMM7: 00 00 00 00 00 00 00 00         \n\
+        movdqa %%xmm0, %%xmm3   # XMM3: RF...................R0         \n\
+        movdqa %%xmm1, %%xmm4   # XMM4: GF...................G0         \n\
+        movdqa %%xmm2, %%xmm5   # XMM5: BF...................B0         \n\
+        punpcklbw %%xmm1,%%xmm0 # XMM0: G7 R7.............G0 R0         \n\
+        punpcklbw %%xmm7,%%xmm2 # XMM2: 00 B7.............00 B0         \n\
+        movdqa %%xmm0, %%xmm1   # XMM1: G7 R7.............G0 R0         \n\
+        punpcklwd %%xmm2,%%xmm0 # XMM0: 0BGR3 0BGR2 0BGR1 0BGR0         \n\
+        punpckhwd %%xmm2,%%xmm1 # XMM1: 0BGR7 0BGR6 0BGR5 0BGR4         \n\
+        punpckhbw %%xmm4,%%xmm3 # XMM3: GF RF.............G8 R8         \n\
+        punpckhbw %%xmm7,%%xmm5 # XMM5: 00 BF.............00 B8         \n\
+        movdqa %%xmm3, %%xmm2   # XMM2: GF RF.............G8 R8         \n\
+        punpckhwd %%xmm5,%%xmm3 # XMM3: 0BGRF 0BGRE 0BGRD 0BGRC         \n\
+        punpcklwd %%xmm5,%%xmm2 # XMM2: 0BGRB 0BGRA 0BGR9 0BGR8         \n"
+
+/* Convert YUV->RGB output to BGRA pixels in XMM0..XMM3 */
+#define SSE2_RGB_TO_BGRA "\
+        pxor %%xmm7, %%xmm7     # XMM7: 00 00 00 00 00 00 00 00         \n\
+        movdqa %%xmm0, %%xmm5   # XMM5: RF...................R0         \n\
+        movdqa %%xmm1, %%xmm4   # XMM4: GF...................G0         \n\
+        movdqa %%xmm2, %%xmm3   # XMM3: BF...................B0         \n\
+        punpcklbw %%xmm1,%%xmm2 # XMM0: G7 B7.............G0 B0         \n\
+        punpcklbw %%xmm7,%%xmm0 # XMM2: 00 R7.............00 R0         \n\
+        movdqa %%xmm2, %%xmm1   # XMM1: G7 B7.............G0 B0         \n\
+        punpcklwd %%xmm0,%%xmm2 # XMM2: 0RGB3 0RGB2 0RGB1 0RGB0         \n\
+        punpckhwd %%xmm0,%%xmm1 # XMM1: 0RGB7 0RGB6 0RGB5 0RGB4         \n\
+        movdqa %%xmm2, %%xmm0   # XMM0: 0RGB3 0RGB2 0RGB1 0RGB0         \n\
+        punpckhbw %%xmm4,%%xmm3 # XMM3: GF BF.............G8 B8         \n\
+        punpckhbw %%xmm7,%%xmm5 # XMM5: 00 RF.............00 R8         \n\
+        movdqa %%xmm3, %%xmm2   # XMM2: GF BF.............G8 B8         \n\
+        punpckhwd %%xmm5,%%xmm3 # XMM3: 0RGBF 0RGBE 0RGBD 0RGBC         \n\
+        punpcklwd %%xmm5,%%xmm2 # XMM2: 0RGBB 0RGBA 0RGB9 0RGB8         \n"
+
+/* Convert and 4 RGBA32 (BGRA32) pixels in XMMn to RGB24 (BGR24) and store
+ * at EDI+(12*n) */
+#define SSE2_RGB32_TO_RGB24(n) "\
+        movd %%xmm"#n", %%eax   # EAX: 00 B0 G0 R0                      \n\
+        psrldq $4, %%xmm"#n"    # XMMn: 00000 0BGR3 0BGR2 0BGR1         \n\
+        movd %%xmm"#n", %%ebx   # EBX: 00 B1 G1 R1                      \n\
+        psrldq $4, %%xmm"#n"    # XMMn: 00000 00000 0BGR3 0BGR2         \n\
+        movd %%xmm"#n", %%ecx   # ECX: 00 B2 G2 R2                      \n\
+        psrldq $4, %%xmm"#n"    # XMMn: 00000 00000 00000 0BGR3         \n\
+        movd %%xmm"#n", %%edx   # EDX: 00 B3 G3 R3                      \n\
+        "IA32_RGB32_TO_RGB24"                                           \n\
+        movl %%eax, 12*"#n"+0("EDI")                                    \n\
+        movl %%ebx, 12*"#n"+4("EDI")                                    \n\
+        movl %%ecx, 12*"#n"+8("EDI")                                    \n"
+
+
+static inline void sse2_store_rgb24(uint8_t *dest)
+{
+    /* It looks like it's fastest to go to RGB32 first, then shift the
+     * result to merge the 24-bit pixels together. */
+    asm(SSE2_RGB_TO_RGBA"                                               \n\
+        "PUSH(EBX)"                                                     \n\
+        "SSE2_RGB32_TO_RGB24(0)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(1)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(2)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(3)"                                        \n\
+        "POP(EBX)"                                                      \n"
+        : /* no outputs */
+        : "D" (dest)
+        : "eax", "ecx", "edx", "esi" COMMA_FAKE_PUSH_REG
+    );
+}
+
+static inline void sse2_store_bgr24(uint8_t *dest)
+{
+    asm(SSE2_RGB_TO_BGRA                                                 "\
+        "PUSH(EBX)"                                                     \n\
+        "SSE2_RGB32_TO_RGB24(0)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(1)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(2)"                                        \n\
+        "SSE2_RGB32_TO_RGB24(3)"                                        \n\
+        "POP(EBX)"                                                      \n"
+        : /* no outputs */
+        : "D" (dest)
+        : "eax", "ecx", "edx", "esi" COMMA_FAKE_PUSH_REG
+    );
+}
+
+/* It would be nice to be able to use movntdq here for a 50% speedup,
+ * but we're not guaranteed alignment... (think 766x512 for example) */
+static inline void sse2_store_rgba32(uint8_t *dest)
+{
+    asm(SSE2_RGB_TO_RGBA                                                 "\
+        movdqu %%xmm0,   ("EDI")                                        \n\
+        movdqu %%xmm1, 16("EDI")                                        \n\
+        movdqu %%xmm2, 32("EDI")                                        \n\
+        movdqu %%xmm3, 48("EDI")                                        \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void sse2_store_abgr32(uint8_t *dest)
+{
+    asm(SSE2_RGB_TO_BGRA                                                 "\
+        pslldq $1, %%xmm0                                               \n\
+        pslldq $1, %%xmm1                                               \n\
+        pslldq $1, %%xmm2                                               \n\
+        pslldq $1, %%xmm3                                               \n\
+        movdqu %%xmm0,   ("EDI")                                        \n\
+        movdqu %%xmm1, 16("EDI")                                        \n\
+        movdqu %%xmm2, 32("EDI")                                        \n\
+        movdqu %%xmm3, 48("EDI")                                        \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void sse2_store_argb32(uint8_t *dest)
+{
+    asm(SSE2_RGB_TO_RGBA                                                 "\
+        pslldq $1, %%xmm0                                               \n\
+        pslldq $1, %%xmm1                                               \n\
+        pslldq $1, %%xmm2                                               \n\
+        pslldq $1, %%xmm3                                               \n\
+        movdqu %%xmm0,   ("EDI")                                        \n\
+        movdqu %%xmm1, 16("EDI")                                        \n\
+        movdqu %%xmm2, 32("EDI")                                        \n\
+        movdqu %%xmm3, 48("EDI")                                        \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+static inline void sse2_store_bgra32(uint8_t *dest)
+{
+    asm(SSE2_RGB_TO_BGRA                                                 "\
+        movdqu %%xmm0,   ("EDI")                                        \n\
+        movdqu %%xmm1, 16("EDI")                                        \n\
+        movdqu %%xmm2, 32("EDI")                                        \n\
+        movdqu %%xmm3, 48("EDI")                                        \n"
+        : /* no outputs */
+        : "D" (dest)
+    );
+}
+
+/*************************************************************************/
+
+static inline void sse2_load_rgb24(uint8_t *src);
+static inline void sse2_load_bgr24(uint8_t *src);
+static inline void sse2_load_rgba32(uint8_t *src);
+static inline void sse2_load_abgr32(uint8_t *src);
+static inline void sse2_load_argb32(uint8_t *src);
+static inline void sse2_load_bgra32(uint8_t *src);
+static inline void sse2_rgb_to_yuv420p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuv411p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuv422p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuv444p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yuy2(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_uyvy(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_yvyu(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+static inline void sse2_rgb_to_y8(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width);
+
+#define DEFINE_RGB2YUV_SSE2(rgb,yuv,rgbsz,rofs,gofs,bofs,slowop) \
+static int rgb##_##yuv##_sse2(uint8_t **src, uint8_t **dest,            \
+                              int width, int height)                    \
+{                                                                       \
+    int x, y;                                                           \
+                                                                        \
+    for (y = 0; y < height; y++) {                                      \
+        for (x = 0; x < (width & ~7); x += 8) {                         \
+            sse2_load_##rgb(src[0]+(y*width+x)*rgbsz);                  \
+            sse2_rgb_to_##yuv(dest[0], dest[1], dest[2], x, y, width);  \
+        }                                                               \
+        while (x < width) {                                             \
+            int r = src[0][(y*width+x)*rgbsz+rofs];                     \
+            int g = src[0][(y*width+x)*rgbsz+gofs];                     \
+            int b = src[0][(y*width+x)*rgbsz+bofs];                     \
+            slowop;                                                     \
+            x++;                                                        \
+        }                                                               \
+    }                                                                   \
+    asm("emms");                                                        \
+    return 1;                                                           \
+}
+
+#define DEFINE_RGB2YUV_SSE2_SET(rgb,sz,r,g,b) \
+    DEFINE_RGB2YUV_SSE2(rgb,yuv420p, sz,r,g,b, RGB2YUV_420P) \
+    DEFINE_RGB2YUV_SSE2(rgb,yuv411p, sz,r,g,b, RGB2YUV_411P) \
+    DEFINE_RGB2YUV_SSE2(rgb,yuv422p, sz,r,g,b, RGB2YUV_422P) \
+    DEFINE_RGB2YUV_SSE2(rgb,yuv444p, sz,r,g,b, RGB2YUV_444P) \
+    DEFINE_RGB2YUV_SSE2(rgb,yuy2,    sz,r,g,b, RGB2YUV_YUY2) \
+    DEFINE_RGB2YUV_SSE2(rgb,uyvy,    sz,r,g,b, RGB2YUV_UYVY) \
+    DEFINE_RGB2YUV_SSE2(rgb,yvyu,    sz,r,g,b, RGB2YUV_YVYU) \
+    DEFINE_RGB2YUV_SSE2(rgb,y8,      sz,r,g,b, RGB2Y())
+
+DEFINE_RGB2YUV_SSE2_SET(rgb24,  3,0,1,2)
+DEFINE_RGB2YUV_SSE2_SET(bgr24,  3,2,1,0)
+DEFINE_RGB2YUV_SSE2_SET(rgba32, 4,0,1,2)
+DEFINE_RGB2YUV_SSE2_SET(abgr32, 4,3,2,1)
+DEFINE_RGB2YUV_SSE2_SET(argb32, 4,1,2,3)
+DEFINE_RGB2YUV_SSE2_SET(bgra32, 4,2,1,0)
+
+/************************************/
+
+/* Split 8 RGBA pixels in XMMr/XMMb into R/G/B in XMM0/XMM1/XMM2.
+ * r and b are 0 and 2 for RGB, 2 and 0 for BGR */
+#define SSE2_SPLIT_RGB32(r,b) "\
+        movdqa 176("EDI"), %%xmm7       # XMM7: 00FF*8                  \n\
+        movdqa %%xmm"#r", %%xmm1        # XMM1: XBGR3 XBGR2 XBGR1 XBGR0 \n\
+        movdqa %%xmm"#b", %%xmm3        # XMM3: XBGR7 XBGR6 XBGR5 XBGR4 \n\
+        pand %%xmm7, %%xmm"#r"          # XMMr: B3 R3 B2 R2 B1 R1 B0 R0 \n\
+        psrld $8, %%xmm1                # XMM1: -XBG3 -XBG2 -XBG1 -XBG0 \n\
+        pand %%xmm7, %%xmm"#b"          # XMMb: B7 R7 B6 R6 B5 R5 B4 R4 \n\
+        psrld $8, %%xmm3                # XMM3: -XBG7 -XBG6 -XBG5 -XBG4 \n\
+        pand %%xmm7, %%xmm1             # XMM1: XX G3 XX G2 XX G1 XX G0 \n\
+        packuswb %%xmm"#b", %%xmm"#r"   # XMMr: B7 R7 ........... B0 R0 \n\
+        pand %%xmm7, %%xmm3             # XMM3: XX G7 XX G6 XX G5 XX G4 \n\
+        movdqa %%xmm"#r", %%xmm"#b"     # XMMb: B7 R7 ........... B0 R0 \n\
+        packuswb %%xmm3, %%xmm1         # XMM1: XX G7 ........... XX G0 \n\
+        pand %%xmm7, %%xmm"#r"          # XMMr: R7 R6 R5 R4 R3 R2 R1 R0 \n\
+        psrlw $8, %%xmm"#b"             # XMMb: B7 B6 B5 B4 B3 B2 B1 B0 \n\
+        pand %%xmm7, %%xmm1             # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n"
+
+static inline void sse2_load_rgb24(uint8_t *src)
+{
+    asm("\
+        "PUSH(EBX)"                                                     \n\
+        # Make stack space for loading XMM registers                    \n"
+#ifdef ARCH_X86_64
+"       sub $24+128, "ESP"                                              \n"
+#else
+"       sub $24, "ESP"                                                  \n"
+#endif
+"       # Copy source pixels to appropriate positions in stack (this    \n\
+        # seems to be the fastest way to get them where we want them)   \n\
+        movl $8, %%ebx                                                  \n\
+        movl $24, %%edx                                                 \n\
+        0:                                                              \n\
+        movb -3("ESI","EDX"), %%al                                      \n\
+        movb %%al, 0-1("ESP","EBX")                                     \n\
+        movb -2("ESI","EDX"), %%al                                      \n\
+        movb %%al, 8-1("ESP","EBX")                                     \n\
+        movb -1("ESI","EDX"), %%al                                      \n\
+        movb %%al, 16-1("ESP","EBX")                                    \n\
+        subl $3, %%edx                                                  \n\
+        subl $1, %%ebx                                                  \n\
+        jnz 0b                                                          \n\
+        # Load XMM0-XMM2 with R/G/B values and expand to 16-bit         \n\
+        pxor %%xmm7, %%xmm7                                             \n\
+        movq ("ESP"), %%xmm0                                            \n\
+        punpcklbw %%xmm7, %%xmm0                                        \n\
+        movq 8("ESP"), %%xmm1                                           \n\
+        punpcklbw %%xmm7, %%xmm1                                        \n\
+        movq 16("ESP"), %%xmm2                                          \n\
+        punpcklbw %%xmm7, %%xmm2                                        \n"
+#ifdef ARCH_X86_64
+"       add $24+128, "ESP"                                              \n"
+#else
+"       add $24, "ESP"                                                  \n"
+#endif
+"       "POP(EBX)"                                                      \n"
+        : /* no outputs */
+        : "S" (src)
+        : "eax", "ecx", "edx", "edi" COMMA_FAKE_PUSH_REG
+    );
+}
+
+static inline void sse2_load_bgr24(uint8_t *src)
+{
+    /* Load as RGB and swap registers */
+    sse2_load_rgb24(src);
+    asm("\
+        movdqa %%xmm0, %%xmm3                                           \n\
+        movdqa %%xmm2, %%xmm0                                           \n\
+        movdqa %%xmm3, %%xmm2                                           \n"
+        : /* no outputs */
+        : /* no inputs */
+    );
+}
+
+static inline void sse2_load_rgba32(uint8_t *src)
+{
+    asm("\
+        movdqu ("ESI"), %%xmm0          # XMM0: XBGR3 XBGR2 XBGR1 XBGR0 \n\
+        movdqu 16("ESI"), %%xmm2        # XMM2: XBGR7 XBGR6 XBGR5 XBGR4 \n\
+        "SSE2_SPLIT_RGB32(0,2)"                                         \n"
+        : /* no outputs */
+        : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_load_abgr32(uint8_t *src)
+{
+    asm("\
+        movdqu ("ESI"), %%xmm2          # XMM2: RGBX3 RGBX2 RGBX1 RGBX0 \n\
+        movdqu 16("ESI"), %%xmm0        # XMM0: RGBX7 RGBX6 RGBX5 RGBX4 \n\
+        psrld $8, %%xmm2                # XMM2: -RGB3 -RGB2 -RGB1 -RGB0 \n\
+        psrld $8, %%xmm0                # XMM0: -RGB7 -RGB6 -RGB5 -RGB4 \n\
+        "SSE2_SPLIT_RGB32(2,0)"                                         \n"
+        : /* no outputs */
+        : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_load_argb32(uint8_t *src)
+{
+    asm("\
+        movdqu ("ESI"), %%xmm0          # XMM0: BGRX3 BGRX2 BGRX1 BGRX0 \n\
+        movdqu 16("ESI"), %%xmm2        # XMM2: BGRX7 BGRX6 BGRX5 BGRX4 \n\
+        psrld $8, %%xmm0                # XMM0: -BGR3 -BGR2 -BGR1 -BGR0 \n\
+        psrld $8, %%xmm2                # XMM2: -BGR7 -BGR6 -BGR5 -BGR4 \n\
+        "SSE2_SPLIT_RGB32(0,2)"                                         \n"
+        : /* no outputs */
+        : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_load_bgra32(uint8_t *src)
+{
+    asm("\
+        movdqu ("ESI"), %%xmm2          # XMM2: XRGB3 XRGB2 XRGB1 XRGB0 \n\
+        movdqu 16("ESI"), %%xmm0        # XMM0: XRGB7 XRGB6 XRGB5 XRGB4 \n\
+        "SSE2_SPLIT_RGB32(2,0)"                                         \n"
+        : /* no outputs */
+        : "S" (src), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+/************************************/
+
+#define SSE2_RGB2Y "\
+        # Make RGB data into 8.6 fixed-point, then create 8.6           \n\
+        # fixed-point Y data in XMM3                                    \n\
+        psllw $6, %%xmm0                                                \n\
+        movdqa %%xmm0, %%xmm3                                           \n\
+        pmulhuw ("EDI"), %%xmm3                                         \n\
+        psllw $6, %%xmm1                                                \n\
+        movdqa %%xmm1, %%xmm6                                           \n\
+        pmulhuw 16("EDI"), %%xmm6                                       \n\
+        psllw $6, %%xmm2                                                \n\
+        movdqa %%xmm2, %%xmm7                                           \n\
+        pmulhuw 32("EDI"), %%xmm7                                       \n\
+        paddw %%xmm6, %%xmm3    # No possibility of overflow            \n\
+        paddw %%xmm7, %%xmm3                                            \n\
+        paddw 144("EDI"), %%xmm3                                        \n"
+#define SSE2_RGB2U "\
+        # Create 8.6 fixed-point U data in XMM4                         \n\
+        movdqa %%xmm0, %%xmm4                                           \n\
+        pmulhw 48("EDI"), %%xmm4                                        \n\
+        movdqa %%xmm1, %%xmm6                                           \n\
+        pmulhw 64("EDI"), %%xmm6                                        \n\
+        movdqa %%xmm2, %%xmm7                                           \n\
+        pmulhw 80("EDI"), %%xmm7                                        \n\
+        paddw %%xmm6, %%xmm4                                            \n\
+        paddw %%xmm7, %%xmm4                                            \n\
+        paddw 160("EDI"), %%xmm4                                        \n"
+#define SSE2_RGB2U0 "\
+        # Create 8.6 fixed-point U data in XMM0                         \n\
+        pmulhw 48("EDI"), %%xmm0                                        \n\
+        pmulhw 64("EDI"), %%xmm1                                        \n\
+        pmulhw 80("EDI"), %%xmm2                                        \n\
+        paddw %%xmm1, %%xmm0                                            \n\
+        paddw %%xmm2, %%xmm0                                            \n\
+        paddw 160("EDI"), %%xmm0                                        \n"
+#define SSE2_RGB2V "\
+        # Create 8.6 fixed-point V data in XMM0                         \n\
+        pmulhw 96("EDI"), %%xmm0                                        \n\
+        pmulhw 112("EDI"), %%xmm1                                       \n\
+        pmulhw 128("EDI"), %%xmm2                                       \n\
+        paddw %%xmm1, %%xmm0                                            \n\
+        paddw %%xmm2, %%xmm0                                            \n\
+        paddw 160("EDI"), %%xmm0                                        \n"
+#define SSE2_PACKYU "\
+        # Shift back down to 8-bit values                               \n\
+        psraw $6, %%xmm3                                                \n\
+        psraw $6, %%xmm0                                                \n\
+        # Pack into bytes                                               \n\
+        pxor %%xmm7, %%xmm7                                             \n\
+        packuswb %%xmm7, %%xmm3                                         \n\
+        packuswb %%xmm7, %%xmm0                                         \n"
+#define SSE2_PACKYUV "\
+        # Shift back down to 8-bit values                               \n\
+        psraw $6, %%xmm3                                                \n\
+        psraw $6, %%xmm4                                                \n\
+        psraw $6, %%xmm0                                                \n\
+        # Pack into bytes                                               \n\
+        pxor %%xmm7, %%xmm7                                             \n\
+        packuswb %%xmm7, %%xmm3                                         \n\
+        packuswb %%xmm7, %%xmm4                                         \n\
+        packuswb %%xmm7, %%xmm0                                         \n"
+#define SSE2_STRIPU(N) "\
+        # Remove every odd U value                                      \n\
+        pand 176("EDI"), %%xmm"#N"                                      \n\
+        packuswb %%xmm7, %%xmm"#N"                                      \n"
+#define SSE2_STRIPV "\
+        # Remove every even V value                                     \n\
+        psrlw $8, %%xmm0                                                \n\
+        packuswb %%xmm7, %%xmm0                                         \n"
+
+static inline void sse2_rgb_to_yuv420p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    if (y%2 == 0) {
+        asm("\
+            "SSE2_RGB2Y"                                                \n\
+            "SSE2_RGB2U0"                                               \n\
+            "SSE2_PACKYU"                                               \n\
+            "SSE2_STRIPU(0)"                                            \n\
+            # Store into destination pointers                           \n\
+            movq %%xmm3, ("EAX")                                        \n\
+            movd %%xmm0, ("ECX")                                        \n"
+            : /* no outputs */
+            : "a" (destY+y*width+x), "c" (destU+(y/2)*(width/2)+(x/2)),
+              "D" (&rgb_data), "m" (rgb_data)
+        );
+    } else {
+        asm("\
+            "SSE2_RGB2Y"                                                \n\
+            "SSE2_RGB2V"                                                \n\
+            "SSE2_PACKYU"                                               \n\
+            "SSE2_STRIPV"                                               \n\
+            # Store into destination pointers                           \n\
+            movq %%xmm3, ("EAX")                                        \n\
+            movd %%xmm0, ("EDX")                                        \n"
+            : /* no outputs */
+            : "a" (destY+y*width+x), "d" (destV+(y/2)*(width/2)+(x/2)),
+              "D" (&rgb_data), "m" (rgb_data)
+        );
+    }
+}
+
+static inline void sse2_rgb_to_yuv411p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        "SSE2_STRIPU(4)"                                                \n\
+        "SSE2_STRIPU(4)"                                                \n\
+        "SSE2_STRIPU(0)"                                                \n\
+        "SSE2_STRIPV"                                                   \n\
+        # Store into destination pointers                               \n\
+        movq %%xmm3, ("EAX")                                            \n\
+        "PUSH(EAX)"  # needed because GCC might rely on it later        \n\
+        movd %%xmm4, %%eax                                              \n\
+        movw %%ax, ("ECX")                                              \n\
+        movd %%xmm0, %%eax                                              \n\
+        movw %%ax, ("EDX")                                              \n\
+        "POP(EAX)"                                                      \n"
+        : /* no outputs */
+        : "a" (destY+y*width+x), "c" (destU+y*(width/4)+(x/4)),
+          "d" (destV+y*(width/4)+(x/4)), "D" (&rgb_data), "m" (rgb_data)
+#ifdef ARCH_X86_64
+        : FAKE_PUSH_REG
+#endif
+    );
+}
+
+static inline void sse2_rgb_to_yuv422p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        "SSE2_STRIPU(4)"                                                \n\
+        "SSE2_STRIPV"                                                   \n\
+        # Store into destination pointers                               \n\
+        movq %%xmm3, ("EAX")                                            \n\
+        movd %%xmm4, ("ECX")                                            \n\
+        movd %%xmm0, ("EDX")                                            \n"
+        : /* no outputs */
+        : "a" (destY+y*width+x), "c" (destU+y*(width/2)+(x/2)),
+          "d" (destV+y*(width/2)+(x/2)), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_rgb_to_yuv444p(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        # Store into destination pointers                               \n\
+        movq %%xmm3, ("EAX")                                            \n\
+        movq %%xmm4, ("ECX")                                            \n\
+        movq %%xmm0, ("EDX")                                            \n"
+        : /* no outputs */
+        : "a" (destY+y*width+x), "c" (destU+y*width+x), "d" (destV+y*width+x),
+          "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_rgb_to_yuy2(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        "SSE2_STRIPU(4)"                                                \n\
+        "SSE2_STRIPV"                                                   \n\
+        # Interleave Y/U/V                                              \n\
+        punpcklbw %%xmm0, %%xmm4                                        \n\
+        punpcklbw %%xmm4, %%xmm3                                        \n\
+        # Store into destination pointer                                \n\
+        movdqu %%xmm3, ("EAX")                                          \n"
+        : /* no outputs */
+        : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_rgb_to_uyvy(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        "SSE2_STRIPU(4)"                                                \n\
+        "SSE2_STRIPV"                                                   \n\
+        # Interleave Y/U/V                                              \n\
+        punpcklbw %%xmm0, %%xmm4                                        \n\
+        punpcklbw %%xmm3, %%xmm4                                        \n\
+        # Store into destination pointer                                \n\
+        movdqu %%xmm4, ("EAX")                                          \n"
+        : /* no outputs */
+        : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_rgb_to_yvyu(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        "SSE2_RGB2Y"                                                    \n\
+        "SSE2_RGB2U"                                                    \n\
+        "SSE2_RGB2V"                                                    \n\
+        "SSE2_PACKYUV"                                                  \n\
+        # Remove every odd V value                                      \n\
+        pand 176("EDI"), %%xmm0                                         \n\
+        packuswb %%xmm7, %%xmm0                                         \n\
+        # Remove every even U value                                     \n\
+        psrlw $8, %%xmm4                                                \n\
+        packuswb %%xmm7, %%xmm4                                         \n\
+        # Interleave Y/U/V                                              \n\
+        punpcklbw %%xmm4, %%xmm0                                        \n\
+        punpcklbw %%xmm0, %%xmm3                                        \n\
+        # Store into destination pointer                                \n\
+        movdqu %%xmm3, ("EAX")                                          \n"
+        : /* no outputs */
+        : "a" (destY+(y*width+x)*2), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+static inline void sse2_rgb_to_y8(
+    uint8_t *destY, uint8_t *destU, uint8_t *destV, int x, int y, int width)
+{
+    asm("\
+        psllw $6, %%xmm0                                                \n\
+        pmulhuw ("EDI"), %%xmm0                                         \n\
+        psllw $6, %%xmm1                                                \n\
+        pmulhuw 16("EDI"), %%xmm1                                       \n\
+        psllw $6, %%xmm2                                                \n\
+        pmulhuw 32("EDI"), %%xmm2                                       \n\
+        paddw %%xmm1, %%xmm0            # No possibility of overflow    \n\
+        paddw %%xmm2, %%xmm0                                            \n\
+        paddw 144("EDI"), %%xmm0                                        \n\
+        psraw $6, %%xmm0                                                \n\
+        packuswb %%xmm0, %%xmm0                                         \n\
+        movq %%xmm0, ("EAX")                                            \n"
+        : /* no outputs */
+        : "a" (destY+y*width+x), "D" (&rgb_data), "m" (rgb_data)
+    );
+}
+
+/*************************************************************************/
+
+static int yuvp_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 16,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%edx                 # (trash EDX, we don't need it  \n\
+        cmovnz %%edx, %%eax             #  anymore)                     \n\
+        movl $0, %%edx                                                  \n\
+        cmovs %%edx, %%eax                                              \n\
+        movb %%al, -1("EDI","ECX")      # and store                     \n",
+        /* main_loop */ "\
+        movdqu -16("ESI","ECX"), %%xmm0 # XMM0: Y15..Y0                 \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: Y15..Y0                 \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: Y7..Y0                  \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        psllw $2, %%xmm0                # XMM0: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        punpckhbw %%xmm4, %%xmm1        # XMM1: Y15..Y8 << 8            \n\
+        psubw %%xmm6, %%xmm1            # XMM1: unbias by 16            \n\
+        psllw $2, %%xmm1                # XMM1: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm1           # XMM1: multiply by 255/219>>2  \n\
+        packuswb %%xmm1, %%xmm0         # XMM0: G15..G0, saturated      \n\
+        movdqu %%xmm0, -16("EDI","ECX")                                 \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+static int yuy2_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pcmpeqd %%xmm5, %%xmm5                                          \n\
+        psrlw $8, %%xmm5                # constant: 0x00FF              \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 8,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -2("ESI","ECX",2), %%eax # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%edx                 # (trash EDX, we don't need it  \n\
+        cmovnz %%edx, %%eax             #  anymore)                     \n\
+        movl $0, %%edx                                                  \n\
+        cmovs %%edx, %%eax                                              \n\
+        movb %%al, -1("EDI","ECX")      # and store                     \n",
+        /* main_loop */ "\
+        movdqu -16("ESI","ECX",2),%%xmm0 #XMM0: V3 Y7..U0 Y0            \n\
+        pand %%xmm5, %%xmm0             # XMM0: Y7..Y0                  \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        psllw $2, %%xmm0                # XMM0: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: G7..G0, saturated       \n\
+        movq %%xmm0, -8("EDI","ECX")                                    \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+static int uyvy_gray8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6                                        \n\
+        psllw $2, %%xmm6                # constant: 16<<2               \n\
+        pcmpeqd %%xmm5, %%xmm5                                          \n\
+        psllw $8, %%xmm5                # constant: 0xFF00              \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 8,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX",2), %%eax # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%edx                 # (trash EDX, we don't need it  \n\
+        cmovnz %%edx, %%eax             #  anymore)                     \n\
+        movl $0, %%edx                                                  \n\
+        cmovs %%edx, %%eax                                              \n\
+        movb %%al, -1("EDI","ECX")      # and store                     \n",
+        /* main_loop */ "\
+        movdqu -16("ESI","ECX",2),%%xmm0 #XMM0: Y7 V3..Y0 U0            \n\
+        pand %%xmm5, %%xmm0             # XMM0: Y7..Y0 << 8             \n\
+        psrlw $6, %%xmm0                # XMM0: fixed point 8.2         \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: G7..G0, saturated       \n\
+        movq %%xmm0, -8("EDI","ECX")                                    \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+/*************************************************************************/
+
+static int gray8_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa 16("EDX"), %%xmm7       # constant: 219/255             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 16,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve gray byte            \n\
+        imull %3, %%eax                 # multiply by 219/255           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        addl $16, %%eax                 # add 16                        \n\
+        movb %%al, -1("EDI","ECX")      # and store                     \n",
+        /* main_loop */ "\
+        movdqu -16("ESI","ECX"), %%xmm2 # XMM2: G15..G0                 \n\
+        movdqa %%xmm4, %%xmm0                                           \n\
+        punpcklbw %%xmm2, %%xmm0        # XMM0: G7..G0 << 8             \n\
+        pmulhuw %%xmm7, %%xmm0          # XMM0: multiply by 219/255>>2  \n\
+        movdqa %%xmm4, %%xmm1                                           \n\
+        punpckhbw %%xmm2, %%xmm1        # XMM1: G15..G8 << 8            \n\
+        pmulhuw %%xmm7, %%xmm1          # XMM1: multiply by 219/255>>2  \n\
+        psrlw $6, %%xmm0                # XMM0: shift down to 8 bits    \n\
+        paddw %%xmm6, %%xmm0            # XMM0: bias by 16              \n\
+        psrlw $6, %%xmm1                # XMM1: shift down to 8 bits    \n\
+        paddw %%xmm6, %%xmm1            # XMM1: bias by 16              \n\
+        packuswb %%xmm1, %%xmm0         # XMM0: Y15..Y0                 \n\
+        movdqu %%xmm0, -16("EDI","ECX")                                 \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+static int gray8_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa 16("EDX"), %%xmm7       # constant: 219/255             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pcmpeqd %%xmm5, %%xmm5                                          \n\
+        psllw $15, %%xmm5               # constant: 0x8000              \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 8,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve gray byte            \n\
+        imull %3, %%eax                 # multiply by 219/255           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        addl $16, %%eax                 # add 16                        \n\
+        movb %%al, -2("EDI","ECX",2)    # and store                     \n\
+        movb $128, -1("EDI","ECX",2)    # store 128 in U/V byte         \n",
+        /* main_loop */ "\
+        movq -8("ESI","ECX"), %%xmm2    # XMM2: G5..G0                  \n\
+        movdqa %%xmm4, %%xmm0                                           \n\
+        punpcklbw %%xmm2, %%xmm0        # XMM0: G7..G0 << 8             \n\
+        pmulhuw %%xmm7, %%xmm0          # XMM0: multiply by 219/255>>2  \n\
+        psrlw $6, %%xmm0                # XMM0: shift down to 8 bits    \n\
+        paddw %%xmm6, %%xmm0            # XMM0: bias by 16              \n\
+        por %%xmm5, %%xmm0              # XMM0: OR in U/V bytes         \n\
+        movdqu %%xmm0, -16("EDI","ECX",2)                               \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+static int gray8_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height) {
+    asm("movdqa 16("EDX"), %%xmm7       # constant: 219/255             \n\
+        movdqa 32("EDX"), %%xmm6                                        \n\
+        psllw $8, %%xmm6                # constant: 16 << 8             \n\
+        pcmpeqd %%xmm5, %%xmm5                                          \n\
+        psllw $15, %%xmm5                                               \n\
+        psrlw $8, %%xmm5                # constant: 0x0080              \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n\
+        pcmpeqd %%xmm3, %%xmm3                                          \n\
+        psllw $8, %%xmm3                # constant: 0xFF00              \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 8,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve gray byte            \n\
+        imull %3, %%eax                 # multiply by 219/255           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        addl $16, %%eax                 # add 16                        \n\
+        movb %%al, -1("EDI","ECX",2)    # and store                     \n\
+        movb $128, -2("EDI","ECX",2)    # store 128 in U/V byte         \n",
+        /* main_loop */ "\
+        movq -8("ESI","ECX"), %%xmm2    # XMM2: G5..G0                  \n\
+        movdqa %%xmm4, %%xmm0                                           \n\
+        punpcklbw %%xmm2, %%xmm0        # XMM0: G7..G0 << 8             \n\
+        pmulhuw %%xmm7, %%xmm0          # XMM0: multiply by 219/255>>2  \n\
+        psllw $2, %%xmm0                # XMM0: shift results to hi byte\n\
+        pand %%xmm3, %%xmm0             # XMM0: clear low byte          \n\
+        paddw %%xmm6, %%xmm0            # XMM0: bias by 16              \n\
+        por %%xmm5, %%xmm0              # XMM0: OR in U/V bytes         \n\
+        movdqu %%xmm0, -16("EDI","ECX",2)                               \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (GRAY_Y), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+/*************************************************************************/
+
+static int y8_rgb24_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        movdqa 48("EDX"), %%xmm5        # constant: bytes 0/3/6/9 mask  \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 4,
+        /* push_regs */ "push "EBX,
+        /* pop_regs  */ "pop "EBX,
+        /* small_loop */ "\
+        lea ("ECX","ECX",2), "EDX"      # 3*count for RGB offset        \n\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%ebx                                                 \n\
+        cmovnz %%ebx, %%eax                                             \n\
+        movl $0, %%ebx                                                  \n\
+        cmovs %%ebx, %%eax                                              \n\
+        movb %%al, -3("EDI","EDX")      # and store                     \n\
+        movb %%al, -2("EDI","EDX")                                      \n\
+        movb %%al, -1("EDI","EDX")                                      \n",
+        /* main_loop */ "\
+        lea ("ECX","ECX",2), "EDX"                                      \n\
+        movd -4("ESI","ECX"), %%xmm0    # XMM0: Y3..Y0                  \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: Y3..Y0 in 16 bits       \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        psllw $2, %%xmm0                # XMM0: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: G3..G0, saturated       \n\
+        pshuflw $0x50, %%xmm0, %%xmm0   # X0.l: G3 G2 G3 G2 G1 G0 G1 G0 \n\
+        pshufhw $0x55, %%xmm0, %%xmm0   # X0.h: G3 G2 G3 G2 G3 G2 G3 G2 \n\
+        pand %%xmm5, %%xmm0             # XMM0: ------3--2--1--0        \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: ------3--2--1--0        \n\
+        pslldq $1, %%xmm1               # XMM1: -----3--2--1--0-        \n\
+        movdqa %%xmm0, %%xmm2           # XMM2: ------3--2--1--0        \n\
+        pslldq $2, %%xmm2               # XMM2: ----3--2--1--0--        \n\
+        por %%xmm1, %%xmm0              # XMM0: -----33-22-11-00        \n\
+        por %%xmm2, %%xmm0              # XMM0: ----333222111000        \n\
+        movd %%xmm0, -12("EDI","EDX")                                   \n\
+        pshufd $0xC9, %%xmm0, %%xmm0                                    \n\
+        movq %%xmm0, -8("EDI","EDX")                                    \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+/* 4BPP is slightly easier... */
+static int y8_rgba32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 4,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%edx                                                 \n\
+        cmovnz %%edx, %%eax                                             \n\
+        movl $0, %%edx                                                  \n\
+        cmovs %%edx, %%eax                                              \n\
+        movb %%al, -4("EDI","ECX",4)      # and store                   \n\
+        movb %%al, -3("EDI","ECX",4)                                    \n\
+        movb %%al, -2("EDI","ECX",4)                                    \n",
+        /* main_loop */ "\
+        movd -4("ESI","ECX"), %%xmm0    # XMM0: Y3..Y0                  \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: Y3..Y0 in 16 bits       \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        psllw $2, %%xmm0                # XMM0: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: G3..G0, saturated       \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: G3..G0 in 16 bits       \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: ---3---2---1---0        \n\
+        movdqa %%xmm0, %%xmm1           # XMM1: ---3---2---1---0        \n\
+        pslldq $1, %%xmm1               # XMM1: --3---2---1---0-        \n\
+        movdqa %%xmm0, %%xmm2           # XMM2: ---3---2---1---0        \n\
+        pslldq $2, %%xmm2               # XMM2: -3---2---1---0--        \n\
+        por %%xmm1, %%xmm0              # XMM0: --33--22--11--00        \n\
+        por %%xmm2, %%xmm0              # XMM0: -333-222-111-000        \n\
+        movntdq %%xmm0, -16("EDI","ECX",4)                              \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+static int y8_argb32_sse2(uint8_t **src, uint8_t **dest, int width, int height)
+{
+    asm("movdqa ("EDX"), %%xmm7         # constant: 255/219             \n\
+        movdqa 32("EDX"), %%xmm6        # constant: 16                  \n\
+        pxor %%xmm4, %%xmm4             # constant: 0                   \n"
+        SIMD_LOOP_WRAPPER(
+        /* blocksize */ 4,
+        /* push_regs */ "",
+        /* pop_regs  */ "",
+        /* small_loop */ "\
+        movzbl -1("ESI","ECX"), %%eax   # retrieve Y byte               \n\
+        subl $16, %%eax                 # subtract 16                   \n\
+        imull %3, %%eax                 # multiply by 255/219           \n\
+        shrl $14, %%eax                 # shift down to 8 bits          \n\
+        testb %%ah, %%ah                # saturate to 0..255            \n\
+        movl $-1, %%edx                                                 \n\
+        cmovnz %%edx, %%eax                                             \n\
+        movl $0, %%edx                                                  \n\
+        cmovs %%edx, %%eax                                              \n\
+        movb %%al, -3("EDI","ECX",4)      # and store                   \n\
+        movb %%al, -2("EDI","ECX",4)                                    \n\
+        movb %%al, -1("EDI","ECX",4)                                    \n",
+        /* main_loop */ "\
+        movd -4("ESI","ECX"), %%xmm0    # XMM0: Y3..Y0                  \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: Y3..Y0 in 16 bits       \n\
+        psubw %%xmm6, %%xmm0            # XMM0: unbias by 16            \n\
+        psllw $2, %%xmm0                # XMM0: fixed point 8.2         \n\
+        pmulhw %%xmm7, %%xmm0           # XMM0: multiply by 255/219>>2  \n\
+        packuswb %%xmm0, %%xmm0         # XMM0: G3..G0, saturated       \n\
+        punpcklbw %%xmm4, %%xmm0        # XMM0: G3..G0 in 16 bits       \n\
+        movdqa %%xmm4, %%xmm3           # XMM3: 0                       \n\
+        punpcklbw %%xmm0, %%xmm3        # XMM3: --3---2---1---0-        \n\
+        movdqa %%xmm3, %%xmm1           # XMM1: --3---2---1---0-        \n\
+        pslldq $1, %%xmm1               # XMM1: -3---2---1---0--        \n\
+        movdqa %%xmm3, %%xmm2           # XMM2: --3---2---1---0-        \n\
+        pslldq $2, %%xmm2               # XMM2: 3---2---1---0---        \n\
+        por %%xmm1, %%xmm3              # XMM3: -33--22--11--00-        \n\
+        por %%xmm2, %%xmm3              # XMM3: 333-222-111-000-        \n\
+        movntdq %%xmm3, -16("EDI","ECX",4)                              \n",
+        /* emms */ "emms")
+        : /* no outputs */
+        : "S" (src[0]), "D" (dest[0]), "c" (width*height),
+          "i" (Y_GRAY), "d" (&gray_data), "m" (gray_data)
+        : "eax");
+    return 1;
+}
+
+/*************************************************************************/
+
+#endif  /* HAVE_ASM_SSE2 */
+
+/*************************************************************************/
+/*************************************************************************/
+
+/* Initialization */
+
+int ac_imgconvert_init_yuv_rgb(int accel)
+{
+    /******** Standard C implementations ********/
+
+    //---- YUV->RGB ----//
+
+    if (!register_conversion(IMG_YUV420P, IMG_RGB24,   yuv420p_rgb24)
+     || !register_conversion(IMG_YUV411P, IMG_RGB24,   yuv411p_rgb24)
+     || !register_conversion(IMG_YUV422P, IMG_RGB24,   yuv422p_rgb24)
+     || !register_conversion(IMG_YUV444P, IMG_RGB24,   yuv444p_rgb24)
+     || !register_conversion(IMG_YUY2,    IMG_RGB24,   yuy2_rgb24)
+     || !register_conversion(IMG_UYVY,    IMG_RGB24,   uyvy_rgb24)
+     || !register_conversion(IMG_YVYU,    IMG_RGB24,   yvyu_rgb24)
+     || !register_conversion(IMG_Y8,      IMG_RGB24,   y8_rgb24)
+
+     || !register_conversion(IMG_YUV420P, IMG_BGR24,   yuv420p_bgr24)
+     || !register_conversion(IMG_YUV411P, IMG_BGR24,   yuv411p_bgr24)
+     || !register_conversion(IMG_YUV422P, IMG_BGR24,   yuv422p_bgr24)
+     || !register_conversion(IMG_YUV444P, IMG_BGR24,   yuv444p_bgr24)
+     || !register_conversion(IMG_YUY2,    IMG_BGR24,   yuy2_bgr24)
+     || !register_conversion(IMG_UYVY,    IMG_BGR24,   uyvy_bgr24)
+     || !register_conversion(IMG_YVYU,    IMG_BGR24,   yvyu_bgr24)
+     || !register_conversion(IMG_Y8,      IMG_BGR24,   y8_rgb24)
+
+     || !register_conversion(IMG_YUV420P, IMG_RGBA32,  yuv420p_rgba32)
+     || !register_conversion(IMG_YUV411P, IMG_RGBA32,  yuv411p_rgba32)
+     || !register_conversion(IMG_YUV422P, IMG_RGBA32,  yuv422p_rgba32)
+     || !register_conversion(IMG_YUV444P, IMG_RGBA32,  yuv444p_rgba32)
+     || !register_conversion(IMG_YUY2,    IMG_RGBA32,  yuy2_rgba32)
+     || !register_conversion(IMG_UYVY,    IMG_RGBA32,  uyvy_rgba32)
+     || !register_conversion(IMG_YVYU,    IMG_RGBA32,  yvyu_rgba32)
+     || !register_conversion(IMG_Y8,      IMG_RGBA32,  y8_rgba32)
+
+     || !register_conversion(IMG_YUV420P, IMG_ABGR32,  yuv420p_abgr32)
+     || !register_conversion(IMG_YUV411P, IMG_ABGR32,  yuv411p_abgr32)
+     || !register_conversion(IMG_YUV422P, IMG_ABGR32,  yuv422p_abgr32)
+     || !register_conversion(IMG_YUV444P, IMG_ABGR32,  yuv444p_abgr32)
+     || !register_conversion(IMG_YUY2,    IMG_ABGR32,  yuy2_abgr32)
+     || !register_conversion(IMG_UYVY,    IMG_ABGR32,  uyvy_abgr32)
+     || !register_conversion(IMG_YVYU,    IMG_ABGR32,  yvyu_abgr32)
+     || !register_conversion(IMG_Y8,      IMG_ABGR32,  y8_argb32)
+
+     || !register_conversion(IMG_YUV420P, IMG_ARGB32,  yuv420p_argb32)
+     || !register_conversion(IMG_YUV411P, IMG_ARGB32,  yuv411p_argb32)
+     || !register_conversion(IMG_YUV422P, IMG_ARGB32,  yuv422p_argb32)
+     || !register_conversion(IMG_YUV444P, IMG_ARGB32,  yuv444p_argb32)
+     || !register_conversion(IMG_YUY2,    IMG_ARGB32,  yuy2_argb32)
+     || !register_conversion(IMG_UYVY,    IMG_ARGB32,  uyvy_argb32)
+     || !register_conversion(IMG_YVYU,    IMG_ARGB32,  yvyu_argb32)
+     || !register_conversion(IMG_Y8,      IMG_ARGB32,  y8_argb32)
+
+     || !register_conversion(IMG_YUV420P, IMG_BGRA32,  yuv420p_bgra32)
+     || !register_conversion(IMG_YUV411P, IMG_BGRA32,  yuv411p_bgra32)
+     || !register_conversion(IMG_YUV422P, IMG_BGRA32,  yuv422p_bgra32)
+     || !register_conversion(IMG_YUV444P, IMG_BGRA32,  yuv444p_bgra32)
+     || !register_conversion(IMG_YUY2,    IMG_BGRA32,  yuy2_bgra32)
+     || !register_conversion(IMG_UYVY,    IMG_BGRA32,  uyvy_bgra32)
+     || !register_conversion(IMG_YVYU,    IMG_BGRA32,  yvyu_bgra32)
+     || !register_conversion(IMG_Y8,      IMG_BGRA32,  y8_rgba32)
+
+    //---- RGB->YUV ----//
+
+     || !register_conversion(IMG_RGB24,   IMG_YUV420P, rgb24_yuv420p)
+     || !register_conversion(IMG_RGB24,   IMG_YUV411P, rgb24_yuv411p)
+     || !register_conversion(IMG_RGB24,   IMG_YUV422P, rgb24_yuv422p)
+     || !register_conversion(IMG_RGB24,   IMG_YUV444P, rgb24_yuv444p)
+     || !register_conversion(IMG_RGB24,   IMG_YUY2,    rgb24_yuy2)
+     || !register_conversion(IMG_RGB24,   IMG_UYVY,    rgb24_uyvy)
+     || !register_conversion(IMG_RGB24,   IMG_YVYU,    rgb24_yvyu)
+     || !register_conversion(IMG_RGB24,   IMG_Y8,      rgb24_y8)
+
+     || !register_conversion(IMG_BGR24,   IMG_YUV420P, bgr24_yuv420p)
+     || !register_conversion(IMG_BGR24,   IMG_YUV411P, bgr24_yuv411p)
+     || !register_conversion(IMG_BGR24,   IMG_YUV422P, bgr24_yuv422p)
+     || !register_conversion(IMG_BGR24,   IMG_YUV444P, bgr24_yuv444p)
+     || !register_conversion(IMG_BGR24,   IMG_YUY2,    bgr24_yuy2)
+     || !register_conversion(IMG_BGR24,   IMG_UYVY,    bgr24_uyvy)
+     || !register_conversion(IMG_BGR24,   IMG_YVYU,    bgr24_yvyu)
+     || !register_conversion(IMG_BGR24,   IMG_Y8,      bgr24_y8)
+
+     || !register_conversion(IMG_RGBA32,  IMG_YUV420P, rgba32_yuv420p)
+     || !register_conversion(IMG_RGBA32,  IMG_YUV411P, rgba32_yuv411p)
+     || !register_conversion(IMG_RGBA32,  IMG_YUV422P, rgba32_yuv422p)
+     || !register_conversion(IMG_RGBA32,  IMG_YUV444P, rgba32_yuv444p)
+     || !register_conversion(IMG_RGBA32,  IMG_YUY2,    rgba32_yuy2)
+     || !register_conversion(IMG_RGBA32,  IMG_UYVY,    rgba32_uyvy)
+     || !register_conversion(IMG_RGBA32,  IMG_YVYU,    rgba32_yvyu)
+     || !register_conversion(IMG_RGBA32,  IMG_Y8,      rgba32_y8)
+
+     || !register_conversion(IMG_ABGR32,  IMG_YUV420P, abgr32_yuv420p)
+     || !register_conversion(IMG_ABGR32,  IMG_YUV411P, abgr32_yuv411p)
+     || !register_conversion(IMG_ABGR32,  IMG_YUV422P, abgr32_yuv422p)
+     || !register_conversion(IMG_ABGR32,  IMG_YUV444P, abgr32_yuv444p)
+     || !register_conversion(IMG_ABGR32,  IMG_YUY2,    abgr32_yuy2)
+     || !register_conversion(IMG_ABGR32,  IMG_UYVY,    abgr32_uyvy)
+     || !register_conversion(IMG_ABGR32,  IMG_YVYU,    abgr32_yvyu)
+     || !register_conversion(IMG_ABGR32,  IMG_Y8,      abgr32_y8)
+
+     || !register_conversion(IMG_ARGB32,  IMG_YUV420P, argb32_yuv420p)
+     || !register_conversion(IMG_ARGB32,  IMG_YUV411P, argb32_yuv411p)
+     || !register_conversion(IMG_ARGB32,  IMG_YUV422P, argb32_yuv422p)
+     || !register_conversion(IMG_ARGB32,  IMG_YUV444P, argb32_yuv444p)
+     || !register_conversion(IMG_ARGB32,  IMG_YUY2,    argb32_yuy2)
+     || !register_conversion(IMG_ARGB32,  IMG_UYVY,    argb32_uyvy)
+     || !register_conversion(IMG_ARGB32,  IMG_YVYU,    argb32_yvyu)
+     || !register_conversion(IMG_ARGB32,  IMG_Y8,      argb32_y8)
+
+     || !register_conversion(IMG_BGRA32,  IMG_YUV420P, bgra32_yuv420p)
+     || !register_conversion(IMG_BGRA32,  IMG_YUV411P, bgra32_yuv411p)
+     || !register_conversion(IMG_BGRA32,  IMG_YUV422P, bgra32_yuv422p)
+     || !register_conversion(IMG_BGRA32,  IMG_YUV444P, bgra32_yuv444p)
+     || !register_conversion(IMG_BGRA32,  IMG_YUY2,    bgra32_yuy2)
+     || !register_conversion(IMG_BGRA32,  IMG_UYVY,    bgra32_uyvy)
+     || !register_conversion(IMG_BGRA32,  IMG_YVYU,    bgra32_yvyu)
+     || !register_conversion(IMG_BGRA32,  IMG_Y8,      bgra32_y8)
+
+    //---- Grayscale ----//
+
+     || !register_conversion(IMG_YUV420P, IMG_GRAY8,   yuvp_gray8)
+     || !register_conversion(IMG_YUV411P, IMG_GRAY8,   yuvp_gray8)
+     || !register_conversion(IMG_YUV422P, IMG_GRAY8,   yuvp_gray8)
+     || !register_conversion(IMG_YUV444P, IMG_GRAY8,   yuvp_gray8)
+     || !register_conversion(IMG_YUY2,    IMG_GRAY8,   yuy2_gray8)
+     || !register_conversion(IMG_UYVY,    IMG_GRAY8,   uyvy_gray8)
+     || !register_conversion(IMG_YVYU,    IMG_GRAY8,   yuy2_gray8)
+     || !register_conversion(IMG_Y8,      IMG_GRAY8,   yuvp_gray8)
+
+     || !register_conversion(IMG_GRAY8,   IMG_YUV420P, gray8_yuv420p)
+     || !register_conversion(IMG_GRAY8,   IMG_YUV411P, gray8_yuv411p)
+     || !register_conversion(IMG_GRAY8,   IMG_YUV422P, gray8_yuv422p)
+     || !register_conversion(IMG_GRAY8,   IMG_YUV444P, gray8_yuv444p)
+     || !register_conversion(IMG_GRAY8,   IMG_YUY2,    gray8_yuy2)
+     || !register_conversion(IMG_GRAY8,   IMG_UYVY,    gray8_uyvy)
+     || !register_conversion(IMG_GRAY8,   IMG_YVYU,    gray8_yuy2)
+     || !register_conversion(IMG_GRAY8,   IMG_Y8,      gray8_y8)
+    ) {
+        return 0;
+    }
+
+    /******** MMX implementations ********/
+
+#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
+    if (accel & AC_MMX) {
+
+        //---- YUV->RGB ----//
+
+        if (!register_conversion(IMG_YUV420P, IMG_RGB24,   yuv420p_rgb24_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_RGB24,   yuv422p_rgb24_mmx)
+         || !register_conversion(IMG_YUV420P, IMG_BGR24,   yuv420p_bgr24_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_BGR24,   yuv422p_bgr24_mmx)
+         || !register_conversion(IMG_YUV420P, IMG_RGBA32,  yuv420p_rgba32_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_RGBA32,  yuv422p_rgba32_mmx)
+         || !register_conversion(IMG_YUV420P, IMG_ABGR32,  yuv420p_abgr32_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_ABGR32,  yuv422p_abgr32_mmx)
+         || !register_conversion(IMG_YUV420P, IMG_ARGB32,  yuv420p_argb32_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_ARGB32,  yuv422p_argb32_mmx)
+         || !register_conversion(IMG_YUV420P, IMG_BGRA32,  yuv420p_bgra32_mmx)
+         || !register_conversion(IMG_YUV422P, IMG_BGRA32,  yuv422p_bgra32_mmx)
+        ) {
+            return 0;
+        }
+    }
+#endif
+
+    /******** SSE2 implementations ********/
+
+#if defined(HAVE_ASM_SSE2)
+    if (HAS_ACCEL(accel, AC_SSE2)) {
+
+        //---- YUV->RGB ----//
+
+        if (!register_conversion(IMG_YUV420P, IMG_RGB24,   yuv420p_rgb24_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_RGB24,   yuv411p_rgb24_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_RGB24,   yuv422p_rgb24_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_RGB24,   yuv444p_rgb24_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_RGB24,   yuy2_rgb24_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_RGB24,   uyvy_rgb24_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_RGB24,   yvyu_rgb24_sse2)
+         || !register_conversion(IMG_Y8,      IMG_RGB24,   y8_rgb24_sse2)
+
+         || !register_conversion(IMG_YUV420P, IMG_BGR24,   yuv420p_bgr24_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_BGR24,   yuv411p_bgr24_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_BGR24,   yuv422p_bgr24_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_BGR24,   yuv444p_bgr24_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_BGR24,   yuy2_bgr24_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_BGR24,   uyvy_bgr24_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_BGR24,   yvyu_bgr24_sse2)
+         || !register_conversion(IMG_Y8,      IMG_BGR24,   y8_rgb24_sse2)
+
+         || !register_conversion(IMG_YUV420P, IMG_RGBA32,  yuv420p_rgba32_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_RGBA32,  yuv411p_rgba32_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_RGBA32,  yuv422p_rgba32_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_RGBA32,  yuv444p_rgba32_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_RGBA32,  yuy2_rgba32_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_RGBA32,  uyvy_rgba32_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_RGBA32,  yvyu_rgba32_sse2)
+         || !register_conversion(IMG_Y8,      IMG_RGBA32,  y8_rgba32_sse2)
+
+         || !register_conversion(IMG_YUV420P, IMG_ABGR32,  yuv420p_abgr32_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_ABGR32,  yuv411p_abgr32_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_ABGR32,  yuv422p_abgr32_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_ABGR32,  yuv444p_abgr32_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_ABGR32,  yuy2_abgr32_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_ABGR32,  uyvy_abgr32_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_ABGR32,  yvyu_abgr32_sse2)
+         || !register_conversion(IMG_Y8,      IMG_ABGR32,  y8_argb32_sse2)
+
+         || !register_conversion(IMG_YUV420P, IMG_ARGB32,  yuv420p_argb32_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_ARGB32,  yuv411p_argb32_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_ARGB32,  yuv422p_argb32_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_ARGB32,  yuv444p_argb32_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_ARGB32,  yuy2_argb32_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_ARGB32,  uyvy_argb32_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_ARGB32,  yvyu_argb32_sse2)
+         || !register_conversion(IMG_Y8,      IMG_ARGB32,  y8_argb32_sse2)
+
+         || !register_conversion(IMG_YUV420P, IMG_BGRA32,  yuv420p_bgra32_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_BGRA32,  yuv411p_bgra32_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_BGRA32,  yuv422p_bgra32_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_BGRA32,  yuv444p_bgra32_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_BGRA32,  yuy2_bgra32_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_BGRA32,  uyvy_bgra32_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_BGRA32,  yvyu_bgra32_sse2)
+         || !register_conversion(IMG_Y8,      IMG_BGRA32,  y8_rgba32_sse2)
+
+        //---- RGB->YUV ----//
+
+         || !register_conversion(IMG_RGB24,   IMG_YUV420P, rgb24_yuv420p_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_YUV411P, rgb24_yuv411p_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_YUV422P, rgb24_yuv422p_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_YUV444P, rgb24_yuv444p_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_YUY2,    rgb24_yuy2_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_UYVY,    rgb24_uyvy_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_YVYU,    rgb24_yvyu_sse2)
+         || !register_conversion(IMG_RGB24,   IMG_Y8,      rgb24_y8_sse2)
+
+         || !register_conversion(IMG_BGR24,   IMG_YUV420P, bgr24_yuv420p_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_YUV411P, bgr24_yuv411p_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_YUV422P, bgr24_yuv422p_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_YUV444P, bgr24_yuv444p_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_YUY2,    bgr24_yuy2_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_UYVY,    bgr24_uyvy_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_YVYU,    bgr24_yvyu_sse2)
+         || !register_conversion(IMG_BGR24,   IMG_Y8,      bgr24_y8_sse2)
+
+         || !register_conversion(IMG_RGBA32,  IMG_YUV420P, rgba32_yuv420p_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_YUV411P, rgba32_yuv411p_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_YUV422P, rgba32_yuv422p_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_YUV444P, rgba32_yuv444p_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_YUY2,    rgba32_yuy2_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_UYVY,    rgba32_uyvy_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_YVYU,    rgba32_yvyu_sse2)
+         || !register_conversion(IMG_RGBA32,  IMG_Y8,      rgba32_y8_sse2)
+
+         || !register_conversion(IMG_ABGR32,  IMG_YUV420P, abgr32_yuv420p_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_YUV411P, abgr32_yuv411p_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_YUV422P, abgr32_yuv422p_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_YUV444P, abgr32_yuv444p_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_YUY2,    abgr32_yuy2_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_UYVY,    abgr32_uyvy_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_YVYU,    abgr32_yvyu_sse2)
+         || !register_conversion(IMG_ABGR32,  IMG_Y8,      abgr32_y8_sse2)
+
+         || !register_conversion(IMG_ARGB32,  IMG_YUV420P, argb32_yuv420p_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_YUV411P, argb32_yuv411p_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_YUV422P, argb32_yuv422p_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_YUV444P, argb32_yuv444p_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_YUY2,    argb32_yuy2_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_UYVY,    argb32_uyvy_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_YVYU,    argb32_yvyu_sse2)
+         || !register_conversion(IMG_ARGB32,  IMG_Y8,      argb32_y8_sse2)
+
+         || !register_conversion(IMG_BGRA32,  IMG_YUV420P, bgra32_yuv420p_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_YUV411P, bgra32_yuv411p_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_YUV422P, bgra32_yuv422p_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_YUV444P, bgra32_yuv444p_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_YUY2,    bgra32_yuy2_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_UYVY,    bgra32_uyvy_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_YVYU,    bgra32_yvyu_sse2)
+         || !register_conversion(IMG_BGRA32,  IMG_Y8,      bgra32_y8_sse2)
+
+        //---- Grayscale ----//
+
+         || !register_conversion(IMG_GRAY8,   IMG_YUY2,    gray8_yuy2_sse2)
+         || !register_conversion(IMG_GRAY8,   IMG_UYVY,    gray8_uyvy_sse2)
+         || !register_conversion(IMG_GRAY8,   IMG_YVYU,    gray8_yuy2_sse2)
+         || !register_conversion(IMG_GRAY8,   IMG_Y8,      gray8_y8_sse2)
+        ) {
+            return 0;
+        }
+    }
+
+    /* YUV->GRAY8 routines use CMOVcc */
+    if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE2)) {
+        if (!register_conversion(IMG_YUV420P, IMG_GRAY8,   yuvp_gray8_sse2)
+         || !register_conversion(IMG_YUV411P, IMG_GRAY8,   yuvp_gray8_sse2)
+         || !register_conversion(IMG_YUV422P, IMG_GRAY8,   yuvp_gray8_sse2)
+         || !register_conversion(IMG_YUV444P, IMG_GRAY8,   yuvp_gray8_sse2)
+         || !register_conversion(IMG_YUY2,    IMG_GRAY8,   yuy2_gray8_sse2)
+         || !register_conversion(IMG_UYVY,    IMG_GRAY8,   uyvy_gray8_sse2)
+         || !register_conversion(IMG_YVYU,    IMG_GRAY8,   yuy2_gray8_sse2)
+         || !register_conversion(IMG_Y8,      IMG_GRAY8,   yuvp_gray8_sse2)
+        ) {
+            return 0;
+        }
+    }
+#endif
+
+    return 1;
+}
+
+/*************************************************************************/
+
+/*
+ * Local variables:
+ *   c-file-style: "stroustrup"
+ *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
+ *   indent-tabs-mode: nil
+ * End:
+ *
+ * vim: expandtab shiftwidth=4:
+ */