summaryrefslogtreecommitdiffstats
path: root/mpeglib/lib/util/render/dither2YUV
diff options
context:
space:
mode:
Diffstat (limited to 'mpeglib/lib/util/render/dither2YUV')
-rw-r--r--mpeglib/lib/util/render/dither2YUV/Makefile.am22
-rw-r--r--mpeglib/lib/util/render/dither2YUV/README13
-rw-r--r--mpeglib/lib/util/render/dither2YUV/dither2YUV.cpp124
-rw-r--r--mpeglib/lib/util/render/dither2YUV/dither2YUV.h64
-rw-r--r--mpeglib/lib/util/render/dither2YUV/rgb2yuv16.cpp916
-rw-r--r--mpeglib/lib/util/render/dither2YUV/rgb2yuv16.h74
-rw-r--r--mpeglib/lib/util/render/dither2YUV/rgb2yuv32.cpp1143
-rw-r--r--mpeglib/lib/util/render/dither2YUV/rgb2yuv32.h93
-rw-r--r--mpeglib/lib/util/render/dither2YUV/rgb2yuvdefs.h74
9 files changed, 2523 insertions, 0 deletions
diff --git a/mpeglib/lib/util/render/dither2YUV/Makefile.am b/mpeglib/lib/util/render/dither2YUV/Makefile.am
new file mode 100644
index 00000000..374658a3
--- /dev/null
+++ b/mpeglib/lib/util/render/dither2YUV/Makefile.am
@@ -0,0 +1,22 @@
+# libdivxplugin - Makefile.am
+
+EXTRA_DIST = README
+
+INCLUDES = -I.. $(all_includes)
+
+
+noinst_LTLIBRARIES = libdivxutil_dither.la
+
+noinst_HEADERS = dither2YUV.h rgb2yuvdefs.h rgb2yuv16.h \
+ rgb2yuv32.h
+
+libdivxutil_dither_la_SOURCES = dither2YUV.cpp rgb2yuv16.cpp \
+ rgb2yuv32.cpp
+
+
+
+
+
+
+
+
diff --git a/mpeglib/lib/util/render/dither2YUV/README b/mpeglib/lib/util/render/dither2YUV/README
new file mode 100644
index 00000000..66246c13
--- /dev/null
+++ b/mpeglib/lib/util/render/dither2YUV/README
@@ -0,0 +1,13 @@
+
+
+* we have a Dither2Yuv base class. Currently this is not derived
+ from a basic ditherWrapper class because we don not have this
+ in mpeglib yet.
+ TODO: change in mpeglib DitherWrapper->Dither2RGB and
+ make DitherWrapper pure virtual and derive Dither2YUV
+ Dither2RGB from this class.
+
+* Note we do not support 8 Bit here, thus the constructor looks
+ dofferent.
+
+
diff --git a/mpeglib/lib/util/render/dither2YUV/dither2YUV.cpp b/mpeglib/lib/util/render/dither2YUV/dither2YUV.cpp
new file mode 100644
index 00000000..db4a3288
--- /dev/null
+++ b/mpeglib/lib/util/render/dither2YUV/dither2YUV.cpp
@@ -0,0 +1,124 @@
+/*
+ this class dithery RGB picture to yuv12
+ Copyright (C) 2000 Martin Vogt
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Library General Public License as published by
+ the Free Software Foundation.
+
+ For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#include "dither2YUV.h"
+
+#include <iostream>
+
+using namespace std;
+
+
+Dither2YUV::Dither2YUV() {
+
+
+ lmmx=mm_support();
+
+}
+
+
+Dither2YUV::~Dither2YUV(){
+}
+
+
+
+
+
+
+void Dither2YUV::doDither(YUVPicture* pic,int depth,int ditherSize,
+ unsigned char* dest,int offset) {
+
+ int inputType=pic->getImageType();
+
+ switch(inputType) {
+ case PICTURE_RGB:
+ doDitherRGB_NORMAL(pic,depth,ditherSize,dest,offset);
+ break;
+ default:
+ std::cout << "unknown RGB type:"<<inputType<<" in Dither2YUV"<<std::endl;
+ exit(0);
+ }
+}
+
+
+void Dither2YUV::doDitherRGB_NORMAL(YUVPicture* rgbPic,
+ int depth,int ditherSize,
+ unsigned char* dest,int offset) {
+
+ switch (ditherSize) {
+ case _SIZE_NORMAL:
+ doDither2YUV_std(rgbPic,depth,dest,offset);
+ break;
+ case _SIZE_DOUBLE:
+ std::cout << "double not supported for RGB"<<std::endl;
+ break;
+ default:
+ std::cout << "unknown size:"<<ditherSize<<" in Dither2YUV"<<std::endl;
+ exit(0);
+ }
+}
+
+
+void Dither2YUV::doDither2YUV_std(YUVPicture* rgbPic,int depth,
+ unsigned char* dest,int offset){
+
+ int h=rgbPic->getHeight();
+ int w=rgbPic->getWidth();
+ int lumLength=w * h;
+ int colorLength=(w * h) / 4;
+
+ unsigned char* lum=dest;
+ unsigned char* cr=lum+lumLength;
+ unsigned char* cb=cr+colorLength;
+ unsigned char* rgbSource=rgbPic->getImagePtr();
+
+
+ switch (depth) {
+ case 8:
+ std::cout << "8 bit dither to yuv not supported"<<std::endl;
+ exit(0);
+ break;
+ case 16:
+ if (lmmx) {
+#ifdef INTEL
+ rgb2yuv16bit_mmx(rgbSource,lum,cr,cb,h,w);
+#endif
+ } else {
+ rgb2yuv16bit(rgbSource,lum,cr,cb,h,w);
+ }
+
+ break;
+ case 24:
+ if (lmmx) {
+#ifdef INTEL
+ rgb2yuv24bit_mmx(rgbSource,lum,cr,cb,h,w);
+#endif
+ } else {
+ rgb2yuv24bit(rgbSource,lum,cr,cb,h,w);
+ }
+ break;
+ case 32:
+ if (lmmx) {
+#ifdef INTEL
+ rgb2yuv32bit_mmx(rgbSource,lum,cr,cb,h,w);
+#endif
+ } else {
+ rgb2yuv32bit(rgbSource,lum,cr,cb,h,w);
+ }
+ break;
+ default:
+ std::cout << "cannot dither depth:"<<depth<<std::endl;
+ }
+
+}
+
+
diff --git a/mpeglib/lib/util/render/dither2YUV/dither2YUV.h b/mpeglib/lib/util/render/dither2YUV/dither2YUV.h
new file mode 100644
index 00000000..5ef26b2b
--- /dev/null
+++ b/mpeglib/lib/util/render/dither2YUV/dither2YUV.h
@@ -0,0 +1,64 @@
+/*
+ this class dithery RGB picture to yuv12
+ Copyright (C) 2000 Martin Vogt
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU Library General Public License as published by
+ the Free Software Foundation.
+
+ For more information look at the file COPYRIGHT in this package
+
+ */
+
+
+#ifndef __DITHER2YUV_H
+#define __DITHER2YUV_H
+
+
+#include "../../mmx/mmx.h"
+#include "../yuvPicture.h"
+
+
+#include <stdlib.h>
+#include "rgb2yuv16.h"
+#include "rgb2yuv32.h"
+
+#define _SIZE_NONE 0
+#define _SIZE_NORMAL 1
+#define _SIZE_DOUBLE 2
+
+
+/**
+ Wraps all calls to software ditherer and the different
+ resolutions,mmx enhancements, and doublesize ditherers.
+*/
+
+
+class Dither2YUV {
+
+ int lmmx;
+
+ int bpp;
+
+
+ public:
+ Dither2YUV();
+ ~Dither2YUV();
+
+ int getDitherSize();
+ void setDitherSize(int ditherSize);
+
+ void doDither(YUVPicture* pic,int depth,int ditherSize,
+ unsigned char* dest,int offset);
+
+
+ private:
+ void doDitherRGB_NORMAL(YUVPicture* pic,int depth,int ditherSize,
+ unsigned char* dest,int offset);
+
+ void doDither2YUV_std(YUVPicture* pic,int depth,
+ unsigned char* dest,int offset);
+
+};
+
+#endif
diff --git a/mpeglib/lib/util/render/dither2YUV/rgb2yuv16.cpp b/mpeglib/lib/util/render/dither2YUV/rgb2yuv16.cpp
new file mode 100644
index 00000000..e0d7fc86
--- /dev/null
+++ b/mpeglib/lib/util/render/dither2YUV/rgb2yuv16.cpp
@@ -0,0 +1,916 @@
+/***************************************************************************
+ rgb2yuv16.c - description
+ -------------------
+ begin : Tue Nov 2 2000
+ copyright : (C) 2000 by Christian Gerlach
+ email : cgerlach@rhrk.uni-kl.de
+ ***************************************************************************/
+
+/***************************************************************************
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ ***************************************************************************/
+
+#include "rgb2yuv16.h"
+#include <iostream>
+
+static unsigned short KEEPR[4] = { 63488, 63488, 63488, 63488 };
+unsigned short KEEPG[4] = { 2016, 2016, 2016, 2016 };
+unsigned short KEEPB[4] = { 31, 31, 31, 31 };
+
+short Y_RED[4] = { 307, 307, 307, 307 };
+short Y_GREEN[4] = { 302, 302, 302, 302 };
+short Y_BLUE[4] = { 117, 117, 117, 117 };
+
+short U_RED[4] = { -150, -150, -150, -150 };
+short U_GREEN[4] = { -147, -147, -147, -147 };
+short U_BLUE[4] = { 444, 444, 444, 444 };
+
+short V_RED[4] = { 632, 632, 632, 632 };
+short V_GREEN[4] = { -265, -265, -265, -265 };
+short V_BLUE[4] = { -102, -102, -102, -102 };
+
+
+// how to avoid these nasty compiler warinings?
+// heres one (maybe bad) method
+void dummyRGB2YUV16Bit() {
+
+ printf("%p\n",KEEPR);
+ printf("%p\n",KEEPG);
+ printf("%p\n",KEEPB);
+ printf("%p\n",Y_RED);
+ printf("%p\n",Y_GREEN);
+ printf("%p\n",Y_BLUE);
+ printf("%p\n",U_RED);
+ printf("%p\n",U_GREEN);
+ printf("%p\n",U_BLUE);
+ printf("%p\n",V_RED);
+ printf("%p\n",V_GREEN);
+ printf("%p\n",V_BLUE);
+}
+
+
+#ifndef INTEL
+void rgb2yuv16bit_mmx(unsigned char* ,unsigned char* ,unsigned char* ,
+ unsigned char* ,int , int ) {
+ std::cout << "RGB->YUV not compiled with INTEL" << std::endl;
+ exit(0);
+}
+
+void rgb2yuv16bit_mmx_fast(unsigned char* ,unsigned char* ,unsigned char* ,
+ unsigned char* ,int , int ) {
+ std::cout << "RGB->YUV not compiled with INTEL" << std::endl;
+ exit(0);
+}
+#endif
+
+
+void rgb2yuv16(unsigned char* rgbSource, unsigned char* dest)
+{
+ int rgb = *((unsigned short*) rgbSource++ );
+ int r = RED(rgb);
+ int g = GREEN(rgb);
+ int b = BLUE(rgb);
+
+ dest[0] = Y_RGB(r, g, b);
+ dest[1] = U_RGB(r, g, b);
+ dest[2] = V_RGB(r, g, b);
+}
+
+void rgb2yuv16bit(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width) {
+
+ int height2 = height / 2;
+ int width2 = width / 2;
+ int r, g, b, row, col, rgb;
+
+ for (row=0 ; row<height2 ; row++) {
+ for (col=0 ; col<width2 ; col++) {
+ rgb = *((unsigned short*) rgbSource++ );
+ r = RED(rgb);
+ g = GREEN(rgb);
+ b = BLUE(rgb);
+
+ *lum++ = Y_RGB(r, g, b);
+ *cr++ = U_RGB(r, g, b);
+ *cb++ = V_RGB(r, g, b);
+
+ rgb = *((unsigned short*) rgbSource++ );
+ r = RED(rgb);
+ g = GREEN(rgb);
+ b = BLUE(rgb);
+
+ *lum++ = Y_RGB(r, g, b);
+ }
+ for (col=0 ; col<width ; col++) {
+ rgb = *((unsigned short*) rgbSource++ );
+ r = RED(rgb);
+ g = GREEN(rgb);
+ b = BLUE(rgb);
+
+ *lum++ = Y_RGB(r, g, b);
+ }
+ }
+}
+
+
+#ifdef INTEL
+
+void rgb2yuv16bit_mmx(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width) {
+ int height2 = height / 2;
+ int width2 = width / 2;
+ int bytesPerLine = width * 2;
+
+ for (int row=0 ; row<height2 ; row++) {
+ rgb2yuv16bit_mmx422_row(rgbSource, lum, cr, cb, width);
+ rgbSource += bytesPerLine;
+ lum += width;
+ cr += width2;
+ cb += width2;
+
+ rgb2y16bit_mmx_row(rgbSource, lum, width);
+ rgbSource += bytesPerLine;
+ lum += width;
+ }
+}
+
+void rgb2yuv16bit_mmx_fast(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width) {
+
+ int height2 = height / 2;
+ int width2 = width / 2;
+ int bytesPerLine = width * 2;
+
+ for (int row=0 ; row<height2 ; row++) {
+ rgb2yuv16bit_mmx422_row_fast(rgbSource, lum, cr, cb, width);
+ rgbSource += bytesPerLine;
+ lum += width;
+ cr += width2;
+ cb += width2;
+
+ rgb2y16bit_mmx_row_fast(rgbSource, lum, width);
+ rgbSource += bytesPerLine;
+ lum += width;
+ }
+}
+
+void rgb2yuv16bit_mmx422_row(unsigned char* rgb,
+ unsigned char* lum, unsigned char* cr,
+ unsigned char* cb, int pixel) {
+ unsigned int buf[17];
+
+ // 36%5 = TEMP0
+ // 44%5 = TEMPY
+ // 52%5 = TEMPU
+ // 60%5 = TEMPV
+
+ __asm__ __volatile__ (
+ "1:\n"
+
+ // unpack hicolor ( pixel 1 - 4)
+ "movq (%0), %%mm0\n"
+
+ "movq %%mm0, %%mm1\n"
+ "pand KEEPR, %%mm1\n"
+ "psrlq $8, %%mm1\n" // B3B2B1B0 -> mm1
+ "movq %%mm0, %%mm2\n"
+ "pand KEEPG, %%mm2\n"
+ "psrlq $3, %%mm2\n" // G3G2G1G0 -> mm2
+ "movq %%mm0, %%mm3\n"
+ "pand KEEPB, %%mm3\n"
+ "psllq $3, %%mm3\n" // G3G2G1G0 -> mm3
+
+ "movq %%mm2, %%mm0\n"
+ "punpcklbw %%mm1, %%mm2\n"
+ "punpckhbw %%mm1, %%mm0\n"
+
+ "pxor %%mm5, %%mm5\n"
+ "movq %%mm3, %%mm4\n"
+ "punpcklbw %%mm5, %%mm3\n"
+ "punpckhbw %%mm5, %%mm4\n"
+
+ "psllq $8, %%mm2\n"
+ "por %%mm2, %%mm3\n" // 0B1G1R10B0G0G0 -> mm3
+ "psllq $8, %%mm0\n"
+ "por %%mm0, %%mm4\n" // 0B3G3R30B2G2G2 -> mm4
+
+ "movq %%mm3, %5\n"
+ "movq %%mm4, 8%5\n"
+
+ // next 4 pixels ------------------------------
+
+ "movq 8(%0), %%mm0\n"
+
+ "movq %%mm0, %%mm1\n"
+ "pand KEEPR, %%mm1\n"
+ "psrlq $8, %%mm1\n" // B3B2B1B0 -> mm1
+ "movq %%mm0, %%mm2\n"
+ "pand KEEPG, %%mm2\n"
+ "psrlq $3, %%mm2\n" // G3G2G1G0 -> mm2
+ "movq %%mm0, %%mm3\n"
+ "pand KEEPB, %%mm3\n"
+ "psllq $3, %%mm3\n" // G3G2G1G0 -> mm3
+
+ "movq %%mm2, %%mm0\n"
+ "punpcklbw %%mm1, %%mm2\n"
+ "punpckhbw %%mm1, %%mm0\n"
+
+ "pxor %%mm5, %%mm5\n"
+ "movq %%mm3, %%mm4\n"
+ "punpcklbw %%mm5, %%mm3\n"
+ "punpckhbw %%mm5, %%mm4\n"
+
+ "psllq $8, %%mm2\n"
+ "por %%mm2, %%mm3\n" // 0B1G1R10B0G0G0 -> mm3
+ "psllq $8, %%mm0\n"
+ "por %%mm0, %%mm4\n" // 0B3G3R30B2G2G2 -> mm4
+
+ "movq %%mm3, 16%5\n"
+ "movq %%mm4, 24%5\n"
+
+ "add $16, %0\n"
+
+ // standard algorithm --------------------------------------------------
+
+ // pack rgb
+ // was: "movq (%0), %%mm1\n" // load G2R2B1G1R1B0G0R0
+ // ------------------------------
+ // (uses: mm0, mm1)
+ "movd 8%5, %%mm0\n"
+ "psllq $24, %%mm0\n"
+ "movd 4%5, %%mm1\n"
+ "por %%mm1, %%mm0\n"
+ "psllq $24, %%mm0\n"
+ "movd %5, %%mm1\n"
+ "por %%mm0, %%mm1\n"
+ // ------------------------------
+
+ "pxor %%mm6, %%mm6\n" // 0 -> mm6
+ "movq %%mm1, %%mm0\n" // G2R2B1G1R1B0G0R0 -> mm0
+ "psrlq $16, %%mm1\n" // 00G2R2B1G1R1B0 -> mm1
+ "punpcklbw ZEROSX, %%mm0\n" // R1B0G0R0 -> mm0
+ "movq %%mm1, %%mm7\n" // 00G2R2B1G1R1B0 -> mm7
+ "punpcklbw ZEROSX, %%mm1\n" // B1G1R1B0 -> mm1
+ "movq %%mm0, %%mm2\n" // R1B0G0R0 -> mm2
+ "pmaddwd YR0GRX, %%mm0\n" // yrR1,ygG0+yrR0 -> mm0
+
+ "movq %%mm1, %%mm3\n" // B1G1R1B0 -> mm3
+ "pmaddwd YBG0BX, %%mm1\n" // ybB1+ygG1,ybB0 -> mm1
+ "movq %%mm2, %%mm4\n" // R1B0G0R0 -> mm4
+ "pmaddwd UR0GRX, %%mm2\n" // urR1,ugG0+urR0 -> mm2
+ "movq %%mm3, %%mm5\n" // B1G1R1B0 -> mm5
+ "pmaddwd UBG0BX, %%mm3\n" // ubB1+ugG1,ubB0 -> mm3
+ "punpckhbw %%mm6, %%mm7\n" // 00G2R2 -> mm7
+ "pmaddwd VR0GRX, %%mm4\n" // vrR1,vgG0+vrR0 -> mm4
+ "paddd %%mm1, %%mm0\n" // Y1Y0 -> mm0
+
+ "pmaddwd VBG0BX, %%mm5\n" // vbB1+vgG1,vbB0 -> mm5
+
+ // pack rgb
+ // was: "movq 8(%0),%%mm1\n" // R5B4G4R4B3G3R3B2 -> mm1
+ // ------------------------------
+ // (uses: mm1, mm6)
+ "movd 20%5, %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 16%5, %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 12%5, %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ "psllq $8, %%mm1\n"
+ "movd 8%5, %%mm6\n"
+ "psrlq $16, %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ // ------------------------------
+
+ "paddd %%mm3, %%mm2\n" // U1U0 -> mm2
+
+ "movq %%mm1, %%mm6\n" // R5B4G4R4B3G3R3B2 -> mm6
+ "punpcklbw ZEROSX, %%mm1\n" // B3G3R3B2 -> mm1
+ "paddd %%mm5, %%mm4\n" // V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm1, %%mm5\n" // B3G3R3B2 -> mm5
+ "psllq $32, %%mm1\n" // R3B200 -> mm1
+
+ "paddd %%mm7, %%mm1\n" // R3B200+00G2R2=R3B2G2R2->mm1
+
+ "punpckhbw ZEROSX, %%mm6\n" // R5B4G4R3 -> mm6
+ "movq %%mm1, %%mm3\n" // R3B2G2R2 -> mm3
+
+ "pmaddwd YR0GRX, %%mm1\n" // yrR3,ygG2+yrR2 -> mm1
+ "movq %%mm5, %%mm7\n" // B3G3R3B2 -> mm7
+
+ "pmaddwd YBG0BX, %%mm5\n" // ybB3+ygG3,ybB2 -> mm5
+ "psrad $15, %%mm0\n" // 32-bit scaled Y1Y0 -> mm0
+
+ "movq %%mm6, 36%5\n" // R5B4G4R4 -> TEMP0
+ "movq %%mm3, %%mm6\n" // R3B2G2R2 -> mm6
+ "pmaddwd UR0GRX, %%mm6\n" // urR3,ugG2+urR2 -> mm6
+ "psrad $15, %%mm2\n" // 32-bit scaled U1U0 -> mm2
+
+ "paddd %%mm5, %%mm1\n" // Y3Y2 -> mm1
+ "movq %%mm7, %%mm5\n" // B3G3R3B2 -> mm5
+ "pmaddwd UBG0BX, %%mm7\n" // ubB3+ugG3,ubB2
+ "psrad $15, %%mm1\n" // 32-bit scaled Y3Y2 -> mm1
+
+ "pmaddwd VR0GRX, %%mm3\n" // vrR3,vgG2+vgR2
+ "packssdw %%mm1, %%mm0\n" // Y3Y2Y1Y0 -> mm0
+
+ "pmaddwd VBG0BX, %%mm5\n" // vbB3+vgG3,vbB2 -> mm5
+ "psrad $15, %%mm4\n" // 32-bit scaled V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "paddd %%mm7, %%mm6\n" // U3U2 -> mm6
+
+ // pack rgb
+ // was: "movq 16(%0), %%mm1\n" // B7G7R7B6G6R6B5G5 -> mm1
+ // ------------------------------
+ // (uses: mm1, mm7)
+ "movd 28%5, %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 24%5, %%mm7\n"
+ "por %%mm7, %%mm1\n"
+ "psllq $16, %%mm1\n"
+ "movd 20%5, %%mm7\n"
+ "psrlq $8, %%mm7\n"
+ "por %%mm7, %%mm1\n"
+ // ------------------------------
+
+ "movq %%mm1, %%mm7\n" // B7G7R7B6G6R6B5G5 -> mm1
+ "psrad $15, %%mm6\n" // 32-bit scaled U3U2 -> mm6
+
+ "paddd %%mm5, %%mm3\n" // V3V2 -> mm3
+ "psllq $16, %%mm7\n" // R7B6G6R6B5G500 -> mm7
+
+ "movq %%mm7, %%mm5\n" // R7B6G6R6B5G500 -> mm5
+ "psrad $15, %%mm3\n" // 32-bit scaled V3V2 -> mm3
+
+ "movq %%mm0, 44%5\n" // 32-bit scaled Y3Y2Y1Y0 -> TEMPY
+
+ "packssdw %%mm6, %%mm2\n" // 32-bit scaled U3U2U1U0 -> mm2
+
+ "movq 36%5, %%mm0\n" // R5B4G4R4 -> mm0
+
+ "punpcklbw ZEROSX, %%mm7\n" // B5G500 -> mm7
+ "movq %%mm0, %%mm6\n" // R5B4G4R4 -> mm6
+
+ "movq %%mm2, 52%5\n" // 32-bit scaled U3U2U1U0 -> TEMPU
+ "psrlq $32, %%mm0\n" // 00R5B4 -> mm0
+
+ "paddw %%mm0, %%mm7\n" // B5G5R5B4 -> mm7
+ "movq %%mm6, %%mm2\n" // B5B4G4R4 -> mm2
+
+ "pmaddwd YR0GRX, %%mm2\n" // yrR5,ygG4+yrR4 -> mm2
+ "movq %%mm7, %%mm0\n" // B5G5R5B4 -> mm0
+
+ "pmaddwd YBG0BX, %%mm7\n" // ybB5+ygG5,ybB4 -> mm7
+ "packssdw %%mm3, %%mm4\n" // 32-bit scaled V3V2V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm4, 60%5\n" // (V3V2V1V0)/256 -> mm4
+
+ "movq %%mm6, %%mm4\n" // B5B4G4R4 -> mm4
+
+ "pmaddwd UR0GRX, %%mm6\n" // urR5,ugG4+urR4
+ "movq %%mm0, %%mm3\n" // B5G5R5B4 -> mm0
+
+ "pmaddwd UBG0BX, %%mm0\n" // ubB5+ugG5,ubB4
+ "paddd %%mm7, %%mm2\n" // Y5Y4 -> mm2
+
+ //----------------------------------------------------------------------
+
+ "pmaddwd VR0GRX, %%mm4\n" // vrR5,vgG4+vrR4 -> mm4
+ "pxor %%mm7, %%mm7\n" // 0 -> mm7
+
+ "pmaddwd VBG0BX, %%mm3\n" // vbB5+vgG5,vbB4 -> mm3
+ "punpckhbw %%mm7, %%mm1\n" // B7G7R7B6 -> mm1
+
+ "paddd %%mm6, %%mm0\n" // U5U4 -> mm0
+ "movq %%mm1, %%mm6\n" // B7G7R7B6 -> mm6
+
+ "pmaddwd YBG0BX, %%mm6\n" // ybB7+ygG7,ybB6 -> mm6
+ "punpckhbw %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
+
+ "movq %%mm5, %%mm7\n" // R7B6G6R6 -> mm7
+ "paddd %%mm4, %%mm3\n" // V5V4 -> mm3
+
+ "pmaddwd YR0GRX, %%mm5\n" // yrR7,ygG6+yrR6 -> mm5
+ "movq %%mm1, %%mm4\n" // B7G7R7B6 -> mm4
+
+ "pmaddwd UBG0BX, %%mm4\n" // ubB7+ugG7,ubB6 -> mm4
+ "psrad $15, %%mm0\n" // 32-bit scaled U5U4 -> %%mm0
+
+ //----------------------------------------------------------------------
+
+ "paddd OFFSETWX, %%mm0\n" // add offset to U5U4 -> mm0
+ "psrad $15, %%mm2\n" // 32-bit scaled Y5Y4 -> mm2
+
+ "paddd %%mm5, %%mm6\n" // Y7Y6 -> mm6
+ "movq %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
+
+ "pmaddwd UR0GRX, %%mm7\n" // urR7,ugG6+ugR6 -> mm7
+ "psrad $15, %%mm3\n" // 32-bit scaled V5V4 -> mm3
+
+ "pmaddwd VBG0BX, %%mm1\n" // vbB7+vgG7,vbB6 -> mm1
+ "psrad $15, %%mm6\n" // 32-bit scaled Y7Y6 -> mm6
+
+ "paddd OFFSETDX, %%mm4\n" // add offset to U7U6
+ "packssdw %%mm6, %%mm2\n" // Y7Y6Y5Y4 -> mm2
+
+ "pmaddwd VR0GRX, %%mm5\n" // vrR7,vgG6+vrR6 -> mm5
+ "paddd %%mm4, %%mm7\n" // U7U6 -> mm7
+
+ "psrad $15, %%mm7\n" // 32-bit scaled U7U6 -> mm7
+
+ //----------------------------------------------------------------------
+
+ "movq 44%5, %%mm6\n" // 32-bit scaled Y3Y2Y1Y0 -> mm6
+ "packssdw %%mm7, %%mm0\n" // 32-bit scaled U7U6U5U4 -> mm0
+
+ "movq 52%5, %%mm4\n" // 32-bit scaled U3U2U1U0 -> mm4
+ "packuswb %%mm2, %%mm6\n" // all 8 Y values -> mm6
+
+ "movq OFFSETBX, %%mm7\n" // 128,128,128,128 -> mm7
+ "paddd %%mm5, %%mm1\n" // V7V6 -> mm1
+
+ "paddw %%mm7, %%mm4\n" // add offset to U3U2U1U0/256
+ "psrad $15, %%mm1\n" // 32-bit scaled V7V6 -> mm1
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm6, (%1)\n" // store Y
+
+ "packuswb %%mm0, %%mm4\n" // all 8 U values -> mm4
+ "movq 60%5, %%mm5\n" // 32-bit scaled V3V2V1V0 -> mm5
+
+ "packssdw %%mm1, %%mm3\n" // V7V6V5V4 -> mm3
+ "paddw %%mm7, %%mm5\n" // add offset to V3V2V1V0
+ "paddw %%mm7, %%mm3\n" // add offset to V7V6V5V4
+
+ "packuswb %%mm3, %%mm5\n" // ALL 8 V values -> mm5
+
+ "movq CLEARX, %%mm2\n"
+ "pand %%mm2, %%mm4\n"
+ "pand %%mm2, %%mm5\n"
+
+ "packuswb %%mm5, %%mm4\n"
+
+ "movd %%mm4, (%2)\n"
+ "psrlq $32, %%mm4\n"
+ "movd %%mm4, (%3)\n"
+
+ "add $8, %1\n"
+ "add $4, %2\n"
+ "add $4, %3\n"
+
+ "sub $8, %4\n"
+ "jnz 1b\n"
+
+ "emms\n"
+
+ :
+ : "r" (rgb), "r" (lum), "r" (cr), "r" (cb),
+ "m" (pixel), "m" (buf[0])
+
+ );
+}
+
+void rgb2yuv16bit_mmx422_row_fast(unsigned char* rgb,
+ unsigned char* lum, unsigned char* cr,
+ unsigned char* cb, int pixel)
+{
+ __asm__ __volatile__ (
+ "1:\n"
+
+ // unpack hicolor ( pixel 0 - 3)
+ "movq (%0), %%mm0\n"
+
+ "movq %%mm0, %%mm1\n"
+ "pand KEEPR, %%mm1\n"
+ "psrlq $11, %%mm1\n" // B3B2B1B0 -> mm1
+
+ "movq %%mm0, %%mm2\n"
+ "pand KEEPG, %%mm2\n"
+ "psrlq $5, %%mm2\n" // G3G2G1G0 -> mm2
+
+ "movq %%mm0, %%mm3\n"
+ "pand KEEPB, %%mm3\n" // R3R2R1R0 -> mm3
+
+ // unpack hicolor ( pixel 4 - 7)
+ "movq 8(%0), %%mm0\n"
+
+ "movq %%mm0, %%mm4\n"
+ "pand KEEPR, %%mm4\n"
+ "psrlq $11, %%mm4\n" // B7B6B5B4 -> mm4
+
+ "movq %%mm0, %%mm5\n"
+ "pand KEEPG, %%mm5\n"
+ "psrlq $5, %%mm5\n" // G7G6G5G4 -> mm5
+
+ "movq %%mm0, %%mm6\n"
+ "pand KEEPB, %%mm6\n" // R7R6R5R4 -> mm6
+
+ // calculate Y
+ "movq %%mm6, %%mm7\n"
+ "pmullw Y_RED, %%mm7\n"
+
+ "movq %%mm5, %%mm0\n"
+ "pmullw Y_GREEN, %%mm0\n"
+ "paddw %%mm0, %%mm7\n"
+
+ "movq %%mm4, %%mm0\n"
+ "pmullw Y_BLUE, %%mm0\n"
+ "paddw %%mm0, %%mm7\n"
+
+ "psrlw $7, %%mm7\n" // Y3Y2Y1Y0 -> mm7
+
+ "pxor %%mm0, %%mm0\n"
+ "packuswb %%mm0, %%mm7\n"
+ "movd %%mm7, 4(%1)\n" // Y3Y2Y1Y0 -> lum
+
+ // --------
+
+ "movq %%mm3, %%mm7\n"
+ "pmullw Y_RED, %%mm7\n"
+
+ "movq %%mm2, %%mm0\n"
+ "pmullw Y_GREEN, %%mm0\n"
+ "paddw %%mm0, %%mm7\n"
+
+ "movq %%mm1, %%mm0\n"
+ "pmullw Y_BLUE, %%mm0\n"
+ "paddw %%mm0, %%mm7\n"
+
+ "psrlw $7, %%mm7\n" // Y7Y6Y5Y4 -> mm7
+
+ "pxor %%mm0, %%mm0\n"
+ "packuswb %%mm0, %%mm7\n"
+ "movd %%mm7, (%1)\n" // Y7Y6Y5Y4 -> lum
+ "add $8, %1\n"
+
+ // pack RGB
+ "packuswb %%mm4, %%mm1\n"
+ "pand CLEARX, %%mm1\n" // B6B4B2B0 -> mm1
+ "packuswb %%mm5, %%mm2\n"
+ "pand CLEARX, %%mm2\n" // GRG4G2G0 -> mm2
+ "packuswb %%mm6, %%mm3\n"
+ "pand CLEARX, %%mm3\n" // R6R4R2R0 -> mm3
+
+ // calculate U
+ "movq %%mm3, %%mm7\n"
+ "pmullw U_RED, %%mm7\n"
+
+ "movq %%mm2, %%mm0\n"
+ "pmullw U_GREEN, %%mm0\n"
+ "paddw %%mm0, %%mm7\n"
+
+ "movq %%mm1, %%mm0\n"
+ "pmullw U_BLUE, %%mm0\n"
+ "paddw %%mm0, %%mm7\n"
+
+ "psrlw $7, %%mm7\n" // U3U2U1U0 -> mm7
+ "paddw OFFSETBX,%%mm7\n"
+ "pand CLEARX, %%mm7\n"
+
+ "pxor %%mm0, %%mm0\n"
+ "packuswb %%mm0, %%mm7\n"
+ "movd %%mm7, (%2)\n" // U3U2U1U0 -> lum
+ "add $4, %2\n"
+
+ // calculate V
+ "movq %%mm3, %%mm7\n"
+ "pmullw V_RED, %%mm7\n"
+
+ "movq %%mm2, %%mm0\n"
+ "pmullw V_GREEN, %%mm0\n"
+ "paddw %%mm0, %%mm7\n"
+
+ "movq %%mm1, %%mm0\n"
+ "pmullw V_BLUE, %%mm0\n"
+ "paddw %%mm0, %%mm7\n"
+
+ "psrlw $7, %%mm7\n" // V3V2V1V0 -> mm7
+ "paddw OFFSETBX,%%mm7\n"
+ "pand CLEARX, %%mm7\n"
+
+ "pxor %%mm0, %%mm0\n"
+ "packuswb %%mm0, %%mm7\n"
+ "movd %%mm7, (%3)\n" // V3V2V1V0 -> lum
+ "add $4, %3\n"
+
+ "add $16, %0\n"
+
+ "sub $8, %4\n"
+ "jnz 1b\n"
+
+ "emms\n"
+
+ :
+ : "r" (rgb), "r" (lum), "r" (cr), "r" (cb), "m" (pixel)
+
+ );
+}
+
+void rgb2y16bit_mmx_row(unsigned char* rgbSource,
+ unsigned char* lum, int pixel)
+{
+ unsigned int buf[16];
+
+ // 36%3 = TEMP0
+ // 44%3 = TEMPY
+
+ __asm__ __volatile__ (
+ "1:\n"
+
+ // unpack hicolor ( pixel 1 - 4)
+ "movq (%0), %%mm0\n"
+
+ "movq %%mm0, %%mm1\n"
+ "pand KEEPR, %%mm1\n"
+ "psrlq $8, %%mm1\n" // B3B2B1B0 -> mm1
+ "movq %%mm0, %%mm2\n"
+ "pand KEEPG, %%mm2\n"
+ "psrlq $3, %%mm2\n" // G3G2G1G0 -> mm2
+ "movq %%mm0, %%mm3\n"
+ "pand KEEPB, %%mm3\n"
+ "psllq $3, %%mm3\n" // G3G2G1G0 -> mm3
+
+ "movq %%mm2, %%mm0\n"
+ "punpcklbw %%mm1, %%mm2\n"
+ "punpckhbw %%mm1, %%mm0\n"
+
+ "pxor %%mm5, %%mm5\n"
+ "movq %%mm3, %%mm4\n"
+ "punpcklbw %%mm5, %%mm3\n"
+ "punpckhbw %%mm5, %%mm4\n"
+
+ "psllq $8, %%mm2\n"
+ "por %%mm2, %%mm3\n" // 0B1G1R10B0G0G0 -> mm3
+ "psllq $8, %%mm0\n"
+ "por %%mm0, %%mm4\n" // 0B3G3R30B2G2G2 -> mm4
+
+ "movq %%mm3, %3\n"
+ "movq %%mm4, 8%3\n"
+
+ // next 4 pixels ------------------------------
+
+ "movq 8(%0), %%mm0\n"
+
+ "movq %%mm0, %%mm1\n"
+ "pand KEEPR, %%mm1\n"
+ "psrlq $8, %%mm1\n" // B3B2B1B0 -> mm1
+ "movq %%mm0, %%mm2\n"
+ "pand KEEPG, %%mm2\n"
+ "psrlq $3, %%mm2\n" // G3G2G1G0 -> mm2
+ "movq %%mm0, %%mm3\n"
+ "pand KEEPB, %%mm3\n"
+ "psllq $3, %%mm3\n" // G3G2G1G0 -> mm3
+
+ "movq %%mm2, %%mm0\n"
+ "punpcklbw %%mm1, %%mm2\n"
+ "punpckhbw %%mm1, %%mm0\n"
+
+ "pxor %%mm5, %%mm5\n"
+ "movq %%mm3, %%mm4\n"
+ "punpcklbw %%mm5, %%mm3\n"
+ "punpckhbw %%mm5, %%mm4\n"
+
+ "psllq $8, %%mm2\n"
+ "por %%mm2, %%mm3\n" // 0B1G1R10B0G0G0 -> mm3
+ "psllq $8, %%mm0\n"
+ "por %%mm0, %%mm4\n" // 0B3G3R30B2G2G2 -> mm4
+
+ "movq %%mm3, 16%3\n"
+ "movq %%mm4, 24%3\n"
+
+ "add $16, %0\n"
+
+ // standard algorithm --------------------------------------------------
+
+ // pack rgb
+ // was: "movq (%0), %%mm1\n" // load G2R2B1G1R1B0G0R0
+ // ------------------------------
+ // (uses: mm0, mm1)
+ "movd 8%3, %%mm0\n"
+ "psllq $24, %%mm0\n"
+ "movd 4%3, %%mm1\n"
+ "por %%mm1, %%mm0\n"
+ "psllq $24, %%mm0\n"
+ "movd %3, %%mm1\n"
+ "por %%mm0, %%mm1\n"
+ // ------------------------------
+
+ "pxor %%mm6, %%mm6\n" // 0 -> mm6
+ "movq %%mm1, %%mm0\n" // G2R2B1G1R1B0G0R0 -> mm0
+ "psrlq $16, %%mm1\n" // 00G2R2B1G1R1B0 -> mm1
+ "punpcklbw ZEROSX, %%mm0\n" // R1B0G0R0 -> mm0
+ "movq %%mm1, %%mm7\n" // 00G2R2B1G1R1B0 -> mm7
+ "punpcklbw ZEROSX, %%mm1\n" // B1G1R1B0 -> mm1
+ "movq %%mm0, %%mm2\n" // R1B0G0R0 -> mm2
+ "pmaddwd YR0GRX, %%mm0\n" // yrR1,ygG0+yrR0 -> mm0
+ "movq %%mm1, %%mm3\n" // B1G1R1B0 -> mm3
+ "pmaddwd YBG0BX, %%mm1\n" // ybB1+ygG1,ybB0 -> mm1
+ "movq %%mm2, %%mm4\n" // R1B0G0R0 -> mm4
+ "movq %%mm3, %%mm5\n" // B1G1R1B0 -> mm5
+ "punpckhbw %%mm6, %%mm7\n" // 00G2R2 -> mm7
+ "paddd %%mm1, %%mm0\n" // Y1Y0 -> mm0
+
+ // pack rgb
+ // was: "movq 8(%0),%%mm1\n" // R5B4G4R4B3G3R3B2 -> mm1
+ // ------------------------------
+ // (uses: mm1, mm6)
+ "movd 20%3, %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 16%3, %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 12%3, %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ "psllq $8, %%mm1\n"
+ "movd 8%3, %%mm6\n"
+ "psrlq $16, %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ // ------------------------------
+
+ "movq %%mm1, %%mm6\n" // R5B4G4R4B3G3R3B2 -> mm6
+ "punpcklbw ZEROSX, %%mm1\n" // B3G3R3B2 -> mm1
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm1, %%mm5\n" // B3G3R3B2 -> mm5
+ "psllq $32, %%mm1\n" // R3B200 -> mm1
+
+ "paddd %%mm7, %%mm1\n" // R3B200+00G2R2=R3B2G2R2->mm1
+
+ "punpckhbw ZEROSX, %%mm6\n" // R5B4G4R3 -> mm6
+ "movq %%mm1, %%mm3\n" // R3B2G2R2 -> mm3
+
+ "pmaddwd YR0GRX, %%mm1\n" // yrR3,ygG2+yrR2 -> mm1
+ "movq %%mm5, %%mm7\n" // B3G3R3B2 -> mm7
+
+ "pmaddwd YBG0BX, %%mm5\n" // ybB3+ygG3,ybB2 -> mm5
+ "psrad $15, %%mm0\n" // 32-bit scaled Y1Y0 -> mm0
+
+ "movq %%mm6, 36%3\n" // R5B4G4R4 -> TEMP0
+ "movq %%mm3, %%mm6\n" // R3B2G2R2 -> mm6
+
+ "paddd %%mm5, %%mm1\n" // Y3Y2 -> mm1
+ "movq %%mm7, %%mm5\n" // B3G3R3B2 -> mm5
+ "psrad $15, %%mm1\n" // 32-bit scaled Y3Y2 -> mm1
+
+ "packssdw %%mm1, %%mm0\n" // Y3Y2Y1Y0 -> mm0
+
+ //----------------------------------------------------------------------
+
+ // pack rgb
+ // was: "movq 16(%0), %%mm1\n" // B7G7R7B6G6R6B5G5 -> mm1
+ // ------------------------------
+ // (uses: mm1, mm7)
+ "movd 28%3, %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 24%3, %%mm7\n"
+ "por %%mm7, %%mm1\n"
+ "psllq $16, %%mm1\n"
+ "movd 20%3, %%mm7\n"
+ "psrlq $8, %%mm7\n"
+ "por %%mm7, %%mm1\n"
+ // ------------------------------
+
+ "movq %%mm1, %%mm7\n" // B7G7R7B6G6R6B5G5 -> mm1
+
+ "psllq $16, %%mm7\n" // R7B6G6R6B5G500 -> mm7
+
+ "movq %%mm7, %%mm5\n" // R7B6G6R6B5G500 -> mm5
+
+ "movq %%mm0, 44%3\n" // 32-bit scaled Y3Y2Y1Y0 -> TEMPY
+
+ "movq 36%3, %%mm0\n" // R5B4G4R4 -> mm0
+
+ "punpcklbw ZEROSX, %%mm7\n" // B5G500 -> mm7
+ "movq %%mm0, %%mm6\n" // R5B4G4R4 -> mm6
+
+ "psrlq $32, %%mm0\n" // 00R5B4 -> mm0
+
+ "paddw %%mm0, %%mm7\n" // B5G5R5B4 -> mm7
+ "movq %%mm6, %%mm2\n" // B5B4G4R4 -> mm2
+
+ "pmaddwd YR0GRX, %%mm2\n" // yrR5,ygG4+yrR4 -> mm2
+
+ "pmaddwd YBG0BX, %%mm7\n" // ybB5+ygG5,ybB4 -> mm7
+
+ //----------------------------------------------------------------------
+ "paddd %%mm7, %%mm2\n" // Y5Y4 -> mm2
+
+ //----------------------------------------------------------------------
+
+ "pxor %%mm7, %%mm7\n" // 0 -> mm7
+
+ "punpckhbw %%mm7, %%mm1\n" // B7G7R7B6 -> mm1
+
+ "movq %%mm1, %%mm6\n" // B7G7R7B6 -> mm6
+
+ "pmaddwd YBG0BX, %%mm6\n" // ybB7+ygG7,ybB6 -> mm6
+ "punpckhbw %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
+
+ "pmaddwd YR0GRX, %%mm5\n" // yrR7,ygG6+yrR6 -> mm5
+
+ //----------------------------------------------------------------------
+
+ "psrad $15, %%mm2\n" // 32-bit scaled Y5Y4 -> mm2
+
+ "paddd %%mm5, %%mm6\n" // Y7Y6 -> mm6
+ "psrad $15, %%mm6\n" // 32-bit scaled Y7Y6 -> mm6
+
+ "packssdw %%mm6, %%mm2\n" // Y7Y6Y5Y4 -> mm2
+
+ //----------------------------------------------------------------------
+
+ "movq 44%3, %%mm6\n" // 32-bit scaled Y3Y2Y1Y0 -> mm6
+ "packuswb %%mm2, %%mm6\n" // all 8 Y values -> mm6
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm6, (%1)\n" // store Y
+
+ "add $8, %1\n"
+
+ "sub $8, %2\n"
+ "jnz 1b\n"
+ "emms\n"
+
+ :
+ : "r" (rgbSource), "r" (lum), "m" (pixel), "m" (buf[0])
+
+ );
+}
+
+void rgb2y16bit_mmx_row_fast(unsigned char* rgb, unsigned char* lum, int pixel)
+{
+ __asm__ __volatile__ (
+ "1:\n"
+
+ // unpack hicolor ( pixel 1 - 4)
+ "movq (%0), %%mm0\n"
+
+ "movq %%mm0, %%mm1\n"
+ "pand KEEPR, %%mm1\n"
+ "psrlq $11, %%mm1\n" // B3B2B1B0 -> mm1
+
+ "movq %%mm0, %%mm2\n"
+ "pand KEEPG, %%mm2\n"
+ "psrlq $5, %%mm2\n" // G3G2G1G0 -> mm2
+ "movq %%mm0, %%mm3\n"
+ "pand KEEPB, %%mm3\n" // R3R2R1R0 -> mm3
+
+ // calculate Y
+ "movq %%mm3, %%mm4\n"
+ "pmullw Y_RED, %%mm4\n"
+
+ "movq %%mm2, %%mm5\n"
+ "pmullw Y_GREEN, %%mm5\n"
+ "paddw %%mm5, %%mm4\n"
+
+ "movq %%mm1, %%mm6\n"
+ "pmullw Y_BLUE, %%mm6\n"
+ "paddw %%mm6, %%mm4\n"
+
+ "psrlw $7, %%mm4\n" // Y3Y2Y1Y0 -> mm4
+
+ "pxor %%mm5, %%mm5\n"
+ "packuswb %%mm5, %%mm4\n"
+
+ "movd %%mm4, (%1)\n"
+ "add $4, %1\n"
+
+ "add $8, %0\n"
+
+ "sub $4, %2\n"
+ "jnz 1b\n"
+
+ "emms\n"
+
+ :
+ : "r" (rgb), "r" (lum), "m" (pixel)
+ );
+}
+
+
+#endif
+// INTEL
+
+
diff --git a/mpeglib/lib/util/render/dither2YUV/rgb2yuv16.h b/mpeglib/lib/util/render/dither2YUV/rgb2yuv16.h
new file mode 100644
index 00000000..7e4d6508
--- /dev/null
+++ b/mpeglib/lib/util/render/dither2YUV/rgb2yuv16.h
@@ -0,0 +1,74 @@
+/***************************************************************************
+ rgb2yuv16.h - description
+ -------------------
+ begin : Tue Nov 2 2000
+ copyright : (C) 2000 by Christian Gerlach
+ email : cgerlach@rhrk.uni-kl.de
+ ***************************************************************************/
+
+/***************************************************************************
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ ***************************************************************************/
+
+#ifndef __RGB2YUV16_H
+#define __RGB2YUV16_H
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "../yuvPicture.h"
+#include "rgb2yuvdefs.h"
+
+// slow C implementation
+void rgb2yuv16bit(unsigned char* rgbSource,
+ unsigned char* destLum,
+ unsigned char* destCr,
+ unsigned char* destCb,int height, int width);
+
+
+
+//
+// We compile with MMX if we are on INTEL arch
+// (this does not mean that we really support MMX,
+// this is a seperate/runtime check)
+//
+
+#ifdef INTEL
+
+void rgb2yuv16bit_mmx(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width);
+
+void rgb2yuv16bit_mmx_fast(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width);
+
+void rgb2yuv16bit_mmx422_row(unsigned char* rgb,
+ unsigned char* lum, unsigned char* cr,
+ unsigned char* cb, int pixel);
+
+void rgb2y16bit_mmx_row(unsigned char* rgbSource,
+ unsigned char* lum, int pixel);
+
+void rgb2yuv16bit_mmx422_row_fast(unsigned char* rgb,
+ unsigned char* lum, unsigned char* cr,
+ unsigned char* cb, int pixel);
+
+void rgb2y16bit_mmx_row_fast(unsigned char* rgb,
+ unsigned char* lum, int pixel);
+
+
+#endif
+// INTEL
+
+
+
+#endif
diff --git a/mpeglib/lib/util/render/dither2YUV/rgb2yuv32.cpp b/mpeglib/lib/util/render/dither2YUV/rgb2yuv32.cpp
new file mode 100644
index 00000000..3e246e25
--- /dev/null
+++ b/mpeglib/lib/util/render/dither2YUV/rgb2yuv32.cpp
@@ -0,0 +1,1143 @@
+/***************************************************************************
+ rgb2yuv32.cpp - description
+ -------------------
+ begin : Tue Nov 2 2000
+ copyright : (C) 2000 by Christian Gerlach
+ email : cgerlach@rhrk.uni-kl.de
+ ***************************************************************************/
+
+/***************************************************************************
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ ***************************************************************************/
+
+#include "rgb2yuv32.h"
+#include <iostream>
+
+void rgb2yuv32(unsigned char* rgb, unsigned char* dest)
+{
+ dest[0] = Y_RGB(rgb[0], rgb[1], rgb[2]);
+ dest[1] = U_RGB(rgb[0], rgb[1], rgb[2]);
+ dest[2] = V_RGB(rgb[0], rgb[1], rgb[2]);
+}
+
+void rgb2yuv24bit(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width) {
+ int width2 = width / 2;
+ int height2 = height / 2;
+ int r, g, b, row, col;
+
+ for (row=0 ; row<height2 ; row++) {
+ for (col=0 ; col<width2 ; col++) {
+ r = *rgbSource++;
+ g = *rgbSource++;
+ b = *rgbSource++;
+
+ *lum++ = Y_RGB(r, g, b);
+ *cr++ = U_RGB(r, g, b);
+ *cb++ = V_RGB(r, g, b);
+
+ r = *rgbSource++;
+ g = *rgbSource++;
+ b = *rgbSource++;
+
+ *lum++ = Y_RGB(r, g, b);
+ }
+ for (col=0 ; col<width ; col++) {
+ r = *rgbSource++;
+ g = *rgbSource++;
+ b = *rgbSource++;
+
+ *lum++ = Y_RGB(r, g, b);
+ }
+ }
+}
+
+void rgb2yuv32bit(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width) {
+
+ int width2 = width / 2;
+ int height2 = height / 2;
+ int r, g, b, row, col;
+
+ for (row=0 ; row<height2 ; row++) {
+ for (col=0 ; col<width2 ; col++) {
+ r = *rgbSource++;
+ g = *rgbSource++;
+ b = *rgbSource++;
+ rgbSource ++;
+
+ *lum++ = Y_RGB(r, g, b);
+ *cr++ = U_RGB(r, g, b);
+ *cb++ = V_RGB(r, g, b);
+
+ r = *rgbSource++;
+ g = *rgbSource++;
+ b = *rgbSource++;
+ rgbSource++;
+
+ *lum++ = Y_RGB(r, g, b);
+ }
+ for (col=0 ; col<width ; col++) {
+ r = *rgbSource++;
+ g = *rgbSource++;
+ b = *rgbSource++;
+ rgbSource ++;
+
+ *lum++ = Y_RGB(r, g, b);
+ }
+ }
+}
+
+#ifndef INTEL
+void rgb2yuv24bit_mmx(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width) {
+ std::cout << "RGB->YUV render not compiled for INTEL"<<std::endl;
+ exit(0);
+}
+
+void rgb2yuv32bit_mmx(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width) {
+ std::cout << "RGB->YUV render not compiled for INTEL"<<std::endl;
+ exit(0);
+}
+
+#endif
+
+#ifdef INTEL
+
+void rgb2yuv24bit_mmx(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width) {
+ int width2 = width / 2;
+ int height2 = height / 2;
+ int row;
+ int bytesPerLine = width * 3;
+
+ for (row=0 ; row<height2 ; row++) {
+ rgb2yuv24bit_mmx422_row(rgbSource, lum, cr, cb, width);
+ rgbSource += bytesPerLine;
+ lum += width;
+ cr += width2;
+ cb += width2;
+
+ rgb2y24bit_mmx_row(rgbSource, lum, width);
+ rgbSource += bytesPerLine;
+ lum += width;
+ }
+}
+
+void rgb2yuv32bit_mmx(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width) {
+
+
+ int width2 = width / 2;
+ int height2 = height / 2;
+ int bytesPerLine = width * 4;
+
+ for (int row=0 ; row<height2 ; row++) {
+ rgb2yuv32bit_mmx422_row(rgbSource, lum,cr, cb, width);
+ rgbSource += bytesPerLine;
+ lum += width;
+ cr += width2;
+ cb += width2;
+
+ rgb2y32bit_mmx_row(rgbSource, lum, width);
+ rgbSource += bytesPerLine;
+ lum += width;
+ }
+}
+
+void rgb2yuv24bit_mmx444_row(unsigned char* rgb, unsigned char* lum, unsigned char* cr,
+ unsigned char* cb, int pixel)
+{
+ unsigned int buf[8];
+
+ // %5 = TEMP0
+ // 8%5 = TEMPY
+ // 16%5 = TEMPU
+ // 24%5 = TEMPV
+
+ __asm__ __volatile__ (
+ "1:\n"
+
+ "movq (%0), %%mm1\n" // load G2R2B1G1R1B0G0R0
+ "pxor %%mm6, %%mm6\n" // 0 -> mm6
+ "movq %%mm1, %%mm0\n" // G2R2B1G1R1B0G0R0 -> mm0
+ "psrlq $16, %%mm1\n" // 00G2R2B1G1R1B0 -> mm1
+ "punpcklbw ZEROSX, %%mm0\n" // R1B0G0R0 -> mm0
+ "movq %%mm1, %%mm7\n" // 00G2R2B1G1R1B0 -> mm7
+ "punpcklbw ZEROSX, %%mm1\n" // B1G1R1B0 -> mm1
+ "movq %%mm0, %%mm2\n" // R1B0G0R0 -> mm2
+ "pmaddwd YR0GRX, %%mm0\n" // yrR1,ygG0+yrR0 -> mm0
+ "movq %%mm1, %%mm3\n" // B1G1R1B0 -> mm3
+ "pmaddwd YBG0BX, %%mm1\n" // ybB1+ygG1,ybB0 -> mm1
+ "movq %%mm2, %%mm4\n" // R1B0G0R0 -> mm4
+ "pmaddwd UR0GRX, %%mm2\n" // urR1,ugG0+urR0 -> mm2
+ "movq %%mm3, %%mm5\n" // B1G1R1B0 -> mm5
+ "pmaddwd UBG0BX, %%mm3\n" // ubB1+ugG1,ubB0 -> mm3
+ "punpckhbw %%mm6, %%mm7\n" // 00G2R2 -> mm7
+ "pmaddwd VR0GRX, %%mm4\n" // vrR1,vgG0+vrR0 -> mm4
+ "paddd %%mm1, %%mm0\n" // Y1Y0 -> mm0
+
+ "pmaddwd VBG0BX, %%mm5\n" // vbB1+vgG1,vbB0 -> mm5
+
+ "movq 8(%0),%%mm1\n" // load G2R2B1G1R1B0G0R0
+ "paddd %%mm3, %%mm2\n" // U1U0 -> mm2
+
+ "movq %%mm1, %%mm6\n" // R5B4G4R4B3G3R3B2 -> mm6
+ "punpcklbw ZEROSX, %%mm1\n" // B3G3R3B2 -> mm1
+ "paddd %%mm5, %%mm4\n" // V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm1, %%mm5\n" // B3G3R3B2 -> mm5
+ "psllq $32, %%mm1\n" // R3B200 -> mm1
+
+ "paddd %%mm7, %%mm1\n" // R3B200+00G2R2=R3B2G2R2->mm1
+
+ "punpckhbw ZEROSX, %%mm6\n" // R5B4G4R3 -> mm6
+ "movq %%mm1, %%mm3\n" // R3B2G2R2 -> mm3
+
+ "pmaddwd YR0GRX, %%mm1\n" // yrR3,ygG2+yrR2 -> mm1
+ "movq %%mm5, %%mm7\n" // B3G3R3B2 -> mm7
+
+ "pmaddwd YBG0BX, %%mm5\n" // ybB3+ygG3,ybB2 -> mm5
+ "psrad $15, %%mm0\n" // 32-bit scaled Y1Y0 -> mm0
+
+ "movq %%mm6, %5\n" // R5B4G4R4 -> TEMP0
+ "movq %%mm3, %%mm6\n" // R3B2G2R2 -> mm6
+ "pmaddwd UR0GRX, %%mm6\n" // urR3,ugG2+urR2 -> mm6
+ "psrad $15, %%mm2\n" // 32-bit scaled U1U0 -> mm2
+
+ "paddd %%mm5, %%mm1\n" // Y3Y2 -> mm1
+ "movq %%mm7, %%mm5\n" // B3G3R3B2 -> mm5
+ "pmaddwd UBG0BX, %%mm7\n" // ubB3+ugG3,ubB2
+ "psrad $15, %%mm1\n" // 32-bit scaled Y3Y2 -> mm1
+
+ "pmaddwd VR0GRX, %%mm3\n" // vrR3,vgG2+vgR2
+ "packssdw %%mm1, %%mm0\n" // Y3Y2Y1Y0 -> mm0
+
+ "pmaddwd VBG0BX, %%mm5\n" // vbB3+vgG3,vbB2 -> mm5
+ "psrad $15, %%mm4\n" // 32-bit scaled V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "movq 16(%0), %%mm1\n" // B7G7R7B6G6R6B5G5 -> mm7
+ "paddd %%mm7, %%mm6\n" // U3U2 -> mm6
+
+ "movq %%mm1, %%mm7\n" // B7G7R7B6G6R6B5G5 -> mm1
+ "psrad $15, %%mm6\n" // 32-bit scaled U3U2 -> mm6
+
+ "paddd %%mm5, %%mm3\n" // V3V2 -> mm3
+ "psllq $16, %%mm7\n" // R7B6G6R6B5G500 -> mm7
+
+ "movq %%mm7, %%mm5\n" // R7B6G6R6B5G500 -> mm5
+ "psrad $15, %%mm3\n" // 32-bit scaled V3V2 -> mm3
+
+ "movq %%mm0, 8%5\n" // 32-bit scaled Y3Y2Y1Y0 -> TEMPY
+ "packssdw %%mm6, %%mm2\n" // 32-bit scaled U3U2U1U0 -> mm2
+
+ "movq %5, %%mm0\n" // R5B4G4R4 -> mm0
+
+ "punpcklbw ZEROSX, %%mm7\n" // B5G500 -> mm7
+ "movq %%mm0, %%mm6\n" // R5B4G4R4 -> mm6
+
+ "movq %%mm2, 16%5\n" // 32-bit scaled U3U2U1U0 -> TEMPU
+ "psrlq $32, %%mm0\n" // 00R5B4 -> mm0
+
+ "paddw %%mm0, %%mm7\n" // B5G5R5B4 -> mm7
+ "movq %%mm6, %%mm2\n" // B5B4G4R4 -> mm2
+
+ "pmaddwd YR0GRX, %%mm2\n" // yrR5,ygG4+yrR4 -> mm2
+ "movq %%mm7, %%mm0\n" // B5G5R5B4 -> mm0
+
+ "pmaddwd YBG0BX, %%mm7\n" // ybB5+ygG5,ybB4 -> mm7
+ "packssdw %%mm3, %%mm4\n" // 32-bit scaled V3V2V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm4, 24%5\n" // (V3V2V1V0)/256 -> mm4
+
+ "movq %%mm6, %%mm4\n" // B5B4G4R4 -> mm4
+
+ "pmaddwd UR0GRX, %%mm6\n" // urR5,ugG4+urR4
+ "movq %%mm0, %%mm3\n" // B5G5R5B4 -> mm0
+
+ "pmaddwd UBG0BX, %%mm0\n" // ubB5+ugG5,ubB4
+ "paddd %%mm7, %%mm2\n" // Y5Y4 -> mm2
+
+ //----------------------------------------------------------------------
+
+ "pmaddwd VR0GRX, %%mm4\n" // vrR5,vgG4+vrR4 -> mm4
+ "pxor %%mm7, %%mm7\n" // 0 -> mm7
+
+ "pmaddwd VBG0BX, %%mm3\n" // vbB5+vgG5,vbB4 -> mm3
+ "punpckhbw %%mm7, %%mm1\n" // B7G7R7B6 -> mm1
+
+ "paddd %%mm6, %%mm0\n" // U5U4 -> mm0
+ "movq %%mm1, %%mm6\n" // B7G7R7B6 -> mm6
+
+ "pmaddwd YBG0BX, %%mm6\n" // ybB7+ygG7,ybB6 -> mm6
+ "punpckhbw %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
+
+ "movq %%mm5, %%mm7\n" // R7B6G6R6 -> mm7
+ "paddd %%mm4, %%mm3\n" // V5V4 -> mm3
+
+ "pmaddwd YR0GRX, %%mm5\n" // yrR7,ygG6+yrR6 -> mm5
+ "movq %%mm1, %%mm4\n" // B7G7R7B6 -> mm4
+
+ "pmaddwd UBG0BX, %%mm4\n" // ubB7+ugG7,ubB6 -> mm4
+ "psrad $15, %%mm0\n" // 32-bit scaled U5U4 -> %%mm0
+
+ //----------------------------------------------------------------------
+
+ "paddd OFFSETWX, %%mm0\n" // add offset to U5U4 -> mm0
+ "psrad $15, %%mm2\n" // 32-bit scaled Y5Y4 -> mm2
+
+ "paddd %%mm5, %%mm6\n" // Y7Y6 -> mm6
+ "movq %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
+
+ "pmaddwd UR0GRX, %%mm7\n" // urR7,ugG6+ugR6 -> mm7
+ "psrad $15, %%mm3\n" // 32-bit scaled V5V4 -> mm3
+
+ "pmaddwd VBG0BX, %%mm1\n" // vbB7+vgG7,vbB6 -> mm1
+ "psrad $15, %%mm6\n" // 32-bit scaled Y7Y6 -> mm6
+
+ "paddd OFFSETDX, %%mm4\n" // add offset to U7U6
+ "packssdw %%mm6, %%mm2\n" // Y7Y6Y5Y4 -> mm2
+
+ "pmaddwd VR0GRX, %%mm5\n" // vrR7,vgG6+vrR6 -> mm5
+ "paddd %%mm4, %%mm7\n" // U7U6 -> mm7
+
+ "psrad $15, %%mm7\n" // 32-bit scaled U7U6 -> mm7
+
+ //----------------------------------------------------------------------
+
+ "movq 8%5, %%mm6\n" // 32-bit scaled Y3Y2Y1Y0 -> mm6
+ "packssdw %%mm7, %%mm0\n" // 32-bit scaled U7U6U5U4 -> mm0
+
+ "movq 16%5, %%mm4\n" // 32-bit scaled U3U2U1U0 -> mm4
+ "packuswb %%mm2, %%mm6\n" // all 8 Y values -> mm6
+
+ "movq OFFSETBX, %%mm7\n" // 128,128,128,128 -> mm7
+ "paddd %%mm5, %%mm1\n" // V7V6 -> mm1
+
+ "paddw %%mm7, %%mm4\n" // add offset to U3U2U1U0/256
+ "psrad $15, %%mm1\n" // 32-bit scaled V7V6 -> mm1
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm6, (%1)\n" // store Y
+ "packuswb %%mm0, %%mm4\n" // all 8 U values -> mm4
+
+ "movq 24%5, %%mm5\n" // 32-bit scaled V3V2V1V0 -> mm5
+ "packssdw %%mm1, %%mm3\n" // V7V6V5V4 -> mm3
+ "paddw %%mm7, %%mm5\n" // add offset to V3V2V1V0
+ "paddw %%mm7, %%mm3\n" // add offset to V7V6V5V4
+
+ "movq %%mm4, (%2)\n" // store U
+ "packuswb %%mm3, %%mm5\n" // ALL 8 V values -> mm5
+
+ "movq %%mm5, (%3)\n" // store V
+
+ "sub $8, %4\n"
+ "jnz 1b\n"
+ "emms\n"
+
+ :
+ : "r" (rgb), "r" (lum), "r" (cr), "r" (cb), "m" (pixel), "m" (buf[0])
+ );
+}
+
+void rgb2yuv24bit_mmx422_row(unsigned char* rgb, unsigned char* lum, unsigned char* cr,
+ unsigned char* cb, int pixel)
+{
+ unsigned int buf[8];
+
+ // %5 = TEMP0
+ // 8%5 = TEMPY
+ // 16%5 = TEMPU
+ // 24%5 = TEMPV
+
+ __asm__ __volatile__ (
+ "1:\n"
+
+ "movq (%0), %%mm1\n" // load G2R2B1G1R1B0G0R0
+ "pxor %%mm6, %%mm6\n" // 0 -> mm6
+ "movq %%mm1, %%mm0\n" // G2R2B1G1R1B0G0R0 -> mm0
+ "psrlq $16, %%mm1\n" // 00G2R2B1G1R1B0 -> mm1
+ "punpcklbw ZEROSX, %%mm0\n" // R1B0G0R0 -> mm0
+ "movq %%mm1, %%mm7\n" // 00G2R2B1G1R1B0 -> mm7
+ "punpcklbw ZEROSX, %%mm1\n" // B1G1R1B0 -> mm1
+ "movq %%mm0, %%mm2\n" // R1B0G0R0 -> mm2
+ "pmaddwd YR0GRX, %%mm0\n" // yrR1,ygG0+yrR0 -> mm0
+
+ "movq %%mm1, %%mm3\n" // B1G1R1B0 -> mm3
+ "pmaddwd YBG0BX, %%mm1\n" // ybB1+ygG1,ybB0 -> mm1
+ "movq %%mm2, %%mm4\n" // R1B0G0R0 -> mm4
+ "pmaddwd UR0GRX, %%mm2\n" // urR1,ugG0+urR0 -> mm2
+ "movq %%mm3, %%mm5\n" // B1G1R1B0 -> mm5
+ "pmaddwd UBG0BX, %%mm3\n" // ubB1+ugG1,ubB0 -> mm3
+ "punpckhbw %%mm6, %%mm7\n" // 00G2R2 -> mm7
+ "pmaddwd VR0GRX, %%mm4\n" // vrR1,vgG0+vrR0 -> mm4
+ "paddd %%mm1, %%mm0\n" // Y1Y0 -> mm0
+
+ "pmaddwd VBG0BX, %%mm5\n" // vbB1+vgG1,vbB0 -> mm5
+
+ "movq 8(%0),%%mm1\n" // load G2R2B1G1R1B0G0R0
+ "paddd %%mm3, %%mm2\n" // U1U0 -> mm2
+
+ "movq %%mm1, %%mm6\n" // R5B4G4R4B3G3R3B2 -> mm6
+ "punpcklbw ZEROSX, %%mm1\n" // B3G3R3B2 -> mm1
+ "paddd %%mm5, %%mm4\n" // V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm1, %%mm5\n" // B3G3R3B2 -> mm5
+ "psllq $32, %%mm1\n" // R3B200 -> mm1
+
+ "paddd %%mm7, %%mm1\n" // R3B200+00G2R2=R3B2G2R2->mm1
+
+ "punpckhbw ZEROSX, %%mm6\n" // R5B4G4R3 -> mm6
+ "movq %%mm1, %%mm3\n" // R3B2G2R2 -> mm3
+
+ "pmaddwd YR0GRX, %%mm1\n" // yrR3,ygG2+yrR2 -> mm1
+ "movq %%mm5, %%mm7\n" // B3G3R3B2 -> mm7
+
+ "pmaddwd YBG0BX, %%mm5\n" // ybB3+ygG3,ybB2 -> mm5
+ "psrad $15, %%mm0\n" // 32-bit scaled Y1Y0 -> mm0
+
+ "movq %%mm6, %5\n" // R5B4G4R4 -> TEMP0
+ "movq %%mm3, %%mm6\n" // R3B2G2R2 -> mm6
+ "pmaddwd UR0GRX, %%mm6\n" // urR3,ugG2+urR2 -> mm6
+ "psrad $15, %%mm2\n" // 32-bit scaled U1U0 -> mm2
+
+ "paddd %%mm5, %%mm1\n" // Y3Y2 -> mm1
+ "movq %%mm7, %%mm5\n" // B3G3R3B2 -> mm5
+ "pmaddwd UBG0BX, %%mm7\n" // ubB3+ugG3,ubB2
+ "psrad $15, %%mm1\n" // 32-bit scaled Y3Y2 -> mm1
+
+ "pmaddwd VR0GRX, %%mm3\n" // vrR3,vgG2+vgR2
+ "packssdw %%mm1, %%mm0\n" // Y3Y2Y1Y0 -> mm0
+
+ "pmaddwd VBG0BX, %%mm5\n" // vbB3+vgG3,vbB2 -> mm5
+ "psrad $15, %%mm4\n" // 32-bit scaled V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "movq 16(%0), %%mm1\n" // B7G7R7B6G6R6B5G5 -> mm7
+ "paddd %%mm7, %%mm6\n" // U3U2 -> mm6
+
+ "movq %%mm1, %%mm7\n" // B7G7R7B6G6R6B5G5 -> mm1
+ "psrad $15, %%mm6\n" // 32-bit scaled U3U2 -> mm6
+
+ "paddd %%mm5, %%mm3\n" // V3V2 -> mm3
+ "psllq $16, %%mm7\n" // R7B6G6R6B5G500 -> mm7
+
+ "movq %%mm7, %%mm5\n" // R7B6G6R6B5G500 -> mm5
+ "psrad $15, %%mm3\n" // 32-bit scaled V3V2 -> mm3
+
+ "movq %%mm0, 8%5\n" // 32-bit scaled Y3Y2Y1Y0 -> TEMPY
+ "packssdw %%mm6, %%mm2\n" // 32-bit scaled U3U2U1U0 -> mm2
+
+ "movq %5, %%mm0\n" // R5B4G4R4 -> mm0
+
+ "punpcklbw ZEROSX, %%mm7\n" // B5G500 -> mm7
+ "movq %%mm0, %%mm6\n" // R5B4G4R4 -> mm6
+
+ "movq %%mm2, 16%5\n" // 32-bit scaled U3U2U1U0 -> TEMPU
+ "psrlq $32, %%mm0\n" // 00R5B4 -> mm0
+
+ "paddw %%mm0, %%mm7\n" // B5G5R5B4 -> mm7
+ "movq %%mm6, %%mm2\n" // B5B4G4R4 -> mm2
+
+ "pmaddwd YR0GRX, %%mm2\n" // yrR5,ygG4+yrR4 -> mm2
+ "movq %%mm7, %%mm0\n" // B5G5R5B4 -> mm0
+
+ "pmaddwd YBG0BX, %%mm7\n" // ybB5+ygG5,ybB4 -> mm7
+ "packssdw %%mm3, %%mm4\n" // 32-bit scaled V3V2V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm4, 24%5\n" // (V3V2V1V0)/256 -> mm4
+
+ "movq %%mm6, %%mm4\n" // B5B4G4R4 -> mm4
+
+ "pmaddwd UR0GRX, %%mm6\n" // urR5,ugG4+urR4
+ "movq %%mm0, %%mm3\n" // B5G5R5B4 -> mm0
+
+ "pmaddwd UBG0BX, %%mm0\n" // ubB5+ugG5,ubB4
+ "paddd %%mm7, %%mm2\n" // Y5Y4 -> mm2
+
+ //----------------------------------------------------------------------
+
+ "pmaddwd VR0GRX, %%mm4\n" // vrR5,vgG4+vrR4 -> mm4
+ "pxor %%mm7, %%mm7\n" // 0 -> mm7
+
+ "pmaddwd VBG0BX, %%mm3\n" // vbB5+vgG5,vbB4 -> mm3
+ "punpckhbw %%mm7, %%mm1\n" // B7G7R7B6 -> mm1
+
+ "paddd %%mm6, %%mm0\n" // U5U4 -> mm0
+ "movq %%mm1, %%mm6\n" // B7G7R7B6 -> mm6
+
+ "pmaddwd YBG0BX, %%mm6\n" // ybB7+ygG7,ybB6 -> mm6
+ "punpckhbw %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
+
+ "movq %%mm5, %%mm7\n" // R7B6G6R6 -> mm7
+ "paddd %%mm4, %%mm3\n" // V5V4 -> mm3
+
+ "pmaddwd YR0GRX, %%mm5\n" // yrR7,ygG6+yrR6 -> mm5
+ "movq %%mm1, %%mm4\n" // B7G7R7B6 -> mm4
+
+ "pmaddwd UBG0BX, %%mm4\n" // ubB7+ugG7,ubB6 -> mm4
+ "psrad $15, %%mm0\n" // 32-bit scaled U5U4 -> %%mm0
+
+ //----------------------------------------------------------------------
+
+ "paddd OFFSETWX, %%mm0\n" // add offset to U5U4 -> mm0
+ "psrad $15, %%mm2\n" // 32-bit scaled Y5Y4 -> mm2
+
+ "paddd %%mm5, %%mm6\n" // Y7Y6 -> mm6
+ "movq %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
+
+ "pmaddwd UR0GRX, %%mm7\n" // urR7,ugG6+ugR6 -> mm7
+ "psrad $15, %%mm3\n" // 32-bit scaled V5V4 -> mm3
+
+ "pmaddwd VBG0BX, %%mm1\n" // vbB7+vgG7,vbB6 -> mm1
+ "psrad $15, %%mm6\n" // 32-bit scaled Y7Y6 -> mm6
+
+ "paddd OFFSETDX, %%mm4\n" // add offset to U7U6
+ "packssdw %%mm6, %%mm2\n" // Y7Y6Y5Y4 -> mm2
+
+ "pmaddwd VR0GRX, %%mm5\n" // vrR7,vgG6+vrR6 -> mm5
+ "paddd %%mm4, %%mm7\n" // U7U6 -> mm7
+
+ "psrad $15, %%mm7\n" // 32-bit scaled U7U6 -> mm7
+
+ //----------------------------------------------------------------------
+
+ "movq 8%5, %%mm6\n" // 32-bit scaled Y3Y2Y1Y0 -> mm6
+ "packssdw %%mm7, %%mm0\n" // 32-bit scaled U7U6U5U4 -> mm0
+
+ "movq 16%5, %%mm4\n" // 32-bit scaled U3U2U1U0 -> mm4
+ "packuswb %%mm2, %%mm6\n" // all 8 Y values -> mm6
+
+ "movq OFFSETBX, %%mm7\n" // 128,128,128,128 -> mm7
+ "paddd %%mm5, %%mm1\n" // V7V6 -> mm1
+
+ "paddw %%mm7, %%mm4\n" // add offset to U3U2U1U0/256
+ "psrad $15, %%mm1\n" // 32-bit scaled V7V6 -> mm1
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm6, (%1)\n" // store Y
+ "packuswb %%mm0, %%mm4\n" // all 8 U values -> mm4
+
+ "movq 24%5, %%mm5\n" // 32-bit scaled V3V2V1V0 -> mm5
+ "packssdw %%mm1, %%mm3\n" // V7V6V5V4 -> mm3
+ "paddw %%mm7, %%mm5\n" // add offset to V3V2V1V0
+ "paddw %%mm7, %%mm3\n" // add offset to V7V6V5V4
+
+ "packuswb %%mm3, %%mm5\n" // ALL 8 V values -> mm5
+
+ // pack U and V
+ "movq CLEARX, %%mm2\n"
+ "pand %%mm2, %%mm4\n"
+ "pand %%mm2, %%mm5\n"
+
+ "packuswb %%mm5, %%mm4\n"
+
+ "movd %%mm4, (%2)\n"
+ "psrlq $32, %%mm4\n"
+ "movd %%mm4, (%3)\n"
+
+ "add $24, %0\n"
+ "add $8, %1\n"
+ "add $4, %2\n"
+ "add $4, %3\n"
+
+ "sub $8, %4\n"
+ "jnz 1b\n"
+ "emms\n"
+
+ :
+ : "r" (rgb), "r" (lum), "r" (cr), "r" (cb), "m" (pixel), "m" (buf[0])
+ );
+}
+
+void rgb2yuv32bit_mmx422_row(unsigned char* rgb, unsigned char* lum, unsigned char* cr,
+ unsigned char* cb, int pixel)
+{
+ unsigned int buf[8];
+
+ // %5 = TEMP0
+ // 8%5 = TEMPY
+ // 16%5 = TEMPU
+ // 24%5 = TEMPV
+
+ __asm__ __volatile__ (
+ "1:\n"
+
+ // pack rgb
+ // was: "movq (%0), %%mm1\n" // load G2R2B1G1R1B0G0R0
+ // ------------------------------
+ // (uses: mm0, mm1)
+ "movd 8(%0), %%mm0\n"
+ "psllq $24, %%mm0\n"
+ "movd 4(%0), %%mm1\n"
+ "por %%mm1, %%mm0\n"
+ "psllq $24, %%mm0\n"
+ "movd (%0), %%mm1\n"
+ "por %%mm0, %%mm1\n"
+ // ------------------------------
+
+ "pxor %%mm6, %%mm6\n" // 0 -> mm6
+ "movq %%mm1, %%mm0\n" // G2R2B1G1R1B0G0R0 -> mm0
+ "psrlq $16, %%mm1\n" // 00G2R2B1G1R1B0 -> mm1
+ "punpcklbw ZEROSX, %%mm0\n" // R1B0G0R0 -> mm0
+ "movq %%mm1, %%mm7\n" // 00G2R2B1G1R1B0 -> mm7
+ "punpcklbw ZEROSX, %%mm1\n" // B1G1R1B0 -> mm1
+ "movq %%mm0, %%mm2\n" // R1B0G0R0 -> mm2
+ "pmaddwd YR0GRX, %%mm0\n" // yrR1,ygG0+yrR0 -> mm0
+
+ "movq %%mm1, %%mm3\n" // B1G1R1B0 -> mm3
+ "pmaddwd YBG0BX, %%mm1\n" // ybB1+ygG1,ybB0 -> mm1
+ "movq %%mm2, %%mm4\n" // R1B0G0R0 -> mm4
+ "pmaddwd UR0GRX, %%mm2\n" // urR1,ugG0+urR0 -> mm2
+ "movq %%mm3, %%mm5\n" // B1G1R1B0 -> mm5
+ "pmaddwd UBG0BX, %%mm3\n" // ubB1+ugG1,ubB0 -> mm3
+ "punpckhbw %%mm6, %%mm7\n" // 00G2R2 -> mm7
+ "pmaddwd VR0GRX, %%mm4\n" // vrR1,vgG0+vrR0 -> mm4
+ "paddd %%mm1, %%mm0\n" // Y1Y0 -> mm0
+
+ "pmaddwd VBG0BX, %%mm5\n" // vbB1+vgG1,vbB0 -> mm5
+
+ // pack rgb
+ // was: "movq 8(%0),%%mm1\n" // R5B4G4R4B3G3R3B2 -> mm1
+ // ------------------------------
+ // (uses: mm1, mm6)
+ "movd 20(%0), %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 16(%0), %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 12(%0), %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ "psllq $8, %%mm1\n"
+ "movd 8(%0), %%mm6\n"
+ "psrlq $16, %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ // ------------------------------
+
+ "paddd %%mm3, %%mm2\n" // U1U0 -> mm2
+
+ "movq %%mm1, %%mm6\n" // R5B4G4R4B3G3R3B2 -> mm6
+ "punpcklbw ZEROSX, %%mm1\n" // B3G3R3B2 -> mm1
+ "paddd %%mm5, %%mm4\n" // V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm1, %%mm5\n" // B3G3R3B2 -> mm5
+ "psllq $32, %%mm1\n" // R3B200 -> mm1
+
+ "paddd %%mm7, %%mm1\n" // R3B200+00G2R2=R3B2G2R2->mm1
+
+ "punpckhbw ZEROSX, %%mm6\n" // R5B4G4R3 -> mm6
+ "movq %%mm1, %%mm3\n" // R3B2G2R2 -> mm3
+
+ "pmaddwd YR0GRX, %%mm1\n" // yrR3,ygG2+yrR2 -> mm1
+ "movq %%mm5, %%mm7\n" // B3G3R3B2 -> mm7
+
+ "pmaddwd YBG0BX, %%mm5\n" // ybB3+ygG3,ybB2 -> mm5
+ "psrad $15, %%mm0\n" // 32-bit scaled Y1Y0 -> mm0
+
+ "movq %%mm6, %5\n" // R5B4G4R4 -> TEMP0
+ "movq %%mm3, %%mm6\n" // R3B2G2R2 -> mm6
+ "pmaddwd UR0GRX, %%mm6\n" // urR3,ugG2+urR2 -> mm6
+ "psrad $15, %%mm2\n" // 32-bit scaled U1U0 -> mm2
+
+ "paddd %%mm5, %%mm1\n" // Y3Y2 -> mm1
+ "movq %%mm7, %%mm5\n" // B3G3R3B2 -> mm5
+ "pmaddwd UBG0BX, %%mm7\n" // ubB3+ugG3,ubB2
+ "psrad $15, %%mm1\n" // 32-bit scaled Y3Y2 -> mm1
+
+ "pmaddwd VR0GRX, %%mm3\n" // vrR3,vgG2+vgR2
+ "packssdw %%mm1, %%mm0\n" // Y3Y2Y1Y0 -> mm0
+
+ "pmaddwd VBG0BX, %%mm5\n" // vbB3+vgG3,vbB2 -> mm5
+ "psrad $15, %%mm4\n" // 32-bit scaled V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "paddd %%mm7, %%mm6\n" // U3U2 -> mm6
+
+ // pack rgb
+ // was: "movq 16(%0), %%mm1\n" // B7G7R7B6G6R6B5G5 -> mm1
+ // ------------------------------
+ // (uses: mm1, mm7)
+ "movd 28(%0), %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 24(%0), %%mm7\n"
+ "por %%mm7, %%mm1\n"
+ "psllq $16, %%mm1\n"
+ "movd 20(%0), %%mm7\n"
+ "psrlq $8, %%mm7\n"
+ "por %%mm7, %%mm1\n"
+ // ------------------------------
+
+ "movq %%mm1, %%mm7\n" // B7G7R7B6G6R6B5G5 -> mm1
+ "psrad $15, %%mm6\n" // 32-bit scaled U3U2 -> mm6
+
+ "paddd %%mm5, %%mm3\n" // V3V2 -> mm3
+ "psllq $16, %%mm7\n" // R7B6G6R6B5G500 -> mm7
+
+ "movq %%mm7, %%mm5\n" // R7B6G6R6B5G500 -> mm5
+ "psrad $15, %%mm3\n" // 32-bit scaled V3V2 -> mm3
+
+ "movq %%mm0, 8%5\n" // 32-bit scaled Y3Y2Y1Y0 -> TEMPY
+
+ "packssdw %%mm6, %%mm2\n" // 32-bit scaled U3U2U1U0 -> mm2
+
+ "movq %5, %%mm0\n" // R5B4G4R4 -> mm0
+
+ "punpcklbw ZEROSX, %%mm7\n" // B5G500 -> mm7
+ "movq %%mm0, %%mm6\n" // R5B4G4R4 -> mm6
+
+ "movq %%mm2, 16%5\n" // 32-bit scaled U3U2U1U0 -> TEMPU
+ "psrlq $32, %%mm0\n" // 00R5B4 -> mm0
+
+ "paddw %%mm0, %%mm7\n" // B5G5R5B4 -> mm7
+ "movq %%mm6, %%mm2\n" // B5B4G4R4 -> mm2
+
+ "pmaddwd YR0GRX, %%mm2\n" // yrR5,ygG4+yrR4 -> mm2
+ "movq %%mm7, %%mm0\n" // B5G5R5B4 -> mm0
+
+ "pmaddwd YBG0BX, %%mm7\n" // ybB5+ygG5,ybB4 -> mm7
+ "packssdw %%mm3, %%mm4\n" // 32-bit scaled V3V2V1V0 -> mm4
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm4, 24%5\n" // (V3V2V1V0)/256 -> mm4
+
+ "movq %%mm6, %%mm4\n" // B5B4G4R4 -> mm4
+
+ "pmaddwd UR0GRX, %%mm6\n" // urR5,ugG4+urR4
+ "movq %%mm0, %%mm3\n" // B5G5R5B4 -> mm0
+
+ "pmaddwd UBG0BX, %%mm0\n" // ubB5+ugG5,ubB4
+ "paddd %%mm7, %%mm2\n" // Y5Y4 -> mm2
+
+ //----------------------------------------------------------------------
+
+ "pmaddwd VR0GRX, %%mm4\n" // vrR5,vgG4+vrR4 -> mm4
+ "pxor %%mm7, %%mm7\n" // 0 -> mm7
+
+ "pmaddwd VBG0BX, %%mm3\n" // vbB5+vgG5,vbB4 -> mm3
+ "punpckhbw %%mm7, %%mm1\n" // B7G7R7B6 -> mm1
+
+ "paddd %%mm6, %%mm0\n" // U5U4 -> mm0
+ "movq %%mm1, %%mm6\n" // B7G7R7B6 -> mm6
+
+ "pmaddwd YBG0BX, %%mm6\n" // ybB7+ygG7,ybB6 -> mm6
+ "punpckhbw %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
+
+ "movq %%mm5, %%mm7\n" // R7B6G6R6 -> mm7
+ "paddd %%mm4, %%mm3\n" // V5V4 -> mm3
+
+ "pmaddwd YR0GRX, %%mm5\n" // yrR7,ygG6+yrR6 -> mm5
+ "movq %%mm1, %%mm4\n" // B7G7R7B6 -> mm4
+
+ "pmaddwd UBG0BX, %%mm4\n" // ubB7+ugG7,ubB6 -> mm4
+ "psrad $15, %%mm0\n" // 32-bit scaled U5U4 -> %%mm0
+
+ //----------------------------------------------------------------------
+
+ "paddd OFFSETWX, %%mm0\n" // add offset to U5U4 -> mm0
+ "psrad $15, %%mm2\n" // 32-bit scaled Y5Y4 -> mm2
+
+ "paddd %%mm5, %%mm6\n" // Y7Y6 -> mm6
+ "movq %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
+
+ "pmaddwd UR0GRX, %%mm7\n" // urR7,ugG6+ugR6 -> mm7
+ "psrad $15, %%mm3\n" // 32-bit scaled V5V4 -> mm3
+
+ "pmaddwd VBG0BX, %%mm1\n" // vbB7+vgG7,vbB6 -> mm1
+ "psrad $15, %%mm6\n" // 32-bit scaled Y7Y6 -> mm6
+
+ "paddd OFFSETDX, %%mm4\n" // add offset to U7U6
+ "packssdw %%mm6, %%mm2\n" // Y7Y6Y5Y4 -> mm2
+
+ "pmaddwd VR0GRX, %%mm5\n" // vrR7,vgG6+vrR6 -> mm5
+ "paddd %%mm4, %%mm7\n" // U7U6 -> mm7
+
+ "psrad $15, %%mm7\n" // 32-bit scaled U7U6 -> mm7
+
+ //----------------------------------------------------------------------
+
+ "movq 8%5, %%mm6\n" // 32-bit scaled Y3Y2Y1Y0 -> mm6
+ "packssdw %%mm7, %%mm0\n" // 32-bit scaled U7U6U5U4 -> mm0
+
+ "movq 16%5, %%mm4\n" // 32-bit scaled U3U2U1U0 -> mm4
+ "packuswb %%mm2, %%mm6\n" // all 8 Y values -> mm6
+
+ "movq OFFSETBX, %%mm7\n" // 128,128,128,128 -> mm7
+ "paddd %%mm5, %%mm1\n" // V7V6 -> mm1
+
+ "paddw %%mm7, %%mm4\n" // add offset to U3U2U1U0/256
+ "psrad $15, %%mm1\n" // 32-bit scaled V7V6 -> mm1
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm6, (%1)\n" // store Y
+
+ "packuswb %%mm0, %%mm4\n" // all 8 U values -> mm4
+ "movq 24%5, %%mm5\n" // 32-bit scaled V3V2V1V0 -> mm5
+
+ "packssdw %%mm1, %%mm3\n" // V7V6V5V4 -> mm3
+ "paddw %%mm7, %%mm5\n" // add offset to V3V2V1V0
+ "paddw %%mm7, %%mm3\n" // add offset to V7V6V5V4
+
+ "packuswb %%mm3, %%mm5\n" // ALL 8 V values -> mm5
+
+ "movq CLEARX, %%mm2\n"
+ "pand %%mm2, %%mm4\n"
+ "pand %%mm2, %%mm5\n"
+
+ "packuswb %%mm5, %%mm4\n"
+
+ "movd %%mm4, (%2)\n"
+ "psrlq $32, %%mm4\n"
+ "movd %%mm4, (%3)\n"
+
+ "add $32, %0\n"
+ "add $8, %1\n"
+ "add $4, %2\n"
+ "add $4, %3\n"
+
+ "sub $8, %4\n"
+ "jnz 1b\n"
+
+ "emms\n"
+
+ :
+ : "r" (rgb), "r" (lum), "r" (cr), "r" (cb), "m" (pixel), "m" (buf[0])
+
+ );
+}
+
+void rgb2y24bit_mmx_row(unsigned char* rgbSource, unsigned char* lum, int pixel)
+{
+ unsigned int buf[4];
+
+ // %3 = TEMP0
+ // 8%3 = TEMPY
+
+ __asm__ __volatile__ (
+ "1:\n"
+
+ "movq (%0), %%mm1\n" // load G2R2B1G1R1B0G0R0
+ "pxor %%mm6, %%mm6\n" // 0 -> mm6
+ "movq %%mm1, %%mm0\n" // G2R2B1G1R1B0G0R0 -> mm0
+ "psrlq $16, %%mm1\n" // 00G2R2B1G1R1B0 -> mm1
+ "punpcklbw ZEROSX, %%mm0\n" // R1B0G0R0 -> mm0
+ "movq %%mm1, %%mm7\n" // 00G2R2B1G1R1B0 -> mm7
+ "punpcklbw ZEROSX, %%mm1\n" // B1G1R1B0 -> mm1
+ "movq %%mm0, %%mm2\n" // R1B0G0R0 -> mm2
+ "pmaddwd YR0GRX, %%mm0\n" // yrR1,ygG0+yrR0 -> mm0
+ "movq %%mm1, %%mm3\n" // B1G1R1B0 -> mm3
+ "pmaddwd YBG0BX, %%mm1\n" // ybB1+ygG1,ybB0 -> mm1
+ "movq %%mm2, %%mm4\n" // R1B0G0R0 -> mm4
+ "movq %%mm3, %%mm5\n" // B1G1R1B0 -> mm5
+ "punpckhbw %%mm6, %%mm7\n" // 00G2R2 -> mm7
+ "paddd %%mm1, %%mm0\n" // Y1Y0 -> mm0
+
+ "movq 8(%0),%%mm1\n" // load G2R2B1G1R1B0G0R0
+
+ "movq %%mm1, %%mm6\n" // R5B4G4R4B3G3R3B2 -> mm6
+ "punpcklbw ZEROSX, %%mm1\n" // B3G3R3B2 -> mm1
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm1, %%mm5\n" // B3G3R3B2 -> mm5
+ "psllq $32, %%mm1\n" // R3B200 -> mm1
+
+ "paddd %%mm7, %%mm1\n" // R3B200+00G2R2=R3B2G2R2->mm1
+
+ "punpckhbw ZEROSX, %%mm6\n" // R5B4G4R3 -> mm6
+ "movq %%mm1, %%mm3\n" // R3B2G2R2 -> mm3
+
+ "pmaddwd YR0GRX, %%mm1\n" // yrR3,ygG2+yrR2 -> mm1
+ "movq %%mm5, %%mm7\n" // B3G3R3B2 -> mm7
+
+ "pmaddwd YBG0BX, %%mm5\n" // ybB3+ygG3,ybB2 -> mm5
+ "psrad $15, %%mm0\n" // 32-bit scaled Y1Y0 -> mm0
+
+ "movq %%mm6, %3\n" // R5B4G4R4 -> TEMP0
+ "movq %%mm3, %%mm6\n" // R3B2G2R2 -> mm6
+
+ "paddd %%mm5, %%mm1\n" // Y3Y2 -> mm1
+ "movq %%mm7, %%mm5\n" // B3G3R3B2 -> mm5
+ "psrad $15, %%mm1\n" // 32-bit scaled Y3Y2 -> mm1
+
+ "packssdw %%mm1, %%mm0\n" // Y3Y2Y1Y0 -> mm0
+
+ //----------------------------------------------------------------------
+
+ "movq 16(%0), %%mm1\n" // B7G7R7B6G6R6B5G5 -> mm7
+
+ "movq %%mm1, %%mm7\n" // B7G7R7B6G6R6B5G5 -> mm1
+
+ "psllq $16, %%mm7\n" // R7B6G6R6B5G500 -> mm7
+
+ "movq %%mm7, %%mm5\n" // R7B6G6R6B5G500 -> mm5
+
+ "movq %%mm0, 8%3\n" // 32-bit scaled Y3Y2Y1Y0 -> TEMPY
+
+ "movq %3, %%mm0\n" // R5B4G4R4 -> mm0
+
+ "punpcklbw ZEROSX, %%mm7\n" // B5G500 -> mm7
+ "movq %%mm0, %%mm6\n" // R5B4G4R4 -> mm6
+
+ "psrlq $32, %%mm0\n" // 00R5B4 -> mm0
+
+ "paddw %%mm0, %%mm7\n" // B5G5R5B4 -> mm7
+ "movq %%mm6, %%mm2\n" // B5B4G4R4 -> mm2
+
+ "pmaddwd YR0GRX, %%mm2\n" // yrR5,ygG4+yrR4 -> mm2
+
+ "pmaddwd YBG0BX, %%mm7\n" // ybB5+ygG5,ybB4 -> mm7
+
+ //----------------------------------------------------------------------
+ "paddd %%mm7, %%mm2\n" // Y5Y4 -> mm2
+
+ //----------------------------------------------------------------------
+
+ "pxor %%mm7, %%mm7\n" // 0 -> mm7
+
+ "punpckhbw %%mm7, %%mm1\n" // B7G7R7B6 -> mm1
+
+ "movq %%mm1, %%mm6\n" // B7G7R7B6 -> mm6
+
+ "pmaddwd YBG0BX, %%mm6\n" // ybB7+ygG7,ybB6 -> mm6
+ "punpckhbw %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
+
+ "pmaddwd YR0GRX, %%mm5\n" // yrR7,ygG6+yrR6 -> mm5
+
+ //----------------------------------------------------------------------
+
+ "psrad $15, %%mm2\n" // 32-bit scaled Y5Y4 -> mm2
+
+ "paddd %%mm5, %%mm6\n" // Y7Y6 -> mm6
+ "psrad $15, %%mm6\n" // 32-bit scaled Y7Y6 -> mm6
+
+ "packssdw %%mm6, %%mm2\n" // Y7Y6Y5Y4 -> mm2
+
+ //----------------------------------------------------------------------
+
+ "movq 8%3, %%mm6\n" // 32-bit scaled Y3Y2Y1Y0 -> mm6
+ "packuswb %%mm2, %%mm6\n" // all 8 Y values -> mm6
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm6, (%1)\n" // store Y
+
+ "add $24, %0\n"
+ "add $8, %1\n"
+
+ "sub $8, %2\n"
+ "jnz 1b\n"
+ "emms\n"
+
+ :
+ : "r" (rgbSource), "r" (lum), "m" (pixel), "m" (buf[0])
+
+ );
+}
+
+void rgb2y32bit_mmx_row(unsigned char* rgbSource, unsigned char* lum, int pixel)
+{
+ unsigned int buf[4];
+
+ // %3 = TEMP0
+ // 8%3 = TEMPY
+
+ __asm__ __volatile__ (
+ "1:\n"
+
+ // pack rgb
+ // was: "movq (%0), %%mm1\n" // load G2R2B1G1R1B0G0R0
+ // ------------------------------
+ // (uses: mm0, mm1)
+ "movd 8(%0), %%mm0\n"
+ "psllq $24, %%mm0\n"
+ "movd 4(%0), %%mm1\n"
+ "por %%mm1, %%mm0\n"
+ "psllq $24, %%mm0\n"
+ "movd (%0), %%mm1\n"
+ "por %%mm0, %%mm1\n"
+ // ------------------------------
+
+ "pxor %%mm6, %%mm6\n" // 0 -> mm6
+ "movq %%mm1, %%mm0\n" // G2R2B1G1R1B0G0R0 -> mm0
+ "psrlq $16, %%mm1\n" // 00G2R2B1G1R1B0 -> mm1
+ "punpcklbw ZEROSX, %%mm0\n" // R1B0G0R0 -> mm0
+ "movq %%mm1, %%mm7\n" // 00G2R2B1G1R1B0 -> mm7
+ "punpcklbw ZEROSX, %%mm1\n" // B1G1R1B0 -> mm1
+ "movq %%mm0, %%mm2\n" // R1B0G0R0 -> mm2
+ "pmaddwd YR0GRX, %%mm0\n" // yrR1,ygG0+yrR0 -> mm0
+ "movq %%mm1, %%mm3\n" // B1G1R1B0 -> mm3
+ "pmaddwd YBG0BX, %%mm1\n" // ybB1+ygG1,ybB0 -> mm1
+ "movq %%mm2, %%mm4\n" // R1B0G0R0 -> mm4
+ "movq %%mm3, %%mm5\n" // B1G1R1B0 -> mm5
+ "punpckhbw %%mm6, %%mm7\n" // 00G2R2 -> mm7
+ "paddd %%mm1, %%mm0\n" // Y1Y0 -> mm0
+
+ // pack rgb
+ // was: "movq 8(%0),%%mm1\n" // R5B4G4R4B3G3R3B2 -> mm1
+ // ------------------------------
+ // (uses: mm1, mm6)
+ "movd 20(%0), %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 16(%0), %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 12(%0), %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ "psllq $8, %%mm1\n"
+ "movd 8(%0), %%mm6\n"
+ "psrlq $16, %%mm6\n"
+ "por %%mm6, %%mm1\n"
+ // ------------------------------
+
+ "movq %%mm1, %%mm6\n" // R5B4G4R4B3G3R3B2 -> mm6
+ "punpcklbw ZEROSX, %%mm1\n" // B3G3R3B2 -> mm1
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm1, %%mm5\n" // B3G3R3B2 -> mm5
+ "psllq $32, %%mm1\n" // R3B200 -> mm1
+
+ "paddd %%mm7, %%mm1\n" // R3B200+00G2R2=R3B2G2R2->mm1
+
+ "punpckhbw ZEROSX, %%mm6\n" // R5B4G4R3 -> mm6
+ "movq %%mm1, %%mm3\n" // R3B2G2R2 -> mm3
+
+ "pmaddwd YR0GRX, %%mm1\n" // yrR3,ygG2+yrR2 -> mm1
+ "movq %%mm5, %%mm7\n" // B3G3R3B2 -> mm7
+
+ "pmaddwd YBG0BX, %%mm5\n" // ybB3+ygG3,ybB2 -> mm5
+ "psrad $15, %%mm0\n" // 32-bit scaled Y1Y0 -> mm0
+
+ "movq %%mm6, %3\n" // R5B4G4R4 -> TEMP0
+ "movq %%mm3, %%mm6\n" // R3B2G2R2 -> mm6
+
+ "paddd %%mm5, %%mm1\n" // Y3Y2 -> mm1
+ "movq %%mm7, %%mm5\n" // B3G3R3B2 -> mm5
+ "psrad $15, %%mm1\n" // 32-bit scaled Y3Y2 -> mm1
+
+ "packssdw %%mm1, %%mm0\n" // Y3Y2Y1Y0 -> mm0
+
+ //----------------------------------------------------------------------
+
+ // pack rgb
+ // was: "movq 16(%0), %%mm1\n" // B7G7R7B6G6R6B5G5 -> mm1
+ // ------------------------------
+ // (uses: mm1, mm7)
+ "movd 28(%0), %%mm1\n"
+ "psllq $24, %%mm1\n"
+ "movd 24(%0), %%mm7\n"
+ "por %%mm7, %%mm1\n"
+ "psllq $16, %%mm1\n"
+ "movd 20(%0), %%mm7\n"
+ "psrlq $8, %%mm7\n"
+ "por %%mm7, %%mm1\n"
+ // ------------------------------
+
+ "movq %%mm1, %%mm7\n" // B7G7R7B6G6R6B5G5 -> mm1
+
+ "psllq $16, %%mm7\n" // R7B6G6R6B5G500 -> mm7
+
+ "movq %%mm7, %%mm5\n" // R7B6G6R6B5G500 -> mm5
+
+ "movq %%mm0, 8%3\n" // 32-bit scaled Y3Y2Y1Y0 -> TEMPY
+
+ "movq %3, %%mm0\n" // R5B4G4R4 -> mm0
+
+ "punpcklbw ZEROSX, %%mm7\n" // B5G500 -> mm7
+ "movq %%mm0, %%mm6\n" // R5B4G4R4 -> mm6
+
+ "psrlq $32, %%mm0\n" // 00R5B4 -> mm0
+
+ "paddw %%mm0, %%mm7\n" // B5G5R5B4 -> mm7
+ "movq %%mm6, %%mm2\n" // B5B4G4R4 -> mm2
+
+ "pmaddwd YR0GRX, %%mm2\n" // yrR5,ygG4+yrR4 -> mm2
+
+ "pmaddwd YBG0BX, %%mm7\n" // ybB5+ygG5,ybB4 -> mm7
+
+ //----------------------------------------------------------------------
+ "paddd %%mm7, %%mm2\n" // Y5Y4 -> mm2
+
+ //----------------------------------------------------------------------
+
+ "pxor %%mm7, %%mm7\n" // 0 -> mm7
+
+ "punpckhbw %%mm7, %%mm1\n" // B7G7R7B6 -> mm1
+
+ "movq %%mm1, %%mm6\n" // B7G7R7B6 -> mm6
+
+ "pmaddwd YBG0BX, %%mm6\n" // ybB7+ygG7,ybB6 -> mm6
+ "punpckhbw %%mm7, %%mm5\n" // R7B6G6R6 -> mm5
+
+ "pmaddwd YR0GRX, %%mm5\n" // yrR7,ygG6+yrR6 -> mm5
+
+ //----------------------------------------------------------------------
+
+ "psrad $15, %%mm2\n" // 32-bit scaled Y5Y4 -> mm2
+
+ "paddd %%mm5, %%mm6\n" // Y7Y6 -> mm6
+ "psrad $15, %%mm6\n" // 32-bit scaled Y7Y6 -> mm6
+
+ "packssdw %%mm6, %%mm2\n" // Y7Y6Y5Y4 -> mm2
+
+ //----------------------------------------------------------------------
+
+ "movq 8%3, %%mm6\n" // 32-bit scaled Y3Y2Y1Y0 -> mm6
+ "packuswb %%mm2, %%mm6\n" // all 8 Y values -> mm6
+
+ //----------------------------------------------------------------------
+
+ "movq %%mm6, (%1)\n" // store Y
+
+ "add $32, %0\n"
+ "add $8, %1\n"
+
+ "sub $8, %2\n"
+ "jnz 1b\n"
+ "emms\n"
+
+ :
+ : "r" (rgbSource), "r" (lum), "r" (pixel), "m" (buf[0])
+
+ );
+}
+
+#endif
+// INTEL
+
diff --git a/mpeglib/lib/util/render/dither2YUV/rgb2yuv32.h b/mpeglib/lib/util/render/dither2YUV/rgb2yuv32.h
new file mode 100644
index 00000000..75fea27f
--- /dev/null
+++ b/mpeglib/lib/util/render/dither2YUV/rgb2yuv32.h
@@ -0,0 +1,93 @@
+/***************************************************************************
+ rgb2yuv32.h - description
+ -------------------
+ begin : Tue Nov 2 2000
+ copyright : (C) 2000 by Christian Gerlach
+ email : cgerlach@rhrk.uni-kl.de
+ ***************************************************************************/
+
+/***************************************************************************
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ ***************************************************************************/
+
+#ifndef _RGB2YUV32_H_
+#define _RGB2YUV32_H_
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "../yuvPicture.h"
+#include "rgb2yuvdefs.h"
+
+void rgb2yuv32(unsigned char* rgb, unsigned char* dest);
+
+// slow C rountines
+void rgb2yuv24bit(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width);
+
+void rgb2yuv32bit(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width);
+
+
+
+
+#ifndef INTEL
+void rgb2yuv24bit_mmx(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width);
+
+void rgb2yuv32bit_mmx(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width);
+#endif
+
+
+#ifdef INTEL
+
+void rgb2yuv24bit_mmx(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width);
+
+void rgb2yuv32bit_mmx(unsigned char* rgbSource,
+ unsigned char* lum,
+ unsigned char* cr,
+ unsigned char* cb,int height, int width);
+
+
+void rgb2yuv24bit_mmx444_row(unsigned char* rgb,
+ unsigned char* lum, unsigned char* cr,
+ unsigned char* cb, int pixel);
+
+
+void rgb2yuv24bit_mmx422_row(unsigned char* rgb,
+ unsigned char* lum, unsigned char* cr,
+ unsigned char* cb, int pixel);
+
+void rgb2yuv32bit_mmx422_row(unsigned char* rgb,
+ unsigned char* lum, unsigned char* cr,
+ unsigned char* cb, int pixel);
+
+void rgb2y24bit_mmx_row(unsigned char* rgbSource,
+ unsigned char* lum, int pixel);
+
+void rgb2y32bit_mmx_row(unsigned char* rgbSource,
+ unsigned char* lum, int pixel);
+
+#endif
+//INTEL
+
+
+#endif
diff --git a/mpeglib/lib/util/render/dither2YUV/rgb2yuvdefs.h b/mpeglib/lib/util/render/dither2YUV/rgb2yuvdefs.h
new file mode 100644
index 00000000..5c7ae574
--- /dev/null
+++ b/mpeglib/lib/util/render/dither2YUV/rgb2yuvdefs.h
@@ -0,0 +1,74 @@
+/***************************************************************************
+ rgb2yuvdefs.h - description
+ -------------------
+ begin : Tue Nov 2 2000
+ copyright : (C) 2000 by Christian Gerlach
+ email : cgerlach@rhrk.uni-kl.de
+ ***************************************************************************/
+
+/***************************************************************************
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ ***************************************************************************/
+
+#ifndef __RGB2YUVDEFS_H
+#define __RGB2YUVDEFS_H
+
+/* gcc 3.3.1 and later optimise the "not used" (only in asm code)
+ symbols away. So we need to mark them as used. */
+#if defined(__GNUC_PREREQ__) && !defined(__GNUC_PREREQ)
+#define __GNUC_PREREQ __GNUC_PREREQ__
+#endif
+#ifdef __GNUC_PREREQ
+#if __GNUC_PREREQ (3,1)
+# define __attribute_used__ __attribute__ ((__used__))
+#else
+# define __attribute_used__
+#endif
+#else
+# define __attribute_used__
+#endif
+
+// hicolor mode (16 bit) with r(5) g(6) b(5) bits (reverse order b, g, r)
+#define RED(rgb) (unsigned char) ((rgb) << 3)
+#define GREEN(rgb) (((rgb) & 0x7e0) >> 3)
+#define BLUE(rgb) (((rgb) & 0xf800) >> 8)
+
+#define YUV_SHIFT 15
+#define YUV_HALF (1<<(YUV_SHIFT-1))
+#define YUV_ONE (1<<YUV_SHIFT)
+#define Y_R ((int)( 0.299 * YUV_ONE ))
+#define Y_G ((int)( 0.587 * YUV_ONE ))
+#define Y_B ((int)( 0.114 * YUV_ONE ))
+#define U_R ((int)(-0.146 * YUV_ONE ))
+#define U_G ((int)(-0.288 * YUV_ONE ))
+#define U_B ((int)( 0.434 * YUV_ONE ))
+#define V_R ((int)( 0.617 * YUV_ONE ))
+#define V_G ((int)(-0.517 * YUV_ONE ))
+#define V_B ((int)(-0.100 * YUV_ONE ))
+
+#define Y_RGB(R,G,B) (( Y_R * (R) + Y_G * (G) + Y_B * (B)) >> YUV_SHIFT)
+#define U_RGB(R,G,B) ((( U_R * (R) + U_G * (G) + U_B * (B)) >> YUV_SHIFT) + 128)
+#define V_RGB(R,G,B) ((( V_R * (R) + V_G * (G) + V_B * (B)) >> YUV_SHIFT) + 128)
+
+static unsigned char __attribute_used__ CLEARX[8] = { 255, 0, 255, 0, 255, 0, 255, 0 };
+static short __attribute_used__ ZEROSX[4] = { 0, 0, 0, 0 };
+
+static short __attribute_used__ OFFSETDX[4] = { 0, 64, 0, 64 };
+static short __attribute_used__ OFFSETWX[4] = { 128, 0, 128, 0 };
+static short __attribute_used__ OFFSETBX[4] = { 128, 128, 128, 128 };
+
+static short __attribute_used__ YR0GRX[4] = { Y_R, Y_G, 0, Y_R };
+static short __attribute_used__ YBG0BX[4] = { Y_B, 0, Y_G, Y_B };
+
+static short __attribute_used__ UR0GRX[4] = { U_R, U_G, 0, U_R };
+static short __attribute_used__ UBG0BX[4] = { U_B, 0, U_G, U_B };
+
+static short __attribute_used__ VR0GRX[4] = { V_R, V_G, 0, V_R };
+static short __attribute_used__ VBG0BX[4] = { V_B, 0, V_G, V_B };
+
+#endif