/*
  colorTables for 16,32 Bit depth
  Copyright (C) 2000  Martin Vogt

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU Library General Public License as published by
  the Free Software Foundation.

  For more information look at the file COPYRIGHT in this package

 */


#include "colorTableHighBit.h"

//#define INTERPOLATE


/*
 * Erik Corry's multi-byte dither routines.
 *
 * The basic idea is that the Init generates all the necessary tables.
 * The tables incorporate the information about the tqlayout of pixels
 * in the XImage, so that it should be able to cope with 15-bit, 16-bit
 * 24-bit (non-packed) and 32-bit (10-11 bits per color!) screens.
 * At present it cannot cope with 24-bit packed mode, since this involves
 * getting down to byte level again. It is assumed that the bits for each
 * color are contiguous in the longword.
 * 
 * Writing to memory is done in shorts or ints. (Unfortunately, short is not
 * very fast on Alpha, so there is room for improvement here). There is no
 * dither time check for overflow - instead the tables have slack at
 * each end. This is likely to be faster than an 'if' test as many modern
 * architectures are really bad at ifs. Potentially, each '&&' causes a 
 * pipeline flush!
 *
 * There is no shifting and fixed point arithmetic, as I really doubt you
 * can see the difference, and it costs. This may be just my bias, since I
 * heard that Intel is really bad at shifting.
 */


/*
 * How many 1 bits are there in the PIXVALword.
 * Low performance, do not call often.
 */
static int number_of_bits_set(unsigned PIXVAL a) {
    if(!a) return 0;
    if(a & 1) return 1 + number_of_bits_set(a >> 1);
    return(number_of_bits_set(a >> 1));
}


/*
 * How many 0 bits are there at most significant end of PIXVALword.
 * Low performance, do not call often.
 */
static int free_bits_at_top(unsigned PIXVAL a) {
      /* assume char is 8 bits */
    if(!a) return sizeof(unsigned PIXVAL) * 8;
        /* assume twos complement */
    if(((PIXVAL)a) < 0l) return 0;
    return 1 + free_bits_at_top ( a << 1);
}

/*
 * How many 0 bits are there at least significant end of PIXVALword.
 * Low performance, do not call often.
 */
static int free_bits_at_bottom(unsigned PIXVAL a) {
      /* assume char is 8 bits */
    if(!a) return sizeof(unsigned PIXVAL) * 8;
    if(((PIXVAL)a) & 1l) return 0;
    return 1 + free_bits_at_bottom ( a >> 1);
}


ColorTableHighBit::ColorTableHighBit(int bpp,unsigned int redMask,
				     unsigned int greenMask,
				     unsigned int blueMask) {
  this->bpp=bpp;
  this->redMask=redMask;
  this->greenMask=greenMask;
  this->blueMask=blueMask;

  colortab = new TABTYPE[5*256];
  
  Cr_r_tab = &colortab[0*256];
  Cr_g_tab = &colortab[1*256];
  Cb_g_tab = &colortab[2*256];
  Cb_b_tab = &colortab[3*256];
  L_tab    = &colortab[4*256];
  
  rgb_2_pix = new PIXVAL [3*768];

  r_2_pix_alloc = &rgb_2_pix[0*768];
  g_2_pix_alloc = &rgb_2_pix[1*768];
  b_2_pix_alloc = &rgb_2_pix[2*768];
  
  initHighColor(bpp>=24,redMask,greenMask,blueMask);
  
}


ColorTableHighBit::~ColorTableHighBit() {
  delete colortab;
  delete rgb_2_pix;
}

/*
 *--------------------------------------------------------------
 *
 * InitColor16Dither --
 *
 *	To get rid of the multiply and other conversions in color
 *	dither, we use a lookup table.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	The lookup tables are initialized.
 *
 *--------------------------------------------------------------
 */

void ColorTableHighBit::initHighColor(int thirty2,unsigned int redMask,
				      unsigned int greenMask,
				      unsigned int blueMask) {
  
  unsigned PIXVAL red_mask = redMask;
  unsigned PIXVAL green_mask =greenMask;
  unsigned PIXVAL blue_mask = blueMask;

  int CR, CB, i;
    

  for (i=0; i<256; i++) {
    L_tab[i] = i;
    if (gammaCorrectFlag) {
      L_tab[i] = (TABTYPE)GAMMA_CORRECTION(i);
    }
    
    CB = CR = i;
    
    if (chromaCorrectFlag) {
      CB -= 128; 
      CB = CHROMA_CORRECTION128(CB);
      CR -= 128;
      CR = CHROMA_CORRECTION128(CR);
    } else {
      CB -= 128; CR -= 128;
    }
/* was
      Cr_r_tab[i] =  1.596 * CR;
      Cr_g_tab[i] = -0.813 * CR;
      Cb_g_tab[i] = -0.391 * CB;   
      Cb_b_tab[i] =  2.018 * CB;
  but they were just messed up.
  Then was (_Video Deymstified_):
      Cr_r_tab[i] =  1.366 * CR;
      Cr_g_tab[i] = -0.700 * CR;
      Cb_g_tab[i] = -0.334 * CB;   
      Cb_b_tab[i] =  1.732 * CB;
  but really should be:
   (from ITU-R BT.470-2 System B, G and SMPTE 170M )
*/
      Cr_r_tab[i] = (TABTYPE) ( (0.419/0.299) * CR  );
      Cr_g_tab[i] = (TABTYPE) ( -(0.299/0.419) * CR );
      Cb_g_tab[i] = (TABTYPE) ( -(0.114/0.331) * CB ); 
      Cb_b_tab[i] = (TABTYPE) (  (0.587/0.331) * CB );

/*
  though you could argue for:
    SMPTE 240M
      Cr_r_tab[i] =  (0.445/0.212) * CR;
      Cr_g_tab[i] = -(0.212/0.445) * CR;
      Cb_g_tab[i] = -(0.087/0.384) * CB; 
      Cb_b_tab[i] =  (0.701/0.384) * CB;
    FCC 
      Cr_r_tab[i] =  (0.421/0.30) * CR;
      Cr_g_tab[i] = -(0.30/0.421) * CR;
      Cb_g_tab[i] = -(0.11/0.331) * CB; 
      Cb_b_tab[i] =  (0.59/0.331) * CB;
    ITU-R BT.709 
      Cr_r_tab[i] =  (0.454/0.2125) * CR;
      Cr_g_tab[i] = -(0.2125/0.454) * CR;
      Cb_g_tab[i] = -(0.0721/0.386) * CB; 
      Cb_b_tab[i] =  (0.7154/0.386) * CB;
*/
    }

    /* 
     * Set up entries 0-255 in rgb-to-pixel value tables.
     */
    for (i = 0; i < 256; i++) {
      r_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set(red_mask));
      r_2_pix_alloc[i + 256] <<= free_bits_at_bottom(red_mask);
      g_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set(green_mask));
      g_2_pix_alloc[i + 256] <<= free_bits_at_bottom(green_mask);
      b_2_pix_alloc[i + 256] = i >> (8 - number_of_bits_set(blue_mask));
      b_2_pix_alloc[i + 256] <<= free_bits_at_bottom(blue_mask);
      /*
       * If we have 16-bit output depth, then we double the value
       * in the top word. This means that we can write out both
       * pixels in the pixel doubling mode with one op. It is 
       * harmless in the normal case as storing a 32-bit value
       * through a short pointer will lose the top bits anyway.
       * A similar optimisation for Alpha for 64 bit has been
       * prepared for, but is not yet implemented.
       */
      if(!thirty2) {
	r_2_pix_alloc[i + 256] |= (r_2_pix_alloc[i + 256]) << 16;
	g_2_pix_alloc[i + 256] |= (g_2_pix_alloc[i + 256]) << 16;
	b_2_pix_alloc[i + 256] |= (b_2_pix_alloc[i + 256]) << 16;

      }
#ifdef SIXTYFOUR_BIT
      if(thirty2) {

	r_2_pix_alloc[i + 256] |= (r_2_pix_alloc[i + 256]) << 32;
	g_2_pix_alloc[i + 256] |= (g_2_pix_alloc[i + 256]) << 32;
	b_2_pix_alloc[i + 256] |= (b_2_pix_alloc[i + 256]) << 32;

      }
#endif
    }

    /*
     * Spread out the values we have to the rest of the array so that
     * we do not need to check for overflow.
     */
    for (i = 0; i < 256; i++) {
      r_2_pix_alloc[i] = r_2_pix_alloc[256];
      r_2_pix_alloc[i+ 512] = r_2_pix_alloc[511];
      g_2_pix_alloc[i] = g_2_pix_alloc[256];
      g_2_pix_alloc[i+ 512] = g_2_pix_alloc[511];
      b_2_pix_alloc[i] = b_2_pix_alloc[256];
      b_2_pix_alloc[i+ 512] = b_2_pix_alloc[511];
    }

    r_2_pix = r_2_pix_alloc + 256;
    g_2_pix = g_2_pix_alloc + 256;
    b_2_pix = b_2_pix_alloc + 256;
}