/*
  libuta - a C++ widget library based on SDL (Simple Direct Layer)
  Copyright (C) 1999-2002  Karsten Laux <klaux@student.uni-kl.de>

  This library is free software; you can redistribute it and/or
  modify it under the terms of the GNU Lesser General Public
  License as published by the Free Software Foundation; either
  version 2.1 of the License, or (at your option) any later version.
  
  This library is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  Lesser General Public License for more details.
  
  You should have received a copy of the GNU Lesser General Public
  License along with this library; if not, write to the
  Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  Boston, MA  02111-1307, SA.
*/

#include "blitters.h"
#include <painter.h>

namespace uta {

#ifdef __OpenBSD__
#undef X86_ASSEMBLER
#endif
#if !defined(X86_ASSEMBLER)

/** Apply const gamma value.
    Blit just copies alpha value of the source to the destination surface.
 */
void gammablit_RGBA8888_RGBA8888(Surface* dst, const Surface* src, 
				 char g, 
				 const Rect& srcRect, const Point& dstPoint)
{
  cerr << "processor seems not to be MMX capable, sorry. " << endl;
}

/** Convert RGBA8888 to RGB565 and apply const gamma value.
    This blit ignores the alpha channel of the source surface.
 */
void gammablit_RGBA8888_RGB565(Surface* dst, const Surface* src, 
			       char g, 
			       const Rect& srcRect, const Point& dstPoint)
{
  cerr << "processor seems not to be MMX capable, sorry. " << endl;
}

/** Apply const gamma value.
 */
void gammablit_RGB565_RGB565(Surface* dst, const Surface* src, 
			     char g, 
			     const Rect& srcRect, const Point& dstPoint)
{
  cerr << "processor seems not to be MMX capable, sorry. " << endl;
}

/** Apply per pixel gamma value.
    Blit just copies alpha value of the source to the destination surface.
 */
void gammablit_RGBA8888_RGBA8888(Surface* dst, const Surface* src, 
				 GammaFunction* func, 
				 const Rect& srcRect, const Point& dstPoint)
{
  cerr << "processor seems not to be MMX capable, sorry. " << endl;
}

/** Convert RGBA8888 to RGB565 and apply per pixel gamma value.
    This blit ignores the alpha channel of the source surface.
 */
void gammablit_RGBA8888_RGB565(Surface* dst, const Surface* src, 
			       GammaFunction* func, 
			       const Rect& srcRect, const Point& dstPoint)
{
  cerr << "processor seems not to be MMX capable, sorry. " << endl;
}

/** Apply per pixel gamma value.
 */
void gammablit_RGB565_RGB565(Surface* dst, const Surface* src, 
			     GammaFunction* func, 
			     const Rect& srcRect, const Point& dstPoint)
{
  cerr << "processor seems not to be MMX capable, sorry. " << endl;
}

#else  // ! X86_ASSEMBLER

// include interal header file
#include "blitter_macros.h"

/** Apply const gamma value.
    Blit just copies alpha value of the source to the destination surface.
 */
void gammablit_RGBA8888_RGBA8888(Surface* dst, const Surface* src, 
				 char g, 
				 const Rect& srcRect, const Point& dstPoint)
{
  assert(dst->pixelformat().bpp() == 4);
  assert(src->pixelformat().bpp() == 4);
  assert(dst->width() >= dstPoint.x + srcRect.width());
  assert(dst->height() >= dstPoint.y + srcRect.height());
  dst->lock();
  src->lock();
  
  /* NOTE THAT ALL POINTERS ARE UNSIGNED CHAR* !!! */
  
  bool brighten = (g >= 0);
  if(g < 0)
    g = -g;
  
  unsigned char* sPtr = 
    (unsigned char*)src->pixels() 
    + srcRect.upperLeft().x * src->pixelformat().bpp() 
    + srcRect.upperLeft().y * src->pitch();
  
  unsigned char* dPtr = 
    (unsigned char*)dst->pixels()
    + dstPoint.x * dst->pixelformat().bpp()
    + dstPoint.y * dst->pitch();
  
  unsigned y = 0;
  unsigned x = 0;
  unsigned dstSkip = dst->pitch() - srcRect.width() * dst->pixelformat().bpp();
  unsigned srcSkip = src->pitch() - srcRect.width() * src->pixelformat().bpp();
  unsigned x_reload = srcRect.width()*src->pixelformat().bpp() / 8;
  unsigned x_misalign = srcRect.width()*src->pixelformat().bpp() % 8;

  mmx_t gamma_val;
  
  /* load gamma value into 6 bytes 
     we process all 3 color components of 2 pixels at a step */
  gamma_val.ub[0]= 0;
  gamma_val.ub[1]= g;
  gamma_val.ub[2]= g;
  gamma_val.ub[3]= g;
  gamma_val.ub[4]= 0;
  gamma_val.ub[5]= g;
  gamma_val.ub[6]= g;
  gamma_val.ub[7]= g;
  
  movq_m2r(gamma_val, mm0);
   
  /* hmm ... should we care for (double) word alignment when accessing memory ? 
     ... will need to investigate  */
  if(brighten)
    {
      y = srcRect.height();
      while(y--)
	{
	  // x_misalign is either 0 or 1
	  if(x_misalign)
	    {
	      // read 4 bytes (1 pixel) from source memory -> mm2 
	      movd_m2r(*(mmx_t*)sPtr, mm1);
	      
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm1);  

	      /* finally write back the results (32bits),
		 only using the lower 16bits of result
		 ... this is somehow dumb, but only executed once
		 per scanline, so I do not care */
	      movd_r2m(mm1,(unsigned int*)dPtr);
	      sPtr += 4;
	      dPtr += 4;
	    }
	  
	  x = x_reload;
	  while(x--)
	    {
	      // read 8 bytes (2 pixels) from source memory -> mm1 
	      movq_m2r(*(mmx_t*)sPtr, mm1);

	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm1);  
	      
	      // finally write back the results
	      movq_r2m(mm1,*(mmx_t*)dPtr);
	      
	      sPtr+=sizeof(mmx_t);
	      dPtr+=sizeof(mmx_t);
	    }
	  
	  sPtr += srcSkip;
	  dPtr += dstSkip;
	}
    }
  else
    {
      /* I hate code duplication ... 
	 but I believe testing brighten==true for every pixel 
	 is waste of time.
      */ 
     
      y = srcRect.height();
      while(y--)
	{
	  // x_misalign is either 0 or 1
	  if(x_misalign)
	    {
	      // read 4 bytes (1 pixel) from source memory -> mm2 
	      movd_m2r(*(mmx_t*)sPtr, mm1);
	      
	      // substract 8 bytes with unsigned saturation
	      psubusb_r2r(mm0,mm1);  

	      /* finally write back the results (32bits),
		 only using the lower 16bits of result
		 ... this is somehow dumb, but only executed once
		 per scanline, so I do not care */
	      movd_r2m(mm1,(unsigned int*)dPtr);
	      sPtr += 4;
	      dPtr += 4;
	    }
	  
	  x = x_reload;
	  while(x--)
	    {
	      // read 8 bytes (2 pixels) from source memory -> mm1 
	      movq_m2r(*(mmx_t*)sPtr, mm1);

	      // substract 8 bytes with unsigned saturation
	      psubusb_r2r(mm0,mm1);  
	      
	      // finally write back the results
	      movq_r2m(mm1,*(mmx_t*)dPtr);
	      
	      sPtr+=sizeof(mmx_t);
	      dPtr+=sizeof(mmx_t);
	    }
	  
	  sPtr += srcSkip;
	  dPtr += dstSkip;
	} 
    }
  
  // reset CPU back to FPU mode (end MMX mode)
  emms();
  
  dst->unlock();
  src->unlock();
}


/** convert RGBA8888 to RGB565 and apply const gamma value.
 */
void gammablit_RGBA8888_RGB565(Surface* dst, const Surface* src, 
			      char g, 
			      const Rect& srcRect, const Point& dstPoint)
{
  assert(dst->pixelformat().bpp() == 2);
  assert(src->pixelformat().bpp() == 4);
  assert(dst->width() >= dstPoint.x + srcRect.width());
  assert(dst->height() >= dstPoint.y + srcRect.height());
  dst->lock();
  src->lock();
  
  /* NOTE THAT ALL POINTERS ARE UNSIGNED CHAR* !!! */
  
  bool brighten = (g >= 0);
  if(g < 0)
    g = -g;
  
  unsigned char* sPtr = 
    (unsigned char*)src->pixels() 
    + srcRect.upperLeft().x * src->pixelformat().bpp() 
    + srcRect.upperLeft().y * src->pitch();
  
  unsigned char* dPtr = 
    (unsigned char*)dst->pixels()
    + dstPoint.x * dst->pixelformat().bpp()
    + dstPoint.y * dst->pitch();
  
  unsigned rMask = dst->pixelformat().rMask();
  unsigned gMask = dst->pixelformat().gMask();
  unsigned bMask = dst->pixelformat().bMask();

  /* rMask -> mm5[0..15], mm5.uw[32-47] gMask -> mm6[0..15], .. 
   * other bits = 0 
   */
  PREPARE_64interleave_COLOR_MASKS(rMask, gMask, bMask);
  
  unsigned y = 0;
  unsigned x = 0;
  unsigned dstSkip = dst->pitch() - srcRect.width() * dst->pixelformat().bpp();
  unsigned srcSkip = src->pitch() - srcRect.width() * src->pixelformat().bpp();
  unsigned x_reload = srcRect.width()*dst->pixelformat().bpp() / 8;
  unsigned x_misalign = srcRect.width()*dst->pixelformat().bpp() % 8;

  mmx_t gamma_val;
  mmx_t pixel_mask;
  pixel_mask.q = 0xFFFFFFFFLL;

  /* load gamma value into 6 bytes 
     we process RGB of two pixels at a step */
  gamma_val.ub[0]= 0;
  gamma_val.ub[1]= g;
  gamma_val.ub[2]= g;
  gamma_val.ub[3]= g;
  gamma_val.ub[4]= 0;
  gamma_val.ub[5]= g;
  gamma_val.ub[6]= g;
  gamma_val.ub[7]= g;
  
  movq_m2r(gamma_val, mm0);
  
  
  mmx_t h;
  
  /* hmm ... should we care for (double) word alignment when accessing memory ? 
     ... will need to investigate  */
  if(brighten)
    {
      y = srcRect.height();
      while(y--)
	{
	  // x_misalign is either 0 or 1
	  if(x_misalign)
	    {
	      // read 4 bytes (1 pixel) from source memory -> mm2 
	      movd_m2r(*(unsigned short*)sPtr, mm2);

	      // add gamma value using saturation
	      paddusb_r2r(mm0, mm1);	     
	      
	      /* and copy the value to mm3 and mm4 */ 
	      movq_r2r(mm2, mm3); 
	      movq_r2r(mm2, mm4); 
	      
	      /* move R from byte[3] to byte[0] */
	      psrld_i2r(24,mm2); 
	      /* move G from byte[2] to byte[0] */
	      psrld_i2r(16,mm3); 
	      /* move B from byte[1] to byte[0] */		  
	      psrld_i2r(8,mm4);
	     
	      // pack components mm2,mm3,mm4 to mm2
	      PACK_32_RGB888_RGB565;
	      
	      /* finally write back the results (32bits),
		 only using the lower 16bits of result
		 ... this is somehow dumb, but only executed once
		 per scanline, so I do not care */
	      movd_r2m(mm2,h);
	      *(unsigned short*)dPtr = h.uw[0];
	      sPtr += 4;
	      dPtr += 2;
	    }
	  
	  x = x_reload;
	  while(x--)
	    {
	      // read 8 bytes (2 pixels) from source memory -> mm1 
	      movq_m2r(*(mmx_t*)sPtr, mm1);

	      // add gamma value using saturation
	      paddusb_r2r(mm0, mm1);

	      /* and copy the value to mm3 and mm4 */ 
	      movq_r2r(mm1, mm3); 
	      movq_r2r(mm1, mm4); 

	      /* components R,G,B) */
	      psrlq_i2r(16,mm1); 
	      psrlq_i2r(13,mm3); 
 	      psrlq_i2r(11,mm4);

	      /* mask out unwanted bits (using the color masks) 
	       * -> pixels at [0...15] and [32..47]
	       R,G,B
	       */ 
	      pand_r2r(mm5,mm1); 
	      pand_r2r(mm6,mm3); 
	      pand_r2r(mm7,mm4); 
	      
	      /* pack color components again */ 
	      por_r2r(mm3,mm1); 
	      por_r2r(mm4,mm1); 		  
	      
	      
	      /* save this as intermediate result (keeping mm1)
		 now again, but using mm2 instead of mm1 */

	      sPtr += sizeof(mmx_t);
	      	      
	      // read 8 bytes (2 pixels) from source memory -> mm2 
	      movq_m2r(*(mmx_t*)sPtr, mm2);

	      // add gamma value using saturation
	      paddusb_r2r(mm0, mm2);

	      /* and copy the value to mm3 and mm4 */ 
	      movq_r2r(mm2, mm3); 
	      movq_r2r(mm2, mm4); 

	      /* components R,G,B) */
	      psrlq_i2r(16,mm2); 
	      psrlq_i2r(13,mm3); 
 	      psrlq_i2r(11,mm4);

	      /* mask out unwanted bits (using the color masks) 
	       * -> pixels at [0...15] and [32..47]
	       R,G,B
	       */ 
	      pand_r2r(mm5,mm2); 
	      pand_r2r(mm6,mm3); 
	      pand_r2r(mm7,mm4); 
	      
	      /* pack color components again */ 
	      por_r2r(mm3,mm2); 
	      por_r2r(mm4,mm2); 		  
	
	      psllq_i2r(16,mm2);

	      /* now we have got the following: pixels Q,R,S,T
		 mm1: 0x0000RRRR0000QQQQ
		 mm2: 0xTTTT0000SSSS0000
		 and we want:
		 mm2: 0xTTTTSSSSRRRRQQQQ ...
	      */
	     
	      movq_m2r(pixel_mask, mm4);  /* -> mm4: 0x00000000FFFFFFFF */

	      movq_r2r(mm1,mm3);  /* -> mm3: 0x0000RRRR0000QQQQ */
	      psrlq_i2r(16,mm3); /* -> mm3: 0x00000000RRRR0000 */
	      por_r2r(mm3, mm1);  /* -> mm1: 0x0000RRRRRRRRQQQQ */
	      pand_r2r(mm4, mm1); /* -> mm1: 0x00000000RRRRQQQQ */

	      movq_r2r(mm2, mm3); /* ->mm3: 0xTTTT0000SSSS0000 */
	      psllq_i2r(16,mm3);  /* ->mm3: 0x0000SSSS00000000 */
	      por_r2r(mm3, mm2);  /* ->mm2: 0xTTTTSSSSSSSS0000 */
	      pandn_r2r(mm2, mm4);/* ->mm4: 0xTTTTSSSS00000000 */

	      por_r2r(mm4,mm1);   /* ->mm1: 0xTTTTSSSSRRRRQQQQ */

	      // finally write back the results
	      movq_r2m(mm1,*(mmx_t*)dPtr);
	      
	      sPtr+=sizeof(mmx_t);
	      dPtr+=sizeof(mmx_t);
	    }
	  
	  sPtr += srcSkip;
	  dPtr += dstSkip;
	}
    }
  else
    {
      /* I hate code duplication ... 
	 but I believe testing brighten==true for every pixel 
	 is waste of time.
      */ 
      y = srcRect.height();
      while(y--)
	{
	  // x_misalign is either 0 or 1
	  if(x_misalign)
	    {
	      // read 4 bytes (1 pixel) from source memory -> mm2 
	      movd_m2r(*(unsigned short*)sPtr, mm2);

	      // add gamma value using saturation
	      psubusb_r2r(mm0, mm1);	     
	      
	      /* and copy the value to mm3 and mm4 */ 
	      movq_r2r(mm2, mm3); 
	      movq_r2r(mm2, mm4); 
	      
	      /* move R from byte[3] to byte[0] */
	      psrld_i2r(24,mm2); 
	      /* move G from byte[2] to byte[0] */
	      psrld_i2r(16,mm3); 
	      /* move B from byte[1] to byte[0] */		  
	      psrld_i2r(8,mm4);
	     
	      // pack components mm2,mm3,mm4 to mm2
	      PACK_32_RGB888_RGB565;
	      
	      /* finally write back the results (32bits),
		 only using the lower 16bits of result
		 ... this is somehow dumb, but only executed once
		 per scanline, so I do not care */
	      movd_r2m(mm2,h);
	      *(unsigned short*)dPtr = h.uw[0];
	      sPtr += 4;
	      dPtr += 2;
	    }
	  
	  x = x_reload;
	  while(x--)
	    {
	      // read 8 bytes (2 pixels) from source memory -> mm1 
	      movq_m2r(*(mmx_t*)sPtr, mm1);

	      // add gamma value using saturation
	      psubusb_r2r(mm0, mm1);

	      /* and copy the value to mm3 and mm4 */ 
	      movq_r2r(mm1, mm3); 
	      movq_r2r(mm1, mm4); 

	      /* components R,G,B) */
	      psrlq_i2r(16,mm1); 
	      psrlq_i2r(13,mm3); 
 	      psrlq_i2r(11,mm4);

	      /* mask out unwanted bits (using the color masks) 
	       * -> pixels at [0...15] and [32..47]
	       R,G,B
	       */ 
	      pand_r2r(mm5,mm1); 
	      pand_r2r(mm6,mm3); 
	      pand_r2r(mm7,mm4); 
	      
	      /* pack color components again */ 
	      por_r2r(mm3,mm1); 
	      por_r2r(mm4,mm1); 		  
	      
	      
	      /* save this as intermediate result (keeping mm1)
		 now again, but using mm2 instead of mm1 */

	      sPtr += sizeof(mmx_t);
	      	      
	      // read 8 bytes (2 pixels) from source memory -> mm1 
	      movq_m2r(*(mmx_t*)sPtr, mm2);

	      // add gamma value using saturation
	      psubusb_r2r(mm0, mm2);

	      /* and copy the value to mm3 and mm4 */ 
	      movq_r2r(mm2, mm3); 
	      movq_r2r(mm2, mm4); 

	      /* components R,G,B) */
	      psrlq_i2r(16,mm2); 
	      psrlq_i2r(13,mm3); 
 	      psrlq_i2r(11,mm4);

	      /* mask out unwanted bits (using the color masks) 
	       * -> pixels at [0...15] and [32..47]
	       R,G,B
	       */ 
	      pand_r2r(mm5,mm2); 
	      pand_r2r(mm6,mm3); 
	      pand_r2r(mm7,mm4); 
	      
	      /* pack color components again */ 
	      por_r2r(mm3,mm2); 
	      por_r2r(mm4,mm2); 		  
	      
	      
	      psllq_i2r(16,mm2);

	      /* now we have got the following: pixels Q,R,S,T
		 mm1: 0x0000RRRR0000QQQQ
		 mm2: 0xTTTT0000SSSS0000
		 and we want:
		 mm2: 0xTTTTSSSSRRRRQQQQ ...
	      */
	     
	      movq_m2r(pixel_mask, mm4);  /* -> mm4: 0x00000000FFFFFFFF */

	      movq_r2r(mm1,mm3);  /* -> mm3: 0x0000RRRR0000QQQQ */
	      psrlq_i2r(16,mm3); /* -> mm3: 0x00000000RRRR0000 */
	      por_r2r(mm3, mm1);  /* -> mm1: 0x0000RRRRRRRRQQQQ */
	      pand_r2r(mm4, mm1); /* -> mm1: 0x00000000RRRRQQQQ */

	      movq_r2r(mm2, mm3); /* ->mm3: 0xTTTT0000SSSS0000 */
	      psllq_i2r(16,mm3); /* ->mm3: 0x0000SSSS00000000 */
	      por_r2r(mm3, mm2);  /* ->mm2: 0xTTTTSSSSSSSS0000 */
	      pandn_r2r(mm2, mm4);/* ->mm4: 0xTTTTSSSS00000000 */

	      por_r2r(mm4,mm1);   /* ->mm1: 0xTTTTSSSSRRRRQQQQ */

	      // finally write back the results
	      movq_r2m(mm1,*(mmx_t*)dPtr);
	      
	      sPtr+=sizeof(mmx_t);
	      dPtr+=sizeof(mmx_t);
	    }
	  
	  sPtr += srcSkip;
	  dPtr += dstSkip;
	}

    }
  
  // reset CPU back to FPU mode (end MMX mode)
  emms();
  
  dst->unlock();
  src->unlock();

}

/** appy const gamma value.
 */
void gammablit_RGB565_RGB565(Surface* dst, const Surface* src, 
			    char g, 
			    const Rect& srcRect, const Point& dstPoint)
{
  assert(dst->pixelformat().bpp() == 2);
  assert(src->pixelformat().bpp() == 2);
  assert(dst->width() >= dstPoint.x + srcRect.width());
  assert(dst->height() >= dstPoint.y + srcRect.height());
  dst->lock();
  src->lock();
  
  /* NOTE THAT ALL POINTERS ARE UNSIGNED CHAR* !!! */
  
  bool brighten = (g >= 0);
  if(g < 0)
    g = -g;
  
  unsigned char* sPtr = 
    (unsigned char*)src->pixels() 
    + srcRect.upperLeft().x * src->pixelformat().bpp() 
    + srcRect.upperLeft().y * src->pitch();
  
  unsigned char* dPtr = 
    (unsigned char*)dst->pixels()
    + dstPoint.x * dst->pixelformat().bpp()
    + dstPoint.y * dst->pitch();
  
  unsigned rMask = dst->pixelformat().rMask();
  unsigned gMask = dst->pixelformat().gMask();
  unsigned bMask = dst->pixelformat().bMask();
  
  PREPARE_64_COLOR_MASKS(rMask, gMask, bMask);
  
  unsigned y = 0;
  unsigned x = 0;
  unsigned dstSkip = dst->pitch() - srcRect.width() * dst->pixelformat().bpp();
  unsigned srcSkip = src->pitch() - srcRect.width() * src->pixelformat().bpp();
  unsigned x_reload = srcRect.width()*src->pixelformat().bpp() / 8;
  // misalign : 0,1,2 or 3 pixels !
  unsigned x_misalign = srcRect.width()*src->pixelformat().bpp() % 8;  

  mmx_t gamma_val;
  
  /* load gamma value into 4 bytes 
     we process 1 color component of 4 pixels at a step */
  gamma_val.ub[0]= g;
  gamma_val.ub[1]= 0;
  gamma_val.ub[2]= g;
  gamma_val.ub[3]= 0;
  gamma_val.ub[4]= g;
  gamma_val.ub[5]= 0;
  gamma_val.ub[6]= g;
  gamma_val.ub[7]= 0;
  
  movq_m2r(gamma_val, mm0);
  
  mmx_t h;
  
  /* hmm ... should we care for (double) word alignment 
     when accessing memory ? 
     ... will need to investigate  */

  if(brighten)
    {
      y = srcRect.height();
      while(y--)
	{
	  if(x_misalign % 2)
	    {
	      // read 4 bytes (2 pixel) from source memory -> mm2 
	      movd_m2r(*(mmx_t*)sPtr, mm2);
	      
	      // unpack mm2 to mm2,mm3,mm4
	      UNPACK_32_RGB565_RGB888;
		
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm2);  
	      paddusb_r2r(mm0,mm3);  
	      paddusb_r2r(mm0,mm4);  

	      // pack components mm2,mm3,mm4 to mm2
	      PACK_32_RGB888_RGB565;
	      
	      /* finally write back the result,
		 only using the lower 16bits of result
		 ... this is somehow dumb, but only executed once
		     per scanline, so I do not care */
	      movd_r2m(mm2,h);
	      *(unsigned short*)dPtr = h.uw[0];
	      sPtr += 2;
	      dPtr += 2;
	    }
	  	  
	  // now for the two remaining misaligned pixels
	  if(x_misalign / 2)
	    {
	      // read 4 bytes (2 pixel) from source memory -> mm2 
	      movd_m2r(*(mmx_t*)sPtr, mm2);
	      
	      // unpack mm2 to mm2,mm3,mm4
	      UNPACK_32_RGB565_RGB888;
	      
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm2);  
	      paddusb_r2r(mm0,mm3);  
	      paddusb_r2r(mm0,mm4);  
	      
	      // pack components mm2,mm3,mm4 to mm2
	      PACK_32_RGB888_RGB565;
	      
	      // finally write back the results (dword: 32 bits)
	      movd_r2m(mm2,*(mmx_t*)dPtr);
	      
	      sPtr+=4;
	      dPtr+=4;
	
	    }
	  
	  x = x_reload;
	  while(x--)
	    {
	      // read 8 bytes (4 pixels) from source memory -> mm2 
	      // and copy the value to mm3 and mm4
	      movq_m2r(*(mmx_t*)sPtr, mm2);
	      
	      UNPACK_64_RGB565_RGB888;
	      
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm2);  
	      paddusb_r2r(mm0,mm3);  
	      paddusb_r2r(mm0,mm4);  
	      
	      PACK_64_RGB888_RGB565;
	      
	      
	      // finally write back the results
	      movq_r2m(mm2,*(mmx_t*)dPtr);
	      
	      sPtr+=sizeof(mmx_t);
	      dPtr+=sizeof(mmx_t);
	    }
	  
	  sPtr += srcSkip;
	  dPtr += dstSkip;
	}
    }
  else
    {
      /* I hate code duplication ... 
	 but I believe testing brighten==true for every pixel 
	 is waste of time.
	  */ 
      y = srcRect.height();
      while(y--)
	{
	  if(x_misalign % 2)
	    {
	      // read 4 bytes (2 pixel) from source memory -> mm2 
	      movd_m2r(*(mmx_t*)sPtr, mm2);
	      
	      // unpack mm2 to mm2,mm3,mm4
	      UNPACK_32_RGB565_RGB888;
	      
	      // sub 8 bytes with unsigned saturation
	      psubusb_r2r(mm0,mm2);  
	      psubusb_r2r(mm0,mm3);  
	      psubusb_r2r(mm0,mm4);  
	      
	      // pack components mm2,mm3,mm4 to mm2
	      PACK_32_RGB888_RGB565;
	      
	      /* finally write back the results,
		 only using the lower 16bits of result
		 ... this is somehow dumb, but only executed once
		 per scanline, so I do not care */
	      movd_r2m(mm2,h);
	      *(unsigned short*)dPtr = h.uw[0];
	      sPtr += 2;
	      dPtr += 2;
	    }
	  
	
	  // now for the two remaining misaligned pixels
	  if(x_misalign / 2)
	  {
	      // read 4 bytes (2 pixel) from source memory -> mm2 
	      movd_m2r(*(mmx_t*)sPtr, mm2);
	      
	      // unpack mm2 to mm2,mm3,mm4
	      UNPACK_32_RGB565_RGB888;
	      
	      // sub 8 bytes with unsigned saturation
	      psubusb_r2r(mm0,mm2);  
	      psubusb_r2r(mm0,mm3);  
	      psubusb_r2r(mm0,mm4);  
	      
	      // pack components mm2,mm3,mm4 to mm2
	      PACK_32_RGB888_RGB565;
	      
	      // finally write back the results (32 bits)
	      movd_r2m(mm2,*(mmx_t*)dPtr);
	      
	      sPtr+=4;
	      dPtr+=4;
	    }
	  
	  x = x_reload;
	  while(x--)
	    {
	      // read 8 bytes (4 pixels) from source memory -> mm2 
	      // and copy the value to mm3 and mm4
	      movq_m2r(*(mmx_t*)sPtr, mm2);
	      
	      UNPACK_64_RGB565_RGB888;
	      
	      // sub 8 bytes with unsigned saturation
	      psubusb_r2r(mm0,mm2);  
	      psubusb_r2r(mm0,mm3);  
	      psubusb_r2r(mm0,mm4);  
	      
	      PACK_64_RGB888_RGB565;
	      
	      // finally write back the results
	      movq_r2m(mm2,*(mmx_t*)dPtr);
	      
	      sPtr+=sizeof(mmx_t);
	      dPtr+=sizeof(mmx_t);
	    }
	  
	  sPtr += srcSkip;
	  dPtr += dstSkip;
	}
    }
  
  // reset CPU back to FPU mode (end MMX mode)
  emms();
  
  dst->unlock();
  src->unlock();
}

#define GAMMA_PTR_INIT(gamma)  \
  char* gPtrStart = gamma->values + gamma->offset_x + (gamma->offset_y << 8);\
  char* gPtrStartX =gamma->values + (gamma->offset_y << 8); \
  char* gPtr = gPtrStart; \
  char* gPtrEndY = gamma->values + gamma->offset_x + 256*256;\
  char flag = 0;

#define GAMMA_PTR_STEP()          \
  gPtr++;                         \
  if(gPtr >= gPtrStartX + 256)    \
    gPtr = gPtrStartX;    

#define GAMMA_PTR_SKIP()                      \
  if(!flag) {                                 \
    flag = 1;                                 \
    gPtr = gPtrStart;                         \
  }                                           \
  else {                                      \
    flag = 0;                                 \
    gPtrStartX += 256;                        \
    gPtrStart += 256;                         \
                                              \
    if(gPtrStart >= gPtrEndY)  {                \
      gPtrStartX = gamma->values;               \
      gPtrStart = gPtrStartX + gamma->offset_x; \
    }                                           \
                                              \
    gPtr = gPtrStart;                         \
  }

/** Apply per pixel gamma value.
    Blit just copies alpha value of the source to the destination surface.
 */
void gammablit_RGBA8888_RGBA8888(Surface* dst, const Surface* src, 
				 GammaFunction* gamma, 
				 const Rect& srcRect, const Point& dstPoint)
{
  assert(dst->pixelformat().bpp() == 4);
  assert(src->pixelformat().bpp() == 4);
  assert(dst->width() >= dstPoint.x + srcRect.width());
  assert(dst->height() >= dstPoint.y + srcRect.height());
  dst->lock();
  src->lock();
  
  /* NOTE THAT ALL POINTERS ARE UNSIGNED CHAR* !!! */
  
  unsigned char* sPtr = 
    (unsigned char*)src->pixels() 
    + srcRect.upperLeft().x * src->pixelformat().bpp() 
    + srcRect.upperLeft().y * src->pitch();
  
  unsigned char* dPtr = 
    (unsigned char*)dst->pixels()
    + dstPoint.x * dst->pixelformat().bpp()
    + dstPoint.y * dst->pitch();
 
  GAMMA_PTR_INIT(gamma);

  unsigned y = 0;
  unsigned x = 0;
  unsigned dstSkip = dst->pitch() - srcRect.width() * dst->pixelformat().bpp();
  unsigned srcSkip = src->pitch() - srcRect.width() * src->pixelformat().bpp();
  unsigned x_reload = srcRect.width()*src->pixelformat().bpp() / 8;
  unsigned x_misalign = srcRect.width()*src->pixelformat().bpp() % 8;

  mmx_t gamma_val;
  char g = 0;
  gamma_val.q = 0LL;
  
  /* hmm ... should we care for (double) word alignment when accessing memory ? 
     ... will need to investigate  */
  
  y = srcRect.height();
  while(y--)
    {
      
      // x_misalign is either 0 or 1
      if(x_misalign)
	{
	  g = *gPtr;
	  GAMMA_PTR_STEP();

	  if(g > 0)
	    { 
	      // load gamma value into two bytes
              gamma_val.ub[1]=g;
              gamma_val.ub[5]=g;
	    }
	  else
	    {
	      gamma_val.ub[1]= -g;
	      gamma_val.ub[5]= -g;  
	    }
	  // move to mm0
	  movq_m2r(gamma_val,mm0);	      
	  // then shift left and or until the value
	  // is in each byte 1,2,3,5,6,7
	  movq_r2r(mm0,mm1); // 1,5
	  pslld_i2r(8,mm1);  // 2,6
	  por_r2r(mm1,mm0);  // 1,2,5,6
	  pslld_i2r(8,mm1);  // 3,7
	  por_r2r(mm1,mm0);  // 1,2,3,5,6,7

	  // read 4 bytes (1 pixel) from source memory -> mm2 
	  movd_m2r(*(mmx_t*)sPtr, mm1);

	  if(g < 0)
	    { 
	      psubusb_r2r(mm0,mm1);
	    }
	  else
	    {
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm1);  
	    }

	  /* finally write back the results (32bits),
	     only using the lower 16bits of result
	     ... this is somehow dumb, but only executed once
	     per scanline, so I do not care */
	  movd_r2m(mm1,(unsigned int*)dPtr);
	  sPtr += 4;
	  dPtr += 4;
	}
      
      x = x_reload;
      while(x--)
	{

	  g = *gPtr;
	  GAMMA_PTR_STEP();

	  if(g > 0)
	    { 
	      // load gamma value into two bytes
              gamma_val.ub[1]=g;
              gamma_val.ub[5]=g;
	    }
	  else
	    {
	      gamma_val.ub[1]= -g;
	      gamma_val.ub[5]= -g;  
	    }

	  // move to mm0
	  movq_m2r(gamma_val,mm0);	      
	  // then shift left and or until the value
	  // is in each byte 1,2,3,5,6,7
	  movq_r2r(mm0,mm1); // 1,5
	  pslld_i2r(8,mm1);  // 2,6
	  por_r2r(mm1,mm0);  // 1,2,5,6
	  pslld_i2r(8,mm1);  // 3,7
	  por_r2r(mm1,mm0);  // 1,2,3,5,6,7

	  // read 8 bytes (2 pixels) from source memory -> mm1 
	  movq_m2r(*(mmx_t*)sPtr, mm1);

	  if(g < 0)
	    { 
	      psubusb_r2r(mm0,mm1);
	    }
	  else
	    {
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm1);  
	    }
	  	  
	  // finally write back the results
	  movq_r2m(mm1,*(mmx_t*)dPtr);
	  
	  sPtr+=sizeof(mmx_t);
	  dPtr+=sizeof(mmx_t);
	}
      
      GAMMA_PTR_SKIP()

      sPtr += srcSkip;
      dPtr += dstSkip;
    }
    
  
  // reset CPU back to FPU mode (end MMX mode)
  emms();
  
  dst->unlock();
  src->unlock();
}

/** convert RGBA8888 to RGB565 and apply per pixel gamma value.
 */
void gammablit_RGBA8888_RGB565(Surface* dst, const Surface* src, 
			      GammaFunction* gamma, 
			      const Rect& srcRect, const Point& dstPoint)
{
  assert(dst->pixelformat().bpp() == 2);
  assert(src->pixelformat().bpp() == 4);
  assert(dst->width() >= dstPoint.x + srcRect.width());
  assert(dst->height() >= dstPoint.y + srcRect.height());
  dst->lock();
  src->lock();
  
  /* NOTE THAT ALL POINTERS ARE UNSIGNED CHAR* !!! */
  
  
  unsigned char* sPtr = 
    (unsigned char*)src->pixels() 
    + srcRect.upperLeft().x * src->pixelformat().bpp() 
    + srcRect.upperLeft().y * src->pitch();
  
  unsigned char* dPtr = 
    (unsigned char*)dst->pixels()
    + dstPoint.x * dst->pixelformat().bpp()
    + dstPoint.y * dst->pitch();

  GAMMA_PTR_INIT(gamma);

  unsigned rMask = dst->pixelformat().rMask();
  unsigned gMask = dst->pixelformat().gMask();
  unsigned bMask = dst->pixelformat().bMask();
  
  PREPARE_64interleave_COLOR_MASKS(rMask, gMask, bMask);


  unsigned y = 0;
  unsigned x = 0;
  unsigned dstSkip = dst->pitch() - srcRect.width() * dst->pixelformat().bpp();
  unsigned srcSkip = src->pitch() - srcRect.width() * src->pixelformat().bpp();
  unsigned x_reload = srcRect.width()*src->pixelformat().bpp() / 8;
  unsigned x_misalign = srcRect.width()*src->pixelformat().bpp() % 8;
  
  
  mmx_t gamma_val;
  char g = 0;
  /* load gamma value into 2 bytes 
     we process 1 color component of 2 pixels at a step */
  gamma_val.ub[0]= g;
  gamma_val.ub[1]= 0;
  gamma_val.ub[2]= 0;
  gamma_val.ub[3]= 0;
  gamma_val.ub[4]= g;
  gamma_val.ub[5]= 0;
  gamma_val.ub[6]= 0;
  gamma_val.ub[7]= 0;
  
  movq_m2r(gamma_val, mm0);
  
  mmx_t h;
  
  /* hmm ... should we care for (double) word alignment when accessing memory ? 
     ... will need to investigate  */

  y = srcRect.height();
  while(y--)
    {
      // x_misalign is either 0 or 1
      if(x_misalign)
	{
	  // read 4 bytes (1 pixel) from source memory -> mm2 
	  movd_m2r(*(mmx_t*)sPtr, mm2);
	  
	  /* and copy the value to mm3 and mm4 */ 
	  movq_r2r(mm2, mm3); 
	  movq_r2r(mm2, mm4); 
	  
	  /* move R from byte[3] to byte[0] */
	  psrld_i2r(24,mm2); 
	  /* move G from byte[2] to byte[0] */
	  psrld_i2r(16,mm3); 
	  /* move B from byte[1] to byte[0] */		  
	  psrld_i2r(8,mm4);

	  g = *gPtr;
	  GAMMA_PTR_STEP();

	  if(g > 0)
	    {
	      gamma_val.ub[0]= g;
	      gamma_val.ub[4]= g;
	      movq_m2r(gamma_val, mm0);	      
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm2);  
	      paddusb_r2r(mm0,mm3);  
	      paddusb_r2r(mm0,mm4);  
	    }
	  else
	    {
	      gamma_val.ub[0]= -g;
	      gamma_val.ub[4]= -g;
	      movq_m2r(gamma_val, mm0);	      
	      // add 8 bytes with unsigned saturation
	      psubusb_r2r(mm0,mm2);  
	      psubusb_r2r(mm0,mm3);  
	      psubusb_r2r(mm0,mm4); 
	    }

	  // pack components mm2,mm3,mm4 to mm2
	  PACK_32_RGB888_RGB565;
	  
	  /* finally write back the results (32bits),
	     only using the lower 16bits of result
	     ... this is somehow dumb, but only executed once
	     per scanline, so I do not care */
	  movd_r2m(mm2,h);
	  *(unsigned short*)dPtr = h.uw[0];
	  sPtr += 4;
	  dPtr += 2;
	}
      
      x = x_reload;
      while(x--)
	{
	  // ... we apply the same gamma value to 4 adjacent pixels
	  g = *gPtr;
	  GAMMA_PTR_STEP();

	  if(g > 0)
	    {
	      gamma_val.ub[0]= g;
	      gamma_val.ub[4]= g;
	      movq_m2r(gamma_val, mm0);	        
	    }
	  else
	    {
	      gamma_val.ub[0]= -g;
	      gamma_val.ub[4]= -g;
	      movq_m2r(gamma_val, mm0); 
	    }

	  // read 8 bytes (2 pixels) from source memory -> mm1 
	  movq_m2r(*(mmx_t*)sPtr, mm1);
	  /* and copy the value to mm3 and mm4 */ 
	  movq_r2r(mm1, mm3); 
	  movq_r2r(mm1, mm4); 
	  
	  /* move R from byte[3|7] to byte[0|4] */
	  psrld_i2r(24,mm1); 
	  /* move G from byte[2|6] to byte[0|4] */
	  psrld_i2r(16,mm3); 
	  /* move B from byte[1|5] to byte[0|4] */		  
	  psrld_i2r(8,mm4);

	  if(g > 0)
	    {
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm1);  
	      paddusb_r2r(mm0,mm3);  
	      paddusb_r2r(mm0,mm4);  
	    }
	  else
	    {
	      // add 8 bytes with unsigned saturation
	      psubusb_r2r(mm0,mm1);  
	      psubusb_r2r(mm0,mm3);  
	      psubusb_r2r(mm0,mm4); 
	    }

	  /* shift values back to the correct position */ 
	  psrlq_i2r(3,mm4); 
	  psllq_i2r(3,mm3); 
	  psllq_i2r(8,mm1); 
	  
	  /* mask out unwanted bits (using the color masks) */ 
	  pand_r2r(mm5,mm1); 
	  pand_r2r(mm6,mm3); 
	  pand_r2r(mm7,mm4); 
	  
	  /* pack color components again */ 
	  por_r2r(mm3,mm1); 
	  por_r2r(mm4,mm1); 		  
	  
	  /* save this as intermediate result (keeping mm1)
	     now again, but using mm2 instead of mm1 */
	  
	  sPtr += sizeof(mmx_t);

	  /* read 8 bytes (2 pixels) from source memory -> mm2 */
	  movq_m2r(*(mmx_t*)sPtr, mm2);
	  /* and copy the value to mm3 and mm4 */ 
	  movq_r2r(mm2, mm3); 
	  movq_r2r(mm2, mm4); 
	  
	  /* move R from byte[3|7] to byte[0|4] */
	  psrld_i2r(24,mm2); 
	  /* move G from byte[2|6] to byte[0|4] */
	  psrld_i2r(16,mm3); 
	  /* move B from byte[1|5] to byte[0|4] */		  
	  psrld_i2r(8,mm4);

	  if(g > 0)
	    {
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm1);  
	      paddusb_r2r(mm0,mm3);  
	      paddusb_r2r(mm0,mm4);  
	    }
	  else
	    {
	      // add 8 bytes with unsigned saturation
	      psubusb_r2r(mm0,mm1);  
	      psubusb_r2r(mm0,mm3);  
	      psubusb_r2r(mm0,mm4); 
	    }	  
	  
	  /* using the macro packs mm2,mm3,mm4 -> mm2 */
	  PACK_64_RGB888_RGB565;
	  
	  /* now we have got the following: pixels Q,R,S,T
	     mm1: 0x0000RRRR0000QQQQ
	     mm2: 0x0000TTTT0000SSSS
	     and we want:
	     mm2: 0xTTTTSSSSRRRRQQQQ ...
	     this pack operation is cool, it does exactly what we want :)
	  */
	  packssdw_r2r(mm1,mm2);
	  
	  // finally write back the results
	  movq_r2m(mm2,*(mmx_t*)dPtr);
	  
	  sPtr+=sizeof(mmx_t);
	  dPtr+=sizeof(mmx_t);
	}
      GAMMA_PTR_SKIP();
      sPtr += srcSkip;
      dPtr += dstSkip;
    }

  
  // reset CPU back to FPU mode (end MMX mode)
  emms();
  
  dst->unlock();
  src->unlock();
}

/** apply per pixel gamma value.
 */
void gammablit_RGB565_RGB565(Surface* dst, const Surface* src, 
			    GammaFunction* gamma, 
			    const Rect& srcRect, const Point& dstPoint)
{
  assert(dst->pixelformat().bpp() == 2);
  assert(src->pixelformat().bpp() == 2);
  assert(dst->width() >= dstPoint.x + srcRect.width());
  assert(dst->height() >= dstPoint.y + srcRect.height());
  dst->lock();
  src->lock();
  
  /* NOTE THAT ALL POINTERS ARE UNSIGNED CHAR* !!! */
  
  unsigned char* sPtr = 
    (unsigned char*)src->pixels() 
    + srcRect.upperLeft().x * src->pixelformat().bpp() 
    + srcRect.upperLeft().y * src->pitch();
  
  unsigned char* dPtr = 
    (unsigned char*)dst->pixels()
    + dstPoint.x * dst->pixelformat().bpp()
    + dstPoint.y * dst->pitch();
  
  unsigned rMask = dst->pixelformat().rMask();
  unsigned gMask = dst->pixelformat().gMask();
  unsigned bMask = dst->pixelformat().bMask();
  
  PREPARE_64_COLOR_MASKS(rMask, gMask, bMask);
  
  GAMMA_PTR_INIT(gamma);

  unsigned y = 0;
  unsigned x = 0;
  unsigned dstSkip = dst->pitch() - srcRect.width() * dst->pixelformat().bpp();
  unsigned srcSkip = src->pitch() - srcRect.width() * src->pixelformat().bpp();
  unsigned x_reload = srcRect.width()*src->pixelformat().bpp() / 8;
  unsigned x_misalign = srcRect.width()*src->pixelformat().bpp() % 8;
  
  mmx_t gamma_val;
  char g = 0;
  /* load gamma value into 4 bytes 
     we process 1 color component of 4 pixels at a step */
  gamma_val.q = 0LL;
  
  mmx_t h;
  
  /* hmm ... should we care for (double) word alignment 
     when accessing memory ? 
     ... will need to investigate  */
  
  y = srcRect.height();
  while(y--)
    {     	
      if(x_misalign % 2)
	{
	  g = *gPtr;
	  GAMMA_PTR_STEP();

	  if(g > 0)
	    { 
	      // load gamma value as two double words
              gamma_val.ub[0]=g;
              gamma_val.ub[4]=g;
	    }
	  else
	    {
	      gamma_val.ub[0]= -g;
	      gamma_val.ub[4]= -g;  
	    }

	  movq_m2r(gamma_val,mm0);	      
	  // then shift left and or until the value
	  // is in each byte 0,2,4,6
	  movq_r2r(mm0,mm1);
	  pslld_i2r(16,mm0);
	  por_r2r(mm1,mm0);  

	  // read 4 bytes (2 pixel) from source memory -> mm2 
	  movd_m2r(*(mmx_t*)sPtr, mm2);
	  
	  // unpack mm2 to mm2,mm3,mm4
	  UNPACK_32_RGB565_RGB888;
	  
	  if(g >= 0)
	    {
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm2);  
	      paddusb_r2r(mm0,mm3);  
	      paddusb_r2r(mm0,mm4);  
	    }
	  else
	    {
	      // sub 8 bytes with unsigned saturation
	      psubusb_r2r(mm0,mm2);  
	      psubusb_r2r(mm0,mm3);  
	      psubusb_r2r(mm0,mm4);  
	    }

	  // pack components mm2,mm3,mm4 to mm2
	  PACK_32_RGB888_RGB565;
	  
	  /* finally write back the results (32bits),
	     only using the lower 16bits of result
	     ... this is somehow dumb, but only executed once
	     per scanline, so I do not care */
	  movd_r2m(mm2,h);
	  *(unsigned short*)dPtr = h.uw[0];
	  sPtr += 2;
	  dPtr += 2;
	}
      
     
      // now for the two remaining misaligned pixels
      if(x_misalign / 2)
      {
	  g = *gPtr;
	  GAMMA_PTR_STEP();

	  if(g > 0)
	    { 
	      // load gamma value as two double words
              gamma_val.d[0]=g;
              gamma_val.d[1]=g;
	    }
	  else
	    {
	      gamma_val.d[0]= -g;
	      gamma_val.d[1]= -g;  
	    }

	  movq_m2r(gamma_val,mm0);	      
	  // then shift left and or until the value
	  // is in each byte 0,2,4,6
	  movq_r2r(mm0,mm1);
	  pslld_i2r(16,mm0);
	  por_r2r(mm1,mm0);  

	  // read 4 bytes (2 pixel) from source memory -> mm2 
	  movd_m2r(*(mmx_t*)sPtr, mm2);
	  
	  // unpack mm2 to mm2,mm3,mm4
	  UNPACK_32_RGB565_RGB888;

	  if(g >= 0)
	    {
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm2);  
	      paddusb_r2r(mm0,mm3);  
	      paddusb_r2r(mm0,mm4);  
	    }
	  else
	    {
	      // sub 8 bytes with unsigned saturation
	      psubusb_r2r(mm0,mm2);  
	      psubusb_r2r(mm0,mm3);  
	      psubusb_r2r(mm0,mm4);  
	    }	   
	  
	  // pack components mm2,mm3,mm4 to mm2
	  PACK_32_RGB888_RGB565;
	  
	  // finally write back the results (32 bits)
	  movd_r2m(mm2,*(mmx_t*)dPtr);
	  
	  sPtr+=4;
	  dPtr+=4;
	}
      
      x = x_reload;
      while(x--)
	{
	  g = *gPtr;
	  GAMMA_PTR_STEP();

	  if(g > 0)
	    { 
	      // load gamma value as two double words
              gamma_val.d[0]=g;
              gamma_val.d[1]=g;
	    }
	  else
	    {
	      gamma_val.d[0]= -g;
	      gamma_val.d[1]= -g;  
	    }

	  movq_m2r(gamma_val,mm0);	      
	  // then shift left and or until the value
	  // is in each byte 0,2,4,6
	  movq_r2r(mm0,mm1);
	  pslld_i2r(16,mm0);
	  por_r2r(mm1,mm0);  
	  
	  // read 8 bytes (4 pixels) from source memory -> mm2 
	  // and copy the value to mm3 and mm4
	  movq_m2r(*(mmx_t*)sPtr, mm2);
	  
	  UNPACK_64_RGB565_RGB888;
	  
	  if(g >= 0)
	    {
	      // add 8 bytes with unsigned saturation
	      paddusb_r2r(mm0,mm2);  
	      paddusb_r2r(mm0,mm3);  
	      paddusb_r2r(mm0,mm4);  
	    }
	  else
	    {
	      // sub 8 bytes with unsigned saturation
	      psubusb_r2r(mm0,mm2);  
	      psubusb_r2r(mm0,mm3);  
	      psubusb_r2r(mm0,mm4);  
	    }
	  
	  PACK_64_RGB888_RGB565;
	  
	  
	  // finally write back the results
	  movq_r2m(mm2,*(mmx_t*)dPtr);
	  
	  sPtr+=sizeof(mmx_t);
	  dPtr+=sizeof(mmx_t);
	}
      GAMMA_PTR_SKIP();

      sPtr += srcSkip;
      dPtr += dstSkip;
    }
    
  
  // reset CPU back to FPU mode (end MMX mode)
  emms();
  
  dst->unlock();
  src->unlock();
}

#endif // !X86_ASSEMBLER


/* --------------------------------------------------------
 * general purpose methods ...
 * --------------------------------------------------------
 */
void gammablit_C(Surface* dst, const Surface* src, 
	       char g, 
	       const Rect& srcRect, const Point& dstPoint)
{
  Painter* source = new Painter((Surface*)src);
  Painter* dest = new Painter(dst);

  Point dstP, srcP;
  Color c;
  int h;
  dstP.y = dstPoint.y;
  for(srcP.y=srcRect.upperLeft().y; srcP.y < srcRect.lowerRight().y; srcP.y++)
    {
      dstP.x = dstPoint.x;
      for(srcP.x=srcRect.upperLeft().x; srcP.x < srcRect.lowerRight().x; srcP.x++)
	{
	  c = source->getPixel(srcP);
	  
	  h = (int)c.r + (int)g;
	  if(h > 255) h = 255;
	  if(h < 0) h = 0;
	  c.r = h;
	  h = (int)c.g + (int)g;
	  if(h > 255) h = 255;
	  if(h < 0) h = 0;
	  c.g = h;
	  h = (int)c.b + (int)g;
	  if(h > 255) h = 255;
	  if(h < 0) h = 0;
	  c.b = h;
	  
	  dest->setPixel(dstP, c);

	  dstP.x++;
	}
      dstP.y++;
    }

  delete source;
  delete dest;
}

void gammablit_C(Surface* dst, const Surface* src, 
		GammaFunction* gamma, 
		const Rect& srcRect, const Point& dstPoint)
{
  Painter* source = new Painter((Surface*)src);
  Painter* dest = new Painter(dst);

  Point dstP, srcP;
  Color c;
  int h;
  char g;
  dstP.y = dstPoint.y;
  for(srcP.y=srcRect.upperLeft().y; srcP.y < srcRect.lowerRight().y; srcP.y++)
    {
      dstP.x = dstPoint.x;
      for(srcP.x=srcRect.upperLeft().x; srcP.x < srcRect.lowerRight().x; srcP.x++)
	{
	  c = source->getPixel(srcP);
	  g = gamma->values[(gamma->offset_x + srcP.x)%256 + (gamma->offset_y + srcP.y)%256 * 256];
	  h = (int)c.r + (int)g;
	  if(h > 255) h = 255;
	  if(h < 0) h = 0;
	  c.r = h;
	  h = (int)c.g + (int)g;
	  if(h > 255) h = 255;
	  if(h < 0) h = 0;
	  c.g = h;
	  h = (int)c.b + (int)g;
	  if(h > 255) h = 255;
	  if(h < 0) h = 0;
	  c.b = h;
	  
	  dest->setPixel(dstP, c);

	  dstP.x++;
	}
      dstP.y++;
    }

  delete source;
  delete dest;
}

/* -------------------------------------------------------- */

void gammablit(Surface* dst, const Surface* src, 
	       char g, 
	       const Rect& srcRect, const Point& dstPoint)
{
#ifdef X86_ASSEMBLER
  int dstFormat = dst->pixelformat()();
  int srcFormat = src->pixelformat()();
  
  if(srcFormat == Pixelformat::RGBA8888)
    {
      if(dstFormat == Pixelformat::RGBA8888)
	gammablit_RGBA8888_RGBA8888(dst,src,g,srcRect,dstPoint);
      else if(dstFormat == Pixelformat::RGB565)
	gammablit_RGBA8888_RGB565(dst,src,g,srcRect,dstPoint);
    }
  else if (srcFormat==Pixelformat::RGB565 && dstFormat==Pixelformat::RGB565)
    gammablit_RGB565_RGB565(dst,src,g,srcRect,dstPoint);
  else
    {
      gammablit_C(dst,src,g,srcRect,dstPoint);
    }
#else
  gammablit_C(dst,src,g,srcRect,dstPoint);
#endif
}


void gammablit(Surface* dst, const Surface* src, 
		GammaFunction* func, 
		const Rect& srcRect, const Point& dstPoint)
{
#ifdef X86_ASSEMBLER
  int dstFormat = dst->pixelformat()();
  int srcFormat = src->pixelformat()();
  
  if(srcFormat == Pixelformat::RGBA8888)
    {
      if(dstFormat == Pixelformat::RGBA8888)
	gammablit_RGBA8888_RGBA8888(dst,src,func,srcRect,dstPoint);
      else if(dstFormat == Pixelformat::RGB565)
	gammablit_RGBA8888_RGB565(dst,src,func,srcRect,dstPoint);
    }
  else if (srcFormat==Pixelformat::RGB565 && dstFormat==Pixelformat::RGB565)
    gammablit_RGB565_RGB565(dst,src,func,srcRect,dstPoint);
  else
    {
      gammablit_C(dst,src,func,srcRect,dstPoint);
    }
#else
  gammablit_C(dst,src,func,srcRect,dstPoint);
#endif
}


} // namespace uta
