/*
    libfame - Fast Assembly MPEG Encoder Library
    Copyright (C) 2000-2001 Vivien Chappelier

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public
    License along with this library; if not, write to the Free
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/********************** MMX accelerated dequantisation *************************/

short const dequantise_mmx_1[] = { 1, 1, 1, 1 };

static void inline dequantise_intra_global(dct_t *block,
					   dct_t *cache,
					   dct_t *dqmatrix,
					   dct_t *psmatrix)
{
  asm volatile ("pxor %%mm7, %%mm7\n"        /* mm7 = 0 */
		"pxor %%mm6, %%mm6\n"        /* mm6 = mismatch accumulator */
		/* first part */
		"movq 0x00(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x08(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm1) - 1) / 2 */
		"pmullw 0x00(%1), %%mm0\n"   /* mm0=[0-3]*Q */
		"pmullw 0x08(%1), %%mm1\n"   /* mm1=[4-7]*Q */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x03, %%mm0\n"       /* divide by 8 */
		"psraw $0x03, %%mm1\n"       /* divide by 8 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x00(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x08(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x00(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x08(%3), %%mm5\n"   /* premultiply for iDCT */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x00(%2)\n"     /* store in cache */
		"movq %%mm1, 0x08(%2)\n"     /* store in cache */
		/* second part */
		"movq 0x10(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x18(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm1) - 1) / 2 */
		"pmullw 0x10(%1), %%mm0\n"   /* mm0=[0-3]*Q */
		"pmullw 0x18(%1), %%mm1\n"   /* mm1=[4-7]*Q */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x03, %%mm0\n"       /* divide by 8 */
		"psraw $0x03, %%mm1\n"       /* divide by 8 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x10(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x18(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x10(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x18(%3), %%mm5\n"   /* premultiply for iDCT */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x10(%2)\n"     /* store in cache */
		"movq %%mm1, 0x18(%2)\n"     /* store in cache */
		/* third part */
		"movq 0x20(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x28(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm1) - 1) / 2 */
		"pmullw 0x20(%1), %%mm0\n"   /* mm0=[0-3]*Q */
		"pmullw 0x28(%1), %%mm1\n"   /* mm1=[4-7]*Q */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x03, %%mm0\n"       /* divide by 8 */
		"psraw $0x03, %%mm1\n"       /* divide by 8 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x20(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x28(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x20(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x28(%3), %%mm5\n"   /* premultiply for iDCT */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x20(%2)\n"     /* store in cache */
		"movq %%mm1, 0x28(%2)\n"     /* store in cache */
		/* fourth part */
		"movq 0x30(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x38(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm1) - 1) / 2 */
		"pmullw 0x30(%1), %%mm0\n"   /* mm0=[0-3]*Q */
		"pmullw 0x38(%1), %%mm1\n"   /* mm1=[4-7]*Q */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x03, %%mm0\n"       /* divide by 8 */
		"psraw $0x03, %%mm1\n"       /* divide by 8 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x30(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x38(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x30(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x38(%3), %%mm5\n"   /* premultiply for iDCT */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x30(%2)\n"     /* store in cache */
		"movq %%mm1, 0x38(%2)\n"     /* store in cache */
		/* fifth part */
		"movq 0x40(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x48(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm1) - 1) / 2 */
		"pmullw 0x40(%1), %%mm0\n"   /* mm0=[0-3]*Q */
		"pmullw 0x48(%1), %%mm1\n"   /* mm1=[4-7]*Q */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x03, %%mm0\n"       /* divide by 8 */
		"psraw $0x03, %%mm1\n"       /* divide by 8 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x40(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x48(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x40(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x48(%3), %%mm5\n"   /* premultiply for iDCT */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x40(%2)\n"     /* store in cache */
		"movq %%mm1, 0x48(%2)\n"     /* store in cache */
		/* sixth part */
		"movq 0x50(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x58(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm1) - 1) / 2 */
		"pmullw 0x50(%1), %%mm0\n"   /* mm0=[0-3]*Q */
		"pmullw 0x58(%1), %%mm1\n"   /* mm1=[4-7]*Q */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x03, %%mm0\n"       /* divide by 8 */
		"psraw $0x03, %%mm1\n"       /* divide by 8 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x50(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x58(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x50(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x58(%3), %%mm5\n"   /* premultiply for iDCT */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x50(%2)\n"     /* store in cache */
		"movq %%mm1, 0x58(%2)\n"     /* store in cache */
		/* seventh part */
		"movq 0x60(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x68(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm1) - 1) / 2 */
		"pmullw 0x60(%1), %%mm0\n"   /* mm0=[0-3]*Q */
		"pmullw 0x68(%1), %%mm1\n"   /* mm1=[4-7]*Q */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x03, %%mm0\n"       /* divide by 8 */
		"psraw $0x03, %%mm1\n"       /* divide by 8 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x60(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x68(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x60(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x68(%3), %%mm5\n"   /* premultiply for iDCT */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x60(%2)\n"     /* store in cache */
		"movq %%mm1, 0x68(%2)\n"     /* store in cache */
		/* eighth part */
		"movq 0x70(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x78(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm1) - 1) / 2 */
		"pmullw 0x70(%1), %%mm0\n"   /* mm0=[0-3]*Q */
		"pmullw 0x78(%1), %%mm1\n"   /* mm1=[4-7]*Q */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x03, %%mm0\n"       /* divide by 8 */
		"psraw $0x03, %%mm1\n"       /* divide by 8 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
	        /* mismatch control */
		"movq %%mm6, %%mm5\n"        /* copy mismatch */
                "psllq $0x20, %%mm5\n"       /* mm5 = higher 32 bits */
                "pxor %%mm6, %%mm5\n"        /* sum mismatch */
		"movq %%mm5, %%mm4\n"        /* copy mismatch */
                "psllq $0x10, %%mm5\n"       /* mm5 =  higher 14 bits */
                "pxor %%mm5, %%mm4\n"        /* sum mismatch */
		"movq %%mm7, %%mm6\n"        /* mm6 = mm7 */
		"pcmpeqw %%mm7, %%mm6\n"     /* mm6 = 0xffffffffffffffff */
                "psllq $0x3f, %%mm6\n"       /* mm6 = 0x7000000000000000 */
		"psrlq $0x0f, %%mm6\n"       /* mm6 = 0x0001000000000000 */
                "pxor %%mm6, %%mm1\n"        /* last coeff ^= 1 */
                "pand %%mm6, %%mm4\n"        /* keep only lsb of mismatch */
                "pxor %%mm4, %%mm1\n"        /* last coeff  ^= !(mismatch&1) */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x70(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x78(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x70(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x78(%3), %%mm5\n"   /* premultiply for iDCT */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x70(%2)\n"     /* store in cache */
		"movq %%mm1, 0x78(%2)\n"     /* store in cache */
	        : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
		: "memory");
}

static void inline dequantise_intra_local(dct_t *block,
					  dct_t *cache,
					  dct_t *dqmatrix,
					  dct_t *psmatrix)
{
#define DEQUANTISE_INTRA_LOCAL_STEP(x)  				 \
    "movq 0x" #x "0(%0), %%mm0\n"     /* load 1st line 1st half */       \
    "movq 0x" #x "8(%0), %%mm1\n"     /* load 1st line 2nd half */	 \
    "movq %%mm0, %%mm2\n"             /* mm2 = 1st line 1st half */	 \
    "movq %%mm1, %%mm3\n"             /* mm3 = 1st line 1st half */	 \
    "psraw $0x0f, %%mm2\n"            /* mm2 = (sign(mm0) - 1) / 2 */	 \
    "psraw $0x0f, %%mm3\n"            /* mm3 = (sign(mm1) - 1) / 2 */	 \
    "pmullw 0x" #x "0(%1), %%mm0\n"   /* mm0=[0-3]*Q */		         \
    "pmullw 0x" #x "8(%1), %%mm1\n"   /* mm1=[4-7]*Q */		         \
    "movq %%mm0, %%mm4\n"             /* mm4 = mm0 */       		 \
    "movq %%mm1, %%mm5\n"             /* mm5 = mm1 */			 \
    "pcmpeqw %%mm7, %%mm4\n"          /* mm4[0-3]=0xFF if mm4[0-3]==0 */ \
    "pcmpeqw %%mm7, %%mm5\n"          /* mm5[0-3]=0xFF if mm5[0-3]==0 */ \
    "pcmpeqw %%mm7, %%mm4\n"          /* mm4[0-3]=0xFF if mm0[0-3]!=0 */ \
    "pcmpeqw %%mm7, %%mm5\n"          /* mm5[0-3]=0xFF if mm1[0-3]!=0 */ \
    "paddsw %%mm2, %%mm0\n"           /* sign adjust before shift */    \
    "paddsw %%mm3, %%mm1\n"           /* sign adjust before shift */    \
    "psraw $0x03, %%mm0\n"            /* divide by 8 */                  \
    "psraw $0x03, %%mm1\n"            /* divide by 8 */			 \
    "psubw %%mm2, %%mm0\n"            /* sign adjust after shift */     \
    "psubw %%mm3, %%mm1\n"            /* sign adjust after shift */	 \
    "pcmpeqw %%mm7, %%mm2\n"          /* invert sign */                  \
    "pcmpeqw %%mm7, %%mm2\n"          /* invert sign */                  \
    "paddw %%mm2, %%mm0\n"            /* sub 1 if >0 */                  \
    "paddw %%mm3, %%mm1\n"            /* sub 1 if >0 */                  \
    "por " ASMSYM "dequantise_mmx_1, %%mm0\n"   /* or 1 */                         \
    "por " ASMSYM "dequantise_mmx_1, %%mm1\n"   /* or 1 */                         \
    "pand %%mm4, %%mm0\n"             /* [0-3]=0 if [0-3] was zero */	 \
    "pand %%mm5, %%mm1\n"             /* [4-7]=0 if [4-7] was zero */	 \
    "movq %%mm0, %%mm4\n"             /* mm4 = mm0 */			 \
    "movq %%mm1, %%mm5\n"             /* mm5 = mm1 */			 \
    "pmulhw 0x" #x "0(%3), %%mm0\n"   /* premultiply for iDCT */	 \
    "pmulhw 0x" #x "8(%3), %%mm1\n"   /* premultiply for iDCT */	 \
    "pmullw 0x" #x "0(%3), %%mm4\n"   /* premultiply for iDCT */	 \
    "pmullw 0x" #x "8(%3), %%mm5\n"   /* premultiply for iDCT */	 \
    "movq %%mm4, %%mm2\n"             /* mm2 = lower(prescale*[0-3]) */	 \
    "movq %%mm5, %%mm3\n"             /* mm3 = lower(prescale*[0-3]) */	 \
    "psllw $0x04, %%mm2\n"            /* keep 12 bits right aligned */	 \
    "psllw $0x04, %%mm3\n"            /* keep 12 bits right aligned */	 \
    "psrlw $0x04, %%mm2\n"            /* keep only 'fixed point' part */ \
    "psrlw $0x04, %%mm3\n"            /* keep only 'fixed point' part */ \
    "paddsw %%mm2, %%mm4\n"           /* add fixed point to number */	 \
    "paddsw %%mm3, %%mm5\n"           /* add fixed point to number */	 \
    "psrlw $0x0c, %%mm4\n"            /* keep most significant 4 bits */ \
    "psrlw $0x0c, %%mm5\n"            /* keep most significant 4 bits */ \
    "psllw $0x04, %%mm0\n"            /* multiply by 16 for iDCT */	 \
    "psllw $0x04, %%mm1\n"            /* multiply by 16 for iDCT */	 \
    "por %%mm4, %%mm0\n"              /* add least significant part */	 \
    "por %%mm5, %%mm1\n"              /* add least significant part */	 \
    "movq %%mm0, 0x" #x "0(%2)\n"     /* store in cache */		 \
    "movq %%mm1, 0x" #x "8(%2)\n"     /* store in cache */

  asm volatile ("pxor %%mm7, %%mm7\n"        /* mm7 = 0 */
		"pxor %%mm6, %%mm6\n"        /* mm6 = mismatch accumulator */
		DEQUANTISE_INTRA_LOCAL_STEP(0)
		DEQUANTISE_INTRA_LOCAL_STEP(1)
		DEQUANTISE_INTRA_LOCAL_STEP(2)
		DEQUANTISE_INTRA_LOCAL_STEP(3)
		DEQUANTISE_INTRA_LOCAL_STEP(4)
		DEQUANTISE_INTRA_LOCAL_STEP(5)
		DEQUANTISE_INTRA_LOCAL_STEP(6)
		DEQUANTISE_INTRA_LOCAL_STEP(7)
	        : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
		: "memory");
}

static void inline dequantise_inter_global(dct_t *block,
					   dct_t *cache,
					   dct_t *dqmatrix,
					   dct_t *psmatrix)
{
  asm volatile ("pxor %%mm7, %%mm7\n"        /* mm7 = 0 */
		"pxor %%mm6, %%mm6\n"        /* mm6 = mismatch accumulator */
		/* first part */
		"movq 0x00(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x08(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm0) - 1) / 2 */
		"paddsw %%mm2, %%mm0\n"      /* mm0 = [0-3]+(sign([0-3])-1)/2*/
		"paddsw %%mm3, %%mm1\n"      /* mm1 = [4-7]+(sign([0-3])-1)/2*/
		"paddsw %%mm0, %%mm0\n"      /* mm0 = 2*[0-3]+sign([0-3])-1 */
		"paddsw %%mm1, %%mm1\n"      /* mm1 = 2*[4-7]+sign([4-7])-1 */
		"pmullw 0x00(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"pmullw 0x08(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */       
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"paddsw 0x00(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"paddsw 0x08(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm4[0-3]==0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm5[0-3]==0 */
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm0[0-3]!=0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm1[0-3]!=0 */
		"pand %%mm4, %%mm0\n"        /* [0-3]=0 if [0-3] was zero */
		"pand %%mm5, %%mm1\n"        /* [4-7]=0 if [4-7] was zero */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x04, %%mm0\n"       /* divide by 16 */
		"psraw $0x04, %%mm1\n"       /* divide by 16 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x00(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x08(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x00(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x08(%3), %%mm5\n"   /* premultiply for iDCT */
		"paddw %%mm2, %%mm4\n"       /* sign adjust before rounding */
		"paddw %%mm3, %%mm5\n"       /* sign adjust before rounding */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x00(%2)\n"     /* store in cache */
		"movq %%mm1, 0x08(%2)\n"     /* store in cache */
		/* second part */
		"movq 0x10(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x18(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm0) - 1) / 2 */
		"paddsw %%mm2, %%mm0\n"      /* mm0 = [0-3]+(sign([0-3])-1)/2*/
		"paddsw %%mm3, %%mm1\n"      /* mm1 = [4-7]+(sign([0-3])-1)/2*/
		"paddsw %%mm0, %%mm0\n"      /* mm0 = 2*[0-3]+sign([0-3])-1 */
		"paddsw %%mm1, %%mm1\n"      /* mm1 = 2*[4-7]+sign([4-7])-1 */
		"pmullw 0x10(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"pmullw 0x18(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */       
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"paddsw 0x10(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"paddsw 0x18(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm4[0-3]==0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm5[0-3]==0 */
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm0[0-3]!=0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm1[0-3]!=0 */
		"pand %%mm4, %%mm0\n"        /* [0-3]=0 if [0-3] was zero */
		"pand %%mm5, %%mm1\n"        /* [4-7]=0 if [4-7] was zero */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x04, %%mm0\n"       /* divide by 16 */
		"psraw $0x04, %%mm1\n"       /* divide by 16 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x10(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x18(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x10(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x18(%3), %%mm5\n"   /* premultiply for iDCT */
		"paddw %%mm2, %%mm4\n"       /* sign adjust before rounding */
		"paddw %%mm3, %%mm5\n"       /* sign adjust before rounding */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x10(%2)\n"     /* store in cache */
		"movq %%mm1, 0x18(%2)\n"     /* store in cache */
		/* third part */
		"movq 0x20(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x28(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm0) - 1) / 2 */
		"paddsw %%mm2, %%mm0\n"      /* mm0 = [0-3]+(sign([0-3])-1)/2*/
		"paddsw %%mm3, %%mm1\n"      /* mm1 = [4-7]+(sign([0-3])-1)/2*/
		"paddsw %%mm0, %%mm0\n"      /* mm0 = 2*[0-3]+sign([0-3])-1 */
		"paddsw %%mm1, %%mm1\n"      /* mm1 = 2*[4-7]+sign([4-7])-1 */
		"pmullw 0x20(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"pmullw 0x28(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */       
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"paddsw 0x20(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"paddsw 0x28(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm4[0-3]==0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm5[0-3]==0 */
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm0[0-3]!=0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm1[0-3]!=0 */
		"pand %%mm4, %%mm0\n"        /* [0-3]=0 if [0-3] was zero */
		"pand %%mm5, %%mm1\n"        /* [4-7]=0 if [4-7] was zero */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x04, %%mm0\n"       /* divide by 16 */
		"psraw $0x04, %%mm1\n"       /* divide by 16 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x20(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x28(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x20(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x28(%3), %%mm5\n"   /* premultiply for iDCT */
		"paddw %%mm2, %%mm4\n"       /* sign adjust before rounding */
		"paddw %%mm3, %%mm5\n"       /* sign adjust before rounding */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x20(%2)\n"     /* store in cache */
		"movq %%mm1, 0x28(%2)\n"     /* store in cache */
		/* fourth part */
		"movq 0x30(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x38(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm0) - 1) / 2 */
		"paddsw %%mm2, %%mm0\n"      /* mm0 = [0-3]+(sign([0-3])-1)/2*/
		"paddsw %%mm3, %%mm1\n"      /* mm1 = [4-7]+(sign([0-3])-1)/2*/
		"paddsw %%mm0, %%mm0\n"      /* mm0 = 2*[0-3]+sign([0-3])-1 */
		"paddsw %%mm1, %%mm1\n"      /* mm1 = 2*[4-7]+sign([4-7])-1 */
		"pmullw 0x30(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"pmullw 0x38(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */       
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"paddsw 0x30(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"paddsw 0x38(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm4[0-3]==0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm5[0-3]==0 */
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm0[0-3]!=0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm1[0-3]!=0 */
		"pand %%mm4, %%mm0\n"        /* [0-3]=0 if [0-3] was zero */
		"pand %%mm5, %%mm1\n"        /* [4-7]=0 if [4-7] was zero */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x04, %%mm0\n"       /* divide by 16 */
		"psraw $0x04, %%mm1\n"       /* divide by 16 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x30(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x38(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x30(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x38(%3), %%mm5\n"   /* premultiply for iDCT */
		"paddw %%mm2, %%mm4\n"       /* sign adjust before rounding */
		"paddw %%mm3, %%mm5\n"       /* sign adjust before rounding */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x30(%2)\n"     /* store in cache */
		"movq %%mm1, 0x38(%2)\n"     /* store in cache */
		/* fifth part */
		"movq 0x40(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x48(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm0) - 1) / 2 */
		"paddsw %%mm2, %%mm0\n"      /* mm0 = [0-3]+(sign([0-3])-1)/2*/
		"paddsw %%mm3, %%mm1\n"      /* mm1 = [4-7]+(sign([0-3])-1)/2*/
		"paddsw %%mm0, %%mm0\n"      /* mm0 = 2*[0-3]+sign([0-3])-1 */
		"paddsw %%mm1, %%mm1\n"      /* mm1 = 2*[4-7]+sign([4-7])-1 */
		"pmullw 0x40(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"pmullw 0x48(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */       
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"paddsw 0x40(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"paddsw 0x48(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm4[0-3]==0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm5[0-3]==0 */
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm0[0-3]!=0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm1[0-3]!=0 */
		"pand %%mm4, %%mm0\n"        /* [0-3]=0 if [0-3] was zero */
		"pand %%mm5, %%mm1\n"        /* [4-7]=0 if [4-7] was zero */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x04, %%mm0\n"       /* divide by 16 */
		"psraw $0x04, %%mm1\n"       /* divide by 16 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x40(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x48(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x40(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x48(%3), %%mm5\n"   /* premultiply for iDCT */
		"paddw %%mm2, %%mm4\n"       /* sign adjust before rounding */
		"paddw %%mm3, %%mm5\n"       /* sign adjust before rounding */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x40(%2)\n"     /* store in cache */
		"movq %%mm1, 0x48(%2)\n"     /* store in cache */
		/* sixth part */
		"movq 0x50(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x58(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm0) - 1) / 2 */
		"paddsw %%mm2, %%mm0\n"      /* mm0 = [0-3]+(sign([0-3])-1)/2*/
		"paddsw %%mm3, %%mm1\n"      /* mm1 = [4-7]+(sign([0-3])-1)/2*/
		"paddsw %%mm0, %%mm0\n"      /* mm0 = 2*[0-3]+sign([0-3])-1 */
		"paddsw %%mm1, %%mm1\n"      /* mm1 = 2*[4-7]+sign([4-7])-1 */
		"pmullw 0x50(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"pmullw 0x58(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */       
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"paddsw 0x50(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"paddsw 0x58(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm4[0-3]==0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm5[0-3]==0 */
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm0[0-3]!=0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm1[0-3]!=0 */
		"pand %%mm4, %%mm0\n"        /* [0-3]=0 if [0-3] was zero */
		"pand %%mm5, %%mm1\n"        /* [4-7]=0 if [4-7] was zero */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x04, %%mm0\n"       /* divide by 16 */
		"psraw $0x04, %%mm1\n"       /* divide by 16 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x50(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x58(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x50(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x58(%3), %%mm5\n"   /* premultiply for iDCT */
		"paddw %%mm2, %%mm4\n"       /* sign adjust before rounding */
		"paddw %%mm3, %%mm5\n"       /* sign adjust before rounding */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x50(%2)\n"     /* store in cache */
		"movq %%mm1, 0x58(%2)\n"     /* store in cache */
		/* seventh part */
		"movq 0x60(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x68(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm0) - 1) / 2 */
		"paddsw %%mm2, %%mm0\n"      /* mm0 = [0-3]+(sign([0-3])-1)/2*/
		"paddsw %%mm3, %%mm1\n"      /* mm1 = [4-7]+(sign([0-3])-1)/2*/
		"paddsw %%mm0, %%mm0\n"      /* mm0 = 2*[0-3]+sign([0-3])-1 */
		"paddsw %%mm1, %%mm1\n"      /* mm1 = 2*[4-7]+sign([4-7])-1 */
		"pmullw 0x60(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"pmullw 0x68(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */       
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"paddsw 0x60(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"paddsw 0x68(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm4[0-3]==0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm5[0-3]==0 */
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm0[0-3]!=0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm1[0-3]!=0 */
		"pand %%mm4, %%mm0\n"        /* [0-3]=0 if [0-3] was zero */
		"pand %%mm5, %%mm1\n"        /* [4-7]=0 if [4-7] was zero */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x04, %%mm0\n"       /* divide by 16 */
		"psraw $0x04, %%mm1\n"       /* divide by 16 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x60(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x68(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x60(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x68(%3), %%mm5\n"   /* premultiply for iDCT */
		"paddw %%mm2, %%mm4\n"       /* sign adjust before rounding */
		"paddw %%mm3, %%mm5\n"       /* sign adjust before rounding */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x60(%2)\n"     /* store in cache */
		"movq %%mm1, 0x68(%2)\n"     /* store in cache */
		/* eighth part */
		"movq 0x70(%0), %%mm0\n"     /* load 1st line 1st half */
		"movq 0x78(%0), %%mm1\n"     /* load 1st line 2nd half */
		"movq %%mm0, %%mm2\n"        /* mm2 = 1st line 1st half */
		"movq %%mm1, %%mm3\n"        /* mm3 = 1st line 1st half */
		"psraw $0x0f, %%mm2\n"       /* mm2 = (sign(mm0) - 1) / 2 */
		"psraw $0x0f, %%mm3\n"       /* mm3 = (sign(mm0) - 1) / 2 */
		"paddsw %%mm2, %%mm0\n"      /* mm0 = [0-3]+(sign([0-3])-1)/2*/
		"paddsw %%mm3, %%mm1\n"      /* mm1 = [4-7]+(sign([0-3])-1)/2*/
		"paddsw %%mm0, %%mm0\n"      /* mm0 = 2*[0-3]+sign([0-3])-1 */
		"paddsw %%mm1, %%mm1\n"      /* mm1 = 2*[4-7]+sign([4-7])-1 */
		"pmullw 0x70(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"pmullw 0x78(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */       
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"paddsw 0x70(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/
		"paddsw 0x78(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm4[0-3]==0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm5[0-3]==0 */
		"pcmpeqw %%mm7, %%mm4\n"     /* mm4[0-3]=0xFF if mm0[0-3]!=0 */
		"pcmpeqw %%mm7, %%mm5\n"     /* mm5[0-3]=0xFF if mm1[0-3]!=0 */
		"pand %%mm4, %%mm0\n"        /* [0-3]=0 if [0-3] was zero */
		"pand %%mm5, %%mm1\n"        /* [4-7]=0 if [4-7] was zero */
		"paddsw %%mm2, %%mm0\n"      /* sign adjust before shift */
		"paddsw %%mm3, %%mm1\n"      /* sign adjust before shift */
		"psraw $0x04, %%mm0\n"       /* divide by 16 */
		"psraw $0x04, %%mm1\n"       /* divide by 16 */
		"psubsw %%mm2, %%mm0\n"      /* sign adjust after shift */
		"psubsw %%mm3, %%mm1\n"      /* sign adjust after shift */
		"pxor %%mm0, %%mm6\n"        /* accumulate mismatch */
		"pxor %%mm1, %%mm6\n"        /* accumulate mismatch */
	        /* mismatch control */
		"movq %%mm6, %%mm5\n"        /* copy mismatch */
                "psllq $0x20, %%mm5\n"       /* mm5 = higher 32 bits */
                "pxor %%mm6, %%mm5\n"        /* sum mismatch */
		"movq %%mm5, %%mm4\n"        /* copy mismatch */
                "psllq $0x10, %%mm5\n"       /* mm5 =  higher 14 bits */
                "pxor %%mm5, %%mm4\n"        /* sum mismatch */
		"movq %%mm7, %%mm6\n"        /* mm6 = mm7 */
		"pcmpeqw %%mm7, %%mm6\n"     /* mm6 = 0xffffffffffffffff */
                "psllq $0x3f, %%mm6\n"       /* mm6 = 0x7000000000000000 */
		"psrlq $0x0f, %%mm6\n"       /* mm6 = 0x0001000000000000 */
                "pxor %%mm6, %%mm1\n"        /* last coeff ^= 1 */
                "pand %%mm6, %%mm4\n"        /* keep only lsb of mismatch */
                "pxor %%mm4, %%mm1\n"        /* last coeff  ^= !(mismatch&1) */
		"movq %%mm0, %%mm4\n"        /* mm4 = mm0 */
		"movq %%mm1, %%mm5\n"        /* mm5 = mm1 */
		"pmulhw 0x70(%3), %%mm0\n"   /* premultiply for iDCT */
		"pmulhw 0x78(%3), %%mm1\n"   /* premultiply for iDCT */
		"pmullw 0x70(%3), %%mm4\n"   /* premultiply for iDCT */
		"pmullw 0x78(%3), %%mm5\n"   /* premultiply for iDCT */
		"paddw %%mm2, %%mm4\n"       /* sign adjust before rounding */
		"paddw %%mm3, %%mm5\n"       /* sign adjust before rounding */
	 	"movq %%mm4, %%mm2\n"        /* mm2 = lower(prescale*[0-3]) */
		"movq %%mm5, %%mm3\n"        /* mm3 = lower(prescale*[0-3]) */
		"psllw $0x04, %%mm2\n"       /* keep 12 bits right aligned */
		"psllw $0x04, %%mm3\n"       /* keep 12 bits right aligned */
		"psrlw $0x04, %%mm2\n"       /* keep only 'fixed point' part */
		"psrlw $0x04, %%mm3\n"       /* keep only 'fixed point' part */
		"paddsw %%mm2, %%mm4\n"      /* add fixed point to number */
		"paddsw %%mm3, %%mm5\n"      /* add fixed point to number */
		"psrlw $0x0c, %%mm4\n"       /* keep most significant 4 bits */
		"psrlw $0x0c, %%mm5\n"       /* keep most significant 4 bits */
		"psllw $0x04, %%mm0\n"       /* multiply by 16 for iDCT */
		"psllw $0x04, %%mm1\n"       /* multiply by 16 for iDCT */
		"por %%mm4, %%mm0\n"         /* add least significant part */
		"por %%mm5, %%mm1\n"         /* add least significant part */
		"movq %%mm0, 0x70(%2)\n"     /* store in cache */
		"movq %%mm1, 0x78(%2)\n"     /* store in cache */
	        : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
		: "memory");
}

static void inline dequantise_inter_local(dct_t *block,
					  dct_t *cache,
					  dct_t *dqmatrix,
					  dct_t *psmatrix)
{
#define DEQUANTISE_INTER_LOCAL_STEP(x)					 \
    "movq 0x" #x "0(%0), %%mm0\n"     /* load 1st line 1st half */	 \
    "movq 0x" #x "8(%0), %%mm1\n"     /* load 1st line 2nd half */	 \
    "movq %%mm0, %%mm2\n"             /* mm2 = 1st line 1st half */	 \
    "movq %%mm1, %%mm3\n"             /* mm3 = 1st line 1st half */	 \
    "psraw $0x0f, %%mm2\n"            /* mm2 = (sign(mm0) - 1) / 2 */	 \
    "psraw $0x0f, %%mm3\n"            /* mm3 = (sign(mm0) - 1) / 2 */	 \
    "paddsw %%mm2, %%mm0\n"           /* mm0 = [0-3]+(sign([0-3])-1)/2*/ \
    "paddsw %%mm3, %%mm1\n"           /* mm1 = [4-7]+(sign([0-3])-1)/2*/ \
    "paddsw %%mm0, %%mm0\n"           /* mm0 = 2*[0-3]+sign([0-3])-1 */	 \
    "paddsw %%mm1, %%mm1\n"           /* mm1 = 2*[4-7]+sign([4-7])-1 */	 \
    "pmullw 0x" #x "0(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/ \
    "pmullw 0x" #x "8(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/ \
    "movq %%mm0, %%mm4\n"             /* mm4 = mm0 */       		 \
    "movq %%mm1, %%mm5\n"             /* mm5 = mm1 */			 \
    "paddsw 0x" #x "0(%1), %%mm0\n"   /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/ \
    "paddsw 0x" #x "8(%1), %%mm1\n"   /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/ \
    "pcmpeqw %%mm7, %%mm4\n"          /* mm4[0-3]=0xFF if mm4[0-3]==0 */ \
    "pcmpeqw %%mm7, %%mm5\n"          /* mm5[0-3]=0xFF if mm5[0-3]==0 */ \
    "pcmpeqw %%mm7, %%mm4\n"          /* mm4[0-3]=0xFF if mm0[0-3]!=0 */ \
    "pcmpeqw %%mm7, %%mm5\n"          /* mm5[0-3]=0xFF if mm1[0-3]!=0 */ \
    "paddsw %%mm2, %%mm0\n"           /* sign adjust before shift */    \
    "paddsw %%mm3, %%mm1\n"           /* sign adjust before shift */    \
    "psraw $0x04, %%mm0\n"            /* divide by 16 */		 \
    "psraw $0x04, %%mm1\n"            /* divide by 16 */		 \
    "psubw %%mm2, %%mm0\n"            /* sign adjust after shift */	 \
    "psubw %%mm3, %%mm1\n"            /* sign adjust after shift */	 \
    "pcmpeqw %%mm7, %%mm2\n"          /* invert sign */                  \
    "pcmpeqw %%mm7, %%mm3\n"          /* invert sign */                  \
    "paddw %%mm2, %%mm0\n"            /* sub 1 if >0 */                  \
    "paddw %%mm3, %%mm1\n"            /* sub 1 if >0 */                  \
    "por " ASMSYM "dequantise_mmx_1, %%mm0\n"   /* or 1 */                         \
    "por " ASMSYM "dequantise_mmx_1, %%mm1\n"   /* or 1 */                         \
    "pand %%mm4, %%mm0\n"             /* [0-3]=0 if [0-3] was zero */	 \
    "pand %%mm5, %%mm1\n"             /* [4-7]=0 if [4-7] was zero */	 \
    "movq %%mm0, %%mm4\n"             /* mm4 = mm0 */			 \
    "movq %%mm1, %%mm5\n"             /* mm5 = mm1 */			 \
    "pmulhw 0x" #x "0(%3), %%mm0\n"   /* premultiply for iDCT */	 \
    "pmulhw 0x" #x "8(%3), %%mm1\n"   /* premultiply for iDCT */	 \
    "pmullw 0x" #x "0(%3), %%mm4\n"   /* premultiply for iDCT */	 \
    "pmullw 0x" #x "8(%3), %%mm5\n"   /* premultiply for iDCT */	 \
    "pcmpeqw %%mm7, %%mm2\n"          /* invert sign */                  \
    "pcmpeqw %%mm7, %%mm3\n"          /* invert sign */                  \
    "paddw %%mm2, %%mm4\n"            /* sign adjust before rounding */  \
    "paddw %%mm3, %%mm5\n"            /* sign adjust before rounding */  \
    "movq %%mm4, %%mm2\n"             /* mm2 = lower(prescale*[0-3]) */	 \
    "movq %%mm5, %%mm3\n"             /* mm3 = lower(prescale*[0-3]) */	 \
    "psllw $0x04, %%mm2\n"            /* keep 12 bits right aligned */	 \
    "psllw $0x04, %%mm3\n"            /* keep 12 bits right aligned */	 \
    "psrlw $0x04, %%mm2\n"            /* keep only 'fixed point' part */ \
    "psrlw $0x04, %%mm3\n"            /* keep only 'fixed point' part */ \
    "paddsw %%mm2, %%mm4\n"           /* add fixed point to number */	 \
    "paddsw %%mm3, %%mm5\n"           /* add fixed point to number */	 \
    "psrlw $0x0c, %%mm4\n"            /* keep most significant 4 bits */ \
    "psrlw $0x0c, %%mm5\n"            /* keep most significant 4 bits */ \
    "psllw $0x04, %%mm0\n"            /* multiply by 16 for iDCT */	 \
    "psllw $0x04, %%mm1\n"            /* multiply by 16 for iDCT */	 \
    "por %%mm4, %%mm0\n"              /* add least significant part */	 \
    "por %%mm5, %%mm1\n"              /* add least significant part */	 \
    "movq %%mm0, 0x" #x "0(%2)\n"     /* store in cache */		 \
    "movq %%mm1, 0x" #x "8(%2)\n"     /* store in cache */

  asm volatile ("pxor %%mm7, %%mm7\n"        /* mm7 = 0 */
		"pxor %%mm6, %%mm6\n"        /* mm6 = mismatch accumulator */
		DEQUANTISE_INTER_LOCAL_STEP(0)
		DEQUANTISE_INTER_LOCAL_STEP(1)
		DEQUANTISE_INTER_LOCAL_STEP(2)
		DEQUANTISE_INTER_LOCAL_STEP(3)
		DEQUANTISE_INTER_LOCAL_STEP(4)
		DEQUANTISE_INTER_LOCAL_STEP(5)
		DEQUANTISE_INTER_LOCAL_STEP(6)
		DEQUANTISE_INTER_LOCAL_STEP(7)
	        : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix)
		: "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix)
		: "memory");
}
