/*
    libfame - Fast Assembly MPEG Encoder Library
    Copyright (C) 2000-2001 Vivien Chappelier

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public
    License along with this library; if not, write to the Free
    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*************************** MMX accelerated DCT *****************************/
/* Warning: Didn't check the DCT was IEEE compliant. It is probably not.     */
/* TODO: Write an IEEE compliant DCT/iDCT                                    */

/* DCT flags */
/* define to have a more accurate but slower DCT */
/* #define precision */
/* define to adjust DCT computation according to the sign */
#define signbit

/* AA&N coefficients */
#define COS2  ((short) (0.541196100 * (double)(1 << 15) + .5)) /* 15-bit */
#define COS4  ((short) (0.707106781 * (double)(1 << 15) + .5)) /* 15-bit */
#define COS6  ((short) (0.382683433 * (double)(1 << 16) + .5)) /* 16-bit */
#define COS8  ((short) (0.306562965 * (double)(1 << 16) + .5)) /* 16-bit */

static short const _mmx_cos[] = {
  -COS2, -COS2, -COS2, -COS2,
   COS4,  COS4,  COS4,  COS4,
   COS6,  COS6,  COS6,  COS6,
   COS8,  COS8,  COS8,  COS8
};

static void inline dct_aan_pass(dct_t *cache)
{
  register unsigned short const *mmx_cos = _mmx_cos;
  asm volatile (
      ""                                     /* STEP 1 */
      "movq 0x00(%0),     %%mm0\n"           /* load line 0 */
      "movq 0x10(%0),     %%mm1\n"           /* load line 1 */
      "movq 0x20(%0),     %%mm2\n"           /* load line 2 */
      "movq 0x30(%0),     %%mm3\n"           /* load line 3 */
      "movq 0x40(%0),     %%mm4\n"           /* load line 4 */
      "movq 0x50(%0),     %%mm5\n"           /* load line 5 */
      "movq 0x60(%0),     %%mm6\n"           /* load line 6 */
      "movq 0x70(%0),     %%mm7\n"           /* load line 7 */
      "paddsw %%mm0, %%mm7\n"                /* line0 + line7 -> mm7 (v00) */
      "paddsw %%mm1, %%mm6\n"                /* line1 + line6 -> mm6 (v01) */
      "paddsw %%mm2, %%mm5\n"                /* line2 + line5 -> mm5 (v02) */
      "paddsw %%mm4, %%mm3\n"                /* line4 + line3 -> mm3 (v03) */
      "psubsw 0x70(%0), %%mm0\n"             /* line0 - line7 -> mm0 (v07) */
      "psubsw 0x60(%0), %%mm1\n"             /* line1 - line6 -> mm1 (v06) */
      "psubsw 0x50(%0), %%mm2\n"             /* line2 - line5 -> mm2 (v05) */
      "psubsw 0x30(%0), %%mm4\n"             /* line4 - line3 -> mm4 (-v04) */
      ""                                     /* STEP 2 */
      "psubsw %%mm2, %%mm4\n"                /* -v04 - v05 -> mm4 (v14) */
      "paddsw %%mm1, %%mm2\n"                /*  v05 + v06 -> mm2 (v15) */
      "paddsw %%mm0, %%mm1\n"                /*  v06 + v07 -> mm1 (v16) */
      "movq %%mm0, 0x70(%0)\n"               /* store v07 for later */
      "movq %%mm3, %%mm0\n"                  /*  v03 -> mm0 */
      "paddsw %%mm7, %%mm3\n"                /*  v00 + v03 -> mm3 (v10) */
      "psubsw %%mm0, %%mm7\n"                /*  v00 - v03 -> mm7 (v13) */
      "movq %%mm5, %%mm0\n"                  /*  v02 -> mm0 */
      "paddsw %%mm6, %%mm5\n"                /*  v01 + v02 -> mm5 (v11) */
      "psubsw %%mm0, %%mm6\n"                /*  v01 - v02 -> mm6 (v12) */
      ""                                     /* STEP 3 */
      "movq %%mm5, %%mm0\n"                  /*  v11 -> mm0 */
      "paddsw %%mm3, %%mm5\n"                /*  v10 + v11 -> mm5 (v20) */
      "psubsw %%mm0, %%mm3\n"                /*  v10 - v11 -> mm3 (v21) */
      "paddsw %%mm7, %%mm6\n"                /*  v12 + v13 -> mm6 (v22) */
      "movq %%mm5, 0x00(%0)\n"               /* store line 0 */
      "movq %%mm3, 0x40(%0)\n"               /* store line 4 */
      "movq %%mm4, %%mm5\n"                  /*  v14 -> mm5 */
      "paddsw %%mm1, %%mm5\n"                /*  v14 + v16 -> mm5 */
#ifdef precision
      "psllw $0x02, %%mm5\n"                 /* precision(va0) += 2 bit */
#endif
      "pmulhw 16(%1), %%mm5\n"               /* (v14+v16)*COS6 -> mm5 (va0) */
      "movq %%mm5, %%mm3\n"                  /* mm5->mm3 */
      "psraw $0x0f, %%mm3\n"                 /* sign(mm5) -> mm3 */
      "psubsw %%mm3, %%mm5\n"                /* adjust multiply */
      ""                                     /* STEP 4 */
#ifdef precision
      "psllw $0x03, %%mm6\n"                 /* precision(v22) += 2 bit */
#else
      "psllw $0x01, %%mm6\n"                 /* */
#endif
      "pmulhw  8(%1), %%mm6\n"               /* 2*v22*A1/2 -> mm6 (v32)*/
      "movq %%mm6, %%mm3\n"                  /* mm6->mm3 */
      "psraw $0x0f, %%mm3\n"                 /* sign(mm6) -> mm3 */
      "psubsw %%mm3, %%mm6\n"                /* adjust multiply */
#ifdef precision
      "psllw $0x03, %%mm2\n"                 /* precision(v15) += 2 bit */
#else
      "psllw $0x01, %%mm2\n"                 /* */
#endif
      "pmulhw  8(%1), %%mm2\n"               /* 2*v15*A3/2 -> mm2 (v35) */
      "movq %%mm2, %%mm3\n"                  /* mm2->mm3 */
      "psraw $0x0f, %%mm3\n"                 /* sign(mm2) -> mm3 */
      "psubsw %%mm3, %%mm2\n"                /* adjust multiply */
#ifdef precision
      "psllw $0x03, %%mm4\n"                 /* precision(v14) += 2 bit */
#else
      "psllw $0x01, %%mm4\n"                 /* */
#endif
      "pmulhw  0(%1), %%mm4\n"               /* 2 * v14 * -A2/2 -> mm4 */
      "movq %%mm4, %%mm3\n"                  /* mm2->mm3 */
      "psraw $0x0f, %%mm3\n"                 /* sign(mm2) -> mm3 */
      "psubsw %%mm3, %%mm4\n"                /* adjust multiply */
      "psubsw %%mm5, %%mm4\n"                /* v14 * -A2 - va0 -> mm4 (v34) */
#ifdef precision
      "psllw $0x02, %%mm1\n"                 /* precision(v16) += 2 bit */
#endif
      "psubsw %%mm1, %%mm5\n"                /* va0 - v16 -> mm5 */
      "pmulhw 24(%1), %%mm1\n"               /* v16 * (A4 - 1) -> mm1 */
      "movq %%mm1, %%mm3\n"                  /* mm2->mm3 */
      "psraw $0x0f, %%mm3\n"                 /* sign(mm2) -> mm3 */
      "psubsw %%mm3, %%mm1\n"                /* adjust multiply */
      "psubsw %%mm5, %%mm1\n"                /* v16 * A4 - va0 -> mm1 (v36) */
      ""                                     /* STEP 5 */
      "movq 0x70(%0), %%mm0\n"               /* retrieve v07 -> mm0 */
#ifdef precision
      "psllw $0x02, %%mm7\n"                 /* precision(v13) += 2 bit */
      "psllw $0x02, %%mm0\n"                 /* precision(v07) += 2 bit */
#endif
      "movq %%mm6, %%mm3\n"                  /* v32 -> mm3 */
      "paddsw %%mm7, %%mm6\n"                /* v13 + v32 -> mm6 (v42) */
      "psubsw %%mm3, %%mm7\n"                /* v13 - v32 -> mm7 (v43) */
#ifdef precision
      "psraw $0x02, %%mm6\n"                 /* precision(v42) -= 2 bit */
      "psraw $0x02, %%mm7\n"                 /* precision(v43) -= 2 bit */
#endif
      "movq %%mm6, %%mm3\n"                  /* mm2->mm3 */
      "psraw $0x0f, %%mm3\n"                 /* sign(mm2) -> mm3 */
      "psubsw %%mm3, %%mm6\n"                /* adjust multiply */
      "movq %%mm7, %%mm3\n"                  /* mm2->mm3 */
      "psraw $0x0f, %%mm3\n"                 /* sign(mm2) -> mm3 */
      "psubsw %%mm3, %%mm7\n"                /* adjust multiply */
      "movq %%mm6, 0x20(%0)\n"               /* store line 2 */
      "movq %%mm7, 0x60(%0)\n"               /* store line 6 */
      "movq %%mm2, %%mm5\n"                  /* v35 -> mm5 */
      "paddsw %%mm0, %%mm2\n"                /* v07 + v35 -> mm2 (v45) */
      "psubsw %%mm5, %%mm0\n"                /* v07 - v35 -> mm0 (v47) */
      ""                                     /* STEP 6 */
      "movq %%mm4, %%mm3\n"                  /* v34 -> mm3 */
      "paddsw %%mm0, %%mm4\n"                /* v47 + v34 -> mm4 (v54) */
      "psubsw %%mm3, %%mm0\n"                /* v47 - v34 -> mm0 (v57) */
      "movq %%mm1, %%mm5\n"                  /* v36 -> mm5 */
      "paddsw %%mm2, %%mm1\n"                /* v45 + v36 -> mm1 (v55) */
      "psubsw %%mm5, %%mm2\n"                /* v45 - v36 -> mm2 (v56) */
#ifdef precision
      "psraw $0x02, %%mm4\n"                 /* precision(v54) -= 2 bit */
      "psraw $0x02, %%mm0\n"                 /* precision(v57) -= 2 bit */
      "psraw $0x02, %%mm1\n"                 /* precision(v55) -= 2 bit */
      "psraw $0x02, %%mm2\n"                 /* precision(v56) -= 2 bit */
#endif
      "movq %%mm4, %%mm3\n"                  /* mm4->mm3 */
      "psraw $0x0f, %%mm3\n"                 /* sign(mm4) -> mm3 */
      "psubsw %%mm3, %%mm4\n"                /* adjust multiply */
      "movq %%mm0, %%mm3\n"                  /* mm0->mm3 */
      "psraw $0x0f, %%mm3\n"                 /* sign(mm0) -> mm3 */
      "psubsw %%mm3, %%mm0\n"                /* adjust multiply */
      "movq %%mm1, %%mm3\n"                  /* mm1->mm3 */
      "psraw $0x0f, %%mm3\n"                 /* sign(mm1) -> mm3 */
      "psubsw %%mm3, %%mm1\n"                /* adjust multiply */
      "movq %%mm2, %%mm3\n"                  /* mm2->mm3 */
      "psraw $0x0f, %%mm3\n"                 /* sign(mm2) -> mm3 */
      "psubsw %%mm3, %%mm2\n"                /* adjust multiply */
      "movq %%mm1, 0x10(%0)\n"               /* store line 1 */
      "movq %%mm0, 0x30(%0)\n"               /* store line 3 */
      "movq %%mm4, 0x50(%0)\n"               /* store line 5 */
      "movq %%mm2, 0x70(%0)\n"               /* store line 7 */
      : "=r"(cache), "=r"(mmx_cos)
      : "0"(cache), "1"(mmx_cos)
      : "memory");
}


static void inline dct(dct_t *block)
{
  dct_aan_pass(block);
  dct_aan_pass(block+4);
  transpose(block);
  dct_aan_pass(block);
  dct_aan_pass(block+4);
}
