
;	optimized mdct() for new GOGO-no-coda (1999/09)
;	Copyright (C) 1999 shigeo
;	special thanks to Keiichi SAKAI

;		99/09/01 lamemdct.c١˽񤭴
%include "nasm.h"

	globaldef mdct_sub_SSE

	externdef ca_arb
	externdef cs_arb
	externdef win_mdct
	externdef cos_s_mdct
	externdef cos_l_017_2_3
	externdef cos_l_5_6_8_9
	externdef cos_l11121415
	externdef cos_l_1_71016
	externdef cos_l_4_0
	externdef cos_l_4_6

%define	SBLIMIT	32

;&III_side_info_t.gr[0].ch[0].block_type = 12*4
%define Offset_block_type dwsizen(4) ; by shigeo 00/01/30

;sizeof(gr_info) = 28*4
%define Size_gr_info dwsizen(28)

	segment_data
		align	16
_MPMP	dd	0,0x80000000,0,0x80000000

	segment_code

;---------------------------------------------------------------------
;	99/11/01	ȤꤢäƤߤ 77k[clk](cf. 94k[clk] by C func.)
;	99/11/02	mdct_SSE, arb_SSE ȤŬǤ٤76k[clk]
;	99/11/18	movups̵(74k(MT), 68k(ST)
; typedef float L3SBS[2][3][18][SBLIMIT]
; static void mdct_sub_SSE(L3SBS (*sb_sample), float (*mdct_freq)[2][576], int stereo, III_side_info_t *l3_side, int mode_gr )
;{
		align	16
mdct_sub_SSE:
		push	esi
		push	edi
		push	ebx
		push	ebp

;	register gr_info *cod_info;
;	register float (*sample)[18][32], (*enc)[18];

;	int		block_type;

%define	local_area	12+(18*4+4)*4		; esp = fin = alloca(sizeof fin)
		sub		esp,local_area
		lea		edx,[esp+15]
		and		dl,0xf0
		
; arg
%define	mode_gr				esp+16+local_area+20
%define	l3_side				esp+16+local_area+16
%define	stereo				esp+16+local_area+12
%define	mdct_freq			esp+16+local_area+8
%define	sb_sample			esp+16+local_area+4
; local
%define	_gr					esp+4+(18*4+4)*4
%define	_ch					esp+0+(18*4+4)*4
%define	fin					edx

;	for ( gr = 0; gr < mode_gr; gr++ ){
		xor		eax,eax
		mov		[_gr],eax
		jmp		short .f0

		align	16
.f0:
.lp0:

;		for ( ch = 0; ch < stereo; ch++ ){
		xor		eax,eax
		mov		[_ch],eax
		jmp		short .f1

		align	16
.f1:
.lp1:
;			cod_info = (gr_info *) &(l3_side->gr[gr].ch[ch]) ;
		mov		esi,[l3_side]
		mov		ebx,[_gr]
		shl		ebx,1
		add		ebx,[_ch]
		imul	ebx,Size_gr_info  ;28*4		; 28*4 = sizeof(gr_info)
;			block_type = cod_info->block_type;
;		mov		ecx,[esi+10*4+ebx+2*4]		; = block_type
		mov		ecx,[esi+ebx+Offset_block_type]		; = block_type
		and		ecx,3

; float (*sb_sample)[2][3][18][SBLIMIT]
;			sample=(*sb_sample)[ch];
		imul	esi,[_ch],3*18*SBLIMIT*4
		add		esi,[sb_sample]		; = sample
		imul	eax,[_gr],18*SBLIMIT*4
		add		esi,eax				; = sample[gr] = sb_sample[0][ch][gr]
; float (*mdct_freq)[2][576]
;			enc=((float (*)[2][32][18])mdct_freq)[gr][ch][0];
		imul	eax,[_ch],32*18*4
		imul	ebx,[_gr],2*32*18*4
		add		ebx,eax
		add		ebx,[mdct_freq]

		cmp		cl,2
		je		near .short_mdct
;	esi = sample[gr], ebx = enc[0]
		call	_long_mdct
;	esi = sample[gr], ebx = enc[32]
		sub		ebx,18*4		; = enc[31]

; aliasing reduction butterfly
;			enc=((float (*)[2][32][18])mdct_freq)[gr][ch];
		movaps	xmm4,[cs_arb+ 0]
		movaps	xmm5,[cs_arb+16]
		movaps	xmm6,[ca_arb+ 0]
		movaps	xmm7,[ca_arb+16]
		mov		eax,-31*18*4
		jmp		.f12

		align	16
.lp12:
		movaps	xmm0,[ebx+eax     +14*4]
		shufps	xmm0,xmm0,0x1B
		movaps	xmm2,xmm0
		mulps	xmm0,xmm4
		mulps	xmm2,xmm6
		movaps	xmm1,[ebx+eax+18*4+ 0*4]
		movaps	xmm3,xmm1
		mulps	xmm1,xmm4
		mulps	xmm3,xmm6
		addps	xmm0,xmm3
		subps	xmm1,xmm2
		shufps	xmm0,xmm0,0x1B
		movaps	[ebx+eax     +14*4],xmm0
		movaps	[ebx+eax+18*4+ 0*4],xmm1

		movaps	xmm2,[ebx+eax     +10*4]
		shufps	xmm2,xmm2,0x1B
		movaps	xmm0,xmm2
		mulps	xmm2,xmm5
		mulps	xmm0,xmm7
		movaps	xmm3,[ebx+eax+18*4+ 4*4]
		add		eax,18*4
		movaps	xmm1,xmm3
		mulps	xmm3,xmm5
		mulps	xmm1,xmm7
		addps	xmm2,xmm1
		subps	xmm3,xmm0
		shufps	xmm2,xmm2,0x1B
		movaps	[ebx+eax-18*4+10*4],xmm2
		movaps	[ebx+eax     + 4*4],xmm3
.f12:
		movlps	xmm0,[ebx+eax     +14*4]
		movhps	xmm0,[ebx+eax     +16*4]
		shufps	xmm0,xmm0,0x1B
		movaps	xmm2,xmm0
		mulps	xmm0,xmm4
		mulps	xmm2,xmm6
		movlps	xmm1,[ebx+eax+18*4+ 0*4]
		movhps	xmm1,[ebx+eax+18*4+ 2*4]
		movaps	xmm3,xmm1
		mulps	xmm1,xmm4
		mulps	xmm3,xmm6
		addps	xmm0,xmm3
		subps	xmm1,xmm2
		shufps	xmm0,xmm0,0x1B
		movlps	[ebx+eax     +14*4],xmm0
		movhps	[ebx+eax     +16*4],xmm0
		movlps	[ebx+eax+18*4+ 0*4],xmm1
		movhps	[ebx+eax+18*4+ 2*4],xmm1

		movlps	xmm2,[ebx+eax     +10*4]
		movhps	xmm2,[ebx+eax     +12*4]
		shufps	xmm2,xmm2,0x1B
		movaps	xmm0,xmm2
		mulps	xmm2,xmm5
		mulps	xmm0,xmm7
		movlps	xmm3,[ebx+eax+18*4+ 4*4]
		movhps	xmm3,[ebx+eax+18*4+ 6*4]
		add		eax,18*4
		movaps	xmm1,xmm3
		mulps	xmm3,xmm5
		mulps	xmm1,xmm7
		addps	xmm2,xmm1
		subps	xmm3,xmm0
		shufps	xmm2,xmm2,0x1B
		movlps	[ebx+eax-18*4+10*4],xmm2
		movhps	[ebx+eax-18*4+12*4],xmm2
		movlps	[ebx+eax     + 4*4],xmm3
		movhps	[ebx+eax     + 6*4],xmm3
		jnz		near .lp12
		jmp		short .cont1

.short_mdct:
		call	_short_mdct

.cont1:
		mov		eax,[_ch]
		inc		eax
		cmp		eax,[stereo]
		mov		[_ch],eax
		jb		near .lp1
;		}

		mov		eax,[_gr]
		inc		eax
		cmp		eax,[mode_gr]
		mov		[_gr],eax
		jb		near .lp0
;	}

;      Save latest granule's subband samples to be used in
;      the next mdct call
; float (*sb_sample)[2][3][18][SBLIMIT]
		mov		edi,[sb_sample]		; sample=(*sb_sample)[0];
		imul	eax,[mode_gr],18*SBLIMIT*4
		lea		esi,[edi+eax]		; = sample[mode_gr]
;	for ( ch = 0; ch < stereo; ch++ ){
		mov		eax,[stereo]

.lp3:
;		for ( k = 0; k < 18; k++ ){
;			for ( band = 0; band < SBLIMIT; band++ ){
;				sample[0][k][band] = sample[mode_gr][k][band];
		mov		ecx,18*SBLIMIT
		rep		movsd				; ۤɰʤ
;			}
;		}
		add		esi,2*18*SBLIMIT*4	; advance pointer to the next channel
		add		edi,2*18*SBLIMIT*4	; advance pointer to the next channel

		dec		eax
		jnz		.lp3
;    }

		add		esp,local_area
		pop		ebp
		pop		ebx
		pop		edi
		pop		esi
		ret
;}

;	99/07/19	SSEȤäƤߤ®ʤ(;_;) by K.SAKAI
;	99/07/21	פä񤭴Ƥߤ806[clk]
;	99/09/01	˻פä񤭴Ƥߤ600[clk]
;	99/09/04	饤Ȥ碌FPU̿Ŭ 530[clk]
;	99/09/07	äunroll̿ѹƤߤ490[clk]
;	99/09/08	̿ѹƤߤ475[clk]
;	99/09/13	Х, *ss̿ǺŬ 430[clk](Thanks! URURI)
;	99/09/14	äѤunroll all loop400[clk]
;	99/09/29	register allocation ѹƱ
;	99/11/02	calling sequence ѹ, mdct_subΰ
;	99/11/18	short mdct񤭴long mdctȾ񤭴
;	99/11/20	band˴ؤloop unrollingΤᡢ񤭴
		align	16
;			while(band < 32){
_long_mdct:
		xor		ebp,ebp			; band = 0;
		imul	eax,ecx,4*36
		add		eax,win_mdct
		jmp		short .f0

		align	16
.lp0:
.f0:
		movaps	xmm7,[_MPMP]
; block_type != 2
; even
;	for ( k = 0; k < 18; k ++ ){
;		mdct_in[k   ] =  sample[ gr ][k][band];
;		mdct_in[k+18] =  sample[gr+1][k][band];
;	}
; odd
;	for ( k = 0; k < 18; k += 2 ){
;		mdct_in[k   ] =  sample[gr][k   ][band];
;		mdct_in[k+ 1] = -sample[gr][k+ 1][band];
;		mdct_in[k+18] =  sample[gr][k+18][band];
;		mdct_in[k+19] = -sample[gr][k+19][band];
;	}
;	for (k=0;k<9;k++){
;		fin[k] = winp[k] * in[k] - winp[17-k] * in[17-k];
;		fin[9+k] = winp[18+k] * in[18+k] + winp[35-k] * in[35-k];
;	}

; fin[17] = winp[27] * in[27] + winp[26] * in[26];
		movaps	xmm0,[esi + 32*27*4 + ebp*4]
		xorps	xmm0,xmm7
		movaps	xmm1,[esi + 32*26*4 + ebp*4]
		movaps	xmm6,[eax+24*4]			; = win[27:24]
		movaps	xmm3,xmm6
		movaps	xmm4,xmm6
		shufps	xmm3,xmm3,0xFF
		shufps	xmm4,xmm4,0xAA
		mulps	xmm0,xmm3
		mulps	xmm1,xmm4
		addps	xmm0,xmm1
		movaps	[fin+17*16],xmm0

; fin[16] = winp[28] * in[28] + winp[25] * in[25];
		movaps	xmm1,[esi + 32*25*4 + ebp*4]
		xorps	xmm1,xmm7
		movaps	xmm0,[esi + 32*28*4 + ebp*4]
		movaps	xmm5,[eax+28*4]			; = win[31:28]
		movaps	xmm3,xmm5
		movaps	xmm4,xmm6
		shufps	xmm3,xmm3,0x00
		shufps	xmm4,xmm4,0x55
		mulps	xmm0,xmm3
		mulps	xmm1,xmm4
		addps	xmm0,xmm1
		movaps	[fin+16*16],xmm0

; fin[15] = winp[29] * in[29] + winp[24] * in[24];
		movaps	xmm0,[esi + 32*29*4 + ebp*4]
		xorps	xmm0,xmm7
		movaps	xmm1,[esi + 32*24*4 + ebp*4]
		movaps	xmm3,xmm5
		shufps	xmm3,xmm3,0x55
		shufps	xmm6,xmm6,0x00
		mulps	xmm0,xmm3
		mulps	xmm1,xmm6
		addps	xmm0,xmm1
		movaps	[fin+15*16],xmm0

; fin[14] = winp[30] * in[30] + winp[23] * in[23];
		movaps	xmm1,[esi + 32*23*4 + ebp*4]
		xorps	xmm1,xmm7
		movaps	xmm0,[esi + 32*30*4 + ebp*4]
		movaps	xmm6,[eax+20*4]			; = win[23:20]
		movaps	xmm3,xmm5
		movaps	xmm4,xmm6
		shufps	xmm3,xmm3,0xAA
		shufps	xmm4,xmm4,0xFF
		mulps	xmm0,xmm3
		mulps	xmm1,xmm4
		addps	xmm0,xmm1
		movaps	[fin+14*16],xmm0

; fin[13] = winp[31] * in[31] + winp[22] * in[22];
		movaps	xmm0,[esi + 32*31*4 + ebp*4]
		xorps	xmm0,xmm7
		movaps	xmm1,[esi + 32*22*4 + ebp*4]
		movaps	xmm4,xmm6
		shufps	xmm5,xmm5,0xFF
		shufps	xmm4,xmm4,0xAA
		mulps	xmm0,xmm5
		mulps	xmm1,xmm4
		addps	xmm0,xmm1
		movaps	[fin+13*16],xmm0

; fin[12] = winp[32] * in[32] + winp[21] * in[21];
		movaps	xmm1,[esi + 32*21*4 + ebp*4]
		xorps	xmm1,xmm7
		movaps	xmm0,[esi + 32*32*4 + ebp*4]
		movaps	xmm5,[eax+32*4]			; = win[35:32]
		movaps	xmm3,xmm5
		movaps	xmm4,xmm6
		shufps	xmm3,xmm3,0x00
		shufps	xmm4,xmm4,0x55
		mulps	xmm0,xmm3
		mulps	xmm1,xmm4
		addps	xmm0,xmm1
		movaps	[fin+12*16],xmm0

; fin[11] = winp[33] * in[33] + winp[20] * in[20];
		movaps	xmm0,[esi + 32*33*4 + ebp*4]
		xorps	xmm0,xmm7
		movaps	xmm1,[esi + 32*20*4 + ebp*4]
		movaps	xmm3,xmm5
		shufps	xmm3,xmm3,0x55
		shufps	xmm6,xmm6,0x00
		mulps	xmm0,xmm3
		mulps	xmm1,xmm6
		addps	xmm0,xmm1
		movaps	[fin+11*16],xmm0

; fin[10] = winp[34] * in[34] + winp[19] * in[19];
		movaps	xmm1,[esi + 32*19*4 + ebp*4]
		xorps	xmm1,xmm7
		movaps	xmm0,[esi + 32*34*4 + ebp*4]
		movaps	xmm6,[eax+16*4]			; = win[19:16]
		movaps	xmm3,xmm5
		movaps	xmm4,xmm6
		shufps	xmm3,xmm3,0xAA
		shufps	xmm4,xmm4,0xFF
		mulps	xmm0,xmm3
		mulps	xmm1,xmm4
		addps	xmm0,xmm1
		movaps	[fin+10*16],xmm0

; fin[ 9] = winp[35] * in[35] + winp[18] * in[18];
		movaps	xmm0,[esi + 32*35*4 + ebp*4]
		xorps	xmm0,xmm7
		movaps	xmm1,[esi + 32*18*4 + ebp*4]
		movaps	xmm4,xmm6
		shufps	xmm5,xmm5,0xFF
		shufps	xmm4,xmm4,0xAA
		mulps	xmm0,xmm5
		mulps	xmm1,xmm4
		addps	xmm0,xmm1
		movaps	[fin+ 9*16],xmm0

; fin[ 0] = winp[ 0] * in[ 0] - winp[17] * in[17];
		movaps	xmm1,[esi + 32*17*4 + ebp*4]
		xorps	xmm1,xmm7
		movaps	xmm0,[esi + 32* 0*4 + ebp*4]
		movaps	xmm5,[eax+ 0*4]			; = win[ 3: 0]
		movaps	xmm3,xmm5
		movaps	xmm4,xmm6
		shufps	xmm3,xmm3,0x00
		shufps	xmm4,xmm4,0x55
		mulps	xmm0,xmm3
		mulps	xmm1,xmm4
		subps	xmm0,xmm1
		movaps	[fin+ 0*16],xmm0

; fin[ 1] = winp[ 1] * in[ 1] - winp[16] * in[16];
		movaps	xmm0,[esi + 32* 1*4 + ebp*4]
		xorps	xmm0,xmm7
		movaps	xmm1,[esi + 32*16*4 + ebp*4]
		movaps	xmm3,xmm5
		shufps	xmm3,xmm3,0x55
		shufps	xmm6,xmm6,0x00
		mulps	xmm0,xmm3
		mulps	xmm1,xmm6
		subps	xmm0,xmm1
		movaps	[fin+ 1*16],xmm0

; fin[ 2] = winp[ 2] * in[ 2] - winp[15] * in[15];
		movaps	xmm1,[esi + 32*15*4 + ebp*4]
		xorps	xmm1,xmm7
		movaps	xmm0,[esi + 32* 2*4 + ebp*4]
		movaps	xmm6,[eax+12*4]			; = win[15:12]
		movaps	xmm3,xmm5
		movaps	xmm4,xmm6
		shufps	xmm3,xmm3,0xAA
		shufps	xmm4,xmm4,0xFF
		mulps	xmm0,xmm3
		mulps	xmm1,xmm4
		subps	xmm0,xmm1
		movaps	[fin+ 2*16],xmm0

; fin[ 3] = winp[ 3] * in[ 3] - winp[14] * in[14];
		movaps	xmm0,[esi + 32* 3*4 + ebp*4]
		xorps	xmm0,xmm7
		movaps	xmm1,[esi + 32*14*4 + ebp*4]
		movaps	xmm4,xmm6
		shufps	xmm5,xmm5,0xFF
		shufps	xmm4,xmm4,0xAA
		mulps	xmm0,xmm5
		mulps	xmm1,xmm4
		subps	xmm0,xmm1
		movaps	[fin+ 3*16],xmm0

; fin[ 4] = winp[ 4] * in[ 4] - winp[13] * in[13];
		movaps	xmm1,[esi + 32*13*4 + ebp*4]
		xorps	xmm1,xmm7
		movaps	xmm0,[esi + 32* 4*4 + ebp*4]
		movaps	xmm5,[eax+ 4*4]			; = win[ 7: 4]
		movaps	xmm3,xmm5
		movaps	xmm4,xmm6
		shufps	xmm3,xmm3,0x00
		shufps	xmm4,xmm4,0x55
		mulps	xmm0,xmm3
		mulps	xmm1,xmm4
		subps	xmm0,xmm1
		movaps	[fin+ 4*16],xmm0

; fin[ 5] = winp[ 5] * in[ 5] - winp[12] * in[12];
		movaps	xmm0,[esi + 32* 5*4 + ebp*4]
		xorps	xmm0,xmm7
		movaps	xmm1,[esi + 32*12*4 + ebp*4]
		movaps	xmm3,xmm5
		shufps	xmm3,xmm3,0x55
		shufps	xmm6,xmm6,0x00
		mulps	xmm0,xmm3
		mulps	xmm1,xmm6
		subps	xmm0,xmm1
		movaps	[fin+ 5*16],xmm0

; fin[ 6] = winp[ 6] * in[ 6] - winp[11] * in[11];
		movaps	xmm1,[esi + 32*11*4 + ebp*4]
		xorps	xmm1,xmm7
		movaps	xmm0,[esi + 32* 6*4 + ebp*4]
		movaps	xmm6,[eax+ 8*4]			; = win[11: 8]
		movaps	xmm3,xmm5
		movaps	xmm4,xmm6
		shufps	xmm3,xmm3,0xAA
		shufps	xmm4,xmm4,0xFF
		mulps	xmm0,xmm3
		mulps	xmm1,xmm4
		subps	xmm0,xmm1
		movaps	[fin+ 6*16],xmm0

; fin[ 7] = winp[ 7] * in[ 7] - winp[10] * in[10];
		movaps	xmm0,[esi + 32* 7*4 + ebp*4]
		xorps	xmm0,xmm7
		movaps	xmm1,[esi + 32*10*4 + ebp*4]
		movaps	xmm4,xmm6
		shufps	xmm5,xmm5,0xFF
		shufps	xmm4,xmm4,0xAA
		mulps	xmm0,xmm5
		mulps	xmm1,xmm4
		subps	xmm0,xmm1
		movaps	[fin+ 7*16],xmm0

; fin[ 8] = winp[ 8] * in[ 8] - winp[ 9] * in[ 9];
		movaps	xmm1,[esi + 32* 9*4 + ebp*4]
		xorps	xmm1,xmm7
		movaps	xmm0,[esi + 32* 8*4 + ebp*4]
		movaps	xmm3,xmm6
		shufps	xmm3,xmm3,0x00
		shufps	xmm6,xmm6,0x55
		mulps	xmm0,xmm3
		mulps	xmm1,xmm6
		subps	xmm0,xmm1
		movaps	[fin+ 8*16],xmm0

; edx = fin, esi = sample[gr], ebp = band, eax = win, ebx = out
.lp1:
; 4 data parallel x 3
; m = 0, 17, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15
;	sum  = ( fin[ 0] ) * cos_l[m][ 0]; /* 17 */
;	sum += ( fin[ 1] ) * cos_l[m][ 1]; /* 15 */
;	sum += ( fin[ 2] ) * cos_l[m][ 2]; /* 13 */
;	sum += ( fin[ 3] ) * cos_l[m][ 3]; /* 11 */
;	sum += ( fin[ 4] ) * cos_l[m][ 4]; /*  9 */
;	sum += ( fin[ 5] ) * cos_l[m][ 5]; /*  7  */
;	sum += ( fin[ 6] ) * cos_l[m][ 6]; /*  5  */
;	sum += ( fin[ 7] ) * cos_l[m][ 7]; /*  3 */
;	sum += ( fin[ 8] ) * cos_l[m][ 8]; /*  1  */
;	sum += ( fin[ 9] ) * cos_l[m][ 9]; /* 19*/
;	sum += ( fin[10] ) * cos_l[m][10]; /* 21 */
;	sum += ( fin[11] ) * cos_l[m][11]; /* 23 */
;	sum += ( fin[12] ) * cos_l[m][12]; /* 25 */
;	sum += ( fin[13] ) * cos_l[m][13]; /* 27 */
;	sum += ( fin[14] ) * cos_l[m][14]; /* 29 */
;	sum += ( fin[15] ) * cos_l[m][15]; /* 31*/
;	sum += ( fin[16] ) * cos_l[m][16]; /* 33 */
;	sum += ( fin[17] ) * cos_l[m][17]; /* 35 */
;	out[m]=sum;

		movss	xmm2,[fin+ 0*4]
		movss	xmm5,[fin+ 4*4]
		shufps	xmm2,xmm2,0
		shufps	xmm5,xmm5,0

		movaps	xmm0,[cos_l_017_2_3+ 0*4]
		mulps	xmm0,xmm2		; = {out[ 3],out[ 2],out[17],out[ 0]}
		movaps	xmm1,[cos_l_5_6_8_9+ 0*4]
		mulps	xmm1,xmm2		; = {out[ 9],out[ 8],out[ 6],out[ 5]}
		mulps	xmm2,[cos_l11121415+ 0*4]		; = {out[15],out[14],out[12],out[11]}

		movaps	xmm3,[cos_l_017_2_3+(0+4)*4]
		mulps	xmm3,xmm5
		movaps	xmm4,[cos_l_5_6_8_9+(0+4)*4]
		mulps	xmm4,xmm5
		mulps	xmm5,[cos_l11121415+(0+4)*4]

		addps	xmm0,xmm3		; = {out[ 3],out[ 2],out[17],out[ 0]}
		addps	xmm1,xmm4		; = {out[ 9],out[ 8],out[ 6],out[ 5]}
		addps	xmm2,xmm5		; = {out[15],out[14],out[12],out[11]}

; iteration n xmm3 xmm4 xmm5 xmm6 xmm7
%macro iteration 6
		movss	%2,[fin+(%1+0)*4]
		movss	%3,[fin+(%1+4)*4]
		shufps	%2,%2,0
		shufps	%3,%3,0

		movaps	%4,[cos_l_017_2_3+%1*4]
		mulps	%4,%2
		movaps	%5,[cos_l_5_6_8_9+%1*4]
		mulps	%5,%2
		mulps	%2,[cos_l11121415+%1*4]

		addps	xmm0,%4		; = {out[ 3],out[ 2],out[17],out[ 0]}
		addps	xmm1,%5		; = {out[ 9],out[ 8],out[ 6],out[ 5]}
		addps	xmm2,%2		; = {out[15],out[14],out[12],out[11]}

		movaps	%6,[cos_l_017_2_3+(%1+4)*4]
		mulps	%6,%3
		movaps	%4,[cos_l_5_6_8_9+(%1+4)*4]
		mulps	%4,%3
		mulps	%3,[cos_l11121415+(%1+4)*4]

		addps	xmm0,%6		; = {out[ 3],out[ 2],out[17],out[ 0]}
		addps	xmm1,%4		; = {out[ 9],out[ 8],out[ 6],out[ 5]}
		addps	xmm2,%3		; = {out[15],out[14],out[12],out[11]}
%endmacro
; ѽ           %2,%3,%4,%5,%6
; Ѳǽˤʤ %5,%2,%6,%4,%3
		iteration  8,xmm6,xmm7,xmm3,xmm4,xmm5
		iteration 16,xmm4,xmm6,xmm5,xmm3,xmm7
		iteration 24,xmm3,xmm4,xmm7,xmm5,xmm6
		iteration 32,xmm5,xmm3,xmm6,xmm7,xmm4
		iteration 40,xmm7,xmm5,xmm4,xmm6,xmm3
		iteration 48,xmm6,xmm7,xmm3,xmm4,xmm5
		iteration 56,xmm4,xmm6,xmm5,xmm3,xmm7
		iteration 64,xmm3,xmm4,xmm7,xmm5,xmm6

		movss	[ebx+ 0*4],xmm0
		movhps	[ebx+ 2*4],xmm0
		movlps	[ebx+ 5*4],xmm1
		movhps	[ebx+ 8*4],xmm1
		shufps	xmm0,xmm0,0x55
		movlps	[ebx+11*4],xmm2
		movhps	[ebx+14*4],xmm2
		movss	[ebx+17*4],xmm0

		add		fin,4
		add		ebx,18*4		; mdct_out
		test	fin,12
		jnz		near .lp1
		sub		fin,16
		add		ebp,4

; fin[ 0] = fin[ 0] + fin[ 5] + fin[15]
; fin[ 1] = fin[ 1] + fin[ 4] + fin[16]
; fin[ 2] = fin[ 2] + fin[ 3] + fin[17]
		movaps	xmm6,[fin+ 0*16]
		movaps	xmm1,[fin+ 1*16]
		movaps	xmm2,[fin+ 2*16]
		addps	xmm6,[fin+ 5*16]
		addps	xmm1,[fin+ 4*16]
		addps	xmm2,[fin+ 3*16]
		addps	xmm6,[fin+15*16]
		addps	xmm1,[fin+16*16]
		addps	xmm2,[fin+17*16]
		movaps	[fin+ 0*16],xmm6
		movaps	[fin+ 1*16],xmm1
		movaps	[fin+ 2*16],xmm2

; fin[ 3] = fin[ 6] - fin[ 9] + fin[14]
; fin[ 4] = fin[ 7] - fin[10] + fin[13]
; fin[ 5] = fin[ 8] - fin[11] + fin[12]
		movaps	xmm7,[fin+ 6*16]
		movaps	xmm4,[fin+ 7*16]
		movaps	xmm5,[fin+ 8*16]
		subps	xmm7,[fin+ 9*16]
		subps	xmm4,[fin+10*16]
		subps	xmm5,[fin+11*16]
		addps	xmm7,[fin+14*16]
		addps	xmm4,[fin+13*16]
		addps	xmm5,[fin+12*16]
		movaps	[fin+ 3*16],xmm7
		movaps	[fin+ 4*16],xmm4
		movaps	[fin+ 5*16],xmm5

; fin[0] = (fin[0]+fin[5]+fin[15])-(fin[1]+fin[4]+fin[16])+(fin[8]-fin[11]+fin[12]);
; fin[6] = (fin[6]-fin[9]+fin[14])-(fin[2]+fin[3]+fin[17])+(fin[7]-fin[10]+fin[13]);
		subps	xmm6,xmm1
		subps	xmm7,xmm2
		addps	xmm6,xmm5
		addps	xmm7,xmm4

; cos_l13_0 = -cos_l_4_6
; cos_l13_6 = +cos_l_4_0
		movss	xmm2,[cos_l_4_0]
		movss	xmm3,[cos_l_4_6]
		shufps	xmm2,xmm2,0x00
		shufps	xmm3,xmm3,0x00
		movaps	xmm0,xmm2
		movaps	xmm1,xmm3

;		/* 4 */
; sum  = ((fin[0]+fin[5]+fin[15])-(fin[1]+fin[4]+fin[16])+(fin[8]-fin[11]+fin[12]))*cos_l[4][0];
; sum += ((fin[6]-fin[9]+fin[14])-(fin[2]+fin[3]+fin[17])+(fin[7]-fin[10]+fin[13]))*cos_l[4][6];
;		out[4]=sum;
		mulps	xmm2,xmm6
		mulps	xmm3,xmm7
		addps	xmm2,xmm3
		movhlps	xmm3,xmm2
		movss	[ebx-18*16+( 4+ 0)*4],xmm2
		shufps	xmm2,xmm2,0x55
		movss	[ebx-18*16+( 4+36)*4],xmm3
		shufps	xmm3,xmm3,0x55
		movss	[ebx-18*16+( 4+18)*4],xmm2
		movss	[ebx-18*16+( 4+54)*4],xmm3

;		/* 13 */
; sum  = ((fin[0]+fin[5]+fin[15])-(fin[1]+fin[4]+fin[16])+(fin[8]-fin[11]+fin[12]))*cos_l[13][0];
; sum += ((fin[6]-fin[9]+fin[14])-(fin[2]+fin[3]+fin[17])+(fin[7]-fin[10]+fin[13]))*cos_l[13][6];
;		out[13]=sum;
		mulps	xmm1,xmm6
		mulps	xmm0,xmm7
		subps	xmm0,xmm1
		movhlps	xmm1,xmm0
		movss	[ebx-18*16+(13+ 0)*4],xmm0
		shufps	xmm0,xmm0,0x55
		movss	[ebx-18*16+(13+36)*4],xmm1
		shufps	xmm1,xmm1,0x55
		movss	[ebx-18*16+(13+18)*4],xmm0
		movss	[ebx-18*16+(13+54)*4],xmm1

		sub		ebx,18*4*4		; rewind mdct_out
		jmp		short .lp2

		align 16
.lp2:
		movss	xmm0,[fin+ 0*16]
		movss	xmm1,[fin+ 1*16]
		movss	xmm2,[fin+ 2*16]
		movss	xmm3,[fin+ 3*16]
		movss	xmm4,[fin+ 4*16]
		movss	xmm5,[fin+ 5*16]
		shufps	xmm0,xmm0,0x00
		shufps	xmm1,xmm1,0x00
		shufps	xmm2,xmm2,0x00
		shufps	xmm3,xmm3,0x00
		shufps	xmm4,xmm4,0x00
		shufps	xmm5,xmm5,0x00

; m = 1, 7, 10, 16
;	sum  = ( fin[ 0]+fin[ 5]+fin[15] ) * cos_l[m][0]; /* mfc=15 0 */
;	sum += ( fin[ 1]+fin[ 4]+fin[16] ) * cos_l[m][1]; /* mfc= 9 1 */
;	sum += ( fin[ 2]+fin[ 3]+fin[17] ) * cos_l[m][2]; /* mfc= 3 2 */
;	sum += ( fin[ 6]-fin[ 9]+fin[14] ) * cos_l[m][6]; /* mfc=21 6 */
;	sum += ( fin[ 7]-fin[10]+fin[13] ) * cos_l[m][7]; /* mfc=27 7 */
;	sum += ( fin[ 8]-fin[11]+fin[12] ) * cos_l[m][8]; /* mfc=28 8 */
;	out[m]=sum;

		mulps	xmm0,[cos_l_1_71016+ 0*4]
		mulps	xmm1,[cos_l_1_71016+ 4*4]
		addps	xmm1,xmm0
		mulps	xmm2,[cos_l_1_71016+ 8*4]
		addps	xmm2,xmm1
		mulps	xmm3,[cos_l_1_71016+12*4]
		addps	xmm3,xmm2
		mulps	xmm4,[cos_l_1_71016+16*4]
		addps	xmm4,xmm3
		mulps	xmm5,[cos_l_1_71016+20*4]
		addps	xmm5,xmm4	; = {out[16], out[10], out[ 7], out[ 1]}

		movhlps	xmm3,xmm5
		movss	[ebx+ 1*4],xmm5
		shufps	xmm5,xmm5,0x55
		movss	[ebx+10*4],xmm3
		shufps	xmm3,xmm3,0x55
		movss	[ebx+ 7*4],xmm5
		movss	[ebx+16*4],xmm3

		add		fin,4
		add		ebx,18*4		; mdct_out
		test	fin,12
		jnz		near .lp2
		sub		fin,16

		cmp		ebp,32
		jb		near .lp0
;			}	; end of while(band < 32)

		ret

; block_type == 2

; esi = sample[gr], ebp = band, ebx = out, eax = cos_s_mdct
; eax, ebp, xmm: destoy
; ebx: advanced to the last band
; ecx, edx, esi, edi: unchanged
; even
;	for ( k = 0; k < 18; k ++ ){
;		mdct_in[k   ] =  sample[ gr ][k][band];
;		mdct_in[k+18] =  sample[gr+1][k][band];
;	}
;	for ( m = 0; m < 6; m++ ){
;		for ( k = 0; k < 12; k++ ){
;			sum += {in[k+18],in[k+12],in[k+ 6]} * {3{cos_s_mdct[m][k]}};
;		}
;		out[3*m+2:3*m+0] = sum;
;	}
; odd
;	for ( k = 0; k < 18; k += 2 ){
;		mdct_in[k   ] =  sample[ gr ][k][band];
;		mdct_in[k+18] =  sample[gr+1][k][band];
;		mdct_in[k+ 1] = -sample[ gr ][k+1][band];
;		mdct_in[k+19] = -sample[gr+1][k+1][band];
;	}
;	for ( m = 0; m < 6; m++ ){
;		for ( k = 0; k < 12; k++ ){
;			sum += {in[k+18],in[k+12],in[k+ 6]} * {3{cos_s_mdct[m][k]}};
;		}
;		out[3*m+2:3*m+0] = sum;
;	}

; band˴ؤloop unrolling
;			for ( band = 0; band < 32; band+=4 ) {
		align	16
_short_mdct:
		xor		ebp,ebp
.lp0:	mov		eax,cos_s_mdct
		jmp		short .lp1

		align	16
.lp1:	movss	xmm0,[eax +  0*4]	; cos_s_mdct[m][ 0]
		shufps	xmm0,xmm0,0
		movss	xmm3,[eax +  6*4]	; cos_s_mdct[m][ 6]
		shufps	xmm3,xmm3,0
		movaps	xmm5,[esi + 32*12*4 + ebp*4]
		movaps	xmm6,xmm5
		mulps	xmm6,xmm0
		mulps	xmm5,xmm3
		movaps	xmm2,[esi + 32*18*4 + ebp*4]
		movaps	xmm7,xmm2
		mulps	xmm7,xmm0
		mulps	xmm2,xmm3
		addps	xmm6,xmm2
		mulps	xmm0,[esi + 32* 6*4 + ebp*4]
		addps	xmm5,xmm0
		mulps	xmm3,[esi + 32*24*4 + ebp*4]
		addps	xmm7,xmm3

%macro	short_mdct_quad_body	1
%if (%1 % 2)
		movaps	xmm4,[_MPMP]
%endif
		movss	xmm0,[eax +  %1*4]		; cos_s_mdct[m][ %1]
		shufps	xmm0,xmm0,0
%if (%1 % 2)
		xorps	xmm0,xmm4
%endif
		movss	xmm3,[eax + (%1+ 6)*4]	; cos_s_mdct[m][(%1+ 6)]
		shufps	xmm3,xmm3,0
%if (%1 % 2)
		xorps	xmm3,xmm4
%endif
		movaps	xmm1,[esi + 32*(%1+12)*4 + ebp*4]
		movaps	xmm4,xmm1
		mulps	xmm4,xmm0
		addps	xmm6,xmm4
		mulps	xmm1,xmm3
		addps	xmm5,xmm1
		movaps	xmm2,[esi + 32*(%1+18)*4 + ebp*4]
		movaps	xmm4,xmm2
		mulps	xmm4,xmm0
		addps	xmm7,xmm4
		mulps	xmm2,xmm3
		addps	xmm6,xmm2
		mulps	xmm0,[esi + 32*(%1+ 6)*4 + ebp*4]
		addps	xmm5,xmm0
		mulps	xmm3,[esi + 32*(%1+24)*4 + ebp*4]
		addps	xmm7,xmm3
%endmacro
		short_mdct_quad_body	1
		short_mdct_quad_body	2
		short_mdct_quad_body	3
		short_mdct_quad_body	4
		short_mdct_quad_body	5

		movaps	xmm4,xmm5
		unpcklps	xmm4,xmm6
		unpckhps	xmm5,xmm6
		movlps	[ebx+     0*4],xmm4
		movhps	[ebx+18*4+0*4],xmm4
		movhlps	xmm2,xmm7
		movss	[ebx+     2*4],xmm7
		movss	[ebx+36*4+2*4],xmm2
		movlps	[ebx+36*4+0*4],xmm5
		movhps	[ebx+54*4+0*4],xmm5
		shufps	xmm7,xmm7,00111001B
		movhlps	xmm3,xmm7
		movss	[ebx+18*4+2*4],xmm7
		movss	[ebx+54*4+2*4],xmm3

		add		eax,12*4
		add		ebx,3*4
		cmp		eax,cos_s_mdct+12*6*4
		jl		near .lp1

		add		ebp,4			; band += 4
		add		ebx,54*4
		cmp		ebp,32
		jb		near .lp0
;			}	; end of while(band < 32)
		ret

		end

