
;	optimized mdct() for new GOGO-no-coda (1999/09)
;	Copyright (C) 1999 shigeo
;	special thanks to Keiichi SAKAI

;	99/09/15	subband.nasʬ
;	ѿ̾ѹ
;	off -> sbd_off
;	xxx -> sbd_xxx
;	enwindow_sse -> enwindow

%include "nasm.h"

	globaldef	window_filter_subband_SSE
	externdef	enwindow
	externdef	idct_coefficient
	externdef	sbd_off
	externdef	sbd_xxx

HAN_SIZE	equ	512		;defined in common.h
SBLIMIT		equ	32

F_SIZE	equ	4
%define F_PTR	dword

	segment_data

		align	16
SCALER	dd	38000000h,38000000h			;1.0/32768 define in common.h

		align	16
Q_MMMM		dd	80000000h, 80000000h, 80000000h, 80000000h
s_ptr		dd 0
saveOff		dd 0

	segment_code

;	99/07/19	SSEб window_filter_subbandKNI()
;	99/07/21	117k[clk]@PIII
;	99/07/22 	loop unrolling, 102k[clk]@PIII
;	99/07/23	¤Ѥ90k85k[clk]
;	99/09/01	LSF(22.05kHz)ӤΥ󥳡бΤ mode_gr ˼褦ѹ
;	99/09/03	improved iDCT routine, 68k[clk]@PIII
;	99/09/04	register allocation Ѥ, 62k[clk]@PIII
;	99/09/29	load/store򸺤餷֤Ѥ餺
;	99/11/14	movups̵59k[clk]@PIII
; void window_filter_subband_SSE(int *win_buf,int ch ,float *s,int mode_gr)
		align	16
window_filter_subband_SSE:
		push	ebx
		push	esi
		push	edi
		push	ebp
%assign _P 4*4
		mov		esi,[esp+_P+8]		;esi=ch
		mov		edi,sbd_xxx			;edi=c=&sbd_xxx[0][0]
		mov		ebx,[esp+_P+4]		;ebx=win_buf
		mov		ebp,[sbd_off+esi*4]		;ebp=offset
		and		esi,esi
		jz		short .F00
		add		edi,HAN_SIZE*F_SIZE	;c=&sbd_xxx[1][0]

.F00:
		;	ebx=win_buf, esi=j=18*2, edi=c(fix), ebp=offset=off[ch]
		mov		[saveOff],ebx
		mov		eax,[esp+_P+12]
		mov		esi,[esp+_P+16]		;esi=mode_gr(=1 or 2)
		add		esi,esi
		lea		esi,[esi+esi*8]		;=esi *= 18
		mov		[s_ptr],eax		;¸

; allocate yyy[64]
		mov		edx,esp
		sub		esp,64*4+4
		and		esp,~15				; align to 16 byte boundary
		mov		[esp+64*4],edx		; save the original ESP
		jmp		short .LOOP
%define	yyy esp

		align	16
.LOOP:
;	for(i=31;i>=0;i--) c[i+offset] = (double)(*(*win_buf)++) * SCALER;
; loop unrolling
;       c += offset;
;	    c[28:31] = win_buf[ 3: 0]*{4{SCALER}};
;	    c[24:27] = win_buf[ 7: 4]*{4{SCALER}};
;	    c[20:23] = win_buf[11: 8]*{4{SCALER}};
;	    c[16:19] = win_buf[15:12]*{4{SCALER}};
;	    c[12:15] = win_buf[19:16]*{4{SCALER}};
;	    c[ 8:11] = win_buf[17:20]*{4{SCALER}};
;	    c[ 4: 7] = win_buf[21:24]*{4{SCALER}};
;	    c[ 0: 3] = win_buf[31:28]*{4{SCALER}};
		mov		ebx,[saveOff]
		movss	xmm7,[SCALER]
		shufps	xmm7,xmm7,0
		lea		edx,[edi+ebp*4]

		cvtpi2ps	xmm0,[ebx + 0*F_SIZE]
		cvtpi2ps	xmm1,[ebx + 2*F_SIZE]
		shufps	xmm1,xmm0,0x11
		mulps	xmm1,xmm7
		movaps	[edx+28*F_SIZE],xmm1

		cvtpi2ps	xmm2,[ebx + 4*F_SIZE]
		cvtpi2ps	xmm3,[ebx + 6*F_SIZE]
		shufps	xmm3,xmm2,0x11
		mulps	xmm3,xmm7
		movaps	[edx+24*F_SIZE],xmm3

		cvtpi2ps	xmm4,[ebx + 8*F_SIZE]
		cvtpi2ps	xmm5,[ebx + 10*F_SIZE]
		shufps	xmm5,xmm4,0x11
		mulps	xmm5,xmm7
		movaps	[edx+20*F_SIZE],xmm5

		cvtpi2ps	xmm0,[ebx + 12*F_SIZE]
		cvtpi2ps	xmm1,[ebx + 14*F_SIZE]
		shufps	xmm1,xmm0,0x11
		mulps	xmm1,xmm7
		movaps	[edx+16*F_SIZE],xmm1

		cvtpi2ps	xmm2,[ebx + 16*F_SIZE]
		cvtpi2ps	xmm3,[ebx + 18*F_SIZE]
		shufps	xmm3,xmm2,0x11
		mulps	xmm3,xmm7
		movaps	[edx+12*F_SIZE],xmm3

		cvtpi2ps	xmm4,[ebx + 20*F_SIZE]
		cvtpi2ps	xmm5,[ebx + 22*F_SIZE]
		shufps	xmm5,xmm4,0x11
		mulps	xmm5,xmm7
		movaps	[edx+ 8*F_SIZE],xmm5

		cvtpi2ps	xmm0,[ebx + 24*F_SIZE]
		cvtpi2ps	xmm1,[ebx + 26*F_SIZE]
		shufps	xmm1,xmm0,0x11
		mulps	xmm1,xmm7
		movaps	[edx+ 4*F_SIZE],xmm1

		cvtpi2ps	xmm2,[ebx + 28*F_SIZE]
		cvtpi2ps	xmm3,[ebx + 30*F_SIZE]
		shufps	xmm3,xmm2,0x11
		mulps	xmm3,xmm7
		movaps	[edx+ 0*F_SIZE],xmm3

		add		ebx,32*F_SIZE
		mov		[saveOff],ebx
;;
		xor		ecx,ecx
		mov		ebx,HAN_SIZE-1			;code size ̾Τ
		mov		edx,enwindow

		mov		eax,ebp					;eax=i+offset
		jmp		short .f1
		align	16
.lp1:
		add		eax,64+4
		and		eax,ebx
.f1:
		movaps	xmm0,[edx +  0*F_SIZE]
		mulps	xmm0,[edi+eax*4]
		add		eax,64
		and		eax,ebx

		movaps	xmm1,[edx +  4*F_SIZE]
		mulps	xmm1,[edi+eax*4]
		add		eax,64
		and		eax,ebx
		addps	xmm1,xmm0

		movaps	xmm2,[edx +  8*F_SIZE]
		mulps	xmm2,[edi+eax*4]
		add		eax,64
		and		eax,ebx
		addps	xmm2,xmm1

		movaps	xmm3,[edx + 12*F_SIZE]
		mulps	xmm3,[edi+eax*4]
		add		eax,64
		and		eax,ebx
		addps	xmm3,xmm2

		movaps	xmm4,[edx + 16*F_SIZE]
		mulps	xmm4,[edi+eax*4]
		add		eax,64
		and		eax,ebx
		addps	xmm4,xmm3

		movaps	xmm5,[edx + 20*F_SIZE]
		mulps	xmm5,[edi+eax*4]
		add		eax,64
		and		eax,ebx
		addps	xmm5,xmm4

		movaps	xmm6,[edx + 24*F_SIZE]
		add		ecx,4
		mulps	xmm6,[edi+eax*4]
		add		eax,64
		and		eax,ebx
		addps	xmm6,xmm5

		movaps	xmm7,[edx + 28*F_SIZE]
		mulps	xmm7,[edi+eax*4]
		add		edx,32*F_SIZE
		cmp		ecx,HAN_SIZE/8
		addps	xmm7,xmm6

		movaps	[yyy+ecx*4-4*4],xmm7
		jb		near .lp1

		add		ebp,480
		and		ebp,ebx

;	yprime[0] = yprime[16]
;       for( i=1; i<=16; i++ ) yprime[i] = y[i+16]+y[16-i];
;
; loop unrolling
;	    yprime[0] = yprime[16]
;       yprime[ 3: 1] = y[19:17]+y[13:15];
;       yprime[ 7: 4] = y[23:20]+y[ 9:12];
;       yprime[11: 8] = y[27:24]+y[ 5: 8];
;       yprime[15:12] = y[31:28]+y[ 1: 4];
		xorps	xmm0,xmm0
		movaps	xmm1,[yyy+12*4]
		movaps	xmm2,[yyy+ 8*4]
		movaps	xmm3,[yyy+ 4*4]
		movaps	xmm4,[yyy+ 0*4]

		shufps	xmm0,xmm1,11100100B	; movhps	xmm0,xmm1
		shufps	xmm0,xmm1,01101100B
		addps	xmm0,[yyy+16*4]

		shufps	xmm1,xmm2,11100100B	; movhps	xmm1,xmm2
		shufps	xmm1,xmm2,01101100B
		addps	xmm1,[yyy+20*4]

		shufps	xmm2,xmm3,11100100B	; movhps	xmm2,xmm3
		shufps	xmm2,xmm3,01101100B
		addps	xmm2,[yyy+24*4]

		shufps	xmm3,xmm4,11100100B	; movhps	xmm3,xmm4
		shufps	xmm3,xmm4,01101100B
		addps	xmm3,[yyy+28*4]

		movaps	[yyy+ 0*4],xmm0
		movaps	[yyy+ 4*4],xmm1
		movaps	[yyy+ 8*4],xmm2
;		movaps	[yyy+12*4],xmm3

;       for( i=17; i<=31; i++ ) yprime[i] = y[i+16]-y[80-i];
; loop unrolling
;       yprime[16   ] = y[32   ]-(-y[ 0   ]);
;       yprime[19:17] = y[35:33]-y[61:63];
;       yprime[23:20] = y[39:36]-y[57:60];
;       yprime[27:24] = y[43:40]-y[53:56];
;       yprime[31:28] = y[47:44]-y[49:52];
; the sign of yprime[31:16] are inverted.
		xorps	xmm4,[Q_MMMM]
		movaps	xmm5,[yyy + 60*4]
		movaps	xmm6,[yyy + 56*4]
		movaps	xmm7,[yyy + 52*4]
		movaps	xmm0,[yyy + 48*4]

		shufps	xmm4,xmm5,11100100B	; movhps	xmm4,xmm5
		shufps	xmm4,xmm5,01101100B
		subps	xmm4,[yyy + 32*4]

		shufps	xmm5,xmm6,11100100B	; movhps	xmm5,xmm6
		shufps	xmm5,xmm6,01101100B
		subps	xmm5,[yyy + 36*4]

		shufps	xmm6,xmm7,11100100B	; movhps	xmm6,xmm7
		shufps	xmm6,xmm7,01101100B
		subps	xmm6,[yyy + 40*4]

		shufps	xmm7,xmm0,11100100B	; movhps	xmm7,xmm0
		shufps	xmm7,xmm0,01101100B
		subps	xmm7,[yyy + 44*4]

;		movaps	[yyy+16*4],xmm4
;		movaps	[yyy+20*4],xmm5
;		movaps	[yyy+24*4],xmm6
;		movaps	[yyy+28*4],xmm7

;       for( i=0; i<16; i++ ){
;               s0 = s1 = 0.0;
;               for( j=0; j<32; j+=4 ){
;                       s0 += (*m)[i+0][j  ]*yprime[j+0];
;                       s1 += (*m)[i+0][j+1]*yprime[j+1];
;                       s0 += (*m)[i+0][j+2]*yprime[j+2];
;                       s1 += (*m)[i+0][j+3]*yprime[j+3];
;               }
;               xout[i+ 0] = s0+s1;
;               xout[31-i] = s0-s1;
;       }
		mov		ebx,[s_ptr]
		mov		ecx,16
		mov		edx,idct_coefficient+16*F_SIZE
		jmp		short .lp5

		align	16
.lp5:
		movaps	xmm0,[edx-64]
		mulps	xmm0,[yyy+ 0*4]

		movaps	xmm1,[edx-48]
		mulps	xmm1,[yyy+ 4*4]
		addps	xmm1,xmm0

		movaps	xmm2,[edx-32]
		mulps	xmm2,[yyy+ 8*4]
		addps	xmm2,xmm1

		movaps	xmm0,[edx-16]
		mulps	xmm0,xmm3
		addps	xmm0,xmm2

		movaps	xmm1,[edx+ 0]
		mulps	xmm1,xmm4
		subps	xmm0,xmm1

		movaps	xmm2,[edx+16]
		mulps	xmm2,xmm5
		subps	xmm0,xmm2

		movaps	xmm1,[edx+32]
		mulps	xmm1,xmm6
		subps	xmm0,xmm1

		movaps	xmm2,[edx+48]
		mulps	xmm2,xmm7
		subps	xmm0,xmm2

		add		edx,32*F_SIZE
		movhlps	xmm1,xmm0
		addps	xmm0,xmm1
		movaps	xmm1,xmm0
		shufps	xmm1,xmm0,0x01
		movaps	xmm2,xmm0
		addss	xmm0,xmm1
		subss	xmm2,xmm1

		movss	[ebx],xmm0
		add		ebx,4
		dec		ecx
		movss	[ebx+ecx*8],xmm2
		jnz		near .lp5

		add		ebx,16*4
		mov		[s_ptr],ebx
		dec		esi
		jnz		near .LOOP

; free area for yyy[64]
		mov		esp,[esp+64*4]
%undef yyy

;
		mov		edx,sbd_off
		test	dword [esp+_P+8],1
		jz		short .lp6
		add		edx,4
.lp6:
		mov	[edx],ebp

.exit:
		pop	ebp
		pop	edi
		pop	esi
		pop	ebx
		ret

		end
