
;	optimized mdct() for new GOGO-no-coda (1999/09)
;	Copyright (C) 1999 shigeo
;	special thanks to Keiichi SAKAI

;	99/09/15	subband.nasʬ
;	ѿ̾ѹ
;	off -> sbd_off
;	xxx -> sbd_xxx
;	enwindow_sse -> enwindow

%include "nasm.h"

	globaldef	window_filter_subband_P5FPU
	globaldef	window_filter_subband_P6FPU
	externdef	enwindow
	externdef	idct_coefficient
	externdef	sbd_off
	externdef	sbd_xxx

HAN_SIZE	equ	512		;defined in common.h
SBLIMIT		equ	32

F_SIZE	equ	4
%define F_PTR	dword

	segment_data

		align	16
SCALER	dd	38000000h,38000000h			;1.0/32768 define in common.h

		align	16
s_ptr		dd 0
saveOff		dd 0

	segment_code
;********   P6 FPUѥ롼   ********
; by K.SAKAI
;	99/09/05	SSEѥ롼󤫤ʬ 141k[clk]@PII
;   99/09/06	Х1ս, Ŭ 131k[clk]@PII
; void window_filter_subband_P6FPU(int *win_buf,int ch ,float *s,int mode_gr)
		align	16
window_filter_subband_P6FPU:
		push	ebx
		push	esi
		push	edi
		push	ebp
%assign _P 4*4
		mov		esi,[esp+_P+8]		;esi=ch
		and		esi,esi
		mov		edi,sbd_xxx					;edi=c=&sbd_xxx[0][0]
		mov		ebx,[esp+_P+4]		;ebx=win_buf
		mov		ebp,[sbd_off+esi*4]		;ebp=sbd_offset
		jz		short .f0
		add		edi,HAN_SIZE*F_SIZE	;c=&sbd_xxx[1][0]

.f0:
		;	ebx=win_buf, esi=j=18*2, edi=c(fix), ebp=offset=sbd_off[ch]
		mov		[saveOff],ebx
		mov		eax,[esp+_P+12]
		mov		esi,[esp+_P+16]		;esi=mode_gr(=1 or 2)
		add		esi,esi
		lea		esi,[esi+esi*8]		;=esi *= 18
		mov		[s_ptr],eax		;¸

; allocate yyy[64]
		mov		edx,esp
		sub		esp,64*4+4
		and		esp,~15				; align to 16 byte boundary
		mov		[esp+64*4],edx		; save the original ESP
		jmp		short .f1
%define	yyy esp

		align	16
.lp1:
.f1:
;	for(i=31;i>=0;i--) c[i+offset] = (double)(*(*win_buf)++) * SCALER;
		mov		ebx,[saveOff]
		fld		dword [SCALER]
		lea		edx,[edi+ebp*4]
		xor		ecx,ecx
		mov		cl,32
		jmp		short .f10

		align	16
.lp10:
.f10:
		fild	dword [ebx + 0*F_SIZE]
		fild	dword [ebx + 1*F_SIZE]
		fild	dword [ebx + 2*F_SIZE]
		fild	dword [ebx + 3*F_SIZE]
		add		ebx,4*4
		sub		ecx,4
		fxch	st4				; wb[3], wb[0], wb[1], wb[2], SCALER
		fmul	st3,st0
		fmul	st2,st0
		fmul	st1,st0
		fmul	st4,st0
		fxch	st3				; wb[3], SCALER, wb[1], wb[2], wb[0]
		fstp	dword [edx+ecx*4+3*4]
		fxch					; wb[3], SCALER, wb[2], wb[1]
		fstp	dword [edx+ecx*4+2*4]
		fstp	dword [edx+ecx*4+1*4]
		fxch					; SCALER, wb[3]
		fstp	dword [edx+ecx*4+0*4]
		jnz		.lp10

		ffree	st0
		mov		[saveOff],ebx
;;
;		xor		ecx,ecx
		mov		edx,enwindow
		jmp		short .f11

		align	16
.lp11:
.f11:
		lea		eax,[ecx+ebp]			;eax=i+offset
		and		eax,(HAN_SIZE-1)

		fld		dword [edx+ 0*4]
		fmul	dword [edi+eax*4+ 0*4]
		fld		dword [edx+ 1*4]
		xor		ebx,ebx
		fmul	dword [edi+eax*4+ 1*4]
		fld		dword [edx+ 2*4]
		mov		bl,4
		fmul	dword [edi+eax*4+ 2*4]
		fld		dword [edx+ 3*4]
		fmul	dword [edi+eax*4+ 3*4]
		jmp		short .f110

		align	16
.lp110:
.f110:
		add		eax,64
		and		eax,(HAN_SIZE-1)
		fld		dword [edx+ebx*4+ 0*4]
		fmul	dword [edi+eax*4+ 0*4]
		faddp	st4,st0
		fld		dword [edx+ebx*4+ 1*4]
		fmul	dword [edi+eax*4+ 1*4]
		faddp	st3,st0
		fld		dword [edx+ebx*4+ 2*4]
		fmul	dword [edi+eax*4+ 2*4]
		faddp	st2,st0
		fld		dword [edx+ebx*4+ 3*4]
		fmul	dword [edi+eax*4+ 3*4]
		add		ebx,4
		cmp		ebx,32
		faddp	st1,st0
		jl		.lp110

		add		edx,32*F_SIZE
		add		ecx,4

		cmp		ecx,HAN_SIZE/8
		fxch	st3			; o[3], o[1], o[2], o[0]
		fstp	dword [yyy+ecx*4-4*4]
		fxch				; o[3], o[2], o[1]
		fstp	dword [yyy+ecx*4-3*4]
		fstp	dword [yyy+ecx*4-2*4]
		fstp	dword [yyy+ecx*4-1*4]
		jl		near .lp11

		add		ebp,480
		and		ebp,(HAN_SIZE-1)

;	yprime[0] = yprime[16]
;       for( i=1; i<=16; i++ ) yprime[i] = y[i+16]+y[16-i];
;
		lea		eax,[yyy+0x80]	; ɥ̾

;       yprime[16] = y[32]+y[ 0]; yprime[ 0] = y[16];      
		mov		ebx,[eax-0x80+16*4]
		fld		dword [eax-0x80+ 0*4]
		fadd	dword [eax-0x80+32*4]
		fstp	dword [eax-0x80+16*4]

;       yprime[ 1] = y[17]+y[15]; yprime[15] = y[31]+y[ 1];
		fld		dword [eax-0x80+17*4]
		fadd	dword [eax-0x80+15*4]
		fld		dword [eax-0x80+31*4]
		mov		[eax-0x80+ 0*4],ebx
		fadd	dword [eax-0x80+ 1*4]
		fxch
		fstp	dword [eax-0x80+ 1*4]
		fstp	dword [eax-0x80+15*4]

;       yprime[ 2] = y[18]+y[14]; yprime[14] = y[30]+y[ 2];
		fld		dword [eax-0x80+18*4]
		fadd	dword [eax-0x80+14*4]
		fld		dword [eax-0x80+30*4]
		fadd	dword [eax-0x80+ 2*4]
		fxch
		fstp	dword [eax-0x80+ 2*4]
		fstp	dword [eax-0x80+14*4]

;       yprime[ 3] = y[19]+y[13]; yprime[13] = y[29]+y[ 3];
		fld		dword [eax-0x80+19*4]
		fadd	dword [eax-0x80+13*4]
		fld		dword [eax-0x80+29*4]
		fadd	dword [eax-0x80+ 3*4]
		fxch
		fstp	dword [eax-0x80+ 3*4]
		fstp	dword [eax-0x80+13*4]

;       yprime[ 4] = y[20]+y[12]; yprime[12] = y[28]+y[ 4];
		fld		dword [eax-0x80+20*4]
		fadd	dword [eax-0x80+12*4]
		fld		dword [eax-0x80+28*4]
		fadd	dword [eax-0x80+ 4*4]
		fxch
		fstp	dword [eax-0x80+ 4*4]
		fstp	dword [eax-0x80+12*4]

;       yprime[ 5] = y[21]+y[11]; yprime[11] = y[27]+y[ 5];
		fld		dword [eax-0x80+21*4]
		fadd	dword [eax-0x80+11*4]
		fld		dword [eax-0x80+27*4]
		fadd	dword [eax-0x80+ 5*4]
		fxch
		fstp	dword [eax-0x80+ 5*4]
		fstp	dword [eax-0x80+11*4]

;       yprime[ 6] = y[22]+y[10]; yprime[10] = y[26]+y[ 6];
		fld		dword [eax-0x80+22*4]
		fadd	dword [eax-0x80+10*4]
		fld		dword [eax-0x80+26*4]
		fadd	dword [eax-0x80+ 6*4]
		fxch
		fstp	dword [eax-0x80+ 6*4]
		fstp	dword [eax-0x80+10*4]

;       yprime[ 7] = y[23]+y[ 9]; yprime[ 9] = y[25]+y[ 7];
		fld		dword [eax-0x80+23*4]
		fadd	dword [eax-0x80+ 9*4]
		fld		dword [eax-0x80+25*4]
		fadd	dword [eax-0x80+ 7*4]
		fxch
		fstp	dword [eax-0x80+ 7*4]
		fstp	dword [eax-0x80+ 9*4]


;       for( i=17; i<=31; i++ ) yprime[i] = y[i+16]-y[80-i];
; loop unrolling
;       yprime[ 8] = y[24]+y[ 8]; yprime[17] = y[33]-y[63];
		fld		dword [eax-0x80+24*4]
		fadd	dword [eax-0x80+ 8*4]
		fld		dword [eax-0x80+33*4]
		fsub	dword [eax-0x80+63*4]
		fxch
		fstp	dword [eax-0x80+ 8*4]
		fstp	dword [eax-0x80+17*4]

;       yprime[18] = y[34]-y[62]; yprime[19] = y[35]-y[61];
		fld		dword [eax-0x80+34*4]
		fsub	dword [eax-0x80+62*4]
		fld		dword [eax-0x80+35*4]
		fsub	dword [eax-0x80+61*4]
		fxch
		fstp	dword [eax-0x80+18*4]
		fstp	dword [eax-0x80+19*4]

;       yprime[20] = y[36]-y[60]; yprime[21] = y[37]-y[59];
		fld		dword [eax-0x80+36*4]
		fsub	dword [eax-0x80+60*4]
		fld		dword [eax-0x80+37*4]
		fsub	dword [eax-0x80+59*4]
		fxch
		fstp	dword [eax-0x80+20*4]
		fstp	dword [eax-0x80+21*4]

;       yprime[22] = y[38]-y[58]; yprime[23] = y[39]-y[57];
		fld		dword [eax-0x80+38*4]
		fsub	dword [eax-0x80+58*4]
		fld		dword [eax-0x80+39*4]
		fsub	dword [eax-0x80+57*4]
		fxch
		fstp	dword [eax-0x80+22*4]
		fstp	dword [eax-0x80+23*4]

;       yprime[24] = y[40]-y[56]; yprime[25] = y[41]-y[55];
		fld		dword [eax-0x80+40*4]
		fsub	dword [eax-0x80+56*4]
		fld		dword [eax-0x80+41*4]
		fsub	dword [eax-0x80+55*4]
		fxch
		fstp	dword [eax-0x80+24*4]
		fstp	dword [eax-0x80+25*4]

;       yprime[26] = y[42]-y[54]; yprime[27] = y[43]-y[53];
		fld		dword [eax-0x80+42*4]
		fsub	dword [eax-0x80+54*4]
		fld		dword [eax-0x80+43*4]
		fsub	dword [eax-0x80+53*4]
		fxch
		fstp	dword [eax-0x80+26*4]
		fstp	dword [eax-0x80+27*4]

;       yprime[28] = y[44]-y[52]; yprime[29] = y[45]-y[51];
		fld		dword [eax-0x80+44*4]
		fsub	dword [eax-0x80+52*4]
		fld		dword [eax-0x80+45*4]
		fsub	dword [eax-0x80+51*4]
		fxch
		fstp	dword [eax-0x80+28*4]
		fstp	dword [eax-0x80+29*4]

;       yprime[30] = y[46]-y[50]; yprime[31] = y[47]-y[49];
		fld		dword [eax-0x80+46*4]
		fsub	dword [eax-0x80+50*4]
		fld		dword [eax-0x80+47*4]
		fsub	dword [eax-0x80+49*4]
		fxch
		fstp	dword [eax-0x80+30*4]
		fstp	dword [eax-0x80+31*4]

;       for( i=0; i<16; i++ ){
;               s0 = s1 = 0.0;
;               for( j=0; j<32; j+=2 ){
;                       s0 += (*m)[i+0][j  ]*yprime[j+0];
;                       s1 += (*m)[i+0][j+1]*yprime[j+1];
;               }
;               xout[i+ 0] = s0+s1;
;               xout[31-i] = s0-s1;
;       }
		mov		ebx,[s_ptr]
		xor		ecx,ecx
		mov		cl,16
		mov		edx,idct_coefficient
		jmp		short .lp14

		align	16
.lp14:
		fld		dword [edx+ 0*4]
		fmul	dword [yyy+ 0*4]
		xor		eax,eax
		fld		dword [edx+ 1*4]
		fmul	dword [yyy+ 1*4]
		mov		al,1
		jmp		short .lp140

		align	16
.lp140:
		fld		dword [edx+eax*8+ 0*4]
		fmul	dword [yyy+eax*8+ 0*4]
		fld		dword [edx+eax*8+ 1*4]
		fmul	dword [yyy+eax*8+ 1*4]
		inc		eax
		cmp		al,16
		fxch
		faddp	st3,st0
		faddp	st1,st0
		jl		short .lp140

		add		edx,32*F_SIZE
		fld		st1
		fxch
		fadd	st2,st0
		fsubp	st1,st0

		fxch
		fstp	dword [ebx]
		add		ebx,4
		dec		ecx
		fstp	dword [ebx+ecx*8]
		jnz		near .lp14

		add		ebx,16*4
		dec		esi
		mov		[s_ptr],ebx
		jnz		near .lp1

; free area for yyy[64]
		mov		esp,[esp+64*4]
%undef yyy

;
		test	dword [esp+_P+8],1
		mov		edx,sbd_off
		jz		short .f2
		add		edx,4
.f2:
		mov		[edx],ebp

.exit:
		pop		ebp
		pop		edi
		pop		esi
		pop		ebx
		ret

;********   P5 FPUѥ롼   ********
; by K.SAKAI
;	99/09/05	SSEѥ롼󤫤ʬ, 200k[clk]@P55C, 140k[clk]@PII
;   99/09/06	Х1ս
; void window_filter_subband_P5FPU(int *win_buf,int ch ,float *s,int mode_gr)
		align	16
proc window_filter_subband_P5FPU
		push	ebx
		push	esi
		push	edi
		push	ebp
%assign _P 4*4
		mov		esi,[esp+_P+8]		;esi=ch
		mov		edi,sbd_xxx			;edi=c=&sbd_xxx[0][0]
		mov		ebx,[esp+_P+4]		;ebx=win_buf
		mov		ebp,[sbd_off+esi*4]		;ebp=offset
		and		esi,esi
		jz		short .f0
		add		edi,HAN_SIZE*F_SIZE	;c=&sbd_xxx[1][0]

.f0:
		;	ebx=win_buf, esi=j=18*2, edi=c(fix), ebp=offset=sbd_off[ch]
		mov		[saveOff],ebx
		mov		eax,[esp+_P+12]
		mov		esi,[esp+_P+16]		;esi=mode_gr(=1 or 2)
		add		esi,esi
		lea		esi,[esi+esi*8]		;=esi *= 18
		mov		[s_ptr],eax		;¸

; allocate yyy[64]
		mov		edx,esp
		sub		esp,64*4+4
		and		esp,~15				; align to 16 byte boundary
		mov		[esp+64*4],edx		; save the original ESP
		jmp		short .f1
%define	yyy esp

		align	16
.lp1:
.f1:
;	for(i=31;i>=0;i--) c[i+offset] = (double)(*(*win_buf)++) * SCALER;
		mov		ebx,[saveOff]
		fld		dword [SCALER]
		lea		edx,[edi+ebp*4]
		mov		ecx,32
		jmp		short .f10

		align	16
.lp10:
.f10:
		fild	dword [ebx + 0*4]
		fmul	st0,st1
		fild	dword [ebx + 1*4]
		fmul	st0,st2
		fild	dword [ebx + 2*4]
		fmul	st0,st3
		fild	dword [ebx + 3*4]
		fmul	st0,st4
		fxch	st3				; SCALER, wb[3], wb[1], wb[2], wb[0]
		fstp	dword [edx+ecx*4-1*4]
		fxch					; SCALER, wb[3], wb[2], wb[1]
		fstp	dword [edx+ecx*4-2*4]
		fstp	dword [edx+ecx*4-3*4]
		fstp	dword [edx+ecx*4-4*4]
		add		ebx,4*4
		sub		ecx,4
		jnz		.lp10

		fcomp	st0,st0
		mov		[saveOff],ebx
;;
;		xor		ecx,ecx
		mov		edx,enwindow
		jmp		short .f11
		align	16
.lp11:
.f11:
		lea		eax,[ecx+ebp]			;eax=i+offset
		and		eax,(HAN_SIZE-1)
		mov		ebx,4

		fld		dword [edx+ 0*4]
		fmul	dword [edi+eax*4+ 0*4]
		fld		dword [edx+ 1*4]
		fmul	dword [edi+eax*4+ 1*4]
		fld		dword [edx+ 2*4]
		fmul	dword [edi+eax*4+ 2*4]
		fld		dword [edx+ 3*4]
		fmul	dword [edi+eax*4+ 3*4]
		add		eax,64
		and		eax,(HAN_SIZE-1)
		jmp		short .f110

		align	16
.lp110:
.f110:
		fld		dword [edx+ebx*4+ 0*4]
		fmul	dword [edi+eax*4+ 0*4]
		fld		dword [edx+ebx*4+ 1*4]
		fmul	dword [edi+eax*4+ 1*4]
		fld		dword [edx+ebx*4+ 2*4]
		fmul	dword [edi+eax*4+ 2*4]
		fld		dword [edx+ebx*4+ 3*4]
		fmul	dword [edi+eax*4+ 3*4]
		fxch	st3			; o[0], o[1], o[2], o[3], x[3], x[1], x[2], x[0]
		faddp	st7,st0
		fxch				; o[0], o[1], o[2], o[3], x[3], x[2], x[1]
		faddp	st5,st0
		faddp	st3,st0
		faddp	st1,st0
		add		eax,64
		add		ebx,4
		and		eax,(HAN_SIZE-1)
		cmp		ebx,32
		jl		.lp110

		add		edx,32*F_SIZE
		add		ecx,4

		fxch	st3			; o[3], o[1], o[2], o[0]
		fstp	dword [yyy+ecx*4-4*4]
		fxch				; o[3], o[2], o[1]
		fstp	dword [yyy+ecx*4-3*4]
		fstp	dword [yyy+ecx*4-2*4]
		fstp	dword [yyy+ecx*4-1*4]
		cmp		ecx,HAN_SIZE/8
		jb		near .lp11

		add		ebp,480
		and		ebp,(HAN_SIZE-1)

;	yprime[0] = yprime[16]
;       for( i=1; i<=16; i++ ) yprime[i] = y[i+16]+y[16-i];
;
		lea		eax,[yyy+0x80]	; ɥ̾

;       yprime[16] = y[32]+y[ 0]; yprime[ 0] = y[16];      
		fld		dword [eax-0x80+ 0*4]
		fadd	dword [eax-0x80+32*4]
		fld		dword [eax-0x80+16*4]
		fxch
		fstp	dword [eax-0x80+16*4]
		fstp	dword [eax-0x80+ 0*4]

;       yprime[ 1] = y[17]+y[15]; yprime[15] = y[31]+y[ 1];
		fld		dword [eax-0x80+17*4]
		fadd	dword [eax-0x80+15*4]
		fld		dword [eax-0x80+31*4]
		fadd	dword [eax-0x80+ 1*4]
		fxch
		fstp	dword [eax-0x80+ 1*4]
		fstp	dword [eax-0x80+15*4]

;       yprime[ 2] = y[18]+y[14]; yprime[14] = y[30]+y[ 2];
		fld		dword [eax-0x80+18*4]
		fadd	dword [eax-0x80+14*4]
		fld		dword [eax-0x80+30*4]
		fadd	dword [eax-0x80+ 2*4]
		fxch
		fstp	dword [eax-0x80+ 2*4]
		fstp	dword [eax-0x80+14*4]

;       yprime[ 3] = y[19]+y[13]; yprime[13] = y[29]+y[ 3];
		fld		dword [eax-0x80+19*4]
		fadd	dword [eax-0x80+13*4]
		fld		dword [eax-0x80+29*4]
		fadd	dword [eax-0x80+ 3*4]
		fxch
		fstp	dword [eax-0x80+ 3*4]
		fstp	dword [eax-0x80+13*4]

;       yprime[ 4] = y[20]+y[12]; yprime[12] = y[28]+y[ 4];
		fld		dword [eax-0x80+20*4]
		fadd	dword [eax-0x80+12*4]
		fld		dword [eax-0x80+28*4]
		fadd	dword [eax-0x80+ 4*4]
		fxch
		fstp	dword [eax-0x80+ 4*4]
		fstp	dword [eax-0x80+12*4]

;       yprime[ 5] = y[21]+y[11]; yprime[11] = y[27]+y[ 5];
		fld		dword [eax-0x80+21*4]
		fadd	dword [eax-0x80+11*4]
		fld		dword [eax-0x80+27*4]
		fadd	dword [eax-0x80+ 5*4]
		fxch
		fstp	dword [eax-0x80+ 5*4]
		fstp	dword [eax-0x80+11*4]

;       yprime[ 6] = y[22]+y[10]; yprime[10] = y[26]+y[ 6];
		fld		dword [eax-0x80+22*4]
		fadd	dword [eax-0x80+10*4]
		fld		dword [eax-0x80+26*4]
		fadd	dword [eax-0x80+ 6*4]
		fxch
		fstp	dword [eax-0x80+ 6*4]
		fstp	dword [eax-0x80+10*4]

;       yprime[ 7] = y[23]+y[ 9]; yprime[ 9] = y[25]+y[ 7];
		fld		dword [eax-0x80+23*4]
		fadd	dword [eax-0x80+ 9*4]
		fld		dword [eax-0x80+25*4]
		fadd	dword [eax-0x80+ 7*4]
		fxch
		fstp	dword [eax-0x80+ 7*4]
		fstp	dword [eax-0x80+ 9*4]


;       for( i=17; i<=31; i++ ) yprime[i] = y[i+16]-y[80-i];
; loop unrolling
;       yprime[ 8] = y[24]+y[ 8]; yprime[17] = y[33]-y[63];
		fld		dword [eax-0x80+24*4]
		fadd	dword [eax-0x80+ 8*4]
		fld		dword [eax-0x80+33*4]
		fsub	dword [eax-0x80+63*4]
		fxch
		fstp	dword [eax-0x80+ 8*4]
		fstp	dword [eax-0x80+17*4]

;       yprime[18] = y[34]-y[62]; yprime[19] = y[35]-y[61];
		fld		dword [eax-0x80+34*4]
		fsub	dword [eax-0x80+62*4]
		fld		dword [eax-0x80+35*4]
		fsub	dword [eax-0x80+61*4]
		fxch
		fstp	dword [eax-0x80+18*4]
		fstp	dword [eax-0x80+19*4]

;       yprime[20] = y[36]-y[60]; yprime[21] = y[37]-y[59];
		fld		dword [eax-0x80+36*4]
		fsub	dword [eax-0x80+60*4]
		fld		dword [eax-0x80+37*4]
		fsub	dword [eax-0x80+59*4]
		fxch
		fstp	dword [eax-0x80+20*4]
		fstp	dword [eax-0x80+21*4]

;       yprime[22] = y[38]-y[58]; yprime[23] = y[39]-y[57];
		fld		dword [eax-0x80+38*4]
		fsub	dword [eax-0x80+58*4]
		fld		dword [eax-0x80+39*4]
		fsub	dword [eax-0x80+57*4]
		fxch
		fstp	dword [eax-0x80+22*4]
		fstp	dword [eax-0x80+23*4]

;       yprime[24] = y[40]-y[56]; yprime[25] = y[41]-y[55];
		fld		dword [eax-0x80+40*4]
		fsub	dword [eax-0x80+56*4]
		fld		dword [eax-0x80+41*4]
		fsub	dword [eax-0x80+55*4]
		fxch
		fstp	dword [eax-0x80+24*4]
		fstp	dword [eax-0x80+25*4]

;       yprime[26] = y[42]-y[54]; yprime[27] = y[43]-y[53];
		fld		dword [eax-0x80+42*4]
		fsub	dword [eax-0x80+54*4]
		fld		dword [eax-0x80+43*4]
		fsub	dword [eax-0x80+53*4]
		fxch
		fstp	dword [eax-0x80+26*4]
		fstp	dword [eax-0x80+27*4]

;       yprime[28] = y[44]-y[52]; yprime[29] = y[45]-y[51];
		fld		dword [eax-0x80+44*4]
		fsub	dword [eax-0x80+52*4]
		fld		dword [eax-0x80+45*4]
		fsub	dword [eax-0x80+51*4]
		fxch
		fstp	dword [eax-0x80+28*4]
		fstp	dword [eax-0x80+29*4]

;       yprime[30] = y[46]-y[50]; yprime[31] = y[47]-y[49];
		fld		dword [eax-0x80+46*4]
		fsub	dword [eax-0x80+50*4]
		fld		dword [eax-0x80+47*4]
		fsub	dword [eax-0x80+49*4]
		fxch
		fstp	dword [eax-0x80+30*4]
		fstp	dword [eax-0x80+31*4]

;       for( i=0; i<16; i++ ){
;               s0 = s1 = 0.0;
;               for( j=0; j<32; j+=2 ){
;                       s0 += (*m)[i+0][j  ]*yprime[j+0];
;                       s1 += (*m)[i+0][j+1]*yprime[j+1];
;               }
;               xout[i+ 0] = s0+s1;
;               xout[31-i] = s0-s1;
;       }
		mov		ebx,[s_ptr]
		mov		ecx,16
		mov		edx,idct_coefficient
		jmp		short .lp14

		align	16
.lp14:
		fld		dword [edx+ 0*4]
		fmul	dword [yyy+ 0*4]
		fld		dword [edx+ 1*4]
		fmul	dword [yyy+ 1*4]

		mov		eax,1
		jmp		short .lp140

		align	16
.lp140:
		fld		dword [edx+eax*8+ 0*4]
		fmul	dword [yyy+eax*8+ 0*4]
		fld		dword [edx+eax*8+ 1*4]
		fmul	dword [yyy+eax*8+ 1*4]
		fxch
		faddp	st3,st0
		faddp	st1,st0
		inc		eax
		cmp		al,16
		jl		short .lp140

		fld		st1
		fxch
		fadd	st2,st0
		fsubp	st1,st0

		fxch
		fstp	dword [ebx]
		fstp	dword [ebx+ecx*8-4]
		add		edx,32*F_SIZE
		add		ebx,4
		loop	.lp14

		add		ebx,16*4
		mov		[s_ptr],ebx

		dec		esi
		jnz		near .lp1

; free area for yyy[64]
		mov		esp,[esp+64*4]
%undef yyy

;
		mov		edx,sbd_off
		test	dword [esp+_P+8],1
		jz		short .f2
		add		edx,4
.f2:
		mov		[edx],ebp

.exit:
		pop		ebp
		pop		edi
		pop		esi
		pop		ebx
		ret

		end
