
;	for new GOGO-no-coda (1999/09)
;	Copyright (C) 1999 shigeo
;	special thanks to Keiichi SAKAI, URURI

;	encoder.h, l3psy.h Ȥ

%define CBANDS 63
%define HBLKSIZE_s 129
%define HBLKSIZE 513
%define	BLKSIZE	1024

%include "nasm.h"

	globaldef	s3_l
	globaldef	wsamp_r_int
	globaldef	wsamp_rs
	globaldef	sprdngf_3DN
	globaldef	sprdngf_FPU
	globaldef	sprdngf_SSE
	globaldef	frame_shiftin_MMX
	globaldef	calc_phase_SSE
	globaldef	calc_phase_3DN
	globaldef	calc_pe_3DN

	segment_bss
		align 16
wsamp_rs	resd	256			; exported to l3psy.c
wsamp_r_int	resd	2*BLKSIZE		; exported to l3psy.c
		align 16
s3_l:	resd	(CBANDS*CBANDS)	; exported to l3psy.c

	segment_data
;	***  *** l3psy.cs3indȤϰۤʤäƤ!!!

;	2ܤϲˤ뤿 ind[i][1] = ind[i][1] - ind[i][0]+1; ȤƤ
		align 16
s3ind:
	dd	 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9
	dd	 0,10, 0,11, 0,12, 0,13, 1,14, 1,14, 2,14
	dd	 3,13,  5,12,  6,12,  7,13,  9,12, 10,12, 11,12
	dd	12,12, 14,11, 15,11, 15,13, 16,13, 16,13, 17,13
	dd	18,13, 19,13, 19,14, 20,15, 21,15, 22,15, 22,15
	dd	23,15, 24,15, 25,15, 26,16, 27,16, 28,16, 29,16
	dd	30,16, 31,16, 32,16, 33,16, 34,16, 35,16, 36,16
	dd	37,16, 37,17, 38,17, 39,17, 40,17, 41,17, 42,17
	dd	43,17, 44,17, 45,17, 46,17, 47,16, 48,15, 48,15

;	l3psy.cs3indƱ
s3ind_C	dd	 0, 2,  0, 3,  0, 4,  0, 5,  0, 6,  0, 7,  0, 8,  0, 9
		dd	 0,10,  0,11,  0,12,  1,14,  1,14,  2,15,  3,15,  5,16
		dd	 6,17,  7,19,  9,20, 10,21, 11,22, 12,23, 14,24, 15,25
		dd	15,27, 16,28, 16,28, 17,29, 18,30, 19,31, 19,32, 20,34
		dd	21,35, 22,36, 22,36, 23,37, 24,38, 25,39, 26,41, 27,42
		dd	28,43, 29,44, 30,45, 31,46, 32,47, 33,48, 34,49, 35,50
		dd	36,51, 37,52, 37,53, 38,54, 39,55, 40,56, 41,57, 42,58
		dd	43,59, 44,60, 45,61, 46,62, 47,62, 48,62, 48,62
s3ind_C_end:

	align 16
Q_not		dd	0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
Q_1:
D_1_1		dd	1.0, 1.0, 1.0, 1.0
Q_05:
D_05_05		dd	0.5, 0.5, 0.5, 0.5
Q_abs:
D_ABS		dd	0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
Q_04:
D_04_04		dd	0.4, 0.4, 0.4, 0.4
D_M05_05	dd	0.5, -0.5

	align 16
log_data1   dd      1.4142135623,-1.4142135623
log_data2   dd      1.4000000000,0          ;=7/5
log_data3   dd      0.1428571428,0          ;=1/7
log_data4   dd      0.3333333333,0          ;=1/3
log_data5   dd      0.3465735903,0          ;=log2/2

	segment_code

;************************************************************************

;	99/08/28
;	by shigeo
;void sprdngf_3DN( float dest[CBANDS][2], float src[CBANDS][2] );
	align 16
sprdngf_3DN:
		push	ebx
		push	esi
		push	edi
		push	ebp
%assign _P 4*4
		mov		edi,[esp+_P+4]		;edi = dest
		mov		edx,[esp+_P+8]		;edx = src
		mov		ebx,s3_l			;ebx = offset of s3_l
		mov		esi,s3ind			;esi = offset of s3ind
		mov		ebp,CBANDS			;ebp = CBANDS
		femms
		jmp		.lp0

		align	16
.lp0:	;for(b = 0; b< CBANDS; b++){

		mov		eax,[esi]			;eax=k
		mov		ecx,[esi+4]			;ecx=count=ind[b][1]-ind[b][0]
		pxor	mm0,mm0				;mm0=[0:0]
		align	4
.lp1:
		movd	mm1,[ebx+eax*4]		;s3_l[b][k]
		punpckldq	mm1,mm1			;mm1=[s3]
		pfmul	mm1,[edx+eax*8]		;mm1=[s3 * src]
		inc		eax
		pfadd	mm0,mm1
		loop	.lp1

		movq	[edi],mm0			;[dest]=mm0
		add		edi,8				;next dest
		add		esi,8				;next s3ind
		add		ebx,4 * CBANDS

		dec		ebp					;}
		jnz		.lp0

		femms
		pop		ebp
		pop		edi
		pop		esi
		pop		ebx
		ret

;------------------------------------------------------------------------
;	99/11/09 8k[clk]@PIII
;	by K.SAKAI
;void sprdngf_SSE( float dest[CBANDS][2], float src[CBANDS][2] );
		align 16
sprdngf_SSE:
		push	ebx
		push	esi
		push	edi
%assign _P 3*4
		mov		edi,[esp+_P+4]		;edi = dest
		mov		edx,[esp+_P+8]		;edx = src
		mov		ebx,s3_l			;ebx = offset of s3_l

; s3ind_C[0][0] = {0, 3}
		fld		dword [edx+0*8]		;src[0][0]
		fld		dword [ebx+0*4]		;s3_l[0][0]
		fmul	st1,st0
		fmul	dword [edx+0*8+4]	;src[0][1]

		fld		dword [edx+1*8]		;src[1][0]
		fld		dword [ebx+1*4]		;s3_l[0][1]
		fmul	st1,st0
		fmul	dword [edx+1*8+4]	;src[1][1]
		fxch
		faddp	st3,st0
		faddp	st1,st0

		fld		dword [edx+2*8]		;src[2][0]
		fld		dword [ebx+2*4]		;s3_l[0][2]
		fmul	st1,st0
		fmul	dword [edx+2*8+4]	;src[2][1]
		fxch
		faddp	st3,st0
		faddp	st1,st0

		fxch
		fstp	dword [edi  ]
		fstp	dword [edi+4]

		mov		esi,s3ind_C+8		;esi = s3ind_C[1]
		add		edi,8				;next dest
		add		ebx,4 * CBANDS
		jmp		short .f0

		align	16
;	for(b = 0; b< CBANDS; b++){
.f0:
.lp0:
		mov		eax,[esi]			; eax = k = ind[b][0]
		mov		ecx,[esi+12]		; ecx = ind[b][1]
		add		esi,16				; next s3ind

		movlps	xmm1,[edx+eax*8]	; {src[k][1], src[k][0]}
		movlhps	xmm1,xmm1
		movss	xmm0,[ebx         +eax*4]	;s3_l[b  ][k]
		movss	xmm7,[ebx+4*CBANDS+eax*4]	;s3_l[b+1][k]
		shufps	xmm0,xmm7,0
		mulps	xmm0,xmm1
		inc		eax
		jmp		short .f1

		align	16
.f1:
.lp1:
		movlps	xmm1,[edx+eax*8]	; {src[k][1], src[k][0]}
		movlhps	xmm1,xmm1
		movss	xmm6,[ebx         +eax*4]	;s3_l[b  ][k]
		movss	xmm7,[ebx+4*CBANDS+eax*4]	;s3_l[b+1][k]
		shufps	xmm6,xmm7,0
		mulps	xmm6,xmm1
		inc		eax
		cmp		eax,ecx
		addps	xmm0,xmm6
		jbe		.lp1

		movups	[edi],xmm0
		add		edi,16				;next dest
		add		ebx,8*CBANDS

		cmp		esi,s3ind_C_end
		jb		.lp0
;	}

		pop		edi
		pop		esi
		pop		ebx
		ret

;------------------------------------------------------------------------
;	99/09/03 9k[clk]@PII/PIII
;	by K.SAKAI
;void sprdngf_FPU( float dest[CBANDS][2], float src[CBANDS][2] );
	align 16
sprdngf_FPU:
		push	ebx
		push	esi
		push	edi
		push	ebp
%assign _P 4*4
		mov		edi,[esp+_P+4]		;edi = dest
		mov		edx,[esp+_P+8]		;edx = src
		mov		ebx,s3_l			;ebx = offset of s3_l
		mov		esi,s3ind			;esi = offset of s3ind
		mov		ebp,CBANDS			;ebp = CBANDS
		jmp		.lp0

		align	16
.lp0:	;for(b = 0; b< CBANDS; b++){

		mov		eax,[esi]			;eax=k
		mov		ecx,[esi+4]			;ecx=count=ind[b][1]-ind[b][0]
		fldz
		fldz
		align	4
.lp1:
		fld		dword [ebx+eax*4]		;s3_l[b][k]
		fld		dword [edx+eax*8]		;src[k][0]
		fld		dword [edx+eax*8+4]		;src[k][1]
		inc		eax
		dec		ecx
		fxch	st2
		fmul	st1,st0
		fmulp	st2,st0
		faddp	st3,st0
		faddp	st1,st0
		jnz	.lp1

		fstp	dword [edi+4]
		fstp	dword [edi]
		add		edi,8				;next dest
		add		esi,8				;next s3ind
		add		ebx,4 * CBANDS

		dec		ebp					;}
		jnz		.lp0

		pop		ebp
		pop		edi
		pop		esi
		pop		ebx
		ret

;************************************************************************
;	* frame_shiftin_NONE *
;	99/10/10  EXTRADELAY8ܿ()
;	99/10/29 Ĵ

;EXTRADELAY defined in musenc.c
%define EXTRADELAY 56

;void frame_shiftin_NONE(int *mfbuf, short *frmBuffer, int samplesPerFrame, int stereo);

proc frame_shiftin_NONE

%$mfbuf				arg		4
%$frmBuffer			arg		4
%$samplesPerFrame	arg		4
%$stereo			arg		4

	pushd	ebp, ebx, esi, edi

.lp_ch:
	mov		r1, [sp(%$mfbuf)]				;r1=mfbuf[ch][0]
	mov		r0, [sp(%$samplesPerFrame)]
	lea		r0, [r1+r0*dwsize]				;r0=mfbuf[ch][samplesPerFrame]
	mov		r2, (576 + EXTRADELAY)/8
	jmp		.lp1

	align	16
.lp1:
	mov		r3, [r0+dwsizen(0)]
	mov		r4, [r0+dwsizen(1)]
	mov		[r1+dwsizen(0)], r3
	mov		r5, [r0+dwsizen(2)]
	mov		[r1+dwsizen(1)], r4
	mov		r6, [r0+dwsizen(3)]
	mov		[r1+dwsizen(2)], r5
	mov		r3, [r0+dwsizen(4)]
	mov		[r1+dwsizen(3)], r6
	mov		r4, [r0+dwsizen(5)]
	mov		[r1+dwsizen(4)], r3
	mov		r5, [r0+dwsizen(6)]
	mov		[r1+dwsizen(5)], r4
	mov		r6, [r0+dwsizen(7)]
	mov		[r1+dwsizen(6)], r5
	add		r0, dwsizen(8)
	mov		[r1+dwsizen(7)], r6

	add		r1, dwsizen(8)
	dec		r2
	jnz		.lp1

	mov		r0, [sp(%$frmBuffer)]	;r0=frmBuffer[ch][0]
									;r1=mfbuf[ch][576+EXTRADELAY]
	mov		r2, [sp(%$samplesPerFrame)]
	shr		r2, 3
	jmp		.lp2

	align	16
.lp2:
	movsx	r3, word [r0+wsizen(0)]
	movsx	r4, word [r0+wsizen(1)]
	mov		[r1+dwsizen(0)], r3
	movsx	r5, word [r0+wsizen(2)]
	mov		[r1+dwsizen(1)], r4
	movsx	r6, word [r0+wsizen(3)]
	mov		[r1+dwsizen(2)], r5
	movsx	r3, word [r0+wsizen(4)]
	mov		[r1+dwsizen(3)], r6
	movsx	r4, word [r0+wsizen(5)]
	mov		[r1+dwsizen(4)], r3
	movsx	r5, word [r0+wsizen(6)]
	mov		[r1+dwsizen(5)], r4
	movsx	r6, word [r0+wsizen(7)]
	mov		[r1+dwsizen(6)], r5
	add		r0, wsizen(8)
	mov		[r1+dwsizen(7)], r6

	add		r1, dwsizen(8)
	dec		r2
	jnz		.lp2

	dec		dword [sp(%$stereo)]
	jz		.exit

	add		dword [sp(%$mfbuf)],		dwsizen(1152+576+EXTRADELAY)
	add		dword [sp(%$frmBuffer)],	wsizen(1152)
	jmp		.lp_ch

.exit:
	popd	ebp, ebx, esi, edi
endproc
;------------------------------------------------------------------------
;	* frame_shiftin_3DN *
;	99/10/10 3DNow! EXTRADELAY8ܿ()
;	99/10/29 Ĵ
;void frame_shiftin_3DN(int *mfbuf, short *frmBuffer, int samplesPerFrame, int stereo);

proc frame_shiftin_3DN

%$mfbuf				arg		4
%$frmBuffer			arg		4
%$samplesPerFrame	arg		4
%$stereo			arg		4

	femms
	pushd	ebp, ebx, esi, edi

.lp_ch:
	mov		r1, [sp(%$mfbuf)]				;r1=mfbuf[ch][0]
	mov		r0, [sp(%$samplesPerFrame)]
	lea		r0, [r1+r0*dwsize]				;r0=mfbuf[ch][samplesPerFrame]
	mov		r2, (576 + EXTRADELAY)/8
	jmp		.lp1

	align	16
.lp1:
	pmov	mm0, [r0+dwsizen(0)]
	pmov	mm1, [r0+dwsizen(2)]
	pmov	[r1+dwsizen(0)], mm0
	pmov	mm2, [r0+dwsizen(4)]
	pmov	[r1+dwsizen(2)], mm1
	pmov	mm3, [r0+dwsizen(6)]
	pmov	[r1+dwsizen(4)], mm2
	add		r0, dwsizen(8)
	pmov	[r1+dwsizen(6)], mm3

	add		r1, dwsizen(8)
	dec		r2
	jnz		.lp1

	mov		r0, [sp(%$frmBuffer)]	;r0=frmBuffer[ch][0]
									;r1=mfbuf[ch][576+EXTRADELAY]
	mov		r2, [sp(%$samplesPerFrame)]
	shr		r2, 3
	pxor	mm2, mm2
	pxor	mm3, mm3
	jmp		.lp2

	align	16
.lp2:
	pmov	mm0, [r0+wsizen(0)]
	pmov	mm1, [r0+wsizen(4)]

	pxor	mm4, mm4
	pxor	mm5, mm5

	puplwd	mm2, mm0
	puphwd	mm3, mm0
	puplwd	mm4, mm1
	puphwd	mm5, mm1

	psrad	mm2, 16
	psrad	mm3, 16
	psrad	mm4, 16
	psrad	mm5, 16

	pmov	[r1+dwsizen(0)], mm2
	pmov	[r1+dwsizen(2)], mm3
	pmov	[r1+dwsizen(4)], mm4
	pmov	[r1+dwsizen(6)], mm5

	add		r0, wsizen(8)
	add		r1, dwsizen(8)

	dec		r2
	pxor	mm2, mm2
	pxor	mm3, mm3
	jnz		.lp2

	dec		dword [sp(%$stereo)]
	jz		.exit

	add		dword [sp(%$mfbuf)],		dwsizen(1152+576+EXTRADELAY)
	add		dword [sp(%$frmBuffer)],	wsizen(1152)
	jmp		.lp_ch

.exit:
	popd	ebp, ebx, esi, edi
	femms
endproc

;------------------------------------------------------------------------
;	99/11/10	Ȥꤢ17.4k[clk] by K.SAKAI
; EXTRADELAY 8ܿ
; samplesPerFrame  1152  576 Ǥ롣
;void frame_shiftin_MMX(int (*mfbuf)[1152+576+EXTRADELAY], short (*frmBuffer)[1152], int samplesPerFrame, int stereo);
			align	16
frame_shiftin_MMX:
		push	esi
		push	edi
%assign	_P	2*4
		mov		edi,[esp+_P+ 4]				; = mfbuf
		mov		esi,[esp+_P+ 8]				; = frmBuffer
		mov		edx,[esp+_P+12]				; = samplesPerFrame
		mov		eax,[esp+_P+16]				; = stereo

.lp0:
		mov		ecx,(576 + EXTRADELAY)/8
		jmp		short .f00

		align	16
.f00:
.lp00:
		movq	mm0,[edi+edx*4+ 0]
		movq	mm1,[edi+edx*4+ 8]
		movq	mm2,[edi+edx*4+16]
		movq	mm3,[edi+edx*4+24]
		movq	[edi+ 0],mm0
		movq	[edi+ 8],mm1
		movq	[edi+16],mm2
		movq	[edi+24],mm3
		add		edi,32
		loop	.lp00
		; edi is &mfbuf[ch][576+EXTRADELAY] at here

		mov		ecx,edx
		jmp		short .f01

		align	16
.f01:
.lp01:
		movq	mm1,[esi+ecx*2-16]
		movq	mm3,[esi+ecx*2- 8]
		punpcklwd	mm0,mm1
		punpckhwd	mm1,mm1
		punpcklwd	mm2,mm3
		punpckhwd	mm3,mm3
		psrad	mm0,16
		psrad	mm1,16
		psrad	mm2,16
		psrad	mm3,16
		movq	[edi+ecx*4-32],mm0
		movq	[edi+ecx*4-24],mm1
		movq	[edi+ecx*4-16],mm2
		movq	[edi+ecx*4- 8],mm3

		sub		ecx,8
		jnz		.lp01

		add		esi,1152*2					; frmBuffer++
		add		edi,1152*4					; mfbuf++
		dec		eax
		jnz		near .lp0

		pop		edi
		pop		esi
		emms
		ret

;************************************************************************
;	99/12/23	by shigeo
;				äư(34k clk -> 8.7k clk)
;void calc_phase_3DN(
;	float abx_s[3][HBLKSIZE_s][2], float energy_s[3][HBLKSIZE_s],float *cw);

		align 16
calc_phase_3DN:
		push	ebx
		push	edi
		push	esi
		push	ebp
%assign _P 4*4
		femms
		mov		ecx,50
		mov		edi,[esp+_P+4]	;edi=&abx_s[0][0][0]
		mov		edx,[esp+_P+8]	;edx=&energy_s[0][0]
		mov		ebp,[esp+_P+12]	;ebp=cw
		mov		esi,HBLKSIZE_s * 4
		add		ebp,6*4		;ebp=&cw[j=6]
		add		edi,2*2*4	;edi=&abx_s[0][k=2][0]
		add		edx,  2*4	;edx=&energy_s[0][k=2]
		movq	mm6, [D_05_05]	;mm6=[0.5:0.5]
		;abx_s[0], [1], [2] <=> edi, edi+esi*2, edi+esi*4
		;energy_s[0], [1], [2] <=> edx, edx+esi, edx+esi*2
		jmp		.lp

		align	16
.lp:
		mov		eax,[edx]	;eax=en0
		test	eax,eax
		jz		.F0
		movq	mm4,[edi]	;[b1:a1]
		movd	mm1,eax		;mm1=[0:en0]=[0:den]
		pfrsqrt	mm2,mm1
		movq	mm0,mm4		;mm0=[b1:a1]
		movq	mm3,mm2
		pfmul	mm2,mm2
		psrlq	mm0,32		;mm0=[0:b1]
		pfrsqit1	mm2,mm1
		pfmul	mm0,mm4		;mm0=[0:a1*b1]
		pfrcpit2	mm2,mm3	;mm2=[*:1/en0]
		pfmul	mm4,mm4		;mm4=[b1^2:a1^2]
		pfmul	mm2,mm1		;mm2=[0:en0]
		pfmul	mm4,[D_M05_05]	;mm4=[-b1^2/2:a1^2/2]
		pfadd	mm2,mm2		;mm2=[0:2en0]=[0:tmp3]
		pfacc	mm4,mm4		;mm4=[*:(a1^2-b1^2)/2]
		punpckldq	mm0,mm4	;mm0=[numim:numre]
		jmp		.F1
.F0:
		movd	mm1,[D_1_1]	;mm1=[0:den=1]		;not movq!
		pxor	mm2,mm2		;mm2=[0:tmp3=0]
		movq	mm0,mm1		;mm0=[numim=0:numre=1]
.F1:
		mov		eax,[edx+esi*2]	;eax=en2
		test	eax,eax
		jnz		.F3
		punpckldq	mm3,mm0
		punpckhdq	mm0,mm3	;mm0=[numre:numim]
		jmp		.F4
.F3:
		movd	mm3,eax		;mm3=[0:en2]
		pfrsqrt	mm4,mm3
		movq	mm5,mm4
		pfmul	mm4,mm4
		pfrsqit1	mm4,mm3
		pfrcpit2	mm4,mm5
		pfmul	mm3,mm4		;mm3=[0:r2=en2]
		pfsub	mm2,mm3		;mm2=[0:tmp3]
		pfmul	mm1,mm3		;mm1=[0:den]
		movq	mm3,[edi+esi*4]	;mm3=[b2:a2]
		movq	mm4,mm3
		movq	mm5,mm0		;mm5=[numim:numre]
		pfacc	mm4,mm4		;mm4=[*:a2+b2]
		pfmul	mm0,mm3		;mm0=[b2*numim:a2*numre]
		pfacc	mm5,mm5		;mm5=[*:numre+numim]
		pfmul	mm4,mm6		;mm4=[*:(a2+b2)/2]
		pfmul	mm4,mm5		;mm4=[*:tmp2]
		punpckldq	mm4,mm4	;mm4=[tmp2:tmp2]
		pfsubr	mm0,mm4		;mm0=[numre':numim']
.F4:
		;mm0=[numre:numim], mm1=[0:den], mm2=[0:tmp3]
		pfrcp	mm3,mm1
		pfrcpit1	mm1,mm3
		pfrcpit2	mm1,mm3	;mm1=[*:1/den]
		pfmul	mm1,mm2		;mm1=[0:tmp3/den]
		punpckldq	mm1,mm1	;mm1=[tmp:tmp]
		pfmul	mm0,mm1		;mm1=[numre:numim]*tmp
		movd	mm3,[edx+esi]	;mm3=[0:en1]
		pfrsqrt	mm4,mm3
		movq	mm5,mm4
		pfmul	mm4,mm4
		pand	mm2,[D_ABS]	;mm2=[0:|tmp3|]
		pfrsqit1	mm4,mm3
		pfrcpit2	mm4,mm5
		pfmul	mm3,mm4		;mm3=[0:rn=en1]
		pfadd	mm2,mm3		;mm2=[0:den]
		movd	eax,mm2
		test	eax,eax
		jnz		.F5
		pxor	mm1,mm1
		jmp		.F6
.F5:
		movq	mm1,[edi+esi*2]	;mm1=[bn:an]
		movq	mm4,mm1
		psrlq	mm4,32			;mm4=[0:bn]
		movq	mm3,mm1			;mm3=[bn:an]
		pfsub	mm3,mm4			;mm3=[*:an-bn]
		pfacc	mm1,mm1			;mm1=[*:an+bn]
		punpckldq	mm3,mm1		;mm3=[an+bn:an-bn]
		pfmul	mm3,mm6			;mm3=[(an+bn)/2:(an-bn)/2]
		pfsub	mm3,mm0			;mm3=[numre'':numim'']
		pfmul	mm3,mm3
		pfacc	mm3,mm3			;mm3=[*:re^2+im^2]
		pfrsqrt	mm1,mm3
		movq	mm4,mm1
		pfmul	mm1,mm1
		pfrsqit1	mm1,mm3
		pfrcpit2	mm1,mm4		;mm1=[*:1/(re^2+im^2)]
		pfmul	mm1,mm2			;mm1=[*:1/cw[j]]
		pfrcp	mm0,mm1
		pfrcpit1	mm1,mm0
		pfrcpit2	mm1,mm0
		punpckldq	mm1,mm1		;mm1=[cw[j]]
.F6:
		movq	[ebp],mm1
		movq	[ebp+8],mm1
		add		edi,8
		add		edx,4
		add		ebp,16
		dec		ecx
		jnz		near .lp

		mov		ecx,76	;=[(HBLKSIZE-206)/4] HBLKSIZE=513 rem=3
		movq	mm0,[D_04_04]
		jmp		.F7
		align	16
.F7:	movq	[ebp],mm0
		movq	[ebp+8],mm0
		add		ebp,16
		loop	.F7
		movq	[ebp],mm0
		movd	[ebp+8],mm0
		pop		ebp
		pop		esi
		pop		edi
		pop		ebx
		femms
		ret

;************************************************************************
;	99/11/22	Initial version by K.SAKAI
; use_reciprocal3.5k(ST)/4.3k(MT)ɽϥե뤬Ѥ롣
; ̤Ǥ5.7k(ST)/6.2k(MT)ɽϥեѤʤ
;void calc_phase_SSE(float abx_s[3][HBLKSIZE_s][2], float energy_s[3][HBLKSIZE_s],float *cw)
;{
;%define	use_reciprocal
		align	16
calc_phase_SSE:
		push	ebx
		mov		ebx,[esp+ 8]	; = abx_s
		mov		eax,[esp+12]	; = energy_s
		mov		edx,[esp+16]	; = cw
		mov		ecx,esp
		sub		esp,20
		and		esp,~15
		mov		[esp+16],ecx
		mov		ecx,6
		jmp		short .f0

;	for ( j = 6; j < 206; j += 4 ){
;		k = (j+2) / 4;
		align	16
.lp0:
		movlps	[edx+ecx*4-64+32],xmm7
		movlps	[edx+ecx*4-64+40],xmm7
		movhps	[edx+ecx*4-64+48],xmm7
		movhps	[edx+ecx*4-64+56],xmm7

.f0:
;		/* square (x1,y1) */
;		den = energy_s[0][k];
;		a1 = abx_s[0][k][0];
;		b1 = abx_s[0][k][1];

		movlps	xmm0,[eax+ecx+ 2]
		movhps	xmm0,[eax+ecx+10]	; = energy_s[0][k+3:k]
		movlps	xmm4,[ebx+ecx*2+4+24]
		movlps	xmm3,[ebx+ecx*2+4+16]
		movlps	xmm2,[ebx+ecx*2+4+ 8]
		movlps	xmm1,[ebx+ecx*2+4]
		unpcklps	xmm3,xmm4
		unpcklps	xmm1,xmm2
		movaps	xmm2,xmm3
		movhlps	xmm2,xmm1			; = abx_s[0][k+3:k][1]
		movlhps	xmm1,xmm3			; = abx_s[0][k+3:k][0]

		movaps	xmm4,xmm1
		mulps	xmm1,xmm1
		mulps	xmm4,xmm2			; = a1*b1
		mulps	xmm2,xmm2
		subps	xmm1,xmm2
		mulps	xmm1,[Q_05]			; = (a1*a1-b1*b1)*0.5

;		r1    = (den) ? sqrt( den )       : 0.0;
;		numim = (den) ? (a1*a1-b1*b1)*0.5 : 0.0;
;		numre = (den) ? a1*b1             : 1.0;
;		den   = (den) ? den               : 1.0;

%ifdef use_reciprocal
		rsqrtps	xmm3,xmm0
		movaps	xmm2,xmm3
		cmpordps	xmm2,xmm2		; = (den)? all one: all zero

		andps	xmm3,xmm2
		mulps	xmm3,xmm0			; = r1
%else
		xorps	xmm2,xmm2
		cmpneqps	xmm2,xmm0		; = (den)? all one: all zero
		sqrtps	xmm3,xmm0
		andps	xmm3,xmm2			; = r1
%endif

		movaps	[esp],xmm3
		andps	xmm1,xmm2			; = numim
		andps	xmm4,xmm2
		andps	xmm0,xmm2
		andnps	xmm2,[Q_1]
		orps	xmm4,xmm2			; = numre
		orps	xmm0,xmm2			; = den

;		/* multiply by (x2,-y2) */
;		r2 = energy_s[2][k];
;		a2 = abx_s[2][k][0];
;		b2 = abx_s[2][k][1];
; xmm1 = numim, xmm4 = numre, xmm0 = den

		movlps	xmm2,[eax+2*HBLKSIZE_s*4+ecx+ 2]
		movhps	xmm2,[eax+2*HBLKSIZE_s*4+ecx+10]	; = energy_s[2][k+3:k]
		movlps	xmm5,[ebx+2*HBLKSIZE_s*8+ecx*2+4]
		movlps	xmm6,[ebx+2*HBLKSIZE_s*8+ecx*2+4+ 8]
		movhps	xmm6,[ebx+2*HBLKSIZE_s*8+ecx*2+4+16]
		movhps	xmm7,[ebx+2*HBLKSIZE_s*8+ecx*2+4+24]
		unpcklps	xmm5,xmm6
		unpckhps	xmm6,xmm7
		movaps	xmm3,xmm5
		movlhps	xmm5,xmm6			; = abx_s[2][k+3:k][0]
		movhlps	xmm6,xmm3			; = abx_s[2][k+3:k][1]

;		tmp2 = (numim+numre)*(a2+b2)*0.5;
;		tmp1 = -a2*numre+tmp2;
;		tmp2 = -b2*numim+tmp2

		movaps	xmm7,xmm5
		addps	xmm7,xmm6
		mulps	xmm7,[Q_05]
		movaps	xmm3,xmm1
		addps	xmm3,xmm4
		mulps	xmm7,xmm3			; = tmp2

		movaps	xmm3,xmm7
		mulps	xmm5,xmm4
		subps	xmm3,xmm5			; = tmp1
		mulps	xmm6,xmm1
		subps	xmm7,xmm6			; = tmp2

;		r2    = (energy_s[2][k]) ? sqrt( energy_s[2][k] ) : 0.0;
;		numim = (energy_s[2][k]) ? tmp1                   : numim;
;		numre = (energy_s[2][k]) ? tmp2                   : numre;
;		den   = (energy_s[2][k]) ? den*r2                 : den;

; xmm0 = den, xmm1 = numim, xmm2 = r2, xmm3 = tmp1, xmm4 = numre, xmm7 = tmp2

%ifdef use_reciprocal
		rsqrtps	xmm5,xmm2
		movaps	xmm6,xmm5
		cmpordps	xmm6,xmm6		; = (den)? all one: all zero

		andps	xmm5,xmm6
		mulps	xmm2,xmm5			; = r2
%else
		xorps	xmm6,xmm6
		cmpneqps	xmm6,xmm2		; = (den)? all one: all zero
		sqrtps	xmm2,xmm2
		andps	xmm2,xmm6			; = r2
%endif

		movaps	xmm5,xmm2
		mulps	xmm5,xmm0

		andps	xmm3,xmm6
		andps	xmm7,xmm6
		andps	xmm5,xmm6
		xorps	xmm6,[Q_not]
		andps	xmm1,xmm6
		andps	xmm4,xmm6
		andps	xmm0,xmm6
		orps	xmm1,xmm3			; = numim
		orps	xmm4,xmm7			; = numre
		orps	xmm0,xmm5			; = den

;		tmp3 = r1 + r1 - r2;
;		tmp = tmp3 / den;
;		numre *= tmp;
;		numim *= tmp;

; xmm0 = den, xmm1 = numim, xmm2 = r2, xmm4 = numre

%ifdef use_reciprocal
		rcpps	xmm0,xmm0			; = 1.0/den
		movaps	xmm3,[esp]
		addps	xmm3,xmm3
		subps	xmm3,xmm2			; = tmp3
		mulps	xmm0,xmm3			; = tmp
		andps	xmm3,[Q_abs]		; = tmp3 = fabs(tmp3)
		mulps	xmm4,xmm0			; = numre
		mulps	xmm1,xmm0			; = numim
%else
		movaps	xmm3,[esp]
		addps	xmm3,xmm3
		subps	xmm3,xmm2			; = tmp3
		movaps	xmm2,xmm3
		divps	xmm2,xmm0			; = tmp
		andps	xmm3,[Q_abs]		; = tmp3 = fabs(tmp3)
		mulps	xmm4,xmm2			; = numre
		mulps	xmm1,xmm2			; = numim
%endif

;		rn = sqrt( energy_s[1][k] );

		movlps	xmm2,[eax+1*HBLKSIZE_s*4+ecx+ 2]
		movhps	xmm2,[eax+1*HBLKSIZE_s*4+ecx+10]	; = energy_s[1][k+3:k]
%ifdef use_reciprocal
		rsqrtps	xmm0,xmm2
		mulps	xmm2,xmm0			; = rn
%else
		sqrtps	xmm2,xmm2
%endif

;		den = rn + fabs( tmp3 );
;		an = abx_s[1][k][0];
;		bn = abx_s[1][k][1];
; xmm1 = numim, xmm2 = rn, xmm3 = tmp3, xmm4 = numre

		addps	xmm3,xmm2			; = den
		movlps	xmm5,[ebx+1*HBLKSIZE_s*8+ecx*2+4]
		movlps	xmm6,[ebx+1*HBLKSIZE_s*8+ecx*2+4+ 8]
		movhps	xmm6,[ebx+1*HBLKSIZE_s*8+ecx*2+4+16]
		movhps	xmm7,[ebx+1*HBLKSIZE_s*8+ecx*2+4+24]
		unpcklps	xmm5,xmm6
		unpckhps	xmm6,xmm7
		movaps	xmm2,xmm5
		movlhps	xmm5,xmm6			; = abx_s[1][k+3:k][0]
		movhlps	xmm6,xmm2			; = abx_s[1][k+3:k][1]

		movaps	xmm2,[Q_05]
		movaps	xmm7,xmm5
		addps	xmm5,xmm6
		subps	xmm7,xmm6
		mulps	xmm5,xmm2
		mulps	xmm7,xmm2
		subps	xmm5,xmm4
		subps	xmm7,xmm1
		mulps	xmm5,xmm5
		mulps	xmm7,xmm7
		addps	xmm5,xmm7
%ifdef use_reciprocal
		rsqrtps	xmm0,xmm5
		mulps	xmm5,xmm0
%else
		sqrtps	xmm5,xmm5
%endif

;		numre = (den) ? ( an + bn ) * 0.5 - numre         : numre;
;		numim = (den) ? ( an - bn ) * 0.5 - numim         : numim;
;		cw[j] = (den) ? sqrt(numre*numre+numim*numim)/den : 0.0;
;		cw[j+1] = cw[j+2] = cw[j+3] = cw[j];

%ifdef use_reciprocal
		rcpps	xmm3,xmm3			; = 1/den
		movaps	xmm2,xmm3
		cmpordps	xmm2,xmm2
		andps	xmm3,xmm2
		mulps	xmm5,xmm3
%else
		xorps	xmm2,xmm2
		cmpneqps	xmm2,xmm3
		andps	xmm5,xmm2
		divps	xmm5,xmm3
%endif
		movaps	xmm7,xmm5
		unpcklps	xmm5,xmm5
		unpckhps	xmm7,xmm7
		movlps	[edx+ecx*4   ],xmm5
		movlps	[edx+ecx*4+ 8],xmm5
		movhps	[edx+ecx*4+16],xmm5
		movhps	[edx+ecx*4+24],xmm5

		add		ecx,16
		cmp		ecx,206
		jl		near .lp0
;	}
		movaps	xmm0,[Q_04]
		mov		ecx,208
		movlps	[edx+206*4],xmm0
		jmp		short .f1

		align	16
;	for( j = 208; j < HBLKSIZE - 1; j++ ){
.lp1:
.f1:
;		cw[j] = 0.4;
%if 1
		movlps	[edx+ecx*4],xmm0
		movlps	[edx+ecx*4+ 8],xmm0
		movlps	[edx+ecx*4+16],xmm0
		movlps	[edx+ecx*4+24],xmm0
%else
		movaps	[edx+ecx*4   ],xmm0
		movaps	[edx+ecx*4+16],xmm0
%endif

		add		ecx,8
		cmp		ecx,HBLKSIZE-3
		jl		.lp1
;	}
;		movss	[edx+ecx*4+8],xmm0
		movss	[edx+ecx*4+0],xmm0	;bug fix by PEN ( 00/01/13 )

		mov		esp,[esp+16]
		pop		ebx
		ret
;}
;************************************************************************
;void calc_pe_3DN( float *pe, int num, float *thr, float *ebc, int *numlines );
;	by shigeo
;	00/03/02 4000 clk on K6-2

;   99/02/11    femms38clock, 1<=x<2ϰϤǺ1.25e-7
;   (AMDLOG.ASM®)
;   ׻ˡ
;   x=2^b*a, 1<=a<2, b:
;   x=[s:b:a]   s:1bit, b:8bit, a:23bit
;   z=(a-2)/(a+2)
;   logx=(2b+1)*log2/2+2{z+z^3/3+z^5/5+z^7/7}

;    input :x=mm0
;   output ;mm0=log(|x|)
;   destroy:mm0,mm1,mm2,mm3

proc	calc_pe_3DN
		push	ebx
		push	esi
		push	edi
%assign _P 4*3
		mov		ecx,[esp+_P+8]	;ecx=num
		mov		ebx,[esp+_P+12]	;ebx=thr
		mov		edx,[esp+_P+16]	;edx=ebc
		mov		edi,[esp+_P+20]	;edi=numlines
		femms
		movq	mm5,[D_1_1]		;mm5=[*:1]
		pxor	mm7,mm7		;mm7=tmp
		movq	mm4,[log_data5]
		jmp		short .lp
		align	16
.lp:
		movd	mm0,[ebx]
		add		ebx,4
		movd	mm6,[edx]
		add		edx,8
		pfadd	mm0,mm5
		pfadd	mm6,mm5
		movq	mm2,mm6
		pfcmpgt	mm6,mm0		;mm6 = (ebc>thr)?1:0	;ǻȤ!
		pfrcp	mm3,mm2
		pfrcpit1	mm2,mm3
		pfrcpit2	mm2,mm3	;mm2= 1 / (ebc + 1)
		pfmul	mm0,mm2
		movd	eax,mm0
		mov		esi,eax
		movq    mm1,[log_data1]	 ;mm1=[-2:2]
		and     esi,7F800000h       ;ؿ
		or      eax,3F800000h       ;eax=a
		shr     esi,22	      ;esi=2*(b+127)
		movd    mm0,eax
		punpckldq   mm0,mm0	 ;mm0=[a:a]
		movq    mm3,[log_data2]	 ;mm3=[0:7/5]
		pfadd   mm1,mm0	     ;mm1=[a-2:a+2]
		sub     esi,253	     ;esi=2*b+1
		movq    mm0,mm1
		movq    mm2,mm1
		pfrcp   mm1,mm0
		psrlq   mm2,32	      ;mm2=[0:a-2]
		pfrcpit1    mm0,mm1
		pfrcpit2    mm0,mm1	 ;mm0=[?:1/(a+2)]
		pfmul   mm0,mm2	     ;mm0=[0:z]
		movq    mm2,[log_data3]	 ;m2=1/7
		movq    mm1,mm0	     ;m1=z
		pfmul   mm0,mm0	     ;m0=zz

		pfmul   mm2,mm0	     ;m2=zz/7
		pfadd   mm3,mm0	     ;m3=7/5+zz
		pfmul   mm2,mm3	     ;m2=zz(1/5+zz/7)
		pfadd   mm2,[log_data4]	 ;m2=1/3+zz(1/5+zz/7)
		pfmul   mm0,mm1	     ;m0=z^3
		movd    mm3,esi	     ;m3=(int)2b+1
		pfmul   mm0,mm2	     ;m0=z^3/3+z^5/5+z^7/7
		pi2fd   mm3,mm3	     ;m3=(float)2b+1
		pfadd   mm0,mm1	     ;m0=X:=z+z^3/3+z^5/5+z^7/7
		pfmul   mm3,mm4;=[log_data5]	 ;m3=(2b+1)*log2/2
		movd	mm2,[edi]	;mm2=numlines
		pfadd   mm0,mm0	     ;m0=2X
		pi2fd	mm2,mm2
		pfadd   mm0,mm3             ;m0=log(x)
		pand	mm0,mm6		;if()ΤȤΤ
		pfmul	mm0,mm2
		add		edi,4
		dec		ecx
		pfsub	mm7,mm0
		jnz		near	.lp
		movd	eax,mm7
		femms
		mov		edx,[esp+_P+4]
		pop		edi
		mov		[edx],eax
		pop		esi
		pop		ebx
		ret
;************************************************************************
	end
