
;	optimized mdct() for new GOGO-no-coda (1999/09)
;	Copyright (C) 1999 shigeo
;	special thanks to Keiichi SAKAI

;	loop.cȤ

;礭ޤǤȤȤϤߤåפˤʤ
PRECALC_SIZE	equ 8206

%include "nasm.h"
%include "grinfo.inc"

	globaldef quantize_xrpow_FPU
	globaldef quantize_xrpow_3DN
	globaldef quantize_xrpow_SSE
%ifdef USE_E3DN
	globaldef quantize_xrpow_E3DN
%endif
	globaldef calc_pow4P3mono
	globaldef calc_pow4P3dual_FPU
	globaldef calc_pow4P3dual_3DN
	globaldef calc_pow4P3dual_SSE
	globaldef pow4P3_table
;	globaldef calc_runlen
	globaldef ix_max_NONE
	globaldef ix_max_MMX
	globaldef ix_max_3DN
	globaldef ix_max_SSE
	globaldef calc_pow075_3DN
	globaldef calc_pow075_SSE
	globaldef calc_pow075_FPU
	globaldef calc_pow075_NONE
%ifdef USE_E3DN
	globaldef calc_pow075_E3DN
%endif
	globaldef ms_convert_FPU
	globaldef ms_convert_3DN
	globaldef ms_convert_SSE
%ifdef USE_E3DN
	globaldef ms_convert_E3DN
%endif
;	globaldef count1_bitcount
	externdef ix_max
F_SIZE	equ	4
%define F_PTR	dword

	segment_data

		align 16
_M01875	dd	-0.1875
_05		dd	0.5
_1		dd	1.0
_00946	dd	0.0946
D_04054	dd	0.4054,0.4054	;=1-0.0946
_4P3	dd	1.33333333333	;=4/3
		align 16
D_MSB1_1	dd	0x80000000,0x80000000	;Ϻ˽񤤤̤ۤȤƤ
D_MSB0_1	dd	0x80000000,0			;D_MSB1_1D_MSB0_1νѹԲ
D_05		dd	0.5,0.5
D_ABS		dd	0x7FFFFFFF,0x7FFFFFFF	;and

		align	8
_M025		dd	-0.25

; for pow(2, i/4); 
powiP4table	dd	1.000000000, 1.18920711498, 1.41421356237, 1.68179283048

; for pow(2, i/16);
powiP16table	dd	1.0000000000000000,1.0442737824274138,1.0905077326652577,1.1387886347566916
		dd	1.1892071150027210,1.2418578120734840,1.2968395546510096,1.3542555469368927
		dd	1.4142135623730951,1.4768261459394993,1.5422108254079407,1.6104903319492543
		dd	1.6817928305074290,1.7562521603732995,1.8340080864093424,1.9152065613971474

		align	8
D_0.4999	dd	0.499996011,0.499996011
D_1Psqr2	dd	0.70710678118,0.70710678118	;1/sqr2

;	calc_pow075_NONE ѤΥơ֥򤳤ä˻äƤ

; 6.26
;					(16/9)^.75, (32/9)^.75, (64/9)^.75, (128/9)^.75
pow075_table0	dd	0x06288D17, 0x0A5B6EB1, 0x116B28F5, 0x1D4B6220
;					(16/11)^.75, (32/11)^.75, (64/11)^.75, (128/11)^.75
				dd	0x054C448A, 0x08E8F61B, 0x0EFC1ABC, 0x1933886F
;					(16/14)^.75, (32/14)^.75, (64/14)^.75, (128/14)^.75
				dd	0x046BDCF6, 0x076F8F2F, 0x0C816474, 0x1508142B
pow075_table1	db	9, 11, 14, 0

		align	8
; 015ޤǤο2ʿɽ1ο
count1_table:	dd	0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4

; huffcode.tblhlen32[]ʬ
hlen32_table:	dd	1,4,4,5,4,6,5,6,4,5,5,6,5,6,6,6
	segment_bss
pow4P3_table: resd PRECALC_SIZE

	segment_code

;************************************************************************

;		98/07/12
;	in : st0
;	out: st0=2^st0
;	dst: none

		align	16
pow2x:
		sub		esp,F_SIZE
		fld		st0					;x			;x

		fsub	F_PTR [_05]			;x-0.5		;x

		fistp	F_PTR [esp]			;x
		fild	F_PTR [esp]			;y=[x]		;x
		fxch						;x			;y
		fsub	st0,st1				;x-y		;y

		f2xm1						;2^(x-y)-1	;y
		fadd	F_PTR [_1]			;2^(x-y)	;y
		fscale						;2^(x-y)*2^y;y

		fstp	st1					;2^x
		add		esp,F_SIZE
		ret

;************************************************************************

;	98/11/26
;	in :eax
;	out:st0=eax^(4/3)

		align 16
pow4P3:
		push	eax
		fld		dword [_4P3]		;4/3
		fild	dword [esp]			;eax			,4/3
		fyl2x						;4/3*log_2(eax)	,---
		call	pow2x				;eax^(4/3)
		ret		4

;************************************************************************

;void quantize_xrpow( float xr[576], int ix[576], gr_info *cod_info );

;	99/08/03
;	11kclk

		align 16
quantize_xrpow_FPU:
		mov		eax,[esp+12]		;eax=cod_info
		fld		dword [_00946]		;0.0946
		mov		eax,[eax+quantizerStepSize]
		neg		eax
		lea		eax,[eax+eax*2]		;eax=-step
		call	pow2iP16			;step'
		mov		eax,[esp+8]			;eax=ix
		mov		ecx,576/2
		mov		edx,[esp+ 4]		;edx=xr
		align	4
.lp:
		fld		st0			;step, step, 0.0946
		fmul	dword [edx]			;step * *xr, step, 0.0946
		fsub	st0,st2				;step * *xr - 0.0946, step
		fistp	dword [eax]			;step	(ϻͼθ)

		fld		st0			;step, step, 0.0946
		fmul	dword [edx+4]			;step * *xr, step, 0.0946
		add		edx,8
		fsub	st0,st2				;step * *xr - 0.0946, step
		fistp	dword [eax+4]			;step	(ϻͼθ)
		add		eax,8

		dec		ecx
		jnz		short .lp
		fcompp						;-

		push		dword 576
		push		dword 0
		push		dword [esp+8+8]
		call		[ix_max]
		add		esp,12

		ret

;------------------------------------------------------------------------

;	99/08/03 by shigeo
;	3500clk
;	99/08/11
;	2300clk
;	99/08/13()
;	00/01/09 by shigeo
;	ix_max(830clk)Ȥ 2800clk

		align	16
quantize_xrpow_3DN:
		mov		eax,[esp+12]		;eax=cod_info
		mov		edx,[esp+ 4]		;edx=xr
		femms
		mov		eax,[eax+quantizerStepSize]
		neg		eax
		lea		ecx,[eax+eax*2]		;-Size*3
		mov		eax,ecx
		and		ecx,15
		shr		eax,4
		shl		eax,23
		add		eax,0x3F800000
		movd		mm0,eax
		mov		eax,[esp+8]			;eax=ix
		pfmul		mm0,[powiP16table+ecx*4]
		movq	mm1,qword [D_04054]	;mm1=[0.4054:0.4054]
		punpckldq	mm0,mm0			;mm0=[step:step]
		pxor		mm7,mm7			;mm7=max
		mov		ecx,576/8
		jmp	short .lp
		align	16
.lp:
		movq	mm2,[edx]			;*xr
		movq	mm3,[edx+8]

		pfmul	mm2,mm0				;*xr * step
		movq	mm4,[edx+16]

		pfadd	mm2,mm1				;*xr * step + 0.4054
		pfmul	mm3,mm0

		movq	mm5,[edx+24]
		pfmax	mm7,mm2
		pf2id	mm2,mm2				;ڼΤ
		pfadd	mm3,mm1
		pfmul	mm4,mm0
		movq	[eax],mm2

		pfmax	mm7,mm3
		pf2id	mm3,mm3
		pfadd	mm4,mm1
		pfmul	mm5,mm0
		movq	[eax+8],mm3

		pfmax	mm7,mm4
		pf2id	mm4,mm4
		pfadd	mm5,mm1
		movq	[eax+16],mm4

		pfmax	mm7,mm5
		pf2id	mm5,mm5
		movq	[eax+24],mm5
		add		eax,32
		add		edx,32

		loop	.lp

		movq	mm1,mm7
		psrlq	mm7,32
		pfmax	mm7,mm1
		pf2id	mm7,mm7
		movd	eax,mm7
		femms
		ret

;------------------------------------------------------------------------

;	00/01/09 by shigeo
;	ʤ®ʤʤ(T_T) 1270clk
;	E3DN̿ϻȤäƤʤɰAthlon̿¤(ΤϤ...)

%ifdef USE_E3DN
		align 16
quantize_xrpow_E3DN:
		mov		eax,[esp+12]		;eax=cod_info
		mov		edx,[esp+ 4]		;edx=xr
		femms
		mov		eax,[eax+quantizerStepSize]
		neg		eax
		lea		ecx,[eax+eax*2]		;-Size*3
		mov		eax,ecx
		and		ecx,15
		shr		eax,4
		shl		eax,23
		add		eax,0x3F800000
		movd		mm0,eax
		mov		eax,[esp+8]			;eax=ix
		pfmul		mm0,[powiP16table+ecx*4]
		mov			ecx,-576*4
		movq	mm1,qword [D_04054]	;mm1=[0.4054:0.4054]
		punpckldq	mm0,mm0			;mm0=[step:step]
		pxor		mm7,mm7			;mm7=max
		sub			eax,ecx
		sub			edx,ecx
		jmp	short .lp
		align	16
.lp:
		prefetch	[edx+ecx+32]
		prefetchw	[eax+ecx+32]
		movq	mm2,[edx+ecx]		;*xr
		movq	mm3,[edx+ecx+8]

		movq	mm4,[edx+ecx+16]
		movq	mm5,[edx+ecx+24]

		pfmul	mm2,mm0				;*xr * step
		pfmul	mm3,mm0
		pfmul	mm4,mm0
		pfmul	mm5,mm0

		pfadd	mm2,mm1				;*xr * step + 0.4054
		pfadd	mm3,mm1
		pfadd	mm4,mm1
		pfadd	mm5,mm1

		pfmax	mm7,mm2
		pf2id	mm2,mm2				;ڼΤ
		pfmax	mm7,mm3
		pf2id	mm3,mm3


		movq	[eax+ecx],mm2
		movq	[eax+ecx+8],mm3

		pfmax	mm7,mm4
		pf2id	mm4,mm4
		pfmax	mm7,mm5
		pf2id	mm5,mm5

		movq	[eax+ecx+16],mm4
		movq	[eax+ecx+24],mm5

		add		ecx,32
		jnz		.lp

		movq	mm1,mm7
		psrlq	mm7,32
		pfmax	mm7,mm1
		pf2id	mm7,mm7
		movd	eax,mm7
		femms
		ret
%endif

;------------------------------------------------------------------------
;	by K. SAKAI
;	99/08/18	2.3k[clk], latency 礭®ʤʤ(;_;)
;	99/11/15	s/movups/movaps/ by K.SAKAI
;	00/01/21	1690[clk], ix_max  by K.SAKAI

		align 16
quantize_xrpow_SSE:
		mov		edx,[esp+ 4]		;edx=xr
		mov		eax,[esp+12]		;eax=cod_info
		fild	F_PTR [eax+quantizerStepSize]
		fmul	F_PTR [_M01875]		;size * -0.1875
		mov		eax,[esp+8]			;eax=ix
		call	pow2x				;step
		sub		esp,F_SIZE
		fstp	F_PTR [esp]			;-
		mov		ecx,576*4
		movss	xmm0,[esp]
		add		esp,F_SIZE
		movss	xmm1,[_00946]
		shufps	xmm0,xmm0,0x00		;xmm0={step, step, step, step}
		shufps	xmm1,xmm1,0x00		;xmm1={0.4054, 0.4054, 0.4054, 0.4054}
		movaps	xmm2,[edx+ecx-16]	;*xr
		xorps	xmm7,xmm7
		jmp		.f0

		align	16
.lp0:
		movaps	xmm2,[edx+ecx-16]	;*xr
		movq	[eax+ecx-48+48],mm6
		movq	[eax+ecx-40+48],mm7
.f0:
		movaps	xmm4,[edx+ecx-32]	;*xr
		mulps	xmm2,xmm0			;*xr * step
		subps	xmm2,xmm1			;step * *xr - 0.0946

		movaps	xmm6,[edx+ecx-48]	;*xr
		sub		ecx,48
		mulps	xmm4,xmm0			;*xr * step
		subps	xmm4,xmm1			;step * *xr - 0.0946

		cvtps2pi	mm2,xmm2		; ͼθ(haveunit.nasꤵƤ)
		movhlps	xmm3,xmm2
		maxps	xmm7,xmm2
		movq	[eax+ecx-16+48],mm2

		mulps	xmm6,xmm0			;*xr * step
		subps	xmm6,xmm1			;step * *xr - 0.0946
		cvtps2pi	mm3,xmm3

		cvtps2pi	mm4,xmm4		; ͼθ(haveunit.nasꤵƤ)
		movhlps	xmm5,xmm4
		maxps	xmm7,xmm4
		movq	[eax+ecx-8+48],mm3
		movq	[eax+ecx-32+48],mm4
		cvtps2pi	mm5,xmm5

		movhlps	xmm2,xmm6
		cvtps2pi	mm6,xmm6		; ͼθ(haveunit.nasꤵƤ)
		maxps	xmm7,xmm6
		movq	[eax+ecx-24+48],mm5
		cvtps2pi	mm7,xmm2
		jnz		.lp0

		movq	[eax-48+48],mm6
		movq	[eax-40+48],mm7

		movhlps	xmm1,xmm7
		maxps	xmm7,xmm1
		movss	xmm1,xmm7
		shufps	xmm7,xmm7,0xB1
		maxss	xmm7,xmm1
		cvtss2si	eax,xmm7
		emms
		ret

;************************************************************************

;	99/08/10
;	by shigeo
;	500clk

;void calc_pow4P3mono(int *ix,float *xr,float *step,float *sum,int n,int next);
		align 16
calc_pow4P3mono:
		push	ebx
		push	esi
		push	edi
%assign _P 4*3
		fldz						;st0=s=0
		mov		ebx,[esp+_P+4]		;ebx=ix
		mov		edx,[esp+_P+8]		;edx=xr
		mov		eax,[esp+_P+12]		;eax=step
		fld		dword [eax]			;st0=*step,s
		mov		ecx,[esp+_P+20]		;ecx=n
		mov		esi,[esp+_P+24]		;esi=next
		mov		edi,pow4P3_table	;ptr to table
		shl		esi,2				;esi=next*sizeof(int)

		align	4
.lp:
		mov		eax,[ebx]			;eax=*ix
;		cmp		eax,PRECALC_SIZE
;		jae		short .real_pow4P3
		fld		dword [edi+eax*4]	;temp=ix^(4/3),step,s
.B0:
		fmul	st0,st1				;temp*step,step,s
		fld		dword [edx]			;xr,temp*step,step,s
		add		edx,esi
		fabs
		fsubrp	st1,st0				;temp:=|xr|-temp*step,step,s
		add		ebx,esi
		fmul	st0,st0				;temp*temp,step,s
		faddp	st2,st0				;step,sum+=temp*temp
		dec		ecx
		jnz		short .lp
		fstp	st0					;s
		mov		eax,[esp+_P+16]		;eax=sum
		fstp	dword [eax]			;*sum=s,-
		pop		edi
		pop		esi
		pop		ebx
		ret

;************************************************************************

;	99/08/10
;	by shigeo
;	st0st7ȤĤΤǤδؿƤfreeǤʤȤʤ
;	1250clk
;	ɤƤpowȤǽΤ3D Now!Ϥäȿɤ
;	99/10/17
;	礭ΤpowȤǽ̵ʤ

;void
;calc_pow4P3dual_FPU(int *ix,float *xr,float *step,float *sum,int n,int next);

		align 16
calc_pow4P3dual_FPU:
%assign ADJ	576*4	;2ܤix/xrؤoffset
		push	ebx
		push	esi
		push	edi
%assign _P 4*3
		fldz						;s0=0
		fld		st0					;st0=s1=0,s0
		mov		ebx,[esp+_P+4]		;ebx=ix
		mov		edx,[esp+_P+8]		;edx=xr
		mov		eax,[esp+_P+12]		;eax=step
		fld		dword [eax]			;step0,s1,s0
		fld		dword [eax+4]		;step1,step0,s1,s0
		mov		ecx,[esp+_P+20]		;ecx=n
		mov		esi,[esp+_P+24]		;esi=next
		mov		edi,pow4P3_table	;ptr to table
		shl		esi,2				;esi=next*sizeof(int)
		align	4
.lp:
		;1ܤpow4P3

		mov		eax,[ebx]			;eax=*ix
;		cmp		eax,PRECALC_SIZE
;		jae		.real_calc0
		fld		dword [edi+eax*4]	;temp:=*ix^(4/3),stp1,stp0,s1,s0
.B0:
		fmul	st0,st2				;temp*=stp0,stp1,stp0,s1,s0
		test	dword [edx],0x80000000
		jz		short .F0
		fchs						;temp=-temp
.F0:
		fsubr	dword [edx]			;df0:=xr-temp,stp1,stp0,s1,s0

		;2ܤpow4P3

		mov		eax,[ebx+ADJ]		;eax=*(ix+576)
;		cmp		eax,PRECALC_SIZE
;		jae		.real_calc1
		fld		dword [edi+eax*4]	;temp=*ix^(4/3),df0,stp1,stp0,s1,s0
.B1:
		fmul	st0,st2				;temp*=stp1,df0,stp1,stp0,s1,s0
		test	dword [edx+ADJ],0x80000000
		jz		short .F1
		fchs						;temp=-temp
.F1:
		fsubr	dword [edx+ADJ]		;df1=xr-temp,df0,stp1,stp0,s1,s0
		add		ebx,esi
		add		edx,esi
		fld		st0					;df1,df1,df0,stp1,stp0,s1,s0
		fadd	st0,st2				;temp=df1+df0,df1,df0,stp1,stp0,s1,s0
		fmul	st0,st0				;temp*=temp,df1,df0,stp1,stp0,s1,s0
		faddp	st6,st0				;df1,df0,stp1,stp0,s1,s0+=temp
		fsubp	st1,st0				;temp=df0-df1,stp1,stp0,s1,s0
		fmul	st0,st0
		faddp	st3,st0				;stp1,stp0,s1+=temp,s0
		dec		ecx
		jnz		.lp
		fcompp						;s1,s0
		fmul	dword [_05]			;s1*0.5,s0
		mov		eax,[esp+_P+16]
		fstp	dword [eax+4]		;s0
		fmul	dword [_05]			;s0*0.5
		fstp	dword [eax]			;-
		pop		edi
		pop		esi
		pop		ebx
		ret

;------------------------------------------------------------------------

;	99/08/11
;	Ǥ礨3D Now!äƤߤ
;	577clk
;	99/11/01
;	㴳κŬ 457clk by shigeo
;	99/11/23
;	㴳κŬ 435clk by kei


;void
;calc_pow4P3dual_3DN(int *ix,float *xr,float *step,float *sum,int n,int next);

		align 16
%if 1
calc_pow4P3dual_3DN:
%assign ADJ	576*4	;2ܤix/xrؤoffset
		push	ebx
		push	esi
		push	edi
		push	ebp		;бpop2սꤢ!!!
%assign _P 4*4
		femms
		pxor	mm7,mm7				;mm7=[s1:s0]	
									;䤬񤤤3263bits1ΰ̣
		mov		ebx,[esp+_P+4]		;ebx=ix
		mov		edx,[esp+_P+8]		;edx=xr
		mov		eax,[esp+_P+12]		;eax=step
		movq	mm6,[eax]			;mm6=[step1:step0]
		mov		ecx,[esp+_P+20]		;ecx=n
		shr		ecx,1				;nloop.csfBandIndexϢ³
									;ǤϾ˶
		mov		esi,[esp+_P+24]		;esi=next
		shl		esi,2				;esi=next*sizeof(int)
		mov		edi,pow4P3_table	;ptr to table
		mov		ebp,D_MSB0_1		;byteάΤ(Ѥ餺)
									;D_MSB1_1ȶѤΤ

		align	4
.lp:
		;󤳤ʤɤ(ޤˡʤʤ)

		mov		eax,[ebx]			;eax=*ix
;		cmp		eax,PRECALC_SIZE
;		jae		dual_overflow
		movd	mm0,[edi+eax*4]	;mm0=[0: temp0=*ix^(4/3)]

		mov		eax,[ebx+ADJ]		;eax=*(ix+576)
;		cmp		eax,PRECALC_SIZE
;		jae		dual_overflow
		punpckldq	mm0,[edi+eax*4]	;mm0=[tmp1:tmp0]

		add		ebx,esi

		mov		eax,[ebx]			;eax=*ix
;		cmp		eax,PRECALC_SIZE
;		jae		dual_overflow
		movd	mm3,[edi+eax*4]	;mm3=[0: temp0=*ix^(4/3)]

		mov		eax,[ebx+ADJ]		;eax=*(ix+576)
;		cmp		eax,PRECALC_SIZE
;		jae		near dual_overflow
		punpckldq	mm3,[edi+eax*4]	;mm3=[tmp1:tmp0]

		movd	mm1,[edx]			;mm1=[0:xr0]
		punpckldq	mm1,[edx+ADJ]	;mm1=[xr1:xr0]

		add		edx,esi

		movd	mm4,[edx]			;mm4=[0:xr0]
		punpckldq	mm4,[edx+ADJ]	;mm4=[xr1:xr0]

		movq	mm2,[ebp-8];=[D_MSB1_1]		;mm2=[0x8000_0000:0x8000_0000]
		movq	mm5,mm2

		pfmul	mm0,mm6				;mm0=[tmp1*stp1:tmp0*stp0]
		pand	mm2,mm1				;mm2=[sign(xr1):sign(xr0)]

		pfmul	mm3,mm6
		pand	mm5,mm4

		pxor	mm0,mm2				;mm0=if( xr < 0)temp=-temp;
		pfsub	mm1,mm0				;mm1=diff:=[xr1-tmp1:xr0-tmp0]

		pxor	mm3,mm5
		pfsub	mm4,mm3

		add		ebx,esi				;ix += next;
		add		edx,esi				;xr += next;

		movq	mm0,mm1				;mm0=[dif1:dif0]
		movq	mm3,mm4

		punpckldq	mm2,mm0			;mm2=[dif0:*]
		punpckldq	mm5,mm3

		pxor	mm1,[ebp];=[D_MSB0_1]		;mm1=[dif1:-dif0]
		punpckhdq	mm0,mm2			;mm0=[dif0:dif1]

		pxor	mm4,[ebp];=[D_MSB0_1]
		punpckhdq	mm3,mm5

		pfsub	mm0,mm1				;mm0=tmp:=[dif0-dif1:dif0+dif1]
		pfsub	mm3,mm4
		pfmul	mm0,mm0				;mm0=tmp*tmp
		pfmul	mm3,mm3
		pfadd	mm7,mm0				;mm7=[s1:s0]
		pfadd	mm7,mm3
		dec		ecx
		jnz		near .lp

		pfmul	mm7,[D_05]		;mm7=[s1*0.5:s0*0.5]
		mov		eax,[esp+_P+16]
		movq	qword [eax],mm7
		femms
		pop		ebp
		pop		edi
		pop		esi
		pop		ebx
		ret
%else
;	¹Ԥ䤹(Ȼפ^^;;)֤ѤС
calc_pow4P3dual_3DN:
%assign ADJ	576*4	;2ܤix/xrؤoffset
		push	ebx
		push	esi
		push	edi
		push	ebp		
%assign _P 4*4
		femms
		mov		eax,[esp+_P+12]		;eax=step
		pxor	mm7,mm7				;mm7=[s1:s0]	
									;䤬񤤤3263bits1ΰ̣
		mov		ebx,[esp+_P+4]		;ebx=ix
		movq	mm6,[eax]			;mm6=[step1:step0]

		mov		edx,[esp+_P+8]		;edx=xr
		movq	mm5,mm6				;mm5=[step1:step0]

		mov		ecx,[esp+_P+20]		;ecx=n
		punpckhdq mm6,[eax]			;mm6=[step1:step1]

		shr		ecx,1				;nloop.csfBandIndexϢ³
									;ǤϾ˶
		mov		esi,[esp+_P+24]		;esi=next

		punpckldq mm5,[eax]			;mm5=[step0:step0]
		mov		edi,pow4P3_table	;ptr to table

		mov		ebp,D_MSB1_1		
		shl		esi,2				;esi=next*sizeof(int)

		align	4
.lp:
		mov		eax,[ebx]			;eax=*ix0
		movd	mm1,[edx]			;mm1=[0:xr0]

		movd	mm0,[edi+eax*4]		;mm0=[0:pow(*ix0)]
		mov		eax,[ebx+esi]		;eax=*(ix0+next)

		punpckldq	mm0,[edi+eax*4]	;mm0=[pow(*(ix0+next)):pow(*ix0)]
		mov		eax,[ebx+ADJ]		;eax=*(ix1)

		movd	mm4,[edx+ADJ]			;mm4=[0:xr1]
		punpckldq	mm1,[edx+esi]	;mm1=[xr0+next:xr0]

		movd	mm3,[edi+eax*4]		;mm3=[0:pow(*(ix1)]
		mov		eax,[ebx+esi+ADJ]	;eax=*(ix1+next)

		punpckldq	mm4,[edx+esi+ADJ]	;mm4=[xr1+next:xr1]
		movq	mm2,[ebp]			;mm2=[0x8000_0000:0x8000_0000]

		punpckldq	mm3,[edi+eax*4]	;mm3=[pow(*(ix1+next):pow(*(ix1)]
		pfmul	mm0,mm5				

		pand	mm2,mm1				
		pfmul	mm3,mm6

		pxor	mm0,mm2				
		lea		ebx,[ebx+esi*2]		;ix += next*2;

		movq	mm2,[ebp]			;mm2=[0x8000_0000:0x8000_0000]
		pfsub	mm1,mm0				;mm1 = [dif0(next):dif0]

		pand	mm2,mm4
		lea		edx,[edx+esi*2]		;xr += next*2;

		pxor	mm3,mm2
		movq	mm0,mm1				;mm0=[dif0(next):dif0]

		pfsub	mm4,mm3				;mm4 = [dif1(next):dif1]
		pfadd	mm0,mm4				;mm0=[dif0(next)+dif1(next):dif0+dif1]

		pfsub	mm1,mm4				;mm3=[dif0(next)-dif1(next):dif0-dif1]
		pfmul	mm0,mm0

		pfmul	mm1,mm1

		pfacc	mm0,mm0
		pfacc	mm1,mm1

		punpckldq	mm1,mm0
		pfadd	mm7,mm1				;mm7=[s1:s0]

		dec		ecx
		jnz		near .lp

		mov		eax,[esp+_P+16]
		pop		ebp

		pfmul	mm7,[D_05]		;mm7=[s1*0.5:s0*0.5]
		pop		edi

		pop		esi
		pop		ebx

		movq	qword [eax],mm7
		femms
		ret
%endif

;************************************************************************

;	99/11/12	Initial version for SSE by K.Sakai
;				pow ȤʤȲ:-)
;				790clk

;void
;calc_pow4P3dual_SSE(int *ix,float *xr,float *step,float *sum,int n,int next);

		align 16
calc_pow4P3dual_SSE:
%assign ADJ	576*4	;2ܤix/xrؤoffset
		push	ebx
		push	esi
		push	edi
%assign _P 4*3
		xorps	xmm0,xmm0			; = s0 = 0.0
		xorps	xmm1,xmm1			; = s1 = 0.0
		mov		ebx,[esp+_P+4]		;ebx=ix
		mov		edx,[esp+_P+8]		;edx=xr
		mov		eax,[esp+_P+12]		;eax=step
		movss	xmm6,[eax  ]		; = step0
		movss	xmm7,[eax+4]		; = step1
		mov		ecx,[esp+_P+20]		;ecx=n
		mov		esi,[esp+_P+24]		;esi=next
		mov		edi,pow4P3_table	;ptr to table
		shl		esi,2				;esi=next*sizeof(int)
		jmp		short .lp

		align	16
.lp:

		movss	xmm4,[edx]
		movss	xmm5,[edx+ADJ]

		;1ܤpow4P3
		mov		eax,[ebx]			;eax=*ix
		movss	xmm2,[edi+eax*4]	; =* ix^(4/3)
		mulss	xmm2,xmm6			; *= stp0
		movmskps	eax,xmm4
		test	al,1
		mov		eax,[ebx+ADJ]		;eax=*(ix+576)
		movss	xmm3,[edi+eax*4]	; = *ix^(4/3)
		jz		short .f00			; diff = (*xr >= 0)? (*xr - tmp): (*xr - (-tmp));
		addss	xmm4,xmm2
		jmp		short .f01
.f00:
		subss	xmm4,xmm2
.f01:
		movss	xmm2,xmm4

		mulss	xmm3,xmm7			; *= stp1
		movmskps	eax,xmm5
		test	al,1
		jz		short .f10			; diff = (*xr >= 0)? (*xr - tmp): (*xr - (-tmp));
		addss	xmm5,xmm3
		jmp		short .f11
.f10:
		subss	xmm5,xmm3
.f11:

		add		ebx,esi
		add		edx,esi
		dec		ecx
		addss	xmm2,xmm5
		subss	xmm4,xmm5
		mulss	xmm2,xmm2
		mulss	xmm4,xmm4
		addss	xmm0,xmm2
		addss	xmm1,xmm4

		jnz		.lp

		mov		eax,[esp+_P+16]
		movss	xmm3,[_05]
		mulss	xmm0,xmm3			; s0*0.5
		mulss	xmm1,xmm3			; s1*0.5
		movss	[eax],xmm0
		movss	[eax+4],xmm1
		pop		edi
		pop		esi
		pop		ebx
		ret
;************************************************************************

%if 0	;00/01/16
;	98/07/12 shigeo
;	00/01/05
;	פʬ(ɬ !cod_info->window_switching_flag || cod_info->block_type != SHORT_TYPE Ǥ)

;void calc_runlen( int ix[576], gr_info *cod_info );
;	®ʤ
;	MMXǤȤʤ

		align	16
calc_runlen:
		push	ebx
%assign _P 4*1
		mov		ecx,[esp+_P+4]
		mov		ebx,[esp+_P+8]
.start:
		sub		ecx,4
		push	esi
		mov		edx,576*4			;edx=i*4
		mov		eax,[ecx+edx-4]
		mov		esi,[ecx+edx-0]
.lp1:
		or		eax,esi
		jne		short .next
		mov		eax,[ecx+edx-4-8]
		mov		esi,[ecx+edx-0-8]
		sub		edx,2*4
		jnz		short .lp1
.next:
		mov		esi,0			;esi=count1=0
		cmp		edx,3*4
		jle		short .exit
%if 0
		align	4
.lp2:
		mov		eax,[ecx+edx-12]
		or		eax,[ecx+edx- 8]
		or		eax,[ecx+edx- 4]
		or		eax,[ecx+edx- 0]
		cmp		eax,1
		jg		short .exit2
		inc		esi
		sub		edx,4*4
		cmp		edx,4*3
		jg		short .lp2
.exit2:
%else					;ä礳ä®ߤ(669 v.s. 653)
		push	ebp
.lp2:
		mov		eax,[ecx+edx-12]
		mov		ebp,[ecx+edx- 8]
		or		eax,ebp
		mov		ebp,[ecx+edx- 4]
		or		eax,ebp
		mov		ebp,[ecx+edx- 0]
		or		eax,ebp
		inc		esi
		cmp		eax,1
		jg		short .exit2
		sub		edx,4*4
		cmp		edx,4*3
		jg		short .lp2
.exit2:
		dec		esi
		pop		ebp
%endif
.exit:
		mov		[ebx+count1],esi
		pop		esi
		shr		edx,3
		mov		[ebx+big_values],edx
		pop		ebx
		ret
%endif

;************************************************************************

;int ix_max( int ix[576], unsigned int begin, unsigned int end );
;	99/08/14
;	1840clk
;	begin<end

		align	16
ix_max_NONE:
%assign _P 4*0
		mov		edx,[esp+_P+8]		;edx=begin
		mov		ecx,[esp+_P+12]		;ecx=end
		xor		eax,eax
		push	ebx
%assign _P 4*1
		mov		ebx,[esp+_P+4]		;ebx=ix
		sub		ecx,edx			;ecx=end-begin
		lea		edx,[ebx+edx*4]		;edx=&ix[begin]
		align	4
.lp:
		mov		ebx,[edx]
		add		edx,4
		cmp		ebx,eax
		ja		.max			;ǡñĴݤΤǵ®
.conti:
		dec		ecx
		jnz		short .lp
		pop		ebx
		ret
.max:
		mov		eax,ebx
		jmp		short .conti

;------------------------------------------------------------------------

;int ix_max_MMX( int ix[576], unsigned int begin, unsigned int end );

;	99/08/14	Initial version by K.Sakai
;	99/11/12	suppose end > begin already
		align	16
ix_max_MMX:
		mov		edx,[esp+8]		;edx = begin
		mov		ecx,[esp+12]
		xor		eax,eax
		sub		ecx,edx			;ecx = end-begin
;		jz		.zero
		shl		edx,2
		add		edx,[esp+4]
		shr		ecx,1			;ecx=ecx/2
		pxor	mm0,mm0			;mm0=[0:0]
		pxor	mm1,mm1			;mm1=[0:0]
;		jc		.odd
.even:
		test	cl,1
		jz		.lp
.two:
		movq	mm1,[edx]
		and		ecx,0xFFFFFFFE
		jz		.exit
		add		edx,8

		align	16
.lp:
		movq	mm4,[edx+ecx*8-16]
		movq	mm5,[edx+ecx*8- 8]
		sub		ecx,2
		movq	mm2,mm0
		movq	mm3,mm1
		pcmpgtd	mm2,mm4
		pcmpgtd	mm3,mm5
		pand	mm0,mm2
		pand	mm1,mm3
		pandn	mm2,mm4
		pandn	mm3,mm5
		por		mm0,mm2
		por		mm1,mm3
		jnz		.lp

.exit:
		movq	mm2,mm0
		pcmpgtd	mm2,mm1
		pand	mm0,mm2
		pandn	mm2,mm1
		por		mm0,mm2

		movq	mm4,mm0
		punpckhdq	mm4,mm4
		movq	mm2,mm0
		pcmpgtd	mm2,mm4
		pand	mm0,mm2
		pandn	mm2,mm4
		por		mm0,mm2
		movd	eax,mm0
		emms
.zero:
		ret

;------------------------------------------------------------------------

;int ix_max_3DN( int ix[576], unsigned int begin, unsigned int end );

;	pi2fdȤäƤΤΤ֤ͤˤ
;	ix[i]2^24ǤʤФʤʤ
;	Υ롼󤬻ȤʬǤϤʤ礭ͤ뤳ȤϤʤ
;	99/08/14
;	1040clk
;	begin,end˶Ǥ뤳Ȥꤹ
;	800clk
;	00/01/05
;	bug fix if end == begin+2  פɤ
		align	16
ix_max_3DN:
		mov		eax,[esp+4]			;eax=ix
		mov		edx,[esp+8]			;edx=begin
		mov		ecx,[esp+12]		;ecx=end
		sub		ecx,edx				;ecx=end-begin
		lea		edx,[eax+edx*4]		;edx=&ix[begin]
		xor		eax,eax
		femms
		pxor	mm0,mm0				;mm0=[0:0]
		shr		ecx,2				;ecx=ecx/4(ecx϶)
		jc		short .two

		jmp		.lp
		align	16
.lp:
		pi2fd	mm1,[edx]			;4byte饤󤷤Ƥʤ⤢뤬
		pi2fd	mm2,[edx+8]			;ˤʤǤ(^^;
		add		edx,16
		pfmax	mm0,mm1				;pfmaxFPUӤʤΤpi2fdɬ
		pfmax	mm0,mm2
		loop	.lp

.nokori:
		movq	mm1,mm0
		psrlq	mm0,32
		pfmax	mm0,mm1
		pf2id	mm0,mm0
		movd	ecx,mm0
		cmp		eax,ecx				;eaxϰֺǽ餫0
		jae		short .exit
		mov		eax,ecx
.exit:
		femms
		ret
.two:
		pi2fd	mm0,[edx]
		jz		short .nokori		;ZFȤ ecx==2ä
		add		edx,8
		jmp		short .lp

;------------------------------------------------------------------------
;	99/12/30	Initial version by kei-i
;int ix_max_E3DN( int ix[576], unsigned int begin, unsigned int end );

%ifdef USE_E3DN

proc ix_max_E3DN
        mov     eax,[esp+4]         ;eax=ix
        mov     edx,[esp+8]         ;edx=begin
        mov     ecx,[esp+12]        ;ecx=end
        sub     ecx,edx             ;ecx=end-begin
        lea     edx,[eax+edx*4]     ;edx=&ix[begin]

        femms
        pxor    mm6,mm6             ;mm6=[0:0]
        pxor    mm7,mm7             ;mm7=[0:0]

        shr     ecx,2               ;ecx=ecx/4(ecx϶)
        jnc     short .check.lp4

        pi2fd   mm7,[edx]
        jz      near .nokori    
        add     edx,8

.check.lp4:
        mov     eax, ecx
        and     ecx, 3
        jz      short .check.lp16
        jmp     .lp4

        align   16
.lp4:
        pi2fd   mm1,[edx]           
        pi2fd   mm2,[edx+8]         
        add     edx,16
        pfmax   mm6,mm1             
        pfmax   mm7,mm2
        loop    .lp4

.check.lp16:
        shr     eax, 2
        jz      .nokori
        jmp     short .lp16

        align   16
.lp16:
        prefetch    [edx+64]

        pi2fd   mm0,[edx]           
        pi2fd   mm1,[edx+8]         
        pi2fd   mm2,[edx+16]            
        pi2fd   mm3,[edx+24]            

        pfmax   mm6, mm0
        pfmax   mm7, mm1

        pi2fd   mm0,[edx+32]            
        pi2fd   mm1,[edx+40]            

        pfmax   mm6, mm2
        pfmax   mm7, mm3

        pi2fd   mm2,[edx+48]            
        pi2fd   mm3,[edx+56]            

        pfmax   mm6, mm0
        pfmax   mm7, mm1

        add     edx,64

        pfmax   mm6, mm2
        pfmax   mm7, mm3

        dec     eax
        jnz     near .lp16

.nokori:
        pfmax   mm7, mm6
        pswapd  mm0, mm7
        pfmax   mm7,mm0
        pf2id   mm7,mm7
        movd    eax,mm7
.exit:
        femms
        ret
%endif

;------------------------------------------------------------------------

;	99/08/05	Initial version by K.Sakai 400clk
;	99/11/12	äѹ
;
;int ix_max_SSE( int ix[576], unsigned int begin, unsigned int end );
		align	16
ix_max_SSE:
		mov		edx,[esp+8]		;edx = begin
		mov		ecx,[esp+12]
		xor		eax,eax
		sub		ecx,edx			;ecx = end-begin
		jbe		.exit
		shl		edx,2
		add		edx,[esp+4]
		shr		ecx,1				;ecx=ecx/2
		xorps	xmm0,xmm0		;xmm0=[0:0]
		xorps	xmm1,xmm1		;xmm1=[0:0]
;		jc		.odd
.even:
		test	cl,1
		jnz		.two
		jmp		short .lp

		align	16
.lp:
		cvtpi2ps	xmm1,[edx+ecx*8-16]
		cvtpi2ps	xmm2,[edx+ecx*8-8]
		sub		ecx,2
		movlhps	xmm1,xmm2
		maxps	xmm0,xmm1
		jnz		.lp

.two_exit:
		movhlps	xmm1,xmm0
		maxps	xmm0,xmm1
		movss	xmm1,xmm0
		shufps	xmm0,xmm0,0xB1
		maxss	xmm0,xmm1
		cvtss2si	eax,xmm0
.exit:
		ret

.two:
		movlhps	xmm0,xmm0
		cvtpi2ps	xmm0,[edx]
		and		ecx,0xFFFFFFFE
		jz		.two_exit
		add		edx,8
		jmp		.lp

;***********************************************************************
;	by shigeo
;	99/08/27
;	5ʬǺ(^^;
;	⤿ʤλĤäƤȤϡquantize_xrpowȤäĤ󤫤
;	㴳κŬ(13000clk)

;void calc_pow075_3DN( float *xr, float *xrpow );
		align 16
calc_pow075_3DN:
		mov		eax,[esp+4]		;eax=xr
		mov		edx,[esp+8]		;edx=xrpow
		mov		ecx,576/2
		femms
		movq	mm7,[D_ABS]
		jmp		.lp
		align 16
.lp:
		movq	mm0,[eax]
		add		eax,8
		pand	mm0,mm7

		pfrsqrt	mm1,mm0
			movq	mm3,mm0
		movq	mm2,mm1
			psrlq	mm3,32		;mm3=|xr[i+1]|
		pfmul	mm1,mm1
			pfrsqrt	mm4,mm3
		pfrsqit1	mm1,mm0
			movq	mm5,mm4
			pfmul	mm4,mm4
		pfrcpit2	mm1,mm2		;mm1=1/|xr[i]|
			pfrsqit1	mm4,mm3

		pfrsqrt	mm0,mm1
			pfrcpit2	mm4,mm5	;mm4=1/|xr[i+1]|
		movq	mm2,mm0
			pfrsqrt		mm3,mm4
		pfmul	mm0,mm0
			movq	mm5,mm3
			pfmul	mm3,mm3
		pfrsqit1	mm0,mm1
			pfrsqit1	mm3,mm4
		pfrcpit2	mm0,mm2		;mm0=|xr[i]|
			pfrcpit2	mm3,mm5	;mm3=|xr[i+1]|
		
		punpckldq	mm0,mm3		;mm0=[|xr[i+1]|:|xr[i]|]
		movq	mm1,mm0
		pfmul	mm0,mm0
		pfmul	mm0,mm1			;mm0=[|xr|^(3/4)]
		movq	[edx],mm0
		add		edx,8
		loop	.lp
		femms
		ret

;***********************************************************************
;	by kei
;	00/03/02
%ifdef USE_E3DN
;void calc_pow075_E3DN( float *xr, float *xrpow );
	align 16
calc_pow075_E3DN:
;	extern _clkbegin
;	extern _clkend
;	call 		_clkbegin

	mov		eax,[esp+4]		;eax=xr
	mov		edx,[esp+8]		;edx=xrpow
	mov		ecx,-576*4
	add		eax, 576*4
	add		edx, 576*4
	femms
	jmp		.lp

	align 16
.lp:
	prefetch	[eax+ecx+64]
	prefetchw	[edx+ecx+64]
%assign i 0
%rep 8
	movq	mm0, [eax+ecx+fsizen(i)]
	pand	mm0, [D_ABS]
	pswapd	mm4, mm0
	pfrsqrt	mm1, mm0
	pfrsqrt	mm5, mm4
	movq	mm2, mm1
	movq	mm6, mm5
	pfmul	mm1, mm1
	pfmul	mm5, mm5
	pfrsqit1 mm1, mm0
	pfrsqit1 mm5, mm4
	pfrcpit2 mm1, mm2
	pfrcpit2 mm5, mm6
	pfrsqrt mm3, mm1
	pfrsqrt mm7, mm5
	movq	mm2, mm3
	movq	mm6, mm7
	pfmul	mm3, mm3
	pfmul	mm7, mm7
	pfrsqit1 mm3, mm1
	pfrsqit1 mm7, mm5
	pfrcpit2 mm3, mm2
	pfrcpit2 mm7, mm6
	punpckldq	mm3,mm7
	movq	mm7, mm3
	pfmul	mm3, mm3
	pfmul 	mm3, mm7
	movq	[edx+ecx+fsizen(i)], mm3

%if 0  ; ޤäȤ̿С
	movq	mm0, [eax+ecx+fsizen(i)]
	pand	mm0, [D_ABS]
	pswapd	mm4, mm0

	pfrsqrt	mm1, mm0
	movq	mm2, mm1
	pfmul	mm1, mm1
	pfrsqit1 mm1, mm0
	pfrcpit2 mm1, mm2

	pfrsqrt mm3, mm1
	movq	mm2, mm3
	pfmul	mm3, mm3
	pfrsqit1 mm3, mm1
	pfrcpit2 mm3, mm2

	pfrsqrt	mm5, mm4
	movq	mm6, mm5
	pfmul	mm5, mm5
	pfrsqit1 mm5, mm4
	pfrcpit2 mm5, mm6

	pfrsqrt mm7, mm5
	movq	mm6, mm7
	pfmul	mm7, mm7
	pfrsqit1 mm7, mm5
	pfrcpit2 mm7, mm6

	punpckldq	mm3,mm7
	movq	mm7, mm3
	pfmul	mm3, mm3
	pfmul 	mm3, mm7
	movq	[edx+ecx+fsizen(i)], mm3
%endif

%assign i i+2
%endrep
	add	ecx, 64
	jnz	near .lp
	
.owari
	femms
;	call	_clkend
	ret
%endif

;************************************************************************
;	99/08/27	Initial version by K.SAKAI, 2700clk ۥޤ...
;	99/11/15	s/movups/movaps/ by K.SAKAI
;void calc_pow075_SSE( float *xr, float *xrpow );
		align 16
calc_pow075_SSE:
		movlps	xmm7,[D_ABS]
		mov		eax,[esp+4]		;eax=xr
		mov		edx,[esp+8]		;edx=xrpow
		mov		ecx,576/8
		movlhps	xmm7,xmm7
		jmp		short .lp0

; 42
		align 16
.lp0:
		movaps	xmm0,[eax+ 0]
		movaps	xmm1,[eax+16]
		add		eax,32
		andps	xmm0,xmm7		; abs() ӥåȤ0ˤ
		andps	xmm1,xmm7		; abs() ӥåȤ0ˤ

		rsqrtps	xmm2,xmm0		; 1/a
		rsqrtps	xmm3,xmm1		; 1/a
		mulps	xmm0,xmm2		; a
		mulps	xmm1,xmm3		; a
		rsqrtps	xmm2,xmm2		; a
		rsqrtps	xmm3,xmm3		; a
		mulps	xmm2,xmm0
		mulps	xmm3,xmm1
		movaps	xmm0,xmm2
		movaps	xmm1,xmm3
		cmpps	xmm0,xmm0,0		; NaNɤ
		cmpps	xmm1,xmm1,0		; NaNɤ
		andps	xmm2,xmm0		; 黻NaNˤʤäƤˤ
		andps	xmm3,xmm1		; 黻NaNˤʤäƤˤ

		movaps	[edx+ 0],xmm2
		movaps	[edx+16],xmm3
		add		edx,32

		dec		ecx
		jnz		.lp0

		ret

;	by K.SAKAI
;	99/08/27
;************************************************************************
;void calc_pow075_FPU( float *xr, float *xrpow );
; ľsqrt60k[clk]@PII, 68k[clk]@P55C
		align 16
calc_pow075_FPU:
		mov		eax,[esp+4]		;eax=xr
		mov		edx,[esp+8]		;edx=xrpow
		mov		ecx,576
		jmp		short .lp0

		align 16
.lp0:
		fld		dword [eax]
		add		eax,4
		fabs

		fld		st0
		fsqrt
		fmulp	st1,st0
		fsqrt

		fstp	dword [edx]
		add		edx,4

		dec		ecx
		jnz		.lp0

		ret

; by K.SAKAI
; 99/08/30	2ˤ 38k[clk]55k[clk]
;
; y = x^0.75
;
; b = x - 1, 0 < x < 2
; y = 1
;   + b*3/(4)
;   - b*b*3/(4*4*2)
;   + b*b*b*3*5/(4*4*4*2*3)
;   - b*b*b*b*3*5*9/(4*4*4*4*2*3*4)
;   + b*b*b*b*b*3*5*9*13/(4*4*4*4*4*2*3*4*5)
;   - b*b*b*b*b*b*3*5*9*13*17/(4*4*4*4*4*4*2*3*4*5*6)
;   + b*b*b*b*b*b*b*3*5*9*13*17*21/(4*4*4*4*4*4*4*2*3*4*5*6*7)
;   - b*b*b*b*b*b*b*b*3*5*9*13*17*21*25/(4*4*4*4*4*4*4*4*2*3*4*5*6*7*8)
;
; mantissa, exponent ʬΥƵmantissaŬѡ
; exponentñ3/4뤳Ȥˤơümantissa˾軻롣
;
; 1) -0.25 <= b <= 0.25 ˼
;    1.0 <= mantissa <= 1.25 Τޤ
;    1.2 <= mantissa < 2.0 x0.625롢̤1.6^0.75򤫤롩
;
; 2) -0.125 <= b <= 0.125 ˼
;    1.0 <= mantissa <= 1.28125(1.01001b)
;	 x0.875(14/16)
;    1.272727 < mantissa <= 1.625(1.101b)
;	x0.6875(11/16)
;    1.555555 < mantissa < 2.0
;	x0.5625(9/16)

		align 16

calc_pow075_NONE:
		push	ebp
		push	ebx
		push	esi
		push	edi
%assign	_P 4*4
		mov		ebx,576
		jmp		short .lp0

		align 16
.lp0:
; ecx = *xr++
		mov		esi,[esp+_P+4]
		mov		ecx,[esi+ebx*4-4]

		xor		edx,edx
		rol		ecx,9
		test	cl,cl
		jz		near .store			; de-normalized number
		cmp		cl,255
		je		near .store			; not a number

		mov		edi,ecx
		stc
		rcr		edi,1
		shr		edi,8		; mantissa, fixed point 9.23

		xor		ebp,ebp
		cmp		edi,0x00A40000		; 1.28125, fixed point 9.23
		adc		ebp,edx
		cmp		edi,0x00D00000		; 1.625, fixed point 9.23
		adc		ebp,edx
		movzx	eax,byte [pow075_table1 + ebp]
		shl		ebp,4
		mul		edi					; edx:eax = 37.27
%if 0
		mov		edi,eax
		sub		edi,(1<<27)			; edi - 1.0, 5.27
%else
		lea		edi,[eax-(1<<27)]
%endif
; start approximation
; y = 1
		mov		esi,(1<<29)			; esi = 1.0, 3.29
;   + b*3/(4)
		mov		eax,edi
		shl		eax,1
		add		eax,edi				; eax = 3.29
		add		esi,eax				; esi = 3.29
;   - b*b*3/(4*4*2)
		imul	edi					; edx:eax = 5.59
%if 0
		shld	edx,eax,2
%else
		shl		edx,2
%endif
		sub		esi,edx
; 3ࢬޤǤ1ʬ1ʲ, 28k[clk]@PII, 46k[clk]@P55C
; ٤SSEεʿɤ
;   + b*b*b*3*5/(4*4*4*2*3)
		mov		eax,0x6AAAAAAA		; 5/(4*3), 0.32
		imul	edx					; edx:eax = 3.61
		mov		eax,edx
		imul	edi					; edx:eax = 8.56
%if 0
		shld	edx,eax,5
%else
		shl		edx,5
%endif
		add		esi,edx
; 4ࢬޤǤ10ʬ1ʲ, 33k[clk]@PII, 55k[clk]@P55C
%if 0
;   - b*b*b*b*3*5*9/(4*4*4*4*2*3*4)
		mov		eax,0x48000000		; 9/(4*4), 1.31
		imul	edx					; edx:eax = 4.60
		mov		eax,edx
		imul	edi					; edx:eax = 9.55
		shld	edx,eax,6
		sub		esi,edx
; 5ࢬޤǤ10ʬ1ʲ(mantissaβӥåȤ), 40k[clk]@PII
;   + b*b*b*b*b*3*5*9*13/(4*4*4*4*4*2*3*4*5)
		mov		eax,0x53333333		; 13/(4*5), 1.31
		imul	edx					; edx:eax = 4.60
		mov		eax,edx
		imul	edi					; edx:eax = 9.55
		shld	edx,eax,6
		add		esi,edx
; 6ࢬޤǤǤ϶֤ξümantissaLSB٤Ȥ⤦
; LSB٤ΤȤȤϡrounding ΰ㤤⡣
%endif
;
; at here,
; esi is mantissa(3.29 fixed point)
; cl is exponent(offset binary)

; calculate exponent
		sub		cl,127
		xor		eax,eax
		mov		al,cl
		and		al,3
		sar		cl,2
		mov		eax,[pow075_table0+ebp+eax*4]
		imul	esi					; edx:eax = 9.55, 1.0 <= edx:eax < 8.0
									; edx  9.23βäƤ롣
; normalize
		xor		eax,eax				; ӥåȤ(=0)
		mov		al,cl
		add		cl,cl
		add		al,cl				; al = λؿ(2's comp.)
		bsr		ecx,edx
		sub		cl,23
		add		al,cl
		add		al,127				; al = λؿ(offset bin.)
%if 0
		neg		cl
		add		cl,9
		shl		edx,cl				; normalize the mantissa
		shrd	edx,eax,9
%else
%if 0
		neg		cl
		add		cl,9
		shl		edx,cl				; normalize the mantissa
		mov		dl,al
		ror		edx,8
		clc
		rcr		edx,1
%else
		shr		edx,cl				; normalize the mantissa
		and		edx,0x007fffff
		shl		eax,23
		or		edx,eax
%endif
%endif

; *xrpow++ = edx
.store:
		mov		edi,[esp+_P+8]
		mov		[edi+ebx*4-4],edx

		dec		ebx
		jnz		near .lp0

.return:
		pop		edi
		pop		esi
		pop		ebx
		pop		ebp
		ret


;************************************************************************

;	input	eax
;	output	st0=pow(2,eax/4)
;	dest	eax,edx
		align	16
pow2iP4:
		mov		edx,eax
		and		edx,3
		shr		eax,2
		fld		dword [powiP4table+edx*4]	;2^(edx/4)

		shl		eax,23
		add		eax,0x3F800000		;eax=2^m
		push	eax
		fld		dword [esp]			;*(float *)&m,ret
		fmulp	st1,st0
		pop		eax
		ret

;------------------------------------------------------------------------

;	input	eax
;	output	st0=pow(2,eax/16)
;	dest	eax,edx
		align	16
pow2iP16:
		mov		edx,eax
		and		edx,15
		shr		eax,4
		fld		dword [powiP16table+edx*4]	;2^(edx/16)

		shl		eax,23
		add		eax,0x3F800000		;eax=2^m
		push	eax
		fld		dword [esp]			;*(float *)&m,ret
		fmulp	st1,st0
		pop		eax
		ret

;************************************************************************
;	99/09/04 by shigeo
;	ʤ󤫤ΤФäƤ(^^;

;void ms_convert(float xr[2][576],float xr_org[2][576]);
		align	16
ms_convert_FPU:
		push	ebx
%assign _P 4*1
		fld		dword [D_1Psqr2]	;1/2
		mov		eax,[esp+_P+4]		;eax=xr[0]
		mov		edx,[esp+_P+8]		;edx=xr_org[0]
		mov		ecx,576
		mov		ebx,4*576
		jmp		.lp
		align	16
.lp:
		fld		dword [edx]			;org0,1/2
		fld		st0					;org0,org0,1/2
		fld		dword [edx+ebx]		;org1,org0,org0,1/2
		add		edx,4
		fadd	st1,st0				;org1,org0+org1,org0,1/2
		fsubp	st2,st0				;org0+org1,org0-org1,1/2
		fmul	st0,st2				;(org0+org1)/2,org0-org1,1/2
		fstp	dword [eax]			;org0-org1,1/2,1/2
		fmul	st0,st1				;(org0-org1)/2,1/2
		fstp	dword [eax+ebx]		;1/2
		add		eax,4
		dec		ecx
		jnz		.lp
.exit:
		fstp	st0					;-
		pop		ebx
		ret
;------------------------------------------------------------------------
		align	16
ms_convert_3DN:
		push	ebx
		femms
%assign _P 4*1
		movq	mm7,[D_1Psqr2]		;mm7=[1/2:1/2]
		mov		eax,[esp+_P+4]		;eax=xr[0]
		mov		edx,[esp+_P+8]		;edx=xr_org[0]
		mov		ecx,576/2
		mov		ebx,4*576
		jmp		.lp
		align	16
.lp:
		movq	mm0,[edx]			;mm0=org0
		movq	mm1,[edx+ebx]		;mm1=org1
		movq	mm2,mm0				;mm2=org0
		add		edx,8
		pfadd	mm0,mm1				;mm0=org0+org1
		pfmul	mm0,mm7
		pfsub	mm2,mm1				;mm2=org0-org1
		pfmul	mm2,mm7
		movq	[eax],mm0
		movq	[eax+ebx],mm2
		add		eax,8
		loop	.lp
.exit:
		femms
		pop		ebx
		ret
;------------------------------------------------------------------------
;	00/01/21 by Kei
%ifdef USE_E3DN
		align	16
ms_convert_E3DN:
;	extern _clkbegin
;	extern _clkend
;	call 		_clkbegin
		
		femms
%assign _P 4*1
		movq	mm7,[D_1Psqr2]		;mm7=[1/2:1/2]
		mov		ecx,-576*4
		mov		eax,[esp+_P]		;eax=xr[0]
		mov		edx,[esp+_P+4]		;edx=xr_org[0]
		sub		eax, ecx
		sub		edx, ecx
		jmp		.lp
		align	16
.lp:
		prefetch	[edx+ecx+64]
;		prefetchw	[eax+ecx+64]
%assign	i 0
%rep 4
		movq	mm0,[edx+ecx+fsizen(i)]			;mm0=org0
		movq	mm3,[edx+ecx+fsizen(i+2)]			;mm3=org0

		movq	mm1,[edx+ecx+fsizen(576+i)]		;mm1=org1
		movq	mm4,[edx+ecx+fsizen(576+i+2)]		;mm4=org1

		movq	mm2,mm0				;mm2=org0
		movq	mm5,mm3				;mm5=org0

		pfadd	mm0,mm1				;mm0=org0+org1
		pfadd	mm3,mm4				;mm3=org0+org1

		pfsub	mm2,mm1				;mm2=org0-org1
		pfsub	mm5,mm4				;mm5=org0-org1

		pfmul	mm0,mm7
		pfmul	mm3,mm7

		movq	[eax+ecx+fsizen(i)],mm0
		movq	[eax+ecx+fsizen(i+2)],mm3

		pfmul	mm2,mm7
		pfmul	mm5,mm7

		movq	[eax+ecx+fsizen(576+i)],mm2
		movq	[eax+ecx+fsizen(576+i+2)],mm5
%assign	i i+4
%endrep
		add		ecx, 64
		jnz		near .lp
.exit:
		femms
;	call 		_clkend
		ret
%endif
;------------------------------------------------------------------------
;	99/11/??	Initial version by K.SAKAI
;	99/11/15	s/movups/movaps/ by K.SAKAI
		align	16
ms_convert_SSE:
		movss	xmm7,[D_1Psqr2]		;1/2
		shufps	xmm7,xmm7,0x00
		mov		eax,[esp+4]		;eax=xr[0]
		mov		edx,[esp+8]		;edx=xr_org[0]
		mov		ecx,(576-4)*4
		jmp		short .lp0
		align	16
.lp0:
		movaps	xmm0,[edx+ecx]			;xmm0=org0
		movaps	xmm2,xmm0
		movaps	xmm1,[edx+ecx+4*576]		;xmm1=org1
		addps	xmm0,xmm1
		mulps	xmm0,xmm7
		subps	xmm2,xmm1
		mulps	xmm2,xmm7
		movaps	[eax+ecx    ],xmm0
		movaps	[eax+ecx+4*576],xmm2
		sub		ecx,4*4
		jae		.lp0
.exit:
		ret
;************************************************************************
;	00/01/16 by shigeo
;int calc_runlen_count1( int *ix, gr_info *cod_info );
;	1190clk (calc_runlen + count1_bitcountؤѤ餺)
;	 970clk a little optimization

proc	calc_runlen_count1
		push	ebx
		push	esi
		push	edi
		push	ebp
%assign _P 4*4
		xor		esi,esi			;esi=sum00
		mov		ecx,576-2
		mov		ebx,[esp+_P+4]	;ebx=ix
		xor		edi,edi			;edi=sum01
		add		ebx,(576-2)*4	;ebx=ix[576-2];
		align	4
.lp1:
		mov		eax,[ebx]
		or		eax,[ebx+4]
		jnz		.F0
		sub		ebx,8
		sub		ecx,2
		jg		.lp1
		xor		ebp,ebp			;count1=0
		jmp		short .exit1
.F0:
		sub		ecx,2			;ecx=i-4
		sub		ebx,8
		push	ecx
		align	4
.lp2:
		mov		ebp,[ebx]			;v
		mov		edx,[ebx+4]			;w
		lea		eax,[edx+ebp*2]		;w+(v<<1)
		or		edx,ebp
		mov		ebp,[ebx+8]			;x
		lea		eax,[ebp+eax*2]		;x+(w<<1)+(v<<2)
		or		edx,ebp
		mov		ebp,[ebx+12]		;y
		or		edx,ebp
		cmp		edx,1
		ja		.exit2
		lea		eax,[ebp+eax*2]		;y+(x<<1)+(w<<2)+(v<<3)
		sub		ebx,16
		add		esi,[count1_table+eax*4]	;sum00 += v + w + x + y;
		add		edi,[hlen32_table+eax*4]	;sum01 += hlen32[p];
		sub		ecx,4
		jge		.lp2
.exit2:
		pop		ebp
		sub		ebp,ecx
		shr		ebp,2				;count1
.exit1:
		add		ecx,4				;ecx=i
		mov		edx,[esp+_P+8]
		shr		ecx,1
		lea		eax,[esi+edi]		;eax=sum0
		mov		[edx+big_values],ecx	;big_values=i/2;
		mov		[edx+count1],ebp
		lea		ecx,[esi+ebp*4]		;ecx=sum1
		pop		ebp
		sub		eax,ecx				;eax = eax-ecx
		pop		edi
		sbb		ebx,ebx				;ebx = (eax<ecx)?-1:0
		pop		esi
		and		eax,ebx				;eax = (eax<ecx)?eax-ecx:0
		inc		ebx					;ebx = (eax<ecx)?0:1
		add		eax,ecx				;eax = (eax<ecx)?eax:ecx
		mov		[edx+count1table_select],ebx
		pop		ebx
		ret

%if 0	;00/01/16
;	99/09/04 by shigeo

	externdef hlen32
	externdef hlen33

;int count1_bitcount( int ix[ 576 ], gr_info *cod_info );
		align	16
count1_bitcount:
		push	ebx
		push	esi
		push	edi
		push	ebp
%assign _P 4*4
		mov		edi,[esp+_P+4]		;edi=ix
		mov		eax,[esp+_P+8]		;eax=cod_info
		mov		ebx,[eax+big_values]
		lea		edi,[edi+ebx*8]		;ix += big_values * 2;
		mov		ecx,[eax+count1]	;ecx=count1
		xor		esi,esi				;esi=sum0
		xor		ebp,ebp				;ebp=sum1
		cmp		ecx,0
		jz		.next
		align	4
.lp:
		mov		eax,[edi+12]		; lameХ
		mov		edx,[edi+8]
		lea		eax,[eax+edx*2]
		mov		edx,[edi+4]
		lea		eax,[eax+edx*4]
		mov		edx,[edi]
		lea		eax,[eax+edx*8]		;p = (v << 3) + (w << 2) + (x << 1) + y;

		add		edi,4*4
		mov		edx,[count1_table+eax*4]	;edx=signbits
		add		esi,edx				;sum0 += signbits
		add		ebp,edx				;sum1 += signbits
		movzx	edx,byte [hlen32+eax]	;edx=ht[32].hlen[p]
		add		esi,edx
		movzx	edx,byte [hlen33+eax]	;edx=ht[33].hlen[p]
		add		ebp,edx
		dec		ecx
		jnz		.lp
.next:
		mov		eax,[esp+_P+8]
		cmp		esi,ebp
		jae		.F0
		mov		dword [eax+count1table_select],0
		mov		eax,esi
		jmp		.exit
.F0:
		mov		dword [eax+count1table_select],1
		mov		eax,ebp
.exit:
		pop		ebp
		pop		edi
		pop		esi
		pop		ebx
		ret
%endif


;************************************************************************
;	99/11/22 count_nz_xr_3DN, count_nz_xr_MMX by Kei
;	00/01/21 count_nz_xr_E3DN by Kei

;int
;count_nz_xr_c(float xr[])
;{
;	int  i;
;	int  ct;
;	int* p = (int*)xr;
;	for( i = 0; i < 576; i++ ){
;		if( *p & 0x7FFFFFFF )ct++;	/* fabs(*p) > 0 */
;		p++;
;	}
;	return ct;
;}

%ifdef USE_E3DN
proc count_nz_xr_E3DN

;	extern _clkbegin
;	extern _clkend
;	call 		_clkbegin

%$xr			arg		4

	femms

	mov			r0, [sp(%$xr)]

.for_init:
	mov			r2, 576/16
	pxor		mm7, mm7
	pmov		mm6, [D_ABS]
	pxor		mm5, mm5
	jmp			near .for

	align		16
.for:
	; AMD Υɥ̤ȡ700bytes 褯餤 prefetch äƤȤ
	; ʤΤɡ64bytes 褬®ä(^^;;
	; prefetch ǡ 2nd åˤϤϤ䤷ͤ
	prefetch	[r0+64]
	pmov		mm0, [r0+fsizen(0)]
	pmov		mm1, [r0+fsizen(2)]

	pmov		mm2, [r0+fsizen(4)]

	pand		mm0, mm6
	pmov		mm3, [r0+fsizen(6)]

	pand		mm1, mm6
	pand		mm2, mm6

	pcmpgtd		mm0, mm5
	pand		mm3, mm6

	pcmpgtd		mm1, mm5
	pcmpgtd		mm2, mm5

	psubd		mm7, mm0
	pcmpgtd		mm3, mm5

	pmov		mm0, [r0+fsizen(8)]
	psubd		mm7, mm1

	pmov		mm1, [r0+fsizen(10)]
	psubd		mm7, mm2

	pmov		mm2, [r0+fsizen(12)]
	psubd		mm7, mm3

	pmov		mm3, [r0+fsizen(14)]
	pand		mm0, mm6

	pand		mm1, mm6
	pand		mm2, mm6

	pcmpgtd		mm0, mm5
	pand		mm3, mm6

	pcmpgtd		mm1, mm5
	pcmpgtd		mm2, mm5

	psubd		mm7, mm0
	pcmpgtd		mm3, mm5

	psubd		mm7, mm1
	add			r0, fsizen(16)

	psubd		mm7, mm2
.for_next:
	dec			r2
	psubd		mm7, mm3
	jnz			near .for

	pswapd		mm6, mm7
	pmovd		r0, mm7
	pmovd		r2, mm6
	femms
	add			r0, r2

;	call		_clkend

endproc
%endif

%macro count_nz_xr_MMX_macro 2
;	in
;		%1 : xr
;	work
;		%2 : loop counter
;	out
;		%1 : result
.for_init:
	mov			%2, 576/8
	pxor		mm7, mm7
	pmov		mm6, [D_ABS]
	pxor		mm5, mm5
	jmp			near .for

	align		16
.for:
	pmov		mm0, [%1+fsizen(0)]
	pmov		mm1, [%1+fsizen(2)]

	pmov		mm2, [%1+fsizen(4)]
	add			%1, fsizen(8)

	pand		mm0, mm6
	pmov		mm3, [%1+fsizen(6-8)]

	pand		mm1, mm6
	pand		mm2, mm6

	pcmpgtd		mm0, mm5
	pand		mm3, mm6

	pcmpgtd		mm1, mm5
	pcmpgtd		mm2, mm5

	psubd		mm7, mm0
	pcmpgtd		mm3, mm5

	psubd		mm7, mm1
	psubd		mm7, mm2

.for_next:
	dec			%2
	psubd		mm7, mm3
	jnz			near .for

	pmovd		%1, mm7
	psrlq		mm7, 32
	pmovd		%2, mm7
	add			%1, %2
%endmacro

proc count_nz_xr_3DN

;	extern _clkbegin
;	extern _clkend
;	call 		_clkbegin

%$xr			arg		4

	femms

	mov			r0, [sp(%$xr)]
	count_nz_xr_MMX_macro r0, r2

	femms

;	call		_clkend

endproc


proc count_nz_xr_MMX

;	call 		_clkbegin

%$xr			arg		4

	emms

	mov			r0, [sp(%$xr)]
	count_nz_xr_MMX_macro r0, r2

	emms

;	call		_clkend
endproc

;************************************************************************
;	99/11/22 set_l3_enc_sign_3DN, set_l3_enc_sign_MMX by Kei
;	00/01/21 set_l3_enc_sign_E3DN by Kei

;void
;set_l3_enc_sign_C(float xr[], int l3_enc[])
;{
;	int *pi = l3_enc;
;	int *p = (int *)xr;
;	int i;
;	for( i = 0; i < 576; i++ ){
;		if( (*p & 0x80000000) && (*pi > 0) )*pi = -*pi;
;		p++;
;		pi++;
;	}
;}

%ifdef USE_E3DN
proc	set_l3_enc_sign_E3DN

;	extern _clkbegin
;	extern _clkend
;	call 		_clkbegin

%$xr			arg		4
%$l3_enc		arg		4

	femms

	mov			r0, [sp(%$xr)]
	mov			r3, [sp(%$l3_enc)]

.for_init:
	mov			r2, -fsizen(576)
	pxor		mm7, mm7
	sub			r0, r2
	sub			r3, r2
	jmp			near .for

	align		16
.for:
	prefetch	[r0+r2+64]
	prefetchw	[r3+r2+64]

%assign	i 0
%rep 4
	pmov		mm0, [r3+r2+fsizen(i+0)]	; mm0=[l3_enc[i]]
	pmov		mm3, [r3+r2+fsizen(i+2)]	; mm3=[l3_enc[i+1]]

	pmov		mm1, mm0			; mm1=[l3_enc[i]]
	pmov		mm4, mm3			; mm4=[l3_enc[i+1]]

	pcmpgtd		mm0, mm7			; mm0=iif(l3_enc[i]>0,-1,0)	
	pcmpgtd		mm3, mm7			; mm3=iif(l3_enc[i+1]>0,-1,0)	

	pand		mm0, [r0+r2+fsizen(i+0)]	; 
	pand		mm3, [r0+r2+fsizen(i+2)]	; 

	psrad		mm0, 31				; mm0=iif(l3_enc[i]>0 && xr[i]<0,-1,0)
	psrad		mm3, 31				; mm3=iif(l3_enc[i+1]>0 && xr[i+1]<0,-1,0)

	pmov		mm2, mm0				
	pmov		mm5, mm3				

	pcmpeqd		mm2, mm7			; mm2=iif(Ʊ,0,-1)
	pcmpeqd		mm5, mm7			; mm5=iif(Ʊ,0,-1)

	pand		mm0, mm1			; mm0=iif(Ʊ,l3_enc[i],0)
	pand		mm3, mm4			; mm3=iif(Ʊ,l3_enc[i+1],0)

	pand		mm2, mm1			; mm2=iif(Ʊ,0,l3_enc[i])
	pand		mm5, mm4			; mm5=iif(Ʊ,0,l3_enc[i+1])

	psubd		mm2, mm0			; mm2=iif(Ʊ,-l3_enc[i],l3_enc[i])
	psubd		mm5, mm3			; mm5=iif(Ʊ,-l3_enc[i+1],l3_enc[i+1])

	pmov		[r3+r2+fsizen(i+0)], mm2
	pmov		[r3+r2+fsizen(i+2)], mm5
%assign	i i+4
%endrep

.for_next:
	add			r2, fsizen(16)
	jnz			near .for

	femms

;	call 		_clkend
endproc
%endif

%macro set_l3_enc_sign_MMX_macro 3
;	in
;		%1 : xr
;		%2 : l3_enc
;	work
;		%3 : loop counter
.for_init:
	mov			%3, 576/4
	pxor		mm7, mm7
	jmp			near .for

	align		16
.for:
	pmov		mm0, [%2+fsizen(0)]	; mm0=[l3_enc[i]]
	pmov		mm3, [%2+fsizen(2)]	; mm3=[l3_enc[i+1]]

	pmov		mm1, mm0			; mm1=[l3_enc[i]]
	pmov		mm4, mm3			; mm4=[l3_enc[i+1]]

	pcmpgtd		mm0, mm7			; mm0=iif(l3_enc[i]>0,-1,0)	
	pcmpgtd		mm3, mm7			; mm3=iif(l3_enc[i+1]>0,-1,0)	

	pand		mm0, [%1+fsizen(0)]	; 
	pand		mm3, [%1+fsizen(2)]	; 

	add			%1, fsizen(4)
	psrad		mm0, 31				; mm0=iif(l3_enc[i]>0 && xr[i]<0,-1,0)

	psrad		mm3, 31				; mm3=iif(l3_enc[i+1]>0 && xr[i+1]<0,-1,0)
	pmov		mm2, mm0				

	pmov		mm5, mm3				
	pcmpeqd		mm2, mm7			; mm2=iif(Ʊ,0,-1)

	pcmpeqd		mm5, mm7			; mm5=iif(Ʊ,0,-1)
	pand		mm0, mm1			; mm0=iif(Ʊ,l3_enc[i],0)
	
	add			%2, fsizen(4)
	pand		mm3, mm4			; mm3=iif(Ʊ,l3_enc[i+1],0)

	pand		mm2, mm1			; mm2=iif(Ʊ,0,l3_enc[i])
	pand		mm5, mm4			; mm5=iif(Ʊ,0,l3_enc[i+1])

	psubd		mm2, mm0			; mm2=iif(Ʊ,-l3_enc[i],l3_enc[i])
	psubd		mm5, mm3			; mm5=iif(Ʊ,-l3_enc[i+1],l3_enc[i+1])

	pmov		[%2+fsizen(0-4)], mm2

.for_next:
	dec			%3
	pmov		[%2+fsizen(2-4)], mm5
	jnz			near .for
%endmacro

proc	set_l3_enc_sign_3DN

%$xr			arg		4
%$l3_enc		arg		4

	femms

	mov			r0, [sp(%$xr)]
	mov			r3, [sp(%$l3_enc)]
	set_l3_enc_sign_MMX_macro r0, r3, r2

	femms

endproc

proc	set_l3_enc_sign_MMX

%$xr			arg		4
%$l3_enc		arg		4

	emms

	mov			r0, [sp(%$xr)]
	mov			r3, [sp(%$l3_enc)]
	set_l3_enc_sign_MMX_macro r0, r3, r2

	emms

endproc

;************************************************************************

		end
