/*
 *	for new GOGO-no-coda (1999, 2000)
 *	based on lame3.28beta and optimized by shigeo
 */
#define MIE
#define MIEasmHuffmana
#define MIEasmQuantize
#define MIEasmCalcNoise
#define MIEasmCalcRunlen
#define MIEasmIxMax
#define MIEasmCalcPow075
#define MIEasmQuantize_xrpow075
#define LAME355a /* LAME355ƤʤƤ褤 */
/*
 *	99/08/01
 *	count_bit()4ܤΰ0ǤʤΥåؿƤӽФˤ
 *		x2.00
 *	99/08/03
 *	ΤȤꤷƺŬ
 *	quntizerStepSizeintǤ褤
 *		餯ϥСѤäƤO.K.
 *	subblock_gain[i]=0 (i=0,1,2)
 *		SBGAIN(Ǥ0,1,2ͤʤ褦)
 *	CΥ롼ää?
 *		(int)(-1.4)=(int)(-1.5)=(int)(-1.6)=-1
 *		(int)( 1.4)=(int)( 1.5)=(int)( 1.6)= 1
 *	quantize_xrpow()asm(FPU,3DN)
 *		x1.30
 *	99/08/09
 *	calc_noise2()κŬ & asm(FPU)
 *		x1.10
 *	99/08/10
 *	calc_noise{1,2}()κŬ³
 *		x1.05
 *	99/08/11
 *	calc_pow4P3dual()3DN
 *		x0.93
 *	99/08/14
 *	ΤȤꤷƺŬ
 *	sfBandIndex[]ȤƶǤ뤳
 *	subdivide
 *		bigvalues_region = 2 * cod_info->big_values;ȤʤäƤ뤳
 *	(ΤȤ)cod_info->address{1,2,3}ƶ
 *	änew_choose_table,ix_maxbeginend϶ǤȤƤ褤
 *	99/08/27
 *	פ¬
 *		calc_pow075()3DN
 *		x0.65
 *      99/10/11
 *      lame3.28򻲹ͤѹ & VBR б
 *	99/10/30
 *		stereoʳǤϸʤȽΤᴰ˰ܹԤ롣
 *	00/01/06
 *		lame3.55ɽƤߤ褦Ȥɡ⤦̵
 *	00/01/11 ҤȤޤλ
 */

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include "musenc.h"
#include "global.h"
#include "loop.h"
#include "huffman.h"
#include "l3bs.h"
#include "reserv.h"
#include "haveunit.h"
#include "l3psy.h"

/* CPU˰ͤäѹ줦ؿؤΥݥ */

static int (*quantize_xrpow)(float xr[576], int ix[576], gr_info *cod_info);
static int (*count_bit)(int ix[576],unsigned int start,unsigned int end,unsigned int table);
static void (*calc_pow4P3dual)(int *ix,float *xr,float *step,float *sum,int n,int next);
static void (*calc_pow075)( float *xr, float *xrpow );
static void (*ms_convert)(float xr[2][576],float xr_org[2][576]);
static int (*count_nz_xr)(float xr[]);
static void (*set_l3_enc_sign)(float xr[], int l3_enc[]);
static int (*new_choose_table)( int ix[576], unsigned int begin, unsigned int end, int *bits );

	/* exported to huffmana.nas */
int (*ix_max)( int ix[576], unsigned int begin, unsigned int end );

/* פstatic */
static int convert_mdct, convert_psy, reduce_sidechannel;
//static int stereo
//static int mode_gr;
//static layer *info;

static int count_bits( int  *ix, gr_info *cod_info);

/* 16, 24kHz Υơ֥뽤 00/01/20 from tables.c in lame3.60 */
static struct scalefac_struct sfBandIndex[6] =
{

  { /* Table B.2.b: 22.05 kHz */
    {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576},
    {0,4,8,12,18,24,32,42,56,74,100,132,174,192}
  },
  { /* Table B.2.c: 24 kHz */
    {0,6,12,18,24,30,36,44,54,66,80,96,114,136,162,194,232,278,332,394,464,540,576},
    {0,4,8,12,18,26,36,48,62,80,104,136,180,192}
  },
  { /* Table B.2.a: 16 kHz */
    {0,6,12,18,24,30,36,44,54,66,80,96,116,140,168,200,238,284,336,396,464,522,576},
    {0,4,8,12,18,26,36,48,62,80,104,134,174,192}
  },
  { /* Table B.8.b: 44.1 kHz */
    {0,4,8,12,16,20,24,30,36,44,52,62,74,90,110,134,162,196,238,288,342,418,576},
    {0,4,8,12,16,22,30,40,52,66,84,106,136,192}
  },
  { /* Table B.8.c: 48 kHz */
    {0,4,8,12,16,20,24,30,36,42,50,60,72,88,106,128,156,190,230,276,330,384,576},
    {0,4,8,12,16,22,28,38,50,64,80,100,126,192}
  },
  { /* Table B.8.a: 32 kHz */
    {0,4,8,12,16,20,24,30,36,44,54,66,82,102,126,156,194,240,296,364,448,550,576},
    {0,4,8,12,16,22,30,42,58,78,104,138,180,192}
  }
};

static const unsigned nr_of_sfb_block[6][3][4] =
{
  {
    {6, 5, 5, 5},
    {9, 9, 9, 9},
    {6, 9, 9, 9}
  },
  {
    {6, 5, 7, 3},
    {9, 9, 12, 6},
    {6, 9, 12, 6}
  },
  {
    {11, 10, 0, 0},
    {18, 18, 0, 0},
    {15,18,0,0}
  },
  {
    {7, 7, 7, 0},
    {12, 12, 12, 0},
    {6, 15, 12, 0}
  },
  {
    {6, 6, 6, 3},
    {12, 9, 9, 6},
    {6, 12, 9, 6}
  },
  {
    {8, 8, 5, 0},
    {15,12,9,0},
    {6,18,9,0}
  }
};

const int *scalefac_long  = &sfBandIndex[3].l[0];
const int *scalefac_short = &sfBandIndex[3].s[0];

/* ɬפѿ */
static int OldValue = -30;	/* guess it or so. */
static int firstcall = 1;
#ifdef LAME355
static float masking_lower = 1;
#endif

/* quantize.nasȤ(12899%С) */
/* 8192+14=8206Ƥ򥫥С */

#define PRECALC_SIZE 8206 

extern float pow4P3_table[PRECALC_SIZE];

static
void
calc_noise_init(void)
{
	int i;
	for( i = 0; i < PRECALC_SIZE; i++ ){
		pow4P3_table[i] = pow(i, 4.0/3.0);
	}
}

void
InitLoop(void)
{
	firstcall = 1;
	OldValue = -30;	/* bin_search_StepSize2  */
#ifdef LAME355
	masking_lower = 1;
#endif
	calc_noise_init();

	if( experimentalY ){
	  fprintf(stderr,"experimentalY is not supported now.\n");
	  exit(1);
	}
	if( experimentalZ ){
	  fprintf(stderr,"experimentalZ is not supported now.\n");
	  exit(1);
	}
}

typedef enum {
    BINSEARCH_NONE,
    BINSEARCH_UP, 
    BINSEARCH_DOWN
} binsearchDirection_t;

/* input: xrpow, cod_info  output:ix */
static
int
bin_search_StepSize2( int desired_rate, int *ix, float *xrpow, gr_info *cod_info )
{
	int flag_GoneOver = 0;
	int CurrentStep = 4;	/* οˤʤʤΤǳ껻ϥեȲ */
	int nBits;
	int StepSize = OldValue;
	binsearchDirection_t Direction = BINSEARCH_NONE;

	for(;;){
		cod_info->quantizerStepSize = StepSize;
		nBits = quantize_xrpow(xrpow, ix, cod_info);
		nBits = ( nBits <= 8191 + 14 ) ? count_bits(ix,cod_info) : 100000;

		if( CurrentStep == 1 ){
			break; /* nothing to adjust anymore */
		}
		if( flag_GoneOver ){
			CurrentStep >>= 1;
		}
		if( nBits > desired_rate ){  /* increase Quantize_StepSize */
			if( Direction == BINSEARCH_DOWN && !flag_GoneOver ){
				flag_GoneOver = 1;
				CurrentStep >>= 1;
			}
			Direction = BINSEARCH_UP;
			StepSize += CurrentStep;
		}else{
			if( nBits < desired_rate ){
				if( Direction == BINSEARCH_UP && !flag_GoneOver ){
					flag_GoneOver = 1;
					CurrentStep >>= 1;
				}
				Direction = BINSEARCH_DOWN;
				StepSize -= CurrentStep;
			}else{
				break; /* nBits == desired_rate;; most unlikely to happen */
			}
		}
	}
	OldValue = StepSize;
	return nBits;
}

/* convert from L/R <-> Mid/Side */
void ms_convert_3DN(float xr[2][576],float xr_org[2][576]);
void ms_convert_FPU(float xr[2][576],float xr_org[2][576]);
void ms_convert_SSE(float xr[2][576],float xr_org[2][576]);
#ifdef USE_E3DN
void ms_convert_E3DN(float xr[2][576],float xr_org[2][576]);
#endif

void
setup_ms_convert(int useUNIT){
#ifdef USE_E3DN
	if( useUNIT & tE3DN ){
		SETUP_DSP("use:ms_convert_E3DN:\n");
		ms_convert = ms_convert_E3DN;
	}else
#endif
	if( useUNIT & t3DN ){
		SETUP_DSP("use:ms_convert_3DN:\n");
		ms_convert = ms_convert_3DN;
	}else
	if( useUNIT & tSSE ){
		SETUP_DSP("use:ms_convert_SSE:\n");
		ms_convert = ms_convert_SSE;
	}else
	{
		SETUP_DSP("use:ms_convert_FPU:\n");
		ms_convert = ms_convert_FPU;
	}
}
#if 0
void ms_convert_C(float xr[2][576],float xr_org[2][576])
{
  int i;
	for ( i = 0; i < 576; i++ ){
		xr[0][i] = (xr_org[0][i]+xr_org[1][i])*(1./SQRT2);
		xr[1][i] = (xr_org[0][i]-xr_org[1][i])*(1./SQRT2);
	}
}
#endif

/* 99/09/21 ޤ */

static
void
pow2i025(int n, float *ret){
#if 1
	int m;
	/* table[i] = 2^(i/4) */
	float table[4]={1,1.18920711498,1.41421356237,1.68179283048};
	
	m = n >> 2;
	m = (m << 23) + 0x3F800000;	/* = 2^m */
	*ret = table[ n & 3 ] * *(float *)&m;
#else
	*ret = pow( 2.0, n * 0.25 );
#endif
}

/*
 *	99/08/27 by shigeo
 *	äʤΤĤäƤ
 */

void calc_pow075_3DN( float *xr, float *xrpow );
#ifdef USE_E3DN
void calc_pow075_E3DN( float *xr, float *xrpow );
#endif
//void calc_pow075_FPU( float *xr, float *xrpow ); /* NONE® */
void calc_pow075_SSE( float *xr, float *xrpow );
void calc_pow075_NONE( float *xr, float *xrpow );
#if 0
static
void
calc_pow075_C( float *xr, float *xrpow ){
	int i;
	for( i = 0; i < 576; i++ ){
		float temp;
		temp = fabs( *xr );
		xr++;
		*xrpow++ = sqrt( sqrt(temp) * temp );
	}
}
#endif

void
setup_calc_pow075(int useUNIT){
#ifdef USE_E3DN
	if( useUNIT & tE3DN ){
		SETUP_DSP("use:calc_pow075_E3DN\n");
		calc_pow075 = calc_pow075_E3DN;
	}else
#endif
	if( useUNIT & t3DN ){
		SETUP_DSP("use:calc_pow075_3DN\n");
		calc_pow075 = calc_pow075_3DN;
	}else
	if( useUNIT & tSSE ){
		SETUP_DSP("use:calc_pow075_SSE\n");
		calc_pow075 = calc_pow075_SSE;
	}else
	{
#if 0
		SETUP_DSP("use:calc_pow075_FPU\n");
		calc_pow075 = calc_pow075_FPU;		/* 45k clk */
#else
		SETUP_DSP("use:calc_pow075_NONE\n");
		calc_pow075 = calc_pow075_NONE;		/* 40k clk */
#endif
    }
}

/* flag = 1 if distort[ch][0][i] > 0 for i = 17,18,19,20  */
/* Ƥdistort[]=0λ⤷ʤ0֤ */

static
int
preemphasis( float xr[576], float xrpow[576], III_psy_xmin  *l3_xmin,
		int gr, int ch, III_side_info_t *l3_side, float distort[4][CBLIMIT] )
{
	gr_info *cod_info = &l3_side->gr[gr].ch[ch].tt;
	static const float pow_tbl1[21] =	/* =sqr2 ^ pretab[i] */
	{
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, SQRT2, SQRT2, SQRT2, SQRT2,
		2, 2, 2*SQRT2, 2*SQRT2, 2*SQRT2, 2
	};
#define P 1.29683955463	/* = 2^(3/8) */
	static const float pow_tbl2[21] = /* = pow_tbl1[i] ^ 0.75 */
	{
		1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, P, P, P, P,
		P*P,P*P,P*P*P,P*P*P,P*P*P,P*P
	};
#undef P
	int i, sfb, start, end;
	float t,t34;
#ifndef no_scfsi
	if ( gr == 1 ){
		int i;
		/*
		 * If the second granule is being coded and scfsi is active in
		 * at least one scfsi_band, the preemphasis in the second granule
		 *is set equal to the setting in the first granule
		 */
		for( i = 0; i < 4; i++ ){
			if( l3_side->scfsi[ch][i] ){
				cod_info->preflag = l3_side->gr[0].ch[ch].tt.preflag;
				return 0;
			}
		}
	}
#endif
	/*
	 * Preemphasis is switched on if in all the upper four scalefactor
	 * bands the actual distortion exceeds the threshold after the
	 * first call of the inner loop
	 */
	if( cod_info->block_type == SHORT_TYPE || cod_info->preflag ||
		distort[0][17] <= 0 ||
		distort[0][18] <= 0 ||
		distort[0][19] <= 0 ||
		distort[0][20] <= 0 ) return 0;

	cod_info->preflag = 1;
	/* t==1λϴ˽ */
	for( sfb = 11; sfb < cod_info->sfb_lmax; sfb++ ){
		t   = pow_tbl1[sfb];
		t34 = pow_tbl2[sfb];
		if( cod_info->scalefac_scale ){
			t *= t;
			t34 *= t34;
		}
		l3_xmin->l[gr][ch][sfb] *= t*t;
		start = scalefac_long[ sfb ];
		end   = scalefac_long[ sfb+1 ];
		for( i = start; i < end; i++ ) xr[i] *= t;
		for( i = start; i < end; i++ ) xrpow[i] *= t34;
	}
	return 1;
}

/* Ƥdistort[]=0λ⤷ʤ0֤ */

static
int
amp_scalefac_bands( float xr[576], float xrpow[576],
		III_psy_xmin *l3_xmin, III_side_info_t *l3_side,
		III_scalefac_t *scalefac,
		int gr, int ch, int iteration, float distort[4][CBLIMIT])
{
	int sfb, i, over = 0;
	float ifqstep, ifqstep2, ifqstep34;
	D192_3 *xr_s;
	D192_3 *xrpow_s;
	gr_info *cod_info;
	float distort_thresh;
	cod_info = &l3_side->gr[gr].ch[ch].tt;

	xr_s = (D192_3 *) xr;
	xrpow_s = (D192_3 *) xrpow;
	if( cod_info->scalefac_scale ){
		ifqstep = 2;
		ifqstep2 = 4;
		ifqstep34 = 1.68179283050743; /* = 2^(3/4) */
	}else{
		ifqstep = SQRT2;
		ifqstep2 = 2;
		ifqstep34 = 1.29683955465101; /* = 2^(3/8) */
	}
	/* distort_thresh = 0, unless all bands have distortion 
	 * less than masking.  In that case, just amplify bands with distortion
	 * within 95% of largest distortion/masking ratio */

	distort_thresh = -900 / 1.05;
	for( sfb = 0; sfb < cod_info->sfb_lmax; sfb++ ){
		distort_thresh = Max( distort[0][sfb], distort_thresh );
	}
	distort_thresh *= 1.05;
	distort_thresh = Min( distort_thresh, 0 );

	for( sfb = 0; sfb < cod_info->sfb_lmax; sfb++ ){
		int start, end;
		if( distort[0][sfb] <= distort_thresh ) continue;
		over++;
		l3_xmin->l[gr][ch][sfb] *= ifqstep2;
		scalefac->l[gr][ch][sfb]++;
		start = scalefac_long[sfb];
		end   = scalefac_long[sfb+1];
		for( i = start; i < end; i++ ) xr[i] *= ifqstep;
		for( i = start; i < end; i++ ) xrpow[i] *= ifqstep34;
	}

	/*
	 * Note that scfsi is not enabled for frames containing short blocks
	 */

	distort_thresh = -900 / 1.05;
	for( i = 0; i < 3; i++ ){
		for( sfb = cod_info->sfb_smax; sfb < SFB_SMAX - 1; sfb++ ){
			distort_thresh = Max( distort[i+1][sfb], distort_thresh );
		}
	}
	distort_thresh *= 1.05;
	distort_thresh = Min( distort_thresh, 0 );

	for( i = 0; i < 3; i++ ){
		for( sfb = cod_info->sfb_smax; sfb < SFB_SMAX - 1; sfb++ ){
			int start, end, j;
			if( distort[i+1][sfb] <= distort_thresh ) continue;
			over++;
			l3_xmin->s[gr][ch][sfb][i] *= ifqstep2;
			scalefac->s[gr][ch][sfb][i]++;
			start = scalefac_short[sfb];
			end   = scalefac_short[sfb+1];
			for( j = start; j < end; j++ ) (*xr_s)[j][i] *= ifqstep;
			for( j = start; j < end; j++ ) (*xrpow_s)[j][i] *= ifqstep34;
		}
	}
	return over;
} /* end of amp_scalefac_bands */


/*
 *	ñͥcalc_noise(ΥǤʤƤƤФ뤳ȤΤ)
 */

#ifdef MIEasmCalcNoise

void calc_pow4P3mono(int *ix,float *xr,float *step,float *sum,int n,int next);

#else

/*
 *	99/08/10
 *	κǸnextix/xrΥݥ󥿤οʤĿ
 *	ºݤˤnext*4ʤ
 *	n0ǤäƤϤʤ
 */

void
calc_pow4P3mono(int *ix,float *xr,float *step,float *sum,int n,int next){
	double temp,s=0;
	for(;n>0;n--){
		if( *ix < PRECALC_SIZE ){
			temp = p_pow4P3_table[*ix];
		}else{
			temp = pow(*ix,4/3.);
			puts("err");exit(1);/* never run */
		}
		temp = fabs(*xr) - temp * *step;
		s += temp * temp;
		ix += next;
		xr += next;
	}
	*sum = s;
}

#endif /* MIEasmCalcNoise */


/* xfsf */

static
int
calc_noise1( float xr[576], int ix[576], gr_info *cod_info,
	float distort[4][CBLIMIT],
	III_psy_xmin *l3_xmin,int gr, int ch, float *over_noise,
	float *tot_noise, float *max_noise )
{
	int sfb, i, over = 0;
	float step;

	D192_3 *xr_s = (D192_3 *)xr;
	I192_3 *ix_s = (I192_3 *)ix;
	int count = 0;
	*over_noise = 0;
	*tot_noise = 0;
	*max_noise = -999;

	pow2i025( cod_info->quantizerStepSize, &step );
	for( sfb = 0; sfb < cod_info->sfb_lmax; sfb++ ){
		double temp;
		int start, bw;
		float sum;
		start = scalefac_long[ sfb ];
		bw = scalefac_long[ sfb+1 ] - start;
		calc_pow4P3mono( &ix[start], &xr[start], &step, &sum, bw, 1 );
		temp = sum / ( bw * l3_xmin->l[gr][ch][sfb] );
		if( temp > 0.001 ){
			temp = 10 * log10( temp );
		}else{
			temp = -30;
		}
		distort[0][sfb] = temp;
		if( temp > 0 ){
			over++;
			*over_noise += temp;
		}
		*tot_noise += temp;
		*max_noise = Max( *max_noise, temp );
		count++;
	}

	for( i = 0; i < 3; i++ ){
#if 0 /* 󥵥ݡ */
		if( cod_info->subblock_gain[i] )
			step *= pow(2.0,-2.0*cod_info->subblock_gain[i] );
#endif
		for( sfb = cod_info->sfb_smax; sfb < SFB_SMAX - 1; sfb++ ){
			double temp;
			int start, bw;
			float sum;
			start = scalefac_short[ sfb ];
			bw = scalefac_short[ sfb+1 ] - start;
			calc_pow4P3mono( &(*ix_s)[start][i], &(*xr_s)[start][i], &step, &sum, bw, 3 );
			temp = sum / ( bw * l3_xmin->s[gr][ch][sfb][i] );
			if( temp > 0.001 ){
				temp = 10 * log10( temp );
			}else{
				temp = -30;
			}
			distort[i+1][sfb] = temp;
			if( temp > 0 ){
				over++;
				*over_noise += temp;
			}
			*tot_noise += temp;
			*max_noise = Max( *max_noise, temp );
			count++;
		}
	}
	if( count > 1 ) *tot_noise /= count;
	if( over > 1 ) *over_noise /= over;
	return over;
} /* end of calc_noise1 */

/*
 *	dualͥѤcalc_noise
 *	L/RΥͥ뤫Τ(masking thresholds)Ȥä
 *	midside̻ͥҲ򤹤Ȥ˸Ƥ
 */

/*
 *	99/08/09
 *	λꤷƺŬ
 *	1. cod_info[ch]->subblock_gain[i]=0 for i=0,1,2
 *	2. stereo=2
 */

#ifdef MIEasmCalcNoise

void
calc_pow4P3dual_3DN(int *ix,float *xr,float *step,float *sum,int n,int next);

void
calc_pow4P3dual_FPU(int *ix,float *xr,float *step,float *sum,int n,int next);

void
calc_pow4P3dual_SSE(int *ix,float *xr,float *step,float *sum,int n,int next);

void
setup_calc_pow4P3dual(int useUNIT){
	if(useUNIT & t3DN){
		SETUP_DSP("use:calc_pow4P3dual_3DN\n");
		calc_pow4P3dual=calc_pow4P3dual_3DN;
	}else if(useUNIT & tSSE){
		SETUP_DSP("use:calc_pow4P3dual_SSE\n");
		calc_pow4P3dual=calc_pow4P3dual_SSE;
	}else{
		SETUP_DSP("use:calc_pow4P3dual_FPU\n");
		calc_pow4P3dual=calc_pow4P3dual_FPU;
        }
}

#else /* MIEasmCalcNoise */

void
calc_pow4P3(int ix,float *xr, float *step,float *diff){
	float temp;
	temp = (ix < PRECALC_SIZE )? p_pow4P3_table[ix] : pow( ix, 4/3.);
	temp *= *step;
	if( *xr < 0 )temp = -temp;
	*diff = *xr - temp;
}

/*
 *	99/08/10
 *	κǸnextix/xrΥݥ󥿤οʤĿ
 *	ºݤˤnext*4ʤ
 *	n0ǤäƤϤʤ
 *	99/08/11
 *	黻롼׳
 */

void
calc_pow4P3dual(int *ix,float *xr,float *step,float *sum,int n,int next){
	float temp,diff0,diff1,s0=0,s1=0;
	for(;n>0;n--){
		calc_pow4P3(*ix,xr,&step[0],&diff0);
		calc_pow4P3(*(ix+576),xr+576,&step[1],&diff1);
		ix += next;
		xr += next;

		temp = diff0 + diff1;
		s0 += temp * temp;

		temp = diff0 - diff1;
		s1 += temp * temp;
    }
	sum[0] = s0*0.5;
	sum[1] = s1*0.5;
}

#endif /* MIEasmCalcNoise */

#ifndef LAME355a
#ifdef no_scfsi
static
int
part2_length( int gr, int ch, III_side_info_t *si )
{
	gr_info *gi = &si->gr[gr].ch[ch].tt;

	if( gl.mode_gr == 2 ){
		if( gi->block_type == SHORT_TYPE ){
			static const int slen_tab1[16] = {0,18,36,54,54,36,54,72,54,72,90,72,90,108,108,126};
			return slen_tab1[ gi->scalefac_compress ];
		}else{
			static const int slen_tab2[16] = {0,10,20,30,33,21,31,41,32,42,52,43,53,63,64,74};
			return slen_tab2[ gi->scalefac_compress ];
		}
	}else{
		return  gi->slen[0] * gi->sfb_partition_table[0] +
				gi->slen[1] * gi->sfb_partition_table[1] +
				gi->slen[2] * gi->sfb_partition_table[2] +
				gi->slen[3] * gi->sfb_partition_table[3];
	}
}
#else /* no_scfsi */
static
int
part2_length( int gr, int ch, III_side_info_t *si )
{
	gr_info *gi = &si->gr[gr].ch[ch].tt;

	if( gl.mode_gr == 2 ){
		static const int slen1_tab[16] = {0,0,0,0,3,1,1,1,2,2,2,3,3,3,4,4};
		static const int slen2_tab[16] = {0,1,2,3,0,1,2,3,1,2,3,1,2,3,2,3};
		int slen1, slen2;

		slen1 = slen1_tab[ gi->scalefac_compress ];
		slen2 = slen2_tab[ gi->scalefac_compress ];

		if( gi->block_type == SHORT_TYPE ){
			return ( slen1 + slen2 ) * 18;
		}else{
			if( gr == 0 ){
				return slen1 * 11 + slen2 * 10;
			}else{
				int bits = 0;
				if( !si->scfsi[ch][0] ) bits += slen1 * 6;
				if( !si->scfsi[ch][1] ) bits += slen1 * 5;
				if( !si->scfsi[ch][2] ) bits += slen2 * 5;
				if( !si->scfsi[ch][3] ) bits += slen2 * 5;
				return bits;
			}
		}
	}else{
		return  gi->slen[0] * gi->sfb_partition_table[0] +
				gi->slen[1] * gi->sfb_partition_table[1] +
				gi->slen[2] * gi->sfb_partition_table[2] +
				gi->slen[3] * gi->sfb_partition_table[3];
	}
}
#endif /* no_scfsi */
#endif /* LAME355 */

/* δؿ convert_psy == 1 λΤ߸ƤФΤ stereo == 2 Ǥ */

/*
 *	xfsf, distort[0][i][j] = distort[1][i][j]
 */

static
void
calc_noise2( float xr[2][576], int ix[2][576], gr_info *cod_info[2],
		float distort[4][CBLIMIT], III_psy_xmin *l3_xmin,int gr, int over[2], 
		float over_noise[2], float tot_noise[2], float max_noise[2] )
{
	int	start, sfb;
	float sum[2],step[2];
	int bw;
	int ch;

	over_noise[0] = over_noise[1] = 0; /* lameǤ block_type == SHORT_TYPE λ̤ɤ虜? */
	tot_noise[0] = tot_noise[1] = 0;
	max_noise[0] = max_noise[1] = -999;

	pow2i025( cod_info[0]->quantizerStepSize, &step[0] );
	pow2i025( cod_info[1]->quantizerStepSize, &step[1] );
	over[0] = over[1] = 0;
	/* calc_noise2: ξΥͥblock typeƱȤ */
	
	if( cod_info[0]->block_type != SHORT_TYPE ){ /* ؤɤä */
		for ( sfb = 0; sfb < SFB_LMAX-1; sfb++ ){
			float dis_temp[2];
			start = scalefac_long[ sfb ];
			bw = scalefac_long[ sfb+1 ] - start;

			calc_pow4P3dual(&ix[0][start],&xr[0][start],step,sum,bw,1);
			
			for( ch = 0; ch < 2; ch++ ){
				float temp;

				temp = sum[ch] / ( bw * l3_xmin->l[gr][ch][sfb] );
				if( temp > 0.001 ){
					temp = 10 * log10( temp );
				}else{
					temp = -30;
				}
				dis_temp[ch] = temp;
				if( temp > 0 ){
					over[ch]++;
					over_noise[ch] += temp;
				}
				tot_noise[ch] += temp;
				max_noise[ch] = Max( max_noise[ch], temp );
			}
			distort[0][sfb] = Max( dis_temp[0], dis_temp[1] );
		}
		return;
	}
	{
		D192_3 *xr_s[2];
		I192_3 *ix_s[2];
		xr_s[0] = (D192_3 *) xr[0];
		xr_s[1] = (D192_3 *) xr[1];
		ix_s[0] = (I192_3 *) ix[0];
		ix_s[1] = (I192_3 *) ix[1];

		for( sfb = 0; sfb < SFB_SMAX - 1; sfb++ ){
			float dis_temp[2];
			int i;
			start = scalefac_short[ sfb ];
			bw = scalefac_short[ sfb+1 ] - start;
			for( i = 0; i < 3; i++ ){
				calc_pow4P3dual(&(*ix_s[0])[start][i],&(*xr_s[0])[start][i], step,sum,bw,3);
				
				for( ch = 0; ch < 2; ch ++ ){
					float temp;
					temp = sum[ch] / ( bw * l3_xmin->s[gr][ch][sfb][i] );
					if( temp > 0.001 ){
						temp = 10 * log10( temp );
					}else{
						temp = -30;
					}
					
					dis_temp[ch] = temp > 0;
					if( temp > 0 ){
						over[ch]++;
						over_noise[ch] += temp;
					}
					tot_noise[ch] += temp;
					max_noise[ch] = Max( max_noise[ch], temp);
				}
				distort[i+1][sfb] = Max( dis_temp[0], dis_temp[1] );
			}
		}
	}
} /* calc_noise2() */

/* 
compute the ATH for each scalefactor band 
cd range:  0..96db

Input:  3.3kHz signal  32767 amplitude  (3.3kHz is where ATH is smallest = -5db)
longblocks:  sfb=12   en0/bw=-11db    max_en0 = 1.3db
shortblocks: sfb=5           -9db              0db

Input:  1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1 -1 (repeated)
longblocks:  amp=1      sfb=12   en0/bw=-103 db      max_en0 = -92db
            amp=32767   sfb=12           -12 db                 -1.4db 

Input:  1 1 1 1 1 1 1 -1 -1 -1 -1 -1 -1 -1 (repeated)
shortblocks: amp=1      sfb=5   en0/bw= -99                    -86 
            amp=32767   sfb=5           -9  db                  4db 


MAX energy of largest wave at 3.3kHz = 1db
AVE energy of largest wave at 3.3kHz = -11db
Let's take the average:  -5db = maximum signal in sfb=12.  
Dynamic range of CD: 96db.  Therefor energy of smallest audible wave 
in sfb=12  = -5  - 96 = -101db = ATH at 3.3kHz.  

ATH formula for this wave: -5db.  To adjust to LAME scaling, we need
ATH = ATH_formula  - 96  (db)
ATH = ATH * 2.5e-10      (ener)

*/
/* δؿϥ󥳡ɤκǽ˰٤ƤФ */
static float ATH_l[CBLIMIT];
static float ATH_s[CBLIMIT];

static
float
ATHformula(float f)
{
  float ath;
  f  = Max(0.02, f);
  /* from Painter & Spanias, 1997 */
  /* minimum: (i=77) 3.3kHz = -5db */
  ath=(3.640 * pow(f,-0.8)
       -  6.500 * exp(-0.6*pow(f-3.3,2.0))
       +  0.001 * pow(f,4.0));
  
  /* convert to energy */
#ifdef LAME355
  ath -= 114;
#else
  ath -= 96;  /* MDCT scaling.  see comments above */
#endif
  ath = pow( 10, ath/10.0 );
  return ath;
}
 
/* ATH_{s,l}ν */
static
void
compute_ath(void)
{
  int sfb,i,start,end;
  float ATH_f;
  float samp_freq = s_freq[gl.version][gl.freq_idx];

  /* last sfb is not used */
  for ( sfb = 0; sfb < SFB_LMAX-1; sfb++ ) {
    start = scalefac_long[ sfb ];
    end   = scalefac_long[ sfb+1 ];
    ATH_l[sfb]=1e38;//1e99;
    for (i=start ; i < end; i++) {
      ATH_f = ATHformula(samp_freq*i/(2*576)); /* freq in kHz */
      ATH_l[sfb]=Min(ATH_l[sfb],ATH_f);
    }
  }

  for ( sfb = 0; sfb < SFB_SMAX - 1; sfb++ ){
    start = scalefac_short[ sfb ];
    end   = scalefac_short[ sfb+1 ];
    ATH_s[sfb]=1e38;//1e99;
    for (i=start ; i < end; i++) {
      ATH_f = ATHformula(samp_freq*i/(2*192));     /* freq in kHz */
      ATH_s[sfb]=Min(ATH_s[sfb],ATH_f);
    }
  }
}

/* iteration_loopΤ߸ƤФ */

static
void
on_pe( float pe[2], III_side_info_t *l3_side, int targ_bits[2],
		int mean_bits, int gr )
{
	gr_info *cod_info;
	int extra_bits, tbits;
	int ch;

	ResvMaxBits2( mean_bits, &tbits, &extra_bits, gr);
	if( gl.stereo == 2 ) tbits /= 2;

	for( ch = 0; ch < gl.stereo; ch ++){
		int add_bits;
		cod_info = &l3_side->gr[gr].ch[ch].tt;

		add_bits= ( pe[ch] - 750 ) * 0.7142857143; /* = 1/1.4 */

		if( cod_info->block_type == SHORT_TYPE && add_bits < 500 ){
			add_bits = 500;
		}

		if( add_bits < 0) add_bits = 0;

		if( add_bits > extra_bits ) add_bits = extra_bits;
		if( add_bits > 4095 - tbits )
			add_bits = 4095 - tbits;

		targ_bits[ch] = tbits + add_bits;
		extra_bits -= add_bits;
	}
}

static
void
reduce_side( int targ_bits[2], float ms_ener_ratio, int mean_bits )
{
	/*  ms_ener_ratio = 0:  allocate 66/33  mid/side  fac=.33
	 *  ms_ener_ratio =.5:  allocate 50/50 mid/side   fac= 0
	 * 75/25 split is fac=.5
	 * float fac = .50*(.5-ms_ener_ratio[gr])/.5;
	 */
	float fac;
	int max_bits;
	fac = 0.5 - ms_ener_ratio;
	if( fac > 0 && targ_bits[1] >= 125 ){
		fac *= (0.33 * 2) * targ_bits[1];
		if( targ_bits[1] - fac > 125 ){
			targ_bits[0] += fac;
			targ_bits[1] -= fac;
		}else{
			targ_bits[0] += targ_bits[1] - 125;
			targ_bits[1] = 125;
		}
	}
	max_bits = Min( 4095, mean_bits / 2 + 1200 );
	if( targ_bits[0] > max_bits ) targ_bits[0] = max_bits;
	if( targ_bits[1] > max_bits ) targ_bits[1] = max_bits;
}

/* stereo == 2, convert_psy == TRUE */

static
void
quant_compare_dual( int better[2], int notdone[2], int best_over[2],
		float best_over_noise[2], int over[2], float over_noise[2] )
{
	/*
	 * noise is given in decibals (db) relative to masking thesholds.
	 * over_noise:  sum of quantization noise > masking
	 * tot_noise:   sum of all quantization noise
	 * max_noise:   max quantization noise 
	 */

	int overS = over[0] + over[1];
	int bestS = best_over[0] + best_over[1];
	int flag;
	flag = (over_noise[0]+over_noise[1]) < (best_over_noise[0]+best_over_noise[1]);
	if( overS != bestS ){
		flag = overS < bestS;
	}
	better[0] = flag & notdone[0];
	better[1] = flag & notdone[1];
}

static
int
quant_compare( int best_over,float best_tot_noise,float best_over_noise,float best_max_noise, int over,float tot_noise, float over_noise, float max_noise)
{
	/*
	 * noise is given in decibals (db) relative to masking thesholds.
	 * over_noise:  sum of quantization noise > masking
	 * tot_noise:   sum of all quantization noise
	 * max_noise:   max quantization noise 
	 */
#if 0
  if( experimentalX == 0 )
    return ( over < best_over || ( over == best_over && over_noise < best_over_noise ) );
#endif

	if( experimentalX == 1 ) 
		return ( max_noise < best_max_noise );

	if( experimentalX == 2 )
		return ( tot_noise < best_tot_noise );

	if( experimentalX == 3 )
		return (tot_noise < best_tot_noise) && (max_noise < best_max_noise + 2);
	if( experimentalX == 4 )
		return ( 0 >= max_noise && best_max_noise > 2 ) ||
		( 0 >= max_noise && best_max_noise < 0 && best_max_noise + 2 > max_noise && tot_noise < best_tot_noise ) ||
		( 0 >= max_noise && best_max_noise > 0 && best_max_noise + 2 > max_noise && tot_noise < best_tot_noise + best_over_noise ) ||
		( 0 < max_noise && best_max_noise > -0.5 && best_max_noise + 1 > max_noise && tot_noise + over_noise < best_tot_noise + best_over_noise ) ||
		( 0 < max_noise && best_max_noise > -1 && best_max_noise + 1.5 > max_noise && tot_noise + over_noise + over_noise < best_tot_noise + best_over_noise + best_over_noise );
#ifdef LAME355
	if( experimentalX == 5 )
		return (over_noise <  best_over_noise) ||
		((over_noise == best_over_noise)&&(tot_noise < best_tot_noise));
#endif
	return 0;
}

#ifdef USE_VBR
#ifdef LAME355

#if 1
/* best_*0(lame3.59ޤǤϤʤäƤ ) */
#define VBR_compare( t0, t3, t2, t1, over, tot_n, over_n, max_n ) \
	( (over) <= 0 && (over_n) <= 0 && (tot_n) <= 0 && (max_n) <= 0 )
#else
static
int
VBR_compare(int best_over,float best_tot_noise,float best_over_noise,float best_max_noise,int over,float tot_noise, float over_noise, float max_noise)
{
  /*
    noise is given in decibals (db) relative to masking thesholds.

    over_noise:  sum of quantization noise > masking
    tot_noise:   sum of all quantization noise
    max_noise:   max quantization noise 

   */
  return over <= best_over &&
         over_noise <= best_over_noise &&
	 tot_noise <= best_tot_noise &&
	 max_noise <= best_max_noise;
}
#endif /* #if 0 */

#endif /* LAME355 */

#ifndef LAME355

/*
 *	ǽ
 *	VBRbits
 */

static
int
VBR_on_pe( layer *info, III_side_info_t *l3_side, 
	       int VBRbits[2][2],float pe[2][2], int *mean_bits )
{
	int index,gr,ch;
	int bitsPerFrame;
	int fullframebits;

	getframebits(&bitsPerFrame,mean_bits,l3_side->rate_idx,l3_side->padding);
	fullframebits= ResvFrameBegin( *mean_bits, bitsPerFrame );

	index = 0;
	for( gr = 0; gr < gl.mode_gr; gr++ ){
		for( ch = 0; ch < gl.stereo; ch++ ){
			if( l3_side->gr[gr].ch[ch].tt.block_type == 2) index++;
		}
	}

	/* impose a minimum bit rate based on number of short blocks */  
	if( index > 0 ){
		if( index > 3 ) index = 3;
	/* index=1: min 160kbs.  (1 shortblock granule)
	 * index=2: min 190kbs.  (2 shortblock granules)
	 * idnex=3: min 220kbs   (3 shortblock granules)
	 * index=4: min 256kbs   (4 shortblock granules)
	 */
		if( gl.rate_idx < 9+index ) gl.rate_idx = 9+index;  
		if( gl.rate_idx > 14 ) gl.rate_idx = 14;
 
		getframebits(&bitsPerFrame,mean_bits,l3_side->rate_idx,l3_side->padding);
		fullframebits= ResvFrameBegin( *mean_bits, bitsPerFrame );
	}

	/* allocate a minimum 125 bits per channel (approx 32kbs) */
	for( gr = 0; gr < gl.mode_gr; gr++ ){
		for( ch = 0; ch < gl.stereo; ch++ ){
			VBRbits[gr][ch] = 125;
		}
	}
	fullframebits -= gl.stereo * gl.mode_gr * 125;

	/* divide bits based on PE */  
	if( fullframebits > 0 ){
		float pe2[2][2], pe_tot= 0.01;
		for( gr = 0; gr < gl.mode_gr; gr++ ){
			for( ch = 0; ch < gl.stereo; ch++ ){
				int shortblock = (l3_side->gr[gr].ch[ch].tt.block_type==2);
				pe2[gr][ch] = Min(pe[gr][ch],2800.);
				if( shortblock ) pe2[gr][ch] = Max(pe2[gr][ch],1800.0);
				pe_tot += pe2[gr][ch];
			}
		}
		for( gr = 0; gr < gl.mode_gr; gr++ ){
			for( ch = 0; ch < gl.stereo; ch++ ){
				VBRbits[gr][ch] = (fullframebits*pe2[gr][ch])/pe_tot;
			}
		}
	}
	for( gr = 0; gr < gl.mode_gr; gr++ ){
		for( ch = 0; ch < gl.stereo; ch++ ){
			if( VBRbits[gr][ch] > 4095 ) VBRbits[gr][ch]=4095;
		}
	}

	/* return value = 1 => do not decrease bit rate further */
	if( index ) return (gl.rate_idx <= 9 + index);
	return 0;
}

static
void
VBR_on_over( layer *info, III_side_info_t *l3_side, 
	       int VBRbits[2][2], float pe[2][2], int max_over, int VBR_q,
	       int over[2][2], int *mean_bits, int *VBR_no_decrease)
{ 
	int extrabits=0;
	int gr,ch;
	int bitsPerFrame;
	int fullframebits;

	if( max_over <= VBR_q ){
		gl.rate_idx --;
		if( gl.rate_idx < VBR_min_rate_idx )
			gl.rate_idx = VBR_min_rate_idx;
		*VBR_no_decrease = VBR_on_pe(info,l3_side,VBRbits,pe,mean_bits);
		return;
	}

	getframebits(&bitsPerFrame,mean_bits,l3_side->rate_idx,l3_side->padding);
	fullframebits= ResvFrameBegin( *mean_bits, bitsPerFrame );

	gl.rate_idx++;
	if( gl.rate_idx > VBR_max_rate_idx )
		gl.rate_idx = VBR_max_rate_idx;

	getframebits(&bitsPerFrame,mean_bits,l3_side->rate_idx,l3_side->padding);
	extrabits= ResvFrameBegin( *mean_bits, bitsPerFrame );
	extrabits -= fullframebits;

	/* divide extra bits based on OVER */
	if( extrabits > 0 ){
		float over2[2][2],over_tot=0;
		for( gr = 0; gr < gl.mode_gr; gr++ ){
			for( ch = 0; ch < gl.stereo; ch++ ){
				over2[gr][ch] = over[gr][ch] + 1;
				over_tot += over2[gr][ch];
			}
		}
		for( gr = 0; gr < gl.mode_gr; gr++ ){
			for( ch = 0; ch < gl.stereo; ch++ ){
				VBRbits[gr][ch] += (extrabits*over2[gr][ch])/over_tot;
			}
		}
	}
	for( gr = 0; gr < gl.mode_gr; gr++ ){
		for( ch = 0; ch < gl.stereo; ch++ ){
			if( VBRbits[gr][ch] > 4095 ) VBRbits[gr][ch] = 4095;
		}
	}
}
#endif /* !LAME355 */
#endif /* USE_VBR */

/* ޤǽ˸ƤӽФ */

static
void
iteration_init( float xr_org[2][2][576], III_side_info_t *l3_side )
{
	gr_info *cod_info;
	int ch, gr, i;
#ifdef LAME355
	extern float lowpass1, lowpass2;
#endif

	if( firstcall ){
		firstcall = 0;
		l3_side->main_data_begin = 0;
//    memset((char *) &l3_xmin, 0, sizeof(l3_xmin)); /* move in calc_xmin */
		scalefac_long  = &sfBandIndex[ gl.freq_idx + gl.version * 3 ].l[0];
		scalefac_short = &sfBandIndex[ gl.freq_idx + gl.version * 3 ].s[0];
		compute_ath();
	}

	l3_side->resvDrain = 0;

	convert_mdct = 0;
	convert_psy = 0;
	reduce_sidechannel = 0;
	if( l3_side->mode_ext == MPG_MD_MS_LR ){
#if 0
    if( highq ){
      convert_mdct = 1;
      convert_psy = 0;
      reduce_sidechannel = 1;
    }else
#endif
	{
		convert_mdct = 1;
		convert_psy = 1;
		reduce_sidechannel=1;
	}
	}
//QQQ lame3.55Ǥ̵ʤäƤ
	if( force_ms ){
		convert_mdct = 0;
		convert_psy = 0;
		reduce_sidechannel = 1;
	}

#ifdef LAME355
	if( lowpass1 > 0 ){
		float start,stop;
		for( gr = 0; gr < gl.mode_gr; gr++ ){
			for( ch = 0; ch < gl.stereo; ch++ ){
				if( l3_side->gr[gr].ch[ch].tt.block_type == SHORT_TYPE ){
					int j, start, stop;
					start = lowpass1 * 192;
					stop  = lowpass2 * 192;
					for( j = 0; j < 3; j++ ){
						for( i = start;  i < 192; i++ ){
							int i0 = 3*i+j; 
			if (i<=stop) xr_org[gr][ch][i0]*=cos((PI/2)*(i-start)/(stop-start));
          else xr_org[gr][ch][i0] = 0;
						}
					}
				}else{
					start = lowpass1 * 576;
					stop  = lowpass2 * 576;
					for( i = start; i < 576; i++ ){
        if (i<=stop) xr_org[gr][ch][i] *=  cos((PI/2)*(i-start)/(stop-start));
        else xr_org[gr][ch][i]=0;
					}
				}
			}
		}
	}else /* lowpass > 0 λ sfb21  */
#endif

	/* 16kHzʾ򥫥å */
	if( sfb21 ){
		for ( gr = 0; gr < gl.mode_gr; gr++ ){
			int start;
			if( l3_side->gr[gr].ch[0].tt.block_type == 2 ){
				start = scalefac_short[ SFB_SMAX-1 ] * 3;
			}else{
				start = scalefac_long[ SFB_LMAX-1 ];
			}
			for( ch = 0; ch < 2; ch++ ){
				for( i = start; i < 192 * 3; i++ ){
					xr_org[gr][ch][i] = 0;
				}
			}
		}
	}

	/* inline gr_deco */
	for( gr = 0; gr < gl.mode_gr; gr++ ){
		for( ch = 0; ch < gl.stereo; ch++ ){
			cod_info = (gr_info *) &(l3_side->gr[gr].ch[ch]);
			if( cod_info->block_type == SHORT_TYPE ){
				cod_info->sfb_lmax = 0; /* No sb*/
				cod_info->sfb_smax = 0;
			}else{
	/* MPEG 1 ǤϺǸscalefactorϻȤʤ */
				cod_info->sfb_lmax = SFB_LMAX - 1;
				cod_info->sfb_smax = SFB_SMAX - 1;    /* No sb */
			}
		}
	}
#ifndef no_scfsi
	for( ch = 0; ch < gl.stereo; ch++ ){
		for( i = 0; i < 4; i++ ){
			l3_side->scfsi[ch][i] = 0;
		}
	}
#endif
} /* endo fo iteration_init */

/*
Calculate the allowed distortion for each scalefactor band,
as determined by the psychoacoustic model.
xmin(sb) = ratio(sb) * en(sb) / bw(sb)
*/

/* l3_xminϤǽ */
#ifndef LAME355

static
void
calc_xmin( float xr[576], III_psy_ratio *ratio,
		gr_info *cod_info, III_psy_xmin *l3_xmin, int gr, int ch )
{
	int start, end, sfb, l, b;
	float en0, bw;
	D192_3 *xr_s;

	xr_s = (D192_3 *) xr;
	for( sfb = cod_info->sfb_smax; sfb < SFB_SMAX - 1; sfb++ ){
		start = scalefac_short[ sfb ];
		end   = scalefac_short[ sfb + 1 ];
		bw = end - start;
		for( b = 0; b < 3; b++ ){
			float temp;
			float enmax;
			enmax = 0;
			for( en0 = 0, l = start; l < end; l++ ){
				temp = (*xr_s)[l][b] * (*xr_s)[l][b];
				enmax= Max( enmax, temp );
				en0 += temp;
			}
			l3_xmin->s[gr][ch][sfb][b] = Max( ratio->s[gr][ch][sfb][b] * Max( 1e-20, en0 / bw ), ATH_s[sfb] );
			if( enmax < ATH_s[sfb] ){
				for( l = start; l < end; l++ ) (*xr_s)[l][b] = 0;
			}
		}
	}

	for( sfb = 0; sfb < cod_info->sfb_lmax; sfb++ ){
		float temp;
		float enmax;
		enmax = 0;
		start = scalefac_long[ sfb ];
		end   = scalefac_long[ sfb+1 ];
		bw = end - start;

		for( en0 = 0.0, l = start; l < end; l++ ){
			temp = xr[l] * xr[l];
			enmax = Max( enmax, temp );
			en0 += temp;
		}
		l3_xmin->l[gr][ch][sfb] = Max( ratio->l[gr][ch][sfb] * Max( 1e-10, en0 / bw ), ATH_l[sfb] );
		if( enmax < ATH_l[sfb] ){
			for( l = start; l < end; l++ ) xr[l]=0;
		}
	}
} /* end of calc_min */

#else /* LAME355 */

static
void
calc_xmin( float xr[576], III_psy_ratio *ratio,
		gr_info *cod_info, III_psy_xmin *l3_xmin, int gr, int ch )
{
	int start, end, sfb, l, b;
	float en0, bw;
	D192_3 *xr_s;
	xr_s = (D192_3 *) xr;

	for( sfb = cod_info->sfb_smax; sfb < SFB_SMAX - 1; sfb++ ){
		start = scalefac_short[ sfb ];
		end   = scalefac_short[ sfb + 1 ];
		bw = end - start;
		bw = masking_lower / ( end - start );
		for( b = 0; b < 3; b++ ){
			for( en0 = 0, l = start; l < end; l++ ){
				en0 += (*xr_s)[l][b] * (*xr_s)[l][b];
			}
			l3_xmin->s[gr][ch][sfb][b] = Max( ATH_s[sfb], ratio->s[gr][ch][sfb][b] * en0 * bw );
		}
	}

	for( sfb = 0; sfb < cod_info->sfb_lmax; sfb++ ){
		start = scalefac_long[ sfb ];
		end   = scalefac_long[ sfb+1 ];
		bw = masking_lower / ( end - start );

		for( en0 = 0.0, l = start; l < end; l++ ){
			en0 += xr[l] * xr[l];
		}
		l3_xmin->l[gr][ch][sfb] = Max( ATH_l[sfb], ratio->l[gr][ch][sfb] * en0 * bw );
	}
} /* end of calc_min */

#endif /* LAME355 */

/*
 *	lame3.28init_outer_loopinit_outer_loop_dual
 *	ǽ
 *	l3_xmin, scalefac
 */

static
void
init_outer_loop(
	float xr[576], float xr_org[576], III_psy_xmin  *l3_xmin,
	III_scalefac_t *scalefac, int gr, III_side_info_t *l3_side,
	III_psy_ratio *ratio, int ch, int flag)
{
	gr_info *cod_info;  
	cod_info = &l3_side->gr[gr].ch[ch].tt;

	/* flaglame3.28*_dualλconvert_psy,ǤʤȤ 0 */

	if( flag ){
		calc_xmin( xr_org, ratio, cod_info, l3_xmin, gr, ch );
	}else{
		calc_xmin( xr, ratio, cod_info, l3_xmin, gr, ch );
	}

	memset( &scalefac->l[gr][ch][0], 0, sizeof(int) * SFB_LMAX );
	memset( &scalefac->s[gr][ch][0][0], 0, sizeof(int) * SFB_SMAX * 3 );

	cod_info->slen[0] = 0;
	cod_info->slen[1] = 0;
	cod_info->slen[2] = 0;
	cod_info->slen[3] = 0;
	cod_info->sfb_partition_table = &nr_of_sfb_block[0][0][0];

	cod_info->quantizerStepSize = 0;

	cod_info->count1table_select= 0;
	cod_info->count1            = 0;
	cod_info->big_values        = 0;
	cod_info->part2_3_length    = 0;
	cod_info->scalefac_compress = 0;

	cod_info->table_select[0]   = 0;
	cod_info->table_select[1]   = 0;
	cod_info->table_select[2]   = 0;
//  cod_info->subblock_gain[0]  = 0;
//  cod_info->subblock_gain[1]  = 0;
//  cod_info->subblock_gain[2]  = 0;
	cod_info->region0_count     = 0;
	cod_info->region1_count     = 0;
	cod_info->preflag           = 0;
	cod_info->scalefac_scale    = 0;
	cod_info->part2_length      = 0;
	cod_info->adr1          = 0;
	cod_info->adr2          = 0;
	cod_info->adr3          = 0;

#if 0
/* ˤsubblock_gain[]!=0Υ롼񤫤ʤФʤʤ */
  if( experimentalZ ){
    /* compute subblock gains */
    int j,b;  float en[3],mx;
    if ((cod_info->block_type ==2) ) {
      /* estimate energy within each subblock */
      for (b=0; b<3; b++) en[b]=0;
      for ( i=0,j = 0; j < 192; j++ ) {
	for (b=0; b<3; b++) {
	  en[b]+=xr[i]*xr[i];
	  i++;
	}
      }
      mx = 1e-12;
      for (b=0; b<3; b++) mx=Max(mx,en[b]);
      for (b=0; b<3; b++) en[b] = Max(en[b],1e-12)/mx;
      //	printf("ener = %4.2f  %4.2f  %4.2f  \n",en[0],en[1],en[2]);
      /* pick gain so that 2^(2gain)*en[0] = 1  */
      /* gain = .5* log( 1/en[0] )/log(2) = -.5*log(en[])/log(2) */
      for (b=0; b<3; b++) {
	cod_info->subblock_gain[b]=nint2(-.5*log(en[b])/log(2.0));
	if (cod_info->subblock_gain[b] > 2) 
	  cod_info->subblock_gain[b]=2;
	if (cod_info->subblock_gain[b] < 0) 
	  cod_info->subblock_gain[b]=0;
      }
    }
  }
#endif /* if( experimentalZ ) */
} /* end of init_outer_loop() */

static 
int
inner_loop( float *xrpow, int *l3_enc, int max_bits, gr_info *cod_info )
{
	int bits;

	cod_info->quantizerStepSize--;
	do{
		cod_info->quantizerStepSize++;
		bits = quantize_xrpow( xrpow, l3_enc, cod_info );
		bits = ( bits <= 8191 + 14 ) ? count_bits( l3_enc, cod_info ) : 100000;
	}while( bits > max_bits );
	return bits;
}

/* by kei */
static
int
count_nz_xr_C(float xr[])
{
	int  i;
	int  ct =0;
	int* p = (int*)xr;
	for( i = 0; i < 576; i++ ){
		if( *p & 0x7FFFFFFF )ct++;	/* fabs(*p) > 0 */
		p++;
	}
	return ct;
}

int count_nz_xr_3DN(float xr[]);
int count_nz_xr_MMX(float xr[]);
//int count_nz_xr_SSE(float xr[]);
//int count_nz_xr_NONE(float xr[]);
#ifdef USE_E3DN
int count_nz_xr_E3DN(float xr[]);
#endif

void setup_count_nz_xr(int useUNIT){
#ifdef USE_E3DN
	if( useUNIT & tE3DN ){
		SETUP_DSP("use:count_nz_xr_E3DN\n");
		count_nz_xr = count_nz_xr_E3DN;
	}else
#endif
	if( useUNIT & t3DN ){
		SETUP_DSP("use:count_nz_xr_3DN\n");
		count_nz_xr = count_nz_xr_3DN;
	}else
//	if( useUNIT & tSSE ){
//		SETUP_DSP("use:count_nz_xr_SSE\n");
//		count_nz_xr = count_nz_xr_SSE;
//	}else
	if( useUNIT & tMMX ){
		SETUP_DSP("use:count_nz_xr_MMX\n");
		count_nz_xr = count_nz_xr_MMX;
	}else
	{
		SETUP_DSP("use:count_nz_xr_C\n");
		count_nz_xr = count_nz_xr_C;
//		SETUP_DSP("use:count_nz_xr_NONE\n");
//		count_nz_xr = count_nz_xr_NONE;
	}
}

static
int
scale_bitcount( III_scalefac_t *scalefac, gr_info *cod_info, int gr, int ch )
{
	int k, max_slen1 = 0, max_slen2 = 0;
#ifdef LAME355a
	int *tab;
	static int slen1_tab[16] = {0,18,36,54,54,36,54,72,54,72,90,72,90,108,108,126};
	static int slen2_tab[16] = {0,10,20,30,33,21,31,41,32,42,52,43,53,63,64,74};
#endif

	if( cod_info->block_type == SHORT_TYPE ){
		int (*fac)[3] = scalefac->s[gr][ch];
#ifdef LAME355a
		tab = slen1_tab;
#endif
		for ( k = 0; k < 6; k++ ){
			if( fac[k][0] > max_slen1 ) max_slen1 = fac[k][0];
			if( fac[k][1] > max_slen1 ) max_slen1 = fac[k][1];
			if( fac[k][2] > max_slen1 ) max_slen1 = fac[k][2];
		}
		for( k = 6; k < SFB_SMAX - 1; k++ ){
			if( fac[k][0] > max_slen2 ) max_slen2 = fac[k][0];
			if( fac[k][1] > max_slen2 ) max_slen2 = fac[k][1];
			if( fac[k][2] > max_slen2 ) max_slen2 = fac[k][2];
		}
	}else{
		int *fac = scalefac->l[gr][ch];
#ifdef LAME355a
		tab = slen2_tab;
#endif
		for( k = 0; k < 11; k++ ){
			if( fac[k] > max_slen1 ) max_slen1 = fac[k];
		}
		for( k = 11; k < 21; k++ ){
			if( fac[k] > max_slen2 ) max_slen2 = fac[k];
		}
	}

#ifdef LAME355a
	/* from Takehiro TOMINAGA <tominaga@isoternet.org> 10/99
	 * loop over *all* posible values of scalefac_compress to find the
	 * one which uses the smallest number of bits.  ISO would stop
	 * at first valid index
	 */
	cod_info->part2_length = 10000;
#endif
	for( k = 0; k < 16; k++ ){
		static const int slen1[16] = {1,1,1,1,8,2,2,2,4,4,4,8,8,8,16,16};
		static const int slen2[16] = {1,2,4,8,1,2,4,8,2,4,8,2,4,8,4,8};
		if( max_slen1 < slen1[k] && max_slen2 < slen2[k]
#ifdef LAME355a
			&& cod_info->part2_length > tab[k]
#endif
		){
#ifdef LAME355a
			cod_info->part2_length = tab[k];
#endif
			cod_info->scalefac_compress = k;
			return 0;
		}
	}
	return 2;
} /* scale_bitcount */

static
int
scale_bitcount_lsf( III_scalefac_t *scalefac, gr_info *cod_info, int gr, int ch )
{
	int table_number, over;
	int i, k, sfb, max_sfac[4];
	const unsigned *partition_table;
	table_number = ( cod_info->preflag ) ? 2 : 0;

	if( cod_info->block_type == SHORT_TYPE ){
		int (*fac)[3] = scalefac->s[gr][ch];
		partition_table = &nr_of_sfb_block[table_number][1][0];
		for( sfb = 0, k = 0; k < 4; k++ ){
			int max = 0;
			for( i = 0; i < partition_table[k] / 3; i++, sfb++ ){
				if( fac[sfb][0] > max ) max = fac[sfb][0];
				if( fac[sfb][1] > max ) max = fac[sfb][1];
				if( fac[sfb][2] > max ) max = fac[sfb][2];
			}
			max_sfac[k] = max;
		}
	}else{
		int *fac = scalefac->l[gr][ch];
		partition_table = &nr_of_sfb_block[table_number][0][0];

		for( sfb = 0, k = 0; k < 4; k++ ){
			int max = 0;
			for( i = 0; i < partition_table[k]; i++, sfb++ ){
				if( fac[sfb] > max ) max = fac[sfb];
			}
			max_sfac[k] = max;
		}
	}

	for( over = 0, k = 0; k < 4; k++ ){
		static const unsigned max_sfacTbl[6][4] = {
			{4, 4, 3, 3}, {4, 4, 3, 0}, {3, 2, 0, 0},
			{4, 5, 5, 0}, {3, 3, 3, 0}, {2, 2, 0, 0}
		};

		if( max_sfac[k] > max_sfacTbl[table_number][k] )over++;
	}
	if( !over ){
	/*
	  Since no bands have been over-amplified, we can set scalefac_compress
	  and slen[] for the formatter
	*/
		static const int log2tab[] = {0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4 };

		unsigned slen1, slen2, slen3, slen4;

		cod_info->sfb_partition_table = partition_table;
		for( k = 0; k < 4; k++ )
			cod_info->slen[k] = log2tab[max_sfac[k]];

		/* set scalefac_compress */
		slen1 = cod_info->slen[0];
		slen2 = cod_info->slen[1];
		slen3 = cod_info->slen[2];
		slen4 = cod_info->slen[3];

		if( table_number ){ /* == 0 or 2 */
			cod_info->scalefac_compress = 500 + (slen1 * 3) + slen2;
		}else{
			cod_info->scalefac_compress = (((slen1 * 5) + slen2) << 4)
			+ (slen3 << 2)
			+ slen4;
		}
#ifdef LAME355a
		/* from Takehiro TOMINAGA <tominaga@isoternet.org> 10/99 */
		for( i = 0, k = 0; k < 4; k++ ){
			i += cod_info->slen[k] * cod_info->sfb_partition_table[k];
		}
		cod_info->part2_length = i;
#endif
	}
	return over;
} /* scale_bitcount_lsf */

static
int
loop_break( III_scalefac_t *scalefac, gr_info *cod_info, int gr, int ch )
{
	int sfb;
	int *fac_l;
	int (*fac_s)[3];

	fac_l = scalefac->l[gr][ch];
	for( sfb = 0; sfb < cod_info->sfb_lmax; sfb++ ){
		if( !*fac_l++ ) return 0;
	}
    
	fac_s = scalefac->s[gr][ch];
	for( sfb = cod_info->sfb_smax; sfb < 12; sfb++ ){
		if( !fac_s[sfb][0] || !fac_s[sfb][1] || !fac_s[sfb][2] ) return 0;
	}
	return 1;
} /* loop_break */

/*
 *	outer_loop stereo == 2, convert_psy == TRUE ȤΩ
 *	ǽ
 *	l3_enc,best_over,xr
 *
 *	best_overϳȤΤȤ̵꤬ΤǤѿȤ
 */

static
void
outer_loop_dual( float xr[2][576], float xr_org[2][576],
    int mean_bits, III_psy_xmin  *l3_xmin, int l3_enc[2][576],
    III_scalefac_t *scalefac, int gr,
    III_side_info_t *l3_side, III_psy_ratio *ratio, float pe[2], float ms_ener_ratio[2])
{
	int notdone[2]={0,0},count[2];
	int targ_bits[2],real_bits[2];
	int scalesave_l[2][CBLIMIT], scalesave_s[2][CBLIMIT][3];
	int save_preflag[2], save_compress[2];
	float distort[4][CBLIMIT]; /* fast_modeǤϻȤʤ */
	char __xrpow[2*576*sizeof(float)+16];
	float (*xrpow)[576] = (float (*)[576])( (int)(__xrpow+15) & -16 ); /* OK */
//	float xrpow[2][576];

	/* *_noisefast_modeȤʤ */
	float max_noise[2];
	float over_noise[2];
	float tot_noise[2];
	float best_max_noise[2];
	float best_over_noise[2];
	float best_tot_noise[2];

	int save_l3_enc[2][576];
	int save_real_bits[2];
	int iteration, ch;
	int best_over[2];
	gr_info save_cod_info[2];
	gr_info *cod_info[2];
	cod_info[0] = &l3_side->gr[gr].ch[0].tt;
	cod_info[1] = &l3_side->gr[gr].ch[1].tt;

	if( convert_mdct ){
		ms_convert( xr, xr_org );
	}else{
		memcpy(xr, xr_org, sizeof(float) * 2 * 576 );
	}
	init_outer_loop(xr[0], xr_org[0], l3_xmin, scalefac, gr, l3_side, ratio, 0, 1 );
	init_outer_loop(xr[1], xr_org[1], l3_xmin, scalefac, gr, l3_side, ratio, 1, 1 );

	for( ch = 0; ch < 2; ch++ ){
		int ct;
#if 1
		ct = count_nz_xr(&xr[ch][0]);
#else
		int *p = (int *)&xr[ch][0];
		ct = 0;
		for( i = 0; i < 576; i++ ){
			if( *p & 0x7FFFFFFF )ct++;	/* fabs(*p) > 0 */
			p++;
		}
#endif
		count[ch] = ct;
		if( ct ){
			best_over[ch] = 100;
			notdone[ch] = 1;
		}else{
			best_over[ch] = notdone[ch] = 0;
			/* notdone[ch]=0λϤl3_encν */
			memset( l3_enc[ch], 0, sizeof(int) * 576 );
		}
	}
  
	/* ƥͥbit򽸤 */
	{
		int add_bits[2], bits;
		int tbits, extra_bits;

		ResvMaxBits2( mean_bits, &tbits, &extra_bits, gr);
		targ_bits[0] = targ_bits[1] = tbits / 2;
		bits = 0;
		{
			float temp;
			temp = Max( pe[0], pe[1] );
			temp -= 750;
			if( temp < 0 ){
				temp = 0;
			}else{
				temp *= 0.7142857143; /* =1/1.4 */
			}
			if( cod_info[0]->block_type == 2 && temp < 500 ){
				add_bits[0] = 500;
			}else{
				add_bits[0] = temp;
			}
			if( cod_info[1]->block_type == 2 && temp < 500 ){
				add_bits[1] = 500;
			}else{
				add_bits[1] = temp;
			}
			bits += add_bits[0] + add_bits[1];
		}
		for( ch = 0; ch < 2; ch++ ){
			if( bits > extra_bits ){
				add_bits[ch] = (extra_bits*add_bits[ch])/bits;
			}
			targ_bits[ch] += add_bits[ch];
		}
		extra_bits -= add_bits[0];
		extra_bits -= add_bits[1];
	}  
	/* ξ¦ͥ125 bitsʲˤϸ餵ʤ */
	if( reduce_sidechannel ){
		float fac;
		fac = 0.5 - ms_ener_ratio[gr];
		if( fac > 0 ){
			fac *= (0.33 * 2) * targ_bits[1];
			if( targ_bits[1] - fac > 125 ){
				targ_bits[0] += fac;
				targ_bits[1] -= fac;
			}
		}
	}

	/* 1ĤΥͥbitƤʤ */
	{
		int max_bits;
		max_bits = ( mean_bits > 5791 ) ? 4095 : ( mean_bits / 2 + 1200 );
		if( targ_bits[0] > max_bits )targ_bits[0] = max_bits;
		if( targ_bits[1] > max_bits )targ_bits[1] = max_bits;
	}

	/* BEGIN MAIN LOOP */
	iteration = 0;

	while( notdone[0] || notdone[1] ){
		int bits_found[2];
		int over[2];
		int better[2]; /* fast_modeϻȤʤ */
		iteration++;

		if( iteration == 1 ){
			/* compute initial quantization step */
			for( ch = 0; ch < 2; ch++ ){
				if( !notdone[ch] ) continue;
				calc_pow075( xr[ch], xrpow[ch] );
				/* l3_encϤδؿǽ */
				bits_found[ch] = 
					bin_search_StepSize2( targ_bits[ch], l3_enc[ch], xrpow[ch], cod_info[ch] );
			}
		}

		for( ch = 0; ch < 2; ch++ ){
			int huff_bits;
			if( !notdone[ch] )continue;
#ifndef LAME355a
			cod_info[ch]->part2_length = part2_length( gr, ch, l3_side );
#endif
			huff_bits = targ_bits[ch] - cod_info[ch]->part2_length;
			if( huff_bits < 0 ){
				notdone[ch] = 0;
//				over[ch] = 999;
			}else{
				if( iteration == 1 && bits_found[ch] <= huff_bits ){
					real_bits[ch] = bits_found[ch];
				}else{
					if( iteration == 1 ){
						cod_info[ch]->quantizerStepSize++;
					}
					real_bits[ch] = inner_loop( xrpow[ch], l3_enc[ch], huff_bits, cod_info[ch] );
				}
			}
		}

		if( fast_mode ){
			over[0] = over[1] = 0;
		}else{
		/* mid/side coefficiets, l/r thresholds */
		/* over, distortΰ, *_noiseϤꤵ */
			calc_noise2( xr, l3_enc, cod_info,
				distort, l3_xmin,gr,over,over_noise,tot_noise,max_noise);

			if( iteration == 1 ){
				better[0] = better[1] = 1;
			}else{
				/* betterϤꤵ */
				quant_compare_dual(better,notdone,best_over,best_over_noise,over,over_noise);
			}
		}

		/* save data so we can restore this quantization later */    
		for( ch = 0; ch < 2; ch++ ){
			if( !fast_mode ){
				if( !better[ch] )continue;
				best_over[ch] = over[ch];
				best_over_noise[ch] = over_noise[ch];
				best_tot_noise[ch] = tot_noise[ch];
				best_max_noise[ch] = max_noise[ch];
			}
			if( !notdone[ch] )continue;
		/* CBLIMIT  礭1ְ㤤ǤϤʤ餷(?) */
			memcpy( scalesave_l[ch], scalefac->l[gr][ch], CBLIMIT * sizeof(int) );
			memcpy( scalesave_s[ch], scalefac->s[gr][ch], SFB_SMAX * 3 * sizeof(int) );
			save_preflag[ch]  = cod_info[ch]->preflag;
			save_compress[ch] = cod_info[ch]->scalefac_compress;
			memcpy(save_l3_enc[ch],l3_enc[ch],sizeof(l3_enc[ch]));
			memcpy(&save_cod_info[ch],cod_info[ch],sizeof(save_cod_info[ch]));
			save_real_bits[ch] = real_bits[ch];
		}

		notdone[0] &= over[0] || over[1];	/* notdone[i] ={0,1} */
		notdone[1] &= over[0] || over[1];

		if( !fast_mode ){
			for( ch = 0; ch < 2; ch ++ ){
				if( !notdone[ch] )continue;
				if( !preemphasis( xr[ch], xrpow[ch], l3_xmin, gr, ch, l3_side, distort ) ){
					amp_scalefac_bands( xr[ch], xrpow[ch], l3_xmin,
						l3_side, scalefac, gr, ch, iteration, distort );
				}
			}
		}
	/* check to make sure we have not amplified too much */

		for( ch = 0; ch < 2; ch++ ){
			int stat;
			if( !notdone[ch] )continue;

			stat = loop_break(scalefac, cod_info[ch], gr, ch);
			if( !stat ){
				if( gl.mode_gr == 2 ){
					stat = scale_bitcount( scalefac, cod_info[ch], gr, ch );
				}else{
					stat = scale_bitcount_lsf( scalefac, cod_info[ch], gr, ch);
				}
			}
			notdone[ch] = !stat;
		}
	} /* done with main iteration */

	/* restore some data */
	for( ch = 0; ch < gl.stereo; ch++ ){
		if( !count[ch] )continue;

		cod_info[ch]->preflag = save_preflag[ch];
		cod_info[ch]->scalefac_compress = save_compress[ch];

		memcpy( scalefac->l[gr][ch], scalesave_l[ch], CBLIMIT * sizeof(int) );
		memcpy( scalefac->s[gr][ch], scalesave_s[ch], SFB_SMAX * 3 * sizeof(int) );
		real_bits[ch] = save_real_bits[ch];

		memcpy(l3_enc[ch],save_l3_enc[ch],sizeof(l3_enc[ch]));   
		memcpy(cod_info[ch],&save_cod_info[ch],sizeof(save_cod_info[ch]));

		if( gl.mode_gr == 2 ){
			scale_bitcount( scalefac, cod_info[ch], gr, ch );
		}else{
			scale_bitcount_lsf( scalefac, cod_info[ch], gr, ch );
		}
#ifndef LAME355a
		cod_info[ch]->part2_length = part2_length( gr, ch, l3_side );
#endif
		cod_info[ch]->part2_3_length = cod_info[ch]->part2_length + real_bits[ch];
	}

	/* finish up */
	for( ch = 0; ch < gl.stereo; ch ++ ){
		ResvAdjust( cod_info[ch], mean_bits );
		cod_info[ch]->global_gain = cod_info[ch]->quantizerStepSize + 210;
	}
} /* end of outer_loop_dual() */


/*
 *	l3_encϤǽ
 *	00/01/05 sloppy=0 Ȥϸ¤ʤ ( if VBR )
 *	00/01/11 best_noise VBRλΤ߻
 */

static
float
outer_loop( float xr[576], int targ_bits, float best_noise[4], int sloppy,
		III_psy_xmin  *l3_xmin, int l3_enc[2][2][576],
		III_scalefac_t *scalefac,int gr, III_side_info_t *l3_side,
		III_psy_ratio *ratio, float ms_ener_ratio, int ch )
{
	int iteration;
	int count = 0, bits_found = 0;
	int real_bits = 0;
	int scalesave_l[SFB_LMAX], scalesave_s[SFB_SMAX][3];
//	float xrpow[576];
	float unaligned_xrpow[576+4];
	float (*xrpow) = (float *)(((int)unaligned_xrpow + 15) & ~15);
	float distort[4][CBLIMIT];
	int save_l3_enc[576];
	int save_real_bits = 0;
	int save_preflag = 0, save_compress = 0;
	int better;
	int over = 0;
	float max_noise = -999;
	float over_noise = 0;
	float tot_noise = 0;

	int best_over = 0;
	float best_over_noise = 0;
	float best_max_noise = 0;	/* ext_malX=0λϻȤʤ */
	float best_tot_noise = 0;	/* ext_malX=0λϻȤʤ */
	gr_info save_cod_info;
	gr_info *cod_info;

	int notdone=1;

//  if( experimentalY ) memcpy(xr_save,xr,sizeof(float)*576);   
	cod_info = &l3_side->gr[gr].ch[ch].tt;
	/* lame  init_outer_loop{"",dual}ϺǸΰȤä礷 */
	init_outer_loop(xr, NULL, l3_xmin,scalefac,gr,l3_side,ratio,ch,0); 
	best_over = 100;
#if 1
	count = count_nz_xr(&xr[0]);
#else
	count = 0;
	{
		int *p = (int *)&xr[0];
		for( i = 0; i < 576; i++ ){
			if( *p & 0x7FFFFFFF ) count++;	/* fabs(*p) > 0 */
			p++;
		}
	}
#endif
	if( count == 0 ){
		best_over = 0;
		notdone = 0;
	/* notdone=0λl3_enc */
		memset( l3_enc[gr][ch], 0, sizeof(int) * 576 );
	}

	/* BEGIN MAIN LOOP */
	iteration = 0;
	while( notdone  ){
		int huff_bits;
//    int try_scale = 0;
		iteration++;

		if( iteration == 1 ){
			calc_pow075( xr, xrpow );
		/* notdone!=0λl3_enc */
			bits_found = 
			bin_search_StepSize2( targ_bits, l3_enc[gr][ch], xrpow, cod_info );
		}

	/* inner_loop starts with the initial quantization step computed above
	 * and slowly increases until the bits < huff_bits.
	 * Thus is it important not to start with too large of an inital
	 * quantization step.  Too small is ok, but inner_loop will take longer 
	 */
#ifndef LAME355a
		cod_info->part2_length = part2_length( gr, ch, l3_side );
#endif
		huff_bits = targ_bits - cod_info->part2_length;
		if( huff_bits < 0 ){
			notdone = 0;
		}else{
		/* if this is the first iteration, see if we can reuse the quantization
		 * computed in bin_search_StepSize above
		 */

			if( iteration == 1 && bits_found <= huff_bits ){
				real_bits = bits_found;
			}else{
				if( iteration == 1 ){
					cod_info->quantizerStepSize++;
				}
				real_bits = inner_loop( xrpow, l3_enc[gr][ch], huff_bits, cod_info );
			}
		}

		if( notdone ){
			if( fast_mode ){
				over = 0;
				better = 1;
			}else{
				over = calc_noise1( xr, l3_enc[gr][ch], cod_info, distort,
				l3_xmin,gr,ch, &over_noise, &tot_noise, &max_noise);

				if( iteration == 1 ){
					better = 1;
				}else{
					if( !experimentalX ){
						better = over < best_over 
						|| ( over == best_over && over_noise < best_over_noise );
					}else{
						better = quant_compare(
				best_over,best_tot_noise,best_over_noise,best_max_noise,
					 over,     tot_noise,     over_noise,     max_noise);
					}
				}
			}
		/* save data so we can restore this quantization later */    
			if( better ){
				if( !fast_mode ){
					best_over = over;
					best_max_noise = max_noise;
					best_over_noise = over_noise;
					best_tot_noise = tot_noise;
				}
				if( !sloppy ){
					memcpy( scalesave_l, scalefac->l[gr][ch], CBLIMIT * sizeof(int) );
					memcpy( scalesave_s, scalefac->s[gr][ch], SFB_SMAX * 3 * sizeof(int) );
					save_preflag  = cod_info->preflag;
					save_compress = cod_info->scalefac_compress;

					memcpy(save_l3_enc,l3_enc[gr][ch],sizeof(l3_enc[gr][ch]));   
					memcpy(&save_cod_info,cod_info,sizeof(save_cod_info));
					save_real_bits = real_bits;
				}
			}
		}

		if( !experimentalX && !over ) notdone = 0;

	/* in sloppy mode, as soon as we know we can do better than targ_noise,
	 * quit.  This is used for the inital VBR bin search.  Turn it off for
	 * final (optimal) quantization */
#ifdef USE_VBR
		if( sloppy && notdone ){
			notdone = 
#ifndef LAME355
			over > VBR_q;
/* lame3.28loopopld.cexperimentalX==0λνͤ᤿
 * ̤ξ֤ǤϽͤޤäƤʤΤǤΥ롼ϥʥ󥻥
 */
#else
			!VBR_compare((int)targ_noise[0],targ_noise[3],targ_noise[2],
				targ_noise[1],over,tot_noise,over_noise,max_noise);
#endif
		}
#endif /* USE_VBR */
		if( notdone && !fast_mode ){
			if( !preemphasis(xr,xrpow,l3_xmin,gr,ch,l3_side,distort) ){
				notdone = amp_scalefac_bands( xr, xrpow, l3_xmin,
					l3_side, scalefac, gr, ch, iteration,distort);
			}
		}

		if( notdone ){
			int status;
			status = loop_break( scalefac, cod_info, gr, ch );
			if( !status ){
				if( gl.mode_gr == 2 ){
					status = scale_bitcount( scalefac, cod_info, gr, ch );
//	  if (status && (cod_info->scalefac_scale==0)) try_scale=1; 
				}else{
					status = scale_bitcount_lsf( scalefac, cod_info, gr, ch );
//	  if (status && (cod_info->scalefac_scale==0)) try_scale=1; 
				}
			}
			notdone = !status;
		}

#if 0 /* experimentalY (ۤɼ񤷤ʤ)߻Բ */
    if( experimentalY && try_scale ){
      memcpy(xr,xr_save,sizeof(float)*576);   
      init_outer_loop(xr,xr_org,l3_xmin,scalefac,gr,l3_side,ratio,ch,0);  
      compute_stepsize=1;  /* compute a new global gain */
      notdone=1;
      cod_info->scalefac_scale=1;
    }
#endif
	} /* done with main iteration */

	if( count && !sloppy ){
		cod_info->preflag = save_preflag;
		cod_info->scalefac_compress = save_compress;

		memcpy( scalefac->l[gr][ch], scalesave_l, sizeof(int) * CBLIMIT );
		memcpy( scalefac->s[gr][ch][0], scalesave_s[0], sizeof(int) * 3 * SFB_SMAX );
		real_bits = save_real_bits;
		memcpy( l3_enc[gr][ch], save_l3_enc, sizeof(l3_enc[gr][ch]) );   
		memcpy( cod_info, &save_cod_info, sizeof(save_cod_info) );

		if( gl.mode_gr == 2 ){
			scale_bitcount( scalefac, cod_info, gr, ch );
		}else{
			scale_bitcount_lsf( scalefac, cod_info, gr, ch );
		}
#ifndef LAME355a
		cod_info->part2_length   = part2_length( gr, ch, l3_side );
#endif
		cod_info->part2_3_length = cod_info->part2_length + real_bits;
	}

	cod_info->global_gain = cod_info->quantizerStepSize + 210;
#if defined(USE_VBR) && defined(LAME355)
	if( VBR ){
		best_noise[0] = best_over;
		best_noise[1] = best_max_noise;
		best_noise[2] = best_over_noise;
		best_noise[3] = best_tot_noise;
	}
#endif
	return best_over;
} /* endof outer_loop() */


static
void
set_l3_enc_sign_C(float xr[], int l3_enc[])
{
	int *pi = l3_enc;
	int *p = (int *)xr;
	int i;
	for( i = 0; i < 576; i++ ){
		if( (*p & 0x80000000) && (*pi > 0) )*pi = -*pi;
		p++;
		pi++;
	}
}

void set_l3_enc_sign_3DN(float xr[], int l3_enc[]);
void set_l3_enc_sign_MMX(float xr[], int l3_enc[]);
//void set_l3_enc_sign_SSE(float xr[], int l3_enc[]);
//void set_l3_enc_sign_NONE(float xr[], int l3_enc[]);
#ifdef USE_E3DN
void set_l3_enc_sign_E3DN(float xr[], int l3_enc[]);
#endif

void setup_set_l3_enc_sign(int useUNIT){
#ifdef USE_E3DN
	if( useUNIT & tE3DN ){
		SETUP_DSP("use:set_l3_enc_sign_E3DN\n");
		set_l3_enc_sign = set_l3_enc_sign_E3DN;
	}else
#endif
	if( useUNIT & t3DN ){
		SETUP_DSP("use:set_l3_enc_sign_3DN\n");
		set_l3_enc_sign = set_l3_enc_sign_3DN;
	}else
//	if( useUNIT & tSSE ){
//		SETUP_DSP("use:set_l3_enc_sign_SSE\n");
//		set_l3_enc_sign = set_l3_enc_sign_SSE;
//	}else
	if( useUNIT & tMMX ){
		SETUP_DSP("use:set_l3_enc_sign_MMX\n");
		set_l3_enc_sign = set_l3_enc_sign_MMX;
	}else
	{
		SETUP_DSP("use:set_l3_enc_sign_C\n");
		set_l3_enc_sign = set_l3_enc_sign_C;
//		SETUP_DSP("use:set_l3_enc_sign_NONE\n");
//		set_l3_enc_sign = set_l3_enc_sign_NONE;
	}
}


/* ǤVBRϻȤʤ */
/*
 *	ǽѿ
 *	l3_enc, l3_xmin, l3_sideΰ, scalefac
 */

void
iteration_loop( float pe[][2], float ms_ener_ratio[2],
	float xr_org[2][2][576], III_psy_ratio *ratio,
		III_side_info_t *l3_side, int l3_enc[2][2][576],
		III_scalefac_t *scalefac )
{
	III_psy_xmin l3_xmin;
	int bitsPerFrame;
	int mean_bits;
	int ch, gr;
	float unaligned_xr[2*2*576+4];
	float (*xr)[2][576] = (float (*)[2][576])(((int)unaligned_xr + 15) & ~15);

	iteration_init( xr_org, l3_side );

	getframebits(&bitsPerFrame,&mean_bits,l3_side->rate_idx,l3_side->padding);
	ResvFrameBegin( mean_bits, bitsPerFrame );

	for( gr = 0; gr < gl.mode_gr; gr++ ){
/* convert_psyΤȤstereo=2 */
		if( convert_psy ){
	/* dual channel version can quantize Mid/Side channels with L/R
	 * maskings (by constantly reconstructing L/R data).  Used before we
	 * we had proper mid/side maskings. */
	   outer_loop_dual( xr[gr], xr_org[gr], mean_bits,
			&l3_xmin,l3_enc[gr], scalefac,gr, l3_side, ratio, pe[gr], ms_ener_ratio);
		}else{
			int targ_bits[2];
			/* copy data to be quantized into xr */
			if( convert_mdct ){
				ms_convert( xr[gr], xr_org[gr] );
			}else{
				memcpy( xr[gr], xr_org[gr], sizeof(float)*2*576 );
			}
			on_pe( pe[gr], l3_side, targ_bits, mean_bits, gr );
			if( reduce_sidechannel ){
				reduce_side( targ_bits, ms_ener_ratio[gr], mean_bits );
			}

			for( ch = 0; ch < gl.stereo; ch++ ){
				gr_info *cod_info;
				outer_loop( xr[gr][ch], targ_bits[ch], NULL, 0, &l3_xmin,l3_enc, 
				scalefac,gr, l3_side, ratio, ms_ener_ratio[gr],ch);
				cod_info = &l3_side->gr[gr].ch[ch].tt;
				ResvAdjust( cod_info, mean_bits );
			}
		}
	}

	/* set the sign of l3_enc */
	for( gr = 0; gr < gl.mode_gr; gr++ ){
		for( ch =  0; ch < gl.stereo; ch++ ){
#if 1
			set_l3_enc_sign(&xr[gr][ch][0], &l3_enc[gr][ch][0]);
#else
			int *pi = &l3_enc[gr][ch][0];
			for( i = 0; i < 576; i++){
				float pr = xr[gr][ch][i];
				if ( (pr < 0) && (pi[i] > 0) )   pi[i] *= -1;
			}
#endif
		}
	}
	ResvFrameEnd( l3_side, mean_bits );
} /* end of iteration_loop */

#ifdef USE_VBR

void
VBR_iteration_loop( float pe[2][2], float ms_ener_ratio[2],
		float xr_org[2][2][576], III_psy_ratio *ratio,
		III_side_info_t *l3_side, int l3_enc[2][2][576],
		III_scalefac_t *scalefac )
{
	III_psy_xmin l3_xmin;
	int mean_bits;
	int ch, gr;
#ifdef LAME355
	int frameBits[15];
	int min_mean_bits = 0;
	float noise[4];
/*	float targ_noise[4]; all zero */
	float __xr_save[576+4];
	float *xr_save = (float *)(( (int)__xr_save + 15 ) & -16 );
	float masking_lower_db;
	int save_bits[2][2], used_bits=0, bits;
	int idx;
#ifdef LAME357
	int analog_silence = 0;
#endif
#else
	int VBRbits[2][2]; /* OK */
	int best_over[2][2]; /* OK */
	int VBR_no_decrease;
#endif /* LAME355 */

	float unaligned_xr[2*2*576+4];
	float (*xr)[2][576] = (float (*)[2][576])(( (int)unaligned_xr + 15) & -16);

	iteration_init( xr_org, l3_side );
#ifdef LAME355
	/*******************************************************************
	 * how many bits are available for each bitrate?
	 *******************************************************************/

#ifndef UseFrameBitsTable
#error " getframebits may be changed "
#endif
	for( idx = VBR_min_rate_idx; idx <= VBR_max_rate_idx; idx++ ){
		mean_bits = mean_bits_table[idx][l3_side->padding];
		if( idx == VBR_min_rate_idx && gl.stereo == 2 ){
			min_mean_bits = mean_bits >>1;
		}
		frameBits[idx]= ResvFrameBegin( mean_bits, bitsPerFrame_table[idx][l3_side->padding]);
	}

	l3_side->rate_idx = VBR_max_rate_idx;

	/*******************************************************************
	 * how many bits would we use of it?
	 *******************************************************************/

	for( gr = 0; gr < gl.mode_gr; gr++){
		int num_chan = gl.stereo;
		/* determine quality based on mid channel only */
		if( reduce_sidechannel ) num_chan = 1;

		/* copy data to be quantized into xr */
		if( convert_mdct ){
			ms_convert(xr[gr],xr_org[gr]);
		}else{
			memcpy(xr[gr],xr_org[gr],sizeof(float)*2*576);   
		}

		for( ch = 0; ch < num_chan; ch++){
			gr_info *cod_info;
			int dbits, this_bits, min_bits, max_bits;
		/******************************************************************
		 * find smallest number of bits for an allowable quantization
		 ******************************************************************/
			memcpy(xr_save,xr[gr][ch],sizeof(float)*576);   
			cod_info = &l3_side->gr[gr].ch[ch].tt;
			min_bits = Max(125,min_mean_bits);
#ifdef LAME360
			/* check for analolg silence */
			/* if energy < ATH, set min_bits = 125 */
			if( 0 == calc_xmin( xr, ratio, cod_info, &l3_xmin, gr, ch ) ){
				analog_silence = 1;
				min_bits = 125;
			}
#endif
			if( cod_info->block_type == SHORT_TYPE ){
				min_bits += Max(1100,pe[gr][ch]);
				min_bits = Min(min_bits,1800);
			}

			max_bits = 1200 + frameBits[VBR_max_rate_idx]/(gl.stereo*gl.mode_gr);
			max_bits = Min(max_bits, 2500);
			max_bits = Max(max_bits, min_bits);

		/* in the case we will not find any better, we allocate max_bits */
			save_bits[gr][ch] = max_bits;

			dbits = ( max_bits - min_bits ) >>2;
			this_bits = ( max_bits + min_bits ) >>1;
		/* bin search to within +/- 10 bits of optimal */
			do{
				int better;
				float fac;
#ifdef LAME360
				masking_lower_db = -8 + 2 * VBR_q;
				fac = ( this_bits - 125 ) / (float)( 2500 -125 );
				fac = 4 * ( fac - 1 );
#else
				masking_lower_db = -10 + 2 * VBR_q;
				fac = 2.526315789e-3 * this_bits - 6.315789474;
#endif
				masking_lower_db += fac;
				masking_lower = pow(10.0, masking_lower_db * 0.1);
	/* VBR will look for a quantization which has better values
	 * then those specified below.*/
				memcpy(xr[gr][ch],xr_save,sizeof(float)*576);
				outer_loop( xr[gr][ch], this_bits, noise, 1,&l3_xmin,
				l3_enc, scalefac,gr, l3_side, ratio, ms_ener_ratio[gr], ch);

				better=VBR_compare((int)targ_noise[0],targ_noise[3],targ_noise[2],
				      targ_noise[1],(int)noise[0],noise[3],noise[2],noise[1]);

				if( better ){
					save_bits[gr][ch] = this_bits;
					this_bits -= dbits;
				}else{
				  this_bits += dbits;
				}
				dbits >>=1;
			}while( dbits > 10 );
			used_bits += save_bits[gr][ch];
		} /* ch */
	} /* gr */

	if( reduce_sidechannel ){
		/* number of bits needed was found for MID channel above.  Use formula
		 * (fixed bitrate code) to set the side channel bits */
		for( gr = 0; gr < gl.mode_gr; gr++ ){
			float fac = 0.33 - 0.66 * ms_ener_ratio[gr];
			save_bits[gr][1] = ( 1 - fac ) / ( 1 + fac ) * save_bits[gr][0];
			used_bits += save_bits[gr][1];
		}
	}

	/******************************************************************
	 * find lowest bitrate able to hold used bits
	 ******************************************************************/
#ifdef LAME360
	idx = ( analog_silence ) ? 1 : VBR_min_rate_idx;
#else
	idx = VBR_min_rate_idx;
#endif
	for( ; idx < VBR_max_rate_idx; idx++ ){
		if( used_bits <= frameBits[idx] ) break;
	}
	l3_side->rate_idx = idx;

  /*******************************************************************
   * calculate quantization for this bitrate
   *******************************************************************/  

	mean_bits = mean_bits_table[l3_side->rate_idx][l3_side->padding];
	bits = ResvFrameBegin( mean_bits, bitsPerFrame_table[l3_side->rate_idx][l3_side->padding]);

  /* repartion available bits in same proportion */
	if( used_bits > bits ){
		for( gr = 0; gr < gl.mode_gr; gr++ ){
			for( ch = 0; ch < gl.stereo; ch++){
				save_bits[gr][ch]=(save_bits[gr][ch]*frameBits[l3_side->rate_idx])/used_bits;
			}
		}
#ifndef NDEBUG
		used_bits = 0;
		for( gr = 0; gr < gl.mode_gr; gr++ ){
			used_bits += save_bits[gr][0];
			if( gl.stereo == 2 ) used_bits += save_bits[gr][1];
		}
#endif
	}
	assert(used_bits <= bits);

	for( gr = 0; gr < gl.mode_gr; gr++ ){
	/* copy data to be quantized into xr */
		if( convert_mdct ){
			ms_convert(xr[gr],xr_org[gr]);
		}else{
			memcpy(xr[gr],xr_org[gr],sizeof(float)*2*576);   
		}
		for( ch = 0; ch < gl.stereo; ch++ ){
#ifdef LAME360
			float fac;
			masking_lower_db = -8 + 2 * VBR_q;
			fac = ( save_bits[gr][ch] - 125 ) / (float)( 2500 -125 );
			fac = 4 * ( fac - 1 );
			masking_lower_db += fac;
			masking_lower = pow(10.0, masking_lower_db * 0.1);
#endif
			outer_loop( xr[gr][ch], save_bits[gr][ch], noise, 0,
			&l3_xmin,l3_enc, scalefac,gr, l3_side, ratio, ms_ener_ratio[gr], ch);
		}
	}
	/* update reservoir status after FINAL quantization/bitrate */
	for( gr = 0; gr < gl.mode_gr; gr++ ){
		ResvAdjust( &l3_side->gr[gr].ch[0].tt, mean_bits );
		if( gl.stereo == 2 ) ResvAdjust( &l3_side->gr[gr].ch[1].tt, mean_bits );
	}

	/* set the sign of l3_enc */
	for( gr = 0; gr < gl.mode_gr; gr++ ){
		set_l3_enc_sign(&xr[gr][0][0], &l3_enc[gr][0][0]);
		if( gl.stereo == 2 ) set_l3_enc_sign(&xr[gr][1][0], &l3_enc[gr][1][0]);
	}
	ResvFrameEnd( l3_side, mean_bits );
	return;

#else /* LAME355 ------------------------------------------------- */

	VBR_no_decrease = VBR_on_pe(info,l3_side,VBRbits,pe,&mean_bits);


	/* quantize! */
	for(;;){
		int max_over, VBR_q2;
		VBR_q2=VBR_q;
	/* Robert Hegemann 7/99 
Distortions in frames at lower bitrates are more anoying than at higher bitrates. 
The user selects the allowed number of distortions at 112 kbs
frames and defines therefore the maximum for all other possible
bitrates. 
    */
		VBR_q2 = ((int)(0.5+
			(float)(bitrate[info->version][info->lay-1][gl.rate_idx])
			/ (float)(bitrate[info->version][info->lay-1][8])
			* (float)(VBR_q)   ));

		for( gr = 0; gr < gl.mode_gr; gr++ ){
			int targ_bits[2];
		/* copy data to be quantized into xr */
			if( convert_mdct ){
				ms_convert(xr[gr],xr_org[gr]);
			}else{
				memcpy(xr[gr],xr_org[gr],sizeof(float)*2*576);   
			}

			targ_bits[0] = VBRbits[gr][0];
			targ_bits[1] = VBRbits[gr][1]; /* stereo == 1 λǤⳲϤʤ */
			if( reduce_sidechannel ) reduce_side(targ_bits,ms_ener_ratio[gr],mean_bits);
			for( ch = 0; ch < gl.stereo; ch++ ){
				best_over[gr][ch] = outer_loop( xr[gr][ch], targ_bits[ch], NULL, 0,
				&l3_xmin,l3_enc, scalefac,gr, l3_side, ratio, ms_ener_ratio[gr], ch);
			}
		}

	/* see if we should try a higher bitrate quantization */
		max_over = 0;
		if( reduce_sidechannel ){
		/* for this case, allow a lot of extra distortion in the side channel 
		 * this means quailty is mostly determined by Mid channel, unless 
		 * side channel is *very* bad */
			for( gr = 0; gr < gl.mode_gr; gr++ ){
				max_over = Max( max_over, best_over[gr][0] );
				max_over = Max( max_over, best_over[gr][1] - 9 );
			}
		}else{
			for( gr = 0; gr < gl.mode_gr; gr++ ){
				for( ch = 0; ch < gl.stereo; ch ++ ){
					if( best_over[gr][ch] > max_over) max_over = best_over[gr][ch];
				}
			}
		}

		if( max_over <= VBR_q2 && gl.rate_idx <= VBR_min_rate_idx ) break;
		if( max_over >  VBR_q2 && gl.rate_idx >= VBR_max_rate_idx ) break;
	/* do not decrease bit rate if we have ever increased bit rate 
	 * otherwise infinite loop */
		if( max_over <= VBR_q2 && VBR_no_decrease ) break;

	/* compute a new bitrate, and allocate bits between granules */
		if( max_over > VBR_q2 ) VBR_no_decrease = 1;
		VBR_on_over( info,l3_side,VBRbits,pe,max_over,VBR_q2,best_over,&mean_bits,&VBR_no_decrease);
	}
	/* update reservoir status after FINAL quantization/bitrate */
	for( gr = 0; gr < gl.mode_gr; gr++ ){
		for( ch = 0; ch < gl.stereo; ch++ ){
			gr_info *cod_info = &l3_side->gr[gr].ch[ch].tt;
			ResvAdjust( cod_info, mean_bits );
		}
	}

	/* set the sign of l3_enc */
	for( gr = 0; gr < gl.mode_gr; gr++ ){
		for( ch = 0; ch < gl.stereo; ch++ ){
			set_l3_enc_sign(&xr[gr][ch][0], &l3_enc[gr][ch][0]);
		}
	}
	ResvFrameEnd( l3_side, mean_bits );
#endif /* LAME355 */
} /* end of VBR_iteration_loop */
#endif /* USE_VBR */



/*
 *	99/08/03
 *	subblock_gain[i]=0ꤷƺŬƤΤ
 *	00/01/08
 *	ret = ix_max()Ʊ˸ƤӽФ
 *	if( ret > 8191 + 14 ) return 100000; else return 0;
 */
#ifdef MIEasmQuantize

int quantize_xrpow_FPU(float xr[576],int ix[576],gr_info *cod_info);
int quantize_xrpow_3DN(float xr[576],int ix[576],gr_info *cod_info);
int quantize_xrpow_SSE(float xr[576],int ix[576],gr_info *cod_info);
#ifdef USE_E3DN
int quantize_xrpow_E3DN(float xr[576],int ix[576],gr_info *cod_info);
#endif

void setup_quantize_xrpow(int useUNIT){
#ifdef USE_E3DN
	if(useUNIT & tE3DN){
		SETUP_DSP("use:quantize_xrpow_E3DN\n");
		quantize_xrpow=quantize_xrpow_E3DN;
	}else
#endif
	if(useUNIT & t3DN){
		SETUP_DSP("use:quantize_xrpow_3DN\n");
		quantize_xrpow=quantize_xrpow_3DN;
	}else
	if(useUNIT & tSSE){
		SETUP_DSP("use:quantize_xrpow_SSE\n");
		quantize_xrpow=quantize_xrpow_SSE;
	}else{
		SETUP_DSP("use:quantize_xrpow_FPU\n");
		quantize_xrpow=quantize_xrpow_FPU;
    }
}

#else
void
setup_quantize_xrpow(int useUNIT){}
/* input: xr, cod_info  output: ix */

int
quantize_xrpow( float xr[576], int ix[576], gr_info *cod_info )
{
	/* quantize on xr^(3/4) instead of xr */
	register int i;
	float step;
	step = pow ( 2.0, cod_info->quantizerStepSize * -0.1875 );
    
	for (i=576;i>0;i--){
		*ix++ = (int)( step * *xr++  + 0.4054);
	}
	return ix_max( ix, 0, 576 );
}
#endif	/* MIEasmQuantize */

int ix_max_3DN( int ix[576], unsigned int begin, unsigned int end );
int ix_max_MMX( int ix[576], unsigned int begin, unsigned int end );
int ix_max_SSE( int ix[576], unsigned int begin, unsigned int end );
int ix_max_NONE( int ix[576], unsigned int begin, unsigned int end );
#ifdef USE_E3DN
int ix_max_E3DN( int ix[576], unsigned int begin, unsigned int end );
#endif

void
setup_ix_max(int useUNIT)
{
#ifdef USE_E3DN
	if(useUNIT & tE3DN){
		SETUP_DSP("use:ix_max_E3DN\n");
		ix_max=ix_max_E3DN;
	}else
#endif
	if(useUNIT & t3DN){
		SETUP_DSP("use:ix_max_3DN\n");
		ix_max=ix_max_3DN;
	}else if(useUNIT & tSSE){
		SETUP_DSP("use:ix_max_SSE\n");
		ix_max=ix_max_SSE;
	}else if(useUNIT & tMMX){
		SETUP_DSP("use:ix_max_MMX\n");
		ix_max=ix_max_MMX;
	}else{
		SETUP_DSP("use:ix_max_NONE\n");
		ix_max=ix_max_NONE;
	}
}

#ifdef MIEasmIxMax
/*
 *	ix[]ˤʤʤ(ۤ)ñĴ
 *	end-beginʿ100
 *	99/08/14
 *	begin,end˶ begin<end Ǥ뤳ȤꤷƤ褤
 */
#else

int ix_max( int ix[576], unsigned int begin, unsigned int end ){
	int i, x, max = 0;
	for ( i = begin; i < end; i++ ){
		x = ix[i];
		if( x > max )
			max = x;
	}
    return max;
}
#endif /* MIEasmIxMax */

/* ʲ count_bits()Τδؿ */

/* exported to huffmana.nas */
static const int choose_table_ptn[]={
/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
   0, 1, 2, 5, 7, 7,10,10,13,13,13,13,13,13,13,13,
  16,17,17,18,18,18,18,19,19,19,19,19,19,19,19,20};

static
int
choose_tableH( int max )
{
	int  i;
#if 0
	if ( max <= 15 ){
		if ( !max ) return 0;
		for ( i = 0; i < 15; i++ ){
			if ( ht[i].xlen > max ) return i;
		}
	}else
#endif
	{
	max -= 15;
		for (i = 15; i < 32; i++ ){
			if( ht[i].linmax >= max ) return i;
		}
	}
	return 0;	/* here if max >= 8192 (ht[24].linmax) cf. huffcode.tbl */
}
#define choose_table( max ) ( max < 32 ) ? choose_table_ptn[max] : choose_tableH( max )

/*
 *	99/08/01
 *	count_bit()֤
 *	99/08/14
 *	begin<endꤷƤ褤
 *	99/12/23
 *	֤ͤλΤbitss­
 *	begin,end϶
 */

static int new_choose_table_NONE( int ix[576], unsigned int begin, unsigned int end, int *bits )
{
	int i, max;
	max = ix_max( ix, begin, end );

	if( max <= 15 ){	/* 8䤬 1500clk */
		int sum;
		int choice = 0;
		if( max == 0 ) return 0;
	/* try tables with no linbits */
#if 1
		choice = choose_table_ptn[max];
#else
		for( i = 0; i < 14; i++ ){
			if ( ht[i].xlen > max ){
				choice = i;
				break;
			}
		}
#endif
		/* ɬ choice != 0 */
		sum = count_bit( ix, begin, end, choice );
		switch( choice ){
		case 2:
			max = count_bit( ix, begin, end, 3 );
			if( max <= sum ){
				sum = max;
				choice = 3;
			}
			break;

		case 5:
			max = count_bit( ix, begin, end, 6 );
			if( max <= sum ){
				sum = max;
				choice = 6;
			}
			break;

		case 7:
			max = count_bit( ix, begin, end, 8 );
			if( max <= sum ){
				sum = max;
				choice = 8;
			}
			max = count_bit( ix, begin, end, 9 );
			if( max <= sum ){
				sum = max;
				choice = 9;
			}
			break;

		case 10:
			max = count_bit( ix, begin, end, 11 );
			if( max <= sum ){
				sum = max;
				choice = 11;
			}
			max = count_bit( ix, begin, end, 12 );
			if( max <= sum ){
				sum = max;
				choice = 12;
			}
			break;

		case 13:
			max = count_bit( ix, begin, end, 15 );
			if( max <= sum ){
				sum = max;
				choice = 15;
			}
			break;
		default:
			break;
		}
		*bits += sum;
		return choice;
	}else{	/* 2䤬 800clk */
		int sum[2];
		int choice[2];
		choice[0] = 0;
		choice[1] = 0;

	/* try tables with linbits */
		max -= 15;	// max  16ʾä max >= 1

//		for( i = 15; i < 24; i++ ){
		for( i = 16; i < 24; i++ ){
			if( ht[i].linmax >= max ){
				choice[0] = i;
			break;
			}
		}
		for( i = 24; i < 32; i++ ){
			if( ht[i].linmax >= max ){
				choice[1] = i;
				break;
			}
		}
		if( choice[0] ){
			sum[0] = count_bit( ix, begin, end, choice[0] );
		}else{
			sum[0] = 0;
		}
		if( choice[1] ){
			sum[1] = count_bit( ix, begin, end, choice[1] );
		}else{
			sum[1] = 0;
		}
		if( sum[1] < sum[0] ){
			sum[0] = sum[1];
			choice[0] = choice[1];
		}
		if( choice[0] ){
			*bits += sum[0];
		}
		return choice[0];
	}
}

#if 0
/*
 *	separate this from orginal function 99/12/23
 *	4400clk
 */

static int bigv_bitcount2( int *ix, unsigned int *tbl, int adr1, int adr2, int adr3 ){
	int bits = 0;
	int adr;
	tbl[0] = tbl[1] = tbl[2] = 0;
	adr = Min( adr1, adr2 );
	if( adr > 0 ){
		tbl[0] = new_choose_table( ix, 0, adr, &bits );
	}
	if( adr2 > adr1 ){
		tbl[1] = new_choose_table( ix, adr1, adr2, &bits );
	}
	if( adr3 > adr2 ){
		tbl[2] = new_choose_table( ix, adr2, adr3, &bits );
	}
	return bits;
}
#endif

static int bigv_bitcount( int ix[576], gr_info *cod_info ){
	/* äˤؤɤʤ 24000clk */
	/*
	  Within each scalefactor band, data is given for successive
	  time windows, beginning with window 0 and ending with window 2.
	  Within each window, the quantized values are then arranged in
	  order of increasing frequency...
	  */
	int bits = 0;
	int sfb, window, line, start, end, max1, max2, x, y;
	int region1Start;
	int *pmax;
	I192_3 *ix_s;
	cod_info->table_select[0] = 0;
	cod_info->table_select[1] = 0;
	cod_info->table_select[2] = 0;

	region1Start = 12;
	max1 = max2 = 0;
	for( sfb = 0; sfb < 13; sfb++ ){
		start = scalefac_short[ sfb ];
		end   = scalefac_short[ sfb+1 ];
		if( start < region1Start ){
			pmax = &max1;
		}else{
			pmax = &max2;
		}
		for( window = 0; window < 3; window++ ){
			for( line = start; line < end; line += 2 ){
				x = ix[ (line * 3) + window ];
				y = ix[ ((line + 1) * 3) + window ];
				*pmax = *pmax > x ? *pmax : x;
				*pmax = *pmax > y ? *pmax : y;
			}
		}
	}
	cod_info->table_select[0] = choose_table(max1);
	cod_info->table_select[1] = choose_table(max2);

	/*
	  Within each scalefactor band, data is given for successive
	  time windows, beginning with window 0 and ending with window 2.
	  Within each window, the quantized values are then arranged in
	  order of increasing frequency...
	*/
	sfb = 0;

	ix_s = (I192_3 *) &ix[0];

	for( ; sfb < 13; sfb++ ){
		unsigned tableindex = 100;

		start = scalefac_short[ sfb ];
		end   = scalefac_short[ sfb+1 ];

		if( start < 12 ){
			tableindex = cod_info->table_select[0];
		}else{
			tableindex = cod_info->table_select[1];
		}
		if( !tableindex ) continue;
		for( window = 0; window < 3; window++ ){
			for( line = start; line < end; line += 2 ){
				unsigned int code, ext;
				int cbits, xbits;
				int x = (*ix_s)[line][window];
				int y = (*ix_s)[line + 1][window];
				bits += HuffmanCode( tableindex, x, y, &code, &ext, &cbits, &xbits );
			}
		}
	}
	return bits;
}

#ifdef MIEasmHuffmana

int count_bit_NONE(int ix[576],unsigned int start,unsigned int end,unsigned int table);
int count_bit_MMX(int ix[576],unsigned int start,unsigned int end,unsigned int table);
void setup_count_bit(int useUNIT){
	if(useUNIT & tAMD && useUNIT & tFAMILY6){
		SETUP_DSP("use:count_bit_NONE\n");
		count_bit = count_bit_NONE;
	}
	else if(useUNIT & tMMX){
		SETUP_DSP("use:count_bit_MMX\n");
		count_bit = count_bit_MMX;
	}
	else{
		SETUP_DSP("use:count_bit_NONE\n");
		count_bit=count_bit_NONE;
	}
}
#else /* MIEasmHuffmana */

/* table_select <=15 */
int HuffmanCodeL(struct huffcodetab *h, int x, int y){
	unsigned int idx = 0;
	unsigned int _cbits = 0;
	if( x ){
		_cbits++;
		idx = x * 16;
	}
	if( y ){
		_cbits++;
		idx += y;
	}
	return h->hlen[idx] + _cbits;
}

/* table_select >15 */
int HuffmanCodeH(struct huffcodetab *h, int x, int y){
	unsigned int idx=0;
	unsigned int _xbits=0;
	if( x ){
		_xbits++;
		if( x>14 ){
			_xbits += h->linbits;
			x = 15;
		}
		idx = 16 * x;
	}
	if( y ){
		_xbits++;
		if( y>14 ){
			_xbits += h->linbits;
			y = 15;
		}
		idx += y;
	}
	return h->hlen[idx] + _xbits;
}

int
count_bit(int ix[576],unsigned int start,unsigned int end,unsigned int table){
    int i, sum;
	struct huffcodetab *h;

    sum = 0;
	h=&ht[table];
#if 0	//tableϾ0ʳǤϤ
	if(table == 0)return 0;
#endif
	if( table > 15){
		for(i = start; i < end; i += 2){
			sum += HuffmanCodeH(h,ix[i],ix[i+1]);
		}
	}else{
		for(i = start; i < end; i += 2){
			sum += HuffmanCodeL(h,ix[i],ix[i+1]);
        }
    }
    return sum;
}

#endif /* MIEasmHuffmana */

int new_choose_table_MMX( int ix[576], unsigned int begin, unsigned int end, int *bits );
static int new_choose_table_NONE( int ix[576], unsigned int begin, unsigned int end, int *bits );
void setup_new_choose_table(int useUNIT)
{
/*
 * ٥⡼ɤǤ NONE ® ºݤΥ󥳡ɤǤ MMX ® on K7
 * ʬͽ¬٤Τʤ뵻?
 */
#if 0
	if(useUNIT & tAMD && useUNIT & tFAMILY6){
		SETUP_DSP("use:new_choose_table_NONE\n");
		new_choose_table = new_choose_table_NONE;
	}
	else
#endif
	if(useUNIT & tMMX){
		SETUP_DSP("use:new_choose_table_MMX\n");
		new_choose_table = new_choose_table_MMX;
	}
	else{
		SETUP_DSP("use:new_choose_table_NONE\n");
		new_choose_table =  new_choose_table_NONE;
	}
}

/*
 *	lame3.55äcalc_runlen  count1_bitcount  00/01/16
 *	®ŪѤʤ
 */
int
calc_runlen_count1( int *ix, gr_info *cod_info );

#if 0
int
calc_runlen_count1_C( int *ix, gr_info *cod_info ){
	static unsigned int hlen32[]={
		1, 4, 4, 5, 4, 6, 5, 6, 4, 5, 5, 6, 5, 6, 6, 6
	};
	int i, k;
	int sum0, sum1;
	int sum00 = 0, sum01 = 0;
	cod_info->count1 = 0;
	for( i = 576; i > 1; i -= 2 ){
		if( ix[i-1] | ix[i-2] )break;
	}
	for( ; i > 3; i -= 4 ){
		int v, w, x, y;
		y = ix[i-1];
		x = ix[i-2];
		w = ix[i-3];
		v = ix[i-4];
		if( ( x | y | w | v ) <= 1 ){	/* v,w,x,y٤01ʤ */
			sum00 += v + w + x + y;
			sum01 += hlen32[(v << 3) + (w << 2) + (x << 1) + y];
			cod_info->count1++;
		}else{
			break;
		}
	}
	sum0 = sum00 + sum01;
	sum1 = sum00 + cod_info->count1 * 4;	/* ht[33].hlen[p]=4 */
	cod_info->big_values = i/2;
	if( sum0 < sum1 ){
        cod_info->count1table_select = 0;
		return	sum0;
	}else{
        cod_info->count1table_select = 1;
		return	sum1;
    }
}
#endif

#if 0
/* Υ롼 count_bits ΤߤƤФ */
/* cod_info->block_type != SHORT_TYPE */
/* big_values > 0 */
/* region0[i] + region1[i] <= i for i = 2,...22 */
/* region0[1] + region1[1] + 1 = 2 */
static
void 
subdivide( gr_info *cod_info )
{
	static const int region0[23]={0,0,0,0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6};
	static const int region1[23]={0,0,0,0,0,1,1,1,2,2,3,3,4,4,4,5,5,6,6,6,7,7,7};
	int bigv;
	int i = 1;

	bigv = 2 * cod_info->big_values;
	while( scalefac_long[i] < bigv ){
		i++;
	}

	cod_info->region0_count = region0[i];
	cod_info->adr1 = scalefac_long[ region0[i] + 1 ];

	cod_info->region1_count = region1[i];
	cod_info->adr2 = scalefac_long[ region0[i] + region1[i] + 2 ];
	cod_info->adr3 = bigv;
}
#endif
/*
 *	max=ix_max();礭뤫ɤΤ˻ȤΤʤ
 *	ǻߤ᤿äʤ, άäƤ褦ʵ⤹
 *	98/08/09	PENΤȤĴ٤ܤ餷
 *	total 11000clk
 *	8191 + 14 ѹԲ
 *	00/01/05 ʬʬ
 *	MMXȤʤʤ Takehiro 롼®ä
 *	00/01/18 non MMXǤʤι®
 *	00/03/03 subdivide ΥХ & ®
 */

/* input: ix, cod_info */

static
int
count_bits( int  *ix, gr_info *cod_info)
{
/*	if( ix_max(ix,0,576) > 8191 + 14 )return 100000; move in quantize_xrpow() */
	if( cod_info->block_type == SHORT_TYPE ){
		cod_info->count1 = 0;
		cod_info->big_values = 288;
		cod_info->count1table_select = 1;
		/* from subdivide */
		cod_info->region0_count =  8;
		cod_info->region1_count =  36;
		cod_info->adr1 = 36;
		cod_info->adr2 = 576;
		cod_info->adr3 = 0;
		return bigv_bitcount(ix,cod_info);
	}else{
		int bits;
		int adr;
		int bigv;
		/* 99% */
		bits = calc_runlen_count1( ix, cod_info );
		bigv = cod_info->big_values * 2;
		if( !cod_info->big_values ){
			cod_info->region0_count = 0;
			cod_info->region1_count = 0;
			/* adr?0Ȥϸ¤ʤ */
		}else if( cod_info->window_switching_flag ){
			cod_info->region0_count = 7;
			cod_info->region1_count = 13;
			cod_info->adr1 = scalefac_long[8];
			cod_info->adr2 = bigv;
			cod_info->adr3 = 0;
		}else{
			static const int region0[23]={0,0,0,0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6};
			static const int region1[23]={0,0,0,0,0,1,1,1,2,2,3,3,4,4,4,5,5,6,6,6,7,7,7};
			int i = 1;

			while( scalefac_long[i] < bigv ) i++;

			cod_info->region0_count = region0[i];
			cod_info->adr1 = scalefac_long[ region0[i] + 1 ];

			cod_info->region1_count = region1[i];
			cod_info->adr2 = scalefac_long[ region0[i] + region1[i] + 2 ];
			cod_info->adr3 = bigv;
		}
		cod_info->table_select[0] = cod_info->table_select[1] = cod_info->table_select[2] = 0;
		adr = Min( cod_info->adr1, cod_info->adr2 );
		if( adr > 0 ){
			cod_info->table_select[0] = new_choose_table( ix, 0, adr, &bits );
		}
		if( cod_info->adr2 > cod_info->adr1 ){
			cod_info->table_select[1] = new_choose_table( ix, cod_info->adr1, cod_info->adr2, &bits );
		}
		if( bigv > cod_info->adr2 ){
			cod_info->table_select[2] = new_choose_table( ix, cod_info->adr2, bigv, &bits );
		}
		return bits;
	}
}
