/* this file is a part of amp software, (C) tomislav uzelac 1996,1997
*/
 
/* transform.c  imdct and polyphase(DCT) transforms
 *
 * Created by: tomislav uzelac  May 1996
 * Karl Anders Oygard optimized this for speed, Mar 13 97
 * Tinic Urou: Optimized for RISC processors and cleaned up source. 
 * Its now incredibly fast on a PowerPC! Jun 07 97
 */

/*
 * Comments for this file:
 *
 * The polyphase algorithm is clearly the most cpu consuming part of mpeg 1
 * layer 3 decoding.  Thus, there has been some effort to optimise this
 * particular algorithm.  Currently, everything has been kept in straight C
 * with no assembler optimisations, but in order to provide efficient paths
 * for different architectures, alternative implementations of some
 * critical sections has been done.  You may want to experiment with these,
 * to see which suits your architecture better.
 *
 * Selection of the different implementations is done with the following
 * defines:
 *
 *     HAS_AUTOINCREMENT
 *
 *         Define this if your architecture supports preincrementation of
 *         pointers when referencing (applies to e.g. 68k)
 *
 * For those who are optimising amp, check out the Pentium rdtsc code
 * (define PENTIUM_RDTSC).  This code uses the rdtsc counter for showing
 * how many cycles are spent in different parts of the code.
 */

#include <math.h>
#include <sys/types.h>
#include <unistd.h>
#include <string.h>

#include "audio.h"
#include "getdata.h"
#include "misc2.h"

#define TRANSFORM
#include "transform.h"

#define SUPERFAST

#define PI12      0.261799387f
#define PI36      0.087266462f

void imdct_init()
{
  int i;

  for(i=0;i<36;i++) /* 0 */
    win[0][i] = (float) sin(PI36 *(i+0.5));
  for(i=0;i<18;i++) /* 1 */
    win[1][i] = (float) sin(PI36 *(i+0.5));
  for(i=18;i<24;i++)
    win[1][i] = 1.0f;
  for(i=24;i<30;i++)
    win[1][i] = (float) sin(PI12 *(i+0.5-18));
  for(i=30;i<36;i++)
    win[1][i] = 0.0f;
  for(i=0;i<6;i++) /* 3 */
    win[3][i] = 0.0f;
  for(i=6;i<12;i++)
    win[3][i] = (float) sin(PI12 * (i+ 0.5 - 6.0));
  for(i=12;i<18;i++)
    win[3][i] = 1.0f;
  for(i=18;i<36;i++)
    win[3][i] = (float) sin(PI36 * (i + 0.5));
}

/* This uses Byeong Gi Lee's Fast Cosine Transform algorithm to decompose
   the 36 point and 12 point IDCT's into 9 point and 3 point IDCT's,
   respectively. Then the 9 point IDCT is computed by a modified version of
   Mikko Tommila's IDCT algorithm, based on the WFTA. See his comments
   before the first 9 point IDCT. The 3 point IDCT is already efficient to
   implement. -- Jeff Tsay. */
/* I got the unrolled IDCT from Jeff Tsay; the code is presumably by 
   Francois-Raymond Boyer - I unrolled it a little further. tu */

void imdct(int win_type,int sb,int ch)
{
/*------------------------------------------------------------------*/
/*                                                                  */
/*    Function: Calculation of the inverse MDCT                     */
/*    In the case of short blocks the 3 output vectors are already  */
/*    overlapped and added in this modul.                           */
/*                                                                  */
/*    New layer3                                                    */
/*                                                                  */
/*------------------------------------------------------------------*/

	float *win_bt;
	int i, p, ss;
	float *in = xr[ch][sb];
	float *s_p = s[ch][sb];
	float *res_p = res[sb];
	float out[36];

	if(win_type == 2)
	{	
		// Putting constants into registers first is much faster on PPC
 		float 	f00=0.0f;
		float 	f05=0.5f;

		// unrolled, fast version
		out[ 0] = f00;
		out[ 1] = f00;
		out[ 2] = f00;
		out[ 3] = f00;
		out[ 4] = f00;
		out[ 5] = f00;
		out[ 6] = f00;
		out[ 7] = f00;
		out[ 8] = f00;
		out[ 9] = f00;
		out[10] = f00;
		out[11] = f00;
		out[12] = f00;
		out[13] = f00;
		out[14] = f00;
		out[15] = f00;
		out[16] = f00;
		out[17] = f00;
		out[18] = f00;
		out[19] = f00;
		out[20] = f00;
		out[21] = f00;
		out[22] = f00;
		out[23] = f00;
		out[24] = f00;
		out[25] = f00;
		out[26] = f00;
		out[27] = f00;
		out[28] = f00;
		out[29] = f00;
		out[30] = f00;
		out[31] = f00;
		out[32] = f00;
		out[33] = f00;
		out[34] = f00;
		out[35] = f00;

		{
			float cospi6=0.866025403f;
			float tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, tmp10, tmp11;
	
			for(ss=0;ss<18;ss+=6) 
			{	
				/*
				*  12 point IMDCT
				*/
				
				float in0,in1,in2,in3,in4,in5;

				{
					float *inss=&in[ss];				
				
					in0=inss[0];	in1=inss[1];
					in2=inss[2];	in3=inss[3];
					in4=inss[4];	in5=inss[5];
				}		

			    /* Begin 12 point IDCT */
				
				// Input aliasing for 12 pt IDCT
				in5+=in4;in4+=in3;in3+=in2;
				in2+=in1;in1+=in0;
			
				// Input aliasing on odd indices (for 6 point IDCT)
				in5+=in3;in3+=in1;
				
				{
					float sum,pp1;
					// 3 point IDCT on even indices
			
					pp1 = in2 * cospi6;
					sum = in0 + in4 * f05;
					tmp1= in0 - in4;
					tmp0= sum + pp1;
					tmp2= sum - pp1;
			
					// End 3 point IDCT on even indices
			
					// 3 point IDCT on odd indices (for 6 point IDCT)
			
					pp1 = in3 * cospi6;
					sum = in1 + in5 * f05;
					tmp4 = in1 - in5;
					tmp5 = sum + pp1;
					tmp3 = sum - pp1;
				}

				/* End 3 point IDCT on odd indices */
				
				/* Twiddle factors on odd indices (for 6 point IDCT) */
				
				tmp3 *= 1.931851653f;
				tmp4 *= 0.707106781f;
				tmp5 *= 0.517638090f;
				
				/* Output butterflies on 2 3 point IDCT's (for 6 point IDCT) */
				
				{
					float save;
					save = tmp0;
					tmp0 += tmp5;
					tmp5 = save - tmp5;
					save = tmp1;
					tmp1 += tmp4;
					tmp4 = save - tmp4;
					save = tmp2;
					tmp2 += tmp3;
					tmp3 = save - tmp3;
				}
				
				/* End 6 point IDCT */
				
				/* Twiddle factors on indices (for 12 point IDCT) */
				
				tmp0 *= 0.504314480f;
				tmp1 *= 0.541196100f;
				tmp2 *= 0.630236207f;
				tmp3 *= 0.821339815f;
				tmp4 *= 1.306562965f;
				tmp5 *= 3.830648788f;
				
				/* End 12 point IDCT */
				
				/* Shift to 12 point modified IDCT, multiply by window type 2 */
				tmp8  = tmp0 * -0.793353340f;
				tmp9  = tmp0 * -0.608761429f;
				tmp7  = tmp1 * -0.923879532f;
				tmp10 = tmp1 * -0.382683432f;
				tmp6  = tmp2 * -0.991444861f;
				tmp11 = tmp2 * -0.130526192f;
				
				tmp0  = tmp3;
				tmp1  = tmp4 *  0.382683432f;
				tmp2  = tmp5 *  0.608761429f;
				
				tmp3  = tmp5 * -0.793353340f;
				tmp4  = tmp4 * -0.923879532f;
				tmp5  = tmp0 * -0.991444861f;
				
				tmp0  *= 0.130526192f;
				
				{
					float *outss=&out[ss];
	
					outss[6]  += tmp0;
					outss[7]  += tmp1;
					outss[8]  += tmp2;
					outss[9]  += tmp3;
					outss[10] += tmp4;
					outss[11] += tmp5;
					outss[12] += tmp6;
					outss[13] += tmp7;
					outss[14] += tmp8;
					outss[15] += tmp9;
					outss[16] += tmp10;
					outss[17] += tmp11;
				}	
			}
		}		
		
		if (sb&1) 
		{
			res_p[ 0] =   out[ 0] + s_p[ 0];
			res_p[ 1] = - out[ 1] - s_p[ 1];
			res_p[ 2] =   out[ 2] + s_p[ 2];
			res_p[ 3] = - out[ 3] - s_p[ 3];
			res_p[ 4] =   out[ 4] + s_p[ 4];
			res_p[ 5] = - out[ 5] - s_p[ 5];
			res_p[ 6] =   out[ 6] + s_p[ 6];
			res_p[ 7] = - out[ 7] - s_p[ 7];
			res_p[ 8] =   out[ 8] + s_p[ 8];
			res_p[ 9] = - out[ 9] - s_p[ 9];
			res_p[10] =   out[10] + s_p[10];
			res_p[11] = - out[11] - s_p[11];
			res_p[12] =   out[12] + s_p[12];
			res_p[13] = - out[13] - s_p[13];
			res_p[14] =   out[14] + s_p[14];
			res_p[15] = - out[15] - s_p[15];
			res_p[16] =   out[16] + s_p[16];
			res_p[17] = - out[17] - s_p[17];
		} 
		else
		{
			res_p[ 0] =   out[ 0] + s_p[ 0];
			res_p[ 1] =   out[ 1] + s_p[ 1];
			res_p[ 2] =   out[ 2] + s_p[ 2];
			res_p[ 3] =   out[ 3] + s_p[ 3];
			res_p[ 4] =   out[ 4] + s_p[ 4];
			res_p[ 5] =   out[ 5] + s_p[ 5];
			res_p[ 6] =   out[ 6] + s_p[ 6];
			res_p[ 7] =   out[ 7] + s_p[ 7];
			res_p[ 8] =   out[ 8] + s_p[ 8];
			res_p[ 9] =   out[ 9] + s_p[ 9];
			res_p[10] =   out[10] + s_p[10];
			res_p[11] =   out[11] + s_p[11];
			res_p[12] =   out[12] + s_p[12];
			res_p[13] =   out[13] + s_p[13];
			res_p[14] =   out[14] + s_p[14];
			res_p[15] =   out[15] + s_p[15];
			res_p[16] =   out[16] + s_p[16];
			res_p[17] =   out[17] + s_p[17];
		}
		
		s_p[ 0]	=out[18];
		s_p[ 1]	=out[19];
		s_p[ 2]	=out[20];
		s_p[ 3]	=out[21];
		s_p[ 4]	=out[22];
		s_p[ 5]	=out[23];
		s_p[ 6]	=out[24];
		s_p[ 7]	=out[25];
		s_p[ 8]	=out[26];
		s_p[ 9]	=out[27];
		s_p[10]	=out[28];
		s_p[11]	=out[29];
		s_p[12]	=out[30];
		s_p[13]	=out[31];
		s_p[14]	=out[32];
		s_p[15]	=out[33];
		s_p[16]	=out[34];
		s_p[17]	=out[35];

    } 
    else 
    {
/*
 * 36 point IDCT ****************************************************************
 */
		float t0,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16,t17;
		{
			float in0,in1,in2,in3,in4,in5,in6,in7,in8,in9,in10,in11,in12,in13,in14,in15,in16,in17;		
		
			in0=in[0];		in1=in[1];		in2=in[2];		in3=in[3];
			in4=in[4];		in5=in[5];		in6=in[6];		in7=in[7];
			in8=in[8];		in9=in[9];		in10=in[10];	in11=in[11];
			in12=in[12];	in13=in[13];	in14=in[14];	in15=in[15];
			in16=in[16];	in17=in[17];	
			
			// input aliasing for 36 point IDCT
		
			in17+=in16; in16+=in15; in15+=in14; in14+=in13;
			in13+=in12; in12+=in11; in11+=in10; in10+=in9;
			in9 +=in8;  in8 +=in7;  in7 +=in6;  in6 +=in5;
			in5 +=in4;  in4 +=in3;  in3 +=in2;  in2 +=in1;
			in1 +=in0;
		
			// 18 point IDCT for odd indices
		
			// input aliasing for 18 point IDCT
			in17+=in15; in15+=in13; in13+=in11; in11+=in9;
			in9 +=in7;  in7 +=in5;  in5 +=in3;  in3 +=in1;
				
			{
				float tmp0,tmp1,tmp2,tmp3,tmp4,tmp0_,tmp1_,tmp2_,tmp3_;
				float tmp0o,tmp1o,tmp2o,tmp3o,tmp4o,tmp0_o,tmp1_o,tmp2_o,tmp3_o;
				
				/* Fast 9 Point Inverse Discrete Cosine Transform
				//
				// By  Francois-Raymond Boyer
				//         mailto:boyerf@iro.umontreal.ca
				//         http://www.iro.umontreal.ca/~boyerf
				//
				// The code has been optimized for Intel processors
				//  (takes a lot of time to convert float to and from iternal FPU representation)
				//
				// It is a simple "factorization" of the IDCT matrix.
				*/
				/* 9 point IDCT on even indices */
				{
					/* 5 points on odd indices (not realy an IDCT) */
					float i0 = in0+in0;
					float i0p12 = i0 + in12;
					
					tmp0 = i0p12 + in4*1.87938524157180f + in8*1.53208888623800f + in16*0.34729635533386f;
					tmp1 = i0    + in4                   - in8 - in12 - in12     - in16;
					tmp2 = i0p12 - in4*0.34729635533386f - in8*1.87938524157180f + in16*1.53208888623800f;
					tmp3 = i0p12 - in4*1.53208888623800f + in8*0.34729635533386f - in16*1.87938524157180f;
					tmp4 = in0   - in4                   + in8 - in12            + in16;
				}
				{
					float i6_ = in6*1.732050808f;		
					
					tmp0_ = in2*1.96961550602440f + i6_ + in10*1.28557521937310f + in14 *0.68404028665134f;
					tmp1_ =(in2                         - in10                   - in14)*1.73205080800000f;
					tmp2_ = in2*1.28557521937310f - i6_ - in10*0.68404028665134f + in14 *1.96961550602440f;
					tmp3_ = in2*0.68404028665134f - i6_ + in10*1.96961550602440f - in14 *1.28557521937310f;
				}
				
				/* 9 point IDCT on odd indices */
				{
					/* 5 points on odd indices (not realy an IDCT) */
					float i0 = in1+in1;
					float i0p12 = i0 + in13;
					
					tmp0o = i0p12   + in5*1.8793852415718f  + in9*1.532088886238f       + in17*0.34729635533386f;
					tmp1o = i0      + in5                   - in9 - in13 - in13         - in17;
					tmp2o = i0p12   - in5*0.34729635533386f - in9*1.8793852415718f      + in17*1.532088886238f;
					tmp3o = i0p12   - in5*1.532088886238f   + in9*0.34729635533386f     - in17*1.8793852415718f;
					tmp4o = (in1    - in5                   + in9 - in13                + in17)*0.707106781f; /* Twiddled */
				}
				{
					/* 4 points on even indices */
					float i6_ = in7*1.732050808f;		/* Sqrt[3] */
					
					tmp0_o = in3*1.9696155060244f  + i6_ + in11*1.2855752193731f  + in15*0.68404028665134f;
					tmp1_o = (in3                        - in11                   - in15)*1.732050808f;
					tmp2_o = in3*1.2855752193731f  - i6_ - in11*0.68404028665134f + in15*1.9696155060244f;
					tmp3_o = in3*0.68404028665134f - i6_ + in11*1.9696155060244f  - in15*1.2855752193731f;
				}
				
				/* Twiddle factors on odd indices
				// and
				// Butterflies on 9 point IDCT's
				// and
				// twiddle factors for 36 point IDCT
				*/
				{
					float e, o;
					e = tmp0 + tmp0_; o = (tmp0o + tmp0_o)*0.501909918f; t0 = (e + o)*(-0.500476342f*.5f);    t17 = (e - o)*(-11.46279281f*.5f);
					e = tmp1 + tmp1_; o = (tmp1o + tmp1_o)*0.517638090f; t1 = (e + o)*(-0.504314480f*.5f);    t16 = (e - o)*(-3.830648788f*.5f);
					e = tmp2 + tmp2_; o = (tmp2o + tmp2_o)*0.551688959f; t2 = (e + o)*(-0.512139757f*.5f);    t15 = (e - o)*(-2.310113158f*.5f);
					e = tmp3 + tmp3_; o = (tmp3o + tmp3_o)*0.610387294f; t3 = (e + o)*(-0.524264562f*.5f);    t14 = (e - o)*(-1.662754762f*.5f);
					                                                     t4 = (tmp4 + tmp4o)*(-0.541196100f); t13 = (tmp4 - tmp4o)*(-1.306562965f);
					e = tmp3 - tmp3_; o = (tmp3o - tmp3_o)*0.871723397f; t5 = (e + o)*(-0.563690973f*.5f);    t12 = (e - o)*(-1.082840285f*.5f);
					e = tmp2 - tmp2_; o = (tmp2o - tmp2_o)*1.183100792f; t6 = (e + o)*(-0.592844523f*.5f);    t11 = (e - o)*(-0.930579498f*.5f);
					e = tmp1 - tmp1_; o = (tmp1o - tmp1_o)*1.931851653f; t7 = (e + o)*(-0.630236207f*.5f);    t10 = (e - o)*(-0.821339815f*.5f);
					e = tmp0 - tmp0_; o = (tmp0o - tmp0_o)*5.736856623f; t8 = (e + o)*(-0.678170852f*.5f);    t9  = (e - o)*(-0.740093616f*.5f);
				}
			}
		}
		/* shift to modified IDCT */
		win_bt = win[win_type];
		
		if (sb&1) 
		{
			res_p[ 0] = - t9  * win_bt[ 0] + s_p[ 0];
			res_p[ 1] = + t10 * win_bt[ 1] - s_p[ 1];
			res_p[ 2] = - t11 * win_bt[ 2] + s_p[ 2];
			res_p[ 3] = + t12 * win_bt[ 3] - s_p[ 3];
			res_p[ 4] = - t13 * win_bt[ 4] + s_p[ 4];
			res_p[ 5] = + t14 * win_bt[ 5] - s_p[ 5];
			res_p[ 6] = - t15 * win_bt[ 6] + s_p[ 6];
			res_p[ 7] = + t16 * win_bt[ 7] - s_p[ 7];
			res_p[ 8] = - t17 * win_bt[ 8] + s_p[ 8];
			
			res_p[ 9] = - t17 * win_bt[ 9] - s_p[ 9];
			res_p[10] = + t16 * win_bt[10] + s_p[10];
			res_p[11] = - t15 * win_bt[11] - s_p[11];
			res_p[12] = + t14 * win_bt[12] + s_p[12];
			res_p[13] = - t13 * win_bt[13] - s_p[13];
			res_p[14] = + t12 * win_bt[14] + s_p[14];
			res_p[15] = - t11 * win_bt[15] - s_p[15];
			res_p[16] = + t10 * win_bt[16] + s_p[16];
			res_p[17] = - t9  * win_bt[17] - s_p[17];
		} 
		else 
		{
			res_p[ 0] = -t9  * win_bt[ 0] + s_p[ 0];
			res_p[ 1] = -t10 * win_bt[ 1] + s_p[ 1];
			res_p[ 2] = -t11 * win_bt[ 2] + s_p[ 2];
			res_p[ 3] = -t12 * win_bt[ 3] + s_p[ 3];
			res_p[ 4] = -t13 * win_bt[ 4] + s_p[ 4];
			res_p[ 5] = -t14 * win_bt[ 5] + s_p[ 5];
			res_p[ 6] = -t15 * win_bt[ 6] + s_p[ 6];
			res_p[ 7] = -t16 * win_bt[ 7] + s_p[ 7];
			res_p[ 8] = -t17 * win_bt[ 8] + s_p[ 8];
			
			res_p[ 9] =  t17 * win_bt[ 9] + s_p[ 9];
			res_p[10] =  t16 * win_bt[10] + s_p[10];
			res_p[11] =  t15 * win_bt[11] + s_p[11];
			res_p[12] =  t14 * win_bt[12] + s_p[12];
			res_p[13] =  t13 * win_bt[13] + s_p[13];
			res_p[14] =  t12 * win_bt[14] + s_p[14];
			res_p[15] =  t11 * win_bt[15] + s_p[15];
			res_p[16] =  t10 * win_bt[16] + s_p[16];
			res_p[17] =  t9  * win_bt[17] + s_p[17];
		}
		
		s_p[ 0]= t8 * win_bt[18];
		s_p[ 1]= t7 * win_bt[19];
		s_p[ 2]= t6 * win_bt[20];
		s_p[ 3]= t5 * win_bt[21];
		s_p[ 4]= t4 * win_bt[22];
		s_p[ 5]= t3 * win_bt[23];
		s_p[ 6]= t2 * win_bt[24];
		s_p[ 7]= t1 * win_bt[25];
		s_p[ 8]= t0 * win_bt[26];
				
		s_p[ 9]= t0 * win_bt[27];
		s_p[10]= t1 * win_bt[28];
		s_p[11]= t2 * win_bt[29];
		s_p[12]= t3 * win_bt[30];
		s_p[13]= t4 * win_bt[31];
		s_p[14]= t5 * win_bt[32];
		s_p[15]= t6 * win_bt[33];
		s_p[16]= t7 * win_bt[34];
		s_p[17]= t8 * win_bt[35];
	}
}

/* fast DCT according to Lee[84]
 * reordering according to Konstantinides[94]
 */ 
void poly(const int ch,int f)
{
	static float u[2][2][17][16]; /* no v[][], it's redundant */
	static int u_start[2]={0,0}; /* first element of u[][] */
	static int u_div[2]={0,0}; /* which part of u[][] is currently used */
	int start = u_start[ch];
	int div = u_div[ch];
	float *u_p;

	{
		float d16,d17,d18,d19,d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31;
		float d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15;
	
		/* step 1: initial reordering and 1st (16 wide) butterflies
		*/
	
		float *r=&res[ 0][f];
		float t;
		
		d0 =r[ 0*18]; t = r[31*18]; d16=(d0  - t) *  b1; d0 += t;
		d1 =r[ 1*18]; t = r[30*18]; d17=(d1  - t) *  b3; d1 += t;
		d3 =r[ 2*18]; t = r[29*18]; d19=(d3  - t) *  b5; d3 += t;
		d2 =r[ 3*18]; t = r[28*18]; d18=(d2  - t) *  b7; d2 += t;
		d6 =r[ 4*18]; t = r[27*18]; d22=(d6  - t) *  b9; d6 += t;
		d7 =r[ 5*18]; t = r[26*18]; d23=(d7  - t) * b11; d7 += t;
		d5 =r[ 6*18]; t = r[25*18]; d21=(d5  - t) * b13; d5 += t;
		d4 =r[ 7*18]; t = r[24*18]; d20=(d4  - t) * b15; d4 += t;
		d12=r[ 8*18]; t = r[23*18]; d28=(d12 - t) * b17; d12+= t;
		d13=r[ 9*18]; t = r[22*18]; d29=(d13 - t) * b19; d13+= t;
		d15=r[10*18]; t = r[21*18]; d31=(d15 - t) * b21; d15+= t;
		d14=r[11*18]; t = r[20*18]; d30=(d14 - t) * b23; d14+= t;
		d10=r[12*18]; t = r[19*18]; d26=(d10 - t) * b25; d10+= t;
		d11=r[13*18]; t = r[18*18]; d27=(d11 - t) * b27; d11+= t;
		d9 =r[14*18]; t = r[17*18]; d25=(d9  - t) * b29; d9 += t;
		d8 =r[15*18]; t = r[16*18]; d24=(d8  - t) * b31; d8 += t;
	
		{
			float c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15;
		
		/* a test to see what can be done with memory separation
		 * first we process indexes 0-15
		*/
			c0 = d0 + d8 ; c8 = ( d0 - d8 ) *  b2;
			c1 = d1 + d9 ; c9 = ( d1 - d9 ) *  b6;
			c2 = d2 + d10; c10= ( d2 - d10) * b14;
			c3 = d3 + d11; c11= ( d3 - d11) * b10;
			c4 = d4 + d12; c12= ( d4 - d12) * b30;
			c5 = d5 + d13; c13= ( d5 - d13) * b26;
			c6 = d6 + d14; c14= ( d6 - d14) * b18;
			c7 = d7 + d15; c15= ( d7 - d15) * b22;
			
		/* step 3: 4-wide butterflies
		*/
			d0 = c0 + c4 ; d4 = ( c0 - c4 ) *  b4;
			d1 = c1 + c5 ; d5 = ( c1 - c5 ) * b12;
			d2 = c2 + c6 ; d6 = ( c2 - c6 ) * b28;
			d3 = c3 + c7 ; d7 = ( c3 - c7 ) * b20;
			
			d8 = c8 + c12; d12= ( c8 - c12) *  b4;
			d9 = c9 + c13; d13= ( c9 - c13) * b12;
			d10= c10+ c14; d14= (c10 - c14) * b28;
			d11= c11+ c15; d15= (c11 - c15) * b20;
		
		/* step 4: 2-wide butterflies
		*/
			{
				float rb8=b8;
				float rb24=b24;			
				c0 = d0 + d2 ; c2 = ( d0 - d2 ) *  rb8;
				c1 = d1 + d3 ; c3 = ( d1 - d3 ) * rb24;
				c4 = d4 + d6 ; c6 = ( d4 - d6 ) *  rb8;
				c5 = d5 + d7 ; c7 = ( d5 - d7 ) * rb24;
				c8 = d8 + d10; c10= ( d8 - d10) *  rb8;
				c9 = d9 + d11; c11= ( d9 - d11) * rb24;
				c12= d12+ d14; c14= (d12 - d14) *  rb8;
				c13= d13+ d15; c15= (d13 - d15) * rb24;
			}
					
		/* step 5: 1-wide butterflies
		*/
			{
				float rb16 = b16;
			
				/* this is a little 'hacked up'
				*/
				d0  = (-c0-c1) * 2; d1 = ( c0 - c1 ) * rb16; 
				d2  = c2 + c3; d3 = ( c2 - c3 ) * rb16; 
			 	d3  = d3 - d2;
			
				d4  =  c4 +c5; d5 = ( c4 - c5 ) * rb16;
				d5  += d4;
				d7  = -d5;
				d7  += ( c6 - c7 ) * rb16; d6 = +c6 +c7;
			
				d8  =  c8 + c9 ; d9 = ( c8 - c9 ) * rb16;
				d11 =  d8 +d9;
				d11 += (c10 - c11) * rb16; d10= c10+ c11; 
			
				d12 = c12 + c13; d13 = (c12 - c13) * rb16;
				d13 += -d8 - d9 + d12;
				d14 = c14 + c15; d15 = (c14 - c15) * rb16;
				d15 -= d11;
				d14 += -  d8 - d10;
			}
	
        /* step 6: final resolving & reordering
         * the other 32 are stored for use with the next granule
         */
	
	        u_p = (float *) &u[ch][div][0][start];
	
	/*16*/  u_p[ 0*16] = +d1 ;
	        u_p[ 2*16] = +d9 -d14;
	/*20*/  u_p[ 4*16] = +d5 -d6;
	        u_p[ 6*16] = -d10 +d13;
	/*24*/  u_p[ 8*16] = +d3;
	        u_p[10*16] = -d8 -d9 +d11 -d13;
	/*28*/  u_p[12*16] = +d7;
	        u_p[14*16] = +d15;
	
        /* the other 32 are stored for use with the next granule
         */
	
	        u_p = (float *) &u[ch][!div][0][start];
	
	/*0*/   u_p[16*16] = +d0;
	        u_p[14*16] = -d8;
	/*4*/   u_p[12*16] = -d4;
	        u_p[10*16] = +d8 -d12;
	/*8*/   u_p[ 8*16] = -d2;
	        u_p[ 6*16] = -d8 -d10 +d12;
	/*12*/  u_p[ 4*16] = +d4 -d6;
	        u_p[ 2*16] = -d14;
	        u_p[ 0*16] = -d1;
		}
	
		{
			float c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15;
	
/* memory separation, second part
*/
/* 2
*/
	        c0=d16 + d24; c8 = (d16 - d24) *  b2;
	        c1=d17 + d25; c9 = (d17 - d25) *  b6;
	        c2=d18 + d26; c10= (d18 - d26) * b14;
	        c3=d19 + d27; c11= (d19 - d27) * b10;
	        c4=d20 + d28; c12= (d20 - d28) * b30;
	        c5=d21 + d29; c13= (d21 - d29) * b26;
	        c6=d22 + d30; c14= (d22 - d30) * b18;
	        c7=d23 + d31; c15= (d23 - d31) * b22;
/* 3
*/
	        d16= c0 + c4;  d20= (c0  - c4 ) *  b4;
	        d17= c1 + c5;  d21= (c1  - c5 ) * b12;
	        d18= c2 + c6;  d22= (c2  - c6 ) * b28;
	        d19= c3 + c7;  d23= (c3  - c7 ) * b20;
	        
	        d24= c8 + c12; d28= (c8  - c12) *  b4;
	        d25= c9 + c13; d29= (c9  - c13) * b12;
	        d26= c10+ c14; d30= (c10 - c14) * b28;
	        d27= c11+ c15; d31= (c11 - c15) * b20;	
/* 4
*/
			{
				float rb8=b8;
				float rb24=b24;			

				c0 = d16+ d18; c2 = (d16 - d18) *  rb8;
				c1 = d17+ d19; c3 = (d17 - d19) * rb24;
				c4 = d20+ d22; c6 = (d20 - d22) *  rb8;
				c5 = d21+ d23; c7 = (d21 - d23) * rb24;
	
				c8 = d24+ d26; c10= (d24 - d26) *  rb8;
				c9 = d25+ d27; c11= (d25 - d27) * rb24;
				c12= d28+ d30; c14= (d28 - d30) *  rb8;
				c13= d29+ d31; c15= (d29 - d31) * rb24;
			}
/* 5
*/
			{
				float rb16 = b16;
				d16= c0+ c1; d17= (c0 - c1) * rb16;
				d18= c2+ c3; d19= (c2 - c3) * rb16;
				
				d20= c4+ c5; d21= (c4 - c5) * rb16;
				d20+=d16; d21+=d17;
				d22= c6+ c7; d23= (c6 - c7) * rb16;
				d22+=d16; d22+=d18;
				d23+=d16; d23+=d17; d23+=d19;
				
				
				d24= c8+ c9; d25= (c8 - c9) * rb16;
				d26= c10+ c11; d27= (c10 - c11) * rb16;
				d26+=d24;
				d27+=d24; d27+=d25;
				
				d28= c12+ c13; d29= (c12 - c13) * rb16;
				d28-=d20; d29+=d28; d29-=d21;
				d30= c14+ c15; d31= (c14 - c15) * rb16;
				d30-=d22;
				d31-=d23;
			}
	
		/* step 6: final resolving & reordering 
		 * the other 32 are stored for use with the next granule
		 */
			
			u_p = (float *) &u[ch][!div][0][start];
			
			u_p[ 1*16] = -d30;	
			u_p[ 3*16] = -d22 +d26;
			u_p[ 5*16] = +d18 +d20 -d26;
			u_p[ 7*16] = -d18 +d28;
			u_p[ 9*16] = -d28;
			u_p[11*16] = -d20 +d24;
			u_p[13*16] = +d16 -d24;
			u_p[15*16] = -d16;
			
		/* the other 32 are stored for use with the next granule
		 */
			
			u_p = (float *) &u[ch][div][0][start];
			
			u_p[15*16] = +d31;
			u_p[13*16] = +d23 -d27;
			u_p[11*16] = -d19 -d20 -d21 +d27;
			u_p[ 9*16] = +d19 -d29;
			u_p[ 7*16] = -d18 +d29;
			u_p[ 5*16] = +d18 +d20 +d21 -d25 -d26;
			u_p[ 3*16] = -d17 -d22 +d25 +d26;
			u_p[ 1*16] = +d17 -d30;
		}
	}

	/* we're doing dewindowing and calculating final samples now
	 */

#ifdef SUPERFAST
/* 
	Tinic: This should be the fastest possible method, especially
	for processors with a rich amount of registers like PPCs. 
*/
	{
		float u0,u1,u2,u3,u4,u5,u6,u7,u8,u9,u10,u11,u12,u13,u14,u15;
		short *samples = &sample_buffer[f][0][ch];
		float *u_ptr = (float *)u[ch][div];
		const float *dewindow1 = t_dewindow[0] + 16 - start;
		const float *dewindow2 = dewindow1;
		int upper = 16*nch;
		int upper2 = 2*upper;
		int start2 = 2*start;
		int out;
		int j;

		out = + u_ptr[ 0] * dewindow1[0x0]
		      + u_ptr[ 1] * dewindow1[0x1]
		      + u_ptr[ 2] * dewindow1[0x2]
		      + u_ptr[ 3] * dewindow1[0x3]
		      + u_ptr[ 4] * dewindow1[0x4]
		      + u_ptr[ 5] * dewindow1[0x5]
		      + u_ptr[ 6] * dewindow1[0x6]
		      + u_ptr[ 7] * dewindow1[0x7]
		      + u_ptr[ 8] * dewindow1[0x8]
		      + u_ptr[ 9] * dewindow1[0x9]
		      + u_ptr[10] * dewindow1[0xa]
		      + u_ptr[11] * dewindow1[0xb]
		      + u_ptr[12] * dewindow1[0xc]
	 	      + u_ptr[13] * dewindow1[0xd]
		      + u_ptr[14] * dewindow1[0xe]
		      + u_ptr[15] * dewindow1[0xf];

		samples[0] = out>32767 ? 32767 : ( out < -32768 ? -32768 : out);

		dewindow1 = dewindow1 - 16 + start2;	
			
		if (div & 0x1) 
		{
			for (j=nch;j<upper;j+=nch) 
			{
				u_ptr     += 16;
				dewindow2 += 32;	
				dewindow1 += 32;				

				u0 =u_ptr[ 0]; u1 =u_ptr[ 1];
				u2 =u_ptr[ 2]; u3 =u_ptr[ 3];
				u4 =u_ptr[ 4]; u5 =u_ptr[ 5];
				u6 =u_ptr[ 6]; u7 =u_ptr[ 7];
				u8 =u_ptr[ 8]; u9 =u_ptr[ 9];
				u10=u_ptr[10]; u11=u_ptr[11];
				u12=u_ptr[12]; u13=u_ptr[13];
				u14=u_ptr[14]; u15=u_ptr[15];

				out = + u0  * dewindow2[0x0]
				      + u1  * dewindow2[0x1]
				      + u2  * dewindow2[0x2]
				      + u3  * dewindow2[0x3]
				      + u4  * dewindow2[0x4]
				      + u5  * dewindow2[0x5]
				      + u6  * dewindow2[0x6]
				      + u7  * dewindow2[0x7]
				      + u8  * dewindow2[0x8]
				      + u9  * dewindow2[0x9]
				      + u10 * dewindow2[0xa]
				      + u11 * dewindow2[0xb]
				      + u12 * dewindow2[0xc]
			 	      + u13 * dewindow2[0xd]
				      + u14 * dewindow2[0xe]
				      + u15 * dewindow2[0xf];

				samples[j] = out>32767 ? 32767 : ( out < -32768 ? -32768 : out);

				out = + u15 * dewindow1[0x0]
				      - u14 * dewindow1[0x1]
				      + u13 * dewindow1[0x2]
				      - u12 * dewindow1[0x3]
				      + u11 * dewindow1[0x4]
				      - u10 * dewindow1[0x5]
				      + u9  * dewindow1[0x6]
				      - u8  * dewindow1[0x7]
				      + u7  * dewindow1[0x8]
			  	      - u6  * dewindow1[0x9]
				      + u5  * dewindow1[0xa]
				      - u4  * dewindow1[0xb]
				      + u3  * dewindow1[0xc]
				      - u2  * dewindow1[0xd]
				      + u1  * dewindow1[0xe]
				      - u0  * dewindow1[0xf];
				
				samples[upper2-j] = out>32767 ? 32767 : ( out < -32768 ? -32768 : out);
			}

			u_ptr  += 16;

			dewindow1 = dewindow1 + 48 - start2;

			out = + u_ptr[ 0] * dewindow1[0x0]
			      + u_ptr[ 2] * dewindow1[0x2]
			      + u_ptr[ 4] * dewindow1[0x4]
			      + u_ptr[ 6] * dewindow1[0x6]
			      + u_ptr[ 8] * dewindow1[0x8]
			      + u_ptr[10] * dewindow1[0xa]
			      + u_ptr[12] * dewindow1[0xc]
			      + u_ptr[14] * dewindow1[0xe];

			samples[upper] = out>32767 ? 32767 : ( out < -32768 ? -32768 : out);
		} 
		else 
		{		
			for (j=nch;j<upper;j+=nch) 
			{
				u_ptr     += 16;
				dewindow2 += 32;	
				dewindow1 += 32;				

				u0 =u_ptr[ 0]; u1 =u_ptr[ 1];
				u2 =u_ptr[ 2]; u3 =u_ptr[ 3];
				u4 =u_ptr[ 4]; u5 =u_ptr[ 5];
				u6 =u_ptr[ 6]; u7 =u_ptr[ 7];
				u8 =u_ptr[ 8]; u9 =u_ptr[ 9];
				u10=u_ptr[10]; u11=u_ptr[11];
				u12=u_ptr[12]; u13=u_ptr[13];
				u14=u_ptr[14]; u15=u_ptr[15];

				out = + u0  * dewindow2[0x0]
				      + u1  * dewindow2[0x1]
				      + u2  * dewindow2[0x2]
				      + u3  * dewindow2[0x3]
				      + u4  * dewindow2[0x4]
				      + u5  * dewindow2[0x5]
				      + u6  * dewindow2[0x6]
				      + u7  * dewindow2[0x7]
				      + u8  * dewindow2[0x8]
				      + u9  * dewindow2[0x9]
				      + u10 * dewindow2[0xa]
				      + u11 * dewindow2[0xb]
				      + u12 * dewindow2[0xc]
			 	      + u13 * dewindow2[0xd]
				      + u14 * dewindow2[0xe]
				      + u15 * dewindow2[0xf];

				samples[j] = out>32767 ? 32767 : ( out < -32768 ? -32768 : out);

				out = - u15 * dewindow1[0x0]
				      + u14 * dewindow1[0x1]
			   	      - u13 * dewindow1[0x2]
				      + u12 * dewindow1[0x3]
				      - u11 * dewindow1[0x4]
				      + u10 * dewindow1[0x5]
				      - u9  * dewindow1[0x6]
				      + u8  * dewindow1[0x7]
				      - u7  * dewindow1[0x8]
				      + u6  * dewindow1[0x9]
				      - u5  * dewindow1[0xa]
				      + u4  * dewindow1[0xb]
				      - u3  * dewindow1[0xc]
				      + u2  * dewindow1[0xd]
				      - u1  * dewindow1[0xe]
				      + u0  * dewindow1[0xf];
				
				samples[upper2-j] = out>32767 ? 32767 : ( out < -32768 ? -32768 : out);
			}

			u_ptr += 16;

			dewindow1 = dewindow1 + 48 - start2;

			out = + u_ptr[ 1] * dewindow1[0x1]
			      + u_ptr[ 3] * dewindow1[0x3]
			      + u_ptr[ 5] * dewindow1[0x5]
			      + u_ptr[ 7] * dewindow1[0x7]
			      + u_ptr[ 9] * dewindow1[0x9]
			      + u_ptr[11] * dewindow1[0xb]
			      + u_ptr[13] * dewindow1[0xd]
			      + u_ptr[15] * dewindow1[0xf];

			samples[upper] = out>32767 ? 32767 : ( out < -32768 ? -32768 : out);
		}
	}
#else
/*
	Older and slower version. Could be faster on a Intel.
*/
	{
	
#define PUT_SAMPLE(out)	 \
		samples[0] = out>32767 ? 32767 : ( out < -32768 ? -32768 : out); \
		samples += nch;

		int out, j;
		short *samples = &sample_buffer[f][0][ch];
		float *u_ptr = (float *)u[ch][div];
		const float *dewindow = t_dewindow[0] + 16 - start;
		
		for (j=0;j<16;++j) 
		{
			out = u_ptr[ 0] * dewindow[0x0]
			    + u_ptr[ 1] * dewindow[0x1]
			    + u_ptr[ 2] * dewindow[0x2]
			    + u_ptr[ 3] * dewindow[0x3]
			    + u_ptr[ 4] * dewindow[0x4]
			    + u_ptr[ 5] * dewindow[0x5]
			    + u_ptr[ 6] * dewindow[0x6]
			    + u_ptr[ 7] * dewindow[0x7]
			    + u_ptr[ 8] * dewindow[0x8]
			    + u_ptr[ 9] * dewindow[0x9]
			    + u_ptr[10] * dewindow[0xa]
			    + u_ptr[11] * dewindow[0xb]
			    + u_ptr[12] * dewindow[0xc]
		 	    + u_ptr[13] * dewindow[0xd]
			    + u_ptr[14] * dewindow[0xe]
			    + u_ptr[15] * dewindow[0xf];

			dewindow += 32;
			u_ptr += 16;
			
			PUT_SAMPLE(out)
		}
			
		if (div & 0x1) 
		{
			out = u_ptr[ 0] * dewindow[0x0]
			    + u_ptr[ 2] * dewindow[0x2]
			    + u_ptr[ 4] * dewindow[0x4]
			    + u_ptr[ 6] * dewindow[0x6]
			    + u_ptr[ 8] * dewindow[0x8]
			    + u_ptr[10] * dewindow[0xa]
			    + u_ptr[12] * dewindow[0xc]
			    + u_ptr[14] * dewindow[0xe];
			
			PUT_SAMPLE(out)
			
			dewindow = dewindow - 48 + start*2;
			
			for (;j<31;++j) 
			{
				u_ptr -= 16;
				
				out =-u_ptr[ 0] * dewindow[0xf]
				    + u_ptr[ 1] * dewindow[0xe]
				    - u_ptr[ 2] * dewindow[0xd]
				    + u_ptr[ 3] * dewindow[0xc]
				    - u_ptr[ 4] * dewindow[0xb]
				    + u_ptr[ 5] * dewindow[0xa]
			  	    - u_ptr[ 6] * dewindow[0x9]
				    + u_ptr[ 7] * dewindow[0x8]
				    - u_ptr[ 8] * dewindow[0x7]
				    + u_ptr[ 9] * dewindow[0x6]
				    - u_ptr[10] * dewindow[0x5]
				    + u_ptr[11] * dewindow[0x4]
				    - u_ptr[12] * dewindow[0x3]
				    + u_ptr[13] * dewindow[0x2]
				    - u_ptr[14] * dewindow[0x1]
				    + u_ptr[15] * dewindow[0x0];
				
				dewindow -= 32;

				PUT_SAMPLE(out)
			}
		} 
		else 
		{
			out = u_ptr[ 1] * dewindow[0x1]
			    + u_ptr[ 3] * dewindow[0x3]
			    + u_ptr[ 5] * dewindow[0x5]
			    + u_ptr[ 7] * dewindow[0x7]
			    + u_ptr[ 9] * dewindow[0x9]
			    + u_ptr[11] * dewindow[0xb]
			    + u_ptr[13] * dewindow[0xd]
			    + u_ptr[15] * dewindow[0xf];
			
			PUT_SAMPLE(out)
				
			dewindow = dewindow - 48 + start*2;
				
			for (;j<31;++j) 
			{
				u_ptr -= 16;
				
				out = u_ptr[ 0] * dewindow[0xf]
				    - u_ptr[ 1] * dewindow[0xe]
				    + u_ptr[ 2] * dewindow[0xd]
				    - u_ptr[ 3] * dewindow[0xc]
				    + u_ptr[ 4] * dewindow[0xb]
				    - u_ptr[ 5] * dewindow[0xa]
				    + u_ptr[ 6] * dewindow[0x9]
				    - u_ptr[ 7] * dewindow[0x8]
				    + u_ptr[ 8] * dewindow[0x7]
				    - u_ptr[ 9] * dewindow[0x6]
				    + u_ptr[10] * dewindow[0x5]
				    - u_ptr[11] * dewindow[0x4]
				    + u_ptr[12] * dewindow[0x3]
			   	    - u_ptr[13] * dewindow[0x2]
				    + u_ptr[14] * dewindow[0x1]
				    - u_ptr[15] * dewindow[0x0];

				dewindow -= 32;
				
				PUT_SAMPLE(out)
			}
		}
	}
#endif	
	--u_start[ch];
	u_start[ch] &= 0xf;
	u_div[ch]^=1;
}

void premultiply()
{
  int i,t;

  for (i = 0; i < 17; ++i)
    for (t = 0; t < 32; ++t)
      t_dewindow[i][t] *= 16383.5f;
}
