_real_f_f_tf48x_8cpp_source.html

/**********************************************************************


   Audacity: A Digital Audio Editor


   RealFFT48x.cpp


   Philip Van Baren

   Andrew Hallendorff (SSE Mods)


*******************************************************************//****************************************************************/


/*

*     Program: REALFFTF.C

*      Author: Philip Van Baren

*        Date: 2 September 1993

*

* Description: These routines perform an FFT on real data to get a conjugate-symmetric

*              output, and an inverse FFT on conjugate-symmetric input to get a real

*              output sequence.

*

*              This code is for floating point data.

*

*              Modified 8/19/1998 by Philip Van Baren

*                 - made the InitializeFFT and EndFFT routines take a structure

*                   holding the length and pointers to the BitReversed and SinTable

*                   tables.

*              Modified 5/23/2009 by Philip Van Baren

*                 - Added GetFFT and ReleaseFFT routines to retain common SinTable

*                   and BitReversed tables so they don't need to be reallocated

*                   and recomputed on every call.

*                 - Added Reorder* functions to undo the bit-reversal

*              Modified 15 April 2016 Paul Licameli

*                 - C++11 smart pointers

*

*  Copyright (C) 2009  Philip VanBaren

*

*  This program is free software; you can redistribute it and/or modify

*  it under the terms of the GNU General Public License as published by

*  the Free Software Foundation; either version 2 of the License, or

*  (at your option) any later version.

*

*  This program is distributed in the hope that it will be useful,

*  but WITHOUT ANY WARRANTY; without even the implied warranty of

*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

*  GNU General Public License for more details.

*

*  You should have received a copy of the GNU General Public License

*  along with this program; if not, write to the Free Software

*  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

*/


#include "RealFFTf48x.h"


#ifdef EXPERIMENTAL_EQ_SSE_THREADED


// at the moment these two are mutually exclusive

//#define USE_SSEMATHFUNC

//#define TEST_COSSINBIT_TABLES


#ifndef USE_SSE2

#define  USE_SSE2

#endif


#include <stdlib.h>

#include <math.h>

#include "RealFFTf.h"

#ifdef __WXMSW__

#pragma warning(disable:4305)

#else


#endif

#include "SseMathFuncs.h"

#include <intrin.h>


#ifndef M_PI

#define  M_PI     3.14159265358979323846  /* pi */

#endif


int tableMask=0;

bool useBitReverseTable=false;

bool useSinCosTable=false;


void TableUsage(int iMask)

{

   tableMask=iMask;

   useBitReverseTable=(iMask&1)!=0;

   useSinCosTable=(iMask&2)!=0;

}


SinCosTable::SinCosTable() :

mSinCosTablePow(13)

{

   size_t tableSize=1<<mSinCosTablePow;

   mSinCosTable.reinit(tableSize);

   for(size_t i=0;i<tableSize;i++) {

      mSinCosTable[i].mSin=(float)-sin(((float)i)*M_PI/tableSize);

      mSinCosTable[i].mCos=(float)-cos(((float)i)*M_PI/tableSize);

   }

};


static SinCosTable sSinCosTable;


static unsigned char sSmallRBTable[256];


class BitReverser {

   public:

      BitReverser()

      {

         for(int i=0;i<256;i++) {

            sSmallRBTable[i]=0;

            for(int maskLow=1, maskHigh=128;maskLow<256;maskLow<<=1,maskHigh>>=1)

               if(i&maskLow)

                  sSmallRBTable[i]|=maskHigh;

         }

      };

   };

static BitReverser sBitReverser;


/* array of functions there prob is a better way to do this */

int SmallVRB0(int bits) { return bits; }; int SmallVRB1(int bits) { return sSmallRBTable[bits]>>7; };

int SmallVRB2(int bits) { return sSmallRBTable[bits]>>6; }; int SmallVRB3(int bits) { return sSmallRBTable[bits]>>5; };

int SmallVRB4(int bits) { return sSmallRBTable[bits]>>4; }; int SmallVRB5(int bits) { return sSmallRBTable[bits]>>3; };

int SmallVRB6(int bits) { return sSmallRBTable[bits]>>2; }; int SmallVRB7(int bits) { return sSmallRBTable[bits]>>1; };

int SmallVRB8(int bits) { return sSmallRBTable[bits]; };

int SmallVRB9(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<1)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>7); };

int SmallVRB10(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<2)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>6); };

int SmallVRB11(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<3)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>5); };

int SmallVRB12(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<4)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>4); };

int SmallVRB13(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<5)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>3); };

int SmallVRB14(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<6)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>2); };

int SmallVRB15(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<7)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>1); };

int SmallVRB16(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<8)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]); };


int (*SmallVRB[])(int bits) = { SmallVRB0, SmallVRB1, SmallVRB2, SmallVRB3, SmallVRB4,

   SmallVRB5, SmallVRB6, SmallVRB7, SmallVRB8, SmallVRB9, SmallVRB10,

   SmallVRB11, SmallVRB12, SmallVRB13, SmallVRB14,SmallVRB15, SmallVRB16 };


int SmallRB(int bits, int numberBits)

{

//   return  ( (sSmallRBTable[*((unsigned char *)&bits)]<<16) + (sSmallRBTable[*(((unsigned char *)&bits)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&bits)+2)] ))>>(24-numberBits);

   return ( (sSmallRBTable[*((unsigned char *)&bits)]<<8) + (sSmallRBTable[*(((unsigned char *)&bits)+1)]) )>>(16-numberBits);

};


/* wrapper functions. If passed -1 function choice will be made locally */

void RealFFTf1x(fft_type *buffer, FFTParam *h, int functionType)

{

   switch(functionType) {

   case FFT_SinCosTableVBR16:

      RealFFTf1xSinCosTableVBR16( buffer, h);

      break;

   case FFT_SinCosTableBR16:

      RealFFTf1xSinCosTableBR16( buffer, h);

      break;

   case FFT_FastMathBR16:

      RealFFTf1xFastMathBR16( buffer, h);

      break;

   case FFT_FastMathBR24:

      RealFFTf1xFastMathBR24( buffer, h);

      break;

   case FFT_SinCosBRTable:

   default:

      RealFFTf1xSinCosBRTable( buffer, h);

   };

}


void InverseRealFFTf1x(fft_type *buffer, FFTParam *h, int functionType)

{

   switch(functionType) {

   case FFT_SinCosTableVBR16:

      InverseRealFFTf1xSinCosTableVBR16( buffer, h);

      break;

   case FFT_SinCosTableBR16:

      InverseRealFFTf1xSinCosTableBR16( buffer, h);

      break;

   case FFT_FastMathBR16:

      InverseRealFFTf1xFastMathBR16( buffer, h);

      break;

   case FFT_FastMathBR24:

      InverseRealFFTf1xFastMathBR24( buffer, h);

      break;

   case FFT_SinCosBRTable:

   default:

      InverseRealFFTf1xSinCosBRTable( buffer, h);

   };

}


void ReorderToTime1x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType)

{

   switch(functionType) {

   case FFT_SinCosTableVBR16:

      ReorderToTime1xSinCosTableVBR16( hFFT, buffer, TimeOut);

      break;

   case FFT_SinCosTableBR16:

      ReorderToTime1xSinCosTableBR16( hFFT, buffer, TimeOut);

      break;

   case FFT_FastMathBR16:

      ReorderToTime1xFastMathBR16( hFFT, buffer, TimeOut);

      break;

   case FFT_FastMathBR24:

      ReorderToTime1xFastMathBR24( hFFT, buffer, TimeOut);

      break;

   case FFT_SinCosBRTable:

   default:

      ReorderToTime1xSinCosBRTable( hFFT, buffer, TimeOut);

   };

}


void ReorderToFreq1x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType)

{

   switch(functionType) {

   case FFT_SinCosTableVBR16:

      ReorderToFreq1xSinCosTableVBR16(hFFT, buffer, RealOut, ImagOut);

      break;

   case FFT_SinCosTableBR16:

      ReorderToFreq1xSinCosTableBR16(hFFT, buffer, RealOut, ImagOut);

      break;

   case FFT_FastMathBR16:

      ReorderToFreq1xFastMathBR16(hFFT, buffer, RealOut, ImagOut);

      break;

   case FFT_FastMathBR24:

      ReorderToFreq1xFastMathBR24(hFFT, buffer, RealOut, ImagOut);

      break;

   case FFT_SinCosBRTable:

   default:

      ReorderToFreq1xSinCosBRTable(hFFT, buffer, RealOut, ImagOut);

   };

}


void RealFFTf4x( fft_type *buffer, FFTParam *h, int functionType)

{

   switch(functionType) {

   case FFT_SinCosTableVBR16:

      RealFFTf4xSinCosTableVBR16( buffer, h);

      break;

   case FFT_SinCosTableBR16:

      RealFFTf4xSinCosTableBR16( buffer, h);

      break;

   case FFT_FastMathBR16:

      RealFFTf4xFastMathBR16( buffer, h);

      break;

   case FFT_FastMathBR24:

      RealFFTf4xFastMathBR24( buffer, h);

      break;

   case FFT_SinCosBRTable:

   default:

      RealFFTf4xSinCosBRTable( buffer, h);

   };

}


void InverseRealFFTf4x( fft_type *buffer, FFTParam *h, int functionType)

{

   switch(functionType) {

   case FFT_SinCosTableVBR16:

      InverseRealFFTf4xSinCosTableVBR16( buffer, h);

      break;

   case FFT_SinCosTableBR16:

      InverseRealFFTf4xSinCosTableBR16( buffer, h);

      break;

   case FFT_FastMathBR16:

      InverseRealFFTf4xFastMathBR16( buffer, h);

      break;

   case FFT_FastMathBR24:

      InverseRealFFTf4xFastMathBR24( buffer, h);

      break;

   case FFT_SinCosBRTable:

   default:

      InverseRealFFTf4xSinCosBRTable( buffer, h);

   };

}


void ReorderToTime4x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType)

{

   switch(functionType) {

   case FFT_SinCosTableVBR16:

      ReorderToTime4xSinCosTableVBR16( hFFT, buffer, TimeOut);

      break;

   case FFT_SinCosTableBR16:

      ReorderToTime4xSinCosTableBR16( hFFT, buffer, TimeOut);

      break;

   case FFT_FastMathBR16:

      ReorderToTime4xFastMathBR16( hFFT, buffer, TimeOut);

      break;

   case FFT_FastMathBR24:

      ReorderToTime4xFastMathBR24( hFFT, buffer, TimeOut);

      break;

   case FFT_SinCosBRTable:

   default:

      ReorderToTime4xSinCosBRTable( hFFT, buffer, TimeOut);

   };

}


void ReorderToFreq4x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType)

{

   switch(functionType) {

   case FFT_SinCosTableVBR16:

      ReorderToFreq4xSinCosTableVBR16(hFFT, buffer, RealOut, ImagOut);

      break;

   case FFT_SinCosTableBR16:

      ReorderToFreq4xSinCosTableBR16(hFFT, buffer, RealOut, ImagOut);

      break;

   case FFT_FastMathBR16:

      ReorderToFreq4xFastMathBR16(hFFT, buffer, RealOut, ImagOut);

      break;

   case FFT_FastMathBR24:

      ReorderToFreq4xFastMathBR24(hFFT, buffer, RealOut, ImagOut);

      break;

   case FFT_SinCosBRTable:

   default:

      ReorderToFreq4xSinCosBRTable(hFFT, buffer, RealOut, ImagOut);

   };

}


#define REAL_SINCOSBRTABLE

#ifdef REAL_SINCOSBRTABLE


/*

*  Forward FFT routine.  Must call GetFFT(fftlen) first!

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]

*                                  Imag_i = buffer[ h->BitReversed[i]+1 ] )

*        Input is in normal order.

*

* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void RealFFTf1xSinCosBRTable(fft_type *buffer, FFTParam *h)

{

   fft_type *A,*B;

   fft_type *sptr;

   fft_type *endptr1,*endptr2;

   int *br1,*br2;

   fft_type HRplus,HRminus,HIplus,HIminus;

   fft_type v1,v2,sin,cos;


   auto ButterfliesPerGroup = h->Points / 2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = buffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = buffer;

      B = buffer + ButterfliesPerGroup * 2;

      sptr = h->SinTable.get();


      while(A < endptr1)

      {

         sin = *sptr;

         cos = *(sptr + 1);

         endptr2 = B;

         while(A < endptr2)

         {

            v1 = *B * cos + *(B+1) * sin;

            v2 = *B * sin - *(B+1) * cos;

            *B = (*A + v1);

            *(A++) = *(B++) - 2 * v1;

            *B = (*A - v2);

            *(A++) = *(B++) + 2 * v2;

         }

         A = B;

         B += ButterfliesPerGroup * 2;

         sptr += 2;

      }

      ButterfliesPerGroup >>= 1;

   }

   /* Massage output to get the output for a real input sequence. */

   br1 = h->BitReversed.get() + 1;

   br2 = h->BitReversed.get() + h->Points - 1;


   while(br1 < br2)

   {

      sin=h->SinTable[*br1];

      cos=h->SinTable[*br1+1];

      A=buffer+*br1;

      B=buffer+*br2;

      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);

      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);

      v1 = (sin*HRminus - cos*HIplus);

      v2 = (cos*HRminus + sin*HIplus);

      *A = (HRplus  + v1) * (fft_type)0.5;

      *B = *A - v1;

      *(A+1) = (HIminus + v2) * (fft_type)0.5;

      *(B+1) = *(A+1) - HIminus;


      br1++;

      br2--;

   }

   /* Handle the center bin (just need a conjugate) */

   A=buffer+*br1+1;

   *A=-*A;

   /* Handle DC bin separately - and ignore the Fs/2 bin

   buffer[0]+=buffer[1];

   buffer[1]=(fft_type)0;*/

   /* Handle DC and Fs/2 bins separately */

   /* Put the Fs/2 value into the imaginary part of the DC bin */

   v1=buffer[0]-buffer[1];

   buffer[0]+=buffer[1];

   buffer[1]=v1;

}


/* Description: This routine performs an inverse FFT to real data.

*              This code is for floating point data.

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]

*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )

*        Input is in normal order, interleaved (real,imaginary) complex data

*        You must call GetFFT(fftlen) first to initialize some buffers!

*

* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void InverseRealFFTf1xSinCosBRTable(fft_type *buffer, FFTParam *h)

{

   fft_type *A,*B;

   fft_type *sptr;

   fft_type *endptr1,*endptr2;

   int *br1;

   fft_type HRplus,HRminus,HIplus,HIminus;

   fft_type v1,v2,sin,cos;


   auto ButterfliesPerGroup = h->Points / 2;


   /* Massage input to get the input for a real output sequence. */

   A = buffer + 2;

   B = buffer + h->Points * 2 - 2;

   br1 = h->BitReversed.get() + 1;

   while(A < B)

   {

      sin=h->SinTable[*br1];

      cos=h->SinTable[*br1+1];

      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);

      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);

      v1 = (sin*HRminus + cos*HIplus);

      v2 = (cos*HRminus - sin*HIplus);

      *A = (HRplus  + v1) * (fft_type)0.5;

      *B = *A - v1;

      *(A+1) = (HIminus - v2) * (fft_type)0.5;

      *(B+1) = *(A+1) - HIminus;


      A+=2;

      B-=2;

      br1++;

   }

   /* Handle center bin (just need conjugate) */

   *(A+1)=-*(A+1);

   /* Handle DC bin separately - this ignores any Fs/2 component

   buffer[1]=buffer[0]=buffer[0]/2;*/

   /* Handle DC and Fs/2 bins specially */

   /* The DC bin is passed in as the real part of the DC complex value */

   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */

   /* (v1+v2) = buffer[0] == the DC component */

   /* (v1-v2) = buffer[1] == the Fs/2 component */

   v1=0.5f*(buffer[0]+buffer[1]);

   v2=0.5f*(buffer[0]-buffer[1]);

   buffer[0]=v1;

   buffer[1]=v2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = buffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = buffer;

      B = buffer + ButterfliesPerGroup * 2;

      sptr = h->SinTable.get();


      while(A < endptr1)

      {

         sin = *(sptr++);

         cos = *(sptr++);

         endptr2 = B;

         while(A < endptr2)

         {

            v1 = *B * cos - *(B + 1) * sin;

            v2 = *B * sin + *(B + 1) * cos;

            *B = (*A + v1) * (fft_type)0.5;

            *(A++) = *(B++) - v1;

            *B = (*A + v2) * (fft_type)0.5;

            *(A++) = *(B++) - v2;

         }

         A = B;

         B += ButterfliesPerGroup * 2;

      }

      ButterfliesPerGroup >>= 1;

   }

}


void ReorderToFreq1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

{

   // Copy the data into the real and imaginary outputs

   for(size_t i = 1; i < hFFT->Points; i++) {

      RealOut[i]=buffer[hFFT->BitReversed[i]  ];

      ImagOut[i]=buffer[hFFT->BitReversed[i]+1];

   }

   RealOut[0] = buffer[0]; // DC component

   ImagOut[0] = 0;

   RealOut[hFFT->Points] = buffer[1]; // Fs/2 component

   ImagOut[hFFT->Points] = 0;

}


void ReorderToTime1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

{

   // Copy the data into the real outputs

   for(size_t i = 0; i < hFFT->Points; i++) {

      TimeOut[i*2  ]=buffer[hFFT->BitReversed[i]  ];

      TimeOut[i*2+1]=buffer[hFFT->BitReversed[i]+1];

   }

}


// 4x processing simd

void RealFFTf4xSinCosBRTable(fft_type *buffer, FFTParam *h)

{


   __m128 *localBuffer=(__m128 *)buffer;


   __m128 *A,*B;

   fft_type *sptr;

   __m128 *endptr1,*endptr2;

   int br1Index, br2Index;

   int br1Value, br2Value;

   __m128 HRplus,HRminus,HIplus,HIminus;

   __m128 v1,v2,sin,cos;

   auto ButterfliesPerGroup = h->Points / 2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = &localBuffer[h->Points * 2];


   while(ButterfliesPerGroup > 0)

   {

      A = localBuffer;

      B = &localBuffer[ButterfliesPerGroup * 2];

      sptr = h->SinTable.get();

      while(A < endptr1)

      {

         sin = _mm_set1_ps(*(sptr++));

         cos = _mm_set1_ps(*(sptr++));

         endptr2 = B;

         while(A < endptr2)

         {

            v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));

            v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));

            *B = _mm_add_ps( *A, v1);

            __m128 temp128 = _mm_set1_ps( 2.0);

            *(A++) = _mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));

            *B = _mm_sub_ps(*A,v2);

            *(A++) = _mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));

         }

         A = B;

         B = &B[ButterfliesPerGroup * 2];

      }

      ButterfliesPerGroup >>= 1;

   }

   /* Massage output to get the output for a real input sequence. */


   br1Index = 1; // h->BitReversed + 1;

   br2Index = h->Points - 1;   //h->BitReversed + h->Points - 1;


   while(br1Index<br2Index)

   {

      br1Value=h->BitReversed[br1Index];

      br2Value=h->BitReversed[br2Index];

      sin=_mm_set1_ps(h->SinTable[br1Value]);

      cos=_mm_set1_ps(h->SinTable[br1Value+1]);

      A=&localBuffer[br1Value];

      B=&localBuffer[br2Value];

      __m128 temp128 = _mm_set1_ps( 2.0);

      HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));

      HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));

      v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));

      v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));

      temp128 = _mm_set1_ps( 0.5);

      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);

      *B = _mm_sub_ps(*A, v1);

      *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);

      *(B+1) = _mm_sub_ps(*(A+1), HIminus);


      br1Index++;

      br2Index--;

   }

   /* Handle the center bin (just need a conjugate) */

   A=&localBuffer[h->BitReversed[br1Index]+1];

   // negate sse style

   *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));

   /* Handle DC and Fs/2 bins separately */

   /* Put the Fs/2 value into the imaginary part of the DC bin */

   v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);

   localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);

   localBuffer[1]=v1;

}


/* Description: This routine performs an inverse FFT to real data.

*              This code is for floating point data.

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]

*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )

*        Input is in normal order, interleaved (real,imaginary) complex data

*        You must call GetFFT(fftlen) first to initialize some buffers!

*

* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void InverseRealFFTf4xSinCosBRTable(fft_type *buffer, FFTParam *h)

{


   __m128 *localBuffer=(__m128 *)buffer;


   __m128 *A,*B;

   fft_type *sptr;

   __m128 *endptr1,*endptr2;

   int br1Index, br1Value;

   __m128 HRplus,HRminus,HIplus,HIminus;

   __m128 v1,v2,sin,cos;


   auto ButterfliesPerGroup = h->Points / 2;


   /* Massage input to get the input for a real output sequence. */

   A = localBuffer + 2;

   B = localBuffer + h->Points * 2 - 2;

   br1Index = 1; //h->BitReversed + 1;

   while(A < B)

   {

      br1Value = h->BitReversed[br1Index];

      sin = _mm_set1_ps(h->SinTable[br1Value]);

      cos = _mm_set1_ps(h->SinTable[br1Value + 1]);

      HRminus = _mm_sub_ps(*A,  *B);

      HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B,  _mm_set1_ps(2.0)));

      HIminus = _mm_sub_ps( *(A+1), *(B+1));

      HIplus = _mm_add_ps(HIminus,  _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));

      v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));

      v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));

      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));

      *B = _mm_sub_ps(*A, v1);

      *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));

      *(B+1) = _mm_sub_ps(*(A+1), HIminus);


      A=&A[2];

      B=&B[-2];

      br1Index++;

   }

   /* Handle center bin (just need conjugate) */

   // negate sse style

   *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));


   /* Handle DC bin separately - this ignores any Fs/2 component

   buffer[1]=buffer[0]=buffer[0]/2;*/

   /* Handle DC and Fs/2 bins specially */

   /* The DC bin is passed in as the real part of the DC complex value */

   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */

   /* (v1+v2) = buffer[0] == the DC component */

   /* (v1-v2) = buffer[1] == the Fs/2 component */

   v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));

   v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));

   localBuffer[0]=v1;

   localBuffer[1]=v2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = localBuffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = localBuffer;

      B = localBuffer + ButterfliesPerGroup * 2;

      sptr = h->SinTable.get();

      while(A < endptr1)

      {

         sin = _mm_set1_ps(*(sptr++));

         cos = _mm_set1_ps(*(sptr++));

         endptr2 = B;

         while(A < endptr2)

         {

            v1 = _mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B + 1), sin));

            v2 = _mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B + 1), cos));

            *B = _mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));

            *(A++) = _mm_sub_ps(*(B++), v1);

            *B = _mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));

            *(A++) = _mm_sub_ps(*(B++), v2);

         }

         A = B;

         B = &B[ButterfliesPerGroup * 2];

      }

      ButterfliesPerGroup >>= 1;

   }

}


void ReorderToFreq4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

{

   __m128 *localBuffer=(__m128 *)buffer;

   __m128 *localRealOut=(__m128 *)RealOut;

   __m128 *localImagOut=(__m128 *)ImagOut;


   // Copy the data into the real and imaginary outputs

   for(size_t i = 1; i < hFFT->Points; i++) {

      int brValue;

      brValue=hFFT->BitReversed[i];

      localRealOut[i]=localBuffer[brValue  ];

      localImagOut[i]=localBuffer[brValue+1];

   }

   localRealOut[0] = localBuffer[0]; // DC component

   localImagOut[0] = _mm_set1_ps(0.0);

   localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component

   localImagOut[hFFT->Points] = _mm_set1_ps(0.0);

}


void ReorderToTime4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

{

   __m128 *localBuffer=(__m128 *)buffer;

   __m128 *localTimeOut=(__m128 *)TimeOut;

   // Copy the data into the real outputs

   for(size_t i = 0; i < hFFT->Points; i++) {

      int brValue;

      brValue = hFFT->BitReversed[i];

      localTimeOut[i*2  ] = localBuffer[brValue  ];

      localTimeOut[i*2+1] = localBuffer[brValue+1];

   }

}


#endif


#define REAL_SINCOSTABLE_VBR16

#ifdef REAL_SINCOSTABLE_VBR16


/*

*  Forward FFT routine.  Must call GetFFT(fftlen) first!

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]

*                                  Imag_i = buffer[ h->BitReversed[i]+1 ] )

*        Input is in normal order.

*

* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void RealFFTf1xSinCosTableVBR16(fft_type *buffer, FFTParam *h)

{

   fft_type *A,*B;

   fft_type *endptr1,*endptr2;

   int br1Index, br2Index;

   int br1Value, br2Value;

   fft_type HRplus,HRminus,HIplus,HIminus;

   fft_type v1,v2,sin,cos;

   auto ButterfliesPerGroup = h->Points / 2;

   int pow2BitsMinus1 = h->pow2Bits - 1;

   int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);


   endptr1 = buffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = buffer;

      B = buffer + ButterfliesPerGroup * 2;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;

         sin = sSinCosTable.mSinCosTable[sinCosLookup].mSin;

         cos = sSinCosTable.mSinCosTable[sinCosLookup].mCos;

         iSinCosIndex++;

         endptr2 = B;

         while(A < endptr2)

         {

            v1 = *B*cos + *(B+1)*sin;

            v2 = *B*sin - *(B+1)*cos;

            *B = (*A+v1);

            *(A++) = *(B++) - 2 * v1;

            *B = (*A - v2);

            *(A++) = *(B++) + 2 * v2;

         }

         A = B;

         B += ButterfliesPerGroup * 2;

      }

      ButterfliesPerGroup >>= 1;

   }

   /* Massage output to get the output for a real input sequence. */


   br1Index = 1;

   br2Index = h->Points - 1;


   while(br1Index < br2Index)

   {

      br1Value=(*SmallVRB[h->pow2Bits])(br1Index);

      br2Value=(*SmallVRB[h->pow2Bits])(br2Index);

      int sinCosIndex=br1Index<<sinCosShift;

      sin=sSinCosTable.mSinCosTable[sinCosIndex].mSin;

      cos=sSinCosTable.mSinCosTable[sinCosIndex].mCos;

      A=&buffer[br1Value];

      B=&buffer[br2Value];

      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);

      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);

      v1 = (sin*HRminus - cos*HIplus);

      v2 = (cos*HRminus + sin*HIplus);

      *A = (HRplus  + v1) * (fft_type)0.5;

      *B = *A - v1;

      *(A+1) = (HIminus + v2) * (fft_type)0.5;

      *(B+1) = *(A+1) - HIminus;


      br1Index++;

      br2Index--;

   }

   /* Handle the center bin (just need a conjugate) */

   A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];


   /* Handle DC bin separately - and ignore the Fs/2 bin

   buffer[0]+=buffer[1];

   buffer[1]=(fft_type)0;*/

   /* Handle DC and Fs/2 bins separately */

   /* Put the Fs/2 value into the imaginary part of the DC bin */

   v1=buffer[0]-buffer[1];

   buffer[0]+=buffer[1];

   buffer[1]=v1;

}


/* Description: This routine performs an inverse FFT to real data.

*              This code is for floating point data.

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]

*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )

*        Input is in normal order, interleaved (real,imaginary) complex data

*        You must call GetFFT(fftlen) first to initialize some buffers!

*

* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void InverseRealFFTf1xSinCosTableVBR16(fft_type *buffer, FFTParam *h)

{

   fft_type *A,*B;

   fft_type *endptr1,*endptr2;

   int br1Index, br1Value;

   int *br1;

   fft_type HRplus,HRminus,HIplus,HIminus;

   fft_type v1,v2,sin,cos;

   auto ButterfliesPerGroup = h->Points / 2;

   int pow2BitsMinus1 = h->pow2Bits - 1;

   int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);


   /* Massage input to get the input for a real output sequence. */

   A = buffer + 2;

   B = buffer + h->Points * 2 - 2;

   br1 = h->BitReversed.get() + 1;

   br1Index = 1; //h->BitReversed + 1;

   while(A < B)

   {

      br1Value = (*SmallVRB[h->pow2Bits])(br1Index);

      int sinCosIndex = br1Index << sinCosShift;

      sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin;

      cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos;

      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);

      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);

      v1 = (sin*HRminus + cos*HIplus);

      v2 = (cos*HRminus - sin*HIplus);

      *A = (HRplus  + v1) * (fft_type)0.5;

      *B = *A - v1;

      *(A+1) = (HIminus - v2) * (fft_type)0.5;

      *(B+1) = *(A+1) - HIminus;


      A=&A[2];

      B=&B[-2];

      br1Index++;

   }

   /* Handle center bin (just need conjugate) */

   *(A+1)=-*(A+1);

   /* Handle DC bin separately - this ignores any Fs/2 component

   buffer[1]=buffer[0]=buffer[0]/2;*/

   /* Handle DC and Fs/2 bins specially */

   /* The DC bin is passed in as the real part of the DC complex value */

   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */

   /* (v1+v2) = buffer[0] == the DC component */

   /* (v1-v2) = buffer[1] == the Fs/2 component */

   v1=0.5f*(buffer[0]+buffer[1]);

   v2=0.5f*(buffer[0]-buffer[1]);

   buffer[0]=v1;

   buffer[1]=v2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = buffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = buffer;

      B = buffer + ButterfliesPerGroup * 2;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;

         sin = sSinCosTable.mSinCosTable[sinCosLookup].mSin;

         cos = sSinCosTable.mSinCosTable[sinCosLookup].mCos;

         iSinCosIndex++;

         endptr2 = B;

         while(A < endptr2)

         {

            v1 = *B * cos - *(B + 1) * sin;

            v2 = *B * sin + *(B + 1) * cos;

            *B = (*A + v1) * (fft_type)0.5;

            *(A++) = *(B++) - v1;

            *B = (*A + v2) * (fft_type)0.5;

            *(A++) = *(B++) - v2;

         }

         A = B;

         B += ButterfliesPerGroup * 2;

      }

      ButterfliesPerGroup >>= 1;

   }

}


void ReorderToTime1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

{

   // Copy the data into the real outputs

   for(size_t i = 0;i < hFFT->Points; i++) {

      int brValue;

      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      TimeOut[i*2  ] = buffer[brValue  ];

      TimeOut[i*2+1] = buffer[brValue+1];

   }

}


void ReorderToFreq1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

{

   // Copy the data into the real and imaginary outputs

   for(size_t i = 1; i < hFFT->Points; i++) {

      int brValue;

      brValue = (*SmallVRB[hFFT->pow2Bits])(i);

      RealOut[i] = buffer[brValue  ];

      ImagOut[i] = buffer[brValue+1];

   }

   RealOut[0] = buffer[0]; // DC component

   ImagOut[0] = 0;

   RealOut[hFFT->Points] = buffer[1]; // Fs/2 component

   ImagOut[hFFT->Points] = 0;

}


// 4x processing simd

void RealFFTf4xSinCosTableVBR16(fft_type *buffer, FFTParam *h)

{


   __m128 *localBuffer=(__m128 *)buffer;


   __m128 *A,*B;

   __m128 *endptr1,*endptr2;

   int br1Index, br2Index;

   int br1Value, br2Value;

   __m128 HRplus,HRminus,HIplus,HIminus;

   __m128 v1,v2,sin,cos;

   auto ButterfliesPerGroup = h->Points / 2;

   int pow2BitsMinus1 = h->pow2Bits - 1;

   int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = &localBuffer[h->Points * 2];


   while(ButterfliesPerGroup > 0)

   {

      A = localBuffer;

      B = &localBuffer[ButterfliesPerGroup * 2];

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;

         sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);

         cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);

         iSinCosIndex++;

         endptr2 = B;

         while(A < endptr2)

         {

            v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));

            v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));

            *B = _mm_add_ps( *A, v1);

            __m128 temp128 = _mm_set1_ps( 2.0);

            *(A++) = _mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));

            *B = _mm_sub_ps(*A,v2);

            *(A++) = _mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));

         }

         A = B;

         B = &B[ButterfliesPerGroup * 2];

      }

      ButterfliesPerGroup >>= 1;

   }

   /* Massage output to get the output for a real input sequence. */


   br1Index = 1; // h->BitReversed + 1;

   br2Index = h->Points - 1;   //h->BitReversed + h->Points - 1;


   while(br1Index < br2Index)

   {

      br1Value=(*SmallVRB[h->pow2Bits])(br1Index);

      br2Value=(*SmallVRB[h->pow2Bits])(br2Index);

      int sinCosIndex=br1Index<<sinCosShift;

      sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);

      cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);

      A=&localBuffer[br1Value];

      B=&localBuffer[br2Value];

      __m128 temp128 = _mm_set1_ps( 2.0);

      HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));

      HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));

      v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));

      v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));

      temp128 = _mm_set1_ps( 0.5);

      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);

      *B = _mm_sub_ps(*A, v1);

      *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);

      *(B+1) = _mm_sub_ps(*(A+1), HIminus);


      br1Index++;

      br2Index--;

   }

   /* Handle the center bin (just need a conjugate) */

   A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];

   // negate sse style

   *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));

   /* Handle DC and Fs/2 bins separately */

   /* Put the Fs/2 value into the imaginary part of the DC bin */

   v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);

   localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);

   localBuffer[1]=v1;

}


/* Description: This routine performs an inverse FFT to real data.

*              This code is for floating point data.

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]

*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )

*        Input is in normal order, interleaved (real,imaginary) complex data

*        You must call GetFFT(fftlen) first to initialize some buffers!

*

* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void InverseRealFFTf4xSinCosTableVBR16(fft_type *buffer, FFTParam *h)

{


   __m128 *localBuffer=(__m128 *)buffer;


   __m128 *A, *B;

   __m128 *endptr1, *endptr2;

   int br1Index, br1Value;

   __m128 HRplus, HRminus, HIplus, HIminus;

   __m128 v1, v2, sin, cos;

   auto ButterfliesPerGroup = h->Points / 2;

   int pow2BitsMinus1 = h->pow2Bits - 1;

   int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);


   /* Massage input to get the input for a real output sequence. */

   A = localBuffer + 2;

   B = localBuffer + h->Points * 2 - 2;

   br1Index = 1; //h->BitReversed + 1;

   while(A < B)

   {

      br1Value = (*SmallVRB[h->pow2Bits])(br1Index);

      int sinCosIndex = br1Index << sinCosShift;

      sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);

      cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);

      HRminus = _mm_sub_ps(*A,  *B);

      HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B,  _mm_set1_ps(2.0)));

      HIminus = _mm_sub_ps( *(A+1), *(B+1));

      HIplus = _mm_add_ps(HIminus,  _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));

      v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));

      v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));

      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));

      *B = _mm_sub_ps(*A, v1);

      *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));

      *(B+1) = _mm_sub_ps(*(A+1), HIminus);


      A = &A[2];

      B = &B[-2];

      br1Index++;

   }

   /* Handle center bin (just need conjugate) */

   // negate sse style

   *(A+1) = _mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));


   /* Handle DC bin separately - this ignores any Fs/2 component

   buffer[1]=buffer[0]=buffer[0]/2;*/

   /* Handle DC and Fs/2 bins specially */

   /* The DC bin is passed in as the real part of the DC complex value */

   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */

   /* (v1+v2) = buffer[0] == the DC component */

   /* (v1-v2) = buffer[1] == the Fs/2 component */

   v1 = _mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));

   v2 = _mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));

   localBuffer[0] = v1;

   localBuffer[1] = v2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = localBuffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = localBuffer;

      B = localBuffer + ButterfliesPerGroup * 2;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;

         sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);

         cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);

         iSinCosIndex++;

         endptr2 = B;

         while(A < endptr2)

         {

            v1 = _mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));

            v2 = _mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));

            *B = _mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));

            *(A++) = _mm_sub_ps(*(B++), v1);

            *B = _mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));

            *(A++) = _mm_sub_ps(*(B++),v2);

         }

         A = B;

         B = &B[ButterfliesPerGroup * 2];

      }

      ButterfliesPerGroup >>= 1;

   }

}


void ReorderToTime4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

{

   __m128 *localBuffer = (__m128 *)buffer;

   __m128 *localTimeOut = (__m128 *)TimeOut;

   // Copy the data into the real outputs

   for(size_t i = 0; i < hFFT->Points; i++) {

      int brValue;

      brValue = (*SmallVRB[hFFT->pow2Bits])(i);

      localTimeOut[i*2  ] = localBuffer[brValue  ];

      localTimeOut[i*2+1] = localBuffer[brValue+1];

   }

}


void ReorderToFreq4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

{

   __m128 *localBuffer = (__m128 *)buffer;

   __m128 *localRealOut = (__m128 *)RealOut;

   __m128 *localImagOut = (__m128 *)ImagOut;


   // Copy the data into the real and imaginary outputs

   for(size_t i = 1; i < hFFT->Points; i++) {

      int brValue;

      brValue = (*SmallVRB[hFFT->pow2Bits])(i);

      localRealOut[i] = localBuffer[brValue  ];

      localImagOut[i] = localBuffer[brValue+1];

   }

   localRealOut[0] = localBuffer[0]; // DC component

   localImagOut[0] = _mm_set1_ps(0.0);

   localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component

   localImagOut[hFFT->Points] = _mm_set1_ps(0.0);

}

#endif


#define REAL_SINCOSTABLE_BR16

#ifdef REAL_SINCOSTABLE_BR16


/*

*  Forward FFT routine.  Must call GetFFT(fftlen) first!

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]

*                                  Imag_i = buffer[ h->BitReversed[i]+1 ] )

*        Input is in normal order.

*

* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void RealFFTf1xSinCosTableBR16(fft_type *buffer, FFTParam *h)

{

   fft_type *A,*B;

   fft_type *endptr1, *endptr2;

   int br1Index, br2Index;

   int br1Value, br2Value;

   fft_type HRplus, HRminus, HIplus, HIminus;

   fft_type v1, v2, sin, cos;

   auto ButterfliesPerGroup = h->Points / 2;

   int pow2BitsMinus1 = h->pow2Bits - 1;

   int bitReverseShiftM1 = 17 - h->pow2Bits;

   int bitReverseShift = bitReverseShiftM1 - 1;

   int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);


   endptr1 = buffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = buffer;

      B = buffer + ButterfliesPerGroup * 2;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

//         int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;

         int sinCosLookup = ( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;

         sin = sSinCosTable.mSinCosTable[sinCosLookup].mSin;

         cos = sSinCosTable.mSinCosTable[sinCosLookup].mCos;

         iSinCosIndex++;

         endptr2 = B;

         while(A < endptr2)

         {

            v1=*B*cos + *(B+1)*sin;

            v2=*B*sin - *(B+1)*cos;

            *B=(*A+v1);

            *(A++)=*(B++)-2*v1;

            *B=(*A-v2);

            *(A++)=*(B++)+2*v2;

         }

         A = B;

         B += ButterfliesPerGroup * 2;

      }

      ButterfliesPerGroup >>= 1;

   }

   /* Massage output to get the output for a real input sequence. */


   br1Index = 1;

   br2Index = h->Points - 1;


   while(br1Index < br2Index)

   {

      br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);

      br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);

      int sinCosIndex = br1Index << sinCosShift;

      sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin;

      cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos;

      A = &buffer[br1Value];

      B = &buffer[br2Value];

      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);

      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);

      v1 = (sin*HRminus - cos*HIplus);

      v2 = (cos*HRminus + sin*HIplus);

      *A = (HRplus  + v1) * (fft_type)0.5;

      *B = *A - v1;

      *(A+1) = (HIminus + v2) * (fft_type)0.5;

      *(B+1) = *(A+1) - HIminus;


      br1Index++;

      br2Index--;

   }

   /* Handle the center bin (just need a conjugate) */

//   A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];

   A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];


   /* Handle DC bin separately - and ignore the Fs/2 bin

   buffer[0]+=buffer[1];

   buffer[1]=(fft_type)0;*/

   /* Handle DC and Fs/2 bins separately */

   /* Put the Fs/2 value into the imaginary part of the DC bin */

   v1=buffer[0]-buffer[1];

   buffer[0]+=buffer[1];

   buffer[1]=v1;

}


/* Description: This routine performs an inverse FFT to real data.

*              This code is for floating point data.

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]

*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )

*        Input is in normal order, interleaved (real,imaginary) complex data

*        You must call GetFFT(fftlen) first to initialize some buffers!

*

* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void InverseRealFFTf1xSinCosTableBR16(fft_type *buffer, FFTParam *h)

{

   fft_type *A,*B;

   fft_type *endptr1,*endptr2;

   int br1Index;

   fft_type HRplus, HRminus, HIplus, HIminus;

   fft_type v1, v2, sin, cos;

   auto ButterfliesPerGroup = h->Points / 2;

   int pow2BitsMinus1 = h->pow2Bits - 1;

   int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1);

   int bitReverseShiftM1 = 17 - h->pow2Bits;


   /* Massage input to get the input for a real output sequence. */

   A = buffer + 2;

   B = buffer + h->Points * 2 - 2;

   br1Index = 1; //h->BitReversed + 1;

   while(A < B)

   {

      int sinCosIndex = br1Index << sinCosShift;

      sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin;

      cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos;

      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);

      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);

      v1 = (sin*HRminus + cos*HIplus);

      v2 = (cos*HRminus - sin*HIplus);

      *A = (HRplus  + v1) * (fft_type)0.5;

      *B = *A - v1;

      *(A+1) = (HIminus - v2) * (fft_type)0.5;

      *(B+1) = *(A+1) - HIminus;


      A=&A[2];

      B=&B[-2];

      br1Index++;

   }

   /* Handle center bin (just need conjugate) */

   *(A+1)=-*(A+1);

   /* Handle DC bin separately - this ignores any Fs/2 component

   buffer[1]=buffer[0]=buffer[0]/2;*/

   /* Handle DC and Fs/2 bins specially */

   /* The DC bin is passed in as the real part of the DC complex value */

   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */

   /* (v1+v2) = buffer[0] == the DC component */

   /* (v1-v2) = buffer[1] == the Fs/2 component */

   v1=0.5f*(buffer[0]+buffer[1]);

   v2=0.5f*(buffer[0]-buffer[1]);

   buffer[0]=v1;

   buffer[1]=v2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = buffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = buffer;

      B = buffer + ButterfliesPerGroup * 2;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

//         int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;

         int sinCosLookup=( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;

         sin=sSinCosTable.mSinCosTable[sinCosLookup].mSin;

         cos=sSinCosTable.mSinCosTable[sinCosLookup].mCos;

         iSinCosIndex++;

         endptr2=B;

         while(A<endptr2)

         {

            v1=*B*cos - *(B+1)*sin;

            v2=*B*sin + *(B+1)*cos;

            *B=(*A+v1)*(fft_type)0.5;

            *(A++)=*(B++)-v1;

            *B=(*A+v2)*(fft_type)0.5;

            *(A++)=*(B++)-v2;

         }

         A = B;

         B += ButterfliesPerGroup * 2;

      }

      ButterfliesPerGroup >>= 1;

   }

}


void ReorderToFreq1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

{

   int bitReverseShift=16-hFFT->pow2Bits;

   // Copy the data into the real and imaginary outputs

   for(size_t i = 1; i < hFFT->Points; i++) {

      int brValue;

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      brValue = ( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);

      RealOut[i] = buffer[brValue  ];

      ImagOut[i] = buffer[brValue+1];

   }

   RealOut[0] = buffer[0]; // DC component

   ImagOut[0] = 0;

   RealOut[hFFT->Points] = buffer[1]; // Fs/2 component

   ImagOut[hFFT->Points] = 0;

}


void ReorderToTime1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

{

   int bitReverseShift=16-hFFT->pow2Bits;

   // Copy the data into the real outputs

   for(size_t i = 0; i < hFFT->Points; i++) {

      int brValue;

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);

      TimeOut[i*2  ] = buffer[brValue  ];

      TimeOut[i*2+1] = buffer[brValue+1];

   }

}


// 4x processing simd

void RealFFTf4xSinCosTableBR16(fft_type *buffer, FFTParam *h)

{


   __m128 *localBuffer=(__m128 *)buffer;


   __m128 *A, *B;

   __m128 *endptr1, *endptr2;

   int br1Index, br2Index;

   int br1Value, br2Value;

   __m128 HRplus, HRminus, HIplus, HIminus;

   __m128 v1, v2, sin, cos;

   auto ButterfliesPerGroup = h->Points / 2;

   int pow2BitsMinus1 = h->pow2Bits - 1;

   int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1);

   int bitReverseShiftM1 = 17 - h->pow2Bits;

   int bitReverseShift = bitReverseShiftM1 - 1;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = &localBuffer[h->Points * 2];


   while(ButterfliesPerGroup > 0)

   {

      A = localBuffer;

      B = &localBuffer[ButterfliesPerGroup * 2];

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

//         int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;

         int sinCosLookup=( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;

         sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);

         cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);

         iSinCosIndex++;

         endptr2=B;

         while(A<endptr2)

         {

            v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));

            v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));

            *B=_mm_add_ps( *A, v1);

            __m128 temp128 = _mm_set1_ps( 2.0);

            *(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));

            *B=_mm_sub_ps(*A,v2);

            *(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));

         }

         A = B;

         B = &B[ButterfliesPerGroup * 2];

      }

      ButterfliesPerGroup >>= 1;

   }

   /* Massage output to get the output for a real input sequence. */


   br1Index = 1; // h->BitReversed + 1;

   br2Index = h->Points - 1;   //h->BitReversed + h->Points - 1;


   while(br1Index < br2Index)

   {

      br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);

      br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);

      int sinCosIndex=br1Index<<sinCosShift;

      sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);

      cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);

      A=&localBuffer[br1Value];

      B=&localBuffer[br2Value];

      __m128 temp128 = _mm_set1_ps( 2.0);

      HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));

      HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));

      v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));

      v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));

      temp128 = _mm_set1_ps( 0.5);

      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);

      *B = _mm_sub_ps(*A, v1);

      *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);

      *(B+1) = _mm_sub_ps(*(A+1), HIminus);


      br1Index++;

      br2Index--;

   }

   /* Handle the center bin (just need a conjugate) */

//   A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];

   A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];

   // negate sse style

   *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));

   /* Handle DC and Fs/2 bins separately */

   /* Put the Fs/2 value into the imaginary part of the DC bin */

   v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);

   localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);

   localBuffer[1]=v1;

}


/* Description: This routine performs an inverse FFT to real data.

*              This code is for floating point data.

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]

*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )

*        Input is in normal order, interleaved (real,imaginary) complex data

*        You must call GetFFT(fftlen) first to initialize some buffers!

*

* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void InverseRealFFTf4xSinCosTableBR16(fft_type *buffer, FFTParam *h)

{


   __m128 *localBuffer=(__m128 *)buffer;


   __m128 *A, *B;

   __m128 *endptr1, *endptr2;

   int br1Index;

   __m128 HRplus, HRminus, HIplus, HIminus;

   __m128 v1, v2, sin, cos;

   auto ButterfliesPerGroup = h->Points / 2;

   int pow2BitsMinus1 = h->pow2Bits - 1;

   int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1);

   int bitReverseShiftM1 = 17 - h->pow2Bits;


   /* Massage input to get the input for a real output sequence. */

   A = localBuffer + 2;

   B = localBuffer + h->Points * 2 - 2;

   br1Index = 1; //h->BitReversed + 1;

   while(A < B)

   {

      int sinCosIndex = br1Index << sinCosShift;

      sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);

      cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);

      HRminus = _mm_sub_ps(*A,  *B);

      HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B,  _mm_set1_ps(2.0)));

      HIminus = _mm_sub_ps( *(A+1), *(B+1));

      HIplus = _mm_add_ps(HIminus,  _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));

      v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));

      v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));

      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));

      *B = _mm_sub_ps(*A, v1);

      *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));

      *(B+1) = _mm_sub_ps(*(A+1), HIminus);


      A=&A[2];

      B=&B[-2];

      br1Index++;

   }

   /* Handle center bin (just need conjugate) */

   // negate sse style

   *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));


   /* Handle DC bin separately - this ignores any Fs/2 component

   buffer[1]=buffer[0]=buffer[0]/2;*/

   /* Handle DC and Fs/2 bins specially */

   /* The DC bin is passed in as the real part of the DC complex value */

   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */

   /* (v1+v2) = buffer[0] == the DC component */

   /* (v1-v2) = buffer[1] == the Fs/2 component */

   v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));

   v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));

   localBuffer[0] = v1;

   localBuffer[1] = v2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = localBuffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = localBuffer;

      B = localBuffer + ButterfliesPerGroup * 2;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

//         int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;

         int sinCosLookup=( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;

         sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);

         cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);

         iSinCosIndex++;

         endptr2=B;

         while(A<endptr2)

         {

            v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));

            v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));

            *B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));

            *(A++)=_mm_sub_ps(*(B++), v1);

            *B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));

            *(A++)=_mm_sub_ps(*(B++),v2);

         }

         A = B;

         B = &B[ButterfliesPerGroup * 2];

      }

      ButterfliesPerGroup >>= 1;

   }

}


void ReorderToTime4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

{

   __m128 *localBuffer = (__m128 *)buffer;

   __m128 *localTimeOut = (__m128 *)TimeOut;

   int bitReverseShift = 16 - hFFT->pow2Bits;


   // Copy the data into the real outputs

   for(size_t i = 0; i < hFFT->Points; i++) {

      int brValue;

      brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      localTimeOut[i*2  ] = localBuffer[brValue  ];

      localTimeOut[i*2+1] = localBuffer[brValue+1];

   }

}


void ReorderToFreq4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

{

   __m128 *localBuffer = (__m128 *)buffer;

   __m128 *localRealOut = (__m128 *)RealOut;

   __m128 *localImagOut = (__m128 *)ImagOut;

   int bitReverseShift = 16 - hFFT->pow2Bits;


   // Copy the data into the real and imaginary outputs

   for(size_t i = 1; i < hFFT->Points; i++) {

      int brValue;

      brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      localRealOut[i] = localBuffer[brValue  ];

      localImagOut[i] = localBuffer[brValue+1];

   }

   localRealOut[0] = localBuffer[0]; // DC component

   localImagOut[0] = _mm_set1_ps(0.0);

   localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component

   localImagOut[hFFT->Points] = _mm_set1_ps(0.0);

}

#endif


#define FAST_MATH_BR24

#ifdef FAST_MATH_BR24


/*

*  Forward FFT routine.  Must call GetFFT(fftlen) first!

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]

*                                  Imag_i = buffer[ h->BitReversed[i]+1 ] )

*        Input is in normal order.

*

* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void RealFFTf1xFastMathBR24(fft_type *buffer, FFTParam *h)

{

   fft_type *A, *B;

   fft_type *endptr1, *endptr2;

   int br1Index, br2Index;

   int br1Value, br2Value;

   fft_type HRplus, HRminus, HIplus, HIminus;

   fft_type v1, v2, sin, cos;

   fft_type iToRad = 2 * M_PI/(2 * h->Points);

   int bitReverseShift = 24 - h->pow2Bits;

   int bitReverseShiftM1 = bitReverseShift + 1;

   auto ButterfliesPerGroup = h->Points / 2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = buffer + h->Points * 2;


   const v4sf zeroes = {0.0,0.0,0.0,0.0};

   while(ButterfliesPerGroup > 0)

   {

      A = buffer;

      B = buffer + ButterfliesPerGroup * 2;

      int sinCosCalIndex = 0;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         v4sf sin4_2, cos4_2;

         if(!sinCosCalIndex)

         {

            //v4sf vx=zeroes; // <-- If we want to suppress the C4701 warning later.

            v4sf vx;

            for(int i=0;i<4;i++) {

               int brTemp=iSinCosIndex+i;

               vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;

//               vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;

            }

            //"Warning C4701 potentially uninitialized local variable 'vx' " is OK.

            //vx is initialised component by component, and MSVC doesn't realize.

            sincos_ps(vx, &sin4_2, &cos4_2);

            sin=-sin4_2.m128_f32[0];

            cos=-cos4_2.m128_f32[0];

            sinCosCalIndex++;

         } else {

            sin=-sin4_2.m128_f32[sinCosCalIndex];

            cos=-cos4_2.m128_f32[sinCosCalIndex];

            if(sinCosCalIndex==3)

               sinCosCalIndex=0;

            else

               sinCosCalIndex++;

         }

         iSinCosIndex++;

         endptr2=B;

         while(A<endptr2)

         {

            v1=*B*cos + *(B+1)*sin;

            v2=*B*sin - *(B+1)*cos;

            *B=(*A+v1);

            *(A++)=*(B++)-2*v1;

            *B=(*A-v2);

            *(A++)=*(B++)+2*v2;

         }

         A = B;

         B += ButterfliesPerGroup * 2;

      }

      ButterfliesPerGroup >>= 1;

   }

   /* Massage output to get the output for a real input sequence. */


   br1Index = 1; // h->BitReversed + 1;

   br2Index = h->Points - 1;   //h->BitReversed + h->Points - 1;


   int sinCosCalIndex = 0;

   while(br1Index < br2Index)

   {

      v4sf sin4_2, cos4_2;

      br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)]  )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);

      br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br2Index)+2)]  )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);

      if(!sinCosCalIndex)

      {

         v4sf vx;

         for(int i=0;i<4;i++)

            vx.m128_f32[i]=((float)(br1Index+i))*iToRad;

         sincos_ps(vx, &sin4_2, &cos4_2);

         sin=-sin4_2.m128_f32[0];

         cos=-cos4_2.m128_f32[0];

         sinCosCalIndex++;

      } else {

         sin=-sin4_2.m128_f32[sinCosCalIndex];

         cos=-cos4_2.m128_f32[sinCosCalIndex];

         if(sinCosCalIndex==3)

            sinCosCalIndex=0;

         else

            sinCosCalIndex++;

      }

      A=&buffer[br1Value];

      B=&buffer[br2Value];

      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);

      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);

      v1 = (sin*HRminus - cos*HIplus);

      v2 = (cos*HRminus + sin*HIplus);

      *A = (HRplus  + v1) * (fft_type)0.5;

      *B = *A - v1;

      *(A+1) = (HIminus + v2) * (fft_type)0.5;

      *(B+1) = *(A+1) - HIminus;


      br1Index++;

      br2Index--;

   }

   /* Handle the center bin (just need a conjugate) */

//   A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];

   A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift)+1];


   /* Handle DC bin separately - and ignore the Fs/2 bin

   buffer[0]+=buffer[1];

   buffer[1]=(fft_type)0;*/

   /* Handle DC and Fs/2 bins separately */

   /* Put the Fs/2 value into the imaginary part of the DC bin */

   v1=buffer[0]-buffer[1];

   buffer[0]+=buffer[1];

   buffer[1]=v1;

}


/* Description: This routine performs an inverse FFT to real data.

*              This code is for floating point data.

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]

*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )

*        Input is in normal order, interleaved (real,imaginary) complex data

*        You must call GetFFT(fftlen) first to initialize some buffers!

*

* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void InverseRealFFTf1xFastMathBR24(fft_type *buffer, FFTParam *h)

{

   fft_type *A,*B;

   fft_type *endptr1,*endptr2;

   int br1Index;

   fft_type HRplus, HRminus, HIplus, HIminus;

   fft_type v1, v2, sin, cos;

   fft_type iToRad = 2 * M_PI / (2 * h->Points);

   int bitReverseShiftM1 = 25 - h->pow2Bits;


   auto ButterfliesPerGroup = h->Points / 2;


   /* Massage input to get the input for a real output sequence. */

   A = buffer + 2;

   B = buffer + h->Points * 2 - 2;

   br1Index = 1; //h->BitReversed + 1;

   int sinCosCalIndex = 0;

   while(A < B)

   {

      v4sf sin4_2, cos4_2;

      if(!sinCosCalIndex)

      {

         v4sf vx;

         for(int i=0;i<4;i++)

            vx.m128_f32[i]=((float)(br1Index+i))*iToRad;

         sincos_ps(vx, &sin4_2, &cos4_2);

         sin=-sin4_2.m128_f32[0];

         cos=-cos4_2.m128_f32[0];

         sinCosCalIndex++;

      } else {

         sin=-sin4_2.m128_f32[sinCosCalIndex];

         cos=-cos4_2.m128_f32[sinCosCalIndex];

         if(sinCosCalIndex==3)

            sinCosCalIndex=0;

         else

            sinCosCalIndex++;

      }

      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);

      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);

      v1 = (sin*HRminus + cos*HIplus);

      v2 = (cos*HRminus - sin*HIplus);

      *A = (HRplus  + v1) * (fft_type)0.5;

      *B = *A - v1;

      *(A+1) = (HIminus - v2) * (fft_type)0.5;

      *(B+1) = *(A+1) - HIminus;


      A=&A[2];

      B=&B[-2];

      br1Index++;

   }

   /* Handle center bin (just need conjugate) */

   *(A+1)=-*(A+1);

   /* Handle DC bin separately - this ignores any Fs/2 component

   buffer[1]=buffer[0]=buffer[0]/2;*/

   /* Handle DC and Fs/2 bins specially */

   /* The DC bin is passed in as the real part of the DC complex value */

   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */

   /* (v1+v2) = buffer[0] == the DC component */

   /* (v1-v2) = buffer[1] == the Fs/2 component */

   v1=0.5f*(buffer[0]+buffer[1]);

   v2=0.5f*(buffer[0]-buffer[1]);

   buffer[0]=v1;

   buffer[1]=v2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = buffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = buffer;

      B = buffer + ButterfliesPerGroup * 2;

      int sinCosCalIndex = 0;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         v4sf sin4_2, cos4_2;

         if(!sinCosCalIndex)

         {

            v4sf vx;

            for(int i=0;i<4;i++) {

               int brTemp=iSinCosIndex+i;

               vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;

//               vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;

            }

            sincos_ps(vx, &sin4_2, &cos4_2);

            sin=-sin4_2.m128_f32[0];

            cos=-cos4_2.m128_f32[0];

            sinCosCalIndex++;

         } else {

            sin=-sin4_2.m128_f32[sinCosCalIndex];

            cos=-cos4_2.m128_f32[sinCosCalIndex];

            if(sinCosCalIndex==3)

               sinCosCalIndex=0;

            else

               sinCosCalIndex++;

         }

         iSinCosIndex++;

         endptr2=B;

         while(A<endptr2)

         {

            v1=*B*cos - *(B+1)*sin;

            v2=*B*sin + *(B+1)*cos;

            *B=(*A+v1)*(fft_type)0.5;

            *(A++)=*(B++)-v1;

            *B=(*A+v2)*(fft_type)0.5;

            *(A++)=*(B++)-v2;

         }

         A = B;

         B += ButterfliesPerGroup * 2;

      }

      ButterfliesPerGroup >>= 1;

   }

}


void ReorderToFreq1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

{

   int bitReverseShift = 24 - hFFT->pow2Bits;

   // Copy the data into the real and imaginary outputs

   for(size_t i = 1; i < hFFT->Points; i++) {

      int brValue;

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);


      RealOut[i] = buffer[brValue  ];

      ImagOut[i] = buffer[brValue+1];

   }

   RealOut[0] = buffer[0]; // DC component

   ImagOut[0] = 0;

   RealOut[hFFT->Points] = buffer[1]; // Fs/2 component

   ImagOut[hFFT->Points] = 0;

}


void ReorderToTime1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

{

   int bitReverseShift = 24 - hFFT->pow2Bits;

   // Copy the data into the real outputs

   for(size_t i = 0; i < hFFT->Points; i++) {

      int brValue;

      brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      TimeOut[i*2  ] = buffer[brValue  ];

      TimeOut[i*2+1] = buffer[brValue+1];

   }

}


void RealFFTf4xFastMathBR24(fft_type *buffer, FFTParam *h)

{


   __m128 *localBuffer=(__m128 *)buffer;


   __m128 *A,*B;

   __m128 *endptr1,*endptr2;

   int br1Index, br2Index;

   int br1Value, br2Value;

   __m128 HRplus,HRminus,HIplus,HIminus;

   __m128 v1,v2,sin,cos;

   fft_type iToRad = 2 * M_PI/(2 * h->Points);

   auto ButterfliesPerGroup = h->Points / 2;

   int bitReverseShift = 24 - h->pow2Bits;

   int bitReverseShiftM1 = bitReverseShift + 1;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = &localBuffer[h->Points * 2];


   while(ButterfliesPerGroup > 0)

   {

      A = localBuffer;

      B = &localBuffer[ButterfliesPerGroup * 2];

      int sinCosCalIndex = 0;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         v4sf sin4_2, cos4_2;

         if(!sinCosCalIndex)

         {

            v4sf vx;

            for(int i=0;i<4;i++) {

               int brTemp=iSinCosIndex+i;

               vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;

//               vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;

            }

            sincos_ps(vx, &sin4_2, &cos4_2);

            sin=_mm_set1_ps(-sin4_2.m128_f32[0]);

            cos=_mm_set1_ps(-cos4_2.m128_f32[0]);

            sinCosCalIndex++;

         } else {

            sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);

            cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);

            if(sinCosCalIndex==3)

               sinCosCalIndex=0;

            else

               sinCosCalIndex++;

         }

         iSinCosIndex++;

         endptr2=B;

         while(A<endptr2)

         {

            v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));

            v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));

            *B=_mm_add_ps( *A, v1);

            __m128 temp128 = _mm_set1_ps( 2.0);

            *(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));

            *B=_mm_sub_ps(*A,v2);

            *(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));

         }

         A = B;

         B = &B[ButterfliesPerGroup * 2];

      }

      ButterfliesPerGroup >>= 1;

   }

   /* Massage output to get the output for a real input sequence. */


   br1Index = 1; // h->BitReversed+1;

   br2Index = h->Points - 1;   //h->BitReversed + h->Points - 1;


   int sinCosCalIndex = 0;

   while(br1Index < br2Index)

   {

      v4sf sin4_2, cos4_2;

      br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);

      br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br2Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);

      if(!sinCosCalIndex)

      {

         v4sf vx;

         for(int i=0;i<4;i++)

            vx.m128_f32[i]=((float)(br1Index+i))*iToRad;

         sincos_ps(vx, &sin4_2, &cos4_2);

         sin=_mm_set1_ps(-sin4_2.m128_f32[0]);

         cos=_mm_set1_ps(-cos4_2.m128_f32[0]);

         sinCosCalIndex++;

      } else {

         sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);

         cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);

         if(sinCosCalIndex==3)

            sinCosCalIndex=0;

         else

            sinCosCalIndex++;

      }

      A=&localBuffer[br1Value];

      B=&localBuffer[br2Value];

      __m128 temp128 = _mm_set1_ps( 2.0);

      HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));

      HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));

      v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));

      v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));

      temp128 = _mm_set1_ps( 0.5);

      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);

      *B = _mm_sub_ps(*A, v1);

      *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);

      *(B+1) = _mm_sub_ps(*(A+1), HIminus);


      br1Index++;

      br2Index--;

   }

   /* Handle the center bin (just need a conjugate) */

//   A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];

   A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift)+1];


   // negate sse style

   *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));

   /* Handle DC and Fs/2 bins separately */

   /* Put the Fs/2 value into the imaginary part of the DC bin */

   v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);

   localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);

   localBuffer[1]=v1;

}


/* Description: This routine performs an inverse FFT to real data.

*              This code is for floating point data.

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]

*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )

*        Input is in normal order, interleaved (real,imaginary) complex data

*        You must call GetFFT(fftlen) first to initialize some buffers!

*

* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void InverseRealFFTf4xFastMathBR24(fft_type *buffer, FFTParam *h)

{


   __m128 *localBuffer=(__m128 *)buffer;


   __m128 *A,*B;

   __m128 *endptr1,*endptr2;

   int br1Index;

   __m128 HRplus,HRminus,HIplus,HIminus;

   __m128 v1,v2,sin,cos;

   fft_type iToRad = 2 * M_PI/(2 * h->Points);

   int bitReverseShiftM1 = 25 - h->pow2Bits;

   auto ButterfliesPerGroup = h->Points / 2;


   /* Massage input to get the input for a real output sequence. */

   A = localBuffer + 2;

   B = localBuffer + h->Points * 2 - 2;

   br1Index = 1; //h->BitReversed + 1;

   int sinCosCalIndex = 0;

   while(A < B)

   {

      v4sf sin4_2, cos4_2;

      if(!sinCosCalIndex)

      {

         v4sf vx;

         for(int i=0;i<4;i++)

            vx.m128_f32[i]=((float)(br1Index+i))*iToRad;

         sincos_ps(vx, &sin4_2, &cos4_2);

         sin=_mm_set1_ps(-sin4_2.m128_f32[0]);

         cos=_mm_set1_ps(-cos4_2.m128_f32[0]);

         sinCosCalIndex++;

      } else {

         sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);

         cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);

         if(sinCosCalIndex==3)

            sinCosCalIndex=0;

         else

            sinCosCalIndex++;

      }

      HRminus = _mm_sub_ps(*A,  *B);

      HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B,  _mm_set1_ps(2.0)));

      HIminus = _mm_sub_ps( *(A+1), *(B+1));

      HIplus = _mm_add_ps(HIminus,  _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));

      v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));

      v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));

      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));

      *B = _mm_sub_ps(*A, v1);

      *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));

      *(B+1) = _mm_sub_ps(*(A+1), HIminus);


      A=&A[2];

      B=&B[-2];

      br1Index++;

   }

   /* Handle center bin (just need conjugate) */

   // negate sse style

   *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));


   /* Handle DC bin separately - this ignores any Fs/2 component

   buffer[1]=buffer[0]=buffer[0]/2;*/

   /* Handle DC and Fs/2 bins specially */

   /* The DC bin is passed in as the real part of the DC complex value */

   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */

   /* (v1+v2) = buffer[0] == the DC component */

   /* (v1-v2) = buffer[1] == the Fs/2 component */

   v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));

   v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));

   localBuffer[0]=v1;

   localBuffer[1]=v2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = localBuffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = localBuffer;

      B = localBuffer + ButterfliesPerGroup * 2;

      int sinCosCalIndex = 0;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         v4sf sin4_2, cos4_2;

         if(!sinCosCalIndex)

         {

            v4sf vx;

            for(int i=0;i<4;i++) {

               int brTemp=iSinCosIndex+i;

               vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;

//               vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;

            }

            sincos_ps(vx, &sin4_2, &cos4_2);

            sin=_mm_set1_ps(-sin4_2.m128_f32[0]);

            cos=_mm_set1_ps(-cos4_2.m128_f32[0]);

            sinCosCalIndex++;

         } else {

            sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);

            cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);

            if(sinCosCalIndex==3)

               sinCosCalIndex=0;

            else

               sinCosCalIndex++;

         }

         iSinCosIndex++;

         endptr2=B;

         while(A<endptr2)

         {

            v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));

            v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));

            *B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));

            *(A++)=_mm_sub_ps(*(B++), v1);

            *B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));

            *(A++)=_mm_sub_ps(*(B++),v2);

         }

         A = B;

         B = &B[ButterfliesPerGroup * 2];

      }

      ButterfliesPerGroup >>= 1;

   }

}


void ReorderToFreq4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

{

   __m128 *localBuffer = (__m128 *)buffer;

   __m128 *localRealOut = (__m128 *)RealOut;

   __m128 *localImagOut = (__m128 *)ImagOut;

   int bitReverseShift = 24-hFFT->pow2Bits;


   // Copy the data into the real and imaginary outputs

   for(size_t i = 1; i < hFFT->Points; i++) {

      int brValue;

      brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      localRealOut[i]=localBuffer[brValue  ];

      localImagOut[i]=localBuffer[brValue+1];

   }

   localRealOut[0] = localBuffer[0]; // DC component

   localImagOut[0] = _mm_set1_ps(0.0);

   localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component

   localImagOut[hFFT->Points] = _mm_set1_ps(0.0);

}


void ReorderToTime4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

{

   __m128 *localBuffer = (__m128 *)buffer;

   __m128 *localTimeOut = (__m128 *)TimeOut;

   int bitReverseShift = 24-hFFT->pow2Bits;


   // Copy the data into the real outputs

   for(size_t i = 0; i < hFFT->Points; i++) {

      int brValue;

      brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      localTimeOut[i*2  ] = localBuffer[brValue  ];

      localTimeOut[i*2+1] = localBuffer[brValue+1];

   }

}


#endif


#define FAST_MATH_BR16

#ifdef FAST_MATH_BR16


/*

*  Forward FFT routine.  Must call GetFFT(fftlen) first!

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]

*                                  Imag_i = buffer[ h->BitReversed[i]+1 ] )

*        Input is in normal order.

*

* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void RealFFTf1xFastMathBR16(fft_type *buffer, FFTParam *h)

{

   fft_type *A,*B;

   fft_type *endptr1,*endptr2;

   int br1Index, br2Index;

   int br1Value, br2Value;

   fft_type HRplus,HRminus,HIplus,HIminus;

   fft_type v1,v2,sin,cos;

   fft_type iToRad = 2 * M_PI / (2 * h->Points);

   int bitReverseShiftM1 = 17 - h->pow2Bits;

   int bitReverseShift = bitReverseShiftM1 - 1;

   auto ButterfliesPerGroup = h->Points / 2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = buffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = buffer;

      B = buffer + ButterfliesPerGroup * 2;

      int sinCosCalIndex = 0;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         v4sf sin4_2, cos4_2;

         if(!sinCosCalIndex)

         {

            v4sf vx;

            for(int i=0;i<4;i++) {

               int brTemp=iSinCosIndex+i;

               vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;

//               vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;

            }

            sincos_ps(vx, &sin4_2, &cos4_2);

            sin=-sin4_2.m128_f32[0];

            cos=-cos4_2.m128_f32[0];

            sinCosCalIndex++;

         } else {

            sin=-sin4_2.m128_f32[sinCosCalIndex];

            cos=-cos4_2.m128_f32[sinCosCalIndex];

            if(sinCosCalIndex==3)

               sinCosCalIndex=0;

            else

               sinCosCalIndex++;

         }

         iSinCosIndex++;

         endptr2=B;

         while(A<endptr2)

         {

            v1=*B*cos + *(B+1)*sin;

            v2=*B*sin - *(B+1)*cos;

            *B=(*A+v1);

            *(A++)=*(B++)-2*v1;

            *B=(*A-v2);

            *(A++)=*(B++)+2*v2;

         }

         A = B;

         B += ButterfliesPerGroup * 2;

      }

      ButterfliesPerGroup >>= 1;

   }

   /* Massage output to get the output for a real input sequence. */


   br1Index = 1; // h->BitReversed+1;

   br2Index = h->Points - 1;   //h->BitReversed + h->Points - 1;


   int sinCosCalIndex = 0;

   while(br1Index < br2Index)

   {

      v4sf sin4_2, cos4_2;

      br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);

      br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);

      if(!sinCosCalIndex)

      {

         v4sf vx;

         for(int i = 0; i < 4; i++)

            vx.m128_f32[i]=((float)(br1Index+i))*iToRad;

         sincos_ps(vx, &sin4_2, &cos4_2);

         sin = -sin4_2.m128_f32[0];

         cos=-cos4_2.m128_f32[0];

         sinCosCalIndex++;

      } else {

         sin=-sin4_2.m128_f32[sinCosCalIndex];

         cos=-cos4_2.m128_f32[sinCosCalIndex];

         if(sinCosCalIndex==3)

            sinCosCalIndex=0;

         else

            sinCosCalIndex++;

      }

      A=&buffer[br1Value];

      B=&buffer[br2Value];

      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);

      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);

      v1 = (sin*HRminus - cos*HIplus);

      v2 = (cos*HRminus + sin*HIplus);

      *A = (HRplus  + v1) * (fft_type)0.5;

      *B = *A - v1;

      *(A+1) = (HIminus + v2) * (fft_type)0.5;

      *(B+1) = *(A+1) - HIminus;


      br1Index++;

      br2Index--;

   }

   /* Handle the center bin (just need a conjugate) */

//   A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];

   A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];


   /* Handle DC bin separately - and ignore the Fs/2 bin

   buffer[0]+=buffer[1];

   buffer[1]=(fft_type)0;*/

   /* Handle DC and Fs/2 bins separately */

   /* Put the Fs/2 value into the imaginary part of the DC bin */

   v1=buffer[0]-buffer[1];

   buffer[0]+=buffer[1];

   buffer[1]=v1;

}


/* Description: This routine performs an inverse FFT to real data.

*              This code is for floating point data.

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]

*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )

*        Input is in normal order, interleaved (real,imaginary) complex data

*        You must call GetFFT(fftlen) first to initialize some buffers!

*

* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void InverseRealFFTf1xFastMathBR16(fft_type *buffer, FFTParam *h)

{

   fft_type *A,*B;

   fft_type *endptr1,*endptr2;

   int br1Index;

   fft_type HRplus,HRminus,HIplus,HIminus;

   fft_type v1,v2,sin,cos;

   fft_type iToRad=2 * M_PI / (2 * h->Points);

   int bitReverseShiftM1=17-h->pow2Bits;


   auto ButterfliesPerGroup = h->Points / 2;


   /* Massage input to get the input for a real output sequence. */

   A = buffer + 2;

   B = buffer + h->Points * 2 - 2;

   br1Index = 1; //h->BitReversed + 1;

   int sinCosCalIndex = 0;

   while(A < B)

   {

      v4sf sin4_2, cos4_2;

      if(!sinCosCalIndex)

      {

         v4sf vx;

         for(int i=0;i<4;i++)

            vx.m128_f32[i]=((float)(br1Index+i))*iToRad;

         sincos_ps(vx, &sin4_2, &cos4_2);

         sin=-sin4_2.m128_f32[0];

         cos=-cos4_2.m128_f32[0];

         sinCosCalIndex++;

      } else {

         sin=-sin4_2.m128_f32[sinCosCalIndex];

         cos=-cos4_2.m128_f32[sinCosCalIndex];

         if(sinCosCalIndex==3)

            sinCosCalIndex=0;

         else

            sinCosCalIndex++;

      }

      HRplus = (HRminus = *A     - *B    ) + (*B     * 2);

      HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);

      v1 = (sin*HRminus + cos*HIplus);

      v2 = (cos*HRminus - sin*HIplus);

      *A = (HRplus  + v1) * (fft_type)0.5;

      *B = *A - v1;

      *(A+1) = (HIminus - v2) * (fft_type)0.5;

      *(B+1) = *(A+1) - HIminus;


      A=&A[2];

      B=&B[-2];

      br1Index++;

   }

   /* Handle center bin (just need conjugate) */

   *(A+1)=-*(A+1);

   /* Handle DC bin separately - this ignores any Fs/2 component

   buffer[1]=buffer[0]=buffer[0]/2;*/

   /* Handle DC and Fs/2 bins specially */

   /* The DC bin is passed in as the real part of the DC complex value */

   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */

   /* (v1+v2) = buffer[0] == the DC component */

   /* (v1-v2) = buffer[1] == the Fs/2 component */

   v1=0.5f*(buffer[0]+buffer[1]);

   v2=0.5f*(buffer[0]-buffer[1]);

   buffer[0]=v1;

   buffer[1]=v2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = buffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = buffer;

      B = buffer + ButterfliesPerGroup * 2;

      int sinCosCalIndex = 0;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         v4sf sin4_2, cos4_2;

         if(!sinCosCalIndex)

         {

            v4sf vx;

            for(int i = 0; i < 4; i++) {

               int brTemp = iSinCosIndex + i;

               vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;

//               vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;

            }

            sincos_ps(vx, &sin4_2, &cos4_2);

            sin=-sin4_2.m128_f32[0];

            cos=-cos4_2.m128_f32[0];

            sinCosCalIndex++;

         } else {

            sin=-sin4_2.m128_f32[sinCosCalIndex];

            cos=-cos4_2.m128_f32[sinCosCalIndex];

            if(sinCosCalIndex==3)

               sinCosCalIndex=0;

            else

               sinCosCalIndex++;

         }

         iSinCosIndex++;

         endptr2=B;

         while(A<endptr2)

         {

            v1=*B*cos - *(B+1)*sin;

            v2=*B*sin + *(B+1)*cos;

            *B=(*A+v1)*(fft_type)0.5;

            *(A++)=*(B++)-v1;

            *B=(*A+v2)*(fft_type)0.5;

            *(A++)=*(B++)-v2;

         }

         A = B;

         B += ButterfliesPerGroup * 2;

      }

      ButterfliesPerGroup >>= 1;

   }

}


void ReorderToFreq1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

{

   int bitReverseShift = 16 - hFFT->pow2Bits;

   // Copy the data into the real and imaginary outputs

   for(size_t i = 1; i < hFFT->Points; i++) {

      int brValue;

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);

      RealOut[i] = buffer[brValue  ];

      ImagOut[i] = buffer[brValue+1];

   }

   RealOut[0] = buffer[0]; // DC component

   ImagOut[0] = 0;

   RealOut[hFFT->Points] = buffer[1]; // Fs/2 component

   ImagOut[hFFT->Points] = 0;

}


void ReorderToTime1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

{

   int bitReverseShift=16-hFFT->pow2Bits;

   // Copy the data into the real outputs

   for(size_t i = 0; i < hFFT->Points; i++) {

      int brValue;

      brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      TimeOut[i*2  ] = buffer[brValue  ];

      TimeOut[i*2+1] = buffer[brValue+1];

   }

}


void RealFFTf4xFastMathBR16(fft_type *buffer, FFTParam *h)

{


   __m128 *localBuffer = (__m128 *)buffer;


   __m128 *A,*B;

   __m128 *endptr1, *endptr2;

   int br1Index, br2Index;

   int br1Value, br2Value;

   __m128 HRplus, HRminus, HIplus, HIminus;

   __m128 v1, v2, sin, cos;

   fft_type iToRad = 2 * M_PI/(2 * h->Points);

   auto ButterfliesPerGroup = h->Points / 2;

   int bitReverseShiftM1 = 17 - h->pow2Bits;

   int bitReverseShift = bitReverseShiftM1 - 1;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = &localBuffer[h->Points * 2];


   while(ButterfliesPerGroup > 0)

   {

      A = localBuffer;

      B = &localBuffer[ButterfliesPerGroup * 2];

      int sinCosCalIndex = 0;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         v4sf sin4_2, cos4_2;

         if(!sinCosCalIndex)

         {

            v4sf vx;

            for(int i=0;i<4;i++) {

               int brTemp=iSinCosIndex+i;

               vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;

//               vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;

            }

            sincos_ps(vx, &sin4_2, &cos4_2);

            sin=_mm_set1_ps(-sin4_2.m128_f32[0]);

            cos=_mm_set1_ps(-cos4_2.m128_f32[0]);

            sinCosCalIndex++;

         } else {

            sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);

            cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);

            if(sinCosCalIndex==3)

               sinCosCalIndex=0;

            else

               sinCosCalIndex++;

         }

         iSinCosIndex++;

         endptr2=B;

         while(A<endptr2)

         {

            v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));

            v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));

            *B=_mm_add_ps( *A, v1);

            __m128 temp128 = _mm_set1_ps( 2.0);

            *(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));

            *B=_mm_sub_ps(*A,v2);

            *(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));

         }

         A = B;

         B = &B[ButterfliesPerGroup * 2];

      }

      ButterfliesPerGroup >>= 1;

   }

   /* Massage output to get the output for a real input sequence. */


   br1Index = 1; // h->BitReversed + 1;

   br2Index = h->Points - 1;   //h->BitReversed + h->Points - 1;


   int sinCosCalIndex = 0;

   while(br1Index < br2Index)

   {

      v4sf sin4_2, cos4_2;

      br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);

      br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);

      if(!sinCosCalIndex)

      {

         v4sf vx;

         for(int i = 0; i < 4; i++)

            vx.m128_f32[i] = ((float)(br1Index+i)) * iToRad;

         sincos_ps(vx, &sin4_2, &cos4_2);

         sin = _mm_set1_ps(-sin4_2.m128_f32[0]);

         cos = _mm_set1_ps(-cos4_2.m128_f32[0]);

         sinCosCalIndex++;

      } else {

         sin = _mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);

         cos = _mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);

         if(sinCosCalIndex == 3)

            sinCosCalIndex = 0;

         else

            sinCosCalIndex++;

      }

      A = &localBuffer[br1Value];

      B = &localBuffer[br2Value];

      __m128 temp128 = _mm_set1_ps( 2.0);

      HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));

      HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));

      v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));

      v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));

      temp128 = _mm_set1_ps( 0.5);

      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);

      *B = _mm_sub_ps(*A, v1);

      *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);

      *(B+1) = _mm_sub_ps(*(A+1), HIminus);


      br1Index++;

      br2Index--;

   }

   /* Handle the center bin (just need a conjugate) */

//   A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];

   A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];


   // negate sse style

   *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));

   /* Handle DC and Fs/2 bins separately */

   /* Put the Fs/2 value into the imaginary part of the DC bin */

   v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);

   localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);

   localBuffer[1]=v1;

}


/* Description: This routine performs an inverse FFT to real data.

*              This code is for floating point data.

*

*  Note: Output is BIT-REVERSED! so you must use the BitReversed to

*        get legible output, (i.e. wave[2*i]   = buffer[ BitReversed[i] ]

*                                  wave[2*i+1] = buffer[ BitReversed[i]+1 ] )

*        Input is in normal order, interleaved (real,imaginary) complex data

*        You must call GetFFT(fftlen) first to initialize some buffers!

*

* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin

* - this can be done because both values will always be real only

* - this allows us to not have to allocate an extra complex value for the Fs/2 bin

*

*  Note: The scaling on this is done according to the standard FFT definition,

*        so a unit amplitude DC signal will output an amplitude of (N)

*        (Older revisions would progressively scale the input, so the output

*        values would be similar in amplitude to the input values, which is

*        good when using fixed point arithmetic)

*/

void InverseRealFFTf4xFastMathBR16(fft_type *buffer, FFTParam *h)

{


   __m128 *localBuffer=(__m128 *)buffer;


   __m128 *A, *B;

   __m128 *endptr1, *endptr2;

   int br1Index;

   __m128 HRplus, HRminus, HIplus, HIminus;

   __m128 v1, v2, sin, cos;

   fft_type iToRad = 2 * M_PI/(2 * h->Points);

   int bitReverseShiftM1 = 17 - h->pow2Bits;

   auto ButterfliesPerGroup = h->Points / 2;


   /* Massage input to get the input for a real output sequence. */

   A = localBuffer + 2;

   B = localBuffer + h->Points * 2 - 2;

   br1Index=1; //h->BitReversed+1;

   int sinCosCalIndex=0;

   while(A<B)

   {

      v4sf sin4_2, cos4_2;

      if(!sinCosCalIndex)

      {

         v4sf vx;

         for(int i=0;i<4;i++)

            vx.m128_f32[i]=((float)(br1Index+i))*iToRad;

         sincos_ps(vx, &sin4_2, &cos4_2);

         sin=_mm_set1_ps(-sin4_2.m128_f32[0]);

         cos=_mm_set1_ps(-cos4_2.m128_f32[0]);

         sinCosCalIndex++;

      } else {

         sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);

         cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);

         if(sinCosCalIndex==3)

            sinCosCalIndex=0;

         else

            sinCosCalIndex++;

      }

      HRminus = _mm_sub_ps(*A,  *B);

      HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B,  _mm_set1_ps(2.0)));

      HIminus = _mm_sub_ps( *(A+1), *(B+1));

      HIplus = _mm_add_ps(HIminus,  _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));

      v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));

      v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));

      *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));

      *B = _mm_sub_ps(*A, v1);

      *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));

      *(B+1) = _mm_sub_ps(*(A+1), HIminus);


      A=&A[2];

      B=&B[-2];

      br1Index++;

   }

   /* Handle center bin (just need conjugate) */

   // negate sse style

   *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));


   /* Handle DC bin separately - this ignores any Fs/2 component

   buffer[1]=buffer[0]=buffer[0]/2;*/

   /* Handle DC and Fs/2 bins specially */

   /* The DC bin is passed in as the real part of the DC complex value */

   /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */

   /* (v1+v2) = buffer[0] == the DC component */

   /* (v1-v2) = buffer[1] == the Fs/2 component */

   v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));

   v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));

   localBuffer[0]=v1;

   localBuffer[1]=v2;


   /*

   *  Butterfly:

   *     Ain-----Aout

   *         \ /

   *         / \

   *     Bin-----Bout

   */


   endptr1 = localBuffer + h->Points * 2;


   while(ButterfliesPerGroup > 0)

   {

      A = localBuffer;

      B = localBuffer + ButterfliesPerGroup * 2;

      int sinCosCalIndex = 0;

      int iSinCosIndex = 0;

      while(A < endptr1)

      {

         v4sf sin4_2, cos4_2;

         if(!sinCosCalIndex)

         {

            v4sf vx;

            for(int i=0;i<4;i++) {

               int brTemp=iSinCosIndex+i;

               vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;

//               vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;

            }

            sincos_ps(vx, &sin4_2, &cos4_2);

            sin=_mm_set1_ps(-sin4_2.m128_f32[0]);

            cos=_mm_set1_ps(-cos4_2.m128_f32[0]);

            sinCosCalIndex++;

         } else {

            sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);

            cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);

            if(sinCosCalIndex==3)

               sinCosCalIndex=0;

            else

               sinCosCalIndex++;

         }

         iSinCosIndex++;

         endptr2=B;

         while(A<endptr2)

         {

            v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));

            v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));

            *B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));

            *(A++)=_mm_sub_ps(*(B++), v1);

            *B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));

            *(A++)=_mm_sub_ps(*(B++),v2);

         }

         A = B;

         B = &B[ButterfliesPerGroup * 2];

      }

      ButterfliesPerGroup >>= 1;

   }

}


void ReorderToFreq4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

{

   __m128 *localBuffer=(__m128 *)buffer;

   __m128 *localRealOut=(__m128 *)RealOut;

   __m128 *localImagOut=(__m128 *)ImagOut;

   int bitReverseShift=16-hFFT->pow2Bits;


   // Copy the data into the real and imaginary outputs

   for(size_t i = 1; i < hFFT->Points; i++) {

      int brValue;

      brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      localRealOut[i]=localBuffer[brValue  ];

      localImagOut[i]=localBuffer[brValue+1];

   }

   localRealOut[0] = localBuffer[0]; // DC component

   localImagOut[0] = _mm_set1_ps(0.0);

   localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component

   localImagOut[hFFT->Points] = _mm_set1_ps(0.0);

}


void ReorderToTime4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

{

   __m128 *localBuffer=(__m128 *)buffer;

   __m128 *localTimeOut=(__m128 *)TimeOut;

   int bitReverseShift=16-hFFT->pow2Bits;


   // Copy the data into the real outputs

   for(size_t i = 0; i < hFFT->Points; i++) {

      int brValue;

      brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);

//      brValue=(*SmallVRB[hFFT->pow2Bits])(i);

      localTimeOut[i*2  ] = localBuffer[brValue  ];

      localTimeOut[i*2+1] = localBuffer[brValue+1];

   }

}


#endif

#endif

M_PI
#define M_PI
Definition: Distortion.cpp:30

RealFFTf48x.h

RealFFTf4x
void RealFFTf4x(fft_type *, FFTParam *, int functionType=-1)

fft_type
float fft_type
Definition: RealFFTf48x.h:6

ReorderToFreq1x
void ReorderToFreq1x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType=-1)

InverseRealFFTf4xFastMathBR16
void InverseRealFFTf4xFastMathBR16(fft_type *, FFTParam *)

InverseRealFFTf4xSinCosTableVBR16
void InverseRealFFTf4xSinCosTableVBR16(fft_type *, FFTParam *)

RealFFTf4xFastMathBR16
void RealFFTf4xFastMathBR16(fft_type *, FFTParam *)

ReorderToTime1xSinCosTableVBR16
void ReorderToTime1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

ReorderToFreq4xFastMathBR16
void ReorderToFreq4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

RealFFTf4xSinCosTableVBR16
void RealFFTf4xSinCosTableVBR16(fft_type *, FFTParam *)

ReorderToTime1xSinCosTableBR16
void ReorderToTime1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

ReorderToTime1xSinCosBRTable
void ReorderToTime1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

ReorderToFreq1xSinCosTableVBR16
void ReorderToFreq1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

RealFFTf1xSinCosTableBR16
void RealFFTf1xSinCosTableBR16(fft_type *, FFTParam *)

ReorderToTime4xFastMathBR16
void ReorderToTime4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

RealFFTf1xSinCosBRTable
void RealFFTf1xSinCosBRTable(fft_type *, FFTParam *)

ReorderToFreq4xFastMathBR24
void ReorderToFreq4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

ReorderToFreq4xSinCosTableVBR16
void ReorderToFreq4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

InverseRealFFTf1xSinCosTableVBR16
void InverseRealFFTf1xSinCosTableVBR16(fft_type *, FFTParam *)

TableUsage
void TableUsage(int iMask)

RealFFTf4xFastMathBR24
void RealFFTf4xFastMathBR24(fft_type *, FFTParam *)

ReorderToFreq4xSinCosBRTable
void ReorderToFreq4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

ReorderToFreq1xFastMathBR16
void ReorderToFreq1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

ReorderToFreq1xSinCosTableBR16
void ReorderToFreq1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

RealFFTf1x
void RealFFTf1x(fft_type *, FFTParam *, int functionType=-1)

InverseRealFFTf1xFastMathBR16
void InverseRealFFTf1xFastMathBR16(fft_type *, FFTParam *)

ReorderToFreq1xFastMathBR24
void ReorderToFreq1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

ReorderToTime4x
void ReorderToTime4x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType=-1)

ReorderToTime4xSinCosBRTable
void ReorderToTime4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

InverseRealFFTf1xSinCosTableBR16
void InverseRealFFTf1xSinCosTableBR16(fft_type *, FFTParam *)

InverseRealFFTf4xSinCosBRTable
void InverseRealFFTf4xSinCosBRTable(fft_type *, FFTParam *)

InverseRealFFTf1xFastMathBR24
void InverseRealFFTf1xFastMathBR24(fft_type *, FFTParam *)

ReorderToTime1xFastMathBR24
void ReorderToTime1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

ReorderToFreq4xSinCosTableBR16
void ReorderToFreq4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

RealFFTf4xSinCosTableBR16
void RealFFTf4xSinCosTableBR16(fft_type *, FFTParam *)

ReorderToTime4xFastMathBR24
void ReorderToTime4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

ReorderToTime1x
void ReorderToTime1x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType=-1)

InverseRealFFTf4x
void InverseRealFFTf4x(fft_type *, FFTParam *, int functionType=-1)

RealFFTf1xFastMathBR16
void RealFFTf1xFastMathBR16(fft_type *, FFTParam *)

InverseRealFFTf1x
void InverseRealFFTf1x(fft_type *, FFTParam *, int functionType=-1)

InverseRealFFTf1xSinCosBRTable
void InverseRealFFTf1xSinCosBRTable(fft_type *, FFTParam *)

ReorderToTime1xFastMathBR16
void ReorderToTime1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

RealFFTf1xSinCosTableVBR16
void RealFFTf1xSinCosTableVBR16(fft_type *, FFTParam *)

InverseRealFFTf4xSinCosTableBR16
void InverseRealFFTf4xSinCosTableBR16(fft_type *, FFTParam *)

SmallVRB
int(* SmallVRB[])(int bits)

ReorderToFreq4x
void ReorderToFreq4x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType=-1)

RealFFTf1xFastMathBR24
void RealFFTf1xFastMathBR24(fft_type *, FFTParam *)

InverseRealFFTf4xFastMathBR24
void InverseRealFFTf4xFastMathBR24(fft_type *, FFTParam *)

RealFFTf4xSinCosBRTable
void RealFFTf4xSinCosBRTable(fft_type *, FFTParam *)

SmallRB
int SmallRB(int bits, int numberBits)

ReorderToTime4xSinCosTableVBR16
void ReorderToTime4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

ReorderToTime4xSinCosTableBR16
void ReorderToTime4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)

ReorderToFreq1xSinCosBRTable
void ReorderToFreq1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)

FFT_SinCosBRTable
@ FFT_SinCosBRTable
Definition: RealFFTf48x.h:11

FFT_SinCosTableBR16
@ FFT_SinCosTableBR16
Definition: RealFFTf48x.h:13

FFT_FastMathBR16
@ FFT_FastMathBR16
Definition: RealFFTf48x.h:14

FFT_SinCosTableVBR16
@ FFT_SinCosTableVBR16
Definition: RealFFTf48x.h:12

FFT_FastMathBR24
@ FFT_FastMathBR24
Definition: RealFFTf48x.h:15

RealFFTf.h

SseMathFuncs.h
SSE maths functions (for FFTs)

sincos_ps
void sincos_ps(v4sf x, v4sf *s, v4sf *c)
Definition: SseMathFuncs.h:631

v4sf
__m128 v4sf
Definition: SseMathFuncs.h:108

A
#define A(N)
Definition: ToChars.cpp:62

SinCosTable
Definition: RealFFTf48x.h:91

SinCosTable::mSinCosTable
ArrayOf< SinCosStruct > mSinCosTable
Definition: RealFFTf48x.h:94

SinCosTable::SinCosTable
SinCosTable()

SinCosTable::mSinCosTablePow
int mSinCosTablePow
Definition: RealFFTf48x.h:93

FFTParam
Definition: RealFFTf.h:7

FFTParam::Points
size_t Points
Definition: RealFFTf.h:10

FFTParam::BitReversed
ArrayOf< int > BitReversed
Definition: RealFFTf.h:8

FFTParam::SinTable
ArrayOf< fft_type > SinTable
Definition: RealFFTf.h:9