Audacity 3.2.0
RealFFTf48x.cpp
Go to the documentation of this file.
1/**********************************************************************
2
3 Audacity: A Digital Audio Editor
4
5 RealFFT48x.cpp
6
7 Philip Van Baren
8 Andrew Hallendorff (SSE Mods)
9
10*******************************************************************//****************************************************************/
16
17/*
18* Program: REALFFTF.C
19* Author: Philip Van Baren
20* Date: 2 September 1993
21*
22* Description: These routines perform an FFT on real data to get a conjugate-symmetric
23* output, and an inverse FFT on conjugate-symmetric input to get a real
24* output sequence.
25*
26* This code is for floating point data.
27*
28* Modified 8/19/1998 by Philip Van Baren
29* - made the InitializeFFT and EndFFT routines take a structure
30* holding the length and pointers to the BitReversed and SinTable
31* tables.
32* Modified 5/23/2009 by Philip Van Baren
33* - Added GetFFT and ReleaseFFT routines to retain common SinTable
34* and BitReversed tables so they don't need to be reallocated
35* and recomputed on every call.
36* - Added Reorder* functions to undo the bit-reversal
37* Modified 15 April 2016 Paul Licameli
38* - C++11 smart pointers
39*
40* Copyright (C) 2009 Philip VanBaren
41*
42* This program is free software; you can redistribute it and/or modify
43* it under the terms of the GNU General Public License as published by
44* the Free Software Foundation; either version 2 of the License, or
45* (at your option) any later version.
46*
47* This program is distributed in the hope that it will be useful,
48* but WITHOUT ANY WARRANTY; without even the implied warranty of
49* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
50* GNU General Public License for more details.
51*
52* You should have received a copy of the GNU General Public License
53* along with this program; if not, write to the Free Software
54* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
55*/
56
57
58#include "RealFFTf48x.h"
59
60
61
62#ifdef EXPERIMENTAL_EQ_SSE_THREADED
63
64// at the moment these two are mutually exclusive
65//#define USE_SSEMATHFUNC
66//#define TEST_COSSINBIT_TABLES
67
68#ifndef USE_SSE2
69#define USE_SSE2
70#endif
71
72#include <stdlib.h>
73#include <math.h>
74#include "RealFFTf.h"
75#ifdef __WXMSW__
76#pragma warning(disable:4305)
77#else
78
79#endif
80#include "SseMathFuncs.h"
81#include <intrin.h>
82
83#ifndef M_PI
84#define M_PI 3.14159265358979323846 /* pi */
85#endif
86
87int tableMask=0;
88bool useBitReverseTable=false;
89bool useSinCosTable=false;
90
91void TableUsage(int iMask)
92{
93 tableMask=iMask;
94 useBitReverseTable=(iMask&1)!=0;
95 useSinCosTable=(iMask&2)!=0;
96}
97
99mSinCosTablePow(13)
100{
101 size_t tableSize=1<<mSinCosTablePow;
102 mSinCosTable.reinit(tableSize);
103 for(size_t i=0;i<tableSize;i++) {
104 mSinCosTable[i].mSin=(float)-sin(((float)i)*M_PI/tableSize);
105 mSinCosTable[i].mCos=(float)-cos(((float)i)*M_PI/tableSize);
106 }
107};
108
109static SinCosTable sSinCosTable;
110
111static unsigned char sSmallRBTable[256];
112
113class BitReverser {
114 public:
115 BitReverser()
116 {
117 for(int i=0;i<256;i++) {
118 sSmallRBTable[i]=0;
119 for(int maskLow=1, maskHigh=128;maskLow<256;maskLow<<=1,maskHigh>>=1)
120 if(i&maskLow)
121 sSmallRBTable[i]|=maskHigh;
122 }
123 };
124 };
125static BitReverser sBitReverser;
126
127/* array of functions there prob is a better way to do this */
128int SmallVRB0(int bits) { return bits; }; int SmallVRB1(int bits) { return sSmallRBTable[bits]>>7; };
129int SmallVRB2(int bits) { return sSmallRBTable[bits]>>6; }; int SmallVRB3(int bits) { return sSmallRBTable[bits]>>5; };
130int SmallVRB4(int bits) { return sSmallRBTable[bits]>>4; }; int SmallVRB5(int bits) { return sSmallRBTable[bits]>>3; };
131int SmallVRB6(int bits) { return sSmallRBTable[bits]>>2; }; int SmallVRB7(int bits) { return sSmallRBTable[bits]>>1; };
132int SmallVRB8(int bits) { return sSmallRBTable[bits]; };
133int SmallVRB9(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<1)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>7); };
134int SmallVRB10(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<2)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>6); };
135int SmallVRB11(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<3)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>5); };
136int SmallVRB12(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<4)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>4); };
137int SmallVRB13(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<5)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>3); };
138int SmallVRB14(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<6)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>2); };
139int SmallVRB15(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<7)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]>>1); };
140int SmallVRB16(int bits) { return (sSmallRBTable[*((unsigned char *)&bits)]<<8)+(sSmallRBTable[*(((unsigned char *)&bits)+1)]); };
141
142int (*SmallVRB[])(int bits) = { SmallVRB0, SmallVRB1, SmallVRB2, SmallVRB3, SmallVRB4,
143 SmallVRB5, SmallVRB6, SmallVRB7, SmallVRB8, SmallVRB9, SmallVRB10,
144 SmallVRB11, SmallVRB12, SmallVRB13, SmallVRB14,SmallVRB15, SmallVRB16 };
145
146int SmallRB(int bits, int numberBits)
147{
148// return ( (sSmallRBTable[*((unsigned char *)&bits)]<<16) + (sSmallRBTable[*(((unsigned char *)&bits)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&bits)+2)] ))>>(24-numberBits);
149 return ( (sSmallRBTable[*((unsigned char *)&bits)]<<8) + (sSmallRBTable[*(((unsigned char *)&bits)+1)]) )>>(16-numberBits);
150};
151
152/* wrapper functions. If passed -1 function choice will be made locally */
153void RealFFTf1x(fft_type *buffer, FFTParam *h, int functionType)
154{
155 switch(functionType) {
157 RealFFTf1xSinCosTableVBR16( buffer, h);
158 break;
160 RealFFTf1xSinCosTableBR16( buffer, h);
161 break;
162 case FFT_FastMathBR16:
163 RealFFTf1xFastMathBR16( buffer, h);
164 break;
165 case FFT_FastMathBR24:
166 RealFFTf1xFastMathBR24( buffer, h);
167 break;
169 default:
170 RealFFTf1xSinCosBRTable( buffer, h);
171 };
172}
173
174void InverseRealFFTf1x(fft_type *buffer, FFTParam *h, int functionType)
175{
176 switch(functionType) {
179 break;
182 break;
183 case FFT_FastMathBR16:
185 break;
186 case FFT_FastMathBR24:
188 break;
190 default:
192 };
193}
194
195void ReorderToTime1x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType)
196{
197 switch(functionType) {
199 ReorderToTime1xSinCosTableVBR16( hFFT, buffer, TimeOut);
200 break;
202 ReorderToTime1xSinCosTableBR16( hFFT, buffer, TimeOut);
203 break;
204 case FFT_FastMathBR16:
205 ReorderToTime1xFastMathBR16( hFFT, buffer, TimeOut);
206 break;
207 case FFT_FastMathBR24:
208 ReorderToTime1xFastMathBR24( hFFT, buffer, TimeOut);
209 break;
211 default:
212 ReorderToTime1xSinCosBRTable( hFFT, buffer, TimeOut);
213 };
214}
215
216void ReorderToFreq1x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType)
217{
218 switch(functionType) {
220 ReorderToFreq1xSinCosTableVBR16(hFFT, buffer, RealOut, ImagOut);
221 break;
223 ReorderToFreq1xSinCosTableBR16(hFFT, buffer, RealOut, ImagOut);
224 break;
225 case FFT_FastMathBR16:
226 ReorderToFreq1xFastMathBR16(hFFT, buffer, RealOut, ImagOut);
227 break;
228 case FFT_FastMathBR24:
229 ReorderToFreq1xFastMathBR24(hFFT, buffer, RealOut, ImagOut);
230 break;
232 default:
233 ReorderToFreq1xSinCosBRTable(hFFT, buffer, RealOut, ImagOut);
234 };
235}
236
237void RealFFTf4x( fft_type *buffer, FFTParam *h, int functionType)
238{
239 switch(functionType) {
241 RealFFTf4xSinCosTableVBR16( buffer, h);
242 break;
244 RealFFTf4xSinCosTableBR16( buffer, h);
245 break;
246 case FFT_FastMathBR16:
247 RealFFTf4xFastMathBR16( buffer, h);
248 break;
249 case FFT_FastMathBR24:
250 RealFFTf4xFastMathBR24( buffer, h);
251 break;
253 default:
254 RealFFTf4xSinCosBRTable( buffer, h);
255 };
256}
257
258void InverseRealFFTf4x( fft_type *buffer, FFTParam *h, int functionType)
259{
260 switch(functionType) {
263 break;
266 break;
267 case FFT_FastMathBR16:
269 break;
270 case FFT_FastMathBR24:
272 break;
274 default:
276 };
277}
278
279void ReorderToTime4x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType)
280{
281 switch(functionType) {
283 ReorderToTime4xSinCosTableVBR16( hFFT, buffer, TimeOut);
284 break;
286 ReorderToTime4xSinCosTableBR16( hFFT, buffer, TimeOut);
287 break;
288 case FFT_FastMathBR16:
289 ReorderToTime4xFastMathBR16( hFFT, buffer, TimeOut);
290 break;
291 case FFT_FastMathBR24:
292 ReorderToTime4xFastMathBR24( hFFT, buffer, TimeOut);
293 break;
295 default:
296 ReorderToTime4xSinCosBRTable( hFFT, buffer, TimeOut);
297 };
298}
299
300void ReorderToFreq4x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType)
301{
302 switch(functionType) {
304 ReorderToFreq4xSinCosTableVBR16(hFFT, buffer, RealOut, ImagOut);
305 break;
307 ReorderToFreq4xSinCosTableBR16(hFFT, buffer, RealOut, ImagOut);
308 break;
309 case FFT_FastMathBR16:
310 ReorderToFreq4xFastMathBR16(hFFT, buffer, RealOut, ImagOut);
311 break;
312 case FFT_FastMathBR24:
313 ReorderToFreq4xFastMathBR24(hFFT, buffer, RealOut, ImagOut);
314 break;
316 default:
317 ReorderToFreq4xSinCosBRTable(hFFT, buffer, RealOut, ImagOut);
318 };
319}
320
321#define REAL_SINCOSBRTABLE
322#ifdef REAL_SINCOSBRTABLE
323
324/*
325* Forward FFT routine. Must call GetFFT(fftlen) first!
326*
327* Note: Output is BIT-REVERSED! so you must use the BitReversed to
328* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
329* Imag_i = buffer[ h->BitReversed[i]+1 ] )
330* Input is in normal order.
331*
332* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
333* - this can be done because both values will always be real only
334* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
335*
336* Note: The scaling on this is done according to the standard FFT definition,
337* so a unit amplitude DC signal will output an amplitude of (N)
338* (Older revisions would progressively scale the input, so the output
339* values would be similar in amplitude to the input values, which is
340* good when using fixed point arithmetic)
341*/
343{
344 fft_type *A,*B;
345 fft_type *sptr;
346 fft_type *endptr1,*endptr2;
347 int *br1,*br2;
348 fft_type HRplus,HRminus,HIplus,HIminus;
349 fft_type v1,v2,sin,cos;
350
351 auto ButterfliesPerGroup = h->Points / 2;
352
353 /*
354 * Butterfly:
355 * Ain-----Aout
356 * \ /
357 * / \
358 * Bin-----Bout
359 */
360
361 endptr1 = buffer + h->Points * 2;
362
363 while(ButterfliesPerGroup > 0)
364 {
365 A = buffer;
366 B = buffer + ButterfliesPerGroup * 2;
367 sptr = h->SinTable.get();
368
369 while(A < endptr1)
370 {
371 sin = *sptr;
372 cos = *(sptr + 1);
373 endptr2 = B;
374 while(A < endptr2)
375 {
376 v1 = *B * cos + *(B+1) * sin;
377 v2 = *B * sin - *(B+1) * cos;
378 *B = (*A + v1);
379 *(A++) = *(B++) - 2 * v1;
380 *B = (*A - v2);
381 *(A++) = *(B++) + 2 * v2;
382 }
383 A = B;
384 B += ButterfliesPerGroup * 2;
385 sptr += 2;
386 }
387 ButterfliesPerGroup >>= 1;
388 }
389 /* Massage output to get the output for a real input sequence. */
390 br1 = h->BitReversed.get() + 1;
391 br2 = h->BitReversed.get() + h->Points - 1;
392
393 while(br1 < br2)
394 {
395 sin=h->SinTable[*br1];
396 cos=h->SinTable[*br1+1];
397 A=buffer+*br1;
398 B=buffer+*br2;
399 HRplus = (HRminus = *A - *B ) + (*B * 2);
400 HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
401 v1 = (sin*HRminus - cos*HIplus);
402 v2 = (cos*HRminus + sin*HIplus);
403 *A = (HRplus + v1) * (fft_type)0.5;
404 *B = *A - v1;
405 *(A+1) = (HIminus + v2) * (fft_type)0.5;
406 *(B+1) = *(A+1) - HIminus;
407
408 br1++;
409 br2--;
410 }
411 /* Handle the center bin (just need a conjugate) */
412 A=buffer+*br1+1;
413 *A=-*A;
414 /* Handle DC bin separately - and ignore the Fs/2 bin
415 buffer[0]+=buffer[1];
416 buffer[1]=(fft_type)0;*/
417 /* Handle DC and Fs/2 bins separately */
418 /* Put the Fs/2 value into the imaginary part of the DC bin */
419 v1=buffer[0]-buffer[1];
420 buffer[0]+=buffer[1];
421 buffer[1]=v1;
422}
423
424
425/* Description: This routine performs an inverse FFT to real data.
426* This code is for floating point data.
427*
428* Note: Output is BIT-REVERSED! so you must use the BitReversed to
429* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
430* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
431* Input is in normal order, interleaved (real,imaginary) complex data
432* You must call GetFFT(fftlen) first to initialize some buffers!
433*
434* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
435* - this can be done because both values will always be real only
436* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
437*
438* Note: The scaling on this is done according to the standard FFT definition,
439* so a unit amplitude DC signal will output an amplitude of (N)
440* (Older revisions would progressively scale the input, so the output
441* values would be similar in amplitude to the input values, which is
442* good when using fixed point arithmetic)
443*/
445{
446 fft_type *A,*B;
447 fft_type *sptr;
448 fft_type *endptr1,*endptr2;
449 int *br1;
450 fft_type HRplus,HRminus,HIplus,HIminus;
451 fft_type v1,v2,sin,cos;
452
453 auto ButterfliesPerGroup = h->Points / 2;
454
455 /* Massage input to get the input for a real output sequence. */
456 A = buffer + 2;
457 B = buffer + h->Points * 2 - 2;
458 br1 = h->BitReversed.get() + 1;
459 while(A < B)
460 {
461 sin=h->SinTable[*br1];
462 cos=h->SinTable[*br1+1];
463 HRplus = (HRminus = *A - *B ) + (*B * 2);
464 HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
465 v1 = (sin*HRminus + cos*HIplus);
466 v2 = (cos*HRminus - sin*HIplus);
467 *A = (HRplus + v1) * (fft_type)0.5;
468 *B = *A - v1;
469 *(A+1) = (HIminus - v2) * (fft_type)0.5;
470 *(B+1) = *(A+1) - HIminus;
471
472 A+=2;
473 B-=2;
474 br1++;
475 }
476 /* Handle center bin (just need conjugate) */
477 *(A+1)=-*(A+1);
478 /* Handle DC bin separately - this ignores any Fs/2 component
479 buffer[1]=buffer[0]=buffer[0]/2;*/
480 /* Handle DC and Fs/2 bins specially */
481 /* The DC bin is passed in as the real part of the DC complex value */
482 /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
483 /* (v1+v2) = buffer[0] == the DC component */
484 /* (v1-v2) = buffer[1] == the Fs/2 component */
485 v1=0.5f*(buffer[0]+buffer[1]);
486 v2=0.5f*(buffer[0]-buffer[1]);
487 buffer[0]=v1;
488 buffer[1]=v2;
489
490 /*
491 * Butterfly:
492 * Ain-----Aout
493 * \ /
494 * / \
495 * Bin-----Bout
496 */
497
498 endptr1 = buffer + h->Points * 2;
499
500 while(ButterfliesPerGroup > 0)
501 {
502 A = buffer;
503 B = buffer + ButterfliesPerGroup * 2;
504 sptr = h->SinTable.get();
505
506 while(A < endptr1)
507 {
508 sin = *(sptr++);
509 cos = *(sptr++);
510 endptr2 = B;
511 while(A < endptr2)
512 {
513 v1 = *B * cos - *(B + 1) * sin;
514 v2 = *B * sin + *(B + 1) * cos;
515 *B = (*A + v1) * (fft_type)0.5;
516 *(A++) = *(B++) - v1;
517 *B = (*A + v2) * (fft_type)0.5;
518 *(A++) = *(B++) - v2;
519 }
520 A = B;
521 B += ButterfliesPerGroup * 2;
522 }
523 ButterfliesPerGroup >>= 1;
524 }
525}
526
527void ReorderToFreq1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
528{
529 // Copy the data into the real and imaginary outputs
530 for(size_t i = 1; i < hFFT->Points; i++) {
531 RealOut[i]=buffer[hFFT->BitReversed[i] ];
532 ImagOut[i]=buffer[hFFT->BitReversed[i]+1];
533 }
534 RealOut[0] = buffer[0]; // DC component
535 ImagOut[0] = 0;
536 RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
537 ImagOut[hFFT->Points] = 0;
538}
539
540void ReorderToTime1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
541{
542 // Copy the data into the real outputs
543 for(size_t i = 0; i < hFFT->Points; i++) {
544 TimeOut[i*2 ]=buffer[hFFT->BitReversed[i] ];
545 TimeOut[i*2+1]=buffer[hFFT->BitReversed[i]+1];
546 }
547}
548
549// 4x processing simd
551{
552
553 __m128 *localBuffer=(__m128 *)buffer;
554
555 __m128 *A,*B;
556 fft_type *sptr;
557 __m128 *endptr1,*endptr2;
558 int br1Index, br2Index;
559 int br1Value, br2Value;
560 __m128 HRplus,HRminus,HIplus,HIminus;
561 __m128 v1,v2,sin,cos;
562 auto ButterfliesPerGroup = h->Points / 2;
563
564 /*
565 * Butterfly:
566 * Ain-----Aout
567 * \ /
568 * / \
569 * Bin-----Bout
570 */
571
572 endptr1 = &localBuffer[h->Points * 2];
573
574 while(ButterfliesPerGroup > 0)
575 {
576 A = localBuffer;
577 B = &localBuffer[ButterfliesPerGroup * 2];
578 sptr = h->SinTable.get();
579 while(A < endptr1)
580 {
581 sin = _mm_set1_ps(*(sptr++));
582 cos = _mm_set1_ps(*(sptr++));
583 endptr2 = B;
584 while(A < endptr2)
585 {
586 v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
587 v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
588 *B = _mm_add_ps( *A, v1);
589 __m128 temp128 = _mm_set1_ps( 2.0);
590 *(A++) = _mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
591 *B = _mm_sub_ps(*A,v2);
592 *(A++) = _mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
593 }
594 A = B;
595 B = &B[ButterfliesPerGroup * 2];
596 }
597 ButterfliesPerGroup >>= 1;
598 }
599 /* Massage output to get the output for a real input sequence. */
600
601 br1Index = 1; // h->BitReversed + 1;
602 br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
603
604 while(br1Index<br2Index)
605 {
606 br1Value=h->BitReversed[br1Index];
607 br2Value=h->BitReversed[br2Index];
608 sin=_mm_set1_ps(h->SinTable[br1Value]);
609 cos=_mm_set1_ps(h->SinTable[br1Value+1]);
610 A=&localBuffer[br1Value];
611 B=&localBuffer[br2Value];
612 __m128 temp128 = _mm_set1_ps( 2.0);
613 HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
614 HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
615 v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
616 v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
617 temp128 = _mm_set1_ps( 0.5);
618 *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
619 *B = _mm_sub_ps(*A, v1);
620 *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
621 *(B+1) = _mm_sub_ps(*(A+1), HIminus);
622
623 br1Index++;
624 br2Index--;
625 }
626 /* Handle the center bin (just need a conjugate) */
627 A=&localBuffer[h->BitReversed[br1Index]+1];
628 // negate sse style
629 *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
630 /* Handle DC and Fs/2 bins separately */
631 /* Put the Fs/2 value into the imaginary part of the DC bin */
632 v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
633 localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
634 localBuffer[1]=v1;
635}
636
637/* Description: This routine performs an inverse FFT to real data.
638* This code is for floating point data.
639*
640* Note: Output is BIT-REVERSED! so you must use the BitReversed to
641* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
642* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
643* Input is in normal order, interleaved (real,imaginary) complex data
644* You must call GetFFT(fftlen) first to initialize some buffers!
645*
646* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
647* - this can be done because both values will always be real only
648* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
649*
650* Note: The scaling on this is done according to the standard FFT definition,
651* so a unit amplitude DC signal will output an amplitude of (N)
652* (Older revisions would progressively scale the input, so the output
653* values would be similar in amplitude to the input values, which is
654* good when using fixed point arithmetic)
655*/
657{
658
659 __m128 *localBuffer=(__m128 *)buffer;
660
661 __m128 *A,*B;
662 fft_type *sptr;
663 __m128 *endptr1,*endptr2;
664 int br1Index, br1Value;
665 __m128 HRplus,HRminus,HIplus,HIminus;
666 __m128 v1,v2,sin,cos;
667
668 auto ButterfliesPerGroup = h->Points / 2;
669
670 /* Massage input to get the input for a real output sequence. */
671 A = localBuffer + 2;
672 B = localBuffer + h->Points * 2 - 2;
673 br1Index = 1; //h->BitReversed + 1;
674 while(A < B)
675 {
676 br1Value = h->BitReversed[br1Index];
677 sin = _mm_set1_ps(h->SinTable[br1Value]);
678 cos = _mm_set1_ps(h->SinTable[br1Value + 1]);
679 HRminus = _mm_sub_ps(*A, *B);
680 HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
681 HIminus = _mm_sub_ps( *(A+1), *(B+1));
682 HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
683 v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
684 v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
685 *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
686 *B = _mm_sub_ps(*A, v1);
687 *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
688 *(B+1) = _mm_sub_ps(*(A+1), HIminus);
689
690 A=&A[2];
691 B=&B[-2];
692 br1Index++;
693 }
694 /* Handle center bin (just need conjugate) */
695 // negate sse style
696 *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
697
698 /* Handle DC bin separately - this ignores any Fs/2 component
699 buffer[1]=buffer[0]=buffer[0]/2;*/
700 /* Handle DC and Fs/2 bins specially */
701 /* The DC bin is passed in as the real part of the DC complex value */
702 /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
703 /* (v1+v2) = buffer[0] == the DC component */
704 /* (v1-v2) = buffer[1] == the Fs/2 component */
705 v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
706 v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
707 localBuffer[0]=v1;
708 localBuffer[1]=v2;
709
710 /*
711 * Butterfly:
712 * Ain-----Aout
713 * \ /
714 * / \
715 * Bin-----Bout
716 */
717
718 endptr1 = localBuffer + h->Points * 2;
719
720 while(ButterfliesPerGroup > 0)
721 {
722 A = localBuffer;
723 B = localBuffer + ButterfliesPerGroup * 2;
724 sptr = h->SinTable.get();
725 while(A < endptr1)
726 {
727 sin = _mm_set1_ps(*(sptr++));
728 cos = _mm_set1_ps(*(sptr++));
729 endptr2 = B;
730 while(A < endptr2)
731 {
732 v1 = _mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B + 1), sin));
733 v2 = _mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B + 1), cos));
734 *B = _mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
735 *(A++) = _mm_sub_ps(*(B++), v1);
736 *B = _mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
737 *(A++) = _mm_sub_ps(*(B++), v2);
738 }
739 A = B;
740 B = &B[ButterfliesPerGroup * 2];
741 }
742 ButterfliesPerGroup >>= 1;
743 }
744}
745
746void ReorderToFreq4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
747{
748 __m128 *localBuffer=(__m128 *)buffer;
749 __m128 *localRealOut=(__m128 *)RealOut;
750 __m128 *localImagOut=(__m128 *)ImagOut;
751
752 // Copy the data into the real and imaginary outputs
753 for(size_t i = 1; i < hFFT->Points; i++) {
754 int brValue;
755 brValue=hFFT->BitReversed[i];
756 localRealOut[i]=localBuffer[brValue ];
757 localImagOut[i]=localBuffer[brValue+1];
758 }
759 localRealOut[0] = localBuffer[0]; // DC component
760 localImagOut[0] = _mm_set1_ps(0.0);
761 localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
762 localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
763}
764
765void ReorderToTime4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
766{
767 __m128 *localBuffer=(__m128 *)buffer;
768 __m128 *localTimeOut=(__m128 *)TimeOut;
769 // Copy the data into the real outputs
770 for(size_t i = 0; i < hFFT->Points; i++) {
771 int brValue;
772 brValue = hFFT->BitReversed[i];
773 localTimeOut[i*2 ] = localBuffer[brValue ];
774 localTimeOut[i*2+1] = localBuffer[brValue+1];
775 }
776}
777
778#endif
779
780#define REAL_SINCOSTABLE_VBR16
781#ifdef REAL_SINCOSTABLE_VBR16
782
783/*
784* Forward FFT routine. Must call GetFFT(fftlen) first!
785*
786* Note: Output is BIT-REVERSED! so you must use the BitReversed to
787* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
788* Imag_i = buffer[ h->BitReversed[i]+1 ] )
789* Input is in normal order.
790*
791* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
792* - this can be done because both values will always be real only
793* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
794*
795* Note: The scaling on this is done according to the standard FFT definition,
796* so a unit amplitude DC signal will output an amplitude of (N)
797* (Older revisions would progressively scale the input, so the output
798* values would be similar in amplitude to the input values, which is
799* good when using fixed point arithmetic)
800*/
802{
803 fft_type *A,*B;
804 fft_type *endptr1,*endptr2;
805 int br1Index, br2Index;
806 int br1Value, br2Value;
807 fft_type HRplus,HRminus,HIplus,HIminus;
808 fft_type v1,v2,sin,cos;
809 auto ButterfliesPerGroup = h->Points / 2;
810 int pow2BitsMinus1 = h->pow2Bits - 1;
811 int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);
812
813 endptr1 = buffer + h->Points * 2;
814
815 while(ButterfliesPerGroup > 0)
816 {
817 A = buffer;
818 B = buffer + ButterfliesPerGroup * 2;
819 int iSinCosIndex = 0;
820 while(A < endptr1)
821 {
822 int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;
823 sin = sSinCosTable.mSinCosTable[sinCosLookup].mSin;
824 cos = sSinCosTable.mSinCosTable[sinCosLookup].mCos;
825 iSinCosIndex++;
826 endptr2 = B;
827 while(A < endptr2)
828 {
829 v1 = *B*cos + *(B+1)*sin;
830 v2 = *B*sin - *(B+1)*cos;
831 *B = (*A+v1);
832 *(A++) = *(B++) - 2 * v1;
833 *B = (*A - v2);
834 *(A++) = *(B++) + 2 * v2;
835 }
836 A = B;
837 B += ButterfliesPerGroup * 2;
838 }
839 ButterfliesPerGroup >>= 1;
840 }
841 /* Massage output to get the output for a real input sequence. */
842
843 br1Index = 1;
844 br2Index = h->Points - 1;
845
846 while(br1Index < br2Index)
847 {
848 br1Value=(*SmallVRB[h->pow2Bits])(br1Index);
849 br2Value=(*SmallVRB[h->pow2Bits])(br2Index);
850 int sinCosIndex=br1Index<<sinCosShift;
851 sin=sSinCosTable.mSinCosTable[sinCosIndex].mSin;
852 cos=sSinCosTable.mSinCosTable[sinCosIndex].mCos;
853 A=&buffer[br1Value];
854 B=&buffer[br2Value];
855 HRplus = (HRminus = *A - *B ) + (*B * 2);
856 HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
857 v1 = (sin*HRminus - cos*HIplus);
858 v2 = (cos*HRminus + sin*HIplus);
859 *A = (HRplus + v1) * (fft_type)0.5;
860 *B = *A - v1;
861 *(A+1) = (HIminus + v2) * (fft_type)0.5;
862 *(B+1) = *(A+1) - HIminus;
863
864 br1Index++;
865 br2Index--;
866 }
867 /* Handle the center bin (just need a conjugate) */
868 A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
869
870 /* Handle DC bin separately - and ignore the Fs/2 bin
871 buffer[0]+=buffer[1];
872 buffer[1]=(fft_type)0;*/
873 /* Handle DC and Fs/2 bins separately */
874 /* Put the Fs/2 value into the imaginary part of the DC bin */
875 v1=buffer[0]-buffer[1];
876 buffer[0]+=buffer[1];
877 buffer[1]=v1;
878}
879
880/* Description: This routine performs an inverse FFT to real data.
881* This code is for floating point data.
882*
883* Note: Output is BIT-REVERSED! so you must use the BitReversed to
884* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
885* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
886* Input is in normal order, interleaved (real,imaginary) complex data
887* You must call GetFFT(fftlen) first to initialize some buffers!
888*
889* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
890* - this can be done because both values will always be real only
891* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
892*
893* Note: The scaling on this is done according to the standard FFT definition,
894* so a unit amplitude DC signal will output an amplitude of (N)
895* (Older revisions would progressively scale the input, so the output
896* values would be similar in amplitude to the input values, which is
897* good when using fixed point arithmetic)
898*/
900{
901 fft_type *A,*B;
902 fft_type *endptr1,*endptr2;
903 int br1Index, br1Value;
904 int *br1;
905 fft_type HRplus,HRminus,HIplus,HIminus;
906 fft_type v1,v2,sin,cos;
907 auto ButterfliesPerGroup = h->Points / 2;
908 int pow2BitsMinus1 = h->pow2Bits - 1;
909 int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);
910
911 /* Massage input to get the input for a real output sequence. */
912 A = buffer + 2;
913 B = buffer + h->Points * 2 - 2;
914 br1 = h->BitReversed.get() + 1;
915 br1Index = 1; //h->BitReversed + 1;
916 while(A < B)
917 {
918 br1Value = (*SmallVRB[h->pow2Bits])(br1Index);
919 int sinCosIndex = br1Index << sinCosShift;
920 sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin;
921 cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos;
922 HRplus = (HRminus = *A - *B ) + (*B * 2);
923 HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
924 v1 = (sin*HRminus + cos*HIplus);
925 v2 = (cos*HRminus - sin*HIplus);
926 *A = (HRplus + v1) * (fft_type)0.5;
927 *B = *A - v1;
928 *(A+1) = (HIminus - v2) * (fft_type)0.5;
929 *(B+1) = *(A+1) - HIminus;
930
931 A=&A[2];
932 B=&B[-2];
933 br1Index++;
934 }
935 /* Handle center bin (just need conjugate) */
936 *(A+1)=-*(A+1);
937 /* Handle DC bin separately - this ignores any Fs/2 component
938 buffer[1]=buffer[0]=buffer[0]/2;*/
939 /* Handle DC and Fs/2 bins specially */
940 /* The DC bin is passed in as the real part of the DC complex value */
941 /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
942 /* (v1+v2) = buffer[0] == the DC component */
943 /* (v1-v2) = buffer[1] == the Fs/2 component */
944 v1=0.5f*(buffer[0]+buffer[1]);
945 v2=0.5f*(buffer[0]-buffer[1]);
946 buffer[0]=v1;
947 buffer[1]=v2;
948
949 /*
950 * Butterfly:
951 * Ain-----Aout
952 * \ /
953 * / \
954 * Bin-----Bout
955 */
956
957 endptr1 = buffer + h->Points * 2;
958
959 while(ButterfliesPerGroup > 0)
960 {
961 A = buffer;
962 B = buffer + ButterfliesPerGroup * 2;
963 int iSinCosIndex = 0;
964 while(A < endptr1)
965 {
966 int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;
967 sin = sSinCosTable.mSinCosTable[sinCosLookup].mSin;
968 cos = sSinCosTable.mSinCosTable[sinCosLookup].mCos;
969 iSinCosIndex++;
970 endptr2 = B;
971 while(A < endptr2)
972 {
973 v1 = *B * cos - *(B + 1) * sin;
974 v2 = *B * sin + *(B + 1) * cos;
975 *B = (*A + v1) * (fft_type)0.5;
976 *(A++) = *(B++) - v1;
977 *B = (*A + v2) * (fft_type)0.5;
978 *(A++) = *(B++) - v2;
979 }
980 A = B;
981 B += ButterfliesPerGroup * 2;
982 }
983 ButterfliesPerGroup >>= 1;
984 }
985}
986
987void ReorderToTime1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
988{
989 // Copy the data into the real outputs
990 for(size_t i = 0;i < hFFT->Points; i++) {
991 int brValue;
992 brValue=(*SmallVRB[hFFT->pow2Bits])(i);
993 TimeOut[i*2 ] = buffer[brValue ];
994 TimeOut[i*2+1] = buffer[brValue+1];
995 }
996}
997
998void ReorderToFreq1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
999{
1000 // Copy the data into the real and imaginary outputs
1001 for(size_t i = 1; i < hFFT->Points; i++) {
1002 int brValue;
1003 brValue = (*SmallVRB[hFFT->pow2Bits])(i);
1004 RealOut[i] = buffer[brValue ];
1005 ImagOut[i] = buffer[brValue+1];
1006 }
1007 RealOut[0] = buffer[0]; // DC component
1008 ImagOut[0] = 0;
1009 RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
1010 ImagOut[hFFT->Points] = 0;
1011}
1012
1013// 4x processing simd
1015{
1016
1017 __m128 *localBuffer=(__m128 *)buffer;
1018
1019 __m128 *A,*B;
1020 __m128 *endptr1,*endptr2;
1021 int br1Index, br2Index;
1022 int br1Value, br2Value;
1023 __m128 HRplus,HRminus,HIplus,HIminus;
1024 __m128 v1,v2,sin,cos;
1025 auto ButterfliesPerGroup = h->Points / 2;
1026 int pow2BitsMinus1 = h->pow2Bits - 1;
1027 int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);
1028
1029 /*
1030 * Butterfly:
1031 * Ain-----Aout
1032 * \ /
1033 * / \
1034 * Bin-----Bout
1035 */
1036
1037 endptr1 = &localBuffer[h->Points * 2];
1038
1039 while(ButterfliesPerGroup > 0)
1040 {
1041 A = localBuffer;
1042 B = &localBuffer[ButterfliesPerGroup * 2];
1043 int iSinCosIndex = 0;
1044 while(A < endptr1)
1045 {
1046 int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;
1047 sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);
1048 cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);
1049 iSinCosIndex++;
1050 endptr2 = B;
1051 while(A < endptr2)
1052 {
1053 v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
1054 v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
1055 *B = _mm_add_ps( *A, v1);
1056 __m128 temp128 = _mm_set1_ps( 2.0);
1057 *(A++) = _mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
1058 *B = _mm_sub_ps(*A,v2);
1059 *(A++) = _mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
1060 }
1061 A = B;
1062 B = &B[ButterfliesPerGroup * 2];
1063 }
1064 ButterfliesPerGroup >>= 1;
1065 }
1066 /* Massage output to get the output for a real input sequence. */
1067
1068 br1Index = 1; // h->BitReversed + 1;
1069 br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
1070
1071 while(br1Index < br2Index)
1072 {
1073 br1Value=(*SmallVRB[h->pow2Bits])(br1Index);
1074 br2Value=(*SmallVRB[h->pow2Bits])(br2Index);
1075 int sinCosIndex=br1Index<<sinCosShift;
1076 sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);
1077 cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);
1078 A=&localBuffer[br1Value];
1079 B=&localBuffer[br2Value];
1080 __m128 temp128 = _mm_set1_ps( 2.0);
1081 HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
1082 HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
1083 v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
1084 v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
1085 temp128 = _mm_set1_ps( 0.5);
1086 *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
1087 *B = _mm_sub_ps(*A, v1);
1088 *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
1089 *(B+1) = _mm_sub_ps(*(A+1), HIminus);
1090
1091 br1Index++;
1092 br2Index--;
1093 }
1094 /* Handle the center bin (just need a conjugate) */
1095 A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
1096 // negate sse style
1097 *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
1098 /* Handle DC and Fs/2 bins separately */
1099 /* Put the Fs/2 value into the imaginary part of the DC bin */
1100 v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
1101 localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
1102 localBuffer[1]=v1;
1103}
1104
1105/* Description: This routine performs an inverse FFT to real data.
1106* This code is for floating point data.
1107*
1108* Note: Output is BIT-REVERSED! so you must use the BitReversed to
1109* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
1110* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
1111* Input is in normal order, interleaved (real,imaginary) complex data
1112* You must call GetFFT(fftlen) first to initialize some buffers!
1113*
1114* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
1115* - this can be done because both values will always be real only
1116* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
1117*
1118* Note: The scaling on this is done according to the standard FFT definition,
1119* so a unit amplitude DC signal will output an amplitude of (N)
1120* (Older revisions would progressively scale the input, so the output
1121* values would be similar in amplitude to the input values, which is
1122* good when using fixed point arithmetic)
1123*/
1125{
1126
1127 __m128 *localBuffer=(__m128 *)buffer;
1128
1129 __m128 *A, *B;
1130 __m128 *endptr1, *endptr2;
1131 int br1Index, br1Value;
1132 __m128 HRplus, HRminus, HIplus, HIminus;
1133 __m128 v1, v2, sin, cos;
1134 auto ButterfliesPerGroup = h->Points / 2;
1135 int pow2BitsMinus1 = h->pow2Bits - 1;
1136 int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);
1137
1138 /* Massage input to get the input for a real output sequence. */
1139 A = localBuffer + 2;
1140 B = localBuffer + h->Points * 2 - 2;
1141 br1Index = 1; //h->BitReversed + 1;
1142 while(A < B)
1143 {
1144 br1Value = (*SmallVRB[h->pow2Bits])(br1Index);
1145 int sinCosIndex = br1Index << sinCosShift;
1146 sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);
1147 cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);
1148 HRminus = _mm_sub_ps(*A, *B);
1149 HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
1150 HIminus = _mm_sub_ps( *(A+1), *(B+1));
1151 HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
1152 v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
1153 v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
1154 *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
1155 *B = _mm_sub_ps(*A, v1);
1156 *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
1157 *(B+1) = _mm_sub_ps(*(A+1), HIminus);
1158
1159 A = &A[2];
1160 B = &B[-2];
1161 br1Index++;
1162 }
1163 /* Handle center bin (just need conjugate) */
1164 // negate sse style
1165 *(A+1) = _mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
1166
1167 /* Handle DC bin separately - this ignores any Fs/2 component
1168 buffer[1]=buffer[0]=buffer[0]/2;*/
1169 /* Handle DC and Fs/2 bins specially */
1170 /* The DC bin is passed in as the real part of the DC complex value */
1171 /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
1172 /* (v1+v2) = buffer[0] == the DC component */
1173 /* (v1-v2) = buffer[1] == the Fs/2 component */
1174 v1 = _mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
1175 v2 = _mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
1176 localBuffer[0] = v1;
1177 localBuffer[1] = v2;
1178
1179 /*
1180 * Butterfly:
1181 * Ain-----Aout
1182 * \ /
1183 * / \
1184 * Bin-----Bout
1185 */
1186
1187 endptr1 = localBuffer + h->Points * 2;
1188
1189 while(ButterfliesPerGroup > 0)
1190 {
1191 A = localBuffer;
1192 B = localBuffer + ButterfliesPerGroup * 2;
1193 int iSinCosIndex = 0;
1194 while(A < endptr1)
1195 {
1196 int sinCosLookup = (*SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;
1197 sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);
1198 cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);
1199 iSinCosIndex++;
1200 endptr2 = B;
1201 while(A < endptr2)
1202 {
1203 v1 = _mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
1204 v2 = _mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
1205 *B = _mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
1206 *(A++) = _mm_sub_ps(*(B++), v1);
1207 *B = _mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
1208 *(A++) = _mm_sub_ps(*(B++),v2);
1209 }
1210 A = B;
1211 B = &B[ButterfliesPerGroup * 2];
1212 }
1213 ButterfliesPerGroup >>= 1;
1214 }
1215}
1216
1217void ReorderToTime4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
1218{
1219 __m128 *localBuffer = (__m128 *)buffer;
1220 __m128 *localTimeOut = (__m128 *)TimeOut;
1221 // Copy the data into the real outputs
1222 for(size_t i = 0; i < hFFT->Points; i++) {
1223 int brValue;
1224 brValue = (*SmallVRB[hFFT->pow2Bits])(i);
1225 localTimeOut[i*2 ] = localBuffer[brValue ];
1226 localTimeOut[i*2+1] = localBuffer[brValue+1];
1227 }
1228}
1229
1230void ReorderToFreq4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
1231{
1232 __m128 *localBuffer = (__m128 *)buffer;
1233 __m128 *localRealOut = (__m128 *)RealOut;
1234 __m128 *localImagOut = (__m128 *)ImagOut;
1235
1236 // Copy the data into the real and imaginary outputs
1237 for(size_t i = 1; i < hFFT->Points; i++) {
1238 int brValue;
1239 brValue = (*SmallVRB[hFFT->pow2Bits])(i);
1240 localRealOut[i] = localBuffer[brValue ];
1241 localImagOut[i] = localBuffer[brValue+1];
1242 }
1243 localRealOut[0] = localBuffer[0]; // DC component
1244 localImagOut[0] = _mm_set1_ps(0.0);
1245 localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
1246 localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
1247}
1248#endif
1249
1250#define REAL_SINCOSTABLE_BR16
1251#ifdef REAL_SINCOSTABLE_BR16
1252
1253/*
1254* Forward FFT routine. Must call GetFFT(fftlen) first!
1255*
1256* Note: Output is BIT-REVERSED! so you must use the BitReversed to
1257* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
1258* Imag_i = buffer[ h->BitReversed[i]+1 ] )
1259* Input is in normal order.
1260*
1261* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
1262* - this can be done because both values will always be real only
1263* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
1264*
1265* Note: The scaling on this is done according to the standard FFT definition,
1266* so a unit amplitude DC signal will output an amplitude of (N)
1267* (Older revisions would progressively scale the input, so the output
1268* values would be similar in amplitude to the input values, which is
1269* good when using fixed point arithmetic)
1270*/
1272{
1273 fft_type *A,*B;
1274 fft_type *endptr1, *endptr2;
1275 int br1Index, br2Index;
1276 int br1Value, br2Value;
1277 fft_type HRplus, HRminus, HIplus, HIminus;
1278 fft_type v1, v2, sin, cos;
1279 auto ButterfliesPerGroup = h->Points / 2;
1280 int pow2BitsMinus1 = h->pow2Bits - 1;
1281 int bitReverseShiftM1 = 17 - h->pow2Bits;
1282 int bitReverseShift = bitReverseShiftM1 - 1;
1283 int sinCosShift = (sSinCosTable.mSinCosTablePow - pow2BitsMinus1);
1284
1285 endptr1 = buffer + h->Points * 2;
1286
1287 while(ButterfliesPerGroup > 0)
1288 {
1289 A = buffer;
1290 B = buffer + ButterfliesPerGroup * 2;
1291 int iSinCosIndex = 0;
1292 while(A < endptr1)
1293 {
1294// int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;
1295 int sinCosLookup = ( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
1296 sin = sSinCosTable.mSinCosTable[sinCosLookup].mSin;
1297 cos = sSinCosTable.mSinCosTable[sinCosLookup].mCos;
1298 iSinCosIndex++;
1299 endptr2 = B;
1300 while(A < endptr2)
1301 {
1302 v1=*B*cos + *(B+1)*sin;
1303 v2=*B*sin - *(B+1)*cos;
1304 *B=(*A+v1);
1305 *(A++)=*(B++)-2*v1;
1306 *B=(*A-v2);
1307 *(A++)=*(B++)+2*v2;
1308 }
1309 A = B;
1310 B += ButterfliesPerGroup * 2;
1311 }
1312 ButterfliesPerGroup >>= 1;
1313 }
1314 /* Massage output to get the output for a real input sequence. */
1315
1316 br1Index = 1;
1317 br2Index = h->Points - 1;
1318
1319 while(br1Index < br2Index)
1320 {
1321 br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
1322 br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
1323 int sinCosIndex = br1Index << sinCosShift;
1324 sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin;
1325 cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos;
1326 A = &buffer[br1Value];
1327 B = &buffer[br2Value];
1328 HRplus = (HRminus = *A - *B ) + (*B * 2);
1329 HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
1330 v1 = (sin*HRminus - cos*HIplus);
1331 v2 = (cos*HRminus + sin*HIplus);
1332 *A = (HRplus + v1) * (fft_type)0.5;
1333 *B = *A - v1;
1334 *(A+1) = (HIminus + v2) * (fft_type)0.5;
1335 *(B+1) = *(A+1) - HIminus;
1336
1337 br1Index++;
1338 br2Index--;
1339 }
1340 /* Handle the center bin (just need a conjugate) */
1341// A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
1342 A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
1343
1344 /* Handle DC bin separately - and ignore the Fs/2 bin
1345 buffer[0]+=buffer[1];
1346 buffer[1]=(fft_type)0;*/
1347 /* Handle DC and Fs/2 bins separately */
1348 /* Put the Fs/2 value into the imaginary part of the DC bin */
1349 v1=buffer[0]-buffer[1];
1350 buffer[0]+=buffer[1];
1351 buffer[1]=v1;
1352}
1353
1354/* Description: This routine performs an inverse FFT to real data.
1355* This code is for floating point data.
1356*
1357* Note: Output is BIT-REVERSED! so you must use the BitReversed to
1358* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
1359* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
1360* Input is in normal order, interleaved (real,imaginary) complex data
1361* You must call GetFFT(fftlen) first to initialize some buffers!
1362*
1363* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
1364* - this can be done because both values will always be real only
1365* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
1366*
1367* Note: The scaling on this is done according to the standard FFT definition,
1368* so a unit amplitude DC signal will output an amplitude of (N)
1369* (Older revisions would progressively scale the input, so the output
1370* values would be similar in amplitude to the input values, which is
1371* good when using fixed point arithmetic)
1372*/
1374{
1375 fft_type *A,*B;
1376 fft_type *endptr1,*endptr2;
1377 int br1Index;
1378 fft_type HRplus, HRminus, HIplus, HIminus;
1379 fft_type v1, v2, sin, cos;
1380 auto ButterfliesPerGroup = h->Points / 2;
1381 int pow2BitsMinus1 = h->pow2Bits - 1;
1382 int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1);
1383 int bitReverseShiftM1 = 17 - h->pow2Bits;
1384
1385 /* Massage input to get the input for a real output sequence. */
1386 A = buffer + 2;
1387 B = buffer + h->Points * 2 - 2;
1388 br1Index = 1; //h->BitReversed + 1;
1389 while(A < B)
1390 {
1391 int sinCosIndex = br1Index << sinCosShift;
1392 sin = sSinCosTable.mSinCosTable[sinCosIndex].mSin;
1393 cos = sSinCosTable.mSinCosTable[sinCosIndex].mCos;
1394 HRplus = (HRminus = *A - *B ) + (*B * 2);
1395 HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
1396 v1 = (sin*HRminus + cos*HIplus);
1397 v2 = (cos*HRminus - sin*HIplus);
1398 *A = (HRplus + v1) * (fft_type)0.5;
1399 *B = *A - v1;
1400 *(A+1) = (HIminus - v2) * (fft_type)0.5;
1401 *(B+1) = *(A+1) - HIminus;
1402
1403 A=&A[2];
1404 B=&B[-2];
1405 br1Index++;
1406 }
1407 /* Handle center bin (just need conjugate) */
1408 *(A+1)=-*(A+1);
1409 /* Handle DC bin separately - this ignores any Fs/2 component
1410 buffer[1]=buffer[0]=buffer[0]/2;*/
1411 /* Handle DC and Fs/2 bins specially */
1412 /* The DC bin is passed in as the real part of the DC complex value */
1413 /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
1414 /* (v1+v2) = buffer[0] == the DC component */
1415 /* (v1-v2) = buffer[1] == the Fs/2 component */
1416 v1=0.5f*(buffer[0]+buffer[1]);
1417 v2=0.5f*(buffer[0]-buffer[1]);
1418 buffer[0]=v1;
1419 buffer[1]=v2;
1420
1421 /*
1422 * Butterfly:
1423 * Ain-----Aout
1424 * \ /
1425 * / \
1426 * Bin-----Bout
1427 */
1428
1429 endptr1 = buffer + h->Points * 2;
1430
1431 while(ButterfliesPerGroup > 0)
1432 {
1433 A = buffer;
1434 B = buffer + ButterfliesPerGroup * 2;
1435 int iSinCosIndex = 0;
1436 while(A < endptr1)
1437 {
1438// int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;
1439 int sinCosLookup=( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
1440 sin=sSinCosTable.mSinCosTable[sinCosLookup].mSin;
1441 cos=sSinCosTable.mSinCosTable[sinCosLookup].mCos;
1442 iSinCosIndex++;
1443 endptr2=B;
1444 while(A<endptr2)
1445 {
1446 v1=*B*cos - *(B+1)*sin;
1447 v2=*B*sin + *(B+1)*cos;
1448 *B=(*A+v1)*(fft_type)0.5;
1449 *(A++)=*(B++)-v1;
1450 *B=(*A+v2)*(fft_type)0.5;
1451 *(A++)=*(B++)-v2;
1452 }
1453 A = B;
1454 B += ButterfliesPerGroup * 2;
1455 }
1456 ButterfliesPerGroup >>= 1;
1457 }
1458}
1459
1460void ReorderToFreq1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
1461{
1462 int bitReverseShift=16-hFFT->pow2Bits;
1463 // Copy the data into the real and imaginary outputs
1464 for(size_t i = 1; i < hFFT->Points; i++) {
1465 int brValue;
1466// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
1467 brValue = ( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
1468 RealOut[i] = buffer[brValue ];
1469 ImagOut[i] = buffer[brValue+1];
1470 }
1471 RealOut[0] = buffer[0]; // DC component
1472 ImagOut[0] = 0;
1473 RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
1474 ImagOut[hFFT->Points] = 0;
1475}
1476
1477void ReorderToTime1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
1478{
1479 int bitReverseShift=16-hFFT->pow2Bits;
1480 // Copy the data into the real outputs
1481 for(size_t i = 0; i < hFFT->Points; i++) {
1482 int brValue;
1483// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
1484 brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
1485 TimeOut[i*2 ] = buffer[brValue ];
1486 TimeOut[i*2+1] = buffer[brValue+1];
1487 }
1488}
1489
1490// 4x processing simd
1492{
1493
1494 __m128 *localBuffer=(__m128 *)buffer;
1495
1496 __m128 *A, *B;
1497 __m128 *endptr1, *endptr2;
1498 int br1Index, br2Index;
1499 int br1Value, br2Value;
1500 __m128 HRplus, HRminus, HIplus, HIminus;
1501 __m128 v1, v2, sin, cos;
1502 auto ButterfliesPerGroup = h->Points / 2;
1503 int pow2BitsMinus1 = h->pow2Bits - 1;
1504 int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1);
1505 int bitReverseShiftM1 = 17 - h->pow2Bits;
1506 int bitReverseShift = bitReverseShiftM1 - 1;
1507
1508 /*
1509 * Butterfly:
1510 * Ain-----Aout
1511 * \ /
1512 * / \
1513 * Bin-----Bout
1514 */
1515
1516 endptr1 = &localBuffer[h->Points * 2];
1517
1518 while(ButterfliesPerGroup > 0)
1519 {
1520 A = localBuffer;
1521 B = &localBuffer[ButterfliesPerGroup * 2];
1522 int iSinCosIndex = 0;
1523 while(A < endptr1)
1524 {
1525// int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;
1526 int sinCosLookup=( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
1527 sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);
1528 cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);
1529 iSinCosIndex++;
1530 endptr2=B;
1531 while(A<endptr2)
1532 {
1533 v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
1534 v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
1535 *B=_mm_add_ps( *A, v1);
1536 __m128 temp128 = _mm_set1_ps( 2.0);
1537 *(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
1538 *B=_mm_sub_ps(*A,v2);
1539 *(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
1540 }
1541 A = B;
1542 B = &B[ButterfliesPerGroup * 2];
1543 }
1544 ButterfliesPerGroup >>= 1;
1545 }
1546 /* Massage output to get the output for a real input sequence. */
1547
1548 br1Index = 1; // h->BitReversed + 1;
1549 br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
1550
1551 while(br1Index < br2Index)
1552 {
1553 br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
1554 br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
1555 int sinCosIndex=br1Index<<sinCosShift;
1556 sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);
1557 cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);
1558 A=&localBuffer[br1Value];
1559 B=&localBuffer[br2Value];
1560 __m128 temp128 = _mm_set1_ps( 2.0);
1561 HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
1562 HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
1563 v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
1564 v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
1565 temp128 = _mm_set1_ps( 0.5);
1566 *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
1567 *B = _mm_sub_ps(*A, v1);
1568 *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
1569 *(B+1) = _mm_sub_ps(*(A+1), HIminus);
1570
1571 br1Index++;
1572 br2Index--;
1573 }
1574 /* Handle the center bin (just need a conjugate) */
1575// A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
1576 A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
1577 // negate sse style
1578 *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
1579 /* Handle DC and Fs/2 bins separately */
1580 /* Put the Fs/2 value into the imaginary part of the DC bin */
1581 v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
1582 localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
1583 localBuffer[1]=v1;
1584}
1585
1586/* Description: This routine performs an inverse FFT to real data.
1587* This code is for floating point data.
1588*
1589* Note: Output is BIT-REVERSED! so you must use the BitReversed to
1590* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
1591* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
1592* Input is in normal order, interleaved (real,imaginary) complex data
1593* You must call GetFFT(fftlen) first to initialize some buffers!
1594*
1595* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
1596* - this can be done because both values will always be real only
1597* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
1598*
1599* Note: The scaling on this is done according to the standard FFT definition,
1600* so a unit amplitude DC signal will output an amplitude of (N)
1601* (Older revisions would progressively scale the input, so the output
1602* values would be similar in amplitude to the input values, which is
1603* good when using fixed point arithmetic)
1604*/
1606{
1607
1608 __m128 *localBuffer=(__m128 *)buffer;
1609
1610 __m128 *A, *B;
1611 __m128 *endptr1, *endptr2;
1612 int br1Index;
1613 __m128 HRplus, HRminus, HIplus, HIminus;
1614 __m128 v1, v2, sin, cos;
1615 auto ButterfliesPerGroup = h->Points / 2;
1616 int pow2BitsMinus1 = h->pow2Bits - 1;
1617 int sinCosShift = (sSinCosTable.mSinCosTablePow-pow2BitsMinus1);
1618 int bitReverseShiftM1 = 17 - h->pow2Bits;
1619
1620 /* Massage input to get the input for a real output sequence. */
1621 A = localBuffer + 2;
1622 B = localBuffer + h->Points * 2 - 2;
1623 br1Index = 1; //h->BitReversed + 1;
1624 while(A < B)
1625 {
1626 int sinCosIndex = br1Index << sinCosShift;
1627 sin = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mSin);
1628 cos = _mm_set1_ps(sSinCosTable.mSinCosTable[sinCosIndex].mCos);
1629 HRminus = _mm_sub_ps(*A, *B);
1630 HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
1631 HIminus = _mm_sub_ps( *(A+1), *(B+1));
1632 HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
1633 v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
1634 v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
1635 *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
1636 *B = _mm_sub_ps(*A, v1);
1637 *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
1638 *(B+1) = _mm_sub_ps(*(A+1), HIminus);
1639
1640 A=&A[2];
1641 B=&B[-2];
1642 br1Index++;
1643 }
1644 /* Handle center bin (just need conjugate) */
1645 // negate sse style
1646 *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
1647
1648 /* Handle DC bin separately - this ignores any Fs/2 component
1649 buffer[1]=buffer[0]=buffer[0]/2;*/
1650 /* Handle DC and Fs/2 bins specially */
1651 /* The DC bin is passed in as the real part of the DC complex value */
1652 /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
1653 /* (v1+v2) = buffer[0] == the DC component */
1654 /* (v1-v2) = buffer[1] == the Fs/2 component */
1655 v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
1656 v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
1657 localBuffer[0] = v1;
1658 localBuffer[1] = v2;
1659
1660 /*
1661 * Butterfly:
1662 * Ain-----Aout
1663 * \ /
1664 * / \
1665 * Bin-----Bout
1666 */
1667
1668 endptr1 = localBuffer + h->Points * 2;
1669
1670 while(ButterfliesPerGroup > 0)
1671 {
1672 A = localBuffer;
1673 B = localBuffer + ButterfliesPerGroup * 2;
1674 int iSinCosIndex = 0;
1675 while(A < endptr1)
1676 {
1677// int sinCosLookup=(*SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;
1678 int sinCosLookup=( ((sSmallRBTable[*((unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
1679 sin=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mSin);
1680 cos=_mm_set1_ps(sSinCosTable.mSinCosTable[sinCosLookup].mCos);
1681 iSinCosIndex++;
1682 endptr2=B;
1683 while(A<endptr2)
1684 {
1685 v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
1686 v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
1687 *B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
1688 *(A++)=_mm_sub_ps(*(B++), v1);
1689 *B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
1690 *(A++)=_mm_sub_ps(*(B++),v2);
1691 }
1692 A = B;
1693 B = &B[ButterfliesPerGroup * 2];
1694 }
1695 ButterfliesPerGroup >>= 1;
1696 }
1697}
1698
1699void ReorderToTime4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
1700{
1701 __m128 *localBuffer = (__m128 *)buffer;
1702 __m128 *localTimeOut = (__m128 *)TimeOut;
1703 int bitReverseShift = 16 - hFFT->pow2Bits;
1704
1705 // Copy the data into the real outputs
1706 for(size_t i = 0; i < hFFT->Points; i++) {
1707 int brValue;
1708 brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
1709// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
1710 localTimeOut[i*2 ] = localBuffer[brValue ];
1711 localTimeOut[i*2+1] = localBuffer[brValue+1];
1712 }
1713}
1714
1715void ReorderToFreq4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
1716{
1717 __m128 *localBuffer = (__m128 *)buffer;
1718 __m128 *localRealOut = (__m128 *)RealOut;
1719 __m128 *localImagOut = (__m128 *)ImagOut;
1720 int bitReverseShift = 16 - hFFT->pow2Bits;
1721
1722 // Copy the data into the real and imaginary outputs
1723 for(size_t i = 1; i < hFFT->Points; i++) {
1724 int brValue;
1725 brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
1726// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
1727 localRealOut[i] = localBuffer[brValue ];
1728 localImagOut[i] = localBuffer[brValue+1];
1729 }
1730 localRealOut[0] = localBuffer[0]; // DC component
1731 localImagOut[0] = _mm_set1_ps(0.0);
1732 localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
1733 localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
1734}
1735#endif
1736
1737#define FAST_MATH_BR24
1738#ifdef FAST_MATH_BR24
1739
1740/*
1741* Forward FFT routine. Must call GetFFT(fftlen) first!
1742*
1743* Note: Output is BIT-REVERSED! so you must use the BitReversed to
1744* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
1745* Imag_i = buffer[ h->BitReversed[i]+1 ] )
1746* Input is in normal order.
1747*
1748* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
1749* - this can be done because both values will always be real only
1750* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
1751*
1752* Note: The scaling on this is done according to the standard FFT definition,
1753* so a unit amplitude DC signal will output an amplitude of (N)
1754* (Older revisions would progressively scale the input, so the output
1755* values would be similar in amplitude to the input values, which is
1756* good when using fixed point arithmetic)
1757*/
1759{
1760 fft_type *A, *B;
1761 fft_type *endptr1, *endptr2;
1762 int br1Index, br2Index;
1763 int br1Value, br2Value;
1764 fft_type HRplus, HRminus, HIplus, HIminus;
1765 fft_type v1, v2, sin, cos;
1766 fft_type iToRad = 2 * M_PI/(2 * h->Points);
1767 int bitReverseShift = 24 - h->pow2Bits;
1768 int bitReverseShiftM1 = bitReverseShift + 1;
1769 auto ButterfliesPerGroup = h->Points / 2;
1770
1771 /*
1772 * Butterfly:
1773 * Ain-----Aout
1774 * \ /
1775 * / \
1776 * Bin-----Bout
1777 */
1778
1779 endptr1 = buffer + h->Points * 2;
1780
1781 const v4sf zeroes = {0.0,0.0,0.0,0.0};
1782 while(ButterfliesPerGroup > 0)
1783 {
1784 A = buffer;
1785 B = buffer + ButterfliesPerGroup * 2;
1786 int sinCosCalIndex = 0;
1787 int iSinCosIndex = 0;
1788 while(A < endptr1)
1789 {
1790 v4sf sin4_2, cos4_2;
1791 if(!sinCosCalIndex)
1792 {
1793 //v4sf vx=zeroes; // <-- If we want to suppress the C4701 warning later.
1794 v4sf vx;
1795 for(int i=0;i<4;i++) {
1796 int brTemp=iSinCosIndex+i;
1797 vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
1798// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
1799 }
1800 //"Warning C4701 potentially uninitialized local variable 'vx' " is OK.
1801 //vx is initialised component by component, and MSVC doesn't realize.
1802 sincos_ps(vx, &sin4_2, &cos4_2);
1803 sin=-sin4_2.m128_f32[0];
1804 cos=-cos4_2.m128_f32[0];
1805 sinCosCalIndex++;
1806 } else {
1807 sin=-sin4_2.m128_f32[sinCosCalIndex];
1808 cos=-cos4_2.m128_f32[sinCosCalIndex];
1809 if(sinCosCalIndex==3)
1810 sinCosCalIndex=0;
1811 else
1812 sinCosCalIndex++;
1813 }
1814 iSinCosIndex++;
1815 endptr2=B;
1816 while(A<endptr2)
1817 {
1818 v1=*B*cos + *(B+1)*sin;
1819 v2=*B*sin - *(B+1)*cos;
1820 *B=(*A+v1);
1821 *(A++)=*(B++)-2*v1;
1822 *B=(*A-v2);
1823 *(A++)=*(B++)+2*v2;
1824 }
1825 A = B;
1826 B += ButterfliesPerGroup * 2;
1827 }
1828 ButterfliesPerGroup >>= 1;
1829 }
1830 /* Massage output to get the output for a real input sequence. */
1831
1832 br1Index = 1; // h->BitReversed + 1;
1833 br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
1834
1835 int sinCosCalIndex = 0;
1836 while(br1Index < br2Index)
1837 {
1838 v4sf sin4_2, cos4_2;
1839 br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
1840 br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br2Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
1841 if(!sinCosCalIndex)
1842 {
1843 v4sf vx;
1844 for(int i=0;i<4;i++)
1845 vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
1846 sincos_ps(vx, &sin4_2, &cos4_2);
1847 sin=-sin4_2.m128_f32[0];
1848 cos=-cos4_2.m128_f32[0];
1849 sinCosCalIndex++;
1850 } else {
1851 sin=-sin4_2.m128_f32[sinCosCalIndex];
1852 cos=-cos4_2.m128_f32[sinCosCalIndex];
1853 if(sinCosCalIndex==3)
1854 sinCosCalIndex=0;
1855 else
1856 sinCosCalIndex++;
1857 }
1858 A=&buffer[br1Value];
1859 B=&buffer[br2Value];
1860 HRplus = (HRminus = *A - *B ) + (*B * 2);
1861 HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
1862 v1 = (sin*HRminus - cos*HIplus);
1863 v2 = (cos*HRminus + sin*HIplus);
1864 *A = (HRplus + v1) * (fft_type)0.5;
1865 *B = *A - v1;
1866 *(A+1) = (HIminus + v2) * (fft_type)0.5;
1867 *(B+1) = *(A+1) - HIminus;
1868
1869 br1Index++;
1870 br2Index--;
1871 }
1872 /* Handle the center bin (just need a conjugate) */
1873// A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
1874 A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift)+1];
1875
1876 /* Handle DC bin separately - and ignore the Fs/2 bin
1877 buffer[0]+=buffer[1];
1878 buffer[1]=(fft_type)0;*/
1879 /* Handle DC and Fs/2 bins separately */
1880 /* Put the Fs/2 value into the imaginary part of the DC bin */
1881 v1=buffer[0]-buffer[1];
1882 buffer[0]+=buffer[1];
1883 buffer[1]=v1;
1884}
1885
1886/* Description: This routine performs an inverse FFT to real data.
1887* This code is for floating point data.
1888*
1889* Note: Output is BIT-REVERSED! so you must use the BitReversed to
1890* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
1891* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
1892* Input is in normal order, interleaved (real,imaginary) complex data
1893* You must call GetFFT(fftlen) first to initialize some buffers!
1894*
1895* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
1896* - this can be done because both values will always be real only
1897* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
1898*
1899* Note: The scaling on this is done according to the standard FFT definition,
1900* so a unit amplitude DC signal will output an amplitude of (N)
1901* (Older revisions would progressively scale the input, so the output
1902* values would be similar in amplitude to the input values, which is
1903* good when using fixed point arithmetic)
1904*/
1906{
1907 fft_type *A,*B;
1908 fft_type *endptr1,*endptr2;
1909 int br1Index;
1910 fft_type HRplus, HRminus, HIplus, HIminus;
1911 fft_type v1, v2, sin, cos;
1912 fft_type iToRad = 2 * M_PI / (2 * h->Points);
1913 int bitReverseShiftM1 = 25 - h->pow2Bits;
1914
1915 auto ButterfliesPerGroup = h->Points / 2;
1916
1917 /* Massage input to get the input for a real output sequence. */
1918 A = buffer + 2;
1919 B = buffer + h->Points * 2 - 2;
1920 br1Index = 1; //h->BitReversed + 1;
1921 int sinCosCalIndex = 0;
1922 while(A < B)
1923 {
1924 v4sf sin4_2, cos4_2;
1925 if(!sinCosCalIndex)
1926 {
1927 v4sf vx;
1928 for(int i=0;i<4;i++)
1929 vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
1930 sincos_ps(vx, &sin4_2, &cos4_2);
1931 sin=-sin4_2.m128_f32[0];
1932 cos=-cos4_2.m128_f32[0];
1933 sinCosCalIndex++;
1934 } else {
1935 sin=-sin4_2.m128_f32[sinCosCalIndex];
1936 cos=-cos4_2.m128_f32[sinCosCalIndex];
1937 if(sinCosCalIndex==3)
1938 sinCosCalIndex=0;
1939 else
1940 sinCosCalIndex++;
1941 }
1942 HRplus = (HRminus = *A - *B ) + (*B * 2);
1943 HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
1944 v1 = (sin*HRminus + cos*HIplus);
1945 v2 = (cos*HRminus - sin*HIplus);
1946 *A = (HRplus + v1) * (fft_type)0.5;
1947 *B = *A - v1;
1948 *(A+1) = (HIminus - v2) * (fft_type)0.5;
1949 *(B+1) = *(A+1) - HIminus;
1950
1951 A=&A[2];
1952 B=&B[-2];
1953 br1Index++;
1954 }
1955 /* Handle center bin (just need conjugate) */
1956 *(A+1)=-*(A+1);
1957 /* Handle DC bin separately - this ignores any Fs/2 component
1958 buffer[1]=buffer[0]=buffer[0]/2;*/
1959 /* Handle DC and Fs/2 bins specially */
1960 /* The DC bin is passed in as the real part of the DC complex value */
1961 /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
1962 /* (v1+v2) = buffer[0] == the DC component */
1963 /* (v1-v2) = buffer[1] == the Fs/2 component */
1964 v1=0.5f*(buffer[0]+buffer[1]);
1965 v2=0.5f*(buffer[0]-buffer[1]);
1966 buffer[0]=v1;
1967 buffer[1]=v2;
1968
1969 /*
1970 * Butterfly:
1971 * Ain-----Aout
1972 * \ /
1973 * / \
1974 * Bin-----Bout
1975 */
1976
1977 endptr1 = buffer + h->Points * 2;
1978
1979 while(ButterfliesPerGroup > 0)
1980 {
1981 A = buffer;
1982 B = buffer + ButterfliesPerGroup * 2;
1983 int sinCosCalIndex = 0;
1984 int iSinCosIndex = 0;
1985 while(A < endptr1)
1986 {
1987 v4sf sin4_2, cos4_2;
1988 if(!sinCosCalIndex)
1989 {
1990 v4sf vx;
1991 for(int i=0;i<4;i++) {
1992 int brTemp=iSinCosIndex+i;
1993 vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
1994// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
1995 }
1996 sincos_ps(vx, &sin4_2, &cos4_2);
1997 sin=-sin4_2.m128_f32[0];
1998 cos=-cos4_2.m128_f32[0];
1999 sinCosCalIndex++;
2000 } else {
2001 sin=-sin4_2.m128_f32[sinCosCalIndex];
2002 cos=-cos4_2.m128_f32[sinCosCalIndex];
2003 if(sinCosCalIndex==3)
2004 sinCosCalIndex=0;
2005 else
2006 sinCosCalIndex++;
2007 }
2008 iSinCosIndex++;
2009 endptr2=B;
2010 while(A<endptr2)
2011 {
2012 v1=*B*cos - *(B+1)*sin;
2013 v2=*B*sin + *(B+1)*cos;
2014 *B=(*A+v1)*(fft_type)0.5;
2015 *(A++)=*(B++)-v1;
2016 *B=(*A+v2)*(fft_type)0.5;
2017 *(A++)=*(B++)-v2;
2018 }
2019 A = B;
2020 B += ButterfliesPerGroup * 2;
2021 }
2022 ButterfliesPerGroup >>= 1;
2023 }
2024}
2025
2026void ReorderToFreq1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
2027{
2028 int bitReverseShift = 24 - hFFT->pow2Bits;
2029 // Copy the data into the real and imaginary outputs
2030 for(size_t i = 1; i < hFFT->Points; i++) {
2031 int brValue;
2032// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
2033 brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);
2034
2035 RealOut[i] = buffer[brValue ];
2036 ImagOut[i] = buffer[brValue+1];
2037 }
2038 RealOut[0] = buffer[0]; // DC component
2039 ImagOut[0] = 0;
2040 RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
2041 ImagOut[hFFT->Points] = 0;
2042}
2043
2044void ReorderToTime1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
2045{
2046 int bitReverseShift = 24 - hFFT->pow2Bits;
2047 // Copy the data into the real outputs
2048 for(size_t i = 0; i < hFFT->Points; i++) {
2049 int brValue;
2050 brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);
2051// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
2052 TimeOut[i*2 ] = buffer[brValue ];
2053 TimeOut[i*2+1] = buffer[brValue+1];
2054 }
2055}
2056
2058{
2059
2060 __m128 *localBuffer=(__m128 *)buffer;
2061
2062 __m128 *A,*B;
2063 __m128 *endptr1,*endptr2;
2064 int br1Index, br2Index;
2065 int br1Value, br2Value;
2066 __m128 HRplus,HRminus,HIplus,HIminus;
2067 __m128 v1,v2,sin,cos;
2068 fft_type iToRad = 2 * M_PI/(2 * h->Points);
2069 auto ButterfliesPerGroup = h->Points / 2;
2070 int bitReverseShift = 24 - h->pow2Bits;
2071 int bitReverseShiftM1 = bitReverseShift + 1;
2072
2073 /*
2074 * Butterfly:
2075 * Ain-----Aout
2076 * \ /
2077 * / \
2078 * Bin-----Bout
2079 */
2080
2081 endptr1 = &localBuffer[h->Points * 2];
2082
2083 while(ButterfliesPerGroup > 0)
2084 {
2085 A = localBuffer;
2086 B = &localBuffer[ButterfliesPerGroup * 2];
2087 int sinCosCalIndex = 0;
2088 int iSinCosIndex = 0;
2089 while(A < endptr1)
2090 {
2091 v4sf sin4_2, cos4_2;
2092 if(!sinCosCalIndex)
2093 {
2094 v4sf vx;
2095 for(int i=0;i<4;i++) {
2096 int brTemp=iSinCosIndex+i;
2097 vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
2098// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
2099 }
2100 sincos_ps(vx, &sin4_2, &cos4_2);
2101 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2102 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2103 sinCosCalIndex++;
2104 } else {
2105 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2106 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2107 if(sinCosCalIndex==3)
2108 sinCosCalIndex=0;
2109 else
2110 sinCosCalIndex++;
2111 }
2112 iSinCosIndex++;
2113 endptr2=B;
2114 while(A<endptr2)
2115 {
2116 v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
2117 v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
2118 *B=_mm_add_ps( *A, v1);
2119 __m128 temp128 = _mm_set1_ps( 2.0);
2120 *(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
2121 *B=_mm_sub_ps(*A,v2);
2122 *(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
2123 }
2124 A = B;
2125 B = &B[ButterfliesPerGroup * 2];
2126 }
2127 ButterfliesPerGroup >>= 1;
2128 }
2129 /* Massage output to get the output for a real input sequence. */
2130
2131 br1Index = 1; // h->BitReversed+1;
2132 br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
2133
2134 int sinCosCalIndex = 0;
2135 while(br1Index < br2Index)
2136 {
2137 v4sf sin4_2, cos4_2;
2138 br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
2139 br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br2Index)+2)] )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
2140 if(!sinCosCalIndex)
2141 {
2142 v4sf vx;
2143 for(int i=0;i<4;i++)
2144 vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
2145 sincos_ps(vx, &sin4_2, &cos4_2);
2146 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2147 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2148 sinCosCalIndex++;
2149 } else {
2150 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2151 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2152 if(sinCosCalIndex==3)
2153 sinCosCalIndex=0;
2154 else
2155 sinCosCalIndex++;
2156 }
2157 A=&localBuffer[br1Value];
2158 B=&localBuffer[br2Value];
2159 __m128 temp128 = _mm_set1_ps( 2.0);
2160 HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
2161 HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
2162 v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
2163 v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
2164 temp128 = _mm_set1_ps( 0.5);
2165 *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
2166 *B = _mm_sub_ps(*A, v1);
2167 *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
2168 *(B+1) = _mm_sub_ps(*(A+1), HIminus);
2169
2170 br1Index++;
2171 br2Index--;
2172 }
2173 /* Handle the center bin (just need a conjugate) */
2174// A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
2175 A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&br1Index)+2)] )>>bitReverseShift)+1];
2176
2177 // negate sse style
2178 *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
2179 /* Handle DC and Fs/2 bins separately */
2180 /* Put the Fs/2 value into the imaginary part of the DC bin */
2181 v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
2182 localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
2183 localBuffer[1]=v1;
2184}
2185
2186/* Description: This routine performs an inverse FFT to real data.
2187* This code is for floating point data.
2188*
2189* Note: Output is BIT-REVERSED! so you must use the BitReversed to
2190* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
2191* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
2192* Input is in normal order, interleaved (real,imaginary) complex data
2193* You must call GetFFT(fftlen) first to initialize some buffers!
2194*
2195* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
2196* - this can be done because both values will always be real only
2197* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
2198*
2199* Note: The scaling on this is done according to the standard FFT definition,
2200* so a unit amplitude DC signal will output an amplitude of (N)
2201* (Older revisions would progressively scale the input, so the output
2202* values would be similar in amplitude to the input values, which is
2203* good when using fixed point arithmetic)
2204*/
2206{
2207
2208 __m128 *localBuffer=(__m128 *)buffer;
2209
2210 __m128 *A,*B;
2211 __m128 *endptr1,*endptr2;
2212 int br1Index;
2213 __m128 HRplus,HRminus,HIplus,HIminus;
2214 __m128 v1,v2,sin,cos;
2215 fft_type iToRad = 2 * M_PI/(2 * h->Points);
2216 int bitReverseShiftM1 = 25 - h->pow2Bits;
2217 auto ButterfliesPerGroup = h->Points / 2;
2218
2219 /* Massage input to get the input for a real output sequence. */
2220 A = localBuffer + 2;
2221 B = localBuffer + h->Points * 2 - 2;
2222 br1Index = 1; //h->BitReversed + 1;
2223 int sinCosCalIndex = 0;
2224 while(A < B)
2225 {
2226 v4sf sin4_2, cos4_2;
2227 if(!sinCosCalIndex)
2228 {
2229 v4sf vx;
2230 for(int i=0;i<4;i++)
2231 vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
2232 sincos_ps(vx, &sin4_2, &cos4_2);
2233 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2234 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2235 sinCosCalIndex++;
2236 } else {
2237 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2238 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2239 if(sinCosCalIndex==3)
2240 sinCosCalIndex=0;
2241 else
2242 sinCosCalIndex++;
2243 }
2244 HRminus = _mm_sub_ps(*A, *B);
2245 HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
2246 HIminus = _mm_sub_ps( *(A+1), *(B+1));
2247 HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
2248 v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
2249 v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
2250 *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
2251 *B = _mm_sub_ps(*A, v1);
2252 *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
2253 *(B+1) = _mm_sub_ps(*(A+1), HIminus);
2254
2255 A=&A[2];
2256 B=&B[-2];
2257 br1Index++;
2258 }
2259 /* Handle center bin (just need conjugate) */
2260 // negate sse style
2261 *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
2262
2263 /* Handle DC bin separately - this ignores any Fs/2 component
2264 buffer[1]=buffer[0]=buffer[0]/2;*/
2265 /* Handle DC and Fs/2 bins specially */
2266 /* The DC bin is passed in as the real part of the DC complex value */
2267 /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
2268 /* (v1+v2) = buffer[0] == the DC component */
2269 /* (v1-v2) = buffer[1] == the Fs/2 component */
2270 v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
2271 v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
2272 localBuffer[0]=v1;
2273 localBuffer[1]=v2;
2274
2275 /*
2276 * Butterfly:
2277 * Ain-----Aout
2278 * \ /
2279 * / \
2280 * Bin-----Bout
2281 */
2282
2283 endptr1 = localBuffer + h->Points * 2;
2284
2285 while(ButterfliesPerGroup > 0)
2286 {
2287 A = localBuffer;
2288 B = localBuffer + ButterfliesPerGroup * 2;
2289 int sinCosCalIndex = 0;
2290 int iSinCosIndex = 0;
2291 while(A < endptr1)
2292 {
2293 v4sf sin4_2, cos4_2;
2294 if(!sinCosCalIndex)
2295 {
2296 v4sf vx;
2297 for(int i=0;i<4;i++) {
2298 int brTemp=iSinCosIndex+i;
2299 vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
2300// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
2301 }
2302 sincos_ps(vx, &sin4_2, &cos4_2);
2303 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2304 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2305 sinCosCalIndex++;
2306 } else {
2307 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2308 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2309 if(sinCosCalIndex==3)
2310 sinCosCalIndex=0;
2311 else
2312 sinCosCalIndex++;
2313 }
2314 iSinCosIndex++;
2315 endptr2=B;
2316 while(A<endptr2)
2317 {
2318 v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
2319 v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
2320 *B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
2321 *(A++)=_mm_sub_ps(*(B++), v1);
2322 *B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
2323 *(A++)=_mm_sub_ps(*(B++),v2);
2324 }
2325 A = B;
2326 B = &B[ButterfliesPerGroup * 2];
2327 }
2328 ButterfliesPerGroup >>= 1;
2329 }
2330}
2331
2332void ReorderToFreq4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
2333{
2334 __m128 *localBuffer = (__m128 *)buffer;
2335 __m128 *localRealOut = (__m128 *)RealOut;
2336 __m128 *localImagOut = (__m128 *)ImagOut;
2337 int bitReverseShift = 24-hFFT->pow2Bits;
2338
2339
2340 // Copy the data into the real and imaginary outputs
2341 for(size_t i = 1; i < hFFT->Points; i++) {
2342 int brValue;
2343 brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);
2344// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
2345 localRealOut[i]=localBuffer[brValue ];
2346 localImagOut[i]=localBuffer[brValue+1];
2347 }
2348 localRealOut[0] = localBuffer[0]; // DC component
2349 localImagOut[0] = _mm_set1_ps(0.0);
2350 localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
2351 localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
2352}
2353
2354void ReorderToTime4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
2355{
2356 __m128 *localBuffer = (__m128 *)buffer;
2357 __m128 *localTimeOut = (__m128 *)TimeOut;
2358 int bitReverseShift = 24-hFFT->pow2Bits;
2359
2360 // Copy the data into the real outputs
2361 for(size_t i = 0; i < hFFT->Points; i++) {
2362 int brValue;
2363 brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<16) + (sSmallRBTable[*(((unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((unsigned char *)&i)+2)] )>>bitReverseShift);
2364// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
2365 localTimeOut[i*2 ] = localBuffer[brValue ];
2366 localTimeOut[i*2+1] = localBuffer[brValue+1];
2367 }
2368}
2369
2370#endif
2371
2372#define FAST_MATH_BR16
2373#ifdef FAST_MATH_BR16
2374
2375/*
2376* Forward FFT routine. Must call GetFFT(fftlen) first!
2377*
2378* Note: Output is BIT-REVERSED! so you must use the BitReversed to
2379* get legible output, (i.e. Real_i = buffer[ h->BitReversed[i] ]
2380* Imag_i = buffer[ h->BitReversed[i]+1 ] )
2381* Input is in normal order.
2382*
2383* Output buffer[0] is the DC bin, and output buffer[1] is the Fs/2 bin
2384* - this can be done because both values will always be real only
2385* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
2386*
2387* Note: The scaling on this is done according to the standard FFT definition,
2388* so a unit amplitude DC signal will output an amplitude of (N)
2389* (Older revisions would progressively scale the input, so the output
2390* values would be similar in amplitude to the input values, which is
2391* good when using fixed point arithmetic)
2392*/
2394{
2395 fft_type *A,*B;
2396 fft_type *endptr1,*endptr2;
2397 int br1Index, br2Index;
2398 int br1Value, br2Value;
2399 fft_type HRplus,HRminus,HIplus,HIminus;
2400 fft_type v1,v2,sin,cos;
2401 fft_type iToRad = 2 * M_PI / (2 * h->Points);
2402 int bitReverseShiftM1 = 17 - h->pow2Bits;
2403 int bitReverseShift = bitReverseShiftM1 - 1;
2404 auto ButterfliesPerGroup = h->Points / 2;
2405
2406 /*
2407 * Butterfly:
2408 * Ain-----Aout
2409 * \ /
2410 * / \
2411 * Bin-----Bout
2412 */
2413
2414 endptr1 = buffer + h->Points * 2;
2415
2416 while(ButterfliesPerGroup > 0)
2417 {
2418 A = buffer;
2419 B = buffer + ButterfliesPerGroup * 2;
2420 int sinCosCalIndex = 0;
2421 int iSinCosIndex = 0;
2422 while(A < endptr1)
2423 {
2424 v4sf sin4_2, cos4_2;
2425 if(!sinCosCalIndex)
2426 {
2427 v4sf vx;
2428 for(int i=0;i<4;i++) {
2429 int brTemp=iSinCosIndex+i;
2430 vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
2431// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
2432 }
2433 sincos_ps(vx, &sin4_2, &cos4_2);
2434 sin=-sin4_2.m128_f32[0];
2435 cos=-cos4_2.m128_f32[0];
2436 sinCosCalIndex++;
2437 } else {
2438 sin=-sin4_2.m128_f32[sinCosCalIndex];
2439 cos=-cos4_2.m128_f32[sinCosCalIndex];
2440 if(sinCosCalIndex==3)
2441 sinCosCalIndex=0;
2442 else
2443 sinCosCalIndex++;
2444 }
2445 iSinCosIndex++;
2446 endptr2=B;
2447 while(A<endptr2)
2448 {
2449 v1=*B*cos + *(B+1)*sin;
2450 v2=*B*sin - *(B+1)*cos;
2451 *B=(*A+v1);
2452 *(A++)=*(B++)-2*v1;
2453 *B=(*A-v2);
2454 *(A++)=*(B++)+2*v2;
2455 }
2456 A = B;
2457 B += ButterfliesPerGroup * 2;
2458 }
2459 ButterfliesPerGroup >>= 1;
2460 }
2461 /* Massage output to get the output for a real input sequence. */
2462
2463 br1Index = 1; // h->BitReversed+1;
2464 br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
2465
2466 int sinCosCalIndex = 0;
2467 while(br1Index < br2Index)
2468 {
2469 v4sf sin4_2, cos4_2;
2470 br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
2471 br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
2472 if(!sinCosCalIndex)
2473 {
2474 v4sf vx;
2475 for(int i = 0; i < 4; i++)
2476 vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
2477 sincos_ps(vx, &sin4_2, &cos4_2);
2478 sin = -sin4_2.m128_f32[0];
2479 cos=-cos4_2.m128_f32[0];
2480 sinCosCalIndex++;
2481 } else {
2482 sin=-sin4_2.m128_f32[sinCosCalIndex];
2483 cos=-cos4_2.m128_f32[sinCosCalIndex];
2484 if(sinCosCalIndex==3)
2485 sinCosCalIndex=0;
2486 else
2487 sinCosCalIndex++;
2488 }
2489 A=&buffer[br1Value];
2490 B=&buffer[br2Value];
2491 HRplus = (HRminus = *A - *B ) + (*B * 2);
2492 HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
2493 v1 = (sin*HRminus - cos*HIplus);
2494 v2 = (cos*HRminus + sin*HIplus);
2495 *A = (HRplus + v1) * (fft_type)0.5;
2496 *B = *A - v1;
2497 *(A+1) = (HIminus + v2) * (fft_type)0.5;
2498 *(B+1) = *(A+1) - HIminus;
2499
2500 br1Index++;
2501 br2Index--;
2502 }
2503 /* Handle the center bin (just need a conjugate) */
2504// A=&buffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
2505 A=&buffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
2506
2507 /* Handle DC bin separately - and ignore the Fs/2 bin
2508 buffer[0]+=buffer[1];
2509 buffer[1]=(fft_type)0;*/
2510 /* Handle DC and Fs/2 bins separately */
2511 /* Put the Fs/2 value into the imaginary part of the DC bin */
2512 v1=buffer[0]-buffer[1];
2513 buffer[0]+=buffer[1];
2514 buffer[1]=v1;
2515}
2516
2517/* Description: This routine performs an inverse FFT to real data.
2518* This code is for floating point data.
2519*
2520* Note: Output is BIT-REVERSED! so you must use the BitReversed to
2521* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
2522* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
2523* Input is in normal order, interleaved (real,imaginary) complex data
2524* You must call GetFFT(fftlen) first to initialize some buffers!
2525*
2526* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
2527* - this can be done because both values will always be real only
2528* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
2529*
2530* Note: The scaling on this is done according to the standard FFT definition,
2531* so a unit amplitude DC signal will output an amplitude of (N)
2532* (Older revisions would progressively scale the input, so the output
2533* values would be similar in amplitude to the input values, which is
2534* good when using fixed point arithmetic)
2535*/
2537{
2538 fft_type *A,*B;
2539 fft_type *endptr1,*endptr2;
2540 int br1Index;
2541 fft_type HRplus,HRminus,HIplus,HIminus;
2542 fft_type v1,v2,sin,cos;
2543 fft_type iToRad=2 * M_PI / (2 * h->Points);
2544 int bitReverseShiftM1=17-h->pow2Bits;
2545
2546 auto ButterfliesPerGroup = h->Points / 2;
2547
2548 /* Massage input to get the input for a real output sequence. */
2549 A = buffer + 2;
2550 B = buffer + h->Points * 2 - 2;
2551 br1Index = 1; //h->BitReversed + 1;
2552 int sinCosCalIndex = 0;
2553 while(A < B)
2554 {
2555 v4sf sin4_2, cos4_2;
2556 if(!sinCosCalIndex)
2557 {
2558 v4sf vx;
2559 for(int i=0;i<4;i++)
2560 vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
2561 sincos_ps(vx, &sin4_2, &cos4_2);
2562 sin=-sin4_2.m128_f32[0];
2563 cos=-cos4_2.m128_f32[0];
2564 sinCosCalIndex++;
2565 } else {
2566 sin=-sin4_2.m128_f32[sinCosCalIndex];
2567 cos=-cos4_2.m128_f32[sinCosCalIndex];
2568 if(sinCosCalIndex==3)
2569 sinCosCalIndex=0;
2570 else
2571 sinCosCalIndex++;
2572 }
2573 HRplus = (HRminus = *A - *B ) + (*B * 2);
2574 HIplus = (HIminus = *(A+1) - *(B+1)) + (*(B+1) * 2);
2575 v1 = (sin*HRminus + cos*HIplus);
2576 v2 = (cos*HRminus - sin*HIplus);
2577 *A = (HRplus + v1) * (fft_type)0.5;
2578 *B = *A - v1;
2579 *(A+1) = (HIminus - v2) * (fft_type)0.5;
2580 *(B+1) = *(A+1) - HIminus;
2581
2582 A=&A[2];
2583 B=&B[-2];
2584 br1Index++;
2585 }
2586 /* Handle center bin (just need conjugate) */
2587 *(A+1)=-*(A+1);
2588 /* Handle DC bin separately - this ignores any Fs/2 component
2589 buffer[1]=buffer[0]=buffer[0]/2;*/
2590 /* Handle DC and Fs/2 bins specially */
2591 /* The DC bin is passed in as the real part of the DC complex value */
2592 /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
2593 /* (v1+v2) = buffer[0] == the DC component */
2594 /* (v1-v2) = buffer[1] == the Fs/2 component */
2595 v1=0.5f*(buffer[0]+buffer[1]);
2596 v2=0.5f*(buffer[0]-buffer[1]);
2597 buffer[0]=v1;
2598 buffer[1]=v2;
2599
2600 /*
2601 * Butterfly:
2602 * Ain-----Aout
2603 * \ /
2604 * / \
2605 * Bin-----Bout
2606 */
2607
2608 endptr1 = buffer + h->Points * 2;
2609
2610 while(ButterfliesPerGroup > 0)
2611 {
2612 A = buffer;
2613 B = buffer + ButterfliesPerGroup * 2;
2614 int sinCosCalIndex = 0;
2615 int iSinCosIndex = 0;
2616 while(A < endptr1)
2617 {
2618 v4sf sin4_2, cos4_2;
2619 if(!sinCosCalIndex)
2620 {
2621 v4sf vx;
2622 for(int i = 0; i < 4; i++) {
2623 int brTemp = iSinCosIndex + i;
2624 vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
2625// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
2626 }
2627 sincos_ps(vx, &sin4_2, &cos4_2);
2628 sin=-sin4_2.m128_f32[0];
2629 cos=-cos4_2.m128_f32[0];
2630 sinCosCalIndex++;
2631 } else {
2632 sin=-sin4_2.m128_f32[sinCosCalIndex];
2633 cos=-cos4_2.m128_f32[sinCosCalIndex];
2634 if(sinCosCalIndex==3)
2635 sinCosCalIndex=0;
2636 else
2637 sinCosCalIndex++;
2638 }
2639 iSinCosIndex++;
2640 endptr2=B;
2641 while(A<endptr2)
2642 {
2643 v1=*B*cos - *(B+1)*sin;
2644 v2=*B*sin + *(B+1)*cos;
2645 *B=(*A+v1)*(fft_type)0.5;
2646 *(A++)=*(B++)-v1;
2647 *B=(*A+v2)*(fft_type)0.5;
2648 *(A++)=*(B++)-v2;
2649 }
2650 A = B;
2651 B += ButterfliesPerGroup * 2;
2652 }
2653 ButterfliesPerGroup >>= 1;
2654 }
2655}
2656
2657void ReorderToFreq1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
2658{
2659 int bitReverseShift = 16 - hFFT->pow2Bits;
2660 // Copy the data into the real and imaginary outputs
2661 for(size_t i = 1; i < hFFT->Points; i++) {
2662 int brValue;
2663// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
2664 brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
2665 RealOut[i] = buffer[brValue ];
2666 ImagOut[i] = buffer[brValue+1];
2667 }
2668 RealOut[0] = buffer[0]; // DC component
2669 ImagOut[0] = 0;
2670 RealOut[hFFT->Points] = buffer[1]; // Fs/2 component
2671 ImagOut[hFFT->Points] = 0;
2672}
2673
2674void ReorderToTime1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
2675{
2676 int bitReverseShift=16-hFFT->pow2Bits;
2677 // Copy the data into the real outputs
2678 for(size_t i = 0; i < hFFT->Points; i++) {
2679 int brValue;
2680 brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
2681// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
2682 TimeOut[i*2 ] = buffer[brValue ];
2683 TimeOut[i*2+1] = buffer[brValue+1];
2684 }
2685}
2686
2688{
2689
2690 __m128 *localBuffer = (__m128 *)buffer;
2691
2692 __m128 *A,*B;
2693 __m128 *endptr1, *endptr2;
2694 int br1Index, br2Index;
2695 int br1Value, br2Value;
2696 __m128 HRplus, HRminus, HIplus, HIminus;
2697 __m128 v1, v2, sin, cos;
2698 fft_type iToRad = 2 * M_PI/(2 * h->Points);
2699 auto ButterfliesPerGroup = h->Points / 2;
2700 int bitReverseShiftM1 = 17 - h->pow2Bits;
2701 int bitReverseShift = bitReverseShiftM1 - 1;
2702
2703 /*
2704 * Butterfly:
2705 * Ain-----Aout
2706 * \ /
2707 * / \
2708 * Bin-----Bout
2709 */
2710
2711 endptr1 = &localBuffer[h->Points * 2];
2712
2713 while(ButterfliesPerGroup > 0)
2714 {
2715 A = localBuffer;
2716 B = &localBuffer[ButterfliesPerGroup * 2];
2717 int sinCosCalIndex = 0;
2718 int iSinCosIndex = 0;
2719 while(A < endptr1)
2720 {
2721 v4sf sin4_2, cos4_2;
2722 if(!sinCosCalIndex)
2723 {
2724 v4sf vx;
2725 for(int i=0;i<4;i++) {
2726 int brTemp=iSinCosIndex+i;
2727 vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
2728// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
2729 }
2730 sincos_ps(vx, &sin4_2, &cos4_2);
2731 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2732 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2733 sinCosCalIndex++;
2734 } else {
2735 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2736 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2737 if(sinCosCalIndex==3)
2738 sinCosCalIndex=0;
2739 else
2740 sinCosCalIndex++;
2741 }
2742 iSinCosIndex++;
2743 endptr2=B;
2744 while(A<endptr2)
2745 {
2746 v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
2747 v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
2748 *B=_mm_add_ps( *A, v1);
2749 __m128 temp128 = _mm_set1_ps( 2.0);
2750 *(A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
2751 *B=_mm_sub_ps(*A,v2);
2752 *(A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
2753 }
2754 A = B;
2755 B = &B[ButterfliesPerGroup * 2];
2756 }
2757 ButterfliesPerGroup >>= 1;
2758 }
2759 /* Massage output to get the output for a real input sequence. */
2760
2761 br1Index = 1; // h->BitReversed + 1;
2762 br2Index = h->Points - 1; //h->BitReversed + h->Points - 1;
2763
2764 int sinCosCalIndex = 0;
2765 while(br1Index < br2Index)
2766 {
2767 v4sf sin4_2, cos4_2;
2768 br1Value=( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br1Index);
2769 br2Value=( ((sSmallRBTable[*((unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br2Index)+1)]) )>>bitReverseShift); // (*SmallVRB[h->pow2Bits])(br2Index);
2770 if(!sinCosCalIndex)
2771 {
2772 v4sf vx;
2773 for(int i = 0; i < 4; i++)
2774 vx.m128_f32[i] = ((float)(br1Index+i)) * iToRad;
2775 sincos_ps(vx, &sin4_2, &cos4_2);
2776 sin = _mm_set1_ps(-sin4_2.m128_f32[0]);
2777 cos = _mm_set1_ps(-cos4_2.m128_f32[0]);
2778 sinCosCalIndex++;
2779 } else {
2780 sin = _mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2781 cos = _mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2782 if(sinCosCalIndex == 3)
2783 sinCosCalIndex = 0;
2784 else
2785 sinCosCalIndex++;
2786 }
2787 A = &localBuffer[br1Value];
2788 B = &localBuffer[br2Value];
2789 __m128 temp128 = _mm_set1_ps( 2.0);
2790 HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *A, *B ), _mm_mul_ps(*B, temp128));
2791 HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
2792 v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
2793 v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
2794 temp128 = _mm_set1_ps( 0.5);
2795 *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
2796 *B = _mm_sub_ps(*A, v1);
2797 *(A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
2798 *(B+1) = _mm_sub_ps(*(A+1), HIminus);
2799
2800 br1Index++;
2801 br2Index--;
2802 }
2803 /* Handle the center bin (just need a conjugate) */
2804// A=&localBuffer[(*SmallVRB[h->pow2Bits])(br1Index)+1];
2805 A=&localBuffer[( ((sSmallRBTable[*((unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
2806
2807 // negate sse style
2808 *A=_mm_xor_ps(*A, _mm_set1_ps(-0.f));
2809 /* Handle DC and Fs/2 bins separately */
2810 /* Put the Fs/2 value into the imaginary part of the DC bin */
2811 v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
2812 localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
2813 localBuffer[1]=v1;
2814}
2815
2816/* Description: This routine performs an inverse FFT to real data.
2817* This code is for floating point data.
2818*
2819* Note: Output is BIT-REVERSED! so you must use the BitReversed to
2820* get legible output, (i.e. wave[2*i] = buffer[ BitReversed[i] ]
2821* wave[2*i+1] = buffer[ BitReversed[i]+1 ] )
2822* Input is in normal order, interleaved (real,imaginary) complex data
2823* You must call GetFFT(fftlen) first to initialize some buffers!
2824*
2825* Input buffer[0] is the DC bin, and input buffer[1] is the Fs/2 bin
2826* - this can be done because both values will always be real only
2827* - this allows us to not have to allocate an extra complex value for the Fs/2 bin
2828*
2829* Note: The scaling on this is done according to the standard FFT definition,
2830* so a unit amplitude DC signal will output an amplitude of (N)
2831* (Older revisions would progressively scale the input, so the output
2832* values would be similar in amplitude to the input values, which is
2833* good when using fixed point arithmetic)
2834*/
2836{
2837
2838 __m128 *localBuffer=(__m128 *)buffer;
2839
2840 __m128 *A, *B;
2841 __m128 *endptr1, *endptr2;
2842 int br1Index;
2843 __m128 HRplus, HRminus, HIplus, HIminus;
2844 __m128 v1, v2, sin, cos;
2845 fft_type iToRad = 2 * M_PI/(2 * h->Points);
2846 int bitReverseShiftM1 = 17 - h->pow2Bits;
2847 auto ButterfliesPerGroup = h->Points / 2;
2848
2849 /* Massage input to get the input for a real output sequence. */
2850 A = localBuffer + 2;
2851 B = localBuffer + h->Points * 2 - 2;
2852 br1Index=1; //h->BitReversed+1;
2853 int sinCosCalIndex=0;
2854 while(A<B)
2855 {
2856 v4sf sin4_2, cos4_2;
2857 if(!sinCosCalIndex)
2858 {
2859 v4sf vx;
2860 for(int i=0;i<4;i++)
2861 vx.m128_f32[i]=((float)(br1Index+i))*iToRad;
2862 sincos_ps(vx, &sin4_2, &cos4_2);
2863 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2864 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2865 sinCosCalIndex++;
2866 } else {
2867 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2868 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2869 if(sinCosCalIndex==3)
2870 sinCosCalIndex=0;
2871 else
2872 sinCosCalIndex++;
2873 }
2874 HRminus = _mm_sub_ps(*A, *B);
2875 HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
2876 HIminus = _mm_sub_ps( *(A+1), *(B+1));
2877 HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
2878 v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
2879 v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
2880 *A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
2881 *B = _mm_sub_ps(*A, v1);
2882 *(A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
2883 *(B+1) = _mm_sub_ps(*(A+1), HIminus);
2884
2885 A=&A[2];
2886 B=&B[-2];
2887 br1Index++;
2888 }
2889 /* Handle center bin (just need conjugate) */
2890 // negate sse style
2891 *(A+1)=_mm_xor_ps(*(A+1), _mm_set1_ps(-0.f));
2892
2893 /* Handle DC bin separately - this ignores any Fs/2 component
2894 buffer[1]=buffer[0]=buffer[0]/2;*/
2895 /* Handle DC and Fs/2 bins specially */
2896 /* The DC bin is passed in as the real part of the DC complex value */
2897 /* The Fs/2 bin is passed in as the imaginary part of the DC complex value */
2898 /* (v1+v2) = buffer[0] == the DC component */
2899 /* (v1-v2) = buffer[1] == the Fs/2 component */
2900 v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
2901 v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
2902 localBuffer[0]=v1;
2903 localBuffer[1]=v2;
2904
2905 /*
2906 * Butterfly:
2907 * Ain-----Aout
2908 * \ /
2909 * / \
2910 * Bin-----Bout
2911 */
2912
2913 endptr1 = localBuffer + h->Points * 2;
2914
2915 while(ButterfliesPerGroup > 0)
2916 {
2917 A = localBuffer;
2918 B = localBuffer + ButterfliesPerGroup * 2;
2919 int sinCosCalIndex = 0;
2920 int iSinCosIndex = 0;
2921 while(A < endptr1)
2922 {
2923 v4sf sin4_2, cos4_2;
2924 if(!sinCosCalIndex)
2925 {
2926 v4sf vx;
2927 for(int i=0;i<4;i++) {
2928 int brTemp=iSinCosIndex+i;
2929 vx.m128_f32[i]=( ((sSmallRBTable[*((unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
2930// vx.m128_f32[i]=((fft_type )SmallRB(iSinCosIndex+i,h->pow2Bits-1))*iToRad;
2931 }
2932 sincos_ps(vx, &sin4_2, &cos4_2);
2933 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2934 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2935 sinCosCalIndex++;
2936 } else {
2937 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2938 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2939 if(sinCosCalIndex==3)
2940 sinCosCalIndex=0;
2941 else
2942 sinCosCalIndex++;
2943 }
2944 iSinCosIndex++;
2945 endptr2=B;
2946 while(A<endptr2)
2947 {
2948 v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
2949 v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
2950 *B=_mm_mul_ps( _mm_add_ps(*A, v1), _mm_set1_ps(0.5));
2951 *(A++)=_mm_sub_ps(*(B++), v1);
2952 *B=_mm_mul_ps(_mm_add_ps(*A, v2), _mm_set1_ps(0.5));
2953 *(A++)=_mm_sub_ps(*(B++),v2);
2954 }
2955 A = B;
2956 B = &B[ButterfliesPerGroup * 2];
2957 }
2958 ButterfliesPerGroup >>= 1;
2959 }
2960}
2961
2962void ReorderToFreq4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
2963{
2964 __m128 *localBuffer=(__m128 *)buffer;
2965 __m128 *localRealOut=(__m128 *)RealOut;
2966 __m128 *localImagOut=(__m128 *)ImagOut;
2967 int bitReverseShift=16-hFFT->pow2Bits;
2968
2969
2970 // Copy the data into the real and imaginary outputs
2971 for(size_t i = 1; i < hFFT->Points; i++) {
2972 int brValue;
2973 brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
2974// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
2975 localRealOut[i]=localBuffer[brValue ];
2976 localImagOut[i]=localBuffer[brValue+1];
2977 }
2978 localRealOut[0] = localBuffer[0]; // DC component
2979 localImagOut[0] = _mm_set1_ps(0.0);
2980 localRealOut[hFFT->Points] = localBuffer[1]; // Fs/2 component
2981 localImagOut[hFFT->Points] = _mm_set1_ps(0.0);
2982}
2983
2984void ReorderToTime4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
2985{
2986 __m128 *localBuffer=(__m128 *)buffer;
2987 __m128 *localTimeOut=(__m128 *)TimeOut;
2988 int bitReverseShift=16-hFFT->pow2Bits;
2989
2990 // Copy the data into the real outputs
2991 for(size_t i = 0; i < hFFT->Points; i++) {
2992 int brValue;
2993 brValue=( ((sSmallRBTable[*((unsigned char *)&i)]<<8) + (sSmallRBTable[*(((unsigned char *)&i)+1)]) )>>bitReverseShift);
2994// brValue=(*SmallVRB[hFFT->pow2Bits])(i);
2995 localTimeOut[i*2 ] = localBuffer[brValue ];
2996 localTimeOut[i*2+1] = localBuffer[brValue+1];
2997 }
2998}
2999
3000#endif
3001#endif
#define M_PI
Definition: Distortion.cpp:22
void RealFFTf4x(fft_type *, FFTParam *, int functionType=-1)
float fft_type
Definition: RealFFTf48x.h:6
void ReorderToFreq1x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType=-1)
void InverseRealFFTf4xFastMathBR16(fft_type *, FFTParam *)
void InverseRealFFTf4xSinCosTableVBR16(fft_type *, FFTParam *)
void RealFFTf4xFastMathBR16(fft_type *, FFTParam *)
void ReorderToTime1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToFreq4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void RealFFTf4xSinCosTableVBR16(fft_type *, FFTParam *)
void ReorderToTime1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToTime1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToFreq1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void RealFFTf1xSinCosTableBR16(fft_type *, FFTParam *)
void ReorderToTime4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void RealFFTf1xSinCosBRTable(fft_type *, FFTParam *)
void ReorderToFreq4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void ReorderToFreq4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void InverseRealFFTf1xSinCosTableVBR16(fft_type *, FFTParam *)
void TableUsage(int iMask)
void RealFFTf4xFastMathBR24(fft_type *, FFTParam *)
void ReorderToFreq4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void ReorderToFreq1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void ReorderToFreq1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void RealFFTf1x(fft_type *, FFTParam *, int functionType=-1)
void InverseRealFFTf1xFastMathBR16(fft_type *, FFTParam *)
void ReorderToFreq1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void ReorderToTime4x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType=-1)
void ReorderToTime4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
@ FFT_SinCosBRTable
Definition: RealFFTf48x.h:11
@ FFT_SinCosTableBR16
Definition: RealFFTf48x.h:13
@ FFT_FastMathBR16
Definition: RealFFTf48x.h:14
@ FFT_SinCosTableVBR16
Definition: RealFFTf48x.h:12
@ FFT_FastMathBR24
Definition: RealFFTf48x.h:15
void InverseRealFFTf1xSinCosTableBR16(fft_type *, FFTParam *)
void InverseRealFFTf4xSinCosBRTable(fft_type *, FFTParam *)
void InverseRealFFTf1xFastMathBR24(fft_type *, FFTParam *)
void ReorderToTime1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToFreq4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void RealFFTf4xSinCosTableBR16(fft_type *, FFTParam *)
void ReorderToTime4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToTime1x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType=-1)
void InverseRealFFTf4x(fft_type *, FFTParam *, int functionType=-1)
void RealFFTf1xFastMathBR16(fft_type *, FFTParam *)
void InverseRealFFTf1x(fft_type *, FFTParam *, int functionType=-1)
void InverseRealFFTf1xSinCosBRTable(fft_type *, FFTParam *)
void ReorderToTime1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void RealFFTf1xSinCosTableVBR16(fft_type *, FFTParam *)
void InverseRealFFTf4xSinCosTableBR16(fft_type *, FFTParam *)
int(* SmallVRB[])(int bits)
void ReorderToFreq4x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType=-1)
void RealFFTf1xFastMathBR24(fft_type *, FFTParam *)
void InverseRealFFTf4xFastMathBR24(fft_type *, FFTParam *)
void RealFFTf4xSinCosBRTable(fft_type *, FFTParam *)
int SmallRB(int bits, int numberBits)
void ReorderToTime4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToTime4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToFreq1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
SSE maths functions (for FFTs)
void sincos_ps(v4sf x, v4sf *s, v4sf *c)
Definition: SseMathFuncs.h:631
__m128 v4sf
Definition: SseMathFuncs.h:108
#define A(N)
Definition: ToChars.cpp:62
ArrayOf< SinCosStruct > mSinCosTable
Definition: RealFFTf48x.h:94
int mSinCosTablePow
Definition: RealFFTf48x.h:93
size_t Points
Definition: RealFFTf.h:10
ArrayOf< int > BitReversed
Definition: RealFFTf.h:8
ArrayOf< fft_type > SinTable
Definition: RealFFTf.h:9