62#ifdef EXPERIMENTAL_EQ_SSE_THREADED
76#pragma warning(disable:4305)
84#define M_PI 3.14159265358979323846
88bool useBitReverseTable=
false;
89bool useSinCosTable=
false;
94 useBitReverseTable=(iMask&1)!=0;
95 useSinCosTable=(iMask&2)!=0;
101 size_t tableSize=1<<mSinCosTablePow;
102 mSinCosTable.reinit(tableSize);
103 for(
size_t i=0;i<tableSize;i++) {
104 mSinCosTable[i].mSin=(float)-sin(((
float)i)*
M_PI/tableSize);
105 mSinCosTable[i].mCos=(float)-cos(((
float)i)*
M_PI/tableSize);
111static unsigned char sSmallRBTable[256];
117 for(
int i=0;i<256;i++) {
119 for(
int maskLow=1, maskHigh=128;maskLow<256;maskLow<<=1,maskHigh>>=1)
121 sSmallRBTable[i]|=maskHigh;
125static BitReverser sBitReverser;
128int SmallVRB0(
int bits) {
return bits; };
int SmallVRB1(
int bits) {
return sSmallRBTable[bits]>>7; };
129int SmallVRB2(
int bits) {
return sSmallRBTable[bits]>>6; };
int SmallVRB3(
int bits) {
return sSmallRBTable[bits]>>5; };
130int SmallVRB4(
int bits) {
return sSmallRBTable[bits]>>4; };
int SmallVRB5(
int bits) {
return sSmallRBTable[bits]>>3; };
131int SmallVRB6(
int bits) {
return sSmallRBTable[bits]>>2; };
int SmallVRB7(
int bits) {
return sSmallRBTable[bits]>>1; };
132int SmallVRB8(
int bits) {
return sSmallRBTable[bits]; };
133int SmallVRB9(
int bits) {
return (sSmallRBTable[*((
unsigned char *)&bits)]<<1)+(sSmallRBTable[*(((
unsigned char *)&bits)+1)]>>7); };
134int SmallVRB10(
int bits) {
return (sSmallRBTable[*((
unsigned char *)&bits)]<<2)+(sSmallRBTable[*(((
unsigned char *)&bits)+1)]>>6); };
135int SmallVRB11(
int bits) {
return (sSmallRBTable[*((
unsigned char *)&bits)]<<3)+(sSmallRBTable[*(((
unsigned char *)&bits)+1)]>>5); };
136int SmallVRB12(
int bits) {
return (sSmallRBTable[*((
unsigned char *)&bits)]<<4)+(sSmallRBTable[*(((
unsigned char *)&bits)+1)]>>4); };
137int SmallVRB13(
int bits) {
return (sSmallRBTable[*((
unsigned char *)&bits)]<<5)+(sSmallRBTable[*(((
unsigned char *)&bits)+1)]>>3); };
138int SmallVRB14(
int bits) {
return (sSmallRBTable[*((
unsigned char *)&bits)]<<6)+(sSmallRBTable[*(((
unsigned char *)&bits)+1)]>>2); };
139int SmallVRB15(
int bits) {
return (sSmallRBTable[*((
unsigned char *)&bits)]<<7)+(sSmallRBTable[*(((
unsigned char *)&bits)+1)]>>1); };
140int SmallVRB16(
int bits) {
return (sSmallRBTable[*((
unsigned char *)&bits)]<<8)+(sSmallRBTable[*(((
unsigned char *)&bits)+1)]); };
142int (*
SmallVRB[])(
int bits) = { SmallVRB0, SmallVRB1, SmallVRB2, SmallVRB3, SmallVRB4,
143 SmallVRB5, SmallVRB6, SmallVRB7, SmallVRB8, SmallVRB9, SmallVRB10,
144 SmallVRB11, SmallVRB12, SmallVRB13, SmallVRB14,SmallVRB15, SmallVRB16 };
146int SmallRB(
int bits,
int numberBits)
149 return ( (sSmallRBTable[*((
unsigned char *)&bits)]<<8) + (sSmallRBTable[*(((
unsigned char *)&bits)+1)]) )>>(16-numberBits);
155 switch(functionType) {
176 switch(functionType) {
197 switch(functionType) {
218 switch(functionType) {
239 switch(functionType) {
260 switch(functionType) {
281 switch(functionType) {
302 switch(functionType) {
321#define REAL_SINCOSBRTABLE
322#ifdef REAL_SINCOSBRTABLE
348 fft_type HRplus,HRminus,HIplus,HIminus;
351 auto ButterfliesPerGroup = h->
Points / 2;
361 endptr1 = buffer + h->
Points * 2;
363 while(ButterfliesPerGroup > 0)
366 B = buffer + ButterfliesPerGroup * 2;
376 v1 = *B * cos + *(B+1) * sin;
377 v2 = *B * sin - *(B+1) * cos;
379 *(
A++) = *(B++) - 2 * v1;
381 *(
A++) = *(B++) + 2 * v2;
384 B += ButterfliesPerGroup * 2;
387 ButterfliesPerGroup >>= 1;
399 HRplus = (HRminus = *
A - *B ) + (*B * 2);
400 HIplus = (HIminus = *(
A+1) - *(B+1)) + (*(B+1) * 2);
401 v1 = (sin*HRminus - cos*HIplus);
402 v2 = (cos*HRminus + sin*HIplus);
406 *(B+1) = *(
A+1) - HIminus;
419 v1=buffer[0]-buffer[1];
420 buffer[0]+=buffer[1];
450 fft_type HRplus,HRminus,HIplus,HIminus;
453 auto ButterfliesPerGroup = h->
Points / 2;
457 B = buffer + h->
Points * 2 - 2;
463 HRplus = (HRminus = *
A - *B ) + (*B * 2);
464 HIplus = (HIminus = *(
A+1) - *(B+1)) + (*(B+1) * 2);
465 v1 = (sin*HRminus + cos*HIplus);
466 v2 = (cos*HRminus - sin*HIplus);
470 *(B+1) = *(
A+1) - HIminus;
485 v1=0.5f*(buffer[0]+buffer[1]);
486 v2=0.5f*(buffer[0]-buffer[1]);
498 endptr1 = buffer + h->
Points * 2;
500 while(ButterfliesPerGroup > 0)
503 B = buffer + ButterfliesPerGroup * 2;
513 v1 = *B * cos - *(B + 1) * sin;
514 v2 = *B * sin + *(B + 1) * cos;
516 *(
A++) = *(B++) - v1;
518 *(
A++) = *(B++) - v2;
521 B += ButterfliesPerGroup * 2;
523 ButterfliesPerGroup >>= 1;
530 for(
size_t i = 1; i < hFFT->
Points; i++) {
534 RealOut[0] = buffer[0];
536 RealOut[hFFT->
Points] = buffer[1];
537 ImagOut[hFFT->
Points] = 0;
543 for(
size_t i = 0; i < hFFT->
Points; i++) {
553 __m128 *localBuffer=(__m128 *)buffer;
557 __m128 *endptr1,*endptr2;
558 int br1Index, br2Index;
559 int br1Value, br2Value;
560 __m128 HRplus,HRminus,HIplus,HIminus;
561 __m128 v1,v2,sin,cos;
562 auto ButterfliesPerGroup = h->
Points / 2;
572 endptr1 = &localBuffer[h->
Points * 2];
574 while(ButterfliesPerGroup > 0)
577 B = &localBuffer[ButterfliesPerGroup * 2];
581 sin = _mm_set1_ps(*(sptr++));
582 cos = _mm_set1_ps(*(sptr++));
586 v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
587 v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
588 *B = _mm_add_ps( *
A, v1);
589 __m128 temp128 = _mm_set1_ps( 2.0);
590 *(
A++) = _mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
591 *B = _mm_sub_ps(*
A,v2);
592 *(
A++) = _mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
595 B = &B[ButterfliesPerGroup * 2];
597 ButterfliesPerGroup >>= 1;
604 while(br1Index<br2Index)
608 sin=_mm_set1_ps(h->
SinTable[br1Value]);
609 cos=_mm_set1_ps(h->
SinTable[br1Value+1]);
610 A=&localBuffer[br1Value];
611 B=&localBuffer[br2Value];
612 __m128 temp128 = _mm_set1_ps( 2.0);
613 HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *
A, *B ), _mm_mul_ps(*B, temp128));
614 HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(
A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
615 v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
616 v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
617 temp128 = _mm_set1_ps( 0.5);
618 *
A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
619 *B = _mm_sub_ps(*
A, v1);
620 *(
A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
621 *(B+1) = _mm_sub_ps(*(
A+1), HIminus);
629 *
A=_mm_xor_ps(*
A, _mm_set1_ps(-0.f));
632 v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
633 localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
659 __m128 *localBuffer=(__m128 *)buffer;
663 __m128 *endptr1,*endptr2;
664 int br1Index, br1Value;
665 __m128 HRplus,HRminus,HIplus,HIminus;
666 __m128 v1,v2,sin,cos;
668 auto ButterfliesPerGroup = h->
Points / 2;
672 B = localBuffer + h->
Points * 2 - 2;
677 sin = _mm_set1_ps(h->
SinTable[br1Value]);
678 cos = _mm_set1_ps(h->
SinTable[br1Value + 1]);
679 HRminus = _mm_sub_ps(*
A, *B);
680 HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
681 HIminus = _mm_sub_ps( *(
A+1), *(B+1));
682 HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
683 v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
684 v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
685 *
A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
686 *B = _mm_sub_ps(*
A, v1);
687 *(
A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
688 *(B+1) = _mm_sub_ps(*(
A+1), HIminus);
696 *(
A+1)=_mm_xor_ps(*(
A+1), _mm_set1_ps(-0.f));
705 v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
706 v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
718 endptr1 = localBuffer + h->
Points * 2;
720 while(ButterfliesPerGroup > 0)
723 B = localBuffer + ButterfliesPerGroup * 2;
727 sin = _mm_set1_ps(*(sptr++));
728 cos = _mm_set1_ps(*(sptr++));
732 v1 = _mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B + 1), sin));
733 v2 = _mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B + 1), cos));
734 *B = _mm_mul_ps( _mm_add_ps(*
A, v1), _mm_set1_ps(0.5));
735 *(
A++) = _mm_sub_ps(*(B++), v1);
736 *B = _mm_mul_ps(_mm_add_ps(*
A, v2), _mm_set1_ps(0.5));
737 *(
A++) = _mm_sub_ps(*(B++), v2);
740 B = &B[ButterfliesPerGroup * 2];
742 ButterfliesPerGroup >>= 1;
748 __m128 *localBuffer=(__m128 *)buffer;
749 __m128 *localRealOut=(__m128 *)RealOut;
750 __m128 *localImagOut=(__m128 *)ImagOut;
753 for(
size_t i = 1; i < hFFT->
Points; i++) {
756 localRealOut[i]=localBuffer[brValue ];
757 localImagOut[i]=localBuffer[brValue+1];
759 localRealOut[0] = localBuffer[0];
760 localImagOut[0] = _mm_set1_ps(0.0);
761 localRealOut[hFFT->
Points] = localBuffer[1];
762 localImagOut[hFFT->
Points] = _mm_set1_ps(0.0);
767 __m128 *localBuffer=(__m128 *)buffer;
768 __m128 *localTimeOut=(__m128 *)TimeOut;
770 for(
size_t i = 0; i < hFFT->
Points; i++) {
773 localTimeOut[i*2 ] = localBuffer[brValue ];
774 localTimeOut[i*2+1] = localBuffer[brValue+1];
780#define REAL_SINCOSTABLE_VBR16
781#ifdef REAL_SINCOSTABLE_VBR16
805 int br1Index, br2Index;
806 int br1Value, br2Value;
807 fft_type HRplus,HRminus,HIplus,HIminus;
809 auto ButterfliesPerGroup = h->
Points / 2;
810 int pow2BitsMinus1 = h->pow2Bits - 1;
813 endptr1 = buffer + h->
Points * 2;
815 while(ButterfliesPerGroup > 0)
818 B = buffer + ButterfliesPerGroup * 2;
819 int iSinCosIndex = 0;
822 int sinCosLookup = (*
SmallVRB[pow2BitsMinus1])(iSinCosIndex)<<sinCosShift;
829 v1 = *B*cos + *(B+1)*sin;
830 v2 = *B*sin - *(B+1)*cos;
832 *(
A++) = *(B++) - 2 * v1;
834 *(
A++) = *(B++) + 2 * v2;
837 B += ButterfliesPerGroup * 2;
839 ButterfliesPerGroup >>= 1;
846 while(br1Index < br2Index)
848 br1Value=(*
SmallVRB[h->pow2Bits])(br1Index);
849 br2Value=(*
SmallVRB[h->pow2Bits])(br2Index);
850 int sinCosIndex=br1Index<<sinCosShift;
855 HRplus = (HRminus = *
A - *B ) + (*B * 2);
856 HIplus = (HIminus = *(
A+1) - *(B+1)) + (*(B+1) * 2);
857 v1 = (sin*HRminus - cos*HIplus);
858 v2 = (cos*HRminus + sin*HIplus);
862 *(B+1) = *(
A+1) - HIminus;
868 A=&buffer[(*
SmallVRB[h->pow2Bits])(br1Index)+1];
875 v1=buffer[0]-buffer[1];
876 buffer[0]+=buffer[1];
903 int br1Index, br1Value;
905 fft_type HRplus,HRminus,HIplus,HIminus;
907 auto ButterfliesPerGroup = h->
Points / 2;
908 int pow2BitsMinus1 = h->pow2Bits - 1;
913 B = buffer + h->
Points * 2 - 2;
918 br1Value = (*
SmallVRB[h->pow2Bits])(br1Index);
919 int sinCosIndex = br1Index << sinCosShift;
922 HRplus = (HRminus = *
A - *B ) + (*B * 2);
923 HIplus = (HIminus = *(
A+1) - *(B+1)) + (*(B+1) * 2);
924 v1 = (sin*HRminus + cos*HIplus);
925 v2 = (cos*HRminus - sin*HIplus);
929 *(B+1) = *(
A+1) - HIminus;
944 v1=0.5f*(buffer[0]+buffer[1]);
945 v2=0.5f*(buffer[0]-buffer[1]);
957 endptr1 = buffer + h->
Points * 2;
959 while(ButterfliesPerGroup > 0)
962 B = buffer + ButterfliesPerGroup * 2;
963 int iSinCosIndex = 0;
966 int sinCosLookup = (*
SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;
973 v1 = *B * cos - *(B + 1) * sin;
974 v2 = *B * sin + *(B + 1) * cos;
976 *(
A++) = *(B++) - v1;
978 *(
A++) = *(B++) - v2;
981 B += ButterfliesPerGroup * 2;
983 ButterfliesPerGroup >>= 1;
990 for(
size_t i = 0;i < hFFT->
Points; i++) {
992 brValue=(*
SmallVRB[hFFT->pow2Bits])(i);
993 TimeOut[i*2 ] = buffer[brValue ];
994 TimeOut[i*2+1] = buffer[brValue+1];
1001 for(
size_t i = 1; i < hFFT->
Points; i++) {
1003 brValue = (*
SmallVRB[hFFT->pow2Bits])(i);
1004 RealOut[i] = buffer[brValue ];
1005 ImagOut[i] = buffer[brValue+1];
1007 RealOut[0] = buffer[0];
1009 RealOut[hFFT->
Points] = buffer[1];
1010 ImagOut[hFFT->
Points] = 0;
1017 __m128 *localBuffer=(__m128 *)buffer;
1020 __m128 *endptr1,*endptr2;
1021 int br1Index, br2Index;
1022 int br1Value, br2Value;
1023 __m128 HRplus,HRminus,HIplus,HIminus;
1024 __m128 v1,v2,sin,cos;
1025 auto ButterfliesPerGroup = h->
Points / 2;
1026 int pow2BitsMinus1 = h->pow2Bits - 1;
1037 endptr1 = &localBuffer[h->
Points * 2];
1039 while(ButterfliesPerGroup > 0)
1042 B = &localBuffer[ButterfliesPerGroup * 2];
1043 int iSinCosIndex = 0;
1046 int sinCosLookup = (*
SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;
1047 sin = _mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosLookup].mSin);
1048 cos = _mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosLookup].mCos);
1053 v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
1054 v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
1055 *B = _mm_add_ps( *
A, v1);
1056 __m128 temp128 = _mm_set1_ps( 2.0);
1057 *(
A++) = _mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
1058 *B = _mm_sub_ps(*
A,v2);
1059 *(
A++) = _mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
1062 B = &B[ButterfliesPerGroup * 2];
1064 ButterfliesPerGroup >>= 1;
1069 br2Index = h->
Points - 1;
1071 while(br1Index < br2Index)
1073 br1Value=(*
SmallVRB[h->pow2Bits])(br1Index);
1074 br2Value=(*
SmallVRB[h->pow2Bits])(br2Index);
1075 int sinCosIndex=br1Index<<sinCosShift;
1076 sin=_mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosIndex].mSin);
1077 cos=_mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosIndex].mCos);
1078 A=&localBuffer[br1Value];
1079 B=&localBuffer[br2Value];
1080 __m128 temp128 = _mm_set1_ps( 2.0);
1081 HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *
A, *B ), _mm_mul_ps(*B, temp128));
1082 HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(
A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
1083 v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
1084 v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
1085 temp128 = _mm_set1_ps( 0.5);
1086 *
A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
1087 *B = _mm_sub_ps(*
A, v1);
1088 *(
A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
1089 *(B+1) = _mm_sub_ps(*(
A+1), HIminus);
1095 A=&localBuffer[(*
SmallVRB[h->pow2Bits])(br1Index)+1];
1097 *
A=_mm_xor_ps(*
A, _mm_set1_ps(-0.f));
1100 v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
1101 localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
1127 __m128 *localBuffer=(__m128 *)buffer;
1130 __m128 *endptr1, *endptr2;
1131 int br1Index, br1Value;
1132 __m128 HRplus, HRminus, HIplus, HIminus;
1133 __m128 v1, v2, sin, cos;
1134 auto ButterfliesPerGroup = h->
Points / 2;
1135 int pow2BitsMinus1 = h->pow2Bits - 1;
1139 A = localBuffer + 2;
1140 B = localBuffer + h->
Points * 2 - 2;
1144 br1Value = (*
SmallVRB[h->pow2Bits])(br1Index);
1145 int sinCosIndex = br1Index << sinCosShift;
1146 sin = _mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosIndex].mSin);
1147 cos = _mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosIndex].mCos);
1148 HRminus = _mm_sub_ps(*
A, *B);
1149 HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
1150 HIminus = _mm_sub_ps( *(
A+1), *(B+1));
1151 HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
1152 v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
1153 v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
1154 *
A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
1155 *B = _mm_sub_ps(*
A, v1);
1156 *(
A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
1157 *(B+1) = _mm_sub_ps(*(
A+1), HIminus);
1165 *(
A+1) = _mm_xor_ps(*(
A+1), _mm_set1_ps(-0.f));
1174 v1 = _mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
1175 v2 = _mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
1176 localBuffer[0] = v1;
1177 localBuffer[1] = v2;
1187 endptr1 = localBuffer + h->
Points * 2;
1189 while(ButterfliesPerGroup > 0)
1192 B = localBuffer + ButterfliesPerGroup * 2;
1193 int iSinCosIndex = 0;
1196 int sinCosLookup = (*
SmallVRB[pow2BitsMinus1])(iSinCosIndex) << sinCosShift;
1197 sin = _mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosLookup].mSin);
1198 cos = _mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosLookup].mCos);
1203 v1 = _mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
1204 v2 = _mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
1205 *B = _mm_mul_ps( _mm_add_ps(*
A, v1), _mm_set1_ps(0.5));
1206 *(
A++) = _mm_sub_ps(*(B++), v1);
1207 *B = _mm_mul_ps(_mm_add_ps(*
A, v2), _mm_set1_ps(0.5));
1208 *(
A++) = _mm_sub_ps(*(B++),v2);
1211 B = &B[ButterfliesPerGroup * 2];
1213 ButterfliesPerGroup >>= 1;
1219 __m128 *localBuffer = (__m128 *)buffer;
1220 __m128 *localTimeOut = (__m128 *)TimeOut;
1222 for(
size_t i = 0; i < hFFT->
Points; i++) {
1224 brValue = (*
SmallVRB[hFFT->pow2Bits])(i);
1225 localTimeOut[i*2 ] = localBuffer[brValue ];
1226 localTimeOut[i*2+1] = localBuffer[brValue+1];
1232 __m128 *localBuffer = (__m128 *)buffer;
1233 __m128 *localRealOut = (__m128 *)RealOut;
1234 __m128 *localImagOut = (__m128 *)ImagOut;
1237 for(
size_t i = 1; i < hFFT->
Points; i++) {
1239 brValue = (*
SmallVRB[hFFT->pow2Bits])(i);
1240 localRealOut[i] = localBuffer[brValue ];
1241 localImagOut[i] = localBuffer[brValue+1];
1243 localRealOut[0] = localBuffer[0];
1244 localImagOut[0] = _mm_set1_ps(0.0);
1245 localRealOut[hFFT->
Points] = localBuffer[1];
1246 localImagOut[hFFT->
Points] = _mm_set1_ps(0.0);
1250#define REAL_SINCOSTABLE_BR16
1251#ifdef REAL_SINCOSTABLE_BR16
1275 int br1Index, br2Index;
1276 int br1Value, br2Value;
1277 fft_type HRplus, HRminus, HIplus, HIminus;
1279 auto ButterfliesPerGroup = h->
Points / 2;
1280 int pow2BitsMinus1 = h->pow2Bits - 1;
1281 int bitReverseShiftM1 = 17 - h->pow2Bits;
1282 int bitReverseShift = bitReverseShiftM1 - 1;
1285 endptr1 = buffer + h->
Points * 2;
1287 while(ButterfliesPerGroup > 0)
1290 B = buffer + ButterfliesPerGroup * 2;
1291 int iSinCosIndex = 0;
1295 int sinCosLookup = ( ((sSmallRBTable[*((
unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((
unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
1302 v1=*B*cos + *(B+1)*sin;
1303 v2=*B*sin - *(B+1)*cos;
1310 B += ButterfliesPerGroup * 2;
1312 ButterfliesPerGroup >>= 1;
1317 br2Index = h->
Points - 1;
1319 while(br1Index < br2Index)
1321 br1Value=( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]) )>>bitReverseShift);
1322 br2Value=( ((sSmallRBTable[*((
unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br2Index)+1)]) )>>bitReverseShift);
1323 int sinCosIndex = br1Index << sinCosShift;
1326 A = &buffer[br1Value];
1327 B = &buffer[br2Value];
1328 HRplus = (HRminus = *
A - *B ) + (*B * 2);
1329 HIplus = (HIminus = *(
A+1) - *(B+1)) + (*(B+1) * 2);
1330 v1 = (sin*HRminus - cos*HIplus);
1331 v2 = (cos*HRminus + sin*HIplus);
1334 *(
A+1) = (HIminus + v2) * (
fft_type)0.5;
1335 *(B+1) = *(
A+1) - HIminus;
1342 A=&buffer[( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
1349 v1=buffer[0]-buffer[1];
1350 buffer[0]+=buffer[1];
1378 fft_type HRplus, HRminus, HIplus, HIminus;
1380 auto ButterfliesPerGroup = h->
Points / 2;
1381 int pow2BitsMinus1 = h->pow2Bits - 1;
1383 int bitReverseShiftM1 = 17 - h->pow2Bits;
1387 B = buffer + h->
Points * 2 - 2;
1391 int sinCosIndex = br1Index << sinCosShift;
1394 HRplus = (HRminus = *
A - *B ) + (*B * 2);
1395 HIplus = (HIminus = *(
A+1) - *(B+1)) + (*(B+1) * 2);
1396 v1 = (sin*HRminus + cos*HIplus);
1397 v2 = (cos*HRminus - sin*HIplus);
1400 *(
A+1) = (HIminus - v2) * (
fft_type)0.5;
1401 *(B+1) = *(
A+1) - HIminus;
1416 v1=0.5f*(buffer[0]+buffer[1]);
1417 v2=0.5f*(buffer[0]-buffer[1]);
1429 endptr1 = buffer + h->
Points * 2;
1431 while(ButterfliesPerGroup > 0)
1434 B = buffer + ButterfliesPerGroup * 2;
1435 int iSinCosIndex = 0;
1439 int sinCosLookup=( ((sSmallRBTable[*((
unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((
unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
1446 v1=*B*cos - *(B+1)*sin;
1447 v2=*B*sin + *(B+1)*cos;
1454 B += ButterfliesPerGroup * 2;
1456 ButterfliesPerGroup >>= 1;
1462 int bitReverseShift=16-hFFT->pow2Bits;
1464 for(
size_t i = 1; i < hFFT->
Points; i++) {
1467 brValue = ( ((sSmallRBTable[*((
unsigned char *)&i)]<<8) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]) )>>bitReverseShift);
1468 RealOut[i] = buffer[brValue ];
1469 ImagOut[i] = buffer[brValue+1];
1471 RealOut[0] = buffer[0];
1473 RealOut[hFFT->
Points] = buffer[1];
1474 ImagOut[hFFT->
Points] = 0;
1479 int bitReverseShift=16-hFFT->pow2Bits;
1481 for(
size_t i = 0; i < hFFT->
Points; i++) {
1484 brValue=( ((sSmallRBTable[*((
unsigned char *)&i)]<<8) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]) )>>bitReverseShift);
1485 TimeOut[i*2 ] = buffer[brValue ];
1486 TimeOut[i*2+1] = buffer[brValue+1];
1494 __m128 *localBuffer=(__m128 *)buffer;
1497 __m128 *endptr1, *endptr2;
1498 int br1Index, br2Index;
1499 int br1Value, br2Value;
1500 __m128 HRplus, HRminus, HIplus, HIminus;
1501 __m128 v1, v2, sin, cos;
1502 auto ButterfliesPerGroup = h->
Points / 2;
1503 int pow2BitsMinus1 = h->pow2Bits - 1;
1505 int bitReverseShiftM1 = 17 - h->pow2Bits;
1506 int bitReverseShift = bitReverseShiftM1 - 1;
1516 endptr1 = &localBuffer[h->
Points * 2];
1518 while(ButterfliesPerGroup > 0)
1521 B = &localBuffer[ButterfliesPerGroup * 2];
1522 int iSinCosIndex = 0;
1526 int sinCosLookup=( ((sSmallRBTable[*((
unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((
unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
1527 sin=_mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosLookup].mSin);
1528 cos=_mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosLookup].mCos);
1533 v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
1534 v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
1535 *B=_mm_add_ps( *
A, v1);
1536 __m128 temp128 = _mm_set1_ps( 2.0);
1537 *(
A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
1538 *B=_mm_sub_ps(*
A,v2);
1539 *(
A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
1542 B = &B[ButterfliesPerGroup * 2];
1544 ButterfliesPerGroup >>= 1;
1549 br2Index = h->
Points - 1;
1551 while(br1Index < br2Index)
1553 br1Value=( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]) )>>bitReverseShift);
1554 br2Value=( ((sSmallRBTable[*((
unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br2Index)+1)]) )>>bitReverseShift);
1555 int sinCosIndex=br1Index<<sinCosShift;
1556 sin=_mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosIndex].mSin);
1557 cos=_mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosIndex].mCos);
1558 A=&localBuffer[br1Value];
1559 B=&localBuffer[br2Value];
1560 __m128 temp128 = _mm_set1_ps( 2.0);
1561 HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *
A, *B ), _mm_mul_ps(*B, temp128));
1562 HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(
A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
1563 v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
1564 v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
1565 temp128 = _mm_set1_ps( 0.5);
1566 *
A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
1567 *B = _mm_sub_ps(*
A, v1);
1568 *(
A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
1569 *(B+1) = _mm_sub_ps(*(
A+1), HIminus);
1576 A=&localBuffer[( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
1578 *
A=_mm_xor_ps(*
A, _mm_set1_ps(-0.f));
1581 v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
1582 localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
1608 __m128 *localBuffer=(__m128 *)buffer;
1611 __m128 *endptr1, *endptr2;
1613 __m128 HRplus, HRminus, HIplus, HIminus;
1614 __m128 v1, v2, sin, cos;
1615 auto ButterfliesPerGroup = h->
Points / 2;
1616 int pow2BitsMinus1 = h->pow2Bits - 1;
1618 int bitReverseShiftM1 = 17 - h->pow2Bits;
1621 A = localBuffer + 2;
1622 B = localBuffer + h->
Points * 2 - 2;
1626 int sinCosIndex = br1Index << sinCosShift;
1627 sin = _mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosIndex].mSin);
1628 cos = _mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosIndex].mCos);
1629 HRminus = _mm_sub_ps(*
A, *B);
1630 HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
1631 HIminus = _mm_sub_ps( *(
A+1), *(B+1));
1632 HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
1633 v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
1634 v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
1635 *
A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
1636 *B = _mm_sub_ps(*
A, v1);
1637 *(
A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
1638 *(B+1) = _mm_sub_ps(*(
A+1), HIminus);
1646 *(
A+1)=_mm_xor_ps(*(
A+1), _mm_set1_ps(-0.f));
1655 v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
1656 v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
1657 localBuffer[0] = v1;
1658 localBuffer[1] = v2;
1668 endptr1 = localBuffer + h->
Points * 2;
1670 while(ButterfliesPerGroup > 0)
1673 B = localBuffer + ButterfliesPerGroup * 2;
1674 int iSinCosIndex = 0;
1678 int sinCosLookup=( ((sSmallRBTable[*((
unsigned char *)&iSinCosIndex)]<<8) + (sSmallRBTable[*(((
unsigned char *)&iSinCosIndex)+1)]) )>>bitReverseShiftM1)<<sinCosShift;
1679 sin=_mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosLookup].mSin);
1680 cos=_mm_set1_ps(sSinCosTable.
mSinCosTable[sinCosLookup].mCos);
1685 v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
1686 v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
1687 *B=_mm_mul_ps( _mm_add_ps(*
A, v1), _mm_set1_ps(0.5));
1688 *(
A++)=_mm_sub_ps(*(B++), v1);
1689 *B=_mm_mul_ps(_mm_add_ps(*
A, v2), _mm_set1_ps(0.5));
1690 *(
A++)=_mm_sub_ps(*(B++),v2);
1693 B = &B[ButterfliesPerGroup * 2];
1695 ButterfliesPerGroup >>= 1;
1701 __m128 *localBuffer = (__m128 *)buffer;
1702 __m128 *localTimeOut = (__m128 *)TimeOut;
1703 int bitReverseShift = 16 - hFFT->pow2Bits;
1706 for(
size_t i = 0; i < hFFT->
Points; i++) {
1708 brValue=( ((sSmallRBTable[*((
unsigned char *)&i)]<<8) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]) )>>bitReverseShift);
1710 localTimeOut[i*2 ] = localBuffer[brValue ];
1711 localTimeOut[i*2+1] = localBuffer[brValue+1];
1717 __m128 *localBuffer = (__m128 *)buffer;
1718 __m128 *localRealOut = (__m128 *)RealOut;
1719 __m128 *localImagOut = (__m128 *)ImagOut;
1720 int bitReverseShift = 16 - hFFT->pow2Bits;
1723 for(
size_t i = 1; i < hFFT->
Points; i++) {
1725 brValue=( ((sSmallRBTable[*((
unsigned char *)&i)]<<8) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]) )>>bitReverseShift);
1727 localRealOut[i] = localBuffer[brValue ];
1728 localImagOut[i] = localBuffer[brValue+1];
1730 localRealOut[0] = localBuffer[0];
1731 localImagOut[0] = _mm_set1_ps(0.0);
1732 localRealOut[hFFT->
Points] = localBuffer[1];
1733 localImagOut[hFFT->
Points] = _mm_set1_ps(0.0);
1737#define FAST_MATH_BR24
1738#ifdef FAST_MATH_BR24
1762 int br1Index, br2Index;
1763 int br1Value, br2Value;
1764 fft_type HRplus, HRminus, HIplus, HIminus;
1767 int bitReverseShift = 24 - h->pow2Bits;
1768 int bitReverseShiftM1 = bitReverseShift + 1;
1769 auto ButterfliesPerGroup = h->
Points / 2;
1779 endptr1 = buffer + h->
Points * 2;
1781 const v4sf zeroes = {0.0,0.0,0.0,0.0};
1782 while(ButterfliesPerGroup > 0)
1785 B = buffer + ButterfliesPerGroup * 2;
1786 int sinCosCalIndex = 0;
1787 int iSinCosIndex = 0;
1790 v4sf sin4_2, cos4_2;
1795 for(
int i=0;i<4;i++) {
1796 int brTemp=iSinCosIndex+i;
1797 vx.m128_f32[i]=( ((sSmallRBTable[*((
unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((
unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
1803 sin=-sin4_2.m128_f32[0];
1804 cos=-cos4_2.m128_f32[0];
1807 sin=-sin4_2.m128_f32[sinCosCalIndex];
1808 cos=-cos4_2.m128_f32[sinCosCalIndex];
1809 if(sinCosCalIndex==3)
1818 v1=*B*cos + *(B+1)*sin;
1819 v2=*B*sin - *(B+1)*cos;
1826 B += ButterfliesPerGroup * 2;
1828 ButterfliesPerGroup >>= 1;
1833 br2Index = h->
Points - 1;
1835 int sinCosCalIndex = 0;
1836 while(br1Index < br2Index)
1838 v4sf sin4_2, cos4_2;
1839 br1Value=( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&br1Index)+2)] )>>bitReverseShift);
1840 br2Value=( ((sSmallRBTable[*((
unsigned char *)&br2Index)]<<16) + (sSmallRBTable[*(((
unsigned char *)&br2Index)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&br2Index)+2)] )>>bitReverseShift);
1844 for(
int i=0;i<4;i++)
1845 vx.m128_f32[i]=((
float)(br1Index+i))*iToRad;
1847 sin=-sin4_2.m128_f32[0];
1848 cos=-cos4_2.m128_f32[0];
1851 sin=-sin4_2.m128_f32[sinCosCalIndex];
1852 cos=-cos4_2.m128_f32[sinCosCalIndex];
1853 if(sinCosCalIndex==3)
1858 A=&buffer[br1Value];
1859 B=&buffer[br2Value];
1860 HRplus = (HRminus = *
A - *B ) + (*B * 2);
1861 HIplus = (HIminus = *(
A+1) - *(B+1)) + (*(B+1) * 2);
1862 v1 = (sin*HRminus - cos*HIplus);
1863 v2 = (cos*HRminus + sin*HIplus);
1866 *(
A+1) = (HIminus + v2) * (
fft_type)0.5;
1867 *(B+1) = *(
A+1) - HIminus;
1874 A=&buffer[( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&br1Index)+2)] )>>bitReverseShift)+1];
1881 v1=buffer[0]-buffer[1];
1882 buffer[0]+=buffer[1];
1910 fft_type HRplus, HRminus, HIplus, HIminus;
1913 int bitReverseShiftM1 = 25 - h->pow2Bits;
1915 auto ButterfliesPerGroup = h->
Points / 2;
1919 B = buffer + h->
Points * 2 - 2;
1921 int sinCosCalIndex = 0;
1924 v4sf sin4_2, cos4_2;
1928 for(
int i=0;i<4;i++)
1929 vx.m128_f32[i]=((
float)(br1Index+i))*iToRad;
1931 sin=-sin4_2.m128_f32[0];
1932 cos=-cos4_2.m128_f32[0];
1935 sin=-sin4_2.m128_f32[sinCosCalIndex];
1936 cos=-cos4_2.m128_f32[sinCosCalIndex];
1937 if(sinCosCalIndex==3)
1942 HRplus = (HRminus = *
A - *B ) + (*B * 2);
1943 HIplus = (HIminus = *(
A+1) - *(B+1)) + (*(B+1) * 2);
1944 v1 = (sin*HRminus + cos*HIplus);
1945 v2 = (cos*HRminus - sin*HIplus);
1948 *(
A+1) = (HIminus - v2) * (
fft_type)0.5;
1949 *(B+1) = *(
A+1) - HIminus;
1964 v1=0.5f*(buffer[0]+buffer[1]);
1965 v2=0.5f*(buffer[0]-buffer[1]);
1977 endptr1 = buffer + h->
Points * 2;
1979 while(ButterfliesPerGroup > 0)
1982 B = buffer + ButterfliesPerGroup * 2;
1983 int sinCosCalIndex = 0;
1984 int iSinCosIndex = 0;
1987 v4sf sin4_2, cos4_2;
1991 for(
int i=0;i<4;i++) {
1992 int brTemp=iSinCosIndex+i;
1993 vx.m128_f32[i]=( ((sSmallRBTable[*((
unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((
unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
1997 sin=-sin4_2.m128_f32[0];
1998 cos=-cos4_2.m128_f32[0];
2001 sin=-sin4_2.m128_f32[sinCosCalIndex];
2002 cos=-cos4_2.m128_f32[sinCosCalIndex];
2003 if(sinCosCalIndex==3)
2012 v1=*B*cos - *(B+1)*sin;
2013 v2=*B*sin + *(B+1)*cos;
2020 B += ButterfliesPerGroup * 2;
2022 ButterfliesPerGroup >>= 1;
2028 int bitReverseShift = 24 - hFFT->pow2Bits;
2030 for(
size_t i = 1; i < hFFT->
Points; i++) {
2033 brValue=( ((sSmallRBTable[*((
unsigned char *)&i)]<<16) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&i)+2)] )>>bitReverseShift);
2035 RealOut[i] = buffer[brValue ];
2036 ImagOut[i] = buffer[brValue+1];
2038 RealOut[0] = buffer[0];
2040 RealOut[hFFT->
Points] = buffer[1];
2041 ImagOut[hFFT->
Points] = 0;
2046 int bitReverseShift = 24 - hFFT->pow2Bits;
2048 for(
size_t i = 0; i < hFFT->
Points; i++) {
2050 brValue=( ((sSmallRBTable[*((
unsigned char *)&i)]<<16) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&i)+2)] )>>bitReverseShift);
2052 TimeOut[i*2 ] = buffer[brValue ];
2053 TimeOut[i*2+1] = buffer[brValue+1];
2060 __m128 *localBuffer=(__m128 *)buffer;
2063 __m128 *endptr1,*endptr2;
2064 int br1Index, br2Index;
2065 int br1Value, br2Value;
2066 __m128 HRplus,HRminus,HIplus,HIminus;
2067 __m128 v1,v2,sin,cos;
2069 auto ButterfliesPerGroup = h->
Points / 2;
2070 int bitReverseShift = 24 - h->pow2Bits;
2071 int bitReverseShiftM1 = bitReverseShift + 1;
2081 endptr1 = &localBuffer[h->
Points * 2];
2083 while(ButterfliesPerGroup > 0)
2086 B = &localBuffer[ButterfliesPerGroup * 2];
2087 int sinCosCalIndex = 0;
2088 int iSinCosIndex = 0;
2091 v4sf sin4_2, cos4_2;
2095 for(
int i=0;i<4;i++) {
2096 int brTemp=iSinCosIndex+i;
2097 vx.m128_f32[i]=( ((sSmallRBTable[*((
unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((
unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
2101 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2102 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2105 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2106 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2107 if(sinCosCalIndex==3)
2116 v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
2117 v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
2118 *B=_mm_add_ps( *
A, v1);
2119 __m128 temp128 = _mm_set1_ps( 2.0);
2120 *(
A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
2121 *B=_mm_sub_ps(*
A,v2);
2122 *(
A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
2125 B = &B[ButterfliesPerGroup * 2];
2127 ButterfliesPerGroup >>= 1;
2132 br2Index = h->
Points - 1;
2134 int sinCosCalIndex = 0;
2135 while(br1Index < br2Index)
2137 v4sf sin4_2, cos4_2;
2138 br1Value=( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&br1Index)+2)] )>>bitReverseShift);
2139 br2Value=( ((sSmallRBTable[*((
unsigned char *)&br2Index)]<<16) + (sSmallRBTable[*(((
unsigned char *)&br2Index)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&br2Index)+2)] )>>bitReverseShift);
2143 for(
int i=0;i<4;i++)
2144 vx.m128_f32[i]=((
float)(br1Index+i))*iToRad;
2146 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2147 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2150 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2151 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2152 if(sinCosCalIndex==3)
2157 A=&localBuffer[br1Value];
2158 B=&localBuffer[br2Value];
2159 __m128 temp128 = _mm_set1_ps( 2.0);
2160 HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *
A, *B ), _mm_mul_ps(*B, temp128));
2161 HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(
A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
2162 v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
2163 v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
2164 temp128 = _mm_set1_ps( 0.5);
2165 *
A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
2166 *B = _mm_sub_ps(*
A, v1);
2167 *(
A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
2168 *(B+1) = _mm_sub_ps(*(
A+1), HIminus);
2175 A=&localBuffer[( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<16) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&br1Index)+2)] )>>bitReverseShift)+1];
2178 *
A=_mm_xor_ps(*
A, _mm_set1_ps(-0.f));
2181 v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
2182 localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
2208 __m128 *localBuffer=(__m128 *)buffer;
2211 __m128 *endptr1,*endptr2;
2213 __m128 HRplus,HRminus,HIplus,HIminus;
2214 __m128 v1,v2,sin,cos;
2216 int bitReverseShiftM1 = 25 - h->pow2Bits;
2217 auto ButterfliesPerGroup = h->
Points / 2;
2220 A = localBuffer + 2;
2221 B = localBuffer + h->
Points * 2 - 2;
2223 int sinCosCalIndex = 0;
2226 v4sf sin4_2, cos4_2;
2230 for(
int i=0;i<4;i++)
2231 vx.m128_f32[i]=((
float)(br1Index+i))*iToRad;
2233 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2234 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2237 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2238 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2239 if(sinCosCalIndex==3)
2244 HRminus = _mm_sub_ps(*
A, *B);
2245 HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
2246 HIminus = _mm_sub_ps( *(
A+1), *(B+1));
2247 HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
2248 v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
2249 v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
2250 *
A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
2251 *B = _mm_sub_ps(*
A, v1);
2252 *(
A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
2253 *(B+1) = _mm_sub_ps(*(
A+1), HIminus);
2261 *(
A+1)=_mm_xor_ps(*(
A+1), _mm_set1_ps(-0.f));
2270 v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
2271 v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
2283 endptr1 = localBuffer + h->
Points * 2;
2285 while(ButterfliesPerGroup > 0)
2288 B = localBuffer + ButterfliesPerGroup * 2;
2289 int sinCosCalIndex = 0;
2290 int iSinCosIndex = 0;
2293 v4sf sin4_2, cos4_2;
2297 for(
int i=0;i<4;i++) {
2298 int brTemp=iSinCosIndex+i;
2299 vx.m128_f32[i]=( ((sSmallRBTable[*((
unsigned char *)&brTemp)]<<16) + (sSmallRBTable[*(((
unsigned char *)&brTemp)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&brTemp)+2)] )>>bitReverseShiftM1)*iToRad;
2303 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2304 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2307 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2308 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2309 if(sinCosCalIndex==3)
2318 v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
2319 v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
2320 *B=_mm_mul_ps( _mm_add_ps(*
A, v1), _mm_set1_ps(0.5));
2321 *(
A++)=_mm_sub_ps(*(B++), v1);
2322 *B=_mm_mul_ps(_mm_add_ps(*
A, v2), _mm_set1_ps(0.5));
2323 *(
A++)=_mm_sub_ps(*(B++),v2);
2326 B = &B[ButterfliesPerGroup * 2];
2328 ButterfliesPerGroup >>= 1;
2334 __m128 *localBuffer = (__m128 *)buffer;
2335 __m128 *localRealOut = (__m128 *)RealOut;
2336 __m128 *localImagOut = (__m128 *)ImagOut;
2337 int bitReverseShift = 24-hFFT->pow2Bits;
2341 for(
size_t i = 1; i < hFFT->
Points; i++) {
2343 brValue=( ((sSmallRBTable[*((
unsigned char *)&i)]<<16) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&i)+2)] )>>bitReverseShift);
2345 localRealOut[i]=localBuffer[brValue ];
2346 localImagOut[i]=localBuffer[brValue+1];
2348 localRealOut[0] = localBuffer[0];
2349 localImagOut[0] = _mm_set1_ps(0.0);
2350 localRealOut[hFFT->
Points] = localBuffer[1];
2351 localImagOut[hFFT->
Points] = _mm_set1_ps(0.0);
2356 __m128 *localBuffer = (__m128 *)buffer;
2357 __m128 *localTimeOut = (__m128 *)TimeOut;
2358 int bitReverseShift = 24-hFFT->pow2Bits;
2361 for(
size_t i = 0; i < hFFT->
Points; i++) {
2363 brValue=( ((sSmallRBTable[*((
unsigned char *)&i)]<<16) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]<<8) + sSmallRBTable[*(((
unsigned char *)&i)+2)] )>>bitReverseShift);
2365 localTimeOut[i*2 ] = localBuffer[brValue ];
2366 localTimeOut[i*2+1] = localBuffer[brValue+1];
2372#define FAST_MATH_BR16
2373#ifdef FAST_MATH_BR16
2397 int br1Index, br2Index;
2398 int br1Value, br2Value;
2399 fft_type HRplus,HRminus,HIplus,HIminus;
2402 int bitReverseShiftM1 = 17 - h->pow2Bits;
2403 int bitReverseShift = bitReverseShiftM1 - 1;
2404 auto ButterfliesPerGroup = h->
Points / 2;
2414 endptr1 = buffer + h->
Points * 2;
2416 while(ButterfliesPerGroup > 0)
2419 B = buffer + ButterfliesPerGroup * 2;
2420 int sinCosCalIndex = 0;
2421 int iSinCosIndex = 0;
2424 v4sf sin4_2, cos4_2;
2428 for(
int i=0;i<4;i++) {
2429 int brTemp=iSinCosIndex+i;
2430 vx.m128_f32[i]=( ((sSmallRBTable[*((
unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((
unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
2434 sin=-sin4_2.m128_f32[0];
2435 cos=-cos4_2.m128_f32[0];
2438 sin=-sin4_2.m128_f32[sinCosCalIndex];
2439 cos=-cos4_2.m128_f32[sinCosCalIndex];
2440 if(sinCosCalIndex==3)
2449 v1=*B*cos + *(B+1)*sin;
2450 v2=*B*sin - *(B+1)*cos;
2457 B += ButterfliesPerGroup * 2;
2459 ButterfliesPerGroup >>= 1;
2464 br2Index = h->
Points - 1;
2466 int sinCosCalIndex = 0;
2467 while(br1Index < br2Index)
2469 v4sf sin4_2, cos4_2;
2470 br1Value=( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]) )>>bitReverseShift);
2471 br2Value=( ((sSmallRBTable[*((
unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br2Index)+1)]) )>>bitReverseShift);
2475 for(
int i = 0; i < 4; i++)
2476 vx.m128_f32[i]=((
float)(br1Index+i))*iToRad;
2478 sin = -sin4_2.m128_f32[0];
2479 cos=-cos4_2.m128_f32[0];
2482 sin=-sin4_2.m128_f32[sinCosCalIndex];
2483 cos=-cos4_2.m128_f32[sinCosCalIndex];
2484 if(sinCosCalIndex==3)
2489 A=&buffer[br1Value];
2490 B=&buffer[br2Value];
2491 HRplus = (HRminus = *
A - *B ) + (*B * 2);
2492 HIplus = (HIminus = *(
A+1) - *(B+1)) + (*(B+1) * 2);
2493 v1 = (sin*HRminus - cos*HIplus);
2494 v2 = (cos*HRminus + sin*HIplus);
2497 *(
A+1) = (HIminus + v2) * (
fft_type)0.5;
2498 *(B+1) = *(
A+1) - HIminus;
2505 A=&buffer[( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
2512 v1=buffer[0]-buffer[1];
2513 buffer[0]+=buffer[1];
2541 fft_type HRplus,HRminus,HIplus,HIminus;
2544 int bitReverseShiftM1=17-h->pow2Bits;
2546 auto ButterfliesPerGroup = h->
Points / 2;
2550 B = buffer + h->
Points * 2 - 2;
2552 int sinCosCalIndex = 0;
2555 v4sf sin4_2, cos4_2;
2559 for(
int i=0;i<4;i++)
2560 vx.m128_f32[i]=((
float)(br1Index+i))*iToRad;
2562 sin=-sin4_2.m128_f32[0];
2563 cos=-cos4_2.m128_f32[0];
2566 sin=-sin4_2.m128_f32[sinCosCalIndex];
2567 cos=-cos4_2.m128_f32[sinCosCalIndex];
2568 if(sinCosCalIndex==3)
2573 HRplus = (HRminus = *
A - *B ) + (*B * 2);
2574 HIplus = (HIminus = *(
A+1) - *(B+1)) + (*(B+1) * 2);
2575 v1 = (sin*HRminus + cos*HIplus);
2576 v2 = (cos*HRminus - sin*HIplus);
2579 *(
A+1) = (HIminus - v2) * (
fft_type)0.5;
2580 *(B+1) = *(
A+1) - HIminus;
2595 v1=0.5f*(buffer[0]+buffer[1]);
2596 v2=0.5f*(buffer[0]-buffer[1]);
2608 endptr1 = buffer + h->
Points * 2;
2610 while(ButterfliesPerGroup > 0)
2613 B = buffer + ButterfliesPerGroup * 2;
2614 int sinCosCalIndex = 0;
2615 int iSinCosIndex = 0;
2618 v4sf sin4_2, cos4_2;
2622 for(
int i = 0; i < 4; i++) {
2623 int brTemp = iSinCosIndex + i;
2624 vx.m128_f32[i]=( ((sSmallRBTable[*((
unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((
unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
2628 sin=-sin4_2.m128_f32[0];
2629 cos=-cos4_2.m128_f32[0];
2632 sin=-sin4_2.m128_f32[sinCosCalIndex];
2633 cos=-cos4_2.m128_f32[sinCosCalIndex];
2634 if(sinCosCalIndex==3)
2643 v1=*B*cos - *(B+1)*sin;
2644 v2=*B*sin + *(B+1)*cos;
2651 B += ButterfliesPerGroup * 2;
2653 ButterfliesPerGroup >>= 1;
2659 int bitReverseShift = 16 - hFFT->pow2Bits;
2661 for(
size_t i = 1; i < hFFT->
Points; i++) {
2664 brValue=( ((sSmallRBTable[*((
unsigned char *)&i)]<<8) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]) )>>bitReverseShift);
2665 RealOut[i] = buffer[brValue ];
2666 ImagOut[i] = buffer[brValue+1];
2668 RealOut[0] = buffer[0];
2670 RealOut[hFFT->
Points] = buffer[1];
2671 ImagOut[hFFT->
Points] = 0;
2676 int bitReverseShift=16-hFFT->pow2Bits;
2678 for(
size_t i = 0; i < hFFT->
Points; i++) {
2680 brValue=( ((sSmallRBTable[*((
unsigned char *)&i)]<<8) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]) )>>bitReverseShift);
2682 TimeOut[i*2 ] = buffer[brValue ];
2683 TimeOut[i*2+1] = buffer[brValue+1];
2690 __m128 *localBuffer = (__m128 *)buffer;
2693 __m128 *endptr1, *endptr2;
2694 int br1Index, br2Index;
2695 int br1Value, br2Value;
2696 __m128 HRplus, HRminus, HIplus, HIminus;
2697 __m128 v1, v2, sin, cos;
2699 auto ButterfliesPerGroup = h->
Points / 2;
2700 int bitReverseShiftM1 = 17 - h->pow2Bits;
2701 int bitReverseShift = bitReverseShiftM1 - 1;
2711 endptr1 = &localBuffer[h->
Points * 2];
2713 while(ButterfliesPerGroup > 0)
2716 B = &localBuffer[ButterfliesPerGroup * 2];
2717 int sinCosCalIndex = 0;
2718 int iSinCosIndex = 0;
2721 v4sf sin4_2, cos4_2;
2725 for(
int i=0;i<4;i++) {
2726 int brTemp=iSinCosIndex+i;
2727 vx.m128_f32[i]=( ((sSmallRBTable[*((
unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((
unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
2731 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2732 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2735 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2736 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2737 if(sinCosCalIndex==3)
2746 v1 = _mm_add_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
2747 v2 = _mm_sub_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
2748 *B=_mm_add_ps( *
A, v1);
2749 __m128 temp128 = _mm_set1_ps( 2.0);
2750 *(
A++)=_mm_sub_ps(*(B++), _mm_mul_ps(temp128, v1));
2751 *B=_mm_sub_ps(*
A,v2);
2752 *(
A++)=_mm_add_ps(*(B++), _mm_mul_ps(temp128, v2));
2755 B = &B[ButterfliesPerGroup * 2];
2757 ButterfliesPerGroup >>= 1;
2762 br2Index = h->
Points - 1;
2764 int sinCosCalIndex = 0;
2765 while(br1Index < br2Index)
2767 v4sf sin4_2, cos4_2;
2768 br1Value=( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]) )>>bitReverseShift);
2769 br2Value=( ((sSmallRBTable[*((
unsigned char *)&br2Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br2Index)+1)]) )>>bitReverseShift);
2773 for(
int i = 0; i < 4; i++)
2774 vx.m128_f32[i] = ((
float)(br1Index+i)) * iToRad;
2776 sin = _mm_set1_ps(-sin4_2.m128_f32[0]);
2777 cos = _mm_set1_ps(-cos4_2.m128_f32[0]);
2780 sin = _mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2781 cos = _mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2782 if(sinCosCalIndex == 3)
2787 A = &localBuffer[br1Value];
2788 B = &localBuffer[br2Value];
2789 __m128 temp128 = _mm_set1_ps( 2.0);
2790 HRplus = _mm_add_ps(HRminus = _mm_sub_ps( *
A, *B ), _mm_mul_ps(*B, temp128));
2791 HIplus = _mm_add_ps(HIminus = _mm_sub_ps(*(
A+1), *(B+1) ), _mm_mul_ps(*(B+1), temp128));
2792 v1 = _mm_sub_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
2793 v2 = _mm_add_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
2794 temp128 = _mm_set1_ps( 0.5);
2795 *
A = _mm_mul_ps(_mm_add_ps(HRplus, v1), temp128);
2796 *B = _mm_sub_ps(*
A, v1);
2797 *(
A+1) = _mm_mul_ps(_mm_add_ps(HIminus, v2), temp128);
2798 *(B+1) = _mm_sub_ps(*(
A+1), HIminus);
2805 A=&localBuffer[( ((sSmallRBTable[*((
unsigned char *)&br1Index)]<<8) + (sSmallRBTable[*(((
unsigned char *)&br1Index)+1)]) )>>bitReverseShift)+1];
2808 *
A=_mm_xor_ps(*
A, _mm_set1_ps(-0.f));
2811 v1=_mm_sub_ps(localBuffer[0], localBuffer[1]);
2812 localBuffer[0]=_mm_add_ps(localBuffer[0], localBuffer[1]);
2838 __m128 *localBuffer=(__m128 *)buffer;
2841 __m128 *endptr1, *endptr2;
2843 __m128 HRplus, HRminus, HIplus, HIminus;
2844 __m128 v1, v2, sin, cos;
2846 int bitReverseShiftM1 = 17 - h->pow2Bits;
2847 auto ButterfliesPerGroup = h->
Points / 2;
2850 A = localBuffer + 2;
2851 B = localBuffer + h->
Points * 2 - 2;
2853 int sinCosCalIndex=0;
2856 v4sf sin4_2, cos4_2;
2860 for(
int i=0;i<4;i++)
2861 vx.m128_f32[i]=((
float)(br1Index+i))*iToRad;
2863 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2864 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2867 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2868 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2869 if(sinCosCalIndex==3)
2874 HRminus = _mm_sub_ps(*
A, *B);
2875 HRplus = _mm_add_ps(HRminus, _mm_mul_ps(*B, _mm_set1_ps(2.0)));
2876 HIminus = _mm_sub_ps( *(
A+1), *(B+1));
2877 HIplus = _mm_add_ps(HIminus, _mm_mul_ps(*(B+1), _mm_set1_ps(2.0)));
2878 v1 = _mm_add_ps(_mm_mul_ps(sin, HRminus), _mm_mul_ps(cos, HIplus));
2879 v2 = _mm_sub_ps(_mm_mul_ps(cos, HRminus), _mm_mul_ps(sin, HIplus));
2880 *
A = _mm_mul_ps(_mm_add_ps(HRplus, v1), _mm_set1_ps(0.5));
2881 *B = _mm_sub_ps(*
A, v1);
2882 *(
A+1) = _mm_mul_ps(_mm_sub_ps(HIminus, v2) , _mm_set1_ps(0.5));
2883 *(B+1) = _mm_sub_ps(*(
A+1), HIminus);
2891 *(
A+1)=_mm_xor_ps(*(
A+1), _mm_set1_ps(-0.f));
2900 v1=_mm_mul_ps(_mm_set1_ps(0.5), _mm_add_ps(localBuffer[0], localBuffer[1]));
2901 v2=_mm_mul_ps(_mm_set1_ps(0.5), _mm_sub_ps(localBuffer[0], localBuffer[1]));
2913 endptr1 = localBuffer + h->
Points * 2;
2915 while(ButterfliesPerGroup > 0)
2918 B = localBuffer + ButterfliesPerGroup * 2;
2919 int sinCosCalIndex = 0;
2920 int iSinCosIndex = 0;
2923 v4sf sin4_2, cos4_2;
2927 for(
int i=0;i<4;i++) {
2928 int brTemp=iSinCosIndex+i;
2929 vx.m128_f32[i]=( ((sSmallRBTable[*((
unsigned char *)&brTemp)]<<8) + (sSmallRBTable[*(((
unsigned char *)&brTemp)+1)]) )>>bitReverseShiftM1)*iToRad;
2933 sin=_mm_set1_ps(-sin4_2.m128_f32[0]);
2934 cos=_mm_set1_ps(-cos4_2.m128_f32[0]);
2937 sin=_mm_set1_ps(-sin4_2.m128_f32[sinCosCalIndex]);
2938 cos=_mm_set1_ps(-cos4_2.m128_f32[sinCosCalIndex]);
2939 if(sinCosCalIndex==3)
2948 v1=_mm_sub_ps( _mm_mul_ps(*B, cos), _mm_mul_ps(*(B+1), sin));
2949 v2=_mm_add_ps( _mm_mul_ps(*B, sin), _mm_mul_ps(*(B+1), cos));
2950 *B=_mm_mul_ps( _mm_add_ps(*
A, v1), _mm_set1_ps(0.5));
2951 *(
A++)=_mm_sub_ps(*(B++), v1);
2952 *B=_mm_mul_ps(_mm_add_ps(*
A, v2), _mm_set1_ps(0.5));
2953 *(
A++)=_mm_sub_ps(*(B++),v2);
2956 B = &B[ButterfliesPerGroup * 2];
2958 ButterfliesPerGroup >>= 1;
2964 __m128 *localBuffer=(__m128 *)buffer;
2965 __m128 *localRealOut=(__m128 *)RealOut;
2966 __m128 *localImagOut=(__m128 *)ImagOut;
2967 int bitReverseShift=16-hFFT->pow2Bits;
2971 for(
size_t i = 1; i < hFFT->
Points; i++) {
2973 brValue=( ((sSmallRBTable[*((
unsigned char *)&i)]<<8) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]) )>>bitReverseShift);
2975 localRealOut[i]=localBuffer[brValue ];
2976 localImagOut[i]=localBuffer[brValue+1];
2978 localRealOut[0] = localBuffer[0];
2979 localImagOut[0] = _mm_set1_ps(0.0);
2980 localRealOut[hFFT->
Points] = localBuffer[1];
2981 localImagOut[hFFT->
Points] = _mm_set1_ps(0.0);
2986 __m128 *localBuffer=(__m128 *)buffer;
2987 __m128 *localTimeOut=(__m128 *)TimeOut;
2988 int bitReverseShift=16-hFFT->pow2Bits;
2991 for(
size_t i = 0; i < hFFT->
Points; i++) {
2993 brValue=( ((sSmallRBTable[*((
unsigned char *)&i)]<<8) + (sSmallRBTable[*(((
unsigned char *)&i)+1)]) )>>bitReverseShift);
2995 localTimeOut[i*2 ] = localBuffer[brValue ];
2996 localTimeOut[i*2+1] = localBuffer[brValue+1];
void RealFFTf4x(fft_type *, FFTParam *, int functionType=-1)
void ReorderToFreq1x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType=-1)
void InverseRealFFTf4xFastMathBR16(fft_type *, FFTParam *)
void InverseRealFFTf4xSinCosTableVBR16(fft_type *, FFTParam *)
void RealFFTf4xFastMathBR16(fft_type *, FFTParam *)
void ReorderToTime1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToFreq4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void RealFFTf4xSinCosTableVBR16(fft_type *, FFTParam *)
void ReorderToTime1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToTime1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToFreq1xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void RealFFTf1xSinCosTableBR16(fft_type *, FFTParam *)
void ReorderToTime4xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void RealFFTf1xSinCosBRTable(fft_type *, FFTParam *)
void ReorderToFreq4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void ReorderToFreq4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void InverseRealFFTf1xSinCosTableVBR16(fft_type *, FFTParam *)
void TableUsage(int iMask)
void RealFFTf4xFastMathBR24(fft_type *, FFTParam *)
void ReorderToFreq4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void ReorderToFreq1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void ReorderToFreq1xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void RealFFTf1x(fft_type *, FFTParam *, int functionType=-1)
void InverseRealFFTf1xFastMathBR16(fft_type *, FFTParam *)
void ReorderToFreq1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void ReorderToTime4x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType=-1)
void ReorderToTime4xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void InverseRealFFTf1xSinCosTableBR16(fft_type *, FFTParam *)
void InverseRealFFTf4xSinCosBRTable(fft_type *, FFTParam *)
void InverseRealFFTf1xFastMathBR24(fft_type *, FFTParam *)
void ReorderToTime1xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToFreq4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
void RealFFTf4xSinCosTableBR16(fft_type *, FFTParam *)
void ReorderToTime4xFastMathBR24(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToTime1x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType=-1)
void InverseRealFFTf4x(fft_type *, FFTParam *, int functionType=-1)
void RealFFTf1xFastMathBR16(fft_type *, FFTParam *)
void InverseRealFFTf1x(fft_type *, FFTParam *, int functionType=-1)
void InverseRealFFTf1xSinCosBRTable(fft_type *, FFTParam *)
void ReorderToTime1xFastMathBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void RealFFTf1xSinCosTableVBR16(fft_type *, FFTParam *)
void InverseRealFFTf4xSinCosTableBR16(fft_type *, FFTParam *)
int(* SmallVRB[])(int bits)
void ReorderToFreq4x(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut, int functionType=-1)
void RealFFTf1xFastMathBR24(fft_type *, FFTParam *)
void InverseRealFFTf4xFastMathBR24(fft_type *, FFTParam *)
void RealFFTf4xSinCosBRTable(fft_type *, FFTParam *)
int SmallRB(int bits, int numberBits)
void ReorderToTime4xSinCosTableVBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToTime4xSinCosTableBR16(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut)
void ReorderToFreq1xSinCosBRTable(FFTParam *hFFT, fft_type *buffer, fft_type *RealOut, fft_type *ImagOut)
SSE maths functions (for FFTs)
void sincos_ps(v4sf x, v4sf *s, v4sf *c)
ArrayOf< SinCosStruct > mSinCosTable
ArrayOf< int > BitReversed
ArrayOf< fft_type > SinTable