Audacity 3.2.0
Equalization48x.cpp
Go to the documentation of this file.
1/**********************************************************************
2
3 Audacity: A Digital Audio Editor
4
5 EffectEqualization.cpp
6
7 Andrew Hallendorff
8
9*******************************************************************//****************************************************************/
15
16
17#include "Equalization48x.h"
18
19#ifdef EXPERIMENTAL_EQ_SSE_THREADED
20#include "../Project.h"
21#include "Equalization.h"
22#include "WaveClip.h"
23#include "WaveTrack.h"
24#include "../float_cast.h"
25#include <vector>
26
27#include <wx/setup.h> // for wxUSE_* macros
28
29#if wxUSE_TOOLTIPS
30#include <wx/tooltip.h>
31#endif
32
33#include <math.h>
34
35#include "../RealFFTf48x.h"
36
37#ifndef USE_SSE2
38#define USE_SSE2
39#endif
40
41#include <stdlib.h>
42
43#ifdef __WXMSW__
44#include <malloc.h>
45#endif
46
47#include <math.h>
48#include <emmintrin.h>
49
50#ifdef _WIN32
51
52// Windows
53#include <intrin.h>
54#define cpuid __cpuid
55
56#else
57
58// GCC Inline Assembly
59void cpuid(int CPUInfo[4],int InfoType){
60 __asm__ __volatile__ (
61 "cpuid":
62 "=a" (CPUInfo[0]),
63 "=b" (CPUInfo[1]),
64 "=c" (CPUInfo[2]),
65 "=d" (CPUInfo[3]) :
66 "a" (InfoType)
67 );
68}
69
70#endif
71
72bool sMathCapsInitialized = false;
73
74MathCaps sMathCaps;
75
76// dirty switcher
77int sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED;
78
79void EffectEqualization48x::SetMathPath(int mathPath) { sMathPath=mathPath; };
80
81int EffectEqualization48x::GetMathPath() { return sMathPath; };
82
83void EffectEqualization48x::AddMathPathOption(int mathPath) { sMathPath|=mathPath; };
84
85void EffectEqualization48x::RemoveMathPathOption(int mathPath) { sMathPath&=~mathPath; };
86
87MathCaps *EffectEqualization48x::GetMathCaps()
88{
89 if(!sMathCapsInitialized)
90 {
91 sMathCapsInitialized=true;
92 sMathCaps.x64 = false;
93 sMathCaps.MMX = false;
94 sMathCaps.SSE = false;
95 sMathCaps.SSE2 = false;
96 sMathCaps.SSE3 = false;
97 sMathCaps.SSSE3 = false;
98 sMathCaps.SSE41 = false;
99 sMathCaps.SSE42 = false;
100 sMathCaps.SSE4a = false;
101 sMathCaps.AVX = false;
102 sMathCaps.XOP = false;
103 sMathCaps.FMA3 = false;
104 sMathCaps.FMA4 = false;
105
106 int info[4];
107 cpuid(info, 0);
108 int nIds = info[0];
109
110 cpuid(info, 0x80000000);
111 int nExIds = info[0];
112
113 // Detect Instruction Set
114 if (nIds >= 1){
115 cpuid(info,0x00000001);
116 sMathCaps.MMX = (info[3] & ((int)1 << 23)) != 0;
117 sMathCaps.SSE = (info[3] & ((int)1 << 25)) != 0;
118 sMathCaps.SSE2 = (info[3] & ((int)1 << 26)) != 0;
119 sMathCaps.SSE3 = (info[2] & ((int)1 << 0)) != 0;
120
121 sMathCaps.SSSE3 = (info[2] & ((int)1 << 9)) != 0;
122 sMathCaps.SSE41 = (info[2] & ((int)1 << 19)) != 0;
123 sMathCaps.SSE42 = (info[2] & ((int)1 << 20)) != 0;
124
125 sMathCaps.AVX = (info[2] & ((int)1 << 28)) != 0;
126 sMathCaps.FMA3 = (info[2] & ((int)1 << 12)) != 0;
127 }
128
129 if (nExIds >= 0x80000001){
130 cpuid(info,0x80000001);
131 sMathCaps.x64 = (info[3] & ((int)1 << 29)) != 0;
132 sMathCaps.SSE4a = (info[2] & ((int)1 << 6)) != 0;
133 sMathCaps.FMA4 = (info[2] & ((int)1 << 16)) != 0;
134 sMathCaps.XOP = (info[2] & ((int)1 << 11)) != 0;
135 }
136 if(sMathCaps.SSE)
137 sMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED; // we are starting on.
138 }
139 return &sMathCaps;
140};
141
142void * malloc_simd(const size_t size)
143{
144#if defined WIN32 // WIN32
145 return _aligned_malloc(size, 16);
146#elif defined __linux__ // Linux
147 return memalign (16, size);
148#elif defined __MACH__ // Mac OS X
149 return malloc(size);
150#else // other (use valloc for page-aligned memory)
151 return valloc(size);
152#endif
153}
154
155void free_simd::operator() (void* mem) const
156{
157#if defined WIN32 // WIN32
158 _aligned_free(mem);
159#else
160 free(mem);
161#endif
162}
163
164EffectEqualization48x::EffectEqualization48x():
165 mThreadCount(0),mFilterSize(0),mWindowSize(0),mBlockSize(0),mWorkerDataCount(0),mBlocksPerBuffer(20),
166 mScratchBufferSize(0),mSubBufferSize(0),mThreaded(false),
167 mBenching(false),mBufferCount(0)
168{
169}
170
171EffectEqualization48x::~EffectEqualization48x()
172{
173}
174
175bool EffectEqualization48x::AllocateBuffersWorkers(int nThreads)
176{
177 if(mBigBuffer)
178 FreeBuffersWorkers();
179 mFilterSize=(mEffectEqualization->mM-1)&(~15); // 4000 !!! Filter MUST BE QUAD WORD ALIGNED !!!!
180 mWindowSize=mEffectEqualization->windowSize;
181 wxASSERT(mFilterSize < mWindowSize);
182 mBlockSize=mWindowSize-mFilterSize; // 12,384
183 auto threadCount = wxThread::GetCPUCount();
184 mThreaded = (nThreads > 0 && threadCount > 0);
185 if(mThreaded)
186 {
187 mThreadCount = threadCount;
188 mWorkerDataCount=mThreadCount+2; // 2 extra slots (maybe double later)
189 } else {
190 mWorkerDataCount=1;
191 mThreadCount=0;
192 }
193#ifdef __AVX_ENABLED
194 mBufferCount=sMathPath&MATH_FUNCTION_AVX?8:4;
195#else
196 mBufferCount=4;
197#endif
198 // we're skewing the data by one block to allow for 1/4 block intersections.
199 // this will remove the disparity in data at the intersections of the runs
200
201 // The nice magic allocation
202 // megabyte - 3 windows - 4 overlapping buffers - filter
203 // 2^20 = 1,048,576 - 3 * 2^14 (16,384) - ((4 * 20) - 3) * 12,384 - 4000
204 // 1,048,576 - 49,152 - 953,568 - 4000 = 41,856 (leftover)
205
206 mScratchBufferSize=mWindowSize*3*sizeof(float)*mBufferCount; // 3 window size blocks of instruction size
207 mSubBufferSize=mBlockSize*(mBufferCount*(mBlocksPerBuffer-1)); // we are going to do a full block overlap
208 mBigBuffer.reset( (float *)malloc_simd(sizeof(float) * (mSubBufferSize + mFilterSize + mScratchBufferSize) * mWorkerDataCount) ); // we run over by filtersize
209 // fill the bufferInfo
210 mBufferInfo.reinit(mWorkerDataCount);
211 for(int i=0;i<mWorkerDataCount;i++) {
212 mBufferInfo[i].mFftWindowSize=mWindowSize;
213 mBufferInfo[i].mFftFilterSize=mFilterSize;
214 mBufferInfo[i].mBufferLength=mBlockSize*mBlocksPerBuffer;
215 mBufferInfo[i].mContiguousBufferSize=mSubBufferSize;
216 mBufferInfo[i].mScratchBuffer=&mBigBuffer[(mSubBufferSize+mScratchBufferSize)*i+mSubBufferSize];
217 for(int j=0;j<mBufferCount;j++)
218 mBufferInfo[i].mBufferDest[j]=mBufferInfo[i].mBufferSouce[j]=&mBigBuffer[j*(mBufferInfo[i].mBufferLength-mBlockSize)+(mSubBufferSize+mScratchBufferSize)*i];
219 }
220 if(mThreadCount) {
221 // start the workers
222 mDataMutex.IsOk();
223 mEQWorkers.reinit(mThreadCount);
224 for(int i=0;i<mThreadCount;i++) {
225 mEQWorkers[i].SetData( mBufferInfo.get(), mWorkerDataCount, &mDataMutex, this);
226 mEQWorkers[i].Create();
227 mEQWorkers[i].Run();
228 }
229 }
230 return true;
231}
232
233bool EffectEqualization48x::FreeBuffersWorkers()
234{
235 if(mThreaded) {
236 for(int i=0;i<mThreadCount;i++) { // tell all the workers to exit
237 mEQWorkers[i].ExitLoop();
238 }
239 for(int i=0;i<mThreadCount;i++) {
240 mEQWorkers[i].Wait();
241 }
242 mEQWorkers.reset(); // kill the workers ( go directly to jail)
243 mThreadCount=0;
244 mWorkerDataCount=0;
245 }
246 mBufferInfo.reset();
247 mBigBuffer.reset();
248 return true;
249}
250
251
252#pragma warning(push)
253// Disable the unreachable code warning in MSVC, for this function.
254#pragma warning(disable: 4702)
255bool EffectEqualization48x::RunFunctionSelect(int flags, int count, WaveTrack * track, sampleCount start, sampleCount len)
256{
257 // deal with tables here
258 flags&=~(MATH_FUNCTION_BITREVERSE_TABLE|MATH_FUNCTION_SIN_COS_TABLE); // clear out the table flags
259 switch (flags)
260 {
261 case MATH_FUNCTION_SSE:
262 return ProcessOne4x(count, track, start, len);
263 break;
264 case MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED:
265 return ProcessOne1x4xThreaded(count, track, start, len);
266 break;
267 case MATH_FUNCTION_THREADED:
268 case MATH_FUNCTION_THREADED|MATH_FUNCTION_SEGMENTED_CODE:
269 return ProcessOne1x4xThreaded(count, track, start, len, 1);
270 break;
271 case MATH_FUNCTION_SEGMENTED_CODE:
272 return ProcessOne1x(count, track, start, len);
273 break;
274 default:
275 return !mEffectEqualization->ProcessOne(count, track, start, len);
276 break;
277 }
278 return false;
279}
280#pragma warning(pop)
281
282bool EffectEqualization48x::Process(EffectEqualization* effectEqualization)
283{
284 mEffectEqualization=effectEqualization;
285// return TrackCompare(); // used for debugging data
286 mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
287 bool bBreakLoop = false;
288
289 TableUsage(sMathPath);
290 if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
291 mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
292 AllocateBuffersWorkers(sMathPath&MATH_FUNCTION_THREADED);
293 auto cleanup = finally( [&] { FreeBuffersWorkers(); } );
294 int count = 0;
295 for( auto track :
296 mEffectEqualization->mOutputTracks->Selected< WaveTrack >() ) {
297 double trackStart = track->GetStartTime();
298 double trackEnd = track->GetEndTime();
299 double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
300 double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
301
302 if (t1 > t0) {
303 auto start = track->TimeToLongSamples(t0);
304 auto end = track->TimeToLongSamples(t1);
305 auto len = end - start;
306 bBreakLoop=RunFunctionSelect(sMathPath, count, track, start, len);
307 if( bBreakLoop )
308 break;
309 }
310 count++;
311 }
312
313 mEffectEqualization->ReplaceProcessedTracks(!bBreakLoop);
314 return !bBreakLoop;
315}
316
317bool EffectEqualization48x::TrackCompare()
318{
319 mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
320 bool bBreakLoop = false;
321
322 TableUsage(sMathPath);
323 if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
324 mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
325 AllocateBuffersWorkers(sMathPath&MATH_FUNCTION_THREADED);
326 auto cleanup = finally( [&] { FreeBuffersWorkers(); } );
327 // Reset map
328 // PRL: These two maps aren't really used
329 std::vector<const Track*> SecondIMap;
330 std::vector<Track*> SecondOMap;
331 SecondIMap.clear();
332 SecondOMap.clear();
333
334 auto pSecondOutputTracks = TrackList::Create( nullptr );
335 auto &SecondOutputTracks = *pSecondOutputTracks;
336
337 for (auto aTrack :
338 mEffectEqualization->inputTracks()->Any< const WaveTrack >()) {
339
340 // Include selected tracks, plus sync-lock selected tracks
341 if (aTrack->GetSelected() || aTrack->IsSyncLockSelected())
342 {
343 auto o = mEffectEqualization->mFactory->DuplicateWaveTrack( *aTrack );
344 SecondIMap.push_back(aTrack);
345 SecondIMap.push_back(o.get());
346 SecondOutputTracks.Add( o );
347 }
348 }
349
350 for(int i = 0; i < 2; i++) {
351 i?sMathPath=sMathPath:sMathPath=0;
352 int count = 0;
353 for( auto track :
354 ( i ? mEffectEqualization->mOutputTracks.get()
355 : &SecondOutputTracks ) -> Selected< WaveTrack >() ) {
356 double trackStart = track->GetStartTime();
357 double trackEnd = track->GetEndTime();
358 double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
359 double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
360
361 if (t1 > t0) {
362 auto start = track->TimeToLongSamples(t0);
363 auto end = track->TimeToLongSamples(t1);
364 auto len = end - start;
365 bBreakLoop=RunFunctionSelect(sMathPath, count, track, start, len);
366 if( bBreakLoop )
367 break;
368 }
369 count++;
370 }
371 }
372
373 auto iter2 = (SecondOutputTracks.Selected< const WaveTrack >()).first;
374 auto track2 = *iter2;
375 for ( auto track :
376 mEffectEqualization->mOutputTracks->Selected< WaveTrack >() ) {
377 double trackStart = track->GetStartTime();
378 double trackEnd = track->GetEndTime();
379 double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
380 double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
381
382 if (t1 > t0) {
383 auto start = track->TimeToLongSamples(t0);
384 auto end = track->TimeToLongSamples(t1);
385 auto len = end - start;
386 DeltaTrack(track, track2, start, len);
387 }
388 track2 = * ++iter2;
389 }
390 mEffectEqualization->ReplaceProcessedTracks(!bBreakLoop);
391 return bBreakLoop; // return !bBreakLoop ?
392}
393
394bool EffectEqualization48x::DeltaTrack(
395 WaveTrack * t, const WaveTrack * t2, sampleCount start, sampleCount len)
396{
397
398 auto trackBlockSize = t->GetMaxBlockSize();
399
400 Floats buffer1{ trackBlockSize };
401 Floats buffer2{ trackBlockSize };
402
403 auto output = t->EmptyCopy();
405 auto originalLen = len;
406 auto currentSample = start;
407
408 while(len > 0) {
409 auto curretLength = limitSampleBufferSize(trackBlockSize, len);
410 t->Get((samplePtr)buffer1.get(), floatSample, currentSample, curretLength);
411 t2->Get((samplePtr)buffer2.get(), floatSample, currentSample, curretLength);
412 for(decltype(curretLength) i=0;i<curretLength;i++)
413 buffer1[i]-=buffer2[i];
414 output->Append((samplePtr)buffer1.get(), floatSample, curretLength);
415 currentSample+=curretLength;
416 len-=curretLength;
417 }
418 output->Flush();
419 len=originalLen;
420 ProcessTail(t, output.get(), start, len);
421 return true;
422}
423
424#include <wx/stopwatch.h>
425
426bool EffectEqualization48x::Benchmark(EffectEqualization* effectEqualization)
427{
428 mEffectEqualization=effectEqualization;
429 mEffectEqualization->CopyInputTracks(); // Set up mOutputTracks.
430 bool bBreakLoop = false;
431
432 TableUsage(sMathPath);
433 if(sMathPath) // !!! Filter MUST BE QUAD WORD ALIGNED !!!!
434 mEffectEqualization->mM=(mEffectEqualization->mM&(~15))+1;
435 AllocateBuffersWorkers(MATH_FUNCTION_THREADED);
436 auto cleanup = finally( [&] { FreeBuffersWorkers(); } );
437 long times[] = { 0,0,0,0,0 };
438 wxStopWatch timer;
439 mBenching = true;
440 for(int i = 0; i < 5 && !bBreakLoop; i++) {
441 int localMathPath;
442 switch(i) {
443 case 0: localMathPath=MATH_FUNCTION_SSE|MATH_FUNCTION_THREADED;
444 if(!sMathCaps.SSE)
445 localMathPath=-1;
446 break;
447 case 1: localMathPath=MATH_FUNCTION_SSE;
448 if(!sMathCaps.SSE)
449 localMathPath=-1;
450 break;
451 case 2: localMathPath=MATH_FUNCTION_SEGMENTED_CODE;
452 break;
453 case 3: localMathPath=MATH_FUNCTION_THREADED|MATH_FUNCTION_SEGMENTED_CODE;
454 break;
455 case 4: localMathPath=0;
456 break;
457 default: localMathPath=-1;
458 }
459 if(localMathPath >= 0) {
460 timer.Start();
461 int count = 0;
462 for (auto track :
463 mEffectEqualization->mOutputTracks->Selected< WaveTrack >() ) {
464 double trackStart = track->GetStartTime();
465 double trackEnd = track->GetEndTime();
466 double t0 = mEffectEqualization->mT0 < trackStart? trackStart: mEffectEqualization->mT0;
467 double t1 = mEffectEqualization->mT1 > trackEnd? trackEnd: mEffectEqualization->mT1;
468
469 if (t1 > t0) {
470 auto start = track->TimeToLongSamples(t0);
471 auto end = track->TimeToLongSamples(t1);
472 auto len = end - start;
473 bBreakLoop=RunFunctionSelect( localMathPath, count, track, start, len);
474 if( bBreakLoop )
475 break;
476 }
477 count++;
478 }
479 times[i]=timer.Time();
480 }
481 }
482 mBenching=false;
483 bBreakLoop=false;
484 mEffectEqualization->ReplaceProcessedTracks(bBreakLoop);
485
486 wxTimeSpan tsSSEThreaded(0, 0, 0, times[0]);
487 wxTimeSpan tsSSE(0, 0, 0, times[1]);
488 wxTimeSpan tsDefaultEnhanced(0, 0, 0, times[2]);
489 wxTimeSpan tsDefaultThreaded(0, 0, 0, times[3]);
490 wxTimeSpan tsDefault(0, 0, 0, times[4]);
491
492 mEffectEqualization->MessageBox(
493 XO(
494"Benchmark times:\nOriginal: %s\nDefault Segmented: %s\nDefault Threaded: %s\nSSE: %s\nSSE Threaded: %s\n")
495 .Format(
496 tsDefault.Format(wxT("%M:%S.%l")),
497 tsDefaultEnhanced.Format(wxT("%M:%S.%l")),
498 tsDefaultThreaded.Format(wxT("%M:%S.%l")),
499 tsSSE.Format(wxT("%M:%S.%l")),
500 tsSSEThreaded.Format(wxT("%M:%S.%l")) ) );
501 return bBreakLoop; // return !bBreakLoop ?
502}
503
504bool EffectEqualization48x::ProcessTail(WaveTrack * t, WaveTrack * output, sampleCount start, sampleCount len)
505{
506 // double offsetT0 = t->LongSamplesToTime(offset);
507 double lenT = t->LongSamplesToTime(len);
508 // 'start' is the sample offset in 't', the passed in track
509 // 'startT' is the equivalent time value
510 // 'output' starts at zero
511 double startT = t->LongSamplesToTime(start);
512
513 //output has one waveclip for the total length, even though
514 //t might have whitespace separating multiple clips
515 //we want to maintain the original clip structure, so
516 //only paste the intersections of the NEW clip.
517
518 //Find the bits of clips that need replacing
519 std::vector<std::pair<double, double> > clipStartEndTimes;
520 std::vector<std::pair<double, double> > clipRealStartEndTimes; //the above may be truncated due to a clip being partially selected
521 for (const auto &clip: t->GetClips())
522 {
523 double clipStartT;
524 double clipEndT;
525
526 clipStartT = clip->GetStartTime();
527 clipEndT = clip->GetEndTime();
528 if( clipEndT <= startT )
529 continue; // clip is not within selection
530 if( clipStartT >= startT + lenT )
531 continue; // clip is not within selection
532
533 //save the actual clip start/end so that we can rejoin them after we paste.
534 clipRealStartEndTimes.push_back(std::pair<double,double>(clipStartT,clipEndT));
535
536 if( clipStartT < startT ) // does selection cover the whole clip?
537 clipStartT = startT; // don't copy all the NEW clip
538 if( clipEndT > startT + lenT ) // does selection cover the whole clip?
539 clipEndT = startT + lenT; // don't copy all the NEW clip
540
541 //save them
542 clipStartEndTimes.push_back(std::pair<double,double>(clipStartT,clipEndT));
543 }
544 //now go thru and replace the old clips with NEW
545 for(unsigned int i=0;i<clipStartEndTimes.size();i++)
546 {
547 //remove the old audio and get the NEW
548 t->Clear(clipStartEndTimes[i].first,clipStartEndTimes[i].second);
549 // output->Copy(clipStartEndTimes[i].first-startT+offsetT0,clipStartEndTimes[i].second-startT+offsetT0, &toClipOutput);
550 auto toClipOutput = output->Copy(clipStartEndTimes[i].first-startT, clipStartEndTimes[i].second-startT);
551 //put the processed audio in
552 t->Paste(clipStartEndTimes[i].first, toClipOutput.get());
553 //if the clip was only partially selected, the Paste will have created a split line. Join is needed to take care of this
554 //This is not true when the selection is fully contained within one clip (second half of conditional)
555 if( (clipRealStartEndTimes[i].first != clipStartEndTimes[i].first ||
556 clipRealStartEndTimes[i].second != clipStartEndTimes[i].second) &&
557 !(clipRealStartEndTimes[i].first <= startT &&
558 clipRealStartEndTimes[i].second >= startT+lenT) )
559 t->Join(clipRealStartEndTimes[i].first,clipRealStartEndTimes[i].second);
560 }
561 return true;
562}
563
564bool EffectEqualization48x::ProcessBuffer(fft_type *sourceBuffer, fft_type *destBuffer, size_t bufferLength)
565{
566 BufferInfo bufferInfo;
567 bufferInfo.mContiguousBufferSize=bufferLength;
568 bufferInfo.mBufferSouce[0]=sourceBuffer;
569 bufferInfo.mBufferDest[0]=destBuffer;
570 bufferInfo.mScratchBuffer=&sourceBuffer[mSubBufferSize];
571 return ProcessBuffer1x(&bufferInfo);
572}
573
574bool EffectEqualization48x::ProcessBuffer1x(BufferInfo *bufferInfo)
575{
576 int bufferCount=bufferInfo->mContiguousBufferSize?1:4;
577 for(int bufferIndex=0;bufferIndex<bufferCount;bufferIndex++)
578 {
579 auto bufferLength=bufferInfo->mBufferLength;
580 if(bufferInfo->mContiguousBufferSize)
581 bufferLength=bufferInfo->mContiguousBufferSize;
582
583 auto blockCount=bufferLength/mBlockSize;
584 auto lastBlockSize=bufferLength%mBlockSize;
585 if(lastBlockSize)
586 blockCount++;
587
588 float *workBuffer=bufferInfo->mScratchBuffer; // all scratch buffers are at the end
589 float *scratchBuffer=&workBuffer[mWindowSize*2]; // all scratch buffers are at the end
590 float *sourceBuffer=bufferInfo->mBufferSouce[bufferIndex];
591 float *destBuffer=bufferInfo->mBufferDest[bufferIndex];
592 for(size_t runx=0;runx<blockCount;runx++)
593 {
594 float *currentBuffer=&workBuffer[mWindowSize*(runx&1)];
595 for(int i=0;i<mBlockSize;i++)
596 currentBuffer[i]=sourceBuffer[i];
597 sourceBuffer+=mBlockSize;
598 float *currentFilter=&currentBuffer[mBlockSize];
599 for(int i=0;i<mFilterSize;i++)
600 currentFilter[i]=0;
601// mEffectEqualization->Filter(mWindowSize, currentBuffer);
602 Filter1x(mWindowSize, currentBuffer, scratchBuffer);
603 float *writeEnd=currentBuffer+mBlockSize;
604 if(runx==blockCount)
605 writeEnd=currentBuffer+(lastBlockSize+mFilterSize);
606 if(runx) {
607 float *lastOverrun=&workBuffer[mWindowSize*((runx+1)&1)+mBlockSize];
608 for(int j=0;j<mFilterSize;j++)
609 *destBuffer++= *currentBuffer++ + *lastOverrun++;
610 } else
611 currentBuffer+=mFilterSize>>1; // this will skip the first filterSize on the first run
612 while(currentBuffer<writeEnd)
613 *destBuffer++ = *currentBuffer++;
614 }
615 }
616 return true;
617}
618
619bool EffectEqualization48x::ProcessOne1x(int count, WaveTrack * t,
620 sampleCount start, sampleCount len)
621{
622 //sampleCount blockCount=len/mBlockSize;
623
624 auto trackBlockSize = t->GetMaxBlockSize();
625
626 auto output = t->EmptyCopy();
628
629 mEffectEqualization->TrackProgress(count, 0.0);
630 int subBufferSize=mBufferCount==8?(mSubBufferSize>>1):mSubBufferSize; // half the buffers if avx is active
631 auto bigRuns=len/(subBufferSize-mBlockSize);
632 int trackBlocksPerBig=subBufferSize/trackBlockSize;
633 int trackLeftovers=subBufferSize-trackBlocksPerBig*trackBlockSize;
634 size_t singleProcessLength;
635 if(bigRuns == 0)
636 singleProcessLength = len.as_size_t();
637 else
638 singleProcessLength =
639 ((mFilterSize>>1)*bigRuns + len%(bigRuns*(subBufferSize-mBlockSize)))
640 .as_size_t();
641 auto currentSample=start;
642 bool bBreakLoop = false;
643 for(int bigRun=0;bigRun<bigRuns;bigRun++)
644 {
645 // fill the buffer
646 for(int i=0;i<trackBlocksPerBig;i++) {
647 t->Get((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize);
648 currentSample+=trackBlockSize;
649 }
650 if(trackLeftovers) {
651 t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
652 currentSample+=trackLeftovers;
653 }
654 currentSample-=mBlockSize+(mFilterSize>>1);
655
656 ProcessBuffer1x(mBufferInfo.get());
657 bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigRun)/bigRuns.as_double());
658 if( bBreakLoop )
659 break;
660 output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, subBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1)));
661 }
662 if(singleProcessLength && !bBreakLoop) {
663 t->Get((samplePtr)mBigBuffer.get(), floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
664 ProcessBuffer(mBigBuffer.get(), mBigBuffer.get(), singleProcessLength+mBlockSize+(mFilterSize>>1));
665 output->Append((samplePtr)&mBigBuffer[bigRuns > 0 ? mBlockSize : 0], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
666 }
667 output->Flush();
668 if(!bBreakLoop)
669 ProcessTail(t, output.get(), start, len);
670 return bBreakLoop;
671}
672
673void EffectEqualization48x::Filter1x(size_t len,
674 float *buffer, float *scratchBuffer)
675{
676 int i;
677 float real, imag;
678 // Apply FFT
679 RealFFTf1x(buffer, mEffectEqualization->hFFT.get());
680
681 // Apply filter
682 // DC component is purely real
683
684 float filterFuncR, filterFuncI;
685 filterFuncR = mEffectEqualization->mFilterFuncR[0];
686 scratchBuffer[0] = buffer[0] * filterFuncR;
687 auto halfLength = (len / 2);
688
689 bool useBitReverseTable=sMathPath&1;
690
691 for(i = 1; i < halfLength; i++)
692 {
693 if(useBitReverseTable) {
694 real=buffer[mEffectEqualization->hFFT->BitReversed[i] ];
695 imag=buffer[mEffectEqualization->hFFT->BitReversed[i]+1];
696 } else {
697 int bitReversed=SmallRB(i,mEffectEqualization->hFFT->pow2Bits);
698 real=buffer[bitReversed];
699 imag=buffer[bitReversed+1];
700 }
701 filterFuncR=mEffectEqualization->mFilterFuncR[i];
702 filterFuncI=mEffectEqualization->mFilterFuncI[i];
703
704 scratchBuffer[2*i ] = real*filterFuncR - imag*filterFuncI;
705 scratchBuffer[2*i+1] = real*filterFuncI + imag*filterFuncR;
706 }
707 // Fs/2 component is purely real
708 filterFuncR=mEffectEqualization->mFilterFuncR[halfLength];
709 scratchBuffer[1] = buffer[1] * filterFuncR;
710
711 // Inverse FFT and normalization
712 InverseRealFFTf1x(scratchBuffer, mEffectEqualization->hFFT.get());
713 ReorderToTime1x(mEffectEqualization->hFFT.get(), scratchBuffer, buffer);
714}
715
716bool EffectEqualization48x::ProcessBuffer4x(BufferInfo *bufferInfo)
717{
718 // length must be a factor of window size for 4x processing.
719 if(bufferInfo->mBufferLength%mBlockSize)
720 return false;
721
722 auto blockCount=bufferInfo->mBufferLength/mBlockSize;
723
724 __m128 *readBlocks[4]; // some temps so we don't destroy the vars in the struct
725 __m128 *writeBlocks[4];
726 for(int i=0;i<4;i++) {
727 readBlocks[i]=(__m128 *)bufferInfo->mBufferSouce[i];
728 writeBlocks[i]=(__m128 *)bufferInfo->mBufferDest[i];
729 }
730
731 __m128 *swizzledBuffer128=(__m128 *)bufferInfo->mScratchBuffer;
732 __m128 *scratchBuffer=&swizzledBuffer128[mWindowSize*2];
733
734 for(size_t run4x=0;run4x<blockCount;run4x++)
735 {
736 // swizzle the data to the swizzle buffer
737 __m128 *currentSwizzledBlock=&swizzledBuffer128[mWindowSize*(run4x&1)];
738 for(int i=0,j=0;j<mBlockSize;i++,j+=4) {
739 __m128 tmp0 = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(1,0,1,0));
740 __m128 tmp1 = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(3,2,3,2));
741 __m128 tmp2 = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(1,0,1,0));
742 __m128 tmp3 = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(3,2,3,2));
743 currentSwizzledBlock[j] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,2,0));
744 currentSwizzledBlock[j+1] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3,1,3,1));
745 currentSwizzledBlock[j+2] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2,0,2,0));
746 currentSwizzledBlock[j+3] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3,1,3,1));
747 }
748 __m128 *thisOverrun128=&currentSwizzledBlock[mBlockSize];
749 for(int i=0;i<mFilterSize;i++)
750 thisOverrun128[i]=_mm_set1_ps(0.0);
751 Filter4x(mWindowSize, (float *)currentSwizzledBlock, (float *)scratchBuffer);
752 int writeStart=0, writeToStart=0; // note readStart is where the read data is written
753 int writeEnd=mBlockSize;
754 if(run4x) {
755 // maybe later swizzle add and write in one
756 __m128 *lastOverrun128=&swizzledBuffer128[mWindowSize*((run4x+1)&1)+mBlockSize];
757 // add and swizzle data + filter
758 for(int i=0,j=0;j<mFilterSize;i++,j+=4) {
759 __m128 tmps0 = _mm_add_ps(currentSwizzledBlock[j], lastOverrun128[j]);
760 __m128 tmps1 = _mm_add_ps(currentSwizzledBlock[j+1], lastOverrun128[j+1]);
761 __m128 tmps2 = _mm_add_ps(currentSwizzledBlock[j+2], lastOverrun128[j+2]);
762 __m128 tmps3 = _mm_add_ps(currentSwizzledBlock[j+3], lastOverrun128[j+3]);
763 __m128 tmp0 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(0,1,0,1));
764 __m128 tmp1 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(2,3,2,3));
765 __m128 tmp2 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(0,1,0,1));
766 __m128 tmp3 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(2,3,2,3));
767 writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
768 writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
769 writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
770 writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
771 }
772 writeStart=mFilterSize;
773 writeToStart=mFilterSize>>2;
774 // swizzle it back.
775 for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=4) {
776 __m128 tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1));
777 __m128 tmp1 = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(2,3,2,3));
778 __m128 tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(0,1,0,1));
779 __m128 tmp3 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(2,3,2,3));
780 writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
781 writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
782 writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
783 writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
784 }
785 } else {
786 // swizzle it back. We overlap one block so we only write the first block on the first run
787 writeStart=0;
788 writeToStart=0;
789 for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=4) {
790 __m128 tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+1], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1));
791 __m128 tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+2], _MM_SHUFFLE(0,1,0,1));
792 writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
793 }
794 }
795 for(int i=0;i<4;i++) { // shift each block
796 readBlocks[i]+=mBlockSize>>2; // these are 128b pointers, each window is 1/4 blockSize for those
797 writeBlocks[i]+=mBlockSize>>2;
798 }
799 }
800 return true;
801}
802
803bool EffectEqualization48x::ProcessOne4x(int count, WaveTrack * t,
804 sampleCount start, sampleCount len)
805{
806 int subBufferSize=mBufferCount==8?(mSubBufferSize>>1):mSubBufferSize; // half the buffers if avx is active
807
808 if(len<subBufferSize) // it's not worth 4x processing do a regular process
809 return ProcessOne1x(count, t, start, len);
810
811 auto trackBlockSize = t->GetMaxBlockSize();
812
813 auto output = t->EmptyCopy();
815
816 mEffectEqualization->TrackProgress(count, 0.0);
817 auto bigRuns = len/(subBufferSize-mBlockSize);
818 int trackBlocksPerBig=subBufferSize/trackBlockSize;
819 int trackLeftovers=subBufferSize-trackBlocksPerBig*trackBlockSize;
820 size_t singleProcessLength =
821 ((mFilterSize>>1)*bigRuns + len%(bigRuns*(subBufferSize-mBlockSize)))
822 .as_size_t();
823 auto currentSample=start;
824
825 bool bBreakLoop = false;
826 for(int bigRun=0;bigRun<bigRuns;bigRun++)
827 {
828 // fill the buffer
829 for(int i=0;i<trackBlocksPerBig;i++) {
830 t->Get((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize);
831 currentSample+=trackBlockSize;
832 }
833 if(trackLeftovers) {
834 t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
835 currentSample+=trackLeftovers;
836 }
837 currentSample-=mBlockSize+(mFilterSize>>1);
838
839 ProcessBuffer4x(mBufferInfo.get());
840 bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigRun)/bigRuns.as_double());
841 if( bBreakLoop )
842 break;
843 output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, subBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1)));
844 }
845 if(singleProcessLength && !bBreakLoop) {
846 t->Get((samplePtr)mBigBuffer.get(), floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
847 ProcessBuffer(mBigBuffer.get(), mBigBuffer.get(), singleProcessLength+mBlockSize+(mFilterSize>>1));
848 output->Append((samplePtr)&mBigBuffer[bigRuns > 0 ? mBlockSize : 0], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
849// output->Append((samplePtr)&mBigBuffer[bigRuns?mBlockSize:0], floatSample, singleProcessLength);
850 }
851 output->Flush();
852 if(!bBreakLoop)
853 ProcessTail(t, output.get(), start, len);
854 return bBreakLoop;
855}
856
857#include <wx/thread.h>
858
859void *EQWorker::Entry()
860{
861 while(!mExitLoop) {
862 int i = 0;
863 {
864 wxMutexLocker locker( *mMutex );
865 for(; i < mBufferInfoCount; i++) {
866 if(mBufferInfoList[i].mBufferStatus==BufferReady) { // we found an unlocked ready buffer
867 mBufferInfoList[i].mBufferStatus=BufferBusy; // we own it now
868 break;
869 }
870 }
871 }
872 if ( i < mBufferInfoCount ) {
873 switch (mProcessingType)
874 {
875 case 1:
876 mEffectEqualization48x->ProcessBuffer1x(&mBufferInfoList[i]);
877 break;
878 case 4:
879 mEffectEqualization48x->ProcessBuffer4x(&mBufferInfoList[i]);
880 break;
881 }
882 mBufferInfoList[i].mBufferStatus=BufferDone; // we're done
883 }
884 }
885 return NULL;
886}
887
888bool EffectEqualization48x::ProcessOne1x4xThreaded(int count, WaveTrack * t,
889 sampleCount start, sampleCount len, int processingType)
890{
891 int subBufferSize=mBufferCount==8?(mSubBufferSize>>1):mSubBufferSize; // half the buffers if avx is active
892
893 sampleCount blockCount=len/mBlockSize;
894
895 if(blockCount<16) // it's not worth 4x processing do a regular process
896 return ProcessOne4x(count, t, start, len);
897 if(mThreadCount<=0 || blockCount<256) // don't do it without cores or big data
898 return ProcessOne4x(count, t, start, len);
899
900 for(int i=0;i<mThreadCount;i++)
901 mEQWorkers[i].mProcessingType=processingType;
902
903 auto output = t->EmptyCopy();
905
906 auto trackBlockSize = t->GetMaxBlockSize();
907 mEffectEqualization->TrackProgress(count, 0.0);
908 auto bigRuns = len/(subBufferSize-mBlockSize);
909 int trackBlocksPerBig=subBufferSize/trackBlockSize;
910 int trackLeftovers=subBufferSize-trackBlocksPerBig*trackBlockSize;
911 size_t singleProcessLength =
912 ((mFilterSize>>1)*bigRuns + len%(bigRuns*(subBufferSize-mBlockSize)))
913 .as_size_t();
914 auto currentSample=start;
915
916 int bigBlocksRead=mWorkerDataCount, bigBlocksWritten=0;
917
918 // fill the first workerDataCount buffers we checked above and there is at least this data
919 auto maxPreFill = bigRuns < mWorkerDataCount ? bigRuns : mWorkerDataCount;
920 for(int i=0;i<maxPreFill;i++)
921 {
922 // fill the buffer
923 for(int j=0;j<trackBlocksPerBig;j++) {
924 t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
925 currentSample+=trackBlockSize;
926 }
927 if(trackLeftovers) {
928 t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
929 currentSample+=trackLeftovers;
930 }
931 currentSample-=mBlockSize+(mFilterSize>>1);
932 mBufferInfo[i].mBufferStatus=BufferReady; // free for grabbin
933 }
934 int currentIndex=0;
935 bool bBreakLoop = false;
936 while(bigBlocksWritten<bigRuns && !bBreakLoop) {
937 bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigBlocksWritten)/bigRuns.as_double());
938 if( bBreakLoop )
939 break;
940 wxMutexLocker locker( mDataMutex ); // Get in line for data
941 // process as many blocks as we can
942 while((mBufferInfo[currentIndex].mBufferStatus==BufferDone) && (bigBlocksWritten<bigRuns)) { // data is ours
943 output->Append((samplePtr)&mBufferInfo[currentIndex].mBufferDest[0][(bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)], floatSample, subBufferSize-((bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)));
944 bigBlocksWritten++;
945 if(bigBlocksRead<bigRuns) {
946 // fill the buffer
947 for(int j=0;j<trackBlocksPerBig;j++) {
948 t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
949 currentSample+=trackBlockSize;
950 }
951 if(trackLeftovers) {
952 t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
953 currentSample+=trackLeftovers;
954 }
955 currentSample-=mBlockSize+(mFilterSize>>1);
956 mBufferInfo[currentIndex].mBufferStatus=BufferReady; // free for grabbin
957 bigBlocksRead++;
958 } else mBufferInfo[currentIndex].mBufferStatus=BufferEmpty; // this is completely unnecessary
959 currentIndex=(currentIndex+1)%mWorkerDataCount;
960 }
961 }
962 if(singleProcessLength && !bBreakLoop) {
963 t->Get((samplePtr)mBigBuffer.get(), floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
964 ProcessBuffer(mBigBuffer.get(), mBigBuffer.get(), singleProcessLength+mBlockSize+(mFilterSize>>1));
965 output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
966 }
967 output->Flush();
968 if(!bBreakLoop)
969 ProcessTail(t, output.get(), start, len);
970 return bBreakLoop;
971}
972
973void EffectEqualization48x::Filter4x(size_t len,
974 float *buffer, float *scratchBuffer)
975{
976 int i;
977 __m128 real128, imag128;
978 // Apply FFT
979 RealFFTf4x(buffer, mEffectEqualization->hFFT.get());
980
981 // Apply filter
982 // DC component is purely real
983 __m128 *localFFTBuffer=(__m128 *)scratchBuffer;
984 __m128 *localBuffer=(__m128 *)buffer;
985
986 __m128 filterFuncR, filterFuncI;
987 filterFuncR = _mm_set1_ps(mEffectEqualization->mFilterFuncR[0]);
988 localFFTBuffer[0] = _mm_mul_ps(localBuffer[0], filterFuncR);
989 auto halfLength = (len / 2);
990
991 bool useBitReverseTable = sMathPath & 1;
992
993 for(i = 1; i < halfLength; i++)
994 {
995 if(useBitReverseTable) {
996 real128=localBuffer[mEffectEqualization->hFFT->BitReversed[i] ];
997 imag128=localBuffer[mEffectEqualization->hFFT->BitReversed[i]+1];
998 } else {
999 int bitReversed=SmallRB(i,mEffectEqualization->hFFT->pow2Bits);
1000 real128=localBuffer[bitReversed];
1001 imag128=localBuffer[bitReversed+1];
1002 }
1003 filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[i]);
1004 filterFuncI=_mm_set1_ps(mEffectEqualization->mFilterFuncI[i]);
1005 localFFTBuffer[2*i ] = _mm_sub_ps( _mm_mul_ps(real128, filterFuncR), _mm_mul_ps(imag128, filterFuncI));
1006 localFFTBuffer[2*i+1] = _mm_add_ps( _mm_mul_ps(real128, filterFuncI), _mm_mul_ps(imag128, filterFuncR));
1007 }
1008 // Fs/2 component is purely real
1009 filterFuncR=_mm_set1_ps(mEffectEqualization->mFilterFuncR[halfLength]);
1010 localFFTBuffer[1] = _mm_mul_ps(localBuffer[1], filterFuncR);
1011
1012 // Inverse FFT and normalization
1013 InverseRealFFTf4x(scratchBuffer, mEffectEqualization->hFFT.get());
1014 ReorderToTime4x(mEffectEqualization->hFFT.get(), scratchBuffer, buffer);
1015}
1016
1017#ifdef __AVX_ENABLED
1018
1019// note although written it has not been tested
1020
1021bool EffectEqualization48x::ProcessBuffer8x(BufferInfo *bufferInfo)
1022{
1023 // length must be a factor of window size for 4x processing.
1024 if(bufferInfo->mBufferLength%mBlockSize || mBufferCount!=8)
1025 return false;
1026
1027 auto blockCount=bufferInfo->mBufferLength/mBlockSize;
1028
1029 __m128 *readBlocks[8]; // some temps so we don't destroy the vars in the struct
1030 __m128 *writeBlocks[8];
1031 for(int i=0;i<8;i++) {
1032 readBlocks[i]=(__m128 *)bufferInfo->mBufferSouce[i];
1033 writeBlocks[i]=(__m128 *)bufferInfo->mBufferDest[i];
1034 }
1035
1036 __m128 *swizzledBuffer128=(__m128 *)bufferInfo->mScratchBuffer;
1037 __m128 *scratchBuffer=&swizzledBuffer128[mWindowSize*4];
1038
1039 int doubleFilter=mFilterSize<<1;
1040 int doubleWindow=mWindowSize<<1;
1041 int doubleBlock=mBlockSize<<1;
1042 for(int run4x=0;run4x<blockCount;run4x++)
1043 {
1044 // swizzle the data to the swizzle buffer
1045 __m128 *currentSwizzledBlock=&swizzledBuffer128[doubleWindow*(run4x&1)];
1046 for(int i=0,j=0;j<doubleBlock;i++,j+=8) { // mBlockSize or doubleBlock???
1047 __m128 tmp0 = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(1,0,1,0));
1048 __m128 tmp1 = _mm_shuffle_ps(readBlocks[0][i], readBlocks[1][i], _MM_SHUFFLE(3,2,3,2));
1049 __m128 tmp2 = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(1,0,1,0));
1050 __m128 tmp3 = _mm_shuffle_ps(readBlocks[2][i], readBlocks[3][i], _MM_SHUFFLE(3,2,3,2));
1051 currentSwizzledBlock[j] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,2,0));
1052 currentSwizzledBlock[j+2] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3,1,3,1));
1053 currentSwizzledBlock[j+4] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2,0,2,0));
1054 currentSwizzledBlock[j+6] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3,1,3,1));
1055 tmp0 = _mm_shuffle_ps(readBlocks[4][i], readBlocks[5][i], _MM_SHUFFLE(1,0,1,0));
1056 tmp1 = _mm_shuffle_ps(readBlocks[4][i], readBlocks[5][i], _MM_SHUFFLE(3,2,3,2));
1057 tmp2 = _mm_shuffle_ps(readBlocks[6][i], readBlocks[7][i], _MM_SHUFFLE(1,0,1,0));
1058 tmp3 = _mm_shuffle_ps(readBlocks[6][i], readBlocks[7][i], _MM_SHUFFLE(3,2,3,2));
1059 currentSwizzledBlock[j+1] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,2,0));
1060 currentSwizzledBlock[j+3] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3,1,3,1));
1061 currentSwizzledBlock[j+5] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2,0,2,0));
1062 currentSwizzledBlock[j+7] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3,1,3,1));
1063 }
1064 __m128 *thisOverrun128=&currentSwizzledBlock[doubleBlock];
1065 for(int i=0;i<doubleFilter;i++)
1066 thisOverrun128[i]=_mm_set1_ps(0.0);
1067 Filter8x(mWindowSize, (float *)currentSwizzledBlock, (float *)scratchBuffer);
1068 int writeStart=0, writeToStart=0; // note readStart is where the read data is written
1069 int writeEnd=doubleBlock;
1070 if(run4x) {
1071 // maybe later swizzle add and write in one
1072 __m128 *lastOverrun128=&swizzledBuffer128[doubleWindow*((run4x+1)&1)+doubleBlock];
1073 // add and swizzle data + filter
1074 for(int i=0,j=0;j<doubleFilter;i++,j+=8) {
1075 __m128 tmps0 = _mm_add_ps(currentSwizzledBlock[j], lastOverrun128[j]);
1076 __m128 tmps1 = _mm_add_ps(currentSwizzledBlock[j+2], lastOverrun128[j+2]);
1077 __m128 tmps2 = _mm_add_ps(currentSwizzledBlock[j+4], lastOverrun128[j+4]);
1078 __m128 tmps3 = _mm_add_ps(currentSwizzledBlock[j+6], lastOverrun128[j+6]);
1079 __m128 tmp0 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(0,1,0,1));
1080 __m128 tmp1 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(2,3,2,3));
1081 __m128 tmp2 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(0,1,0,1));
1082 __m128 tmp3 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(2,3,2,3));
1083 writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
1084 writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
1085 writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
1086 writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
1087 tmps0 = _mm_add_ps(currentSwizzledBlock[j+1], lastOverrun128[j+1]);
1088 tmps1 = _mm_add_ps(currentSwizzledBlock[j+3], lastOverrun128[j+3]);
1089 tmps2 = _mm_add_ps(currentSwizzledBlock[j+5], lastOverrun128[j+5]);
1090 tmps3 = _mm_add_ps(currentSwizzledBlock[j+7], lastOverrun128[j+7]);
1091 tmp0 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(0,1,0,1));
1092 tmp1 = _mm_shuffle_ps(tmps1, tmps0, _MM_SHUFFLE(2,3,2,3));
1093 tmp2 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(0,1,0,1));
1094 tmp3 = _mm_shuffle_ps(tmps3, tmps2, _MM_SHUFFLE(2,3,2,3));
1095 writeBlocks[4][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
1096 writeBlocks[5][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
1097 writeBlocks[6][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
1098 writeBlocks[7][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
1099 }
1100 writeStart=doubleFilter;
1101 writeToStart=mFilterSize>>2;
1102 // swizzle it back.
1103 for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=8) {
1104 __m128 tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+2], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1));
1105 __m128 tmp1 = _mm_shuffle_ps(currentSwizzledBlock[j+2], currentSwizzledBlock[j], _MM_SHUFFLE(2,3,2,3));
1106 __m128 tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+6], currentSwizzledBlock[j+4], _MM_SHUFFLE(0,1,0,1));
1107 __m128 tmp3 = _mm_shuffle_ps(currentSwizzledBlock[j+6], currentSwizzledBlock[j+4], _MM_SHUFFLE(2,3,2,3));
1108 writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
1109 writeBlocks[1][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
1110 writeBlocks[2][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
1111 writeBlocks[3][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
1112 tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+1], _MM_SHUFFLE(0,1,0,1));
1113 tmp1 = _mm_shuffle_ps(currentSwizzledBlock[j+3], currentSwizzledBlock[j+1], _MM_SHUFFLE(2,3,2,3));
1114 tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+7], currentSwizzledBlock[j+5], _MM_SHUFFLE(0,1,0,1));
1115 tmp3 = _mm_shuffle_ps(currentSwizzledBlock[j+7], currentSwizzledBlock[j+5], _MM_SHUFFLE(2,3,2,3));
1116 writeBlocks[4][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
1117 writeBlocks[5][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(0,2,0,2));
1118 writeBlocks[6][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(1,3,1,3));
1119 writeBlocks[7][i] = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(0,2,0,2));
1120 }
1121 } else {
1122 // swizzle it back. We overlap one block so we only write the first block on the first run
1123 writeStart=0;
1124 writeToStart=0;
1125 for(int i=writeToStart,j=writeStart;j<writeEnd;i++,j+=8) {
1126 __m128 tmp0 = _mm_shuffle_ps(currentSwizzledBlock[j+2], currentSwizzledBlock[j], _MM_SHUFFLE(0,1,0,1));
1127 __m128 tmp2 = _mm_shuffle_ps(currentSwizzledBlock[j+6], currentSwizzledBlock[j+4], _MM_SHUFFLE(0,1,0,1));
1128 writeBlocks[0][i] = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(1,3,1,3));
1129 }
1130 }
1131 for(int i=0;i<8;i++) { // shift each block
1132 readBlocks[i]+=mBlockSize>>2; // these are 128b pointers, each window is 1/4 blockSize for those
1133 writeBlocks[i]+=mBlockSize>>2;
1134 }
1135 }
1136 return true;
1137}
1138
1139bool EffectEqualization48x::ProcessOne8x(int count, WaveTrack * t,
1140 sampleCount start, sampleCount len)
1141{
1142 sampleCount blockCount=len/mBlockSize;
1143
1144 if(blockCount<32) // it's not worth 8x processing do a regular process
1145 return ProcessOne4x(count, t, start, len);
1146
1147 auto trackBlockSize = t->GetMaxBlockSize();
1148
1149 auto output = t->EmptyCopy();
1151
1152 mEffectEqualization->TrackProgress(count, 0.0);
1153 int bigRuns=len/(mSubBufferSize-mBlockSize);
1154 int trackBlocksPerBig=mSubBufferSize/trackBlockSize;
1155 int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize;
1156 int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize));
1157 auto currentSample=start;
1158
1159 bool bBreakLoop = false;
1160 for(int bigRun=0;bigRun<bigRuns;bigRun++)
1161 {
1162 // fill the buffer
1163 for(int i=0;i<trackBlocksPerBig;i++) {
1164 t->Get((samplePtr)&mBigBuffer[i*trackBlockSize], floatSample, currentSample, trackBlockSize);
1165 currentSample+=trackBlockSize;
1166 }
1167 if(trackLeftovers) {
1168 t->Get((samplePtr)&mBigBuffer[trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
1169 currentSample+=trackLeftovers;
1170 }
1171 currentSample-=mBlockSize+(mFilterSize>>1);
1172
1173 ProcessBuffer4x(mBufferInfo);
1174 if (bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigRun)/(double)bigRuns))
1175 {
1176 break;
1177 }
1178 output->Append((samplePtr)&mBigBuffer[(bigRun?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigRun?mBlockSize:0)+(mFilterSize>>1)));
1179 }
1180 if(singleProcessLength && !bBreakLoop) {
1181 t->Get((samplePtr)mBigBuffer.get(), floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
1182 ProcessBuffer(mBigBuffer.get(), mBigBuffer.get(), singleProcessLength+mBlockSize+(mFilterSize>>1));
1183 output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
1184 }
1185 output->Flush();
1186 if(!bBreakLoop)
1187 ProcessTail(t, output.get(), start, len);
1188 return bBreakLoop;
1189}
1190
1191bool EffectEqualization48x::ProcessOne8xThreaded(int count, WaveTrack * t,
1192 sampleCount start, sampleCount len)
1193{
1194 sampleCount blockCount=len/mBlockSize;
1195
1196 if(blockCount<16) // it's not worth 4x processing do a regular process
1197 return ProcessOne4x(count, t, start, len);
1198 if(mThreadCount<=0 || blockCount<256) // don't do it without cores or big data
1199 return ProcessOne4x(count, t, start, len);
1200
1201 auto output = t->EmptyCopy();
1203
1204 auto trackBlockSize = t->GetMaxBlockSize();
1205 mEffectEqualization->TrackProgress(count, 0.0);
1206 int bigRuns=len/(mSubBufferSize-mBlockSize);
1207 int trackBlocksPerBig=mSubBufferSize/trackBlockSize;
1208 int trackLeftovers=mSubBufferSize-trackBlocksPerBig*trackBlockSize;
1209 int singleProcessLength=(mFilterSize>>1)*bigRuns + len%(bigRuns*(mSubBufferSize-mBlockSize));
1210 auto currentSample=start;
1211
1212 int bigBlocksRead=mWorkerDataCount, bigBlocksWritten=0;
1213
1214 // fill the first workerDataCount buffers we checked above and there is at least this data
1215 for(int i=0;i<mWorkerDataCount;i++)
1216 {
1217 // fill the buffer
1218 for(int j=0;j<trackBlocksPerBig;j++) {
1219 t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
1220 currentSample+=trackBlockSize;
1221 }
1222 if(trackLeftovers) {
1223 t->Get((samplePtr)&mBufferInfo[i].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
1224 currentSample+=trackLeftovers;
1225 }
1226 currentSample-=mBlockSize+(mFilterSize>>1);
1227 mBufferInfo[i].mBufferStatus=BufferReady; // free for grabbin
1228 }
1229 int currentIndex=0;
1230 bool bBreakLoop = false;
1231 while(bigBlocksWritten<bigRuns) {
1232 if (bBreakLoop=mEffectEqualization->TrackProgress(count, (double)(bigBlocksWritten)/(double)bigRuns))
1233 {
1234 break;
1235 }
1236 wxMutexLocker locker( mDataMutex ); // Get in line for data
1237 // process as many blocks as we can
1238 while((mBufferInfo[currentIndex].mBufferStatus==BufferDone) && (bigBlocksWritten<bigRuns)) { // data is ours
1239 output->Append((samplePtr)&mBufferInfo[currentIndex].mBufferDest[0][(bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)], floatSample, mSubBufferSize-((bigBlocksWritten?mBlockSize:0)+(mFilterSize>>1)));
1240 bigBlocksWritten++;
1241 if(bigBlocksRead<bigRuns) {
1242 // fill the buffer
1243 for(int j=0;j<trackBlocksPerBig;j++) {
1244 t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][j*trackBlockSize], floatSample, currentSample, trackBlockSize);
1245 currentSample+=trackBlockSize;
1246 }
1247 if(trackLeftovers) {
1248 t->Get((samplePtr)&mBufferInfo[currentIndex].mBufferSouce[0][trackBlocksPerBig*trackBlockSize], floatSample, currentSample, trackLeftovers);
1249 currentSample+=trackLeftovers;
1250 }
1251 currentSample-=mBlockSize+(mFilterSize>>1);
1252 mBufferInfo[currentIndex].mBufferStatus=BufferReady; // free for grabbin
1253 bigBlocksRead++;
1254 } else mBufferInfo[currentIndex].mBufferStatus=BufferEmpty; // this is completely unnecessary
1255 currentIndex=(currentIndex+1)%mWorkerDataCount;
1256 }
1257 }
1258 if(singleProcessLength && !bBreakLoop) {
1259 t->Get((samplePtr)mBigBuffer.get(), floatSample, currentSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
1260 ProcessBuffer(mBigBuffer.get(), mBigBuffer.get(), singleProcessLength+mBlockSize+(mFilterSize>>1));
1261 output->Append((samplePtr)&mBigBuffer[mBlockSize], floatSample, singleProcessLength+mBlockSize+(mFilterSize>>1));
1262 }
1263 output->Flush();
1264 if(!bBreakLoop)
1265 ProcessTail(t, output.get(), start, len);
1266 return bBreakLoop;
1267}
1268
1269
1270
1271
1272void EffectEqualization48x::Filter8x(size_t len,
1273 float *buffer, float *scratchBuffer)
1274{
1275 int i;
1276 __m256 real256, imag256;
1277 // Apply FFT
1278 RealFFTf8x(buffer, mEffectEqualization->hFFT);
1279
1280 // Apply filter
1281 // DC component is purely real
1282 __m256 *localFFTBuffer=(__m256 *)scratchBuffer;
1283 __m256 *localBuffer=(__m256 *)buffer;
1284
1285 __m256 filterFuncR, filterFuncI;
1286 filterFuncR = _mm256_set1_ps(mEffectEqualization->mFilterFuncR[0]);
1287 localFFTBuffer[0] = _mm256_mul_ps(localBuffer[0], filterFuncR);
1288 auto halfLength = (len / 2);
1289
1290 bool useBitReverseTable = sMathPath & 1;
1291
1292 for(i = 1; i < halfLength; i++)
1293 {
1294 if(useBitReverseTable) {
1295 real256=localBuffer[mEffectEqualization->hFFT->BitReversed[i] ];
1296 imag256=localBuffer[mEffectEqualization->hFFT->BitReversed[i]+1];
1297 } else {
1298 int bitReversed=SmallRB(i,mEffectEqualization->hFFT->pow2Bits);
1299 real256=localBuffer[bitReversed];
1300 imag256=localBuffer[bitReversed+1];
1301 }
1302 filterFuncR=_mm256_set1_ps(mEffectEqualization->mFilterFuncR[i]);
1303 filterFuncI=_mm256_set1_ps(mEffectEqualization->mFilterFuncI[i]);
1304 localFFTBuffer[2*i ] = _mm256_sub_ps( _mm256_mul_ps(real256, filterFuncR), _mm256_mul_ps(imag256, filterFuncI));
1305 localFFTBuffer[2*i+1] = _mm256_add_ps( _mm256_mul_ps(real256, filterFuncI), _mm256_mul_ps(imag256, filterFuncR));
1306 }
1307 // Fs/2 component is purely real
1308 filterFuncR=_mm256_set1_ps(mEffectEqualization->mFilterFuncR[halfLength]);
1309 localFFTBuffer[1] = _mm256_mul_ps(localBuffer[1], filterFuncR);
1310
1311 // Inverse FFT and normalization
1312 InverseRealFFTf8x(scratchBuffer, mEffectEqualization->hFFT);
1313 ReorderToTime8x(mEffectEqualization->hFFT, scratchBuffer, buffer);
1314}
1315
1316#endif
1317
1318#endif
wxT("CloseDown"))
XO("Cut/Copy/Paste")
void RealFFTf4x(fft_type *, FFTParam *, int functionType=-1)
float fft_type
Definition: RealFFTf48x.h:6
void TableUsage(int iMask)
void RealFFTf1x(fft_type *, FFTParam *, int functionType=-1)
void ReorderToTime4x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType=-1)
void ReorderToTime1x(FFTParam *hFFT, fft_type *buffer, fft_type *TimeOut, int functionType=-1)
void InverseRealFFTf4x(fft_type *, FFTParam *, int functionType=-1)
void InverseRealFFTf1x(fft_type *, FFTParam *, int functionType=-1)
int SmallRB(int bits, int numberBits)
size_t limitSampleBufferSize(size_t bufferSize, sampleCount limit)
Definition: SampleCount.cpp:22
constexpr sampleFormat floatSample
Definition: SampleFormat.h:45
char * samplePtr
Definition: SampleFormat.h:57
Subclass & Get(const RegisteredFactory &key)
Get reference to an attachment, creating on demand if not present, down-cast it to Subclass.
Definition: ClientData.h:318
An Effect that modifies volume in different frequency bands.
Definition: Equalization.h:23
Abstract base class used in importing a file.
static TrackListHolder Create(AudacityProject *pOwner)
Definition: Track.cpp:330
A Track that contains audio waveform data.
Definition: WaveTrack.h:203
double GetStartTime() const override
Implement WideSampleSequence.
Definition: WaveTrack.cpp:2576
void ConvertToSampleFormat(sampleFormat format, const std::function< void(size_t)> &progressReport={})
Definition: WaveTrack.cpp:901
bool Append(size_t iChannel, constSamplePtr buffer, sampleFormat format, size_t len, unsigned int stride=1, sampleFormat effectiveFormat=widestSampleFormat) override
Definition: WaveTrack.cpp:2227
void Join(double t0, double t1, const ProgressReporter &reportProgress)
Definition: WaveTrack.cpp:2127
void Clear(double t0, double t1) override
Definition: WaveTrack.cpp:1138
void Flush() override
Definition: WaveTrack.cpp:2294
void Paste(double t0, const Track &src) override
Definition: WaveTrack.cpp:1965
double GetEndTime() const override
Implement WideSampleSequence.
Definition: WaveTrack.cpp:2586
size_t GetMaxBlockSize() const
Definition: WaveTrack.cpp:2258
Holder EmptyCopy(size_t nChannels, const SampleBlockFactoryPtr &pFactory={}) const
Definition: WaveTrack.cpp:989
Track::Holder Copy(double t0, double t1, bool forClipboard=true) const override
Create new tracks and don't modify this track.
Definition: WaveTrack.cpp:1073
double LongSamplesToTime(sampleCount pos) const
sampleCount TimeToLongSamples(double t0) const
Positions or offsets within audio files need a wide type.
Definition: SampleCount.h:19
size_t as_size_t() const
Definition: SampleCount.cpp:16
const char * end(const char *str) noexcept
Definition: StringUtils.h:106
void free(void *ptr)
Definition: VectorOps.h:34