Audacity 3.2.0
TimeAndPitch.cpp
Go to the documentation of this file.
1#include "TimeAndPitch.h"
2
3#include <algorithm>
4#include <array>
5#include <cassert>
6#include <random>
7#include <stdlib.h>
8#include <utility>
9
12#include "SamplesFloat.h"
13#include "SimdTypes.h"
14
15using namespace staffpad::audio;
16
17namespace staffpad {
18
19namespace {
20constexpr double twoPi = 6.28318530717958647692f;
21
30inline float lagrange6(const float (&smp)[6], float t)
31{
32 auto* y = &smp[2];
33
34 auto ym1_y1 = y[-1] + y[1];
35 auto twentyfourth_ym2_y2 = 1.f / 24.f * (y[-2] + y[2]);
36 auto c0 = y[0];
37 auto c1 = 0.05f * y[-2] - 0.5f * y[-1] - 1.f / 3.f * y[0] + y[1] - 0.25f * y[2] + 1.f / 30.f * y[3];
38 auto c2 = 2.f / 3.f * ym1_y1 - 1.25f * y[0] - twentyfourth_ym2_y2;
39 auto c3 = 5.f / 12.f * y[0] - 7.f / 12.f * y[1] + 7.f / 24.f * y[2] - 1.f / 24.f * (y[-2] + y[-1] + y[3]);
40 auto c4 = 0.25f * y[0] - 1.f / 6.f * ym1_y1 + twentyfourth_ym2_y2;
41 auto c5 = 1.f / 120.f * (y[3] - y[-2]) + 1.f / 24.f * (y[-1] - y[2]) + 1.f / 12.f * (y[1] - y[0]);
42
43 // Estrin's scheme
44 auto t2 = t * t;
45 return (c0 + c1 * t) + (c2 + c3 * t) * t2 + (c4 + c5 * t) * t2 * t2;
46}
47
48void generateRandomPhaseVector(float* dst, size_t size, std::mt19937& gen)
49{
50 std::vector<std::complex<float>> v(size);
51 std::uniform_real_distribution<float> dis(
52 -3.14159265358979323846f, 3.14159265358979323846f);
53 std::for_each(dst, dst + size, [&dis, &gen](auto& a) { a = dis(gen); });
54}
55} // namespace
56
58{
59 impl(int fft_size) : fft(fft_size)
60 {
61 randomGenerator.seed(0);
62 }
63
65 std::mt19937 randomGenerator;
70
81
82 double exact_hop_a = 512.0, hop_a_err = 0.0;
83 double exact_hop_s = 0.0;
84 double next_exact_hop_s = 512.0;
85 double hop_s_err = 0.0;
86
87 std::vector<int> peak_index, trough_index;
88};
89
91 int fftSize, bool reduceImaging, ShiftTimbreCb shiftTimbreCb)
92 : fftSize(fftSize)
93 , _reduceImaging(reduceImaging)
94 , _shiftTimbreCb(std::move(shiftTimbreCb))
95{
96}
97
99{
100 // Here for ~std::shared_ptr<impl>() to know ~impl()
101}
102
103void TimeAndPitch::setup(int numChannels, int maxBlockSize)
104{
105 assert(numChannels == 1 || numChannels == 2);
106 _numChannels = numChannels;
107
108 d = std::make_unique<impl>(fftSize);
110 _numBins = fftSize / 2 + 1;
111
112 // audio sample buffers
113 d->fft_timeseries.setSize(_numChannels, fftSize);
114 int outBufferSize = fftSize + 2 * _maxBlockSize; // 1 block extra for safety
115 for (int ch = 0; ch < _numChannels; ++ch)
116 {
117 d->inResampleInputBuffer[ch].setSize(_maxBlockSize +
118 6); // needs additional history for resampling. more in the future
119 d->inCircularBuffer[ch].setSize(fftSize);
120 d->outCircularBuffer[ch].setSize(outBufferSize);
121 }
122 d->normalizationBuffer.setSize(outBufferSize);
123
124 // fft coefficient buffers
125 d->spectrum.setSize(_numChannels, _numBins);
126 d->norm.setSize(1, _numBins);
127 d->last_norm.setSize(1, _numBins);
128 d->phase.setSize(_numChannels, _numBins);
129 d->last_phase.setSize(_numChannels, _numBins);
130 d->phase_accum.setSize(_numChannels, _numBins);
131 d->random_phases.setSize(1, _numBins);
133 d->random_phases.getPtr(0), _numBins, d->randomGenerator);
134
136
137 // Cos window
138 // For perfect int-factor overlaps the first sample needs to be 0 and the last non-zero.
139 d->cosWindow.setSize(1, fftSize);
140 d->sqWindow.setSize(1, fftSize);
141 auto* w = d->cosWindow.getPtr(0);
142 auto* sqw = d->sqWindow.getPtr(0);
143 for (int i = 0; i < fftSize; ++i)
144 {
145 w[i] = -0.5f * std::cos(float(twoPi * (float)i / (float)fftSize)) + 0.5f;
146 sqw[i] = w[i] * w[i];
147 }
148
149 d->peak_index.reserve(_numBins);
150 d->trough_index.reserve(_numBins);
151
152 // force fft to allocate all buffers now. Currently there is no other way in the fft class
153 d->fft.forwardReal(d->fft_timeseries, d->spectrum);
154
155 reset();
156}
157
159{
160 return std::max(0, int(std::ceil(d->exact_hop_a)) - _analysis_hop_counter +
161 1); // 1 extra to counter pitch factor error accumulation
162}
163
165{
167}
168
170{
173 for (int ch = 0; ch < _numChannels; ++ch)
174 {
175 d->inResampleInputBuffer[ch].reset();
176 d->inCircularBuffer[ch].reset();
177 d->outCircularBuffer[ch].reset();
178 }
179 d->normalizationBuffer.reset();
180 d->last_norm.zeroOut();
181 d->last_phase.zeroOut();
182 d->phase_accum.zeroOut();
184 d->hop_a_err = 0.0;
185 d->hop_s_err = 0.0;
186 d->exact_hop_s = 0.0;
187 _resampleReadPos = 0.0;
188}
189
190namespace {
191
192// wrap a phase value into -PI..PI
193inline float _unwrapPhase(float arg)
194{
195 using namespace audio::simd;
196 return arg - rint(arg * 0.15915494309f) * 6.283185307f;
197}
198
199void _unwrapPhaseVec(float* v, int n)
200{
201 audio::simd::perform_parallel_simd_aligned(v, n, [](auto& a) { a = a - rint(a * 0.15915494309f) * 6.283185307f; });
202}
203
205void _fft_shift(float* v, int n)
206{
207 assert((n & 1) == 0);
208 int n2 = n >> 1;
209 audio::simd::perform_parallel_simd_aligned(v, v + n2, n2, [](auto& a, auto& b) {
210 auto tmp = a;
211 a = b;
212 b = tmp;
213 });
214}
215
216void _lr_to_ms(float* ch1, float* ch2, int n)
217{
218 audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
219 auto l = a, r = b;
220 a = 0.5f * (l + r);
221 b = 0.5f * (l - r);
222 });
223}
224
225void _ms_to_lr(float* ch1, float* ch2, int n)
226{
227 audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
228 auto m = a, s = b;
229 a = m + s;
230 b = m - s;
231 });
232}
233
234} // namespace
235
236// ----------------------------------------------------------------------------
237
238template <int num_channels>
239void TimeAndPitch::_time_stretch(float a_a, float a_s)
240{
241 auto alpha = a_s / a_a; // this is the real stretch factor based on integer hop sizes
242
243 // Create a norm array
244 const auto* norms = d->norm.getPtr(0); // for stereo, just use the mid-channel
245 const auto* norms_last = d->last_norm.getPtr(0);
246
247 d->peak_index.clear();
248 d->trough_index.clear();
249 float lowest = norms[0];
250 int trough = 0;
251 if (norms_last[0] >= norms[1])
252 {
253 d->peak_index.emplace_back(0);
254 d->trough_index.emplace_back(0);
255 }
256 for (int i = 1; i < _numBins - 1; ++i)
257 {
258 if (norms_last[i] >= norms[i - 1] && norms_last[i] >= norms[i + 1])
259 {
260 d->peak_index.emplace_back(i);
261 d->trough_index.emplace_back(trough);
262 trough = i;
263 lowest = norms[i];
264 }
265 else if (norms[i] < lowest)
266 {
267 lowest = norms[i];
268 trough = i;
269 }
270 }
271 if (norms_last[_numBins - 1] > norms[_numBins - 2])
272 {
273 d->peak_index.emplace_back(_numBins - 1);
274 d->trough_index.emplace_back(trough);
275 }
276
277 if (d->peak_index.size() == 0)
278 {
279 // use max norm of last frame
280 int max_idx = 0;
281 float max_norm = 0.f;
282 vo::findMaxElement(norms_last, _numBins, max_idx, max_norm);
283 d->peak_index.emplace_back(max_idx);
284 }
285
286 const float** p = const_cast<const float**>(d->phase.getPtrs());
287 const float** p_l = const_cast<const float**>(d->last_phase.getPtrs());
288 float** acc = d->phase_accum.getPtrs();
289
290 float expChange_a = a_a * float(_expectedPhaseChangePerBinPerSample);
291 float expChange_s = a_s * float(_expectedPhaseChangePerBinPerSample);
292
293 int num_peaks = (int)d->peak_index.size();
294 // time integration for all peaks
295 for (int i = 0; i < num_peaks; ++i)
296 {
297 const int n = d->peak_index[i];
298 auto fn = float(n);
299
300 // determine phase from last frame
301 float fn_expChange_a = fn * expChange_a;
302 float fn_expChange_s = fn * expChange_s;
303
304 for (int ch = 0; ch < num_channels; ++ch)
305 acc[ch][n] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n] - p_l[ch][n] - fn_expChange_a) + fn_expChange_s;
306 }
307
308 // go from first peak to 0
309 for (int n = d->peak_index[0]; n > 0; --n)
310 {
311 for (int ch = 0; ch < num_channels; ++ch)
312 acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
313 }
314
315 // 'grow' from pairs of peaks to the lowest norm in between
316 for (int i = 0; i < num_peaks - 1; ++i)
317 {
318 const int mid = d->trough_index[i + 1];
319 for (int n = d->peak_index[i]; n < mid; ++n)
320 {
321 for (int ch = 0; ch < num_channels; ++ch)
322 acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
323 }
324 for (int n = d->peak_index[i + 1]; n > mid + 1; --n)
325 {
326 for (int ch = 0; ch < num_channels; ++ch)
327 acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
328 }
329 }
330
331 // last peak to the end
332 for (int n = d->peak_index[num_peaks - 1]; n < _numBins - 1; ++n)
333 {
334 for (int ch = 0; ch < num_channels; ++ch)
335 acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
336 }
337
338 d->last_norm.assignSamples(d->norm);
339 d->last_phase.assignSamples(d->phase);
340}
341
343{
344 // Upsampling brings spectral components down, including those that were
345 // beyond the Nyquist. From `imagingBeginBin`, we have a mirroring of the
346 // spectrum, which, if not lowpass-filtered by the resampler, may yield
347 // an aliasing-like quality if what is mirrored is a harmonic series. A
348 // simple thing to do against that is to scramble the phase values there.
349 // This way we preserve natural energy fluctuations, but the artifact
350 // isn't noticeable as much anymore, because much more noise-like, which
351 // is what one typically gets at those frequencies.
352 constexpr auto alignment = 64 / sizeof(float);
353 const int imagingBeginBin =
354 std::ceil((fftSize / 2 * _pitchFactor + 1) / alignment) * alignment;
355
356 if (imagingBeginBin >= _numBins)
357 return;
358
359 const auto n = _numBins - imagingBeginBin;
360 auto pSpec = d->spectrum.getPtr(0);
361 auto pRand = d->random_phases.getPtr(0);
362 vo::rotate(nullptr, pRand, pSpec + imagingBeginBin, n);
363 // Just rotating the random phase vector to produce pseudo-random
364 // sequences of phases.
365 std::uniform_int_distribution<size_t> dis(0, n - 1);
366 const auto middle = dis(d->randomGenerator);
367 std::rotate(pRand, pRand + middle, pRand + n);
368}
369
371void TimeAndPitch::_process_hop(int hop_a, int hop_s)
372{
373 if (d->exact_hop_a != d->exact_hop_s)
374 {
375 if (_numChannels == 2)
376 _lr_to_ms(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
377
378 for (int ch = 0; ch < _numChannels; ++ch)
379 {
380 vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
381 _fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
382 }
383
384 // determine norm/phase
385 d->fft.forwardReal(d->fft_timeseries, d->spectrum);
386 // norms of the mid channel only (or sole channel) are needed in
387 // _time_stretch
388 vo::calcNorms(d->spectrum.getPtr(0), d->norm.getPtr(0), d->spectrum.getNumSamples());
389 for (int ch = 0; ch < _numChannels; ++ch)
390 vo::calcPhases(d->spectrum.getPtr(ch), d->phase.getPtr(ch), d->spectrum.getNumSamples());
391
392 if (_shiftTimbreCb)
394 1 / _pitchFactor, d->spectrum.getPtr(0), d->norm.getPtr(0));
395
396 if (_reduceImaging && _pitchFactor < 1.)
398
399 if (_numChannels == 1)
400 _time_stretch<1>((float)hop_a, (float)hop_s);
401 else if (_numChannels == 2)
402 _time_stretch<2>((float)hop_a, (float)hop_s);
403
404 for (int ch = 0; ch < _numChannels; ++ch)
405 _unwrapPhaseVec(d->phase_accum.getPtr(ch), _numBins);
406
407 for (int ch = 0; ch < _numChannels; ++ch)
408 vo::rotate(d->phase.getPtr(ch), d->phase_accum.getPtr(ch), d->spectrum.getPtr(ch),
409 d->spectrum.getNumSamples());
410 d->fft.inverseReal(d->spectrum, d->fft_timeseries);
411
412 for (int ch = 0; ch < _numChannels; ++ch)
413 vo::constantMultiply(d->fft_timeseries.getPtr(ch), 1.f / fftSize, d->fft_timeseries.getPtr(ch),
414 d->fft_timeseries.getNumSamples());
415
416 if (_numChannels == 2)
417 _ms_to_lr(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
418
419 for (int ch = 0; ch < _numChannels; ++ch)
420 {
421 _fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
422 vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
423 }
424 }
425 else
426 { // stretch factor == 1.0 => just apply window
427 for (int ch = 0; ch < _numChannels; ++ch)
428 vo::multiply(d->fft_timeseries.getPtr(ch), d->sqWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
429 }
430
431 {
432 float gainFact = float(_timeStretch * ((8.f / 3.f) / _overlap_a)); // overlap add normalization factor
433 for (int ch = 0; ch < _numChannels; ++ch)
434 d->outCircularBuffer[ch].writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->fft_timeseries.getPtr(ch),
435 gainFact);
436
438 d->normalizationBuffer.writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->sqWindow.getPtr(0), gainFact);
439 }
440 _outBufferWriteOffset += hop_s;
442}
443
444void TimeAndPitch::setTimeStretchAndPitchFactor(double timeScale, double pitchFactor)
445{
446 assert(timeScale > 0.0);
447 assert(pitchFactor > 0.0);
448 _pitchFactor = pitchFactor;
449 _timeStretch = timeScale * pitchFactor;
450
452 double overlap_s = overlap;
455 else
456 overlap_s /= _timeStretch;
457
458 d->exact_hop_a = double(fftSize) / _overlap_a;
460 {
461 d->exact_hop_s = double(fftSize) / overlap_s;
462 }
463 else
464 {
465 // switch after processing next block, unless it's the first one
466 d->next_exact_hop_s = double(fftSize) / overlap_s;
467 if (d->exact_hop_s == 0.0) // on first chunk set it immediately
468 d->exact_hop_s = d->next_exact_hop_s;
469 }
470}
471
472void TimeAndPitch::feedAudio(const float* const* input_smp, int numSamples)
473{
474 assert(numSamples <= _maxBlockSize);
475 for (int ch = 0; ch < _numChannels; ++ch)
476 {
477 d->inResampleInputBuffer[ch].writeBlock(0, numSamples, input_smp[ch]);
478 d->inResampleInputBuffer[ch].advance(numSamples);
479 }
480 _resampleReadPos -= numSamples;
481
482 // determine integer hop size for the next hop.
483 // The error accumulators will make the value fluctuate by one to reach arbitrary scale
484 // ratios
485 if (d->exact_hop_s == 0.0) // this happens if feedAudio is called without setting up stretch factors
486 d->exact_hop_s = d->next_exact_hop_s;
487
488 const int hop_s = int(d->exact_hop_s + d->hop_s_err);
489 const int hop_a = int(d->exact_hop_a + d->hop_a_err);
490
491 double step = 0.0;
492 double read_pos = _resampleReadPos;
493 while (read_pos < 0.0)
494 {
495 auto int_pos = int(std::floor(read_pos));
496 float frac_pos = float(read_pos - int_pos);
497 for (int ch = 0; ch < _numChannels; ++ch)
498 {
499 float smp[6];
500 d->inResampleInputBuffer[ch].readBlock(int_pos - 6, 6, smp);
501 float s = (frac_pos == 0) ? smp[2] : lagrange6(smp, frac_pos);
502 d->inCircularBuffer[ch].writeOffset0(s);
503 d->inCircularBuffer[ch].advance(1);
504 }
506 step++;
507
508 if (_analysis_hop_counter >= hop_a)
509 {
510 _analysis_hop_counter -= hop_a;
511 d->hop_s_err += d->exact_hop_s - hop_s;
512 d->hop_a_err += d->exact_hop_a - hop_a;
513 for (int ch = 0; ch < _numChannels; ++ch)
514 d->inCircularBuffer[ch].readBlock(-fftSize, fftSize, d->fft_timeseries.getPtr(ch));
515 _process_hop(hop_a, hop_s);
516 }
517 read_pos = _resampleReadPos + step * _pitchFactor;
518 }
519 _resampleReadPos = read_pos;
520}
521
522void TimeAndPitch::retrieveAudio(float* const* out_smp, int numSamples)
523{
524 assert(numSamples <= _maxBlockSize);
525 for (int ch = 0; ch < _numChannels; ++ch)
526 {
527 d->outCircularBuffer[ch].readAndClearBlock(0, numSamples, out_smp[ch]);
529 {
530 constexpr float curve =
531 4.f * 4.f; // the curve approximates 1/x over 1 but fades to 0 near 0 to avoid fade-in clicks
532 for (int i = 0; i < numSamples; ++i)
533 {
534 float x = d->normalizationBuffer.read(i);
535 out_smp[ch][i] *= x / (x * x + 1 / curve);
536 }
537 }
538 d->outCircularBuffer[ch].advance(numSamples);
539 }
540
541 d->normalizationBuffer.clearBlock(0, numSamples);
542 d->normalizationBuffer.advance(numSamples);
543
544 _outBufferWriteOffset -= numSamples;
545 _availableOutputSamples -= numSamples;
546
548 d->exact_hop_s = d->next_exact_hop_s;
549}
550
551void TimeAndPitch::processPitchShift(float* const* smp, int numSamples, double pitchFactor)
552{
553 setTimeStretchAndPitchFactor(1.0, pitchFactor);
554 feedAudio(smp, numSamples);
555 retrieveAudio(smp, numSamples);
556}
557
559{
560 return fftSize - fftSize / overlap + 3; // 3 for resampling
561}
562
563
565{
566 const float coeff = (timeStretch < 1.f) ? (1.f / 3.f) : (2.f / 3.f);
567 return int(getLatencySamples() * (timeStretch * coeff + (1 - coeff)));
568}
569
570} // namespace staffpad
static const auto fn
int getNumAvailableOutputSamples() const
void setTimeStretchAndPitchFactor(double timeStretch, double pitchFactor)
int getSamplesToNextHop() const
void _time_stretch(float hop_a, float hop_s)
std::function< void(double factor, std::complex< float > *spectrum, const float *magnitude)> ShiftTimbreCb
Definition: TimeAndPitch.h:20
void setup(int numChannels, int maxBlockSize)
void retrieveAudio(float *const *out_smp, int numSamples)
const ShiftTimbreCb _shiftTimbreCb
Definition: TimeAndPitch.h:101
void _process_hop(int hop_a, int hop_s)
process one hop/chunk in _fft_timeSeries and add the result to output circular buffer
int getLatencySamples() const
static constexpr bool normalize_window
Definition: TimeAndPitch.h:89
double _expectedPhaseChangePerBinPerSample
Definition: TimeAndPitch.h:112
void processPitchShift(float *const *smp, int numSamples, double pitchFactor)
int getLatencySamplesForStretchRatio(float timeStretch) const
TimeAndPitch(int fftSize, bool reduceImaging=true, ShiftTimbreCb shiftTimbreCb={})
std::shared_ptr< impl > d
Definition: TimeAndPitch.h:97
static constexpr bool modulate_synthesis_hop
Definition: TimeAndPitch.h:90
void feedAudio(const float *const *in_smp, int numSamples)
static constexpr int overlap
Definition: TimeAndPitch.h:88
void generateRandomPhaseVector(float *dst, size_t size, std::mt19937 &gen)
void _fft_shift(float *v, int n)
rotate even-sized array by half its size to align fft phase at the center
float lagrange6(const float(&smp)[6], float t)
void _lr_to_ms(float *ch1, float *ch2, int n)
void _ms_to_lr(float *ch1, float *ch2, int n)
__finl float __vecc rint(float a)
__finl void perform_parallel_simd_aligned(float *a, float *b, int n, const fnc &f)
Definition: SimdTypes.h:87
void multiply(const T *src1, const T *src2, T *dst, int32_t n)
Definition: VectorOps.h:74
void calcPhases(const std::complex< float > *src, float *dst, int32_t n)
Definition: VectorOps.h:128
void rotate(const float *oldPhase, const float *newPhase, std::complex< float > *dst, int32_t n)
Definition: VectorOps.h:140
void findMaxElement(const T *src, int32_t n, int32_t &maxIndex, T &maxValue)
Definition: VectorOps.h:87
void calcNorms(const std::complex< float > *src, float *dst, int32_t n)
Definition: VectorOps.h:134
void constantMultiply(const T *src, T constant, T *dst, int32_t n)
Definition: VectorOps.h:60
STL namespace.
std::vector< int > trough_index
CircularSampleBuffer< float > inCircularBuffer[2]
CircularSampleBuffer< float > normalizationBuffer
std::vector< int > peak_index
CircularSampleBuffer< float > outCircularBuffer[2]
CircularSampleBuffer< float > inResampleInputBuffer[2]