Audacity 3.2.0
TimeAndPitch.cpp
Go to the documentation of this file.
1#include "TimeAndPitch.h"
2
3#include <algorithm>
4#include <array>
5#include <cassert>
6#include <utility>
7
10#include "SamplesFloat.h"
11#include "SimdTypes.h"
12
13using namespace staffpad::audio;
14
15namespace staffpad {
16
17namespace {
18constexpr double twoPi = 6.28318530717958647692f;
19
28inline float lagrange6(const float (&smp)[6], float t)
29{
30 auto* y = &smp[2];
31
32 auto ym1_y1 = y[-1] + y[1];
33 auto twentyfourth_ym2_y2 = 1.f / 24.f * (y[-2] + y[2]);
34 auto c0 = y[0];
35 auto c1 = 0.05f * y[-2] - 0.5f * y[-1] - 1.f / 3.f * y[0] + y[1] - 0.25f * y[2] + 1.f / 30.f * y[3];
36 auto c2 = 2.f / 3.f * ym1_y1 - 1.25f * y[0] - twentyfourth_ym2_y2;
37 auto c3 = 5.f / 12.f * y[0] - 7.f / 12.f * y[1] + 7.f / 24.f * y[2] - 1.f / 24.f * (y[-2] + y[-1] + y[3]);
38 auto c4 = 0.25f * y[0] - 1.f / 6.f * ym1_y1 + twentyfourth_ym2_y2;
39 auto c5 = 1.f / 120.f * (y[3] - y[-2]) + 1.f / 24.f * (y[-1] - y[2]) + 1.f / 12.f * (y[1] - y[0]);
40
41 // Estrin's scheme
42 auto t2 = t * t;
43 return (c0 + c1 * t) + (c2 + c3 * t) * t2 + (c4 + c5 * t) * t2 * t2;
44}
45
46inline int getFftSize(int sampleRate)
47{
48 // 44.1kHz maps to 4096 samples (i.e., 93ms).
49 // We grow the FFT size proportionally with the sample rate to keep the
50 // window duration roughly constant, with quantization due to the
51 // power-of-two constraint.
52 // If needed some time in the future, we can decouple analysis window and
53 // FFT sizes by zero-padding, allowing for very fine-grained window duration
54 // without compromising performance.
55 return 1 << (12 + (int)std::round(std::log2(sampleRate / 44100.)));
56}
57} // namespace
58
60{
61 impl(int fft_size) : fft(fft_size)
62 {
63 }
64
70
80
81 double exact_hop_a = 512.0, hop_a_err = 0.0;
82 double exact_hop_s = 0.0;
83 double next_exact_hop_s = 512.0;
84 double hop_s_err = 0.0;
85
86 std::vector<int> peak_index, trough_index;
87};
88
90 : fftSize(getFftSize(sampleRate))
91{
92}
93
95{
96 // Here for ~std::shared_ptr<impl>() to know ~impl()
97}
98
99void TimeAndPitch::setup(int numChannels, int maxBlockSize)
100{
101 assert(numChannels == 1 || numChannels == 2);
102 _numChannels = numChannels;
103
104 d = std::make_unique<impl>(fftSize);
106 _numBins = fftSize / 2 + 1;
107
108 // audio sample buffers
109 d->fft_timeseries.setSize(_numChannels, fftSize);
110 int outBufferSize = fftSize + 2 * _maxBlockSize; // 1 block extra for safety
111 for (int ch = 0; ch < _numChannels; ++ch)
112 {
113 d->inResampleInputBuffer[ch].setSize(_maxBlockSize +
114 6); // needs additional history for resampling. more in the future
115 d->inCircularBuffer[ch].setSize(fftSize);
116 d->outCircularBuffer[ch].setSize(outBufferSize);
117 }
118 d->normalizationBuffer.setSize(outBufferSize);
119
120 // fft coefficient buffers
121 d->spectrum.setSize(_numChannels, _numBins);
122 d->norm.setSize(1, _numBins);
123 d->last_norm.setSize(1, _numBins);
124 d->phase.setSize(_numChannels, _numBins);
125 d->last_phase.setSize(_numChannels, _numBins);
126 d->phase_accum.setSize(_numChannels, _numBins);
127
129
130 // Cos window
131 // For perfect int-factor overlaps the first sample needs to be 0 and the last non-zero.
132 d->cosWindow.setSize(1, fftSize);
133 d->sqWindow.setSize(1, fftSize);
134 auto* w = d->cosWindow.getPtr(0);
135 auto* sqw = d->sqWindow.getPtr(0);
136 for (int i = 0; i < fftSize; ++i)
137 {
138 w[i] = -0.5f * std::cos(float(twoPi * (float)i / (float)fftSize)) + 0.5f;
139 sqw[i] = w[i] * w[i];
140 }
141
142 d->peak_index.reserve(_numBins);
143 d->trough_index.reserve(_numBins);
144
145 // force fft to allocate all buffers now. Currently there is no other way in the fft class
146 d->fft.forwardReal(d->fft_timeseries, d->spectrum);
147
148 reset();
149}
150
152{
153 return std::max(0, int(std::ceil(d->exact_hop_a)) - _analysis_hop_counter +
154 1); // 1 extra to counter pitch factor error accumulation
155}
156
158{
160}
161
163{
166 for (int ch = 0; ch < _numChannels; ++ch)
167 {
168 d->inResampleInputBuffer[ch].reset();
169 d->inCircularBuffer[ch].reset();
170 d->outCircularBuffer[ch].reset();
171 }
172 d->normalizationBuffer.reset();
173 d->last_norm.zeroOut();
174 d->last_phase.zeroOut();
175 d->phase_accum.zeroOut();
177 d->hop_a_err = 0.0;
178 d->hop_s_err = 0.0;
179 d->exact_hop_s = 0.0;
180 _resampleReadPos = 0.0;
181}
182
183namespace {
184
185// wrap a phase value into -PI..PI
186inline float _unwrapPhase(float arg)
187{
188 using namespace audio::simd;
189 return arg - rint(arg * 0.15915494309f) * 6.283185307f;
190}
191
192void _unwrapPhaseVec(float* v, int n)
193{
194 audio::simd::perform_parallel_simd_aligned(v, n, [](auto& a) { a = a - rint(a * 0.15915494309f) * 6.283185307f; });
195}
196
198void _fft_shift(float* v, int n)
199{
200 assert((n & 1) == 0);
201 int n2 = n >> 1;
202 audio::simd::perform_parallel_simd_aligned(v, v + n2, n2, [](auto& a, auto& b) {
203 auto tmp = a;
204 a = b;
205 b = tmp;
206 });
207}
208
209void _lr_to_ms(float* ch1, float* ch2, int n)
210{
211 audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
212 auto l = a, r = b;
213 a = 0.5f * (l + r);
214 b = 0.5f * (l - r);
215 });
216}
217
218void _ms_to_lr(float* ch1, float* ch2, int n)
219{
220 audio::simd::perform_parallel_simd_aligned(ch1, ch2, n, [](auto& a, auto& b) {
221 auto m = a, s = b;
222 a = m + s;
223 b = m - s;
224 });
225}
226
227} // namespace
228
229// ----------------------------------------------------------------------------
230
231template <int num_channels>
232void TimeAndPitch::_time_stretch(float a_a, float a_s)
233{
234 auto alpha = a_s / a_a; // this is the real stretch factor based on integer hop sizes
235
236 // Create a norm array
237 auto* norms = d->norm.getPtr(0); // for stereo, just use the mid-channel
238 const auto* norms_last = d->last_norm.getPtr(0);
239
240 d->peak_index.clear();
241 d->trough_index.clear();
242 float lowest = norms[0];
243 int trough = 0;
244 if (norms_last[0] >= norms[1])
245 {
246 d->peak_index.emplace_back(0);
247 d->trough_index.emplace_back(0);
248 }
249 for (int i = 1; i < _numBins - 1; ++i)
250 {
251 if (norms_last[i] >= norms[i - 1] && norms_last[i] >= norms[i + 1])
252 {
253 d->peak_index.emplace_back(i);
254 d->trough_index.emplace_back(trough);
255 trough = i;
256 lowest = norms[i];
257 }
258 else if (norms[i] < lowest)
259 {
260 lowest = norms[i];
261 trough = i;
262 }
263 }
264 if (norms_last[_numBins - 1] > norms[_numBins - 2])
265 {
266 d->peak_index.emplace_back(_numBins - 1);
267 d->trough_index.emplace_back(trough);
268 }
269
270 if (d->peak_index.size() == 0)
271 {
272 // use max norm of last frame
273 int max_idx = 0;
274 float max_norm = 0.f;
275 vo::findMaxElement(norms_last, _numBins, max_idx, max_norm);
276 d->peak_index.emplace_back(max_idx);
277 }
278
279 const float** p = const_cast<const float**>(d->phase.getPtrs());
280 const float** p_l = const_cast<const float**>(d->last_phase.getPtrs());
281 float** acc = d->phase_accum.getPtrs();
282
283 float expChange_a = a_a * float(_expectedPhaseChangePerBinPerSample);
284 float expChange_s = a_s * float(_expectedPhaseChangePerBinPerSample);
285
286 int num_peaks = (int)d->peak_index.size();
287 // time integration for all peaks
288 for (int i = 0; i < num_peaks; ++i)
289 {
290 const int n = d->peak_index[i];
291 auto fn = float(n);
292
293 // determine phase from last frame
294 float fn_expChange_a = fn * expChange_a;
295 float fn_expChange_s = fn * expChange_s;
296
297 for (int ch = 0; ch < num_channels; ++ch)
298 acc[ch][n] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n] - p_l[ch][n] - fn_expChange_a) + fn_expChange_s;
299 }
300
301 // go from first peak to 0
302 for (int n = d->peak_index[0]; n > 0; --n)
303 {
304 for (int ch = 0; ch < num_channels; ++ch)
305 acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
306 }
307
308 // 'grow' from pairs of peaks to the lowest norm in between
309 for (int i = 0; i < num_peaks - 1; ++i)
310 {
311 const int mid = d->trough_index[i + 1];
312 for (int n = d->peak_index[i]; n < mid; ++n)
313 {
314 for (int ch = 0; ch < num_channels; ++ch)
315 acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
316 }
317 for (int n = d->peak_index[i + 1]; n > mid + 1; --n)
318 {
319 for (int ch = 0; ch < num_channels; ++ch)
320 acc[ch][n - 1] = acc[ch][n] - alpha * _unwrapPhase(p[ch][n] - p[ch][n - 1]);
321 }
322 }
323
324 // last peak to the end
325 for (int n = d->peak_index[num_peaks - 1]; n < _numBins - 1; ++n)
326 {
327 for (int ch = 0; ch < num_channels; ++ch)
328 acc[ch][n + 1] = acc[ch][n] + alpha * _unwrapPhase(p[ch][n + 1] - p[ch][n]);
329 }
330
331 d->last_norm.assignSamples(d->norm);
332 d->last_phase.assignSamples(d->phase);
333}
334
336void TimeAndPitch::_process_hop(int hop_a, int hop_s)
337{
338 if (d->exact_hop_a != d->exact_hop_s)
339 {
340 if (_numChannels == 2)
341 _lr_to_ms(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
342
343 for (int ch = 0; ch < _numChannels; ++ch)
344 {
345 vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
346 _fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
347 }
348
349 // determine norm/phase
350 d->fft.forwardReal(d->fft_timeseries, d->spectrum);
351 // norms of the mid channel only (or sole channel) are needed in
352 // _time_stretch
353 vo::calcNorms(d->spectrum.getPtr(0), d->norm.getPtr(0), d->spectrum.getNumSamples());
354 for (int ch = 0; ch < _numChannels; ++ch)
355 vo::calcPhases(d->spectrum.getPtr(ch), d->phase.getPtr(ch), d->spectrum.getNumSamples());
356
357 if (_numChannels == 1)
358 _time_stretch<1>((float)hop_a, (float)hop_s);
359 else if (_numChannels == 2)
360 _time_stretch<2>((float)hop_a, (float)hop_s);
361
362 for (int ch = 0; ch < _numChannels; ++ch)
363 _unwrapPhaseVec(d->phase_accum.getPtr(ch), _numBins);
364
365 for (int ch = 0; ch < _numChannels; ++ch)
366 vo::rotate(d->phase.getPtr(ch), d->phase_accum.getPtr(ch), d->spectrum.getPtr(ch),
367 d->spectrum.getNumSamples());
368 d->fft.inverseReal(d->spectrum, d->fft_timeseries);
369
370 for (int ch = 0; ch < _numChannels; ++ch)
371 vo::constantMultiply(d->fft_timeseries.getPtr(ch), 1.f / fftSize, d->fft_timeseries.getPtr(ch),
372 d->fft_timeseries.getNumSamples());
373
374 if (_numChannels == 2)
375 _ms_to_lr(d->fft_timeseries.getPtr(0), d->fft_timeseries.getPtr(1), fftSize);
376
377 for (int ch = 0; ch < _numChannels; ++ch)
378 {
379 _fft_shift(d->fft_timeseries.getPtr(ch), fftSize);
380 vo::multiply(d->fft_timeseries.getPtr(ch), d->cosWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
381 }
382 }
383 else
384 { // stretch factor == 1.0 => just apply window
385 for (int ch = 0; ch < _numChannels; ++ch)
386 vo::multiply(d->fft_timeseries.getPtr(ch), d->sqWindow.getPtr(0), d->fft_timeseries.getPtr(ch), fftSize);
387 }
388
389 {
390 float gainFact = float(_timeStretch * ((8.f / 3.f) / _overlap_a)); // overlap add normalization factor
391 for (int ch = 0; ch < _numChannels; ++ch)
392 d->outCircularBuffer[ch].writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->fft_timeseries.getPtr(ch),
393 gainFact);
394
396 d->normalizationBuffer.writeAddBlockWithGain(_outBufferWriteOffset, fftSize, d->sqWindow.getPtr(0), gainFact);
397 }
398 _outBufferWriteOffset += hop_s;
400}
401
402void TimeAndPitch::setTimeStretchAndPitchFactor(double timeScale, double pitchFactor)
403{
404 assert(timeScale > 0.0);
405 assert(pitchFactor > 0.0);
406 _pitchFactor = pitchFactor;
407 _timeStretch = timeScale * pitchFactor;
408
410 double overlap_s = overlap;
413 else
414 overlap_s /= _timeStretch;
415
416 d->exact_hop_a = double(fftSize) / _overlap_a;
418 {
419 d->exact_hop_s = double(fftSize) / overlap_s;
420 }
421 else
422 {
423 // switch after processing next block, unless it's the first one
424 d->next_exact_hop_s = double(fftSize) / overlap_s;
425 if (d->exact_hop_s == 0.0) // on first chunk set it immediately
426 d->exact_hop_s = d->next_exact_hop_s;
427 }
428}
429
430void TimeAndPitch::feedAudio(const float* const* input_smp, int numSamples)
431{
432 assert(numSamples <= _maxBlockSize);
433 for (int ch = 0; ch < _numChannels; ++ch)
434 {
435 d->inResampleInputBuffer[ch].writeBlock(0, numSamples, input_smp[ch]);
436 d->inResampleInputBuffer[ch].advance(numSamples);
437 }
438 _resampleReadPos -= numSamples;
439
440 // determine integer hop size for the next hop.
441 // The error accumulators will make the value fluctuate by one to reach arbitrary scale
442 // ratios
443 if (d->exact_hop_s == 0.0) // this happens if feedAudio is called without setting up stretch factors
444 d->exact_hop_s = d->next_exact_hop_s;
445
446 const int hop_s = int(d->exact_hop_s + d->hop_s_err);
447 const int hop_a = int(d->exact_hop_a + d->hop_a_err);
448
449 double step = 0.0;
450 double read_pos = _resampleReadPos;
451 while (read_pos < 0.0)
452 {
453 auto int_pos = int(std::floor(read_pos));
454 float frac_pos = float(read_pos - int_pos);
455 for (int ch = 0; ch < _numChannels; ++ch)
456 {
457 float smp[6];
458 d->inResampleInputBuffer[ch].readBlock(int_pos - 6, 6, smp);
459 float s = (frac_pos == 0) ? smp[2] : lagrange6(smp, frac_pos);
460 d->inCircularBuffer[ch].writeOffset0(s);
461 d->inCircularBuffer[ch].advance(1);
462 }
464 step++;
465
466 if (_analysis_hop_counter >= hop_a)
467 {
468 _analysis_hop_counter -= hop_a;
469 d->hop_s_err += d->exact_hop_s - hop_s;
470 d->hop_a_err += d->exact_hop_a - hop_a;
471 for (int ch = 0; ch < _numChannels; ++ch)
472 d->inCircularBuffer[ch].readBlock(-fftSize, fftSize, d->fft_timeseries.getPtr(ch));
473 _process_hop(hop_a, hop_s);
474 }
475 read_pos = _resampleReadPos + step * _pitchFactor;
476 }
477 _resampleReadPos = read_pos;
478}
479
480void TimeAndPitch::retrieveAudio(float* const* out_smp, int numSamples)
481{
482 assert(numSamples <= _maxBlockSize);
483 for (int ch = 0; ch < _numChannels; ++ch)
484 {
485 d->outCircularBuffer[ch].readAndClearBlock(0, numSamples, out_smp[ch]);
487 {
488 constexpr float curve =
489 4.f * 4.f; // the curve approximates 1/x over 1 but fades to 0 near 0 to avoid fade-in clicks
490 for (int i = 0; i < numSamples; ++i)
491 {
492 float x = d->normalizationBuffer.read(i);
493 out_smp[ch][i] *= x / (x * x + 1 / curve);
494 }
495 }
496 d->outCircularBuffer[ch].advance(numSamples);
497 }
498
499 d->normalizationBuffer.clearBlock(0, numSamples);
500 d->normalizationBuffer.advance(numSamples);
501
502 _outBufferWriteOffset -= numSamples;
503 _availableOutputSamples -= numSamples;
504
506 d->exact_hop_s = d->next_exact_hop_s;
507}
508
509void TimeAndPitch::processPitchShift(float* const* smp, int numSamples, double pitchFactor)
510{
511 setTimeStretchAndPitchFactor(1.0, pitchFactor);
512 feedAudio(smp, numSamples);
513 retrieveAudio(smp, numSamples);
514}
515
517{
518 return fftSize - fftSize / overlap + 3; // 3 for resampling
519}
520
521
523{
524 const float coeff = (timeStretch < 1.f) ? (1.f / 3.f) : (2.f / 3.f);
525 return int(getLatencySamples() * (timeStretch * coeff + (1 - coeff)));
526}
527
528} // namespace staffpad
static const auto fn
int getNumAvailableOutputSamples() const
void setTimeStretchAndPitchFactor(double timeStretch, double pitchFactor)
int getSamplesToNextHop() const
void _time_stretch(float hop_a, float hop_s)
TimeAndPitch(int sampleRate)
void setup(int numChannels, int maxBlockSize)
void retrieveAudio(float *const *out_smp, int numSamples)
void _process_hop(int hop_a, int hop_s)
process one hop/chunk in _fft_timeSeries and add the result to output circular buffer
int getLatencySamples() const
static constexpr bool normalize_window
Definition: TimeAndPitch.h:81
double _expectedPhaseChangePerBinPerSample
Definition: TimeAndPitch.h:100
void processPitchShift(float *const *smp, int numSamples, double pitchFactor)
int getLatencySamplesForStretchRatio(float timeStretch) const
std::shared_ptr< impl > d
Definition: TimeAndPitch.h:88
static constexpr bool modulate_synthesis_hop
Definition: TimeAndPitch.h:82
void feedAudio(const float *const *in_smp, int numSamples)
static constexpr int overlap
Definition: TimeAndPitch.h:80
fastfloat_really_inline void round(adjusted_mantissa &am, callback cb) noexcept
Definition: fast_float.h:2512
void _fft_shift(float *v, int n)
rotate even-sized array by half its size to align fft phase at the center
float lagrange6(const float(&smp)[6], float t)
void _lr_to_ms(float *ch1, float *ch2, int n)
void _ms_to_lr(float *ch1, float *ch2, int n)
__finl float __vecc rint(float a)
__finl void perform_parallel_simd_aligned(float *a, float *b, int n, const fnc &f)
Definition: SimdTypes.h:87
void multiply(const T *src1, const T *src2, T *dst, int32_t n)
Definition: VectorOps.h:74
void calcPhases(const std::complex< float > *src, float *dst, int32_t n)
Definition: VectorOps.h:128
void rotate(const float *oldPhase, const float *newPhase, std::complex< float > *dst, int32_t n)
Definition: VectorOps.h:140
void findMaxElement(const T *src, int32_t n, int32_t &maxIndex, T &maxValue)
Definition: VectorOps.h:87
void calcNorms(const std::complex< float > *src, float *dst, int32_t n)
Definition: VectorOps.h:134
void constantMultiply(const T *src, T constant, T *dst, int32_t n)
Definition: VectorOps.h:60
std::vector< int > trough_index
CircularSampleBuffer< float > inCircularBuffer[2]
CircularSampleBuffer< float > normalizationBuffer
std::vector< int > peak_index
CircularSampleBuffer< float > outCircularBuffer[2]
CircularSampleBuffer< float > inResampleInputBuffer[2]