Audacity 3.2.0
GetMeterUsingTatumQuantizationFit.cpp
Go to the documentation of this file.
1/* SPDX-License-Identifier: GPL-2.0-or-later */
2/*!********************************************************************
3
4Audacity: A Digital Audio Editor
5
6GetMeterUsingTatumQuantizationFit.cpp
7
8Matthieu Hodgkinson
9
10**********************************************************************/
11
13#include "IteratorX.h"
15#include "MirDsp.h"
16#include "MirTypes.h"
17#include "MirUtils.h"
19#include <array>
20#include <cassert>
21#include <cmath>
22#include <map>
23#include <numeric>
24#include <regex>
25#include <unordered_map>
26#include <unordered_set>
27
28namespace MIR
29{
30namespace
31{
32constexpr auto minTatumsPerMinute = 100;
33constexpr auto maxTatumsPerMinute = 700;
34constexpr auto minBpm = 50.;
35constexpr auto maxBpm = 200.;
36constexpr auto minBeatsPerBar = 2;
37constexpr auto maxBeatsPerBar = 4;
38constexpr std::array<std::pair<int, int>, 9> possibleTatumsPerBeat {
39 std::pair<int, int> { 1, 1 }, //
40 std::pair<int, int> { 2, 1 }, std::pair<int, int> { 3, 1 },
41 std::pair<int, int> { 4, 1 }, std::pair<int, int> { 6, 1 }, //
42 std::pair<int, int> { 1, 2 }, std::pair<int, int> { 1, 3 },
43 std::pair<int, int> { 1, 4 }, std::pair<int, int> { 1, 6 } //
44};
45
46// Number of bars and beats per bar dividing the input audio recording.
48{
49 const int numBars;
50 const int beatsPerBar;
51};
52
53// A map of possible number of tatums to a list of number of bars and beats per
54// bar which explain it.
56 std::unordered_map<int /*num tatums*/, std::vector<BarDivision>>;
57
58// Gather a collection of possible numbers of divisions based on the assumption
59// that the audio is a loop (hence there must be a round number of bars) and on
60// reasonable bar and tatum durations.
62{
63 constexpr auto minBarDuration = 1.;
64 constexpr auto maxBarDuration = 4.;
65 const int minNumBars =
66 std::max(std::round(audioFileDuration / maxBarDuration), 1.);
67 const int maxNumBars = std::round(audioFileDuration / minBarDuration);
68 PossibleDivHierarchies possibleDivHierarchies;
70 IotaRange { minNumBars, maxNumBars + 1 }, [&](int numBars) {
71 const auto barDuration = audioFileDuration / numBars;
72 const auto minBpb = std::clamp<int>(
73 std::floor(minBpm * barDuration / 60), minBeatsPerBar,
75 const auto maxBpb = std::clamp<int>(
76 std::ceil(maxBpm * barDuration / 60), minBeatsPerBar,
79 IotaRange { minBpb, maxBpb + 1 }, [&](int beatsPerBar) {
80 std::for_each(
82 [&](const std::pair<int, int>& tatumsPerBeat) {
83 const auto [tatumsPerBeatNum, tatumsPerBeatDen] =
84 tatumsPerBeat;
85
86 // Number of tatums per bar must be round:
87 if (
88 (beatsPerBar * tatumsPerBeatNum) % tatumsPerBeatDen !=
89 0)
90 return;
91
92 const int tatumsPerBar =
93 beatsPerBar * tatumsPerBeatNum / tatumsPerBeatDen;
94 const int numTatums = tatumsPerBar * numBars;
95 const auto tatumRate = 60. * numTatums / audioFileDuration;
96 if (
97 minTatumsPerMinute < tatumRate &&
98 tatumRate < maxTatumsPerMinute)
99 possibleDivHierarchies[numTatums].push_back(
100 BarDivision { numBars, beatsPerBar });
101 });
102 });
103 });
104 return possibleDivHierarchies;
105}
106
107int GetOnsetLag(const std::vector<float>& odf, int numTatums)
108{
109 // Understandably, the first onset of a loop recording isn't typically
110 // centered on time 0, or there would be a click at the onset. The entire
111 // recording is therefore likely to have a small lag. It is important that we
112 // take this lag into account for the quantization distance measure.
113 // The code below is equivalent to cross-correlating the odf with a pulse
114 // train of frequency `numTatums / odf.size()`. We take the position of the
115 // first peak to be the lag.
116 const auto pulseTrainPeriod = 1. * odf.size() / numTatums;
117 auto max = std::numeric_limits<float>::lowest();
118 auto lag = 0;
119 while (true)
120 {
121 auto val = 0.f;
122 for_each_in_range(IotaRange { 0, numTatums }, [&](int i) {
123 const int j = std::round(i * pulseTrainPeriod) + lag;
124 val += (j < odf.size() ? odf[j] : 0.f);
125 });
126 if (val < max)
127 break;
128 max = val;
129 ++lag;
130 }
131 return lag - 1;
132}
133
134// This is the fundament of the algorithm. It gives a weighted average of the
135// normalized distance between ODF peaks and the closest tatum. The weights are
136// the ODF values at the peaks. The distance is normalized by the tatum
137// duration, so that the final value ranges between 0 and 1.
138// This can be seen as a performance accuracy measure. The higher the tempo, the
139// more accurate the performer has to be for this distance to stay low. In
140// general, this is desired. We could nevertheless consider introducing a slight
141// tolerance which increases with the tatum rate, as very fast rhythmic patterns
142// recorded without post-editing still tend to achieve lower scores.
144 const std::vector<int>& peakIndices, const std::vector<float>& peakValues,
145 size_t size, int numDivisions, int lag)
146{
147 std::vector<double> peakDistances(peakIndices.size());
148 const auto peakSum =
149 std::accumulate(peakValues.begin(), peakValues.end(), 0.);
150 const auto odfSamplesPerDiv = 1. * size / numDivisions;
151 std::transform(
152 peakIndices.begin(), peakIndices.end(), peakDistances.begin(),
153 [&](int peakIndex) {
154 const auto shiftedIndex = peakIndex - lag;
155 const auto closestDiv = std::round(shiftedIndex / odfSamplesPerDiv);
156 // Normalized distance between 0 and 1:
157 const auto distance =
158 (shiftedIndex - closestDiv * odfSamplesPerDiv) / odfSamplesPerDiv;
159 // Mutliply by two such that the error spans `[0, 1)`.
160 return 2 * std::abs(distance);
161 });
162 // Calculate the score as the sum of the distances weighted by
163 // the odf values:
164 const auto weightedAverage =
165 std::inner_product(
166 peakDistances.begin(), peakDistances.end(), peakValues.begin(), 0.) /
167 peakSum;
168 return weightedAverage;
169}
170
172 const std::vector<float>& odf, const std::vector<int>& peakIndices,
173 const std::vector<float>& peakValues,
174 const std::vector<int>& possibleNumTatums,
175 QuantizationFitDebugOutput* debugOutput)
176{
177 const auto quantizations = [&]() {
178 std::unordered_map<int, OnsetQuantization> quantizations;
179 std::transform(
180 possibleNumTatums.begin(), possibleNumTatums.end(),
181 std::inserter(quantizations, quantizations.end()), [&](int numTatums) {
182 const auto lag = GetOnsetLag(odf, numTatums);
183 const auto distance = GetQuantizationDistance(
184 peakIndices, peakValues, odf.size(), numTatums, lag);
185 return std::make_pair(
186 numTatums, OnsetQuantization { distance, lag, numTatums });
187 });
188 return quantizations;
189 }();
190
191 const auto bestFitIt = std::min_element(
192 quantizations.begin(), quantizations.end(),
193 [](const std::pair<int, OnsetQuantization>& a,
194 const std::pair<int, OnsetQuantization>& b) {
195 return a.second.error < b.second.error;
196 });
197
198 const auto error = bestFitIt->second.error;
199 const auto mostLikelyNumTatums = bestFitIt->first;
200 const auto lag = bestFitIt->second.lag;
201
202 return { error, lag, mostLikelyNumTatums };
203}
204
205std::optional<TimeSignature>
206GetTimeSignature(const BarDivision& barDivision, int numTatums)
207{
208 const auto numBeats = barDivision.numBars * barDivision.beatsPerBar;
209 const auto tatumsPerBeat = 1. * numTatums / numBeats;
210 switch (barDivision.beatsPerBar)
211 {
212 case 2:
213 return (tatumsPerBeat == 3) ? TimeSignature::SixEight :
214 TimeSignature::TwoTwo;
215 case 3:
216 return TimeSignature::ThreeFour;
217 case 4:
218 return TimeSignature::FourFour;
219 default:
220 // Since the bar-division hypotheses are based on these very time
221 // signatures, this should never happen. But the code would need to be
222 // structured more clearly to really have confidence and risk crashing.
223 // For now we accept that the time-signature cannot be recognized in
224 // release builds.
225 assert(false);
226 return std::nullopt;
227 }
228}
229
230double
231GetTimeSignatureLikelihood(const std::optional<TimeSignature>& ts, double bpm)
232{
233 // TODO these were taken from a rather old PoC - review.
234 if (!ts.has_value())
235 return 0.;
236
237 static const std::unordered_map<TimeSignature, double> expectedBpms {
238 { TimeSignature::TwoTwo, 115. },
239 { TimeSignature::FourFour, 115. },
240 { TimeSignature::ThreeFour, 140. },
241 { TimeSignature::SixEight, 64. },
242 };
243
244 static const std::unordered_map<TimeSignature, double> bpmStdDevs {
245 { TimeSignature::TwoTwo, 25. },
246 { TimeSignature::FourFour, 25. },
247 { TimeSignature::ThreeFour, 25. },
248 { TimeSignature::SixEight, 15. },
249 };
250
251 // Add a slight bias towards 4/4, which is the most common time signature.
252 static const std::unordered_map<TimeSignature, double> tsPrior {
253 { TimeSignature::TwoTwo, .1 },
254 { TimeSignature::FourFour, .45 },
255 { TimeSignature::ThreeFour, .2 },
256 { TimeSignature::SixEight, .25 },
257 };
258
259 const auto mu = expectedBpms.at(*ts);
260 const auto sigma = bpmStdDevs.at(*ts);
261 const auto tmp = (bpm - mu) / sigma;
262 const auto likelihood = std::exp(-.5 * tmp * tmp);
263 return likelihood * tsPrior.at(*ts);
264}
265
266// To evaluate how likely a certain BPM is, we evaluate how much the ODF repeats
267// itself at that beat rate. We do this by looking at the auto-correlation of
268// the ODF, "comb-filtering" it at integer multiples of the beat period.
270 double odfAutoCorrSampleRate, double bpm,
271 const std::vector<float>& odfAutoCorr, int odfAutocorrFullSize,
272 QuantizationFitDebugOutput* debugOutput)
273{
274 // Look for closest peak in `odfAutoCorr`:
275 const auto lag = odfAutoCorrSampleRate * 60 / bpm;
276 // Traverse all the auto-correlation by steps of `k*lag`, each time
277 // finding the closest peak, and average the values. Doing this rather
278 // than evaluating only just one peak is more robust to rhythms which
279 // miss beats.
280 auto periodIndex = 1;
281 auto sum = 0.;
282 auto numIndices = 0;
283 while (true)
284 {
285 auto j = static_cast<int>(periodIndex++ * lag + .5);
286 if (j >= odfAutoCorr.size())
287 break;
288 while (true)
289 {
290 const auto i = MapToPositiveHalfIndex(j - 1, odfAutocorrFullSize);
291 const auto k = MapToPositiveHalfIndex(j + 1, odfAutocorrFullSize);
292 if (
293 odfAutoCorr[i] <= odfAutoCorr[j] &&
294 odfAutoCorr[j] >= odfAutoCorr[k])
295 break;
296 j = odfAutoCorr[i] > odfAutoCorr[k] ? i : k;
297 }
298 sum += odfAutoCorr[j];
299 ++numIndices;
300 if (debugOutput)
301 debugOutput->odfAutoCorrPeakIndices.push_back(j);
302 }
303 return sum / numIndices;
304}
305
307 const std::vector<BarDivision>& possibleBarDivisions,
308 double audioFileDuration, int numTatums, const std::vector<float>& odf,
309 QuantizationFitDebugOutput* debugOutput)
310
311{
312 const auto odfAutoCorr = GetNormalizedCircularAutocorr(odf);
313 if (debugOutput)
314 debugOutput->odfAutoCorr = odfAutoCorr;
315 const auto odfAutocorrFullSize = 2 * (odfAutoCorr.size() - 1);
316 assert(IsPowOfTwo(odfAutocorrFullSize));
317 const auto odfAutoCorrSampleRate = odfAutocorrFullSize / audioFileDuration;
318
319 std::vector<double> scores(possibleBarDivisions.size());
320 // These (still) only depend on the beat rate. We look at
321 // self-similarities at beat intervals, so to say, without examining the
322 // contents of the beats. Since several bar divisions may yield the same
323 // number of beats, we may be able to re-use some calculations.
324 std::unordered_map<int /*numBeats*/, double> autocorrScoreCache;
325 std::transform(
326 possibleBarDivisions.begin(), possibleBarDivisions.end(), scores.begin(),
327 [&](const BarDivision& barDivision) {
328 const auto numBeats = barDivision.numBars * barDivision.beatsPerBar;
329 const auto timeSignature = GetTimeSignature(barDivision, numTatums);
330 const auto bpm = 1. * numBeats / audioFileDuration * 60;
331 const auto likelihood = GetTimeSignatureLikelihood(timeSignature, bpm);
332 if (!autocorrScoreCache.count(numBeats))
333 autocorrScoreCache[numBeats] = GetBeatSelfSimilarityScore(
334 odfAutoCorrSampleRate, bpm, odfAutoCorr, odfAutocorrFullSize,
335 debugOutput);
336 const auto selfSimilarityScore = autocorrScoreCache.at(numBeats);
337 return likelihood * selfSimilarityScore;
338 });
339
340 return std::max_element(scores.begin(), scores.end()) - scores.begin();
341}
342
344 const std::vector<float>& odf, int numTatums,
345 std::vector<BarDivision> possibleBarDivisions, double audioFileDuration,
346 QuantizationFitDebugOutput* debugOutput)
347{
348 std::vector<BarDivision> fourFourDivs;
350 IotaRange<size_t>(0, possibleBarDivisions.size()), [&](size_t i) {
351 if (
352 GetTimeSignature(possibleBarDivisions[i], numTatums) ==
353 TimeSignature::FourFour)
354 fourFourDivs.push_back(possibleBarDivisions[i]);
355 });
356
357 // If there is one or more 4/4 possibilities, don't take any risk and only
358 // consider these because much more frequent, especially in the world of
359 // loops. When we get more clever about time-signature detection, we may be
360 // more assertive.
361 if (!fourFourDivs.empty())
362 std::swap(possibleBarDivisions, fourFourDivs);
363
364 const auto winnerIndex = GetBestBarDivisionIndex(
365 possibleBarDivisions, audioFileDuration, numTatums, odf, debugOutput);
366
367 const auto& barDivision = possibleBarDivisions[winnerIndex];
368 const auto numBeats = barDivision.numBars * barDivision.beatsPerBar;
369 const auto signature = GetTimeSignature(barDivision, numTatums);
370 const auto bpm = 60. * numBeats / audioFileDuration;
371
372 return { bpm, signature };
373}
374
376 const std::vector<int>& peakIndices, const std::vector<float>& peakValues)
377{
378 // Detect single-event recordings, e.g. only just one crash cymbal hit. This
379 // will translate as one large peak and maybe a few much smaller ones. In
380 // that case we can expect the average to be between these two groups.
381 const auto peakAvg =
382 std::accumulate(peakValues.begin(), peakValues.end(), 0.) /
383 peakIndices.size();
384 const auto numPeaksAboveAvg =
385 std::count_if(peakValues.begin(), peakValues.end(), [&](float v) {
386 return v > peakAvg;
387 });
388 return numPeaksAboveAvg <= 1;
389}
390} // namespace
391
392std::optional<MusicalMeter> GetMeterUsingTatumQuantizationFit(
394 const std::function<void(double)>& progressCallback,
395 QuantizationFitDebugOutput* debugOutput)
396{
397 const auto odf =
398 GetOnsetDetectionFunction(audio, progressCallback, debugOutput);
399 const auto odfSr =
400 1. * audio.GetSampleRate() * odf.size() / audio.GetNumSamples();
401 const auto audioFileDuration =
402 1. * audio.GetNumSamples() / audio.GetSampleRate();
403
404 const auto peakIndices = GetPeakIndices(odf);
405 if (debugOutput)
406 {
407 debugOutput->audioFileDuration = audioFileDuration;
408 debugOutput->odfSr = odfSr;
409 debugOutput->odfPeakIndices = peakIndices;
410 }
411
412 const auto peakValues = ([&]() {
413 std::vector<float> peakValues(peakIndices.size());
414 std::transform(
415 peakIndices.begin(), peakIndices.end(), peakValues.begin(),
416 [&](int i) { return odf[i]; });
417 return peakValues;
418 })();
419
420 if (IsSingleEvent(peakIndices, peakValues))
421 return {};
422
423 const auto possibleDivs = GetPossibleDivHierarchies(audioFileDuration);
424 if (possibleDivs.empty())
425 // The file is probably too short to be a loop.
426 return {};
427
428 const auto possibleNumTatums = [&]() {
429 std::vector<int> possibleNumTatums(possibleDivs.size());
430 std::transform(
431 possibleDivs.begin(), possibleDivs.end(), possibleNumTatums.begin(),
432 [&](const auto& entry) { return entry.first; });
433 return possibleNumTatums;
434 }();
435
436 const auto experiment = RunQuantizationExperiment(
437 odf, peakIndices, peakValues, possibleNumTatums, debugOutput);
438
439 const auto winnerMeter = GetMostLikelyMeterFromQuantizationExperiment(
440 odf, experiment.numDivisions, possibleDivs.at(experiment.numDivisions),
441 audioFileDuration, debugOutput);
442
443 const auto score = 1 - experiment.error;
444
445 if (debugOutput)
446 {
447 debugOutput->tatumQuantization = experiment;
448 debugOutput->bpm = winnerMeter.bpm;
449 debugOutput->timeSignature = winnerMeter.timeSignature;
450 debugOutput->odf = odf;
451 debugOutput->odfSr = odfSr;
452 debugOutput->audioFileDuration = audioFileDuration;
453 debugOutput->score = score;
454 }
455
456 return score < loopClassifierSettings.at(tolerance).threshold ?
457 std::optional<MusicalMeter> {} :
458 winnerMeter;
459}
460} // namespace MIR
static ProjectFileIORegistry::AttributeWriterEntry entry
void for_each_in_range(Range &&range, Function &&fn)
Definition: IteratorX.h:246
constexpr auto MapToPositiveHalfIndex(int index, int fullSize)
Useful when dealing with symmetric spectra reduced only to their positive half. See tests below for m...
MockedAudio audio
bool IsSingleEvent(const std::vector< int > &peakIndices, const std::vector< float > &peakValues)
std::optional< TimeSignature > GetTimeSignature(const BarDivision &barDivision, int numTatums)
size_t GetBestBarDivisionIndex(const std::vector< BarDivision > &possibleBarDivisions, double audioFileDuration, int numTatums, const std::vector< float > &odf, QuantizationFitDebugOutput *debugOutput)
double GetQuantizationDistance(const std::vector< int > &peakIndices, const std::vector< float > &peakValues, size_t size, int numDivisions, int lag)
double GetBeatSelfSimilarityScore(double odfAutoCorrSampleRate, double bpm, const std::vector< float > &odfAutoCorr, int odfAutocorrFullSize, QuantizationFitDebugOutput *debugOutput)
std::unordered_map< int, std::vector< BarDivision > > PossibleDivHierarchies
double GetTimeSignatureLikelihood(const std::optional< TimeSignature > &ts, double bpm)
MusicalMeter GetMostLikelyMeterFromQuantizationExperiment(const std::vector< float > &odf, int numTatums, std::vector< BarDivision > possibleBarDivisions, double audioFileDuration, QuantizationFitDebugOutput *debugOutput)
OnsetQuantization RunQuantizationExperiment(const std::vector< float > &odf, const std::vector< int > &peakIndices, const std::vector< float > &peakValues, const std::vector< int > &possibleNumTatums, QuantizationFitDebugOutput *debugOutput)
std::vector< float > GetOnsetDetectionFunction(const MirAudioReader &audio, const std::function< void(double)> &progressCallback, QuantizationFitDebugOutput *debugOutput)
Definition: MirDsp.cpp:109
std::vector< float > GetNormalizedCircularAutocorr(const std::vector< float > &ux)
Get the normalized, circular auto-correlation for a signal x whose length already is a power of two....
Definition: MirDsp.cpp:73
static const std::unordered_map< FalsePositiveTolerance, LoopClassifierSettings > loopClassifierSettings
FalsePositiveTolerance
Definition: MirTypes.h:25
std::optional< MusicalMeter > GetMeterUsingTatumQuantizationFit(const MirAudioReader &audio, FalsePositiveTolerance tolerance, const std::function< void(double)> &progressCallback, QuantizationFitDebugOutput *debugOutput)
Get the BPM of the given audio file, using the Tatum Quantization Fit method.
std::vector< int > GetPeakIndices(const std::vector< float > &x)
Definition: MirUtils.cpp:67
constexpr auto IsPowOfTwo(int x)
Definition: MirUtils.h:28
void swap(std::unique_ptr< Alg_seq > &a, std::unique_ptr< Alg_seq > &b)
Definition: NoteTrack.cpp:634
fastfloat_really_inline void round(adjusted_mantissa &am, callback cb) noexcept
Definition: fast_float.h:2512
std::vector< float > odf
Definition: MirTypes.h:145
OnsetQuantization tatumQuantization
Definition: MirTypes.h:138
std::vector< float > odfAutoCorr
Definition: MirTypes.h:149
std::optional< TimeSignature > timeSignature
Definition: MirTypes.h:140
std::vector< int > odfPeakIndices
Definition: MirTypes.h:148
std::vector< int > odfAutoCorrPeakIndices
Definition: MirTypes.h:150