488 lines
17 KiB
C++
488 lines
17 KiB
C++
// Copyright 2011-2023 The Mumble Developers. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style license
|
|
// that can be found in the LICENSE file at the root of the
|
|
// Mumble source tree or at <https://www.mumble.info/LICENSE>.
|
|
|
|
#include "AudioOutputSpeech.h"
|
|
|
|
#include "Audio.h"
|
|
#include "ClientUser.h"
|
|
#include "PacketDataStream.h"
|
|
#include "Utils.h"
|
|
#include "Global.h"
|
|
|
|
#include <opus.h>
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cmath>
|
|
|
|
std::mutex AudioOutputSpeech::s_audioCachesMutex;
|
|
std::vector< AudioOutputCache > AudioOutputSpeech::s_audioCaches(100);
|
|
|
|
void AudioOutputSpeech::invalidateAudioOutputCache(void *maskedIndex) {
|
|
// The given "pointer" actually is to be understood as an index
|
|
const std::size_t index = reinterpret_cast< std::size_t >(maskedIndex) - 1;
|
|
|
|
std::lock_guard< std::mutex > lock(s_audioCachesMutex);
|
|
|
|
if (index < s_audioCaches.size()) {
|
|
s_audioCaches[index].clear();
|
|
}
|
|
}
|
|
|
|
std::size_t AudioOutputSpeech::storeAudioOutputCache(const Mumble::Protocol::AudioData &audioData) {
|
|
std::lock_guard< std::mutex > lock(s_audioCachesMutex);
|
|
|
|
// Find free spot in s_audioCaches
|
|
auto it = std::find_if(s_audioCaches.begin(), s_audioCaches.end(),
|
|
[](const AudioOutputCache &chunk) { return !chunk.isValid(); });
|
|
|
|
if (it != s_audioCaches.end()) {
|
|
// Write audio data to that free (currently unused) chunk
|
|
it->loadFrom(audioData);
|
|
|
|
return static_cast< std::size_t >(std::distance(s_audioCaches.begin(), it));
|
|
} else {
|
|
// The list of audio chunks is full -> extend it
|
|
AudioOutputCache chunk;
|
|
chunk.loadFrom(audioData);
|
|
|
|
s_audioCaches.push_back(std::move(chunk));
|
|
|
|
return s_audioCaches.size() - 1;
|
|
}
|
|
}
|
|
|
|
|
|
AudioOutputSpeech::AudioOutputSpeech(ClientUser *user, unsigned int freq, Mumble::Protocol::AudioCodec codec,
|
|
unsigned int systemMaxBufferSize)
|
|
: iMixerFreq(freq), m_codec(codec), p(user) {
|
|
int err;
|
|
|
|
opusState = nullptr;
|
|
|
|
bHasTerminator = false;
|
|
bStereo = false;
|
|
|
|
iSampleRate = SAMPLE_RATE;
|
|
|
|
// opus's "frame" means different from normal audio term "frame"
|
|
// normally, a frame means a bundle of only one sample from each channel,
|
|
// e.Global::get(). for a stereo stream, ...[LR]LRLRLR.... where the bracket indicates a frame
|
|
// in opus term, a frame means samples that span a period of time, which can be either stereo or mono
|
|
// e.Global::get(). ...[LRLR....LRLR].... or ...[MMMM....MMMM].... for mono stream
|
|
// opus supports frames with: 2.5, 5, 10, 20, 40 or 60 ms of audio data.
|
|
// sample rate / 100 means 10ms mono audio data per frame.
|
|
iFrameSizePerChannel = iFrameSize = iSampleRate / 100; // for mono stream
|
|
|
|
assert(m_codec == Mumble::Protocol::AudioCodec::Opus);
|
|
|
|
// Always pretend Stereo mode is true by default. since opus will convert mono stream to stereo stream.
|
|
// https://tools.ietf.org/html/rfc6716#section-2.1.2
|
|
bStereo = true;
|
|
opusState = opus_decoder_create(static_cast< int >(iSampleRate), bStereo ? 2 : 1, nullptr);
|
|
opus_decoder_ctl(opusState,
|
|
OPUS_SET_PHASE_INVERSION_DISABLED(1)); // Disable phase inversion for better mono downmix.
|
|
|
|
// iAudioBufferSize: size (in unit of float) of the buffer used to store decoded pcm data.
|
|
// For opus, the maximum frame size of a packet is 60ms.
|
|
iAudioBufferSize = iSampleRate * 60 / 1000; // = SampleRate * 60ms = 48000Hz * 0.06s = 2880, ~12KB
|
|
|
|
// iBufferSize: size of the buffer to store the resampled audio data.
|
|
// Note that the number of samples in each opus packet can be different from the number of samples the system
|
|
// requests from us each time (this is known as the system's audio buffer size).
|
|
// For example, the maximum size of an opus packet can be 60ms, but the system's audio buffer size is typically
|
|
// ~5ms on my laptop.
|
|
// Whenever the system's audio callback is called, we have two choice:
|
|
// 1. Decode a new opus packet. Then we need a buffer to store unused samples (which don't fit in the system's
|
|
// buffer),
|
|
// 2. Use unused samples from the buffer (remaining from the last decoded frame).
|
|
// How large should this buffer be? Consider the case in which remaining samples in the buffer can not fill
|
|
// the system's audio buffer. In that case, we need to decode a new opus packet. In the worst case, the buffer size
|
|
// needed is
|
|
// 60ms of new decoded audio data + system's buffer size - 1.
|
|
iOutputSize = static_cast< unsigned int >(
|
|
ceilf(static_cast< float >(iAudioBufferSize * iMixerFreq) / static_cast< float >(iSampleRate)));
|
|
iBufferSize = iOutputSize + systemMaxBufferSize; // -1 has been rounded up
|
|
|
|
if (bStereo) {
|
|
iAudioBufferSize *= 2;
|
|
iOutputSize *= 2;
|
|
iBufferSize *= 2;
|
|
iFrameSize *= 2;
|
|
}
|
|
|
|
pfBuffer = new float[iBufferSize];
|
|
|
|
srs = nullptr;
|
|
fResamplerBuffer = nullptr;
|
|
if (iMixerFreq != iSampleRate) {
|
|
srs = speex_resampler_init(bStereo ? 2 : 1, iSampleRate, iMixerFreq, 3, &err);
|
|
fResamplerBuffer = new float[iAudioBufferSize];
|
|
}
|
|
|
|
iBufferOffset = iBufferFilled = iLastConsume = 0;
|
|
bLastAlive = true;
|
|
|
|
iMissCount = 0;
|
|
iMissedFrames = 0;
|
|
|
|
m_audioContext = Mumble::Protocol::AudioContext::INVALID;
|
|
|
|
jbJitter = jitter_buffer_init(static_cast< int >(iFrameSize));
|
|
int margin = Global::get().s.iJitterBufferSize * static_cast< int >(iFrameSize);
|
|
jitter_buffer_ctl(jbJitter, JITTER_BUFFER_SET_MARGIN, &margin);
|
|
|
|
// We are configuring our Jitter buffer to use a custom deleter function. This prevents the buffer from
|
|
// copying the stored data into the buffer itself and also from releasing the memory of it. Instead it
|
|
// will now call this "deleter" function instead.
|
|
// This allows us to manage our own (global) storage for our audio data. With that, we can reuse the same
|
|
// memory regions in order to avoid frequent memory allocations and deallocations.
|
|
// Also this is the basis for using our trick of actually only storing indices instead of proper data
|
|
// pointers in the buffer.
|
|
jitter_buffer_ctl(jbJitter, JITTER_BUFFER_SET_DESTROY_CALLBACK,
|
|
reinterpret_cast< void * >(&AudioOutputSpeech::invalidateAudioOutputCache));
|
|
|
|
fFadeIn = new float[iFrameSizePerChannel];
|
|
fFadeOut = new float[iFrameSizePerChannel];
|
|
|
|
float mul = static_cast< float >(M_PI / (2.0 * static_cast< double >(iFrameSizePerChannel)));
|
|
for (unsigned int i = 0; i < iFrameSizePerChannel; ++i)
|
|
fFadeIn[i] = fFadeOut[iFrameSizePerChannel - i - 1] = sinf(static_cast< float >(i) * mul);
|
|
}
|
|
|
|
AudioOutputSpeech::~AudioOutputSpeech() {
|
|
if (opusState) {
|
|
opus_decoder_destroy(opusState);
|
|
}
|
|
|
|
if (srs)
|
|
speex_resampler_destroy(srs);
|
|
|
|
jitter_buffer_destroy(jbJitter);
|
|
|
|
if (p) {
|
|
p->setTalking(Settings::Passive);
|
|
}
|
|
|
|
delete[] fFadeIn;
|
|
delete[] fFadeOut;
|
|
delete[] fResamplerBuffer;
|
|
}
|
|
|
|
void AudioOutputSpeech::addFrameToBuffer(const Mumble::Protocol::AudioData &audioData) {
|
|
QMutexLocker lock(&qmJitter);
|
|
|
|
if (audioData.payload.empty()) {
|
|
return;
|
|
}
|
|
|
|
int samples = 0;
|
|
|
|
assert(m_codec == Mumble::Protocol::AudioCodec::Opus);
|
|
assert(audioData.usedCodec == m_codec);
|
|
|
|
samples = opus_decoder_get_nb_samples(
|
|
opusState, audioData.payload.data(),
|
|
static_cast< int >(audioData.payload.size())); // this function return samples per channel
|
|
samples *= 2; // since we assume all input stream is stereo.
|
|
|
|
// We can't handle frames which are not a multiple of our configured framesize.
|
|
if (static_cast< unsigned int >(samples) % iFrameSize != 0) {
|
|
qWarning("AudioOutputSpeech: Dropping Opus audio packet, because its sample count (%d) is not a "
|
|
"multiple of our frame size (%d)",
|
|
samples, iFrameSize);
|
|
return;
|
|
}
|
|
|
|
// Copy the audio data to an AudioOutputCache instance and store that in our global chunk list
|
|
std::size_t storageIndex = storeAudioOutputCache(audioData);
|
|
|
|
// We cheat a bit and instead of storing the actual audio data in the jitter buffer, we store the index to
|
|
// the created audio chunk in the buffer. Passing a length of 0 should ensure that this "pointer" will never
|
|
// be dereferenced.
|
|
|
|
// A call to jitter_buffer_put stores the packet in an internal array used for book-keeping.
|
|
// The library uses jbp.data == NULL to differentiate between unused and reserved elements
|
|
// of the book-keeping array.
|
|
// Since a storageIndex of zero will look the same as a null pointer, we always add one to
|
|
// ensure the library never erroneously confuses this entry with a free slot.
|
|
JitterBufferPacket jbp;
|
|
jbp.data = reinterpret_cast< char * >(storageIndex) + 1;
|
|
jbp.len = 0;
|
|
jbp.span = static_cast< unsigned int >(samples);
|
|
jbp.timestamp = static_cast< unsigned int >(iFrameSize * audioData.frameNumber);
|
|
|
|
jitter_buffer_put(jbJitter, &jbp);
|
|
}
|
|
|
|
bool AudioOutputSpeech::prepareSampleBuffer(unsigned int frameCount) {
|
|
unsigned int channels = bStereo ? 2 : 1;
|
|
// Note: all stereo supports are crafted for opus, since other codecs are deprecated and will soon be removed.
|
|
|
|
unsigned int sampleCount = frameCount * channels;
|
|
|
|
// we can not control exactly how many frames decoder returns
|
|
// so we need a buffer to keep unused frames
|
|
// shift the buffer, remove decoded and played frames
|
|
for (unsigned int i = iLastConsume; i < iBufferFilled; ++i)
|
|
pfBuffer[i - iLastConsume] = pfBuffer[i];
|
|
|
|
iBufferFilled -= iLastConsume;
|
|
|
|
iLastConsume = sampleCount;
|
|
|
|
// Maximum interaural delay is accounted for to prevent audio glitches
|
|
if (iBufferFilled >= sampleCount + INTERAURAL_DELAY)
|
|
return bLastAlive;
|
|
|
|
float *pOut;
|
|
bool nextalive = bLastAlive;
|
|
|
|
while (iBufferFilled < sampleCount + INTERAURAL_DELAY) {
|
|
int decodedSamples = static_cast< int >(iFrameSize);
|
|
resizeBuffer(iBufferFilled + iOutputSize + INTERAURAL_DELAY);
|
|
// TODO: allocating memory in the audio callback will crash mumble in some cases.
|
|
// we need to initialize the buffer with an appropriate size when initializing
|
|
// this class. See #4250.
|
|
|
|
pOut = (srs) ? fResamplerBuffer : (pfBuffer + iBufferFilled);
|
|
|
|
if (!bLastAlive) {
|
|
memset(pOut, 0, iFrameSize * sizeof(float));
|
|
} else {
|
|
if (p == &LoopUser::lpLoopy) {
|
|
LoopUser::lpLoopy.fetchFrames();
|
|
}
|
|
|
|
int avail = 0;
|
|
int ts = jitter_buffer_get_pointer_timestamp(jbJitter);
|
|
jitter_buffer_ctl(jbJitter, JITTER_BUFFER_GET_AVAILABLE_COUNT, &avail);
|
|
|
|
if (p && (ts == 0)) {
|
|
int want = static_cast< int >(p->fAverageAvailable);
|
|
if (avail < want) {
|
|
++iMissCount;
|
|
if (iMissCount < 20) {
|
|
memset(pOut, 0, iFrameSize * sizeof(float));
|
|
goto nextframe;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (qlFrames.isEmpty()) {
|
|
QMutexLocker lock(&qmJitter);
|
|
|
|
JitterBufferPacket jbp;
|
|
|
|
spx_int32_t startofs = 0;
|
|
if (jitter_buffer_get(jbJitter, &jbp, static_cast< int >(iFrameSize), &startofs) == JITTER_BUFFER_OK) {
|
|
std::lock_guard< std::mutex > audioChunkLock(s_audioCachesMutex);
|
|
|
|
iMissCount = 0;
|
|
|
|
// The "data pointer" that is stored in the buffer is actually just an index to s_audioCaches
|
|
const std::size_t index = reinterpret_cast< std::size_t >(jbp.data) - 1;
|
|
assert(jbp.len == 0);
|
|
assert(index < s_audioCaches.size());
|
|
|
|
AudioOutputCache &cache = s_audioCaches[index];
|
|
assert(cache.isValid());
|
|
|
|
bHasTerminator = cache.isLastFrame();
|
|
|
|
assert(m_codec == Mumble::Protocol::AudioCodec::Opus);
|
|
|
|
// Copy audio data into qlFrames
|
|
qlFrames << QByteArray(reinterpret_cast< const char * >(cache.getAudioData().data()),
|
|
static_cast< int >(cache.getAudioData().size()));
|
|
|
|
if (cache.containsPositionalInformation()) {
|
|
assert(cache.getPositionalInformation().size() == 3);
|
|
assert(fPos.size() == 3);
|
|
|
|
for (unsigned int i = 0; i < 3; ++i) {
|
|
fPos[i] = cache.getPositionalInformation()[i];
|
|
}
|
|
} else {
|
|
fPos[0] = fPos[1] = fPos[2] = 0.0f;
|
|
}
|
|
|
|
m_suggestedVolumeAdjustment = cache.getVolumeAdjustment();
|
|
m_audioContext = cache.getContext();
|
|
|
|
if (p) {
|
|
float a = static_cast< float >(avail);
|
|
if (static_cast< float >(avail) >= p->fAverageAvailable)
|
|
p->fAverageAvailable = a;
|
|
else
|
|
p->fAverageAvailable *= 0.99f;
|
|
}
|
|
|
|
// If a destroy callback has been registered, jitter_buffer_get expects the caller to
|
|
// invoke the destroy callback on the returned packet.
|
|
// We registered a destroy callback in our constructor, so we clean up the packet here.
|
|
cache.clear();
|
|
} else {
|
|
// Let the jitter buffer know it's the right time to adjust the buffering delay to the network
|
|
// conditions.
|
|
jitter_buffer_update_delay(jbJitter, &jbp, nullptr);
|
|
|
|
iMissCount++;
|
|
if (iMissCount > 10)
|
|
nextalive = false;
|
|
}
|
|
}
|
|
|
|
if (!qlFrames.isEmpty()) {
|
|
QByteArray qba = qlFrames.takeFirst();
|
|
|
|
assert(m_codec == Mumble::Protocol::AudioCodec::Opus);
|
|
|
|
if (qba.isEmpty() || !(p && p->bLocalMute)) {
|
|
// If qba is empty, we have to let Opus know about the packet loss
|
|
// Otherwise if the associated user is not locally muted, we want to decode the audio
|
|
// packet normally in order to be able to play it.
|
|
decodedSamples = opus_decode_float(
|
|
opusState, qba.isEmpty() ? nullptr : reinterpret_cast< const unsigned char * >(qba.constData()),
|
|
qba.size(), pOut, static_cast< int >(iAudioBufferSize), 0);
|
|
} else {
|
|
// If the packet is non-empty, but the associated user is locally muted,
|
|
// we don't have to decode the packet. Instead it is enough to know how many
|
|
// samples it contained so that we can then mute the appropriate output length
|
|
decodedSamples = opus_packet_get_samples_per_frame(
|
|
reinterpret_cast< const unsigned char * >(qba.constData()), SAMPLE_RATE);
|
|
}
|
|
|
|
// The returned sample count we get from the Opus functions refer to samples per channel.
|
|
// Thus in order to get the total amount, we have to multiply by the channel count.
|
|
decodedSamples *= static_cast< int >(channels);
|
|
|
|
if (decodedSamples < 0) {
|
|
decodedSamples = static_cast< int >(iFrameSize);
|
|
memset(pOut, 0, iFrameSize * sizeof(float));
|
|
}
|
|
|
|
bool update = true;
|
|
if (p) {
|
|
float &fPowerMax = p->fPowerMax;
|
|
float &fPowerMin = p->fPowerMin;
|
|
|
|
float pow = 0.0f;
|
|
for (int i = 0; i < decodedSamples; ++i) {
|
|
pow += pOut[i] * pOut[i];
|
|
}
|
|
pow = sqrtf(pow / static_cast< float >(decodedSamples)); // Average over both L and R channel.
|
|
|
|
if (pow >= fPowerMax) {
|
|
fPowerMax = pow;
|
|
} else {
|
|
if (pow <= fPowerMin) {
|
|
fPowerMin = pow;
|
|
} else {
|
|
fPowerMax = 0.99f * fPowerMax;
|
|
fPowerMin += 0.0001f * pow;
|
|
}
|
|
}
|
|
|
|
update = (pow < (fPowerMin + 0.01f * (fPowerMax - fPowerMin))); // Update jitter buffer when quiet.
|
|
}
|
|
|
|
if (qlFrames.isEmpty() && update) {
|
|
jitter_buffer_update_delay(jbJitter, nullptr, nullptr);
|
|
}
|
|
|
|
if (qlFrames.isEmpty() && bHasTerminator) {
|
|
nextalive = false;
|
|
}
|
|
} else {
|
|
assert(m_codec == Mumble::Protocol::AudioCodec::Opus);
|
|
decodedSamples = opus_decode_float(opusState, nullptr, 0, pOut, static_cast< int >(iFrameSize), 0);
|
|
decodedSamples *= static_cast< int >(channels);
|
|
|
|
if (decodedSamples < 0) {
|
|
decodedSamples = static_cast< int >(iFrameSize);
|
|
memset(pOut, 0, iFrameSize * sizeof(float));
|
|
}
|
|
}
|
|
|
|
if (!nextalive) {
|
|
for (unsigned int i = 0; i < static_cast< unsigned int >(iFrameSizePerChannel); ++i) {
|
|
for (unsigned int s = 0; s < channels; ++s)
|
|
pOut[i * channels + s] *= fFadeOut[i];
|
|
}
|
|
} else if (ts == 0) {
|
|
for (unsigned int i = 0; i < static_cast< unsigned int >(iFrameSizePerChannel); ++i) {
|
|
for (unsigned int s = 0; s < channels; ++s)
|
|
pOut[i * channels + s] *= fFadeIn[i];
|
|
}
|
|
}
|
|
|
|
for (unsigned int i = static_cast< unsigned int >(decodedSamples) / iFrameSize; i > 0; --i) {
|
|
jitter_buffer_tick(jbJitter);
|
|
}
|
|
}
|
|
nextframe:
|
|
if (p && p->bLocalMute) {
|
|
// Overwrite the output with zeros as this user is muted
|
|
// NOTE: If Opus is used, then in this case no samples have actually been decoded and thus
|
|
// we don't discard previously done work (in form of decoding the audio stream) by overwriting
|
|
// it with zeros.
|
|
memset(pOut, 0, static_cast< unsigned int >(decodedSamples) * sizeof(float));
|
|
}
|
|
|
|
spx_uint32_t inlen = static_cast< unsigned int >(decodedSamples) / channels; // per channel
|
|
spx_uint32_t outlen = static_cast< unsigned int >(
|
|
ceilf(static_cast< float >(static_cast< unsigned int >(decodedSamples) / channels * iMixerFreq)
|
|
/ static_cast< float >(iSampleRate)));
|
|
if (srs && bLastAlive) {
|
|
if (channels == 1) {
|
|
speex_resampler_process_float(srs, 0, fResamplerBuffer, &inlen, pfBuffer + iBufferFilled, &outlen);
|
|
} else if (channels == 2) {
|
|
speex_resampler_process_interleaved_float(srs, fResamplerBuffer, &inlen, pfBuffer + iBufferFilled,
|
|
&outlen);
|
|
}
|
|
}
|
|
iBufferFilled += outlen * channels;
|
|
}
|
|
|
|
if (p) {
|
|
Settings::TalkState ts;
|
|
if (!nextalive) {
|
|
m_audioContext = Mumble::Protocol::AudioContext::INVALID;
|
|
}
|
|
|
|
switch (m_audioContext) {
|
|
case Mumble::Protocol::AudioContext::LISTEN:
|
|
// Fallthrough
|
|
case Mumble::Protocol::AudioContext::NORMAL:
|
|
ts = Settings::Talking;
|
|
break;
|
|
case Mumble::Protocol::AudioContext::SHOUT:
|
|
ts = Settings::Shouting;
|
|
break;
|
|
case Mumble::Protocol::AudioContext::INVALID:
|
|
ts = Settings::Passive;
|
|
break;
|
|
case Mumble::Protocol::AudioContext::WHISPER:
|
|
ts = Settings::Whispering;
|
|
break;
|
|
default:
|
|
// Default to normal talking, if we don't know the used context
|
|
ts = Settings::Talking;
|
|
break;
|
|
}
|
|
|
|
if (ts != Settings::Passive && p->bLocalMute) {
|
|
ts = Settings::MutedTalking;
|
|
}
|
|
|
|
p->setTalking(ts);
|
|
}
|
|
|
|
bool tmp = bLastAlive;
|
|
bLastAlive = nextalive;
|
|
return tmp;
|
|
}
|