mumble-voip_mumble/src/mumble/AudioInput.cpp

// Copyright 2007-2023 The Mumble Developers. All rights reserved.
// Use of this source code is governed by a BSD-style license
// that can be found in the LICENSE file at the root of the
// Mumble source tree or at <https://www.mumble.info/LICENSE>.

#include "AudioInput.h"

#include "API.h"
#include "AudioOutput.h"
#include "MainWindow.h"
#include "MumbleProtocol.h"
#include "NetworkConfig.h"
#include "PacketDataStream.h"
#include "PluginManager.h"
#include "ServerHandler.h"
#include "User.h"
#include "Utils.h"
#include "VoiceRecorder.h"
#include "Global.h"

#include <opus.h>

#ifdef USE_RENAMENOISE
extern "C" {
#	include "renamenoise.h"
}
#endif

#include <algorithm>
#include <cassert>
#include <exception>
#include <limits>

void Resynchronizer::addMic(short *mic) {
	bool drop = false;
	{
		std::unique_lock< std::mutex > l(m);
		micQueue.push_back(mic);
		switch (state) {
			case S0:
				state = S1a;
				break;
			case S1a:
				state = S2;
				break;
			case S1b:
				state = S2;
				break;
			case S2:
				state = S3;
				break;
			case S3:
				state = S4a;
				break;
			case S4a:
				state = S5;
				break;
			case S4b:
				drop = true;
				break;
			case S5:
				drop = true;
				break;
		}
		if (drop) {
			delete[] micQueue.front();
			micQueue.pop_front();
		}
	}
	if (bDebugPrintQueue) {
		if (drop)
			qWarning("Resynchronizer::addMic(): dropped microphone chunk due to overflow");
		printQueue('+');
	}
}

AudioChunk Resynchronizer::addSpeaker(short *speaker) {
	AudioChunk result;
	bool drop = false;
	{
		std::unique_lock< std::mutex > l(m);
		switch (state) {
			case S0:
				drop = true;
				break;
			case S1a:
				drop = true;
				break;
			case S1b:
				state = S0;
				break;
			case S2:
				state = S1b;
				break;
			case S3:
				state = S2;
				break;
			case S4a:
				state = S3;
				break;
			case S4b:
				state = S3;
				break;
			case S5:
				state = S4b;
				break;
		}
		if (drop == false) {
			result = AudioChunk(micQueue.front(), speaker);
			micQueue.pop_front();
		}
	}
	if (drop)
		delete[] speaker;
	if (bDebugPrintQueue) {
		if (drop)
			qWarning("Resynchronizer::addSpeaker(): dropped speaker chunk due to underflow");
		printQueue('-');
	}
	return result;
}

void Resynchronizer::reset() {
	if (bDebugPrintQueue)
		qWarning("Resetting echo queue");
	std::unique_lock< std::mutex > l(m);
	state = S0;
	while (!micQueue.empty()) {
		delete[] micQueue.front();
		micQueue.pop_front();
	}
}

Resynchronizer::~Resynchronizer() {
	reset();
}

void Resynchronizer::printQueue(char who) {
	unsigned int mic;
	{
		std::unique_lock< std::mutex > l(m);
		mic = static_cast< unsigned int >(micQueue.size());
	}
	std::string line;
	line.reserve(32);
	line += who;
	line += " Echo queue [";
	for (unsigned int i = 0; i < 5; i++)
		line += i < mic ? '#' : ' ';
	line += "]\r";
	// This relies on \r to retrace always on the same line, can't use qWarining
	printf("%s", line.c_str());
	fflush(stdout);
}

// Remember that we cannot use static member classes that are not pointers, as the constructor
// for AudioInputRegistrar() might be called before they are initialized, as the constructor
// is called from global initialization.
// Hence, we allocate upon first call.

QMap< QString, AudioInputRegistrar * > *AudioInputRegistrar::qmNew;
QString AudioInputRegistrar::current = QString();

AudioInputRegistrar::AudioInputRegistrar(const QString &n, int p) : name(n), priority(p), echoOptions() {
	if (!qmNew)
		qmNew = new QMap< QString, AudioInputRegistrar * >();
	qmNew->insert(name, this);
}

AudioInputRegistrar::~AudioInputRegistrar() {
	qmNew->remove(name);
}

AudioInputPtr AudioInputRegistrar::newFromChoice(QString choice) {
	if (!qmNew)
		return AudioInputPtr();

	if (!choice.isEmpty() && qmNew->contains(choice)) {
		Global::get().s.qsAudioInput = choice;
		current                      = choice;
		return AudioInputPtr(qmNew->value(current)->create());
	}
	choice = Global::get().s.qsAudioInput;
	if (qmNew->contains(choice)) {
		current = choice;
		return AudioInputPtr(qmNew->value(choice)->create());
	}

	AudioInputRegistrar *r = nullptr;
	foreach (AudioInputRegistrar *air, *qmNew)
		if (!r || (air->priority > r->priority))
			r = air;
	if (r) {
		current = r->name;
		return AudioInputPtr(r->create());
	}
	return AudioInputPtr();
}

bool AudioInputRegistrar::canExclusive() const {
	return false;
}

bool AudioInputRegistrar::isMicrophoneAccessDeniedByOS() {
	return false;
}

AudioInput::AudioInput()
	: opusBuffer(static_cast< std::size_t >(Global::get().s.iFramesPerPacket * (SAMPLE_RATE / 100))) {
	bDebugDumpInput         = Global::get().bDebugDumpInput;
	resync.bDebugPrintQueue = Global::get().bDebugPrintQueue;
	if (bDebugDumpInput) {
		outMic.open("raw_microphone_dump", std::ios::binary);
		outSpeaker.open("speaker_dump", std::ios::binary);
		outProcessed.open("processed_microphone_dump", std::ios::binary);
	}

	adjustBandwidth(Global::get().iMaxBandwidth, iAudioQuality, iAudioFrames, bAllowLowDelay);

	Global::get().iAudioBandwidth = getNetworkBandwidth(iAudioQuality, iAudioFrames);

	m_codec = Mumble::Protocol::AudioCodec::Opus;

	activityState = ActivityStateActive;
	opusState     = nullptr;

	if (bAllowLowDelay && iAudioQuality >= 64000) { // > 64 kbit/s bitrate and low delay allowed
		opusState = opus_encoder_create(SAMPLE_RATE, 1, OPUS_APPLICATION_RESTRICTED_LOWDELAY, nullptr);
		qWarning("AudioInput: Opus encoder set for low delay");
	} else if (iAudioQuality >= 32000) { // > 32 kbit/s bitrate
		opusState = opus_encoder_create(SAMPLE_RATE, 1, OPUS_APPLICATION_AUDIO, nullptr);
		qWarning("AudioInput: Opus encoder set for high quality speech");
	} else {
		opusState = opus_encoder_create(SAMPLE_RATE, 1, OPUS_APPLICATION_VOIP, nullptr);
		qWarning("AudioInput: Opus encoder set for low quality speech");
	}

	opus_encoder_ctl(opusState, OPUS_SET_VBR(0)); // CBR

#ifdef USE_RENAMENOISE
	denoiseState = renamenoise_create(nullptr);
#endif

	qWarning("AudioInput: %d bits/s, %d hz, %d sample", iAudioQuality, iSampleRate, iFrameSize);
	iEchoFreq = iMicFreq = iSampleRate;

	iFrameCounter   = 0;
	iSilentFrames   = 0;
	iHoldFrames     = 0;
	iBufferedFrames = 0;

	bUserIsMuted = false;

	bResetProcessor = true;

	bEchoMulti = false;

	sppPreprocess = nullptr;
	sesEcho       = nullptr;
	srsMic = srsEcho = nullptr;

	iEchoChannels = iMicChannels = 0;
	iEchoFilled = iMicFilled = 0;
	eMicFormat = eEchoFormat = SampleFloat;
	iMicSampleSize = iEchoSampleSize = 0;

	bPreviousVoice = false;

	bResetEncoder = true;

	pfMicInput = pfEchoInput = nullptr;

	iBitrate    = 0;
	dPeakSignal = dPeakSpeaker = dPeakMic = dPeakCleanMic = 0.0;

	if (Global::get().uiSession) {
		setMaxBandwidth(Global::get().iMaxBandwidth);
	}

	bRunning = true;

	connect(this, SIGNAL(doDeaf()), Global::get().mw->qaAudioDeaf, SLOT(trigger()), Qt::QueuedConnection);
	connect(this, SIGNAL(doMute()), Global::get().mw->qaAudioMute, SLOT(trigger()), Qt::QueuedConnection);
	connect(this, SIGNAL(doMuteCue()), Global::get().mw, SLOT(on_muteCuePopup_triggered()));
}

AudioInput::~AudioInput() {
	bRunning = false;
	wait();

	if (opusState) {
		opus_encoder_destroy(opusState);
	}

#ifdef USE_RENAMENOISE
	if (denoiseState) {
		renamenoise_destroy(denoiseState);
	}
#endif

	if (sppPreprocess)
		speex_preprocess_state_destroy(sppPreprocess);
	if (sesEcho)
		speex_echo_state_destroy(sesEcho);

	if (srsMic)
		speex_resampler_destroy(srsMic);
	if (srsEcho)
		speex_resampler_destroy(srsEcho);

	delete[] pfMicInput;
	delete[] pfEchoInput;
}

bool AudioInput::isTransmitting() const {
	return bPreviousVoice;
}

#define IN_MIXER_FLOAT(channels)                                                                             \
	static void inMixerFloat##channels(float *RESTRICT buffer, const void *RESTRICT ipt, unsigned int nsamp, \
									   unsigned int N, quint64 mask) {                                       \
		const float *RESTRICT input = reinterpret_cast< const float * >(ipt);                                \
		const float m               = 1.0f / static_cast< float >(channels);                                 \
		Q_UNUSED(N);                                                                                         \
		Q_UNUSED(mask);                                                                                      \
		for (unsigned int i = 0; i < nsamp; ++i) {                                                           \
			float v = 0.0f;                                                                                  \
			for (unsigned int j = 0; j < channels; ++j)                                                      \
				v += input[i * channels + j];                                                                \
			buffer[i] = v * m;                                                                               \
		}                                                                                                    \
	}

#define IN_MIXER_SHORT(channels)                                                                             \
	static void inMixerShort##channels(float *RESTRICT buffer, const void *RESTRICT ipt, unsigned int nsamp, \
									   unsigned int N, quint64 mask) {                                       \
		const short *RESTRICT input = reinterpret_cast< const short * >(ipt);                                \
		const float m               = 1.0f / (32768.f * static_cast< float >(channels));                     \
		Q_UNUSED(N);                                                                                         \
		Q_UNUSED(mask);                                                                                      \
		for (unsigned int i = 0; i < nsamp; ++i) {                                                           \
			float v = 0.0f;                                                                                  \
			for (unsigned int j = 0; j < channels; ++j)                                                      \
				v += static_cast< float >(input[i * channels + j]);                                          \
			buffer[i] = v * m;                                                                               \
		}                                                                                                    \
	}

static void inMixerFloatMask(float *RESTRICT buffer, const void *RESTRICT ipt, unsigned int nsamp, unsigned int N,
							 quint64 mask) {
	const float *RESTRICT input = reinterpret_cast< const float * >(ipt);

	unsigned int chancount = 0;
	static std::vector< unsigned int > chanindex;
	chanindex.resize(N);
	for (unsigned int j = 0; j < N; ++j) {
		if ((mask & (1ULL << j)) == 0) {
			continue;
		}
		chanindex[chancount] = j; // Use chancount as index into chanindex.
		++chancount;
	}

	const float m = 1.0f / static_cast< float >(chancount);
	for (unsigned int i = 0; i < nsamp; ++i) {
		float v = 0.0f;
		for (unsigned int j = 0; j < chancount; ++j) {
			v += input[i * N + chanindex[j]];
		}
		buffer[i] = v * m;
	}
}

static void inMixerShortMask(float *RESTRICT buffer, const void *RESTRICT ipt, unsigned int nsamp, unsigned int N,
							 quint64 mask) {
	const short *RESTRICT input = reinterpret_cast< const short * >(ipt);

	unsigned int chancount = 0;
	static std::vector< unsigned int > chanindex;
	chanindex.resize(N);
	for (unsigned int j = 0; j < N; ++j) {
		if ((mask & (1ULL << j)) == 0) {
			continue;
		}
		chanindex[chancount] = j; // Use chancount as index into chanindex.
		++chancount;
	}

	const float m = 1.0f / static_cast< float >(chancount);
	for (unsigned int i = 0; i < nsamp; ++i) {
		float v = 0.0f;
		for (unsigned int j = 0; j < chancount; ++j) {
			v += static_cast< float >(input[i * N + chanindex[j]]);
		}
		buffer[i] = v * m;
	}
}

IN_MIXER_FLOAT(1)
IN_MIXER_FLOAT(2)
IN_MIXER_FLOAT(3)
IN_MIXER_FLOAT(4)
IN_MIXER_FLOAT(5)
IN_MIXER_FLOAT(6)
IN_MIXER_FLOAT(7)
IN_MIXER_FLOAT(8)
IN_MIXER_FLOAT(N)

IN_MIXER_SHORT(1)
IN_MIXER_SHORT(2)
IN_MIXER_SHORT(3)
IN_MIXER_SHORT(4)
IN_MIXER_SHORT(5)
IN_MIXER_SHORT(6)
IN_MIXER_SHORT(7)
IN_MIXER_SHORT(8)
IN_MIXER_SHORT(N)

#undef IN_MIXER_FLOAT
#undef IN_MIXER_SHORT

AudioInput::inMixerFunc AudioInput::chooseMixer(const unsigned int nchan, SampleFormat sf, quint64 chanmask) {
	inMixerFunc r = nullptr;

	if (chanmask != 0xffffffffffffffffULL) {
		if (sf == SampleFloat) {
			r = inMixerFloatMask;
		} else if (sf == SampleShort) {
			r = inMixerShortMask;
		}
		return r;
	}

	if (sf == SampleFloat) {
		switch (nchan) {
			case 1:
				r = inMixerFloat1;
				break;
			case 2:
				r = inMixerFloat2;
				break;
			case 3:
				r = inMixerFloat3;
				break;
			case 4:
				r = inMixerFloat4;
				break;
			case 5:
				r = inMixerFloat5;
				break;
			case 6:
				r = inMixerFloat6;
				break;
			case 7:
				r = inMixerFloat7;
				break;
			case 8:
				r = inMixerFloat8;
				break;
			default:
				r = inMixerFloatN;
				break;
		}
	} else {
		switch (nchan) {
			case 1:
				r = inMixerShort1;
				break;
			case 2:
				r = inMixerShort2;
				break;
			case 3:
				r = inMixerShort3;
				break;
			case 4:
				r = inMixerShort4;
				break;
			case 5:
				r = inMixerShort5;
				break;
			case 6:
				r = inMixerShort6;
				break;
			case 7:
				r = inMixerShort7;
				break;
			case 8:
				r = inMixerShort8;
				break;
			default:
				r = inMixerShortN;
				break;
		}
	}
	return r;
}

void AudioInput::initializeMixer() {
	int err;

	if (srsMic)
		speex_resampler_destroy(srsMic);
	if (srsEcho)
		speex_resampler_destroy(srsEcho);
	delete[] pfMicInput;
	delete[] pfEchoInput;

	if (iMicFreq != iSampleRate)
		srsMic = speex_resampler_init(1, iMicFreq, iSampleRate, 3, &err);

	iMicLength = (iFrameSize * iMicFreq) / iSampleRate;

	pfMicInput = new float[iMicLength];

	if (iEchoChannels > 0) {
		bEchoMulti = (Global::get().s.echoOption == EchoCancelOptionID::SPEEX_MULTICHANNEL);
		if (iEchoFreq != iSampleRate)
			srsEcho = speex_resampler_init(bEchoMulti ? iEchoChannels : 1, iEchoFreq, iSampleRate, 3, &err);
		iEchoLength    = (iFrameSize * iEchoFreq) / iSampleRate;
		iEchoMCLength  = bEchoMulti ? iEchoLength * iEchoChannels : iEchoLength;
		iEchoFrameSize = bEchoMulti ? iFrameSize * iEchoChannels : iFrameSize;
		pfEchoInput    = new float[iEchoMCLength];
	} else {
		srsEcho     = nullptr;
		pfEchoInput = nullptr;
	}

	uiMicChannelMask = Global::get().s.uiAudioInputChannelMask;

	// There is no channel mask setting for the echo canceller, so allow all channels.
	uiEchoChannelMask = 0xffffffffffffffffULL;

	imfMic  = chooseMixer(iMicChannels, eMicFormat, uiMicChannelMask);
	imfEcho = chooseMixer(iEchoChannels, eEchoFormat, uiEchoChannelMask);

	iMicSampleSize =
		static_cast< unsigned int >(iMicChannels * ((eMicFormat == SampleFloat) ? sizeof(float) : sizeof(short)));
	iEchoSampleSize =
		static_cast< unsigned int >(iEchoChannels * ((eEchoFormat == SampleFloat) ? sizeof(float) : sizeof(short)));

	bResetProcessor = true;

	qWarning("AudioInput: Initialized mixer for %d channel %d hz mic and %d channel %d hz echo", iMicChannels, iMicFreq,
			 iEchoChannels, iEchoFreq);
	if (uiMicChannelMask != 0xffffffffffffffffULL) {
		qWarning("AudioInput: using mic channel mask 0x%llx", static_cast< unsigned long long >(uiMicChannelMask));
	}
}

void AudioInput::addMic(const void *data, unsigned int nsamp) {
	while (nsamp > 0) {
		// Make sure we don't overrun the frame buffer
		const unsigned int left = qMin(nsamp, iMicLength - iMicFilled);

		// Append mix into pfMicInput frame buffer (converts 16bit pcm->float if necessary)
		imfMic(pfMicInput + iMicFilled, data, left, iMicChannels, uiMicChannelMask);

		iMicFilled += left;
		nsamp -= left;

		// If new samples are left offset data pointer to point at the first one for next iteration
		if (nsamp > 0) {
			if (eMicFormat == SampleFloat)
				data = reinterpret_cast< const float * >(data) + left * iMicChannels;
			else
				data = reinterpret_cast< const short * >(data) + left * iMicChannels;
		}

		if (iMicFilled == iMicLength) {
			// Frame complete
			iMicFilled = 0;

			// If needed resample frame
			float *pfOutput = srsMic ? (float *) alloca(iFrameSize * sizeof(float)) : nullptr;
			float *ptr      = srsMic ? pfOutput : pfMicInput;

			if (srsMic) {
				spx_uint32_t inlen  = iMicLength;
				spx_uint32_t outlen = iFrameSize;
				speex_resampler_process_float(srsMic, 0, pfMicInput, &inlen, pfOutput, &outlen);
			}

			// If echo cancellation is enabled the pointer ends up in the resynchronizer queue
			// and may need to outlive this function's frame
			short *psMic = iEchoChannels > 0 ? new short[iFrameSize] : (short *) alloca(iFrameSize * sizeof(short));

			// Convert float to 16bit PCM
			const float mul = 32768.f;
			for (int j = 0; j < iFrameSize; ++j)
				psMic[j] = static_cast< short >(qBound(-32768.f, (ptr[j] * mul), 32767.f));

			// If we have echo cancellation enabled...
			if (iEchoChannels > 0) {
				resync.addMic(psMic);
			} else {
				encodeAudioFrame(AudioChunk(psMic));
			}
		}
	}
}

void AudioInput::addEcho(const void *data, unsigned int nsamp) {
	while (nsamp > 0) {
		// Make sure we don't overrun the echo frame buffer
		const unsigned int left = qMin(nsamp, iEchoLength - iEchoFilled);

		if (bEchoMulti) {
			const unsigned int samples = left * iEchoChannels;

			if (eEchoFormat == SampleFloat) {
				for (unsigned int i = 0; i < samples; ++i)
					pfEchoInput[i + iEchoFilled * iEchoChannels] = reinterpret_cast< const float * >(data)[i];
			} else {
				// 16bit PCM -> float
				for (unsigned int i = 0; i < samples; ++i)
					pfEchoInput[i + iEchoFilled * iEchoChannels] =
						static_cast< float >(reinterpret_cast< const short * >(data)[i]) * (1.0f / 32768.f);
			}
		} else {
			// Mix echo channels (converts 16bit PCM -> float if needed)
			imfEcho(pfEchoInput + iEchoFilled, data, left, iEchoChannels, uiEchoChannelMask);
		}

		iEchoFilled += left;
		nsamp -= left;

		// If new samples are left offset data pointer to point at the first one for next iteration
		if (nsamp > 0) {
			if (eEchoFormat == SampleFloat)
				data = reinterpret_cast< const float * >(data) + left * iEchoChannels;
			else
				data = reinterpret_cast< const short * >(data) + left * iEchoChannels;
		}

		if (iEchoFilled == iEchoLength) {
			// Frame complete

			iEchoFilled = 0;

			// Resample if necessary
			float *pfOutput = srsEcho ? (float *) alloca(iEchoFrameSize * sizeof(float)) : nullptr;
			float *ptr      = srsEcho ? pfOutput : pfEchoInput;

			if (srsEcho) {
				spx_uint32_t inlen  = iEchoLength;
				spx_uint32_t outlen = iFrameSize;
				speex_resampler_process_interleaved_float(srsEcho, pfEchoInput, &inlen, pfOutput, &outlen);
			}

			short *outbuff = new short[iEchoFrameSize];

			// float -> 16bit PCM
			const float mul = 32768.f;
			for (unsigned int j = 0; j < iEchoFrameSize; ++j) {
				outbuff[j] = static_cast< short >(qBound(-32768.f, (ptr[j] * mul), 32767.f));
			}

			auto chunk = resync.addSpeaker(outbuff);
			if (!chunk.empty()) {
				encodeAudioFrame(chunk);
				delete[] chunk.mic;
				delete[] chunk.speaker;
			}
		}
	}
}

void AudioInput::adjustBandwidth(int bitspersec, int &bitrate, int &frames, bool &allowLowDelay) {
	frames        = Global::get().s.iFramesPerPacket;
	bitrate       = Global::get().s.iQuality;
	allowLowDelay = Global::get().s.bAllowLowDelay;

	if (bitspersec == -1) {
		// No limit
	} else {
		if (getNetworkBandwidth(bitrate, frames) > bitspersec) {
			if ((frames <= 4) && (bitspersec <= 32000))
				frames = 4;
			else if ((frames == 1) && (bitspersec <= 64000))
				frames = 2;
			else if ((frames == 2) && (bitspersec <= 48000))
				frames = 4;
			if (getNetworkBandwidth(bitrate, frames) > bitspersec) {
				do {
					bitrate -= 1000;
				} while ((bitrate > 8000) && (getNetworkBandwidth(bitrate, frames) > bitspersec));
			}
		}
	}
	if (bitrate <= 8000)
		bitrate = 8000;
}

void AudioInput::setMaxBandwidth(int bitspersec) {
	if (bitspersec == Global::get().iMaxBandwidth)
		return;

	int frames;
	int bitrate;
	bool allowLowDelay;
	adjustBandwidth(bitspersec, bitrate, frames, allowLowDelay);

	Global::get().iMaxBandwidth = bitspersec;

	if (bitspersec != -1) {
		if ((bitrate != Global::get().s.iQuality) || (frames != Global::get().s.iFramesPerPacket))
			Global::get().mw->msgBox(
				tr("Server maximum network bandwidth is only %1 kbit/s. Audio quality auto-adjusted to %2 "
				   "kbit/s (%3 ms)")
					.arg(bitspersec / 1000)
					.arg(bitrate / 1000)
					.arg(frames * 10));
	}

	AudioInputPtr ai = Global::get().ai;
	if (ai) {
		Global::get().iAudioBandwidth = getNetworkBandwidth(bitrate, frames);
		ai->iAudioQuality             = bitrate;
		ai->iAudioFrames              = frames;
		ai->bAllowLowDelay            = allowLowDelay;
		return;
	}

	ai.reset();

	Audio::stopInput();
	Audio::startInput();
}

int AudioInput::getNetworkBandwidth(int bitrate, int frames) {
	int overhead = 20 + 8 + 4 + 1 + 2 + (Global::get().s.bTransmitPosition ? 12 : 0)
				   + (NetworkConfig::TcpModeEnabled() ? 12 : 0) + frames;
	overhead *= (800 / frames);
	int bw = overhead + bitrate;

	return bw;
}

void AudioInput::resetAudioProcessor() {
	if (!bResetProcessor)
		return;

	int iArg;

	if (sppPreprocess)
		speex_preprocess_state_destroy(sppPreprocess);
	if (sesEcho)
		speex_echo_state_destroy(sesEcho);

	sppPreprocess = speex_preprocess_state_init(iFrameSize, iSampleRate);
	resync.reset();
	selectNoiseCancel();

	iArg = 1;
	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_VAD, &iArg);
	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC, &iArg);
	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_DEREVERB, &iArg);

	iArg = 30000;
	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_TARGET, &iArg);

	float v = 30000.0f / static_cast< float >(Global::get().s.iMinLoudness);
	iArg    = static_cast< int >(floorf(20.0f * log10f(v)));
	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_MAX_GAIN, &iArg);

	iArg = -60;
	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_DECREMENT, &iArg);

	if (noiseCancel == Settings::NoiseCancelSpeex) {
		iArg = Global::get().s.iSpeexNoiseCancelStrength;
		speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_NOISE_SUPPRESS, &iArg);
	}

	if (iEchoChannels > 0) {
		int filterSize = iFrameSize * (10 + resync.getNominalLag());
		sesEcho =
			speex_echo_state_init_mc(iFrameSize, filterSize, 1, bEchoMulti ? static_cast< int >(iEchoChannels) : 1);
		iArg = iSampleRate;
		speex_echo_ctl(sesEcho, SPEEX_ECHO_SET_SAMPLING_RATE, &iArg);
		speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_ECHO_STATE, sesEcho);

		qWarning("AudioInput: ECHO CANCELLER ACTIVE");
	} else {
		sesEcho = nullptr;
	}

	bResetEncoder = true;

	bResetProcessor = false;
}

bool AudioInput::selectCodec() {
	// We only ever use Opus
	Mumble::Protocol::AudioCodec previousCodec = m_codec;

	assert(previousCodec == Mumble::Protocol::AudioCodec::Opus);

	m_codec = Mumble::Protocol::AudioCodec::Opus;

	if (m_codec != previousCodec) {
		iBufferedFrames = 0;
		qlFrames.clear();
		opusBuffer.clear();
	}

	return true;
}

void AudioInput::selectNoiseCancel() {
	noiseCancel = Global::get().s.noiseCancelMode;

	if (noiseCancel == Settings::NoiseCancelRNN || noiseCancel == Settings::NoiseCancelBoth) {
#ifdef USE_RENAMENOISE
		if (!denoiseState || iFrameSize != 480) {
			qWarning("AudioInput: Ignoring request to enable ReNameNoise: internal error");
			noiseCancel = Settings::NoiseCancelSpeex;
		}
#else
		qWarning("AudioInput: Ignoring request to enable ReNameNoise: Mumble was built without support for it");
		noiseCancel = Settings::NoiseCancelSpeex;
#endif
	}

	int iArg = 0;
	switch (noiseCancel) {
		case Settings::NoiseCancelOff:
			qWarning("AudioInput: Noise canceller disabled");
			break;
		case Settings::NoiseCancelSpeex:
			qWarning("AudioInput: Using Speex as noise canceller");
			iArg = 1;
			break;
		case Settings::NoiseCancelRNN:
			qWarning("AudioInput: Using ReNameNoise as noise canceller");
			break;
		case Settings::NoiseCancelBoth:
			iArg = 1;
			qWarning("AudioInput: Using ReNameNoise and Speex as noise canceller");
			break;
	}
	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_DENOISE, &iArg);
}

int AudioInput::encodeOpusFrame(short *source, int size, EncodingOutputBuffer &buffer) {
	int len;
	if (bResetEncoder) {
		opus_encoder_ctl(opusState, OPUS_RESET_STATE, nullptr);
		bResetEncoder = false;
	}

	opus_encoder_ctl(opusState, OPUS_SET_BITRATE(iAudioQuality));

	len = opus_encode(opusState, source, size, &buffer[0], static_cast< opus_int32 >(buffer.size()));
	const int tenMsFrameCount = (size / iFrameSize);
	iBitrate                  = (len * 100 * 8) / tenMsFrameCount;
	return len;
}

void AudioInput::encodeAudioFrame(AudioChunk chunk) {
	int iArg;
	float sum;
	short max;

	short *psSource;

	iFrameCounter++;

	// As Global::get().iTarget is not protected by any locks, we avoid race-conditions by
	// copying it once at this point and stick to whatever value it is here. Thus
	// if the value of Global::get().iTarget changes during the execution of this function,
	// it won't cause any inconsistencies and the change is reflected once this
	// function is called again.
	std::int32_t voiceTargetID = Global::get().iTarget;

	if (!bRunning)
		return;

	sum = 1.0f;
	max = 1;
	for (unsigned int i = 0; i < iFrameSize; i++) {
		sum += static_cast< float >(chunk.mic[i] * chunk.mic[i]);
		max = std::max(static_cast< short >(abs(chunk.mic[i])), max);
	}
	dPeakMic = qMax(20.0f * log10f(sqrtf(sum / static_cast< float >(iFrameSize)) / 32768.0f), -96.0f);
	dMaxMic  = max;

	if (chunk.speaker && (iEchoChannels > 0)) {
		sum = 1.0f;
		for (unsigned int i = 0; i < iEchoFrameSize; ++i) {
			sum += static_cast< float >(chunk.speaker[i] * chunk.speaker[i]);
		}
		dPeakSpeaker = qMax(20.0f * log10f(sqrtf(sum / static_cast< float >(iFrameSize)) / 32768.0f), -96.0f);
	} else {
		dPeakSpeaker = 0.0;
	}

	QMutexLocker l(&qmSpeex);
	resetAudioProcessor();

	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_GET_AGC_GAIN, &iArg);
	float gainValue = static_cast< float >(iArg);
	if (noiseCancel == Settings::NoiseCancelSpeex || noiseCancel == Settings::NoiseCancelBoth) {
		iArg = Global::get().s.iSpeexNoiseCancelStrength - iArg;
		speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_NOISE_SUPPRESS, &iArg);
	}

	short psClean[iFrameSize];
	if (sesEcho && chunk.speaker) {
		speex_echo_cancellation(sesEcho, chunk.mic, chunk.speaker, psClean);
		psSource = psClean;
	} else {
		psSource = chunk.mic;
	}

#ifdef USE_RENAMENOISE
	// At the time of writing this code, ReNameNoise only supports a sample rate of 48000 Hz.
	if (noiseCancel == Settings::NoiseCancelRNN || noiseCancel == Settings::NoiseCancelBoth) {
		float denoiseFrames[480];
		for (unsigned int i = 0; i < 480; i++) {
			denoiseFrames[i] = psSource[i];
		}

		renamenoise_process_frame_clamped(denoiseState, psSource, denoiseFrames);
	}
#endif

	speex_preprocess_run(sppPreprocess, psSource);

	sum = 1.0f;
	for (unsigned int i = 0; i < iFrameSize; i++)
		sum += static_cast< float >(psSource[i] * psSource[i]);
	float micLevel = sqrtf(sum / static_cast< float >(iFrameSize));
	dPeakSignal    = qMax(20.0f * log10f(micLevel / 32768.0f), -96.0f);

	if (bDebugDumpInput) {
		outMic.write(reinterpret_cast< const char * >(chunk.mic), iFrameSize * sizeof(short));
		if (chunk.speaker) {
			outSpeaker.write(reinterpret_cast< const char * >(chunk.speaker),
							 static_cast< std::streamsize >(iEchoFrameSize * sizeof(short)));
		}
		outProcessed.write(reinterpret_cast< const char * >(psSource),
						   static_cast< std::streamsize >(iFrameSize * sizeof(short)));
	}

	spx_int32_t prob = 0;
	speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_GET_PROB, &prob);
	fSpeechProb = static_cast< float >(prob) / 100.0f;

	// clean microphone level: peak of filtered signal attenuated by AGC gain
	dPeakCleanMic = qMax(dPeakSignal - gainValue, -96.0f);
	float level   = (Global::get().s.vsVAD == Settings::SignalToNoise) ? fSpeechProb : (1.0f + dPeakCleanMic / 96.0f);

	bool bIsSpeech = false;

	if (level > Global::get().s.fVADmax) {
		// Voice-activation threshold has been reached
		bIsSpeech = true;
	} else if (level > Global::get().s.fVADmin && bPreviousVoice) {
		// Voice-deactivation threshold has not yet been reached
		bIsSpeech = true;
	}

	if (!bIsSpeech) {
		iHoldFrames++;
		if (iHoldFrames < Global::get().s.iVoiceHold)
			// Hold mic open until iVoiceHold threshold is reached
			bIsSpeech = true;
	} else {
		iHoldFrames = 0;
	}

	// If Global::get().iPushToTalk > 0 that means that we are currently in some sort of PTT action. For
	// instance this could mean we're currently whispering
	bool isPTT = Global::get().iPushToTalk > 0;

	if (Global::get().s.atTransmit == Settings::Continuous
		|| API::PluginData::get().overwriteMicrophoneActivation.load()) {
		// Continuous transmission is enabled
		bIsSpeech = true;
	} else if (Global::get().s.atTransmit == Settings::PushToTalk) {
		// PTT is enabled, so check if it is currently active
		bool doublePush = Global::get().s.uiDoublePush > 0
						  && ((Global::get().uiDoublePush < Global::get().s.uiDoublePush)
							  || (Global::get().tDoublePush.elapsed() < Global::get().s.uiDoublePush));

		// With double push enabled, we might be in a PTT state without pressing any PTT key
		isPTT     = isPTT || doublePush;
		bIsSpeech = isPTT;
	}

	bIsSpeech = bIsSpeech || isPTT;

	ClientUser *p          = ClientUser::get(Global::get().uiSession);
	bool bTalkingWhenMuted = false;
	if (Global::get().s.bMute || ((Global::get().s.lmLoopMode != Settings::Local) && p && (p->bMute || p->bSuppress))
		|| Global::get().bPushToMute || (voiceTargetID < 0)) {
		bTalkingWhenMuted = bIsSpeech;
		bIsSpeech         = false;
	}

	if (bIsSpeech) {
		iSilentFrames = 0;
	} else {
		iSilentFrames++;
		if (iSilentFrames > 500)
			iFrameCounter = 0;
	}

	if (p) {
		if (!bIsSpeech)
			p->setTalking(Settings::Passive);
		else if (voiceTargetID == Mumble::Protocol::ReservedTargetIDs::REGULAR_SPEECH)
			p->setTalking(Settings::Talking);
		else
			p->setTalking(Settings::Shouting);
	}

	if (Global::get().uiSession != 0) {
		AudioOutputPtr ao = Global::get().ao;

		if (ao) {
			const bool treatAsPTT         = isPTT || previousPTT;
			const bool audioCueEnabledPTT = Global::get().s.audioCueEnabledPTT && treatAsPTT;
			const bool audioCueEnabledVAD =
				Global::get().s.audioCueEnabledVAD && Global::get().s.atTransmit == Settings::VAD && !treatAsPTT;
			const bool audioCueEnabled = audioCueEnabledPTT || audioCueEnabledVAD;

			const bool playAudioOnCue  = bIsSpeech && !bPreviousVoice && audioCueEnabled;
			const bool playAudioOffCue = !bIsSpeech && bPreviousVoice && audioCueEnabled;
			const bool stopActiveCue   = m_activeAudioCue && (playAudioOnCue || playAudioOffCue);

			if (stopActiveCue) {
				// Cancel active cue first, if there is any
				ao->removeToken(m_activeAudioCue);
			}

			if (playAudioOnCue) {
				m_activeAudioCue = ao->playSample(Global::get().s.qsTxAudioCueOn, Global::get().s.cueVolume);
			} else if (playAudioOffCue) {
				m_activeAudioCue = ao->playSample(Global::get().s.qsTxAudioCueOff, Global::get().s.cueVolume);
			}

			if (Global::get().s.bTxMuteCue && !Global::get().bPushToMute && !Global::get().s.bDeaf
				&& bTalkingWhenMuted) {
				if (!qetLastMuteCue.isValid() || qetLastMuteCue.elapsed() > MUTE_CUE_DELAY) {
					qetLastMuteCue.start();
					ao->playSample(Global::get().s.qsTxMuteCue, Global::get().s.cueVolume);
					emit doMuteCue();
				}
			}
		}
	}

	if (!bIsSpeech && !bPreviousVoice) {
		iBitrate = 0;

		if ((tIdle.elapsed() / 1000000ULL) > Global::get().s.iIdleTime) {
			activityState = ActivityStateIdle;
			tIdle.restart();
			if (Global::get().s.iaeIdleAction == Settings::Deafen && !Global::get().s.bDeaf) {
				emit doDeaf();
			} else if (Global::get().s.iaeIdleAction == Settings::Mute && !Global::get().s.bMute) {
				emit doMute();
			}
		}

		if (activityState == ActivityStateReturnedFromIdle) {
			activityState = ActivityStateActive;
			if (Global::get().s.iaeIdleAction != Settings::Nothing && Global::get().s.bUndoIdleActionUponActivity) {
				if (Global::get().s.iaeIdleAction == Settings::Deafen && Global::get().s.bDeaf) {
					emit doDeaf();
				} else if (Global::get().s.iaeIdleAction == Settings::Mute && Global::get().s.bMute) {
					emit doMute();
				}
			}
		}

		spx_int32_t increment = 0;
		speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_INCREMENT, &increment);
		return;
	} else {
		spx_int32_t increment = 12;
		speex_preprocess_ctl(sppPreprocess, SPEEX_PREPROCESS_SET_AGC_INCREMENT, &increment);
	}

	if (bIsSpeech && !bPreviousVoice) {
		bResetEncoder = true;
	}

	tIdle.restart();

	EncodingOutputBuffer buffer;
	Q_ASSERT(buffer.size() >= static_cast< size_t >(iAudioQuality / 100 * iAudioFrames / 8));

	emit audioInputEncountered(psSource, iFrameSize, iMicChannels, SAMPLE_RATE, bIsSpeech);

	int len = 0;

	bool encoded = true;
	if (!selectCodec())
		return;

	assert(m_codec == Mumble::Protocol::AudioCodec::Opus);

	// Encode via Opus
	encoded = false;
	opusBuffer.insert(opusBuffer.end(), psSource, psSource + iFrameSize);
	++iBufferedFrames;

	if (!bIsSpeech || iBufferedFrames >= iAudioFrames) {
		if (iBufferedFrames < iAudioFrames) {
			// Stuff frame to framesize if speech ends and we don't have enough audio
			// this way we are guaranteed to have a valid framecount and won't cause
			// a codec configuration switch by suddenly using a wildly different
			// framecount per packet.
			const int missingFrames = iAudioFrames - iBufferedFrames;
			opusBuffer.insert(opusBuffer.end(), static_cast< std::size_t >(iFrameSize * missingFrames), 0);
			iBufferedFrames += missingFrames;
			iFrameCounter += missingFrames;
		}

		Q_ASSERT(iBufferedFrames == iAudioFrames);

		len = encodeOpusFrame(&opusBuffer[0], iBufferedFrames * iFrameSize, buffer);
		opusBuffer.clear();
		if (len <= 0) {
			iBitrate = 0;
			qWarning() << "encodeOpusFrame failed" << iBufferedFrames << iFrameSize << len;
			iBufferedFrames = 0; // These are lost. Make sure not to mess up our sequence counter next flushCheck.
			return;
		}
		encoded = true;
	}

	if (encoded) {
		flushCheck(QByteArray(reinterpret_cast< char * >(&buffer[0]), len), !bIsSpeech, voiceTargetID);
	}

	if (!bIsSpeech)
		iBitrate = 0;

	bPreviousVoice = bIsSpeech;
	previousPTT    = isPTT;
}

static void sendAudioFrame(gsl::span< const Mumble::Protocol::byte > encodedPacket) {
	ServerHandlerPtr sh = Global::get().sh;
	if (sh) {
		sh->sendMessage(encodedPacket.data(), static_cast< int >(encodedPacket.size()));
	}
}

void AudioInput::flushCheck(const QByteArray &frame, bool terminator, std::int32_t voiceTargetID) {
	qlFrames << frame;

	if (!terminator && iBufferedFrames < iAudioFrames)
		return;

	Mumble::Protocol::AudioData audioData;
	audioData.targetOrContext = static_cast< std::uint32_t >(voiceTargetID);
	audioData.isLastFrame     = terminator;

	if (terminator && Global::get().iPrevTarget > 0) {
		// If we have been whispering to some target but have just ended, terminator will be true. However
		// in the case of whispering this means that we just released the whisper key so this here is the
		// last audio frame that is sent for whispering. The whisper key being released means that Global::get().iTarget
		// is reset to 0 by now. In order to send the last whisper frame correctly, we have to use
		// Global::get().iPrevTarget which is set to whatever Global::get().iTarget has been before its last change.

		audioData.targetOrContext = static_cast< std::uint32_t >(Global::get().iPrevTarget);

		// We reset Global::get().iPrevTarget as it has fulfilled its purpose for this whisper-action. It'll be set
		// accordingly once the client whispers for the next time.
		Global::get().iPrevTarget = 0;
	}
	if (Global::get().s.lmLoopMode == Settings::Server) {
		audioData.targetOrContext = Mumble::Protocol::ReservedTargetIDs::SERVER_LOOPBACK;
	}

	audioData.usedCodec = m_codec;

	int frames      = iBufferedFrames;
	iBufferedFrames = 0;

	audioData.frameNumber = static_cast< std::size_t >(iFrameCounter - frames);

	if (Global::get().s.bTransmitPosition && Global::get().pluginManager && !Global::get().bCenterPosition
		&& Global::get().pluginManager->fetchPositionalData()) {
		Position3D currentPos = Global::get().pluginManager->getPositionalData().getPlayerPos();

		audioData.position[0] = currentPos.x;
		audioData.position[1] = currentPos.y;
		audioData.position[2] = currentPos.z;

		audioData.containsPositionalData = true;
	}

	assert(m_codec == Mumble::Protocol::AudioCodec::Opus);
	// In Opus mode we only expect a single frame per packet
	assert(qlFrames.size() == 1);

	audioData.payload = gsl::span< const Mumble::Protocol::byte >(
		reinterpret_cast< const Mumble::Protocol::byte * >(qlFrames[0].constData()),
		static_cast< std::size_t >(qlFrames[0].size()));

	{
		ServerHandlerPtr sh = Global::get().sh;
		if (sh) {
			VoiceRecorderPtr recorder(sh->recorder);
			if (recorder) {
				recorder->getRecordUser().addFrame(audioData);
			}

			m_udpEncoder.setProtocolVersion(sh->m_version);
		}
	}

	if (Global::get().s.lmLoopMode == Settings::Local) {
		// Only add audio data to local loop buffer
		LoopUser::lpLoopy.addFrame(audioData);
	} else {
		// Encode audio frame and send out
		gsl::span< const Mumble::Protocol::byte > encodedAudioPacket = m_udpEncoder.encodeAudioPacket(audioData);

		sendAudioFrame(encodedAudioPacket);
	}

	qlFrames.clear();
}

bool AudioInput::isAlive() const {
	return isRunning();
}

void AudioInput::onUserMuteDeafStateChanged() {
	const ClientUser *user = qobject_cast< ClientUser * >(QObject::sender());
	updateUserMuteDeafState(user);
}

void AudioInput::updateUserMuteDeafState(const ClientUser *user) {
	bool bMuted = user->bSuppress || user->bSelfMute;
	if (bUserIsMuted != bMuted) {
		bUserIsMuted = bMuted;
		onUserMutedChanged();
	}
}

void AudioInput::onUserMutedChanged() {
}