mumble-voip_mumble/src/mumble/AudioInput.h

// Copyright 2007-2023 The Mumble Developers. All rights reserved.
// Use of this source code is governed by a BSD-style license
// that can be found in the LICENSE file at the root of the
// Mumble source tree or at <https://www.mumble.info/LICENSE>.

#ifndef MUMBLE_MUMBLE_AUDIOINPUT_H_
#define MUMBLE_MUMBLE_AUDIOINPUT_H_

#include <QElapsedTimer>
#include <QObject>
#include <QThread>

#include <boost/array.hpp>
#include <boost/shared_ptr.hpp>

#include <cstdint>
#include <fstream>
#include <list>
#include <memory>
#include <mutex>
#include <vector>

#include <speex/speex_echo.h>
#include <speex/speex_preprocess.h>
#include <speex/speex_resampler.h>

#include "Audio.h"
#include "AudioOutputToken.h"
#include "EchoCancelOption.h"
#include "MumbleProtocol.h"
#include "Settings.h"
#include "Timer.h"

class AudioInput;
struct OpusEncoder;
struct ReNameNoiseDenoiseState;
typedef boost::shared_ptr< AudioInput > AudioInputPtr;

/**
 * A chunk of audio data to process
 * This struct wraps pointers to two dynamically allocated arrays, containing
 * PCM samples of microphone and speaker readback data (for echo cancellation).
 * Does not handle pointer ownership, so you'll have to deallocate them yourself.
 */
struct AudioChunk {
	AudioChunk() : mic(nullptr), speaker(nullptr) {}
	explicit AudioChunk(short *mic) : mic(mic), speaker(nullptr) {}
	AudioChunk(short *mic, short *speaker) : mic(mic), speaker(speaker) {}
	bool empty() const { return mic == nullptr; }

	short *mic;     ///< Pointer to microphone samples
	short *speaker; ///< Pointer to speaker samples, nullptr if echo cancellation is disabled
};

/*
 * According to https://www.speex.org/docs/manual/speex-manual/node7.html
 * "It is important that, at any time, any echo that is present in the input
 * has already been sent to the echo canceller as echo_frame."
 * Thus, we artificially introduce a small lag in the microphone by means of
 * a queue, so as to be sure the speaker data always precedes the microphone.
 *
 * There are conflicting requirements for the queue:
 * - it has to be small enough not to cause a noticeable lag in the voice
 * - it has to be large enough not to force us to drop packets frequently
 *   when the addMic() and addEcho() callbacks are called in a jittery way
 * - its fill level must be controlled so it does not operate towards zero
 *   elements size, as this would not provide the lag required for the
 *   echo canceller to work properly.
 *
 * The current implementation uses a 5 elements queue, with a control
 * statemachine that introduces packet drops to control the fill level
 * to at least 2 (plus or minus one) and less than 4 elements.
 * With a 10ms chunk, this queue should introduce a ~20ms lag to the voice.
 */
class Resynchronizer {
public:
	/**
	 * Add a microphone sample to the resynchronizer queue
	 * The resynchronizer may decide to drop the sample, and in that case
	 * the pointer will be deallocated not lo leak memory
	 *
	 * \param mic pointer to a dynamically allocated  array with PCM data
	 */
	void addMic(short *mic);

	/**
	 * Add a speaker sample to the resynchronizer
	 * The resynchronizer may decide to drop the sample, and in that case
	 * the pointer will be deallocated not lo leak memory
	 *
	 * \param mic pointer to a dynamically allocated array with PCM data
	 * \return If microphone data is available, the resynchronizer will return a
	 * valid audio chunk to encode, otherwise an empty chunk will be returned
	 */
	AudioChunk addSpeaker(short *speaker);

	/**
	 * Reinitialize the resynchronizer, emptying the queue in the process.
	 */
	void reset();

	/**
	 * \return the nominal lag that the resynchronizer tries to enforce on the
	 * microphone data, in order to make sure the speaker data is always passed
	 * first to the echo canceller
	 */
	int getNominalLag() const { return 2; }

	~Resynchronizer();

	bool bDebugPrintQueue = false; ///< Enables printing queue fill level stats

private:
	/**
	 * Print queue level stats for debugging purposes
	 * \param mic used to distinguish between addMic() and addSpeaker()
	 */
	void printQueue(char who);

	// TODO: there was a mutex (qmEcho), but can the callbacks be called concurrently?
	mutable std::mutex m;
	std::list< short * > micQueue;                          ///< Queue of microphone samples
	enum { S0, S1a, S1b, S2, S3, S4a, S4b, S5 } state = S0; ///< Queue fill control statemachine
};

class AudioInputRegistrar {
private:
	Q_DISABLE_COPY(AudioInputRegistrar)
public:
	static QMap< QString, AudioInputRegistrar * > *qmNew;
	static QString current;
	static AudioInputPtr newFromChoice(QString choice = QString());

	const QString name;
	int priority;

	/// A list of echo cancellation options available for this backend.
	std::vector< EchoCancelOptionID > echoOptions;

	AudioInputRegistrar(const QString &n, int priority = 0);
	virtual ~AudioInputRegistrar();
	virtual AudioInput *create()                               = 0;
	virtual const QVariant getDeviceChoice()                   = 0;
	virtual const QList< audioDevice > getDeviceChoices()      = 0;
	virtual void setDeviceChoice(const QVariant &, Settings &) = 0;

	/// Check that given combination of echoOption and outputSystem combination is suitable for echo cancellation
	virtual bool canEcho(EchoCancelOptionID echoOptionId, const QString &outputSystem) const = 0;
	virtual bool canExclusive() const;

	/**
	 * Check if Mumble's microphone access has been denied by the OS.
	 * Both Windows and macOS have builtin privacy safeguards that display a message asking for users'
	 * consent when apps are trying to use the microphone, and/or provide ways to deny the microphone
	 * access of some apps.
	 * This function should check if Mumble has the permission to use the microphone.
	 * Note: It is possible that this result could only be known after trying to initialize the audio backend.
	 * Generally, call this function after attempts to initialize the AudioInput have been made.
	 * @return true if microphone access is denied.
	 */
	virtual bool isMicrophoneAccessDeniedByOS() = 0;
};

class AudioInput : public QThread {
	friend class AudioNoiseWidget;
	friend class AudioEchoWidget;
	friend class AudioStats;
	friend class AudioInputDialog;

private:
	Q_OBJECT
	Q_DISABLE_COPY(AudioInput)
protected:
	typedef enum { SampleShort, SampleFloat } SampleFormat;
	typedef void (*inMixerFunc)(float *RESTRICT, const void *RESTRICT, unsigned int, unsigned int, quint64);

private:
	bool bDebugDumpInput;                           ///< When true, dump pcm data to debug the echo canceller
	std::ofstream outMic, outSpeaker, outProcessed; ///< Files to dump raw pcm data

	SpeexResamplerState *srsMic, *srsEcho;

	std::unique_ptr< Mumble::Protocol::byte[] > m_legacyBuffer;
	Mumble::Protocol::UDPAudioEncoder< Mumble::Protocol::Role::Client > m_udpEncoder;

	unsigned int iMicFilled, iEchoFilled;
	inMixerFunc imfMic, imfEcho;
	inMixerFunc chooseMixer(const unsigned int nchan, SampleFormat sf, quint64 mask);
	void resetAudioProcessor();

	OpusEncoder *opusState;
#ifdef USE_RENAMENOISE
	ReNameNoiseDenoiseState *denoiseState;
#endif
	bool selectCodec();
	void selectNoiseCancel();

	typedef boost::array< unsigned char, 960 > EncodingOutputBuffer;

	int encodeOpusFrame(short *source, int size, EncodingOutputBuffer &buffer);

	QElapsedTimer qetLastMuteCue;

	AudioOutputToken m_activeAudioCue;

protected:
	Mumble::Protocol::AudioCodec m_codec;
	SampleFormat eMicFormat, eEchoFormat;

	unsigned int iMicChannels, iEchoChannels;
	unsigned int iMicFreq, iEchoFreq;
	unsigned int iMicLength, iEchoLength;
	unsigned int iMicSampleSize, iEchoSampleSize;
	unsigned int iEchoMCLength, iEchoFrameSize;
	quint64 uiMicChannelMask, uiEchoChannelMask;

	bool bEchoMulti;
	Settings::NoiseCancel noiseCancel;
	// Standard microphone sample rate (samples/s)
	static const unsigned int iSampleRate = SAMPLE_RATE;
	/// Based the sample rate, 48,000 samples/s = 48 samples/ms.
	/// For each 10 ms, this yields 480 samples. This corresponds numerically with the calculation:
	/// iFrameSize = 48000 / 100 = 480 samples, allowing a consistent 10ms of audio data per frame.
	static const int iFrameSize = SAMPLE_RATE / 100;

	QMutex qmSpeex;
	SpeexPreprocessState *sppPreprocess;
	SpeexEchoState *sesEcho;

	/// bResetEncoder is a flag that notifies
	/// our encoder functions that the encoder
	/// needs to be reset.
	bool bResetEncoder;

	/// Encoded audio rate in bit/s
	int iAudioQuality;
	bool bAllowLowDelay;
	/// Number of 10ms audio "frames" per packet (!= frames in packet)
	int iAudioFrames;

	/// The minimum time in ms that has to pass between the playback of two consecutive mute cues.
	static constexpr unsigned int MUTE_CUE_DELAY = 5000;

	float *pfMicInput;
	float *pfEchoInput;

	Resynchronizer resync;
	std::vector< short > opusBuffer;

	void encodeAudioFrame(AudioChunk chunk);
	void addMic(const void *data, unsigned int nsamp);
	void addEcho(const void *data, unsigned int nsamp);

	volatile bool bRunning;
	volatile bool bPreviousVoice;
	volatile bool previousPTT;

	int iFrameCounter;
	int iSilentFrames;
	int iHoldFrames;
	int iBufferedFrames;

	QList< QByteArray > qlFrames;
	void flushCheck(const QByteArray &, bool terminator, std::int32_t voiceTargetID);

	void initializeMixer();

	static void adjustBandwidth(int bitspersec, int &bitrate, int &frames, bool &allowLowDelay);

	bool bUserIsMuted;

signals:
	void doDeaf();
	void doMute();
	void doMuteCue();
	/// A signal emitted if audio input is being encountered
	///
	/// @param inputPCM The encountered input PCM
	/// @param sampleCount The amount of samples in the input
	/// @param channelCount The amount of channels in the input
	/// @param sampleRate The used sample rate in Hz
	/// @param isSpeech Whether Mumble considers the input to be speech
	void audioInputEncountered(short *inputPCM, unsigned int sampleCount, unsigned int channelCount,
							   unsigned int sampleRate, bool isSpeech);

public:
	typedef enum { ActivityStateIdle, ActivityStateReturnedFromIdle, ActivityStateActive } ActivityState;

	ActivityState activityState;

	bool bResetProcessor;

	Timer tIdle;

	int iBitrate;
	float dPeakSpeaker, dPeakSignal, dMaxMic, dPeakMic, dPeakCleanMic;
	float fSpeechProb;

	static int getNetworkBandwidth(int bitrate, int frames);
	static void setMaxBandwidth(int bitspersec);

	/// Construct an AudioInput.
	///
	/// This constructor is only ever called by Audio::startInput(), and is guaranteed
	/// to be called on the application's main thread.
	AudioInput();

	/// Destroy an AudioInput.
	///
	/// This destructor is only ever called by Audio::stopInput() and Audio::stop(),
	/// and is guaranteed to be called on the application's main thread.
	~AudioInput() Q_DECL_OVERRIDE;
	void run() Q_DECL_OVERRIDE = 0;
	virtual bool isAlive() const;
	bool isTransmitting() const;

	void updateUserMuteDeafState(const ClientUser *user);

protected:
	virtual void onUserMutedChanged();

public slots:
	void onUserMuteDeafStateChanged();
};

#endif