Skip to content
Snippets Groups Projects
Commit a47c9ba2 authored by Aleksandras Kostarevas's avatar Aleksandras Kostarevas
Browse files

Add option to disable VAD stopping

parent f2807064
No related branches found
No related tags found
No related merge requests found
......@@ -44,6 +44,11 @@ val AUDIO_FOCUS = SettingsKey(
default = true
)
val USE_VAD_AUTOSTOP = SettingsKey(
key = booleanPreferencesKey("use_vad_autostop"),
default = true
)
val ENGLISH_MODEL_INDEX = SettingsKey(
key = intPreferencesKey("english_model_index"),
default = 0
......
......@@ -35,6 +35,7 @@ import org.futo.inputmethod.latin.uix.KeyboardManagerForAction
import org.futo.inputmethod.latin.uix.PREFER_BLUETOOTH
import org.futo.inputmethod.latin.uix.PersistentActionState
import org.futo.inputmethod.latin.uix.ResourceHelper
import org.futo.inputmethod.latin.uix.USE_VAD_AUTOSTOP
import org.futo.inputmethod.latin.uix.VERBOSE_PROGRESS
import org.futo.inputmethod.latin.uix.getSetting
import org.futo.inputmethod.latin.uix.setSetting
......@@ -108,6 +109,7 @@ private class VoiceInputActionWindow(
val useBluetoothAudio = context.getSetting(PREFER_BLUETOOTH)
val requestAudioFocus = context.getSetting(AUDIO_FOCUS)
val canExpandSpace = context.getSetting(CAN_EXPAND_SPACE)
val useVAD = context.getSetting(USE_VAD_AUTOSTOP)
val primaryModel = model
val languageSpecificModels = mutableMapOf<Language, ModelLoader>()
......@@ -132,7 +134,8 @@ private class VoiceInputActionWindow(
recordingConfiguration = RecordingSettings(
preferBluetoothMic = useBluetoothAudio,
requestAudioFocus = requestAudioFocus,
canExpandSpace = canExpandSpace
canExpandSpace = canExpandSpace,
useVADAutoStop = useVAD
)
)
}
......
......@@ -15,6 +15,7 @@ import org.futo.inputmethod.latin.uix.DISALLOW_SYMBOLS
import org.futo.inputmethod.latin.uix.ENABLE_SOUND
import org.futo.inputmethod.latin.uix.PREFER_BLUETOOTH
import org.futo.inputmethod.latin.uix.USE_SYSTEM_VOICE_INPUT
import org.futo.inputmethod.latin.uix.USE_VAD_AUTOSTOP
import org.futo.inputmethod.latin.uix.VERBOSE_PROGRESS
import org.futo.inputmethod.latin.uix.settings.NavigationItem
import org.futo.inputmethod.latin.uix.settings.NavigationItemStyle
......@@ -67,7 +68,6 @@ fun VoiceInputScreen(navController: NavHostController = rememberNavController())
setting = PREFER_BLUETOOTH
)
SettingToggleDataStore(
title = "Audio Focus",
subtitle = "Pause videos/music when voice input is activated",
......@@ -80,11 +80,17 @@ fun VoiceInputScreen(navController: NavHostController = rememberNavController())
)
SettingToggleDataStore(
title = "Experimental long-form voice input",
subtitle = "This disables the 30 second limit, but the output quality may be degraded with long inputs.",
title = "Long-form voice input",
subtitle = "If disabled, voice input will auto-stop after 30 seconds.",
setting = CAN_EXPAND_SPACE
)
SettingToggleDataStore(
title = "Auto-stop on silence",
subtitle = "Automatically stop when silence is detected. You may need to manually stop regardless if there's too much background noise. Please also enable long-form voice input to prevent stopping after 30s.",
setting = USE_VAD_AUTOSTOP
)
NavigationItem(
title = "Models",
subtitle = "To change the models, visit Languages & Models menu",
......
......@@ -79,7 +79,8 @@ private fun getRecordingDeviceKind(type: Int): String {
data class RecordingSettings(
val preferBluetoothMic: Boolean,
val requestAudioFocus: Boolean,
val canExpandSpace: Boolean
val canExpandSpace: Boolean,
val useVADAutoStop: Boolean
)
data class AudioRecognizerSettings(
......@@ -103,6 +104,7 @@ class AudioRecognizer(
private val modelRunner = MultiModelRunner(modelManager)
private val canExpandSpace = settings.recordingConfiguration.canExpandSpace
private val useVAD = settings.recordingConfiguration.useVADAutoStop
private var floatSamples: FloatBuffer = FloatBuffer.allocate(16000 * 30)
private var recorderJob: Job? = null
......@@ -293,7 +295,7 @@ class AudioRecognizer(
modelRunner.preload(settings.modelRunConfiguration)
}
private suspend fun recordingJob(recorder: AudioRecord, vad: VadModel) {
private suspend fun recordingJob(recorder: AudioRecord, vad: VadModel?) {
var hasTalked = false
var anyNoiseAtAll = false
......@@ -328,7 +330,7 @@ class AudioRecognizer(
isRunningOutOfSpace = false
}
val hasNotTalkedRecently = hasTalked && (numConsecutiveNonSpeech > 66)
val hasNotTalkedRecently = hasTalked && (numConsecutiveNonSpeech > 66) && useVAD
if (isRunningOutOfSpace || hasNotTalkedRecently) {
yield()
withContext(Dispatchers.Main) {
......@@ -338,30 +340,32 @@ class AudioRecognizer(
}
// Run VAD
var remainingSamples = nRead
var offset = 0
while (remainingSamples > 0) {
if (!vadSampleBuffer.hasRemaining()) {
val isSpeech = vad.isSpeech(vadSampleBuffer.array())
vadSampleBuffer.clear()
vadSampleBuffer.rewind()
if (!isSpeech) {
numConsecutiveNonSpeech++
numConsecutiveSpeech = 0
} else {
numConsecutiveNonSpeech = 0
numConsecutiveSpeech++
if(useVAD && vad != null) {
var remainingSamples = nRead
var offset = 0
while (remainingSamples > 0) {
if (!vadSampleBuffer.hasRemaining()) {
val isSpeech = vad.isSpeech(vadSampleBuffer.array())
vadSampleBuffer.clear()
vadSampleBuffer.rewind()
if (!isSpeech) {
numConsecutiveNonSpeech++
numConsecutiveSpeech = 0
} else {
numConsecutiveNonSpeech = 0
numConsecutiveSpeech++
}
}
}
val samplesToRead = min(min(remainingSamples, 480), vadSampleBuffer.remaining())
for (i in 0 until samplesToRead) {
vadSampleBuffer.put(
samples[offset]
)
offset += 1
remainingSamples -= 1
val samplesToRead = min(min(remainingSamples, 480), vadSampleBuffer.remaining())
for (i in 0 until samplesToRead) {
vadSampleBuffer.put(
samples[offset]
)
offset += 1
remainingSamples -= 1
}
}
}
......@@ -456,8 +460,12 @@ class AudioRecognizer(
recorderJob = lifecycleScope.launch {
withContext(Dispatchers.Default) {
createVad().use { vad ->
recordingJob(recorder, vad)
if(useVAD) {
createVad().use { vad ->
recordingJob(recorder, vad)
}
} else {
recordingJob(recorder, null)
}
}
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment