Add option to disable VAD stopping

a47c9ba2 · Aleksandras Kostarevas · f2807064 · a47c9ba2 · a47c9ba2 · a47c9ba2
Commit a47c9ba2 authored 3 weeks ago by Aleksandras Kostarevas
--- a/java/src/org/futo/inputmethod/latin/uix/VoiceInputSettingKeys.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/VoiceInputSettingKeys.kt
@@ -44,6 +44,11 @@ val AUDIO_FOCUS = SettingsKey(
    default = true
 )

+val USE_VAD_AUTOSTOP = SettingsKey(
+    key = booleanPreferencesKey("use_vad_autostop"),
+    default = true
+)
+
 val ENGLISH_MODEL_INDEX = SettingsKey(
    key = intPreferencesKey("english_model_index"),
    default = 0

--- a/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/actions/VoiceInputAction.kt
@@ -35,6 +35,7 @@ import org.futo.inputmethod.latin.uix.KeyboardManagerForAction
 import org.futo.inputmethod.latin.uix.PREFER_BLUETOOTH
 import org.futo.inputmethod.latin.uix.PersistentActionState
 import org.futo.inputmethod.latin.uix.ResourceHelper
+import org.futo.inputmethod.latin.uix.USE_VAD_AUTOSTOP
 import org.futo.inputmethod.latin.uix.VERBOSE_PROGRESS
 import org.futo.inputmethod.latin.uix.getSetting
 import org.futo.inputmethod.latin.uix.setSetting
@@ -108,6 +109,7 @@ private class VoiceInputActionWindow(
        val useBluetoothAudio = context.getSetting(PREFER_BLUETOOTH)
        val requestAudioFocus = context.getSetting(AUDIO_FOCUS)
        val canExpandSpace = context.getSetting(CAN_EXPAND_SPACE)
+        val useVAD = context.getSetting(USE_VAD_AUTOSTOP)

        val primaryModel = model
        val languageSpecificModels = mutableMapOf<Language, ModelLoader>()
@@ -132,7 +134,8 @@ private class VoiceInputActionWindow(
            recordingConfiguration = RecordingSettings(
                preferBluetoothMic = useBluetoothAudio,
                requestAudioFocus = requestAudioFocus,
-                canExpandSpace = canExpandSpace
+                canExpandSpace = canExpandSpace,
+                useVADAutoStop = useVAD
            )
        )
    }

--- a/java/src/org/futo/inputmethod/latin/uix/settings/pages/VoiceInput.kt
+++ b/java/src/org/futo/inputmethod/latin/uix/settings/pages/VoiceInput.kt
@@ -15,6 +15,7 @@ import org.futo.inputmethod.latin.uix.DISALLOW_SYMBOLS
 import org.futo.inputmethod.latin.uix.ENABLE_SOUND
 import org.futo.inputmethod.latin.uix.PREFER_BLUETOOTH
 import org.futo.inputmethod.latin.uix.USE_SYSTEM_VOICE_INPUT
+import org.futo.inputmethod.latin.uix.USE_VAD_AUTOSTOP
 import org.futo.inputmethod.latin.uix.VERBOSE_PROGRESS
 import org.futo.inputmethod.latin.uix.settings.NavigationItem
 import org.futo.inputmethod.latin.uix.settings.NavigationItemStyle
@@ -67,7 +68,6 @@ fun VoiceInputScreen(navController: NavHostController = rememberNavController())
                setting = PREFER_BLUETOOTH
            )

-
            SettingToggleDataStore(
                title = "Audio Focus",
                subtitle = "Pause videos/music when voice input is activated",
@@ -80,11 +80,17 @@ fun VoiceInputScreen(navController: NavHostController = rememberNavController())
            )

            SettingToggleDataStore(
-                title = "Experimental long-form voice input",
-                subtitle = "This disables the 30 second limit, but the output quality may be degraded with long inputs.",
+                title = "Long-form voice input",
+                subtitle = "If disabled, voice input will auto-stop after 30 seconds.",
                setting = CAN_EXPAND_SPACE
            )

+            SettingToggleDataStore(
+                title = "Auto-stop on silence",
+                subtitle = "Automatically stop when silence is detected. You may need to manually stop regardless if there's too much background noise. Please also enable long-form voice input to prevent stopping after 30s.",
+                setting = USE_VAD_AUTOSTOP
+            )
+
            NavigationItem(
                title = "Models",
                subtitle = "To change the models, visit Languages & Models menu",

--- a/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt
+++ b/voiceinput-shared/src/main/java/org/futo/voiceinput/shared/AudioRecognizer.kt
@@ -79,7 +79,8 @@ private fun getRecordingDeviceKind(type: Int): String {
 data class RecordingSettings(
    val preferBluetoothMic: Boolean,
    val requestAudioFocus: Boolean,
-    val canExpandSpace: Boolean
+    val canExpandSpace: Boolean,
+    val useVADAutoStop: Boolean
 )

 data class AudioRecognizerSettings(
@@ -103,6 +104,7 @@ class AudioRecognizer(
    private val modelRunner = MultiModelRunner(modelManager)

    private val canExpandSpace = settings.recordingConfiguration.canExpandSpace
+    private val useVAD = settings.recordingConfiguration.useVADAutoStop

    private var floatSamples: FloatBuffer = FloatBuffer.allocate(16000 * 30)
    private var recorderJob: Job? = null
@@ -293,7 +295,7 @@ class AudioRecognizer(
        modelRunner.preload(settings.modelRunConfiguration)
    }

-    private suspend fun recordingJob(recorder: AudioRecord, vad: VadModel) {
+    private suspend fun recordingJob(recorder: AudioRecord, vad: VadModel?) {
        var hasTalked = false
        var anyNoiseAtAll = false

@@ -328,7 +330,7 @@ class AudioRecognizer(
                isRunningOutOfSpace = false
            }

-            val hasNotTalkedRecently = hasTalked && (numConsecutiveNonSpeech > 66)
+            val hasNotTalkedRecently = hasTalked && (numConsecutiveNonSpeech > 66) && useVAD
            if (isRunningOutOfSpace || hasNotTalkedRecently) {
                yield()
                withContext(Dispatchers.Main) {
@@ -338,30 +340,32 @@ class AudioRecognizer(
            }

            // Run VAD
-            var remainingSamples = nRead
-            var offset = 0
-            while (remainingSamples > 0) {
-                if (!vadSampleBuffer.hasRemaining()) {
-                    val isSpeech = vad.isSpeech(vadSampleBuffer.array())
-                    vadSampleBuffer.clear()
-                    vadSampleBuffer.rewind()
-
-                    if (!isSpeech) {
-                        numConsecutiveNonSpeech++
-                        numConsecutiveSpeech = 0
-                    } else {
-                        numConsecutiveNonSpeech = 0
-                        numConsecutiveSpeech++
+            if(useVAD && vad != null) {
+                var remainingSamples = nRead
+                var offset = 0
+                while (remainingSamples > 0) {
+                    if (!vadSampleBuffer.hasRemaining()) {
+                        val isSpeech = vad.isSpeech(vadSampleBuffer.array())
+                        vadSampleBuffer.clear()
+                        vadSampleBuffer.rewind()
+
+                        if (!isSpeech) {
+                            numConsecutiveNonSpeech++
+                            numConsecutiveSpeech = 0
+                        } else {
+                            numConsecutiveNonSpeech = 0
+                            numConsecutiveSpeech++
+                        }
                    }
-                }

-                val samplesToRead = min(min(remainingSamples, 480), vadSampleBuffer.remaining())
-                for (i in 0 until samplesToRead) {
-                    vadSampleBuffer.put(
-                        samples[offset]
-                    )
-                    offset += 1
-                    remainingSamples -= 1
+                    val samplesToRead = min(min(remainingSamples, 480), vadSampleBuffer.remaining())
+                    for (i in 0 until samplesToRead) {
+                        vadSampleBuffer.put(
+                            samples[offset]
+                        )
+                        offset += 1
+                        remainingSamples -= 1
+                    }
                }
            }

@@ -456,8 +460,12 @@ class AudioRecognizer(

            recorderJob = lifecycleScope.launch {
                withContext(Dispatchers.Default) {
-                    createVad().use { vad ->
-                        recordingJob(recorder, vad)
+                    if(useVAD) {
+                        createVad().use { vad ->
+                            recordingJob(recorder, vad)
+                        }
+                    } else {
+                        recordingJob(recorder, null)
                    }
                }
            }