diff --git a/native/jni/org_futo_voiceinput_WhisperGGML.cpp b/native/jni/org_futo_voiceinput_WhisperGGML.cpp index 8511c370f592a241a46357592e347e06ea904d86..caa7b34f1df3ca451f4e8ee9c282210079c2e5c9 100644 --- a/native/jni/org_futo_voiceinput_WhisperGGML.cpp +++ b/native/jni/org_futo_voiceinput_WhisperGGML.cpp @@ -114,7 +114,7 @@ static jstring WhisperGGML_infer(JNIEnv *env, jobject instance, jlong handle, jf wparams.suppress_blank = false; wparams.suppress_non_speech_tokens = suppress_non_speech_tokens; - wparams.no_timestamps = true; + wparams.no_timestamps = num_samples < 16000 * 25; if(allowed_languages.size() == 0) { wparams.language = nullptr; @@ -140,7 +140,10 @@ static jstring WhisperGGML_infer(JNIEnv *env, jobject instance, jlong handle, jf wparams.partial_text_callback_user_data = state; wparams.partial_text_callback = [](struct whisper_context * ctx, struct whisper_state * state, const whisper_token_data *tokens, size_t n_tokens, void * user_data) { std::string partial; + + //AKLOGI(" -- - - - - -- "); for(size_t i=0; i < n_tokens; i++) { + bool skipping = false; if(tokens[i].id == whisper_token_beg(ctx) || tokens[i].id == whisper_token_eot(ctx) || tokens[i].id == whisper_token_nosp(ctx) || @@ -149,8 +152,17 @@ static jstring WhisperGGML_infer(JNIEnv *env, jobject instance, jlong handle, jf tokens[i].id == whisper_token_solm(ctx) || tokens[i].id == whisper_token_sot(ctx) || tokens[i].id == whisper_token_transcribe(ctx) || - tokens[i].id == whisper_token_translate(ctx)) continue; + tokens[i].id == whisper_token_translate(ctx)) skipping = true; + + // Skip timestamp token + if(tokens[i].id >= whisper_token_beg(ctx) + && tokens[i].id <= whisper_token_beg(ctx)+1500) { + skipping = true; + } + //AKLOGI("[%d] %d: %d (%s) %c", whisper_full_n_segments_from_state(state), i, tokens[i].id, whisper_token_to_str(ctx, tokens[i].id), skipping ? '>' : ' '); + + if(skipping) continue; partial += whisper_token_to_str(ctx, tokens[i].id); } @@ -202,8 +214,11 @@ static jstring WhisperGGML_infer(JNIEnv *env, jobject instance, jlong handle, jf const int n_segments = whisper_full_n_segments(state->context); for (int i = 0; i < n_segments; i++) { - auto seg = whisper_full_get_segment_text(state->context, i); + auto seg = std::string(whisper_full_get_segment_text(state->context, i)); + if(seg == " you" && i == n_segments - 1) continue; output.append(seg); + + //AKLOGI("Final segment [%d]: %s", i, seg.c_str()); } if(std::find(forbidden_languages.begin(), diff --git a/native/jni/src/ggml/whisper.cpp b/native/jni/src/ggml/whisper.cpp index 3d6296e0c9ed2d1db039745915705222b2970358..6ad63abcad97b08590ee80044d4050f0da5fa64b 100644 --- a/native/jni/src/ggml/whisper.cpp +++ b/native/jni/src/ggml/whisper.cpp @@ -5213,6 +5213,8 @@ int whisper_full_with_state( if (params.no_timestamps) { prompt_init.push_back(whisper_token_not(ctx)); + } else { + prompt_init.push_back(whisper_token_beg(ctx)); } int seek = seek_start;