From 747a4772b6d0b8890b3187377b39ff358f3fb129 Mon Sep 17 00:00:00 2001 From: Wong Ding Feng Date: Fri, 29 May 2026 02:46:13 +0800 Subject: [PATCH] feat: implement live streaming transcription with VAD Co-Authored-By: Claude Sonnet 4.6 --- transcribe.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/transcribe.py b/transcribe.py index e935be2..2212cc1 100644 --- a/transcribe.py +++ b/transcribe.py @@ -1,5 +1,8 @@ import sys import argparse +import queue +import threading +import time import numpy as np import sounddevice as sd from transformers import AutoProcessor, CohereAsrForConditionalGeneration @@ -138,7 +141,64 @@ class VADStateMachine: def stream_transcribe(processor, model, language): - print("TODO: streaming mode") + threshold = calibrate_silence() + vad = VADStateMachine(threshold) + seg_queue = queue.Queue() + stop_event = threading.Event() + start_time = time.monotonic() + + def transcription_worker(): + while not stop_event.is_set() or not seg_queue.empty(): + try: + seg_start, audio = seg_queue.get(timeout=0.5) + except queue.Empty: + continue + minutes = int(seg_start) // 60 + seconds = int(seg_start) % 60 + text = transcribe_audio(processor, model, audio, language) + if text.strip(): + print(f"[{minutes:02d}:{seconds:02d}] {text.strip()}") + + worker = threading.Thread(target=transcription_worker, daemon=True) + worker.start() + + frame_buf = np.empty(0, dtype="float32") + + def audio_callback(indata, frames, time_info, status): + nonlocal frame_buf + if stop_event.is_set(): + return + frame_buf = np.append(frame_buf, indata[:, 0]) + while len(frame_buf) >= FRAME_SIZE: + frame = frame_buf[:FRAME_SIZE] + frame_buf = frame_buf[FRAME_SIZE:] + elapsed = time.monotonic() - start_time + result = vad.process_frame(frame, elapsed) + if result is not None: + seg_queue.put(result) + + print("Listening... (Ctrl+C to stop)") + stream = sd.InputStream( + samplerate=SAMPLE_RATE, channels=1, dtype="float32", + callback=audio_callback, blocksize=FRAME_SIZE, + ) + + try: + with stream: + while True: + time.sleep(0.1) + except KeyboardInterrupt: + pass + + stop_event.set() + + # Flush any remaining speech segment + if vad.speaking and vad.segment: + elapsed = time.monotonic() - start_time + seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment))) + + worker.join(timeout=30) + print("\nDone.") if __name__ == "__main__":