feat: implement live streaming transcription with VAD

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-29 02:46:13 +08:00
parent d62fcdd1cd
commit 747a4772b6
1 changed files with 61 additions and 1 deletions
@@ -1,5 +1,8 @@
 import sys
 import argparse
 import queue
 import threading
 import time
 import numpy as np
 import sounddevice as sd
 from transformers import AutoProcessor, CohereAsrForConditionalGeneration
@@ -138,7 +141,64 @@ class VADStateMachine:
 def stream_transcribe(processor, model, language):
-    print("TODO: streaming mode")
+    threshold = calibrate_silence()
    vad = VADStateMachine(threshold)
    seg_queue = queue.Queue()
    stop_event = threading.Event()
    start_time = time.monotonic()
    def transcription_worker():
        while not stop_event.is_set() or not seg_queue.empty():
            try:
                seg_start, audio = seg_queue.get(timeout=0.5)
            except queue.Empty:
                continue
            minutes = int(seg_start) // 60
            seconds = int(seg_start) % 60
            text = transcribe_audio(processor, model, audio, language)
            if text.strip():
                print(f"[{minutes:02d}:{seconds:02d}] {text.strip()}")
    worker = threading.Thread(target=transcription_worker, daemon=True)
    worker.start()
    frame_buf = np.empty(0, dtype="float32")
    def audio_callback(indata, frames, time_info, status):
        nonlocal frame_buf
        if stop_event.is_set():
            return
        frame_buf = np.append(frame_buf, indata[:, 0])
        while len(frame_buf) >= FRAME_SIZE:
            frame = frame_buf[:FRAME_SIZE]
            frame_buf = frame_buf[FRAME_SIZE:]
            elapsed = time.monotonic() - start_time
            result = vad.process_frame(frame, elapsed)
            if result is not None:
                seg_queue.put(result)
    print("Listening... (Ctrl+C to stop)")
    stream = sd.InputStream(
        samplerate=SAMPLE_RATE, channels=1, dtype="float32",
        callback=audio_callback, blocksize=FRAME_SIZE,
    )
    try:
        with stream:
            while True:
                time.sleep(0.1)
    except KeyboardInterrupt:
        pass
    stop_event.set()
    # Flush any remaining speech segment
    if vad.speaking and vad.segment:
        elapsed = time.monotonic() - start_time
        seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))
    worker.join(timeout=30)
    print("\nDone.")
 if __name__ == "__main__":