From 747a4772b6d0b8890b3187377b39ff358f3fb129 Mon Sep 17 00:00:00 2001
From: Wong Ding Feng <dingfengwong@gmail.com>
Date: Fri, 29 May 2026 02:46:13 +0800
Subject: [PATCH] feat: implement live streaming transcription with VAD

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 transcribe.py | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 61 insertions(+), 1 deletion(-)

diff --git a/transcribe.py b/transcribe.py
index e935be2..2212cc1 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -1,5 +1,8 @@
 import sys
 import argparse
+import queue
+import threading
+import time
 import numpy as np
 import sounddevice as sd
 from transformers import AutoProcessor, CohereAsrForConditionalGeneration
@@ -138,7 +141,64 @@ class VADStateMachine:
 
 
 def stream_transcribe(processor, model, language):
-    print("TODO: streaming mode")
+    threshold = calibrate_silence()
+    vad = VADStateMachine(threshold)
+    seg_queue = queue.Queue()
+    stop_event = threading.Event()
+    start_time = time.monotonic()
+
+    def transcription_worker():
+        while not stop_event.is_set() or not seg_queue.empty():
+            try:
+                seg_start, audio = seg_queue.get(timeout=0.5)
+            except queue.Empty:
+                continue
+            minutes = int(seg_start) // 60
+            seconds = int(seg_start) % 60
+            text = transcribe_audio(processor, model, audio, language)
+            if text.strip():
+                print(f"[{minutes:02d}:{seconds:02d}] {text.strip()}")
+
+    worker = threading.Thread(target=transcription_worker, daemon=True)
+    worker.start()
+
+    frame_buf = np.empty(0, dtype="float32")
+
+    def audio_callback(indata, frames, time_info, status):
+        nonlocal frame_buf
+        if stop_event.is_set():
+            return
+        frame_buf = np.append(frame_buf, indata[:, 0])
+        while len(frame_buf) >= FRAME_SIZE:
+            frame = frame_buf[:FRAME_SIZE]
+            frame_buf = frame_buf[FRAME_SIZE:]
+            elapsed = time.monotonic() - start_time
+            result = vad.process_frame(frame, elapsed)
+            if result is not None:
+                seg_queue.put(result)
+
+    print("Listening... (Ctrl+C to stop)")
+    stream = sd.InputStream(
+        samplerate=SAMPLE_RATE, channels=1, dtype="float32",
+        callback=audio_callback, blocksize=FRAME_SIZE,
+    )
+
+    try:
+        with stream:
+            while True:
+                time.sleep(0.1)
+    except KeyboardInterrupt:
+        pass
+
+    stop_event.set()
+
+    # Flush any remaining speech segment
+    if vad.speaking and vad.segment:
+        elapsed = time.monotonic() - start_time
+        seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))
+
+    worker.join(timeout=30)
+    print("\nDone.")
 
 
 if __name__ == "__main__":