feat: implement live streaming transcription with VAD

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-29 02:46:13 +08:00
parent d62fcdd1cd
commit 747a4772b6
+61 -1
View File
@@ -1,5 +1,8 @@
import sys import sys
import argparse import argparse
import queue
import threading
import time
import numpy as np import numpy as np
import sounddevice as sd import sounddevice as sd
from transformers import AutoProcessor, CohereAsrForConditionalGeneration from transformers import AutoProcessor, CohereAsrForConditionalGeneration
@@ -138,7 +141,64 @@ class VADStateMachine:
def stream_transcribe(processor, model, language): def stream_transcribe(processor, model, language):
print("TODO: streaming mode") threshold = calibrate_silence()
vad = VADStateMachine(threshold)
seg_queue = queue.Queue()
stop_event = threading.Event()
start_time = time.monotonic()
def transcription_worker():
while not stop_event.is_set() or not seg_queue.empty():
try:
seg_start, audio = seg_queue.get(timeout=0.5)
except queue.Empty:
continue
minutes = int(seg_start) // 60
seconds = int(seg_start) % 60
text = transcribe_audio(processor, model, audio, language)
if text.strip():
print(f"[{minutes:02d}:{seconds:02d}] {text.strip()}")
worker = threading.Thread(target=transcription_worker, daemon=True)
worker.start()
frame_buf = np.empty(0, dtype="float32")
def audio_callback(indata, frames, time_info, status):
nonlocal frame_buf
if stop_event.is_set():
return
frame_buf = np.append(frame_buf, indata[:, 0])
while len(frame_buf) >= FRAME_SIZE:
frame = frame_buf[:FRAME_SIZE]
frame_buf = frame_buf[FRAME_SIZE:]
elapsed = time.monotonic() - start_time
result = vad.process_frame(frame, elapsed)
if result is not None:
seg_queue.put(result)
print("Listening... (Ctrl+C to stop)")
stream = sd.InputStream(
samplerate=SAMPLE_RATE, channels=1, dtype="float32",
callback=audio_callback, blocksize=FRAME_SIZE,
)
try:
with stream:
while True:
time.sleep(0.1)
except KeyboardInterrupt:
pass
stop_event.set()
# Flush any remaining speech segment
if vad.speaking and vad.segment:
elapsed = time.monotonic() - start_time
seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))
worker.join(timeout=30)
print("\nDone.")
if __name__ == "__main__": if __name__ == "__main__":