feat: implement live streaming transcription with VAD
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+61
-1
@@ -1,5 +1,8 @@
|
||||
import sys
|
||||
import argparse
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
import numpy as np
|
||||
import sounddevice as sd
|
||||
from transformers import AutoProcessor, CohereAsrForConditionalGeneration
|
||||
@@ -138,7 +141,64 @@ class VADStateMachine:
|
||||
|
||||
|
||||
def stream_transcribe(processor, model, language):
|
||||
print("TODO: streaming mode")
|
||||
threshold = calibrate_silence()
|
||||
vad = VADStateMachine(threshold)
|
||||
seg_queue = queue.Queue()
|
||||
stop_event = threading.Event()
|
||||
start_time = time.monotonic()
|
||||
|
||||
def transcription_worker():
|
||||
while not stop_event.is_set() or not seg_queue.empty():
|
||||
try:
|
||||
seg_start, audio = seg_queue.get(timeout=0.5)
|
||||
except queue.Empty:
|
||||
continue
|
||||
minutes = int(seg_start) // 60
|
||||
seconds = int(seg_start) % 60
|
||||
text = transcribe_audio(processor, model, audio, language)
|
||||
if text.strip():
|
||||
print(f"[{minutes:02d}:{seconds:02d}] {text.strip()}")
|
||||
|
||||
worker = threading.Thread(target=transcription_worker, daemon=True)
|
||||
worker.start()
|
||||
|
||||
frame_buf = np.empty(0, dtype="float32")
|
||||
|
||||
def audio_callback(indata, frames, time_info, status):
|
||||
nonlocal frame_buf
|
||||
if stop_event.is_set():
|
||||
return
|
||||
frame_buf = np.append(frame_buf, indata[:, 0])
|
||||
while len(frame_buf) >= FRAME_SIZE:
|
||||
frame = frame_buf[:FRAME_SIZE]
|
||||
frame_buf = frame_buf[FRAME_SIZE:]
|
||||
elapsed = time.monotonic() - start_time
|
||||
result = vad.process_frame(frame, elapsed)
|
||||
if result is not None:
|
||||
seg_queue.put(result)
|
||||
|
||||
print("Listening... (Ctrl+C to stop)")
|
||||
stream = sd.InputStream(
|
||||
samplerate=SAMPLE_RATE, channels=1, dtype="float32",
|
||||
callback=audio_callback, blocksize=FRAME_SIZE,
|
||||
)
|
||||
|
||||
try:
|
||||
with stream:
|
||||
while True:
|
||||
time.sleep(0.1)
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
|
||||
stop_event.set()
|
||||
|
||||
# Flush any remaining speech segment
|
||||
if vad.speaking and vad.segment:
|
||||
elapsed = time.monotonic() - start_time
|
||||
seg_queue.put((vad.segment_start_time, np.concatenate(vad.segment)))
|
||||
|
||||
worker.join(timeout=30)
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user