From f083e424c9f7c99552395a5dc86bab6d41955a44 Mon Sep 17 00:00:00 2001 From: Wong Ding Feng Date: Sat, 30 May 2026 21:12:26 +0800 Subject: [PATCH] feat: make silence pause duration configurable via --pause flag Default is 0.3s for responsive typing. Configurable on both `cohere on --pause` and `cohere transcribe --stream --pause`. Co-Authored-By: Claude Opus 4.6 --- src/cohere_transcribe/cli/cli.py | 12 +++++++++--- src/cohere_transcribe/daemon.py | 7 ++++--- src/cohere_transcribe/daemon_main.py | 3 ++- src/cohere_transcribe/stream.py | 6 +++--- src/cohere_transcribe/vad.py | 11 ++++++++--- 5 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/cohere_transcribe/cli/cli.py b/src/cohere_transcribe/cli/cli.py index 4aee2ad..d4561a1 100644 --- a/src/cohere_transcribe/cli/cli.py +++ b/src/cohere_transcribe/cli/cli.py @@ -15,6 +15,7 @@ console = Console() @app.command() def on( language: str = typer.Option("en", "--lang", "-l", help="Language code"), + pause: float = typer.Option(0.3, "--pause", "-p", help="Seconds of silence before sending text"), foreground: bool = typer.Option(False, "--fg", help="Run in foreground (don't daemonize)"), ): """Start transcribing and typing into your focused window.""" @@ -25,13 +26,16 @@ def on( if foreground: from ..daemon import run_daemon console.print("[green]Starting cohere (foreground)...[/green]") - run_daemon(language) + run_daemon(language, pause=pause) return console.print("[green]Starting cohere daemon...[/green]") os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True) + cmd = [sys.executable, "-m", "cohere_transcribe.daemon_main", "--lang", language] + if pause != 0.3: + cmd += ["--pause", str(pause)] subprocess.Popen( - [sys.executable, "-m", "cohere_transcribe.daemon_main", "--lang", language], + cmd, start_new_session=True, stdin=subprocess.DEVNULL, stdout=open(os.path.join(os.path.dirname(STATE_FILE), "daemon.log"), "a"), @@ -85,14 +89,16 @@ def transcribe( mic: int = typer.Option(None, "--mic", "-m", help="Record from mic for N seconds"), stream: bool = typer.Option(False, "--stream", "-s", help="Live streaming mode (prints to terminal)"), language: str = typer.Option("en", "--lang", "-l", help="Language code"), + pause: float = typer.Option(0.3, "--pause", "-p", help="Seconds of silence before sending text"), ): """One-shot transcription (file, mic, or stream to terminal).""" from ..model import load_model, transcribe_audio + from ..vad import pause_seconds_to_frames if stream: from ..stream import stream_transcribe processor, model = load_model() - stream_transcribe(processor, model, language) + stream_transcribe(processor, model, language, silence_frames=pause_seconds_to_frames(pause)) elif mic is not None: from ..model import record_audio processor, model = load_model() diff --git a/src/cohere_transcribe/daemon.py b/src/cohere_transcribe/daemon.py index 6c7be11..2d04325 100644 --- a/src/cohere_transcribe/daemon.py +++ b/src/cohere_transcribe/daemon.py @@ -11,7 +11,7 @@ import numpy as np import sounddevice as sd from .model import SAMPLE_RATE, load_model, transcribe_audio -from .vad import FRAME_SIZE, VADStateMachine, calibrate_silence +from .vad import DEFAULT_SILENCE_FRAMES, FRAME_SIZE, VADStateMachine, calibrate_silence, pause_seconds_to_frames STATE_DIR = os.path.expanduser("~/.local/state/cohere") STATE_FILE = os.path.join(STATE_DIR, "state.json") @@ -77,7 +77,7 @@ def stop_daemon() -> bool: return False -def run_daemon(language: str = "en"): +def run_daemon(language: str = "en", pause: float | None = None): pid = os.getpid() _write_state(pid, "starting") @@ -86,9 +86,10 @@ def run_daemon(language: str = "en"): signal.signal(signal.SIGTERM, handle_sigterm) + silence_frames = pause_seconds_to_frames(pause) if pause else DEFAULT_SILENCE_FRAMES processor, model = load_model() threshold = calibrate_silence() - vad = VADStateMachine(threshold) + vad = VADStateMachine(threshold, silence_frames=silence_frames) seg_queue: queue.Queue = queue.Queue() stop_event = threading.Event() start_time = time.monotonic() diff --git a/src/cohere_transcribe/daemon_main.py b/src/cohere_transcribe/daemon_main.py index 4a5d02f..0d21305 100644 --- a/src/cohere_transcribe/daemon_main.py +++ b/src/cohere_transcribe/daemon_main.py @@ -4,5 +4,6 @@ from .daemon import run_daemon parser = argparse.ArgumentParser() parser.add_argument("--lang", default="en") +parser.add_argument("--pause", type=float, default=None) args = parser.parse_args() -run_daemon(args.lang) +run_daemon(args.lang, pause=args.pause) diff --git a/src/cohere_transcribe/stream.py b/src/cohere_transcribe/stream.py index 5f06412..7040941 100644 --- a/src/cohere_transcribe/stream.py +++ b/src/cohere_transcribe/stream.py @@ -7,12 +7,12 @@ import numpy as np import sounddevice as sd from .model import SAMPLE_RATE, transcribe_audio -from .vad import FRAME_SIZE, VADStateMachine, calibrate_silence +from .vad import DEFAULT_SILENCE_FRAMES, FRAME_SIZE, VADStateMachine, calibrate_silence -def stream_transcribe(processor, model, language): +def stream_transcribe(processor, model, language, silence_frames=DEFAULT_SILENCE_FRAMES): threshold = calibrate_silence() - vad = VADStateMachine(threshold) + vad = VADStateMachine(threshold, silence_frames=silence_frames) seg_queue = queue.Queue() stop_event = threading.Event() start_time = time.monotonic() diff --git a/src/cohere_transcribe/vad.py b/src/cohere_transcribe/vad.py index e292c22..f9e9d96 100644 --- a/src/cohere_transcribe/vad.py +++ b/src/cohere_transcribe/vad.py @@ -7,11 +7,15 @@ from .model import SAMPLE_RATE FRAME_SIZE = 800 # 50ms at 16kHz PRE_ROLL_FRAMES = 6 # ~0.3s of audio before speech onset -SILENCE_FRAMES = 16 # ~0.8s of silence to end a segment +DEFAULT_SILENCE_FRAMES = 16 # ~0.8s of silence to end a segment SPEECH_ONSET_FRAMES = 3 # ~150ms of speech to trigger MAX_SPEECH_SECONDS = 30 # force chunk boundary +def pause_seconds_to_frames(seconds: float) -> int: + return max(1, round(seconds / (FRAME_SIZE / SAMPLE_RATE))) + + def calibrate_silence(duration=0.5): print("Calibrating silence threshold...") audio = sd.rec(int(duration * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype="float32") @@ -23,8 +27,9 @@ def calibrate_silence(duration=0.5): class VADStateMachine: - def __init__(self, threshold): + def __init__(self, threshold, silence_frames=DEFAULT_SILENCE_FRAMES): self.threshold = threshold + self.silence_limit = silence_frames self.speaking = False self.speech_frames = 0 self.silence_frames = 0 @@ -61,7 +66,7 @@ class VADStateMachine: self.silence_frames += 1 segment_duration = len(self.segment) * FRAME_SIZE / SAMPLE_RATE - if self.silence_frames >= SILENCE_FRAMES or segment_duration >= MAX_SPEECH_SECONDS: + if self.silence_frames >= self.silence_limit or segment_duration >= MAX_SPEECH_SECONDS: result = (self.segment_start_time, np.concatenate(self.segment)) self.speaking = False self.speech_frames = 0