From f083e424c9f7c99552395a5dc86bab6d41955a44 Mon Sep 17 00:00:00 2001
From: Wong Ding Feng <dingfengwong@gmail.com>
Date: Sat, 30 May 2026 21:12:26 +0800
Subject: [PATCH] feat: make silence pause duration configurable via --pause
 flag

Default is 0.3s for responsive typing. Configurable on both
`cohere on --pause` and `cohere transcribe --stream --pause`.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/cohere_transcribe/cli/cli.py     | 12 +++++++++---
 src/cohere_transcribe/daemon.py      |  7 ++++---
 src/cohere_transcribe/daemon_main.py |  3 ++-
 src/cohere_transcribe/stream.py      |  6 +++---
 src/cohere_transcribe/vad.py         | 11 ++++++++---
 5 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/cohere_transcribe/cli/cli.py b/src/cohere_transcribe/cli/cli.py
index 4aee2ad..d4561a1 100644
--- a/src/cohere_transcribe/cli/cli.py
+++ b/src/cohere_transcribe/cli/cli.py
@@ -15,6 +15,7 @@ console = Console()
 @app.command()
 def on(
     language: str = typer.Option("en", "--lang", "-l", help="Language code"),
+    pause: float = typer.Option(0.3, "--pause", "-p", help="Seconds of silence before sending text"),
     foreground: bool = typer.Option(False, "--fg", help="Run in foreground (don't daemonize)"),
 ):
     """Start transcribing and typing into your focused window."""
@@ -25,13 +26,16 @@ def on(
     if foreground:
         from ..daemon import run_daemon
         console.print("[green]Starting cohere (foreground)...[/green]")
-        run_daemon(language)
+        run_daemon(language, pause=pause)
         return
 
     console.print("[green]Starting cohere daemon...[/green]")
     os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
+    cmd = [sys.executable, "-m", "cohere_transcribe.daemon_main", "--lang", language]
+    if pause != 0.3:
+        cmd += ["--pause", str(pause)]
     subprocess.Popen(
-        [sys.executable, "-m", "cohere_transcribe.daemon_main", "--lang", language],
+        cmd,
         start_new_session=True,
         stdin=subprocess.DEVNULL,
         stdout=open(os.path.join(os.path.dirname(STATE_FILE), "daemon.log"), "a"),
@@ -85,14 +89,16 @@ def transcribe(
     mic: int = typer.Option(None, "--mic", "-m", help="Record from mic for N seconds"),
     stream: bool = typer.Option(False, "--stream", "-s", help="Live streaming mode (prints to terminal)"),
     language: str = typer.Option("en", "--lang", "-l", help="Language code"),
+    pause: float = typer.Option(0.3, "--pause", "-p", help="Seconds of silence before sending text"),
 ):
     """One-shot transcription (file, mic, or stream to terminal)."""
     from ..model import load_model, transcribe_audio
+    from ..vad import pause_seconds_to_frames
 
     if stream:
         from ..stream import stream_transcribe
         processor, model = load_model()
-        stream_transcribe(processor, model, language)
+        stream_transcribe(processor, model, language, silence_frames=pause_seconds_to_frames(pause))
     elif mic is not None:
         from ..model import record_audio
         processor, model = load_model()
diff --git a/src/cohere_transcribe/daemon.py b/src/cohere_transcribe/daemon.py
index 6c7be11..2d04325 100644
--- a/src/cohere_transcribe/daemon.py
+++ b/src/cohere_transcribe/daemon.py
@@ -11,7 +11,7 @@ import numpy as np
 import sounddevice as sd
 
 from .model import SAMPLE_RATE, load_model, transcribe_audio
-from .vad import FRAME_SIZE, VADStateMachine, calibrate_silence
+from .vad import DEFAULT_SILENCE_FRAMES, FRAME_SIZE, VADStateMachine, calibrate_silence, pause_seconds_to_frames
 
 STATE_DIR = os.path.expanduser("~/.local/state/cohere")
 STATE_FILE = os.path.join(STATE_DIR, "state.json")
@@ -77,7 +77,7 @@ def stop_daemon() -> bool:
         return False
 
 
-def run_daemon(language: str = "en"):
+def run_daemon(language: str = "en", pause: float | None = None):
     pid = os.getpid()
     _write_state(pid, "starting")
 
@@ -86,9 +86,10 @@ def run_daemon(language: str = "en"):
 
     signal.signal(signal.SIGTERM, handle_sigterm)
 
+    silence_frames = pause_seconds_to_frames(pause) if pause else DEFAULT_SILENCE_FRAMES
     processor, model = load_model()
     threshold = calibrate_silence()
-    vad = VADStateMachine(threshold)
+    vad = VADStateMachine(threshold, silence_frames=silence_frames)
     seg_queue: queue.Queue = queue.Queue()
     stop_event = threading.Event()
     start_time = time.monotonic()
diff --git a/src/cohere_transcribe/daemon_main.py b/src/cohere_transcribe/daemon_main.py
index 4a5d02f..0d21305 100644
--- a/src/cohere_transcribe/daemon_main.py
+++ b/src/cohere_transcribe/daemon_main.py
@@ -4,5 +4,6 @@ from .daemon import run_daemon
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--lang", default="en")
+parser.add_argument("--pause", type=float, default=None)
 args = parser.parse_args()
-run_daemon(args.lang)
+run_daemon(args.lang, pause=args.pause)
diff --git a/src/cohere_transcribe/stream.py b/src/cohere_transcribe/stream.py
index 5f06412..7040941 100644
--- a/src/cohere_transcribe/stream.py
+++ b/src/cohere_transcribe/stream.py
@@ -7,12 +7,12 @@ import numpy as np
 import sounddevice as sd
 
 from .model import SAMPLE_RATE, transcribe_audio
-from .vad import FRAME_SIZE, VADStateMachine, calibrate_silence
+from .vad import DEFAULT_SILENCE_FRAMES, FRAME_SIZE, VADStateMachine, calibrate_silence
 
 
-def stream_transcribe(processor, model, language):
+def stream_transcribe(processor, model, language, silence_frames=DEFAULT_SILENCE_FRAMES):
     threshold = calibrate_silence()
-    vad = VADStateMachine(threshold)
+    vad = VADStateMachine(threshold, silence_frames=silence_frames)
     seg_queue = queue.Queue()
     stop_event = threading.Event()
     start_time = time.monotonic()
diff --git a/src/cohere_transcribe/vad.py b/src/cohere_transcribe/vad.py
index e292c22..f9e9d96 100644
--- a/src/cohere_transcribe/vad.py
+++ b/src/cohere_transcribe/vad.py
@@ -7,11 +7,15 @@ from .model import SAMPLE_RATE
 
 FRAME_SIZE = 800          # 50ms at 16kHz
 PRE_ROLL_FRAMES = 6       # ~0.3s of audio before speech onset
-SILENCE_FRAMES = 16       # ~0.8s of silence to end a segment
+DEFAULT_SILENCE_FRAMES = 16  # ~0.8s of silence to end a segment
 SPEECH_ONSET_FRAMES = 3   # ~150ms of speech to trigger
 MAX_SPEECH_SECONDS = 30   # force chunk boundary
 
 
+def pause_seconds_to_frames(seconds: float) -> int:
+    return max(1, round(seconds / (FRAME_SIZE / SAMPLE_RATE)))
+
+
 def calibrate_silence(duration=0.5):
     print("Calibrating silence threshold...")
     audio = sd.rec(int(duration * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype="float32")
@@ -23,8 +27,9 @@ def calibrate_silence(duration=0.5):
 
 
 class VADStateMachine:
-    def __init__(self, threshold):
+    def __init__(self, threshold, silence_frames=DEFAULT_SILENCE_FRAMES):
         self.threshold = threshold
+        self.silence_limit = silence_frames
         self.speaking = False
         self.speech_frames = 0
         self.silence_frames = 0
@@ -61,7 +66,7 @@ class VADStateMachine:
             self.silence_frames += 1
 
         segment_duration = len(self.segment) * FRAME_SIZE / SAMPLE_RATE
-        if self.silence_frames >= SILENCE_FRAMES or segment_duration >= MAX_SPEECH_SECONDS:
+        if self.silence_frames >= self.silence_limit or segment_duration >= MAX_SPEECH_SECONDS:
             result = (self.segment_start_time, np.concatenate(self.segment))
             self.speaking = False
             self.speech_frames = 0