feat: make silence pause duration configurable via --pause flag
Default is 0.3s for responsive typing. Configurable on both `cohere on --pause` and `cohere transcribe --stream --pause`. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -15,6 +15,7 @@ console = Console()
|
|||||||
@app.command()
|
@app.command()
|
||||||
def on(
|
def on(
|
||||||
language: str = typer.Option("en", "--lang", "-l", help="Language code"),
|
language: str = typer.Option("en", "--lang", "-l", help="Language code"),
|
||||||
|
pause: float = typer.Option(0.3, "--pause", "-p", help="Seconds of silence before sending text"),
|
||||||
foreground: bool = typer.Option(False, "--fg", help="Run in foreground (don't daemonize)"),
|
foreground: bool = typer.Option(False, "--fg", help="Run in foreground (don't daemonize)"),
|
||||||
):
|
):
|
||||||
"""Start transcribing and typing into your focused window."""
|
"""Start transcribing and typing into your focused window."""
|
||||||
@@ -25,13 +26,16 @@ def on(
|
|||||||
if foreground:
|
if foreground:
|
||||||
from ..daemon import run_daemon
|
from ..daemon import run_daemon
|
||||||
console.print("[green]Starting cohere (foreground)...[/green]")
|
console.print("[green]Starting cohere (foreground)...[/green]")
|
||||||
run_daemon(language)
|
run_daemon(language, pause=pause)
|
||||||
return
|
return
|
||||||
|
|
||||||
console.print("[green]Starting cohere daemon...[/green]")
|
console.print("[green]Starting cohere daemon...[/green]")
|
||||||
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
os.makedirs(os.path.dirname(STATE_FILE), exist_ok=True)
|
||||||
|
cmd = [sys.executable, "-m", "cohere_transcribe.daemon_main", "--lang", language]
|
||||||
|
if pause != 0.3:
|
||||||
|
cmd += ["--pause", str(pause)]
|
||||||
subprocess.Popen(
|
subprocess.Popen(
|
||||||
[sys.executable, "-m", "cohere_transcribe.daemon_main", "--lang", language],
|
cmd,
|
||||||
start_new_session=True,
|
start_new_session=True,
|
||||||
stdin=subprocess.DEVNULL,
|
stdin=subprocess.DEVNULL,
|
||||||
stdout=open(os.path.join(os.path.dirname(STATE_FILE), "daemon.log"), "a"),
|
stdout=open(os.path.join(os.path.dirname(STATE_FILE), "daemon.log"), "a"),
|
||||||
@@ -85,14 +89,16 @@ def transcribe(
|
|||||||
mic: int = typer.Option(None, "--mic", "-m", help="Record from mic for N seconds"),
|
mic: int = typer.Option(None, "--mic", "-m", help="Record from mic for N seconds"),
|
||||||
stream: bool = typer.Option(False, "--stream", "-s", help="Live streaming mode (prints to terminal)"),
|
stream: bool = typer.Option(False, "--stream", "-s", help="Live streaming mode (prints to terminal)"),
|
||||||
language: str = typer.Option("en", "--lang", "-l", help="Language code"),
|
language: str = typer.Option("en", "--lang", "-l", help="Language code"),
|
||||||
|
pause: float = typer.Option(0.3, "--pause", "-p", help="Seconds of silence before sending text"),
|
||||||
):
|
):
|
||||||
"""One-shot transcription (file, mic, or stream to terminal)."""
|
"""One-shot transcription (file, mic, or stream to terminal)."""
|
||||||
from ..model import load_model, transcribe_audio
|
from ..model import load_model, transcribe_audio
|
||||||
|
from ..vad import pause_seconds_to_frames
|
||||||
|
|
||||||
if stream:
|
if stream:
|
||||||
from ..stream import stream_transcribe
|
from ..stream import stream_transcribe
|
||||||
processor, model = load_model()
|
processor, model = load_model()
|
||||||
stream_transcribe(processor, model, language)
|
stream_transcribe(processor, model, language, silence_frames=pause_seconds_to_frames(pause))
|
||||||
elif mic is not None:
|
elif mic is not None:
|
||||||
from ..model import record_audio
|
from ..model import record_audio
|
||||||
processor, model = load_model()
|
processor, model = load_model()
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ import numpy as np
|
|||||||
import sounddevice as sd
|
import sounddevice as sd
|
||||||
|
|
||||||
from .model import SAMPLE_RATE, load_model, transcribe_audio
|
from .model import SAMPLE_RATE, load_model, transcribe_audio
|
||||||
from .vad import FRAME_SIZE, VADStateMachine, calibrate_silence
|
from .vad import DEFAULT_SILENCE_FRAMES, FRAME_SIZE, VADStateMachine, calibrate_silence, pause_seconds_to_frames
|
||||||
|
|
||||||
STATE_DIR = os.path.expanduser("~/.local/state/cohere")
|
STATE_DIR = os.path.expanduser("~/.local/state/cohere")
|
||||||
STATE_FILE = os.path.join(STATE_DIR, "state.json")
|
STATE_FILE = os.path.join(STATE_DIR, "state.json")
|
||||||
@@ -77,7 +77,7 @@ def stop_daemon() -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def run_daemon(language: str = "en"):
|
def run_daemon(language: str = "en", pause: float | None = None):
|
||||||
pid = os.getpid()
|
pid = os.getpid()
|
||||||
_write_state(pid, "starting")
|
_write_state(pid, "starting")
|
||||||
|
|
||||||
@@ -86,9 +86,10 @@ def run_daemon(language: str = "en"):
|
|||||||
|
|
||||||
signal.signal(signal.SIGTERM, handle_sigterm)
|
signal.signal(signal.SIGTERM, handle_sigterm)
|
||||||
|
|
||||||
|
silence_frames = pause_seconds_to_frames(pause) if pause else DEFAULT_SILENCE_FRAMES
|
||||||
processor, model = load_model()
|
processor, model = load_model()
|
||||||
threshold = calibrate_silence()
|
threshold = calibrate_silence()
|
||||||
vad = VADStateMachine(threshold)
|
vad = VADStateMachine(threshold, silence_frames=silence_frames)
|
||||||
seg_queue: queue.Queue = queue.Queue()
|
seg_queue: queue.Queue = queue.Queue()
|
||||||
stop_event = threading.Event()
|
stop_event = threading.Event()
|
||||||
start_time = time.monotonic()
|
start_time = time.monotonic()
|
||||||
|
|||||||
@@ -4,5 +4,6 @@ from .daemon import run_daemon
|
|||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--lang", default="en")
|
parser.add_argument("--lang", default="en")
|
||||||
|
parser.add_argument("--pause", type=float, default=None)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
run_daemon(args.lang)
|
run_daemon(args.lang, pause=args.pause)
|
||||||
|
|||||||
@@ -7,12 +7,12 @@ import numpy as np
|
|||||||
import sounddevice as sd
|
import sounddevice as sd
|
||||||
|
|
||||||
from .model import SAMPLE_RATE, transcribe_audio
|
from .model import SAMPLE_RATE, transcribe_audio
|
||||||
from .vad import FRAME_SIZE, VADStateMachine, calibrate_silence
|
from .vad import DEFAULT_SILENCE_FRAMES, FRAME_SIZE, VADStateMachine, calibrate_silence
|
||||||
|
|
||||||
|
|
||||||
def stream_transcribe(processor, model, language):
|
def stream_transcribe(processor, model, language, silence_frames=DEFAULT_SILENCE_FRAMES):
|
||||||
threshold = calibrate_silence()
|
threshold = calibrate_silence()
|
||||||
vad = VADStateMachine(threshold)
|
vad = VADStateMachine(threshold, silence_frames=silence_frames)
|
||||||
seg_queue = queue.Queue()
|
seg_queue = queue.Queue()
|
||||||
stop_event = threading.Event()
|
stop_event = threading.Event()
|
||||||
start_time = time.monotonic()
|
start_time = time.monotonic()
|
||||||
|
|||||||
@@ -7,11 +7,15 @@ from .model import SAMPLE_RATE
|
|||||||
|
|
||||||
FRAME_SIZE = 800 # 50ms at 16kHz
|
FRAME_SIZE = 800 # 50ms at 16kHz
|
||||||
PRE_ROLL_FRAMES = 6 # ~0.3s of audio before speech onset
|
PRE_ROLL_FRAMES = 6 # ~0.3s of audio before speech onset
|
||||||
SILENCE_FRAMES = 16 # ~0.8s of silence to end a segment
|
DEFAULT_SILENCE_FRAMES = 16 # ~0.8s of silence to end a segment
|
||||||
SPEECH_ONSET_FRAMES = 3 # ~150ms of speech to trigger
|
SPEECH_ONSET_FRAMES = 3 # ~150ms of speech to trigger
|
||||||
MAX_SPEECH_SECONDS = 30 # force chunk boundary
|
MAX_SPEECH_SECONDS = 30 # force chunk boundary
|
||||||
|
|
||||||
|
|
||||||
|
def pause_seconds_to_frames(seconds: float) -> int:
|
||||||
|
return max(1, round(seconds / (FRAME_SIZE / SAMPLE_RATE)))
|
||||||
|
|
||||||
|
|
||||||
def calibrate_silence(duration=0.5):
|
def calibrate_silence(duration=0.5):
|
||||||
print("Calibrating silence threshold...")
|
print("Calibrating silence threshold...")
|
||||||
audio = sd.rec(int(duration * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype="float32")
|
audio = sd.rec(int(duration * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype="float32")
|
||||||
@@ -23,8 +27,9 @@ def calibrate_silence(duration=0.5):
|
|||||||
|
|
||||||
|
|
||||||
class VADStateMachine:
|
class VADStateMachine:
|
||||||
def __init__(self, threshold):
|
def __init__(self, threshold, silence_frames=DEFAULT_SILENCE_FRAMES):
|
||||||
self.threshold = threshold
|
self.threshold = threshold
|
||||||
|
self.silence_limit = silence_frames
|
||||||
self.speaking = False
|
self.speaking = False
|
||||||
self.speech_frames = 0
|
self.speech_frames = 0
|
||||||
self.silence_frames = 0
|
self.silence_frames = 0
|
||||||
@@ -61,7 +66,7 @@ class VADStateMachine:
|
|||||||
self.silence_frames += 1
|
self.silence_frames += 1
|
||||||
|
|
||||||
segment_duration = len(self.segment) * FRAME_SIZE / SAMPLE_RATE
|
segment_duration = len(self.segment) * FRAME_SIZE / SAMPLE_RATE
|
||||||
if self.silence_frames >= SILENCE_FRAMES or segment_duration >= MAX_SPEECH_SECONDS:
|
if self.silence_frames >= self.silence_limit or segment_duration >= MAX_SPEECH_SECONDS:
|
||||||
result = (self.segment_start_time, np.concatenate(self.segment))
|
result = (self.segment_start_time, np.concatenate(self.segment))
|
||||||
self.speaking = False
|
self.speaking = False
|
||||||
self.speech_frames = 0
|
self.speech_frames = 0
|
||||||
|
|||||||
Reference in New Issue
Block a user