From 50f8d158c40d09f87a162562fdb278b35f9db936 Mon Sep 17 00:00:00 2001 From: Wong Ding Feng Date: Sat, 30 May 2026 21:37:20 +0800 Subject: [PATCH] feat: add voice command processing and input backend interface Introduce InputBackend protocol with WtypeBackend and PrintBackend, and a command processor that translates spoken commands (enter, new line, question mark, comma, etc.) into key presses and punctuation. Co-Authored-By: Claude Opus 4.6 --- src/cohere_transcribe/backend.py | 35 ++++++++++++++++++++ src/cohere_transcribe/commands.py | 55 +++++++++++++++++++++++++++++++ src/cohere_transcribe/daemon.py | 14 +++----- 3 files changed, 94 insertions(+), 10 deletions(-) create mode 100644 src/cohere_transcribe/backend.py create mode 100644 src/cohere_transcribe/commands.py diff --git a/src/cohere_transcribe/backend.py b/src/cohere_transcribe/backend.py new file mode 100644 index 0000000..7269ad7 --- /dev/null +++ b/src/cohere_transcribe/backend.py @@ -0,0 +1,35 @@ +import subprocess +import sys +from typing import Protocol + + +class InputBackend(Protocol): + def type_text(self, text: str) -> None: ... + def send_key(self, key: str) -> None: ... + + +class WtypeBackend: + def type_text(self, text: str) -> None: + try: + subprocess.run(["wtype", "--", text], check=True, timeout=10) + except FileNotFoundError: + print("wtype not found — install it for keyboard injection", file=sys.stderr) + except subprocess.SubprocessError as e: + print(f"wtype error: {e}", file=sys.stderr) + + def send_key(self, key: str) -> None: + try: + subprocess.run(["wtype", "-k", key], check=True, timeout=10) + except FileNotFoundError: + print("wtype not found — install it for keyboard injection", file=sys.stderr) + except subprocess.SubprocessError as e: + print(f"wtype error: {e}", file=sys.stderr) + + +class PrintBackend: + def type_text(self, text: str) -> None: + print(text, end="", flush=True) + + def send_key(self, key: str) -> None: + key_map = {"Return": "\n", "Tab": "\t", "BackSpace": "\b"} + print(key_map.get(key, f"[{key}]"), end="", flush=True) diff --git a/src/cohere_transcribe/commands.py b/src/cohere_transcribe/commands.py new file mode 100644 index 0000000..ec3478a --- /dev/null +++ b/src/cohere_transcribe/commands.py @@ -0,0 +1,55 @@ +import re + +from .backend import InputBackend + +KEY_COMMANDS: dict[str, list[str]] = { + "new line": ["Return"], + "newline": ["Return"], + "enter": ["Return"], + "press enter": ["Return"], + "new paragraph": ["Return", "Return"], + "tab": ["Tab"], + "backspace": ["BackSpace"], +} + +PUNCTUATION: dict[str, str] = { + "question mark": "?", + "exclamation mark": "!", + "exclamation point": "!", + "period": ".", + "full stop": ".", + "comma": ",", + "colon": ":", + "semicolon": ";", + "open quote": '"', + "close quote": '"', + "open paren": "(", + "close paren": ")", +} + + +def _build_pattern(commands: dict) -> re.Pattern: + sorted_keys = sorted(commands.keys(), key=len, reverse=True) + escaped = [re.escape(k) for k in sorted_keys] + return re.compile(r"\b(" + "|".join(escaped) + r")\b", re.IGNORECASE) + + +_KEY_PATTERN = _build_pattern(KEY_COMMANDS) +_PUNCT_PATTERN = _build_pattern(PUNCTUATION) + + +def process_and_output(text: str, backend: InputBackend) -> None: + text = _PUNCT_PATTERN.sub(lambda m: PUNCTUATION[m.group(1).lower()], text) + text = re.sub(r"\s+([?.!,;:)\"])", r"\1", text) + + parts = _KEY_PATTERN.split(text) + + for part in parts: + cmd = part.strip().lower() + if cmd in KEY_COMMANDS: + for key in KEY_COMMANDS[cmd]: + backend.send_key(key) + else: + cleaned = part.strip() + if cleaned: + backend.type_text(cleaned + " ") diff --git a/src/cohere_transcribe/daemon.py b/src/cohere_transcribe/daemon.py index 2d04325..131e7c2 100644 --- a/src/cohere_transcribe/daemon.py +++ b/src/cohere_transcribe/daemon.py @@ -1,8 +1,6 @@ import json import os import signal -import subprocess -import sys import queue import threading import time @@ -10,6 +8,8 @@ import time import numpy as np import sounddevice as sd +from .backend import WtypeBackend +from .commands import process_and_output from .model import SAMPLE_RATE, load_model, transcribe_audio from .vad import DEFAULT_SILENCE_FRAMES, FRAME_SIZE, VADStateMachine, calibrate_silence, pause_seconds_to_frames @@ -24,13 +24,7 @@ def _write_state(pid: int, status: str): json.dump({"pid": pid, "status": status, "started_at": time.time()}, f) -def _type_text(text: str): - try: - subprocess.run(["wtype", "--", text], check=True, timeout=10) - except FileNotFoundError: - print("wtype not found — install it for keyboard injection", file=sys.stderr) - except subprocess.SubprocessError as e: - print(f"wtype error: {e}", file=sys.stderr) +_backend = WtypeBackend() def read_state() -> dict | None: @@ -105,7 +99,7 @@ def run_daemon(language: str = "en", pause: float | None = None): text = transcribe_audio(processor, model, audio, language) text = text.strip() if text: - _type_text(text + " ") + process_and_output(text, _backend) worker = threading.Thread(target=transcription_worker, daemon=True) worker.start()