feat: add voice command processing and input backend interface
Introduce InputBackend protocol with WtypeBackend and PrintBackend, and a command processor that translates spoken commands (enter, new line, question mark, comma, etc.) into key presses and punctuation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,35 @@
|
|||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from typing import Protocol
|
||||||
|
|
||||||
|
|
||||||
|
class InputBackend(Protocol):
|
||||||
|
def type_text(self, text: str) -> None: ...
|
||||||
|
def send_key(self, key: str) -> None: ...
|
||||||
|
|
||||||
|
|
||||||
|
class WtypeBackend:
|
||||||
|
def type_text(self, text: str) -> None:
|
||||||
|
try:
|
||||||
|
subprocess.run(["wtype", "--", text], check=True, timeout=10)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("wtype not found — install it for keyboard injection", file=sys.stderr)
|
||||||
|
except subprocess.SubprocessError as e:
|
||||||
|
print(f"wtype error: {e}", file=sys.stderr)
|
||||||
|
|
||||||
|
def send_key(self, key: str) -> None:
|
||||||
|
try:
|
||||||
|
subprocess.run(["wtype", "-k", key], check=True, timeout=10)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("wtype not found — install it for keyboard injection", file=sys.stderr)
|
||||||
|
except subprocess.SubprocessError as e:
|
||||||
|
print(f"wtype error: {e}", file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
class PrintBackend:
|
||||||
|
def type_text(self, text: str) -> None:
|
||||||
|
print(text, end="", flush=True)
|
||||||
|
|
||||||
|
def send_key(self, key: str) -> None:
|
||||||
|
key_map = {"Return": "\n", "Tab": "\t", "BackSpace": "\b"}
|
||||||
|
print(key_map.get(key, f"[{key}]"), end="", flush=True)
|
||||||
@@ -0,0 +1,55 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
from .backend import InputBackend
|
||||||
|
|
||||||
|
KEY_COMMANDS: dict[str, list[str]] = {
|
||||||
|
"new line": ["Return"],
|
||||||
|
"newline": ["Return"],
|
||||||
|
"enter": ["Return"],
|
||||||
|
"press enter": ["Return"],
|
||||||
|
"new paragraph": ["Return", "Return"],
|
||||||
|
"tab": ["Tab"],
|
||||||
|
"backspace": ["BackSpace"],
|
||||||
|
}
|
||||||
|
|
||||||
|
PUNCTUATION: dict[str, str] = {
|
||||||
|
"question mark": "?",
|
||||||
|
"exclamation mark": "!",
|
||||||
|
"exclamation point": "!",
|
||||||
|
"period": ".",
|
||||||
|
"full stop": ".",
|
||||||
|
"comma": ",",
|
||||||
|
"colon": ":",
|
||||||
|
"semicolon": ";",
|
||||||
|
"open quote": '"',
|
||||||
|
"close quote": '"',
|
||||||
|
"open paren": "(",
|
||||||
|
"close paren": ")",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _build_pattern(commands: dict) -> re.Pattern:
|
||||||
|
sorted_keys = sorted(commands.keys(), key=len, reverse=True)
|
||||||
|
escaped = [re.escape(k) for k in sorted_keys]
|
||||||
|
return re.compile(r"\b(" + "|".join(escaped) + r")\b", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
_KEY_PATTERN = _build_pattern(KEY_COMMANDS)
|
||||||
|
_PUNCT_PATTERN = _build_pattern(PUNCTUATION)
|
||||||
|
|
||||||
|
|
||||||
|
def process_and_output(text: str, backend: InputBackend) -> None:
|
||||||
|
text = _PUNCT_PATTERN.sub(lambda m: PUNCTUATION[m.group(1).lower()], text)
|
||||||
|
text = re.sub(r"\s+([?.!,;:)\"])", r"\1", text)
|
||||||
|
|
||||||
|
parts = _KEY_PATTERN.split(text)
|
||||||
|
|
||||||
|
for part in parts:
|
||||||
|
cmd = part.strip().lower()
|
||||||
|
if cmd in KEY_COMMANDS:
|
||||||
|
for key in KEY_COMMANDS[cmd]:
|
||||||
|
backend.send_key(key)
|
||||||
|
else:
|
||||||
|
cleaned = part.strip()
|
||||||
|
if cleaned:
|
||||||
|
backend.type_text(cleaned + " ")
|
||||||
@@ -1,8 +1,6 @@
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
import queue
|
import queue
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
@@ -10,6 +8,8 @@ import time
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import sounddevice as sd
|
import sounddevice as sd
|
||||||
|
|
||||||
|
from .backend import WtypeBackend
|
||||||
|
from .commands import process_and_output
|
||||||
from .model import SAMPLE_RATE, load_model, transcribe_audio
|
from .model import SAMPLE_RATE, load_model, transcribe_audio
|
||||||
from .vad import DEFAULT_SILENCE_FRAMES, FRAME_SIZE, VADStateMachine, calibrate_silence, pause_seconds_to_frames
|
from .vad import DEFAULT_SILENCE_FRAMES, FRAME_SIZE, VADStateMachine, calibrate_silence, pause_seconds_to_frames
|
||||||
|
|
||||||
@@ -24,13 +24,7 @@ def _write_state(pid: int, status: str):
|
|||||||
json.dump({"pid": pid, "status": status, "started_at": time.time()}, f)
|
json.dump({"pid": pid, "status": status, "started_at": time.time()}, f)
|
||||||
|
|
||||||
|
|
||||||
def _type_text(text: str):
|
_backend = WtypeBackend()
|
||||||
try:
|
|
||||||
subprocess.run(["wtype", "--", text], check=True, timeout=10)
|
|
||||||
except FileNotFoundError:
|
|
||||||
print("wtype not found — install it for keyboard injection", file=sys.stderr)
|
|
||||||
except subprocess.SubprocessError as e:
|
|
||||||
print(f"wtype error: {e}", file=sys.stderr)
|
|
||||||
|
|
||||||
|
|
||||||
def read_state() -> dict | None:
|
def read_state() -> dict | None:
|
||||||
@@ -105,7 +99,7 @@ def run_daemon(language: str = "en", pause: float | None = None):
|
|||||||
text = transcribe_audio(processor, model, audio, language)
|
text = transcribe_audio(processor, model, audio, language)
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
if text:
|
if text:
|
||||||
_type_text(text + " ")
|
process_and_output(text, _backend)
|
||||||
|
|
||||||
worker = threading.Thread(target=transcription_worker, daemon=True)
|
worker = threading.Thread(target=transcription_worker, daemon=True)
|
||||||
worker.start()
|
worker.start()
|
||||||
|
|||||||
Reference in New Issue
Block a user