From 50f8d158c40d09f87a162562fdb278b35f9db936 Mon Sep 17 00:00:00 2001
From: Wong Ding Feng <dingfengwong@gmail.com>
Date: Sat, 30 May 2026 21:37:20 +0800
Subject: [PATCH] feat: add voice command processing and input backend
 interface

Introduce InputBackend protocol with WtypeBackend and PrintBackend,
and a command processor that translates spoken commands (enter, new line,
question mark, comma, etc.) into key presses and punctuation.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 src/cohere_transcribe/backend.py  | 35 ++++++++++++++++++++
 src/cohere_transcribe/commands.py | 55 +++++++++++++++++++++++++++++++
 src/cohere_transcribe/daemon.py   | 14 +++-----
 3 files changed, 94 insertions(+), 10 deletions(-)
 create mode 100644 src/cohere_transcribe/backend.py
 create mode 100644 src/cohere_transcribe/commands.py

diff --git a/src/cohere_transcribe/backend.py b/src/cohere_transcribe/backend.py
new file mode 100644
index 0000000..7269ad7
--- /dev/null
+++ b/src/cohere_transcribe/backend.py
@@ -0,0 +1,35 @@
+import subprocess
+import sys
+from typing import Protocol
+
+
+class InputBackend(Protocol):
+    def type_text(self, text: str) -> None: ...
+    def send_key(self, key: str) -> None: ...
+
+
+class WtypeBackend:
+    def type_text(self, text: str) -> None:
+        try:
+            subprocess.run(["wtype", "--", text], check=True, timeout=10)
+        except FileNotFoundError:
+            print("wtype not found — install it for keyboard injection", file=sys.stderr)
+        except subprocess.SubprocessError as e:
+            print(f"wtype error: {e}", file=sys.stderr)
+
+    def send_key(self, key: str) -> None:
+        try:
+            subprocess.run(["wtype", "-k", key], check=True, timeout=10)
+        except FileNotFoundError:
+            print("wtype not found — install it for keyboard injection", file=sys.stderr)
+        except subprocess.SubprocessError as e:
+            print(f"wtype error: {e}", file=sys.stderr)
+
+
+class PrintBackend:
+    def type_text(self, text: str) -> None:
+        print(text, end="", flush=True)
+
+    def send_key(self, key: str) -> None:
+        key_map = {"Return": "\n", "Tab": "\t", "BackSpace": "\b"}
+        print(key_map.get(key, f"[{key}]"), end="", flush=True)
diff --git a/src/cohere_transcribe/commands.py b/src/cohere_transcribe/commands.py
new file mode 100644
index 0000000..ec3478a
--- /dev/null
+++ b/src/cohere_transcribe/commands.py
@@ -0,0 +1,55 @@
+import re
+
+from .backend import InputBackend
+
+KEY_COMMANDS: dict[str, list[str]] = {
+    "new line": ["Return"],
+    "newline": ["Return"],
+    "enter": ["Return"],
+    "press enter": ["Return"],
+    "new paragraph": ["Return", "Return"],
+    "tab": ["Tab"],
+    "backspace": ["BackSpace"],
+}
+
+PUNCTUATION: dict[str, str] = {
+    "question mark": "?",
+    "exclamation mark": "!",
+    "exclamation point": "!",
+    "period": ".",
+    "full stop": ".",
+    "comma": ",",
+    "colon": ":",
+    "semicolon": ";",
+    "open quote": '"',
+    "close quote": '"',
+    "open paren": "(",
+    "close paren": ")",
+}
+
+
+def _build_pattern(commands: dict) -> re.Pattern:
+    sorted_keys = sorted(commands.keys(), key=len, reverse=True)
+    escaped = [re.escape(k) for k in sorted_keys]
+    return re.compile(r"\b(" + "|".join(escaped) + r")\b", re.IGNORECASE)
+
+
+_KEY_PATTERN = _build_pattern(KEY_COMMANDS)
+_PUNCT_PATTERN = _build_pattern(PUNCTUATION)
+
+
+def process_and_output(text: str, backend: InputBackend) -> None:
+    text = _PUNCT_PATTERN.sub(lambda m: PUNCTUATION[m.group(1).lower()], text)
+    text = re.sub(r"\s+([?.!,;:)\"])", r"\1", text)
+
+    parts = _KEY_PATTERN.split(text)
+
+    for part in parts:
+        cmd = part.strip().lower()
+        if cmd in KEY_COMMANDS:
+            for key in KEY_COMMANDS[cmd]:
+                backend.send_key(key)
+        else:
+            cleaned = part.strip()
+            if cleaned:
+                backend.type_text(cleaned + " ")
diff --git a/src/cohere_transcribe/daemon.py b/src/cohere_transcribe/daemon.py
index 2d04325..131e7c2 100644
--- a/src/cohere_transcribe/daemon.py
+++ b/src/cohere_transcribe/daemon.py
@@ -1,8 +1,6 @@
 import json
 import os
 import signal
-import subprocess
-import sys
 import queue
 import threading
 import time
@@ -10,6 +8,8 @@ import time
 import numpy as np
 import sounddevice as sd
 
+from .backend import WtypeBackend
+from .commands import process_and_output
 from .model import SAMPLE_RATE, load_model, transcribe_audio
 from .vad import DEFAULT_SILENCE_FRAMES, FRAME_SIZE, VADStateMachine, calibrate_silence, pause_seconds_to_frames
 
@@ -24,13 +24,7 @@ def _write_state(pid: int, status: str):
         json.dump({"pid": pid, "status": status, "started_at": time.time()}, f)
 
 
-def _type_text(text: str):
-    try:
-        subprocess.run(["wtype", "--", text], check=True, timeout=10)
-    except FileNotFoundError:
-        print("wtype not found — install it for keyboard injection", file=sys.stderr)
-    except subprocess.SubprocessError as e:
-        print(f"wtype error: {e}", file=sys.stderr)
+_backend = WtypeBackend()
 
 
 def read_state() -> dict | None:
@@ -105,7 +99,7 @@ def run_daemon(language: str = "en", pause: float | None = None):
             text = transcribe_audio(processor, model, audio, language)
             text = text.strip()
             if text:
-                _type_text(text + " ")
+                process_and_output(text, _backend)
 
     worker = threading.Thread(target=transcription_worker, daemon=True)
     worker.start()