puck / server /ocr.py
vu1n's picture
Puck β€” desktop fairy familiar (HF Build Small)
3c124f3
Raw
History Blame Contribute Delete
3.44 kB
"""macOS Vision OCR for peeks β€” shells out to the compiled `recognize/bin/ocr`.
Two jobs: (1) identify a coding-CLI from the text in its prompt/status line β€” reliable on
dark, small-text terminal screens where CLIP fingerprinting is fuzzy; (2) ground Puck's
quip in the actual on-screen text so it's topical instead of a guess from pixels.
Recognition is REGION-LOCAL (the OCR runs on Puck's peek crop), so it matches what's under
the sprite β€” unlike a window title, which is window-global and lies under tabbed terminals
(Ghostty) and browsers (Chrome).
"""
import base64
import os
import subprocess
import tempfile
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
OCR_BIN = ROOT / "recognize" / "bin" / "ocr"
# Distinctive CONTENT markers (lowercased) from each CLI's prompt/status β€” NOT the window
# title. codex & pi both surface "gpt-5.5", so neither uses it: pi is pinned by its
# "(openai-codex)" backend tag / "pi v0." banner, codex by the SPACED "openai codex".
_TOOL_MARKERS = {
"claude-code": ["claude code", "claude max", "auto mode on", "for agents", "/release-notes"],
"codex": ["openai codex", "/model to change", "codex app", "/fast to enable"],
"opencode": ["opencode", "glm-5", "z.ai coding", "esc interrupt"],
"pi": ["pi v0.", "openai-codex", "/272k", "ctrl+c/ctrl+d"],
"amp": ["welcome to amp", "ctrl+o for help", "- smart -", "β€” smart β€”"],
}
# shell/login chrome that's noise for a topical quip
_NOISE = ("last login", "cd /", "exec ", "ttys", "fnm_version")
def available() -> bool:
return OCR_BIN.exists()
def ocr_lines(image_data_url: str, timeout: float = 8.0) -> list[str]:
"""Recognized text lines from a data-URL image (empty list on any failure)."""
if not OCR_BIN.exists():
return []
_, _, b64 = image_data_url.partition(",")
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f:
f.write(base64.b64decode(b64))
tmp = f.name
try:
out = subprocess.run(
[str(OCR_BIN), tmp], capture_output=True, text=True, timeout=timeout
)
return [ln.strip() for ln in out.stdout.splitlines() if ln.strip()]
except Exception: # noqa: BLE001 β€” OCR is best-effort; a failure just means no hint
return []
finally:
os.unlink(tmp)
def detect_tool(lines: list[str]) -> str | None:
"""Best-scoring tool by distinctive marker hits; None if nothing matched."""
blob = "\n".join(lines).lower()
best, best_score = None, 0
for label, markers in _TOOL_MARKERS.items():
score = sum(1 for m in markers if m in blob)
if score > best_score:
best, best_score = label, score
return best
def _is_noise(line: str) -> bool:
"""Shell/terminal furniture that crowds out real content in the quip's excerpt."""
low = line.lower()
if any(n in low for n in _NOISE):
return True
if len(line) <= 3: # OCR crumbs of the shell powerline ("Evuln", time glyphs)
return True
if "Γ—" in line: # terminal title-bar dimensions, e.g. "140Γ—43"
return True
return False
def topical_excerpt(lines: list[str], cap: int = 240) -> str:
"""A short, denoised snippet of on-screen text to anchor the quip in real words.
Drops shell/title-bar furniture so content (prompt, output, status) leads the budget."""
return " Β· ".join(ln for ln in lines if not _is_noise(ln))[:cap]