esfiles / background_model.py
Besjon Cifliku
feat: implement anomaly detection to filter suspicious word relations
9c3ade2
"""
Background language model — pretrained general-English word vectors.
This is the "what's normal in English" reference used by anomaly detection.
It is deliberately a *distributional* word-embedding model (GloVe), the same
*kind* of object as the corpus Word2Vec: comparing a word's neighbourhood in
the corpus against its neighbourhood here is only meaningful when both spaces
are word-co-occurrence embeddings. A sentence-transformer (or an OpenAI text
embedding) is the wrong type for this role — it models text similarity, not
word co-occurrence, and lives in an unrelated vector space.
The model is loaded lazily on first use (gensim downloads & caches it to
~/gensim-data, or $GENSIM_DATA_DIR). A failed download degrades gracefully:
callers see `ready == False` instead of a crash.
"""
import logging
import os
import threading
from typing import Optional
logger = logging.getLogger(__name__)
# glove-wiki-gigaword-100: ~128MB, 400k lowercase words, dim=100.
# Override with BACKGROUND_MODEL env var (e.g. glove-wiki-gigaword-50).
DEFAULT_MODEL = os.environ.get("BACKGROUND_MODEL", "glove-wiki-gigaword-100")
class BackgroundModel:
"""Lazily-loaded pretrained general-English word vectors (gensim KeyedVectors)."""
def __init__(self, model_name: str = DEFAULT_MODEL):
self.model_name = model_name
self._kv = None # gensim KeyedVectors once loaded
self._lock = threading.Lock()
self._load_failed = False
@property
def ready(self) -> bool:
return self._kv is not None
@property
def kv(self):
"""The underlying gensim KeyedVectors, or None if not loaded."""
return self._kv
def load(self) -> bool:
"""
Load the model if needed. Returns True on success, False on failure.
Thread-safe and idempotent. The first call may download ~128MB; later
calls (and restarts, if the cache survives) are instant. On failure the
model is marked failed so we don't retry a doomed download every request.
"""
if self._kv is not None:
return True
with self._lock:
if self._kv is not None:
return True
if self._load_failed:
return False
try:
import gensim.downloader as gd
logger.info("Loading background model '%s' (first use may download ~100MB)...",
self.model_name)
self._kv = gd.load(self.model_name)
logger.info("Background model ready: %d words, dim=%d",
len(self._kv), self._kv.vector_size)
return True
except Exception:
logger.exception("Failed to load background model '%s' — "
"anomaly detection will be unavailable.", self.model_name)
self._load_failed = True
return False
def has(self, word: str) -> bool:
"""True if the word exists in the background vocabulary (i.e. common English)."""
return self._kv is not None and word in self._kv
def similarity(self, a: str, b: str) -> Optional[float]:
"""Cosine similarity between two words in general English, or None if either is OOV."""
if self._kv is None or a not in self._kv or b not in self._kv:
return None
return float(self._kv.similarity(a, b))
def status(self) -> dict:
return {
"model_name": self.model_name,
"ready": self.ready,
"load_failed": self._load_failed,
"vocab_size": len(self._kv) if self._kv is not None else 0,
"vector_size": int(self._kv.vector_size) if self._kv is not None else 0,
}