""" Background language model — pretrained general-English word vectors. This is the "what's normal in English" reference used by anomaly detection. It is deliberately a *distributional* word-embedding model (GloVe), the same *kind* of object as the corpus Word2Vec: comparing a word's neighbourhood in the corpus against its neighbourhood here is only meaningful when both spaces are word-co-occurrence embeddings. A sentence-transformer (or an OpenAI text embedding) is the wrong type for this role — it models text similarity, not word co-occurrence, and lives in an unrelated vector space. The model is loaded lazily on first use (gensim downloads & caches it to ~/gensim-data, or $GENSIM_DATA_DIR). A failed download degrades gracefully: callers see `ready == False` instead of a crash. """ import logging import os import threading from typing import Optional logger = logging.getLogger(__name__) # glove-wiki-gigaword-100: ~128MB, 400k lowercase words, dim=100. # Override with BACKGROUND_MODEL env var (e.g. glove-wiki-gigaword-50). DEFAULT_MODEL = os.environ.get("BACKGROUND_MODEL", "glove-wiki-gigaword-100") class BackgroundModel: """Lazily-loaded pretrained general-English word vectors (gensim KeyedVectors).""" def __init__(self, model_name: str = DEFAULT_MODEL): self.model_name = model_name self._kv = None # gensim KeyedVectors once loaded self._lock = threading.Lock() self._load_failed = False @property def ready(self) -> bool: return self._kv is not None @property def kv(self): """The underlying gensim KeyedVectors, or None if not loaded.""" return self._kv def load(self) -> bool: """ Load the model if needed. Returns True on success, False on failure. Thread-safe and idempotent. The first call may download ~128MB; later calls (and restarts, if the cache survives) are instant. On failure the model is marked failed so we don't retry a doomed download every request. """ if self._kv is not None: return True with self._lock: if self._kv is not None: return True if self._load_failed: return False try: import gensim.downloader as gd logger.info("Loading background model '%s' (first use may download ~100MB)...", self.model_name) self._kv = gd.load(self.model_name) logger.info("Background model ready: %d words, dim=%d", len(self._kv), self._kv.vector_size) return True except Exception: logger.exception("Failed to load background model '%s' — " "anomaly detection will be unavailable.", self.model_name) self._load_failed = True return False def has(self, word: str) -> bool: """True if the word exists in the background vocabulary (i.e. common English).""" return self._kv is not None and word in self._kv def similarity(self, a: str, b: str) -> Optional[float]: """Cosine similarity between two words in general English, or None if either is OOV.""" if self._kv is None or a not in self._kv or b not in self._kv: return None return float(self._kv.similarity(a, b)) def status(self) -> dict: return { "model_name": self.model_name, "ready": self.ready, "load_failed": self._load_failed, "vocab_size": len(self._kv) if self._kv is not None else 0, "vector_size": int(self._kv.vector_size) if self._kv is not None else 0, }