| """ |
| Background language model — pretrained general-English word vectors. |
| |
| This is the "what's normal in English" reference used by anomaly detection. |
| It is deliberately a *distributional* word-embedding model (GloVe), the same |
| *kind* of object as the corpus Word2Vec: comparing a word's neighbourhood in |
| the corpus against its neighbourhood here is only meaningful when both spaces |
| are word-co-occurrence embeddings. A sentence-transformer (or an OpenAI text |
| embedding) is the wrong type for this role — it models text similarity, not |
| word co-occurrence, and lives in an unrelated vector space. |
| |
| The model is loaded lazily on first use (gensim downloads & caches it to |
| ~/gensim-data, or $GENSIM_DATA_DIR). A failed download degrades gracefully: |
| callers see `ready == False` instead of a crash. |
| """ |
|
|
| import logging |
| import os |
| import threading |
| from typing import Optional |
|
|
| logger = logging.getLogger(__name__) |
|
|
| |
| |
| DEFAULT_MODEL = os.environ.get("BACKGROUND_MODEL", "glove-wiki-gigaword-100") |
|
|
|
|
| class BackgroundModel: |
| """Lazily-loaded pretrained general-English word vectors (gensim KeyedVectors).""" |
|
|
| def __init__(self, model_name: str = DEFAULT_MODEL): |
| self.model_name = model_name |
| self._kv = None |
| self._lock = threading.Lock() |
| self._load_failed = False |
|
|
| @property |
| def ready(self) -> bool: |
| return self._kv is not None |
|
|
| @property |
| def kv(self): |
| """The underlying gensim KeyedVectors, or None if not loaded.""" |
| return self._kv |
|
|
| def load(self) -> bool: |
| """ |
| Load the model if needed. Returns True on success, False on failure. |
| |
| Thread-safe and idempotent. The first call may download ~128MB; later |
| calls (and restarts, if the cache survives) are instant. On failure the |
| model is marked failed so we don't retry a doomed download every request. |
| """ |
| if self._kv is not None: |
| return True |
| with self._lock: |
| if self._kv is not None: |
| return True |
| if self._load_failed: |
| return False |
| try: |
| import gensim.downloader as gd |
| logger.info("Loading background model '%s' (first use may download ~100MB)...", |
| self.model_name) |
| self._kv = gd.load(self.model_name) |
| logger.info("Background model ready: %d words, dim=%d", |
| len(self._kv), self._kv.vector_size) |
| return True |
| except Exception: |
| logger.exception("Failed to load background model '%s' — " |
| "anomaly detection will be unavailable.", self.model_name) |
| self._load_failed = True |
| return False |
|
|
| def has(self, word: str) -> bool: |
| """True if the word exists in the background vocabulary (i.e. common English).""" |
| return self._kv is not None and word in self._kv |
|
|
| def similarity(self, a: str, b: str) -> Optional[float]: |
| """Cosine similarity between two words in general English, or None if either is OOV.""" |
| if self._kv is None or a not in self._kv or b not in self._kv: |
| return None |
| return float(self._kv.similarity(a, b)) |
|
|
| def status(self) -> dict: |
| return { |
| "model_name": self.model_name, |
| "ready": self.ready, |
| "load_failed": self._load_failed, |
| "vocab_size": len(self._kv) if self._kv is not None else 0, |
| "vector_size": int(self._kv.vector_size) if self._kv is not None else 0, |
| } |
|
|