| """Build data.json for the Gradio Space. |
| |
| Positions: pre-computed 3D embeddings of the `src/transformers/**` source files |
| (downloaded from a private HF bucket). Falls back to PCA on the original |
| 384-dim embeddings if the 3D file is missing. |
| Colors: recency-weighted edit score from `git log` on the cloned transformers repo. |
| """ |
| import datetime as dt |
| import json |
| import math |
| import os |
| import re |
| import subprocess |
| import urllib.error |
| import urllib.request |
| from collections import defaultdict |
| from pathlib import Path |
|
|
| import numpy as np |
|
|
| ROOT = Path(__file__).parent |
| REPO_DIR = ROOT / ".cache" / "transformers" |
|
|
| EMBEDDINGS_3D_FILE = ROOT / ".cache" / "transformers-embeddings-src-3d.jsonl" |
| EMBEDDINGS_3D_URL = ( |
| "https://huggingface.co/buckets/the-best-team/data/resolve/" |
| "transformers-embeddings-src-3d.jsonl" |
| ) |
| EMBEDDINGS_FILE = ROOT / ".cache" / "transformers-embeddings-src.jsonl" |
| EMBEDDINGS_URL = ( |
| "https://huggingface.co/buckets/the-best-team/data/resolve/" |
| "transformers-embeddings-src.jsonl" |
| ) |
| DATA_FILE = ROOT / "data.json" |
|
|
| SRC_PREFIX = "src/transformers/" |
| HALF_LIFE_SECONDS = 365 * 24 * 3600 |
|
|
| |
| SKIP_PATH_PATTERNS = [ |
| re.compile(r"(^|/)__init__\.py$"), |
| re.compile(r"(^|/)modeling_.*\.py$"), |
| re.compile(r"^src/transformers/cli/transformers\.py$"), |
| ] |
|
|
|
|
| def is_skipped(path): |
| return any(p.search(path) for p in SKIP_PATH_PATTERNS) |
|
|
|
|
| def run(cmd): |
| return subprocess.run(cmd, check=True, capture_output=True, text=True).stdout |
|
|
|
|
| def hf_token(): |
| p = Path.home() / ".cache" / "huggingface" / "token" |
| return p.read_text().strip() if p.exists() else os.environ.get("HF_TOKEN", "") |
|
|
|
|
| def download(url, dest): |
| if dest.exists(): |
| return True |
| dest.parent.mkdir(parents=True, exist_ok=True) |
| try: |
| req = urllib.request.Request( |
| url, headers={"Authorization": f"Bearer {hf_token()}"} |
| ) |
| with urllib.request.urlopen(req) as resp, dest.open("wb") as out: |
| out.write(resp.read()) |
| return True |
| except (urllib.error.URLError, urllib.error.HTTPError) as e: |
| print(f" download failed for {url}: {e}") |
| return False |
|
|
|
|
| def load_embeddings_3d(): |
| """Primary source: per-file 3D vectors keyed under `reduced_embedding`. |
| |
| Returns ordered (paths, coords) or (None, None) if the file isn't available. |
| """ |
| if not download(EMBEDDINGS_3D_URL, EMBEDDINGS_3D_FILE): |
| return None, None |
| paths, vecs = [], [] |
| with EMBEDDINGS_3D_FILE.open() as f: |
| for line in f: |
| d = json.loads(line) |
| paths.append(SRC_PREFIX + d["path"]) |
| vecs.append(d["reduced_embedding"]) |
| return paths, np.asarray(vecs, dtype=np.float64) |
|
|
|
|
| def load_embeddings_pca_fallback(): |
| """Fallback: load 384-dim embeddings and reduce via PCA.""" |
| if not download(EMBEDDINGS_URL, EMBEDDINGS_FILE): |
| raise RuntimeError("Neither the 3D nor the 384-dim embedding file is available.") |
| paths, vecs = [], [] |
| with EMBEDDINGS_FILE.open() as f: |
| for line in f: |
| d = json.loads(line) |
| paths.append(SRC_PREFIX + d["path"]) |
| vecs.append(d["embedding"]) |
| matrix = np.asarray(vecs, dtype=np.float64) |
| return paths, pca_3d(matrix) |
|
|
|
|
| def pca_3d(matrix): |
| """Project (N, D) → (N, 3) via centered SVD. Scale each axis to roughly unit std.""" |
| X = matrix - matrix.mean(axis=0, keepdims=True) |
| _, _, Vt = np.linalg.svd(X, full_matrices=False) |
| proj = X @ Vt[:3].T |
| proj /= proj.std(axis=0, keepdims=True) + 1e-12 |
| return proj |
|
|
|
|
| def load_positions(): |
| """Pre-computed 3D embeddings if available, else PCA on the 384-dim file.""" |
| paths, coords = load_embeddings_3d() |
| if paths is not None: |
| print(f"Using pre-computed 3D embeddings: {len(paths)} files.") |
| return paths, coords |
| print("3D embeddings unavailable; falling back to PCA on 384-dim file.") |
| return load_embeddings_pca_fallback() |
|
|
|
|
| def edit_timelines(): |
| out = run( |
| [ |
| "git", "-C", str(REPO_DIR), |
| "log", "--name-only", "--pretty=format:COMMIT:%ct", |
| ] |
| ) |
| timelines = defaultdict(list) |
| current_ts = None |
| for line in out.split("\n"): |
| if line.startswith("COMMIT:"): |
| current_ts = int(line[len("COMMIT:"):]) |
| elif line.strip() and current_ts is not None: |
| timelines[line.strip()].append(current_ts) |
| return timelines |
|
|
|
|
| def recency_weighted_score(timestamps, now_ts): |
| """Sum of exp-decayed edit weights: recent edits weigh more, old ones fade.""" |
| if not timestamps: |
| return 0.0 |
| return sum(0.5 ** ((now_ts - ts) / HALF_LIFE_SECONDS) for ts in timestamps) |
|
|
|
|
| def redness_scores(scores): |
| """Log-compress, min-max normalize, invert so high score → 0 (red).""" |
| log_scores = [math.log1p(s) for s in scores] |
| lo, hi = min(log_scores), max(log_scores) |
| span = (hi - lo) or 1.0 |
| return [1.0 - (ls - lo) / span for ls in log_scores] |
|
|
|
|
| def main(): |
| paths, coords = load_positions() |
| keep = [i for i, p in enumerate(paths) if not is_skipped(p)] |
| if len(keep) < len(paths): |
| print(f"Skipping {len(paths) - len(keep)} files via SKIP_PATH_PATTERNS.") |
| paths = [paths[i] for i in keep] |
| coords = coords[keep] |
| print(f"Per-axis std: {coords.std(axis=0)}") |
|
|
| timelines = edit_timelines() |
| now_ts = int(dt.datetime.now().timestamp()) |
|
|
| scores, edit_times, hovers = [], [], [] |
| for p in paths: |
| ts_list = timelines.get(p, []) |
| scores.append(recency_weighted_score(ts_list, now_ts)) |
| edit_times.append(ts_list) |
| last = dt.date.fromtimestamp(max(ts_list)).isoformat() if ts_list else "never" |
| hovers.append(f"{p}<br>edits: {len(ts_list)} (last: {last})") |
|
|
| color_values = redness_scores(scores) |
|
|
| data = { |
| "x": coords[:, 0].tolist(), |
| "y": coords[:, 1].tolist(), |
| "z": coords[:, 2].tolist(), |
| "color": color_values, |
| "edit_times": edit_times, |
| "hover": hovers, |
| } |
| DATA_FILE.write_text(json.dumps(data)) |
| print( |
| f"Wrote {DATA_FILE} — {len(paths)} points, " |
| f"max recency-weighted score: {max(scores):.2f}" |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|