# SPDX-License-Identifier: Apache-2.0 # Copyright (c) 2026 World Labs. """HuggingFace ZeroGPU Space for flux_rgbd. Wraps the same generate-and-render path used by the local demo with ``@spaces.GPU`` so HF ZeroGPU can attach a GPU per call. The model loads at module-level (CPU), then moves to CUDA inside the first decorated call. """ import os import sys import time import uuid from pathlib import Path # The vendored flux_rgbd package lives next to this file. sys.path.insert(0, str(Path(__file__).resolve().parent)) import gradio as gr import numpy as np import spaces import torch from flux_rgbd import FluxRGBDRunner from flux_rgbd.pointcloud import depth_edge_mask, statistical_outlier_mask # BF16 checkpoint, downloaded from the Hub on first use. Override with the # WEIGHTS_REPO env var (repo id or local path). WEIGHTS_REPO = os.environ.get("WEIGHTS_REPO", "bartduis/modality_forcing") DEFAULT_PROMPT = ( "A warm, inviting kitchen with a rustic-modern feel, where soft morning " "light filters through a small window above the sink." ) # Lazy-loaded runner. On ZeroGPU the model is loaded inside the first # @spaces.GPU call so the import path costs nothing. _runner: FluxRGBDRunner | None = None def _ensure_runner() -> FluxRGBDRunner: global _runner if _runner is None: # Use BF16 Qwen3-8B instead of Qwen3-8B-FP8 to avoid the # finegrained-fp8 / deep-gemm kernel dependency, which currently # hits a metadata.json parse bug on HF's kernels-community. text_encoder = os.environ.get("TEXT_ENCODER_REPO", "Qwen/Qwen3-8B") # Generation resolution must match the checkpoint's training resolution # (512 for the default model, 1024 for the 1024 checkpoint). Set # IMG_RESOLUTION=1024 alongside WEIGHTS_REPO when using the 1024 ckpt. res = int(os.environ.get("IMG_RESOLUTION", "512")) print(f"[boot] loading {WEIGHTS_REPO} @ {res}px (text encoder: {text_encoder})…", flush=True) _runner = FluxRGBDRunner.from_pretrained( WEIGHTS_REPO, device="cuda", dtype=torch.bfloat16, head_dtype=torch.float32, text_encoder=text_encoder, img_hw=(res, res), ) print("[boot] runner ready.", flush=True) return _runner # --- helpers (kept inline so the Space repo doesn't depend on demo/app_lib) --- _SH_C0 = 0.28209479177387814 def _letterbox(img: np.ndarray, target: int = 512): """Resize so long side = target, then zero-pad to (target, target).""" import cv2 h_in, w_in = img.shape[:2] if h_in >= w_in: h_out, w_out = target, max(1, int(round(w_in * target / h_in))) else: w_out, h_out = target, max(1, int(round(h_in * target / w_in))) resized = cv2.resize(img, (w_out, h_out), interpolation=cv2.INTER_AREA) canvas = np.zeros((target, target, img.shape[2] if img.ndim == 3 else 1), dtype=img.dtype) if img.ndim == 2: canvas = canvas[..., 0] top = (target - h_out) // 2 left = (target - w_out) // 2 canvas[top:top + h_out, left:left + w_out] = resized return canvas, (top, left, h_out, w_out) def _depth_to_pointcloud(rgb_u8, depth, *, fov_deg=65.0, max_points=1_200_000, edge_rtol=0.04, sor=False): h, w = depth.shape fx = w / (2.0 * np.tan(np.deg2rad(fov_deg) / 2.0)) cx, cy = w * 0.5, h * 0.5 # Keep every valid pixel — no percentile clip. The earlier [1, 99] clip # discarded the nearest 1% of points, carving a hole in the closest # surface (e.g. the front edge of a table) and also dropping the far # background. The i2d depth is clean enough that this clipping isn't # needed and it was cutting off the geometry users care most about. valid = (depth > 0) & np.isfinite(depth) # Depth-edge mask: drop occlusion-boundary "veil" pixels (MoGe-style). if edge_rtol and edge_rtol > 0: valid &= ~depth_edge_mask(depth, rtol=float(edge_rtol)) v_idx, u_idx = np.where(valid) z = depth[v_idx, u_idx] x = (u_idx + 0.5 - cx) * z / fx y = (v_idx + 0.5 - cy) * z / fx # glTF / Three.js: +Y up, camera looks down -Z. Flip image-y # (which points down) and depth (which points into the scene). pts = np.stack([x, -y, -z], axis=-1).astype(np.float32) cols = rgb_u8[v_idx, u_idx] if sor: # Statistical outlier rejection: drops isolated floaters, but can # over-trim fine structures — opt-in (the edge mask above is the # default cleanup). inliers = statistical_outlier_mask(pts) pts, cols = pts[inliers], cols[inliers] if pts.shape[0] > max_points: idx = np.random.default_rng(0).choice(pts.shape[0], max_points, replace=False) pts, cols = pts[idx], cols[idx] if pts.shape[0]: pts -= np.median(pts, axis=0, keepdims=True) return pts, cols def _save_glb(path, points, colors): """Colored point cloud → binary glTF, the format gr.Model3D handles cleanly.""" import trimesh cloud = trimesh.PointCloud(vertices=points, colors=colors) scene = trimesh.Scene() scene.add_geometry(cloud) scene.export(str(path)) def _depth_to_magma(depth: np.ndarray) -> np.ndarray: """Depth → magma-colormapped disparity image (uint8 RGB). Visualizes 1/depth (so near = bright) robustly normalized to the 5–95th percentile, matching the depth panel in the reference notebook. """ from matplotlib import cm valid = (depth > 0) & np.isfinite(depth) disparity = np.zeros_like(depth, dtype=np.float32) if valid.any(): disparity[valid] = 1.0 / np.maximum(depth[valid], 1e-8) lo, hi = np.percentile(disparity[valid], [5, 95]) disparity = np.clip((disparity - lo) / max(hi - lo, 1e-8), 0, 1) disparity[~valid] = 0.0 return (cm.magma(disparity)[..., :3] * 255).astype(np.uint8) # /tmp is the writable mount on HF Spaces. We write the PLY here from the # parent process (i.e. NOT inside the @spaces.GPU subprocess) so Gradio's # file route can read it. Unique filename per call so Gradio's content- # hashed cache always serves fresh bytes. _ARTIFACT_DIR = Path("/tmp/flux_rgbd_artifacts") # 2h is comfortably longer than any viewer session; keeps a busy Space's # /tmp bounded since nothing else ever deletes these. _ARTIFACT_TTL_S = 2 * 3600.0 def _prune_old_artifacts() -> None: now = time.time() for f in _ARTIFACT_DIR.glob("cloud_*.glb"): try: if now - f.stat().st_mtime > _ARTIFACT_TTL_S: f.unlink() except OSError: pass # concurrent delete / fs hiccup — never fail a generation _ARTIFACT_DIR.mkdir(parents=True, exist_ok=True) @spaces.GPU(duration=120) def _sample_on_gpu(prompt: str, input_image, num_steps: int, cfg_scale: float, seed: int, refine_depth: bool = True, log2_alpha: float = 5.0): """GPU-only step: text encode + diffusion sample + VAE decode. Returns plain numpy arrays so the parent process (which is what serves Gradio files) can do the rest. Writing the PLY here would leave it in the subprocess's filesystem view where the parent's Gradio file route can't find it (returns 404). """ import time runner = _ensure_runner() mode = "i2d" if input_image is not None else "joint" target = runner.img_hw[0] if mode == "i2d": letterboxed, (top, left, vh, vw) = _letterbox(input_image, target) model_input = letterboxed else: letterboxed = None top = left = 0 vh = vw = target model_input = None t0 = time.time() if mode == "i2d": # Image given: single image→depth pass at CFG 1.0 (no guidance — the # RGB is fixed, so there is nothing for CFG to steer). result = runner.generate( prompt.strip() if prompt else "", mode="i2d", num_steps=int(num_steps), cfg_scale=1.0, seed=int(seed), clean_rgb_image=model_input, ) else: # Text→RGBD. Stage 1 joint at the requested CFG (default 4.0), rgb-first # trajectory (log2_alpha=5) for cleaner depth. When `refine_depth` is on, # a stage 2 re-derives depth via i2d on that RGB at CFG 1.0 for sharper, # RGB-consistent geometry; otherwise the single joint pass is used. result = runner.generate( prompt.strip() if prompt else "", mode="joint", num_steps=int(num_steps), cfg_scale=float(cfg_scale), seed=int(seed), log2_alpha=float(log2_alpha), refine_depth_i2d=bool(refine_depth), i2d_cfg_scale=1.0, ) elapsed = time.time() - t0 rgb_for_pc = (letterboxed[top:top + vh, left:left + vw] if mode == "i2d" else result["rgb"]) depth = result["depth"] if mode == "i2d": depth = depth[top:top + vh, left:left + vw] return rgb_for_pc, depth, mode, elapsed def generate(prompt: str, input_image, num_steps: int, cfg_scale: float, seed: int, refine_depth: bool = True, log2_alpha: float = 5.0, edge_rtol: float = 0.04, sor: bool = False): """Public Gradio handler. Runs the GPU step then does PLY writing here in the parent process so the file persists for Gradio.""" rgb_for_pc, depth, mode, elapsed = _sample_on_gpu( prompt, input_image, num_steps, cfg_scale, seed, refine_depth, log2_alpha, ) pts, cols = _depth_to_pointcloud(rgb_for_pc, depth, edge_rtol=edge_rtol, sor=bool(sor)) _prune_old_artifacts() cloud_path = str(_ARTIFACT_DIR / f"cloud_{uuid.uuid4().hex[:12]}.glb") _save_glb(cloud_path, pts, cols) valid = (depth > 0) & np.isfinite(depth) if valid.any(): d = depth[valid] depth_summary = ( f"depth median={float(np.median(d)):.2f} " f"p5={float(np.percentile(d, 5)):.2f} " f"p95={float(np.percentile(d, 95)):.2f}" ) else: depth_summary = "depth has no valid pixels" status = f"{mode} · {elapsed:.1f} s · {depth_summary} · {pts.shape[0]:,} points" return rgb_for_pc, _depth_to_magma(depth), cloud_path, status # --- Presentation layer ---------------------------------------------------- # Only the Gradio UI definition lives below. The generation/model code above # is untouched. WORLD_LABS_URL = "https://www.worldlabs.ai" _PROJECT_URL = "https://modality-forcing.github.io/" _ARXIV_URL = "https://arxiv.org/abs/2606.13676" _CODE_URL = "https://github.com/Duisterhof/modality-forcing" # Editorial monochrome: a fully neutral palette, Inter for body, JetBrains # Mono for the small uppercase "eyebrow" labels. The serif display face for # the title (Gilda Display) is pulled in via @import in the CSS below. _THEME = gr.themes.Default( # System fonts only — no Google-fetched web fonts for the body/mono, which # were loading unreliably (falling back to Arial and looking cheap). The # serif display title uses Gilda Display, pulled in via @import in the CSS. font=( "system-ui", "-apple-system", "Segoe UI", "Helvetica Neue", "Arial", "sans-serif", ), font_mono=( "ui-monospace", "SFMono-Regular", "Menlo", "Consolas", "monospace", ), primary_hue=gr.themes.colors.neutral, secondary_hue=gr.themes.colors.neutral, neutral_hue=gr.themes.colors.neutral, ).set( # Hairline, low-contrast borders; no heavy shadows or filled labels. block_border_width="1px", block_border_color="*neutral_200", block_background_fill="white", block_shadow="none", block_label_background_fill="transparent", block_label_border_width="0px", block_label_text_weight="500", input_border_width="1px", input_border_color="*neutral_200", input_shadow="none", # Near-black, fully-rounded primary button (pill); white secondary. button_large_radius="*radius_xxl", button_small_radius="*radius_xxl", button_primary_background_fill="#111111", button_primary_background_fill_hover="#1f1f1f", button_primary_text_color="white", button_primary_border_color="#111111", button_secondary_background_fill="white", button_secondary_border_color="rgba(0,0,0,0.16)", ) _CSS = """ @import url('https://fonts.googleapis.com/css2?family=Gilda+Display&display=swap'); /* Warm "paper" canvas everywhere (page + app + container) so the white component cards lift off the background and the layout reads premium rather than flat white-on-white. Body uses the system UI font. */ html, body, gradio-app, .gradio-container, .gradio-container .gap { background: #f4f3ef !important; } .gradio-container { color: #141414 !important; max-width: 1120px !important; margin: 0 auto !important; padding: 28px 24px 12px !important; font-family: system-ui, -apple-system, "Segoe UI", "Helvetica Neue", Arial, sans-serif !important; } /* Components become quiet white cards: hairline edge, soft round corners, and a whisper of shadow for depth. (Also overrides this Gradio build's hardcoded 3px black .block border.) */ .gradio-container .block { border: 1px solid rgba(20,20,20,0.07) !important; border-radius: 16px !important; background: #ffffff !important; box-shadow: 0 1px 2px rgba(20,20,20,0.04), 0 12px 28px -18px rgba(20,20,20,0.18) !important; } /* Text/HTML blocks float on the page — no card border, fill, or shadow. */ .gradio-container .mf-bare { border: 0 !important; background: transparent !important; box-shadow: none !important; padding: 0 !important; } /* Hairline rule separating the masthead from the workspace. */ .mf-rule { height: 1px; border: 0; background: rgba(20,20,20,0.08); max-width: 1080px; margin: 0.75rem auto 1.5rem; } /* ---- Publication header (mirrors the project page) ---- */ /* Everything in the masthead is centered. Forced with !important because Gradio's prose CSS otherwise left-aligns
/
'
'
'
'