John Ho
added input for reasoning and settings
c9780e6
import json
import gradio as gr
import spaces, torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
from typing import Literal
# @spaces.GPU(duration=30)
def load_model():
# moondream2
# return AutoModelForCausalLM.from_pretrained(
# "vikhyatk/moondream2",
# revision="2025-04-14",
# trust_remote_code=True,
# device_map={"": "cuda"},
# )
# moondream3-preview
moondream = AutoModelForCausalLM.from_pretrained(
"moondream/moondream3-preview",
trust_remote_code=True,
dtype=torch.bfloat16,
device_map={"": "cuda"},
)
moondream.compile()
return moondream
_MODEL = (
load_model()
) # calling spaces.GPU decorated functions outside ZeroGPU scope will cause a PickingError
@spaces.GPU(duration=30)
def detect(
im: Image.Image,
object_name: str,
mode: Literal["point", "object_detection", "query"],
reasoning: bool = False,
settings: dict = {"temperature": 0.0, "top_p": 0.95, "max_tokens": 512},
):
"""
Open Vocabulary Detection and Visual Question Answering using moondream2
Args:
im: Pillow Image
object_name: the object you would like to detect, or the question to ask when mode is "query"
mode: point, object_detection, or query
Returns:
For "point" / "object_detection": a list of points (xy) or bounding boxes (xyxy) with normalized coordinates.
For "query": a dict {"answer": str} with the answer to the question.
"""
model = _MODEL # load_model()
if isinstance(settings, str):
settings = json.loads(settings)
if mode == "point":
return model.point(im, object_name, settings=settings)["points"]
elif mode == "object_detection":
return model.detect(im, object_name, settings=settings)["objects"]
elif mode == "query":
return model.query(im, object_name, reasoning=reasoning, settings=settings)
demo = gr.Interface(
fn=detect,
title="moondream-pointer",
description="using [moondream3-preview](https://huggingface.co/moondream/moondream3-preview) for object grounding",
inputs=[
gr.Image(label="Input Image", type="pil"),
gr.Textbox(
label="Object / Question",
info="object to detector (for points / object_detection) or question for a query",
),
gr.Dropdown(label="Mode", choices=["point", "object_detection", "query"]),
gr.Checkbox(
label="Reasoning",
value=False,
info="enable [chain-of-thought](https://huggingface.co/moondream/moondream3-preview#query) (query mode only)",
),
gr.Textbox(
label="Settings (JSON)",
value='{"temperature": 0.0, "top_p": 0.95, "max_tokens": 512}',
info="query: temperature / top_p / max_tokens Β· point & object_detection: max_objects",
),
],
outputs=gr.JSON(label="Output JSON"),
)
demo.launch(
mcp_server=True, app_kwargs={"docs_url": "/docs"} # add FastAPI Swagger API Docs
)