Spaces:

GF-John
/

moondream-pointer

Running on Zero

John Ho

added input for reasoning and settings

c9780e6 1 day ago

3.08 kB

	import json
	import gradio as gr
	import spaces, torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from PIL import Image
	from typing import Literal


	# @spaces.GPU(duration=30)
	def load_model():
	# moondream2
	# return AutoModelForCausalLM.from_pretrained(
	# "vikhyatk/moondream2",
	# revision="2025-04-14",
	# trust_remote_code=True,
	# device_map={"": "cuda"},
	# )

	# moondream3-preview
	moondream = AutoModelForCausalLM.from_pretrained(
	"moondream/moondream3-preview",
	trust_remote_code=True,
	dtype=torch.bfloat16,
	device_map={"": "cuda"},
	)
	moondream.compile()
	return moondream


	_MODEL = (
	load_model()
	) # calling spaces.GPU decorated functions outside ZeroGPU scope will cause a PickingError


	@spaces.GPU(duration=30)
	def detect(
	im: Image.Image,
	object_name: str,
	mode: Literal["point", "object_detection", "query"],
	reasoning: bool = False,
	settings: dict = {"temperature": 0.0, "top_p": 0.95, "max_tokens": 512},
	):
	"""
	Open Vocabulary Detection and Visual Question Answering using moondream2

	Args:
	im: Pillow Image
	object_name: the object you would like to detect, or the question to ask when mode is "query"
	mode: point, object_detection, or query
	Returns:
	For "point" / "object_detection": a list of points (xy) or bounding boxes (xyxy) with normalized coordinates.
	For "query": a dict {"answer": str} with the answer to the question.
	"""
	model = _MODEL # load_model()
	if isinstance(settings, str):
	settings = json.loads(settings)
	if mode == "point":
	return model.point(im, object_name, settings=settings)["points"]
	elif mode == "object_detection":
	return model.detect(im, object_name, settings=settings)["objects"]
	elif mode == "query":
	return model.query(im, object_name, reasoning=reasoning, settings=settings)


	demo = gr.Interface(
	fn=detect,
	title="moondream-pointer",
	description="using [moondream3-preview](https://huggingface.co/moondream/moondream3-preview) for object grounding",
	inputs=[
	gr.Image(label="Input Image", type="pil"),
	gr.Textbox(
	label="Object / Question",
	info="object to detector (for points / object_detection) or question for a query",
	),
	gr.Dropdown(label="Mode", choices=["point", "object_detection", "query"]),
	gr.Checkbox(
	label="Reasoning",
	value=False,
	info="enable [chain-of-thought](https://huggingface.co/moondream/moondream3-preview#query) (query mode only)",
	),
	gr.Textbox(
	label="Settings (JSON)",
	value='{"temperature": 0.0, "top_p": 0.95, "max_tokens": 512}',
	info="query: temperature / top_p / max_tokens · point & object_detection: max_objects",
	),
	],
	outputs=gr.JSON(label="Output JSON"),
	)
	demo.launch(
	mcp_server=True, app_kwargs={"docs_url": "/docs"} # add FastAPI Swagger API Docs
	)