max044
/

vl-jepa-custom

temporal-grounding

action-retrieval

Model card Files Files and versions

vl-jepa-custom / sample_inference.py

max044's picture

Upload folder using huggingface_hub

2fd5fdc verified 3 months ago

history blame contribute delete

2.22 kB

	import torch
	import cv2
	import numpy as np
	from PIL import Image
	from vljepa.config import Config
	from vljepa.models import VLJepa
	from vljepa.utils import nms

	def load_model(checkpoint_path, device="cpu"):
	config = Config()
	config.device = device
	model = VLJepa(config)

	print(f"Loading weights from {checkpoint_path}...")
	checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=True)
	model.predictor.load_state_dict(checkpoint["predictor_state_dict"])
	model.y_encoder.projection.load_state_dict(checkpoint["y_projection_state_dict"])

	model.eval()
	return model, config

	def extract_frames(video_path, num_frames=16):
	cap = cv2.VideoCapture(video_path)
	total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
	if total_frames <= 0:
	return []

	indices = np.linspace(0, total_frames - 1, num_frames).astype(int)
	frames = []
	for idx in indices:
	cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
	ret, frame = cap.read()
	if ret:
	frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
	frames.append(frame)
	cap.release()
	return frames

	def main():
	device = "cuda" if torch.cuda.is_available() else "cpu"
	checkpoint_path = "best.pth"
	video_path = "sample_video.mp4" # Replace with a real video path
	query = "a person is opening a door"

	model, config = load_model(checkpoint_path, device)

	# This is a simplified inference demonstration.
	# In a real scenario, you would use a sliding window approach as seen in infer.py
	print(f"Ready for inference on {device}.")
	print(f"Model architecture: {config.clip_model} + {config.predictor_model} (LoRA) + {config.text_model}")

	# Example Tokenization
	query_tokens = model.query_encoder.tokenize([query], device=device)

	# Example Text Encoding
	with torch.no_grad():
	text_embedding = model.encode_text([query], device=device)

	print(f"Query: '{query}'")
	print(f"Text embedding shape: {text_embedding.shape}")
	print("\nTo perform full temporal localization, use the infer.py script which implements sliding window and NMS.")

	if __name__ == "__main__":
	main()