InternVideo3: Agentify Foundation Models with Multimodal Contextual Reasoning
Paper • 2606.12195 • Published • 20
How to use yanziang/InternVideo3-8B-Instruct with Transformers:
# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("yanziang/InternVideo3-8B-Instruct", trust_remote_code=True, dtype="auto")InternVideo3 is a multimodal large language model designed for long-horizon video understanding and agentic reasoning. It introduces Multimodal Contextual Reasoning (MCR), an efficient formulation that unifies perception, planning, tool use, self-reflection, and memory within a single shared context, enabling recursive multi-step reasoning over long videos.
| Component | Details |
|---|---|
| Vision Encoder | 27-layer ViT, hidden_size=1152, patch_size=16, temporal_patch_size=2 |
| Language Model | 36-layer, hidden_size=4096, 32 attention heads |
| KV Latent Rank | 896 per layer |
| Max Context | 262,144 tokens |
| Precision | BFloat16 |
pip install transformers>=4.57.3 torch qwen-vl-utils
import torch
from transformers import AutoModelForCausalLM, AutoProcessor
model_path = "OpenGVLab/InternVideo3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
model_path,
dtype=torch.bfloat16,
attn_implementation="sdpa",
device_map="auto",
trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(
model_path,
trust_remote_code=True,
)
messages = [
{
"role": "user",
"content": [{"type": "text", "text": "Please introduce yourself."}],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, enable_thinking=True
)
inputs = processor(text=text, images=None, videos=None, do_resize=False, return_tensors="pt")
inputs = inputs.to(model.device)
output = model.generate(**inputs, max_new_tokens=1024, use_cache=True)
generated_ids = [o[len(i):] for i, o in zip(inputs.input_ids, output)]
print(processor.batch_decode(generated_ids, skip_special_tokens=True)[0])
video_path = "your_video.mp4"
fps = 1
min_pixels = 128 * 32 * 32
max_pixels = 128 * 32 * 32
messages = [
{
"role": "user",
"content": [
{"type": "video", "video": video_path, "fps": fps},
{"type": "text", "text": "Please describe this video in detail."},
],
}
]
processor.video_processor.size = {
"longest_edge": max_pixels * max_frames,
"shortest_edge": min_pixels * min_frames,
}
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
fps=fps,
return_tensors="pt",
)
inputs = inputs.to(model.device)
output = model.generate(**inputs, max_new_tokens=1024, use_cache=True)
generated_ids = [o[len(i):] for i, o in zip(inputs.input_ids, output)]
print(processor.batch_decode(generated_ids, skip_special_tokens=True)[0])
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": "your_image.jpg"},
{"type": "text", "text": "Please describe this image in detail."},
],
}
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, enable_thinking=True
)
inputs = processor(text=text, images=images, videos=None, do_resize=False, return_tensors="pt")
inputs = inputs.to(model.device)
output = model.generate(**inputs, max_new_tokens=1024, use_cache=True)
generated_ids = [o[len(i):] for i, o in zip(inputs.input_ids, output)]
print(processor.batch_decode(generated_ids, skip_special_tokens=True)[0])
@misc{yan2026internvideo3agentifyfoundationmodels,
title={InternVideo3: Agentify Foundation Models with Multimodal Contextual Reasoning},
author={Ziang Yan and Sheng Xia and Jiashuo Yu and Yue Wu and Tianxiang Jiang and Songze Li and Kanghui Tian and Yicheng Xu and Yinan He and Kai Chen and Limin Wang and Yu Qiao and Yi Wang},
year={2026},
eprint={2606.12195},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2606.12195},
}
## License
This project is released under the Apache 2.0 License.