Instructions to use OpenGVLab/InternVL-14B-224px with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use OpenGVLab/InternVL-14B-224px with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("image-feature-extraction", model="OpenGVLab/InternVL-14B-224px", trust_remote_code=True)# Load model directly from transformers import AutoProcessor, AutoModel processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL-14B-224px", trust_remote_code=True) model = AutoModel.from_pretrained("OpenGVLab/InternVL-14B-224px", trust_remote_code=True) - Notebooks
- Google Colab
- Kaggle
| # -------------------------------------------------------- | |
| # InternVL | |
| # Copyright (c) 2023 OpenGVLab | |
| # Licensed under The MIT License [see LICENSE for details] | |
| # -------------------------------------------------------- | |
| import torch | |
| import torch.nn as nn | |
| import torchvision.transforms as T | |
| from torchvision.transforms import InterpolationMode | |
| from transformers import LlamaTokenizer | |
| from .configuration_intern_vit import InternVisionConfig | |
| from .configuration_internvl import InternVLConfig | |
| from .modeling_intern_vit import InternVisionModel | |
| from .modeling_internvl import InternVL_C, InternVL_G, InternVLModel | |
| __all__ = ['InternVisionConfig', 'InternVisionModel', 'InternVLConfig', | |
| 'InternVLModel', 'InternVL_C', 'InternVL_G'] | |
| # Prefix the text "summarize:" | |
| class InternVLTokenizer(nn.Module): | |
| def __init__(self, model_path): | |
| super(InternVLTokenizer, self).__init__() | |
| self.tokenizer = LlamaTokenizer.from_pretrained(model_path) | |
| self.tokenizer.pad_token = ' ' # allow padding | |
| self.tokenizer.add_eos_token = True | |
| def forward(self, text, prefix='summarize:'): | |
| if type(text) == str: | |
| text = prefix + text | |
| elif type(text) == list: | |
| text = [prefix + item for item in text] | |
| text = self.tokenizer(text, return_tensors='pt', max_length=80, truncation=True, padding='max_length').input_ids | |
| return text | |
| def build_transform(task, image_size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]): | |
| if task == 'retrieval': | |
| transform = T.Compose([ | |
| T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), | |
| T.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), | |
| T.ToTensor(), | |
| T.Normalize(mean=mean, std=std)]) | |
| else: | |
| transform = T.Compose([ | |
| T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), | |
| T.Resize(image_size, interpolation=InterpolationMode.BICUBIC), | |
| T.CenterCrop(image_size), | |
| T.ToTensor(), | |
| T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) | |
| return transform | |
| def load_internvl_c_huggingface(ckpt_path, device, task): | |
| model = InternVL_C.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device) | |
| if model.config.use_backbone_lora: | |
| model.vision_model.merge_and_unload() | |
| model.vision_model = model.vision_model.model | |
| if model.config.use_qllama_lora: | |
| model.qllama.merge_and_unload() | |
| model.qllama = model.qllama.model | |
| if model.config.force_image_size is not None: | |
| image_size = model.config.force_image_size | |
| else: | |
| image_size = model.config.vision_config.image_size | |
| transform = build_transform(task, image_size) | |
| tokenizer = InternVLTokenizer(ckpt_path) | |
| return model, transform, tokenizer | |
| def load_internvl_g_huggingface(ckpt_path, device, task): | |
| model = InternVL_G.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device) | |
| if model.config.use_backbone_lora: | |
| model.vision_model.merge_and_unload() | |
| model.vision_model = model.vision_model.model | |
| if model.config.use_qllama_lora: | |
| model.qllama.merge_and_unload() | |
| model.qllama = model.qllama.model | |
| if model.config.force_image_size is not None: | |
| image_size = model.config.force_image_size | |
| else: | |
| image_size = model.config.vision_config.image_size | |
| transform = build_transform(task, image_size) | |
| tokenizer = InternVLTokenizer(ckpt_path) | |
| return model, transform, tokenizer | |