Instructions to use DeepXR/Helion-V1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use DeepXR/Helion-V1 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="DeepXR/Helion-V1") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForMultimodalLM tokenizer = AutoTokenizer.from_pretrained("DeepXR/Helion-V1") model = AutoModelForMultimodalLM.from_pretrained("DeepXR/Helion-V1") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use DeepXR/Helion-V1 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "DeepXR/Helion-V1" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V1", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/DeepXR/Helion-V1
- SGLang
How to use DeepXR/Helion-V1 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "DeepXR/Helion-V1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V1", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "DeepXR/Helion-V1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "DeepXR/Helion-V1", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use DeepXR/Helion-V1 with Docker Model Runner:
docker model run hf.co/DeepXR/Helion-V1
| """ | |
| Helion-V1 Production Deployment Script | |
| Optimized for serving with vLLM, TGI, or custom inference servers | |
| """ | |
| import os | |
| import json | |
| import logging | |
| from typing import Dict, List, Optional | |
| from dataclasses import dataclass | |
| import asyncio | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| ) | |
| logger = logging.getLogger(__name__) | |
| class DeploymentConfig: | |
| """Configuration for model deployment.""" | |
| model_name: str = "DeepXR/Helion-V1" | |
| tensor_parallel_size: int = 1 | |
| max_model_len: int = 4096 | |
| max_num_seqs: int = 256 | |
| gpu_memory_utilization: float = 0.90 | |
| trust_remote_code: bool = True | |
| quantization: Optional[str] = None # "awq", "gptq", or None | |
| dtype: str = "bfloat16" | |
| enforce_eager: bool = False | |
| # Safety settings | |
| max_tokens: int = 2048 | |
| temperature: float = 0.7 | |
| top_p: float = 0.9 | |
| frequency_penalty: float = 0.1 | |
| presence_penalty: float = 0.1 | |
| # Rate limiting | |
| rate_limit_requests_per_minute: int = 60 | |
| rate_limit_tokens_per_minute: int = 90000 | |
| class HelionDeployment: | |
| """ | |
| Production deployment handler for Helion-V1. | |
| Supports vLLM, Text Generation Inference, and custom servers. | |
| """ | |
| def __init__(self, config: DeploymentConfig): | |
| self.config = config | |
| self.model = None | |
| self.tokenizer = None | |
| def deploy_vllm(self): | |
| """Deploy using vLLM for high-throughput inference.""" | |
| try: | |
| from vllm import LLM, SamplingParams | |
| logger.info("Initializing vLLM engine...") | |
| self.model = LLM( | |
| model=self.config.model_name, | |
| tensor_parallel_size=self.config.tensor_parallel_size, | |
| max_model_len=self.config.max_model_len, | |
| max_num_seqs=self.config.max_num_seqs, | |
| gpu_memory_utilization=self.config.gpu_memory_utilization, | |
| trust_remote_code=self.config.trust_remote_code, | |
| quantization=self.config.quantization, | |
| dtype=self.config.dtype, | |
| enforce_eager=self.config.enforce_eager | |
| ) | |
| logger.info("✅ vLLM engine initialized successfully") | |
| return True | |
| except ImportError: | |
| logger.error("vLLM not installed. Install with: pip install vllm") | |
| return False | |
| except Exception as e: | |
| logger.error(f"Failed to initialize vLLM: {e}") | |
| return False | |
| def get_sampling_params(self) -> 'SamplingParams': | |
| """Get vLLM sampling parameters.""" | |
| from vllm import SamplingParams | |
| return SamplingParams( | |
| temperature=self.config.temperature, | |
| top_p=self.config.top_p, | |
| max_tokens=self.config.max_tokens, | |
| frequency_penalty=self.config.frequency_penalty, | |
| presence_penalty=self.config.presence_penalty | |
| ) | |
| def generate_vllm(self, prompts: List[str]) -> List[str]: | |
| """Generate responses using vLLM.""" | |
| if not self.model: | |
| raise RuntimeError("Model not initialized. Call deploy_vllm() first.") | |
| sampling_params = self.get_sampling_params() | |
| outputs = self.model.generate(prompts, sampling_params) | |
| return [output.outputs[0].text for output in outputs] | |
| def create_fastapi_server(self): | |
| """Create FastAPI server for HTTP API.""" | |
| try: | |
| from fastapi import FastAPI, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from pydantic import BaseModel | |
| import uvicorn | |
| app = FastAPI( | |
| title="Helion-V1 API", | |
| description="Safe and helpful AI assistant API", | |
| version="1.0.0" | |
| ) | |
| # CORS middleware | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| class ChatRequest(BaseModel): | |
| messages: List[Dict[str, str]] | |
| max_tokens: Optional[int] = 512 | |
| temperature: Optional[float] = 0.7 | |
| top_p: Optional[float] = 0.9 | |
| class ChatResponse(BaseModel): | |
| response: str | |
| model: str | |
| usage: Dict[str, int] | |
| async def chat_completion(request: ChatRequest): | |
| """OpenAI-compatible chat completion endpoint.""" | |
| try: | |
| # Format messages | |
| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(self.config.model_name) | |
| prompt = tokenizer.apply_chat_template( | |
| request.messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| # Generate response | |
| responses = self.generate_vllm([prompt]) | |
| return ChatResponse( | |
| response=responses[0], | |
| model=self.config.model_name, | |
| usage={ | |
| "prompt_tokens": len(tokenizer.encode(prompt)), | |
| "completion_tokens": len(tokenizer.encode(responses[0])), | |
| "total_tokens": len(tokenizer.encode(prompt + responses[0])) | |
| } | |
| ) | |
| except Exception as e: | |
| logger.error(f"Generation error: {e}") | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def health_check(): | |
| """Health check endpoint.""" | |
| return {"status": "healthy", "model": self.config.model_name} | |
| async def root(): | |
| """Root endpoint.""" | |
| return { | |
| "name": "Helion-V1 API", | |
| "version": "1.0.0", | |
| "status": "online" | |
| } | |
| return app | |
| except ImportError: | |
| logger.error("FastAPI not installed. Install with: pip install fastapi uvicorn") | |
| return None | |
| def export_onnx(self, output_path: str = "./helion_onnx"): | |
| """Export model to ONNX format for optimized deployment.""" | |
| try: | |
| from optimum.onnxruntime import ORTModelForCausalLM | |
| from transformers import AutoTokenizer | |
| logger.info("Exporting model to ONNX...") | |
| model = ORTModelForCausalLM.from_pretrained( | |
| self.config.model_name, | |
| export=True | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(self.config.model_name) | |
| model.save_pretrained(output_path) | |
| tokenizer.save_pretrained(output_path) | |
| logger.info(f"✅ Model exported to {output_path}") | |
| return True | |
| except ImportError: | |
| logger.error("Optimum not installed. Install with: pip install optimum[onnxruntime-gpu]") | |
| return False | |
| except Exception as e: | |
| logger.error(f"ONNX export failed: {e}") | |
| return False | |
| def create_docker_config(self, output_path: str = "./"): | |
| """Generate Dockerfile for containerized deployment.""" | |
| dockerfile_content = f"""FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 | |
| # Set working directory | |
| WORKDIR /app | |
| # Install Python and dependencies | |
| RUN apt-get update && apt-get install -y \\ | |
| python3.10 \\ | |
| python3-pip \\ | |
| git \\ | |
| && rm -rf /var/lib/apt/lists/* | |
| # Install Python packages | |
| COPY requirements.txt . | |
| RUN pip3 install --no-cache-dir -r requirements.txt | |
| # Install vLLM for high-performance inference | |
| RUN pip3 install vllm | |
| # Copy application code | |
| COPY . . | |
| # Set environment variables | |
| ENV MODEL_NAME={self.config.model_name} | |
| ENV MAX_MODEL_LEN={self.config.max_model_len} | |
| ENV GPU_MEMORY_UTILIZATION={self.config.gpu_memory_utilization} | |
| ENV TENSOR_PARALLEL_SIZE={self.config.tensor_parallel_size} | |
| # Expose port | |
| EXPOSE 8000 | |
| # Health check | |
| HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \\ | |
| CMD curl -f http://localhost:8000/health || exit 1 | |
| # Run the application | |
| CMD ["python3", "deployment.py", "--server"] | |
| """ | |
| dockerfile_path = os.path.join(output_path, "Dockerfile") | |
| with open(dockerfile_path, 'w') as f: | |
| f.write(dockerfile_content) | |
| # Also create docker-compose.yml | |
| docker_compose_content = f"""version: '3.8' | |
| services: | |
| helion-v1: | |
| build: . | |
| ports: | |
| - "8000:8000" | |
| environment: | |
| - MODEL_NAME={self.config.model_name} | |
| - CUDA_VISIBLE_DEVICES=0 | |
| deploy: | |
| resources: | |
| reservations: | |
| devices: | |
| - driver: nvidia | |
| count: 1 | |
| capabilities: [gpu] | |
| volumes: | |
| - model_cache:/root/.cache/huggingface | |
| restart: unless-stopped | |
| volumes: | |
| model_cache: | |
| """ | |
| compose_path = os.path.join(output_path, "docker-compose.yml") | |
| with open(compose_path, 'w') as f: | |
| f.write(docker_compose_content) | |
| logger.info(f"✅ Docker configuration created in {output_path}") | |
| logger.info("Build with: docker-compose build") | |
| logger.info("Run with: docker-compose up -d") | |
| def main(): | |
| """Main deployment function.""" | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Deploy Helion-V1") | |
| parser.add_argument("--model", default="DeepXR/Helion-V1", help="Model name or path") | |
| parser.add_argument("--backend", choices=["vllm", "tgi", "fastapi"], default="vllm") | |
| parser.add_argument("--server", action="store_true", help="Start HTTP server") | |
| parser.add_argument("--export-onnx", action="store_true", help="Export to ONNX") | |
| parser.add_argument("--create-docker", action="store_true", help="Create Docker config") | |
| parser.add_argument("--tensor-parallel", type=int, default=1) | |
| parser.add_argument("--quantization", choices=["awq", "gptq", None], default=None) | |
| args = parser.parse_args() | |
| # Create config | |
| config = DeploymentConfig( | |
| model_name=args.model, | |
| tensor_parallel_size=args.tensor_parallel, | |
| quantization=args.quantization | |
| ) | |
| deployment = HelionDeployment(config) | |
| if args.export_onnx: | |
| deployment.export_onnx() | |
| if args.create_docker: | |
| deployment.create_docker_config() | |
| if args.server: | |
| if args.backend == "vllm": | |
| if deployment.deploy_vllm(): | |
| app = deployment.create_fastapi_server() | |
| if app: | |
| import uvicorn | |
| logger.info("🚀 Starting Helion-V1 server on http://0.0.0.0:8000") | |
| uvicorn.run(app, host="0.0.0.0", port=8000) | |
| else: | |
| logger.error(f"Backend {args.backend} not implemented yet") | |
| else: | |
| logger.info("No action specified. Use --help for options.") | |
| if __name__ == "__main__": | |
| main() |