Text Generation
MLX
Safetensors
English
Korean
solar_open
upstage
solar
Mixture of Experts
100b
llm
conversational
custom_code
4-bit precision
Instructions to use mlx-community/Solar-Open-100B-4bit with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- MLX
How to use mlx-community/Solar-Open-100B-4bit with MLX:
# Make sure mlx-lm is installed # pip install --upgrade mlx-lm # Generate text with mlx-lm from mlx_lm import load, generate model, tokenizer = load("mlx-community/Solar-Open-100B-4bit") prompt = "Write a story about Einstein" messages = [{"role": "user", "content": prompt}] prompt = tokenizer.apply_chat_template( messages, add_generation_prompt=True ) text = generate(model, tokenizer, prompt=prompt, verbose=True) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- LM Studio
- Pi new
How to use mlx-community/Solar-Open-100B-4bit with Pi:
Start the MLX server
# Install MLX LM: uv tool install mlx-lm # Start a local OpenAI-compatible server: mlx_lm.server --model "mlx-community/Solar-Open-100B-4bit"
Configure the model in Pi
# Install Pi: npm install -g @mariozechner/pi-coding-agent # Add to ~/.pi/agent/models.json: { "providers": { "mlx-lm": { "baseUrl": "http://localhost:8080/v1", "api": "openai-completions", "apiKey": "none", "models": [ { "id": "mlx-community/Solar-Open-100B-4bit" } ] } } }Run Pi
# Start Pi in your project directory: pi
- Hermes Agent new
How to use mlx-community/Solar-Open-100B-4bit with Hermes Agent:
Start the MLX server
# Install MLX LM: uv tool install mlx-lm # Start a local OpenAI-compatible server: mlx_lm.server --model "mlx-community/Solar-Open-100B-4bit"
Configure Hermes
# Install Hermes: curl -fsSL https://hermes-agent.nousresearch.com/install.sh | bash hermes setup # Point Hermes at the local server: hermes config set model.provider custom hermes config set model.base_url http://127.0.0.1:8080/v1 hermes config set model.default mlx-community/Solar-Open-100B-4bit
Run Hermes
hermes
- MLX LM
How to use mlx-community/Solar-Open-100B-4bit with MLX LM:
Generate or start a chat session
# Install MLX LM uv tool install mlx-lm # Interactive chat REPL mlx_lm.chat --model "mlx-community/Solar-Open-100B-4bit"
Run an OpenAI-compatible server
# Install MLX LM uv tool install mlx-lm # Start the server mlx_lm.server --model "mlx-community/Solar-Open-100B-4bit" # Calling the OpenAI-compatible server with curl curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "mlx-community/Solar-Open-100B-4bit", "messages": [ {"role": "user", "content": "Hello"} ] }'
| # coding=utf-8 | |
| # Copyright 2025 Upstage AI. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from typing import TYPE_CHECKING | |
| import torch | |
| from vllm.sampling_params import SamplingParams | |
| from vllm.v1.sample.logits_processor import ( | |
| AdapterLogitsProcessor, | |
| RequestLogitsProcessor, | |
| ) | |
| if TYPE_CHECKING: | |
| from vllm.config import VllmConfig | |
| # Hardcoded token IDs for Solar tokenizer | |
| TOOL_CALL_END_TOKEN_ID = 32 # <|tool_call:end|> | |
| CALLS_TOKEN_ID = 25 # <|calls|> | |
| class SingleToolCallEnforcer: | |
| """Request-level logits processor that enforces single tool call. | |
| When <|tool_call:end|> token is generated, forces the next token | |
| to be <|calls|> (which is a stop token), preventing parallel tool calls. | |
| """ | |
| def __init__( | |
| self, | |
| tool_call_end_token_id: int, | |
| calls_token_id: int, | |
| ): | |
| self._tool_call_end_token_id = tool_call_end_token_id | |
| self._calls_token_id = calls_token_id | |
| def __call__( | |
| self, | |
| output_token_ids: list[int], | |
| logits: torch.Tensor, | |
| ) -> torch.Tensor: | |
| # Check if last generated token is <|tool_call:end|> | |
| if output_token_ids and output_token_ids[-1] == self._tool_call_end_token_id: | |
| # Force next token to be <|calls|> by masking all other tokens | |
| mask = torch.full_like(logits, -float("inf")) | |
| mask[self._calls_token_id] = logits[self._calls_token_id] | |
| return mask | |
| return logits | |
| class ParallelToolCallLogitsProcessor(AdapterLogitsProcessor): | |
| """Logits processor that enforces single tool call when parallel_tool_calls=False. | |
| When parallel_tool_calls is disabled in SamplingParams, this processor | |
| ensures that after <|tool_call:end|> is generated, the next token is | |
| forced to be <|calls|> (a stop token), preventing multiple tool calls. | |
| """ | |
| def __init__( | |
| self, | |
| vllm_config: "VllmConfig", | |
| device: torch.device, | |
| is_pin_memory: bool, | |
| ): | |
| super().__init__(vllm_config, device, is_pin_memory) | |
| def is_argmax_invariant(self) -> bool: | |
| """This processor can change argmax result by forcing specific tokens.""" | |
| return False | |
| def new_req_logits_processor( | |
| self, | |
| params: SamplingParams, | |
| ) -> RequestLogitsProcessor | None: | |
| """Return a request-level logits processor if parallel_tool_calls=False. | |
| Args: | |
| params: Request sampling params | |
| Returns: | |
| SingleToolCallEnforcer if parallel_tool_calls is False, otherwise None. | |
| """ | |
| # Only apply when parallel_tool_calls is explicitly disabled | |
| if params.parallel_tool_calls is False: | |
| return SingleToolCallEnforcer( | |
| tool_call_end_token_id=TOOL_CALL_END_TOKEN_ID, | |
| calls_token_id=CALLS_TOKEN_ID, | |
| ) | |
| return None | |