Text Generation
Transformers
Safetensors
stripedhyena
long context
deep signal processing
hybrid
biology
genomics
custom_code
Instructions to use togethercomputer/evo-1-131k-base with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use togethercomputer/evo-1-131k-base with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="togethercomputer/evo-1-131k-base", trust_remote_code=True)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("togethercomputer/evo-1-131k-base", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use togethercomputer/evo-1-131k-base with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "togethercomputer/evo-1-131k-base" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "togethercomputer/evo-1-131k-base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/togethercomputer/evo-1-131k-base
- SGLang
How to use togethercomputer/evo-1-131k-base with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "togethercomputer/evo-1-131k-base" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "togethercomputer/evo-1-131k-base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "togethercomputer/evo-1-131k-base" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "togethercomputer/evo-1-131k-base", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use togethercomputer/evo-1-131k-base with Docker Model Runner:
docker model run hf.co/togethercomputer/evo-1-131k-base
| # Copyright (c) Together | |
| # This software is distributed under the terms of the Apache License, Version 2.0 | |
| # Author: Michael Poli | |
| import gc | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| try: | |
| import conv1d_cpp | |
| except: | |
| pass | |
| from .utils import column_split | |
| IIR_PREFILL_MODES = [ | |
| "recurrence", | |
| "modal-fft", | |
| "hybrid-modal-recurrence", | |
| "modal-scan", | |
| "canonical-fft", | |
| "iir-fir-caching", | |
| ] | |
| def canonicalize_modal_system(poles, residues): | |
| """Canonicalize a modal system. | |
| Args: | |
| poles (Tensor): The poles of the system. | |
| residues (Tensor): The residues of the system. | |
| Returns: | |
| Tuple[Tensor, Tensor]: The canonicalized poles and residues. | |
| """ | |
| raise NotImplementedError | |
| def list_tensors(idx): | |
| for obj in gc.get_objects(): | |
| try: | |
| if torch.is_tensor(obj) and isinstance(obj, torch.Tensor): | |
| # dump to log | |
| print(type(obj), obj.size()) | |
| el = obj[0] | |
| with open(f"tensors_{idx}.txt", "a") as f: | |
| f.write(f"{type(obj)} {obj.size()} {el}\n") | |
| except Exception as e: | |
| pass | |
| class HyenaInferenceEngine: | |
| def __init__( | |
| self, | |
| fir_fn=None, | |
| iir_prefill_style="modal-fft", | |
| layer_idx=None, | |
| ) -> None: | |
| self.fir_fn = fir_fn | |
| assert iir_prefill_style in IIR_PREFILL_MODES, f"iir_prefill_style must be one of {IIR_PREFILL_MODES}" | |
| self.iir_prefill_style = iir_prefill_style | |
| self.layer_idx = layer_idx | |
| self.low_mem_mode = False | |
| def parallel_fir( | |
| self, | |
| fir_fn, | |
| u, | |
| weight, | |
| bias, | |
| L, | |
| fir_length=3, | |
| inference_params=None, | |
| prefill_mode=None, | |
| padding_mask=None, | |
| ): | |
| """Compute the output state of the long convolutional filter.""" | |
| # prepare input layout, dimensions and dispatch to fir kernel | |
| if fir_fn != torch.nn.functional.conv1d: | |
| z_pre = fir_fn(u)[:, :L] # B, L, D | |
| z_pre = z_pre.permute(0, 2, 1) | |
| else: | |
| u = u.permute(0, 2, 1) # B, D, L | |
| z_pre = fir_fn( | |
| u, | |
| weight, | |
| bias=None, # don't pass it here, add manually instead! source of small error | |
| stride=1, | |
| padding=fir_length - 1, | |
| groups=u.shape[1], | |
| )[..., :L] | |
| # add manually instead! source of small error | |
| z_pre = z_pre + bias[None, :, None] | |
| # handle padding post fir, the only place with biases | |
| if type(padding_mask) == torch.Tensor: | |
| z_pre = z_pre * padding_mask[:, None] | |
| if inference_params is not None: | |
| # handle seqlen last and dim last cases for `u` | |
| if fir_fn != torch.nn.functional.conv1d: | |
| fir_state = u[:, -fir_length + 1 :].permute(0, 2, 1) | |
| else: | |
| fir_state = u[..., -fir_length + 1 :] | |
| else: | |
| fir_state = None | |
| return z_pre, fir_state | |
| def parallel_iir( | |
| self, | |
| z_pre, | |
| h, | |
| D, | |
| L, | |
| poles, | |
| residues, | |
| t, | |
| dims, | |
| layer_idx, | |
| inference_params=None, | |
| prefill_style="fft", | |
| fftconv_fn=None, | |
| padding_mask=None, | |
| use_flashfft=False, | |
| column_split_hyena=False, | |
| long_fir_threshold=None, | |
| ): | |
| """Compute the output state of the short convolutional filter.""" | |
| fft_size = 2 * L | |
| hidden_size, num_attention_heads, hidden_size_per_attention_head, _, _ = dims | |
| # Compatibility with training infra that column splits the projections | |
| if column_split_hyena: | |
| z = z_pre.reshape( | |
| z_pre.shape[0], | |
| num_attention_heads, | |
| 3 * hidden_size_per_attention_head, | |
| z_pre.shape[2], | |
| ) | |
| x2, x1, v = ( | |
| z[:, :, :hidden_size_per_attention_head], | |
| z[ | |
| :, | |
| :, | |
| hidden_size_per_attention_head : 2 * hidden_size_per_attention_head, | |
| ], | |
| z[:, :, 2 * hidden_size_per_attention_head :], | |
| ) | |
| x2, x1, v = ( | |
| x2.reshape(x2.shape[0], -1, x2.shape[-1]), | |
| x1.reshape(x1.shape[0], -1, x1.shape[-1]), | |
| v.reshape(v.shape[0], -1, v.shape[-1]), | |
| ) | |
| else: | |
| x2, x1, v = z_pre.split([hidden_size, hidden_size, hidden_size], dim=1) | |
| x1v = x1 * v | |
| if inference_params is not None and prefill_style == "recurrence": | |
| y = self.prefill_via_direct_recurrence( | |
| inference_params=inference_params, | |
| x1v=x1v, | |
| L=L, | |
| poles=poles, | |
| residues=residues, | |
| ) | |
| else: | |
| if use_flashfft and (L % 2) == 0: # only works with even L | |
| y = fftconv_fn( | |
| x1v.to(dtype=torch.bfloat16).contiguous(), | |
| h.to(dtype=torch.float32), | |
| ) | |
| X_s = None | |
| elif long_fir_threshold is None: | |
| H = torch.fft.rfft(h.to(dtype=torch.float32), n=fft_size) / fft_size | |
| X_s = torch.fft.fft(x1v.to(dtype=torch.float32), n=fft_size) | |
| X = X_s[..., : H.shape[-1]] | |
| if len(z_pre.shape) > 3: | |
| H = H.unsqueeze(1) | |
| y = torch.fft.irfft(X * H, n=fft_size, norm="forward")[..., :L] | |
| else: | |
| assert h.shape[0] == 1, "batch size must be 1 for long_fir_threshold" | |
| h = h[0][:, None] # rearrange to d, 1, l for depthwise conv1d | |
| h = h[..., :long_fir_threshold] | |
| y = F.conv1d( | |
| x1v, | |
| h.to(dtype=x1v.dtype), | |
| stride=1, | |
| groups=x1v.shape[1], | |
| padding=h.shape[-1] - 1, | |
| )[..., :L] | |
| y = y.to(dtype=x1v.dtype) | |
| y = (y + x1v * D.unsqueeze(-1)) * x2 | |
| if inference_params is not None: | |
| if prefill_style == "fft": | |
| self.prefill_via_modal_fft( | |
| inference_params=inference_params, | |
| x1v=x1v, | |
| X_s=X_s, | |
| L=L, | |
| t=t, | |
| poles=poles, | |
| dims=dims, | |
| layer_idx=layer_idx, | |
| use_flashfft=use_flashfft, | |
| fftconv_fn=fftconv_fn, | |
| ) | |
| elif prefill_style == "recurrence": | |
| # recurrent prefill is done before | |
| pass | |
| else: | |
| raise NotImplementedError | |
| if self.low_mem_mode: | |
| # TODO: smarter gc | |
| del z_pre, x2, x1, v, x1v, h, poles, residues | |
| torch.cuda.empty_cache() | |
| return y.permute(0, 2, 1) | |
| def step_fir(self, u, fir_state, weight, bias=None): | |
| """Step the FIR filter. | |
| Note: | |
| `fir_state` contains the last `short_filter_length - 1` elements of `u`: `u_(L-2), u_{L-1), ...` | |
| We assume dimensions of `short_filter_weight` to be `[d, 1, short_filter_len]` (SISO / multi SISO layout). | |
| """ | |
| h0, h = weight[..., 0, -1], weight[..., 0, :-1] | |
| h0, h = h0[None], h[None] | |
| y = h0 * u + torch.sum(fir_state * h, dim=-1) + bias | |
| # update | |
| fir_state = torch.roll(fir_state, -1, dims=2) | |
| fir_state[..., -1] = u | |
| return y, fir_state | |
| def step_iir(self, x2, x1, v, D, residues, poles, iir_state, iir_groups=1): | |
| x1v = x1 * v | |
| residues, poles = ( | |
| torch.view_as_complex(residues.to(torch.float32)), | |
| torch.view_as_complex(poles.to(torch.float32)), | |
| ) | |
| # squeeze the dummy seqlen dimension | |
| # D, state_dim, 1 -> 1, D, state_dim | |
| residues, poles = residues[..., 0][None], poles[..., 0][None] | |
| iir_state = poles * iir_state + x1v[..., None] | |
| res_state = torch.sum(residues * iir_state, dim=-1).real | |
| if iir_groups > 1: | |
| raise NotImplementedError | |
| y = x2 * (res_state + D * x1v) | |
| return y, iir_state | |
| def prefill_via_fir_caching(self, u, inference_params, L, *args, **kwargs): | |
| """Turns the IIR filter into a FIR and uses a cache for decoding.""" | |
| raise NotImplementedError(":)") | |
| def prefill_via_direct_recurrence( | |
| self, inference_params, x1v, L, residues, poles, *args, **kwargs | |
| ) -> torch.Tensor: | |
| """ | |
| Compute the IIR state via explicit SSM recurrence (modal form) | |
| This is the most memory efficient prefilling method for Hyena filters. | |
| Note: | |
| dtypes: [state: float32, poles: float32, x1v: bfloat16, output: bfloat16] | |
| """ | |
| state_dim = poles.shape[1] | |
| x1v_ = x1v[..., None, None] # b, d, l, sdim, reim | |
| x1v_ = x1v_.repeat(1, 1, 1, state_dim, 2) # b, d, l, sdim, reim | |
| x1v_[..., 1] = 0 | |
| state = 0 * x1v_[:, :, 0] | |
| output = 0 * x1v_[:, :, :, 0, 0] # b, d, l | |
| # suppress dummy seqlen dimension | |
| poles = poles[:, :, 0][None] | |
| residues = residues[:, :, 0][None].repeat(x1v_.shape[0], 1, 1, 1) # b, d, sdim, reim | |
| # state: b, d, sdim, reim | |
| # poles: 1, d, sdim, reim | |
| # x1v_: b, d, l, sdim, reim | |
| for i in range(L): | |
| state[..., 0] = poles[..., 0] * state[..., 0] - poles[..., 1] * state[..., 1] + x1v_[:, :, i, :, 0] | |
| state[..., 1] = poles[..., 0] * state[..., 1] + poles[..., 1] * state[..., 0] + x1v_[:, :, i, :, 1] | |
| output[:, :, i] = torch.sum(residues * state, dim=-2)[..., 0] # .real | |
| inference_params.state_dict[self.layer_idx] = torch.view_as_complex(state.to(dtype=torch.float32)) | |
| return output | |
| def prefill_via_hybrid_recurrence(self, inference_params, u, log_poles, x1v_f_a, L, *args, **kwargs): | |
| """ | |
| Compute the IIR state via hybrid recurrence-convolution over blocks | |
| """ | |
| raise NotImplementedError(":)") | |
| def prefill_via_scan(self, u, inference_params=None, *args, **kwargs): | |
| raise NotImplementedError | |
| def prefill_via_canonical_fft(self, u, inference_params=None, *args, **kwargs): | |
| """ | |
| Compute the IIR state via a single FFT with the denominator of the SSM in companion form. | |
| This is the most memory efficient "parallelized" prefilling method for Hyena. | |
| From: https://arxiv.org/abs/2310.18780 | |
| """ | |
| raise NotImplementedError(":)") | |
| def prefill_via_modal_fft( | |
| self, | |
| inference_params, | |
| x1v, | |
| L, | |
| poles, | |
| t, | |
| dims, | |
| layer_idx, | |
| X_s=None, | |
| use_flashfft=False, | |
| fftconv_fn=None, | |
| state_dtype=torch.complex64, | |
| *args, | |
| **kwargs, | |
| ): | |
| """ | |
| Compute the IIR state via a single FFT, using the poles of the SSM in modal form. | |
| """ | |
| # When the model has a long convolution derived from a SSM in modal form and prefill_style is "fft", | |
| # we split the filter into poles and residues and reuse FFT computation on the input. | |
| # This optimization is currently not supported when using flashfftconv. | |
| hidden_size, _, _, state_size, hyena_filter_groups = dims | |
| if use_flashfft: | |
| # using real states | |
| poles = poles.squeeze().reshape(poles.shape[0], -1)[..., None] | |
| state_s = poles**t | |
| if hyena_filter_groups > 1: | |
| raise NotImplementedError | |
| x1v = x1v[:, :, None].repeat(1, 1, 2 * state_size, 1) | |
| x1v = x1v.reshape(x1v.shape[0], -1, x1v.shape[-1]) | |
| state_s = state_s[None] | |
| state = fftconv_fn( | |
| x1v.contiguous(), | |
| state_s.to(dtype=torch.float32), | |
| ) | |
| state = state[..., L - 1].reshape(x1v.shape[0], hidden_size, state_size, 2) | |
| state = torch.view_as_complex(state.contiguous().to(dtype=torch.float32)) | |
| inference_params.state_dict[self.layer_idx] = state | |
| else: | |
| assert X_s is not None | |
| bs = x1v.shape[0] | |
| fft_size = 2 * L | |
| poles = torch.view_as_complex(poles.to(torch.float32)) | |
| state_s = poles**t | |
| state_S = torch.fft.fft(state_s, n=fft_size).repeat(bs, 1, 1, 1) # B, D, state_dim, 2 * L | |
| if hyena_filter_groups > 1: | |
| state_S = state_S.repeat_interleave(hidden_size // hyena_filter_groups, 1) | |
| state = torch.fft.ifft(X_s[..., None, :] * state_S, n=fft_size) | |
| inference_params.state_dict[layer_idx] = state[..., L - 1].to(dtype=state_dtype) | |
| def _compute_state(self, log_poles, u, t, L, *args, **kwargs): | |
| """ | |
| Compute the IIR state given an input `u` and log_poles of the modal system. | |
| """ | |
| bs = u.shape[0] | |
| fft_size = 2 * L | |
| U = torch.fft.rfft(u.to(torch.float32), n=fft_size) | |
| fft_size = 2 * L | |
| x = (log_poles * t).exp() | |
| # [batch, hidden_size, state_dim, 2 * seqlen] | |
| X = torch.fft.fft(x, n=fft_size).repeat(bs, 1, 1, 1) | |
| state = torch.fft.ifft(U[..., None, :] * X, n=fft_size)[..., :L] | |
| return state | |