juno-labs/text-voice-activity-detection
Viewer • Updated • 77.4k • 4
How to use juno-labs/gemma-text-vad with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-generation", model="juno-labs/gemma-text-vad")
messages = [
{"role": "user", "content": "Who are you?"},
]
pipe(messages) # Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("juno-labs/gemma-text-vad")
model = AutoModelForCausalLM.from_pretrained("juno-labs/gemma-text-vad")
messages = [
{"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))How to use juno-labs/gemma-text-vad with vLLM:
# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "juno-labs/gemma-text-vad"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "juno-labs/gemma-text-vad",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'docker model run hf.co/juno-labs/gemma-text-vad
How to use juno-labs/gemma-text-vad with SGLang:
# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
--model-path "juno-labs/gemma-text-vad" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "juno-labs/gemma-text-vad",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=<secret>" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server \
--model-path "juno-labs/gemma-text-vad" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "juno-labs/gemma-text-vad",
"messages": [
{
"role": "user",
"content": "What is the capital of France?"
}
]
}'How to use juno-labs/gemma-text-vad with Docker Model Runner:
docker model run hf.co/juno-labs/gemma-text-vad
This is a text-based Voice Activity Detection model that determines if a given speech fragment is complete enough for processing by a smart speaker assistant. This allows smart speakers to move from using time based pauses (300ms - 1000ms) to detect the end of voice input to using this model to determine if the voice input is complete.
Example:
Model prompting requirements:
"You are a Voice Activity Detection system. Determine if the given speech fragment is complete enough for processing. Answer with only 'yes' if complete or 'no' if incomplete.""Is this sentence fragment complete for processing: '{fragment}'"To use with pipeline from transformers:
from transformers import pipeline
pipe = pipeline("text-generation", model="juno-labs/gemma-text-vad")
SYSTEM_PROMPT = (
"You are a Voice Activity Detection system. "
"Determine if the given speech fragment is complete enough for processing. Answer with only 'yes' if complete or 'no' if incomplete."
)
SENTENCE = "Hey Juno can you set the temperature to"
messages = [
{'content': SYSTEM_PROMPT, 'role': 'system'},
{'content': f"Is this sentence fragment complete for processing: '{SENTENCE}'", 'role': 'user'}
]
generated = pipe(messages)
classification = generated[0]["generated_text"][2]["content"]
print(f"Classification: {classification}") # "yes" or "no"
To use with transformers:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
MODEL_ID = "juno-labs/gemma-text-vad"
# Load model + tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL_ID, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
SYSTEM_PROMPT = (
"You are a Voice Activity Detection system. "
"Determine if the given speech fragment is complete enough for processing. Answer with only 'yes' if complete or 'no' if incomplete."
)
SENTENCE = "Set the temperature to 68"
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Is this sentence fragment complete for processing: '{SENTENCE}'"},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=1, # only 1 token
do_sample=False, # greedy decoding
pad_token_id=tokenizer.eos_token_id,
)
decoded = tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
print(f"Classification: {decoded}") # "yes" or "no"