Instructions to use Tiiny/SmallThinker-3B-Preview with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Tiiny/SmallThinker-3B-Preview with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Tiiny/SmallThinker-3B-Preview") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("Tiiny/SmallThinker-3B-Preview") model = AutoModelForCausalLM.from_pretrained("Tiiny/SmallThinker-3B-Preview") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Inference
- Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Tiiny/SmallThinker-3B-Preview with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Tiiny/SmallThinker-3B-Preview" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Tiiny/SmallThinker-3B-Preview", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Tiiny/SmallThinker-3B-Preview
- SGLang
How to use Tiiny/SmallThinker-3B-Preview with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Tiiny/SmallThinker-3B-Preview" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Tiiny/SmallThinker-3B-Preview", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Tiiny/SmallThinker-3B-Preview" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Tiiny/SmallThinker-3B-Preview", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Tiiny/SmallThinker-3B-Preview with Docker Model Runner:
docker model run hf.co/Tiiny/SmallThinker-3B-Preview
Eval script
Hi! Do you have the evaluation scripts available? My numbers are a bit different locally
AIME: 8/90 = 8.88%, AIME2024: 4/30=13.33%
AMC: 32/83 = 38.55%
Is there anything finicky with the prompt? Running with vllm
This is my evaluation script.
import re
import time
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
import re
def extract_final_answer(text):
patterns = [
r"\*+Final\s+Answer\*+\s*\n*\s*\\\[\s*\\boxed\s*{\s*([0-9.-]+)\s*}\s*\\\]",
r"\*+Final\s+Answer\*+\s*\n*\s*\\\[\s*([0-9.-]+)\s*\\\]",
r"\*?Final\s+Answer\*?\s*[:=]\s*([0-9.-]+)",
r"[Tt]he\s+[Ff]inal\s+[Aa]nswer\s+[Ii]s\s*[:=]?\s*([0-9.-]+)",
r"[Ff]inal\s+[Aa]nswer\s*[:=]\s*([0-9.-]+)",
]
# text_normalized = text.replace('\\\\', '\\')
text_normalized = text
print(text_normalized)
for i, pattern in enumerate(patterns):
match = re.search(pattern, text_normalized, re.DOTALL)
if match:
result = match.group(1).strip()
return result
else:
print("Not found match")
return None
def evaluate_answer(predicted, actual):
if predicted is None:
return False
try:
pred_val = float(predicted)
actual_val = float(actual)
return abs(pred_val - actual_val) < 1e-5
except:
return predicted.strip() == actual.strip()
def run_inference(model, tokenizer, question, max_new_tokens=16384):
prompt = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": question},
]
input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
generated_ids = model.generate(
input_ids,
max_new_tokens=max_new_tokens,
use_cache=True
)
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
return response
def main():
# 1. Load model and tokenizer
model_path = "saves/qwen2-01/full/sft/checkpoint-44000" # Replace with your model_path
model = AutoModelForCausalLM.from_pretrained(
model_path,
torch_dtype=torch.float16,
device_map="cuda"
)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# 2. Load the dataset
dataset = load_dataset("json", data_files="/home/syx/Qwen2.5-Math/evaluation/data/aime24/test.jsonl") # Replace with the custom dataset
eval_dataset = dataset["train"]
# 3. Inference and validation
results = {
"correct": 0,
"total": 0,
"predictions": []
}
for item in tqdm(eval_dataset):
question = item["question"]
ground_truth = item["answer"] if "answer" in item else None
# Inference
start_time = time.time()
response = run_inference(model, tokenizer, question)
inference_time = time.time() - start_time
# Extract
predicted_answer = extract_final_answer(response)
#print(response)
#print(predicted_answer)
# Evaluate
is_correct = None
if ground_truth is not None:
is_correct = evaluate_answer(predicted_answer, ground_truth)
print(is_correct)
results["correct"] += int(is_correct)
results["total"] += 1
# Save
results["predictions"].append({
"question": question,
"response": response,
"extracted_answer": predicted_answer,
"ground_truth": ground_truth,
"is_correct": is_correct,
"inference_time": inference_time
})
# 4. Output
if results["total"] > 0:
accuracy = results["correct"] / results["total"] * 100
print(f"\nAccuracy: {accuracy:.2f}%")
print(f"Correct: {results['correct']}/{results['total']}")
# Save to Disk
import json
with open("aime24.jsonl", "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False)
if __name__ == "__main__":
main()
This is my evaluation script. I conducted testing based on transformers, using configurations consistent with those in the generation_config.json of the released model.
You can refer to the result folder to get the evaluation log.
Thanks! It might be the answer parsing.
For the full finetune does that include system prompt in the messages? Should I be evaluating with "You are a helpful assistant"?
For the full finetune no system prompt was included, but I did use a system prompt during evaluation.