Instructions to use Tiiny/SmallThinker-3B-Preview with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use Tiiny/SmallThinker-3B-Preview with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Tiiny/SmallThinker-3B-Preview")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Tiiny/SmallThinker-3B-Preview")
model = AutoModelForCausalLM.from_pretrained("Tiiny/SmallThinker-3B-Preview")
messages = [
    {"role": "user", "content": "Who are you?"},
]
inputs = tokenizer.apply_chat_template(
	messages,
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
).to(model.device)

outputs = model.generate(**inputs, max_new_tokens=40)
print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:]))

Inference
Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use Tiiny/SmallThinker-3B-Preview with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "Tiiny/SmallThinker-3B-Preview"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Tiiny/SmallThinker-3B-Preview",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker

docker model run hf.co/Tiiny/SmallThinker-3B-Preview

SGLang

How to use Tiiny/SmallThinker-3B-Preview with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "Tiiny/SmallThinker-3B-Preview" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Tiiny/SmallThinker-3B-Preview",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "Tiiny/SmallThinker-3B-Preview" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "Tiiny/SmallThinker-3B-Preview",
		"messages": [
			{
				"role": "user",
				"content": "What is the capital of France?"
			}
		]
	}'

Docker Model Runner
How to use Tiiny/SmallThinker-3B-Preview with Docker Model Runner:
```
docker model run hf.co/Tiiny/SmallThinker-3B-Preview
```

Eval script

by rawsh - opened Jan 16, 2025

Discussion

rawsh

Jan 16, 2025

Hi! Do you have the evaluation scripts available? My numbers are a bit different locally

AIME: 8/90 = 8.88%, AIME2024: 4/30=13.33%
AMC: 32/83 = 38.55%

Is there anything finicky with the prompt? Running with vllm

jeremyii

Tiiny AI org Jan 16, 2025

•

edited Jan 16, 2025

This is my evaluation script.

import re
import time
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

import re

def extract_final_answer(text):
    patterns = [ 
        r"\*+Final\s+Answer\*+\s*\n*\s*\\\[\s*\\boxed\s*{\s*([0-9.-]+)\s*}\s*\\\]",
        r"\*+Final\s+Answer\*+\s*\n*\s*\\\[\s*([0-9.-]+)\s*\\\]",
        r"\*?Final\s+Answer\*?\s*[:=]\s*([0-9.-]+)",
        r"[Tt]he\s+[Ff]inal\s+[Aa]nswer\s+[Ii]s\s*[:=]?\s*([0-9.-]+)",
        r"[Ff]inal\s+[Aa]nswer\s*[:=]\s*([0-9.-]+)",
    ]   

    # text_normalized = text.replace('\\\\', '\\')
    text_normalized = text
    print(text_normalized)

    for i, pattern in enumerate(patterns):
        match = re.search(pattern, text_normalized, re.DOTALL)
        if match:
            result = match.group(1).strip()
            return result
        else:
            print("Not found match")

    return None

def evaluate_answer(predicted, actual):
    if predicted is None:
        return False
    try:
        pred_val = float(predicted)
        actual_val = float(actual)
        return abs(pred_val - actual_val) < 1e-5
    except:
        return predicted.strip() == actual.strip()

def run_inference(model, tokenizer, question, max_new_tokens=16384):
    prompt = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": question},
    ]
    
    input_text = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
    
    generated_ids = model.generate(
        input_ids,
        max_new_tokens=max_new_tokens,
        use_cache=True
    )
    
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return response

def main():
    # 1. Load model and tokenizer
    model_path = "saves/qwen2-01/full/sft/checkpoint-44000"  # Replace with your model_path
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16,
        device_map="cuda"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)

    # 2. Load the dataset
    dataset = load_dataset("json", data_files="/home/syx/Qwen2.5-Math/evaluation/data/aime24/test.jsonl")  # Replace with the custom dataset
    eval_dataset = dataset["train"]
    
    # 3. Inference and validation
    results = {
        "correct": 0,
        "total": 0,
        "predictions": []
    }
    
    for item in tqdm(eval_dataset):
        question = item["question"]
        ground_truth = item["answer"] if "answer" in item else None
        
        # Inference
        start_time = time.time()
        response = run_inference(model, tokenizer, question)
        inference_time = time.time() - start_time
        
        # Extract
        predicted_answer = extract_final_answer(response)
        #print(response)
        #print(predicted_answer)
        
        # Evaluate
        is_correct = None
        if ground_truth is not None:
            is_correct = evaluate_answer(predicted_answer, ground_truth)
            print(is_correct)
            results["correct"] += int(is_correct)
            results["total"] += 1
            
        # Save
        results["predictions"].append({
            "question": question,
            "response": response,
            "extracted_answer": predicted_answer,
            "ground_truth": ground_truth,
            "is_correct": is_correct,
            "inference_time": inference_time
        })
        
    # 4. Output 
    if results["total"] > 0:
        accuracy = results["correct"] / results["total"] * 100
        print(f"\nAccuracy: {accuracy:.2f}%")
        print(f"Correct: {results['correct']}/{results['total']}")
    
    # Save to Disk 
    import json
    with open("aime24.jsonl", "w") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)

if __name__ == "__main__":
    main()

This is my evaluation script. I conducted testing based on transformers, using configurations consistent with those in the generation_config.json of the released model.
You can refer to the result folder to get the evaluation log.

rawsh

Jan 16, 2025

Thanks! It might be the answer parsing.

rawsh

Jan 16, 2025

For the full finetune does that include system prompt in the messages? Should I be evaluating with "You are a helpful assistant"?

jeremyii

Tiiny AI org Jan 16, 2025

For the full finetune no system prompt was included, but I did use a system prompt during evaluation.

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment