3.2 TB
5,727 files
Updated about 6 hours ago
NameSize
.eval_results
.gitattributes2.5 kB
xet
.gitignore22 Bytes
xet
2026-01-16T02-31-35-233Z_8072a61b-67e6-4618-8f35-5c5616aea2be.jsonl2.34 kB
xet
2026-01-16T02-37-34-075Z_4293a326-81ca-4327-b450-85275e1ca645.jsonl1.11 MB
xet
2026-01-16T02-44-55-123Z_99e99531-f376-4820-92f4-3c88afca3af9.jsonl712 kB
xet
2026-01-16T02-54-21-340Z_917258fc-d6b9-48f6-a846-41c328b35952.jsonl1.72 kB
xet
2026-01-16T02-58-05-814Z_14c806f6-4fde-4121-9a73-a8a167199723.jsonl5.08 kB
xet
2026-01-16T03-18-40-694Z_89afd3da-1fa3-45f9-87ad-a023f92372ee.jsonl8.05 kB
xet
2026-01-16T03-32-51-416Z_cf56c275-9716-42a7-b79e-c3225fe7f6d2.jsonl369 kB
xet
2026-01-16T03-48-59-923Z_49b070e9-d713-43aa-a3f0-7364a0ed6512.jsonl14.5 kB
xet
2026-01-16T10-50-40-927Z_13d61130-4ac2-41a7-8bf4-cdc1479cc5df.jsonl338 kB
xet
2026-01-16T11-12-35-796Z_105b45a9-be31-4997-95c3-c95ab814daf8.jsonl303 kB
xet
2026-01-16T11-27-41-663Z_7e5bb6ee-e3ca-4da8-86e5-079051bd15cf.jsonl109 kB
xet
2026-01-16T11-44-14-847Z_eedda2fa-e607-4779-b551-cfe751c7330f.jsonl273 kB
xet
2026-01-16T11-47-15-096Z_d93876c8-a8b4-4fe4-87fa-0a1a62a5555d.jsonl8.02 kB
xet
2026-01-16T19-15-10-809Z_6759de8e-a889-440b-b1e0-1a3f935fc278.jsonl146 kB
xet
2026-01-16T19-35-54-247Z_90c2c1e4-c1b7-43e0-9b47-3129d845adab.jsonl807 kB
xet
2026-01-16T20-16-36-972Z_a81208d7-8442-4d76-b258-10c068310f6b.jsonl90.8 kB
xet
2026-01-16T20-31-49-504Z_a86d0690-2d5b-4aa2-bad1-30ce4d39f641.jsonl244 kB
xet
2026-01-16T20-55-34-252Z_e18dee76-e4fc-4225-93aa-fef27bb337f8.jsonl195 kB
xet
2026-01-16T21-18-39-318Z_4801d454-021d-4fc1-9cc0-fc3cf8598ee2.jsonl306 kB
xet
2026-01-16T21-53-12-476Z_1375d9e7-7f20-48f5-8fc8-fae0ae4f14bc.jsonl273 kB
xet
2026-01-16T21-57-50-280Z_fcb17499-fc42-4271-8505-b67d25b41b98.jsonl547 kB
xet
2026-01-16T22-22-41-061Z_997112cd-f6dd-4793-971b-2f4874465759.jsonl180 kB
xet
2026-01-16T23-01-06-683Z_7ff3b620-79b9-409b-8f1a-b53d7ccc4244.jsonl175 kB
xet
2026-01-16T23-10-40-461Z_ce4a39ac-761e-4e80-9351-548256907ead.jsonl93.3 kB
xet
2026-01-16T23-16-39-577Z_5fa9f7c7-fdb5-47b1-8a0f-5807e8a5e294.jsonl206 kB
xet
2026-01-16T23-26-15-892Z_79258f4c-347d-446e-bd6b-d98a48c5697d.jsonl224 kB
xet
2026-01-16T23-40-31-481Z_8ff6046f-bf83-40e0-ada5-53bcf3c526ab.jsonl5.55 kB
xet
2026-01-17T09-35-40-998Z_9ca183c6-a1f8-4dd6-a093-4b5ecf19345b.jsonl2.14 kB
xet
2026-01-17T10-00-58-318Z_fab6b9b8-0c6a-4e12-b9b9-14a75041bc53.jsonl164 kB
xet
2026-01-17T10-24-33-379Z_00f02fa6-b9ff-4027-8877-30442cee47d1.jsonl931 kB
xet
2026-01-17T10-57-44-566Z_8ccfe986-6efc-423c-9482-8e3b0b546f90.jsonl2.41 kB
xet
2026-01-17T10-58-02-353Z_fe9c43ce-e8ae-4ad6-88e9-6d636d72612e.jsonl37.7 kB
xet
2026-01-17T12-23-49-439Z_0be79071-cc59-4ff2-96c6-51a1d7238c35.jsonl280 kB
xet
2026-01-17T19-14-18-000Z_0a8e6e29-9685-4e24-8f98-7999d28060b6.jsonl328 kB
xet
2026-01-17T19-51-25-958Z_db328990-395c-44bd-8317-d0aa41c34538.jsonl280 kB
xet
2026-01-17T20-01-24-791Z_90081a50-9706-4c54-96da-5afe2c3906f8.jsonl1.09 MB
xet
2026-01-17T20-45-04-077Z_99455d99-6135-4f32-9f03-bb15c079fbfa.jsonl172 kB
xet
2026-01-17T20-53-31-151Z_258012c1-7429-4d68-adea-dfd54105338e.jsonl7.54 kB
xet
2026-01-18T10-49-51-429Z_6673f4c8-6be0-47c5-9cf7-d3e74dbc39d2.jsonl2.55 MB
xet
2026-01-18T11-13-14-021Z_472cf2fa-ab5b-49be-9c1a-42c0c5978980.jsonl35.4 kB
xet
2026-01-18T16-16-07-069Z_b13d4ec5-7518-4486-9f00-e0533eeec6f2.jsonl471 kB
xet
2026-01-18T16-23-45-506Z_2cb902dc-e3d8-411c-807d-356b47c4b8c1.jsonl107 kB
xet
2026-01-18T16-38-18-109Z_9d8191fb-0bd5-451b-91e1-4ab60838e883.jsonl196 kB
xet
2026-01-18T18-13-16-670Z_1cc2db72-5054-4e19-9c5b-203da3e69261.jsonl205 kB
xet
2026-01-18T18-19-34-124Z_484c5a43-37f5-4a77-8ea5-d5ceb0df3a7a.jsonl989 kB
xet
2026-01-18T19-16-14-853Z_d3634f1b-1a6f-4493-a1f4-c1b83c98252f.jsonl207 kB
xet
2026-01-18T19-22-53-304Z_ff2bc754-b374-4c96-b62a-d8680b418832.jsonl6.22 kB
xet
2026-01-18T19-26-05-852Z_09517659-75bd-41e4-83dd-3cce63021752.jsonl24.8 kB
xet
2026-01-18T23-15-20-921Z_9c2cca56-9042-4f9a-bf32-6f60bfbcce53.jsonl400 kB
xet
2026-01-18T23-19-54-131Z_be960e8e-60c6-494e-8a46-d2806b5032bc.jsonl201 kB
xet
2026-01-19T12-38-59-280Z_2e618260-6230-4e7f-8190-e94d965f1bbe.jsonl2.06 kB
xet
2026-01-19T12-41-33-774Z_16648ee0-28fa-40ef-8fb3-d2e6a19d76ce.jsonl4.56 kB
xet
2026-01-19T12-43-04-788Z_0af1e524-99c7-4b33-aee8-de7e7aef313a.jsonl11.6 kB
xet
2026-01-19T12-43-52-780Z_65bcd2e4-8575-4852-b5f5-8304c0cc9227.jsonl12.3 kB
xet
2026-01-19T12-46-02-613Z_db30d656-dc22-487f-b1b6-4c74775893c2.jsonl268 kB
xet
2026-01-19T13-19-48-302Z_5ffb409e-2158-45a6-bf5c-59f9bbdc54f4.jsonl614 kB
xet
2026-01-19T13-22-10-573Z_87c036cb-5afc-478a-8e88-770f2afca842.jsonl523 kB
xet
2026-01-19T13-27-11-032Z_323a5dfc-366f-4f69-ab63-a46b6ca5c3a6.jsonl923 kB
xet
2026-01-19T15-01-52-822Z_732a795a-8170-42f1-b718-4a1e2e5800fb.jsonl237 kB
xet
2026-01-19T15-05-48-893Z_4058175a-2afa-49fb-9b1a-860abbc8debc.jsonl125 kB
xet
2026-01-19T15-10-53-158Z_e600a390-da4d-4c9f-868e-9c0394f5869c.jsonl177 kB
xet
2026-01-19T15-42-48-058Z_6668e388-42ed-4257-bc20-0525c1cdd09c.jsonl89 kB
xet
2026-01-19T21-26-05-775Z_c3f74ad9-f358-40e5-bd43-7ffeec60f1e3.jsonl207 kB
xet
2026-01-20T00-33-40-568Z_bf06cedc-545d-45fc-86cb-1d82e4639f0a.jsonl1.09 MB
xet
2026-01-20T13-48-52-215Z_f812c572-b123-49f0-b478-5ba486ef39f0.jsonl3.54 kB
xet
2026-01-20T14-11-35-307Z_249e9b18-b23b-4c27-a431-65a9972213b9.jsonl145 kB
xet
2026-01-20T14-23-30-958Z_7d90420c-e1d2-4d39-b111-6f6b30aaf318.jsonl11 kB
xet
2026-01-20T14-24-27-695Z_3b343cc1-c0c2-4eef-af2b-c6895736343d.jsonl18.6 kB
xet
2026-01-20T14-51-00-767Z_a55ec578-8049-4481-a999-05a32f01c8d5.jsonl301 kB
xet
2026-01-20T23-14-49-565Z_d12a6005-1a87-41f6-84a2-d9d3e4a16dd5.jsonl178 kB
xet
2026-01-20T23-25-37-924Z_bdc14b4d-6efb-49e4-81ec-214ada5baec3.jsonl159 kB
xet
2026-01-20T23-39-55-405Z_f14d9472-2470-460e-a3e3-f3b11810630a.jsonl126 kB
xet
2026-01-21T12-42-13-273Z_945a84e1-497c-4f80-ac9d-0b79362ad723.jsonl95.3 kB
xet
2026-01-21T22-25-59-585Z_ec7037ba-c439-4af4-abdd-b0c60b2416c2.jsonl309 kB
xet
2026-01-21T22-34-07-476Z_920e719e-b793-41d6-88aa-b5679580045f.jsonl27.2 kB
xet
2026-01-21T22-38-31-250Z_e86a388c-5413-4a0c-a1f8-baf7f823f5de.jsonl663 kB
xet
2026-01-21T23-18-35-115Z_c9acbd2c-9237-450d-9f2c-82d7fcf3dbec.jsonl10.6 kB
xet
2026-01-21T23-22-25-183Z_5d06e00c-75d1-4d94-83ad-b983e12ee0d8.jsonl7.08 kB
xet
2026-01-22T00-04-14-484Z_3698201b-d2bd-467d-a7cf-532dc14ec2c3.jsonl582 kB
xet
2026-01-22T00-36-13-233Z_c4df8d74-00e6-431f-aa21-27c351d30d65.jsonl66.3 kB
xet
2026-01-22T00-40-19-757Z_26f1d6b3-b95d-440c-90f1-0b587471e7dc.jsonl34.4 kB
xet
2026-01-22T00-48-46-900Z_2bd0d7a1-321b-438e-85f5-73aa84a5bdba.jsonl121 kB
xet
2026-01-22T00-49-52-613Z_d67f06cd-752a-4f83-9eb5-23766e405d13.jsonl405 kB
xet
2026-01-22T00-53-49-148Z_5b4cb96e-191a-4fe5-9a6b-d714504a78c8.jsonl266 kB
xet
2026-01-22T02-01-33-494Z_44c9d3c0-b9a5-452c-96c5-08e89f04c308.jsonl248 kB
xet
2026-01-22T12-18-53-795Z_d25c0139-dbca-4232-a742-7638923537ac.jsonl462 kB
xet
2026-01-22T20-27-58-850Z_5e6c9973-0cf8-45d1-8113-1cda04125c00.jsonl139 kB
xet
2026-01-22T20-42-26-950Z_6265ff58-8596-4115-baa8-cc7a77f2922a.jsonl26.9 kB
xet
2026-01-22T20-46-18-293Z_9b87d3c5-5a8f-48a0-83a6-74de91dc9ced.jsonl229 kB
xet
2026-01-22T20-50-45-596Z_c7b03879-c00a-4d67-bca4-edf38ffc8cfb.jsonl429 kB
xet
2026-01-22T21-30-43-133Z_af9d4c37-5c80-4db5-b8e5-87e666801542.jsonl343 kB
xet
2026-01-22T21-45-26-714Z_cad40af4-0d89-4fc7-a830-a799d7c0e080.jsonl313 kB
xet
2026-01-22T22-03-47-508Z_25655705-a26f-4fe5-9937-f9530282a7ad.jsonl193 kB
xet
2026-01-22T22-14-34-084Z_d9b4cc16-bdd4-406e-b651-4187a6deb45b.jsonl620 kB
xet
2026-01-23T00-26-06-942Z_ece0704b-a507-4a7c-a54a-20b9bed31fc8.jsonl18.7 kB
xet
2026-01-23T16-35-35-399Z_64de8e0d-1082-4dfa-b345-5f32809d608e.jsonl245 kB
xet
2026-01-23T18-51-26-313Z_6ed2cf56-b258-4684-8d76-190cb7ab5cab.jsonl1.86 kB
xet
README.md

This dataset was generated using teich by TeichAI

Prepare these datasets for supervised fine-tuning in just a few lines of code — see the Conversion section below.

DeepSeek v4 Pro Agent Traces

This directory contains raw agent trace files generated by teich.

All assistant responses were generated by deepseek/deepseek-v4-pro.

JSONL files: 4006

Training-ready tools

A complete configured tools schema snapshot is embedded in the collapsed section at the bottom of this README. Use it when rendering loaded examples through your training chat template. load_traces applies this snapshot to each loaded example as the tools field.

Format

Each file is newline-delimited JSON representing a single captured agent session. The trace schema is designed for upload-first preservation so you can keep the original session history and convert it later for training.

Common top-level event groups:

  • session_meta
  • turn_context
  • event_msg
  • response_item
  • session
  • message
  • session_info
  • model_change
  • thinking_level_change

Example

{"type":"session","version":3,"id":"019e03aa-8ec4-768e-9833-edc257e9203a","timestamp":"2026-05-07T18:19:29.863Z","cwd":"/workspace"}
{"type":"message","id":"system-bca663f1","parentId":null,"timestamp":"2026-05-07T18:19:31.559Z","message":{"role":"developer","content":[{"type":"text","text":"You are an expert coding assistant operating inside pi, a coding agent harness. You help users by reading files, executing commands, editing code, and writing new files.\n\nAvailable tools:\n- read: Read file contents\n- bash: Execute bash commands (ls, grep, find, etc.)\n- edit: Make precise file edits with exact text replacement, including multiple disjoint edits in one call\n- write: Create or overwrite files\n\nIn addition to the tools above, you may have access to other custom tools depending on the project.\n\nGuidelines:\n- Use bash for file operations like ls, rg, find\n- Use read to examine files instead of cat or sed.\n- Use edit for precise changes (edits[].oldText must match exactly)\n- When changing multiple separate locations in one file, use one edit call with multiple entries in edits[] instead of multiple edit calls\n- Each edits[].oldText is matched against the original file, not after earlier edits are applied. Do not emit overlapping or nested edits. Merge nearby changes into one edit.\n- Keep edits[].oldText as small as possible while still being unique in the file. Do not pad with large unchanged regions.\n- Use write only for new files or complete rewrites.\n- Be concise in your responses\n- Show file paths clearly when working with files\n\nPi documentation (read only when the user asks about pi itself, its SDK, extensions, themes, skills, or TUI):\n- Main documentation: /usr/local/lib/node_modules/@mariozechner/pi-coding-agent/README.md\n- Additional docs: /usr/local/lib/node_modules/@mariozechner/pi-coding-agent/docs\n- Examples: /usr/local/lib/node_modules/@mariozechner/pi-coding-agent/examples (extensions, custom tools, SDK)\n- When asked about: extensions (docs/extensions.md, examples/extensions/), themes (docs/themes.md), skills (docs/skills.md), prompt templates (docs/prompt-templates.md), TUI components (docs/tui.md), keybindings (docs/keybindings.md), SDK integrations (docs/sdk.md), custom providers (docs/custom-provider.md), adding models (docs/models.md), pi packages (docs/packages.md)\n- When working on pi topics, read the docs and examples, and follow .md cross-references before implementing\n- Always read pi .md files completely and follow links to related docs (e.g., tui.md for TUI API details)\nCurrent date: 2026-05-07\nCurrent working directory: /workspace"}]}}
{"type":"model_change","id":"5d586b96","parentId":null,"timestamp":"2026-05-07T18:19:31.376Z","modelId":"deepseek/deepseek-v4-pro"}

Conversion

Recommended: train with Unsloth and TRL SFTTrainer

Use the trainer-first path: prepare_data renders trainer-friendly text rows with Teich supervision metadata, SFTTrainer tokenizes them, then mask_data applies Teich's multi-turn/tool-aware response-only labels:

import os

from unsloth import FastLanguageModel
import torch
from trl import SFTConfig, SFTTrainer

from teich import mask_data, prepare_data

MAX_SEQ_LEN = 32768
MODEL_NAME = 'unsloth/Qwen3.5-0.8B'
CHAT_TEMPLATE_KWARGS = {'enable_thinking': True}
PUSH_TO_HUB_REPO_ID = 'username/teich-sft-model'
HF_TOKEN = os.environ.get('HF_TOKEN') or ''

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=MAX_SEQ_LEN,
    load_in_4bit=False,
    load_in_8bit=False,
    full_finetuning=False,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'out_proj'],
    lora_alpha=64,
    lora_dropout=0,
    bias='none',
    use_gradient_checkpointing='unsloth',
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

train_dataset = prepare_data(
    'armand0e/DeepSeek-v4-Pro-Agent',
    tokenizer,
    split='train',
    max_examples=500,
    chat_template_kwargs=CHAT_TEMPLATE_KWARGS,
    max_length=MAX_SEQ_LEN,
    drop_oversized_examples=True,
    tokenize=True,
    strict=True,
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=None,
    args=SFTConfig(
        dataset_text_field='text',
        dataset_num_proc=1,
        max_length=MAX_SEQ_LEN,
        packing=False,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=1,
        learning_rate=2e-4,
        logging_steps=1,
        optim='muon',
        optim_target_modules='all-linear',
        weight_decay=0.001,
        lr_scheduler_type='linear',
        output_dir='outputs',
        seed=3407,
        report_to='none',
    ),
)

trainer = mask_data(
    trainer,
    tokenizer=tokenizer,
    train_on_reasoning=True,
    train_on_final_answers=True,
    train_on_tools=True,
)

trainer_stats = trainer.train(resume_from_checkpoint=False)

model.push_to_hub_merged(PUSH_TO_HUB_REPO_ID, tokenizer, save_method='merged_16bit', token=HF_TOKEN)

mask_data keeps the normal trainer configuration flow while applying Teich's assistant/tool-call labels after trainer tokenization. Keep packing=False for this flow. If you want standard next-token training without Teich response-only labels, call prepare_data(..., teich_masking=False) and skip mask_data().

You can combine this dataset with other Teich chat-only or tool-call datasets by passing a list of dataset IDs, local paths, or loaded datasets.Dataset objects:

train_dataset = prepare_data(
    ['armand0e/DeepSeek-v4-Pro-Agent', 'username/other-teich-dataset'],
    tokenizer,
    max_length=MAX_SEQ_LEN,
    drop_oversized_examples=True,
    tokenize=True,
    chat_template_kwargs=CHAT_TEMPLATE_KWARGS,
)

Fallback: render loaded examples with your tokenizer

Use load_traces directly only when you want to own the remaining training pipeline yourself: chat-template rendering, filtering, tokenization, label masking, packing policy, and auditing. load_traces returns rows with normalized messages ready for tokenizer.apply_chat_template(...):

from teich import load_traces

dataset = load_traces('armand0e/DeepSeek-v4-Pro-Agent')
example = dataset[0]
rendered = tokenizer.apply_chat_template(
    example['messages'],
    tools=example.get('tools') or [],
    tokenize=False,
    add_generation_prompt=False,
    enable_thinking=True,
)

Tool schema snapshot

Training-ready tool schema snapshot
[
  {
    "type": "function",
    "function": {
      "name": "bash",
      "description": "Run shell commands in the workspace.",
      "parameters": {
        "type": "object",
        "properties": {
          "cmd": {
            "type": "string"
          },
          "cwd": {
            "type": "string"
          }
        },
        "required": [
          "cmd"
        ],
        "additionalProperties": true
      }
    }
  },
  {
    "type": "function",
    "function": {
      "name": "read_file",
      "description": "Read file contents from the workspace.",
      "parameters": {
        "type": "object",
        "properties": {
          "path": {
            "type": "string"
          }
        },
        "required": [
          "path"
        ],
        "additionalProperties": true
      }
    }
  },
  {
    "type": "function",
    "function": {
      "name": "write_file",
      "description": "Write file contents in the workspace.",
      "parameters": {
        "type": "object",
        "properties": {
          "path": {
            "type": "string"
          },
          "content": {
            "type": "string"
          }
        },
        "required": [
          "path",
          "content"
        ],
        "additionalProperties": true
      }
    }
  }
]
Total size
3.2 TB
Files
5,727
Last updated
Jun 4
Pre-warmed CDN
US EU US EU

Contributors