| { |
| "model": { |
| "vocab_size": 50257, |
| "hidden_size": 2048, |
| "num_hidden_layers": 24, |
| "num_attention_heads": 32, |
| "intermediate_size": 8192, |
| "hidden_act": "silu", |
| "max_position_embeddings": 2048, |
| "initializer_range": 0.02, |
| "layer_norm_eps": 1e-05, |
| "use_cache": false, |
| "rope_theta": 10000.0, |
| "attention_dropout": 0.0, |
| "hidden_dropout": 0.0 |
| }, |
| "training": { |
| "batch_size_per_device": 4, |
| "gradient_accumulation_steps": 8, |
| "learning_rate": 0.0006, |
| "min_learning_rate": 6e-05, |
| "weight_decay": 0.1, |
| "adam_beta1": 0.9, |
| "adam_beta2": 0.95, |
| "adam_epsilon": 1e-08, |
| "max_grad_norm": 1.0, |
| "warmup_steps": 2000, |
| "num_training_steps": 100000, |
| "logging_steps": 10, |
| "save_steps": 1000, |
| "eval_steps": 500, |
| "save_total_limit": 5, |
| "fp16": true, |
| "gradient_checkpointing": true |
| }, |
| "data": { |
| "dataset_name": "tiiuae/falcon-refinedweb", |
| "dataset_config": "default", |
| "text_column": "content", |
| "max_seq_length": 2048, |
| "num_workers": 8, |
| "preprocessing_num_workers": 16 |
| }, |
| "tokenizer": { |
| "tokenizer_name": "gpt2", |
| "add_special_tokens": true |
| }, |
| "infrastructure": { |
| "num_gpus": 4, |
| "seed": 42, |
| "output_dir": "./checkpoints", |
| "logging_dir": "./logs", |
| "resume_from_checkpoint": "./checkpoints/checkpoint-8000" |
| }, |
| "distributed": { |
| "backend": "nccl", |
| "find_unused_parameters": false |
| } |
| } |