{
  "train_micro_batch_size_per_gpu": 1,
  "gradient_accumulation_steps": 64,
  "steps_per_print": 0,
  "gradient_clipping": 1.0,

  "fp16": { "enabled": false },
  "bf16": { "enabled": true },

  "zero_optimization": {
    "stage": 3,
    "overlap_comm": true,
    "contiguous_gradients": true,
    "reduce_scatter": true,
    "reduce_bucket_size": 50000000,
    "stage3_prefetch_bucket_size": 50000000,
    "stage3_param_persistence_threshold": 100000,

    /* 关键：保存最终权重时做 16bit 聚合，方便直接 from_pretrained 加载 */
    "stage3_gather_16bit_weights_on_model_save": true
  },

  "activation_checkpointing": {
    "partition_activations": true,
    "contiguous_memory_optimization": true,
    "cpu_checkpointing": false,
    "number_checkpoints": 36,
    "profile": false,
    "synchronize_checkpoint_boundary": true
  },

  "wall_clock_breakdown": false
}