.

2025-09-09 18:31:15 +08:00 · 2025-09-09 18:31:15 +08:00 · 9f6e4c4ba5
parent 7f0b897d47
commit 9f6e4c4ba5
2 changed files with 4 additions and 3 deletions
--- a/ds_config_zero3_lora.json
+++ b/ds_config_zero3_lora.json
@ -1,4 +1,7 @@
 {
+  "train_micro_batch_size_per_gpu": 1,
+  "gradient_accumulation_steps": 1,
+
  "bf16": { "enabled": true },
  "fp16": { "enabled": false },

@ -6,11 +9,9 @@
    "stage": 3,
    "overlap_comm": true,
    "contiguous_gradients": true,
-
    "reduce_bucket_size": 500000000,
    "stage3_prefetch_bucket_size": 200000000,
    "stage3_param_persistence_threshold": 1000000,
-
    "offload_optimizer": { "device": "none" },
    "offload_param":     { "device": "none" }
  },
--- a/train_mm_zero3_lora.sh
+++ b/train_mm_zero3_lora.sh
@ -6,7 +6,7 @@ FORCE_COLOR=1  deepspeed --hostfile hostfile \
    --output_dir /home/test/checkpoints/q3-32b-lora \
    --seq_len 4096 \
    --bf16 \
-    --gradient_accumulation_steps 64 \
+    --gradient_accumulation_steps 1 \
    --per_device_train_batch_size 1 \
    --learning_rate 1e-4 \
    --warmup_ratio 0.03 \