From 9f6e4c4ba52d20bd79f8d78e84138e2d873ef067 Mon Sep 17 00:00:00 2001
From: hailin <hailin@gdzx.xyz>
Date: Tue, 9 Sep 2025 18:31:15 +0800
Subject: [PATCH] .

---
 ds_config_zero3_lora.json | 5 +++--
 train_mm_zero3_lora.sh    | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/ds_config_zero3_lora.json b/ds_config_zero3_lora.json
index d1ee123..2757e03 100644
--- a/ds_config_zero3_lora.json
+++ b/ds_config_zero3_lora.json
@@ -1,4 +1,7 @@
 {
+  "train_micro_batch_size_per_gpu": 1,
+  "gradient_accumulation_steps": 1,
+
   "bf16": { "enabled": true },
   "fp16": { "enabled": false },
 
@@ -6,11 +9,9 @@
     "stage": 3,
     "overlap_comm": true,
     "contiguous_gradients": true,
-
     "reduce_bucket_size": 500000000,
     "stage3_prefetch_bucket_size": 200000000,
     "stage3_param_persistence_threshold": 1000000,
-
     "offload_optimizer": { "device": "none" },
     "offload_param":     { "device": "none" }
   },
diff --git a/train_mm_zero3_lora.sh b/train_mm_zero3_lora.sh
index b0643ee..aad88e6 100755
--- a/train_mm_zero3_lora.sh
+++ b/train_mm_zero3_lora.sh
@@ -6,7 +6,7 @@ FORCE_COLOR=1  deepspeed --hostfile hostfile \
     --output_dir /home/test/checkpoints/q3-32b-lora \
     --seq_len 4096 \
     --bf16 \
-    --gradient_accumulation_steps 64 \
+    --gradient_accumulation_steps 1 \
     --per_device_train_batch_size 1 \
     --learning_rate 1e-4 \
     --warmup_ratio 0.03 \