From 9f6e4c4ba52d20bd79f8d78e84138e2d873ef067 Mon Sep 17 00:00:00 2001 From: hailin Date: Tue, 9 Sep 2025 18:31:15 +0800 Subject: [PATCH] . --- ds_config_zero3_lora.json | 5 +++-- train_mm_zero3_lora.sh | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/ds_config_zero3_lora.json b/ds_config_zero3_lora.json index d1ee123..2757e03 100644 --- a/ds_config_zero3_lora.json +++ b/ds_config_zero3_lora.json @@ -1,4 +1,7 @@ { + "train_micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 1, + "bf16": { "enabled": true }, "fp16": { "enabled": false }, @@ -6,11 +9,9 @@ "stage": 3, "overlap_comm": true, "contiguous_gradients": true, - "reduce_bucket_size": 500000000, "stage3_prefetch_bucket_size": 200000000, "stage3_param_persistence_threshold": 1000000, - "offload_optimizer": { "device": "none" }, "offload_param": { "device": "none" } }, diff --git a/train_mm_zero3_lora.sh b/train_mm_zero3_lora.sh index b0643ee..aad88e6 100755 --- a/train_mm_zero3_lora.sh +++ b/train_mm_zero3_lora.sh @@ -6,7 +6,7 @@ FORCE_COLOR=1 deepspeed --hostfile hostfile \ --output_dir /home/test/checkpoints/q3-32b-lora \ --seq_len 4096 \ --bf16 \ - --gradient_accumulation_steps 64 \ + --gradient_accumulation_steps 1 \ --per_device_train_batch_size 1 \ --learning_rate 1e-4 \ --warmup_ratio 0.03 \