From 47c5cbe68c2fe0713882c60ea53bf5211748fd75 Mon Sep 17 00:00:00 2001 From: hailin Date: Tue, 9 Sep 2025 23:02:04 +0800 Subject: [PATCH] . --- ds_config_zero3_lora.json | 13 +++++++------ train_mm_zero3_lora.sh | 2 +- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/ds_config_zero3_lora.json b/ds_config_zero3_lora.json index 940531e..da93499 100644 --- a/ds_config_zero3_lora.json +++ b/ds_config_zero3_lora.json @@ -1,21 +1,22 @@ { "train_micro_batch_size_per_gpu": 1, "gradient_accumulation_steps": 4, - "bf16": { "enabled": true }, "fp16": { "enabled": false }, - "zero_optimization": { "stage": 3, "overlap_comm": true, "contiguous_gradients": true, - "reduce_bucket_size": 500000000, - "stage3_prefetch_bucket_size": 200000000, + "allgather_partitions": true, + "reduce_scatter": true, + "round_robin_gradients": true, + "reduce_bucket_size": 150000000, + "stage3_prefetch_bucket_size": 100000000, "stage3_param_persistence_threshold": 1000000, "offload_optimizer": { "device": "none" }, - "offload_param": { "device": "none" } + "offload_param": { "device": "none" } }, - + "stage3_gather_16bit_weights_on_model_save": false, "gradient_clipping": 1.0, "wall_clock_breakdown": false } diff --git a/train_mm_zero3_lora.sh b/train_mm_zero3_lora.sh index 4775743..6a14c3a 100755 --- a/train_mm_zero3_lora.sh +++ b/train_mm_zero3_lora.sh @@ -16,5 +16,5 @@ FORCE_COLOR=1 deepspeed --hostfile hostfile \ --log_interval 10 \ --eval_steps 50 \ --gradient_checkpointing \ - --deepspeed /home/test/jd_train/ds_config_zero3_lora_gpu.json \ + --deepspeed /home/test/jd_train/ds_config_zero3_lora.json \ --report_to wandb --wandb_project ds-qwen3-lora