This commit is contained in:
hailin 2025-08-31 13:25:44 +08:00
parent 77a228df5e
commit fcf21b7660
2 changed files with 5 additions and 3 deletions

View File

@ -7,9 +7,9 @@
"overlap_comm": true,
"contiguous_gradients": true,
"reduce_bucket_size": 150000000,
"stage3_prefetch_bucket_size": 75000000,
"stage3_param_persistence_threshold": 1000000,
"reduce_bucket_size": 100000000,
"stage3_prefetch_bucket_size": 50000000,
"stage3_param_persistence_threshold": 0,
"offload_optimizer": { "device": "none" },
"offload_param": { "device": "none" },

View File

@ -4,6 +4,8 @@ export TORCH_EXTENSIONS_DIR=/tmp/$USER/torch_ext
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.9"
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:64"
deepspeed --hostfile hostfile \
--num_nodes 6 --num_gpus 4 \
/home/test/jd_train/train_sft_ds.py \