From 871b1df4c4243703c9f8e7d2d7ac6af10d401199 Mon Sep 17 00:00:00 2001 From: hailin Date: Thu, 28 Aug 2025 10:52:17 +0000 Subject: [PATCH] . --- mm-zero3.sh | 17 +++++++++++++++++ rm-all-checkpoints.sh | 1 + ss-zero3.sh | 17 +++++++++++++++++ ss.sh | 19 +++++++++++++++++++ 4 files changed, 54 insertions(+) create mode 100755 mm-zero3.sh create mode 100755 rm-all-checkpoints.sh create mode 100755 ss-zero3.sh create mode 100755 ss.sh diff --git a/mm-zero3.sh b/mm-zero3.sh new file mode 100755 index 0000000..5c8a7fc --- /dev/null +++ b/mm-zero3.sh @@ -0,0 +1,17 @@ +deepspeed --hostfile hostfile \ + --num_nodes 6 --num_gpus 4 \ + /home/test/jd_train/train_sft_ds.py \ + --model_name_or_path /home/test/Qwen3-1.7B \ + --data_glob "/home/test/datasets/my_corpus/train.jsonl" \ + --output_dir /home/test/checkpoints/q3-1_7b-ds4 \ + --seq_len 512 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 1 \ + --learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \ + --max_steps 62 \ + --log_interval 1 \ + --bf16 \ + --deepspeed /home/test/jd_train/ds_config_zero3.json \ + --report_to none \ + --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl" + diff --git a/rm-all-checkpoints.sh b/rm-all-checkpoints.sh new file mode 100755 index 0000000..1371306 --- /dev/null +++ b/rm-all-checkpoints.sh @@ -0,0 +1 @@ +pdsh -R ssh -w tn[01-06] 'rm -rf ~/checkpoints/*' diff --git a/ss-zero3.sh b/ss-zero3.sh new file mode 100755 index 0000000..8e93f2b --- /dev/null +++ b/ss-zero3.sh @@ -0,0 +1,17 @@ +export TOKENIZERS_PARALLELISM=false +export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128" +export NCCL_DEBUG=INFO + +torchrun --nproc_per_node 4 /home/test/jd_train/train_sft_ds.py \ + --model_name_or_path /home/test/Qwen3-1.7B \ + --data_glob "/home/test/datasets/my_corpus/train.jsonl" \ + --output_dir /home/test/checkpoints/q3-1_7b-ds4 \ + --seq_len 512 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 1 \ + --learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \ + --max_steps 375 --log_interval 1 \ + --bf16 \ + --deepspeed /home/test/jd_train/ds_config_zero3.json \ + --report_to none \ + --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl" diff --git a/ss.sh b/ss.sh new file mode 100755 index 0000000..d82ac4b --- /dev/null +++ b/ss.sh @@ -0,0 +1,19 @@ +export TOKENIZERS_PARALLELISM=false +export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128" +export NCCL_DEBUG=INFO +export CUDA_VISIBLE_DEVICES=0 + +torchrun --nproc_per_node 1 /home/test/jd_train/train_sft_ds.py \ + --model_name_or_path /home/test/Qwen3-1.7B \ + --data_glob "/home/test/datasets/my_corpus/train.jsonl" \ + --output_dir /home/test/checkpoints/smoke-q3-1_7b-ds \ + --seq_len 512 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 1 \ + --learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \ + --max_steps 1500 --log_interval 1 \ + --bf16 \ + --report_to none \ + --deepspeed /home/test/jd_train/ds_config_zero3.json \ + --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl" +