This commit is contained in:
hailin 2025-08-28 10:52:17 +00:00
parent 8cbc4b948a
commit 871b1df4c4
4 changed files with 54 additions and 0 deletions

17
mm-zero3.sh Executable file
View File

@ -0,0 +1,17 @@
deepspeed --hostfile hostfile \
--num_nodes 6 --num_gpus 4 \
/home/test/jd_train/train_sft_ds.py \
--model_name_or_path /home/test/Qwen3-1.7B \
--data_glob "/home/test/datasets/my_corpus/train.jsonl" \
--output_dir /home/test/checkpoints/q3-1_7b-ds4 \
--seq_len 512 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 1 \
--learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \
--max_steps 62 \
--log_interval 1 \
--bf16 \
--deepspeed /home/test/jd_train/ds_config_zero3.json \
--report_to none \
--eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"

1
rm-all-checkpoints.sh Executable file
View File

@ -0,0 +1 @@
pdsh -R ssh -w tn[01-06] 'rm -rf ~/checkpoints/*'

17
ss-zero3.sh Executable file
View File

@ -0,0 +1,17 @@
export TOKENIZERS_PARALLELISM=false
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128"
export NCCL_DEBUG=INFO
torchrun --nproc_per_node 4 /home/test/jd_train/train_sft_ds.py \
--model_name_or_path /home/test/Qwen3-1.7B \
--data_glob "/home/test/datasets/my_corpus/train.jsonl" \
--output_dir /home/test/checkpoints/q3-1_7b-ds4 \
--seq_len 512 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 1 \
--learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \
--max_steps 375 --log_interval 1 \
--bf16 \
--deepspeed /home/test/jd_train/ds_config_zero3.json \
--report_to none \
--eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"

19
ss.sh Executable file
View File

@ -0,0 +1,19 @@
export TOKENIZERS_PARALLELISM=false
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128"
export NCCL_DEBUG=INFO
export CUDA_VISIBLE_DEVICES=0
torchrun --nproc_per_node 1 /home/test/jd_train/train_sft_ds.py \
--model_name_or_path /home/test/Qwen3-1.7B \
--data_glob "/home/test/datasets/my_corpus/train.jsonl" \
--output_dir /home/test/checkpoints/smoke-q3-1_7b-ds \
--seq_len 512 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 1 \
--learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \
--max_steps 1500 --log_interval 1 \
--bf16 \
--report_to none \
--deepspeed /home/test/jd_train/ds_config_zero3.json \
--eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"