.

2025-08-28 10:52:17 +00:00 · 2025-08-28 10:52:17 +00:00 · 871b1df4c4
parent 8cbc4b948a
commit 871b1df4c4
4 changed files with 54 additions and 0 deletions
--- a/mm-zero3.sh
+++ b/mm-zero3.sh
@ -0,0 +1,17 @@
+deepspeed --hostfile hostfile \
+  --num_nodes 6 --num_gpus 4 \
+  /home/test/jd_train/train_sft_ds.py \
+    --model_name_or_path /home/test/Qwen3-1.7B \
+    --data_glob "/home/test/datasets/my_corpus/train.jsonl" \
+    --output_dir /home/test/checkpoints/q3-1_7b-ds4 \
+    --seq_len 512 \
+    --per_device_train_batch_size 1 \
+    --gradient_accumulation_steps 1 \
+    --learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \
+    --max_steps 62 \
+    --log_interval 1 \
+    --bf16 \
+    --deepspeed /home/test/jd_train/ds_config_zero3.json \
+    --report_to none \
+    --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"
+
--- a/rm-all-checkpoints.sh
+++ b/rm-all-checkpoints.sh
@ -0,0 +1 @@
+pdsh -R ssh -w tn[01-06] 'rm -rf ~/checkpoints/*'
--- a/ss-zero3.sh
+++ b/ss-zero3.sh
@ -0,0 +1,17 @@
+export TOKENIZERS_PARALLELISM=false
+export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128"
+export NCCL_DEBUG=INFO
+
+torchrun --nproc_per_node 4 /home/test/jd_train/train_sft_ds.py \
+  --model_name_or_path /home/test/Qwen3-1.7B \
+  --data_glob "/home/test/datasets/my_corpus/train.jsonl" \
+  --output_dir /home/test/checkpoints/q3-1_7b-ds4 \
+  --seq_len 512 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \
+  --max_steps 375 --log_interval 1 \
+  --bf16 \
+  --deepspeed /home/test/jd_train/ds_config_zero3.json \
+  --report_to none \
+  --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"
--- a/ss.sh
+++ b/ss.sh
@ -0,0 +1,19 @@
+export TOKENIZERS_PARALLELISM=false
+export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128"
+export NCCL_DEBUG=INFO
+export CUDA_VISIBLE_DEVICES=0
+
+torchrun --nproc_per_node 1 /home/test/jd_train/train_sft_ds.py \
+  --model_name_or_path /home/test/Qwen3-1.7B \
+  --data_glob "/home/test/datasets/my_corpus/train.jsonl" \
+  --output_dir /home/test/checkpoints/smoke-q3-1_7b-ds \
+  --seq_len 512 \
+  --per_device_train_batch_size 1 \
+  --gradient_accumulation_steps 1 \
+  --learning_rate 2e-5 --weight_decay 0.1 --warmup_ratio 0.02 \
+  --max_steps 1500 --log_interval 1 \
+  --bf16 \
+  --report_to none \
+  --deepspeed /home/test/jd_train/ds_config_zero3.json \
+  --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl"
+
				`@ -0,0 +1 @@`
				`pdsh -R ssh -w tn[01-06] 'rm -rf ~/checkpoints/*'`