diff --git a/mm-zero3.sh b/mm-zero3.sh index ca051f5..a68ebc5 100755 --- a/mm-zero3.sh +++ b/mm-zero3.sh @@ -41,5 +41,6 @@ deepspeed --hostfile hostfile \ --bf16 \ --deepspeed /home/test/jd_train/ds_config_zero3.json \ --report_to wandb \ + --eval_steps 10 \ --eval_data_glob "/home/test/datasets/my_corpus/test.jsonl" diff --git a/train_sft_ds.py b/train_sft_ds.py index 34777d5..fd1ece4 100644 --- a/train_sft_ds.py +++ b/train_sft_ds.py @@ -887,6 +887,8 @@ def parse_args(): help="for deepspeed/torchrun launcher; ignored by user code") ap.add_argument("--per_device_eval_batch_size", type=int, default=1) ap.add_argument("--deepspeed", type=str, default=None) + ap.add_argument("--eval_steps", type=int, default=10, + help="Evaluate every N optimizer steps when eval_dataset is provided") return ap.parse_args() @@ -1240,7 +1242,9 @@ def main(): logging_dir=logging_dir, do_train=True, do_eval=(eval_dataset is not None), - eval_steps=max(50, args.save_steps // 5) if eval_dataset is not None else None, + # eval_steps=max(50, args.save_steps // 5) if eval_dataset is not None else None, + # 用用户指定的 eval_steps;没有 eval 集就 None + eval_steps=(args.eval_steps if eval_dataset is not None else None), per_device_train_batch_size=args.per_device_train_batch_size, gradient_accumulation_steps=args.gradient_accumulation_steps, learning_rate=args.learning_rate,