This commit is contained in:
hailin 2025-09-01 19:31:44 +08:00
parent 20ee10d2aa
commit b8fb783871
3 changed files with 0 additions and 50 deletions

View File

@ -1,25 +0,0 @@
{
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 1,
"zero_optimization": {
"stage": 3,
"overlap_comm": true,
"contiguous_gradients": true,
"reduce_bucket_size": 80000000,
"stage3_prefetch_bucket_size": 40000000,
"stage3_param_persistence_threshold": 0,
"offload_optimizer": { "device": "none" },
"offload_param": { "device": "none" },
"stage3_gather_16bit_weights_on_model_save": false
},
"bf16": { "enabled": true },
"fp16": { "enabled": false },
"gradient_clipping": 1.0,
"wall_clock_breakdown": false
}

View File

@ -847,7 +847,6 @@ def main():
logging_steps=args.log_interval,
save_steps=args.save_steps,
save_total_limit=2,
optim="adamw_torch",
# deepspeed=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
deepspeed=(args.deepspeed if use_ds else None),
dataloader_drop_last=False,
@ -884,29 +883,6 @@ def main():
trainer_kwargs["tokenizer"] = tokenizer
decay_params, no_decay_params = [], []
for n, p in model.named_parameters():
if not p.requires_grad:
continue
if any(nd in n for nd in ["bias", "LayerNorm.weight", "layer_norm.weight", "norm.weight", "ln_f.weight"]):
no_decay_params.append(p)
else:
decay_params.append(p)
optimizer_grouped_parameters = [
{"params": decay_params, "weight_decay": args.weight_decay},
{"params": no_decay_params, "weight_decay": 0.0},
]
optimizer = TorchAdamW(
optimizer_grouped_parameters,
lr=args.learning_rate,
betas=(0.9, 0.999),
eps=1e-8,
)
trainer = DebugTrainer(
model=model,
args=training_args,
@ -915,7 +891,6 @@ def main():
#tokenizer=tokenizer,
#processing_class=tokenizer,
data_collator=data_collator,
optimizers=(optimizer, None),
**trainer_kwargs,
)