This commit is contained in:
parent
20ee10d2aa
commit
b8fb783871
|
|
@ -1,25 +0,0 @@
|
||||||
{
|
|
||||||
"train_micro_batch_size_per_gpu": 1,
|
|
||||||
"gradient_accumulation_steps": 1,
|
|
||||||
|
|
||||||
"zero_optimization": {
|
|
||||||
"stage": 3,
|
|
||||||
"overlap_comm": true,
|
|
||||||
"contiguous_gradients": true,
|
|
||||||
|
|
||||||
"reduce_bucket_size": 80000000,
|
|
||||||
"stage3_prefetch_bucket_size": 40000000,
|
|
||||||
"stage3_param_persistence_threshold": 0,
|
|
||||||
|
|
||||||
"offload_optimizer": { "device": "none" },
|
|
||||||
"offload_param": { "device": "none" },
|
|
||||||
|
|
||||||
"stage3_gather_16bit_weights_on_model_save": false
|
|
||||||
},
|
|
||||||
|
|
||||||
"bf16": { "enabled": true },
|
|
||||||
"fp16": { "enabled": false },
|
|
||||||
|
|
||||||
"gradient_clipping": 1.0,
|
|
||||||
"wall_clock_breakdown": false
|
|
||||||
}
|
|
||||||
|
|
@ -847,7 +847,6 @@ def main():
|
||||||
logging_steps=args.log_interval,
|
logging_steps=args.log_interval,
|
||||||
save_steps=args.save_steps,
|
save_steps=args.save_steps,
|
||||||
save_total_limit=2,
|
save_total_limit=2,
|
||||||
optim="adamw_torch",
|
|
||||||
# deepspeed=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
|
# deepspeed=(args.deepspeed if args.deepspeed and os.path.isfile(args.deepspeed) else None),
|
||||||
deepspeed=(args.deepspeed if use_ds else None),
|
deepspeed=(args.deepspeed if use_ds else None),
|
||||||
dataloader_drop_last=False,
|
dataloader_drop_last=False,
|
||||||
|
|
@ -884,29 +883,6 @@ def main():
|
||||||
trainer_kwargs["tokenizer"] = tokenizer
|
trainer_kwargs["tokenizer"] = tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
decay_params, no_decay_params = [], []
|
|
||||||
for n, p in model.named_parameters():
|
|
||||||
if not p.requires_grad:
|
|
||||||
continue
|
|
||||||
if any(nd in n for nd in ["bias", "LayerNorm.weight", "layer_norm.weight", "norm.weight", "ln_f.weight"]):
|
|
||||||
no_decay_params.append(p)
|
|
||||||
else:
|
|
||||||
decay_params.append(p)
|
|
||||||
|
|
||||||
optimizer_grouped_parameters = [
|
|
||||||
{"params": decay_params, "weight_decay": args.weight_decay},
|
|
||||||
{"params": no_decay_params, "weight_decay": 0.0},
|
|
||||||
]
|
|
||||||
|
|
||||||
optimizer = TorchAdamW(
|
|
||||||
optimizer_grouped_parameters,
|
|
||||||
lr=args.learning_rate,
|
|
||||||
betas=(0.9, 0.999),
|
|
||||||
eps=1e-8,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
trainer = DebugTrainer(
|
trainer = DebugTrainer(
|
||||||
model=model,
|
model=model,
|
||||||
args=training_args,
|
args=training_args,
|
||||||
|
|
@ -915,7 +891,6 @@ def main():
|
||||||
#tokenizer=tokenizer,
|
#tokenizer=tokenizer,
|
||||||
#processing_class=tokenizer,
|
#processing_class=tokenizer,
|
||||||
data_collator=data_collator,
|
data_collator=data_collator,
|
||||||
optimizers=(optimizer, None),
|
|
||||||
**trainer_kwargs,
|
**trainer_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue