diff --git a/hf_trainer.py b/hf_trainer.py index 96af38d..9c11a5c 100644 --- a/hf_trainer.py +++ b/hf_trainer.py @@ -18,7 +18,7 @@ import evaluate transformers.logging.set_verbosity_error() IWSLT_D_COLUMNS = ["text", "category", "rating", "summary", "title"] -RAI_D_COLUMNS = ["id", "lang", "provider", "date", "title", "text", "str_label", "label"] +RAI_D_COLUMNS = ["id", "lang", "provider", "date", "title", "text", "label"] def init_callbacks(patience=-1, nosave=False): @@ -31,6 +31,7 @@ def init_callbacks(patience=-1, nosave=False): def init_model(model_name, nlabels): if model_name == "mbert": hf_name = "bert-base-multilingual-cased" + # hf_name = "mbert-rai-multi-2000/checkpoint-1500" elif model_name == "xlm-roberta": hf_name = "xlm-roberta-base" else: @@ -43,27 +44,14 @@ def init_model(model_name, nlabels): def main(args): tokenizer, model = init_model(args.model, args.nlabels) - # data = load_dataset( - # "json", - # data_files={ - # "train": "local_datasets/webis-cls/all-domains/train.json", - # "test": "local_datasets/webis-cls/all-domains/test.json", - # }, - # ) - data = load_dataset( "csv", data_files = { - # "train": expanduser("~/datasets/rai/csv/rai-no-it-train.csv"), - # "test": expanduser("~/datasets/rai/csv/rai-no-it-test.csv") - # "train": expanduser("~/datasets/rai/csv/rai-train.csv"), - # "test": expanduser("~/datasets/rai/csv/rai-test-ita-labeled.csv") - "train": expanduser("~/datasets/rai/csv/train-split-rai.csv"), - "test": expanduser("~/datasets/rai/csv/test-split-rai-labeled.csv") + "train": expanduser("~/datasets/rai/csv/train-rai-multilingual-2000.csv"), + "test": expanduser("~/datasets/rai/csv/test-rai-multilingual-2000.csv") } ) - def process_sample_iwslt(sample): inputs = sample["text"] ratings = [r - 1 for r in sample["rating"]] @@ -93,7 +81,7 @@ def main(args): process_sample_rai, batched=True, num_proc=4, - load_from_cache_file=True, + load_from_cache_file=False, remove_columns=RAI_D_COLUMNS, ) train_val_splits = data["train"].train_test_split(test_size=0.2, seed=42) @@ -115,7 +103,7 @@ def main(args): recall_metric = evaluate.load("recall") training_args = TrainingArguments( - output_dir=f"{args.model}-rai-final", + output_dir=f"{args.model}-rai-multi-2000", do_train=True, evaluation_strategy="steps", per_device_train_batch_size=args.batch, @@ -127,8 +115,7 @@ def main(args): max_grad_norm=5.0, num_train_epochs=args.epochs, lr_scheduler_type=args.scheduler, - # warmup_ratio=0.1, - warmup_ratio=1500, + warmup_ratio=0.1, logging_strategy="steps", logging_first_step=True, logging_steps=args.steplog, @@ -189,8 +176,8 @@ def main(args): callbacks=callbacks, ) - print("- Training:") - trainer.train() + # print("- Training:") + # trainer.train() print("- Testing:") test_results = trainer.evaluate(eval_dataset=data["test"]) @@ -203,10 +190,10 @@ if __name__ == "__main__": from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument("--model", type=str, metavar="", default="mbert") - parser.add_argument("--nlabels", type=int, metavar="", default=3) + parser.add_argument("--nlabels", type=int, metavar="", default=28) parser.add_argument("--lr", type=float, metavar="", default=1e-5, help="Set learning rate",) parser.add_argument("--scheduler", type=str, metavar="", default="linear", help="Accepted: [\"cosine\", \"cosine-reset\", \"cosine-warmup\", \"cosine-warmup-reset\", \"constant\"]") - parser.add_argument("--batch", type=int, metavar="", default=16, help="Set batch size") + parser.add_argument("--batch", type=int, metavar="", default=8, help="Set batch size") parser.add_argument("--gradacc", type=int, metavar="", default=1, help="Gradient accumulation steps") parser.add_argument("--epochs", type=int, metavar="", default=100, help="Set epochs") parser.add_argument("--stepeval", type=int, metavar="", default=50, help="Run evaluation every n steps")