removed unused cols in rai dataset

This commit is contained in:
Andrea Pedrotti 2023-07-03 19:02:37 +02:00
parent d36e185ffe
commit 55e12505c0
1 changed files with 11 additions and 24 deletions

View File

@ -18,7 +18,7 @@ import evaluate
transformers.logging.set_verbosity_error() transformers.logging.set_verbosity_error()
IWSLT_D_COLUMNS = ["text", "category", "rating", "summary", "title"] IWSLT_D_COLUMNS = ["text", "category", "rating", "summary", "title"]
RAI_D_COLUMNS = ["id", "lang", "provider", "date", "title", "text", "str_label", "label"] RAI_D_COLUMNS = ["id", "lang", "provider", "date", "title", "text", "label"]
def init_callbacks(patience=-1, nosave=False): def init_callbacks(patience=-1, nosave=False):
@ -31,6 +31,7 @@ def init_callbacks(patience=-1, nosave=False):
def init_model(model_name, nlabels): def init_model(model_name, nlabels):
if model_name == "mbert": if model_name == "mbert":
hf_name = "bert-base-multilingual-cased" hf_name = "bert-base-multilingual-cased"
# hf_name = "mbert-rai-multi-2000/checkpoint-1500"
elif model_name == "xlm-roberta": elif model_name == "xlm-roberta":
hf_name = "xlm-roberta-base" hf_name = "xlm-roberta-base"
else: else:
@ -43,27 +44,14 @@ def init_model(model_name, nlabels):
def main(args): def main(args):
tokenizer, model = init_model(args.model, args.nlabels) tokenizer, model = init_model(args.model, args.nlabels)
# data = load_dataset(
# "json",
# data_files={
# "train": "local_datasets/webis-cls/all-domains/train.json",
# "test": "local_datasets/webis-cls/all-domains/test.json",
# },
# )
data = load_dataset( data = load_dataset(
"csv", "csv",
data_files = { data_files = {
# "train": expanduser("~/datasets/rai/csv/rai-no-it-train.csv"), "train": expanduser("~/datasets/rai/csv/train-rai-multilingual-2000.csv"),
# "test": expanduser("~/datasets/rai/csv/rai-no-it-test.csv") "test": expanduser("~/datasets/rai/csv/test-rai-multilingual-2000.csv")
# "train": expanduser("~/datasets/rai/csv/rai-train.csv"),
# "test": expanduser("~/datasets/rai/csv/rai-test-ita-labeled.csv")
"train": expanduser("~/datasets/rai/csv/train-split-rai.csv"),
"test": expanduser("~/datasets/rai/csv/test-split-rai-labeled.csv")
} }
) )
def process_sample_iwslt(sample): def process_sample_iwslt(sample):
inputs = sample["text"] inputs = sample["text"]
ratings = [r - 1 for r in sample["rating"]] ratings = [r - 1 for r in sample["rating"]]
@ -93,7 +81,7 @@ def main(args):
process_sample_rai, process_sample_rai,
batched=True, batched=True,
num_proc=4, num_proc=4,
load_from_cache_file=True, load_from_cache_file=False,
remove_columns=RAI_D_COLUMNS, remove_columns=RAI_D_COLUMNS,
) )
train_val_splits = data["train"].train_test_split(test_size=0.2, seed=42) train_val_splits = data["train"].train_test_split(test_size=0.2, seed=42)
@ -115,7 +103,7 @@ def main(args):
recall_metric = evaluate.load("recall") recall_metric = evaluate.load("recall")
training_args = TrainingArguments( training_args = TrainingArguments(
output_dir=f"{args.model}-rai-final", output_dir=f"{args.model}-rai-multi-2000",
do_train=True, do_train=True,
evaluation_strategy="steps", evaluation_strategy="steps",
per_device_train_batch_size=args.batch, per_device_train_batch_size=args.batch,
@ -127,8 +115,7 @@ def main(args):
max_grad_norm=5.0, max_grad_norm=5.0,
num_train_epochs=args.epochs, num_train_epochs=args.epochs,
lr_scheduler_type=args.scheduler, lr_scheduler_type=args.scheduler,
# warmup_ratio=0.1, warmup_ratio=0.1,
warmup_ratio=1500,
logging_strategy="steps", logging_strategy="steps",
logging_first_step=True, logging_first_step=True,
logging_steps=args.steplog, logging_steps=args.steplog,
@ -189,8 +176,8 @@ def main(args):
callbacks=callbacks, callbacks=callbacks,
) )
print("- Training:") # print("- Training:")
trainer.train() # trainer.train()
print("- Testing:") print("- Testing:")
test_results = trainer.evaluate(eval_dataset=data["test"]) test_results = trainer.evaluate(eval_dataset=data["test"])
@ -203,10 +190,10 @@ if __name__ == "__main__":
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument("--model", type=str, metavar="", default="mbert") parser.add_argument("--model", type=str, metavar="", default="mbert")
parser.add_argument("--nlabels", type=int, metavar="", default=3) parser.add_argument("--nlabels", type=int, metavar="", default=28)
parser.add_argument("--lr", type=float, metavar="", default=1e-5, help="Set learning rate",) parser.add_argument("--lr", type=float, metavar="", default=1e-5, help="Set learning rate",)
parser.add_argument("--scheduler", type=str, metavar="", default="linear", help="Accepted: [\"cosine\", \"cosine-reset\", \"cosine-warmup\", \"cosine-warmup-reset\", \"constant\"]") parser.add_argument("--scheduler", type=str, metavar="", default="linear", help="Accepted: [\"cosine\", \"cosine-reset\", \"cosine-warmup\", \"cosine-warmup-reset\", \"constant\"]")
parser.add_argument("--batch", type=int, metavar="", default=16, help="Set batch size") parser.add_argument("--batch", type=int, metavar="", default=8, help="Set batch size")
parser.add_argument("--gradacc", type=int, metavar="", default=1, help="Gradient accumulation steps") parser.add_argument("--gradacc", type=int, metavar="", default=1, help="Gradient accumulation steps")
parser.add_argument("--epochs", type=int, metavar="", default=100, help="Set epochs") parser.add_argument("--epochs", type=int, metavar="", default=100, help="Set epochs")
parser.add_argument("--stepeval", type=int, metavar="", default=50, help="Run evaluation every n steps") parser.add_argument("--stepeval", type=int, metavar="", default=50, help="Run evaluation every n steps")