1
0
Fork 0
QuaPy/Ordinal/finetune_bert.py

101 lines
3.4 KiB
Python

import sys
import numpy as np
import datasets
import torch.cuda
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer
from datasets import list_datasets, list_metrics, load_dataset, Dataset, DatasetDict, load_metric
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import pandas as pd
import csv
def tokenize_function(example):
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256)
return tokens
def compute_metrics(eval_preds):
logits, labels = eval_preds
preds = np.argmax(logits, axis=-1)
return {
'macro-f1': f1_score(labels, preds, average='macro'),
'micro-f1': f1_score(labels, preds, average='micro'),
}
if __name__ == '__main__':
debug = False
assert torch.cuda.is_available(), 'cuda is not available'
n_args = len(sys.argv)
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
datapath = sys.argv[1] # './data/Books/training_data.txt'
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
modelout = checkpoint+'-finetuned'
# load the training set, and extract a held-out validation split of 1000 documents (stratified)
df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
labels = df['labels'].to_frame()
X_train, X_val = train_test_split(df, stratify=labels, test_size=1000, random_state=1)
num_labels = len(pd.unique(labels['labels']))
features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')})
train = Dataset.from_pandas(df=X_train, split='train', features=features)
validation = Dataset.from_pandas(df=X_val, split='validation', features=features)
dataset = DatasetDict({
'train': train.select(range(500)) if debug else train,
'validation': validation.select(range(500)) if debug else validation
})
# tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)
print(tokenized_datasets['train'][0]['labels'])
print(tokenized_datasets['train'][0]['review'])
print(tokenized_datasets['train'][0]['input_ids'])
print(len(tokenized_datasets['train'][0]['input_ids']))
# print(tokenized_datasets['train'][0]['token_type_ids'])
# print(tokenized_datasets['train'][0]['attention_mask'])
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
# fine-tuning
training_args = TrainingArguments(
modelout,
learning_rate=2e-5,
num_train_epochs=5,
weight_decay=0.01,
evaluation_strategy='epoch',
save_strategy='epoch',
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
# eval_steps=10,
save_total_limit=1,
load_best_model_at_end=True
)
trainer = Trainer(
model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['validation'],
data_collator=DataCollatorWithPadding(tokenizer),
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()