forked from moreo/QuaPy
generating features from RoBERTa, testing them on Amazons data
This commit is contained in:
parent
d949c77317
commit
464bd60c7c
|
@ -1,6 +1,11 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
def smoothness(p):
|
||||
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||
|
||||
|
||||
|
||||
def _check_arrays(prevs):
|
||||
prevs = np.asarray(prevs)
|
||||
if prevs.ndim==1:
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
import sys
|
||||
import numpy as np
|
||||
import datasets
|
||||
import torch.cuda
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer
|
||||
from datasets import list_datasets, list_metrics, load_dataset, Dataset, DatasetDict, load_metric
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
from transformers import TrainingArguments
|
||||
from transformers import Trainer
|
||||
import pandas as pd
|
||||
import csv
|
||||
|
||||
|
||||
def tokenize_function(example):
|
||||
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256)
|
||||
return tokens
|
||||
|
||||
|
||||
def compute_metrics(eval_preds):
|
||||
logits, labels = eval_preds
|
||||
preds = np.argmax(logits, axis=-1)
|
||||
return {
|
||||
'macro-f1': f1_score(labels, preds, average='macro'),
|
||||
'micro-f1': f1_score(labels, preds, average='micro'),
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
debug = False
|
||||
assert torch.cuda.is_available(), 'cuda is not available'
|
||||
|
||||
n_args = len(sys.argv)
|
||||
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
|
||||
|
||||
datapath = sys.argv[1] # './data/Books/training_data.txt'
|
||||
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
|
||||
modelout = checkpoint+'-finetuned'
|
||||
|
||||
# load the training set, and extract a held-out validation split of 1000 documents (stratified)
|
||||
df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
||||
labels = df['labels'].to_frame()
|
||||
X_train, X_val = train_test_split(df, stratify=labels, test_size=1000, random_state=1)
|
||||
num_labels = len(pd.unique(labels['labels']))
|
||||
|
||||
features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')})
|
||||
train = Dataset.from_pandas(df=X_train, split='train', features=features)
|
||||
validation = Dataset.from_pandas(df=X_val, split='validation', features=features)
|
||||
|
||||
dataset = DatasetDict({
|
||||
'train': train.select(range(500)) if debug else train,
|
||||
'validation': validation.select(range(500)) if debug else validation
|
||||
})
|
||||
|
||||
# tokenize the dataset
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||
|
||||
print(tokenized_datasets)
|
||||
print(tokenized_datasets['train'][0]['labels'])
|
||||
print(tokenized_datasets['train'][0]['review'])
|
||||
print(tokenized_datasets['train'][0]['input_ids'])
|
||||
print(len(tokenized_datasets['train'][0]['input_ids']))
|
||||
# print(tokenized_datasets['train'][0]['token_type_ids'])
|
||||
# print(tokenized_datasets['train'][0]['attention_mask'])
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
||||
|
||||
# fine-tuning
|
||||
training_args = TrainingArguments(
|
||||
modelout,
|
||||
learning_rate=2e-5,
|
||||
num_train_epochs=5,
|
||||
weight_decay=0.01,
|
||||
evaluation_strategy='epoch',
|
||||
save_strategy='epoch',
|
||||
per_device_train_batch_size=16,
|
||||
per_device_eval_batch_size=16,
|
||||
# eval_steps=10,
|
||||
save_total_limit=1,
|
||||
load_best_model_at_end=True
|
||||
)
|
||||
trainer = Trainer(
|
||||
model,
|
||||
args=training_args,
|
||||
train_dataset=tokenized_datasets['train'],
|
||||
eval_dataset=tokenized_datasets['validation'],
|
||||
data_collator=DataCollatorWithPadding(tokenizer),
|
||||
tokenizer=tokenizer,
|
||||
compute_metrics=compute_metrics
|
||||
)
|
||||
|
||||
trainer.train()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
#conda activate torch
|
||||
|
||||
transformer=roberta-base
|
||||
|
||||
#python3 finetune_bert.py ./data/Books/training_data.txt $transformer
|
||||
#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned last
|
||||
#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned average
|
||||
PYTHONPATH=.:.. python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned posteriors
|
|
@ -8,25 +8,34 @@ from Ordinal.main import quantifiers
|
|||
from Ordinal.tabular import Table
|
||||
|
||||
domain = 'Books-tfidf'
|
||||
domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
||||
domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
prot = 'app'
|
||||
outpath = f'./tables/{domain}/{prot}/results.tex'
|
||||
|
||||
resultpath = join('./results', domain, prot)
|
||||
resultpath_bertlast = join('./results', domain_bert_last, prot)
|
||||
resultpath_bertave = join('./results', domain_bert_ave, prot)
|
||||
|
||||
methods = [qname for qname, *_ in quantifiers()]
|
||||
# methods += [m+'-r' for m in methods]
|
||||
methods_Rlast = [m+'-RoBERTa-last' for m in methods]
|
||||
methods_Rave = [m+'-RoBERTa-average' for m in methods]
|
||||
methods = methods + methods_Rlast + methods_Rave
|
||||
methods += [m+'-r' for m in methods]
|
||||
|
||||
table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
|
||||
|
||||
resultfiles = list(glob(f'{resultpath}/*.csv')) + list(glob(f'{resultpath_bertlast}/*.csv')) + list(glob(f'{resultpath_bertave}/*.csv'))
|
||||
|
||||
for resultfile in glob(f'{resultpath}/*.csv'):
|
||||
for resultfile in resultfiles:
|
||||
df = pd.read_csv(resultfile)
|
||||
nmd = df['nmd'].values
|
||||
resultname = Path(resultfile).name
|
||||
method, drift, *other = resultname.replace('.csv', '').split('.')
|
||||
if other:
|
||||
continue
|
||||
method += '-r'
|
||||
if method not in methods:
|
||||
continue
|
||||
|
||||
table.add(drift, method, nmd)
|
||||
|
||||
|
|
|
@ -0,0 +1,145 @@
|
|||
import sys
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoTokenizer
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
from os.path import join
|
||||
import os
|
||||
import shutil
|
||||
from tqdm import tqdm
|
||||
|
||||
from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
|
||||
|
||||
|
||||
|
||||
def tokenize_function(example):
|
||||
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')
|
||||
return {
|
||||
'input_ids': tokens.input_ids.cuda(),
|
||||
'attention_mask': tokens.attention_mask.cuda()
|
||||
}
|
||||
|
||||
|
||||
def save_samples_as_txt(tensors, labels, path):
|
||||
vectors = tensors
|
||||
labels = labels.values
|
||||
vec_lab = np.hstack([labels, vectors])
|
||||
n_cols = vectors.shape[1]
|
||||
np.savetxt(path, vec_lab, fmt=['%d']+['%f']*n_cols)
|
||||
|
||||
|
||||
def transform_sample(instances, labels, outpath, batch_size=50):
|
||||
ndocs = len(labels)
|
||||
batches = ndocs // batch_size
|
||||
assert ndocs % batches == 0, 'fragmented last bach not supported'
|
||||
|
||||
transformations = []
|
||||
for batch_id in range(0, ndocs, batch_size):
|
||||
|
||||
batch_instances = instances[batch_id:batch_id + batch_size]
|
||||
|
||||
tokenized_dataset = tokenize_function(batch_instances)
|
||||
out = model(**tokenized_dataset, output_hidden_states=True)
|
||||
|
||||
if generation_mode == 'posteriors':
|
||||
logits = out.logits
|
||||
posteriors = torch.softmax(logits, dim=-1)
|
||||
transformed = posteriors
|
||||
elif generation_mode == 'last':
|
||||
hidden_states = out.hidden_states
|
||||
last_layer_cls = hidden_states[-1][:, 0, :]
|
||||
transformed = last_layer_cls
|
||||
elif generation_mode == 'average':
|
||||
hidden_states = out.hidden_states
|
||||
hidden_states = torch.stack(hidden_states)
|
||||
all_layer_cls = hidden_states[:, :, 0, :]
|
||||
average_cls = torch.mean(all_layer_cls, dim=0)
|
||||
transformed = average_cls
|
||||
else:
|
||||
raise NotImplementedError()
|
||||
|
||||
transformations.append(transformed.cpu().numpy())
|
||||
|
||||
transformations = np.vstack(transformations)
|
||||
save_samples_as_txt(transformations, labels, outpath)
|
||||
|
||||
|
||||
def transform_folder_samples(protocol, splitname):
|
||||
in_folder = join(datapath, domain, protocol, splitname)
|
||||
out_folder = join(datapath, outname, protocol, splitname)
|
||||
total = 1000 if splitname.startswith('dev') else 5000
|
||||
|
||||
for i, (instances, labels) in tqdm(enumerate(
|
||||
load_samples_folder(in_folder, load_fn=load_single_sample_as_csv)), desc=f'{protocol} {splitname}', total=total):
|
||||
transform_sample(instances, labels, outpath=join(out_folder, f'{i}.txt'))
|
||||
|
||||
|
||||
def get_best_checkpoint(checkpointdir):
|
||||
from glob import glob
|
||||
steps = []
|
||||
for folder in glob(f'{checkpointdir}/checkpoint-*'):
|
||||
step=int(folder.split('checkpoint-')[1])
|
||||
steps.append(step)
|
||||
assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)'
|
||||
choosen = f'{checkpointdir}/checkpoint-{min(steps)}'
|
||||
return choosen
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
debug = False
|
||||
assert torch.cuda.is_available(), 'cuda is not available'
|
||||
|
||||
checkpoint='roberta-base-finetuned'
|
||||
generation_mode = 'posteriors'
|
||||
|
||||
# n_args = len(sys.argv)
|
||||
# assert n_args==3, 'wrong arguments, expected: <checkpoint> <generation-mode>\n' \
|
||||
# '\tgeneration-mode: last (last layer), ave (average pooling), or posteriors (posterior probabilities)'
|
||||
|
||||
# checkpoint = sys.argv[1] #e.g., 'bert-base-uncased'
|
||||
# generation_mode = sys.argv[2] # e.g., 'last'
|
||||
|
||||
assert 'finetuned' in checkpoint, 'looks like this model is not finetuned'
|
||||
|
||||
checkpoint = get_best_checkpoint(checkpoint)
|
||||
|
||||
num_labels = 5
|
||||
|
||||
datapath = './data'
|
||||
domain = 'Books'
|
||||
protocols = ['app'] # ['app', 'npp']
|
||||
|
||||
assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model'
|
||||
outname = domain + f'-{checkpoint}-{generation_mode}'
|
||||
|
||||
with torch.no_grad():
|
||||
print('loading', checkpoint)
|
||||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
||||
|
||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||
|
||||
print('transforming the training set')
|
||||
instances, labels = load_single_sample_as_csv(join(datapath, domain), 'training_data')
|
||||
transform_sample(instances, labels, join(datapath, outname, 'training_data.txt'))
|
||||
print('[done]')
|
||||
|
||||
for protocol in protocols:
|
||||
in_path = join(datapath, domain, protocol)
|
||||
out_path = join(datapath, outname, protocol)
|
||||
os.makedirs(out_path, exist_ok=True)
|
||||
os.makedirs(join(out_path, 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(out_path, 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(in_path, 'dev_prevalences.txt'), join(out_path, 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(in_path, 'test_prevalences.txt'), join(out_path, 'test_prevalences.txt'))
|
||||
|
||||
print('processing', protocol)
|
||||
transform_folder_samples(protocol, 'dev_samples')
|
||||
transform_folder_samples(protocol, 'test_samples')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -9,7 +9,7 @@ from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
|
|||
from quapy.data import LabelledCollection
|
||||
from os.path import join
|
||||
import os
|
||||
from utils import load_samples, load_samples_pkl
|
||||
from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl
|
||||
from evaluation import nmd, mnmd
|
||||
from time import time
|
||||
import pickle
|
||||
|
@ -25,22 +25,6 @@ import mord
|
|||
# add drift='all'
|
||||
|
||||
|
||||
def load_test_samples():
|
||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
||||
ids = set(ids)
|
||||
pklpath = join(datapath, domain, protocol, 'test_samples')
|
||||
for sample in tqdm(load_samples_pkl(pklpath, filter=ids), total=len(ids)):
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
|
||||
def load_dev_samples():
|
||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
||||
ids = set(ids)
|
||||
pklpath = join(datapath, domain, protocol, 'dev_samples')
|
||||
for sample in tqdm(load_samples_pkl(pklpath, filter=ids), total=len(ids)):
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
|
||||
def quantifiers():
|
||||
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||
|
@ -58,21 +42,20 @@ def quantifiers():
|
|||
|
||||
# with order-aware classifiers
|
||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
||||
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
||||
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
||||
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
||||
#yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
||||
#yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
||||
#yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
||||
#yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
||||
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
|
||||
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
||||
#yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
||||
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
||||
|
||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
# I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
|
||||
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
|
||||
# not implement predict_proba nor decision_score
|
||||
yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
|
||||
yield 'CC-bal(SVR)', CC(RegressorClassifier()), params_SVR
|
||||
# yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
|
||||
#yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
|
||||
#yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
|
||||
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
|
||||
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
|
||||
# yield 'PACC(SVR)', PACC(RegressorClassifier()), params_SVR
|
||||
|
@ -82,6 +65,7 @@ def quantifiers():
|
|||
|
||||
def run_experiment(params):
|
||||
qname, q, param_grid, drift = params
|
||||
qname += posfix
|
||||
resultfile = join(resultpath, f'{qname}.{drift}.csv')
|
||||
if os.path.exists(resultfile):
|
||||
print(f'result file {resultfile} already exists: continue')
|
||||
|
@ -89,6 +73,22 @@ def run_experiment(params):
|
|||
|
||||
print(f'fitting {qname} for {drift}-drift')
|
||||
|
||||
|
||||
def load_test_samples():
|
||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
||||
ids = set(ids)
|
||||
folderpath = join(datapath, domain, protocol, 'test_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
|
||||
def load_dev_samples():
|
||||
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
||||
ids = set(ids)
|
||||
folderpath = join(datapath, domain, protocol, 'dev_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
q = qp.model_selection.GridSearchQ(
|
||||
q,
|
||||
param_grid,
|
||||
|
@ -125,22 +125,34 @@ def run_experiment(params):
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
domain = 'Books-tfidf'
|
||||
#preprocessing = 'roberta.last'
|
||||
preprocessing = 'roberta.average'
|
||||
#preprocessing = 'tfidf'
|
||||
if preprocessing=='tfidf':
|
||||
domain = 'Books-tfidf'
|
||||
posfix = ''
|
||||
elif preprocessing=='roberta.last':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
||||
posfix = '-RoBERTa-last'
|
||||
elif preprocessing=='roberta.average':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
posfix = '-RoBERTa-average'
|
||||
load_sample_fn = load_single_sample_pkl
|
||||
datapath = './data'
|
||||
protocol = 'app'
|
||||
resultpath = join('./results', domain, protocol)
|
||||
os.makedirs(resultpath, exist_ok=True)
|
||||
|
||||
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
|
||||
train = load_sample_fn(join(datapath, domain), 'training_data')
|
||||
|
||||
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
||||
for drift in ['low', 'mid', 'high', 'all']:
|
||||
params = [(*qs, drift) for qs in quantifiers()]
|
||||
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
|
||||
for h in hypers:
|
||||
if h is not None:
|
||||
foo.write(h)
|
||||
foo.write('\n')
|
||||
#for drift in [f'smooth{i}' for i in range(5)] + ['all']:
|
||||
params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']]
|
||||
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
|
||||
for h in hypers:
|
||||
if h is not None:
|
||||
foo.write(h)
|
||||
foo.write('\n')
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import numpy as np
|
||||
import quapy as qp
|
||||
from Ordinal.evaluation import nmd
|
||||
from Ordinal.utils import load_samples_pkl
|
||||
from evaluation import nmd
|
||||
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
||||
from quapy.data import LabelledCollection
|
||||
import pickle
|
||||
import os
|
||||
|
@ -13,7 +13,8 @@ def partition_by_drift(split, training_prevalence):
|
|||
assert split in ['dev', 'test'], 'invalid split name'
|
||||
total=1000 if split=='dev' else 5000
|
||||
drifts = []
|
||||
for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total):
|
||||
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
||||
drifts.append(nmd(training_prevalence, sample.prevalence()))
|
||||
drifts = np.asarray(drifts)
|
||||
order = np.argsort(drifts)
|
||||
|
@ -34,7 +35,7 @@ def partition_by_drift(split, training_prevalence):
|
|||
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
|
||||
|
||||
|
||||
domain = 'Books-tfidf'
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
datapath = './data'
|
||||
|
||||
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||
|
|
|
@ -0,0 +1,36 @@
|
|||
import numpy as np
|
||||
from Ordinal.evaluation import smoothness
|
||||
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
||||
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def partition_by_smoothness(split):
|
||||
assert split in ['dev', 'test'], 'invalid split name'
|
||||
total=1000 if split=='dev' else 5000
|
||||
smooths = []
|
||||
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
||||
smooths.append(smoothness(sample.prevalence()))
|
||||
smooths = np.asarray(smooths)
|
||||
order = np.argsort(smooths)
|
||||
nD = len(order)
|
||||
low2high_smooth = np.array_split(order, 5)
|
||||
all_drift = np.arange(nD)
|
||||
for i, smooth_idx in enumerate(low2high_smooth):
|
||||
block = smooths[smooth_idx]
|
||||
print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}')
|
||||
np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx)
|
||||
np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift)
|
||||
|
||||
|
||||
#domain = 'Books-tfidf'
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
datapath = './data'
|
||||
|
||||
#training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||
|
||||
partition_by_smoothness('dev')
|
||||
partition_by_smoothness('test')
|
||||
|
|
@ -4,7 +4,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
|
|||
from os.path import join
|
||||
import os
|
||||
import pickle
|
||||
from utils import load_samples
|
||||
from utils import load_samples_raw
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
|
||||
|
@ -40,7 +40,7 @@ pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pic
|
|||
|
||||
|
||||
def transform_folder_samples(protocol, splitname):
|
||||
for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||
sample.instances = tfidf.transform(sample.instances)
|
||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from os.path import join
|
||||
import os
|
||||
import pickle
|
||||
from utils import *
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
|
||||
|
||||
vector_generation = 'average'
|
||||
|
||||
datapath = './data'
|
||||
domain = f'Books-roberta-base-finetuned/checkpoint-1188-{vector_generation}'
|
||||
outname = domain.replace('-finetuned', '-finetuned-pkl')
|
||||
|
||||
protocol = 'app'
|
||||
|
||||
print('pickling npy txt files')
|
||||
print('from:', join(datapath, domain))
|
||||
print('to', join(datapath, outname))
|
||||
print('for protocol:', protocol)
|
||||
|
||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, protocol), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, protocol, 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
|
||||
|
||||
|
||||
train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
|
||||
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def transform_folder_samples(protocol, splitname):
|
||||
folder_dir=join(datapath, domain, protocol, splitname)
|
||||
for i, sample in tqdm(enumerate(load_samples_folder(folder_dir, filter=None, load_fn=load_simple_sample_npytxt, classes=train.classes_))):
|
||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
transform_folder_samples(protocol, 'dev_samples')
|
||||
transform_folder_samples(protocol, 'test_samples')
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,54 @@
|
|||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from os.path import join
|
||||
import os
|
||||
import pickle
|
||||
from utils import load_samples_raw
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
|
||||
|
||||
datapath = './data'
|
||||
domain = 'Books'
|
||||
outname = domain + '-tfidf'
|
||||
|
||||
def save_preprocessing_info(transformer):
|
||||
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
|
||||
foo.write(f'{str(transformer)}\n')
|
||||
|
||||
|
||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
|
||||
os.makedirs(join(datapath, outname, 'npp'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
|
||||
|
||||
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
|
||||
train.instances = tfidf.fit_transform(train.instances)
|
||||
save_preprocessing_info(tfidf)
|
||||
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def transform_folder_samples(protocol, splitname):
|
||||
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||
sample.instances = tfidf.transform(sample.instances)
|
||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
transform_folder_samples('app', 'dev_samples')
|
||||
transform_folder_samples('app', 'test_samples')
|
||||
transform_folder_samples('npp', 'dev_samples')
|
||||
transform_folder_samples('npp', 'test_samples')
|
||||
|
||||
|
||||
|
|
@ -1,42 +1,64 @@
|
|||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
import numpy as np
|
||||
from glob import glob
|
||||
from json import load
|
||||
import os
|
||||
from os.path import join
|
||||
import pickle
|
||||
import pandas as pd
|
||||
import csv
|
||||
import datasets
|
||||
from datasets import Dataset
|
||||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
|
||||
|
||||
def load_samples(path_dir, classes):
|
||||
nsamples = len(glob(join(path_dir, f'*.txt')))
|
||||
for id in range(nsamples):
|
||||
yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)
|
||||
|
||||
def load_simple_sample_npytxt(parentdir, filename, classes=None):
|
||||
samplepath = join(parentdir, filename+'.txt')
|
||||
yX = np.loadtxt(samplepath)
|
||||
X = yX[:,1:]
|
||||
y = yX[:,0].astype(np.int32)
|
||||
return LabelledCollection(instances=X, labels=y, classes_=classes)
|
||||
|
||||
|
||||
def load_samples_as_csv(path_dir, debug=False):
|
||||
import pandas as pd
|
||||
import csv
|
||||
import datasets
|
||||
from datasets import Dataset
|
||||
|
||||
nsamples = len(glob(join(path_dir, f'*.txt')))
|
||||
for id in range(nsamples):
|
||||
df = pd.read_csv(join(path_dir, f'{id}.txt'), sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
||||
labels = df.pop('labels').to_frame()
|
||||
X = df
|
||||
|
||||
features = datasets.Features({'review': datasets.Value('string')})
|
||||
if debug:
|
||||
sample = Dataset.from_pandas(df=X, features=features).select(range(50))
|
||||
labels = labels[:50]
|
||||
else:
|
||||
sample = Dataset.from_pandas(df=X, features=features)
|
||||
|
||||
yield sample, labels
|
||||
def load_simple_sample_raw(parentdir, filename, classes=None):
|
||||
samplepath = join(parentdir, filename+'.txt')
|
||||
return LabelledCollection.load(samplepath, loader_func=qp.data.reader.from_text, classes=classes)
|
||||
|
||||
|
||||
def load_samples_pkl(path_dir, filter=None):
|
||||
nsamples = len(glob(join(path_dir, f'*.pkl')))
|
||||
def load_single_sample_as_csv(parentdir, filename):
|
||||
samplepath = join(parentdir, filename+'.txt')
|
||||
df = pd.read_csv(samplepath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
||||
labels = df.pop('labels').to_frame()
|
||||
|
||||
features = datasets.Features({'review': datasets.Value('string')})
|
||||
sample = Dataset.from_pandas(df=df, features=features)
|
||||
|
||||
return sample, labels
|
||||
|
||||
|
||||
def load_single_sample_pkl(parentdir, filename):
|
||||
return pickle.load(open(join(parentdir, filename+'.pkl'), 'rb'))
|
||||
|
||||
|
||||
# def load_samples_npytxt(path_dir, filter=None, classes=None):
|
||||
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
|
||||
|
||||
|
||||
# def load_samples_raw(path_dir, filter=None, classes=None):
|
||||
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, load_fn_kwargs={'classes': classes})
|
||||
|
||||
|
||||
# def load_samples_as_csv(path_dir, filter=None):
|
||||
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_as_csv)
|
||||
|
||||
|
||||
# def load_samples_pkl(path_dir, filter=None):
|
||||
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_pkl)
|
||||
|
||||
|
||||
def load_samples_folder(path_dir, filter=None, load_fn=None, **load_fn_kwargs):
|
||||
nsamples = len(glob(join(path_dir, f'*')))
|
||||
for id in range(nsamples):
|
||||
if (filter is None) or id in filter:
|
||||
yield pickle.load(open(join(path_dir, f'{id}.pkl'), 'rb'))
|
||||
|
||||
yield load_fn(path_dir, f'{id}', **load_fn_kwargs)
|
||||
|
|
Loading…
Reference in New Issue