1
0
Fork 0

generating features from RoBERTa, testing them on Amazons data

This commit is contained in:
Alejandro Moreo Fernandez 2022-03-16 19:12:45 +01:00
parent d949c77317
commit 464bd60c7c
12 changed files with 515 additions and 73 deletions

View File

@ -1,6 +1,11 @@
import numpy as np
def smoothness(p):
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
def _check_arrays(prevs):
prevs = np.asarray(prevs)
if prevs.ndim==1:

100
Ordinal/finetune_bert.py Normal file
View File

@ -0,0 +1,100 @@
import sys
import numpy as np
import datasets
import torch.cuda
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer
from datasets import list_datasets, list_metrics, load_dataset, Dataset, DatasetDict, load_metric
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import pandas as pd
import csv
def tokenize_function(example):
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256)
return tokens
def compute_metrics(eval_preds):
logits, labels = eval_preds
preds = np.argmax(logits, axis=-1)
return {
'macro-f1': f1_score(labels, preds, average='macro'),
'micro-f1': f1_score(labels, preds, average='micro'),
}
if __name__ == '__main__':
debug = False
assert torch.cuda.is_available(), 'cuda is not available'
n_args = len(sys.argv)
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
datapath = sys.argv[1] # './data/Books/training_data.txt'
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
modelout = checkpoint+'-finetuned'
# load the training set, and extract a held-out validation split of 1000 documents (stratified)
df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
labels = df['labels'].to_frame()
X_train, X_val = train_test_split(df, stratify=labels, test_size=1000, random_state=1)
num_labels = len(pd.unique(labels['labels']))
features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')})
train = Dataset.from_pandas(df=X_train, split='train', features=features)
validation = Dataset.from_pandas(df=X_val, split='validation', features=features)
dataset = DatasetDict({
'train': train.select(range(500)) if debug else train,
'validation': validation.select(range(500)) if debug else validation
})
# tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)
print(tokenized_datasets['train'][0]['labels'])
print(tokenized_datasets['train'][0]['review'])
print(tokenized_datasets['train'][0]['input_ids'])
print(len(tokenized_datasets['train'][0]['input_ids']))
# print(tokenized_datasets['train'][0]['token_type_ids'])
# print(tokenized_datasets['train'][0]['attention_mask'])
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
# fine-tuning
training_args = TrainingArguments(
modelout,
learning_rate=2e-5,
num_train_epochs=5,
weight_decay=0.01,
evaluation_strategy='epoch',
save_strategy='epoch',
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
# eval_steps=10,
save_total_limit=1,
load_best_model_at_end=True
)
trainer = Trainer(
model,
args=training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['validation'],
data_collator=DataCollatorWithPadding(tokenizer),
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()

11
Ordinal/finetuning_batch.sh Executable file
View File

@ -0,0 +1,11 @@
#!/bin/bash
set -x
#conda activate torch
transformer=roberta-base
#python3 finetune_bert.py ./data/Books/training_data.txt $transformer
#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned last
#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned average
PYTHONPATH=.:.. python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned posteriors

View File

@ -8,25 +8,34 @@ from Ordinal.main import quantifiers
from Ordinal.tabular import Table
domain = 'Books-tfidf'
domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
prot = 'app'
outpath = f'./tables/{domain}/{prot}/results.tex'
resultpath = join('./results', domain, prot)
resultpath_bertlast = join('./results', domain_bert_last, prot)
resultpath_bertave = join('./results', domain_bert_ave, prot)
methods = [qname for qname, *_ in quantifiers()]
# methods += [m+'-r' for m in methods]
methods_Rlast = [m+'-RoBERTa-last' for m in methods]
methods_Rave = [m+'-RoBERTa-average' for m in methods]
methods = methods + methods_Rlast + methods_Rave
methods += [m+'-r' for m in methods]
table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
resultfiles = list(glob(f'{resultpath}/*.csv')) + list(glob(f'{resultpath_bertlast}/*.csv')) + list(glob(f'{resultpath_bertave}/*.csv'))
for resultfile in glob(f'{resultpath}/*.csv'):
for resultfile in resultfiles:
df = pd.read_csv(resultfile)
nmd = df['nmd'].values
resultname = Path(resultfile).name
method, drift, *other = resultname.replace('.csv', '').split('.')
if other:
continue
method += '-r'
if method not in methods:
continue
table.add(drift, method, nmd)

View File

@ -0,0 +1,145 @@
import sys
import numpy as np
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from os.path import join
import os
import shutil
from tqdm import tqdm
from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
def tokenize_function(example):
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')
return {
'input_ids': tokens.input_ids.cuda(),
'attention_mask': tokens.attention_mask.cuda()
}
def save_samples_as_txt(tensors, labels, path):
vectors = tensors
labels = labels.values
vec_lab = np.hstack([labels, vectors])
n_cols = vectors.shape[1]
np.savetxt(path, vec_lab, fmt=['%d']+['%f']*n_cols)
def transform_sample(instances, labels, outpath, batch_size=50):
ndocs = len(labels)
batches = ndocs // batch_size
assert ndocs % batches == 0, 'fragmented last bach not supported'
transformations = []
for batch_id in range(0, ndocs, batch_size):
batch_instances = instances[batch_id:batch_id + batch_size]
tokenized_dataset = tokenize_function(batch_instances)
out = model(**tokenized_dataset, output_hidden_states=True)
if generation_mode == 'posteriors':
logits = out.logits
posteriors = torch.softmax(logits, dim=-1)
transformed = posteriors
elif generation_mode == 'last':
hidden_states = out.hidden_states
last_layer_cls = hidden_states[-1][:, 0, :]
transformed = last_layer_cls
elif generation_mode == 'average':
hidden_states = out.hidden_states
hidden_states = torch.stack(hidden_states)
all_layer_cls = hidden_states[:, :, 0, :]
average_cls = torch.mean(all_layer_cls, dim=0)
transformed = average_cls
else:
raise NotImplementedError()
transformations.append(transformed.cpu().numpy())
transformations = np.vstack(transformations)
save_samples_as_txt(transformations, labels, outpath)
def transform_folder_samples(protocol, splitname):
in_folder = join(datapath, domain, protocol, splitname)
out_folder = join(datapath, outname, protocol, splitname)
total = 1000 if splitname.startswith('dev') else 5000
for i, (instances, labels) in tqdm(enumerate(
load_samples_folder(in_folder, load_fn=load_single_sample_as_csv)), desc=f'{protocol} {splitname}', total=total):
transform_sample(instances, labels, outpath=join(out_folder, f'{i}.txt'))
def get_best_checkpoint(checkpointdir):
from glob import glob
steps = []
for folder in glob(f'{checkpointdir}/checkpoint-*'):
step=int(folder.split('checkpoint-')[1])
steps.append(step)
assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)'
choosen = f'{checkpointdir}/checkpoint-{min(steps)}'
return choosen
if __name__ == '__main__':
debug = False
assert torch.cuda.is_available(), 'cuda is not available'
checkpoint='roberta-base-finetuned'
generation_mode = 'posteriors'
# n_args = len(sys.argv)
# assert n_args==3, 'wrong arguments, expected: <checkpoint> <generation-mode>\n' \
# '\tgeneration-mode: last (last layer), ave (average pooling), or posteriors (posterior probabilities)'
# checkpoint = sys.argv[1] #e.g., 'bert-base-uncased'
# generation_mode = sys.argv[2] # e.g., 'last'
assert 'finetuned' in checkpoint, 'looks like this model is not finetuned'
checkpoint = get_best_checkpoint(checkpoint)
num_labels = 5
datapath = './data'
domain = 'Books'
protocols = ['app'] # ['app', 'npp']
assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model'
outname = domain + f'-{checkpoint}-{generation_mode}'
with torch.no_grad():
print('loading', checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
os.makedirs(join(datapath, outname), exist_ok=True)
print('transforming the training set')
instances, labels = load_single_sample_as_csv(join(datapath, domain), 'training_data')
transform_sample(instances, labels, join(datapath, outname, 'training_data.txt'))
print('[done]')
for protocol in protocols:
in_path = join(datapath, domain, protocol)
out_path = join(datapath, outname, protocol)
os.makedirs(out_path, exist_ok=True)
os.makedirs(join(out_path, 'dev_samples'), exist_ok=True)
os.makedirs(join(out_path, 'test_samples'), exist_ok=True)
shutil.copyfile(join(in_path, 'dev_prevalences.txt'), join(out_path, 'dev_prevalences.txt'))
shutil.copyfile(join(in_path, 'test_prevalences.txt'), join(out_path, 'test_prevalences.txt'))
print('processing', protocol)
transform_folder_samples(protocol, 'dev_samples')
transform_folder_samples(protocol, 'test_samples')

View File

@ -9,7 +9,7 @@ from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
from quapy.data import LabelledCollection
from os.path import join
import os
from utils import load_samples, load_samples_pkl
from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl
from evaluation import nmd, mnmd
from time import time
import pickle
@ -25,22 +25,6 @@ import mord
# add drift='all'
def load_test_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
ids = set(ids)
pklpath = join(datapath, domain, protocol, 'test_samples')
for sample in tqdm(load_samples_pkl(pklpath, filter=ids), total=len(ids)):
yield sample.instances, sample.prevalence()
def load_dev_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
ids = set(ids)
pklpath = join(datapath, domain, protocol, 'dev_samples')
for sample in tqdm(load_samples_pkl(pklpath, filter=ids), total=len(ids)):
yield sample.instances, sample.prevalence()
def quantifiers():
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
@ -58,21 +42,20 @@ def quantifiers():
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
#yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
#yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
#yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
#yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
#yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
# I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
# not implement predict_proba nor decision_score
yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
yield 'CC-bal(SVR)', CC(RegressorClassifier()), params_SVR
# yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
#yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
#yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
# yield 'PACC(SVR)', PACC(RegressorClassifier()), params_SVR
@ -82,6 +65,7 @@ def quantifiers():
def run_experiment(params):
qname, q, param_grid, drift = params
qname += posfix
resultfile = join(resultpath, f'{qname}.{drift}.csv')
if os.path.exists(resultfile):
print(f'result file {resultfile} already exists: continue')
@ -89,6 +73,22 @@ def run_experiment(params):
print(f'fitting {qname} for {drift}-drift')
def load_test_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
ids = set(ids)
folderpath = join(datapath, domain, protocol, 'test_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
yield sample.instances, sample.prevalence()
def load_dev_samples():
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
ids = set(ids)
folderpath = join(datapath, domain, protocol, 'dev_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
yield sample.instances, sample.prevalence()
q = qp.model_selection.GridSearchQ(
q,
param_grid,
@ -125,17 +125,29 @@ def run_experiment(params):
if __name__ == '__main__':
#preprocessing = 'roberta.last'
preprocessing = 'roberta.average'
#preprocessing = 'tfidf'
if preprocessing=='tfidf':
domain = 'Books-tfidf'
posfix = ''
elif preprocessing=='roberta.last':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
posfix = '-RoBERTa-last'
elif preprocessing=='roberta.average':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
posfix = '-RoBERTa-average'
load_sample_fn = load_single_sample_pkl
datapath = './data'
protocol = 'app'
resultpath = join('./results', domain, protocol)
os.makedirs(resultpath, exist_ok=True)
train = pickle.load(open(join(datapath, domain, 'training_data.pkl'), 'rb'))
train = load_sample_fn(join(datapath, domain), 'training_data')
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
for drift in ['low', 'mid', 'high', 'all']:
params = [(*qs, drift) for qs in quantifiers()]
#for drift in [f'smooth{i}' for i in range(5)] + ['all']:
params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']]
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
for h in hypers:
if h is not None:

View File

@ -1,7 +1,7 @@
import numpy as np
import quapy as qp
from Ordinal.evaluation import nmd
from Ordinal.utils import load_samples_pkl
from evaluation import nmd
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
from quapy.data import LabelledCollection
import pickle
import os
@ -13,7 +13,8 @@ def partition_by_drift(split, training_prevalence):
assert split in ['dev', 'test'], 'invalid split name'
total=1000 if split=='dev' else 5000
drifts = []
for sample in tqdm(load_samples_pkl(join(datapath, domain, 'app', f'{split}_samples')), total=total):
folderpath = join(datapath, domain, 'app', f'{split}_samples')
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
drifts.append(nmd(training_prevalence, sample.prevalence()))
drifts = np.asarray(drifts)
order = np.argsort(drifts)
@ -34,7 +35,7 @@ def partition_by_drift(split, training_prevalence):
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
domain = 'Books-tfidf'
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
datapath = './data'
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))

View File

@ -0,0 +1,36 @@
import numpy as np
from Ordinal.evaluation import smoothness
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
from os.path import join
from tqdm import tqdm
def partition_by_smoothness(split):
assert split in ['dev', 'test'], 'invalid split name'
total=1000 if split=='dev' else 5000
smooths = []
folderpath = join(datapath, domain, 'app', f'{split}_samples')
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
smooths.append(smoothness(sample.prevalence()))
smooths = np.asarray(smooths)
order = np.argsort(smooths)
nD = len(order)
low2high_smooth = np.array_split(order, 5)
all_drift = np.arange(nD)
for i, smooth_idx in enumerate(low2high_smooth):
block = smooths[smooth_idx]
print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}')
np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx)
np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift)
#domain = 'Books-tfidf'
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
datapath = './data'
#training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
partition_by_smoothness('dev')
partition_by_smoothness('test')

View File

@ -4,7 +4,7 @@ from sklearn.feature_extraction.text import TfidfVectorizer
from os.path import join
import os
import pickle
from utils import load_samples
from utils import load_samples_raw
from tqdm import tqdm
import shutil
@ -40,7 +40,7 @@ pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pic
def transform_folder_samples(protocol, splitname):
for i, sample in tqdm(enumerate(load_samples(join(datapath, domain, protocol, splitname), classes=train.classes_))):
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
sample.instances = tfidf.transform(sample.instances)
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

View File

@ -0,0 +1,47 @@
import quapy as qp
from quapy.data import LabelledCollection
from sklearn.feature_extraction.text import TfidfVectorizer
from os.path import join
import os
import pickle
from utils import *
from tqdm import tqdm
import shutil
vector_generation = 'average'
datapath = './data'
domain = f'Books-roberta-base-finetuned/checkpoint-1188-{vector_generation}'
outname = domain.replace('-finetuned', '-finetuned-pkl')
protocol = 'app'
print('pickling npy txt files')
print('from:', join(datapath, domain))
print('to', join(datapath, outname))
print('for protocol:', protocol)
os.makedirs(join(datapath, outname), exist_ok=True)
os.makedirs(join(datapath, outname, protocol), exist_ok=True)
os.makedirs(join(datapath, outname, protocol, 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
def transform_folder_samples(protocol, splitname):
folder_dir=join(datapath, domain, protocol, splitname)
for i, sample in tqdm(enumerate(load_samples_folder(folder_dir, filter=None, load_fn=load_simple_sample_npytxt, classes=train.classes_))):
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
transform_folder_samples(protocol, 'dev_samples')
transform_folder_samples(protocol, 'test_samples')

View File

@ -0,0 +1,54 @@
import quapy as qp
from quapy.data import LabelledCollection
from sklearn.feature_extraction.text import TfidfVectorizer
from os.path import join
import os
import pickle
from utils import load_samples_raw
from tqdm import tqdm
import shutil
datapath = './data'
domain = 'Books'
outname = domain + '-tfidf'
def save_preprocessing_info(transformer):
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
foo.write(f'{str(transformer)}\n')
os.makedirs(join(datapath, outname), exist_ok=True)
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
os.makedirs(join(datapath, outname, 'npp'), exist_ok=True)
os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt'))
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
train.instances = tfidf.fit_transform(train.instances)
save_preprocessing_info(tfidf)
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
def transform_folder_samples(protocol, splitname):
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
sample.instances = tfidf.transform(sample.instances)
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
transform_folder_samples('app', 'dev_samples')
transform_folder_samples('app', 'test_samples')
transform_folder_samples('npp', 'dev_samples')
transform_folder_samples('npp', 'test_samples')

View File

@ -1,42 +1,64 @@
import quapy as qp
from quapy.data import LabelledCollection
import numpy as np
from glob import glob
from json import load
import os
from os.path import join
import pickle
import pandas as pd
import csv
import datasets
from datasets import Dataset
import quapy as qp
from quapy.data import LabelledCollection
def load_samples(path_dir, classes):
nsamples = len(glob(join(path_dir, f'*.txt')))
for id in range(nsamples):
yield LabelledCollection.load(join(path_dir, f'{id}.txt'), loader_func=qp.data.reader.from_text, classes=classes)
def load_simple_sample_npytxt(parentdir, filename, classes=None):
samplepath = join(parentdir, filename+'.txt')
yX = np.loadtxt(samplepath)
X = yX[:,1:]
y = yX[:,0].astype(np.int32)
return LabelledCollection(instances=X, labels=y, classes_=classes)
def load_samples_as_csv(path_dir, debug=False):
import pandas as pd
import csv
import datasets
from datasets import Dataset
def load_simple_sample_raw(parentdir, filename, classes=None):
samplepath = join(parentdir, filename+'.txt')
return LabelledCollection.load(samplepath, loader_func=qp.data.reader.from_text, classes=classes)
nsamples = len(glob(join(path_dir, f'*.txt')))
for id in range(nsamples):
df = pd.read_csv(join(path_dir, f'{id}.txt'), sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
def load_single_sample_as_csv(parentdir, filename):
samplepath = join(parentdir, filename+'.txt')
df = pd.read_csv(samplepath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
labels = df.pop('labels').to_frame()
X = df
features = datasets.Features({'review': datasets.Value('string')})
if debug:
sample = Dataset.from_pandas(df=X, features=features).select(range(50))
labels = labels[:50]
else:
sample = Dataset.from_pandas(df=X, features=features)
sample = Dataset.from_pandas(df=df, features=features)
yield sample, labels
return sample, labels
def load_samples_pkl(path_dir, filter=None):
nsamples = len(glob(join(path_dir, f'*.pkl')))
def load_single_sample_pkl(parentdir, filename):
return pickle.load(open(join(parentdir, filename+'.pkl'), 'rb'))
# def load_samples_npytxt(path_dir, filter=None, classes=None):
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
# def load_samples_raw(path_dir, filter=None, classes=None):
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, load_fn_kwargs={'classes': classes})
# def load_samples_as_csv(path_dir, filter=None):
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_as_csv)
# def load_samples_pkl(path_dir, filter=None):
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_pkl)
def load_samples_folder(path_dir, filter=None, load_fn=None, **load_fn_kwargs):
nsamples = len(glob(join(path_dir, f'*')))
for id in range(nsamples):
if (filter is None) or id in filter:
yield pickle.load(open(join(path_dir, f'{id}.pkl'), 'rb'))
yield load_fn(path_dir, f'{id}', **load_fn_kwargs)