forked from moreo/QuaPy
scripts using QuaPy
This commit is contained in:
parent
b4c3e57343
commit
85abaf2ba2
|
@ -18,13 +18,6 @@ te_size = 1000
|
|||
nval = 1000
|
||||
nte = 5000
|
||||
|
||||
# domain = 'Gift_Cards'
|
||||
# tr_size = 200
|
||||
# val_size = 100
|
||||
# te_size = 100
|
||||
# nval = 20
|
||||
# nte = 40
|
||||
|
||||
|
||||
def from_gz_text(path, encoding='utf-8', class2int=True):
|
||||
"""
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
import gzip
|
||||
import quapy as qp
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from quapy.data import LabelledCollection
|
||||
import quapy.functional as F
|
||||
import os
|
||||
from os.path import join
|
||||
from pathlib import Path
|
||||
import pickle
|
||||
|
||||
|
||||
datadir = '../OrdinalQuantification'
|
||||
outdir = './data/'
|
||||
domain = 'fact'
|
||||
seed = 7
|
||||
|
||||
tr_size = 20000
|
||||
val_size = 1000
|
||||
te_size = 1000
|
||||
nval = 1000
|
||||
nte = 5000
|
||||
|
||||
|
||||
def from_csv(path):
|
||||
df = pd.read_csv(path)
|
||||
|
||||
# divide the continuous labels into ordered classes
|
||||
energy_boundaries = np.arange(start=2.4, stop=4.2, step=0.15)[1:-1]
|
||||
y = np.digitize(np.array(df['log10_energy'], dtype=np.float32), energy_boundaries)
|
||||
|
||||
# note: omitting the dtype will result in a single instance having a different class
|
||||
|
||||
# obtain a matrix of shape (n_samples, n_features)
|
||||
X = df.iloc[:, 1:].to_numpy().astype(np.float32)
|
||||
return X, y
|
||||
|
||||
|
||||
def write_pkl(sample: LabelledCollection, path):
|
||||
os.makedirs(Path(path).parent, exist_ok=True)
|
||||
pickle.dump(sample, open(path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
|
||||
sample = pool.sampling(sample_size, *prev)
|
||||
write_pkl(sample, join(outdir, f'{i}.pkl'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
with open(prevpath, 'wt') as prevfile:
|
||||
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
|
||||
write_pkl(sample, join(outdir, f'{i}.pkl'))
|
||||
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||
|
||||
|
||||
|
||||
fullpath = join(datadir,domain, 'fact_wobble.csv')
|
||||
|
||||
data = LabelledCollection.load(fullpath, from_csv)
|
||||
|
||||
if np.isnan(data.instances).any():
|
||||
rows, cols = np.where(np.isnan(data.instances))
|
||||
data.instances = np.delete(data.instances, rows, axis=0)
|
||||
data.labels = np.delete(data.labels, rows, axis=0)
|
||||
print('deleted nan rows')
|
||||
|
||||
if np.isnan(data.instances).any():
|
||||
rows, cols = np.where(np.isnan(data.instances))
|
||||
data.instances = np.delete(data.instances, rows, axis=0)
|
||||
data.labels = np.delete(data.labels, rows, axis=0)
|
||||
print('deleted nan rows')
|
||||
|
||||
if np.isinf(data.instances).any():
|
||||
rows, cols = np.where(np.isinf(data.instances))
|
||||
data.instances = np.delete(data.instances, rows, axis=0)
|
||||
data.labels = np.delete(data.labels, rows, axis=0)
|
||||
print('deleted inf rows')
|
||||
|
||||
|
||||
print(len(data))
|
||||
print(data.classes_)
|
||||
print(data.prevalence())
|
||||
|
||||
with qp.util.temp_seed(seed):
|
||||
train, rest = data.split_stratified(train_prop=tr_size)
|
||||
|
||||
devel, test = rest.split_stratified(train_prop=0.5)
|
||||
print(len(train))
|
||||
print(len(devel))
|
||||
print(len(test))
|
||||
|
||||
domaindir = join(outdir, domain)
|
||||
|
||||
write_pkl(train, join(domaindir, 'training_data.pkl'))
|
||||
write_pkl(devel, join(domaindir, 'development_data.pkl'))
|
||||
write_pkl(test, join(domaindir, 'test_data.pkl'))
|
||||
|
||||
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||
|
||||
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
|
|
@ -1,11 +1,11 @@
|
|||
import numpy as np
|
||||
|
||||
|
||||
# smoothing approximation
|
||||
def smoothness(p):
|
||||
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||
|
||||
|
||||
|
||||
def _check_arrays(prevs):
|
||||
prevs = np.asarray(prevs)
|
||||
if prevs.ndim==1:
|
||||
|
@ -13,6 +13,7 @@ def _check_arrays(prevs):
|
|||
return prevs
|
||||
|
||||
|
||||
# mean normalized match distance
|
||||
def mnmd(prevs, prevs_hat):
|
||||
prevs = _check_arrays(prevs)
|
||||
prevs_hat = _check_arrays(prevs_hat)
|
||||
|
@ -22,6 +23,7 @@ def mnmd(prevs, prevs_hat):
|
|||
return np.mean(nmds)
|
||||
|
||||
|
||||
# normalized match distance
|
||||
def nmd(prev, prev_hat):
|
||||
n = len(prev)
|
||||
return (1./(n-1))*mdpa(prev, prev_hat)
|
||||
|
|
|
@ -0,0 +1,150 @@
|
|||
import numpy as np
|
||||
import quapy as qp
|
||||
import os
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from Ordinal.model import RegressionQuantification, LogisticAT, LogisticSE, LogisticIT, LAD, OrdinalRidge
|
||||
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
|
||||
from os.path import join
|
||||
from utils import load_samples_folder, load_single_sample_pkl
|
||||
from evaluation import nmd, mnmd
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
"""
|
||||
This script generates all results from Table 1 in the paper, i.e., all results comparing quantifiers equipped with
|
||||
standard logistic regression against quantifiers equipped with order-aware classifiers
|
||||
"""
|
||||
|
||||
def quantifiers():
|
||||
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
params_Ridge = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced'], 'normalize':[True,False]}
|
||||
|
||||
# baselines
|
||||
yield 'CC(LR)', CC(LogisticRegression()), params_LR
|
||||
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
|
||||
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
|
||||
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
|
||||
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
|
||||
|
||||
# with order-aware classifiers
|
||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
||||
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
||||
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
||||
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
||||
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
||||
|
||||
yield 'CC(OLR-SE)', CC(LogisticSE()), params_OLR
|
||||
yield 'PCC(OLR-SE)', PCC(LogisticSE()), params_OLR
|
||||
yield 'ACC(OLR-SE)', ACC(LogisticSE()), params_OLR
|
||||
yield 'PACC(OLR-SE)', PACC(LogisticSE()), params_OLR
|
||||
yield 'SLD(OLR-SE)', EMQ(LogisticSE()), params_OLR
|
||||
|
||||
yield 'CC(OLR-IT)', CC(LogisticIT()), params_OLR
|
||||
yield 'PCC(OLR-IT)', PCC(LogisticIT()), params_OLR
|
||||
yield 'ACC(OLR-IT)', ACC(LogisticIT()), params_OLR
|
||||
yield 'PACC(OLR-IT)', PACC(LogisticIT()), params_OLR
|
||||
yield 'SLD(OLR-IT)', EMQ(LogisticIT()), params_OLR
|
||||
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
||||
|
||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
yield 'CC(LAD)', CC(LAD()), params_SVR
|
||||
yield 'ACC(LAD)', ACC(LAD()), params_SVR
|
||||
yield 'CC(ORidge)', CC(OrdinalRidge()), params_Ridge
|
||||
yield 'ACC(ORidge)', ACC(OrdinalRidge()), params_Ridge
|
||||
|
||||
|
||||
def run_experiment(params):
|
||||
qname, q, param_grid = params
|
||||
qname += posfix
|
||||
resultfile = join(resultpath, f'{qname}.all.csv')
|
||||
if os.path.exists(resultfile):
|
||||
print(f'result file {resultfile} already exists: continue')
|
||||
return None
|
||||
|
||||
print(f'fitting {qname} for all-drift')
|
||||
|
||||
|
||||
def load_test_samples():
|
||||
folderpath = join(datapath, domain, protocol, 'test_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=5000):
|
||||
if posfix == '-std':
|
||||
sample.instances = zscore.transform(sample.instances)
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
|
||||
def load_dev_samples():
|
||||
folderpath = join(datapath, domain, protocol, 'dev_samples')
|
||||
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=1000):
|
||||
if posfix == '-std':
|
||||
sample.instances = zscore.transform(sample.instances)
|
||||
yield sample.instances, sample.prevalence()
|
||||
|
||||
q = qp.model_selection.GridSearchQ(
|
||||
q,
|
||||
param_grid,
|
||||
sample_size=1000,
|
||||
protocol='gen',
|
||||
error=mnmd,
|
||||
val_split=load_dev_samples,
|
||||
n_jobs=-1,
|
||||
refit=False,
|
||||
timeout=60*60*2,
|
||||
verbose=True).fit(train)
|
||||
|
||||
hyperparams = f'{qname}\tall\t{q.best_params_}\t{q.best_score_}'
|
||||
|
||||
print('[done]')
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
report.to_csv(resultfile, index=False)
|
||||
|
||||
print('[learning regressor-based adjustment]')
|
||||
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
|
||||
q.fit(None)
|
||||
|
||||
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||
mean_nmd = report['nmd'].mean()
|
||||
std_nmd = report['nmd'].std()
|
||||
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||
resultfile = join(resultpath, f'{qname}.all.reg.csv')
|
||||
report.to_csv(resultfile, index=False)
|
||||
|
||||
return hyperparams
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
#domain = 'Books-tfidf'
|
||||
posfix = ''
|
||||
|
||||
# domain = 'fact'
|
||||
# posfix = '-std' # set to '' to avoid standardization
|
||||
# posfix = ''
|
||||
|
||||
load_sample_fn = load_single_sample_pkl
|
||||
datapath = './data'
|
||||
protocol = 'app'
|
||||
resultpath = join('./results', domain, protocol)
|
||||
os.makedirs(resultpath, exist_ok=True)
|
||||
|
||||
train = load_sample_fn(join(datapath, domain), 'training_data')
|
||||
|
||||
if posfix=='-std':
|
||||
zscore = StandardScaler()
|
||||
train.instances = zscore.fit_transform(train.instances)
|
||||
|
||||
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
||||
hypers = qp.util.parallel(run_experiment, quantifiers(), n_jobs=-3)
|
||||
for h in hypers:
|
||||
if h is not None:
|
||||
foo.write(h)
|
||||
foo.write('\n')
|
||||
|
||||
|
|
@ -1,16 +1,26 @@
|
|||
import csv
|
||||
import sys
|
||||
import numpy as np
|
||||
import datasets
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch.cuda
|
||||
from datasets import Dataset, DatasetDict
|
||||
from sklearn.metrics import f1_score
|
||||
from sklearn.model_selection import train_test_split
|
||||
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer
|
||||
from datasets import list_datasets, list_metrics, load_dataset, Dataset, DatasetDict, load_metric
|
||||
from transformers import AutoModelForSequenceClassification
|
||||
from transformers import TrainingArguments
|
||||
from transformers import AutoTokenizer, DataCollatorWithPadding
|
||||
from transformers import Trainer
|
||||
import pandas as pd
|
||||
import csv
|
||||
from transformers import TrainingArguments
|
||||
|
||||
|
||||
"""
|
||||
This script fine-tunes a pre-trained language model on a given textual training set.
|
||||
The training goes for a maximum of 5 epochs, but stores the model parameters of the best performing epoch according
|
||||
to the validation loss in a hold-out val split of 1000 documents (stratified).
|
||||
|
||||
We used it with RoBERTa in the training set of the Amazon-OQ-BK domain, i.e.:
|
||||
$> python3 ./data/Books/training_data.txt roberta-base
|
||||
"""
|
||||
|
||||
|
||||
def tokenize_function(example):
|
||||
|
@ -31,13 +41,13 @@ if __name__ == '__main__':
|
|||
debug = False
|
||||
assert torch.cuda.is_available(), 'cuda is not available'
|
||||
|
||||
datapath = './data/Books/training_data.txt'
|
||||
checkpoint = 'roberta-base'
|
||||
# n_args = len(sys.argv)
|
||||
# assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
|
||||
# datapath = './data/Books/training_data.txt'
|
||||
# checkpoint = 'roberta-base'
|
||||
n_args = len(sys.argv)
|
||||
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
|
||||
|
||||
# datapath = sys.argv[1] # './data/Books/training_data.txt'
|
||||
# checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
|
||||
datapath = sys.argv[1] # './data/Books/training_data.txt'
|
||||
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
|
||||
|
||||
modelout = checkpoint+'-val-finetuned'
|
||||
|
||||
|
@ -60,14 +70,6 @@ if __name__ == '__main__':
|
|||
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||
|
||||
print(tokenized_datasets)
|
||||
print(tokenized_datasets['train'][0]['labels'])
|
||||
print(tokenized_datasets['train'][0]['review'])
|
||||
print(tokenized_datasets['train'][0]['input_ids'])
|
||||
print(len(tokenized_datasets['train'][0]['input_ids']))
|
||||
# print(tokenized_datasets['train'][0]['token_type_ids'])
|
||||
# print(tokenized_datasets['train'][0]['attention_mask'])
|
||||
|
||||
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
||||
|
||||
# fine-tuning
|
||||
|
|
|
@ -1,11 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -x
|
||||
|
||||
#conda activate torch
|
||||
|
||||
transformer=roberta-base
|
||||
|
||||
#python3 finetune_bert.py ./data/Books/training_data.txt $transformer
|
||||
#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned last
|
||||
#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned average
|
||||
PYTHONPATH=.:.. python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned posteriors
|
|
@ -7,6 +7,10 @@ from pathlib import Path
|
|||
from Ordinal.main import quantifiers
|
||||
from Ordinal.tabular import Table
|
||||
|
||||
"""
|
||||
This script generates some tables for Amazon-OQ-BK (for internal use only)
|
||||
"""
|
||||
|
||||
domain = 'Books-tfidf'
|
||||
domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
||||
domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
|
@ -34,7 +38,6 @@ resultfiles = list(glob(f'{resultpath}/*.csv')) \
|
|||
+ list(glob(f'{resultpath_bertave}/*.csv')) \
|
||||
+ list(glob(f'{resultpath_bertpost}/*.csv'))
|
||||
|
||||
|
||||
for resultfile in resultfiles:
|
||||
df = pd.read_csv(resultfile)
|
||||
nmd = df['nmd'].values
|
|
@ -0,0 +1,82 @@
|
|||
import pandas as pd
|
||||
from os.path import join
|
||||
import os
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
|
||||
from Ordinal.experiments_lr_vs_ordlr import quantifiers
|
||||
from Ordinal.tabular import Table
|
||||
|
||||
"""
|
||||
This script generates some tables for Fact-OQ (for internal use only)
|
||||
"""
|
||||
|
||||
#domain = 'fact'
|
||||
#domain = 'Books-tfidf'
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
prot = 'app'
|
||||
outpath = f'./tables/{domain}/{prot}/results.tex'
|
||||
|
||||
resultpath = join('./results', domain, prot)
|
||||
|
||||
withstd=False
|
||||
|
||||
methods = [qname for qname, *_ in quantifiers()]
|
||||
if withstd:
|
||||
methods = [m+'-std' for m in methods]
|
||||
#methods = methods + methods_variant
|
||||
# methods += [m+'-r' for m in methods]
|
||||
|
||||
quantifiers_families = ['CC', 'PCC', 'ACC', 'PACC', 'SLD']
|
||||
# method_variants = ['LR', 'OLR-AT', 'OLR-SE', 'OLR-IT', 'ORidge', 'LAD']
|
||||
method_variants = ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']
|
||||
if withstd:
|
||||
method_variants = [m+'-std' for m in method_variants]
|
||||
|
||||
print('families:', quantifiers_families)
|
||||
print('variants', method_variants)
|
||||
table = Table(benchmarks=quantifiers_families, methods=method_variants, prec_mean=4, show_std=True, prec_std=4,
|
||||
color=False, show_rel_to=0, missing_str='\multicolumn{1}{c}{---}', clean_zero=True)
|
||||
|
||||
resultfiles = list(glob(f'{resultpath}/*).all.csv'))
|
||||
|
||||
for resultfile in resultfiles:
|
||||
df = pd.read_csv(resultfile)
|
||||
nmd = df['nmd'].values
|
||||
resultname = Path(resultfile).name
|
||||
|
||||
method, drift, *other = resultname.replace('.csv', '').replace('-RoBERTa-average','').split('.')
|
||||
if drift!='all':
|
||||
continue
|
||||
if other:
|
||||
method += '-r'
|
||||
if method not in methods:
|
||||
continue
|
||||
|
||||
family, variant = method.split('(')
|
||||
variant = variant.replace(')', '')
|
||||
if variant not in method_variants:
|
||||
continue
|
||||
table.add(family, variant, nmd)
|
||||
|
||||
os.makedirs(Path(outpath).parent, exist_ok=True)
|
||||
|
||||
tabular = """
|
||||
\\resizebox{\\textwidth}{!}{%
|
||||
|
||||
\\begin{tabular}{c""" + ('l' * (table.nbenchmarks)) + """}
|
||||
\\toprule
|
||||
"""
|
||||
|
||||
tabular += table.latexTabularT(average=False)
|
||||
tabular += """
|
||||
\end{tabular}%
|
||||
}"""
|
||||
|
||||
print('saving table in', outpath)
|
||||
with open(outpath, 'wt') as foo:
|
||||
foo.write(tabular)
|
||||
foo.write('\n')
|
||||
|
||||
print('[done]')
|
||||
|
|
@ -12,6 +12,11 @@ from tqdm import tqdm
|
|||
from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
|
||||
|
||||
|
||||
"""
|
||||
This scripts takes a pre-trained model (a fine-tuned one) and generates numerical representations for all
|
||||
samples in the dataset. The representations are saved in npy-txt plain format.
|
||||
"""
|
||||
|
||||
|
||||
def tokenize_function(example):
|
||||
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.data.reader import from_text
|
||||
from quapy.functional import strprev
|
||||
|
||||
category = 'Books'
|
||||
datadir = './data'
|
||||
|
||||
training_path = f'{datadir}/{category}/training_data.txt'
|
||||
|
||||
data = LabelledCollection.load(training_path, loader_func=from_text)
|
||||
|
||||
print(len(data))
|
||||
print(strprev(data.prevalence()))
|
||||
|
||||
|
|
@ -3,8 +3,7 @@ from sklearn.linear_model import LogisticRegression
|
|||
import quapy as qp
|
||||
import numpy as np
|
||||
|
||||
from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, \
|
||||
LogisticAT
|
||||
from Ordinal.model import OrderedLogisticRegression, LogisticAT
|
||||
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
|
||||
from quapy.data import LabelledCollection
|
||||
from os.path import join
|
||||
|
@ -17,13 +16,6 @@ from tqdm import tqdm
|
|||
import mord
|
||||
|
||||
|
||||
#TODO:
|
||||
# Ordinal LR, LAD -> balance sample_weight
|
||||
# use BERT to extract features
|
||||
# other domains? Kitchen, Electronics...
|
||||
# try with the inverse of the distance
|
||||
# add drift='all'
|
||||
|
||||
|
||||
def quantifiers():
|
||||
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||
|
@ -126,8 +118,8 @@ def run_experiment(params):
|
|||
|
||||
if __name__ == '__main__':
|
||||
#preprocessing = 'roberta.last'
|
||||
# preprocessing = 'roberta.average'
|
||||
preprocessing = 'roberta.posteriors'
|
||||
preprocessing = 'roberta.average'
|
||||
# preprocessing = 'roberta.posteriors'
|
||||
#preprocessing = 'tfidf'
|
||||
if preprocessing=='tfidf':
|
||||
domain = 'Books-tfidf'
|
||||
|
|
117
Ordinal/model.py
117
Ordinal/model.py
|
@ -1,17 +1,12 @@
|
|||
from copy import deepcopy
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import LogisticRegression, Ridge
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import LinearSVR, SVR
|
||||
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
||||
import mord
|
||||
import numpy as np
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import Ridge
|
||||
from sklearn.svm import LinearSVR
|
||||
from sklearn.utils.class_weight import compute_class_weight
|
||||
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
||||
|
||||
|
||||
class OrderedLogisticRegression:
|
||||
|
@ -38,103 +33,6 @@ class OrderedLogisticRegression:
|
|||
return self.res_prob.model.predict(self.res_prob.params, exog=X)
|
||||
|
||||
|
||||
class StackedClassifier: # aka Funnelling Monolingual
|
||||
def __init__(self, base_estimator=LogisticRegression()):
|
||||
if not hasattr(base_estimator, 'predict_proba'):
|
||||
print('the estimator does not seem to be probabilistic: calibrating')
|
||||
base_estimator = CalibratedClassifierCV(base_estimator)
|
||||
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
self.base = deepcopy(base_estimator)
|
||||
self.meta = deepcopy(base_estimator)
|
||||
self.norm = StandardScaler()
|
||||
|
||||
def fit(self, X, y):
|
||||
self.base.fit(X, y)
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.fit_transform(P)
|
||||
self.meta.fit(P, y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict(P)
|
||||
|
||||
def predict_proba(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict_proba(P)
|
||||
|
||||
|
||||
class RegressionQuantification:
|
||||
def __init__(self,
|
||||
base_quantifier,
|
||||
regression='svr',
|
||||
val_samples_generator=None,
|
||||
norm=True):
|
||||
|
||||
self.base_quantifier = base_quantifier
|
||||
if isinstance(regression, str):
|
||||
assert regression in ['ridge', 'svr'], 'unknown regression model'
|
||||
if regression == 'ridge':
|
||||
self.reg = Ridge(normalize=norm)
|
||||
elif regression == 'svr':
|
||||
self.reg = MultiOutputRegressor(LinearSVR())
|
||||
else:
|
||||
self.reg = regression
|
||||
# self.reg = MultiTaskLassoCV(normalize=norm)
|
||||
# self.reg = KernelRidge(kernel='rbf')
|
||||
# self.reg = LassoLarsCV(normalize=norm)
|
||||
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
|
||||
#self.reg = LinearRegression(normalize=norm) # <- bien
|
||||
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
|
||||
self.regression = regression
|
||||
self.val_samples_generator = val_samples_generator
|
||||
# self.norm = StandardScaler()
|
||||
# self.covs = covs
|
||||
|
||||
def generate_validation_samples(self):
|
||||
Xs, ys = [], []
|
||||
for instances, prevalence in self.val_samples_generator():
|
||||
ys.append(prevalence)
|
||||
Xs.append(self.base_quantifier.quantify(instances))
|
||||
Xs = np.asarray(Xs)
|
||||
ys = np.asarray(ys)
|
||||
return Xs, ys
|
||||
|
||||
def fit(self, data):
|
||||
print('fitting quantifier')
|
||||
if data is not None:
|
||||
self.base_quantifier.fit(data)
|
||||
print('generating val samples')
|
||||
Xs, ys = self.generate_validation_samples()
|
||||
# Xs = self.norm.fit_transform(Xs)
|
||||
print('fitting regressor')
|
||||
self.reg.fit(Xs, ys)
|
||||
print('[done]')
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
|
||||
# Xs = self.norm.transform(Xs)
|
||||
Xs = self.reg.predict(Xs).flatten()
|
||||
# Xs = self.norm.inverse_transform(Xs)
|
||||
Xs = np.clip(Xs, 0, 1)
|
||||
adjusted = Xs / Xs.sum()
|
||||
# adjusted = np.clip(Xs, 0, 1)
|
||||
adjusted = adjusted
|
||||
return adjusted
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.base_quantifier.get_params()
|
||||
|
||||
def set_params(self, **params):
|
||||
self.base_quantifier.set_params(**params)
|
||||
|
||||
|
||||
class LAD(BaseEstimator, ClassifierMixin):
|
||||
def __init__(self, C=1.0, class_weight=None):
|
||||
self.C = C
|
||||
|
@ -238,6 +136,7 @@ class OrdinalRidge(BaseEstimator, ClassifierMixin):
|
|||
self.class_weight = params['class_weight']
|
||||
self.normalize = params['normalize']
|
||||
|
||||
|
||||
# with order-aware classifiers
|
||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
class LogisticAT(mord.LogisticAT):
|
||||
|
|
|
@ -9,6 +9,12 @@ from os.path import join
|
|||
from tqdm import tqdm
|
||||
|
||||
|
||||
"""
|
||||
This scripts generates a partition of a dataset in terms of "shift".
|
||||
The partition is only carried out by generating index vectors.
|
||||
"""
|
||||
|
||||
|
||||
def partition_by_drift(split, training_prevalence):
|
||||
assert split in ['dev', 'test'], 'invalid split name'
|
||||
total=1000 if split=='dev' else 5000
|
||||
|
|
|
@ -1,11 +1,16 @@
|
|||
import numpy as np
|
||||
from Ordinal.evaluation import smoothness
|
||||
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
||||
|
||||
from os.path import join
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
"""
|
||||
This scripts generates a partition of a dataset in terms of "smoothness".
|
||||
The partition is only carried out by generating index vectors.
|
||||
"""
|
||||
|
||||
|
||||
def partition_by_smoothness(split):
|
||||
assert split in ['dev', 'test'], 'invalid split name'
|
||||
total=1000 if split=='dev' else 5000
|
||||
|
|
|
@ -1,54 +0,0 @@
|
|||
import quapy as qp
|
||||
from quapy.data import LabelledCollection
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from os.path import join
|
||||
import os
|
||||
import pickle
|
||||
from utils import load_samples_raw
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
|
||||
|
||||
datapath = './data'
|
||||
domain = 'Books'
|
||||
outname = domain + '-tfidf'
|
||||
|
||||
def save_preprocessing_info(transformer):
|
||||
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
|
||||
foo.write(f'{str(transformer)}\n')
|
||||
|
||||
|
||||
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
|
||||
os.makedirs(join(datapath, outname, 'npp'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True)
|
||||
os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True)
|
||||
shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt'))
|
||||
|
||||
|
||||
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
|
||||
|
||||
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
|
||||
train.instances = tfidf.fit_transform(train.instances)
|
||||
save_preprocessing_info(tfidf)
|
||||
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
def transform_folder_samples(protocol, splitname):
|
||||
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||
sample.instances = tfidf.transform(sample.instances)
|
||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
||||
transform_folder_samples('app', 'dev_samples')
|
||||
transform_folder_samples('app', 'test_samples')
|
||||
transform_folder_samples('npp', 'dev_samples')
|
||||
transform_folder_samples('npp', 'test_samples')
|
||||
|
||||
|
||||
|
|
@ -8,6 +8,11 @@ from utils import *
|
|||
from tqdm import tqdm
|
||||
import shutil
|
||||
|
||||
"""
|
||||
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into dense vectors
|
||||
extracted from a pretrained model (here we use the RoBERTa fine-tuned on the training set)
|
||||
Three vector generation modes are available: posteriors, last, average
|
||||
"""
|
||||
|
||||
vector_generation = 'posteriors'
|
||||
|
||||
|
@ -29,7 +34,6 @@ os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
|
|||
shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
|
||||
shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
|
||||
|
||||
|
||||
train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
|
||||
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
|
|
|
@ -1,14 +1,20 @@
|
|||
import quapy as qp
|
||||
from Ordinal.utils import load_simple_sample_raw
|
||||
from quapy.data import LabelledCollection
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from os.path import join
|
||||
import os
|
||||
import pickle
|
||||
from utils import load_samples_raw
|
||||
from tqdm import tqdm
|
||||
import shutil
|
||||
|
||||
|
||||
|
||||
"""
|
||||
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors.
|
||||
"""
|
||||
|
||||
|
||||
datapath = './data'
|
||||
domain = 'Books'
|
||||
outname = domain + '-tfidf'
|
||||
|
@ -40,7 +46,7 @@ pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pic
|
|||
|
||||
|
||||
def transform_folder_samples(protocol, splitname):
|
||||
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||
for i, sample in tqdm(enumerate(load_simple_sample_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||
sample.instances = tfidf.transform(sample.instances)
|
||||
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
Loading…
Reference in New Issue