1
0
Fork 0

scripts using QuaPy

This commit is contained in:
Alejandro Moreo Fernandez 2022-03-31 18:46:56 +02:00
parent b4c3e57343
commit 85abaf2ba2
17 changed files with 418 additions and 234 deletions

View File

@ -18,13 +18,6 @@ te_size = 1000
nval = 1000
nte = 5000
# domain = 'Gift_Cards'
# tr_size = 200
# val_size = 100
# te_size = 100
# nval = 20
# nte = 40
def from_gz_text(path, encoding='utf-8', class2int=True):
"""

View File

@ -0,0 +1,116 @@
import gzip
import quapy as qp
import numpy as np
import pandas as pd
from quapy.data import LabelledCollection
import quapy.functional as F
import os
from os.path import join
from pathlib import Path
import pickle
datadir = '../OrdinalQuantification'
outdir = './data/'
domain = 'fact'
seed = 7
tr_size = 20000
val_size = 1000
te_size = 1000
nval = 1000
nte = 5000
def from_csv(path):
df = pd.read_csv(path)
# divide the continuous labels into ordered classes
energy_boundaries = np.arange(start=2.4, stop=4.2, step=0.15)[1:-1]
y = np.digitize(np.array(df['log10_energy'], dtype=np.float32), energy_boundaries)
# note: omitting the dtype will result in a single instance having a different class
# obtain a matrix of shape (n_samples, n_features)
X = df.iloc[:, 1:].to_numpy().astype(np.float32)
return X, y
def write_pkl(sample: LabelledCollection, path):
os.makedirs(Path(path).parent, exist_ok=True)
pickle.dump(sample, open(path, 'wb'), pickle.HIGHEST_PROTOCOL)
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
os.makedirs(outdir, exist_ok=True)
with open(prevpath, 'wt') as prevfile:
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
sample = pool.sampling(sample_size, *prev)
write_pkl(sample, join(outdir, f'{i}.pkl'))
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
os.makedirs(outdir, exist_ok=True)
with open(prevpath, 'wt') as prevfile:
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
write_pkl(sample, join(outdir, f'{i}.pkl'))
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
fullpath = join(datadir,domain, 'fact_wobble.csv')
data = LabelledCollection.load(fullpath, from_csv)
if np.isnan(data.instances).any():
rows, cols = np.where(np.isnan(data.instances))
data.instances = np.delete(data.instances, rows, axis=0)
data.labels = np.delete(data.labels, rows, axis=0)
print('deleted nan rows')
if np.isnan(data.instances).any():
rows, cols = np.where(np.isnan(data.instances))
data.instances = np.delete(data.instances, rows, axis=0)
data.labels = np.delete(data.labels, rows, axis=0)
print('deleted nan rows')
if np.isinf(data.instances).any():
rows, cols = np.where(np.isinf(data.instances))
data.instances = np.delete(data.instances, rows, axis=0)
data.labels = np.delete(data.labels, rows, axis=0)
print('deleted inf rows')
print(len(data))
print(data.classes_)
print(data.prevalence())
with qp.util.temp_seed(seed):
train, rest = data.split_stratified(train_prop=tr_size)
devel, test = rest.split_stratified(train_prop=0.5)
print(len(train))
print(len(devel))
print(len(test))
domaindir = join(outdir, domain)
write_pkl(train, join(domaindir, 'training_data.pkl'))
write_pkl(devel, join(domaindir, 'development_data.pkl'))
write_pkl(test, join(domaindir, 'test_data.pkl'))
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))

View File

@ -1,11 +1,11 @@
import numpy as np
# smoothing approximation
def smoothness(p):
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
def _check_arrays(prevs):
prevs = np.asarray(prevs)
if prevs.ndim==1:
@ -13,6 +13,7 @@ def _check_arrays(prevs):
return prevs
# mean normalized match distance
def mnmd(prevs, prevs_hat):
prevs = _check_arrays(prevs)
prevs_hat = _check_arrays(prevs_hat)
@ -22,6 +23,7 @@ def mnmd(prevs, prevs_hat):
return np.mean(nmds)
# normalized match distance
def nmd(prev, prev_hat):
n = len(prev)
return (1./(n-1))*mdpa(prev, prev_hat)

View File

@ -0,0 +1,150 @@
import numpy as np
import quapy as qp
import os
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from Ordinal.model import RegressionQuantification, LogisticAT, LogisticSE, LogisticIT, LAD, OrdinalRidge
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
from os.path import join
from utils import load_samples_folder, load_single_sample_pkl
from evaluation import nmd, mnmd
from tqdm import tqdm
"""
This script generates all results from Table 1 in the paper, i.e., all results comparing quantifiers equipped with
standard logistic regression against quantifiers equipped with order-aware classifiers
"""
def quantifiers():
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
params_Ridge = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced'], 'normalize':[True,False]}
# baselines
yield 'CC(LR)', CC(LogisticRegression()), params_LR
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
yield 'CC(OLR-SE)', CC(LogisticSE()), params_OLR
yield 'PCC(OLR-SE)', PCC(LogisticSE()), params_OLR
yield 'ACC(OLR-SE)', ACC(LogisticSE()), params_OLR
yield 'PACC(OLR-SE)', PACC(LogisticSE()), params_OLR
yield 'SLD(OLR-SE)', EMQ(LogisticSE()), params_OLR
yield 'CC(OLR-IT)', CC(LogisticIT()), params_OLR
yield 'PCC(OLR-IT)', PCC(LogisticIT()), params_OLR
yield 'ACC(OLR-IT)', ACC(LogisticIT()), params_OLR
yield 'PACC(OLR-IT)', PACC(LogisticIT()), params_OLR
yield 'SLD(OLR-IT)', EMQ(LogisticIT()), params_OLR
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
yield 'CC(LAD)', CC(LAD()), params_SVR
yield 'ACC(LAD)', ACC(LAD()), params_SVR
yield 'CC(ORidge)', CC(OrdinalRidge()), params_Ridge
yield 'ACC(ORidge)', ACC(OrdinalRidge()), params_Ridge
def run_experiment(params):
qname, q, param_grid = params
qname += posfix
resultfile = join(resultpath, f'{qname}.all.csv')
if os.path.exists(resultfile):
print(f'result file {resultfile} already exists: continue')
return None
print(f'fitting {qname} for all-drift')
def load_test_samples():
folderpath = join(datapath, domain, protocol, 'test_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=5000):
if posfix == '-std':
sample.instances = zscore.transform(sample.instances)
yield sample.instances, sample.prevalence()
def load_dev_samples():
folderpath = join(datapath, domain, protocol, 'dev_samples')
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=1000):
if posfix == '-std':
sample.instances = zscore.transform(sample.instances)
yield sample.instances, sample.prevalence()
q = qp.model_selection.GridSearchQ(
q,
param_grid,
sample_size=1000,
protocol='gen',
error=mnmd,
val_split=load_dev_samples,
n_jobs=-1,
refit=False,
timeout=60*60*2,
verbose=True).fit(train)
hyperparams = f'{qname}\tall\t{q.best_params_}\t{q.best_score_}'
print('[done]')
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
report.to_csv(resultfile, index=False)
print('[learning regressor-based adjustment]')
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
q.fit(None)
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
mean_nmd = report['nmd'].mean()
std_nmd = report['nmd'].std()
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
resultfile = join(resultpath, f'{qname}.all.reg.csv')
report.to_csv(resultfile, index=False)
return hyperparams
if __name__ == '__main__':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
#domain = 'Books-tfidf'
posfix = ''
# domain = 'fact'
# posfix = '-std' # set to '' to avoid standardization
# posfix = ''
load_sample_fn = load_single_sample_pkl
datapath = './data'
protocol = 'app'
resultpath = join('./results', domain, protocol)
os.makedirs(resultpath, exist_ok=True)
train = load_sample_fn(join(datapath, domain), 'training_data')
if posfix=='-std':
zscore = StandardScaler()
train.instances = zscore.fit_transform(train.instances)
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
hypers = qp.util.parallel(run_experiment, quantifiers(), n_jobs=-3)
for h in hypers:
if h is not None:
foo.write(h)
foo.write('\n')

View File

@ -1,16 +1,26 @@
import csv
import sys
import numpy as np
import datasets
import numpy as np
import pandas as pd
import torch.cuda
from datasets import Dataset, DatasetDict
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer
from datasets import list_datasets, list_metrics, load_dataset, Dataset, DatasetDict, load_metric
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import Trainer
import pandas as pd
import csv
from transformers import TrainingArguments
"""
This script fine-tunes a pre-trained language model on a given textual training set.
The training goes for a maximum of 5 epochs, but stores the model parameters of the best performing epoch according
to the validation loss in a hold-out val split of 1000 documents (stratified).
We used it with RoBERTa in the training set of the Amazon-OQ-BK domain, i.e.:
$> python3 ./data/Books/training_data.txt roberta-base
"""
def tokenize_function(example):
@ -31,13 +41,13 @@ if __name__ == '__main__':
debug = False
assert torch.cuda.is_available(), 'cuda is not available'
datapath = './data/Books/training_data.txt'
checkpoint = 'roberta-base'
# n_args = len(sys.argv)
# assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
# datapath = './data/Books/training_data.txt'
# checkpoint = 'roberta-base'
n_args = len(sys.argv)
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
# datapath = sys.argv[1] # './data/Books/training_data.txt'
# checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
datapath = sys.argv[1] # './data/Books/training_data.txt'
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
modelout = checkpoint+'-val-finetuned'
@ -60,14 +70,6 @@ if __name__ == '__main__':
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)
print(tokenized_datasets['train'][0]['labels'])
print(tokenized_datasets['train'][0]['review'])
print(tokenized_datasets['train'][0]['input_ids'])
print(len(tokenized_datasets['train'][0]['input_ids']))
# print(tokenized_datasets['train'][0]['token_type_ids'])
# print(tokenized_datasets['train'][0]['attention_mask'])
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
# fine-tuning

View File

@ -1,11 +0,0 @@
#!/bin/bash
set -x
#conda activate torch
transformer=roberta-base
#python3 finetune_bert.py ./data/Books/training_data.txt $transformer
#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned last
#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned average
PYTHONPATH=.:.. python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned posteriors

View File

@ -7,6 +7,10 @@ from pathlib import Path
from Ordinal.main import quantifiers
from Ordinal.tabular import Table
"""
This script generates some tables for Amazon-OQ-BK (for internal use only)
"""
domain = 'Books-tfidf'
domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
@ -34,7 +38,6 @@ resultfiles = list(glob(f'{resultpath}/*.csv')) \
+ list(glob(f'{resultpath_bertave}/*.csv')) \
+ list(glob(f'{resultpath_bertpost}/*.csv'))
for resultfile in resultfiles:
df = pd.read_csv(resultfile)
nmd = df['nmd'].values

View File

@ -0,0 +1,82 @@
import pandas as pd
from os.path import join
import os
from glob import glob
from pathlib import Path
from Ordinal.experiments_lr_vs_ordlr import quantifiers
from Ordinal.tabular import Table
"""
This script generates some tables for Fact-OQ (for internal use only)
"""
#domain = 'fact'
#domain = 'Books-tfidf'
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
prot = 'app'
outpath = f'./tables/{domain}/{prot}/results.tex'
resultpath = join('./results', domain, prot)
withstd=False
methods = [qname for qname, *_ in quantifiers()]
if withstd:
methods = [m+'-std' for m in methods]
#methods = methods + methods_variant
# methods += [m+'-r' for m in methods]
quantifiers_families = ['CC', 'PCC', 'ACC', 'PACC', 'SLD']
# method_variants = ['LR', 'OLR-AT', 'OLR-SE', 'OLR-IT', 'ORidge', 'LAD']
method_variants = ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']
if withstd:
method_variants = [m+'-std' for m in method_variants]
print('families:', quantifiers_families)
print('variants', method_variants)
table = Table(benchmarks=quantifiers_families, methods=method_variants, prec_mean=4, show_std=True, prec_std=4,
color=False, show_rel_to=0, missing_str='\multicolumn{1}{c}{---}', clean_zero=True)
resultfiles = list(glob(f'{resultpath}/*).all.csv'))
for resultfile in resultfiles:
df = pd.read_csv(resultfile)
nmd = df['nmd'].values
resultname = Path(resultfile).name
method, drift, *other = resultname.replace('.csv', '').replace('-RoBERTa-average','').split('.')
if drift!='all':
continue
if other:
method += '-r'
if method not in methods:
continue
family, variant = method.split('(')
variant = variant.replace(')', '')
if variant not in method_variants:
continue
table.add(family, variant, nmd)
os.makedirs(Path(outpath).parent, exist_ok=True)
tabular = """
\\resizebox{\\textwidth}{!}{%
\\begin{tabular}{c""" + ('l' * (table.nbenchmarks)) + """}
\\toprule
"""
tabular += table.latexTabularT(average=False)
tabular += """
\end{tabular}%
}"""
print('saving table in', outpath)
with open(outpath, 'wt') as foo:
foo.write(tabular)
foo.write('\n')
print('[done]')

View File

@ -12,6 +12,11 @@ from tqdm import tqdm
from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
"""
This scripts takes a pre-trained model (a fine-tuned one) and generates numerical representations for all
samples in the dataset. The representations are saved in npy-txt plain format.
"""
def tokenize_function(example):
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')

View File

@ -1,16 +0,0 @@
import quapy as qp
from quapy.data import LabelledCollection
from quapy.data.reader import from_text
from quapy.functional import strprev
category = 'Books'
datadir = './data'
training_path = f'{datadir}/{category}/training_data.txt'
data = LabelledCollection.load(training_path, loader_func=from_text)
print(len(data))
print(strprev(data.prevalence()))

View File

@ -3,8 +3,7 @@ from sklearn.linear_model import LogisticRegression
import quapy as qp
import numpy as np
from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, \
LogisticAT
from Ordinal.model import OrderedLogisticRegression, LogisticAT
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
from quapy.data import LabelledCollection
from os.path import join
@ -17,13 +16,6 @@ from tqdm import tqdm
import mord
#TODO:
# Ordinal LR, LAD -> balance sample_weight
# use BERT to extract features
# other domains? Kitchen, Electronics...
# try with the inverse of the distance
# add drift='all'
def quantifiers():
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
@ -126,8 +118,8 @@ def run_experiment(params):
if __name__ == '__main__':
#preprocessing = 'roberta.last'
# preprocessing = 'roberta.average'
preprocessing = 'roberta.posteriors'
preprocessing = 'roberta.average'
# preprocessing = 'roberta.posteriors'
#preprocessing = 'tfidf'
if preprocessing=='tfidf':
domain = 'Books-tfidf'

View File

@ -1,17 +1,12 @@
from copy import deepcopy
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, Ridge
from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR, SVR
from statsmodels.miscmodels.ordinal_model import OrderedModel
import mord
import numpy as np
from scipy.sparse import issparse
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR
from sklearn.utils.class_weight import compute_class_weight
from statsmodels.miscmodels.ordinal_model import OrderedModel
class OrderedLogisticRegression:
@ -38,103 +33,6 @@ class OrderedLogisticRegression:
return self.res_prob.model.predict(self.res_prob.params, exog=X)
class StackedClassifier: # aka Funnelling Monolingual
def __init__(self, base_estimator=LogisticRegression()):
if not hasattr(base_estimator, 'predict_proba'):
print('the estimator does not seem to be probabilistic: calibrating')
base_estimator = CalibratedClassifierCV(base_estimator)
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
self.base = deepcopy(base_estimator)
self.meta = deepcopy(base_estimator)
self.norm = StandardScaler()
def fit(self, X, y):
self.base.fit(X, y)
P = self.base.predict_proba(X)
P = self.norm.fit_transform(P)
self.meta.fit(P, y)
return self
def predict(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict(P)
def predict_proba(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict_proba(P)
class RegressionQuantification:
def __init__(self,
base_quantifier,
regression='svr',
val_samples_generator=None,
norm=True):
self.base_quantifier = base_quantifier
if isinstance(regression, str):
assert regression in ['ridge', 'svr'], 'unknown regression model'
if regression == 'ridge':
self.reg = Ridge(normalize=norm)
elif regression == 'svr':
self.reg = MultiOutputRegressor(LinearSVR())
else:
self.reg = regression
# self.reg = MultiTaskLassoCV(normalize=norm)
# self.reg = KernelRidge(kernel='rbf')
# self.reg = LassoLarsCV(normalize=norm)
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
#self.reg = LinearRegression(normalize=norm) # <- bien
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
self.regression = regression
self.val_samples_generator = val_samples_generator
# self.norm = StandardScaler()
# self.covs = covs
def generate_validation_samples(self):
Xs, ys = [], []
for instances, prevalence in self.val_samples_generator():
ys.append(prevalence)
Xs.append(self.base_quantifier.quantify(instances))
Xs = np.asarray(Xs)
ys = np.asarray(ys)
return Xs, ys
def fit(self, data):
print('fitting quantifier')
if data is not None:
self.base_quantifier.fit(data)
print('generating val samples')
Xs, ys = self.generate_validation_samples()
# Xs = self.norm.fit_transform(Xs)
print('fitting regressor')
self.reg.fit(Xs, ys)
print('[done]')
return self
def quantify(self, instances):
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
# Xs = self.norm.transform(Xs)
Xs = self.reg.predict(Xs).flatten()
# Xs = self.norm.inverse_transform(Xs)
Xs = np.clip(Xs, 0, 1)
adjusted = Xs / Xs.sum()
# adjusted = np.clip(Xs, 0, 1)
adjusted = adjusted
return adjusted
def get_params(self, deep=True):
return self.base_quantifier.get_params()
def set_params(self, **params):
self.base_quantifier.set_params(**params)
class LAD(BaseEstimator, ClassifierMixin):
def __init__(self, C=1.0, class_weight=None):
self.C = C
@ -238,6 +136,7 @@ class OrdinalRidge(BaseEstimator, ClassifierMixin):
self.class_weight = params['class_weight']
self.normalize = params['normalize']
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
class LogisticAT(mord.LogisticAT):

View File

@ -9,6 +9,12 @@ from os.path import join
from tqdm import tqdm
"""
This scripts generates a partition of a dataset in terms of "shift".
The partition is only carried out by generating index vectors.
"""
def partition_by_drift(split, training_prevalence):
assert split in ['dev', 'test'], 'invalid split name'
total=1000 if split=='dev' else 5000

View File

@ -1,11 +1,16 @@
import numpy as np
from Ordinal.evaluation import smoothness
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
from os.path import join
from tqdm import tqdm
"""
This scripts generates a partition of a dataset in terms of "smoothness".
The partition is only carried out by generating index vectors.
"""
def partition_by_smoothness(split):
assert split in ['dev', 'test'], 'invalid split name'
total=1000 if split=='dev' else 5000

View File

@ -1,54 +0,0 @@
import quapy as qp
from quapy.data import LabelledCollection
from sklearn.feature_extraction.text import TfidfVectorizer
from os.path import join
import os
import pickle
from utils import load_samples_raw
from tqdm import tqdm
import shutil
datapath = './data'
domain = 'Books'
outname = domain + '-tfidf'
def save_preprocessing_info(transformer):
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
foo.write(f'{str(transformer)}\n')
os.makedirs(join(datapath, outname), exist_ok=True)
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
os.makedirs(join(datapath, outname, 'npp'), exist_ok=True)
os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt'))
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
train.instances = tfidf.fit_transform(train.instances)
save_preprocessing_info(tfidf)
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
def transform_folder_samples(protocol, splitname):
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
sample.instances = tfidf.transform(sample.instances)
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
transform_folder_samples('app', 'dev_samples')
transform_folder_samples('app', 'test_samples')
transform_folder_samples('npp', 'dev_samples')
transform_folder_samples('npp', 'test_samples')

View File

@ -8,6 +8,11 @@ from utils import *
from tqdm import tqdm
import shutil
"""
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into dense vectors
extracted from a pretrained model (here we use the RoBERTa fine-tuned on the training set)
Three vector generation modes are available: posteriors, last, average
"""
vector_generation = 'posteriors'
@ -29,7 +34,6 @@ os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)

View File

@ -1,14 +1,20 @@
import quapy as qp
from Ordinal.utils import load_simple_sample_raw
from quapy.data import LabelledCollection
from sklearn.feature_extraction.text import TfidfVectorizer
from os.path import join
import os
import pickle
from utils import load_samples_raw
from tqdm import tqdm
import shutil
"""
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors.
"""
datapath = './data'
domain = 'Books'
outname = domain + '-tfidf'
@ -40,7 +46,7 @@ pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pic
def transform_folder_samples(protocol, splitname):
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
for i, sample in tqdm(enumerate(load_simple_sample_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
sample.instances = tfidf.transform(sample.instances)
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)