diff --git a/Ordinal/build_Amazon_datasets.py b/Ordinal/build_Amazon_datasets.py index 465c797..533f7e4 100644 --- a/Ordinal/build_Amazon_datasets.py +++ b/Ordinal/build_Amazon_datasets.py @@ -18,13 +18,6 @@ te_size = 1000 nval = 1000 nte = 5000 -# domain = 'Gift_Cards' -# tr_size = 200 -# val_size = 100 -# te_size = 100 -# nval = 20 -# nte = 40 - def from_gz_text(path, encoding='utf-8', class2int=True): """ diff --git a/Ordinal/build_Telescope_datasets.py b/Ordinal/build_Telescope_datasets.py new file mode 100644 index 0000000..8cf87e9 --- /dev/null +++ b/Ordinal/build_Telescope_datasets.py @@ -0,0 +1,116 @@ +import gzip +import quapy as qp +import numpy as np +import pandas as pd +from quapy.data import LabelledCollection +import quapy.functional as F +import os +from os.path import join +from pathlib import Path +import pickle + + +datadir = '../OrdinalQuantification' +outdir = './data/' +domain = 'fact' +seed = 7 + +tr_size = 20000 +val_size = 1000 +te_size = 1000 +nval = 1000 +nte = 5000 + + +def from_csv(path): + df = pd.read_csv(path) + + # divide the continuous labels into ordered classes + energy_boundaries = np.arange(start=2.4, stop=4.2, step=0.15)[1:-1] + y = np.digitize(np.array(df['log10_energy'], dtype=np.float32), energy_boundaries) + + # note: omitting the dtype will result in a single instance having a different class + + # obtain a matrix of shape (n_samples, n_features) + X = df.iloc[:, 1:].to_numpy().astype(np.float32) + return X, y + + +def write_pkl(sample: LabelledCollection, path): + os.makedirs(Path(path).parent, exist_ok=True) + pickle.dump(sample, open(path, 'wb'), pickle.HIGHEST_PROTOCOL) + + +def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath): + os.makedirs(outdir, exist_ok=True) + with open(prevpath, 'wt') as prevfile: + prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n') + for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)): + sample = pool.sampling(sample_size, *prev) + write_pkl(sample, join(outdir, f'{i}.pkl')) + prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n') + + +def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath): + os.makedirs(outdir, exist_ok=True) + with open(prevpath, 'wt') as prevfile: + prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n') + for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)): + write_pkl(sample, join(outdir, f'{i}.pkl')) + prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n') + + + +fullpath = join(datadir,domain, 'fact_wobble.csv') + +data = LabelledCollection.load(fullpath, from_csv) + +if np.isnan(data.instances).any(): + rows, cols = np.where(np.isnan(data.instances)) + data.instances = np.delete(data.instances, rows, axis=0) + data.labels = np.delete(data.labels, rows, axis=0) + print('deleted nan rows') + +if np.isnan(data.instances).any(): + rows, cols = np.where(np.isnan(data.instances)) + data.instances = np.delete(data.instances, rows, axis=0) + data.labels = np.delete(data.labels, rows, axis=0) + print('deleted nan rows') + +if np.isinf(data.instances).any(): + rows, cols = np.where(np.isinf(data.instances)) + data.instances = np.delete(data.instances, rows, axis=0) + data.labels = np.delete(data.labels, rows, axis=0) + print('deleted inf rows') + + +print(len(data)) +print(data.classes_) +print(data.prevalence()) + +with qp.util.temp_seed(seed): + train, rest = data.split_stratified(train_prop=tr_size) + + devel, test = rest.split_stratified(train_prop=0.5) + print(len(train)) + print(len(devel)) + print(len(test)) + + domaindir = join(outdir, domain) + + write_pkl(train, join(domaindir, 'training_data.pkl')) + write_pkl(devel, join(domaindir, 'development_data.pkl')) + write_pkl(test, join(domaindir, 'test_data.pkl')) + + gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'), + prevpath=join(domaindir, 'app', 'dev_prevalences.txt')) + gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'), + prevpath=join(domaindir, 'app', 'test_prevalences.txt')) + + gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'), + prevpath=join(domaindir, 'npp', 'dev_prevalences.txt')) + gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'), + prevpath=join(domaindir, 'npp', 'test_prevalences.txt')) + + + diff --git a/Ordinal/evaluation.py b/Ordinal/evaluation.py index 452b512..9e4ea9c 100644 --- a/Ordinal/evaluation.py +++ b/Ordinal/evaluation.py @@ -1,11 +1,11 @@ import numpy as np +# smoothing approximation def smoothness(p): return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:])) - def _check_arrays(prevs): prevs = np.asarray(prevs) if prevs.ndim==1: @@ -13,6 +13,7 @@ def _check_arrays(prevs): return prevs +# mean normalized match distance def mnmd(prevs, prevs_hat): prevs = _check_arrays(prevs) prevs_hat = _check_arrays(prevs_hat) @@ -22,6 +23,7 @@ def mnmd(prevs, prevs_hat): return np.mean(nmds) +# normalized match distance def nmd(prev, prev_hat): n = len(prev) return (1./(n-1))*mdpa(prev, prev_hat) diff --git a/Ordinal/experiments_lr_vs_ordlr.py b/Ordinal/experiments_lr_vs_ordlr.py new file mode 100644 index 0000000..ff4f56a --- /dev/null +++ b/Ordinal/experiments_lr_vs_ordlr.py @@ -0,0 +1,150 @@ +import numpy as np +import quapy as qp +import os +from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import StandardScaler +from Ordinal.model import RegressionQuantification, LogisticAT, LogisticSE, LogisticIT, LAD, OrdinalRidge +from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC +from os.path import join +from utils import load_samples_folder, load_single_sample_pkl +from evaluation import nmd, mnmd +from tqdm import tqdm + + +""" +This script generates all results from Table 1 in the paper, i.e., all results comparing quantifiers equipped with +standard logistic regression against quantifiers equipped with order-aware classifiers +""" + +def quantifiers(): + params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']} + params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']} + params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']} + params_Ridge = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced'], 'normalize':[True,False]} + + # baselines + yield 'CC(LR)', CC(LogisticRegression()), params_LR + yield 'PCC(LR)', PCC(LogisticRegression()), params_LR + yield 'ACC(LR)', ACC(LogisticRegression()), params_LR + yield 'PACC(LR)', PACC(LogisticRegression()), params_LR + yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR + + # with order-aware classifiers + # threshold-based ordinal regression (see https://pythonhosted.org/mord/) + yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR + yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR + yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR + yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR + yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR + + yield 'CC(OLR-SE)', CC(LogisticSE()), params_OLR + yield 'PCC(OLR-SE)', PCC(LogisticSE()), params_OLR + yield 'ACC(OLR-SE)', ACC(LogisticSE()), params_OLR + yield 'PACC(OLR-SE)', PACC(LogisticSE()), params_OLR + yield 'SLD(OLR-SE)', EMQ(LogisticSE()), params_OLR + + yield 'CC(OLR-IT)', CC(LogisticIT()), params_OLR + yield 'PCC(OLR-IT)', PCC(LogisticIT()), params_OLR + yield 'ACC(OLR-IT)', ACC(LogisticIT()), params_OLR + yield 'PACC(OLR-IT)', PACC(LogisticIT()), params_OLR + yield 'SLD(OLR-IT)', EMQ(LogisticIT()), params_OLR + # other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.) + + # regression-based ordinal regression (see https://pythonhosted.org/mord/) + yield 'CC(LAD)', CC(LAD()), params_SVR + yield 'ACC(LAD)', ACC(LAD()), params_SVR + yield 'CC(ORidge)', CC(OrdinalRidge()), params_Ridge + yield 'ACC(ORidge)', ACC(OrdinalRidge()), params_Ridge + + +def run_experiment(params): + qname, q, param_grid = params + qname += posfix + resultfile = join(resultpath, f'{qname}.all.csv') + if os.path.exists(resultfile): + print(f'result file {resultfile} already exists: continue') + return None + + print(f'fitting {qname} for all-drift') + + + def load_test_samples(): + folderpath = join(datapath, domain, protocol, 'test_samples') + for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=5000): + if posfix == '-std': + sample.instances = zscore.transform(sample.instances) + yield sample.instances, sample.prevalence() + + + def load_dev_samples(): + folderpath = join(datapath, domain, protocol, 'dev_samples') + for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=1000): + if posfix == '-std': + sample.instances = zscore.transform(sample.instances) + yield sample.instances, sample.prevalence() + + q = qp.model_selection.GridSearchQ( + q, + param_grid, + sample_size=1000, + protocol='gen', + error=mnmd, + val_split=load_dev_samples, + n_jobs=-1, + refit=False, + timeout=60*60*2, + verbose=True).fit(train) + + hyperparams = f'{qname}\tall\t{q.best_params_}\t{q.best_score_}' + + print('[done]') + + report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd]) + mean_nmd = report['nmd'].mean() + std_nmd = report['nmd'].std() + print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}') + report.to_csv(resultfile, index=False) + + print('[learning regressor-based adjustment]') + q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples) + q.fit(None) + + report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd]) + mean_nmd = report['nmd'].mean() + std_nmd = report['nmd'].std() + print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}') + resultfile = join(resultpath, f'{qname}.all.reg.csv') + report.to_csv(resultfile, index=False) + + return hyperparams + + +if __name__ == '__main__': + domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average' + #domain = 'Books-tfidf' + posfix = '' + + # domain = 'fact' + # posfix = '-std' # set to '' to avoid standardization + # posfix = '' + + load_sample_fn = load_single_sample_pkl + datapath = './data' + protocol = 'app' + resultpath = join('./results', domain, protocol) + os.makedirs(resultpath, exist_ok=True) + + train = load_sample_fn(join(datapath, domain), 'training_data') + + if posfix=='-std': + zscore = StandardScaler() + train.instances = zscore.fit_transform(train.instances) + + with open(join(resultpath, 'hyper.txt'), 'at') as foo: + hypers = qp.util.parallel(run_experiment, quantifiers(), n_jobs=-3) + for h in hypers: + if h is not None: + foo.write(h) + foo.write('\n') + + diff --git a/Ordinal/finetune_bert.py b/Ordinal/finetune_bert.py index b7e9b28..3ff5870 100644 --- a/Ordinal/finetune_bert.py +++ b/Ordinal/finetune_bert.py @@ -1,16 +1,26 @@ +import csv import sys -import numpy as np import datasets +import numpy as np +import pandas as pd import torch.cuda +from datasets import Dataset, DatasetDict from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split -from transformers import AutoTokenizer, DataCollatorWithPadding, BertTokenizer -from datasets import list_datasets, list_metrics, load_dataset, Dataset, DatasetDict, load_metric from transformers import AutoModelForSequenceClassification -from transformers import TrainingArguments +from transformers import AutoTokenizer, DataCollatorWithPadding from transformers import Trainer -import pandas as pd -import csv +from transformers import TrainingArguments + + +""" +This script fine-tunes a pre-trained language model on a given textual training set. +The training goes for a maximum of 5 epochs, but stores the model parameters of the best performing epoch according +to the validation loss in a hold-out val split of 1000 documents (stratified). + +We used it with RoBERTa in the training set of the Amazon-OQ-BK domain, i.e.: +$> python3 ./data/Books/training_data.txt roberta-base +""" def tokenize_function(example): @@ -31,13 +41,13 @@ if __name__ == '__main__': debug = False assert torch.cuda.is_available(), 'cuda is not available' - datapath = './data/Books/training_data.txt' - checkpoint = 'roberta-base' - # n_args = len(sys.argv) - # assert n_args==3, 'wrong arguments, expected: ' + # datapath = './data/Books/training_data.txt' + # checkpoint = 'roberta-base' + n_args = len(sys.argv) + assert n_args==3, 'wrong arguments, expected: ' - # datapath = sys.argv[1] # './data/Books/training_data.txt' - # checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base' + datapath = sys.argv[1] # './data/Books/training_data.txt' + checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base' modelout = checkpoint+'-val-finetuned' @@ -60,14 +70,6 @@ if __name__ == '__main__': tokenizer = AutoTokenizer.from_pretrained(checkpoint) tokenized_datasets = dataset.map(tokenize_function, batched=True) - print(tokenized_datasets) - print(tokenized_datasets['train'][0]['labels']) - print(tokenized_datasets['train'][0]['review']) - print(tokenized_datasets['train'][0]['input_ids']) - print(len(tokenized_datasets['train'][0]['input_ids'])) - # print(tokenized_datasets['train'][0]['token_type_ids']) - # print(tokenized_datasets['train'][0]['attention_mask']) - model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda() # fine-tuning diff --git a/Ordinal/finetuning_batch.sh b/Ordinal/finetuning_batch.sh deleted file mode 100755 index f11e2d8..0000000 --- a/Ordinal/finetuning_batch.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -set -x - -#conda activate torch - -transformer=roberta-base - -#python3 finetune_bert.py ./data/Books/training_data.txt $transformer -#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned last -#python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned average -PYTHONPATH=.:.. python3 generate_bert_vectors_npytxt.py "$transformer"-finetuned posteriors diff --git a/Ordinal/gen_tables.py b/Ordinal/gen_tables_amazon.py similarity index 96% rename from Ordinal/gen_tables.py rename to Ordinal/gen_tables_amazon.py index 966c895..3d07a82 100644 --- a/Ordinal/gen_tables.py +++ b/Ordinal/gen_tables_amazon.py @@ -7,6 +7,10 @@ from pathlib import Path from Ordinal.main import quantifiers from Ordinal.tabular import Table +""" +This script generates some tables for Amazon-OQ-BK (for internal use only) +""" + domain = 'Books-tfidf' domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last' domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average' @@ -34,7 +38,6 @@ resultfiles = list(glob(f'{resultpath}/*.csv')) \ + list(glob(f'{resultpath_bertave}/*.csv')) \ + list(glob(f'{resultpath_bertpost}/*.csv')) - for resultfile in resultfiles: df = pd.read_csv(resultfile) nmd = df['nmd'].values diff --git a/Ordinal/gen_tables_telescope.py b/Ordinal/gen_tables_telescope.py new file mode 100644 index 0000000..fb99fa9 --- /dev/null +++ b/Ordinal/gen_tables_telescope.py @@ -0,0 +1,82 @@ +import pandas as pd +from os.path import join +import os +from glob import glob +from pathlib import Path + +from Ordinal.experiments_lr_vs_ordlr import quantifiers +from Ordinal.tabular import Table + +""" +This script generates some tables for Fact-OQ (for internal use only) +""" + +#domain = 'fact' +#domain = 'Books-tfidf' +domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average' +prot = 'app' +outpath = f'./tables/{domain}/{prot}/results.tex' + +resultpath = join('./results', domain, prot) + +withstd=False + +methods = [qname for qname, *_ in quantifiers()] +if withstd: + methods = [m+'-std' for m in methods] +#methods = methods + methods_variant +# methods += [m+'-r' for m in methods] + +quantifiers_families = ['CC', 'PCC', 'ACC', 'PACC', 'SLD'] +# method_variants = ['LR', 'OLR-AT', 'OLR-SE', 'OLR-IT', 'ORidge', 'LAD'] +method_variants = ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD'] +if withstd: + method_variants = [m+'-std' for m in method_variants] + +print('families:', quantifiers_families) +print('variants', method_variants) +table = Table(benchmarks=quantifiers_families, methods=method_variants, prec_mean=4, show_std=True, prec_std=4, + color=False, show_rel_to=0, missing_str='\multicolumn{1}{c}{---}', clean_zero=True) + +resultfiles = list(glob(f'{resultpath}/*).all.csv')) + +for resultfile in resultfiles: + df = pd.read_csv(resultfile) + nmd = df['nmd'].values + resultname = Path(resultfile).name + + method, drift, *other = resultname.replace('.csv', '').replace('-RoBERTa-average','').split('.') + if drift!='all': + continue + if other: + method += '-r' + if method not in methods: + continue + + family, variant = method.split('(') + variant = variant.replace(')', '') + if variant not in method_variants: + continue + table.add(family, variant, nmd) + +os.makedirs(Path(outpath).parent, exist_ok=True) + +tabular = """ + \\resizebox{\\textwidth}{!}{% + + \\begin{tabular}{c""" + ('l' * (table.nbenchmarks)) + """} + \\toprule + """ + +tabular += table.latexTabularT(average=False) +tabular += """ + \end{tabular}% + }""" + +print('saving table in', outpath) +with open(outpath, 'wt') as foo: + foo.write(tabular) + foo.write('\n') + +print('[done]') + diff --git a/Ordinal/generate_bert_vectors_npytxt.py b/Ordinal/generate_bert_vectors_npytxt.py index ffdc005..f58d5ae 100644 --- a/Ordinal/generate_bert_vectors_npytxt.py +++ b/Ordinal/generate_bert_vectors_npytxt.py @@ -12,6 +12,11 @@ from tqdm import tqdm from Ordinal.utils import load_samples_folder, load_single_sample_as_csv +""" +This scripts takes a pre-trained model (a fine-tuned one) and generates numerical representations for all +samples in the dataset. The representations are saved in npy-txt plain format. +""" + def tokenize_function(example): tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt') diff --git a/Ordinal/inspect_dataset.py b/Ordinal/inspect_dataset.py deleted file mode 100644 index cd38eed..0000000 --- a/Ordinal/inspect_dataset.py +++ /dev/null @@ -1,16 +0,0 @@ -import quapy as qp -from quapy.data import LabelledCollection -from quapy.data.reader import from_text -from quapy.functional import strprev - -category = 'Books' -datadir = './data' - -training_path = f'{datadir}/{category}/training_data.txt' - -data = LabelledCollection.load(training_path, loader_func=from_text) - -print(len(data)) -print(strprev(data.prevalence())) - - diff --git a/Ordinal/main.py b/Ordinal/main.py index 2a2317f..21742a4 100644 --- a/Ordinal/main.py +++ b/Ordinal/main.py @@ -3,8 +3,7 @@ from sklearn.linear_model import LogisticRegression import quapy as qp import numpy as np -from Ordinal.model import OrderedLogisticRegression, StackedClassifier, RegressionQuantification, \ - LogisticAT +from Ordinal.model import OrderedLogisticRegression, LogisticAT from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy from quapy.data import LabelledCollection from os.path import join @@ -17,13 +16,6 @@ from tqdm import tqdm import mord -#TODO: -# Ordinal LR, LAD -> balance sample_weight -# use BERT to extract features -# other domains? Kitchen, Electronics... -# try with the inverse of the distance -# add drift='all' - def quantifiers(): params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']} @@ -126,8 +118,8 @@ def run_experiment(params): if __name__ == '__main__': #preprocessing = 'roberta.last' - # preprocessing = 'roberta.average' - preprocessing = 'roberta.posteriors' + preprocessing = 'roberta.average' + # preprocessing = 'roberta.posteriors' #preprocessing = 'tfidf' if preprocessing=='tfidf': domain = 'Books-tfidf' diff --git a/Ordinal/model.py b/Ordinal/model.py index d797328..cbd6cd7 100644 --- a/Ordinal/model.py +++ b/Ordinal/model.py @@ -1,17 +1,12 @@ -from copy import deepcopy -import numpy as np -from sklearn.base import BaseEstimator, ClassifierMixin -from sklearn.calibration import CalibratedClassifierCV -from sklearn.decomposition import TruncatedSVD -from sklearn.linear_model import LogisticRegression, Ridge -from scipy.sparse import issparse -from sklearn.multiclass import OneVsRestClassifier -from sklearn.multioutput import MultiOutputRegressor -from sklearn.preprocessing import StandardScaler -from sklearn.svm import LinearSVR, SVR -from statsmodels.miscmodels.ordinal_model import OrderedModel import mord +import numpy as np +from scipy.sparse import issparse +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.decomposition import TruncatedSVD +from sklearn.linear_model import Ridge +from sklearn.svm import LinearSVR from sklearn.utils.class_weight import compute_class_weight +from statsmodels.miscmodels.ordinal_model import OrderedModel class OrderedLogisticRegression: @@ -38,103 +33,6 @@ class OrderedLogisticRegression: return self.res_prob.model.predict(self.res_prob.params, exog=X) -class StackedClassifier: # aka Funnelling Monolingual - def __init__(self, base_estimator=LogisticRegression()): - if not hasattr(base_estimator, 'predict_proba'): - print('the estimator does not seem to be probabilistic: calibrating') - base_estimator = CalibratedClassifierCV(base_estimator) - # self.base = deepcopy(OneVsRestClassifier(base_estimator)) - # self.meta = deepcopy(OneVsRestClassifier(base_estimator)) - self.base = deepcopy(base_estimator) - self.meta = deepcopy(base_estimator) - self.norm = StandardScaler() - - def fit(self, X, y): - self.base.fit(X, y) - P = self.base.predict_proba(X) - P = self.norm.fit_transform(P) - self.meta.fit(P, y) - return self - - def predict(self, X): - P = self.base.predict_proba(X) - P = self.norm.transform(P) - return self.meta.predict(P) - - def predict_proba(self, X): - P = self.base.predict_proba(X) - P = self.norm.transform(P) - return self.meta.predict_proba(P) - - -class RegressionQuantification: - def __init__(self, - base_quantifier, - regression='svr', - val_samples_generator=None, - norm=True): - - self.base_quantifier = base_quantifier - if isinstance(regression, str): - assert regression in ['ridge', 'svr'], 'unknown regression model' - if regression == 'ridge': - self.reg = Ridge(normalize=norm) - elif regression == 'svr': - self.reg = MultiOutputRegressor(LinearSVR()) - else: - self.reg = regression - # self.reg = MultiTaskLassoCV(normalize=norm) - # self.reg = KernelRidge(kernel='rbf') - # self.reg = LassoLarsCV(normalize=norm) - # self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien - #self.reg = LinearRegression(normalize=norm) # <- bien - # self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm - # self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm - # self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va - self.regression = regression - self.val_samples_generator = val_samples_generator - # self.norm = StandardScaler() - # self.covs = covs - - def generate_validation_samples(self): - Xs, ys = [], [] - for instances, prevalence in self.val_samples_generator(): - ys.append(prevalence) - Xs.append(self.base_quantifier.quantify(instances)) - Xs = np.asarray(Xs) - ys = np.asarray(ys) - return Xs, ys - - def fit(self, data): - print('fitting quantifier') - if data is not None: - self.base_quantifier.fit(data) - print('generating val samples') - Xs, ys = self.generate_validation_samples() - # Xs = self.norm.fit_transform(Xs) - print('fitting regressor') - self.reg.fit(Xs, ys) - print('[done]') - return self - - def quantify(self, instances): - Xs = self.base_quantifier.quantify(instances).reshape(1, -1) - # Xs = self.norm.transform(Xs) - Xs = self.reg.predict(Xs).flatten() - # Xs = self.norm.inverse_transform(Xs) - Xs = np.clip(Xs, 0, 1) - adjusted = Xs / Xs.sum() - # adjusted = np.clip(Xs, 0, 1) - adjusted = adjusted - return adjusted - - def get_params(self, deep=True): - return self.base_quantifier.get_params() - - def set_params(self, **params): - self.base_quantifier.set_params(**params) - - class LAD(BaseEstimator, ClassifierMixin): def __init__(self, C=1.0, class_weight=None): self.C = C @@ -238,6 +136,7 @@ class OrdinalRidge(BaseEstimator, ClassifierMixin): self.class_weight = params['class_weight'] self.normalize = params['normalize'] + # with order-aware classifiers # threshold-based ordinal regression (see https://pythonhosted.org/mord/) class LogisticAT(mord.LogisticAT): diff --git a/Ordinal/partition_dataset_by_shift.py b/Ordinal/partition_dataset_by_shift.py index ac7eb4f..41d6738 100644 --- a/Ordinal/partition_dataset_by_shift.py +++ b/Ordinal/partition_dataset_by_shift.py @@ -9,6 +9,12 @@ from os.path import join from tqdm import tqdm +""" +This scripts generates a partition of a dataset in terms of "shift". +The partition is only carried out by generating index vectors. +""" + + def partition_by_drift(split, training_prevalence): assert split in ['dev', 'test'], 'invalid split name' total=1000 if split=='dev' else 5000 diff --git a/Ordinal/partition_dataset_by_smoothness.py b/Ordinal/partition_dataset_by_smoothness.py index 616ba5b..d549992 100644 --- a/Ordinal/partition_dataset_by_smoothness.py +++ b/Ordinal/partition_dataset_by_smoothness.py @@ -1,11 +1,16 @@ import numpy as np from Ordinal.evaluation import smoothness from Ordinal.utils import load_samples_folder, load_single_sample_pkl - from os.path import join from tqdm import tqdm +""" +This scripts generates a partition of a dataset in terms of "smoothness". +The partition is only carried out by generating index vectors. +""" + + def partition_by_smoothness(split): assert split in ['dev', 'test'], 'invalid split name' total=1000 if split=='dev' else 5000 diff --git a/Ordinal/preprocess_dataset.py b/Ordinal/preprocess_dataset.py deleted file mode 100644 index 0e273c4..0000000 --- a/Ordinal/preprocess_dataset.py +++ /dev/null @@ -1,54 +0,0 @@ -import quapy as qp -from quapy.data import LabelledCollection -from sklearn.feature_extraction.text import TfidfVectorizer -from os.path import join -import os -import pickle -from utils import load_samples_raw -from tqdm import tqdm -import shutil - - -datapath = './data' -domain = 'Books' -outname = domain + '-tfidf' - -def save_preprocessing_info(transformer): - with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo: - foo.write(f'{str(transformer)}\n') - - -os.makedirs(join(datapath, outname), exist_ok=True) -os.makedirs(join(datapath, outname, 'app'), exist_ok=True) -os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True) -os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True) -shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt')) -shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt')) -os.makedirs(join(datapath, outname, 'npp'), exist_ok=True) -os.makedirs(join(datapath, outname, 'npp', 'dev_samples'), exist_ok=True) -os.makedirs(join(datapath, outname, 'npp', 'test_samples'), exist_ok=True) -shutil.copyfile(join(datapath, domain, 'npp', 'dev_prevalences.txt'), join(datapath, outname, 'npp', 'dev_prevalences.txt')) -shutil.copyfile(join(datapath, domain, 'npp', 'test_prevalences.txt'), join(datapath, outname, 'npp', 'test_prevalences.txt')) - - -tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5) - -train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text) -train.instances = tfidf.fit_transform(train.instances) -save_preprocessing_info(tfidf) -pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) - - -def transform_folder_samples(protocol, splitname): - for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))): - sample.instances = tfidf.transform(sample.instances) - pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) - - -transform_folder_samples('app', 'dev_samples') -transform_folder_samples('app', 'test_samples') -transform_folder_samples('npp', 'dev_samples') -transform_folder_samples('npp', 'test_samples') - - - diff --git a/Ordinal/preprocess_dataset_npytxt2pkl.py b/Ordinal/preprocess_dataset_npytxt2pkl.py index c8f1c8a..b25f974 100644 --- a/Ordinal/preprocess_dataset_npytxt2pkl.py +++ b/Ordinal/preprocess_dataset_npytxt2pkl.py @@ -8,6 +8,11 @@ from utils import * from tqdm import tqdm import shutil +""" +This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into dense vectors +extracted from a pretrained model (here we use the RoBERTa fine-tuned on the training set) +Three vector generation modes are available: posteriors, last, average +""" vector_generation = 'posteriors' @@ -29,7 +34,6 @@ os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True) shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt')) shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt')) - train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5)) pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL) diff --git a/Ordinal/preprocess_dataset_tfidf.py b/Ordinal/preprocess_dataset_raw2tfidf.py similarity index 88% rename from Ordinal/preprocess_dataset_tfidf.py rename to Ordinal/preprocess_dataset_raw2tfidf.py index 0e273c4..632546a 100644 --- a/Ordinal/preprocess_dataset_tfidf.py +++ b/Ordinal/preprocess_dataset_raw2tfidf.py @@ -1,14 +1,20 @@ import quapy as qp +from Ordinal.utils import load_simple_sample_raw from quapy.data import LabelledCollection from sklearn.feature_extraction.text import TfidfVectorizer from os.path import join import os import pickle -from utils import load_samples_raw from tqdm import tqdm import shutil + +""" +This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors. +""" + + datapath = './data' domain = 'Books' outname = domain + '-tfidf' @@ -40,7 +46,7 @@ pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pic def transform_folder_samples(protocol, splitname): - for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))): + for i, sample in tqdm(enumerate(load_simple_sample_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))): sample.instances = tfidf.transform(sample.instances) pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)