diff --git a/NewMethods/quantification_stumps.py b/NewMethods/QuantificationStumps.py similarity index 100% rename from NewMethods/quantification_stumps.py rename to NewMethods/QuantificationStumps.py diff --git a/NewMethods/class_weight_model.py b/NewMethods/class_weight_model.py index 3f8c55c..b8b7c29 100644 --- a/NewMethods/class_weight_model.py +++ b/NewMethods/class_weight_model.py @@ -1,3 +1,4 @@ +from sklearn import clone from sklearn.linear_model import LogisticRegression, LogisticRegressionCV import numpy as np from sklearn.model_selection import GridSearchCV @@ -27,12 +28,14 @@ Possible extensions: - better investigate the "iterative" nature of the method. - better investigate the implications with other learners. E.g., using EMQ as a prompt, or using EMQ in the second stage (test). + - test with SVM """ class ClassWeightPCC(BaseQuantifier): - def __init__(self, **pcc_param_grid): - self.learner = PACC(LogisticRegression()) + def __init__(self, estimator=LogisticRegression, **pcc_param_grid): + self.estimator = estimator + self.learner = PACC(self.estimator()) if 'class_weight' in pcc_param_grid: raise ValueError('parameter "class_weight" cannot be included in "pcc_param_grid"') self.pcc_param_grid = dict(pcc_param_grid) @@ -43,9 +46,6 @@ class ClassWeightPCC(BaseQuantifier): self.learner.fit(self.train) return self - def deploy(self, deployed=True): - self.deployed = deployed - def quantify(self, instances): guessed_prevalence = self.learner.quantify(instances) class_weight = self._get_class_weight(guessed_prevalence) @@ -55,11 +55,18 @@ class ClassWeightPCC(BaseQuantifier): for classification (and not for quantification)""" # pcc = PCC(GridSearchCV(LogisticRegression(class_weight=class_weight), param_grid=self.pcc_param_grid, n_jobs=-1)) pcc = PCC(LogisticRegressionCV(Cs=self.pcc_param_grid['C'], class_weight=class_weight, n_jobs=-1, cv=3)) + raise ValueError('this cannot work...') else: """If the param grid has not been specified, we take the best parameters found for the base quantifier""" base_parameters = dict(self.learner.get_params()) - base_parameters['class_weight'] = class_weight # override the class_weight parameter - pcc = PCC(LogisticRegression(**base_parameters)) + for p,v in self.learner.get_params().items(): + # this search is in order to allow for quantifiers that work with a CalibratedClassifierCV to work + if 'class_weight' in p: + base_parameters[p] = class_weight + break + base_estimator = clone(self.learner.learner) + base_estimator.set_params(**base_parameters) + pcc = PCC(base_estimator) return pcc.fit(self.train).quantify(instances) def _get_class_weight(self, prevalence): @@ -74,6 +81,8 @@ class ClassWeightPCC(BaseQuantifier): return {0:weights[0], 1:weights[1]} def set_params(self, **parameters): + # parameters = {p:v for p,v in parameters.items()} + # print(parameters) self.learner.set_params(**parameters) def get_params(self, deep=True): diff --git a/NewMethods/common.py b/NewMethods/common.py index 56aaffe..e03052d 100644 --- a/NewMethods/common.py +++ b/NewMethods/common.py @@ -39,6 +39,12 @@ def is_already_computed(result_dir, dataset_name, model_name, run, optim_loss): nice = { + 'pacc.opt': 'PACC(LR)', + 'pacc.opt.svm': 'PACC(SVM)', + 'pcc.opt': 'PCC(LR)', + 'pcc.opt.svm': 'PCC(SVM)', + 'wpacc.opt': 'R-PCC(LR)', + 'wpacc.opt.svm': 'R-PCC(SVM)', 'mae':'AE', 'ae':'AE', 'svmkld': 'SVM(KLD)', diff --git a/NewMethods/tc_experiments.py b/NewMethods/tc_experiments.py index 119f1e6..b38a6df 100644 --- a/NewMethods/tc_experiments.py +++ b/NewMethods/tc_experiments.py @@ -8,6 +8,7 @@ from class_weight_model import ClassWeightPCC # from method.experimental import ExpMax, VarExpMax from common import * from method.meta import QuaNet +from quantification_stumps_model import QuantificationStumpRegressor from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, SVMAE, HDy from quapy.method.meta import EHDy import numpy as np @@ -35,12 +36,14 @@ svmperf_params = {'C': __C_range} def quantification_models(): # yield 'cc', CC(newLR()), lr_params # yield 'acc', ACC(newLR()), lr_params - yield 'pcc', PCC(newLR()), None - yield 'pacc', PACC(newLR()), None - yield 'wpacc', ClassWeightPCC(), None - yield 'pcc.opt', PCC(newLR()), lr_params - yield 'pacc.opt', PACC(newLR()), lr_params - yield 'wpacc.opt', ClassWeightPCC(), lr_params + # yield 'pcc', PCC(newLR()), None + # yield 'pacc', PACC(newLR()), None + # yield 'wpacc', ClassWeightPCC(), None + # yield 'pcc.opt', PCC(newLR()), lr_params + # yield 'pacc.opt', PACC(newLR()), lr_params + # yield 'wpacc.opt', ClassWeightPCC(), lr_params + yield 'ds', QuantificationStumpRegressor(SAMPLE_SIZE), None + # yield 'ds.opt', QuantificationStumpRegressor(SAMPLE_SIZE), {'C': __C_range} # yield 'MAX', MAX(newLR()), lr_params # yield 'MS', MS(newLR()), lr_params # yield 'MS2', MS2(newLR()), lr_params @@ -92,7 +95,8 @@ def quantification_ensembles(): def run(experiment): optim_loss, dataset_name, (model_name, model, hyperparams) = experiment - + if dataset_name == 'imdb': + return data = qp.datasets.fetch_reviews(dataset_name, tfidf=True, min_df=5) run=0 @@ -127,7 +131,7 @@ def run(experiment): test=data.test, sample_size=SAMPLE_SIZE, n_prevpoints=21, - n_repetitions=100, + n_repetitions=1, n_jobs=-1 if isinstance(model, qp.method.meta.Ensemble) else 1, verbose=True ) diff --git a/NewMethods/uci_experiments.py b/NewMethods/uci_experiments.py index 098c88b..5910c5e 100644 --- a/NewMethods/uci_experiments.py +++ b/NewMethods/uci_experiments.py @@ -1,8 +1,11 @@ +from sklearn.svm import LinearSVC + from class_weight_model import ClassWeightPCC # from classification.methods import LowRankLogisticRegression # from method.experimental import ExpMax, VarExpMax from common import * from method.meta import QuaNet +from quantification_stumps_model import QuantificationStumpRegressor from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, SVMAE, HDy from quapy.method.meta import EHDy import numpy as np @@ -36,6 +39,10 @@ def quantification_models(): yield 'pcc.opt', PCC(newLR()), lr_params yield 'pacc.opt', PACC(newLR()), lr_params yield 'wpacc.opt', ClassWeightPCC(), lr_params + yield 'ds.opt', QuantificationStumpRegressor(SAMPLE_SIZE), {'C': __C_range} + # yield 'pcc.opt.svm', PCC(LinearSVC()), lr_params + # yield 'pacc.opt.svm', PACC(LinearSVC()), lr_params + # yield 'wpacc.opt.svm', ClassWeightPCC(LinearSVC), lr_params # yield 'wpacc.opt2', ClassWeightPCC(C=__C_range), lr_params # this cannot work in its current version (see notes in the class_weight_model.py file) # yield 'MAX', MAX(newLR()), lr_params # yield 'MS', MS(newLR()), lr_params @@ -60,29 +67,29 @@ def quantification_models(): # yield 'quanet', QuaNet(learner, SAMPLE_SIZE, checkpointdir=args.checkpointdir, device=device), lr_params -def quantification_ensembles(): - param_mod_sel = { - 'sample_size': SAMPLE_SIZE, - 'n_prevpoints': 21, - 'n_repetitions': 5, - 'refit': True, - 'verbose': False - } - common = { - 'size': 30, - 'red_size': 15, - 'max_sample_size': None, # same as training set - 'n_jobs': ENSEMBLE_N_JOBS, - 'param_grid': lr_params, - 'param_mod_sel': param_mod_sel, - 'val_split': 0.4, - 'min_pos': 5 - } - - # hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection - # will be skipped (by setting hyperparameters to None) - hyper_none = None - yield 'ehdymaeds', EHDy(newLR(), optim='mae', policy='ds', **common), hyper_none +# def quantification_ensembles(): +# param_mod_sel = { +# 'sample_size': SAMPLE_SIZE, +# 'n_prevpoints': 21, +# 'n_repetitions': 5, +# 'refit': True, +# 'verbose': False +# } +# common = { +# 'size': 30, +# 'red_size': 15, +# 'max_sample_size': None, # same as training set +# 'n_jobs': ENSEMBLE_N_JOBS, +# 'param_grid': lr_params, +# 'param_mod_sel': param_mod_sel, +# 'val_split': 0.4, +# 'min_pos': 5 +# } +# +# hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection +# will be skipped (by setting hyperparameters to None) + # hyper_none = None + # yield 'ehdymaeds', EHDy(newLR(), optim='mae', policy='ds', **common), hyper_none def run(experiment): @@ -116,9 +123,6 @@ def run(experiment): model.fit(data.training) best_params = {} - if hasattr(model, "deploy"): - model.deploy() - # model evaluation true_prevalences, estim_prevalences = qp.evaluation.artificial_prevalence_prediction( model, @@ -130,9 +134,6 @@ def run(experiment): ) test_true_prevalence = data.test.prevalence() - if hasattr(model, "deploy"): - model.deploy(False) - evaluate_experiment(true_prevalences, estim_prevalences) save_results(args.results, dataset_name, model_name, run, optim_loss, true_prevalences, estim_prevalences, @@ -141,7 +142,7 @@ def run(experiment): if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification') + parser = argparse.ArgumentParser(description='Run experiments for UCI ML Quantification') parser.add_argument('results', metavar='RESULT_PATH', type=str, help='path to the directory where to store the results') parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='./svm_perf_quantification', diff --git a/NewMethods/uci_plots.py b/NewMethods/uci_plots.py index 9705f27..fca5e23 100644 --- a/NewMethods/uci_plots.py +++ b/NewMethods/uci_plots.py @@ -4,7 +4,8 @@ import pathlib import pickle from glob import glob import sys -from uci_common import * +from uci_experiments import * +from uci_tables import METHODS from os.path import join @@ -42,7 +43,7 @@ def plot_error_by_drift(methods, error_name, logscale=False, path=None): tr_prevs, n_bins=20, error_name=error_name, - show_std=False, + show_std=True, logscale=logscale, title=f'Quantification error as a function of distribution shift', savepath=path @@ -54,7 +55,7 @@ def diagonal_plot(methods, error_name, path=None): if path is not None: path = join(path, f'diag_{error_name}') method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name) - qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Positive', legend=True, show_std=False, savepath=f'{path}_pos.{plotext}') + qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Positive', legend=True, show_std=True, savepath=f'{path}_pos.{plotext}') def binary_bias_global(methods, error_name, path=None): diff --git a/NewMethods/uci_tables.py b/NewMethods/uci_tables.py index dd076ce..ff4f555 100644 --- a/NewMethods/uci_tables.py +++ b/NewMethods/uci_tables.py @@ -18,16 +18,18 @@ makedirs(tables_path, exist_ok=True) qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE - - METHODS = [#'cc', 'acc', - 'pcc', - 'pacc', - 'wpacc', + # 'pcc', + # 'pacc', + # 'wpacc', 'pcc.opt', 'pacc.opt', 'wpacc.opt', - 'wpacc.opt2', + 'ds.opt', + # 'pcc.opt.svm', + # 'pacc.opt.svm', + # 'wpacc.opt.svm', + # 'wpacc.opt2', # 'MAX', 'MS', 'MS2', 'sldc', # 'svmmae',