testing new models

2021-11-17 17:07:13 +01:00 · 2021-11-17 17:07:13 +01:00 · 06d36a132d
parent 27124d0d00
commit 06d36a132d
7 changed files with 77 additions and 54 deletions
--- a/NewMethods/quantification_stumps.py
+++ b/NewMethods/quantification_stumps.py
--- a/NewMethods/class_weight_model.py
+++ b/NewMethods/class_weight_model.py
@ -1,3 +1,4 @@
+from sklearn import clone
 from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
 import numpy as np
 from sklearn.model_selection import GridSearchCV
@ -27,12 +28,14 @@ Possible extensions:
    - better investigate the "iterative" nature of the method.
    - better investigate the implications with other learners. E.g., using EMQ as a prompt, or using EMQ in the second
        stage (test).
+    - test with SVM
 """

 class ClassWeightPCC(BaseQuantifier):

-    def __init__(self, **pcc_param_grid):
-        self.learner = PACC(LogisticRegression())
+    def __init__(self, estimator=LogisticRegression, **pcc_param_grid):
+        self.estimator = estimator
+        self.learner = PACC(self.estimator())
        if 'class_weight' in pcc_param_grid:
            raise ValueError('parameter "class_weight" cannot be included in "pcc_param_grid"')
        self.pcc_param_grid = dict(pcc_param_grid)
@ -43,9 +46,6 @@ class ClassWeightPCC(BaseQuantifier):
        self.learner.fit(self.train)
        return self

-    def deploy(self, deployed=True):
-        self.deployed = deployed
-
    def quantify(self, instances):
        guessed_prevalence = self.learner.quantify(instances)
        class_weight = self._get_class_weight(guessed_prevalence)
@ -55,11 +55,18 @@ class ClassWeightPCC(BaseQuantifier):
            for classification (and not for quantification)"""
            # pcc = PCC(GridSearchCV(LogisticRegression(class_weight=class_weight), param_grid=self.pcc_param_grid, n_jobs=-1))
            pcc = PCC(LogisticRegressionCV(Cs=self.pcc_param_grid['C'], class_weight=class_weight, n_jobs=-1, cv=3))
+            raise ValueError('this cannot work...')
        else:
            """If the param grid has not been specified, we take the best parameters found for the base quantifier"""
            base_parameters = dict(self.learner.get_params())
-            base_parameters['class_weight'] = class_weight  # override the class_weight parameter
-            pcc = PCC(LogisticRegression(**base_parameters))
+            for p,v in self.learner.get_params().items():
+                # this search is in order to allow for quantifiers that work with a CalibratedClassifierCV to work
+                if 'class_weight' in p:
+                    base_parameters[p] = class_weight
+                    break
+            base_estimator = clone(self.learner.learner)
+            base_estimator.set_params(**base_parameters)
+            pcc = PCC(base_estimator)
        return pcc.fit(self.train).quantify(instances)

    def _get_class_weight(self, prevalence):
@ -74,6 +81,8 @@ class ClassWeightPCC(BaseQuantifier):
        return {0:weights[0], 1:weights[1]}

    def set_params(self, **parameters):
+        # parameters = {p:v for p,v in parameters.items()}
+        # print(parameters)
        self.learner.set_params(**parameters)

    def get_params(self, deep=True):
--- a/NewMethods/common.py
+++ b/NewMethods/common.py
@ -39,6 +39,12 @@ def is_already_computed(result_dir, dataset_name, model_name, run, optim_loss):


 nice = {
+    'pacc.opt': 'PACC(LR)',
+    'pacc.opt.svm': 'PACC(SVM)',
+    'pcc.opt': 'PCC(LR)',
+    'pcc.opt.svm': 'PCC(SVM)',
+    'wpacc.opt': 'R-PCC(LR)',
+    'wpacc.opt.svm': 'R-PCC(SVM)',
    'mae':'AE',
    'ae':'AE',
    'svmkld': 'SVM(KLD)',
--- a/NewMethods/tc_experiments.py
+++ b/NewMethods/tc_experiments.py
@ -8,6 +8,7 @@ from class_weight_model import ClassWeightPCC
 # from method.experimental import ExpMax, VarExpMax
 from common import *
 from method.meta import QuaNet
+from quantification_stumps_model import QuantificationStumpRegressor
 from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, SVMAE, HDy
 from quapy.method.meta import EHDy
 import numpy as np
@ -35,12 +36,14 @@ svmperf_params = {'C': __C_range}
 def quantification_models():
    # yield 'cc', CC(newLR()), lr_params
    # yield 'acc', ACC(newLR()), lr_params
-    yield 'pcc', PCC(newLR()), None
-    yield 'pacc', PACC(newLR()), None
-    yield 'wpacc', ClassWeightPCC(), None
-    yield 'pcc.opt', PCC(newLR()), lr_params
-    yield 'pacc.opt', PACC(newLR()), lr_params
-    yield 'wpacc.opt', ClassWeightPCC(), lr_params
+    # yield 'pcc', PCC(newLR()), None
+    # yield 'pacc', PACC(newLR()), None
+    # yield 'wpacc', ClassWeightPCC(), None
+    # yield 'pcc.opt', PCC(newLR()), lr_params
+    # yield 'pacc.opt', PACC(newLR()), lr_params
+    # yield 'wpacc.opt', ClassWeightPCC(), lr_params
+    yield 'ds', QuantificationStumpRegressor(SAMPLE_SIZE), None
+    # yield 'ds.opt', QuantificationStumpRegressor(SAMPLE_SIZE), {'C': __C_range}
    # yield 'MAX', MAX(newLR()), lr_params
    # yield 'MS', MS(newLR()), lr_params
    # yield 'MS2', MS2(newLR()), lr_params
@ -92,7 +95,8 @@ def quantification_ensembles():

 def run(experiment):
    optim_loss, dataset_name, (model_name, model, hyperparams) = experiment
-
+    if dataset_name == 'imdb':
+        return
    data = qp.datasets.fetch_reviews(dataset_name, tfidf=True, min_df=5)
    run=0

@ -127,7 +131,7 @@ def run(experiment):
        test=data.test,
        sample_size=SAMPLE_SIZE,
        n_prevpoints=21,
-        n_repetitions=100,
+        n_repetitions=1,
        n_jobs=-1 if isinstance(model, qp.method.meta.Ensemble) else 1,
        verbose=True
    )
--- a/NewMethods/uci_experiments.py
+++ b/NewMethods/uci_experiments.py
@ -1,8 +1,11 @@
+from sklearn.svm import LinearSVC
+
 from class_weight_model import ClassWeightPCC
 # from classification.methods import LowRankLogisticRegression
 # from method.experimental import ExpMax, VarExpMax
 from common import *
 from method.meta import QuaNet
+from quantification_stumps_model import QuantificationStumpRegressor
 from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, SVMAE, HDy
 from quapy.method.meta import EHDy
 import numpy as np
@ -36,6 +39,10 @@ def quantification_models():
    yield 'pcc.opt', PCC(newLR()), lr_params
    yield 'pacc.opt', PACC(newLR()), lr_params
    yield 'wpacc.opt', ClassWeightPCC(), lr_params
+    yield 'ds.opt', QuantificationStumpRegressor(SAMPLE_SIZE), {'C': __C_range}
+    # yield 'pcc.opt.svm', PCC(LinearSVC()), lr_params
+    # yield 'pacc.opt.svm', PACC(LinearSVC()), lr_params
+    # yield 'wpacc.opt.svm', ClassWeightPCC(LinearSVC), lr_params
    # yield 'wpacc.opt2', ClassWeightPCC(C=__C_range), lr_params  # this cannot work in its current version (see notes in the class_weight_model.py file)
    # yield 'MAX', MAX(newLR()), lr_params
    # yield 'MS', MS(newLR()), lr_params
@ -60,29 +67,29 @@ def quantification_models():
 #     yield 'quanet', QuaNet(learner, SAMPLE_SIZE, checkpointdir=args.checkpointdir, device=device), lr_params


-def quantification_ensembles():
-    param_mod_sel = {
-        'sample_size': SAMPLE_SIZE,
-        'n_prevpoints': 21,
-        'n_repetitions': 5,
-        'refit': True,
-        'verbose': False
-    }
-    common = {
-        'size': 30,
-        'red_size': 15,
-        'max_sample_size': None,  # same as training set
-        'n_jobs': ENSEMBLE_N_JOBS,
-        'param_grid': lr_params,
-        'param_mod_sel': param_mod_sel,
-        'val_split': 0.4,
-        'min_pos': 5
-    }
-
-    # hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
-    # will be skipped (by setting hyperparameters to None)
-    hyper_none = None
-    yield 'ehdymaeds',  EHDy(newLR(), optim='mae', policy='ds', **common), hyper_none
+# def quantification_ensembles():
+#     param_mod_sel = {
+#         'sample_size': SAMPLE_SIZE,
+#         'n_prevpoints': 21,
+#         'n_repetitions': 5,
+#         'refit': True,
+#         'verbose': False
+#     }
+#     common = {
+#         'size': 30,
+#         'red_size': 15,
+#         'max_sample_size': None,  # same as training set
+#         'n_jobs': ENSEMBLE_N_JOBS,
+#         'param_grid': lr_params,
+#         'param_mod_sel': param_mod_sel,
+#         'val_split': 0.4,
+#         'min_pos': 5
+#     }
+#
+#     hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
+#     will be skipped (by setting hyperparameters to None)
+    # hyper_none = None
+    # yield 'ehdymaeds',  EHDy(newLR(), optim='mae', policy='ds', **common), hyper_none


 def run(experiment):
@ -116,9 +123,6 @@ def run(experiment):
            model.fit(data.training)
            best_params = {}

-        if hasattr(model, "deploy"):
-            model.deploy()
-
        # model evaluation
        true_prevalences, estim_prevalences = qp.evaluation.artificial_prevalence_prediction(
            model,
@ -130,9 +134,6 @@ def run(experiment):
        )
        test_true_prevalence = data.test.prevalence()

-        if hasattr(model, "deploy"):
-            model.deploy(False)
-
        evaluate_experiment(true_prevalences, estim_prevalences)
        save_results(args.results, dataset_name, model_name, run, optim_loss,
                     true_prevalences, estim_prevalences,
@ -141,7 +142,7 @@ def run(experiment):


 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
+    parser = argparse.ArgumentParser(description='Run experiments for UCI ML Quantification')
    parser.add_argument('results', metavar='RESULT_PATH', type=str,
                        help='path to the directory where to store the results')
    parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='./svm_perf_quantification',
--- a/NewMethods/uci_plots.py
+++ b/NewMethods/uci_plots.py
@ -4,7 +4,8 @@ import pathlib
 import pickle
 from glob import glob
 import sys
-from uci_common import *
+from uci_experiments import *
+from uci_tables import METHODS
 from os.path import join


@ -42,7 +43,7 @@ def plot_error_by_drift(methods, error_name, logscale=False, path=None):
        tr_prevs,
        n_bins=20,
        error_name=error_name,
-        show_std=False,
+        show_std=True,
        logscale=logscale,
        title=f'Quantification error as a function of distribution shift',
        savepath=path
@ -54,7 +55,7 @@ def diagonal_plot(methods, error_name, path=None):
    if path is not None:
        path = join(path, f'diag_{error_name}')
    method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
-    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Positive', legend=True, show_std=False, savepath=f'{path}_pos.{plotext}')
+    qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Positive', legend=True, show_std=True, savepath=f'{path}_pos.{plotext}')


 def binary_bias_global(methods, error_name, path=None):
--- a/NewMethods/uci_tables.py
+++ b/NewMethods/uci_tables.py
@ -18,16 +18,18 @@ makedirs(tables_path, exist_ok=True)
 qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE


-
-
 METHODS = [#'cc', 'acc',
-           'pcc',
-           'pacc',
-           'wpacc',
+           # 'pcc',
+           # 'pacc',
+           # 'wpacc',
           'pcc.opt',
           'pacc.opt',
           'wpacc.opt',
-           'wpacc.opt2',
+           'ds.opt',
+           # 'pcc.opt.svm',
+           # 'pacc.opt.svm',
+           # 'wpacc.opt.svm',
+           # 'wpacc.opt2',
           # 'MAX', 'MS', 'MS2',
           'sldc',
           # 'svmmae',