adding uci_experiments to examples folder

2023-03-23 15:46:03 +01:00 · 2023-03-23 15:46:03 +01:00 · 67906f6f2d
parent 4904475d26
commit 67906f6f2d
7 changed files with 183 additions and 20 deletions
--- a/TODO.txt
+++ b/TODO.txt
@ -1,3 +1,6 @@
+ensembles seem to be broken; they have an internal model selection which takes the parameters, but since quapy now
+    works with protocols it would need to know the validation set in order to pass something like
+    "protocol: APP(val, etc.)"
 sample_size should not be mandatory when qp.environ['SAMPLE_SIZE'] has been specified
 clean all the cumbersome methods that have to be implemented for new quantifiers (e.g., n_classes_ prop, etc.)
 make truly parallel the GridSearchQ
--- a/examples/uci_experiments.py
+++ b/examples/uci_experiments.py
@ -0,0 +1,152 @@
+from copy import deepcopy
+
+import quapy as qp
+from sklearn.calibration import CalibratedClassifierCV
+from sklearn.linear_model import LogisticRegression
+from classification.methods import LowRankLogisticRegression
+from quapy.method.meta import QuaNet
+from quapy.protocol import APP
+from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, HDy, newSVMAE
+from quapy.method.meta import EHDy
+import numpy as np
+import os
+import pickle
+import itertools
+import argparse
+import torch
+import shutil
+
+
+N_JOBS = -1
+CUDA_N_JOBS = 2
+ENSEMBLE_N_JOBS = -1
+
+qp.environ['SAMPLE_SIZE'] = 100
+
+
+def newLR():
+    return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
+
+
+def calibratedLR():
+    return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1))
+
+
+__C_range = np.logspace(-3, 3, 7)
+lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']}
+svmperf_params = {'classifier__C': __C_range}
+
+
+def quantification_models():
+    yield 'cc', CC(newLR()), lr_params
+    yield 'acc', ACC(newLR()), lr_params
+    yield 'pcc', PCC(newLR()), lr_params
+    yield 'pacc', PACC(newLR()), lr_params
+    yield 'MAX', MAX(newLR()), lr_params
+    yield 'MS', MS(newLR()), lr_params
+    yield 'MS2', MS2(newLR()), lr_params
+    yield 'sldc', EMQ(newLR(), recalib='platt'), lr_params
+    yield 'svmmae', newSVMAE(), svmperf_params
+    yield 'hdy', HDy(newLR()), lr_params
+
+
+def quantification_cuda_models():
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    print(f'Running QuaNet in {device}')
+    learner = LowRankLogisticRegression()
+    yield 'quanet', QuaNet(learner, checkpointdir=args.checkpointdir, device=device), lr_params
+
+
+def evaluate_experiment(true_prevalences, estim_prevalences):
+    print('\nEvaluation Metrics:\n' + '=' * 22)
+    for eval_measure in [qp.error.mae, qp.error.mrae]:
+        err = eval_measure(true_prevalences, estim_prevalences)
+        print(f'\t{eval_measure.__name__}={err:.4f}')
+    print()
+
+
+def result_path(path, dataset_name, model_name, run, optim_loss):
+    return os.path.join(path, f'{dataset_name}-{model_name}-run{run}-{optim_loss}.pkl')
+
+
+def is_already_computed(dataset_name, model_name, run, optim_loss):
+    return os.path.exists(result_path(args.results, dataset_name, model_name, run, optim_loss))
+
+
+def save_results(dataset_name, model_name, run, optim_loss, *results):
+    rpath = result_path(args.results, dataset_name, model_name, run, optim_loss)
+    qp.util.create_parent_dir(rpath)
+    with open(rpath, 'wb') as foo:
+        pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)
+
+
+def run(experiment):
+    optim_loss, dataset_name, (model_name, model, hyperparams) = experiment
+    if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return
+
+    collection = qp.datasets.fetch_UCILabelledCollection(dataset_name)
+    for run, data in enumerate(qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=1)):
+        if is_already_computed(dataset_name, model_name, run=run, optim_loss=optim_loss):
+            print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5 already computed.')
+            continue
+
+        print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5')
+        # model selection (hyperparameter optimization for a quantification-oriented loss)
+        train, test = data.train_test
+        train, val = train.split_stratified()
+        if hyperparams is not None:
+            model_selection = qp.model_selection.GridSearchQ(
+                deepcopy(model),
+                param_grid=hyperparams,
+                protocol=APP(val, n_prevalences=21, repeats=25),
+                error=optim_loss,
+                refit=True,
+                timeout=60*60,
+                verbose=True
+            )
+            model_selection.fit(data.training)
+            model = model_selection.best_model()
+            best_params = model_selection.best_params_
+        else:
+            model.fit(data.training)
+            best_params = {}
+
+        # model evaluation
+        true_prevalences, estim_prevalences = qp.evaluation.prediction(
+            model,
+            protocol=APP(test, n_prevalences=21, repeats=100)
+        )
+        test_true_prevalence = data.test.prevalence()
+
+        evaluate_experiment(true_prevalences, estim_prevalences)
+        save_results(dataset_name, model_name, run, optim_loss,
+                     true_prevalences, estim_prevalences,
+                     data.training.prevalence(), test_true_prevalence,
+                     best_params)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
+    parser.add_argument('results', metavar='RESULT_PATH', type=str,
+                        help='path to the directory where to store the results')
+    parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification',
+                        help='path to the directory with svmperf')
+    parser.add_argument('--checkpointdir', metavar='PATH', type=str, default='./checkpoint',
+                        help='path to the directory where to dump QuaNet checkpoints')
+    args = parser.parse_args()
+
+    print(f'Result folder: {args.results}')
+    np.random.seed(0)
+
+    qp.environ['SVMPERF_HOME'] = args.svmperfpath
+
+    optim_losses = ['mae']
+    datasets = qp.datasets.UCI_DATASETS[:4]
+
+    models = quantification_models()
+    qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)
+
+    models = quantification_cuda_models()
+    qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=CUDA_N_JOBS)
+
+    shutil.rmtree(args.checkpointdir, ignore_errors=True)
--- a/quapy/classification/methods.py
+++ b/quapy/classification/methods.py
@ -19,17 +19,16 @@ class LowRankLogisticRegression(BaseEstimator):

    def __init__(self, n_components=100, **kwargs):
        self.n_components = n_components
-        self.learner = LogisticRegression(**kwargs)
+        self.classifier = LogisticRegression(**kwargs)

-    def get_params(self, deep=True):
+    def get_params(self):
        """
        Get hyper-parameters for this estimator.

-        :param deep: compatibility with sklearn
        :return: a dictionary with parameter names mapped to their values
        """
        params = {'n_components': self.n_components}
-        params.update(self.learner.get_params(deep))
+        params.update(self.classifier.get_params())
        return params

    def set_params(self, **params):
@ -44,7 +43,7 @@ class LowRankLogisticRegression(BaseEstimator):
        if 'n_components' in params_:
            self.n_components = params_['n_components']
            del params_['n_components']
-        self.learner.set_params(**params_)
+        self.classifier.set_params(**params_)

    def fit(self, X, y):
        """
@ -60,8 +59,8 @@ class LowRankLogisticRegression(BaseEstimator):
        if nF > self.n_components:
            self.pca = TruncatedSVD(self.n_components).fit(X)
        X = self.transform(X)
-        self.learner.fit(X, y)
-        self.classes_ = self.learner.classes_
+        self.classifier.fit(X, y)
+        self.classes_ = self.classifier.classes_
        return self

    def predict(self, X):
@ -73,7 +72,7 @@ class LowRankLogisticRegression(BaseEstimator):
            instances in `X`
        """
        X = self.transform(X)
-        return self.learner.predict(X)
+        return self.classifier.predict(X)

    def predict_proba(self, X):
        """
@ -83,7 +82,7 @@ class LowRankLogisticRegression(BaseEstimator):
        :return: array-like of shape `(n_samples, n_classes)` with the posterior probabilities
        """
        X = self.transform(X)
-        return self.learner.predict_proba(X)
+        return self.classifier.predict_proba(X)

    def transform(self, X):
        """
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -207,7 +207,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False
    return Dataset(*data.split_stratified(1 - test_split, random_state=0))


-def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
+def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
    """
    Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
    `Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
@ -223,7 +223,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->

    >>> import quapy as qp
    >>> collection = qp.datasets.fetch_UCILabelledCollection("yeast")
-    >>> for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
+    >>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
    >>>     ...

    The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
@ -233,7 +233,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
        ~/quay_data/ directory)
    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
    :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
-    :return: a :class:`quapy.data.base.Dataset` instance
+    :return: a :class:`quapy.data.base.LabelledCollection` instance
    """

    assert dataset_name in UCI_DATASETS, \
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -444,24 +444,28 @@ class EMQ(AggregativeProbabilisticQuantifier):

    def __init__(self, classifier: BaseEstimator, exact_train_prev=True, recalib=None):
        self.classifier = classifier
+        self.non_calibrated = classifier
        self.exact_train_prev = exact_train_prev
        self.recalib = recalib

    def fit(self, data: LabelledCollection, fit_classifier=True):
        if self.recalib is not None:
            if self.recalib == 'nbvs':
-                self.classifier = NBVSCalibration(self.classifier)
+                self.classifier = NBVSCalibration(self.non_calibrated)
            elif self.recalib == 'bcts':
-                self.classifier = BCTSCalibration(self.classifier)
+                self.classifier = BCTSCalibration(self.non_calibrated)
            elif self.recalib == 'ts':
-                self.classifier = TSCalibration(self.classifier)
+                self.classifier = TSCalibration(self.non_calibrated)
            elif self.recalib == 'vs':
-                self.classifier = VSCalibration(self.classifier)
+                self.classifier = VSCalibration(self.non_calibrated)
            elif self.recalib == 'platt':
                self.classifier = CalibratedClassifierCV(self.classifier, ensemble=False)
            else:
                raise ValueError('invalid param argument for recalibration method; available ones are '
                                 '"nbvs", "bcts", "ts", and "vs".')
+            self.recalib = None
+        else:
+            self.classifier = self.non_calibrated
        self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True)
        if self.exact_train_prev:
            self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
--- a/quapy/method/neural.py
+++ b/quapy/method/neural.py
@ -9,6 +9,7 @@ from torch.nn.functional import relu
 from quapy.protocol import UPP
 from quapy.method.aggregative import *
 from quapy.util import EarlyStop
+from tqdm import tqdm


 class QuaNetTrainer(BaseQuantifier):
@ -28,7 +29,7 @@ class QuaNetTrainer(BaseQuantifier):
    >>>
    >>> # load the kindle dataset as text, and convert words to numerical indexes
    >>> dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
-    >>> qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
+    >>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True)
    >>>
    >>> # the text classifier is a CNN trained by NeuralClassifierTrainer
    >>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
@ -263,15 +264,19 @@ class QuaNetTrainer(BaseQuantifier):
                                     f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}')

    def get_params(self, deep=True):
-        return {**self.classifier.get_params(), **self.quanet_params}
+        classifier_params = self.classifier.get_params()
+        classifier_params = {'classifier__'+k:v for k,v in classifier_params.items()}
+        return {**classifier_params, **self.quanet_params}

    def set_params(self, **parameters):
        learner_params = {}
        for key, val in parameters.items():
            if key in self.quanet_params:
                self.quanet_params[key] = val
+            elif key.startswith('classifier__'):
+                learner_params[key.replace('classifier__', '')] = val
            else:
-                learner_params[key] = val
+                raise ValueError('unknown parameter ', key)
        self.classifier.set_params(**learner_params)

    def __check_params_colision(self, quanet_params, learner_params):
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@ -56,7 +56,7 @@ class GridSearchQ(BaseQuantifier):

    def _sout(self, msg):
        if self.verbose:
-            print(f'[{self.__class__.__name__}]: {msg}')
+            print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')

    def __check_error(self, error):
        if error in qp.error.QUANTIFICATION_ERROR: