method MS2 (Medium Sweep 2) fixed

2024-01-19 18:11:22 +01:00 · 2024-01-19 18:11:22 +01:00 · 8d22ba39f4
parent b68b58ad11
commit 8d22ba39f4
2 changed files with 6 additions and 150 deletions
--- a/examples/_uci_experiments_checking_optim_threshold_modifications.py
+++ b/examples/_uci_experiments_checking_optim_threshold_modifications.py
@ -1,134 +0,0 @@
-from copy import deepcopy
-
-import quapy as qp
-from sklearn.calibration import CalibratedClassifierCV
-from sklearn.linear_model import LogisticRegression
-from quapy.classification.methods import LowRankLogisticRegression
-from quapy.method.meta import QuaNet
-from quapy.protocol import APP
-from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, HDy, newSVMAE, T50, X
-from quapy.method.meta import EHDy
-import numpy as np
-import os
-import pickle
-import itertools
-import argparse
-from glob import glob
-import pandas as pd
-from time import time
-
-N_JOBS = -1
-
-qp.environ['SAMPLE_SIZE'] = 100
-
-
-def newLR():
-    return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
-
-
-def calibratedLR():
-    return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1))
-
-
-__C_range = np.logspace(-3, 3, 7)
-lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']}
-svmperf_params = {'classifier__C': __C_range}
-
-
-def quantification_models():
-    yield 'acc', ACC(newLR()), lr_params
-    yield 'T50', T50(newLR()), lr_params
-    yield 'X', X(newLR()), lr_params
-    yield 'MAX', MAX(newLR()), lr_params
-    yield 'MS', MS(newLR()), lr_params
-    yield 'MS+', MS(newLR()), lr_params
-    # yield 'MS2', MS2(newLR()), lr_params
-
-
-
-def result_path(path, dataset_name, model_name, optim_loss):
-    return os.path.join(path, f'{dataset_name}-{model_name}-{optim_loss}.pkl')
-
-
-def is_already_computed(dataset_name, model_name, optim_loss):
-    return os.path.exists(result_path(args.results, dataset_name, model_name, optim_loss))
-
-
-def save_results(dataset_name, model_name, optim_loss, *results):
-    rpath = result_path(args.results, dataset_name, model_name, optim_loss)
-    qp.util.create_parent_dir(rpath)
-    with open(rpath, 'wb') as foo:
-        pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)
-
-
-def run(experiment):
-    optim_loss, dataset_name, (model_name, model, hyperparams) = experiment
-    if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return
-
-    if is_already_computed(dataset_name, model_name, optim_loss=optim_loss):
-        print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} already computed.')
-        return
-
-    dataset = qp.datasets.fetch_UCIDataset(dataset_name)
-
-    print(f'running dataset={dataset_name} model={model_name} loss={optim_loss}')
-    # model selection (hyperparameter optimization for a quantification-oriented loss)
-    train, test = dataset.train_test
-    train, val = train.split_stratified()
-    if hyperparams is not None:
-        model_selection = qp.model_selection.GridSearchQ(
-            deepcopy(model),
-            param_grid=hyperparams,
-            protocol=APP(val, n_prevalences=21, repeats=25),
-            error=optim_loss,
-            refit=True,
-            timeout=60*60,
-            verbose=True
-        )
-        model_selection.fit(train)
-        model = model_selection.best_model()
-    else:
-        model.fit(dataset.training)
-
-    # model evaluation
-    true_prevalences, estim_prevalences = qp.evaluation.prediction(
-        model,
-        protocol=APP(test, n_prevalences=21, repeats=100)
-    )
-
-    mae = qp.error.mae(true_prevalences, estim_prevalences)
-    save_results(dataset_name, model_name, optim_loss, mae)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
-    parser.add_argument('--results', metavar='RESULT_PATH', type=str, default='results_tmp',
-                        help='path to the directory where to store the results')
-    parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification',
-                        help='path to the directory with svmperf')
-    args = parser.parse_args()
-
-    print(f'Result folder: {args.results}')
-    np.random.seed(0)
-
-    qp.environ['SVMPERF_HOME'] = args.svmperfpath
-
-    optim_losses = ['mae']
-    datasets = qp.datasets.UCI_DATASETS
-
-    tstart = time()
-    models = quantification_models()
-    qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)
-    tend = time()
-
-    # open all results and show
-    df = pd.DataFrame(columns=('method', 'dataset', 'mae'))
-    for i, file in enumerate(glob(f'{args.results}/*.pkl')):
-        mae = float(pickle.load(open(file, 'rb'))[0])
-        *dataset, method, _ = file.split('/')[-1].split('-')
-        dataset = '-'.join(dataset)
-        df.loc[i] = [method, dataset, mae]
-
-    print(df.pivot_table(index='dataset', columns='method', values='mae', margins=True))
-
-    print(f'took {(tend-tstart)}s')
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -1131,21 +1131,15 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
        if len(candidates) == 0:
            # if no candidate gives rise to a valid combination of tpr and fpr, this method defaults to the standard
            # classify & count; this is akin to assign tpr=1, fpr=0, threshold=0
-            tpr, fpr, threshold, score = 1, 0, 0, 0
-            candidates.append([tpr, fpr, threshold, score])
+            tpr, fpr, threshold = 1, 0, 0
+            candidates.append([tpr, fpr, threshold])
+            scores.append(0)

        candidates = np.asarray(candidates)
        candidates = candidates[np.argsort(scores)]  # sort candidates by candidate_score

        return candidates

-    # def aggregate_with_threshold(self, classif_predictions, tpr, fpr, threshold):
-    #     prevs_estim = np.mean(classif_predictions >= threshold)
-    #     if tpr - fpr != 0:
-    #         prevs_estim = (prevs_estim - fpr) / (tpr - fpr)
-    #     prevs_estim = F.as_binary_prevalence(prevs_estim, clip_if_necessary=True)
-    #     return prevs_estim
-
    def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds):
        prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0)
        prevs_estims = (prevs_estims - fprs) / (tprs - fprs)
@ -1286,13 +1280,9 @@ class MS(ThresholdOptimization):

    def aggregate(self, classif_predictions: np.ndarray):
        prevalences = self.aggregate_with_threshold(classif_predictions, self.tprs, self.fprs, self.thresholds)
-        return np.median(prevalences, axis=0)
-        # prevalences = []
-        # for tpr, fpr, threshold in self.tprs_fprs_thresholds:
-        #     pos_prev = self.aggregate_with_threshold(classif_predictions, tpr, fpr, threshold)[1]
-        #     prevalences.append(pos_prev)
-        # median = np.median(prevalences)
-        # return F.as_binary_prevalence(median)
+        if prevalences.ndim==2:
+            prevalences = np.median(prevalences, axis=0)
+        return prevalences


 class MS2(MS):