diff --git a/examples/_uci_experiments_checking_optim_threshold_modifications.py b/examples/_uci_experiments_checking_optim_threshold_modifications.py deleted file mode 100644 index 79f7208..0000000 --- a/examples/_uci_experiments_checking_optim_threshold_modifications.py +++ /dev/null @@ -1,134 +0,0 @@ -from copy import deepcopy - -import quapy as qp -from sklearn.calibration import CalibratedClassifierCV -from sklearn.linear_model import LogisticRegression -from quapy.classification.methods import LowRankLogisticRegression -from quapy.method.meta import QuaNet -from quapy.protocol import APP -from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, HDy, newSVMAE, T50, X -from quapy.method.meta import EHDy -import numpy as np -import os -import pickle -import itertools -import argparse -from glob import glob -import pandas as pd -from time import time - -N_JOBS = -1 - -qp.environ['SAMPLE_SIZE'] = 100 - - -def newLR(): - return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1) - - -def calibratedLR(): - return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)) - - -__C_range = np.logspace(-3, 3, 7) -lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']} -svmperf_params = {'classifier__C': __C_range} - - -def quantification_models(): - yield 'acc', ACC(newLR()), lr_params - yield 'T50', T50(newLR()), lr_params - yield 'X', X(newLR()), lr_params - yield 'MAX', MAX(newLR()), lr_params - yield 'MS', MS(newLR()), lr_params - yield 'MS+', MS(newLR()), lr_params - # yield 'MS2', MS2(newLR()), lr_params - - - -def result_path(path, dataset_name, model_name, optim_loss): - return os.path.join(path, f'{dataset_name}-{model_name}-{optim_loss}.pkl') - - -def is_already_computed(dataset_name, model_name, optim_loss): - return os.path.exists(result_path(args.results, dataset_name, model_name, optim_loss)) - - -def save_results(dataset_name, model_name, optim_loss, *results): - rpath = result_path(args.results, dataset_name, model_name, optim_loss) - qp.util.create_parent_dir(rpath) - with open(rpath, 'wb') as foo: - pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL) - - -def run(experiment): - optim_loss, dataset_name, (model_name, model, hyperparams) = experiment - if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return - - if is_already_computed(dataset_name, model_name, optim_loss=optim_loss): - print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} already computed.') - return - - dataset = qp.datasets.fetch_UCIDataset(dataset_name) - - print(f'running dataset={dataset_name} model={model_name} loss={optim_loss}') - # model selection (hyperparameter optimization for a quantification-oriented loss) - train, test = dataset.train_test - train, val = train.split_stratified() - if hyperparams is not None: - model_selection = qp.model_selection.GridSearchQ( - deepcopy(model), - param_grid=hyperparams, - protocol=APP(val, n_prevalences=21, repeats=25), - error=optim_loss, - refit=True, - timeout=60*60, - verbose=True - ) - model_selection.fit(train) - model = model_selection.best_model() - else: - model.fit(dataset.training) - - # model evaluation - true_prevalences, estim_prevalences = qp.evaluation.prediction( - model, - protocol=APP(test, n_prevalences=21, repeats=100) - ) - - mae = qp.error.mae(true_prevalences, estim_prevalences) - save_results(dataset_name, model_name, optim_loss, mae) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification') - parser.add_argument('--results', metavar='RESULT_PATH', type=str, default='results_tmp', - help='path to the directory where to store the results') - parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification', - help='path to the directory with svmperf') - args = parser.parse_args() - - print(f'Result folder: {args.results}') - np.random.seed(0) - - qp.environ['SVMPERF_HOME'] = args.svmperfpath - - optim_losses = ['mae'] - datasets = qp.datasets.UCI_DATASETS - - tstart = time() - models = quantification_models() - qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS) - tend = time() - - # open all results and show - df = pd.DataFrame(columns=('method', 'dataset', 'mae')) - for i, file in enumerate(glob(f'{args.results}/*.pkl')): - mae = float(pickle.load(open(file, 'rb'))[0]) - *dataset, method, _ = file.split('/')[-1].split('-') - dataset = '-'.join(dataset) - df.loc[i] = [method, dataset, mae] - - print(df.pivot_table(index='dataset', columns='method', values='mae', margins=True)) - - print(f'took {(tend-tstart)}s') diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index c6293c9..c3e24db 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -1131,21 +1131,15 @@ class ThresholdOptimization(BinaryAggregativeQuantifier): if len(candidates) == 0: # if no candidate gives rise to a valid combination of tpr and fpr, this method defaults to the standard # classify & count; this is akin to assign tpr=1, fpr=0, threshold=0 - tpr, fpr, threshold, score = 1, 0, 0, 0 - candidates.append([tpr, fpr, threshold, score]) + tpr, fpr, threshold = 1, 0, 0 + candidates.append([tpr, fpr, threshold]) + scores.append(0) candidates = np.asarray(candidates) candidates = candidates[np.argsort(scores)] # sort candidates by candidate_score return candidates - # def aggregate_with_threshold(self, classif_predictions, tpr, fpr, threshold): - # prevs_estim = np.mean(classif_predictions >= threshold) - # if tpr - fpr != 0: - # prevs_estim = (prevs_estim - fpr) / (tpr - fpr) - # prevs_estim = F.as_binary_prevalence(prevs_estim, clip_if_necessary=True) - # return prevs_estim - def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds): prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0) prevs_estims = (prevs_estims - fprs) / (tprs - fprs) @@ -1286,13 +1280,9 @@ class MS(ThresholdOptimization): def aggregate(self, classif_predictions: np.ndarray): prevalences = self.aggregate_with_threshold(classif_predictions, self.tprs, self.fprs, self.thresholds) - return np.median(prevalences, axis=0) - # prevalences = [] - # for tpr, fpr, threshold in self.tprs_fprs_thresholds: - # pos_prev = self.aggregate_with_threshold(classif_predictions, tpr, fpr, threshold)[1] - # prevalences.append(pos_prev) - # median = np.median(prevalences) - # return F.as_binary_prevalence(median) + if prevalences.ndim==2: + prevalences = np.median(prevalences, axis=0) + return prevalences class MS2(MS):