forked from moreo/QuaPy
method MS2 (Medium Sweep 2) fixed
This commit is contained in:
parent
b68b58ad11
commit
8d22ba39f4
|
@ -1,134 +0,0 @@
|
||||||
from copy import deepcopy
|
|
||||||
|
|
||||||
import quapy as qp
|
|
||||||
from sklearn.calibration import CalibratedClassifierCV
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
from quapy.classification.methods import LowRankLogisticRegression
|
|
||||||
from quapy.method.meta import QuaNet
|
|
||||||
from quapy.protocol import APP
|
|
||||||
from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, HDy, newSVMAE, T50, X
|
|
||||||
from quapy.method.meta import EHDy
|
|
||||||
import numpy as np
|
|
||||||
import os
|
|
||||||
import pickle
|
|
||||||
import itertools
|
|
||||||
import argparse
|
|
||||||
from glob import glob
|
|
||||||
import pandas as pd
|
|
||||||
from time import time
|
|
||||||
|
|
||||||
N_JOBS = -1
|
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = 100
|
|
||||||
|
|
||||||
|
|
||||||
def newLR():
|
|
||||||
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
|
|
||||||
|
|
||||||
|
|
||||||
def calibratedLR():
|
|
||||||
return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1))
|
|
||||||
|
|
||||||
|
|
||||||
__C_range = np.logspace(-3, 3, 7)
|
|
||||||
lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']}
|
|
||||||
svmperf_params = {'classifier__C': __C_range}
|
|
||||||
|
|
||||||
|
|
||||||
def quantification_models():
|
|
||||||
yield 'acc', ACC(newLR()), lr_params
|
|
||||||
yield 'T50', T50(newLR()), lr_params
|
|
||||||
yield 'X', X(newLR()), lr_params
|
|
||||||
yield 'MAX', MAX(newLR()), lr_params
|
|
||||||
yield 'MS', MS(newLR()), lr_params
|
|
||||||
yield 'MS+', MS(newLR()), lr_params
|
|
||||||
# yield 'MS2', MS2(newLR()), lr_params
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def result_path(path, dataset_name, model_name, optim_loss):
|
|
||||||
return os.path.join(path, f'{dataset_name}-{model_name}-{optim_loss}.pkl')
|
|
||||||
|
|
||||||
|
|
||||||
def is_already_computed(dataset_name, model_name, optim_loss):
|
|
||||||
return os.path.exists(result_path(args.results, dataset_name, model_name, optim_loss))
|
|
||||||
|
|
||||||
|
|
||||||
def save_results(dataset_name, model_name, optim_loss, *results):
|
|
||||||
rpath = result_path(args.results, dataset_name, model_name, optim_loss)
|
|
||||||
qp.util.create_parent_dir(rpath)
|
|
||||||
with open(rpath, 'wb') as foo:
|
|
||||||
pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)
|
|
||||||
|
|
||||||
|
|
||||||
def run(experiment):
|
|
||||||
optim_loss, dataset_name, (model_name, model, hyperparams) = experiment
|
|
||||||
if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return
|
|
||||||
|
|
||||||
if is_already_computed(dataset_name, model_name, optim_loss=optim_loss):
|
|
||||||
print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} already computed.')
|
|
||||||
return
|
|
||||||
|
|
||||||
dataset = qp.datasets.fetch_UCIDataset(dataset_name)
|
|
||||||
|
|
||||||
print(f'running dataset={dataset_name} model={model_name} loss={optim_loss}')
|
|
||||||
# model selection (hyperparameter optimization for a quantification-oriented loss)
|
|
||||||
train, test = dataset.train_test
|
|
||||||
train, val = train.split_stratified()
|
|
||||||
if hyperparams is not None:
|
|
||||||
model_selection = qp.model_selection.GridSearchQ(
|
|
||||||
deepcopy(model),
|
|
||||||
param_grid=hyperparams,
|
|
||||||
protocol=APP(val, n_prevalences=21, repeats=25),
|
|
||||||
error=optim_loss,
|
|
||||||
refit=True,
|
|
||||||
timeout=60*60,
|
|
||||||
verbose=True
|
|
||||||
)
|
|
||||||
model_selection.fit(train)
|
|
||||||
model = model_selection.best_model()
|
|
||||||
else:
|
|
||||||
model.fit(dataset.training)
|
|
||||||
|
|
||||||
# model evaluation
|
|
||||||
true_prevalences, estim_prevalences = qp.evaluation.prediction(
|
|
||||||
model,
|
|
||||||
protocol=APP(test, n_prevalences=21, repeats=100)
|
|
||||||
)
|
|
||||||
|
|
||||||
mae = qp.error.mae(true_prevalences, estim_prevalences)
|
|
||||||
save_results(dataset_name, model_name, optim_loss, mae)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
|
|
||||||
parser.add_argument('--results', metavar='RESULT_PATH', type=str, default='results_tmp',
|
|
||||||
help='path to the directory where to store the results')
|
|
||||||
parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification',
|
|
||||||
help='path to the directory with svmperf')
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
print(f'Result folder: {args.results}')
|
|
||||||
np.random.seed(0)
|
|
||||||
|
|
||||||
qp.environ['SVMPERF_HOME'] = args.svmperfpath
|
|
||||||
|
|
||||||
optim_losses = ['mae']
|
|
||||||
datasets = qp.datasets.UCI_DATASETS
|
|
||||||
|
|
||||||
tstart = time()
|
|
||||||
models = quantification_models()
|
|
||||||
qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)
|
|
||||||
tend = time()
|
|
||||||
|
|
||||||
# open all results and show
|
|
||||||
df = pd.DataFrame(columns=('method', 'dataset', 'mae'))
|
|
||||||
for i, file in enumerate(glob(f'{args.results}/*.pkl')):
|
|
||||||
mae = float(pickle.load(open(file, 'rb'))[0])
|
|
||||||
*dataset, method, _ = file.split('/')[-1].split('-')
|
|
||||||
dataset = '-'.join(dataset)
|
|
||||||
df.loc[i] = [method, dataset, mae]
|
|
||||||
|
|
||||||
print(df.pivot_table(index='dataset', columns='method', values='mae', margins=True))
|
|
||||||
|
|
||||||
print(f'took {(tend-tstart)}s')
|
|
|
@ -1131,21 +1131,15 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
|
||||||
if len(candidates) == 0:
|
if len(candidates) == 0:
|
||||||
# if no candidate gives rise to a valid combination of tpr and fpr, this method defaults to the standard
|
# if no candidate gives rise to a valid combination of tpr and fpr, this method defaults to the standard
|
||||||
# classify & count; this is akin to assign tpr=1, fpr=0, threshold=0
|
# classify & count; this is akin to assign tpr=1, fpr=0, threshold=0
|
||||||
tpr, fpr, threshold, score = 1, 0, 0, 0
|
tpr, fpr, threshold = 1, 0, 0
|
||||||
candidates.append([tpr, fpr, threshold, score])
|
candidates.append([tpr, fpr, threshold])
|
||||||
|
scores.append(0)
|
||||||
|
|
||||||
candidates = np.asarray(candidates)
|
candidates = np.asarray(candidates)
|
||||||
candidates = candidates[np.argsort(scores)] # sort candidates by candidate_score
|
candidates = candidates[np.argsort(scores)] # sort candidates by candidate_score
|
||||||
|
|
||||||
return candidates
|
return candidates
|
||||||
|
|
||||||
# def aggregate_with_threshold(self, classif_predictions, tpr, fpr, threshold):
|
|
||||||
# prevs_estim = np.mean(classif_predictions >= threshold)
|
|
||||||
# if tpr - fpr != 0:
|
|
||||||
# prevs_estim = (prevs_estim - fpr) / (tpr - fpr)
|
|
||||||
# prevs_estim = F.as_binary_prevalence(prevs_estim, clip_if_necessary=True)
|
|
||||||
# return prevs_estim
|
|
||||||
|
|
||||||
def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds):
|
def aggregate_with_threshold(self, classif_predictions, tprs, fprs, thresholds):
|
||||||
prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0)
|
prevs_estims = np.mean(classif_predictions[:, None] >= thresholds, axis=0)
|
||||||
prevs_estims = (prevs_estims - fprs) / (tprs - fprs)
|
prevs_estims = (prevs_estims - fprs) / (tprs - fprs)
|
||||||
|
@ -1286,13 +1280,9 @@ class MS(ThresholdOptimization):
|
||||||
|
|
||||||
def aggregate(self, classif_predictions: np.ndarray):
|
def aggregate(self, classif_predictions: np.ndarray):
|
||||||
prevalences = self.aggregate_with_threshold(classif_predictions, self.tprs, self.fprs, self.thresholds)
|
prevalences = self.aggregate_with_threshold(classif_predictions, self.tprs, self.fprs, self.thresholds)
|
||||||
return np.median(prevalences, axis=0)
|
if prevalences.ndim==2:
|
||||||
# prevalences = []
|
prevalences = np.median(prevalences, axis=0)
|
||||||
# for tpr, fpr, threshold in self.tprs_fprs_thresholds:
|
return prevalences
|
||||||
# pos_prev = self.aggregate_with_threshold(classif_predictions, tpr, fpr, threshold)[1]
|
|
||||||
# prevalences.append(pos_prev)
|
|
||||||
# median = np.median(prevalences)
|
|
||||||
# return F.as_binary_prevalence(median)
|
|
||||||
|
|
||||||
|
|
||||||
class MS2(MS):
|
class MS2(MS):
|
||||||
|
|
Loading…
Reference in New Issue