testing optimization threshold variants, not working
This commit is contained in:
parent
896fa042d6
commit
9b2470c992
|
@ -13,13 +13,11 @@ import os
|
||||||
import pickle
|
import pickle
|
||||||
import itertools
|
import itertools
|
||||||
import argparse
|
import argparse
|
||||||
import torch
|
from glob import glob
|
||||||
import shutil
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
N_JOBS = -1
|
N_JOBS = -1
|
||||||
CUDA_N_JOBS = 2
|
|
||||||
ENSEMBLE_N_JOBS = -1
|
|
||||||
|
|
||||||
qp.environ['SAMPLE_SIZE'] = 100
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
|
|
||||||
|
@ -40,30 +38,23 @@ svmperf_params = {'classifier__C': __C_range}
|
||||||
def quantification_models():
|
def quantification_models():
|
||||||
yield 'acc', ACC(newLR()), lr_params
|
yield 'acc', ACC(newLR()), lr_params
|
||||||
yield 'T50', T50(newLR()), lr_params
|
yield 'T50', T50(newLR()), lr_params
|
||||||
yield 'X', X(newLR()), lr_params
|
#yield 'X', X(newLR()), lr_params
|
||||||
yield 'MAX', MAX(newLR()), lr_params
|
#yield 'MAX', MAX(newLR()), lr_params
|
||||||
yield 'MS', MS(newLR()), lr_params
|
yield 'MS', MS(newLR()), lr_params
|
||||||
yield 'MS2', MS2(newLR()), lr_params
|
yield 'MS2', MS2(newLR()), lr_params
|
||||||
|
|
||||||
|
|
||||||
def evaluate_experiment(true_prevalences, estim_prevalences):
|
|
||||||
print('\nEvaluation Metrics:\n' + '=' * 22)
|
def result_path(path, dataset_name, model_name, optim_loss):
|
||||||
for eval_measure in [qp.error.mae, qp.error.mrae]:
|
return os.path.join(path, f'{dataset_name}-{model_name}-{optim_loss}.pkl')
|
||||||
err = eval_measure(true_prevalences, estim_prevalences)
|
|
||||||
print(f'\t{eval_measure.__name__}={err:.4f}')
|
|
||||||
print()
|
|
||||||
|
|
||||||
|
|
||||||
def result_path(path, dataset_name, model_name, run, optim_loss):
|
def is_already_computed(dataset_name, model_name, optim_loss):
|
||||||
return os.path.join(path, f'{dataset_name}-{model_name}-run{run}-{optim_loss}.pkl')
|
return os.path.exists(result_path(args.results, dataset_name, model_name, optim_loss))
|
||||||
|
|
||||||
|
|
||||||
def is_already_computed(dataset_name, model_name, run, optim_loss):
|
def save_results(dataset_name, model_name, optim_loss, *results):
|
||||||
return os.path.exists(result_path(args.results, dataset_name, model_name, run, optim_loss))
|
rpath = result_path(args.results, dataset_name, model_name, optim_loss)
|
||||||
|
|
||||||
|
|
||||||
def save_results(dataset_name, model_name, run, optim_loss, *results):
|
|
||||||
rpath = result_path(args.results, dataset_name, model_name, run, optim_loss)
|
|
||||||
qp.util.create_parent_dir(rpath)
|
qp.util.create_parent_dir(rpath)
|
||||||
with open(rpath, 'wb') as foo:
|
with open(rpath, 'wb') as foo:
|
||||||
pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)
|
pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)
|
||||||
|
@ -73,45 +64,39 @@ def run(experiment):
|
||||||
optim_loss, dataset_name, (model_name, model, hyperparams) = experiment
|
optim_loss, dataset_name, (model_name, model, hyperparams) = experiment
|
||||||
if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return
|
if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return
|
||||||
|
|
||||||
collection = qp.datasets.fetch_UCILabelledCollection(dataset_name)
|
if is_already_computed(dataset_name, model_name, optim_loss=optim_loss):
|
||||||
for run, data in enumerate(qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=1)):
|
print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} already computed.')
|
||||||
if is_already_computed(dataset_name, model_name, run=run, optim_loss=optim_loss):
|
return
|
||||||
print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5 already computed.')
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5')
|
dataset = qp.datasets.fetch_UCIDataset(dataset_name)
|
||||||
# model selection (hyperparameter optimization for a quantification-oriented loss)
|
|
||||||
train, test = data.train_test
|
|
||||||
train, val = train.split_stratified()
|
|
||||||
if hyperparams is not None:
|
|
||||||
model_selection = qp.model_selection.GridSearchQ(
|
|
||||||
deepcopy(model),
|
|
||||||
param_grid=hyperparams,
|
|
||||||
protocol=APP(val, n_prevalences=21, repeats=25),
|
|
||||||
error=optim_loss,
|
|
||||||
refit=True,
|
|
||||||
timeout=60*60,
|
|
||||||
verbose=True
|
|
||||||
)
|
|
||||||
model_selection.fit(data.training)
|
|
||||||
model = model_selection.best_model()
|
|
||||||
best_params = model_selection.best_params_
|
|
||||||
else:
|
|
||||||
model.fit(data.training)
|
|
||||||
best_params = {}
|
|
||||||
|
|
||||||
# model evaluation
|
print(f'running dataset={dataset_name} model={model_name} loss={optim_loss}')
|
||||||
true_prevalences, estim_prevalences = qp.evaluation.prediction(
|
# model selection (hyperparameter optimization for a quantification-oriented loss)
|
||||||
model,
|
train, test = dataset.train_test
|
||||||
protocol=APP(test, n_prevalences=21, repeats=100)
|
train, val = train.split_stratified()
|
||||||
|
if hyperparams is not None:
|
||||||
|
model_selection = qp.model_selection.GridSearchQ(
|
||||||
|
deepcopy(model),
|
||||||
|
param_grid=hyperparams,
|
||||||
|
protocol=APP(val, n_prevalences=21, repeats=25),
|
||||||
|
error=optim_loss,
|
||||||
|
refit=True,
|
||||||
|
timeout=60*60,
|
||||||
|
verbose=True
|
||||||
)
|
)
|
||||||
test_true_prevalence = data.test.prevalence()
|
model_selection.fit(train)
|
||||||
|
model = model_selection.best_model()
|
||||||
|
else:
|
||||||
|
model.fit(dataset.training)
|
||||||
|
|
||||||
evaluate_experiment(true_prevalences, estim_prevalences)
|
# model evaluation
|
||||||
save_results(dataset_name, model_name, run, optim_loss,
|
true_prevalences, estim_prevalences = qp.evaluation.prediction(
|
||||||
true_prevalences, estim_prevalences,
|
model,
|
||||||
data.training.prevalence(), test_true_prevalence,
|
protocol=APP(test, n_prevalences=21, repeats=100)
|
||||||
best_params)
|
)
|
||||||
|
|
||||||
|
mae = qp.error.mae(true_prevalences, estim_prevalences)
|
||||||
|
save_results(dataset_name, model_name, optim_loss, mae)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -133,4 +118,14 @@ if __name__ == '__main__':
|
||||||
models = quantification_models()
|
models = quantification_models()
|
||||||
qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)
|
qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)
|
||||||
|
|
||||||
shutil.rmtree(args.checkpointdir, ignore_errors=True)
|
# open all results and show
|
||||||
|
df = pd.DataFrame(columns=('method', 'dataset', 'mae'))
|
||||||
|
for i, file in enumerate(glob(f'{args.results}/*.pkl')):
|
||||||
|
mae = float(pickle.load(open(file, 'rb'))[0])
|
||||||
|
*dataset, method, _ = file.split('/')[-1].split('-')
|
||||||
|
dataset = '-'.join(dataset)
|
||||||
|
df.loc[i] = [method, dataset, mae]
|
||||||
|
|
||||||
|
print(df.pivot_table(index='dataset', columns='method', values='mae'))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -104,7 +104,7 @@ def run(experiment):
|
||||||
timeout=60*60,
|
timeout=60*60,
|
||||||
verbose=True
|
verbose=True
|
||||||
)
|
)
|
||||||
model_selection.fit(data.training)
|
model_selection.fit(train)
|
||||||
model = model_selection.best_model()
|
model = model_selection.best_model()
|
||||||
best_params = model_selection.best_params_
|
best_params = model_selection.best_params_
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -168,7 +168,7 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
|
||||||
:param instances: array-like of shape `(n_instances, n_features,)`
|
:param instances: array-like of shape `(n_instances, n_features,)`
|
||||||
:return: np.ndarray of shape `(n_instances,)` with label predictions
|
:return: np.ndarray of shape `(n_instances,)` with label predictions
|
||||||
"""
|
"""
|
||||||
return getattr(self, self._classifier_method())(instances)
|
return getattr(self.classifier, self._classifier_method())(instances)
|
||||||
|
|
||||||
def _classifier_method(self):
|
def _classifier_method(self):
|
||||||
"""
|
"""
|
||||||
|
@ -1142,8 +1142,8 @@ class ThresholdOptimization(BinaryAggregativeQuantifier):
|
||||||
def aggregate_with_threshold(self, classif_predictions, tpr, fpr, threshold):
|
def aggregate_with_threshold(self, classif_predictions, tpr, fpr, threshold):
|
||||||
prevs_estim = np.mean(classif_predictions > threshold)
|
prevs_estim = np.mean(classif_predictions > threshold)
|
||||||
if tpr - fpr != 0:
|
if tpr - fpr != 0:
|
||||||
prevs_estim = np.clip((prevs_estim - fpr) / (tpr - fpr), 0, 1)
|
prevs_estim = (prevs_estim - fpr) / (tpr - fpr)
|
||||||
prevs_estim = np.array((1 - prevs_estim, prevs_estim))
|
prevs_estim = F.as_binary_prevalence(prevs_estim, clip_if_necessary=True)
|
||||||
return prevs_estim
|
return prevs_estim
|
||||||
|
|
||||||
def _compute_table(self, y, y_):
|
def _compute_table(self, y, y_):
|
||||||
|
|
|
@ -112,7 +112,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
return predictions
|
return predictions
|
||||||
|
|
||||||
predictions, status, took = self._error_handler(job, cls_params)
|
predictions, status, took = self._error_handler(job, cls_params)
|
||||||
self._sout(f'[classifier fit] hyperparams={cls_params} status={status} [took {took:.3f}s]')
|
self._sout(f'[classifier fit] hyperparams={cls_params} [took {took:.3f}s]')
|
||||||
return model, predictions, status, took
|
return model, predictions, status, took
|
||||||
|
|
||||||
def _prepare_aggregation(self, args):
|
def _prepare_aggregation(self, args):
|
||||||
|
|
Loading…
Reference in New Issue