forked from moreo/QuaPy
adding uci experiments to the examples folder
This commit is contained in:
parent
4f1ac49030
commit
4e016c7596
|
@ -0,0 +1,152 @@
|
||||||
|
from copy import deepcopy
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from classification.methods import LowRankLogisticRegression
|
||||||
|
from quapy.method.meta import QuaNet
|
||||||
|
from quapy.protocol import APP
|
||||||
|
from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, HDy, newSVMAE
|
||||||
|
from quapy.method.meta import EHDy
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
import itertools
|
||||||
|
import argparse
|
||||||
|
import torch
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
N_JOBS = -1
|
||||||
|
CUDA_N_JOBS = 2
|
||||||
|
ENSEMBLE_N_JOBS = -1
|
||||||
|
|
||||||
|
qp.environ['SAMPLE_SIZE'] = 100
|
||||||
|
|
||||||
|
|
||||||
|
def newLR():
|
||||||
|
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
|
||||||
|
|
||||||
|
|
||||||
|
def calibratedLR():
|
||||||
|
return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1))
|
||||||
|
|
||||||
|
|
||||||
|
__C_range = np.logspace(-3, 3, 7)
|
||||||
|
lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']}
|
||||||
|
svmperf_params = {'classifier__C': __C_range}
|
||||||
|
|
||||||
|
|
||||||
|
def quantification_models():
|
||||||
|
yield 'cc', CC(newLR()), lr_params
|
||||||
|
yield 'acc', ACC(newLR()), lr_params
|
||||||
|
yield 'pcc', PCC(newLR()), lr_params
|
||||||
|
yield 'pacc', PACC(newLR()), lr_params
|
||||||
|
yield 'MAX', MAX(newLR()), lr_params
|
||||||
|
yield 'MS', MS(newLR()), lr_params
|
||||||
|
yield 'MS2', MS2(newLR()), lr_params
|
||||||
|
yield 'sldc', EMQ(newLR(), recalib='platt'), lr_params
|
||||||
|
yield 'svmmae', newSVMAE(), svmperf_params
|
||||||
|
yield 'hdy', HDy(newLR()), lr_params
|
||||||
|
|
||||||
|
|
||||||
|
def quantification_cuda_models():
|
||||||
|
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||||
|
print(f'Running QuaNet in {device}')
|
||||||
|
learner = LowRankLogisticRegression()
|
||||||
|
yield 'quanet', QuaNet(learner, checkpointdir=args.checkpointdir, device=device), lr_params
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate_experiment(true_prevalences, estim_prevalences):
|
||||||
|
print('\nEvaluation Metrics:\n' + '=' * 22)
|
||||||
|
for eval_measure in [qp.error.mae, qp.error.mrae]:
|
||||||
|
err = eval_measure(true_prevalences, estim_prevalences)
|
||||||
|
print(f'\t{eval_measure.__name__}={err:.4f}')
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def result_path(path, dataset_name, model_name, run, optim_loss):
|
||||||
|
return os.path.join(path, f'{dataset_name}-{model_name}-run{run}-{optim_loss}.pkl')
|
||||||
|
|
||||||
|
|
||||||
|
def is_already_computed(dataset_name, model_name, run, optim_loss):
|
||||||
|
return os.path.exists(result_path(args.results, dataset_name, model_name, run, optim_loss))
|
||||||
|
|
||||||
|
|
||||||
|
def save_results(dataset_name, model_name, run, optim_loss, *results):
|
||||||
|
rpath = result_path(args.results, dataset_name, model_name, run, optim_loss)
|
||||||
|
qp.util.create_parent_dir(rpath)
|
||||||
|
with open(rpath, 'wb') as foo:
|
||||||
|
pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
def run(experiment):
|
||||||
|
optim_loss, dataset_name, (model_name, model, hyperparams) = experiment
|
||||||
|
if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return
|
||||||
|
|
||||||
|
collection = qp.datasets.fetch_UCILabelledCollection(dataset_name)
|
||||||
|
for run, data in enumerate(qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=1)):
|
||||||
|
if is_already_computed(dataset_name, model_name, run=run, optim_loss=optim_loss):
|
||||||
|
print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5 already computed.')
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5')
|
||||||
|
# model selection (hyperparameter optimization for a quantification-oriented loss)
|
||||||
|
train, test = data.train_test
|
||||||
|
train, val = train.split_stratified()
|
||||||
|
if hyperparams is not None:
|
||||||
|
model_selection = qp.model_selection.GridSearchQ(
|
||||||
|
deepcopy(model),
|
||||||
|
param_grid=hyperparams,
|
||||||
|
protocol=APP(val, n_prevalences=21, repeats=25),
|
||||||
|
error=optim_loss,
|
||||||
|
refit=True,
|
||||||
|
timeout=60*60,
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
model_selection.fit(data.training)
|
||||||
|
model = model_selection.best_model()
|
||||||
|
best_params = model_selection.best_params_
|
||||||
|
else:
|
||||||
|
model.fit(data.training)
|
||||||
|
best_params = {}
|
||||||
|
|
||||||
|
# model evaluation
|
||||||
|
true_prevalences, estim_prevalences = qp.evaluation.prediction(
|
||||||
|
model,
|
||||||
|
protocol=APP(test, n_prevalences=21, repeats=100)
|
||||||
|
)
|
||||||
|
test_true_prevalence = data.test.prevalence()
|
||||||
|
|
||||||
|
evaluate_experiment(true_prevalences, estim_prevalences)
|
||||||
|
save_results(dataset_name, model_name, run, optim_loss,
|
||||||
|
true_prevalences, estim_prevalences,
|
||||||
|
data.training.prevalence(), test_true_prevalence,
|
||||||
|
best_params)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
|
||||||
|
parser.add_argument('results', metavar='RESULT_PATH', type=str,
|
||||||
|
help='path to the directory where to store the results')
|
||||||
|
parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification',
|
||||||
|
help='path to the directory with svmperf')
|
||||||
|
parser.add_argument('--checkpointdir', metavar='PATH', type=str, default='./checkpoint',
|
||||||
|
help='path to the directory where to dump QuaNet checkpoints')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print(f'Result folder: {args.results}')
|
||||||
|
np.random.seed(0)
|
||||||
|
|
||||||
|
qp.environ['SVMPERF_HOME'] = args.svmperfpath
|
||||||
|
|
||||||
|
optim_losses = ['mae']
|
||||||
|
datasets = qp.datasets.UCI_DATASETS[:4]
|
||||||
|
|
||||||
|
models = quantification_models()
|
||||||
|
qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)
|
||||||
|
|
||||||
|
models = quantification_cuda_models()
|
||||||
|
qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=CUDA_N_JOBS)
|
||||||
|
|
||||||
|
shutil.rmtree(args.checkpointdir, ignore_errors=True)
|
|
@ -19,7 +19,7 @@ class LowRankLogisticRegression(BaseEstimator):
|
||||||
|
|
||||||
def __init__(self, n_components=100, **kwargs):
|
def __init__(self, n_components=100, **kwargs):
|
||||||
self.n_components = n_components
|
self.n_components = n_components
|
||||||
self.learner = LogisticRegression(**kwargs)
|
self.classifier = LogisticRegression(**kwargs)
|
||||||
|
|
||||||
def get_params(self):
|
def get_params(self):
|
||||||
"""
|
"""
|
||||||
|
@ -28,7 +28,7 @@ class LowRankLogisticRegression(BaseEstimator):
|
||||||
:return: a dictionary with parameter names mapped to their values
|
:return: a dictionary with parameter names mapped to their values
|
||||||
"""
|
"""
|
||||||
params = {'n_components': self.n_components}
|
params = {'n_components': self.n_components}
|
||||||
params.update(self.learner.get_params())
|
params.update(self.classifier.get_params())
|
||||||
return params
|
return params
|
||||||
|
|
||||||
def set_params(self, **params):
|
def set_params(self, **params):
|
||||||
|
@ -43,7 +43,7 @@ class LowRankLogisticRegression(BaseEstimator):
|
||||||
if 'n_components' in params_:
|
if 'n_components' in params_:
|
||||||
self.n_components = params_['n_components']
|
self.n_components = params_['n_components']
|
||||||
del params_['n_components']
|
del params_['n_components']
|
||||||
self.learner.set_params(**params_)
|
self.classifier.set_params(**params_)
|
||||||
|
|
||||||
def fit(self, X, y):
|
def fit(self, X, y):
|
||||||
"""
|
"""
|
||||||
|
@ -59,8 +59,8 @@ class LowRankLogisticRegression(BaseEstimator):
|
||||||
if nF > self.n_components:
|
if nF > self.n_components:
|
||||||
self.pca = TruncatedSVD(self.n_components).fit(X)
|
self.pca = TruncatedSVD(self.n_components).fit(X)
|
||||||
X = self.transform(X)
|
X = self.transform(X)
|
||||||
self.learner.fit(X, y)
|
self.classifier.fit(X, y)
|
||||||
self.classes_ = self.learner.classes_
|
self.classes_ = self.classifier.classes_
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def predict(self, X):
|
def predict(self, X):
|
||||||
|
@ -72,7 +72,7 @@ class LowRankLogisticRegression(BaseEstimator):
|
||||||
instances in `X`
|
instances in `X`
|
||||||
"""
|
"""
|
||||||
X = self.transform(X)
|
X = self.transform(X)
|
||||||
return self.learner.predict(X)
|
return self.classifier.predict(X)
|
||||||
|
|
||||||
def predict_proba(self, X):
|
def predict_proba(self, X):
|
||||||
"""
|
"""
|
||||||
|
@ -82,7 +82,7 @@ class LowRankLogisticRegression(BaseEstimator):
|
||||||
:return: array-like of shape `(n_samples, n_classes)` with the posterior probabilities
|
:return: array-like of shape `(n_samples, n_classes)` with the posterior probabilities
|
||||||
"""
|
"""
|
||||||
X = self.transform(X)
|
X = self.transform(X)
|
||||||
return self.learner.predict_proba(X)
|
return self.classifier.predict_proba(X)
|
||||||
|
|
||||||
def transform(self, X):
|
def transform(self, X):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -207,7 +207,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False
|
||||||
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
|
||||||
|
|
||||||
|
|
||||||
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
|
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
|
||||||
"""
|
"""
|
||||||
Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
|
Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
|
||||||
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
|
||||||
|
@ -233,7 +233,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
|
||||||
~/quay_data/ directory)
|
~/quay_data/ directory)
|
||||||
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
|
||||||
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
|
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
|
||||||
:return: a :class:`quapy.data.base.Dataset` instance
|
:return: a :class:`quapy.data.base.LabelledCollection` instance
|
||||||
"""
|
"""
|
||||||
|
|
||||||
assert dataset_name in UCI_DATASETS, \
|
assert dataset_name in UCI_DATASETS, \
|
||||||
|
|
|
@ -444,24 +444,28 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
def __init__(self, classifier: BaseEstimator, exact_train_prev=True, recalib=None):
|
def __init__(self, classifier: BaseEstimator, exact_train_prev=True, recalib=None):
|
||||||
self.classifier = classifier
|
self.classifier = classifier
|
||||||
|
self.non_calibrated = classifier
|
||||||
self.exact_train_prev = exact_train_prev
|
self.exact_train_prev = exact_train_prev
|
||||||
self.recalib = recalib
|
self.recalib = recalib
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_classifier=True):
|
def fit(self, data: LabelledCollection, fit_classifier=True):
|
||||||
if self.recalib is not None:
|
if self.recalib is not None:
|
||||||
if self.recalib == 'nbvs':
|
if self.recalib == 'nbvs':
|
||||||
self.classifier = NBVSCalibration(self.classifier)
|
self.classifier = NBVSCalibration(self.non_calibrated)
|
||||||
elif self.recalib == 'bcts':
|
elif self.recalib == 'bcts':
|
||||||
self.classifier = BCTSCalibration(self.classifier)
|
self.classifier = BCTSCalibration(self.non_calibrated)
|
||||||
elif self.recalib == 'ts':
|
elif self.recalib == 'ts':
|
||||||
self.classifier = TSCalibration(self.classifier)
|
self.classifier = TSCalibration(self.non_calibrated)
|
||||||
elif self.recalib == 'vs':
|
elif self.recalib == 'vs':
|
||||||
self.classifier = VSCalibration(self.classifier)
|
self.classifier = VSCalibration(self.non_calibrated)
|
||||||
elif self.recalib == 'platt':
|
elif self.recalib == 'platt':
|
||||||
self.classifier = CalibratedClassifierCV(self.classifier, ensemble=False)
|
self.classifier = CalibratedClassifierCV(self.classifier, ensemble=False)
|
||||||
else:
|
else:
|
||||||
raise ValueError('invalid param argument for recalibration method; available ones are '
|
raise ValueError('invalid param argument for recalibration method; available ones are '
|
||||||
'"nbvs", "bcts", "ts", and "vs".')
|
'"nbvs", "bcts", "ts", and "vs".')
|
||||||
|
self.recalib = None
|
||||||
|
else:
|
||||||
|
self.classifier = self.non_calibrated
|
||||||
self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True)
|
self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True)
|
||||||
if self.exact_train_prev:
|
if self.exact_train_prev:
|
||||||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
||||||
|
|
|
@ -9,6 +9,7 @@ from torch.nn.functional import relu
|
||||||
from quapy.protocol import UPP
|
from quapy.protocol import UPP
|
||||||
from quapy.method.aggregative import *
|
from quapy.method.aggregative import *
|
||||||
from quapy.util import EarlyStop
|
from quapy.util import EarlyStop
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
class QuaNetTrainer(BaseQuantifier):
|
class QuaNetTrainer(BaseQuantifier):
|
||||||
|
@ -263,15 +264,19 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}')
|
f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}')
|
||||||
|
|
||||||
def get_params(self, deep=True):
|
def get_params(self, deep=True):
|
||||||
return {**self.classifier.get_params(), **self.quanet_params}
|
classifier_params = self.classifier.get_params()
|
||||||
|
classifier_params = {'classifier__'+k:v for k,v in classifier_params.items()}
|
||||||
|
return {**classifier_params, **self.quanet_params}
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
def set_params(self, **parameters):
|
||||||
learner_params = {}
|
learner_params = {}
|
||||||
for key, val in parameters.items():
|
for key, val in parameters.items():
|
||||||
if key in self.quanet_params:
|
if key in self.quanet_params:
|
||||||
self.quanet_params[key] = val
|
self.quanet_params[key] = val
|
||||||
|
elif key.startswith('classifier__'):
|
||||||
|
learner_params[key.replace('classifier__', '')] = val
|
||||||
else:
|
else:
|
||||||
learner_params[key] = val
|
raise ValueError('unknown parameter ', key)
|
||||||
self.classifier.set_params(**learner_params)
|
self.classifier.set_params(**learner_params)
|
||||||
|
|
||||||
def __check_params_colision(self, quanet_params, learner_params):
|
def __check_params_colision(self, quanet_params, learner_params):
|
||||||
|
|
|
@ -56,7 +56,7 @@ class GridSearchQ(BaseQuantifier):
|
||||||
|
|
||||||
def _sout(self, msg):
|
def _sout(self, msg):
|
||||||
if self.verbose:
|
if self.verbose:
|
||||||
print(f'[{self.__class__.__name__}]: {msg}')
|
print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')
|
||||||
|
|
||||||
def __check_error(self, error):
|
def __check_error(self, error):
|
||||||
if error in qp.error.QUANTIFICATION_ERROR:
|
if error in qp.error.QUANTIFICATION_ERROR:
|
||||||
|
|
Loading…
Reference in New Issue