adding uci_experiments to examples folder

This commit is contained in:
Alejandro Moreo Fernandez 2023-03-23 15:46:03 +01:00
parent 4904475d26
commit 67906f6f2d
7 changed files with 183 additions and 20 deletions

View File

@ -1,3 +1,6 @@
ensembles seem to be broken; they have an internal model selection which takes the parameters, but since quapy now
works with protocols it would need to know the validation set in order to pass something like
"protocol: APP(val, etc.)"
sample_size should not be mandatory when qp.environ['SAMPLE_SIZE'] has been specified
clean all the cumbersome methods that have to be implemented for new quantifiers (e.g., n_classes_ prop, etc.)
make truly parallel the GridSearchQ

152
examples/uci_experiments.py Normal file
View File

@ -0,0 +1,152 @@
from copy import deepcopy
import quapy as qp
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from classification.methods import LowRankLogisticRegression
from quapy.method.meta import QuaNet
from quapy.protocol import APP
from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, HDy, newSVMAE
from quapy.method.meta import EHDy
import numpy as np
import os
import pickle
import itertools
import argparse
import torch
import shutil
N_JOBS = -1
CUDA_N_JOBS = 2
ENSEMBLE_N_JOBS = -1
qp.environ['SAMPLE_SIZE'] = 100
def newLR():
return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)
def calibratedLR():
return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1))
__C_range = np.logspace(-3, 3, 7)
lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']}
svmperf_params = {'classifier__C': __C_range}
def quantification_models():
yield 'cc', CC(newLR()), lr_params
yield 'acc', ACC(newLR()), lr_params
yield 'pcc', PCC(newLR()), lr_params
yield 'pacc', PACC(newLR()), lr_params
yield 'MAX', MAX(newLR()), lr_params
yield 'MS', MS(newLR()), lr_params
yield 'MS2', MS2(newLR()), lr_params
yield 'sldc', EMQ(newLR(), recalib='platt'), lr_params
yield 'svmmae', newSVMAE(), svmperf_params
yield 'hdy', HDy(newLR()), lr_params
def quantification_cuda_models():
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Running QuaNet in {device}')
learner = LowRankLogisticRegression()
yield 'quanet', QuaNet(learner, checkpointdir=args.checkpointdir, device=device), lr_params
def evaluate_experiment(true_prevalences, estim_prevalences):
print('\nEvaluation Metrics:\n' + '=' * 22)
for eval_measure in [qp.error.mae, qp.error.mrae]:
err = eval_measure(true_prevalences, estim_prevalences)
print(f'\t{eval_measure.__name__}={err:.4f}')
print()
def result_path(path, dataset_name, model_name, run, optim_loss):
return os.path.join(path, f'{dataset_name}-{model_name}-run{run}-{optim_loss}.pkl')
def is_already_computed(dataset_name, model_name, run, optim_loss):
return os.path.exists(result_path(args.results, dataset_name, model_name, run, optim_loss))
def save_results(dataset_name, model_name, run, optim_loss, *results):
rpath = result_path(args.results, dataset_name, model_name, run, optim_loss)
qp.util.create_parent_dir(rpath)
with open(rpath, 'wb') as foo:
pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)
def run(experiment):
optim_loss, dataset_name, (model_name, model, hyperparams) = experiment
if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return
collection = qp.datasets.fetch_UCILabelledCollection(dataset_name)
for run, data in enumerate(qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=1)):
if is_already_computed(dataset_name, model_name, run=run, optim_loss=optim_loss):
print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5 already computed.')
continue
print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5')
# model selection (hyperparameter optimization for a quantification-oriented loss)
train, test = data.train_test
train, val = train.split_stratified()
if hyperparams is not None:
model_selection = qp.model_selection.GridSearchQ(
deepcopy(model),
param_grid=hyperparams,
protocol=APP(val, n_prevalences=21, repeats=25),
error=optim_loss,
refit=True,
timeout=60*60,
verbose=True
)
model_selection.fit(data.training)
model = model_selection.best_model()
best_params = model_selection.best_params_
else:
model.fit(data.training)
best_params = {}
# model evaluation
true_prevalences, estim_prevalences = qp.evaluation.prediction(
model,
protocol=APP(test, n_prevalences=21, repeats=100)
)
test_true_prevalence = data.test.prevalence()
evaluate_experiment(true_prevalences, estim_prevalences)
save_results(dataset_name, model_name, run, optim_loss,
true_prevalences, estim_prevalences,
data.training.prevalence(), test_true_prevalence,
best_params)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
parser.add_argument('results', metavar='RESULT_PATH', type=str,
help='path to the directory where to store the results')
parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification',
help='path to the directory with svmperf')
parser.add_argument('--checkpointdir', metavar='PATH', type=str, default='./checkpoint',
help='path to the directory where to dump QuaNet checkpoints')
args = parser.parse_args()
print(f'Result folder: {args.results}')
np.random.seed(0)
qp.environ['SVMPERF_HOME'] = args.svmperfpath
optim_losses = ['mae']
datasets = qp.datasets.UCI_DATASETS[:4]
models = quantification_models()
qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)
models = quantification_cuda_models()
qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=CUDA_N_JOBS)
shutil.rmtree(args.checkpointdir, ignore_errors=True)

View File

@ -19,17 +19,16 @@ class LowRankLogisticRegression(BaseEstimator):
def __init__(self, n_components=100, **kwargs):
self.n_components = n_components
self.learner = LogisticRegression(**kwargs)
self.classifier = LogisticRegression(**kwargs)
def get_params(self, deep=True):
def get_params(self):
"""
Get hyper-parameters for this estimator.
:param deep: compatibility with sklearn
:return: a dictionary with parameter names mapped to their values
"""
params = {'n_components': self.n_components}
params.update(self.learner.get_params(deep))
params.update(self.classifier.get_params())
return params
def set_params(self, **params):
@ -44,7 +43,7 @@ class LowRankLogisticRegression(BaseEstimator):
if 'n_components' in params_:
self.n_components = params_['n_components']
del params_['n_components']
self.learner.set_params(**params_)
self.classifier.set_params(**params_)
def fit(self, X, y):
"""
@ -60,8 +59,8 @@ class LowRankLogisticRegression(BaseEstimator):
if nF > self.n_components:
self.pca = TruncatedSVD(self.n_components).fit(X)
X = self.transform(X)
self.learner.fit(X, y)
self.classes_ = self.learner.classes_
self.classifier.fit(X, y)
self.classes_ = self.classifier.classes_
return self
def predict(self, X):
@ -73,7 +72,7 @@ class LowRankLogisticRegression(BaseEstimator):
instances in `X`
"""
X = self.transform(X)
return self.learner.predict(X)
return self.classifier.predict(X)
def predict_proba(self, X):
"""
@ -83,7 +82,7 @@ class LowRankLogisticRegression(BaseEstimator):
:return: array-like of shape `(n_samples, n_classes)` with the posterior probabilities
"""
X = self.transform(X)
return self.learner.predict_proba(X)
return self.classifier.predict_proba(X)
def transform(self, X):
"""

View File

@ -207,7 +207,7 @@ def fetch_UCIDataset(dataset_name, data_home=None, test_split=0.3, verbose=False
return Dataset(*data.split_stratified(1 - test_split, random_state=0))
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> Dataset:
def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
"""
Loads a UCI collection as an instance of :class:`quapy.data.base.LabelledCollection`, as used in
`Pérez-Gállego, P., Quevedo, J. R., & del Coz, J. J. (2017).
@ -223,7 +223,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
>>> import quapy as qp
>>> collection = qp.datasets.fetch_UCILabelledCollection("yeast")
>>> for data in qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
>>> for data in qp.domains.Dataset.kFCV(collection, nfolds=5, nrepeats=2):
>>> ...
The list of valid dataset names can be accessed in `quapy.data.datasets.UCI_DATASETS`
@ -233,7 +233,7 @@ def fetch_UCILabelledCollection(dataset_name, data_home=None, verbose=False) ->
~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.Dataset` instance
:return: a :class:`quapy.data.base.LabelledCollection` instance
"""
assert dataset_name in UCI_DATASETS, \

View File

@ -444,24 +444,28 @@ class EMQ(AggregativeProbabilisticQuantifier):
def __init__(self, classifier: BaseEstimator, exact_train_prev=True, recalib=None):
self.classifier = classifier
self.non_calibrated = classifier
self.exact_train_prev = exact_train_prev
self.recalib = recalib
def fit(self, data: LabelledCollection, fit_classifier=True):
if self.recalib is not None:
if self.recalib == 'nbvs':
self.classifier = NBVSCalibration(self.classifier)
self.classifier = NBVSCalibration(self.non_calibrated)
elif self.recalib == 'bcts':
self.classifier = BCTSCalibration(self.classifier)
self.classifier = BCTSCalibration(self.non_calibrated)
elif self.recalib == 'ts':
self.classifier = TSCalibration(self.classifier)
self.classifier = TSCalibration(self.non_calibrated)
elif self.recalib == 'vs':
self.classifier = VSCalibration(self.classifier)
self.classifier = VSCalibration(self.non_calibrated)
elif self.recalib == 'platt':
self.classifier = CalibratedClassifierCV(self.classifier, ensemble=False)
else:
raise ValueError('invalid param argument for recalibration method; available ones are '
'"nbvs", "bcts", "ts", and "vs".')
self.recalib = None
else:
self.classifier = self.non_calibrated
self.classifier, _ = _training_helper(self.classifier, data, fit_classifier, ensure_probabilistic=True)
if self.exact_train_prev:
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)

View File

@ -9,6 +9,7 @@ from torch.nn.functional import relu
from quapy.protocol import UPP
from quapy.method.aggregative import *
from quapy.util import EarlyStop
from tqdm import tqdm
class QuaNetTrainer(BaseQuantifier):
@ -28,7 +29,7 @@ class QuaNetTrainer(BaseQuantifier):
>>>
>>> # load the kindle dataset as text, and convert words to numerical indexes
>>> dataset = qp.datasets.fetch_reviews('kindle', pickle=True)
>>> qp.data.preprocessing.index(dataset, min_df=5, inplace=True)
>>> qp.domains.preprocessing.index(dataset, min_df=5, inplace=True)
>>>
>>> # the text classifier is a CNN trained by NeuralClassifierTrainer
>>> cnn = CNNnet(dataset.vocabulary_size, dataset.n_classes)
@ -263,15 +264,19 @@ class QuaNetTrainer(BaseQuantifier):
f'patience={early_stop.patience}/{early_stop.PATIENCE_LIMIT}')
def get_params(self, deep=True):
return {**self.classifier.get_params(), **self.quanet_params}
classifier_params = self.classifier.get_params()
classifier_params = {'classifier__'+k:v for k,v in classifier_params.items()}
return {**classifier_params, **self.quanet_params}
def set_params(self, **parameters):
learner_params = {}
for key, val in parameters.items():
if key in self.quanet_params:
self.quanet_params[key] = val
elif key.startswith('classifier__'):
learner_params[key.replace('classifier__', '')] = val
else:
learner_params[key] = val
raise ValueError('unknown parameter ', key)
self.classifier.set_params(**learner_params)
def __check_params_colision(self, quanet_params, learner_params):

View File

@ -56,7 +56,7 @@ class GridSearchQ(BaseQuantifier):
def _sout(self, msg):
if self.verbose:
print(f'[{self.__class__.__name__}]: {msg}')
print(f'[{self.__class__.__name__}:{self.model.__class__.__name__}]: {msg}')
def __check_error(self, error):
if error in qp.error.QUANTIFICATION_ERROR: