QuaPy/examples/uci_experiments.py

from copy import deepcopy

import quapy as qp
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression
from classification.methods import LowRankLogisticRegression
from quapy.method.meta import QuaNet
from quapy.protocol import APP
from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, HDy, newSVMAE
from quapy.method.meta import EHDy
import numpy as np
import os
import pickle
import itertools
import argparse
import torch
import shutil


N_JOBS = -1
CUDA_N_JOBS = 2
ENSEMBLE_N_JOBS = -1

qp.environ['SAMPLE_SIZE'] = 100


def newLR():
    return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)


def calibratedLR():
    return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1))


__C_range = np.logspace(-3, 3, 7)
lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']}
svmperf_params = {'classifier__C': __C_range}


def quantification_models():
    yield 'cc', CC(newLR()), lr_params
    yield 'acc', ACC(newLR()), lr_params
    yield 'pcc', PCC(newLR()), lr_params
    yield 'pacc', PACC(newLR()), lr_params
    yield 'MAX', MAX(newLR()), lr_params
    yield 'MS', MS(newLR()), lr_params
    yield 'MS2', MS2(newLR()), lr_params
    yield 'sldc', EMQ(newLR(), recalib='platt'), lr_params
    yield 'svmmae', newSVMAE(), svmperf_params
    yield 'hdy', HDy(newLR()), lr_params


def quantification_cuda_models():
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f'Running QuaNet in {device}')
    learner = LowRankLogisticRegression()
    yield 'quanet', QuaNet(learner, checkpointdir=args.checkpointdir, device=device), lr_params


def evaluate_experiment(true_prevalences, estim_prevalences):
    print('\nEvaluation Metrics:\n' + '=' * 22)
    for eval_measure in [qp.error.mae, qp.error.mrae]:
        err = eval_measure(true_prevalences, estim_prevalences)
        print(f'\t{eval_measure.__name__}={err:.4f}')
    print()


def result_path(path, dataset_name, model_name, run, optim_loss):
    return os.path.join(path, f'{dataset_name}-{model_name}-run{run}-{optim_loss}.pkl')


def is_already_computed(dataset_name, model_name, run, optim_loss):
    return os.path.exists(result_path(args.results, dataset_name, model_name, run, optim_loss))


def save_results(dataset_name, model_name, run, optim_loss, *results):
    rpath = result_path(args.results, dataset_name, model_name, run, optim_loss)
    qp.util.create_parent_dir(rpath)
    with open(rpath, 'wb') as foo:
        pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)


def run(experiment):
    optim_loss, dataset_name, (model_name, model, hyperparams) = experiment
    if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return

    collection = qp.datasets.fetch_UCILabelledCollection(dataset_name)
    for run, data in enumerate(qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=1)):
        if is_already_computed(dataset_name, model_name, run=run, optim_loss=optim_loss):
            print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5 already computed.')
            continue

        print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5')
        # model selection (hyperparameter optimization for a quantification-oriented loss)
        train, test = data.train_test
        train, val = train.split_stratified()
        if hyperparams is not None:
            model_selection = qp.model_selection.GridSearchQ(
                deepcopy(model),
                param_grid=hyperparams,
                protocol=APP(val, n_prevalences=21, repeats=25),
                error=optim_loss,
                refit=True,
                timeout=60*60,
                verbose=True
            )
            model_selection.fit(data.training)
            model = model_selection.best_model()
            best_params = model_selection.best_params_
        else:
            model.fit(data.training)
            best_params = {}

        # model evaluation
        true_prevalences, estim_prevalences = qp.evaluation.prediction(
            model,
            protocol=APP(test, n_prevalences=21, repeats=100)
        )
        test_true_prevalence = data.test.prevalence()

        evaluate_experiment(true_prevalences, estim_prevalences)
        save_results(dataset_name, model_name, run, optim_loss,
                     true_prevalences, estim_prevalences,
                     data.training.prevalence(), test_true_prevalence,
                     best_params)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
    parser.add_argument('results', metavar='RESULT_PATH', type=str,
                        help='path to the directory where to store the results')
    parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification',
                        help='path to the directory with svmperf')
    parser.add_argument('--checkpointdir', metavar='PATH', type=str, default='./checkpoint',
                        help='path to the directory where to dump QuaNet checkpoints')
    args = parser.parse_args()

    print(f'Result folder: {args.results}')
    np.random.seed(0)

    qp.environ['SVMPERF_HOME'] = args.svmperfpath

    optim_losses = ['mae']
    datasets = qp.datasets.UCI_DATASETS[:4]

    models = quantification_models()
    qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)

    models = quantification_cuda_models()
    qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=CUDA_N_JOBS)

    shutil.rmtree(args.checkpointdir, ignore_errors=True)
adding uci experiments to the examples folder 2023-03-23 15:40:27 +01:00			`from copy import deepcopy`

			`import quapy as qp`
			`from sklearn.calibration import CalibratedClassifierCV`
			`from sklearn.linear_model import LogisticRegression`
			`from classification.methods import LowRankLogisticRegression`
			`from quapy.method.meta import QuaNet`
			`from quapy.protocol import APP`
			`from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, HDy, newSVMAE`
			`from quapy.method.meta import EHDy`
			`import numpy as np`
			`import os`
			`import pickle`
			`import itertools`
			`import argparse`
			`import torch`
			`import shutil`


			`N_JOBS = -1`
			`CUDA_N_JOBS = 2`
			`ENSEMBLE_N_JOBS = -1`

			`qp.environ['SAMPLE_SIZE'] = 100`


			`def newLR():`
			`return LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)`


			`def calibratedLR():`
			`return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1))`


			`__C_range = np.logspace(-3, 3, 7)`
			`lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']}`
			`svmperf_params = {'classifier__C': __C_range}`


			`def quantification_models():`
			`yield 'cc', CC(newLR()), lr_params`
			`yield 'acc', ACC(newLR()), lr_params`
			`yield 'pcc', PCC(newLR()), lr_params`
			`yield 'pacc', PACC(newLR()), lr_params`
			`yield 'MAX', MAX(newLR()), lr_params`
			`yield 'MS', MS(newLR()), lr_params`
			`yield 'MS2', MS2(newLR()), lr_params`
			`yield 'sldc', EMQ(newLR(), recalib='platt'), lr_params`
			`yield 'svmmae', newSVMAE(), svmperf_params`
			`yield 'hdy', HDy(newLR()), lr_params`


			`def quantification_cuda_models():`
			`device = 'cuda' if torch.cuda.is_available() else 'cpu'`
			`print(f'Running QuaNet in {device}')`
			`learner = LowRankLogisticRegression()`
			`yield 'quanet', QuaNet(learner, checkpointdir=args.checkpointdir, device=device), lr_params`


			`def evaluate_experiment(true_prevalences, estim_prevalences):`
			`print('\nEvaluation Metrics:\n' + '=' * 22)`
			`for eval_measure in [qp.error.mae, qp.error.mrae]:`
			`err = eval_measure(true_prevalences, estim_prevalences)`
			`print(f'\t{eval_measure.__name__}={err:.4f}')`
			`print()`


			`def result_path(path, dataset_name, model_name, run, optim_loss):`
			`return os.path.join(path, f'{dataset_name}-{model_name}-run{run}-{optim_loss}.pkl')`


			`def is_already_computed(dataset_name, model_name, run, optim_loss):`
			`return os.path.exists(result_path(args.results, dataset_name, model_name, run, optim_loss))`


			`def save_results(dataset_name, model_name, run, optim_loss, *results):`
			`rpath = result_path(args.results, dataset_name, model_name, run, optim_loss)`
			`qp.util.create_parent_dir(rpath)`
			`with open(rpath, 'wb') as foo:`
			`pickle.dump(tuple(results), foo, pickle.HIGHEST_PROTOCOL)`


			`def run(experiment):`
			`optim_loss, dataset_name, (model_name, model, hyperparams) = experiment`
			`if dataset_name in ['acute.a', 'acute.b', 'iris.1']: return`

			`collection = qp.datasets.fetch_UCILabelledCollection(dataset_name)`
			`for run, data in enumerate(qp.data.Dataset.kFCV(collection, nfolds=5, nrepeats=1)):`
			`if is_already_computed(dataset_name, model_name, run=run, optim_loss=optim_loss):`
			`print(f'result for dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5 already computed.')`
			`continue`

			`print(f'running dataset={dataset_name} model={model_name} loss={optim_loss} run={run+1}/5')`
			`# model selection (hyperparameter optimization for a quantification-oriented loss)`
			`train, test = data.train_test`
			`train, val = train.split_stratified()`
			`if hyperparams is not None:`
			`model_selection = qp.model_selection.GridSearchQ(`
			`deepcopy(model),`
			`param_grid=hyperparams,`
			`protocol=APP(val, n_prevalences=21, repeats=25),`
			`error=optim_loss,`
			`refit=True,`
			`timeout=60*60,`
			`verbose=True`
			`)`
			`model_selection.fit(data.training)`
			`model = model_selection.best_model()`
			`best_params = model_selection.best_params_`
			`else:`
			`model.fit(data.training)`
			`best_params = {}`

			`# model evaluation`
			`true_prevalences, estim_prevalences = qp.evaluation.prediction(`
			`model,`
			`protocol=APP(test, n_prevalences=21, repeats=100)`
			`)`
			`test_true_prevalence = data.test.prevalence()`

			`evaluate_experiment(true_prevalences, estim_prevalences)`
			`save_results(dataset_name, model_name, run, optim_loss,`
			`true_prevalences, estim_prevalences,`
			`data.training.prevalence(), test_true_prevalence,`
			`best_params)`


			`if __name__ == '__main__':`
			`parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')`
			`parser.add_argument('results', metavar='RESULT_PATH', type=str,`
			`help='path to the directory where to store the results')`
			`parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='../svm_perf_quantification',`
			`help='path to the directory with svmperf')`
			`parser.add_argument('--checkpointdir', metavar='PATH', type=str, default='./checkpoint',`
			`help='path to the directory where to dump QuaNet checkpoints')`
			`args = parser.parse_args()`

			`print(f'Result folder: {args.results}')`
			`np.random.seed(0)`

			`qp.environ['SVMPERF_HOME'] = args.svmperfpath`

			`optim_losses = ['mae']`
			`datasets = qp.datasets.UCI_DATASETS[:4]`

			`models = quantification_models()`
			`qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=N_JOBS)`

			`models = quantification_cuda_models()`
			`qp.util.parallel(run, itertools.product(optim_losses, datasets, models), n_jobs=CUDA_N_JOBS)`

			`shutil.rmtree(args.checkpointdir, ignore_errors=True)`