1
0
Fork 0

testing new models

This commit is contained in:
Alejandro Moreo Fernandez 2021-11-17 17:07:13 +01:00
parent 27124d0d00
commit 06d36a132d
7 changed files with 77 additions and 54 deletions

View File

@ -1,3 +1,4 @@
from sklearn import clone
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import numpy as np
from sklearn.model_selection import GridSearchCV
@ -27,12 +28,14 @@ Possible extensions:
- better investigate the "iterative" nature of the method.
- better investigate the implications with other learners. E.g., using EMQ as a prompt, or using EMQ in the second
stage (test).
- test with SVM
"""
class ClassWeightPCC(BaseQuantifier):
def __init__(self, **pcc_param_grid):
self.learner = PACC(LogisticRegression())
def __init__(self, estimator=LogisticRegression, **pcc_param_grid):
self.estimator = estimator
self.learner = PACC(self.estimator())
if 'class_weight' in pcc_param_grid:
raise ValueError('parameter "class_weight" cannot be included in "pcc_param_grid"')
self.pcc_param_grid = dict(pcc_param_grid)
@ -43,9 +46,6 @@ class ClassWeightPCC(BaseQuantifier):
self.learner.fit(self.train)
return self
def deploy(self, deployed=True):
self.deployed = deployed
def quantify(self, instances):
guessed_prevalence = self.learner.quantify(instances)
class_weight = self._get_class_weight(guessed_prevalence)
@ -55,11 +55,18 @@ class ClassWeightPCC(BaseQuantifier):
for classification (and not for quantification)"""
# pcc = PCC(GridSearchCV(LogisticRegression(class_weight=class_weight), param_grid=self.pcc_param_grid, n_jobs=-1))
pcc = PCC(LogisticRegressionCV(Cs=self.pcc_param_grid['C'], class_weight=class_weight, n_jobs=-1, cv=3))
raise ValueError('this cannot work...')
else:
"""If the param grid has not been specified, we take the best parameters found for the base quantifier"""
base_parameters = dict(self.learner.get_params())
base_parameters['class_weight'] = class_weight # override the class_weight parameter
pcc = PCC(LogisticRegression(**base_parameters))
for p,v in self.learner.get_params().items():
# this search is in order to allow for quantifiers that work with a CalibratedClassifierCV to work
if 'class_weight' in p:
base_parameters[p] = class_weight
break
base_estimator = clone(self.learner.learner)
base_estimator.set_params(**base_parameters)
pcc = PCC(base_estimator)
return pcc.fit(self.train).quantify(instances)
def _get_class_weight(self, prevalence):
@ -74,6 +81,8 @@ class ClassWeightPCC(BaseQuantifier):
return {0:weights[0], 1:weights[1]}
def set_params(self, **parameters):
# parameters = {p:v for p,v in parameters.items()}
# print(parameters)
self.learner.set_params(**parameters)
def get_params(self, deep=True):

View File

@ -39,6 +39,12 @@ def is_already_computed(result_dir, dataset_name, model_name, run, optim_loss):
nice = {
'pacc.opt': 'PACC(LR)',
'pacc.opt.svm': 'PACC(SVM)',
'pcc.opt': 'PCC(LR)',
'pcc.opt.svm': 'PCC(SVM)',
'wpacc.opt': 'R-PCC(LR)',
'wpacc.opt.svm': 'R-PCC(SVM)',
'mae':'AE',
'ae':'AE',
'svmkld': 'SVM(KLD)',

View File

@ -8,6 +8,7 @@ from class_weight_model import ClassWeightPCC
# from method.experimental import ExpMax, VarExpMax
from common import *
from method.meta import QuaNet
from quantification_stumps_model import QuantificationStumpRegressor
from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, SVMAE, HDy
from quapy.method.meta import EHDy
import numpy as np
@ -35,12 +36,14 @@ svmperf_params = {'C': __C_range}
def quantification_models():
# yield 'cc', CC(newLR()), lr_params
# yield 'acc', ACC(newLR()), lr_params
yield 'pcc', PCC(newLR()), None
yield 'pacc', PACC(newLR()), None
yield 'wpacc', ClassWeightPCC(), None
yield 'pcc.opt', PCC(newLR()), lr_params
yield 'pacc.opt', PACC(newLR()), lr_params
yield 'wpacc.opt', ClassWeightPCC(), lr_params
# yield 'pcc', PCC(newLR()), None
# yield 'pacc', PACC(newLR()), None
# yield 'wpacc', ClassWeightPCC(), None
# yield 'pcc.opt', PCC(newLR()), lr_params
# yield 'pacc.opt', PACC(newLR()), lr_params
# yield 'wpacc.opt', ClassWeightPCC(), lr_params
yield 'ds', QuantificationStumpRegressor(SAMPLE_SIZE), None
# yield 'ds.opt', QuantificationStumpRegressor(SAMPLE_SIZE), {'C': __C_range}
# yield 'MAX', MAX(newLR()), lr_params
# yield 'MS', MS(newLR()), lr_params
# yield 'MS2', MS2(newLR()), lr_params
@ -92,7 +95,8 @@ def quantification_ensembles():
def run(experiment):
optim_loss, dataset_name, (model_name, model, hyperparams) = experiment
if dataset_name == 'imdb':
return
data = qp.datasets.fetch_reviews(dataset_name, tfidf=True, min_df=5)
run=0
@ -127,7 +131,7 @@ def run(experiment):
test=data.test,
sample_size=SAMPLE_SIZE,
n_prevpoints=21,
n_repetitions=100,
n_repetitions=1,
n_jobs=-1 if isinstance(model, qp.method.meta.Ensemble) else 1,
verbose=True
)

View File

@ -1,8 +1,11 @@
from sklearn.svm import LinearSVC
from class_weight_model import ClassWeightPCC
# from classification.methods import LowRankLogisticRegression
# from method.experimental import ExpMax, VarExpMax
from common import *
from method.meta import QuaNet
from quantification_stumps_model import QuantificationStumpRegressor
from quapy.method.aggregative import CC, ACC, PCC, PACC, MAX, MS, MS2, EMQ, SVMAE, HDy
from quapy.method.meta import EHDy
import numpy as np
@ -36,6 +39,10 @@ def quantification_models():
yield 'pcc.opt', PCC(newLR()), lr_params
yield 'pacc.opt', PACC(newLR()), lr_params
yield 'wpacc.opt', ClassWeightPCC(), lr_params
yield 'ds.opt', QuantificationStumpRegressor(SAMPLE_SIZE), {'C': __C_range}
# yield 'pcc.opt.svm', PCC(LinearSVC()), lr_params
# yield 'pacc.opt.svm', PACC(LinearSVC()), lr_params
# yield 'wpacc.opt.svm', ClassWeightPCC(LinearSVC), lr_params
# yield 'wpacc.opt2', ClassWeightPCC(C=__C_range), lr_params # this cannot work in its current version (see notes in the class_weight_model.py file)
# yield 'MAX', MAX(newLR()), lr_params
# yield 'MS', MS(newLR()), lr_params
@ -60,29 +67,29 @@ def quantification_models():
# yield 'quanet', QuaNet(learner, SAMPLE_SIZE, checkpointdir=args.checkpointdir, device=device), lr_params
def quantification_ensembles():
param_mod_sel = {
'sample_size': SAMPLE_SIZE,
'n_prevpoints': 21,
'n_repetitions': 5,
'refit': True,
'verbose': False
}
common = {
'size': 30,
'red_size': 15,
'max_sample_size': None, # same as training set
'n_jobs': ENSEMBLE_N_JOBS,
'param_grid': lr_params,
'param_mod_sel': param_mod_sel,
'val_split': 0.4,
'min_pos': 5
}
# hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
# will be skipped (by setting hyperparameters to None)
hyper_none = None
yield 'ehdymaeds', EHDy(newLR(), optim='mae', policy='ds', **common), hyper_none
# def quantification_ensembles():
# param_mod_sel = {
# 'sample_size': SAMPLE_SIZE,
# 'n_prevpoints': 21,
# 'n_repetitions': 5,
# 'refit': True,
# 'verbose': False
# }
# common = {
# 'size': 30,
# 'red_size': 15,
# 'max_sample_size': None, # same as training set
# 'n_jobs': ENSEMBLE_N_JOBS,
# 'param_grid': lr_params,
# 'param_mod_sel': param_mod_sel,
# 'val_split': 0.4,
# 'min_pos': 5
# }
#
# hyperparameters will be evaluated within each quantifier of the ensemble, and so the typical model selection
# will be skipped (by setting hyperparameters to None)
# hyper_none = None
# yield 'ehdymaeds', EHDy(newLR(), optim='mae', policy='ds', **common), hyper_none
def run(experiment):
@ -116,9 +123,6 @@ def run(experiment):
model.fit(data.training)
best_params = {}
if hasattr(model, "deploy"):
model.deploy()
# model evaluation
true_prevalences, estim_prevalences = qp.evaluation.artificial_prevalence_prediction(
model,
@ -130,9 +134,6 @@ def run(experiment):
)
test_true_prevalence = data.test.prevalence()
if hasattr(model, "deploy"):
model.deploy(False)
evaluate_experiment(true_prevalences, estim_prevalences)
save_results(args.results, dataset_name, model_name, run, optim_loss,
true_prevalences, estim_prevalences,
@ -141,7 +142,7 @@ def run(experiment):
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Run experiments for Tweeter Sentiment Quantification')
parser = argparse.ArgumentParser(description='Run experiments for UCI ML Quantification')
parser.add_argument('results', metavar='RESULT_PATH', type=str,
help='path to the directory where to store the results')
parser.add_argument('--svmperfpath', metavar='SVMPERF_PATH', type=str, default='./svm_perf_quantification',

View File

@ -4,7 +4,8 @@ import pathlib
import pickle
from glob import glob
import sys
from uci_common import *
from uci_experiments import *
from uci_tables import METHODS
from os.path import join
@ -42,7 +43,7 @@ def plot_error_by_drift(methods, error_name, logscale=False, path=None):
tr_prevs,
n_bins=20,
error_name=error_name,
show_std=False,
show_std=True,
logscale=logscale,
title=f'Quantification error as a function of distribution shift',
savepath=path
@ -54,7 +55,7 @@ def diagonal_plot(methods, error_name, path=None):
if path is not None:
path = join(path, f'diag_{error_name}')
method_names, true_prevs, estim_prevs, tr_prevs = gather_results(methods, error_name)
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Positive', legend=True, show_std=False, savepath=f'{path}_pos.{plotext}')
qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, pos_class=1, title='Positive', legend=True, show_std=True, savepath=f'{path}_pos.{plotext}')
def binary_bias_global(methods, error_name, path=None):

View File

@ -18,16 +18,18 @@ makedirs(tables_path, exist_ok=True)
qp.environ['SAMPLE_SIZE'] = SAMPLE_SIZE
METHODS = [#'cc', 'acc',
'pcc',
'pacc',
'wpacc',
# 'pcc',
# 'pacc',
# 'wpacc',
'pcc.opt',
'pacc.opt',
'wpacc.opt',
'wpacc.opt2',
'ds.opt',
# 'pcc.opt.svm',
# 'pacc.opt.svm',
# 'wpacc.opt.svm',
# 'wpacc.opt2',
# 'MAX', 'MS', 'MS2',
'sldc',
# 'svmmae',