1
0
Fork 0

initial experiments and DIR method

This commit is contained in:
Alejandro Moreo Fernandez 2023-07-21 11:41:16 +02:00
parent 26de9d92eb
commit d995990fba
7 changed files with 322 additions and 89 deletions

34
.gitignore vendored
View File

@ -27,6 +27,40 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
.idea
.vscode
LeQua2022
MultiLabel/results_generales
MultiLabel/mlqtables
NewMethods/plots*
NewMethods/results*
NewMethods/tables*
NewMethods/latex*
Ordinal/data*
Ordinal/roberta*
Ordinal/tables*
Ordinal/results*
eDiscovery/plots*
eDiscovery/results*
examples/results*
poster-cikm*
slides-cikm*
slides-short-cikm*
quick_experiment/figures*
quick_experiment/figures*
svm_perf_quantification/*
TweetSentQuant/plots*
TweetSentQuant/results*
TweetSentQuant/tables*
TweetSentQuant/Tweet Sentiment Quantification_NPP
TweetSentQuant/checkpoint
TweetSentQuant/*.tex
checkpoint
*.png
*.zip
*.pkl
*.pickle
*.pdf
# PyInstaller
# Usually these files are written by a python script from a template

View File

@ -1,3 +1,4 @@
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegression
import os
@ -5,52 +6,91 @@ import sys
import pandas as pd
import quapy as qp
from quapy.method.aggregative import DistributionMatching
from quapy.method.aggregative import EMQ, DistributionMatching, PACC, HDy, OneVsAllAggregative
from method_kdey import KDEy
from method_dirichlety import DIRy
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
qp.environ['N_JOBS'] = -1
method = 'KDE'
param = 0.1
div = 'topsoe'
method_identifier = f'{method}_modsel_{div}'
result_dir = f'results_lequa'
optim = 'mae'
os.makedirs('results', exist_ok=True)
result_path = f'results_LequaT2B/{method_identifier}.csv'
os.makedirs(result_dir, exist_ok=True)
#if os.path.exists(result_path):
# print('Result already exit. Nothing to do')
# sys.exit(0)
hyper_LR = {
'classifier__C': np.logspace(-3,3,7),
'classifier__class_weight': ['balanced', None]
}
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for method in ['PACC', 'SLD', 'DM', 'KDE', 'HDy', 'DIR']:
#if os.path.exists(result_path):
# print('Result already exit. Nothing to do')
# sys.exit(0)
dataset = 'T1B'
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
result_path = f'{result_dir}/{method}'
if os.path.exists(result_path+'.dataframe'):
print(f'result file {result_path} already exist; skipping')
continue
if method == 'KDE':
param_grid = {'bandwidth': np.linspace(0.001, 0.1, 11)}
model = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn')
else:
raise NotImplementedError('unknown method')
with open(result_path+'.csv', 'at') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
modsel = GridSearchQ(model, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1)
dataset = 'T1B'
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
print('init', dataset)
if method == 'KDE':
param_grid = {
'bandwidth': np.linspace(0.001, 0.2, 21),
'classifier__C': np.logspace(-4,4,9),
'classifier__class_weight': ['balanced', None]
}
quantifier = KDEy(LogisticRegression(), target='max_likelihood')
elif method == 'DIR':
param_grid = hyper_LR
quantifier = DIRy(LogisticRegression())
elif method == 'SLD':
param_grid = hyper_LR
quantifier = EMQ(LogisticRegression())
elif method == 'PACC':
param_grid = hyper_LR
quantifier = PACC(LogisticRegression())
elif method == 'HDy-OvA':
param_grid = {
'binary_quantifier__classifier__C': np.logspace(-4,4,9),
'binary_quantifier__classifier__class_weight': ['balanced', None]
}
quantifier = OneVsAllAggregative(HDy(LogisticRegression()))
elif method == 'DM':
param_grid = {
'nbins': [5,10,15],
'classifier__C': np.logspace(-4,4,9),
'classifier__class_weight': ['balanced', None]
}
quantifier = DistributionMatching(LogisticRegression())
else:
raise NotImplementedError('unknown method', method)
modsel.fit(train)
print(f'best params {modsel.best_params_}')
modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim)
quantifier = modsel.best_model()
modsel.fit(train)
print(f'best params {modsel.best_params_}')
pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae'], verbose=True)
means = report.mean()
csv.write(f'{method}\tLeQua-{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
quantifier = modsel.best_model()
df = pd.read_csv(result_path, sep='\t')
report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
means = report.mean()
report.to_csv(result_path+'.dataframe')
csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path+'.csv', sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

View File

@ -1,3 +1,4 @@
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegression
import os
@ -5,8 +6,9 @@ import sys
import pandas as pd
import quapy as qp
from quapy.method.aggregative import DistributionMatching
from quapy.method.aggregative import EMQ, DistributionMatching, PACC, HDy, OneVsAllAggregative
from method_kdey import KDEy
from method_dirichlety import DIRy
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
@ -15,50 +17,103 @@ if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
method = 'KDE'
param = 0.1
target = 'max_likelihood'
div = 'topsoe'
method_identifier = f'{method}_modsel_{div if target=="min_divergence" else target}'
n_bags_val = 250
n_bags_test = 1000
result_dir = f'results_tweet_{n_bags_test}'
optim = 'mae'
os.makedirs('results', exist_ok=True)
result_path = f'results/{method_identifier}.csv'
os.makedirs(result_dir, exist_ok=True)
#if os.path.exists(result_path):
# print('Result already exit. Nothing to do')
# sys.exit(0)
hyper_LR = {
'classifier__C': np.logspace(-4,4,9),
'classifier__class_weight': ['balanced', None]
}
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for method in ['PACC', 'SLD', 'DM', 'KDE', 'HDy', 'DIR']:
#if os.path.exists(result_path):
# print('Result already exit. Nothing to do')
# sys.exit(0)
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
print('init', dataset)
result_path = f'{result_dir}/{method}'
if os.path.exists(result_path+'.dataframe'):
print(f'result file {result_path} already exist; skipping')
continue
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
with open(result_path+'.csv', 'at') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n')
if method == 'KDE':
param_grid = {'bandwidth': np.linspace(0.001, 0.2, 21)}
model = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn', target=target)
else:
raise NotImplementedError('unknown method')
# four semeval dataset share the training, so it is useless to optimize hyperparameters four times;
# this variable controls that the mod sel has already been done, and skip this otherwise
semeval_trained = False
protocol = UPP(data.test, repeats=100)
modsel = GridSearchQ(model, param_grid, protocol, refit=False, n_jobs=-1, verbose=1)
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
print('init', dataset)
modsel.fit(data.training)
print(f'best params {modsel.best_params_}')
is_semeval = dataset.startswith('semeval')
quantifier = modsel.best_model()
if not is_semeval or not semeval_trained:
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=100)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True)
means = report.mean()
csv.write(f'{method_identifier}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
if method == 'KDE':
param_grid = {
'bandwidth': np.linspace(0.001, 0.2, 21),
'classifier__C': np.logspace(-4,4,9),
'classifier__class_weight': ['balanced', None]
}
quantifier = KDEy(LogisticRegression(), target='max_likelihood')
elif method == 'DIR':
param_grid = hyper_LR
quantifier = DIRy(LogisticRegression())
elif method == 'SLD':
param_grid = hyper_LR
quantifier = EMQ(LogisticRegression())
elif method == 'PACC':
param_grid = hyper_LR
quantifier = PACC(LogisticRegression())
elif method == 'HDy-OvA':
param_grid = {
'binary_quantifier__classifier__C': np.logspace(-4,4,9),
'binary_quantifier__classifier__class_weight': ['balanced', None]
}
quantifier = OneVsAllAggregative(HDy(LogisticRegression()))
elif method == 'DM':
param_grid = {
'nbins': [5,10,15],
'classifier__C': np.logspace(-4,4,9),
'classifier__class_weight': ['balanced', None]
}
quantifier = DistributionMatching(LogisticRegression())
else:
raise NotImplementedError('unknown method', method)
df = pd.read_csv(result_path, sep='\t')
# model selection
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
protocol = UPP(data.test, repeats=n_bags_val)
modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim)
modsel.fit(data.training)
print(f'best params {modsel.best_params_}')
pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)
quantifier = modsel.best_model()
if is_semeval:
semeval_trained = True
else:
print(f'model selection for {dataset} already done; skipping')
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=n_bags_test)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True)
report.to_csv(result_path+'.dataframe')
means = report.mean()
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path+'.csv', sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

View File

@ -0,0 +1,101 @@
import os
import sys
from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import quapy as qp
from quapy.data import LabelledCollection
from quapy.protocol import APP, UPP
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
DistributionMatching, _get_divergence
import scipy
from scipy import optimize
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
import dirichlet
class DIRy(AggregativeProbabilisticQuantifier):
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None, target='max_likelihood'):
self.classifier = classifier
self.val_split = val_split
self.n_jobs = n_jobs
self.target = target
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
if val_split is None:
val_split = self.val_split
self.classifier, y, posteriors, _, _ = cross_generate_predictions(
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
self.val_parameters = [dirichlet.mle(posteriors[y == cat]) for cat in range(data.n_classes)]
return self
def val_pdf(self, prev):
"""
Returns a function that computes the mixture model with the given prev as mixture factor
:param prev: a prevalence vector, ndarray
:return: a function implementing the validation distribution with fixed mixture factor
"""
return lambda posteriors: sum(prev_i * dirichlet.pdf(parameters_i)(posteriors) for parameters_i, prev_i in zip(self.val_parameters, prev))
def aggregate(self, posteriors: np.ndarray):
if self.target == 'min_divergence':
raise NotImplementedError('not yet')
return self._target_divergence(posteriors)
elif self.target == 'max_likelihood':
return self._target_likelihood(posteriors)
else:
raise ValueError('unknown target')
def _target_divergence(self, posteriors):
test_density = self.get_kde(posteriors)
# val_test_posteriors = np.concatenate([self.val_posteriors, posteriors])
test_likelihood = self.pdf(test_density, posteriors)
divergence = _get_divergence(self.divergence)
n_classes = len(self.val_densities)
def match(prev):
val_pdf = self.val_pdf(prev)
val_likelihood = val_pdf(posteriors)
#for i,prev_i in enumerate(prev):
return divergence(val_likelihood, test_likelihood)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
def _target_likelihood(self, posteriors, eps=0.000001):
n_classes = len(self.val_parameters)
def neg_loglikelihood(prev):
val_pdf = self.val_pdf(prev)
test_likelihood = val_pdf(posteriors)
test_loglikelihood = np.log(test_likelihood + eps)
return -np.sum(test_loglikelihood)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x

View File

@ -9,8 +9,8 @@ from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import quapy as qp
from data import LabelledCollection
from protocol import APP, UPP
from quapy.data import LabelledCollection
from quapy.protocol import APP, UPP
from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \
DistributionMatching, _get_divergence
import scipy
@ -22,16 +22,6 @@ from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
# TODO: think of a MMD-y variant, i.e., a MMD variant that uses the points in the simplex and possibly any non-linear kernel
class SklearnKDE:
def __init__(self):
pass
def fit(self):
pass
def likelihood(self):
pass
class KDEy(AggregativeProbabilisticQuantifier):
@ -163,8 +153,6 @@ class KDEy(AggregativeProbabilisticQuantifier):
val_pdf = self.val_pdf(prev)
val_likelihood = val_pdf(posteriors)
#for i,prev_i in enumerate(prev):
return divergence(val_likelihood, test_likelihood)
# the initial point is set as the uniform distribution
@ -176,7 +164,7 @@ class KDEy(AggregativeProbabilisticQuantifier):
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
def _target_likelihood(self, posteriors):
def _target_likelihood(self, posteriors, eps=0.000001):
"""
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
@ -189,8 +177,9 @@ class KDEy(AggregativeProbabilisticQuantifier):
def neg_loglikelihood(prev):
val_pdf = self.val_pdf(prev)
test_likelihood = val_pdf(posteriors)
test_loglikelihood = np.log(test_likelihood)
return - np.sum(test_loglikelihood)
test_loglikelihood = np.log(test_likelihood + eps)
return -np.sum(test_loglikelihood)
#return -np.prod(test_likelihood)
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))

View File

@ -2,7 +2,8 @@ import sys
from pathlib import Path
import pandas as pd
result_dir = 'results'
#result_dir = 'results_tweet_1000'
result_dir = 'results_lequa'
dfs = []
@ -11,19 +12,27 @@ for path in pathlist:
path_in_str = str(path)
print(path_in_str)
df = pd.read_csv(path_in_str, sep='\t')
dfs.append(df)
try:
df = pd.read_csv(path_in_str, sep='\t')
if not df.empty:
dfs.append(df)
except Exception:
print('empty')
df = pd.concat(dfs)
piv = df.pivot_table(index='Dataset', columns='Method', values='MRAE')
piv.loc['mean'] = piv.mean()
for err in ['MAE', 'MRAE']:
print('-'*100)
print(err)
print('-'*100)
piv = df.pivot_table(index='Dataset', columns='Method', values=err)
piv.loc['mean'] = piv.mean()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('expand_frame_repr', False)
print(piv)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('expand_frame_repr', False)
print(piv)
print()

View File

@ -4,6 +4,11 @@ y el otro es un KDE en test), de las que luego se calculará la divergencia (obj
generar solo una distribución (mixture model de train) y tomar la likelihood de los puntos de test como objetivo
a maximizar.
- quedarse con hyperparametros mejores por verlos
- sacar los dataframes en resultados para hcer test estadisticos
- hacer dibujitos
1) aclarar: only test?
2) implementar el auto
- optimización interna para likelihood [ninguno parece funcionar bien]