distribution matching multiclass

This commit is contained in:
Alejandro Moreo Fernandez 2023-07-20 09:03:22 +02:00
parent 4a7ffe5b50
commit 26de9d92eb
7 changed files with 335 additions and 67 deletions

58
laboratory/main_lequa.py Normal file
View File

@ -0,0 +1,58 @@
import numpy as np
from sklearn.linear_model import LogisticRegression
import os
import sys
import pandas as pd
import quapy as qp
from quapy.method.aggregative import DistributionMatching
from method_kdey import KDEy
from quapy.model_selection import GridSearchQ
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
qp.environ['N_JOBS'] = -1
method = 'KDE'
param = 0.1
div = 'topsoe'
method_identifier = f'{method}_modsel_{div}'
os.makedirs('results', exist_ok=True)
result_path = f'results_LequaT2B/{method_identifier}.csv'
#if os.path.exists(result_path):
# print('Result already exit. Nothing to do')
# sys.exit(0)
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
dataset = 'T1B'
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
if method == 'KDE':
param_grid = {'bandwidth': np.linspace(0.001, 0.1, 11)}
model = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn')
else:
raise NotImplementedError('unknown method')
modsel = GridSearchQ(model, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1)
modsel.fit(train)
print(f'best params {modsel.best_params_}')
quantifier = modsel.best_model()
report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae'], verbose=True)
means = report.mean()
csv.write(f'{method}\tLeQua-{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path, sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

66
laboratory/main_tweets.py Normal file
View File

@ -0,0 +1,66 @@
import numpy as np
from sklearn.linear_model import LogisticRegression
import os
import sys
import pandas as pd
import quapy as qp
from quapy.method.aggregative import DistributionMatching
from method_kdey import KDEy
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
method = 'KDE'
param = 0.1
target = 'max_likelihood'
div = 'topsoe'
method_identifier = f'{method}_modsel_{div if target=="min_divergence" else target}'
os.makedirs('results', exist_ok=True)
result_path = f'results/{method_identifier}.csv'
#if os.path.exists(result_path):
# print('Result already exit. Nothing to do')
# sys.exit(0)
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
print('init', dataset)
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
if method == 'KDE':
param_grid = {'bandwidth': np.linspace(0.001, 0.2, 21)}
model = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn', target=target)
else:
raise NotImplementedError('unknown method')
protocol = UPP(data.test, repeats=100)
modsel = GridSearchQ(model, param_grid, protocol, refit=False, n_jobs=-1, verbose=1)
modsel.fit(data.training)
print(f'best params {modsel.best_params_}')
quantifier = modsel.best_model()
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=100)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True)
means = report.mean()
csv.write(f'{method_identifier}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path, sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

View File

@ -0,0 +1,62 @@
from sklearn.linear_model import LogisticRegression
import os
import sys
import pandas as pd
import quapy as qp
from method.aggregative import DistributionMatching
from method_kdey import KDEy
from protocol import UPP
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
method = 'KDE'
param = 0.1
div = 'topsoe'
method_identifier = f'{method}_{param}_{div}'
# generates tuples (dataset, method, method_name)
# (the dataset is needed for methods that process the dataset differently)
def gen_methods():
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True)
if method == 'KDE':
kdey = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn')
yield data, kdey, method_identifier
elif method == 'DM':
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=param)
yield data, dm, method_identifier
else:
raise NotImplementedError('unknown method')
os.makedirs('results', exist_ok=True)
result_path = f'results/{method_identifier}.csv'
if os.path.exists(result_path):
print('Result already exit. Nothing to do')
sys.exit(0)
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for data, quantifier, quant_name in gen_methods():
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=100)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True)
means = report.mean()
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path, sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

View File

@ -1,8 +1,11 @@
import os
import sys
from typing import Union, Callable
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
import quapy as qp
@ -12,56 +15,96 @@ from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _traini
DistributionMatching, _get_divergence
import scipy
from scipy import optimize
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
# TODO: optimize the bandwidth automatically
# TODO: replace the l2 metric in the kernel with the EMD, try to visualize the difference between both criteria in a 3-simplex
# TODO: think of a MMD-y variant, i.e., a MMD variant that uses the points in the simplex and possibly any non-linear kernel
class SklearnKDE:
def __init__(self):
pass
def fit(self):
pass
def likelihood(self):
pass
class KDEy(AggregativeProbabilisticQuantifier):
BANDWIDTH_METHOD = ['auto', 'scott', 'silverman']
ENGINE = ['scipy', 'sklearn']
ENGINE = ['scipy', 'sklearn', 'statsmodels']
TARGET = ['min_divergence', 'max_likelihood']
def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='HD',
bandwidth_method='scott', engine='sklearn', n_jobs=None):
bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None):
assert bandwidth in KDEy.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
self.classifier = classifier
self.val_split = val_split
self.divergence = divergence
self.bandwidth_method = bandwidth_method
self.bandwidth = bandwidth
self.engine = engine
self.target = target
self.n_jobs = n_jobs
assert bandwidth_method in KDEy.BANDWIDTH_METHOD, f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
def search_bandwidth_maxlikelihood(self, posteriors, labels):
grid = {'bandwidth': np.linspace(0.001, 0.2, 100)}
search = GridSearchCV(
KernelDensity(), param_grid=grid, n_jobs=-1, cv=50, verbose=1, refit=True
)
search.fit(posteriors, labels)
bandwidth = search.best_params_['bandwidth']
print(f'auto: bandwidth={bandwidth:.5f}')
return bandwidth
def get_kde(self, posteriors):
# if self.bandwidth == 'auto':
# print('adjusting bandwidth')
#
# if self.engine == 'sklearn':
# grid = {'bandwidth': np.linspace(0.001,0.2,41)}
# search = GridSearchCV(
# KernelDensity(), param_grid=grid, n_jobs=-1, cv=10, verbose=1, refit=True
# )
# search.fit(posteriors)
# print(search.best_score_)
# print(search.best_params_)
#
# import pandas as pd
# df = pd.DataFrame(search.cv_results_)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# pd.set_option('expand_frame_repr', False)
# print(df)
# sys.exit(0)
#
# kde = search
#else:
if self.engine == 'scipy':
# scipy treats columns as datapoints, and need the datapoints not to lie in a lower-dimensional subspace, which
# requires removing the last dimension which is constrained
posteriors = posteriors[:,:-1].T
kde = scipy.stats.gaussian_kde(posteriors)
kde.set_bandwidth(self.bandwidth_method)
kde.set_bandwidth(self.bandwidth)
elif self.engine == 'sklearn':
kde = KernelDensity(bandwidth=self.bandwidth_method).fit(posteriors)
kde = KernelDensity(bandwidth=self.bandwidth).fit(posteriors)
return kde
def pdf(self, kde, posteriors):
if self.engine == 'scipy':
return kde(posteriors[:,:-1].T)
return kde(posteriors[:, :-1].T)
elif self.engine == 'sklearn':
return np.exp(kde.score_samples(posteriors))
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
"""
Trains the classifier (if requested) and generates the validation distributions out of the training data.
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
channels (a channel is a description, in form of a histogram, of a specific class -- there are as many channels
as classes, although in the binary case one can use only one channel, since the other one is constrained),
and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
is the fraction of instances with a value in the `k`-th bin.
:param data: the training set
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
@ -77,6 +120,9 @@ class KDEy(AggregativeProbabilisticQuantifier):
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
)
if self.bandwidth == 'auto':
self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y)
self.val_densities = [self.get_kde(posteriors[y == cat]) for cat in range(data.n_classes)]
self.val_posteriors = posteriors
@ -90,14 +136,18 @@ class KDEy(AggregativeProbabilisticQuantifier):
"""
return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev))
def aggregate(self, posteriors: np.ndarray):
if self.target == 'min_divergence':
return self._target_divergence(posteriors)
elif self.target == 'max_likelihood':
return self._target_likelihood(posteriors)
else:
raise ValueError('unknown target')
def _target_divergence(self, posteriors):
"""
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
In the multiclass case, with `n` the number of classes, the test and mixture distributions contain
`n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
independently. The matching is computed as an average of the divergence across all channels.
:param instances: instances in the sample
:return: a vector of class prevalence estimates
@ -107,12 +157,14 @@ class KDEy(AggregativeProbabilisticQuantifier):
test_likelihood = self.pdf(test_density, posteriors)
divergence = _get_divergence(self.divergence)
n_classes = len(self.val_densities)
def match(prev):
val_pdf = self.val_pdf(prev)
val_likelihood = val_pdf(posteriors)
val_likelihood = val_pdf(posteriors)
#for i,prev_i in enumerate(prev):
return divergence(val_likelihood, test_likelihood)
# the initial point is set as the uniform distribution
@ -124,50 +176,27 @@ class KDEy(AggregativeProbabilisticQuantifier):
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x
def _target_likelihood(self, posteriors):
"""
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
:param instances: instances in the sample
:return: a vector of class prevalence estimates
"""
n_classes = len(self.val_densities)
if __name__ == '__main__':
def neg_loglikelihood(prev):
val_pdf = self.val_pdf(prev)
test_likelihood = val_pdf(posteriors)
test_loglikelihood = np.log(test_likelihood)
return - np.sum(test_loglikelihood)
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
div = 'HD'
# the initial point is set as the uniform distribution
uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
# generates tuples (dataset, method, method_name)
# (the dataset is needed for methods that process the dataset differently)
def gen_methods():
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True)
# kdey = KDEy(LogisticRegression(), divergence=div, bandwidth_method='scott')
# yield data, kdey, f'KDEy-{div}-scott'
kdey = KDEy(LogisticRegression(), divergence=div, bandwidth_method='silverman', engine='sklearn')
yield data, kdey, f'KDEy-{div}-silverman'
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=5)
yield data, dm, f'DM-5b-{div}'
# dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=10)
# yield data, dm, f'DM-10b-{div}'
result_path = 'results_kdey.csv'
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for data, quantifier, quant_name in gen_methods():
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=100)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae','mrae'], verbose=True)
means = report.mean()
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path, sep='\t')
# print(df)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)
# solutions are bounded to those contained in the unit-simplex
bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x

View File

@ -0,0 +1,32 @@
import sys
from pathlib import Path
import pandas as pd
result_dir = 'results'
dfs = []
pathlist = Path(result_dir).rglob('*.csv')
for path in pathlist:
path_in_str = str(path)
print(path_in_str)
df = pd.read_csv(path_in_str, sep='\t')
dfs.append(df)
df = pd.concat(dfs)
piv = df.pivot_table(index='Dataset', columns='Method', values='MRAE')
piv.loc['mean'] = piv.mean()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('expand_frame_repr', False)
print(piv)

21
laboratory/todo.txt Normal file
View File

@ -0,0 +1,21 @@
Cosa fundamental:
KDE se puede usar para generar 2 distribuciones (una, es un mixture model de KDEs en train condicionados a cada clase,
y el otro es un KDE en test), de las que luego se calculará la divergencia (objetivo a minimizar). Otra opción es
generar solo una distribución (mixture model de train) y tomar la likelihood de los puntos de test como objetivo
a maximizar.
1) aclarar: only test?
2) implementar el auto
- optimización interna para likelihood [ninguno parece funcionar bien]
- de todo (e.g., todo el training)?
- independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test)
- optimización como un parámetro GridSearchQ
3) aclarar: topsoe?
4) otro tipo de model selection?
5) aumentar numero de bags
6) optimizar parametro C? optimizar kernel? optimizar distancia?
7) KDE de sklearn o multivariate KDE de statsmodel? ver también qué es esto (parece que da P(Y|X) o sea que podría
eliminar el clasificador?):
https://www.statsmodels.org/dev/_modules/statsmodels/nonparametric/kernel_density.html#KDEMultivariateConditional
8) quitar la ultima dimension en sklearn también?
9) optimizar para RAE en vez de AE?

View File

@ -88,7 +88,7 @@ class GridSearchQ(BaseQuantifier):
hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)]
self._sout(f'starting model selection with {self.n_jobs =}')
#pass a seed to parallel so it is set in clild processes
# pass a seed to parallel so it is set in clild processes
scores = qp.util.parallel(
self._delayed_eval,
((params, training) for params in hyper),