From 26de9d92ebab66a427401a362da2c7ccf7783dd1 Mon Sep 17 00:00:00 2001 From: Alex Moreo Date: Thu, 20 Jul 2023 09:03:22 +0200 Subject: [PATCH] distribution matching multiclass --- laboratory/main_lequa.py | 58 ++++++++++++ laboratory/main_tweets.py | 66 ++++++++++++++ laboratory/main_tweets_auto.py | 62 +++++++++++++ laboratory/method_kdey.py | 161 +++++++++++++++++++-------------- laboratory/show_results.py | 32 +++++++ laboratory/todo.txt | 21 +++++ quapy/model_selection.py | 2 +- 7 files changed, 335 insertions(+), 67 deletions(-) create mode 100644 laboratory/main_lequa.py create mode 100644 laboratory/main_tweets.py create mode 100644 laboratory/main_tweets_auto.py create mode 100644 laboratory/show_results.py create mode 100644 laboratory/todo.txt diff --git a/laboratory/main_lequa.py b/laboratory/main_lequa.py new file mode 100644 index 0000000..e8ed41a --- /dev/null +++ b/laboratory/main_lequa.py @@ -0,0 +1,58 @@ +import numpy as np +from sklearn.linear_model import LogisticRegression +import os +import sys +import pandas as pd + +import quapy as qp +from quapy.method.aggregative import DistributionMatching +from method_kdey import KDEy +from quapy.model_selection import GridSearchQ + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B'] + qp.environ['N_JOBS'] = -1 + method = 'KDE' + param = 0.1 + div = 'topsoe' + method_identifier = f'{method}_modsel_{div}' + + os.makedirs('results', exist_ok=True) + result_path = f'results_LequaT2B/{method_identifier}.csv' + + #if os.path.exists(result_path): + # print('Result already exit. Nothing to do') + # sys.exit(0) + + with open(result_path, 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\n') + + dataset = 'T1B' + train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset) + + if method == 'KDE': + param_grid = {'bandwidth': np.linspace(0.001, 0.1, 11)} + model = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn') + else: + raise NotImplementedError('unknown method') + + modsel = GridSearchQ(model, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1) + + modsel.fit(train) + print(f'best params {modsel.best_params_}') + + quantifier = modsel.best_model() + + report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae'], verbose=True) + means = report.mean() + csv.write(f'{method}\tLeQua-{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') + csv.flush() + + df = pd.read_csv(result_path, sep='\t') + + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) + print(pv) diff --git a/laboratory/main_tweets.py b/laboratory/main_tweets.py new file mode 100644 index 0000000..046dfc1 --- /dev/null +++ b/laboratory/main_tweets.py @@ -0,0 +1,66 @@ +import numpy as np +from sklearn.linear_model import LogisticRegression +import os +import sys +import pandas as pd + +import quapy as qp +from quapy.method.aggregative import DistributionMatching +from method_kdey import KDEy +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 100 + qp.environ['N_JOBS'] = -1 + method = 'KDE' + param = 0.1 + target = 'max_likelihood' + div = 'topsoe' + method_identifier = f'{method}_modsel_{div if target=="min_divergence" else target}' + + os.makedirs('results', exist_ok=True) + result_path = f'results/{method_identifier}.csv' + + #if os.path.exists(result_path): + # print('Result already exit. Nothing to do') + # sys.exit(0) + + with open(result_path, 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\n') + + for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: + print('init', dataset) + + data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True) + + if method == 'KDE': + param_grid = {'bandwidth': np.linspace(0.001, 0.2, 21)} + model = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn', target=target) + else: + raise NotImplementedError('unknown method') + + protocol = UPP(data.test, repeats=100) + modsel = GridSearchQ(model, param_grid, protocol, refit=False, n_jobs=-1, verbose=1) + + modsel.fit(data.training) + print(f'best params {modsel.best_params_}') + + quantifier = modsel.best_model() + + data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False) + quantifier.fit(data.training) + protocol = UPP(data.test, repeats=100) + report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True) + means = report.mean() + csv.write(f'{method_identifier}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') + csv.flush() + + df = pd.read_csv(result_path, sep='\t') + + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) + print(pv) diff --git a/laboratory/main_tweets_auto.py b/laboratory/main_tweets_auto.py new file mode 100644 index 0000000..9472db5 --- /dev/null +++ b/laboratory/main_tweets_auto.py @@ -0,0 +1,62 @@ +from sklearn.linear_model import LogisticRegression +import os +import sys +import pandas as pd + +import quapy as qp +from method.aggregative import DistributionMatching +from method_kdey import KDEy +from protocol import UPP + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 100 + qp.environ['N_JOBS'] = -1 + method = 'KDE' + param = 0.1 + div = 'topsoe' + method_identifier = f'{method}_{param}_{div}' + + # generates tuples (dataset, method, method_name) + # (the dataset is needed for methods that process the dataset differently) + def gen_methods(): + + for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: + + data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True) + + if method == 'KDE': + kdey = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn') + yield data, kdey, method_identifier + + elif method == 'DM': + dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=param) + yield data, dm, method_identifier + + else: + raise NotImplementedError('unknown method') + + os.makedirs('results', exist_ok=True) + result_path = f'results/{method_identifier}.csv' + + if os.path.exists(result_path): + print('Result already exit. Nothing to do') + sys.exit(0) + + with open(result_path, 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\n') + for data, quantifier, quant_name in gen_methods(): + quantifier.fit(data.training) + protocol = UPP(data.test, repeats=100) + report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True) + means = report.mean() + csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') + csv.flush() + + df = pd.read_csv(result_path, sep='\t') + + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) + print(pv) diff --git a/laboratory/method_kdey.py b/laboratory/method_kdey.py index e999652..b5acd84 100644 --- a/laboratory/method_kdey.py +++ b/laboratory/method_kdey.py @@ -1,8 +1,11 @@ +import os +import sys from typing import Union, Callable import numpy as np from sklearn.base import BaseEstimator from sklearn.linear_model import LogisticRegression import pandas as pd +from sklearn.model_selection import GridSearchCV from sklearn.neighbors import KernelDensity import quapy as qp @@ -12,56 +15,96 @@ from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _traini DistributionMatching, _get_divergence import scipy from scipy import optimize +from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional # TODO: optimize the bandwidth automatically -# TODO: replace the l2 metric in the kernel with the EMD, try to visualize the difference between both criteria in a 3-simplex # TODO: think of a MMD-y variant, i.e., a MMD variant that uses the points in the simplex and possibly any non-linear kernel +class SklearnKDE: + def __init__(self): + pass + + def fit(self): + pass + + def likelihood(self): + pass + + class KDEy(AggregativeProbabilisticQuantifier): BANDWIDTH_METHOD = ['auto', 'scott', 'silverman'] - ENGINE = ['scipy', 'sklearn'] + ENGINE = ['scipy', 'sklearn', 'statsmodels'] + TARGET = ['min_divergence', 'max_likelihood'] def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='HD', - bandwidth_method='scott', engine='sklearn', n_jobs=None): + bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None): + assert bandwidth in KDEy.BANDWIDTH_METHOD or isinstance(bandwidth, float), \ + f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}' + assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}' + assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}' self.classifier = classifier self.val_split = val_split self.divergence = divergence - self.bandwidth_method = bandwidth_method + self.bandwidth = bandwidth self.engine = engine + self.target = target self.n_jobs = n_jobs - assert bandwidth_method in KDEy.BANDWIDTH_METHOD, f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}' - assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}' + + def search_bandwidth_maxlikelihood(self, posteriors, labels): + grid = {'bandwidth': np.linspace(0.001, 0.2, 100)} + search = GridSearchCV( + KernelDensity(), param_grid=grid, n_jobs=-1, cv=50, verbose=1, refit=True + ) + search.fit(posteriors, labels) + bandwidth = search.best_params_['bandwidth'] + print(f'auto: bandwidth={bandwidth:.5f}') + return bandwidth def get_kde(self, posteriors): + # if self.bandwidth == 'auto': + # print('adjusting bandwidth') + # + # if self.engine == 'sklearn': + # grid = {'bandwidth': np.linspace(0.001,0.2,41)} + # search = GridSearchCV( + # KernelDensity(), param_grid=grid, n_jobs=-1, cv=10, verbose=1, refit=True + # ) + # search.fit(posteriors) + # print(search.best_score_) + # print(search.best_params_) + # + # import pandas as pd + # df = pd.DataFrame(search.cv_results_) + # pd.set_option('display.max_columns', None) + # pd.set_option('display.max_rows', None) + # pd.set_option('expand_frame_repr', False) + # print(df) + # sys.exit(0) + # + # kde = search + + #else: if self.engine == 'scipy': # scipy treats columns as datapoints, and need the datapoints not to lie in a lower-dimensional subspace, which # requires removing the last dimension which is constrained posteriors = posteriors[:,:-1].T kde = scipy.stats.gaussian_kde(posteriors) - kde.set_bandwidth(self.bandwidth_method) + kde.set_bandwidth(self.bandwidth) elif self.engine == 'sklearn': - kde = KernelDensity(bandwidth=self.bandwidth_method).fit(posteriors) + kde = KernelDensity(bandwidth=self.bandwidth).fit(posteriors) return kde def pdf(self, kde, posteriors): if self.engine == 'scipy': - return kde(posteriors[:,:-1].T) + return kde(posteriors[:, :-1].T) elif self.engine == 'sklearn': return np.exp(kde.score_samples(posteriors)) def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): """ - Trains the classifier (if requested) and generates the validation distributions out of the training data. - The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of - channels (a channel is a description, in form of a histogram, of a specific class -- there are as many channels - as classes, although in the binary case one can use only one channel, since the other one is constrained), - and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]` - are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete - distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]` - is the fraction of instances with a value in the `k`-th bin. :param data: the training set :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) @@ -77,6 +120,9 @@ class KDEy(AggregativeProbabilisticQuantifier): data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs ) + if self.bandwidth == 'auto': + self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y) + self.val_densities = [self.get_kde(posteriors[y == cat]) for cat in range(data.n_classes)] self.val_posteriors = posteriors @@ -90,14 +136,18 @@ class KDEy(AggregativeProbabilisticQuantifier): """ return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev)) - def aggregate(self, posteriors: np.ndarray): + if self.target == 'min_divergence': + return self._target_divergence(posteriors) + elif self.target == 'max_likelihood': + return self._target_likelihood(posteriors) + else: + raise ValueError('unknown target') + + def _target_divergence(self, posteriors): """ Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution (the mixture) that best matches the test distribution, in terms of the divergence measure of choice. - In the multiclass case, with `n` the number of classes, the test and mixture distributions contain - `n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed - independently. The matching is computed as an average of the divergence across all channels. :param instances: instances in the sample :return: a vector of class prevalence estimates @@ -107,12 +157,14 @@ class KDEy(AggregativeProbabilisticQuantifier): test_likelihood = self.pdf(test_density, posteriors) divergence = _get_divergence(self.divergence) - n_classes = len(self.val_densities) def match(prev): val_pdf = self.val_pdf(prev) - val_likelihood = val_pdf(posteriors) + val_likelihood = val_pdf(posteriors) + + #for i,prev_i in enumerate(prev): + return divergence(val_likelihood, test_likelihood) # the initial point is set as the uniform distribution @@ -124,50 +176,27 @@ class KDEy(AggregativeProbabilisticQuantifier): r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) return r.x + def _target_likelihood(self, posteriors): + """ + Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution + (the mixture) that best matches the test distribution, in terms of the divergence measure of choice. + :param instances: instances in the sample + :return: a vector of class prevalence estimates + """ + n_classes = len(self.val_densities) -if __name__ == '__main__': + def neg_loglikelihood(prev): + val_pdf = self.val_pdf(prev) + test_likelihood = val_pdf(posteriors) + test_loglikelihood = np.log(test_likelihood) + return - np.sum(test_loglikelihood) - qp.environ['SAMPLE_SIZE'] = 100 - qp.environ['N_JOBS'] = -1 - div = 'HD' + # the initial point is set as the uniform distribution + uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) - # generates tuples (dataset, method, method_name) - # (the dataset is needed for methods that process the dataset differently) - def gen_methods(): - - for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: - - data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True) - - # kdey = KDEy(LogisticRegression(), divergence=div, bandwidth_method='scott') - # yield data, kdey, f'KDEy-{div}-scott' - - kdey = KDEy(LogisticRegression(), divergence=div, bandwidth_method='silverman', engine='sklearn') - yield data, kdey, f'KDEy-{div}-silverman' - - dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=5) - yield data, dm, f'DM-5b-{div}' - - # dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=10) - # yield data, dm, f'DM-10b-{div}' - - - result_path = 'results_kdey.csv' - with open(result_path, 'wt') as csv: - csv.write(f'Method\tDataset\tMAE\tMRAE\n') - for data, quantifier, quant_name in gen_methods(): - quantifier.fit(data.training) - protocol = UPP(data.test, repeats=100) - report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae','mrae'], verbose=True) - means = report.mean() - csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') - csv.flush() - - df = pd.read_csv(result_path, sep='\t') - # print(df) - - pd.set_option('display.max_columns', None) - pd.set_option('display.max_rows', None) - pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"]) - print(pv) + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + return r.x \ No newline at end of file diff --git a/laboratory/show_results.py b/laboratory/show_results.py new file mode 100644 index 0000000..219914c --- /dev/null +++ b/laboratory/show_results.py @@ -0,0 +1,32 @@ +import sys +from pathlib import Path +import pandas as pd + +result_dir = 'results' + +dfs = [] + +pathlist = Path(result_dir).rglob('*.csv') +for path in pathlist: + path_in_str = str(path) + print(path_in_str) + + df = pd.read_csv(path_in_str, sep='\t') + + dfs.append(df) + +df = pd.concat(dfs) + +piv = df.pivot_table(index='Dataset', columns='Method', values='MRAE') +piv.loc['mean'] = piv.mean() + +pd.set_option('display.max_columns', None) +pd.set_option('display.max_rows', None) +pd.set_option('expand_frame_repr', False) +print(piv) + + + + + + diff --git a/laboratory/todo.txt b/laboratory/todo.txt new file mode 100644 index 0000000..20b69d8 --- /dev/null +++ b/laboratory/todo.txt @@ -0,0 +1,21 @@ +Cosa fundamental: +KDE se puede usar para generar 2 distribuciones (una, es un mixture model de KDEs en train condicionados a cada clase, +y el otro es un KDE en test), de las que luego se calculará la divergencia (objetivo a minimizar). Otra opción es +generar solo una distribución (mixture model de train) y tomar la likelihood de los puntos de test como objetivo +a maximizar. + +1) aclarar: only test? +2) implementar el auto + - optimización interna para likelihood [ninguno parece funcionar bien] + - de todo (e.g., todo el training)? + - independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test) + - optimización como un parámetro GridSearchQ +3) aclarar: topsoe? +4) otro tipo de model selection? +5) aumentar numero de bags +6) optimizar parametro C? optimizar kernel? optimizar distancia? +7) KDE de sklearn o multivariate KDE de statsmodel? ver también qué es esto (parece que da P(Y|X) o sea que podría + eliminar el clasificador?): + https://www.statsmodels.org/dev/_modules/statsmodels/nonparametric/kernel_density.html#KDEMultivariateConditional +8) quitar la ultima dimension en sklearn también? +9) optimizar para RAE en vez de AE? diff --git a/quapy/model_selection.py b/quapy/model_selection.py index 4a53bb6..146d178 100644 --- a/quapy/model_selection.py +++ b/quapy/model_selection.py @@ -88,7 +88,7 @@ class GridSearchQ(BaseQuantifier): hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)] self._sout(f'starting model selection with {self.n_jobs =}') - #pass a seed to parallel so it is set in clild processes + # pass a seed to parallel so it is set in clild processes scores = qp.util.parallel( self._delayed_eval, ((params, training) for params in hyper),