From d995990fba05faa5135c39cc6870160258be2a23 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 21 Jul 2023 11:41:16 +0200 Subject: [PATCH] initial experiments and DIR method --- .gitignore | 34 +++++++++ laboratory/main_lequa.py | 96 +++++++++++++++++-------- laboratory/main_tweets.py | 123 +++++++++++++++++++++++--------- laboratory/method_dirichlety.py | 101 ++++++++++++++++++++++++++ laboratory/method_kdey.py | 23 ++---- laboratory/show_results.py | 29 +++++--- laboratory/todo.txt | 5 ++ 7 files changed, 322 insertions(+), 89 deletions(-) create mode 100644 laboratory/method_dirichlety.py diff --git a/.gitignore b/.gitignore index b9703a3..b5ecfbc 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,40 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +.idea +.vscode +LeQua2022 +MultiLabel/results_generales +MultiLabel/mlqtables +NewMethods/plots* +NewMethods/results* +NewMethods/tables* +NewMethods/latex* +Ordinal/data* +Ordinal/roberta* +Ordinal/tables* +Ordinal/results* +eDiscovery/plots* +eDiscovery/results* +examples/results* +poster-cikm* +slides-cikm* +slides-short-cikm* +quick_experiment/figures* +quick_experiment/figures* +svm_perf_quantification/* +TweetSentQuant/plots* +TweetSentQuant/results* +TweetSentQuant/tables* +TweetSentQuant/Tweet Sentiment Quantification_NPP +TweetSentQuant/checkpoint +TweetSentQuant/*.tex +checkpoint +*.png +*.zip +*.pkl +*.pickle +*.pdf # PyInstaller # Usually these files are written by a python script from a template diff --git a/laboratory/main_lequa.py b/laboratory/main_lequa.py index e8ed41a..2834d23 100644 --- a/laboratory/main_lequa.py +++ b/laboratory/main_lequa.py @@ -1,3 +1,4 @@ +import pickle import numpy as np from sklearn.linear_model import LogisticRegression import os @@ -5,52 +6,91 @@ import sys import pandas as pd import quapy as qp -from quapy.method.aggregative import DistributionMatching +from quapy.method.aggregative import EMQ, DistributionMatching, PACC, HDy, OneVsAllAggregative from method_kdey import KDEy +from method_dirichlety import DIRy from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP if __name__ == '__main__': qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B'] qp.environ['N_JOBS'] = -1 - method = 'KDE' - param = 0.1 - div = 'topsoe' - method_identifier = f'{method}_modsel_{div}' + result_dir = f'results_lequa' + optim = 'mae' - os.makedirs('results', exist_ok=True) - result_path = f'results_LequaT2B/{method_identifier}.csv' + os.makedirs(result_dir, exist_ok=True) - #if os.path.exists(result_path): - # print('Result already exit. Nothing to do') - # sys.exit(0) + hyper_LR = { + 'classifier__C': np.logspace(-3,3,7), + 'classifier__class_weight': ['balanced', None] + } - with open(result_path, 'wt') as csv: - csv.write(f'Method\tDataset\tMAE\tMRAE\n') + for method in ['PACC', 'SLD', 'DM', 'KDE', 'HDy', 'DIR']: + + #if os.path.exists(result_path): + # print('Result already exit. Nothing to do') + # sys.exit(0) - dataset = 'T1B' - train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset) + result_path = f'{result_dir}/{method}' + if os.path.exists(result_path+'.dataframe'): + print(f'result file {result_path} already exist; skipping') + continue - if method == 'KDE': - param_grid = {'bandwidth': np.linspace(0.001, 0.1, 11)} - model = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn') - else: - raise NotImplementedError('unknown method') + with open(result_path+'.csv', 'at') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') - modsel = GridSearchQ(model, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1) + dataset = 'T1B' + train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset) + print('init', dataset) + if method == 'KDE': + param_grid = { + 'bandwidth': np.linspace(0.001, 0.2, 21), + 'classifier__C': np.logspace(-4,4,9), + 'classifier__class_weight': ['balanced', None] + } + quantifier = KDEy(LogisticRegression(), target='max_likelihood') + elif method == 'DIR': + param_grid = hyper_LR + quantifier = DIRy(LogisticRegression()) + elif method == 'SLD': + param_grid = hyper_LR + quantifier = EMQ(LogisticRegression()) + elif method == 'PACC': + param_grid = hyper_LR + quantifier = PACC(LogisticRegression()) + elif method == 'HDy-OvA': + param_grid = { + 'binary_quantifier__classifier__C': np.logspace(-4,4,9), + 'binary_quantifier__classifier__class_weight': ['balanced', None] + } + quantifier = OneVsAllAggregative(HDy(LogisticRegression())) + elif method == 'DM': + param_grid = { + 'nbins': [5,10,15], + 'classifier__C': np.logspace(-4,4,9), + 'classifier__class_weight': ['balanced', None] + } + quantifier = DistributionMatching(LogisticRegression()) + else: + raise NotImplementedError('unknown method', method) - modsel.fit(train) - print(f'best params {modsel.best_params_}') + modsel = GridSearchQ(quantifier, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1, error=optim) - quantifier = modsel.best_model() + modsel.fit(train) + print(f'best params {modsel.best_params_}') + pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) - report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae'], verbose=True) - means = report.mean() - csv.write(f'{method}\tLeQua-{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') - csv.flush() + quantifier = modsel.best_model() - df = pd.read_csv(result_path, sep='\t') + report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae', 'kld'], verbose=True) + means = report.mean() + report.to_csv(result_path+'.dataframe') + csv.write(f'{method}\tLeQua-T1B\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') + csv.flush() + + df = pd.read_csv(result_path+'.csv', sep='\t') pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) diff --git a/laboratory/main_tweets.py b/laboratory/main_tweets.py index 046dfc1..c07efbb 100644 --- a/laboratory/main_tweets.py +++ b/laboratory/main_tweets.py @@ -1,3 +1,4 @@ +import pickle import numpy as np from sklearn.linear_model import LogisticRegression import os @@ -5,8 +6,9 @@ import sys import pandas as pd import quapy as qp -from quapy.method.aggregative import DistributionMatching +from quapy.method.aggregative import EMQ, DistributionMatching, PACC, HDy, OneVsAllAggregative from method_kdey import KDEy +from method_dirichlety import DIRy from quapy.model_selection import GridSearchQ from quapy.protocol import UPP @@ -15,50 +17,103 @@ if __name__ == '__main__': qp.environ['SAMPLE_SIZE'] = 100 qp.environ['N_JOBS'] = -1 - method = 'KDE' - param = 0.1 - target = 'max_likelihood' - div = 'topsoe' - method_identifier = f'{method}_modsel_{div if target=="min_divergence" else target}' + n_bags_val = 250 + n_bags_test = 1000 + result_dir = f'results_tweet_{n_bags_test}' + optim = 'mae' - os.makedirs('results', exist_ok=True) - result_path = f'results/{method_identifier}.csv' + os.makedirs(result_dir, exist_ok=True) - #if os.path.exists(result_path): - # print('Result already exit. Nothing to do') - # sys.exit(0) + hyper_LR = { + 'classifier__C': np.logspace(-4,4,9), + 'classifier__class_weight': ['balanced', None] + } - with open(result_path, 'wt') as csv: - csv.write(f'Method\tDataset\tMAE\tMRAE\n') + for method in ['PACC', 'SLD', 'DM', 'KDE', 'HDy', 'DIR']: + + #if os.path.exists(result_path): + # print('Result already exit. Nothing to do') + # sys.exit(0) - for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: - print('init', dataset) + result_path = f'{result_dir}/{method}' + if os.path.exists(result_path+'.dataframe'): + print(f'result file {result_path} already exist; skipping') + continue - data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True) + with open(result_path+'.csv', 'at') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\n') - if method == 'KDE': - param_grid = {'bandwidth': np.linspace(0.001, 0.2, 21)} - model = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn', target=target) - else: - raise NotImplementedError('unknown method') + # four semeval dataset share the training, so it is useless to optimize hyperparameters four times; + # this variable controls that the mod sel has already been done, and skip this otherwise + semeval_trained = False - protocol = UPP(data.test, repeats=100) - modsel = GridSearchQ(model, param_grid, protocol, refit=False, n_jobs=-1, verbose=1) + for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: + print('init', dataset) - modsel.fit(data.training) - print(f'best params {modsel.best_params_}') + is_semeval = dataset.startswith('semeval') - quantifier = modsel.best_model() + if not is_semeval or not semeval_trained: - data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False) - quantifier.fit(data.training) - protocol = UPP(data.test, repeats=100) - report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True) - means = report.mean() - csv.write(f'{method_identifier}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') - csv.flush() + if method == 'KDE': + param_grid = { + 'bandwidth': np.linspace(0.001, 0.2, 21), + 'classifier__C': np.logspace(-4,4,9), + 'classifier__class_weight': ['balanced', None] + } + quantifier = KDEy(LogisticRegression(), target='max_likelihood') + elif method == 'DIR': + param_grid = hyper_LR + quantifier = DIRy(LogisticRegression()) + elif method == 'SLD': + param_grid = hyper_LR + quantifier = EMQ(LogisticRegression()) + elif method == 'PACC': + param_grid = hyper_LR + quantifier = PACC(LogisticRegression()) + elif method == 'HDy-OvA': + param_grid = { + 'binary_quantifier__classifier__C': np.logspace(-4,4,9), + 'binary_quantifier__classifier__class_weight': ['balanced', None] + } + quantifier = OneVsAllAggregative(HDy(LogisticRegression())) + elif method == 'DM': + param_grid = { + 'nbins': [5,10,15], + 'classifier__C': np.logspace(-4,4,9), + 'classifier__class_weight': ['balanced', None] + } + quantifier = DistributionMatching(LogisticRegression()) + else: + raise NotImplementedError('unknown method', method) - df = pd.read_csv(result_path, sep='\t') + # model selection + data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True) + + protocol = UPP(data.test, repeats=n_bags_val) + modsel = GridSearchQ(quantifier, param_grid, protocol, refit=False, n_jobs=-1, verbose=1, error=optim) + + modsel.fit(data.training) + print(f'best params {modsel.best_params_}') + pickle.dump(modsel.best_params_, open(f'{result_dir}/{method}_{dataset}.hyper.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) + + quantifier = modsel.best_model() + + if is_semeval: + semeval_trained = True + + else: + print(f'model selection for {dataset} already done; skipping') + + data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False) + quantifier.fit(data.training) + protocol = UPP(data.test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True) + report.to_csv(result_path+'.dataframe') + means = report.mean() + csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n') + csv.flush() + + df = pd.read_csv(result_path+'.csv', sep='\t') pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) diff --git a/laboratory/method_dirichlety.py b/laboratory/method_dirichlety.py new file mode 100644 index 0000000..005bb63 --- /dev/null +++ b/laboratory/method_dirichlety.py @@ -0,0 +1,101 @@ +import os +import sys +from typing import Union, Callable +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.linear_model import LogisticRegression +import pandas as pd +from sklearn.model_selection import GridSearchCV +from sklearn.neighbors import KernelDensity + +import quapy as qp +from quapy.data import LabelledCollection +from quapy.protocol import APP, UPP +from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \ + DistributionMatching, _get_divergence +import scipy +from scipy import optimize +from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional +import dirichlet + + +class DIRy(AggregativeProbabilisticQuantifier): + + def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None, target='max_likelihood'): + self.classifier = classifier + self.val_split = val_split + self.n_jobs = n_jobs + self.target = target + + def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): + + if val_split is None: + val_split = self.val_split + + self.classifier, y, posteriors, _, _ = cross_generate_predictions( + data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs + ) + + self.val_parameters = [dirichlet.mle(posteriors[y == cat]) for cat in range(data.n_classes)] + + return self + + def val_pdf(self, prev): + """ + Returns a function that computes the mixture model with the given prev as mixture factor + :param prev: a prevalence vector, ndarray + :return: a function implementing the validation distribution with fixed mixture factor + """ + return lambda posteriors: sum(prev_i * dirichlet.pdf(parameters_i)(posteriors) for parameters_i, prev_i in zip(self.val_parameters, prev)) + + def aggregate(self, posteriors: np.ndarray): + if self.target == 'min_divergence': + raise NotImplementedError('not yet') + return self._target_divergence(posteriors) + elif self.target == 'max_likelihood': + return self._target_likelihood(posteriors) + else: + raise ValueError('unknown target') + + def _target_divergence(self, posteriors): + test_density = self.get_kde(posteriors) + # val_test_posteriors = np.concatenate([self.val_posteriors, posteriors]) + test_likelihood = self.pdf(test_density, posteriors) + divergence = _get_divergence(self.divergence) + + n_classes = len(self.val_densities) + + def match(prev): + val_pdf = self.val_pdf(prev) + val_likelihood = val_pdf(posteriors) + + #for i,prev_i in enumerate(prev): + + return divergence(val_likelihood, test_likelihood) + + # the initial point is set as the uniform distribution + uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + return r.x + + def _target_likelihood(self, posteriors, eps=0.000001): + n_classes = len(self.val_parameters) + + def neg_loglikelihood(prev): + val_pdf = self.val_pdf(prev) + test_likelihood = val_pdf(posteriors) + test_loglikelihood = np.log(test_likelihood + eps) + return -np.sum(test_loglikelihood) + + # the initial point is set as the uniform distribution + uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) + + # solutions are bounded to those contained in the unit-simplex + bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1] + constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 + r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + return r.x \ No newline at end of file diff --git a/laboratory/method_kdey.py b/laboratory/method_kdey.py index b5acd84..6aa89be 100644 --- a/laboratory/method_kdey.py +++ b/laboratory/method_kdey.py @@ -9,8 +9,8 @@ from sklearn.model_selection import GridSearchCV from sklearn.neighbors import KernelDensity import quapy as qp -from data import LabelledCollection -from protocol import APP, UPP +from quapy.data import LabelledCollection +from quapy.protocol import APP, UPP from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _training_helper, cross_generate_predictions, \ DistributionMatching, _get_divergence import scipy @@ -22,16 +22,6 @@ from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional # TODO: think of a MMD-y variant, i.e., a MMD variant that uses the points in the simplex and possibly any non-linear kernel -class SklearnKDE: - def __init__(self): - pass - - def fit(self): - pass - - def likelihood(self): - pass - class KDEy(AggregativeProbabilisticQuantifier): @@ -163,8 +153,6 @@ class KDEy(AggregativeProbabilisticQuantifier): val_pdf = self.val_pdf(prev) val_likelihood = val_pdf(posteriors) - #for i,prev_i in enumerate(prev): - return divergence(val_likelihood, test_likelihood) # the initial point is set as the uniform distribution @@ -176,7 +164,7 @@ class KDEy(AggregativeProbabilisticQuantifier): r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) return r.x - def _target_likelihood(self, posteriors): + def _target_likelihood(self, posteriors, eps=0.000001): """ Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution (the mixture) that best matches the test distribution, in terms of the divergence measure of choice. @@ -189,8 +177,9 @@ class KDEy(AggregativeProbabilisticQuantifier): def neg_loglikelihood(prev): val_pdf = self.val_pdf(prev) test_likelihood = val_pdf(posteriors) - test_loglikelihood = np.log(test_likelihood) - return - np.sum(test_loglikelihood) + test_loglikelihood = np.log(test_likelihood + eps) + return -np.sum(test_loglikelihood) + #return -np.prod(test_likelihood) # the initial point is set as the uniform distribution uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,)) diff --git a/laboratory/show_results.py b/laboratory/show_results.py index 219914c..2810234 100644 --- a/laboratory/show_results.py +++ b/laboratory/show_results.py @@ -2,7 +2,8 @@ import sys from pathlib import Path import pandas as pd -result_dir = 'results' +#result_dir = 'results_tweet_1000' +result_dir = 'results_lequa' dfs = [] @@ -11,19 +12,27 @@ for path in pathlist: path_in_str = str(path) print(path_in_str) - df = pd.read_csv(path_in_str, sep='\t') - - dfs.append(df) + try: + df = pd.read_csv(path_in_str, sep='\t') + if not df.empty: + dfs.append(df) + except Exception: + print('empty') df = pd.concat(dfs) -piv = df.pivot_table(index='Dataset', columns='Method', values='MRAE') -piv.loc['mean'] = piv.mean() +for err in ['MAE', 'MRAE']: + print('-'*100) + print(err) + print('-'*100) + piv = df.pivot_table(index='Dataset', columns='Method', values=err) + piv.loc['mean'] = piv.mean() -pd.set_option('display.max_columns', None) -pd.set_option('display.max_rows', None) -pd.set_option('expand_frame_repr', False) -print(piv) + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pd.set_option('expand_frame_repr', False) + print(piv) + print() diff --git a/laboratory/todo.txt b/laboratory/todo.txt index 20b69d8..4042ab2 100644 --- a/laboratory/todo.txt +++ b/laboratory/todo.txt @@ -4,6 +4,11 @@ y el otro es un KDE en test), de las que luego se calculará la divergencia (obj generar solo una distribución (mixture model de train) y tomar la likelihood de los puntos de test como objetivo a maximizar. +- quedarse con hyperparametros mejores por verlos +- sacar los dataframes en resultados para hcer test estadisticos +- hacer dibujitos + + 1) aclarar: only test? 2) implementar el auto - optimización interna para likelihood [ninguno parece funcionar bien]