diff --git a/NewMethods/fgsld/fglsd_test.py b/NewMethods/fgsld/fglsd_test.py new file mode 100644 index 0000000..72fb90d --- /dev/null +++ b/NewMethods/fgsld/fglsd_test.py @@ -0,0 +1,47 @@ +from sklearn.calibration import CalibratedClassifierCV +from sklearn.svm import LinearSVC +from fgsld.fgsld_quantifiers import FakeFGLSD +from method.aggregative import EMQ, CC +import quapy as qp + + +qp.environ['SAMPLE_SIZE'] = 500 + +dataset = qp.datasets.fetch_reviews('kindle') +qp.data.preprocessing.text2tfidf(dataset, min_df=5, inplace=True) + +training = dataset.training +test = dataset.test + +cls = CalibratedClassifierCV(LinearSVC()) + + +method_names, true_prevs, estim_prevs, tr_prevs = [], [], [], [] + +for model, model_name in [ + (CC(cls), 'CC'), + # (FakeFGLSD(cls, nbins=5, isomerous=False, recompute_bins=False), 'FGSLD-isometric-stat-5'), + (FakeFGLSD(cls, nbins=5, isomerous=True, recompute_bins=True), 'FGSLD-isometric-dyn-5'), + # (FakeFGLSD(cls, nbins=5, isomerous=True, recompute_bins=False), 'FGSLD-isomerous-stat-5'), + # (FakeFGLSD(cls, nbins=10, isomerous=True, recompute_bins=True), 'FGSLD-isomerous-dyn-10'), + #(FakeFGLSD(cls, nbins=5, isomerous=False), 'FGSLD-5'), + #(FakeFGLSD(cls, nbins=10, isomerous=False), 'FGSLD-10'), + #(FakeFGLSD(cls, nbins=50, isomerous=False), 'FGSLD-50'), + #(FakeFGLSD(cls, nbins=100, isomerous=False), 'FGSLD-100'), +# (FakeFGLSD(cls, nbins=1, isomerous=False), 'FGSLD-1'), + #(FakeFGLSD(cls, nbins=10, isomerous=True), 'FGSLD-10-ISO'), + # (FakeFGLSD(cls, nbins=50, isomerous=False), 'FGSLD-50'), + (EMQ(cls), 'SLD'), +]: + print('running ', model_name) + model.fit(training) + true_prev, estim_prev = qp.evaluation.artificial_sampling_prediction( + model, test, qp.environ['SAMPLE_SIZE'], n_repetitions=5, n_prevpoints=11, n_jobs=-1 + ) + method_names.append(model_name) + true_prevs.append(true_prev) + estim_prevs.append(estim_prev) + tr_prevs.append(training.prevalence()) + + +qp.plot.binary_diagonal(method_names, true_prevs, estim_prevs, train_prev=tr_prevs[0], savepath='./plot_fglsd.png') diff --git a/NewMethods/fgsld/fine_grained_sld.py b/NewMethods/fgsld/fine_grained_sld.py new file mode 100644 index 0000000..f45e125 --- /dev/null +++ b/NewMethods/fgsld/fine_grained_sld.py @@ -0,0 +1,116 @@ +import numpy as np +from metrics import isomerous_bins, isometric_bins +from em import History, get_measures_single_history +from sklearn.model_selection import cross_val_predict +import math + + +class FineGrainedSLD: + def __init__(self, x_tr, x_te, y_tr, tr_priors, clf, n_bins=10): + self.y_tr = y_tr + self.clf = clf + self.tr_priors = tr_priors + self.te_preds = clf.predict_proba(x_te) + self.tr_preds = cross_val_predict(clf, x_tr, y_tr, method='predict_proba', n_jobs=10) + self.n_bins = n_bins + self.history: [History] = [] + self.multi_class = False + + def run(self, isomerous_binning, epsilon=1e-6, compute_bins_at_every_iter=True, return_posteriors_hist=False): + """ + Run the FGSLD algorithm. + + :param isomerous_binning: whether to use isomerous or isometric binning. + :param epsilon: stopping condition. + :param compute_bins_at_every_iter: whether FGSLD should recompute the posterior bins at every iteration or not. + :param return_posteriors_hist: whether to return posteriors at every iteration or not. + :return: If `return_posteriors_hist` is true, the returned posteriors will be a list of numpy arrays, else a single numpy array with posteriors at last iteration. + """ + smoothing_tr = 1 / (2 * self.tr_preds.shape[0]) + smoothing_te = 1 / (2 * self.te_preds.shape[0]) + s = 0 + tr_bin_priors = np.zeros((self.n_bins, self.tr_preds.shape[1]), dtype=np.float) + te_bin_priors = np.zeros((self.n_bins, self.te_preds.shape[1]), dtype=np.float) + tr_bins = self.__create_bins(training=True, isomerous_binning=isomerous_binning) + te_bins = self.__create_bins(training=False, isomerous_binning=isomerous_binning) + self.__compute_bins_priors(tr_bin_priors, self.tr_preds, tr_bins, smoothing_tr) + + val = 2 * epsilon + if return_posteriors_hist: + posteriors_hist = [self.te_preds.copy()] + while not val < epsilon and s < 1000: + assert np.all(np.around(self.te_preds.sum(axis=1), 4) == 1), f"Probabilities do not sum to 1:\ns={s}, " \ + f"probs={self.te_preds.sum(axis=1)}" + if compute_bins_at_every_iter: + te_bins = self.__create_bins(training=False, isomerous_binning=isomerous_binning) + + if s == 0: + te_bin_priors_prev = tr_bin_priors.copy() + else: + te_bin_priors_prev = te_bin_priors.copy() + self.__compute_bins_priors(te_bin_priors, self.te_preds, te_bins, smoothing_te) + + te_preds_cp = self.te_preds.copy() + for label_idx, bins in te_bins.items(): + for i, bin_ in enumerate(bins): + if bin_.shape[0] == 0: + continue + te = te_bin_priors[i][label_idx] + tr = tr_bin_priors[i][label_idx] + # local_min = (math.floor(tr * 10) / 10) + # local_max = local_min + .1 + # trans = lambda l: min(max((l - local_min) / 1, 0), 1) + trans = lambda l: l + self.te_preds[:, label_idx][bin_] = (te_preds_cp[:, label_idx][bin_]) * \ + (trans(te) / trans(tr)) + + # Normalization step + self.te_preds = (self.te_preds / self.te_preds.sum(axis=1, keepdims=True)) + + val = 0 + for label_idx in range(te_bin_priors.shape[1]): + temp = max(abs((te_bin_priors[:, label_idx] / te_bin_priors_prev[:, label_idx]) - 1)) + if temp > val: + val = temp + s += 1 + if return_posteriors_hist: + posteriors_hist.append(self.te_preds.copy()) + if return_posteriors_hist: + return self.te_preds.mean(axis=0), posteriors_hist + return self.te_preds.mean(axis=0), self.te_preds + + def __compute_bins_priors(self, bin_priors_placeholder, posteriors, bins, smoothing): + for label_idx, bins in bins.items(): + for i, bin_ in enumerate(bins): + if bin_.shape[0] == 0: + bin_priors_placeholder[i, label_idx] = smoothing + continue + numerator = posteriors[:, label_idx][bin_].mean() + bin_prior = (numerator + smoothing) / (1 + self.n_bins * smoothing) # normalize priors + bin_priors_placeholder[i, label_idx] = bin_prior + + def __find_bin_idx(self, label_bins: [np.array], idx: int or list): + if hasattr(idx, '__len__'): + idxs = np.zeros(len(idx), dtype=np.int) + for i, bin_ in enumerate(label_bins): + for j, id_ in enumerate(idx): + if id_ in bin_: + idxs[j] = i + return idxs + else: + for i, bin_ in enumerate(label_bins): + if idx in bin_: + return i + + def __create_bins(self, training: bool, isomerous_binning: bool): + bins = {} + preds = self.tr_preds if training else self.te_preds + if isomerous_binning: + for label_idx in range(preds.shape[1]): + bins[label_idx] = isomerous_bins(label_idx, preds, self.n_bins) + else: + intervals = np.linspace(0., 1., num=self.n_bins, endpoint=False) + for label_idx in range(preds.shape[1]): + bins_ = isometric_bins(label_idx, preds, intervals, 0.1) + bins[label_idx] = [bins_[i] for i in intervals] + return bins diff --git a/NewMethods/fgsld/plot_fglsd.png b/NewMethods/fgsld/plot_fglsd.png new file mode 100644 index 0000000..d1dd2d6 Binary files /dev/null and b/NewMethods/fgsld/plot_fglsd.png differ diff --git a/NewMethods/gen_tables.py b/NewMethods/gen_tables.py new file mode 100644 index 0000000..2cf2aab --- /dev/null +++ b/NewMethods/gen_tables.py @@ -0,0 +1,161 @@ +import quapy as qp +import numpy as np +from os import makedirs +import sys, os +import pickle +from experiments import result_path +from tabular import Table +import argparse + +tables_path = './tables' +MAXTONE = 50 # sets the intensity of the maximum color reached by the worst (red) and best (green) results + +makedirs(tables_path, exist_ok=True) + +sample_size = 100 +qp.environ['SAMPLE_SIZE'] = sample_size + + +nice = { + 'mae':'AE', + 'mrae':'RAE', + 'ae':'AE', + 'rae':'RAE', + 'svmkld': 'SVM(KLD)', + 'svmnkld': 'SVM(NKLD)', + 'svmq': 'SVM(Q)', + 'svmae': 'SVM(AE)', + 'svmnae': 'SVM(NAE)', + 'svmmae': 'SVM(AE)', + 'svmmrae': 'SVM(RAE)', + 'quanet': 'QuaNet', + 'hdy': 'HDy', + 'hdysld': 'HDy-SLD', + 'dys': 'DyS', + 'svmperf':'', + 'sanders': 'Sanders', + 'semeval13': 'SemEval13', + 'semeval14': 'SemEval14', + 'semeval15': 'SemEval15', + 'semeval16': 'SemEval16', + 'Average': 'Average' +} + +def save_table(path, table): + print(f'saving results in {path}') + with open(path, 'wt') as foo: + foo.write(table) + + +def experiment_errors(path, dataset, method, loss): + path = result_path(path, dataset, method, 'm'+loss if not loss.startswith('m') else loss) + if os.path.exists(path): + true_prevs, estim_prevs, _, _, _, _ = pickle.load(open(path, 'rb')) + err_fn = getattr(qp.error, loss) + errors = err_fn(true_prevs, estim_prevs) + return errors + return None + +def nicerm(key): + return '\mathrm{'+nice[key]+'}' + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Generate tables for Tweeter Sentiment Quantification') + parser.add_argument('results', metavar='RESULT_PATH', type=str, + help='path to the directory containing the results of the methods tested in Gao & Sebastiani') + parser.add_argument('newresults', metavar='RESULT_PATH', type=str, + help='path to the directory containing the results for the experimental methods') + args = parser.parse_args() + + datasets = qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST + evaluation_measures = [qp.error.ae, qp.error.rae] + gao_seb_methods = ['cc', 'acc', 'pcc', 'pacc', 'sld', 'svmq', 'svmkld', 'svmnkld'] + new_methods = ['hdy'] # methods added to the Gao & Sebastiani methods + experimental_methods = ['hdysld'] # experimental + + for i, eval_func in enumerate(evaluation_measures): + + # Tables evaluation scores for AE and RAE (two tables) + # ---------------------------------------------------- + + eval_name = eval_func.__name__ + + added_methods = ['svmm' + eval_name] + new_methods + methods = gao_seb_methods + added_methods + experimental_methods + nold_methods = len(gao_seb_methods) + nnew_methods = len(added_methods) + nexp_methods = len(experimental_methods) + + # fill data table + table = Table(benchmarks=datasets, methods=methods) + for dataset in datasets: + for method in methods: + if method in experimental_methods: + path = args.newresults + else: + path = args.results + table.add(dataset, method, experiment_errors(path, dataset, method, eval_name)) + + # write the latex table + tabular = """ + \\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline + & \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} & + \multicolumn{"""+str(nnew_methods)+"""}{c|}{} & + \multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline + """ + rowreplace={dataset: nice.get(dataset, dataset.upper()) for dataset in datasets} + colreplace={method:'\side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} ' for method in methods} + + tabular += table.latexTabular(benchmark_replace=rowreplace, method_replace=colreplace) + tabular += "\n\end{tabularx}" + + save_table(f'./tables/tab_results_{eval_name}.new.tex', tabular) + + # Tables ranks for AE and RAE (two tables) + # ---------------------------------------------------- + # fill the data table + ranktable = Table(benchmarks=datasets, methods=methods, missing='--') + for dataset in datasets: + for method in methods: + ranktable.add(dataset, method, values=table.get(dataset, method, 'rank')) + + # write the latex table + tabular = """ + \\begin{tabularx}{\\textwidth}{|c||""" + ('Y|'*nold_methods) + '|' + ('Y|'*nnew_methods) + '|' + ('Y|'*nexp_methods) + """} \hline + & \multicolumn{"""+str(nold_methods)+"""}{c||}{Methods tested in~\cite{Gao:2016uq}} & + \multicolumn{"""+str(nnew_methods)+"""}{c|}{} & + \multicolumn{"""+str(nexp_methods)+"""}{c|}{}\\\\ \hline + """ + for method in methods: + tabular += ' & \side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} ' + tabular += '\\\\\hline\n' + + for dataset in datasets: + tabular += nice.get(dataset, dataset.upper()) + ' ' + for method in methods: + newrank = ranktable.get(dataset, method) + if newrank != '--': + newrank = f'{int(newrank)}' + color = ranktable.get_color(dataset, method) + if color == '--': + color = '' + tabular += ' & ' + f'{newrank}' + color + tabular += '\\\\\hline\n' + tabular += '\hline\n' + + tabular += 'Average ' + for method in methods: + newrank = ranktable.get_average(method) + if newrank != '--': + newrank = f'{newrank:.1f}' + color = ranktable.get_average(method, 'color') + if color == '--': + color = '' + tabular += ' & ' + f'{newrank}' + color + tabular += '\\\\\hline\n' + tabular += "\end{tabularx}" + + save_table(f'./tables/tab_rank_{eval_name}.new.tex', tabular) + + print("[Done]") diff --git a/NewMethods/settings.py b/NewMethods/settings.py new file mode 100644 index 0000000..431e173 --- /dev/null +++ b/NewMethods/settings.py @@ -0,0 +1,5 @@ +import multiprocessing + +N_JOBS = -2 #multiprocessing.cpu_count() +ENSEMBLE_N_JOBS=1 +SAMPLE_SIZE = 100