last updates before submission

updating kde labels in plots
improving the quality of the plots
2024-08-22 17:02:00 +02:00 · 2024-05-20 12:13:00 +02:00 · 2024-05-17 13:52:56 +02:00 · 2024-05-15 12:00:00 +02:00 · 2024-05-10 15:46:13 +02:00 · 2024-05-09 16:24:20 +02:00
24 changed files with 1901 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@ -143,8 +143,7 @@ LeQua2022
 MultiLabel
 NewMethods
 Ordinal
-Retrieval
-eDiscovery
+Archived/eDiscovery
 poster-cikm
 slides-cikm
 slides-short-cikm
@ -153,9 +152,4 @@ svm_perf_quantification/svm_struct
 svm_perf_quantification/svm_light
 TweetSentQuant

-
-
-
-
-
 *.png
--- a/CHANGE_LOG.txt
+++ b/CHANGE_LOG.txt
@ -1,3 +1,9 @@
+Change Log 0.1.9
+----------------
+
+<...>
+
+
 Change Log 0.1.8
 ----------------

--- a/Retrieval/classifier_kfcv_accuracy.py
+++ b/Retrieval/classifier_kfcv_accuracy.py
@ -0,0 +1,84 @@
+import itertools
+import os.path
+import pickle
+from collections import defaultdict
+from pathlib import Path
+
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.svm import LinearSVC
+
+import quapy as qp
+from Retrieval.commons import RetrievedSamples, load_sample
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
+from quapy.data.base import LabelledCollection
+
+from os.path import join
+from tqdm import tqdm
+
+from result_table.src.table import Table
+
+"""
+ 
+"""
+
+data_home = 'data'
+
+datasets = ['continent', 'gender', 'years_category', 'relative_pageviews_category', 'num_sitelinks_category']
+
+param_grid = {'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]}
+
+classifiers = [
+    ('LR', LogisticRegression(max_iter=5000), param_grid),
+    ('SVM', LinearSVC(), param_grid)
+]
+
+def benchmark_name(class_name):
+    return class_name.replace('_', '\_')
+
+table = Table(name=f'accuracy', benchmarks=[benchmark_name(d) for d in datasets])
+table.format.show_std = False
+table.format.stat_test = None
+table.format.lower_is_better = False
+table.format.color = False
+table.format.remove_zero = True
+table.format.style = 'rules'
+
+for class_name, (cls_name, cls, grid) in itertools.product(datasets, classifiers):
+
+    train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json')  # <-------- fixed classifier
+
+    texts, labels = load_sample(train_data_path, class_name=class_name)
+
+    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
+    Xtr = tfidf.fit_transform(texts)
+    print(f'Xtr shape={Xtr.shape}')
+
+    print('training classifier...', end='')
+    classifier = GridSearchCV(
+        cls,
+        param_grid=grid,
+        n_jobs=-1,
+        cv=5,
+        verbose=10
+    )
+    classifier.fit(Xtr, labels)
+    classifier_acc = classifier.best_score_
+    classifier_acc_per_fold = classifier.cv_results_['mean_test_score'][classifier.best_index_]
+
+    print(f'[done] best-params={classifier.best_params_} got {classifier_acc:.4f} score, per fold {classifier_acc_per_fold}')
+
+    table.add(benchmark=benchmark_name(class_name), method=cls_name, v=classifier_acc_per_fold)
+
+    Table.LatexPDF(f'./latex/classifier_Acc.pdf', tables=[table])
+
+
+
+
+
+
+
+
--- a/Retrieval/commons.py
+++ b/Retrieval/commons.py
@ -0,0 +1,153 @@
+import pandas as pd
+import numpy as np
+from glob import glob
+from os.path import join
+
+import quapy.functional as F
+
+
+Ks = [50, 100, 500, 1000]
+
+CLASS_NAMES = ['continent', 'gender', 'years_category'] # ['relative_pageviews_category', 'num_sitelinks_category']:
+
+DATA_SIZES = ['10K', '50K', '100K', '500K', '1M', 'FULL']
+
+protected_group = {
+    'gender': 'Female',
+    'continent': 'Africa',
+    'years_category': 'Pre-1900s',
+}
+
+
+def load_sample(path, class_name):
+    """
+    Loads a sample json as a dataframe and returns text and labels for
+    the given class_name
+
+    :param path: path to a json file
+    :param class_name: string representing the target class
+    :return: texts, labels for class_name
+    """
+    df = pd.read_json(path)
+    text = df.text.values
+    labels = df[class_name].values
+    return text, labels
+
+
+def binarize_labels(labels, positive_class=None):
+    if positive_class is not None:
+        protected_labels = labels==positive_class
+        labels[protected_labels] = 1
+        labels[~protected_labels] = 0
+        labels = labels.astype(int)
+    return labels
+
+
+class RetrievedSamples:
+    def __init__(self,
+                 class_home: str,
+                 test_rankings_path: str,
+                 test_query_prevs_path: str,
+                 vectorizer,
+                 class_name,
+                 positive_class=None,
+                 classes=None,
+                 ):
+        self.class_home = class_home
+        self.test_rankings_df = pd.read_json(test_rankings_path)
+        self.test_query_prevs_df = pd.read_json(test_query_prevs_path)
+        self.vectorizer = vectorizer
+        self.class_name = class_name
+        self.positive_class = positive_class
+        self.classes = classes
+
+    def get_text_label_score(self, df, filter_rank=1000):
+        df = df[df['rank']<filter_rank]
+
+        class_name = self.class_name
+        vectorizer = self.vectorizer
+        filter_classes = self.classes
+
+        text = df.text.values
+        labels = df[class_name].values
+        rel_score = df.score.values
+
+        labels = binarize_labels(labels, self.positive_class)
+
+        if filter_classes is not None:
+            idx = np.isin(labels, filter_classes)
+            text = text[idx]
+            labels = labels[idx]
+            rel_score = rel_score[idx]
+
+        if vectorizer is not None:
+            text = vectorizer.transform(text)
+
+        order = np.argsort(-rel_score)
+        return text[order], labels[order], rel_score[order]
+
+    def __call__(self):
+        tests_df = self.test_rankings_df
+        class_name = self.class_name
+
+        for file in self._list_queries():
+
+            # loads the training sample
+            train_df = pd.read_json(file)
+            if len(train_df) == 0:
+                print('empty dataframe: ', file)
+            else:
+                Xtr, ytr, score_tr = self.get_text_label_score(train_df)
+
+                # loads the test sample
+                query_id = self._get_query_id_from_path(file)
+                sel_df = tests_df[tests_df.qid == query_id]
+                Xte, yte, score_te = self.get_text_label_score(sel_df)
+
+                # gets the prevalence of all judged relevant documents for the query
+                df = self.test_query_prevs_df
+                q_rel_prevs = df.loc[df.id == query_id][class_name+'_proportions'].values[0]
+
+                if self.positive_class is not None:
+                    if self.positive_class not in q_rel_prevs:
+                        print(f'positive class {self.positive_class} not found in the query; skipping')
+                        continue
+                    q_rel_prevs = F.as_binary_prevalence(q_rel_prevs[self.positive_class])
+                else:
+                    q_rel_prevs = np.asarray([q_rel_prevs.get(class_i, 0.) for class_i in self.classes])
+
+                yield (Xtr, ytr, score_tr), (Xte, yte, score_te), q_rel_prevs
+
+    def _list_queries(self):
+        return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json')))
+
+    # def _get_test_sample(self, query_id, max_lines=-1):
+    #     df = self.test_rankings_df
+    #     sel_df = df[df.qid==int(query_id)]
+    #     return get_text_label_score(sel_df)
+        # texts = sel_df.text.values
+        # try:
+        #     labels = sel_df[self.class_name].values
+        # except KeyError as e:
+        #     print(f'error: key {self.class_name} not found in test rankings')
+        #     raise e
+        # if max_lines > 0 and len(texts) > max_lines:
+        #     ranks = sel_df.rank.values
+        #     idx = np.argsort(ranks)[:max_lines]
+        #     texts = np.asarray(texts)[idx]
+        #     labels = np.asarray(labels)[idx]
+        # return texts, labels
+
+    def total(self):
+        return len(self._list_queries())
+
+    def _get_query_id_from_path(self, path):
+        prefix = 'training_Query-'
+        posfix = 'Sample-200SPLIT'
+        qid = path
+        qid = qid[:qid.index(posfix)]
+        qid = qid[qid.index(prefix) + len(prefix):]
+        qid = int(qid)
+        return qid
+
+
--- a/Retrieval/deprecated_code/fifth.py
+++ b/Retrieval/deprecated_code/fifth.py
@ -0,0 +1,182 @@
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.svm import LinearSVC
+
+import quapy as qp
+import quapy.functional as F
+from Retrieval.commons import RetrievedSamples, load_txt_sample, load_json_sample
+from Retrieval.tabular import Table
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+
+"""
+In this fifth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as 
+in the fourth experiment, and the fairness group are defined upon geographic info as in the fourth case.
+As in the fourth, the data Li and Ui have been drawn by retrieving query-related documents from
+a pool of the same size. Unlike the fourth experiment, here the training queries are
+
+Por ahora 1000 en tr y 100 en test
+Parece que ahora hay muy poco shift  
+"""
+
+
+def cls(classifier_trained=None):
+    if classifier_trained is None:
+        # return LinearSVC()
+        return LogisticRegression()
+    else:
+        return classifier_trained
+
+
+def methods(classifier_trained=None):
+    yield ('CC', ClassifyAndCount(cls(classifier_trained)))
+    yield ('PCC', PCC(cls(classifier_trained)))
+    yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
+    yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
+    yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
+    yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
+    # yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
+    # yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
+    # yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
+    # yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
+    # yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
+    # yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
+    # yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
+    # yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
+    # yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
+    # yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
+    yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
+    # yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+
+
+def train_classifier():
+    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+    training = LabelledCollection.load(train_path, loader_func=load_json_sample, class_name=CLASS_NAME)
+
+    if REDUCE_TR > 0 and len(training) > REDUCE_TR:
+        print('Reducing the number of documents in the training to', REDUCE_TR)
+        training = training.sampling(REDUCE_TR, *training.prevalence())
+
+    Xtr, ytr = training.Xy
+    Xtr = tfidf.fit_transform(Xtr)
+    print('L orig shape = ', Xtr.shape)
+
+    training = LabelledCollection(Xtr, ytr)
+
+    print('training classifier')
+    classifier_trained = LogisticRegression()
+    classifier_trained = GridSearchCV(classifier_trained,
+                                      param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
+                                      n_jobs=-1, cv=5)
+    classifier_trained.fit(Xtr, ytr)
+    classifier_trained = classifier_trained.best_estimator_
+    trained = True
+    print('[Done!]')
+
+    classes = training.classes_
+
+    print('training classes:', classes)
+    print('training prevalence:', training.prevalence())
+
+    return tfidf, classifier_trained
+
+
+def reduceAtK(data: LabelledCollection, k):
+    X, y = data.Xy
+    X = X[:k]
+    y = y[:k]
+    return LabelledCollection(X, y, classes=data.classes_)
+
+
+RANK_AT_K = -1
+REDUCE_TR = 50000
+qp.environ['SAMPLE_SIZE'] = RANK_AT_K
+
+
+def scape_latex(string):
+    return string.replace('_', '\_')
+
+
+Ks = [10, 50, 100, 250, 500, 1000, 2000]
+# Ks = [500]
+
+for CLASS_NAME in ['gender_category'] : #'years_category']: #['continent', 'first_letter_category']: #, 'gender', 'gender_category', 'occupations', 'source_countries', 'source_subcont_regions', 'years_category', 'relative_pageviews_category']:
+
+    data_path = './' + CLASS_NAME
+
+    if CLASS_NAME in ['years_category', 'continent', 'gender_category']:
+        train_path = join(data_path, 'train500PerGroup.json')
+    else:
+        train_path = join(data_path, 'train3000samples.json')
+
+    tfidf, classifier_trained = qp.util.pickled_resource(f'classifier_{CLASS_NAME}.pkl', train_classifier)
+    trained=True
+
+    experiment_prot = RetrievedSamples(data_path,
+                               load_fn=load_json_sample,
+                               vectorizer=tfidf,
+                               max_train_lines=None,
+                               max_test_lines=RANK_AT_K, classes=classifier_trained.classes_, class_name=CLASS_NAME)
+
+    method_names = [name for name, *other in methods()]
+    benchmarks = [f'{scape_latex(CLASS_NAME)}@{k}' for k in Ks]
+    table_mae = Table(benchmarks, method_names, color_mode='global')
+    table_mrae = Table(benchmarks, method_names, color_mode='global')
+
+    for method_name, quantifier in methods(classifier_trained):
+        # print('Starting with method=', method_name)
+
+        mae_errors = {k:[] for k in Ks}
+        mrae_errors = {k:[] for k in Ks}
+
+        pbar = tqdm(experiment_prot(), total=49)
+        for train, test in pbar:
+            if train is not None:
+                try:
+                    if trained and method_name!='MLPE':
+                        quantifier.fit(train, val_split=train, fit_classifier=False)
+                    else:
+                        quantifier.fit(train)
+
+                    for k in Ks:
+                        test_k = reduceAtK(test, k)
+                        estim_prev = quantifier.quantify(test_k.instances)
+
+                        mae_errors[k].append(qp.error.mae(test_k.prevalence(), estim_prev))
+                        mrae_errors[k].append(qp.error.mrae(test_k.prevalence(), estim_prev, eps=(1./(2*k))))
+
+                except Exception as e:
+                    print(f'wow, something happened here! skipping; {e}')
+            else:
+                print('skipping one!')
+
+            # pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
+            pbar.set_description(f'{method_name}')
+
+        for k in Ks:
+
+            table_mae.add(benchmark=f'{scape_latex(CLASS_NAME)}@{k}', method=method_name, values=mae_errors[k])
+            table_mrae.add(benchmark=f'{scape_latex(CLASS_NAME)}@{k}', method=method_name, values=mrae_errors[k])
+
+    table_mae.latexPDF('./latex', f'table_{CLASS_NAME}_mae.tex')
+    table_mrae.latexPDF('./latex', f'table_{CLASS_NAME}_mrae.tex')
+
+
+
+
+
+
+
+
--- a/Retrieval/deprecated_code/fourth.py
+++ b/Retrieval/deprecated_code/fourth.py
@ -0,0 +1,161 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.svm import LinearSVC
+
+import quapy as qp
+import quapy.functional as F
+from Retrieval.commons import RetrievedSamples, load_txt_sample
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+
+"""
+In this fourth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as 
+in the third experiment, and the fairness group are defined upon geographic info as in the third case.
+The difference here is that the data Li and Ui have been drawn by retrieving query-related documents from
+a pool of the same size.
+
+Por ahora 1000 en tr y 100 en test
+Parece que ahora hay muy poco shift  
+"""
+
+def cls(classifier_trained=None):
+    if classifier_trained is None:
+        # return LinearSVC()
+        return LogisticRegression()
+    else:
+        return classifier_trained
+
+
+def methods(classifier_trained=None):
+    yield ('CC', ClassifyAndCount(cls(classifier_trained)))
+    yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
+    yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
+    yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
+    yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
+    yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
+    yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
+    # yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
+    yield ('PCC', PCC(cls(classifier_trained)))
+    yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
+    yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
+    yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
+    yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
+    yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
+    yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
+    yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
+    yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
+    yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+
+
+def train_classifier():
+    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+    training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
+
+    if REDUCE_TR > 0:
+        print('Reducing the number of documents in the training to', REDUCE_TR)
+        training = training.sampling(REDUCE_TR, *training.prevalence())
+
+    Xtr, ytr = training.Xy
+    Xtr = tfidf.fit_transform(Xtr)
+    print('L orig shape = ', Xtr.shape)
+
+    training = LabelledCollection(Xtr, ytr)
+
+    print('training classifier')
+    classifier_trained = LogisticRegression()
+    classifier_trained = GridSearchCV(classifier_trained,
+                                      param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
+                                      n_jobs=-1, cv=5)
+    classifier_trained.fit(Xtr, ytr)
+    classifier_trained = classifier_trained.best_estimator_
+    trained = True
+    print('[Done!]')
+
+    classes = training.classes_
+
+    print('training classes:', classes)
+    print('training prevalence:', training.prevalence())
+
+    return tfidf, classifier_trained
+
+
+
+RANK_AT_K = 1000
+REDUCE_TR = 50000
+qp.environ['SAMPLE_SIZE'] = RANK_AT_K
+
+data_path = './50_50_split_trec'
+train_path = join(data_path, 'train_50_50_continent.txt')
+
+tfidf, classifier_trained = qp.util.pickled_resource('classifier.pkl', train_classifier)
+trained=True
+
+experiment_prot = RetrievedSamples(data_path,
+                           load_fn=load_txt_sample,
+                           vectorizer=tfidf,
+                           max_train_lines=None,
+                           max_test_lines=RANK_AT_K, classes=classifier_trained.classes_)
+
+result_mae_dict = {}
+result_mrae_dict = {}
+for method_name, quantifier in methods(classifier_trained):
+    # print('Starting with method=', method_name)
+
+    mae_errors = []
+    mrae_errors = []
+    pbar = tqdm(experiment_prot(), total=49)
+    for train, test in pbar:
+        if train is not None:
+            try:
+
+                # print(train.prevalence())
+                # print(test.prevalence())
+                if trained and method_name!='MLPE':
+                    quantifier.fit(train, val_split=train, fit_classifier=False)
+                else:
+                    quantifier.fit(train)
+                estim_prev = quantifier.quantify(test.instances)
+
+                mae = qp.error.mae(test.prevalence(), estim_prev)
+                mae_errors.append(mae)
+
+                mrae = qp.error.mrae(test.prevalence(), estim_prev)
+                mrae_errors.append(mrae)
+
+                # print()
+                # print('Training prevalence:', F.strprev(train.prevalence()), 'shape', train.X.shape)
+                # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
+                # print('Estim prevalence:', F.strprev(estim_prev))
+
+            except Exception as e:
+                print(f'wow, something happened here! skipping; {e}')
+        else:
+            print('skipping one!')
+
+        pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
+    print()
+    result_mae_dict[method_name] = np.mean(mae_errors)
+    result_mrae_dict[method_name] = np.mean(mrae_errors)
+
+print('Results\n'+('-'*100))
+for method_name in result_mae_dict.keys():
+    MAE = result_mae_dict[method_name]
+    MRAE = result_mrae_dict[method_name]
+    print(f'{method_name}\t{MAE=:.5f}\t{MRAE=:.5f}')
+
+
+
+
+
+
+
--- a/Retrieval/deprecated_code/preliminary_.py
+++ b/Retrieval/deprecated_code/preliminary_.py
@ -0,0 +1,98 @@
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+import quapy.functional as F
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+
+"""
+This was the very first experiment. 1 big training set and many test rankings produced according to some queries.
+The quantification methods did not seem to work. The more sophisticated the method is, the worse it performed.
+This is a clear indication that the PPS assumptions do not hold.
+Actually, while the training set could be some iid sample from a distribution L and every test set
+is a iid sample from a distribution U, it is pretty clear that P(X|Y) is different, since the test set
+are biased towards a query term whereas the training set is not.  
+"""
+
+def methods():
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+    yield ('CC', ClassifyAndCount(LogisticRegression(n_jobs=-1)))
+    yield ('ACC', ACC(LogisticRegression(n_jobs=-1)))
+    yield ('PCC', PCC(LogisticRegression(n_jobs=-1)))
+    yield ('PACC', PACC(LogisticRegression(n_jobs=-1)))
+    yield ('EMQ', EMQ(LogisticRegression(n_jobs=-1)))
+
+
+def load_txt_sample(path, verbose=False):
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text']
+    y = df['first_letter_category']
+
+    return X, y
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, classes):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.classes = classes
+
+    def __call__(self):
+        for file in glob(join(self.path_dir, 'test_data_*.txt')):
+            X, y = self.load_fn(file)
+            if len(X)!=qp.environ['SAMPLE_SIZE']:
+                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            sample = LabelledCollection(X, y, classes=self.classes)
+            yield sample.Xp
+
+
+qp.environ['SAMPLE_SIZE']=100
+
+data_path = './data'
+train_path = join(data_path, 'train_data.txt')
+
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5)
+
+training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True)
+
+# training = training.sampling(1000)
+
+Xtr, ytr = training.Xy
+Xtr = tfidf.fit_transform(Xtr)
+print('Xtr shape = ', Xtr.shape)
+
+training = LabelledCollection(Xtr, ytr)
+classes = training.classes_
+
+test_prot = RetrievedSamples(data_path, load_fn=load_txt_sample, vectorizer=tfidf, classes=classes)
+
+print('Training prevalence:', F.strprev(training.prevalence()))
+for X, p in test_prot():
+    print('Test prevalence:', F.strprev(p))
+
+for method_name, quantifier in methods():
+    print('training ', method_name)
+    quantifier.fit(training)
+    print('[done]')
+
+    report = qp.evaluation.evaluation_report(quantifier, test_prot, error_metrics=['mae', 'mrae'], verbose=True)
+
+    print(report.mean())
+
+
+
--- a/Retrieval/deprecated_code/second.py
+++ b/Retrieval/deprecated_code/second.py
@ -0,0 +1,131 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+import quapy.functional as F
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+
+"""
+In this second experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set.
+Both elements in the pair are *retrieved according to the same query*. This is a way to impose
+the same type of bias that was present in the test, to the training set. Let's see...  
+"""
+
+def methods():
+    yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('CC', ClassifyAndCount(LogisticRegression()))
+    yield ('EMQ', EMQ(LogisticRegression()))
+    yield ('PCC', PCC(LogisticRegression()))
+    yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+
+
+def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text'].values
+    y = df['first_letter_category'].values
+
+    if parse_columns:
+        rank = df['rank'].values
+        scores = df['score'].values
+        order = np.argsort(rank)
+        X = X[order]
+        y = y[order]
+        rank = rank[order]
+        scores = scores[order]
+
+    if max_lines is not None:
+        X = X[:max_lines]
+        y = y[:max_lines]
+
+    return X, y
+
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, classes, max_train_lines=None, max_test_lines=None):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.classes = classes
+        self.max_train_lines = max_train_lines
+        self.max_test_lines = max_test_lines
+
+    def __call__(self):
+        for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
+
+            X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
+            X = self.vectorizer.transform(X)
+            train_sample = LabelledCollection(X, y, classes=self.classes)
+
+            X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
+            if len(X)!=qp.environ['SAMPLE_SIZE']:
+                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            test_sample = LabelledCollection(X, y, classes=self.classes)
+
+            yield train_sample, test_sample
+
+
+RANK_AT_K = 500
+REDUCE_TR = 50000
+qp.environ['SAMPLE_SIZE'] = RANK_AT_K
+
+data_path = './newCollection'
+train_path = join(data_path, 'train_data.txt')
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+
+training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
+if REDUCE_TR>0:
+    print('Reducing the number of documents in the training to', REDUCE_TR)
+    training = training.sampling(REDUCE_TR)
+
+Xtr, ytr = training.Xy
+Xtr = tfidf.fit_transform(Xtr)
+print('L orig shape = ', Xtr.shape)
+
+training = LabelledCollection(Xtr, ytr)
+classes = training.classes_
+
+experiment_prot = RetrievedSamples(data_path,
+                                   load_fn=load_txt_sample,
+                                   vectorizer=tfidf,
+                                   classes=classes,
+                                   max_train_lines=RANK_AT_K,
+                                   max_test_lines=RANK_AT_K)
+
+for method_name, quantifier in methods():
+    print('Starting with method=', method_name)
+
+    errors = []
+    pbar = tqdm(experiment_prot(), total=49)
+    for train, test in pbar:
+        # print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
+        # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
+
+        quantifier.fit(train)
+        estim_prev = quantifier.quantify(test.instances)
+        mae = qp.error.mae(test.prevalence(), estim_prev)
+        errors.append(mae)
+
+        pbar.set_description(f'mae={np.mean(errors):.4f}')
+    print()
+
+
+
+
--- a/Retrieval/deprecated_code/third.py
+++ b/Retrieval/deprecated_code/third.py
@ -0,0 +1,155 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+import quapy.functional as F
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+
+"""
+In this third experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as 
+in the second experiment, but in this case the fairness group are defined upon geographic info.  
+"""
+
+def methods():
+    yield ('CC', ClassifyAndCount(LogisticRegression()))
+    yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('EMQ', EMQ(LogisticRegression()))
+    yield ('PCC', PCC(LogisticRegression()))
+    yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+
+
+def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
+    # print('reading', path)
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text'].values
+    y = df['continent'].values
+
+    if parse_columns:
+        rank = df['rank'].values
+        scores = df['score'].values
+        rank = rank[y != 'Antarctica']
+        scores = scores[y != 'Antarctica']
+
+    X = X[y!='Antarctica']
+    y = y[y!='Antarctica']
+
+    if parse_columns:
+        order = np.argsort(rank)
+        X = X[order]
+        y = y[order]
+        rank = rank[order]
+        scores = scores[order]
+
+    if max_lines is not None:
+        X = X[:max_lines]
+        y = y[:max_lines]
+
+    return X, y
+
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.max_train_lines = max_train_lines
+        self.max_test_lines = max_test_lines
+
+    def __call__(self):
+        for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
+
+            X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
+            X = self.vectorizer.transform(X)
+            train_sample = LabelledCollection(X, y)
+
+            X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
+            if len(X)!=qp.environ['SAMPLE_SIZE']:
+                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            try:
+                test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
+            except ValueError as e:
+                print(f'file {file} caused error {e}')
+                yield None, None
+
+            # print('train #classes:', train_sample.n_classes, train_sample.prevalence())
+            # print('test  #classes:', test_sample.n_classes, test_sample.prevalence())
+
+            yield train_sample, test_sample
+
+
+RANK_AT_K = 100
+REDUCE_TR = 50000
+qp.environ['SAMPLE_SIZE'] = RANK_AT_K
+
+data_path = './newCollectionGeo'
+train_path = join(data_path, 'train_data_continent.txt')
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+
+training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
+
+if REDUCE_TR>0:
+    print('Reducing the number of documents in the training to', REDUCE_TR)
+    training = training.sampling(REDUCE_TR)
+
+Xtr, ytr = training.Xy
+Xtr = tfidf.fit_transform(Xtr)
+print('L orig shape = ', Xtr.shape)
+
+training = LabelledCollection(Xtr, ytr)
+classes = training.classes_
+
+print('training classes:', classes)
+print('training prevalence:', training.prevalence())
+
+experiment_prot = RetrievedSamples(data_path,
+                                   load_fn=load_txt_sample,
+                                   vectorizer=tfidf,
+                                   max_train_lines=None,
+                                   max_test_lines=RANK_AT_K)
+
+for method_name, quantifier in methods():
+    print('Starting with method=', method_name)
+
+    errors = []
+    pbar = tqdm(experiment_prot(), total=49)
+    for train, test in pbar:
+        if train is not None:
+            try:
+                # print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
+                # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
+
+                # print(train.prevalence())
+                # print(test.prevalence())
+                quantifier.fit(train)
+                estim_prev = quantifier.quantify(test.instances)
+                mae = qp.error.mae(test.prevalence(), estim_prev)
+                errors.append(mae)
+            except Exception as e:
+                print(f'wow, something happened here! skipping; {e}')
+        else:
+            print('skipping one!')
+
+        pbar.set_description(f'mae={np.mean(errors):.4f}')
+    print()
+
+
+
+
--- a/Retrieval/experiments.py
+++ b/Retrieval/experiments.py
@ -0,0 +1,299 @@
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV, cross_val_predict
+from sklearn.base import clone
+
+import quapy as qp
+from Retrieval.commons import *
+from Retrieval.methods import *
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
+from quapy.data.base import LabelledCollection
+
+from os.path import join
+from tqdm import tqdm
+
+from result_table.src.table import Table
+
+"""
+In this sixth experiment, we have a collection C of >6M documents.
+We split C in two equally-sized pools TrPool, TePool
+
+I have randomly split the collection in 50% train and 50% split. In each split we have approx. 3.25 million documents. 
+
+We have 5 categories we can evaluate over: Continent, Years_Category, Num_Site_Links, Relative Pageviews and Gender. 
+
+From the training set I have created smaller subsets for each category:
+100K, 500K, 1M and FULL (3.25M) 
+
+For each category and subset, I have created a training set called: "classifier_training.json". This is the "base" training set for the classifier. In this set we have 500 documents per group in a category. (For example: Male 500, Female 500, Unknown 500).  Let me know if you think we need more. 
+
+To "bias" the quantifier towards a query, I have executed the queries (97) on the different training sets and retrieved the 200 most relevant documents per group. 
+For example: (Male 200, Female 200, Unknown 200) 
+Sometimes this is infeasible, we should probably discuss this at some point. 
+
+ You can find the results for every query in a file named: 
+
+"training_Query-[QID]Sample-200SPLIT.json" 
+
+Test: 
+To evaluate our approach, I have executed the queries on the test split. You can find the results for all 97 queries up till k=1000 in this file. 
+ testRanking_Results.json 
+  
+"""
+
+
+def methods(classifier, class_name=None, binarize=False):
+
+    kde_param = {
+        'continent': 0.01,
+        'gender': 0.03,
+        'years_category':0.03
+    }
+
+    yield ('NaiveQuery', Naive())
+    yield ('CC', ClassifyAndCount(classifier))
+    yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1))
+    yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param.get(class_name, 0.01)))
+    if binarize:
+        yield ('M3b', M3rND_ModelB(classifier))
+        yield ('M3b+', M3rND_ModelB(classifier))
+        yield ('M3d', M3rND_ModelD(classifier))
+        yield ('M3d+', M3rND_ModelD(classifier))
+
+
+def train_classifier_fn(train_path):
+    """
+    Trains a classifier. To do so, it loads the training set, transforms it into a tfidf representation.
+    The classifier is Logistic Regression, with hyperparameters C (range [0.001, 0.01, ..., 1000]) and
+    class_weight (range {'balanced', None}) optimized via 5FCV.
+
+    :return: the tfidf-vectorizer and the classifier trained
+    """
+    texts, labels = load_sample(train_path, class_name=class_name)
+
+    if BINARIZE:
+        labels = binarize_labels(labels, positive_class=protected_group[class_name])
+
+    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
+    Xtr = tfidf.fit_transform(texts)
+    print(f'Xtr shape={Xtr.shape}')
+
+    print('training classifier...', end='')
+    classifier = LogisticRegression(max_iter=5000)
+    modsel = GridSearchCV(
+        classifier,
+        param_grid={'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]},
+        n_jobs=-1,
+        cv=5
+    )
+    modsel.fit(Xtr, labels)
+    classifier = modsel.best_estimator_
+    classifier_acc = modsel.best_score_
+    best_params = modsel.best_params_
+    print(f'[done] best-params={best_params} got {classifier_acc:.4f} score')
+
+    print('generating cross-val predictions for M3')
+    predictions = cross_val_predict(clone(classifier), Xtr, labels, cv=10, n_jobs=-1, verbose=10)
+    conf_matrix = confusion_matrix(labels, predictions, labels=classifier.classes_)
+
+    training = LabelledCollection(Xtr, labels)
+    print('training classes:', training.classes_)
+    print('training prevalence:', training.prevalence())
+
+    return tfidf, classifier, conf_matrix
+
+
+def reduceAtK(data: LabelledCollection, k):
+    # if k > len(data):
+    #     print(f'[warning] {k=}>{len(data)=}')
+    X, y = data.Xy
+    X = X[:k]
+    y = y[:k]
+    return LabelledCollection(X, y, classes=data.classes_)
+
+
+def benchmark_name(class_name, k=None):
+    scape_class_name = class_name.replace('_', '\_')
+    if k is None:
+        return scape_class_name
+    else:
+        return f'{scape_class_name}@{k}'
+
+
+def run_experiment():
+
+    results = {
+        'mae': {k: [] for k in Ks},
+        'mrae': {k: [] for k in Ks},
+        'rKL_error': [],
+        'rND_error': []
+    }
+
+    pbar = tqdm(experiment_prot(), total=experiment_prot.total())
+    for train, test, q_rel_prevs in pbar:
+        Xtr, ytr, score_tr = train
+        Xte, yte, score_te = test
+
+        train_col = LabelledCollection(Xtr, ytr, classes=classifier.classes_)
+
+        if not method_name.startswith('Naive') and not method_name.startswith('M3'):
+            method.fit(train_col, val_split=train_col, fit_classifier=False)
+        elif method_name == 'Naive':
+            method.fit(train_col)
+
+        test_col = LabelledCollection(Xte, yte, classes=classifier.classes_)
+        rKL_estim, rKL_true = [], []
+        rND_estim, rND_true = [], []
+        for k in Ks:
+            test_k = reduceAtK(test_col, k)
+            if method_name == 'NaiveQuery':
+                train_k = reduceAtK(train_col, k)
+                method.fit(train_k)
+
+            estim_prev = method.quantify(test_k.instances)
+
+            # epsilon value for prevalence smoothing
+            eps=(1. / (2. * k))
+
+            # error metrics
+            test_k_prev = test_k.prevalence()
+            mae = qp.error.mae(test_k_prev, estim_prev)
+            mrae = qp.error.mrae(test_k_prev, estim_prev, eps=eps)
+            rKL_at_k_estim = qp.error.kld(estim_prev, q_rel_prevs, eps=eps)
+            rKL_at_k_true  = qp.error.kld(test_k_prev, q_rel_prevs, eps=eps)
+
+            if BINARIZE:
+                # [1] is the index of the minority or historically disadvantaged group
+                rND_at_k_estim = np.abs(estim_prev[1] - q_rel_prevs[1])
+                rND_at_k_true = np.abs(test_k_prev[1] - q_rel_prevs[1])
+
+            # collect results
+            results['mae'][k].append(mae)
+            results['mrae'][k].append(mrae)
+            rKL_estim.append(rKL_at_k_estim)
+            rKL_true.append(rKL_at_k_true)
+            if BINARIZE:
+                rND_estim.append(rND_at_k_estim)
+                rND_true.append(rND_at_k_true)
+
+
+        # aggregate fairness metrics
+        def aggregate(rMs, Ks, Z=1):
+            return (1 / Z) * sum((1. / np.log2(k)) * v for v, k in zip(rMs, Ks))
+
+        Z = sum((1. / np.log2(k)) for k in Ks)
+        rKL_estim = aggregate(rKL_estim, Ks, Z)
+        rKL_true  = aggregate(rKL_true, Ks, Z)
+        rKL_error = np.abs(rKL_true-rKL_estim)
+        results['rKL_error'].append(rKL_error)
+
+        if BINARIZE:
+            rND_estim = aggregate(rND_estim, Ks, Z)
+            rND_true = aggregate(rND_true, Ks, Z)
+
+            if isinstance(method, AbstractM3rND):
+                if method_name.endswith('+'):
+                    # learns the correction parameters from the query-specific training data
+                    conf_matrix_ = method.get_confusion_matrix(*train_col.Xy)
+                else:
+                    # learns the correction parameters from the training data used to train the classifier
+                    conf_matrix_ = conf_matrix.copy()
+                rND_estim = method.fair_measure_correction(rND_estim, conf_matrix_)
+
+            rND_error = np.abs(rND_true - rND_estim)
+            results['rND_error'].append(rND_error)
+
+        pbar.set_description(f'{method_name}')
+
+    return results
+
+
+data_home = 'data'
+
+if __name__ == '__main__':
+
+    # final tables only contain the information for the data size 10K, each row is a class name and each colum
+    # the corresponding rND (for binary) or rKL (for multiclass) score
+    tables_RND, tables_DKL = [], []
+    tables_final = []
+    for class_mode in ['multiclass', 'binary']:
+        BINARIZE = (class_mode=='binary')
+        method_names = [name for name, *other in methods(None, binarize=BINARIZE)]
+
+        table_final = Table(name=f'rND' if BINARIZE else f'rKL', benchmarks=[benchmark_name(c) for c in CLASS_NAMES], methods=method_names)
+        table_final.format.mean_macro = False
+        tables_final.append(table_final)
+        for class_name in CLASS_NAMES:
+            tables_mae, tables_mrae = [], []
+
+            benchmarks_size =[benchmark_name(class_name, s) for s in DATA_SIZES]
+            table_DKL = Table(name=f'rKL-{class_name}', benchmarks=benchmarks_size, methods=method_names)
+            table_RND = Table(name=f'rND-{class_name}', benchmarks=benchmarks_size, methods=method_names)
+
+            for data_size in DATA_SIZES:
+                print(class_name, class_mode, data_size)
+                benchmarks_k = [benchmark_name(class_name, k) for k in Ks]
+                # table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks_k, methods=method_names)
+                table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks_k, methods=method_names)
+
+                # tables_mae.append(table_mae)
+                tables_mrae.append(table_mrae)
+
+                # sets all paths
+                class_home = join(data_home, class_name, data_size)
+                train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json') # <----- fixed classifier
+                classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}_{class_mode}.pkl')
+                test_rankings_path = join(data_home, 'testRanking_Results.json')
+                test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json')
+                results_home = join('results', class_name, class_mode, data_size)
+                positive_class = protected_group[class_name] if BINARIZE else None
+
+                # instantiates the classifier (trains it the first time, loads it in the subsequent executions)
+                tfidf, classifier, conf_matrix \
+                    = qp.util.pickled_resource(classifier_path, train_classifier_fn, train_data_path)
+
+                experiment_prot = RetrievedSamples(
+                    class_home,
+                    test_rankings_path,
+                    test_query_prevs_path,
+                    vectorizer=tfidf,
+                    class_name=class_name,
+                    positive_class=positive_class,
+                    classes=classifier.classes_
+                )
+
+                for method_name, method in methods(classifier, class_name, BINARIZE):
+
+                    results_path = join(results_home, method_name + '.pkl')
+                    results = qp.util.pickled_resource(results_path, run_experiment)
+
+                    # compose the tables
+                    for k in Ks:
+                        # table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k])
+                        table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
+                    table_DKL.add(benchmark=benchmark_name(class_name, data_size), method=method_name, v=results['rKL_error'])
+                    if BINARIZE:
+                        table_RND.add(benchmark=benchmark_name(class_name, data_size), method=method_name, v=results['rND_error'])
+
+                    if data_size=='10K':
+                        value = results['rND_error'] if BINARIZE else results['rKL_error']
+                        table_final.add(benchmark=benchmark_name(class_name), method=method_name, v=value)
+
+            tables = ([table_RND] + tables_mrae) if BINARIZE else ([table_DKL] + tables_mrae)
+            Table.LatexPDF(f'./latex/{class_mode}/{class_name}.pdf', tables=tables)
+
+            if BINARIZE:
+                tables_RND.append(table_RND)
+            else:
+                tables_DKL.append(table_DKL)
+
+    Table.LatexPDF(f'./latex/global/main.pdf', tables=tables_RND+tables_DKL, dedicated_pages=False)
+    Table.LatexPDF(f'./latex/final/main.pdf', tables=tables_final, dedicated_pages=False)
+
+
+
+
+
+
+
--- a/Retrieval/kdey_bandwidth_selection_queries.py
+++ b/Retrieval/kdey_bandwidth_selection_queries.py
@ -0,0 +1,88 @@
+import os.path
+import pickle
+from collections import defaultdict
+from pathlib import Path
+
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.svm import LinearSVC
+
+import quapy as qp
+from Retrieval.commons import RetrievedSamples, load_sample
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
+from quapy.data.base import LabelledCollection
+from experiments import benchmark_name, reduceAtK, run_experiment
+
+from os.path import join
+from tqdm import tqdm
+
+from result_table.src.table import Table
+
+
+
+def methods(classifier):
+    for i, bandwidth in enumerate(np.linspace(0.01, 0.1, 10)):
+        yield (f'KDE{str(i).zfill(2)}', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=bandwidth))
+
+
+if __name__ == '__main__':
+    data_home = 'data-modsel'
+
+    Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
+
+    method_names = [m for m, *_ in methods(None)]
+
+    class_mode = 'multiclass'
+
+    dir_names={
+        'gender': '100K_GENDER_TREC21_QUERIES/100K-NEW-QUERIES',
+        'continent': '100K_CONT_TREC21_QUERIES/100K-NEW-QUERIES',
+        'years_category': '100K_YEARS_TREC21_QUERIES/100K-NEW-QUERIES'
+    }
+
+    for class_name in ['gender', 'continent', 'years_category']:
+
+        tables_mrae = []
+
+        benchmarks = [benchmark_name(class_name, k) for k in Ks]
+
+        for data_size in ['100K']:
+
+            table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names)
+            tables_mrae.append(table_mrae)
+
+            class_home = join(data_home, dir_names[class_name])
+            classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}_{class_mode}.pkl')
+            test_rankings_path = join(data_home, 'testRanking-TREC21-Queries_Results.json')
+            test_query_prevs_path = join('data', 'prevelance_vectors_judged_docs.json')
+            results_home = join('results', 'modsel', class_name, data_size)
+
+            tfidf, classifier, conf_matrix = pickle.load(open(classifier_path, 'rb'))
+
+            experiment_prot = RetrievedSamples(
+                class_home,
+                test_rankings_path,
+                test_query_prevs_path,
+                vectorizer=tfidf,
+                class_name=class_name,
+                classes=classifier.classes_
+            )
+            for method_name, quantifier in methods(classifier):
+
+                results_path = join(results_home, method_name + '.pkl')
+                results = qp.util.pickled_resource(results_path, run_experiment)
+
+                for k in Ks:
+                    table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
+
+                Table.LatexPDF(f'./latex/modsel/{class_name}.pdf', tables=tables_mrae)
+
+
+
+
+
+
+
--- a/Retrieval/methods.py
+++ b/Retrieval/methods.py
@ -0,0 +1,88 @@
+
+"""
+This file implements some of the methods presented in the FAccT'22 paper by
+Ghazimatin, Kleindessner, Russell, Abedjan, and Golebiowski,
+Measuring Fairness of Rankings under Noisy Sensitive Information.
+
+In particular, it implements two variants of a method relying on M3=rND:
+one in which the assumed graphical model is P(Â,A,S) = P(Â|A)*P(S|A) (called "b")
+and another in which the assumed graphical model is P(Â,A,S) = P(Â|A)*P(S|Â) (called "d")
+"""
+
+import numpy as np
+from abc import ABC, abstractmethod
+from sklearn.metrics import confusion_matrix
+
+from quapy.method.aggregative import CC
+
+
+class AbstractM3rND(ABC):
+    def __init__(self, classifier):
+        self.quantifier = CC(classifier)
+
+    def proxy_labels(self, instances):
+        return self.quantifier.classify(instances)
+
+    def quantify(self, instances):
+        return self.quantifier.quantify(instances)
+
+    @abstractmethod
+    def fair_measure_correction(self, rND_estim: float, conf_matrix: np.ndarray):
+        ...
+
+    def get_confusion_matrix(self, X, y, additive_smoothing=0.5):
+        """
+        Some confusion matrices may contain 0 values for certain classes, and this causes
+        instabilities in the correction. If requested, applies additive smoothing. Default
+        is adding half a count.
+
+        :param X: array-like with the covariates
+        :param y: array-like with the true labels
+        :param additive_smoothing: float, default 0.5
+        :return: the confusion matrix C with entries Cij=P(Y=i,Ŷ=j)
+        """
+        proxy_labels = self.proxy_labels(X)
+        true_labels = y
+        labels = self.quantifier.classes_
+        conf_matrix = confusion_matrix(true_labels, proxy_labels, labels=labels)
+        if additive_smoothing > 0:
+            conf_matrix = conf_matrix.astype(float) + additive_smoothing
+        return conf_matrix
+
+
+class M3rND_ModelB(AbstractM3rND):
+    def __init__(self, classifier):
+        super().__init__(classifier)
+
+    def fair_measure_correction(self, rND_estim: float, conf_matrix: np.ndarray):
+        # conf_matrix contains values Cij=P(Y=i,Ŷ=j)
+        # truecond_matrix contains values Cij=P(Ŷ=j|Y=i) (truecond stands for "conditioned on true labels")
+        truecond_matrix = conf_matrix / conf_matrix.sum(axis=1, keepdims=True)
+        p = truecond_matrix[0, 1]  # P(hat{A}=1|A=0)
+        q = truecond_matrix[1, 0]  # P(hat{A}=0|A=1)
+        den = (1 - p - q)
+        if den != 0:
+            corr = 1./den
+            rND_estim = rND_estim * corr
+        return rND_estim
+
+
+class M3rND_ModelD(AbstractM3rND):
+    def __init__(self, classifier):
+        super().__init__(classifier)
+
+    def fair_measure_correction(self, rND_estim: float, conf_matrix: np.ndarray):
+        # conf_matrix contains values Cij=P(Y=i,Ŷ=j)
+        # truecond_matrix contains values Cij=P(Ŷ=j|Y=i) (truecond stands for "conditioned on true labels")
+        truecond_matrix = conf_matrix / conf_matrix.sum(axis=1, keepdims=True)
+        prev_A = conf_matrix.sum(axis=1)
+        beta = prev_A[1]  # P(A)
+        p = truecond_matrix[0, 1]  # P(hat{A}=1|A=0)
+        q = truecond_matrix[1, 0]  # P(hat{A}=0|A=1)
+        x = (1 - q) * beta + p * (1 - beta)
+        y = q * beta + (1 - p) * (1 - beta)
+        if x != 0 and y != 0:
+            corr = ((((1 - q) * beta) / x) - (q * beta / y))
+            rND_estim = rND_estim * corr
+        return rND_estim
+
--- a/Retrieval/plot_mrae_xaxis_k.py
+++ b/Retrieval/plot_mrae_xaxis_k.py
@ -0,0 +1,124 @@
+import itertools
+import os.path
+import pickle
+import numpy as np
+from Retrieval.experiments import methods
+from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
+from os.path import join
+import matplotlib.pyplot as plt
+
+
+
+data_home = 'data'
+class_mode = 'multiclass'
+
+method_names = [name for name, *other in methods(None, 'continent')]
+
+all_results = {}
+
+class_name_label = {
+    'continent': 'Geographic Location',
+    'gender': 'Gender',
+    'years_category': 'Age of Topic'
+}
+
+
+# loads all MRAE results, and returns a dictionary containing the values, which is indexed by:
+# class_name -> data_size -> method_name -> k -> stat -> float
+# where stat is "mean", "std", "max"
+def load_all_results():
+
+    for class_name in CLASS_NAMES:
+
+        all_results[class_name] = {}
+
+        for data_size in DATA_SIZES:
+
+            all_results[class_name][data_size] = {}
+
+            results_home = join('results', class_name, class_mode, data_size)
+
+            all_results[class_name][data_size] = {}
+
+            for method_name in method_names:
+                results_path = join(results_home, method_name + '.pkl')
+                try:
+                    results = pickle.load(open(results_path, 'rb'))
+                except Exception as e:
+                    print(f'missing result {results}', e)
+
+                all_results[class_name][data_size][method_name] = {}
+                for k in Ks:
+                    all_results[class_name][data_size][method_name][k] = {}
+                    values = results['mrae']
+                    all_results[class_name][data_size][method_name][k]['mean'] = np.mean(values[k])
+                    all_results[class_name][data_size][method_name][k]['std'] = np.std(values[k])
+                    all_results[class_name][data_size][method_name][k]['max'] = np.max(values[k])
+
+    return all_results
+
+
+results = load_all_results()
+
+# generates the class-independent, size-independent plots for y-axis=MRAE in which:
+# - the x-axis displays the Ks
+
+for class_name in CLASS_NAMES:
+    for data_size in DATA_SIZES[:1]:
+
+        log = class_name=='gender'
+
+        fig, ax = plt.subplots()
+
+        max_means = []
+        markers = itertools.cycle(['o', 's', '^', 'D', 'v', '*', '+'])
+        for method_name in method_names:
+            # class_name -> data_size -> method_name -> k -> stat -> float
+            means = [
+                results[class_name][data_size][method_name][k]['mean'] for k in Ks
+            ]
+            stds = [
+                results[class_name][data_size][method_name][k]['std'] for k in Ks
+            ]
+            # max_mean = np.max([
+            #         results[class_name][data_size][method_name][k]['max'] for k in Ks
+            # ])
+            max_means.append(max(means))
+
+            means = np.asarray(means)
+            stds = np.asarray(stds)
+
+            method_name = method_name.replace('NaiveQuery', 'Naive@$k$')
+            method_name = method_name.replace('KDEy-ML', 'KDEy')
+            marker = next(markers)
+            line = ax.plot(Ks, means, 'o-', label=method_name, color=None, linewidth=3, markersize=10, marker=marker)
+            color = line[-1].get_color()
+            if log:
+                ax.set_yscale('log')
+            # ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color)
+
+        ax.grid(True, which='both', axis='y', color='gray', linestyle='--', linewidth=0.3)
+        ax.set_xlabel('k')
+        ax.set_ylabel('RAE' + (' (log scale)' if log else ''))
+        data_size_label = '$\mathcal{L}_{10\mathrm{K}}$'
+        ax.set_title(f'{class_name_label[class_name]} from {data_size_label}')
+        ax.set_ylim([0, max(max_means)*1.05])
+
+        if class_name == 'years_category':
+            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+
+        os.makedirs(f'plots/var_k/{class_name}', exist_ok=True)
+        plotpath = f'plots/var_k/{class_name}/{data_size}_mrae.pdf'
+        print(f'saving plot in {plotpath}')
+        plt.savefig(plotpath, bbox_inches='tight')
+
+
+
+
+
+
+
+
+
+
+
--- a/Retrieval/plot_mrae_xaxis_size.py
+++ b/Retrieval/plot_mrae_xaxis_size.py
@ -0,0 +1,88 @@
+import itertools
+import os.path
+from Retrieval.experiments import methods
+from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES
+import matplotlib.pyplot as plt
+
+from Retrieval.plot_mrae_xaxis_k import load_all_results
+
+data_home = 'data'
+class_mode = 'multiclass'
+
+method_names = [name for name, *other in methods(None)]
+
+all_results = {}
+
+class_name_label = {
+    'continent': 'Geographic Location',
+    'gender': 'Gender',
+    'years_category': 'Age of Topic'
+}
+
+# loads all MRAE results, and returns a dictionary containing the values, which is indexed by:
+# class_name -> data_size -> method_name -> k -> stat -> float
+results = load_all_results()
+
+# generates the class-independent, size-independent plots for y-axis=MRAE in which:
+# - the x-axis displays the Ks
+
+# X_DATA_SIZES = [int(x.replace('K', '000').replace('M', '000000').replace('FULL', '3250000')) for x in DATA_SIZES]
+X_DATA_SIZES = [x.replace('FULL', '3.25M') for x in DATA_SIZES]
+
+for class_name in CLASS_NAMES:
+    for k in [100]: #Ks:
+
+        log = class_name=='gender'
+
+        fig, ax = plt.subplots()
+
+        max_means = []
+        markers = itertools.cycle(['o', 's', '^', 'D', 'v', '*', '+'])
+        for method_name in method_names:
+            # class_name -> data_size -> method_name -> k -> stat -> float
+            means = [
+                results[class_name][data_size][method_name][k]['mean'] for data_size in DATA_SIZES
+            ]
+            stds = [
+                results[class_name][data_size][method_name][k]['std'] for data_size in DATA_SIZES
+            ]
+            # max_mean = np.max([
+            #         results[class_name][data_size][method_name][k]['max'] for data_size in DATA_SIZE
+            # ])
+
+            max_means.append(max(means))
+
+            style = 'o-' if method_name != 'CC' else '--'
+            method_name = method_name.replace('NaiveQuery', 'Naive@$k$')
+            method_name = method_name.replace('KDEy-ML', 'KDEy')
+            marker=next(markers)
+            line = ax.plot(X_DATA_SIZES, means, style, label=method_name, color=None, linewidth=3, markersize=10, marker=marker)
+            color = line[-1].get_color()
+            if log:
+                ax.set_yscale('log')
+            # ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color)
+
+        ax.grid(True, which='both', axis='y', color='gray', linestyle='--', linewidth=0.3)
+        ax.set_xlabel('training pool size')
+        ax.set_ylabel('RAE' + (' (log scale)' if log else ''))
+        ax.set_title(f'{class_name_label[class_name]} at exposure {k=}')
+        ax.set_ylim([0, max(max_means)*1.05])
+
+        if class_name == 'years_category':
+            ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+
+        os.makedirs(f'plots/var_size/{class_name}', exist_ok=True)
+        plotpath = f'plots/var_size/{class_name}/{k}_mrae.pdf'
+        print(f'saving plot in {plotpath}')
+        plt.savefig(plotpath, bbox_inches='tight')
+
+
+
+
+
+
+
+
+
+
+
--- a/Retrieval/relscore_distribution.py
+++ b/Retrieval/relscore_distribution.py
@ -0,0 +1,93 @@
+import os.path
+import pickle
+from itertools import zip_longest
+from commons import RetrievedSamples, load_sample, DATA_SIZES
+from os.path import join
+from tqdm import tqdm
+import numpy as np
+import matplotlib.pyplot as plt
+
+"""
+Plots the distribution of (predicted) relevance score for the test samples and for the training samples wrt:
+- training pool size (10K, 50K, 100K, 500K, 1M, FULL)
+- rank  
+"""
+
+
+data_home = 'data'
+
+up_to = 250
+
+for class_name in ['continent']: # 'num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
+    test_added = False
+    Mtrs, Mtes, source = [], [], []
+    for data_size in DATA_SIZES:
+
+        class_home = join(data_home, class_name, data_size)
+        classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
+        test_rankings_path = join(data_home, 'testRanking_Results.json')
+        test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json')
+
+        _, classifier = pickle.load(open(classifier_path, 'rb'))
+
+        experiment_prot = RetrievedSamples(
+            class_home,
+            test_rankings_path,
+            test_query_prevs_path,
+            vectorizer=None,
+            class_name=class_name,
+            classes=classifier.classes_
+        )
+
+        Mtr = []
+        Mte = []
+        pbar = tqdm(experiment_prot(), total=experiment_prot.total())
+        for train, test, *_ in pbar:
+            Xtr, ytr, score_tr = train
+            Xte, yte, score_te = test
+            if len(score_tr) >= up_to:
+                Mtr.append(score_tr)
+                Mte.append(score_te)
+
+        Mtrs.append(Mtr)
+        if not test_added:
+            Mtes.append(Mte)
+            test_added = True
+        source.append(data_size)
+
+    fig, ax = plt.subplots()
+    # train_source = ['train-'+s for s in source]
+    train_source = ['$L_{'+s.replace('FULL', '3.25M').replace('K','\mathrm{K}').replace('M','\mathrm{M}')+'}$' for s in source]
+    # Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test']))
+    Ms = list(zip(Mtrs, train_source)) + list(zip(Mtes, ['$U_{(3.25\mathrm{M})}$']))
+
+
+    for M, source in Ms:
+        M = np.asarray(list(zip_longest(*M, fillvalue=np.nan))).T
+
+        num_rep, num_docs = M.shape
+
+        mean_values = np.nanmean(M, axis=0)
+        n_filled = np.count_nonzero(~np.isnan(M), axis=0)
+        std_errors = np.nanstd(M, axis=0) / np.sqrt(n_filled)
+
+        line = ax.plot(range(num_docs), mean_values, '-', label=source, color=None)
+        color = line[-1].get_color()
+        ax.fill_between(range(num_docs), mean_values - std_errors, mean_values + std_errors, alpha=0.3, color=color)
+
+
+    ax.set_xlabel('rank ($k$)')
+    ax.set_ylabel('predicted relevance score')
+    ax.set_title(class_name.replace('continent', 'Geographic Location'))
+    ax.set_xlim((0,up_to))
+
+    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+
+    # plt.show()
+    os.makedirs('plots', exist_ok=True)
+    plotpath = f'plots/{class_name}_rel_distrbution_2.pdf'
+    print(f'saving plot in {plotpath}')
+    plt.savefig(plotpath, bbox_inches='tight')
+
+
+
--- a/Retrieval/tmp.py
+++ b/Retrieval/tmp.py
@ -0,0 +1,16 @@
+import pandas as pd
+
+from os.path import join
+
+from quapy.data import LabelledCollection
+
+data_home = 'data'
+CLASS_NAME = 'continent'
+datasize = '100K'
+
+file_path = join(data_home, 'prevelance_vectors_judged_docs.json')
+
+df = pd.read_json(file_path)
+
+pd.set_option('display.max_columns', None)
+print(df)
--- a/quapy/init.py
+++ b/quapy/init.py
@ -11,7 +11,7 @@ from . import util
 from . import model_selection
 from . import classification

-__version__ = '0.1.8'
+__version__ = '0.1.9'

 environ = {
    'SAMPLE_SIZE': None,
--- a/quapy/error.py
+++ b/quapy/error.py
@ -158,8 +158,8 @@ def kld(prevs, prevs_hat, eps=None):
    :return: Kullback-Leibler divergence between the two distributions
    """
    eps = __check_eps(eps)
-    smooth_prevs = prevs + eps
-    smooth_prevs_hat = prevs_hat + eps
+    smooth_prevs = smooth(prevs, eps)
+    smooth_prevs_hat = smooth(prevs_hat, eps)
    return (smooth_prevs*np.log(smooth_prevs/smooth_prevs_hat)).sum(axis=-1)


--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@ -87,7 +87,6 @@ def evaluation_report(model: BaseQuantifier,
    Generates a report (a pandas' DataFrame) containing information of the evaluation of the model as according
    to a specific protocol and in terms of one or more evaluation metrics (errors).

-
    :param model: a quantifier, instance of :class:`quapy.method.base.BaseQuantifier`
    :param protocol: :class:`quapy.protocol.AbstractProtocol`; if this object is also instance of
        :class:`quapy.protocol.OnLabelledCollectionProtocol`, then the aggregation speed-up can be run. This is the protocol
--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -141,6 +141,19 @@ def uniform_prevalence_sampling(n_classes, size=1):
    return u


+def uniform_prevalence(n_classes):
+    """
+    Returns a vector representing the uniform distribution for `n_classes`
+
+    :param n_classes: number of classes
+    :return: np.ndarray with all values 1/n_classes
+    """
+    assert isinstance(n_classes, int) and n_classes>0, \
+        (f'param {n_classes} not understood; must be a positive integer representing the '
+         f'number of classes ')
+    return np.full(shape=n_classes, fill_value=1./n_classes)
+
+
 uniform_simplex_sampling = uniform_prevalence_sampling


--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@ -52,7 +52,7 @@ class KDEBase:
        """
        return np.exp(kde.score_samples(X))

-    def get_mixture_components(self, X, y, n_classes, bandwidth):
+    def get_mixture_components(self, X, y, classes, bandwidth):
        """
        Returns an array containing the mixture components, i.e., the KDE functions for each class.

@ -62,7 +62,13 @@ class KDEBase:
        :param bandwidth: float, the bandwidth of the kernel
        :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
        """
-        return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
+        class_cond_X = []
+        for cat in classes:
+            selX = X[y==cat]
+            if selX.size==0:
+                selX = [F.uniform_prevalence(len(classes))]
+            class_cond_X.append(np.asarray(selX))
+        return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]



@ -114,7 +120,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
        self.random_state=random_state

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
+        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
        return self

    def aggregate(self, posteriors: np.ndarray):
@ -196,7 +202,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
        self.montecarlo_trials = montecarlo_trials

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
+        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)

        N = self.montecarlo_trials
        rs = self.random_state
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
 from copy import deepcopy
 from typing import Callable, Union
 import numpy as np
-from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
+from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling, PlattScaling
 from scipy import optimize
 from sklearn.base import BaseEstimator
 from sklearn.calibration import CalibratedClassifierCV
@ -636,18 +636,35 @@ class EMQ(AggregativeSoftQuantifier):
                calibrator = TempScaling()
            elif self.recalib == 'vs':
                calibrator = VectorScaling()
+            elif self.recalib == 'platt':
+                calibrator = CalibratedClassifierCV(estimator=self.classifier, cv='prefit')
            else:
                raise ValueError('invalid param argument for recalibration method; available ones are '
                                 '"nbvs", "bcts", "ts", and "vs".')

-            self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
+            if not np.issubdtype(y.dtype, np.number):
+                y = np.searchsorted(data.classes_, y)
+
+            if self.recalib == 'platt':
+                self.classifier = calibrator.fit(*data.Xy)
+            else:
+                print(classif_predictions.prevalence())
+                try:
+                    self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
+                except RuntimeError as e:
+                    print(e)
+                    print('defaults to I')
+                    self.calibration_function = lambda P:P

        if self.exact_train_prev:
            self.train_prevalence = data.prevalence()
        else:
            train_posteriors = classif_predictions.X
            if self.recalib is not None:
-                train_posteriors = self.calibration_function(train_posteriors)
+                if self.recalib == 'platt':
+                    train_posteriors = self.classifier.predict_proba(train_posteriors)
+                else:
+                    train_posteriors = self.calibration_function(train_posteriors)
            self.train_prevalence = F.prevalence_from_probabilities(train_posteriors)

    def aggregate(self, classif_posteriors, epsilon=EPSILON):
@ -681,6 +698,11 @@ class EMQ(AggregativeSoftQuantifier):
        """
        Px = posterior_probabilities
        Ptr = np.copy(tr_prev)
+
+        if np.product(Ptr) == 0:  # some entry is 0; we should smooth the values to avoid 0 division
+            Ptr += epsilon
+            Ptr /= Ptr.sum()
+
        qs = np.copy(Ptr)  # qs (the running estimate) is initialized as the training prevalence

        s, converged = 0, False
--- a/quapy/method/non_aggregative.py
+++ b/quapy/method/non_aggregative.py
@ -1,5 +1,6 @@
 from typing import Union, Callable
 import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer

 from quapy.functional import get_divergence
 from quapy.data import LabelledCollection
@ -146,6 +147,53 @@ class DMx(BaseQuantifier):
        return F.argmin_prevalence(loss, n_classes, method=self.search)


+class ReadMe(BaseQuantifier):
+
+    def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
+        self.bootstrap_trials = bootstrap_trials
+        self.bootstrap_range = bootstrap_range
+        self.bagging_trials = bagging_trials
+        self.bagging_range = bagging_range
+        self.vectorizer_kwargs = vectorizer_kwargs
+
+    def fit(self, data: LabelledCollection):
+        X, y = data.Xy
+        self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
+        X = self.vectorizer.fit_transform(X)
+        self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
+
+    def quantify(self, instances):
+        X = self.vectorizer.transform(instances)
+
+        # number of features
+        num_docs, num_feats = X.shape
+
+        # bootstrap
+        p_boots = []
+        for _ in range(self.bootstrap_trials):
+            docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
+            class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
+            Xboot = X[docs_idx]
+
+            # bagging
+            p_bags = []
+            for _ in range(self.bagging_trials):
+                feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
+                class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
+                Xbag = Xboot[:,feat_idx]
+                p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
+                p_bags.append(p)
+            p_boots.append(np.mean(p_bags, axis=0))
+
+        p_mean = np.mean(p_boots, axis=0)
+        p_std  = np.std(p_bags, axis=0)
+
+        return p_mean
+
+
+    def std_constrained_linear_ls(self, X, class_cond_X: dict):
+        pass
+

 def _get_features_range(X):
    feat_ranges = []
--- a/quapy/util.py
+++ b/quapy/util.py
@ -56,6 +56,7 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
    :param seed: the numeric seed
    :param asarray: set to True to return a np.ndarray instead of a list
    :param backend: indicates the backend used for handling parallel works
+    :param open_args: if True, then the delayed function is called on *args_i, instead of on args_i
    """
    def func_dec(environ, seed, *args):
        qp.environ = environ.copy()
@ -74,6 +75,40 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
    return out


+def parallel_unpack(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
+    """
+    A wrapper of multiprocessing:
+
+    >>> Parallel(n_jobs=n_jobs)(
+    >>>      delayed(func)(*args_i) for args_i in args
+    >>> )
+
+    that takes the `quapy.environ` variable as input silently.
+    Seeds the child processes to ensure reproducibility when n_jobs>1.
+
+    :param func: callable
+    :param args: args of func
+    :param seed: the numeric seed
+    :param asarray: set to True to return a np.ndarray instead of a list
+    :param backend: indicates the backend used for handling parallel works
+    """
+
+    def func_dec(environ, seed, *args):
+        qp.environ = environ.copy()
+        qp.environ['N_JOBS'] = 1
+        # set a context with a temporal seed to ensure results are reproducibles in parallel
+        with ExitStack() as stack:
+            if seed is not None:
+                stack.enter_context(qp.util.temp_seed(seed))
+            return func(*args)
+
+    out = Parallel(n_jobs=n_jobs, backend=backend)(
+        delayed(func_dec)(qp.environ, None if seed is None else seed + i, *args_i) for i, args_i in enumerate(args)
+    )
+    if asarray:
+        out = np.asarray(out)
+    return out
+
@contextlib.contextmanager
 def temp_seed(random_state):
    """
Author	SHA1	Message	Date
Alejandro Moreo Fernandez	47eb864491	last updates before submission	2024-08-22 17:02:00 +02:00
Alejandro Moreo Fernandez	1a1bccdd23	updating kde labels in plots	2024-05-20 12:13:00 +02:00
Alejandro Moreo Fernandez	517686eea1	improving the quality of the plots	2024-05-17 13:52:56 +02:00
Alejandro Moreo Fernandez	0df44c13a9	switching	2024-05-15 12:00:00 +02:00
Alejandro Moreo Fernandez	2ac48a9798	setting a rank threshold to 1000, and finalizing plots	2024-05-10 15:46:13 +02:00
Alejandro Moreo Fernandez	67ed6e4c6c	adding methods of prior work to git	2024-05-09 16:24:20 +02:00
Alejandro Moreo Fernandez	5284e04c90	final plots	2024-05-09 16:22:59 +02:00
Alejandro Moreo Fernandez	366020d45c	finalizing experiments and bugfix in kld error	2024-05-08 11:31:28 +02:00
Alejandro Moreo Fernandez	1007257280	adding Dkl	2024-05-02 16:36:23 +02:00
Alejandro Moreo Fernandez	e1f6149f71	adding the prevalence of the judged relevant per each query	2024-05-02 10:59:16 +02:00
Alejandro Moreo Fernandez	a1a716dc4a	trying to select training documents based on test score distribution	2024-04-24 15:27:35 +02:00
Alejandro Moreo Fernandez	36c53639d7	model selection for kde in a past TREC dataset	2024-04-23 09:53:31 +02:00
Alejandro Moreo Fernandez	bc656fe207	kde working	2024-04-19 18:16:14 +02:00
Alejandro Moreo Fernandez	985f430d52	refactoring everything	2024-04-18 09:32:30 +02:00
Alejandro Moreo Fernandez	8399552c8d	testing gender and continent again	2024-04-12 12:03:38 +02:00
Alejandro Moreo Fernandez	8ad41b1d33	new experimental protocol applied to continent	2024-04-09 09:48:56 +02:00
Alejandro Moreo Fernandez	1b420afd6c	fixing code to handle different categories	2024-04-05 18:09:52 +02:00
Alejandro Moreo Fernandez	8f9d19dd5f	fixing code to handle different categories	2024-04-05 18:09:20 +02:00
Alejandro Moreo Fernandez	2a685cec1e	seems to be working :D	2024-03-23 20:12:10 +01:00
Alejandro Moreo Fernandez	4150f4351f	statring 5th approach	2024-03-15 16:57:45 +01:00
Alejandro Moreo Fernandez	1aa9891ff9	cleaning gitignore	2024-02-23 16:48:53 +01:00
Alejandro Moreo Fernandez	1c03dd651b	first commit, some ideas already explored	2024-02-23 16:42:31 +01:00
Alejandro Moreo Fernandez	b3ccf71edb	Merge branch 'devel' of github.com:HLT-ISTI/QuaPy into devel	2024-02-23 16:30:11 +01:00
Alejandro Moreo Fernandez	320b3eac38	small fixes in kdey (now should work with string labels) and EMQ (in case some training prior prob was 0, it broke)	2024-02-23 16:29:53 +01:00
Alejandro Moreo Fernandez	9542eaee61	doing some benchmarking	2024-02-22 15:10:45 +01:00
Alejandro Moreo Fernandez	d50a86daf4	sketching readme system by Lu and King, Hopings and King	2024-02-16 17:34:10 +01:00