new experimental protocol applied to continent

fixing code to handle different categories
2024-04-09 09:48:56 +02:00 · 2024-04-05 18:09:52 +02:00 · 2024-04-05 18:09:20 +02:00 · 2024-03-23 20:12:10 +01:00 · 2024-03-15 16:57:45 +01:00 · 2024-02-23 16:48:53 +01:00
15 changed files with 1443 additions and 12 deletions
--- a/.gitignore
+++ b/.gitignore
@ -143,8 +143,7 @@ LeQua2022
 MultiLabel
 NewMethods
 Ordinal
-Retrieval
-eDiscovery
+Archived/eDiscovery
 poster-cikm
 slides-cikm
 slides-short-cikm
@ -153,9 +152,4 @@ svm_perf_quantification/svm_struct
 svm_perf_quantification/svm_light
 TweetSentQuant

-
-
-
-
-
 *.png
--- a/CHANGE_LOG.txt
+++ b/CHANGE_LOG.txt
@ -1,3 +1,9 @@
+Change Log 0.1.9
+----------------
+
+<...>
+
+
 Change Log 0.1.8
 ----------------

--- a/Retrieval/commons.py
+++ b/Retrieval/commons.py
@ -0,0 +1,121 @@
+import pandas as pd
+import numpy as np
+from glob import glob
+from os.path import join
+
+from quapy.data import LabelledCollection
+from quapy.protocol import AbstractProtocol
+import json
+
+
+def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
+    # print('reading', path)
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text'].values
+    y = df['continent'].values
+
+    if parse_columns:
+        rank = df['rank'].values
+        scores = df['score'].values
+        rank = rank[y != 'Antarctica']
+        scores = scores[y != 'Antarctica']
+
+    X = X[y!='Antarctica']
+    y = y[y!='Antarctica']
+
+    if parse_columns:
+        order = np.argsort(rank)
+        X = X[order]
+        y = y[order]
+        rank = rank[order]
+        scores = scores[order]
+
+    if max_lines is not None:
+        X = X[:max_lines]
+        y = y[:max_lines]
+
+    return X, y
+
+
+def load_json_sample(path, class_name, max_lines=-1):
+    obj = json.load(open(path, 'rt'))
+    keys = [f'{id}' for id in range(len(obj['text'].keys()))]
+    text = [obj['text'][id] for id in keys]
+    classes = [obj[class_name][id] for id in keys]
+    if max_lines is not None and max_lines>0:
+        text = text[:max_lines]
+        classes = classes[:max_lines]
+    return text, classes
+
+
+class TextRankings:
+
+    def __init__(self, path, class_name):
+        self.obj = json.load(open(path, 'rt'))
+        self.class_name = class_name
+
+    def get_sample_Xy(self, sample_id, max_lines=-1):
+        sample_id = str(sample_id)
+        O = self.obj
+        docs_ids = [doc_id for doc_id, query_id in O['qid'].items() if query_id == sample_id]
+        texts = [O['text'][doc_id] for doc_id in docs_ids]
+        labels = [O[self.class_name][doc_id] for doc_id in docs_ids]
+        if max_lines > 0 and len(texts) > max_lines:
+            ranks = [int(O['rank'][doc_id]) for doc_id in docs_ids]
+            sel = np.argsort(ranks)[:max_lines]
+            texts = np.asarray(texts)[sel]
+            labels = np.asarray(labels)[sel]
+
+        return texts, labels
+
+
+def get_query_id_from_path(path, prefix='training', posfix='200SPLIT'):
+    qid = path
+    qid = qid[:qid.index(posfix)]
+    qid = qid[qid.index(prefix)+len(prefix):]
+    return qid
+
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None, classes=None, class_name=None):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.max_train_lines = max_train_lines
+        self.max_test_lines = max_test_lines
+        self.classes=classes
+        assert class_name is not None, 'class name should be specified'
+        self.class_name = class_name
+        self.text_samples = TextRankings(join(self.path_dir, 'testRankingsRetrieval.json'), class_name=class_name)
+
+
+    def __call__(self):
+
+        for file in glob(join(self.path_dir, 'training*SPLIT.json')):
+
+            X, y = self.load_fn(file, class_name=self.class_name, max_lines=self.max_train_lines)
+            X = self.vectorizer.transform(X)
+            train_sample = LabelledCollection(X, y, classes=self.classes)
+
+            query_id = get_query_id_from_path(file)
+            X, y = self.text_samples.get_sample_Xy(query_id, max_lines=self.max_test_lines)
+
+            # if len(X)!=qp.environ['SAMPLE_SIZE']:
+            #     print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            try:
+                test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
+            except ValueError as e:
+                print(f'file {file} caused error {e}')
+                yield None, None
+
+            # print('train #classes:', train_sample.n_classes, train_sample.prevalence())
+            # print('test  #classes:', test_sample.n_classes, test_sample.prevalence())
+
+            yield train_sample, test_sample
--- a/Retrieval/fifth.py
+++ b/Retrieval/fifth.py
@ -0,0 +1,182 @@
+from collections import defaultdict
+
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.svm import LinearSVC
+
+import quapy as qp
+import quapy.functional as F
+from Retrieval.commons import RetrievedSamples, load_txt_sample, load_json_sample
+from Retrieval.tabular import Table
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+
+"""
+In this fifth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as 
+in the fourth experiment, and the fairness group are defined upon geographic info as in the fourth case.
+As in the fourth, the data Li and Ui have been drawn by retrieving query-related documents from
+a pool of the same size. Unlike the fourth experiment, here the training queries are
+
+Por ahora 1000 en tr y 100 en test
+Parece que ahora hay muy poco shift  
+"""
+
+
+def cls(classifier_trained=None):
+    if classifier_trained is None:
+        # return LinearSVC()
+        return LogisticRegression()
+    else:
+        return classifier_trained
+
+
+def methods(classifier_trained=None):
+    yield ('CC', ClassifyAndCount(cls(classifier_trained)))
+    yield ('PCC', PCC(cls(classifier_trained)))
+    yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
+    yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
+    yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
+    yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
+    # yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
+    # yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
+    # yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
+    # yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
+    # yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
+    # yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
+    # yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
+    # yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
+    # yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
+    # yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
+    yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
+    # yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+
+
+def train_classifier():
+    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+    training = LabelledCollection.load(train_path, loader_func=load_json_sample, class_name=CLASS_NAME)
+
+    if REDUCE_TR > 0 and len(training) > REDUCE_TR:
+        print('Reducing the number of documents in the training to', REDUCE_TR)
+        training = training.sampling(REDUCE_TR, *training.prevalence())
+
+    Xtr, ytr = training.Xy
+    Xtr = tfidf.fit_transform(Xtr)
+    print('L orig shape = ', Xtr.shape)
+
+    training = LabelledCollection(Xtr, ytr)
+
+    print('training classifier')
+    classifier_trained = LogisticRegression()
+    classifier_trained = GridSearchCV(classifier_trained,
+                                      param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
+                                      n_jobs=-1, cv=5)
+    classifier_trained.fit(Xtr, ytr)
+    classifier_trained = classifier_trained.best_estimator_
+    trained = True
+    print('[Done!]')
+
+    classes = training.classes_
+
+    print('training classes:', classes)
+    print('training prevalence:', training.prevalence())
+
+    return tfidf, classifier_trained
+
+
+def reduceAtK(data: LabelledCollection, k):
+    X, y = data.Xy
+    X = X[:k]
+    y = y[:k]
+    return LabelledCollection(X, y, classes=data.classes_)
+
+
+RANK_AT_K = -1
+REDUCE_TR = 50000
+qp.environ['SAMPLE_SIZE'] = RANK_AT_K
+
+
+def scape_latex(string):
+    return string.replace('_', '\_')
+
+
+Ks = [10, 50, 100, 250, 500, 1000, 2000]
+# Ks = [500]
+
+for CLASS_NAME in ['continent'] : #'years_category']: #['continent', 'first_letter_category']: #, 'gender', 'gender_category', 'occupations', 'source_countries', 'source_subcont_regions', 'years_category', 'relative_pageviews_category']:
+
+    data_path = './' + CLASS_NAME
+
+    if CLASS_NAME in ['years_category', 'continent']:
+        train_path = join(data_path, 'train500PerGroup.json')
+    else:
+        train_path = join(data_path, 'train3000samples.json')
+
+    tfidf, classifier_trained = qp.util.pickled_resource(f'classifier_{CLASS_NAME}.pkl', train_classifier)
+    trained=True
+
+    experiment_prot = RetrievedSamples(data_path,
+                               load_fn=load_json_sample,
+                               vectorizer=tfidf,
+                               max_train_lines=None,
+                               max_test_lines=RANK_AT_K, classes=classifier_trained.classes_, class_name=CLASS_NAME)
+
+    method_names = [name for name, *other in methods()]
+    benchmarks = [f'{scape_latex(CLASS_NAME)}@{k}' for k in Ks]
+    table_mae = Table(benchmarks, method_names, color_mode='global')
+    table_mrae = Table(benchmarks, method_names, color_mode='global')
+
+    for method_name, quantifier in methods(classifier_trained):
+        # print('Starting with method=', method_name)
+
+        mae_errors = {k:[] for k in Ks}
+        mrae_errors = {k:[] for k in Ks}
+
+        pbar = tqdm(experiment_prot(), total=49)
+        for train, test in pbar:
+            if train is not None:
+                try:
+                    if trained and method_name!='MLPE':
+                        quantifier.fit(train, val_split=train, fit_classifier=False)
+                    else:
+                        quantifier.fit(train)
+
+                    for k in Ks:
+                        test_k = reduceAtK(test, k)
+                        estim_prev = quantifier.quantify(test_k.instances)
+
+                        mae_errors[k].append(qp.error.mae(test_k.prevalence(), estim_prev))
+                        mrae_errors[k].append(qp.error.mrae(test_k.prevalence(), estim_prev, eps=(1./(2*k))))
+
+                except Exception as e:
+                    print(f'wow, something happened here! skipping; {e}')
+            else:
+                print('skipping one!')
+
+            # pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
+            pbar.set_description(f'{method_name}')
+
+        for k in Ks:
+
+            table_mae.add(benchmark=f'{scape_latex(CLASS_NAME)}@{k}', method=method_name, values=mae_errors[k])
+            table_mrae.add(benchmark=f'{scape_latex(CLASS_NAME)}@{k}', method=method_name, values=mrae_errors[k])
+
+    table_mae.latexPDF('./latex', f'table_{CLASS_NAME}_mae.tex')
+    table_mrae.latexPDF('./latex', f'table_{CLASS_NAME}_mrae.tex')
+
+
+
+
+
+
+
+
--- a/Retrieval/fourth.py
+++ b/Retrieval/fourth.py
@ -0,0 +1,161 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import GridSearchCV
+from sklearn.svm import LinearSVC
+
+import quapy as qp
+import quapy.functional as F
+from Retrieval.commons import RetrievedSamples, load_txt_sample
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+
+"""
+In this fourth experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as 
+in the third experiment, and the fairness group are defined upon geographic info as in the third case.
+The difference here is that the data Li and Ui have been drawn by retrieving query-related documents from
+a pool of the same size.
+
+Por ahora 1000 en tr y 100 en test
+Parece que ahora hay muy poco shift  
+"""
+
+def cls(classifier_trained=None):
+    if classifier_trained is None:
+        # return LinearSVC()
+        return LogisticRegression()
+    else:
+        return classifier_trained
+
+
+def methods(classifier_trained=None):
+    yield ('CC', ClassifyAndCount(cls(classifier_trained)))
+    yield ('PACC', PACC(cls(classifier_trained), val_split=5, n_jobs=-1))
+    yield ('EMQ', EMQ(cls(classifier_trained), exact_train_prev=True))
+    yield ('EMQh', EMQ(cls(classifier_trained), exact_train_prev=False))
+    yield ('EMQ-BCTS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='bcts'))
+    yield ('EMQ-TS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='ts'))
+    yield ('EMQ-NBVS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='nbvs'))
+    # yield ('EMQ-VS', EMQ(cls(classifier_trained), exact_train_prev=False, recalib='vs'))
+    yield ('PCC', PCC(cls(classifier_trained)))
+    yield ('ACC', ACC(cls(classifier_trained), val_split=5, n_jobs=-1))
+    yield ('KDE001', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.001))
+    yield ('KDE005', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
+    yield ('KDE01', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.01))
+    yield ('KDE02', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.02))
+    yield ('KDE03', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.03))
+    yield ('KDE05', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.05))
+    yield ('KDE07', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.07))
+    yield ('KDE10', KDEyML(cls(classifier_trained), val_split=5, n_jobs=-1, bandwidth=0.10))
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+
+
+def train_classifier():
+    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+    training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
+
+    if REDUCE_TR > 0:
+        print('Reducing the number of documents in the training to', REDUCE_TR)
+        training = training.sampling(REDUCE_TR, *training.prevalence())
+
+    Xtr, ytr = training.Xy
+    Xtr = tfidf.fit_transform(Xtr)
+    print('L orig shape = ', Xtr.shape)
+
+    training = LabelledCollection(Xtr, ytr)
+
+    print('training classifier')
+    classifier_trained = LogisticRegression()
+    classifier_trained = GridSearchCV(classifier_trained,
+                                      param_grid={'C': np.logspace(-3, 3, 7), 'class_weight': ['balanced', None]},
+                                      n_jobs=-1, cv=5)
+    classifier_trained.fit(Xtr, ytr)
+    classifier_trained = classifier_trained.best_estimator_
+    trained = True
+    print('[Done!]')
+
+    classes = training.classes_
+
+    print('training classes:', classes)
+    print('training prevalence:', training.prevalence())
+
+    return tfidf, classifier_trained
+
+
+
+RANK_AT_K = 1000
+REDUCE_TR = 50000
+qp.environ['SAMPLE_SIZE'] = RANK_AT_K
+
+data_path = './50_50_split_trec'
+train_path = join(data_path, 'train_50_50_continent.txt')
+
+tfidf, classifier_trained = qp.util.pickled_resource('classifier.pkl', train_classifier)
+trained=True
+
+experiment_prot = RetrievedSamples(data_path,
+                           load_fn=load_txt_sample,
+                           vectorizer=tfidf,
+                           max_train_lines=None,
+                           max_test_lines=RANK_AT_K, classes=classifier_trained.classes_)
+
+result_mae_dict = {}
+result_mrae_dict = {}
+for method_name, quantifier in methods(classifier_trained):
+    # print('Starting with method=', method_name)
+
+    mae_errors = []
+    mrae_errors = []
+    pbar = tqdm(experiment_prot(), total=49)
+    for train, test in pbar:
+        if train is not None:
+            try:
+
+                # print(train.prevalence())
+                # print(test.prevalence())
+                if trained and method_name!='MLPE':
+                    quantifier.fit(train, val_split=train, fit_classifier=False)
+                else:
+                    quantifier.fit(train)
+                estim_prev = quantifier.quantify(test.instances)
+
+                mae = qp.error.mae(test.prevalence(), estim_prev)
+                mae_errors.append(mae)
+
+                mrae = qp.error.mrae(test.prevalence(), estim_prev)
+                mrae_errors.append(mrae)
+
+                # print()
+                # print('Training prevalence:', F.strprev(train.prevalence()), 'shape', train.X.shape)
+                # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
+                # print('Estim prevalence:', F.strprev(estim_prev))
+
+            except Exception as e:
+                print(f'wow, something happened here! skipping; {e}')
+        else:
+            print('skipping one!')
+
+        pbar.set_description(f'{method_name}\tmae={np.mean(mae_errors):.4f}\tmrae={np.mean(mrae_errors):.4f}')
+    print()
+    result_mae_dict[method_name] = np.mean(mae_errors)
+    result_mrae_dict[method_name] = np.mean(mrae_errors)
+
+print('Results\n'+('-'*100))
+for method_name in result_mae_dict.keys():
+    MAE = result_mae_dict[method_name]
+    MRAE = result_mrae_dict[method_name]
+    print(f'{method_name}\t{MAE=:.5f}\t{MRAE=:.5f}')
+
+
+
+
+
+
+
--- a/Retrieval/previous/preliminary_.py
+++ b/Retrieval/previous/preliminary_.py
@ -0,0 +1,98 @@
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+import quapy.functional as F
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+
+"""
+This was the very first experiment. 1 big training set and many test rankings produced according to some queries.
+The quantification methods did not seem to work. The more sophisticated the method is, the worse it performed.
+This is a clear indication that the PPS assumptions do not hold.
+Actually, while the training set could be some iid sample from a distribution L and every test set
+is a iid sample from a distribution U, it is pretty clear that P(X|Y) is different, since the test set
+are biased towards a query term whereas the training set is not.  
+"""
+
+def methods():
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+    yield ('CC', ClassifyAndCount(LogisticRegression(n_jobs=-1)))
+    yield ('ACC', ACC(LogisticRegression(n_jobs=-1)))
+    yield ('PCC', PCC(LogisticRegression(n_jobs=-1)))
+    yield ('PACC', PACC(LogisticRegression(n_jobs=-1)))
+    yield ('EMQ', EMQ(LogisticRegression(n_jobs=-1)))
+
+
+def load_txt_sample(path, verbose=False):
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text']
+    y = df['first_letter_category']
+
+    return X, y
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, classes):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.classes = classes
+
+    def __call__(self):
+        for file in glob(join(self.path_dir, 'test_data_*.txt')):
+            X, y = self.load_fn(file)
+            if len(X)!=qp.environ['SAMPLE_SIZE']:
+                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            sample = LabelledCollection(X, y, classes=self.classes)
+            yield sample.Xp
+
+
+qp.environ['SAMPLE_SIZE']=100
+
+data_path = './data'
+train_path = join(data_path, 'train_data.txt')
+
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5)
+
+training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True)
+
+# training = training.sampling(1000)
+
+Xtr, ytr = training.Xy
+Xtr = tfidf.fit_transform(Xtr)
+print('Xtr shape = ', Xtr.shape)
+
+training = LabelledCollection(Xtr, ytr)
+classes = training.classes_
+
+test_prot = RetrievedSamples(data_path, load_fn=load_txt_sample, vectorizer=tfidf, classes=classes)
+
+print('Training prevalence:', F.strprev(training.prevalence()))
+for X, p in test_prot():
+    print('Test prevalence:', F.strprev(p))
+
+for method_name, quantifier in methods():
+    print('training ', method_name)
+    quantifier.fit(training)
+    print('[done]')
+
+    report = qp.evaluation.evaluation_report(quantifier, test_prot, error_metrics=['mae', 'mrae'], verbose=True)
+
+    print(report.mean())
+
+
+
--- a/Retrieval/previous/second.py
+++ b/Retrieval/previous/second.py
@ -0,0 +1,131 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+import quapy.functional as F
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+
+"""
+In this second experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set.
+Both elements in the pair are *retrieved according to the same query*. This is a way to impose
+the same type of bias that was present in the test, to the training set. Let's see...  
+"""
+
+def methods():
+    yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('CC', ClassifyAndCount(LogisticRegression()))
+    yield ('EMQ', EMQ(LogisticRegression()))
+    yield ('PCC', PCC(LogisticRegression()))
+    yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+
+
+def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text'].values
+    y = df['first_letter_category'].values
+
+    if parse_columns:
+        rank = df['rank'].values
+        scores = df['score'].values
+        order = np.argsort(rank)
+        X = X[order]
+        y = y[order]
+        rank = rank[order]
+        scores = scores[order]
+
+    if max_lines is not None:
+        X = X[:max_lines]
+        y = y[:max_lines]
+
+    return X, y
+
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, classes, max_train_lines=None, max_test_lines=None):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.classes = classes
+        self.max_train_lines = max_train_lines
+        self.max_test_lines = max_test_lines
+
+    def __call__(self):
+        for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
+
+            X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
+            X = self.vectorizer.transform(X)
+            train_sample = LabelledCollection(X, y, classes=self.classes)
+
+            X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
+            if len(X)!=qp.environ['SAMPLE_SIZE']:
+                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            test_sample = LabelledCollection(X, y, classes=self.classes)
+
+            yield train_sample, test_sample
+
+
+RANK_AT_K = 500
+REDUCE_TR = 50000
+qp.environ['SAMPLE_SIZE'] = RANK_AT_K
+
+data_path = './newCollection'
+train_path = join(data_path, 'train_data.txt')
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+
+training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
+if REDUCE_TR>0:
+    print('Reducing the number of documents in the training to', REDUCE_TR)
+    training = training.sampling(REDUCE_TR)
+
+Xtr, ytr = training.Xy
+Xtr = tfidf.fit_transform(Xtr)
+print('L orig shape = ', Xtr.shape)
+
+training = LabelledCollection(Xtr, ytr)
+classes = training.classes_
+
+experiment_prot = RetrievedSamples(data_path,
+                                   load_fn=load_txt_sample,
+                                   vectorizer=tfidf,
+                                   classes=classes,
+                                   max_train_lines=RANK_AT_K,
+                                   max_test_lines=RANK_AT_K)
+
+for method_name, quantifier in methods():
+    print('Starting with method=', method_name)
+
+    errors = []
+    pbar = tqdm(experiment_prot(), total=49)
+    for train, test in pbar:
+        # print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
+        # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
+
+        quantifier.fit(train)
+        estim_prev = quantifier.quantify(test.instances)
+        mae = qp.error.mae(test.prevalence(), estim_prev)
+        errors.append(mae)
+
+        pbar.set_description(f'mae={np.mean(errors):.4f}')
+    print()
+
+
+
+
--- a/Retrieval/previous/third.py
+++ b/Retrieval/previous/third.py
@ -0,0 +1,155 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+import quapy.functional as F
+from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation
+from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC
+from quapy.protocol import AbstractProtocol
+from quapy.data.base import LabelledCollection
+
+from glob import glob
+from os.path import join
+from tqdm import tqdm
+
+"""
+In this third experiment, we have pairs of (Li,Ui) with Li a training set and Ui a test set as 
+in the second experiment, but in this case the fairness group are defined upon geographic info.  
+"""
+
+def methods():
+    yield ('CC', ClassifyAndCount(LogisticRegression()))
+    yield ('PACC', PACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('EMQ', EMQ(LogisticRegression()))
+    yield ('PCC', PCC(LogisticRegression()))
+    yield ('ACC', ACC(LogisticRegression(), val_split=5, n_jobs=-1))
+    yield ('MLPE', MaximumLikelihoodPrevalenceEstimation())
+
+
+def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
+    # print('reading', path)
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text'].values
+    y = df['continent'].values
+
+    if parse_columns:
+        rank = df['rank'].values
+        scores = df['score'].values
+        rank = rank[y != 'Antarctica']
+        scores = scores[y != 'Antarctica']
+
+    X = X[y!='Antarctica']
+    y = y[y!='Antarctica']
+
+    if parse_columns:
+        order = np.argsort(rank)
+        X = X[order]
+        y = y[order]
+        rank = rank[order]
+        scores = scores[order]
+
+    if max_lines is not None:
+        X = X[:max_lines]
+        y = y[:max_lines]
+
+    return X, y
+
+
+class RetrievedSamples(AbstractProtocol):
+
+    def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None):
+        self.path_dir = path_dir
+        self.load_fn = load_fn
+        self.vectorizer = vectorizer
+        self.max_train_lines = max_train_lines
+        self.max_test_lines = max_test_lines
+
+    def __call__(self):
+        for file in glob(join(self.path_dir, 'test_rankings_*.txt')):
+
+            X, y = self.load_fn(file.replace('test_', 'training_'), parse_columns=True, max_lines=self.max_train_lines)
+            X = self.vectorizer.transform(X)
+            train_sample = LabelledCollection(X, y)
+
+            X, y = self.load_fn(file, parse_columns=True, max_lines=self.max_test_lines)
+            if len(X)!=qp.environ['SAMPLE_SIZE']:
+                print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
+            X = self.vectorizer.transform(X)
+            try:
+                test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
+            except ValueError as e:
+                print(f'file {file} caused error {e}')
+                yield None, None
+
+            # print('train #classes:', train_sample.n_classes, train_sample.prevalence())
+            # print('test  #classes:', test_sample.n_classes, test_sample.prevalence())
+
+            yield train_sample, test_sample
+
+
+RANK_AT_K = 100
+REDUCE_TR = 50000
+qp.environ['SAMPLE_SIZE'] = RANK_AT_K
+
+data_path = './newCollectionGeo'
+train_path = join(data_path, 'train_data_continent.txt')
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+
+training = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
+
+if REDUCE_TR>0:
+    print('Reducing the number of documents in the training to', REDUCE_TR)
+    training = training.sampling(REDUCE_TR)
+
+Xtr, ytr = training.Xy
+Xtr = tfidf.fit_transform(Xtr)
+print('L orig shape = ', Xtr.shape)
+
+training = LabelledCollection(Xtr, ytr)
+classes = training.classes_
+
+print('training classes:', classes)
+print('training prevalence:', training.prevalence())
+
+experiment_prot = RetrievedSamples(data_path,
+                                   load_fn=load_txt_sample,
+                                   vectorizer=tfidf,
+                                   max_train_lines=None,
+                                   max_test_lines=RANK_AT_K)
+
+for method_name, quantifier in methods():
+    print('Starting with method=', method_name)
+
+    errors = []
+    pbar = tqdm(experiment_prot(), total=49)
+    for train, test in pbar:
+        if train is not None:
+            try:
+                # print('Training prevalence:', F.strprev(training.prevalence()), 'shape', train.X.shape)
+                # print('Test prevalence:', F.strprev(test.prevalence()), 'shape', test.X.shape)
+
+                # print(train.prevalence())
+                # print(test.prevalence())
+                quantifier.fit(train)
+                estim_prev = quantifier.quantify(test.instances)
+                mae = qp.error.mae(test.prevalence(), estim_prev)
+                errors.append(mae)
+            except Exception as e:
+                print(f'wow, something happened here! skipping; {e}')
+        else:
+            print('skipping one!')
+
+        pbar.set_description(f'mae={np.mean(errors):.4f}')
+    print()
+
+
+
+
--- a/Retrieval/tabular.py
+++ b/Retrieval/tabular.py
@ -0,0 +1,427 @@
+import os.path
+import numpy as np
+import itertools
+from scipy.stats import ttest_ind_from_stats, wilcoxon
+from pathlib import Path
+from os.path import join
+
+
+class Table:
+    VALID_TESTS = [None, "wilcoxon", "ttest"]
+
+    def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
+                 clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
+                 color=True, color_mode='local', maxtone=50):
+        assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
+
+        self.benchmarks = np.asarray(benchmarks)
+        self.benchmark_index = {row:i for i, row in enumerate(benchmarks)}
+
+        self.methods = np.asarray(methods)
+        self.method_index = {col:j for j, col in enumerate(methods)}
+
+        self.map = {}  
+        # keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
+        self._addmap('values', dtype=object)
+        self.lower_is_better = lower_is_better
+        self.ttest = ttest
+        self.prec_mean = prec_mean
+        self.clean_zero = clean_zero
+        self.show_std = show_std
+        self.prec_std = prec_std
+        self.add_average = average
+        self.missing = missing
+        self.missing_str = missing_str
+        self.color = color
+        self.color_mode = color_mode
+        self.maxtone = maxtone
+        
+        self.touch()
+
+    @property
+    def nbenchmarks(self):
+        return len(self.benchmarks)
+
+    @property
+    def nmethods(self):
+        return len(self.methods)
+
+    def touch(self):
+        self._modif = True
+
+    def update(self):
+        if self._modif:
+            self.compute()
+
+    def _getfilled(self):
+        return np.argwhere(self.map['fill'])
+
+    @property
+    def values(self):
+        return self.map['values']
+
+    def _indexes(self):
+        return itertools.product(range(self.nbenchmarks), range(self.nmethods))
+
+    def _addmap(self, map, dtype, func=None):
+        self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
+        if func is None:
+            return
+        m = self.map[map]
+        f = func
+        indexes = self._indexes() if map == 'fill' else self._getfilled()
+        for i, j in indexes:
+            m[i, j] = f(self.values[i, j])
+
+    def _addrank(self):
+        for i in range(self.nbenchmarks):
+            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
+            col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
+            ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
+            if not self.lower_is_better:
+                ranked_cols_idx = ranked_cols_idx[::-1]
+            self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx)+1)
+            
+    def _addcolor(self):
+        minval = {}
+        maxval = {}
+
+        if self.color_mode == 'global':
+            filled_cols_idx = np.argwhere(self.map['fill'])
+            col_means = [self.map['mean'][i, j] for i, j in filled_cols_idx]
+            if len(filled_cols_idx) > 0:
+                global_minval = min(col_means)
+                global_maxval = max(col_means)
+                for i in range(self.nbenchmarks):
+                    minval[i] = global_minval
+                    maxval[i] = global_maxval
+        elif self.color_mode == 'local':
+            for i in range(self.nbenchmarks):
+                filled_cols_idx = np.argwhere(self.map['fill'][i, i + 1])
+                if len(filled_cols_idx)>0:
+                    col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
+                    minval[i] = min(col_means)
+                    maxval[i] = max(col_means)
+
+        else:
+            print(f'color mode {self.color_mode} not understood, valid ones are "local" and "global"; skip')
+            return
+
+
+        for i in range(self.nbenchmarks):
+            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
+            for col_idx in filled_cols_idx:
+                val = self.map['mean'][i,col_idx]
+                if i not in maxval or i not in minval:
+                    continue
+                norm = (maxval[i] - minval[i])
+                if norm > 0:
+                    normval = (val - minval[i]) / norm
+                else:
+                    normval = 0.5
+
+                if self.lower_is_better:
+                    normval = 1 - normval
+
+                normval = np.clip(normval, 0,1)
+
+                self.map['color'][i, col_idx] = color_red2green_01(normval, self.maxtone)
+
+    def _run_ttest(self, row, col1, col2):
+        mean1 = self.map['mean'][row, col1]
+        std1 = self.map['std'][row, col1]
+        nobs1 = self.map['nobs'][row, col1]
+        mean2 = self.map['mean'][row, col2]
+        std2 = self.map['std'][row, col2]
+        nobs2 = self.map['nobs'][row, col2]
+        _, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
+        return p_val
+
+    def _run_wilcoxon(self, row, col1, col2):
+        values1 = self.map['values'][row, col1]
+        values2 = self.map['values'][row, col2]
+        try:
+            _, p_val = wilcoxon(values1, values2)
+        except ValueError:
+            p_val = 0
+        return p_val
+
+    def _add_statistical_test(self):
+        if self.ttest is None:
+            return
+        self.some_similar = [False]*self.nmethods
+        for i in range(self.nbenchmarks):
+            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
+            if len(filled_cols_idx) <= 1:
+                continue
+            col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
+            best_pos = filled_cols_idx[np.argmin(col_means)]
+
+            for j in filled_cols_idx:
+                if j==best_pos:
+                    continue
+                if self.ttest == 'ttest':
+                    p_val = self._run_ttest(i, best_pos, j)
+                else:
+                    p_val = self._run_wilcoxon(i, best_pos, j)
+
+                pval_outcome = pval_interpretation(p_val)
+                self.map['ttest'][i, j] = pval_outcome
+                if pval_outcome != 'Diff':
+                    self.some_similar[j] = True
+
+    def compute(self):
+        self._addmap('fill', dtype=bool, func=lambda x: x is not None)
+        self._addmap('mean', dtype=float, func=np.mean)
+        self._addmap('std', dtype=float, func=np.std)
+        self._addmap('nobs', dtype=float, func=len)
+        self._addmap('rank', dtype=int, func=None)
+        self._addmap('color', dtype=object, func=None)
+        self._addmap('ttest', dtype=object, func=None)
+        self._addmap('latex', dtype=object, func=None)
+        self._addrank()
+        self._addcolor()
+        self._add_statistical_test()
+        if self.add_average:
+            self._addave()
+        self._modif = False
+
+    def _is_column_full(self, col):
+        return all(self.map['fill'][:, self.method_index[col]])
+
+    def _addave(self):
+        ave = Table(['ave'], self.methods,
+                    lower_is_better=self.lower_is_better,
+                    ttest=self.ttest,
+                    average=False,
+                    missing=self.missing,
+                    missing_str=self.missing_str,
+                    prec_mean=self.prec_mean,
+                    prec_std=self.prec_std,
+                    clean_zero=self.clean_zero,
+                    show_std=self.show_std,
+                    color=self.color,
+                    maxtone=self.maxtone)
+        for col in self.methods:
+            values = None
+            if self._is_column_full(col):
+                if self.ttest == 'ttest':
+                    # values = np.asarray(self.map['mean'][:, self.method_index[col]])
+                    values = np.concatenate(self.values[:, self.method_index[col]])
+                else:  # wilcoxon
+                    # values = np.asarray(self.map['mean'][:, self.method_index[col]])
+                    values = np.concatenate(self.values[:, self.method_index[col]])
+            ave.add('ave', col, values)
+        self.average = ave
+
+    def add(self, benchmark, method, values):
+        if values is not None:
+            values = np.asarray(values)
+            if values.ndim==0:
+                values = values.flatten()
+        rid, cid = self._coordinates(benchmark, method)
+        self.map['values'][rid, cid] = values
+        self.touch()
+
+    def get(self, benchmark, method, attr='mean'):
+        self.update()
+        assert attr in self.map, f'unknwon attribute {attr}'
+        rid, cid = self._coordinates(benchmark, method)
+        if self.map['fill'][rid, cid]:
+            v = self.map[attr][rid, cid]
+            if v is None or (isinstance(v,float) and np.isnan(v)):
+                return self.missing
+            return v
+        else:
+            return self.missing
+
+    def _coordinates(self, benchmark, method):
+        assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
+        assert method in self.method_index, f'method {method} out of range'
+        rid = self.benchmark_index[benchmark]
+        cid = self.method_index[method]
+        return rid, cid
+
+    def get_average(self, method, attr='mean'):
+        self.update()
+        if self.add_average:
+            return self.average.get('ave', method, attr=attr)
+        return None
+
+    def get_color(self, benchmark, method):
+        color = self.get(benchmark, method, attr='color')
+        if color is None:
+            return ''
+        return color
+
+    def latex(self, benchmark, method):
+        self.update()
+        i,j = self._coordinates(benchmark, method)
+        if self.map['fill'][i,j] == False:
+            return self.missing_str
+
+        mean = self.map['mean'][i,j]
+        l = f" {mean:.{self.prec_mean}f}"
+        if self.clean_zero:
+            l = l.replace(' 0.', '.')
+
+        isbest = self.map['rank'][i,j] == 1
+        if isbest:
+            l = "\\textbf{"+l.strip()+"}"
+
+        stat = '' if self.ttest is None else '^{\phantom{\ddag}}'
+        if self.ttest is not None and self.some_similar[j]:
+            test_label = self.map['ttest'][i,j]
+            if test_label == 'Sim':
+                stat = '^{\dag}'
+            elif test_label == 'Same':
+                stat = '^{\ddag}'
+            elif isbest or test_label == 'Diff':
+                stat = '^{\phantom{\ddag}}'
+
+        std = ''
+        if self.show_std:
+            std = self.map['std'][i,j]
+            std = f" {std:.{self.prec_std}f}"
+            if self.clean_zero:
+                std = std.replace(' 0.', '.')
+            std = f"\pm {std:{self.prec_std}}"
+
+        if stat!='' or std!='':
+            l = f'{l}${stat}{std}$'
+
+        if self.color:
+            l += ' ' + self.map['color'][i,j]
+
+        return l
+
+    def latexPDF(self, path, name:str, *args, **kwargs):
+        if not name.endswith('.tex'):
+            name += '.tex'
+
+        self.latexSaveDocument(join(path, name), *args, **kwargs)
+
+        print("[Tables Done] runing latex")
+        os.chdir(path)
+        os.system('pdflatex '+name)
+        basename = name.replace('.tex', '')
+        os.system(f'rm {basename}.aux {basename}.bbl {basename}.blg {basename}.log {basename}.out {basename}.dvi')
+        os.chdir('..')
+
+    def latexSaveDocument(self, path, *args, **kwargs):
+        document = self.latexDocument(*args, **kwargs)
+        parent = Path(path).parent
+        os.makedirs(parent, exist_ok=True)
+        with open(path, 'wt') as foo:
+            foo.write(document)
+        print('text file save at ', path)
+
+    def latexDocument(self, *args, **kwargs):
+        document = """
+\\documentclass[10pt,a4paper]{article}
+\\usepackage[utf8]{inputenc}
+\\usepackage{amsmath}
+\\usepackage{amsfonts}
+\\usepackage{amssymb}
+\\usepackage{graphicx}
+\\usepackage{xcolor}
+\\usepackage{colortbl}
+        
+\\begin{document}
+        """
+        document += self.latexTable(*args, **kwargs)
+        document += "\n\end{document}\n"
+        return document
+
+    def latexTable(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline', resizebox=True):
+        table = """
+        \\begin{table}
+        \center
+        %%%\\resizebox{\\textwidth}{!}{% \n
+        """
+        table += "\n\\begin{tabular}{|c"+"|c" * self.nmethods + "|}\n"
+        table += self.latexTabular(benchmark_replace, method_replace, aslines, endl)
+        table += "\n\\end{tabular}\n"
+        table += """
+        %%%}%
+        \end{table}
+        """
+        if resizebox:
+            table = table.replace("%%%", "")
+        return table
+
+    def latexTabular(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline'):
+        lines = []
+        l = '\multicolumn{1}{c|}{} & '
+        l += ' & '.join([method_replace.get(col, col) for col in self.methods])
+        l += ' \\\\\hline'
+        lines.append(l)
+
+        for row in self.benchmarks:
+            rowname = benchmark_replace.get(row, row)
+            l = rowname + ' & '
+            l += self.latexRow(row, endl=endl)
+            lines.append(l)
+
+        if self.add_average:
+            # l += '\hline\n'
+            l = '\hline \n \\textit{Average} & '
+            l += self.latexAverage(endl=endl)
+            lines.append(l)
+        if not aslines:
+            lines='\n'.join(lines)
+        return lines
+
+    def latexRow(self, benchmark, endl='\\\\\hline\n'):
+        s = [self.latex(benchmark, col) for col in self.methods]
+        s = ' & '.join(s)
+        s += ' ' + endl
+        return s
+
+    def latexAverage(self, endl='\\\\\hline\n'):
+        if self.add_average:
+            return self.average.latexRow('ave', endl=endl)
+
+    def getRankTable(self, prec_mean=0):
+        t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=prec_mean, average=True, maxtone=self.maxtone, ttest=None)
+        for rid, cid in self._getfilled():
+            row = self.benchmarks[rid]
+            col = self.methods[cid]
+            t.add(row, col, self.get(row, col, 'rank'))
+        t.compute()
+        return t
+
+    def dropMethods(self, methods):
+        drop_index = [self.method_index[m] for m in methods]
+        new_methods = np.delete(self.methods, drop_index)
+        new_index = {col:j for j, col in enumerate(new_methods)}
+
+        self.map['values'] = self.values[:,np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
+        self.methods = new_methods
+        self.method_index = new_index
+        self.touch()
+
+
+def pval_interpretation(p_val):
+    if 0.005 >= p_val:
+        return 'Diff'
+    elif 0.05 >= p_val > 0.005:
+        return 'Sim'
+    elif p_val > 0.05:
+        return 'Same'
+
+
+def color_red2green_01(val, maxtone=50):
+    if np.isnan(val): return None
+    assert 0 <= val <= 1, f'val {val} out of range [0,1]'
+
+
+    # rescale to [-1,1]
+    val = val * 2 - 1
+    if val < 0:
+        color = 'red'
+        tone = maxtone * (-val)
+    else:
+        color = 'green'
+        tone = maxtone * val
+    return '\cellcolor{' + color + f'!{int(tone)}' + '}'
--- a/Retrieval/understand_classif_scheme.py
+++ b/Retrieval/understand_classif_scheme.py
@ -0,0 +1,66 @@
+import numpy as np
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
+from sklearn.metrics import make_scorer, f1_score
+from sklearn.svm import LinearSVC
+
+from quapy.data.base import LabelledCollection
+from sklearn.model_selection import cross_val_score, GridSearchCV
+
+from os.path import join
+
+"""
+In this experiment, I simply try to understand whether the learning task can be learned or not.
+The problem is that we are quantifying the categories based on the alphabetical order (of what?).  
+"""
+
+def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
+    if verbose:
+        print(f'loading {path}...', end='')
+    df = pd.read_csv(path, sep='\t')
+    if verbose:
+        print('[done]')
+    X = df['text'].values
+    y = df['continent'].values
+
+    if parse_columns:
+        rank = df['rank'].values
+        scores = df['score'].values
+        order = np.argsort(rank)
+        X = X[order]
+        y = y[order]
+        rank = rank[order]
+        scores = scores[order]
+
+    if max_lines is not None:
+        X = X[:max_lines]
+        y = y[:max_lines]
+
+    return X, y
+
+data_path = './50_50_split_trec'
+train_path = join(data_path, 'train_50_50_continent.txt')
+
+tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
+data = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
+data = data.sampling(20000)
+train, test = data.split_stratified()
+train.instances = tfidf.fit_transform(train.instances)
+test.instances  = tfidf.transform(test.instances)
+
+# svm = LinearSVC()
+# cls = GridSearchCV(svm, param_grid={'C':np.logspace(-3,3,7), 'class_weight':['balanced', None]})
+cls = LogisticRegression()
+cls.fit(*train.Xy)
+
+# score = cross_val_score(LogisticRegressionCV(), *data.Xy, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1, cv=5)
+# print(score)
+# print(np.mean(score))
+
+y_pred = cls.predict(test.instances)
+macrof1 = f1_score(y_true=test.labels, y_pred=y_pred, average='macro')
+microf1 = f1_score(y_true=test.labels, y_pred=y_pred, average='micro')
+
+print('macro', macrof1)
+print('micro', microf1)
--- a/quapy/init.py
+++ b/quapy/init.py
@ -11,7 +11,7 @@ from . import util
 from . import model_selection
 from . import classification

-__version__ = '0.1.8'
+__version__ = '0.1.9'

 environ = {
    'SAMPLE_SIZE': None,
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@ -52,7 +52,7 @@ class KDEBase:
        """
        return np.exp(kde.score_samples(X))

-    def get_mixture_components(self, X, y, n_classes, bandwidth):
+    def get_mixture_components(self, X, y, classes, bandwidth):
        """
        Returns an array containing the mixture components, i.e., the KDE functions for each class.

@ -62,7 +62,7 @@ class KDEBase:
        :param bandwidth: float, the bandwidth of the kernel
        :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
        """
-        return [self.get_kde_function(X[y == cat], bandwidth) for cat in range(n_classes)]
+        return [self.get_kde_function(X[y == cat], bandwidth) for cat in classes]



@ -114,7 +114,7 @@ class KDEyML(AggregativeSoftQuantifier, KDEBase):
        self.random_state=random_state

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
+        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)
        return self

    def aggregate(self, posteriors: np.ndarray):
@ -196,7 +196,7 @@ class KDEyHD(AggregativeSoftQuantifier, KDEBase):
        self.montecarlo_trials = montecarlo_trials

    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
-        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.n_classes, self.bandwidth)
+        self.mix_densities = self.get_mixture_components(*classif_predictions.Xy, data.classes_, self.bandwidth)

        N = self.montecarlo_trials
        rs = self.random_state
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -640,6 +640,8 @@ class EMQ(AggregativeSoftQuantifier):
                raise ValueError('invalid param argument for recalibration method; available ones are '
                                 '"nbvs", "bcts", "ts", and "vs".')

+            if not np.issubdtype(y.dtype, np.number):
+                y = np.searchsorted(data.classes_, y)
            self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)

        if self.exact_train_prev:
@ -681,6 +683,11 @@ class EMQ(AggregativeSoftQuantifier):
        """
        Px = posterior_probabilities
        Ptr = np.copy(tr_prev)
+
+        if np.product(Ptr) == 0:  # some entry is 0; we should smooth the values to avoid 0 division
+            Ptr += epsilon
+            Ptr /= Ptr.sum()
+
        qs = np.copy(Ptr)  # qs (the running estimate) is initialized as the training prevalence

        s, converged = 0, False
--- a/quapy/method/non_aggregative.py
+++ b/quapy/method/non_aggregative.py
@ -1,5 +1,6 @@
 from typing import Union, Callable
 import numpy as np
+from sklearn.feature_extraction.text import CountVectorizer

 from quapy.functional import get_divergence
 from quapy.data import LabelledCollection
@ -146,6 +147,53 @@ class DMx(BaseQuantifier):
        return F.argmin_prevalence(loss, n_classes, method=self.search)


+class ReadMe(BaseQuantifier):
+
+    def __init__(self, bootstrap_trials=100, bootstrap_range=100, bagging_trials=100, bagging_range=25, **vectorizer_kwargs):
+        self.bootstrap_trials = bootstrap_trials
+        self.bootstrap_range = bootstrap_range
+        self.bagging_trials = bagging_trials
+        self.bagging_range = bagging_range
+        self.vectorizer_kwargs = vectorizer_kwargs
+
+    def fit(self, data: LabelledCollection):
+        X, y = data.Xy
+        self.vectorizer = CountVectorizer(binary=True, **self.vectorizer_kwargs)
+        X = self.vectorizer.fit_transform(X)
+        self.class_conditional_X = {i: X[y==i] for i in range(data.classes_)}
+
+    def quantify(self, instances):
+        X = self.vectorizer.transform(instances)
+
+        # number of features
+        num_docs, num_feats = X.shape
+
+        # bootstrap
+        p_boots = []
+        for _ in range(self.bootstrap_trials):
+            docs_idx = np.random.choice(num_docs, size=self.bootstra_range, replace=False)
+            class_conditional_X = {i: X[docs_idx] for i, X in self.class_conditional_X.items()}
+            Xboot = X[docs_idx]
+
+            # bagging
+            p_bags = []
+            for _ in range(self.bagging_trials):
+                feat_idx = np.random.choice(num_feats, size=self.bagging_range, replace=False)
+                class_conditional_Xbag = {i: X[:, feat_idx] for i, X in class_conditional_X.items()}
+                Xbag = Xboot[:,feat_idx]
+                p = self.std_constrained_linear_ls(Xbag, class_conditional_Xbag)
+                p_bags.append(p)
+            p_boots.append(np.mean(p_bags, axis=0))
+
+        p_mean = np.mean(p_boots, axis=0)
+        p_std  = np.std(p_bags, axis=0)
+
+        return p_mean
+
+
+    def std_constrained_linear_ls(self, X, class_cond_X: dict):
+        pass
+

 def _get_features_range(X):
    feat_ranges = []
--- a/quapy/util.py
+++ b/quapy/util.py
@ -56,6 +56,7 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
    :param seed: the numeric seed
    :param asarray: set to True to return a np.ndarray instead of a list
    :param backend: indicates the backend used for handling parallel works
+    :param open_args: if True, then the delayed function is called on *args_i, instead of on args_i
    """
    def func_dec(environ, seed, *args):
        qp.environ = environ.copy()
@ -74,6 +75,40 @@ def parallel(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
    return out


+def parallel_unpack(func, args, n_jobs, seed=None, asarray=True, backend='loky'):
+    """
+    A wrapper of multiprocessing:
+
+    >>> Parallel(n_jobs=n_jobs)(
+    >>>      delayed(func)(*args_i) for args_i in args
+    >>> )
+
+    that takes the `quapy.environ` variable as input silently.
+    Seeds the child processes to ensure reproducibility when n_jobs>1.
+
+    :param func: callable
+    :param args: args of func
+    :param seed: the numeric seed
+    :param asarray: set to True to return a np.ndarray instead of a list
+    :param backend: indicates the backend used for handling parallel works
+    """
+
+    def func_dec(environ, seed, *args):
+        qp.environ = environ.copy()
+        qp.environ['N_JOBS'] = 1
+        # set a context with a temporal seed to ensure results are reproducibles in parallel
+        with ExitStack() as stack:
+            if seed is not None:
+                stack.enter_context(qp.util.temp_seed(seed))
+            return func(*args)
+
+    out = Parallel(n_jobs=n_jobs, backend=backend)(
+        delayed(func_dec)(qp.environ, None if seed is None else seed + i, *args_i) for i, args_i in enumerate(args)
+    )
+    if asarray:
+        out = np.asarray(out)
+    return out
+
@contextlib.contextmanager
 def temp_seed(random_state):
    """
Author	SHA1	Message	Date
Alejandro Moreo Fernandez	8ad41b1d33	new experimental protocol applied to continent	2024-04-09 09:48:56 +02:00
Alejandro Moreo Fernandez	1b420afd6c	fixing code to handle different categories	2024-04-05 18:09:52 +02:00
Alejandro Moreo Fernandez	8f9d19dd5f	fixing code to handle different categories	2024-04-05 18:09:20 +02:00
Alejandro Moreo Fernandez	2a685cec1e	seems to be working :D	2024-03-23 20:12:10 +01:00
Alejandro Moreo Fernandez	4150f4351f	statring 5th approach	2024-03-15 16:57:45 +01:00
Alejandro Moreo Fernandez	1aa9891ff9	cleaning gitignore	2024-02-23 16:48:53 +01:00
Alejandro Moreo Fernandez	1c03dd651b	first commit, some ideas already explored	2024-02-23 16:42:31 +01:00
Alejandro Moreo Fernandez	b3ccf71edb	Merge branch 'devel' of github.com:HLT-ISTI/QuaPy into devel	2024-02-23 16:30:11 +01:00
Alejandro Moreo Fernandez	320b3eac38	small fixes in kdey (now should work with string labels) and EMQ (in case some training prior prob was 0, it broke)	2024-02-23 16:29:53 +01:00
Alejandro Moreo Fernandez	9542eaee61	doing some benchmarking	2024-02-22 15:10:45 +01:00
Alejandro Moreo Fernandez	d50a86daf4	sketching readme system by Lu and King, Hopings and King	2024-02-16 17:34:10 +01:00