16 changed files with 593 additions and 656 deletions
--- a/Retrieval/classifier_kfcv_accuracy.py
+++ b/Retrieval/classifier_kfcv_accuracy.py
@ -1,82 +0,0 @@
 import itertools
 import os.path
 import pickle
 from collections import defaultdict
 from pathlib import Path
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 import quapy as qp
 from Retrieval.commons import RetrievedSamples, load_sample
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.data.base import LabelledCollection
 from os.path import join
 from tqdm import tqdm
 from result_table.src.table import Table
 """
 """
 data_home = 'data'
 datasets = ['continent', 'gender', 'years_category', 'relative_pageviews_category', 'num_sitelinks_category']
 param_grid = {'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]}
 # param_grid = {'C': np.logspace(-1, 1, 2)}
 classifiers = [
    ('LR', LogisticRegression(max_iter=5000), param_grid),
    ('SVM', LinearSVC(), param_grid)
 ]
 def benchmark_name(class_name):
    return class_name.replace('_', '\_')
 table = Table(name=f'accuracy', benchmarks=[benchmark_name(d) for d in datasets])
 table.format.show_std = False
 table.format.stat_test = None
 table.format.lower_is_better = False
 for class_name, (cls_name, cls, grid) in itertools.product(datasets, classifiers):
    train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json')  # <-------- fixed classifier
    texts, labels = load_sample(train_data_path, class_name=class_name)
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
    Xtr = tfidf.fit_transform(texts)
    print(f'Xtr shape={Xtr.shape}')
    print('training classifier...', end='')
    classifier = GridSearchCV(
        cls,
        param_grid=grid,
        n_jobs=-1,
        cv=5,
        verbose=10
    )
    classifier.fit(Xtr, labels)
    classifier_acc = classifier.best_score_
    classifier_acc_per_fold = classifier.cv_results_['mean_test_score'][classifier.best_index_]
    print(f'[done] best-params={classifier.best_params_} got {classifier_acc:.4f} score, per fold {classifier_acc_per_fold}')
    table.add(benchmark=benchmark_name(class_name), method=cls_name, v=classifier_acc_per_fold)
    Table.LatexPDF(f'./latex/classifier_Acc.pdf', tables=[table])
--- a/Retrieval/commons.py
+++ b/Retrieval/commons.py
@ -8,102 +8,116 @@ from quapy.protocol import AbstractProtocol
 import json
-def load_sample(path, class_name):
+def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
-    """
+    # print('reading', path)
-    Loads a sample json as a dataframe and returns text and labels for
+    if verbose:
-    the given class_name
+        print(f'loading {path}...', end='')
    df = pd.read_csv(path, sep='\t')
    if verbose:
        print('[done]')
    X = df['text'].values
    y = df['continent'].values
-    :param path: path to a json file
+    if parse_columns:
-    :param class_name: string representing the target class
+        rank = df['rank'].values
-    :return: texts, labels for class_name
+        scores = df['score'].values
-    """
+        rank = rank[y != 'Antarctica']
-    df = pd.read_json(path)
+        scores = scores[y != 'Antarctica']
-    text = df.text.values
+
-    labels = df[class_name].values
+    X = X[y!='Antarctica']
-    return text, labels
+    y = y[y!='Antarctica']
    if parse_columns:
        order = np.argsort(rank)
        X = X[order]
        y = y[order]
        rank = rank[order]
        scores = scores[order]
    if max_lines is not None:
        X = X[:max_lines]
        y = y[:max_lines]
    return X, y
-def get_text_label_score(df, class_name, vectorizer=None, filter_classes=None):
+def load_json_sample(path, class_name, max_lines=-1):
-    text = df.text.values
+    obj = json.load(open(path, 'rt'))
-    labels = df[class_name].values
+    keys = [f'{id}' for id in range(len(obj['text'].keys()))]
-    rel_score = df.score.values
+    text = [obj['text'][id] for id in keys]
-
+    #print(list(obj.keys()))
-    if filter_classes is not None:
+    #import sys; sys.exit(0)
-        idx = np.isin(labels, filter_classes)
+    classes = [obj[class_name][id] for id in keys]
-        text = text[idx]
+    if max_lines is not None and max_lines>0:
-        labels = labels[idx]
+        text = text[:max_lines]
-        rel_score = rel_score[idx]
+        classes = classes[:max_lines]
-
+    return text, classes
    if vectorizer is not None:
        text = vectorizer.transform(text)
    order = np.argsort(-rel_score)
    return text[order], labels[order], rel_score[order]
-class RetrievedSamples:
+class TextRankings:
-    def __init__(self,
+    def __init__(self, path, class_name):
-                 class_home: str,
+        self.obj = json.load(open(path, 'rt'))
                 test_rankings_path: str,
                 vectorizer,
                 class_name,
                 classes=None
                 ):
        self.class_home = class_home
        self.test_rankings_df = pd.read_json(test_rankings_path)
        self.vectorizer = vectorizer
        self.class_name = class_name
    def get_sample_Xy(self, sample_id, max_lines=-1):
        sample_id = str(sample_id)
        O = self.obj
        docs_ids = [doc_id for doc_id, query_id in O['qid'].items() if query_id == sample_id]
        texts = [O['text'][doc_id] for doc_id in docs_ids]
        labels = [O[self.class_name][doc_id] for doc_id in docs_ids]
        if max_lines > 0 and len(texts) > max_lines:
            ranks = [int(O['rank'][doc_id]) for doc_id in docs_ids]
            sel = np.argsort(ranks)[:max_lines]
            texts = np.asarray(texts)[sel]
            labels = np.asarray(labels)[sel]
        return texts, labels
 def get_query_id_from_path(path, prefix='training', posfix='200SPLIT'):
    qid = path
    qid = qid[:qid.index(posfix)]
    qid = qid[qid.index(prefix)+len(prefix):]
    return qid
 class RetrievedSamples(AbstractProtocol):
    def __init__(self, path_dir: str, load_fn, vectorizer, max_train_lines=None, max_test_lines=None, classes=None, class_name=None):
        self.path_dir = path_dir
        self.load_fn = load_fn
        self.vectorizer = vectorizer
        self.max_train_lines = max_train_lines
        self.max_test_lines = max_test_lines
        self.classes=classes
        assert class_name is not None, 'class name should be specified'
        self.class_name = class_name
        self.text_samples = TextRankings(join(self.path_dir, 'testRankingsRetrieval.json'), class_name=class_name)
    def __call__(self):
        tests_df = self.test_rankings_df
        class_name = self.class_name
        vectorizer = self.vectorizer
-        for file in self._list_queries():
+        for file in glob(join(self.path_dir, 'training*SPLIT.json')):
-            # loads the training sample
+            X, y = self.load_fn(file, class_name=self.class_name, max_lines=self.max_train_lines)
-            train_df = pd.read_json(file)
+            X = self.vectorizer.transform(X)
-            Xtr, ytr, score_tr = get_text_label_score(train_df, class_name, vectorizer, filter_classes=self.classes)
+            train_sample = LabelledCollection(X, y, classes=self.classes)
-            # loads the test sample
+            query_id = get_query_id_from_path(file)
-            query_id = self._get_query_id_from_path(file)
+            X, y = self.text_samples.get_sample_Xy(query_id, max_lines=self.max_test_lines)
            sel_df = tests_df[tests_df.qid == int(query_id)]
            Xte, yte, score_te = get_text_label_score(sel_df, class_name, vectorizer, filter_classes=self.classes)
-            yield (Xtr, ytr, score_tr), (Xte, yte, score_te)
+            # if len(X)!=qp.environ['SAMPLE_SIZE']:
-
+            #     print(f'[warning]: file {file} contains {len(X)} instances (expected: {qp.environ["SAMPLE_SIZE"]})')
-    def _list_queries(self):
+            # assert len(X) == qp.environ['SAMPLE_SIZE'], f'unexpected sample size for file {file}, found {len(X)}'
-        return sorted(glob(join(self.class_home, 'training_Query*200SPLIT.json')))
+            X = self.vectorizer.transform(X)
-
+            try:
-    # def _get_test_sample(self, query_id, max_lines=-1):
+                test_sample = LabelledCollection(X, y, classes=train_sample.classes_)
-    #     df = self.test_rankings_df
+            except ValueError as e:
-    #     sel_df = df[df.qid==int(query_id)]
+                print(f'file {file} caused error {e}')
-    #     return get_text_label_score(sel_df)
+                yield None, None
        # texts = sel_df.text.values
        # try:
        #     labels = sel_df[self.class_name].values
        # except KeyError as e:
        #     print(f'error: key {self.class_name} not found in test rankings')
        #     raise e
        # if max_lines > 0 and len(texts) > max_lines:
        #     ranks = sel_df.rank.values
        #     idx = np.argsort(ranks)[:max_lines]
        #     texts = np.asarray(texts)[idx]
        #     labels = np.asarray(labels)[idx]
        # return texts, labels
    def total(self):
        return len(self._list_queries())
    def _get_query_id_from_path(self, path):
        prefix = 'training_Query-'
        posfix = 'Sample-200SPLIT'
        qid = path
        qid = qid[:qid.index(posfix)]
        qid = qid[qid.index(prefix) + len(prefix):]
        return qid
            # print('train #classes:', train_sample.n_classes, train_sample.prevalence())
            # print('test  #classes:', test_sample.n_classes, test_sample.prevalence())
            yield train_sample, test_sample
--- a/Retrieval/experiments.py
+++ b/Retrieval/experiments.py
@ -1,245 +0,0 @@
 import os.path
 import pickle
 from collections import defaultdict
 from pathlib import Path
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 import quapy as qp
 from Retrieval.commons import RetrievedSamples, load_sample
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.data.base import LabelledCollection
 from os.path import join
 from tqdm import tqdm
 from result_table.src.table import Table
 """
 In this sixth experiment, we have a collection C of >6M documents.
 We split C in two equally-sized pools TrPool, TePool
 I have randomly split the collection in 50% train and 50% split. In each split we have approx. 3.25 million documents. 
 We have 5 categories we can evaluate over: Continent, Years_Category, Num_Site_Links, Relative Pageviews and Gender. 
 From the training set I have created smaller subsets for each category:
 100K, 500K, 1M and FULL (3.25M) 
 For each category and subset, I have created a training set called: "classifier_training.json". This is the "base" training set for the classifier. In this set we have 500 documents per group in a category. (For example: Male 500, Female 500, Unknown 500).  Let me know if you think we need more. 
 To "bias" the quantifier towards a query, I have executed the queries (97) on the different training sets and retrieved the 200 most relevant documents per group. 
 For example: (Male 200, Female 200, Unknown 200) 
 Sometimes this is infeasible, we should probably discuss this at some point. 
 You can find the results for every query in a file named: 
 "training_Query-[QID]Sample-200SPLIT.json" 
 Test: 
 To evaluate our approach, I have executed the queries on the test split. You can find the results for all 97 queries up till k=1000 in this file. 
 testRanking_Results.json 
 """
 def methods(classifier, class_name):
    kde_param = {
        'continent': 0.18,
        'gender': 0.12,
        'years_category':0.09
    }
    yield ('Naive', Naive())
    yield ('NaiveQuery', Naive())
    yield ('CC', ClassifyAndCount(classifier))
    # yield ('PCC', PCC(classifier))
    # yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1))
    yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1))
    # yield ('EMQ', EMQ(classifier, exact_train_prev=True))
    # yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt'))
    # yield ('EMQh', EMQ(classifier, exact_train_prev=False))
    # yield ('EMQ-BCTS', EMQ(classifier, exact_train_prev=True, recalib='bcts'))
    # yield ('EMQ-TS', EMQ(classifier, exact_train_prev=False, recalib='ts'))
    # yield ('EMQ-NBVS', EMQ(classifier, exact_train_prev=False, recalib='nbvs'))
    # yield ('EMQ-VS', EMQ(classifier, exact_train_prev=False, recalib='vs'))
    # yield ('KDE001', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.001))
    # yield ('KDE005', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.005)) # <-- wow!
    # yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
    # yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02))
    # yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03))
    # yield ('KDE-silver', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='silverman'))
    # yield ('KDE-scott', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth='scott'))
    yield ('KDE-opt', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param[class_name]))
    yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01))
    yield ('KDE02', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.02))
    yield ('KDE03', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.03))
    yield ('KDE04', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.04))
    yield ('KDE05', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.05))
    yield ('KDE07', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.07))
    # yield ('KDE10', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.10))
 def train_classifier(train_path):
    """
    Trains a classifier. To do so, it loads the training set, transforms it into a tfidf representation.
    The classifier is Logistic Regression, with hyperparameters C (range [0.001, 0.01, ..., 1000]) and
    class_weight (range {'balanced', None}) optimized via 5FCV.
    :return: the tfidf-vectorizer and the classifier trained
    """
    texts, labels = load_sample(train_path, class_name=class_name)
    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=3)
    Xtr = tfidf.fit_transform(texts)
    print(f'Xtr shape={Xtr.shape}')
    print('training classifier...', end='')
    classifier = LogisticRegression(max_iter=5000)
    classifier = GridSearchCV(
        classifier,
        param_grid={'C': np.logspace(-4, 4, 9), 'class_weight': ['balanced', None]},
        n_jobs=-1,
        cv=5
    )
    classifier.fit(Xtr, labels)
    classifier = classifier.best_estimator_
    classifier_acc = classifier.best_score_
    print(f'[done] best-params={classifier.best_params_} got {classifier_acc:.4f} score')
    training = LabelledCollection(Xtr, labels)
    print('training classes:', training.classes_)
    print('training prevalence:', training.prevalence())
    return tfidf, classifier
 def reduceAtK(data: LabelledCollection, k):
    # if k > len(data):
    #     print(f'[warning] {k=}>{len(data)=}')
    X, y = data.Xy
    X = X[:k]
    y = y[:k]
    return LabelledCollection(X, y, classes=data.classes_)
 def benchmark_name(class_name, k):
    scape_class_name = class_name.replace('_', '\_')
    return f'{scape_class_name}@{k}'
 def run_experiment():
    results = {
        'mae': {k: [] for k in Ks},
        'mrae': {k: [] for k in Ks}
    }
    pbar = tqdm(experiment_prot(), total=experiment_prot.total())
    for train, test in pbar:
        Xtr, ytr, score_tr = train
        Xte, yte, score_te = test
        if HALF:
            n = len(ytr) // 2
            train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier_trained.classes_)
        else:
            train_col = LabelledCollection(Xtr, ytr, classes=classifier_trained.classes_)
        if method_name not in ['Naive', 'NaiveQuery']:
            quantifier.fit(train_col, val_split=train_col, fit_classifier=False)
        elif method_name == 'Naive':
            quantifier.fit(train_col)
        test_col = LabelledCollection(Xte, yte, classes=classifier_trained.classes_)
        for k in Ks:
            test_k = reduceAtK(test_col, k)
            if method_name == 'NaiveQuery':
                train_k = reduceAtK(train_col, k)
                quantifier.fit(train_k)
            estim_prev = quantifier.quantify(test_k.instances)
            mae = qp.error.mae(test_k.prevalence(), estim_prev)
            mrae = qp.error.mrae(test_k.prevalence(), estim_prev, eps=(1. / (2 * k)))
            results['mae'][k].append(mae)
            results['mrae'][k].append(mrae)
        pbar.set_description(f'{method_name}')
    return results
 data_home = 'data'
 HALF=True
 exp_posfix = '_half'
 method_names = [name for name, *other in methods(None, 'continent')]
 Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
 for class_name in ['gender', 'continent', 'years_category']: # 'relative_pageviews_category', 'num_sitelinks_category']:
    tables_mae, tables_mrae = [], []
    benchmarks = [benchmark_name(class_name, k) for k in Ks]
    for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
        table_mae = Table(name=f'{class_name}-{data_size}-mae', benchmarks=benchmarks, methods=method_names)
        table_mrae = Table(name=f'{class_name}-{data_size}-mrae', benchmarks=benchmarks, methods=method_names)
        table_mae.format.mean_prec = 5
        table_mae.format.remove_zero = True
        table_mae.format.color_mode = 'global'
        tables_mae.append(table_mae)
        tables_mrae.append(table_mrae)
        class_home = join(data_home, class_name, data_size)
        # train_data_path = join(class_home, 'classifier_training.json')
        # classifier_path = join('classifiers', data_size, f'classifier_{class_name}.pkl')
        train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json')  # <-------- fixed classifier
        classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')  # <------------ fixed classifier
        test_rankings_path = join(data_home, 'testRanking_Results.json')
        results_home = join('results'+exp_posfix, class_name, data_size)
        tfidf, classifier_trained = qp.util.pickled_resource(classifier_path, train_classifier, train_data_path)
        experiment_prot = RetrievedSamples(
            class_home,
            test_rankings_path,
            vectorizer=tfidf,
            class_name=class_name,
            classes=classifier_trained.classes_
        )
        for method_name, quantifier in methods(classifier_trained, class_name):
            results_path = join(results_home, method_name + '.pkl')
            if os.path.exists(results_path):
                print(f'Method {method_name=} already computed')
                results = pickle.load(open(results_path, 'rb'))
            else:
                results = run_experiment()
                os.makedirs(Path(results_path).parent, exist_ok=True)
                pickle.dump(results, open(results_path, 'wb'), pickle.HIGHEST_PROTOCOL)
            for k in Ks:
                table_mae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mae'][k])
                table_mrae.add(benchmark=benchmark_name(class_name, k), method=method_name, v=results['mrae'][k])
        # Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mae+tables_mrae)
        Table.LatexPDF(f'./latex{exp_posfix}/{class_name}{exp_posfix}.pdf', tables=tables_mrae)
--- a/Retrieval/deprecated_code/fifth.py
+++ b/Retrieval/deprecated_code/fifth.py
--- a/Retrieval/deprecated_code/fourth.py
+++ b/Retrieval/deprecated_code/fourth.py
--- a/Retrieval/kdey_bandwith_selection.py
+++ b/Retrieval/kdey_bandwith_selection.py
@ -1,77 +0,0 @@
 import itertools
 import os.path
 import pickle
 from collections import defaultdict
 from pathlib import Path
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 import quapy as qp
 from Retrieval.commons import RetrievedSamples, load_sample
 from quapy.protocol import UPP
 from quapy.method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.model_selection import GridSearchQ
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.data.base import LabelledCollection
 from os.path import join
 from tqdm import tqdm
 from result_table.src.table import Table
 """
 """
 data_home = 'data'
 datasets = ['continent', 'gender', 'years_category'] #, 'relative_pageviews_category', 'num_sitelinks_category']
 for class_name in datasets:
    train_data_path = join(data_home, class_name, 'FULL', 'classifier_training.json')  # <-------- fixed classifier
    texts, labels = load_sample(train_data_path, class_name=class_name)
    classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
    tfidf, classifier_trained = pickle.load(open(classifier_path, 'rb'))
    classifier_hyper = classifier_trained.get_params()
    print(f'{classifier_hyper=}')
    X = tfidf.transform(texts)
    print(f'Xtr shape={X.shape}')
    pool = LabelledCollection(X, labels)
    train, val = pool.split_stratified(train_prop=0.5, random_state=0)
    q = KDEyML(LogisticRegression())
    classifier_hyper = {'classifier__C':[classifier_hyper['C'], 0.00000001], 'classifier__class_weight':[classifier_hyper['class_weight']]}
    quantifier_hyper = {'bandwidth': np.linspace(0.01, 0.2, 20)}
    hyper = {**classifier_hyper, **quantifier_hyper}
    qp.environ['SAMPLE_SIZE'] = 100
    modsel = GridSearchQ(
        model=q,
        param_grid=hyper,
        protocol=UPP(val, sample_size=100),
        n_jobs=-1,
        error='mrae',
        verbose=True
    )
    modsel.fit(train)
    print(class_name)
    print(f'{modsel.best_params_}')
    print(f'{modsel.best_score_}')
--- a/Retrieval/deprecated_code/preliminary_.py
+++ b/Retrieval/deprecated_code/preliminary_.py
--- a/Retrieval/deprecated_code/second.py
+++ b/Retrieval/deprecated_code/second.py
--- a/Retrieval/deprecated_code/third.py
+++ b/Retrieval/deprecated_code/third.py
--- a/Retrieval/relscore_distribution.py
+++ b/Retrieval/relscore_distribution.py
@ -1,105 +0,0 @@
 import os.path
 import pickle
 from collections import defaultdict
 from itertools import zip_longest
 from pathlib import Path
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.svm import LinearSVC
 import quapy as qp
 import quapy.functional as F
 from Retrieval.commons import RetrievedSamples, load_sample
 from method.non_aggregative import MaximumLikelihoodPrevalenceEstimation as Naive
 from quapy.method.aggregative import ClassifyAndCount, EMQ, ACC, PCC, PACC, KDEyML
 from quapy.protocol import AbstractProtocol
 from quapy.data.base import LabelledCollection
 from glob import glob
 from os.path import join
 from tqdm import tqdm
 from result_table.src.table import Table
 import numpy as np
 import matplotlib.pyplot as plt
 """
 Plots the distribution of (predicted) relevance score for the test samples and for the training samples wrt:
 - training pool size (100K, 500K, 1M, FULL)
 - rank  
 """
 data_home = 'data'
 Ks = [5, 10, 25, 50, 75, 100, 250, 500, 750, 1000]
 for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']:
    test_added = False
    Mtrs, Mtes, source = [], [], []
    for data_size in ['10K', '50K', '100K', '500K', '1M', 'FULL']:
        class_home = join(data_home, class_name, data_size)
        classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl')
        test_rankings_path = join(data_home, 'testRanking_Results.json')
        _, classifier = pickle.load(open(classifier_path, 'rb'))
        experiment_prot = RetrievedSamples(
            class_home,
            test_rankings_path,
            vectorizer=None,
            class_name=class_name,
            classes=classifier.classes_
        )
        Mtr = []
        Mte = []
        pbar = tqdm(experiment_prot(), total=experiment_prot.total())
        for train, test in pbar:
            Xtr, ytr, score_tr = train
            Xte, yte, score_te = test
            Mtr.append(score_tr)
            Mte.append(score_te)
        Mtrs.append(Mtr)
        if not test_added:
            Mtes.append(Mte)
            test_added = True
        source.append(data_size)
    fig, ax = plt.subplots()
    train_source = ['train-'+s for s in source]
    Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test']))
    for M, source in Ms:
        M = np.asarray(list(zip_longest(*M, fillvalue=np.nan))).T
        num_rep, num_docs = M.shape
        mean_values = np.nanmean(M, axis=0)
        n_filled = np.count_nonzero(~np.isnan(M), axis=0)
        std_errors = np.nanstd(M, axis=0) / np.sqrt(n_filled)
        line = ax.plot(range(num_docs), mean_values, '-', label=source, color=None)
        color = line[-1].get_color()
        ax.fill_between(range(num_docs), mean_values - std_errors, mean_values + std_errors, alpha=0.3, color=color)
    ax.set_xlabel('Doc. Rank')
    ax.set_ylabel('Rel. Score')
    ax.set_title(class_name)
    ax.legend()
    # plt.show()
    os.makedirs('plots', exist_ok=True)
    plotpath = f'plots/{class_name}.pdf'
    print(f'saving plot in {plotpath}')
    plt.savefig(plotpath)
--- a/Retrieval/tabular.py
+++ b/Retrieval/tabular.py
@ -0,0 +1,427 @@
 import os.path
 import numpy as np
 import itertools
 from scipy.stats import ttest_ind_from_stats, wilcoxon
 from pathlib import Path
 from os.path import join
 class Table:
    VALID_TESTS = [None, "wilcoxon", "ttest"]
    def __init__(self, benchmarks, methods, lower_is_better=True, ttest='ttest', prec_mean=3,
                 clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
                 color=True, color_mode='local', maxtone=50):
        assert ttest in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
        self.benchmarks = np.asarray(benchmarks)
        self.benchmark_index = {row:i for i, row in enumerate(benchmarks)}
        self.methods = np.asarray(methods)
        self.method_index = {col:j for j, col in enumerate(methods)}
        self.map = {}  
        # keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
        self._addmap('values', dtype=object)
        self.lower_is_better = lower_is_better
        self.ttest = ttest
        self.prec_mean = prec_mean
        self.clean_zero = clean_zero
        self.show_std = show_std
        self.prec_std = prec_std
        self.add_average = average
        self.missing = missing
        self.missing_str = missing_str
        self.color = color
        self.color_mode = color_mode
        self.maxtone = maxtone
        self.touch()
    @property
    def nbenchmarks(self):
        return len(self.benchmarks)
    @property
    def nmethods(self):
        return len(self.methods)
    def touch(self):
        self._modif = True
    def update(self):
        if self._modif:
            self.compute()
    def _getfilled(self):
        return np.argwhere(self.map['fill'])
    @property
    def values(self):
        return self.map['values']
    def _indexes(self):
        return itertools.product(range(self.nbenchmarks), range(self.nmethods))
    def _addmap(self, map, dtype, func=None):
        self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
        if func is None:
            return
        m = self.map[map]
        f = func
        indexes = self._indexes() if map == 'fill' else self._getfilled()
        for i, j in indexes:
            m[i, j] = f(self.values[i, j])
    def _addrank(self):
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
            ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
            if not self.lower_is_better:
                ranked_cols_idx = ranked_cols_idx[::-1]
            self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx)+1)
    def _addcolor(self):
        minval = {}
        maxval = {}
        if self.color_mode == 'global':
            filled_cols_idx = np.argwhere(self.map['fill'])
            col_means = [self.map['mean'][i, j] for i, j in filled_cols_idx]
            if len(filled_cols_idx) > 0:
                global_minval = min(col_means)
                global_maxval = max(col_means)
                for i in range(self.nbenchmarks):
                    minval[i] = global_minval
                    maxval[i] = global_maxval
        elif self.color_mode == 'local':
            for i in range(self.nbenchmarks):
                filled_cols_idx = np.argwhere(self.map['fill'][i, i + 1])
                if len(filled_cols_idx)>0:
                    col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
                    minval[i] = min(col_means)
                    maxval[i] = max(col_means)
        else:
            print(f'color mode {self.color_mode} not understood, valid ones are "local" and "global"; skip')
            return
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            for col_idx in filled_cols_idx:
                val = self.map['mean'][i,col_idx]
                if i not in maxval or i not in minval:
                    continue
                norm = (maxval[i] - minval[i])
                if norm > 0:
                    normval = (val - minval[i]) / norm
                else:
                    normval = 0.5
                if self.lower_is_better:
                    normval = 1 - normval
                normval = np.clip(normval, 0,1)
                self.map['color'][i, col_idx] = color_red2green_01(normval, self.maxtone)
    def _run_ttest(self, row, col1, col2):
        mean1 = self.map['mean'][row, col1]
        std1 = self.map['std'][row, col1]
        nobs1 = self.map['nobs'][row, col1]
        mean2 = self.map['mean'][row, col2]
        std2 = self.map['std'][row, col2]
        nobs2 = self.map['nobs'][row, col2]
        _, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
        return p_val
    def _run_wilcoxon(self, row, col1, col2):
        values1 = self.map['values'][row, col1]
        values2 = self.map['values'][row, col2]
        try:
            _, p_val = wilcoxon(values1, values2)
        except ValueError:
            p_val = 0
        return p_val
    def _add_statistical_test(self):
        if self.ttest is None:
            return
        self.some_similar = [False]*self.nmethods
        for i in range(self.nbenchmarks):
            filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
            if len(filled_cols_idx) <= 1:
                continue
            col_means = [self.map['mean'][i,j] for j in filled_cols_idx]
            best_pos = filled_cols_idx[np.argmin(col_means)]
            for j in filled_cols_idx:
                if j==best_pos:
                    continue
                if self.ttest == 'ttest':
                    p_val = self._run_ttest(i, best_pos, j)
                else:
                    p_val = self._run_wilcoxon(i, best_pos, j)
                pval_outcome = pval_interpretation(p_val)
                self.map['ttest'][i, j] = pval_outcome
                if pval_outcome != 'Diff':
                    self.some_similar[j] = True
    def compute(self):
        self._addmap('fill', dtype=bool, func=lambda x: x is not None)
        self._addmap('mean', dtype=float, func=np.mean)
        self._addmap('std', dtype=float, func=np.std)
        self._addmap('nobs', dtype=float, func=len)
        self._addmap('rank', dtype=int, func=None)
        self._addmap('color', dtype=object, func=None)
        self._addmap('ttest', dtype=object, func=None)
        self._addmap('latex', dtype=object, func=None)
        self._addrank()
        self._addcolor()
        self._add_statistical_test()
        if self.add_average:
            self._addave()
        self._modif = False
    def _is_column_full(self, col):
        return all(self.map['fill'][:, self.method_index[col]])
    def _addave(self):
        ave = Table(['ave'], self.methods,
                    lower_is_better=self.lower_is_better,
                    ttest=self.ttest,
                    average=False,
                    missing=self.missing,
                    missing_str=self.missing_str,
                    prec_mean=self.prec_mean,
                    prec_std=self.prec_std,
                    clean_zero=self.clean_zero,
                    show_std=self.show_std,
                    color=self.color,
                    maxtone=self.maxtone)
        for col in self.methods:
            values = None
            if self._is_column_full(col):
                if self.ttest == 'ttest':
                    # values = np.asarray(self.map['mean'][:, self.method_index[col]])
                    values = np.concatenate(self.values[:, self.method_index[col]])
                else:  # wilcoxon
                    # values = np.asarray(self.map['mean'][:, self.method_index[col]])
                    values = np.concatenate(self.values[:, self.method_index[col]])
            ave.add('ave', col, values)
        self.average = ave
    def add(self, benchmark, method, values):
        if values is not None:
            values = np.asarray(values)
            if values.ndim==0:
                values = values.flatten()
        rid, cid = self._coordinates(benchmark, method)
        self.map['values'][rid, cid] = values
        self.touch()
    def get(self, benchmark, method, attr='mean'):
        self.update()
        assert attr in self.map, f'unknwon attribute {attr}'
        rid, cid = self._coordinates(benchmark, method)
        if self.map['fill'][rid, cid]:
            v = self.map[attr][rid, cid]
            if v is None or (isinstance(v,float) and np.isnan(v)):
                return self.missing
            return v
        else:
            return self.missing
    def _coordinates(self, benchmark, method):
        assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
        assert method in self.method_index, f'method {method} out of range'
        rid = self.benchmark_index[benchmark]
        cid = self.method_index[method]
        return rid, cid
    def get_average(self, method, attr='mean'):
        self.update()
        if self.add_average:
            return self.average.get('ave', method, attr=attr)
        return None
    def get_color(self, benchmark, method):
        color = self.get(benchmark, method, attr='color')
        if color is None:
            return ''
        return color
    def latex(self, benchmark, method):
        self.update()
        i,j = self._coordinates(benchmark, method)
        if self.map['fill'][i,j] == False:
            return self.missing_str
        mean = self.map['mean'][i,j]
        l = f" {mean:.{self.prec_mean}f}"
        if self.clean_zero:
            l = l.replace(' 0.', '.')
        isbest = self.map['rank'][i,j] == 1
        if isbest:
            l = "\\textbf{"+l.strip()+"}"
        stat = '' if self.ttest is None else '^{\phantom{\ddag}}'
        if self.ttest is not None and self.some_similar[j]:
            test_label = self.map['ttest'][i,j]
            if test_label == 'Sim':
                stat = '^{\dag}'
            elif test_label == 'Same':
                stat = '^{\ddag}'
            elif isbest or test_label == 'Diff':
                stat = '^{\phantom{\ddag}}'
        std = ''
        if self.show_std:
            std = self.map['std'][i,j]
            std = f" {std:.{self.prec_std}f}"
            if self.clean_zero:
                std = std.replace(' 0.', '.')
            std = f"\pm {std:{self.prec_std}}"
        if stat!='' or std!='':
            l = f'{l}${stat}{std}$'
        if self.color:
            l += ' ' + self.map['color'][i,j]
        return l
    def latexPDF(self, path, name:str, *args, **kwargs):
        if not name.endswith('.tex'):
            name += '.tex'
        self.latexSaveDocument(join(path, name), *args, **kwargs)
        print("[Tables Done] runing latex")
        os.chdir(path)
        os.system('pdflatex '+name)
        basename = name.replace('.tex', '')
        os.system(f'rm {basename}.aux {basename}.bbl {basename}.blg {basename}.log {basename}.out {basename}.dvi')
        os.chdir('..')
    def latexSaveDocument(self, path, *args, **kwargs):
        document = self.latexDocument(*args, **kwargs)
        parent = Path(path).parent
        os.makedirs(parent, exist_ok=True)
        with open(path, 'wt') as foo:
            foo.write(document)
        print('text file save at ', path)
    def latexDocument(self, *args, **kwargs):
        document = """
 \\documentclass[10pt,a4paper]{article}
 \\usepackage[utf8]{inputenc}
 \\usepackage{amsmath}
 \\usepackage{amsfonts}
 \\usepackage{amssymb}
 \\usepackage{graphicx}
 \\usepackage{xcolor}
 \\usepackage{colortbl}
 \\begin{document}
        """
        document += self.latexTable(*args, **kwargs)
        document += "\n\end{document}\n"
        return document
    def latexTable(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline', resizebox=True):
        table = """
        \\begin{table}
        \center
        %%%\\resizebox{\\textwidth}{!}{% \n
        """
        table += "\n\\begin{tabular}{|c"+"|c" * self.nmethods + "|}\n"
        table += self.latexTabular(benchmark_replace, method_replace, aslines, endl)
        table += "\n\\end{tabular}\n"
        table += """
        %%%}%
        \end{table}
        """
        if resizebox:
            table = table.replace("%%%", "")
        return table
    def latexTabular(self, benchmark_replace={}, method_replace={}, aslines=False, endl='\\\\\hline'):
        lines = []
        l = '\multicolumn{1}{c|}{} & '
        l += ' & '.join([method_replace.get(col, col) for col in self.methods])
        l += ' \\\\\hline'
        lines.append(l)
        for row in self.benchmarks:
            rowname = benchmark_replace.get(row, row)
            l = rowname + ' & '
            l += self.latexRow(row, endl=endl)
            lines.append(l)
        if self.add_average:
            # l += '\hline\n'
            l = '\hline \n \\textit{Average} & '
            l += self.latexAverage(endl=endl)
            lines.append(l)
        if not aslines:
            lines='\n'.join(lines)
        return lines
    def latexRow(self, benchmark, endl='\\\\\hline\n'):
        s = [self.latex(benchmark, col) for col in self.methods]
        s = ' & '.join(s)
        s += ' ' + endl
        return s
    def latexAverage(self, endl='\\\\\hline\n'):
        if self.add_average:
            return self.average.latexRow('ave', endl=endl)
    def getRankTable(self, prec_mean=0):
        t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=prec_mean, average=True, maxtone=self.maxtone, ttest=None)
        for rid, cid in self._getfilled():
            row = self.benchmarks[rid]
            col = self.methods[cid]
            t.add(row, col, self.get(row, col, 'rank'))
        t.compute()
        return t
    def dropMethods(self, methods):
        drop_index = [self.method_index[m] for m in methods]
        new_methods = np.delete(self.methods, drop_index)
        new_index = {col:j for j, col in enumerate(new_methods)}
        self.map['values'] = self.values[:,np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
        self.methods = new_methods
        self.method_index = new_index
        self.touch()
 def pval_interpretation(p_val):
    if 0.005 >= p_val:
        return 'Diff'
    elif 0.05 >= p_val > 0.005:
        return 'Sim'
    elif p_val > 0.05:
        return 'Same'
 def color_red2green_01(val, maxtone=50):
    if np.isnan(val): return None
    assert 0 <= val <= 1, f'val {val} out of range [0,1]'
    # rescale to [-1,1]
    val = val * 2 - 1
    if val < 0:
        color = 'red'
        tone = maxtone * (-val)
    else:
        color = 'green'
        tone = maxtone * val
    return '\cellcolor{' + color + f'!{int(tone)}' + '}'
--- a/Retrieval/tmp.py
+++ b/Retrieval/tmp.py
@ -1,27 +0,0 @@
 import pandas as pd
 from os.path import join
 from Retrieval.commons import load_json_sample
 from quapy.data import LabelledCollection
 data_home = 'data'
 CLASS_NAME = 'continent'
 datasize = '100K'
 file_path = join(data_home, CLASS_NAME, datasize, 'training_Query-84Sample-200SPLIT.json')
 text, classes = load_json_sample(file_path, CLASS_NAME)
 data = LabelledCollection(text, classes)
 print(data.classes_)
 print(data.prevalence())
 print('done')
 test_ranking_path = join(data_home, 'testRanking_Results.json')
 # obj = json.load(open(test_ranking_path))
 df = pd.read_json(test_ranking_path)
 print('done')
--- a/Retrieval/understand_classif_scheme.py
+++ b/Retrieval/understand_classif_scheme.py
@ -0,0 +1,66 @@
 import numpy as np
 import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
 from sklearn.metrics import make_scorer, f1_score
 from sklearn.svm import LinearSVC
 from quapy.data.base import LabelledCollection
 from sklearn.model_selection import cross_val_score, GridSearchCV
 from os.path import join
 """
 In this experiment, I simply try to understand whether the learning task can be learned or not.
 The problem is that we are quantifying the categories based on the alphabetical order (of what?).  
 """
 def load_txt_sample(path, parse_columns, verbose=False, max_lines=None):
    if verbose:
        print(f'loading {path}...', end='')
    df = pd.read_csv(path, sep='\t')
    if verbose:
        print('[done]')
    X = df['text'].values
    y = df['continent'].values
    if parse_columns:
        rank = df['rank'].values
        scores = df['score'].values
        order = np.argsort(rank)
        X = X[order]
        y = y[order]
        rank = rank[order]
        scores = scores[order]
    if max_lines is not None:
        X = X[:max_lines]
        y = y[:max_lines]
    return X, y
 data_path = './50_50_split_trec'
 train_path = join(data_path, 'train_50_50_continent.txt')
 tfidf = TfidfVectorizer(sublinear_tf=True, min_df=10)
 data = LabelledCollection.load(train_path, loader_func=load_txt_sample, verbose=True, parse_columns=False)
 data = data.sampling(20000)
 train, test = data.split_stratified()
 train.instances = tfidf.fit_transform(train.instances)
 test.instances  = tfidf.transform(test.instances)
 # svm = LinearSVC()
 # cls = GridSearchCV(svm, param_grid={'C':np.logspace(-3,3,7), 'class_weight':['balanced', None]})
 cls = LogisticRegression()
 cls.fit(*train.Xy)
 # score = cross_val_score(LogisticRegressionCV(), *data.Xy, scoring=make_scorer(f1_score, average='macro'), n_jobs=-1, cv=5)
 # print(score)
 # print(np.mean(score))
 y_pred = cls.predict(test.instances)
 macrof1 = f1_score(y_true=test.labels, y_pred=y_pred, average='macro')
 microf1 = f1_score(y_true=test.labels, y_pred=y_pred, average='micro')
 print('macro', macrof1)
 print('micro', microf1)
--- a/quapy/functional.py
+++ b/quapy/functional.py
@ -141,19 +141,6 @@ def uniform_prevalence_sampling(n_classes, size=1):
    return u
 def uniform_prevalence(n_classes):
    """
    Returns a vector representing the uniform distribution for `n_classes`
    :param n_classes: number of classes
    :return: np.ndarray with all values 1/n_classes
    """
    assert isinstance(n_classes, int) and n_classes>0, \
        (f'param {n_classes} not understood; must be a positive integer representing the '
         f'number of classes ')
    return np.full(shape=n_classes, fill_value=1./n_classes)
 uniform_simplex_sampling = uniform_prevalence_sampling
--- a/quapy/method/_kdey.py
+++ b/quapy/method/_kdey.py
@ -62,13 +62,7 @@ class KDEBase:
        :param bandwidth: float, the bandwidth of the kernel
        :return: a list of KernelDensity objects, each fitted with the corresponding class-specific covariates
        """
-        class_cond_X = []
+        return [self.get_kde_function(X[y == cat], bandwidth) for cat in classes]
        for cat in classes:
            selX = X[y==cat]
            if selX.size==0:
                selX = [F.uniform_prevalence(len(classes))]
            class_cond_X.append(selX)
        return [self.get_kde_function(X_cond_yi, bandwidth) for X_cond_yi in class_cond_X]
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -2,7 +2,7 @@ from abc import ABC, abstractmethod
 from copy import deepcopy
 from typing import Callable, Union
 import numpy as np
-from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling, PlattScaling
+from abstention.calibration import NoBiasVectorScaling, TempScaling, VectorScaling
 from scipy import optimize
 from sklearn.base import BaseEstimator
 from sklearn.calibration import CalibratedClassifierCV
@ -636,35 +636,20 @@ class EMQ(AggregativeSoftQuantifier):
                calibrator = TempScaling()
            elif self.recalib == 'vs':
                calibrator = VectorScaling()
            elif self.recalib == 'platt':
                calibrator = CalibratedClassifierCV(estimator=self.classifier, cv='prefit')
            else:
                raise ValueError('invalid param argument for recalibration method; available ones are '
                                 '"nbvs", "bcts", "ts", and "vs".')
            if not np.issubdtype(y.dtype, np.number):
                y = np.searchsorted(data.classes_, y)
-
+            self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
            if self.recalib == 'platt':
                self.classifier = calibrator.fit(*data.Xy)
            else:
                print(classif_predictions.prevalence())
                try:
                    self.calibration_function = calibrator(P, np.eye(data.n_classes)[y], posterior_supplied=True)
                except RuntimeError as e:
                    print(e)
                    print('defaults to I')
                    self.calibration_function = lambda P:P
        if self.exact_train_prev:
            self.train_prevalence = data.prevalence()
        else:
            train_posteriors = classif_predictions.X
            if self.recalib is not None:
-                if self.recalib == 'platt':
+                train_posteriors = self.calibration_function(train_posteriors)
                    train_posteriors = self.classifier.predict_proba(train_posteriors)
                else:
                    train_posteriors = self.calibration_function(train_posteriors)
            self.train_prevalence = F.prevalence_from_probabilities(train_posteriors)
    def aggregate(self, classif_posteriors, epsilon=EPSILON):