From 9a08125e7ea86e3de1bdc32985a6969263fa63fa Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Mon, 25 Oct 2021 13:37:22 +0200
Subject: [PATCH] evaluation script and format checker added

---
 LeQua2022/TODO.txt              |   3 +-
 LeQua2022/constants.py          |   6 ++
 LeQua2022/data.py               | 112 ++++++++++++++++++++------------
 LeQua2022/evaluation.py         |  41 ++++++++++++
 LeQua2022/format_checker.py     |  27 ++++++++
 LeQua2022/main_binary_vector.py |  60 ++++++-----------
 quapy/data/preprocessing.py     |   2 +-
 7 files changed, 166 insertions(+), 85 deletions(-)
 create mode 100644 LeQua2022/constants.py
 create mode 100644 LeQua2022/evaluation.py
 create mode 100644 LeQua2022/format_checker.py

diff --git a/LeQua2022/TODO.txt b/LeQua2022/TODO.txt
index e51cf0d..2bc944e 100644
--- a/LeQua2022/TODO.txt
+++ b/LeQua2022/TODO.txt
@@ -5,4 +5,5 @@
 5. plots
 6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y
     que de todos modos genera un output con el mismo nombre del file
-7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly
\ No newline at end of file
+7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly
+8. No me convence que la lectura de los samples (caso en que no hay ground truth) viene en orden aleatorio
diff --git a/LeQua2022/constants.py b/LeQua2022/constants.py
new file mode 100644
index 0000000..1162e12
--- /dev/null
+++ b/LeQua2022/constants.py
@@ -0,0 +1,6 @@
+DEV_SAMPLES = 1000
+TEST_SAMPLES = 5000
+
+T1A_SAMPLE_SIZE = 250
+
+ERROR_TOL=1E-3
diff --git a/LeQua2022/data.py b/LeQua2022/data.py
index 2d99120..815fc30 100644
--- a/LeQua2022/data.py
+++ b/LeQua2022/data.py
@@ -7,6 +7,9 @@ import quapy as qp
 import numpy as np
 import sklearn
 import re
+from glob import glob
+
+import constants
 
 
 # def load_binary_raw_document(path):
@@ -20,14 +23,38 @@ import re
 # def load_multiclass_raw_document(path):
 #     return qp.data.from_text(path, verbose=0, class2int=False)
 
+def load_category_map(path):
+    cat2code = {}
+    with open(path, 'rt') as fin:
+        category, code = fin.readline().split()
+        cat2code[category] = int(code)
+    return cat2code
+
 
 def load_binary_vectors(path, nF=None):
     return sklearn.datasets.load_svmlight_file(path, n_features=nF)
 
 
-def gen_load_samples_T1A(path_dir:str, ground_truth_path:str = None):
-    # for ... : yield
-    pass
+def __gen_load_samples_with_groudtruth(path_dir:str, ground_truth_path:str, load_fn, **load_kwargs):
+    true_prevs = ResultSubmission.load(ground_truth_path)
+    for filename, prevalence in true_prevs.iterrows():
+        sample, _ = load_fn(os.path.join(path_dir, filename), **load_kwargs)
+        yield filename, sample, prevalence
+
+
+def __gen_load_samples_without_groudtruth(path_dir:str, load_fn, **load_kwargs):
+    for filepath in glob(os.path.join(path_dir, '*_sample_*.txt')):
+        sample, _ = load_fn(filepath, **load_kwargs)
+        yield os.path.basename(filepath), sample
+
+
+def gen_load_samples_T1A(path_dir:str, nF:int, ground_truth_path:str = None):
+    if ground_truth_path is None:
+        for filename, sample in __gen_load_samples_without_groudtruth(path_dir, load_binary_vectors, nF=nF):
+            yield filename, sample
+    else:
+        for filename, sample, prevalence in __gen_load_samples_with_groudtruth(path_dir, ground_truth_path, load_binary_vectors, nF=nF):
+            yield filename, sample, prevalence
 
 
 def gen_load_samples_T1B(path_dir:str, ground_truth_path:str = None):
@@ -46,9 +73,6 @@ def gen_load_samples_T2B(path_dir:str, ground_truth_path:str = None):
 
 
 class ResultSubmission:
-    DEV_LEN = 1000
-    TEST_LEN = 5000
-    ERROR_TOL = 1E-3
 
     def __init__(self, categories: List[str]):
         if not isinstance(categories, list) or len(categories) < 2:
@@ -80,9 +104,9 @@ class ResultSubmission:
             raise ValueError(f'error: wrong shape found for prevalence vector {prevalence_values}')
         if (prevalence_values<0).any() or (prevalence_values>1).any():
             raise ValueError(f'error: prevalence values out of range [0,1] for "{sample_name}"')
-        if np.abs(prevalence_values.sum()-1) > ResultSubmission.ERROR_TOL:
+        if np.abs(prevalence_values.sum()-1) > constants.ERROR_TOL:
             raise ValueError(f'error: prevalence values do not sum up to one for "{sample_name}"'
-                             f'(error tolerance {ResultSubmission.ERROR_TOL})')
+                             f'(error tolerance {constants.ERROR_TOL})')
 
         new_entry = dict([('filename',sample_name)]+[(col_i,prev_i) for col_i, prev_i in zip(self.categories, prevalence_values)])
         self.df = self.df.append(new_entry, ignore_index=True)
@@ -93,7 +117,7 @@ class ResultSubmission:
     @classmethod
     def load(cls, path: str) -> 'ResultSubmission':
         df, inferred_type = ResultSubmission.check_file_format(path, return_inferred_type=True)
-        r = ResultSubmission(categories=df.columns.values.tolist())
+        r = ResultSubmission(categories=df.columns.values[1:].tolist())
         r.inferred_type = inferred_type
         r.df = df
         return r
@@ -102,13 +126,19 @@ class ResultSubmission:
         ResultSubmission.check_dataframe_format(self.df)
         self.df.to_csv(path)
 
-    def get(self, sample_name:str):
+    def prevalence(self, sample_name:str):
         sel = self.df.loc[self.df['filename'] == sample_name]
         if sel.empty:
             return None
         else:
             return sel.loc[:,self.df.columns[1]:].values.flatten()
 
+    def iterrows(self):
+        for index, row in self.df.iterrows():
+            filename = row.filename
+            prevalence = row[self.df.columns[1]:].values.flatten()
+            yield filename, prevalence
+
     @classmethod
     def check_file_format(cls, path, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
         df = pd.read_csv(path, index_col=0)
@@ -116,7 +146,7 @@ class ResultSubmission:
 
     @classmethod
     def check_dataframe_format(cls, df, path=None, return_inferred_type=False) -> Union[pd.DataFrame, Tuple[pd.DataFrame, str]]:
-        hint_path = ''  # if given, show the data path in the error messages
+        hint_path = ''  # if given, show the data path in the error message
         if path is not None:
             hint_path = f' in {path}'
 
@@ -125,33 +155,33 @@ class ResultSubmission:
 
         if df.empty:
             raise ValueError(f'error{hint_path}: results file is empty')
-        elif len(df) == ResultSubmission.DEV_LEN:
+        elif len(df) == constants.DEV_SAMPLES:
             inferred_type = 'dev'
-            expected_len = ResultSubmission.DEV_LEN
-        elif len(df) == ResultSubmission.TEST_LEN:
+            expected_len = constants.DEV_SAMPLES
+        elif len(df) == constants.TEST_SAMPLES:
             inferred_type = 'test'
-            expected_len = ResultSubmission.TEST_LEN
+            expected_len = constants.TEST_SAMPLES
         else:
             raise ValueError(f'wrong number of prevalence values found{hint_path}; '
-                             f'expected {ResultSubmission.DEV_LEN} for development sets and '
-                             f'{ResultSubmission.TEST_LEN} for test sets; found {len(df)}')
+                             f'expected {constants.DEV_SAMPLES} for development sets and '
+                             f'{constants.TEST_SAMPLES} for test sets; found {len(df)}')
 
         set_names = frozenset(df.filename)
         for i in range(expected_len):
             if f'{inferred_type}_sample_{i}.txt' not in set_names:
-                raise ValueError(f'{hint_path} a file with {len(df)} entries is assumed to be of type '
+                raise ValueError(f'error{hint_path} a file with {len(df)} entries is assumed to be of type '
                                  f'"{inferred_type}" but entry {inferred_type}_sample_{i}.txt is missing '
                                  f'(among perhaps many others)')
 
         for category_name in df.columns[1:]:
             if (df[category_name] < 0).any() or (df[category_name] > 1).any():
-                raise ValueError(f'{hint_path} column "{category_name}" contains values out of range [0,1]')
+                raise ValueError(f'error{hint_path} column "{category_name}" contains values out of range [0,1]')
 
         prevs = df.loc[:, df.columns[1]:].values
-        round_errors = np.abs(prevs.sum(axis=-1) - 1.) > ResultSubmission.ERROR_TOL
+        round_errors = np.abs(prevs.sum(axis=-1) - 1.) > constants.ERROR_TOL
         if round_errors.any():
             raise ValueError(f'warning: prevalence values in rows with id {np.where(round_errors)[0].tolist()} '
-                              f'do not sum up to 1 (error tolerance {ResultSubmission.ERROR_TOL}), '
+                              f'do not sum up to 1 (error tolerance {constants.ERROR_TOL}), '
                               f'probably due to some rounding errors.')
 
         if return_inferred_type:
@@ -163,20 +193,31 @@ class ResultSubmission:
         self.df = self.df.reindex([self.df.columns[0]] + sorted(self.df.columns[1:]), axis=1)
         self.categories = sorted(self.categories)
 
+    def filenames(self):
+        return self.df.filename.values
 
 
-def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=1000, average=True):
+def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSubmission, sample_size=None, average=True):
+    if sample_size is None:
+        if qp.environ['SAMPLE_SIZE'] is None:
+            raise ValueError('Relative Absolute Error cannot be computed: '
+                             'neither sample_size nor qp.environ["SAMPLE_SIZE"] have been specified')
+        else:
+            sample_size = qp.environ['SAMPLE_SIZE']
+
     if len(true_prevs) != len(predicted_prevs):
-        raise ValueError(f'size mismatch, groun truth has {len(true_prevs)} entries '
-                         f'while predictions contain {len(predicted_prevs)} entries')
+        raise ValueError(f'size mismatch, ground truth file has {len(true_prevs)} entries '
+                         f'while the file of predictions contain {len(predicted_prevs)} entries')
     true_prevs.sort_categories()
     predicted_prevs.sort_categories()
     if true_prevs.categories != predicted_prevs.categories:
-        raise ValueError(f'these result files are not comparable since the categories are different')
+        raise ValueError(f'these result files are not comparable since the categories are different: '
+                         f'true={true_prevs.categories} vs. predictions={predicted_prevs.categories}')
     ae, rae = [], []
-    for sample_name in true_prevs.df.filename.values:
-        ae.append(qp.error.mae(true_prevs.get(sample_name), predicted_prevs.get(sample_name)))
-        rae.append(qp.error.mrae(true_prevs.get(sample_name), predicted_prevs.get(sample_name), eps=sample_size))
+    for sample_name, true_prevalence in true_prevs.iterrows():
+        pred_prevalence = predicted_prevs.prevalence(sample_name)
+        ae.append(qp.error.ae(true_prevalence, pred_prevalence))
+        rae.append(qp.error.rae(true_prevalence, pred_prevalence, eps=1./(2*sample_size)))
     ae = np.asarray(ae)
     rae = np.asarray(rae)
     if average:
@@ -187,21 +228,6 @@ def evaluate_submission(true_prevs: ResultSubmission, predicted_prevs: ResultSub
 
 
 
-# r = ResultSubmission(['negative', 'positive'])
-# from tqdm import tqdm
-# for i in tqdm(range(1000), total=1000):
-#     r.add(f'dev_sample_{i}.txt', np.asarray([0.5, 0.5]))
-# r.dump('./path.csv')
-
-r = ResultSubmission.load('./data/T1A/public/dummy_submission.csv')
-t = ResultSubmission.load('./data/T1A/public/dummy_submission (copy).csv')
-# print(r.df)
-# print(r.get('dev_sample_10.txt'))
-print(evaluate_submission(r, t))
-
-# s = ResultSubmission.load('./data/T1A/public/dummy_submission.csv')
-#
-# print(s)
 
 
 
diff --git a/LeQua2022/evaluation.py b/LeQua2022/evaluation.py
new file mode 100644
index 0000000..e56d6d5
--- /dev/null
+++ b/LeQua2022/evaluation.py
@@ -0,0 +1,41 @@
+import argparse
+import quapy as qp
+from data import ResultSubmission, evaluate_submission
+import constants
+import os
+
+"""
+LeQua2022 Official evaluation script 
+"""
+
+def main(args):
+    if args.task in {'T1A'}:
+        qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
+    true_prev = ResultSubmission.load(args.true_prevalences)
+    pred_prev = ResultSubmission.load(args.pred_prevalences)
+    mae, mrae = evaluate_submission(true_prev, pred_prev)
+    print(f'MAE: {mae:.4f}')
+    print(f'MRAE: {mrae:.4f}')
+
+    if args.output is not None:
+        outdir = os.path.dirname(args.output)
+        if outdir:
+            os.makedirs(outdir, exist_ok=True)
+        with open(args.output, 'wt') as foo:
+            foo.write(f'MAE: {mae:.4f}\n')
+            foo.write(f'MRAE: {mrae:.4f}\n')
+
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser(description='LeQua2022 official evaluation script')
+    parser.add_argument('task', metavar='TASK', type=str, choices=['T1A', 'T1B', 'T2A', 'T2B'],
+                        help='Task name (T1A, T1B, T2A, T2B)')
+    parser.add_argument('true_prevalences', metavar='TRUE-PREV-PATH', type=str,
+                        help='Path of ground truth prevalence values file (.csv)')
+    parser.add_argument('pred_prevalences', metavar='PRED-PREV-PATH', type=str,
+                        help='Path of predicted prevalence values file (.csv)')
+    parser.add_argument('--output', metavar='SCORES-PATH', type=str, default=None,
+                        help='Path where to store the evaluation scores')
+    args = parser.parse_args()
+
+    main(args)
diff --git a/LeQua2022/format_checker.py b/LeQua2022/format_checker.py
new file mode 100644
index 0000000..25f3e45
--- /dev/null
+++ b/LeQua2022/format_checker.py
@@ -0,0 +1,27 @@
+import argparse
+import quapy as qp
+from data import ResultSubmission, evaluate_submission
+import constants
+import os
+
+"""
+LeQua2022 Official format-checker script 
+"""
+
+def main(args):
+    try:
+        ResultSubmission.check_file_format(args.prevalence_file)
+    except Exception as e:
+        print(e)
+        print('Format check: not passed')
+    else:
+        print('Format check: passed')
+
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser(description='LeQua2022 official format-checker script')
+    parser.add_argument('prevalence_file', metavar='PREV-PATH', type=str,
+                        help='Path of the file containing prevalence values to check')
+    args = parser.parse_args()
+
+    main(args)
diff --git a/LeQua2022/main_binary_vector.py b/LeQua2022/main_binary_vector.py
index 2930091..c9e87a0 100644
--- a/LeQua2022/main_binary_vector.py
+++ b/LeQua2022/main_binary_vector.py
@@ -9,64 +9,44 @@ import quapy as qp
 from quapy.data import LabelledCollection
 from quapy.method.aggregative import *
 import quapy.functional as F
-from data import load_binary_vectors
+from data import *
 import os
+import constants
 
-path_binary_vector = './data/T1A'
-result_path = os.path.join('results', 'T1A')  # binary - vector
-os.makedirs(result_path, exist_ok=True)
+predictions_path = os.path.join('predictions', 'T1A')  # binary - vector
+os.makedirs(predictions_path, exist_ok=True)
 
-train_file = os.path.join(path_binary_vector, 'public', 'training_vectors.txt')
-
-train = LabelledCollection.load(train_file, load_binary_vectors)
+pathT1A = './data/T1A/public'
+T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
+T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
+T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt')
 
+train = LabelledCollection.load(T1A_trainpath, load_binary_vectors)
 nF = train.instances.shape[1]
 
+qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
+
 print(f'number of classes: {len(train.classes_)}')
 print(f'number of training documents: {len(train)}')
 print(f'training prevalence: {F.strprev(train.prevalence())}')
 print(f'training matrix shape: {train.instances.shape}')
 
-dev_prev = pd.read_csv(os.path.join(path_binary_vector, 'public', 'dev_prevalences.csv'), index_col=0)
-print(dev_prev)
+true_prevalence = ResultSubmission.load(T1A_devprevalence_path)
 
-
-scores = {}
-for quantifier in [CC]: #, ACC, PCC, PACC, EMQ, HDy]:
+for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
 
     classifier = CalibratedClassifierCV(LogisticRegression())
     model = quantifier(classifier).fit(train)
     quantifier_name = model.__class__.__name__
 
-    scores[quantifier_name]={}
-    for sample_set, sample_size in [('dev', 1000)]:
-        ae_errors, rae_errors = [], []
-        for i, row in tqdm(dev_prev.iterrows(), total=len(dev_prev), desc=f'testing {quantifier_name} in {sample_set}'):
-            filename = row['filename']
-            prev_true = row[1:].values
-            sample_path = os.path.join(path_binary_vector, 'public', f'{sample_set}_vectors', filename)
-            sample, _ = load_binary_vectors(sample_path, nF)
-            qp.environ['SAMPLE_SIZE'] = sample.shape[0]
-            prev_estim = model.quantify(sample)
-            # prev_true  = sample.prevalence()
-            ae_errors.append(qp.error.mae(prev_true, prev_estim))
-            rae_errors.append(qp.error.mrae(prev_true, prev_estim))
-
-        ae_errors = np.asarray(ae_errors)
-        rae_errors = np.asarray(rae_errors)
-
-        mae = ae_errors.mean()
-        mrae = rae_errors.mean()
-        scores[quantifier_name][sample_set] = {'mae': mae, 'mrae': mrae}
-        pickle.dump(ae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.ae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
-        pickle.dump(rae_errors, open(os.path.join(result_path, f'{quantifier_name}.{sample_set}.rae.pickle'), 'wb'), pickle.HIGHEST_PROTOCOL)
-        print(f'{quantifier_name} {sample_set} MAE={mae:.4f}')
-        print(f'{quantifier_name} {sample_set} MRAE={mrae:.4f}')
-
-for model in scores:
-    for sample_set in ['validation']:#, 'test']:
-        print(f'{model}\t{scores[model][sample_set]["mae"]:.4f}\t{scores[model][sample_set]["mrae"]:.4f}')
+    predictions = ResultSubmission(categories=['negative', 'positive'])
+    for samplename, sample in tqdm(gen_load_samples_T1A(T1A_devvectors_path, nF),
+                                   desc=quantifier_name, total=len(true_prevalence)):
+        predictions.add(samplename, model.quantify(sample))
 
+    predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv'))
+    mae, mrae = evaluate_submission(true_prevalence, predictions)
+    print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
 
 """
 test:
diff --git a/quapy/data/preprocessing.py b/quapy/data/preprocessing.py
index ee1627e..6e58718 100644
--- a/quapy/data/preprocessing.py
+++ b/quapy/data/preprocessing.py
@@ -149,7 +149,7 @@ class IndexTransformer:
 
     def index(self, documents):
         vocab = self.vocabulary_.copy()
-        return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
+        return [[vocab.prevalence(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
 
     def fit_transform(self, X, n_jobs=-1):
         return self.fit(X).transform(X, n_jobs=n_jobs)