From a7e87e41f8172d644d850dcc548644d964b3d0f9 Mon Sep 17 00:00:00 2001
From: Alex Moreo <alejandro.moreo@isti.cnr.it>
Date: Tue, 26 Oct 2021 18:41:10 +0200
Subject: [PATCH] GridSearchQ adapted to work with generator functions and
 integrated for the baselines of LeQua2022; some tests with SVD

---
 LeQua2022/TODO.txt                            | 12 ++-
 LeQua2022/baselinesSVD_T1A.py                 | 84 +++++++++++++++++
 ...main_binary_vector.py => baselines_T1A.py} | 16 +++-
 LeQua2022/baselines_T1Amodsel.py              | 91 +++++++++++++++++++
 LeQua2022/baselines_T1B.py                    | 55 +++++++++++
 LeQua2022/constants.py                        |  3 +-
 LeQua2022/data.py                             | 39 ++++----
 quapy/evaluation.py                           | 22 +++++
 quapy/model_selection.py                      | 27 ++++--
 9 files changed, 315 insertions(+), 34 deletions(-)
 create mode 100644 LeQua2022/baselinesSVD_T1A.py
 rename LeQua2022/{main_binary_vector.py => baselines_T1A.py} (76%)
 create mode 100644 LeQua2022/baselines_T1Amodsel.py
 create mode 100644 LeQua2022/baselines_T1B.py

diff --git a/LeQua2022/TODO.txt b/LeQua2022/TODO.txt
index 2bc944e..1e16136 100644
--- a/LeQua2022/TODO.txt
+++ b/LeQua2022/TODO.txt
@@ -1,9 +1,13 @@
-1. los test hay que hacerlos suponiendo que las etiquetas no existen, es decir, viendo los resultados en los ficheros "prevalences" (renominar)
 2. tablas?
 3. fetch dataset (download, unzip, etc.)
 4. model selection
 5. plots
-6. estoy leyendo los samples en orden, y no hace falta. Sería mejor una función genérica que lee todos los ejemplos y
-    que de todos modos genera un output con el mismo nombre del file
-7. Make ResultSubmission class abstract, and create 4 instances thus forcing the field task_name to be set correctly
 8. No me convence que la lectura de los samples (caso en que no hay ground truth) viene en orden aleatorio
+9. Experimentar con vectores densos (PCA sobre tfidf por ejemplo)
+10. Si cambiamos el formato de los samples (por ejemplo, en lugar de svmlight con .txt a PCA con .dat) hay que cambiar
+    cosas en el código. Está escrito varias veces un glob(*.txt)
+11. Quitar las categorias como columnas de los ficheros de prevalences
+12. sample_size cannot be set to a non-integer in GridSearchQ whith protocol="gen" (it could, but is not indicated in doc)
+13. repair doc of GridSearchQ
+14. reparar la calibracion en LR (lo tuve que quitar para que funcionara GridSearchQ, y lo quité en todos los ficheros)
+15. podria poner que el eval_budget se usase en GridSearchQ con generator function para el progress bar de tqdm
\ No newline at end of file
diff --git a/LeQua2022/baselinesSVD_T1A.py b/LeQua2022/baselinesSVD_T1A.py
new file mode 100644
index 0000000..c0fdc15
--- /dev/null
+++ b/LeQua2022/baselinesSVD_T1A.py
@@ -0,0 +1,84 @@
+import pickle
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from tqdm import tqdm
+import pandas as pd
+
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.method.aggregative import *
+import quapy.functional as F
+from data import *
+import os
+import constants
+
+from sklearn.decomposition import TruncatedSVD
+
+
+# LeQua official baselines for task T1A (Binary/Vector)
+# =====================================================
+
+predictions_path = os.path.join('predictions', 'T1A')
+os.makedirs(predictions_path, exist_ok=True)
+
+models_path = os.path.join('models', 'T1A')
+os.makedirs(models_path, exist_ok=True)
+
+pathT1A = './data/T1A/public'
+T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
+T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
+T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt')
+
+train = LabelledCollection.load(T1A_trainpath, load_binary_vectors)
+nF = train.instances.shape[1]
+svd = TruncatedSVD(n_components=300)
+train.instances = svd.fit_transform(train.instances)
+
+qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
+
+print(f'number of classes: {len(train.classes_)}')
+print(f'number of training documents: {len(train)}')
+print(f'training prevalence: {F.strprev(train.prevalence())}')
+print(f'training matrix shape: {train.instances.shape}')
+
+true_prevalence = ResultSubmission.load(T1A_devprevalence_path)
+
+for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
+
+    # classifier = CalibratedClassifierCV(LogisticRegression())
+    classifier = LogisticRegression()
+    model = quantifier(classifier).fit(train)
+    quantifier_name = model.__class__.__name__
+
+    predictions = ResultSubmission(categories=['negative', 'positive'])
+    for samplename, sample in tqdm(gen_load_samples_T1(T1A_devvectors_path, nF),
+                                   desc=quantifier_name, total=len(true_prevalence)):
+        sample = svd.transform(sample)
+        predictions.add(samplename, model.quantify(sample))
+
+    predictions.dump(os.path.join(predictions_path, quantifier_name + '.svd.csv'))
+    pickle.dump(model, open(os.path.join(models_path, quantifier_name+'.svd.pkl'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
+
+    mae, mrae = evaluate_submission(true_prevalence, predictions)
+    print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
+
+"""
+test:
+CC	0.1859	1.5406
+ACC	0.0453	0.2840
+PCC	0.1793	1.7187
+PACC	0.0287	0.1494
+EMQ	0.0225	0.1020
+HDy	0.0631	0.2307
+
+validation
+CC	0.1862	1.9587
+ACC	0.0394	0.2669
+PCC	0.1789	2.1383
+PACC	0.0354	0.1587
+EMQ	0.0224	0.0960
+HDy	0.0467	0.2121
+"""
+
+
diff --git a/LeQua2022/main_binary_vector.py b/LeQua2022/baselines_T1A.py
similarity index 76%
rename from LeQua2022/main_binary_vector.py
rename to LeQua2022/baselines_T1A.py
index c9e87a0..179995c 100644
--- a/LeQua2022/main_binary_vector.py
+++ b/LeQua2022/baselines_T1A.py
@@ -13,9 +13,16 @@ from data import *
 import os
 import constants
 
-predictions_path = os.path.join('predictions', 'T1A')  # binary - vector
+
+# LeQua official baselines for task T1A (Binary/Vector)
+# =====================================================
+
+predictions_path = os.path.join('predictions', 'T1A')
 os.makedirs(predictions_path, exist_ok=True)
 
+models_path = os.path.join('models', 'T1A')
+os.makedirs(models_path, exist_ok=True)
+
 pathT1A = './data/T1A/public'
 T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
 T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
@@ -35,16 +42,19 @@ true_prevalence = ResultSubmission.load(T1A_devprevalence_path)
 
 for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
 
-    classifier = CalibratedClassifierCV(LogisticRegression())
+    # classifier = CalibratedClassifierCV(LogisticRegression(C=1))
+    classifier = LogisticRegression(C=1)
     model = quantifier(classifier).fit(train)
     quantifier_name = model.__class__.__name__
 
     predictions = ResultSubmission(categories=['negative', 'positive'])
-    for samplename, sample in tqdm(gen_load_samples_T1A(T1A_devvectors_path, nF),
+    for samplename, sample in tqdm(gen_load_samples_T1(T1A_devvectors_path, nF),
                                    desc=quantifier_name, total=len(true_prevalence)):
         predictions.add(samplename, model.quantify(sample))
 
     predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv'))
+    pickle.dump(model, open(os.path.join(models_path, quantifier_name+'.pkl'), 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
+
     mae, mrae = evaluate_submission(true_prevalence, predictions)
     print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
 
diff --git a/LeQua2022/baselines_T1Amodsel.py b/LeQua2022/baselines_T1Amodsel.py
new file mode 100644
index 0000000..c312135
--- /dev/null
+++ b/LeQua2022/baselines_T1Amodsel.py
@@ -0,0 +1,91 @@
+import pickle
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from tqdm import tqdm
+import pandas as pd
+
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.method.aggregative import *
+import quapy.functional as F
+from data import *
+import os
+import constants
+
+
+# LeQua official baselines for task T1A (Binary/Vector)
+# =====================================================
+
+predictions_path = os.path.join('predictions', 'T1A')
+os.makedirs(predictions_path, exist_ok=True)
+
+models_path = os.path.join('models', 'T1A')
+os.makedirs(models_path, exist_ok=True)
+
+pathT1A = './data/T1A/public'
+T1A_devvectors_path = os.path.join(pathT1A, 'dev_vectors')
+T1A_devprevalence_path = os.path.join(pathT1A, 'dev_prevalences.csv')
+T1A_trainpath = os.path.join(pathT1A, 'training_vectors.txt')
+
+train = LabelledCollection.load(T1A_trainpath, load_binary_vectors)
+nF = train.instances.shape[1]
+
+qp.environ['SAMPLE_SIZE'] = constants.T1A_SAMPLE_SIZE
+
+print(f'number of classes: {len(train.classes_)}')
+print(f'number of training documents: {len(train)}')
+print(f'training prevalence: {F.strprev(train.prevalence())}')
+print(f'training matrix shape: {train.instances.shape}')
+
+true_prevalence = ResultSubmission.load(T1A_devprevalence_path)
+
+param_grid = {'C': np.logspace(-3,3,7), 'class_weight': ['balanced', None]}
+
+
+def gen_samples():
+    return gen_load_samples_T1(T1A_devvectors_path, nF, ground_truth_path=T1A_devprevalence_path, return_filename=False)
+
+
+for quantifier in [CC, ACC, PCC, PACC, EMQ, HDy]:
+    #classifier = CalibratedClassifierCV(LogisticRegression(), n_jobs=-1)
+    classifier = LogisticRegression()
+    model = quantifier(classifier)
+    print(f'{model.__class__.__name__}: Model selection')
+    model = qp.model_selection.GridSearchQ(
+        model,
+        param_grid,
+        sample_size=None,
+        protocol='gen',
+        error=qp.error.mae,
+        refit=False,
+        verbose=True
+    ).fit(train, gen_samples)
+
+    quantifier_name = model.best_model().__class__.__name__
+    print(f'{quantifier_name} mae={model.best_score_:.3f} (params: {model.best_params_})')
+
+    pickle.dump(model.best_model(),
+                open(os.path.join(models_path, quantifier_name+'.modsel.pkl'), 'wb'),
+                protocol=pickle.HIGHEST_PROTOCOL)
+
+
+"""
+test:
+CC	0.1859	1.5406
+ACC	0.0453	0.2840
+PCC	0.1793	1.7187
+PACC	0.0287	0.1494
+EMQ	0.0225	0.1020
+HDy	0.0631	0.2307
+
+validation
+CC	0.1862	1.9587
+ACC	0.0394	0.2669
+PCC	0.1789	2.1383
+PACC	0.0354	0.1587
+EMQ	0.0224	0.0960
+HDy	0.0467	0.2121
+"""
+
+
diff --git a/LeQua2022/baselines_T1B.py b/LeQua2022/baselines_T1B.py
new file mode 100644
index 0000000..1344bbc
--- /dev/null
+++ b/LeQua2022/baselines_T1B.py
@@ -0,0 +1,55 @@
+import pickle
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from tqdm import tqdm
+import pandas as pd
+
+import quapy as qp
+from quapy.data import LabelledCollection
+from quapy.method.aggregative import *
+import quapy.functional as F
+from data import *
+import os
+import constants
+
+predictions_path = os.path.join('predictions', 'T1B')  # multiclass - vector
+os.makedirs(predictions_path, exist_ok=True)
+
+pathT1B = './data/T1B/public'
+T1B_devvectors_path = os.path.join(pathT1B, 'dev_vectors')
+T1B_devprevalence_path = os.path.join(pathT1B, 'dev_prevalences.csv')
+T1B_trainpath = os.path.join(pathT1B, 'training_vectors.txt')
+T1B_catmap = os.path.join(pathT1B, 'training_vectors_label_map.txt')
+
+train = LabelledCollection.load(T1B_trainpath, load_binary_vectors)
+nF = train.instances.shape[1]
+
+qp.environ['SAMPLE_SIZE'] = constants.T1B_SAMPLE_SIZE
+
+print(f'number of classes: {len(train.classes_)}')
+print(f'number of training documents: {len(train)}')
+print(f'training prevalence: {F.strprev(train.prevalence())}')
+print(f'training matrix shape: {train.instances.shape}')
+
+true_prevalence = ResultSubmission.load(T1B_devprevalence_path)
+
+cat2code, categories = load_category_map(T1B_catmap)
+
+for quantifier in [PACC]:  # [CC, ACC, PCC, PACC, EMQ]:
+
+    classifier = CalibratedClassifierCV(LogisticRegression())
+    model = quantifier(classifier).fit(train)
+    quantifier_name = model.__class__.__name__
+
+    predictions = ResultSubmission(categories=categories)
+    for samplename, sample in tqdm(gen_load_samples_T1(T1B_devvectors_path, nF),
+                                   desc=quantifier_name, total=len(true_prevalence)):
+        predictions.add(samplename, model.quantify(sample))
+
+    predictions.dump(os.path.join(predictions_path, quantifier_name + '.csv'))
+    mae, mrae = evaluate_submission(true_prevalence, predictions)
+    print(f'{quantifier_name} mae={mae:.3f} mrae={mrae:.3f}')
+
+
+
diff --git a/LeQua2022/constants.py b/LeQua2022/constants.py
index 1162e12..dee7f8c 100644
--- a/LeQua2022/constants.py
+++ b/LeQua2022/constants.py
@@ -2,5 +2,6 @@ DEV_SAMPLES = 1000
 TEST_SAMPLES = 5000
 
 T1A_SAMPLE_SIZE = 250
+T1B_SAMPLE_SIZE = 1000
 
-ERROR_TOL=1E-3
+ERROR_TOL = 1E-3
diff --git a/LeQua2022/data.py b/LeQua2022/data.py
index 815fc30..5068f47 100644
--- a/LeQua2022/data.py
+++ b/LeQua2022/data.py
@@ -26,40 +26,45 @@ import constants
 def load_category_map(path):
     cat2code = {}
     with open(path, 'rt') as fin:
-        category, code = fin.readline().split()
-        cat2code[category] = int(code)
-    return cat2code
+        for line in fin:
+            category, code = line.split()
+            cat2code[category] = int(code)
+    code2cat = [cat for cat, code in sorted(cat2code.items(), key=lambda x:x[1])]
+    return cat2code, code2cat
 
 
 def load_binary_vectors(path, nF=None):
     return sklearn.datasets.load_svmlight_file(path, n_features=nF)
 
 
-def __gen_load_samples_with_groudtruth(path_dir:str, ground_truth_path:str, load_fn, **load_kwargs):
+def __gen_load_samples_with_groudtruth(path_dir:str, return_filename:bool, ground_truth_path:str, load_fn, **load_kwargs):
     true_prevs = ResultSubmission.load(ground_truth_path)
     for filename, prevalence in true_prevs.iterrows():
         sample, _ = load_fn(os.path.join(path_dir, filename), **load_kwargs)
-        yield filename, sample, prevalence
+        if return_filename:
+            yield filename, sample, prevalence
+        else:
+            yield sample, prevalence
 
 
-def __gen_load_samples_without_groudtruth(path_dir:str, load_fn, **load_kwargs):
+def __gen_load_samples_without_groudtruth(path_dir:str, return_filename:bool, load_fn, **load_kwargs):
     for filepath in glob(os.path.join(path_dir, '*_sample_*.txt')):
         sample, _ = load_fn(filepath, **load_kwargs)
-        yield os.path.basename(filepath), sample
+        if return_filename:
+            yield os.path.basename(filepath), sample
+        else:
+            yield sample
 
 
-def gen_load_samples_T1A(path_dir:str, nF:int, ground_truth_path:str = None):
+def gen_load_samples_T1(path_dir:str, nF:int, ground_truth_path:str = None, return_filename=True):
     if ground_truth_path is None:
-        for filename, sample in __gen_load_samples_without_groudtruth(path_dir, load_binary_vectors, nF=nF):
-            yield filename, sample
+        # the generator function returns tuples (filename:str, sample:csr_matrix)
+        gen_fn = __gen_load_samples_without_groudtruth(path_dir, return_filename, load_binary_vectors, nF=nF)
     else:
-        for filename, sample, prevalence in __gen_load_samples_with_groudtruth(path_dir, ground_truth_path, load_binary_vectors, nF=nF):
-            yield filename, sample, prevalence
-
-
-def gen_load_samples_T1B(path_dir:str, ground_truth_path:str = None):
-    # for ... : yield
-    pass
+        # the generator function returns tuples (filename:str, sample:csr_matrix, prevalence:ndarray)
+        gen_fn = __gen_load_samples_with_groudtruth(path_dir, return_filename, ground_truth_path, load_binary_vectors, nF=nF)
+    for r in gen_fn:
+        yield r
 
 
 def gen_load_samples_T2A(path_dir:str, ground_truth_path:str = None):
diff --git a/quapy/evaluation.py b/quapy/evaluation.py
index ebdb537..42ecf01 100644
--- a/quapy/evaluation.py
+++ b/quapy/evaluation.py
@@ -9,6 +9,7 @@ from quapy.method.base import BaseQuantifier
 from quapy.util import temp_seed
 import quapy.functional as F
 import pandas as pd
+import inspect
 
 
 def artificial_prevalence_prediction(
@@ -78,6 +79,27 @@ def natural_prevalence_prediction(
     return _predict_from_indexes(indexes, model, test, n_jobs, verbose)
 
 
+def gen_prevalence_prediction(model: BaseQuantifier, gen_fn: Callable, eval_budget=None):
+    if not inspect.isgenerator(gen_fn()):
+        raise ValueError('param "gen_fun" is not a generator')
+
+    if not isinstance(eval_budget, int):
+        eval_budget = -1
+
+    true_prevalences, estim_prevalences = [], []
+    for sample_instances, true_prev in gen_fn():
+        true_prevalences.append(true_prev)
+        estim_prevalences.append(model.quantify(sample_instances))
+        eval_budget -= 1
+        if eval_budget == 0:
+            break
+
+    true_prevalences = np.asarray(true_prevalences)
+    estim_prevalences = np.asarray(estim_prevalences)
+
+    return true_prevalences, estim_prevalences
+
+
 def _predict_from_indexes(
         indexes,
         model: BaseQuantifier,
diff --git a/quapy/model_selection.py b/quapy/model_selection.py
index 1080db0..95c6ff8 100644
--- a/quapy/model_selection.py
+++ b/quapy/model_selection.py
@@ -5,8 +5,9 @@ from typing import Union, Callable
 
 import quapy as qp
 from quapy.data.base import LabelledCollection
-from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction
+from quapy.evaluation import artificial_prevalence_prediction, natural_prevalence_prediction, gen_prevalence_prediction
 from quapy.method.aggregative import BaseQuantifier
+import inspect
 
 
 class GridSearchQ(BaseQuantifier):
@@ -74,8 +75,10 @@ class GridSearchQ(BaseQuantifier):
         self.timeout = timeout
         self.verbose = verbose
         self.__check_error(error)
-        assert self.protocol in {'app', 'npp'}, \
-            'unknown protocol; valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence protocols'
+        assert self.protocol in {'app', 'npp', 'gen'}, \
+            'unknown protocol: valid ones are "app" or "npp" for the "artificial" or the "natural" prevalence ' \
+            'protocols. Use protocol="gen" when passing a generator function thorough val_split that yields a ' \
+            'sample (instances) and their prevalence (ndarray) at each iteration.'
         if self.protocol == 'npp':
             if self.n_repetitions is None or self.n_repetitions == 1:
                 if self.eval_budget is not None:
@@ -99,9 +102,14 @@ class GridSearchQ(BaseQuantifier):
             assert 0. < validation < 1., 'validation proportion should be in (0,1)'
             training, validation = training.split_stratified(train_prop=1 - validation)
             return training, validation
+        elif self.protocol=='gen' and inspect.isgenerator(validation()):
+            return training, validation
         else:
             raise ValueError(f'"validation" must either be a LabelledCollection or a float in (0,1) indicating the'
-                             f'proportion of training documents to extract (type found: {type(validation)})')
+                             f'proportion of training documents to extract (type found: {type(validation)}). '
+                             f'Optionally, "validation" can be a callable function returning a generator that yields '
+                             f'the sample instances along with their true prevalence at each iteration by '
+                             f'setting protocol="gen".')
 
     def __check_error(self, error):
         if error in qp.error.QUANTIFICATION_ERROR:
@@ -132,6 +140,8 @@ class GridSearchQ(BaseQuantifier):
             return natural_prevalence_prediction(
                 model, val_split, self.sample_size,
                 **commons)
+        elif self.protocol == 'gen':
+            return gen_prevalence_prediction(model, gen_fn=val_split, eval_budget=self.eval_budget)
         else:
             raise ValueError('unknown protocol')
 
@@ -144,7 +154,8 @@ class GridSearchQ(BaseQuantifier):
         if val_split is None:
             val_split = self.val_split
         training, val_split = self.__check_training_validation(training, val_split)
-        assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
+        if self.protocol != 'gen':
+            assert isinstance(self.sample_size, int) and self.sample_size > 0, 'sample_size must be a positive integer'
 
         params_keys = list(self.param_grid.keys())
         params_values = list(self.param_grid.values())
@@ -192,8 +203,6 @@ class GridSearchQ(BaseQuantifier):
             raise TimeoutError('all jobs took more than the timeout time to end')
 
         self.sout(f'optimization finished: best params {self.best_params_} (score={self.best_score_:.5f})')
-        # model.set_params(**self.best_params_)
-        # self.best_model_ = deepcopy(model)
 
         if self.refit:
             self.sout(f'refitting on the whole development set')
@@ -203,11 +212,11 @@ class GridSearchQ(BaseQuantifier):
 
     def quantify(self, instances):
         assert hasattr(self, 'best_model_'), 'quantify called before fit'
-        return self.best_model_.quantify(instances)
+        return self.best_model().quantify(instances)
 
     @property
     def classes_(self):
-        return self.best_model_.classes_
+        return self.best_model().classes_
 
     def set_params(self, **parameters):
         self.param_grid = parameters