Merge branch 'localstack' of gitea-s2i2s.isti.cnr.it:moreo/QuaPy into localstack

2024-09-26 16:06:01 +02:00 · 2024-09-26 16:06:01 +02:00 · 641228bf62
parent 4fa4540aab 04c1f286ce
commit 641228bf62
9 changed files with 317 additions and 75 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -20,15 +20,16 @@ jobs:
    env:
      QUAPY_TESTS_OMIT_LARGE_DATASETS: True
    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip setuptools wheel
-        python -m pip install -e .[bayes,composable,tests]
+        python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
+        python -m pip install -e .[bayes,tests]
    - name: Test with unittest
      run: python -m unittest

@ -38,15 +39,18 @@ jobs:
    runs-on: ubuntu-latest
    if: github.ref == 'refs/heads/master'
    steps:
-    - uses: actions/checkout@v1
-    - name: Build documentation
-      uses: ammaraskar/sphinx-action@master
+    - uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v5
      with:
-        pre-build-command: |
-          apt-get --allow-releaseinfo-change update -y && apt-get install -y git && git --version
-          python -m pip install --upgrade pip setuptools wheel "jax[cpu]"
-          python -m pip install -e .[composable,neural,docs]
-        docs-folder: "docs/"
+        python-version: 3.11
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip setuptools wheel "jax[cpu]"
+        python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
+        python -m pip install -e .[neural,docs]
+    - name: Build documentation
+      run: sphinx-build -M html docs/source docs/build
    - name: Publish documentation
      run: |
        git clone ${{ github.server_url }}/${{ github.repository }}.git --branch gh-pages --single-branch __gh-pages/
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@ -1,23 +0,0 @@
-name: Pylint
-
-on: [push]
-
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.8", "3.9", "3.10"]
-    steps:
-    - uses: actions/checkout@v3
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v3
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pylint
-    - name: Analysing the code with pylint
-      run: |
-        pylint $(git ls-files '*.py')
--- a/LocalStack/experiments.py
+++ b/LocalStack/experiments.py
@ -0,0 +1,126 @@
+import os
+from time import time
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+import quapy as qp
+from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2, KDEyMLred
+from LocalStack.method import LocalStackingQuantification, LocalStackingQuantification2
+from quapy.method.aggregative import PACC, EMQ, KDEyML
+from quapy.model_selection import GridSearchQ
+from quapy.protocol import UPP
+from pathlib import Path
+
+SEED = 1
+
+
+
+METHODS = [
+    ('PACC', PACC(), {}),
+    ('EMQ', EMQ(), {}),
+    ('KDEy-ML',  KDEyML(), {}),
+]
+
+TRANSDUCTIVE_METHODS = [
+    ('LSQ',  LocalStackingQuantification(EMQ()), {}),
+    ('LSQ2',  LocalStackingQuantification2(EMQ()), {})
+]
+
+def show_results(result_path):
+    import pandas as pd
+    df = pd.read_csv(result_path + '.csv', sep='\t')
+    pd.set_option('display.max_columns', None)
+    pd.set_option('display.max_rows', None)
+    pd.set_option('display.width', 1000)  # Ajustar el ancho máximo
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE"], margins=True)
+    print(pv)
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["MRAE"], margins=True)
+    print(pv)
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["KLD"], margins=True)
+    print(pv)
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["TR-TIME"], margins=True)
+    print(pv)
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["TE-TIME"], margins=True)
+    print(pv)
+
+
+if __name__ == '__main__':
+
+    qp.environ['SAMPLE_SIZE'] = 500
+    qp.environ['N_JOBS'] = -1
+    n_bags_val = 25
+    n_bags_test = 100
+    result_dir = f'results_quantification/localstack'
+
+    os.makedirs(result_dir, exist_ok=True)
+
+    global_result_path = f'{result_dir}/allmethods'
+    with open(global_result_path + '.csv', 'wt') as csv:
+        csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\tTR-TIME\tTE-TIME\n')
+
+    for method_name, quantifier, param_grid in METHODS + TRANSDUCTIVE_METHODS:
+
+        print('Init method', method_name)
+
+        with open(global_result_path + '.csv', 'at') as csv:
+            for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
+                print('init', dataset)
+
+                # run_experiment(global_result_path, method_name, quantifier, param_grid, dataset)
+                local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe')
+
+                if os.path.exists(local_result_path):
+                    print(f'result file {local_result_path} already exist; skipping')
+                    report = qp.util.load_report(local_result_path)
+
+                else:
+                    with qp.util.temp_seed(SEED):
+
+                        data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True)
+                        train, test = data.train_test
+
+                        transductive_names = [name for (name, *_) in TRANSDUCTIVE_METHODS]
+
+                        if method_name not in transductive_names:
+                            if len(param_grid) == 0:
+                                t_init = time()
+                                quantifier.fit(train)
+                                train_time = time() - t_init
+                            else:
+                                # model selection (train)
+                                train, val = train.split_stratified(random_state=SEED)
+                                protocol = UPP(val, repeats=n_bags_val)
+                                modsel = GridSearchQ(
+                                    quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae'
+                                )
+                                t_init = time()
+                                try:
+                                    modsel.fit(train)
+                                    print(f'best params {modsel.best_params_}')
+                                    print(f'best score {modsel.best_score_}')
+                                    quantifier = modsel.best_model()
+                                except:
+                                    print('something went wrong... trying to fit the default model')
+                                    quantifier.fit(train)
+                                train_time = time() - t_init
+                        else:
+                            # transductive
+                            t_init = time()
+                            quantifier.fit(train)  # <-- nothing actually (proyects the X into posteriors only)
+                            train_time = time() - t_init
+
+                        # test
+                        t_init = time()
+                        protocol = UPP(test, repeats=n_bags_test)
+                        report = qp.evaluation.evaluation_report(
+                            quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True
+                        )
+                        test_time = time() - t_init
+                        report['tr_time'] = train_time
+                        report['te_time'] = test_time
+                        report.to_csv(local_result_path)
+
+                means = report.mean(numeric_only=True)
+                csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\t{means["tr_time"]:.3f}\t{means["te_time"]:.3f}\n')
+                csv.flush()
+
+    show_results(global_result_path)
--- a/LocalStack/method.py
+++ b/LocalStack/method.py
@ -0,0 +1,112 @@
+import numpy as np
+import quapy as qp
+from sklearn.multioutput import MultiOutputRegressor
+from sklearn.svm import SVR
+
+from data import LabelledCollection
+from quapy.method.base import BaseQuantifier
+from quapy.method.aggregative import AggregativeSoftQuantifier
+
+
+class LocalStackingQuantification(BaseQuantifier):
+
+    def __init__(self, surrogate_quantifier, n_samples_gen=200, n_samples_sel=50, comparison_measure='ae', random_state=None):
+        assert isinstance(surrogate_quantifier, AggregativeSoftQuantifier), \
+            f'the surrogate quantifier must be of type {AggregativeSoftQuantifier.__class__.__name__}'
+        self.surrogate_quantifier = surrogate_quantifier
+        self.n_samples_gen = n_samples_gen
+        self.n_samples_sel = n_samples_sel
+        self.comparison_measure = qp.error.from_name(comparison_measure)
+        self.random_state = random_state
+
+    def fit(self, data: LabelledCollection):
+        train, val = data.split_stratified()
+        self.surrogate_quantifier.fit(train)
+        self.val_data = val
+        return self
+
+    def normalize(self, out_simplex:np.ndarray):
+        in_simplex = out_simplex/out_simplex.sum()
+        return in_simplex
+
+    def quantify(self, instances: np.ndarray):
+        assert hasattr(self, 'val_data'), 'quantify called before fit'
+        pred_prevs = self.surrogate_quantifier.quantify(instances)
+        test_size = instances.shape[0]
+
+        samples = []
+        samples_pred_prevs = []
+        samples_distance = []
+        for i in range(self.n_samples_gen):
+            sample_i = self.val_data.sampling(test_size, *pred_prevs, random_state=self.random_state)
+            pred_prev_sample_i = self.surrogate_quantifier.quantify(sample_i.X)
+            err_dist = self.comparison_measure(pred_prevs, pred_prev_sample_i)
+
+            samples.append(sample_i)
+            samples_pred_prevs.append(pred_prev_sample_i)
+            samples_distance.append(err_dist)
+
+        ord_distances = np.argsort(samples_distance)
+        samples_sel = np.asarray(samples)[ord_distances][:self.n_samples_sel]
+        samples_pred_prevs_sel = np.asarray(samples_pred_prevs)[ord_distances][:self.n_samples_sel]
+
+        reg = MultiOutputRegressor(SVR())
+        reg_X = samples_pred_prevs_sel
+        reg_y = [s.prevalence() for s in samples_sel]
+        reg.fit(reg_X, reg_y)
+
+        corrected_prev = reg.predict([pred_prevs])[0]
+
+        corrected_prev = self.normalize(corrected_prev)
+        return corrected_prev
+
+
+
+class LocalStackingQuantification2(BaseQuantifier):
+
+    """
+    Este en vez de seleccionar samples de training para los que la prevalencia predicha se parece a la prevalencia
+    predica en test, saca directamente samples de training con la prevalencia predicha en test
+    """
+
+    def __init__(self, surrogate_quantifier, n_samples_gen=200, n_samples_sel=50, comparison_measure='ae', random_state=None):
+        assert isinstance(surrogate_quantifier, AggregativeSoftQuantifier), \
+            f'the surrogate quantifier must be of type {AggregativeSoftQuantifier.__class__.__name__}'
+        self.surrogate_quantifier = surrogate_quantifier
+        self.n_samples_gen = n_samples_gen
+        self.n_samples_sel = n_samples_sel
+        self.comparison_measure = qp.error.from_name(comparison_measure)
+        self.random_state = random_state
+
+    def fit(self, data: LabelledCollection):
+        train, val = data.split_stratified()
+        self.surrogate_quantifier.fit(train)
+        self.val_data = val
+        return self
+
+    def normalize(self, out_simplex:np.ndarray):
+        in_simplex = out_simplex/out_simplex.sum()
+        return in_simplex
+
+    def quantify(self, instances: np.ndarray):
+        assert hasattr(self, 'val_data'), 'quantify called before fit'
+        pred_prevs = self.surrogate_quantifier.quantify(instances)
+        test_size = instances.shape[0]
+
+        samples = []
+        samples_pred_prevs = []
+        for i in range(self.n_samples_gen):
+            sample_i = self.val_data.sampling(test_size, *pred_prevs, random_state=self.random_state)
+            pred_prev_sample_i = self.surrogate_quantifier.quantify(sample_i.X)
+            samples.append(sample_i)
+            samples_pred_prevs.append(pred_prev_sample_i)
+
+        reg = MultiOutputRegressor(SVR())
+        reg_X = samples_pred_prevs
+        reg_y = [s.prevalence() for s in samples]
+        reg.fit(reg_X, reg_y)
+
+        corrected_prev = reg.predict([pred_prevs])[0]
+
+        corrected_prev = self.normalize(corrected_prev)
+        return corrected_prev
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -11,9 +11,14 @@ import sys
 from os.path import join
 quapy_path = join(pathlib.Path(__file__).parents[2].resolve().as_posix(), 'quapy')
 wiki_path = join(pathlib.Path(__file__).parents[0].resolve().as_posix(), 'wiki')
+source_path = pathlib.Path(__file__).parents[2].resolve().as_posix()
 print(f'quapy path={quapy_path}')
+print(f'quapy source path={source_path}')
 sys.path.insert(0, quapy_path)
 sys.path.insert(0, wiki_path)
+sys.path.insert(0, source_path)
+
+print(sys.path)


 project = 'QuaPy: A Python-based open-source framework for quantification'
--- a/docs/source/manuals/methods.md
+++ b/docs/source/manuals/methods.md
@ -447,7 +447,7 @@ The [](quapy.method.composable) module allows the composition of quantification
 ```sh
 pip install --upgrade pip setuptools wheel
 pip install "jax[cpu]"
-pip install quapy[composable]
+pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
 ```

 ### Basics
--- a/examples/14.composable_methods.py
+++ b/examples/14.composable_methods.py
@ -2,6 +2,13 @@
 This example illustrates the composition of quantification methods from
 arbitrary loss functions and feature transformations. It will extend the basic
 example on the usage of quapy with this composition.
+
+This example requires the installation of qunfold, the back-end of QuaPy's
+composition module:
+
+    pip install --upgrade pip setuptools wheel
+    pip install "jax[cpu]"
+    pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
 """

 import numpy as np
--- a/quapy/method/composable.py
+++ b/quapy/method/composable.py
@ -1,45 +1,57 @@
 """This module allows the composition of quantification methods from loss functions and feature transformations. This functionality is realized through an integration of the qunfold package: https://github.com/mirkobunse/qunfold."""

-import qunfold
-from qunfold.quapy import QuaPyWrapper
-from qunfold.sklearn import CVClassifier
-from qunfold import (
-    LeastSquaresLoss, # losses
-    BlobelLoss,
-    EnergyLoss,
-    HellingerSurrogateLoss,
-    CombinedLoss,
-    TikhonovRegularization,
-    TikhonovRegularized,
-    ClassTransformer, # transformers
-    HistogramTransformer,
-    DistanceTransformer,
-    KernelTransformer,
-    EnergyKernelTransformer,
-    LaplacianKernelTransformer,
-    GaussianKernelTransformer,
-    GaussianRFFKernelTransformer,
-)
+_import_error_message = """qunfold, the back-end of quapy.method.composable, is not properly installed.

-__all__ = [ # control public members, e.g., for auto-documentation in sphinx; omit QuaPyWrapper
-    "ComposableQuantifier",
-    "CVClassifier",
-    "LeastSquaresLoss",
-    "BlobelLoss",
-    "EnergyLoss",
-    "HellingerSurrogateLoss",
-    "CombinedLoss",
-    "TikhonovRegularization",
-    "TikhonovRegularized",
-    "ClassTransformer",
-    "HistogramTransformer",
-    "DistanceTransformer",
-    "KernelTransformer",
-    "EnergyKernelTransformer",
-    "LaplacianKernelTransformer",
-    "GaussianKernelTransformer",
-    "GaussianRFFKernelTransformer",
-]
+To fix this error, call:
+
+    pip install --upgrade pip setuptools wheel
+    pip install "jax[cpu]"
+    pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
+"""
+
+try:
+    import qunfold
+    from qunfold.quapy import QuaPyWrapper
+    from qunfold.sklearn import CVClassifier
+    from qunfold import (
+        LeastSquaresLoss, # losses
+        BlobelLoss,
+        EnergyLoss,
+        HellingerSurrogateLoss,
+        CombinedLoss,
+        TikhonovRegularization,
+        TikhonovRegularized,
+        ClassTransformer, # transformers
+        HistogramTransformer,
+        DistanceTransformer,
+        KernelTransformer,
+        EnergyKernelTransformer,
+        LaplacianKernelTransformer,
+        GaussianKernelTransformer,
+        GaussianRFFKernelTransformer,
+    )
+
+    __all__ = [ # control public members, e.g., for auto-documentation in sphinx; omit QuaPyWrapper
+        "ComposableQuantifier",
+        "CVClassifier",
+        "LeastSquaresLoss",
+        "BlobelLoss",
+        "EnergyLoss",
+        "HellingerSurrogateLoss",
+        "CombinedLoss",
+        "TikhonovRegularization",
+        "TikhonovRegularized",
+        "ClassTransformer",
+        "HistogramTransformer",
+        "DistanceTransformer",
+        "KernelTransformer",
+        "EnergyKernelTransformer",
+        "LaplacianKernelTransformer",
+        "GaussianKernelTransformer",
+        "GaussianRFFKernelTransformer",
+    ]
+except ImportError as e:
+    raise ImportError(_import_error_message) from e

 def ComposableQuantifier(loss, transformer, **kwargs):
    """A generic quantification / unfolding method that solves a linear system of equations.
--- a/setup.py
+++ b/setup.py
@ -125,7 +125,6 @@ setup(
    # projects.
    extras_require={  # Optional
       'bayes': ['jax', 'jaxlib', 'numpyro'],
-       'composable': ['qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4'],
       'neural': ['torch'],
       'tests': ['certifi'],
       'docs' : ['sphinx-rtd-theme', 'myst-parser'],