Merge branch 'localstack' of gitea-s2i2s.isti.cnr.it:moreo/QuaPy into localstack

2024-09-26 16:06:01 +02:00 · 2024-09-26 16:06:01 +02:00 · 641228bf62
parent 4fa4540aab 04c1f286ce
commit 641228bf62
9 changed files with 317 additions and 75 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -20,15 +20,16 @@ jobs:
    env:
      QUAPY_TESTS_OMIT_LARGE_DATASETS: True
    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip setuptools wheel
-        python -m pip install -e .[bayes,composable,tests]
+        python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
        python -m pip install -e .[bayes,tests]
    - name: Test with unittest
      run: python -m unittest
@ -38,15 +39,18 @@ jobs:
    runs-on: ubuntu-latest
    if: github.ref == 'refs/heads/master'
    steps:
-    - uses: actions/checkout@v1
+    - uses: actions/checkout@v4
-    - name: Build documentation
+    - name: Set up Python
-      uses: ammaraskar/sphinx-action@master
+      uses: actions/setup-python@v5
      with:
-        pre-build-command: |
+        python-version: 3.11
-          apt-get --allow-releaseinfo-change update -y && apt-get install -y git && git --version
+    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip setuptools wheel "jax[cpu]"
-          python -m pip install -e .[composable,neural,docs]
+        python -m pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
-        docs-folder: "docs/"
+        python -m pip install -e .[neural,docs]
    - name: Build documentation
      run: sphinx-build -M html docs/source docs/build
    - name: Publish documentation
      run: |
        git clone ${{ github.server_url }}/${{ github.repository }}.git --branch gh-pages --single-branch __gh-pages/
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@ -1,23 +0,0 @@
 name: Pylint
 on: [push]
 jobs:
  build:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.8", "3.9", "3.10"]
    steps:
    - uses: actions/checkout@v3
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v3
      with:
        python-version: ${{ matrix.python-version }}
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install pylint
    - name: Analysing the code with pylint
      run: |
        pylint $(git ls-files '*.py')
--- a/LocalStack/experiments.py
+++ b/LocalStack/experiments.py
@ -0,0 +1,126 @@
 import os
 from time import time
 import numpy as np
 from sklearn.linear_model import LogisticRegression
 import quapy as qp
 from KDEy.kdey_devel import KDEyMLauto, KDEyMLauto2, KDEyMLred
 from LocalStack.method import LocalStackingQuantification, LocalStackingQuantification2
 from quapy.method.aggregative import PACC, EMQ, KDEyML
 from quapy.model_selection import GridSearchQ
 from quapy.protocol import UPP
 from pathlib import Path
 SEED = 1
 METHODS = [
    ('PACC', PACC(), {}),
    ('EMQ', EMQ(), {}),
    ('KDEy-ML',  KDEyML(), {}),
 ]
 TRANSDUCTIVE_METHODS = [
    ('LSQ',  LocalStackingQuantification(EMQ()), {}),
    ('LSQ2',  LocalStackingQuantification2(EMQ()), {})
 ]
 def show_results(result_path):
    import pandas as pd
    df = pd.read_csv(result_path + '.csv', sep='\t')
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.width', 1000)  # Ajustar el ancho máximo
    pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE"], margins=True)
    print(pv)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["MRAE"], margins=True)
    print(pv)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["KLD"], margins=True)
    print(pv)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["TR-TIME"], margins=True)
    print(pv)
    pv = df.pivot_table(index='Dataset', columns="Method", values=["TE-TIME"], margins=True)
    print(pv)
 if __name__ == '__main__':
    qp.environ['SAMPLE_SIZE'] = 500
    qp.environ['N_JOBS'] = -1
    n_bags_val = 25
    n_bags_test = 100
    result_dir = f'results_quantification/localstack'
    os.makedirs(result_dir, exist_ok=True)
    global_result_path = f'{result_dir}/allmethods'
    with open(global_result_path + '.csv', 'wt') as csv:
        csv.write(f'Method\tDataset\tMAE\tMRAE\tKLD\tTR-TIME\tTE-TIME\n')
    for method_name, quantifier, param_grid in METHODS + TRANSDUCTIVE_METHODS:
        print('Init method', method_name)
        with open(global_result_path + '.csv', 'at') as csv:
            for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
                print('init', dataset)
                # run_experiment(global_result_path, method_name, quantifier, param_grid, dataset)
                local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe')
                if os.path.exists(local_result_path):
                    print(f'result file {local_result_path} already exist; skipping')
                    report = qp.util.load_report(local_result_path)
                else:
                    with qp.util.temp_seed(SEED):
                        data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True)
                        train, test = data.train_test
                        transductive_names = [name for (name, *_) in TRANSDUCTIVE_METHODS]
                        if method_name not in transductive_names:
                            if len(param_grid) == 0:
                                t_init = time()
                                quantifier.fit(train)
                                train_time = time() - t_init
                            else:
                                # model selection (train)
                                train, val = train.split_stratified(random_state=SEED)
                                protocol = UPP(val, repeats=n_bags_val)
                                modsel = GridSearchQ(
                                    quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae'
                                )
                                t_init = time()
                                try:
                                    modsel.fit(train)
                                    print(f'best params {modsel.best_params_}')
                                    print(f'best score {modsel.best_score_}')
                                    quantifier = modsel.best_model()
                                except:
                                    print('something went wrong... trying to fit the default model')
                                    quantifier.fit(train)
                                train_time = time() - t_init
                        else:
                            # transductive
                            t_init = time()
                            quantifier.fit(train)  # <-- nothing actually (proyects the X into posteriors only)
                            train_time = time() - t_init
                        # test
                        t_init = time()
                        protocol = UPP(test, repeats=n_bags_test)
                        report = qp.evaluation.evaluation_report(
                            quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'], verbose=True
                        )
                        test_time = time() - t_init
                        report['tr_time'] = train_time
                        report['te_time'] = test_time
                        report.to_csv(local_result_path)
                means = report.mean(numeric_only=True)
                csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\t{means["tr_time"]:.3f}\t{means["te_time"]:.3f}\n')
                csv.flush()
    show_results(global_result_path)
--- a/LocalStack/method.py
+++ b/LocalStack/method.py
@ -0,0 +1,112 @@
 import numpy as np
 import quapy as qp
 from sklearn.multioutput import MultiOutputRegressor
 from sklearn.svm import SVR
 from data import LabelledCollection
 from quapy.method.base import BaseQuantifier
 from quapy.method.aggregative import AggregativeSoftQuantifier
 class LocalStackingQuantification(BaseQuantifier):
    def __init__(self, surrogate_quantifier, n_samples_gen=200, n_samples_sel=50, comparison_measure='ae', random_state=None):
        assert isinstance(surrogate_quantifier, AggregativeSoftQuantifier), \
            f'the surrogate quantifier must be of type {AggregativeSoftQuantifier.__class__.__name__}'
        self.surrogate_quantifier = surrogate_quantifier
        self.n_samples_gen = n_samples_gen
        self.n_samples_sel = n_samples_sel
        self.comparison_measure = qp.error.from_name(comparison_measure)
        self.random_state = random_state
    def fit(self, data: LabelledCollection):
        train, val = data.split_stratified()
        self.surrogate_quantifier.fit(train)
        self.val_data = val
        return self
    def normalize(self, out_simplex:np.ndarray):
        in_simplex = out_simplex/out_simplex.sum()
        return in_simplex
    def quantify(self, instances: np.ndarray):
        assert hasattr(self, 'val_data'), 'quantify called before fit'
        pred_prevs = self.surrogate_quantifier.quantify(instances)
        test_size = instances.shape[0]
        samples = []
        samples_pred_prevs = []
        samples_distance = []
        for i in range(self.n_samples_gen):
            sample_i = self.val_data.sampling(test_size, *pred_prevs, random_state=self.random_state)
            pred_prev_sample_i = self.surrogate_quantifier.quantify(sample_i.X)
            err_dist = self.comparison_measure(pred_prevs, pred_prev_sample_i)
            samples.append(sample_i)
            samples_pred_prevs.append(pred_prev_sample_i)
            samples_distance.append(err_dist)
        ord_distances = np.argsort(samples_distance)
        samples_sel = np.asarray(samples)[ord_distances][:self.n_samples_sel]
        samples_pred_prevs_sel = np.asarray(samples_pred_prevs)[ord_distances][:self.n_samples_sel]
        reg = MultiOutputRegressor(SVR())
        reg_X = samples_pred_prevs_sel
        reg_y = [s.prevalence() for s in samples_sel]
        reg.fit(reg_X, reg_y)
        corrected_prev = reg.predict([pred_prevs])[0]
        corrected_prev = self.normalize(corrected_prev)
        return corrected_prev
 class LocalStackingQuantification2(BaseQuantifier):
    """
    Este en vez de seleccionar samples de training para los que la prevalencia predicha se parece a la prevalencia
    predica en test, saca directamente samples de training con la prevalencia predicha en test
    """
    def __init__(self, surrogate_quantifier, n_samples_gen=200, n_samples_sel=50, comparison_measure='ae', random_state=None):
        assert isinstance(surrogate_quantifier, AggregativeSoftQuantifier), \
            f'the surrogate quantifier must be of type {AggregativeSoftQuantifier.__class__.__name__}'
        self.surrogate_quantifier = surrogate_quantifier
        self.n_samples_gen = n_samples_gen
        self.n_samples_sel = n_samples_sel
        self.comparison_measure = qp.error.from_name(comparison_measure)
        self.random_state = random_state
    def fit(self, data: LabelledCollection):
        train, val = data.split_stratified()
        self.surrogate_quantifier.fit(train)
        self.val_data = val
        return self
    def normalize(self, out_simplex:np.ndarray):
        in_simplex = out_simplex/out_simplex.sum()
        return in_simplex
    def quantify(self, instances: np.ndarray):
        assert hasattr(self, 'val_data'), 'quantify called before fit'
        pred_prevs = self.surrogate_quantifier.quantify(instances)
        test_size = instances.shape[0]
        samples = []
        samples_pred_prevs = []
        for i in range(self.n_samples_gen):
            sample_i = self.val_data.sampling(test_size, *pred_prevs, random_state=self.random_state)
            pred_prev_sample_i = self.surrogate_quantifier.quantify(sample_i.X)
            samples.append(sample_i)
            samples_pred_prevs.append(pred_prev_sample_i)
        reg = MultiOutputRegressor(SVR())
        reg_X = samples_pred_prevs
        reg_y = [s.prevalence() for s in samples]
        reg.fit(reg_X, reg_y)
        corrected_prev = reg.predict([pred_prevs])[0]
        corrected_prev = self.normalize(corrected_prev)
        return corrected_prev
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -11,9 +11,14 @@ import sys
 from os.path import join
 quapy_path = join(pathlib.Path(__file__).parents[2].resolve().as_posix(), 'quapy')
 wiki_path = join(pathlib.Path(__file__).parents[0].resolve().as_posix(), 'wiki')
 source_path = pathlib.Path(__file__).parents[2].resolve().as_posix()
 print(f'quapy path={quapy_path}')
 print(f'quapy source path={source_path}')
 sys.path.insert(0, quapy_path)
 sys.path.insert(0, wiki_path)
 sys.path.insert(0, source_path)
 print(sys.path)
 project = 'QuaPy: A Python-based open-source framework for quantification'
--- a/docs/source/manuals/methods.md
+++ b/docs/source/manuals/methods.md
@ -447,7 +447,7 @@ The [](quapy.method.composable) module allows the composition of quantification
 ```sh
 pip install --upgrade pip setuptools wheel
 pip install "jax[cpu]"
-pip install quapy[composable]
+pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
 ```
 ### Basics
--- a/examples/14.composable_methods.py
+++ b/examples/14.composable_methods.py
@ -2,6 +2,13 @@
 This example illustrates the composition of quantification methods from
 arbitrary loss functions and feature transformations. It will extend the basic
 example on the usage of quapy with this composition.
 This example requires the installation of qunfold, the back-end of QuaPy's
 composition module:
    pip install --upgrade pip setuptools wheel
    pip install "jax[cpu]"
    pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
 """
 import numpy as np
--- a/quapy/method/composable.py
+++ b/quapy/method/composable.py
@ -1,9 +1,19 @@
 """This module allows the composition of quantification methods from loss functions and feature transformations. This functionality is realized through an integration of the qunfold package: https://github.com/mirkobunse/qunfold."""
-import qunfold
+_import_error_message = """qunfold, the back-end of quapy.method.composable, is not properly installed.
-from qunfold.quapy import QuaPyWrapper
+
-from qunfold.sklearn import CVClassifier
+To fix this error, call:
-from qunfold import (
+
    pip install --upgrade pip setuptools wheel
    pip install "jax[cpu]"
    pip install "qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4"
 """
 try:
    import qunfold
    from qunfold.quapy import QuaPyWrapper
    from qunfold.sklearn import CVClassifier
    from qunfold import (
        LeastSquaresLoss, # losses
        BlobelLoss,
        EnergyLoss,
@ -19,9 +29,9 @@ from qunfold import (
        LaplacianKernelTransformer,
        GaussianKernelTransformer,
        GaussianRFFKernelTransformer,
-)
+    )
-__all__ = [ # control public members, e.g., for auto-documentation in sphinx; omit QuaPyWrapper
+    __all__ = [ # control public members, e.g., for auto-documentation in sphinx; omit QuaPyWrapper
        "ComposableQuantifier",
        "CVClassifier",
        "LeastSquaresLoss",
@ -39,7 +49,9 @@ __all__ = [ # control public members, e.g., for auto-documentation in sphinx; om
        "LaplacianKernelTransformer",
        "GaussianKernelTransformer",
        "GaussianRFFKernelTransformer",
-]
+    ]
 except ImportError as e:
    raise ImportError(_import_error_message) from e
 def ComposableQuantifier(loss, transformer, **kwargs):
    """A generic quantification / unfolding method that solves a linear system of equations.
--- a/setup.py
+++ b/setup.py
@ -125,7 +125,6 @@ setup(
    # projects.
    extras_require={  # Optional
       'bayes': ['jax', 'jaxlib', 'numpyro'],
       'composable': ['qunfold @ git+https://github.com/mirkobunse/qunfold@v0.1.4'],
       'neural': ['torch'],
       'tests': ['certifi'],
       'docs' : ['sphinx-rtd-theme', 'myst-parser'],