Merge branch 'devel' of github.com:HLT-ISTI/QuaPy into devel

2024-04-30 09:55:50 +02:00 · 2024-04-30 09:55:50 +02:00 · 817aab1d99
parent 7f39f4df66 b3860b3b83
commit 817aab1d99
8 changed files with 346 additions and 80 deletions
--- a/.gitignore
+++ b/.gitignore
@ -69,6 +69,9 @@ instance/
 # Scrapy stuff:
 .scrapy

+# vscode config:
+.vscode/
+
 # Sphinx documentation
 docs/_build/

@ -85,6 +88,11 @@ ipython_config.py
 # pyenv
 .python-version

+# poetry
+poetry.toml
+pyproject.toml 
+poetry.lock
+
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
--- a/examples/uci_experiments.py
+++ b/examples/uci_experiments.py
@ -29,12 +29,17 @@ def newLR():


 def calibratedLR():
-    return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1))
+    return CalibratedClassifierCV(newLR())


 __C_range = np.logspace(-3, 3, 7)
-lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']}
-svmperf_params = {'classifier__C': __C_range}
+lr_params = {
+    'classifier__C': __C_range,
+    'classifier__class_weight': [None, 'balanced']
+}
+svmperf_params = {
+    'classifier__C': __C_range
+}


 def quantification_models():
--- a/examples/ucimulti_experiments.py
+++ b/examples/ucimulti_experiments.py
@ -0,0 +1,124 @@
+import pickle
+import os
+from time import time
+from collections import defaultdict
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+
+import quapy as qp
+from quapy.method.aggregative import PACC, EMQ, KDEyML
+from quapy.model_selection import GridSearchQ
+from quapy.protocol import UPP
+from pathlib import Path
+
+
+SEED = 1
+
+
+def newLR():
+    return LogisticRegression(max_iter=3000)
+
+# typical hyperparameters explored for Logistic Regression
+logreg_grid = {
+    'C': np.logspace(-3, 3, 7),
+    'class_weight': ['balanced', None]
+}
+
+def wrap_hyper(classifier_hyper_grid:dict):
+    return {'classifier__'+k:v for k, v in classifier_hyper_grid.items()}
+
+METHODS = [
+    ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)),
+    ('EMQ',  EMQ(newLR()), wrap_hyper(logreg_grid)),
+    # ('KDEy-ML',  KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}),
+]
+
+
+def show_results(result_path):
+    import pandas as pd
+    df = pd.read_csv(result_path+'.csv', sep='\t')
+    pd.set_option('display.max_columns', None)
+    pd.set_option('display.max_rows', None)
+    pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True)
+    print(pv)
+
+def load_timings(result_path):
+    import pandas as pd
+    timings = defaultdict(lambda: {})
+    if not Path(result_path + '.csv').exists():
+        return timings
+
+    df = pd.read_csv(result_path+'.csv', sep='\t')
+    return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict()
+
+if __name__ == '__main__':
+
+    qp.environ['SAMPLE_SIZE'] = 500
+    qp.environ['N_JOBS'] = -1
+    n_bags_val = 250
+    n_bags_test = 1000
+    result_dir = f'results/ucimulti'
+
+    os.makedirs(result_dir, exist_ok=True)
+
+    global_result_path = f'{result_dir}/allmethods'
+    timings = load_timings(global_result_path)
+    with open(global_result_path + '.csv', 'wt') as csv:
+        csv.write(f'Method\tDataset\tMAE\tMRAE\tt_train\n')
+
+    for method_name, quantifier, param_grid in METHODS:
+
+        print('Init method', method_name)
+
+        with open(global_result_path + '.csv', 'at') as csv:
+
+            for dataset in qp.datasets.UCI_MULTICLASS_DATASETS:
+
+                print('init', dataset)
+
+                local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe')
+
+                if os.path.exists(local_result_path):
+                    print(f'result file {local_result_path} already exist; skipping')
+                    report = qp.util.load_report(local_result_path)
+
+                else:
+                    with qp.util.temp_seed(SEED):
+
+                        data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True)
+
+                        # model selection
+                        train, test = data.train_test
+                        train, val = train.split_stratified(random_state=SEED)
+
+                        protocol = UPP(val, repeats=n_bags_val)
+                        modsel = GridSearchQ(
+                            quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae'
+                        )
+                        
+                        t_init = time()
+                        try:
+                            modsel.fit(train)
+
+                            print(f'best params {modsel.best_params_}')
+                            print(f'best score {modsel.best_score_}')
+
+                            quantifier = modsel.best_model()
+                        except:
+                            print('something went wrong... trying to fit the default model')
+                            quantifier.fit(train)
+                        timings[method_name][dataset] = time() - t_init
+                        
+
+                        protocol = UPP(test, repeats=n_bags_test)
+                        report = qp.evaluation.evaluation_report(
+                            quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True
+                        )
+                        report.to_csv(local_result_path)
+
+                means = report.mean(numeric_only=True)
+                csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{timings[method_name][dataset]:.3f}\n')
+                csv.flush()
+
+    show_results(global_result_path)
--- a/quapy/data/_ifcb.py
+++ b/quapy/data/_ifcb.py
@ -1,20 +1,17 @@
 import os
 import pandas as pd
 import math
-
 from quapy.data import LabelledCollection
 from quapy.protocol import AbstractProtocol
 from pathlib import Path


 def get_sample_list(path_dir):
-    """Gets a sample list finding the csv files in a directory
+    """
+    Gets a sample list finding the csv files in a directory

-    Args:
-        path_dir (_type_): directory to look for samples
-
-    Returns:
-        _type_: list of samples
+    :param path_dir: directory to look for samples
+    :return: list of samples
    """
    samples = []
    for filename in sorted(os.listdir(path_dir)):
@ -23,18 +20,15 @@ def get_sample_list(path_dir):
    return samples


-def generate_modelselection_split(samples, split=0.3):
-    """This function generates a train/test split for model selection
+def generate_modelselection_split(samples, test_prop=0.3):
+    """This function generates a train/test partition for model selection
    without the use of random numbers so the split is always the same

-    Args:
-        samples (_type_): list of samples
-        split (float, optional): percentage saved for test. Defaults to 0.3.
-
-    Returns:
-        _type_: list of samples to use as train and list of samples to use as test
+    :param samples: list of samples
+    :param test_prop: float, percentage saved for test. Defaults to 0.3.
+    :return: list of samples to use as train and list of samples to use as test
    """
-    num_items_to_pick = math.ceil(len(samples) * split)
+    num_items_to_pick = math.ceil(len(samples) * test_prop)
    step_size = math.floor(len(samples) / num_items_to_pick)
    test_indices = [i * step_size for i in range(num_items_to_pick)]
    test = [samples[i] for i in test_indices]
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -14,12 +14,17 @@ from quapy.util import download_file_if_not_exists, download_file, get_quapy_hom


 REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
-TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
+
+TWITTER_SENTIMENT_DATASETS_TEST = [
+    'gasp', 'hcr', 'omd', 'sanders',
    'semeval13', 'semeval14', 'semeval15', 'semeval16',
-                              'sst', 'wa', 'wb']
-TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
+    'sst', 'wa', 'wb',
+]
+TWITTER_SENTIMENT_DATASETS_TRAIN = [
+    'gasp', 'hcr', 'omd', 'sanders',
    'semeval', 'semeval16',
-                                 'sst', 'wa', 'wb']
+    'sst', 'wa', 'wb',
+]
 UCI_BINARY_DATASETS = [
    #'acute.a', 'acute.b',
    'balance.1', 
@ -44,14 +49,37 @@ UCI_BINARY_DATASETS = [
    'transfusion',
    'wdbc',
    'wine.1', 'wine.2', 'wine.3',
-                       'wine-q-red', 'wine-q-white',
-                       'yeast']
+    'wine-q-red',
+    'wine-q-white',
+    'yeast',
+]

-UCI_MULTICLASS_DATASETS = ['dry-bean',
+UCI_MULTICLASS_DATASETS = [
+    'dry-bean',
    'wine-quality',
    'academic-success',
    'digits',
-                           'letter']
+    'letter',
+    'abalone',
+    'obesity',
+    'nursery',
+    'yeast',
+    'hand_digits',
+    'satellite',
+    'shuttle',
+    'cmc',
+    'isolet',
+    'waveform-v1',
+    'molecular',
+    'poker_hand',
+    'connect-4',
+    'mhr',
+    'chess',
+    'page_block',
+    'phishing',
+    'image_seg',
+    'hcv',
+]

 LEQUA2022_VECTOR_TASKS = ['T1A', 'T1B']
 LEQUA2022_TEXT_TASKS = ['T2A', 'T2B']
@ -561,7 +589,13 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=Fals
    return data


-def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset:
+def fetch_UCIMulticlassDataset(
+        dataset_name,
+        data_home=None,
+        min_test_split=0.3,
+        max_train_instances=25000,
+        min_class_support=100,
+        verbose=False) -> Dataset:
    """
    Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. 

@ -583,15 +617,28 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver
    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
        ~/quay_data/ directory)
-    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
+    :param min_test_split: minimum proportion of instances to be included in the test set. This value is interpreted
+        as a minimum proportion, meaning that the real proportion could be higher in case the training proportion
+        (1-`min_test_split`% of the instances) surpasses `max_train_instances`. In such case, only `max_train_instances`
+        are taken for training, and the rest (irrespective of `min_test_split`) is taken for test.
+    :param max_train_instances: maximum number of instances to keep for training (defaults to 25000)
+    :param min_class_support: minimum number of istances per class. Classes with fewer instances
+        are discarded (deafult is 100)
    :param verbose: set to True (default is False) to get information (stats) about the dataset
    :return: a :class:`quapy.data.base.Dataset` instance
    """
-    data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose)
-    return Dataset(*data.split_stratified(1 - test_split, random_state=0), name=dataset_name)
+
+    data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, min_class_support, verbose=verbose)
+    n = len(data)
+    train_prop = (1.-min_test_split)
+    n_train = int(n*train_prop)
+    if n_train > max_train_instances:
+        train_prop = (max_train_instances / n)
+
+    return Dataset(*data.split_stratified(train_prop, random_state=0))


-def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection:
+def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_class_support=100, verbose=False) -> LabelledCollection:
    """
    Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`.

@ -613,7 +660,9 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
    :param dataset_name: a dataset name
    :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default
        ~/quay_data/ directory)
-    :param test_split: proportion of documents to be included in the test set. The rest conforms the training set
+    :param test_split: proportion of instances to be included in the test set. The rest conforms the training set
+    :param min_class_support: minimum number of istances per class. Classes with fewer instances
+        are discarded (deafult is 100)
    :param verbose: set to True (default is False) to get information (stats) about the dataset
    :return: a :class:`quapy.data.base.LabelledCollection` instance
    """
@ -626,19 +675,57 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=
        data_home = get_quapy_home()
    
    identifiers = {
-        "dry-bean": 602,
-        "wine-quality": 186,
-        "academic-success": 697,
-        "digits": 80,
-        "letter": 59
+        'dry-bean': 602,
+        'wine-quality': 186,
+        'academic-success': 697,
+        'digits': 80,
+        'letter': 59,
+        'abalone': 1,
+        'obesity': 544,
+        'nursery': 76,
+        'yeast': 110,
+        'hand_digits': 81,
+        'satellite': 146,
+        'shuttle': 148,
+        'cmc': 30,
+        'isolet': 54,
+        'waveform-v1': 107,
+        'molecular': 69,
+        'poker_hand': 158,
+        'connect-4': 26,
+        'mhr': 863,
+        'chess': 23,
+        'page_block': 78,
+        'phishing': 379,
+        'image_seg': 147,
+        'hcv': 503,
    }
    
    full_names = {
-        "dry-bean": "Dry Bean Dataset",
-        "wine-quality": "Wine Quality",
-        "academic-success": "Predict students' dropout and academic success",
-        "digits": "Optical Recognition of Handwritten Digits",
-        "letter": "Letter Recognition"
+        'dry-bean': 'Dry Bean Dataset',
+        'wine-quality': 'Wine Quality',
+        'academic-success': 'Predict students\' dropout and academic success',
+        'digits': 'Optical Recognition of Handwritten Digits',
+        'letter': 'Letter Recognition',
+        'abalone': 'Abalone',
+        'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition',
+        'nursery': 'Nursery',
+        'yeast': 'Yeast',
+        'hand_digits': 'Pen-Based Recognition of Handwritten Digits',
+        'satellite': 'Statlog Landsat Satellite',
+        'shuttle': 'Statlog Shuttle',
+        'cmc': 'Contraceptive Method Choice',
+        'isolet': 'ISOLET',
+        'waveform-v1': 'Waveform Database Generator (Version 1)',
+        'molecular': 'Molecular Biology (Splice-junction Gene Sequences)',
+        'poker_hand': 'Poker Hand',
+        'connect-4': 'Connect-4',
+        'mhr': 'Maternal Health Risk',
+        'chess': 'Chess (King-Rook vs. King)',
+        'page_block': 'Page Blocks Classification',
+        'phishing': 'Website Phishing',
+        'image_seg': 'Statlog (Image Segmentation)',
+        'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients',
    }
    
    identifier = identifiers[dataset_name]
@ -649,14 +736,36 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=

    file = join(data_home, 'uci_multiclass', dataset_name+'.pkl')
    
-    def download(id):
-        data = fetch_ucirepo(id=id)
-        X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze()
+    def download(id, name):
+        df = fetch_ucirepo(id=id)
+
+        df.data.features = pd.get_dummies(df.data.features, drop_first=True)
+        X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze()
+
+        assert y.ndim == 1, 'more than one y'
+
        classes = np.sort(np.unique(y))
        y = np.searchsorted(classes, y)
        return LabelledCollection(X, y)

-    data = pickled_resource(file, download, identifier)
+    def filter_classes(data: LabelledCollection, min_ipc):
+        classes = data.classes_
+        # restrict classes to only those with at least min_ipc instances
+        classes = classes[data.counts() >= min_ipc]
+        # filter X and y keeping only datapoints belonging to valid classes
+        filter_idx = np.in1d(data.y, classes)
+        X, y = data.X[filter_idx], data.y[filter_idx]
+        # map classes to range(len(classes))
+        y = np.searchsorted(classes, y)
+        return LabelledCollection(X, y)
+
+    data = pickled_resource(file, download, identifier, dataset_name)
+    data = filter_classes(data, min_class_support)
+    if data.n_classes <= 2:
+        raise ValueError(
+            f'After filtering out classes with less than {min_class_support=} instances, the dataset {dataset_name} '
+            f'is no longer multiclass. Try a reducing this value.'
+        )

    if verbose:
        data.stats()
@ -746,8 +855,8 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
    information on this dataset, please follow the zenodo link).
    This dataset is based on the data available publicly at
    `WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
-    The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
-    Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
+    The dataset already comes with processed features.
+    The scripts used for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.

    The datasets are downloaded only once, and stored for fast reuse.

@ -803,7 +912,7 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No
    if for_model_selection:
        # In this case, return 70% of training data as the training set and 30% as the test set
        samples = get_sample_list(train_samples_path)
-        train, test = generate_modelselection_split(samples, split=0.3)
+        train, test = generate_modelselection_split(samples, test_prop=0.3)
        train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train)

        # Test prevalence is computed from class labels
--- a/quapy/method/_neural.py
+++ b/quapy/method/_neural.py
@ -21,7 +21,7 @@ class QuaNetTrainer(BaseQuantifier):
    Example:

    >>> import quapy as qp
-    >>> from quapy.method.meta import QuaNet
+    >>> from quapy.method_name.meta import QuaNet
    >>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet
    >>>
    >>> # use samples of 100 elements
--- a/quapy/method/aggregative.py
+++ b/quapy/method/aggregative.py
@ -593,6 +593,7 @@ class PACC(AggregativeSoftQuantifier):
        if self.norm not in ACC.NORMALIZATIONS:
            raise ValueError(f"unknown normalization; valid ones are {ACC.NORMALIZATIONS}")

+
    def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection):
        """
        Estimates the misclassification rates
--- a/quapy/util.py
+++ b/quapy/util.py
@ -6,6 +6,9 @@ import pickle
 import urllib
 from pathlib import Path
 from contextlib import ExitStack
+
+import pandas as pd
+
 import quapy as qp

 import numpy as np
@ -248,6 +251,28 @@ def _check_sample_size(sample_size):
    return sample_size


+def load_report(path, as_dict=False):
+    def str2prev_arr(strprev):
+        within = strprev.strip('[]').split()
+        float_list = [float(p) for p in within]
+        float_list[-1] = 1. - sum(float_list[:-1])
+        return np.asarray(float_list)
+
+    df = pd.read_csv(path, index_col=0)
+    df['true-prev'] = df['true-prev'].apply(str2prev_arr)
+    df['estim-prev'] = df['estim-prev'].apply(str2prev_arr)
+    if as_dict:
+        d = {}
+        for col in df.columns.values:
+            vals = df[col].values
+            if col in ['true-prev', 'estim-prev']:
+                vals = np.vstack(vals)
+            d[col] = vals
+        return d
+    else:
+        return df
+
+
 class EarlyStop:
    """
    A class implementing the early-stopping condition typically used for training neural networks.