From f5603135a7e9d8f7c1f0fa4b599401a21a57e03d Mon Sep 17 00:00:00 2001 From: Lorenzo Volpi Date: Thu, 11 Apr 2024 20:07:59 +0200 Subject: [PATCH 01/11] Excluded vscode config files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 8eaff3e..5a3d613 100644 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,9 @@ instance/ # Scrapy stuff: .scrapy +# vscode config: +.vscode/ + # Sphinx documentation docs/_build/ From f69fca32b4e92638373103ae0f858a50a66971cb Mon Sep 17 00:00:00 2001 From: Lorenzo Volpi Date: Thu, 11 Apr 2024 20:08:52 +0200 Subject: [PATCH 02/11] Added UCI multiclass datasets; added filter for min instances per class to UCI multiclass datasets --- quapy/data/datasets.py | 211 +++++++++++++++++++++++++++++++---------- 1 file changed, 162 insertions(+), 49 deletions(-) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 5b9806f..72ee924 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -14,41 +14,76 @@ from quapy.util import download_file_if_not_exists, download_file, get_quapy_hom REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb'] -TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders', - 'semeval13', 'semeval14', 'semeval15', 'semeval16', - 'sst', 'wa', 'wb'] -TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders', - 'semeval', 'semeval16', - 'sst', 'wa', 'wb'] -UCI_BINARY_DATASETS = ['acute.a', 'acute.b', - 'balance.1', 'balance.2', 'balance.3', - 'breast-cancer', - 'cmc.1', 'cmc.2', 'cmc.3', - 'ctg.1', 'ctg.2', 'ctg.3', - #'diabetes', # <-- I haven't found this one... - 'german', - 'haberman', - 'ionosphere', - 'iris.1', 'iris.2', 'iris.3', - 'mammographic', - 'pageblocks.5', - #'phoneme', # <-- I haven't found this one... - 'semeion', - 'sonar', - 'spambase', - 'spectf', - 'tictactoe', - 'transfusion', - 'wdbc', - 'wine.1', 'wine.2', 'wine.3', - 'wine-q-red', 'wine-q-white', - 'yeast'] +TWITTER_SENTIMENT_DATASETS_TEST = [ + 'gasp', 'hcr', 'omd', 'sanders', + 'semeval13', 'semeval14', 'semeval15', 'semeval16', + 'sst', 'wa', 'wb', +] +TWITTER_SENTIMENT_DATASETS_TRAIN = [ + 'gasp', 'hcr', 'omd', 'sanders', + 'semeval', 'semeval16', + 'sst', 'wa', 'wb', +] +UCI_BINARY_DATASETS = [ + 'acute.a', 'acute.b', + 'balance.1', 'balance.2', 'balance.3', + 'breast-cancer', + 'cmc.1', 'cmc.2', 'cmc.3', + 'ctg.1', 'ctg.2', 'ctg.3', + #'diabetes', # <-- I haven't found this one... + 'german', + 'haberman', + 'ionosphere', + 'iris.1', 'iris.2', 'iris.3', + 'mammographic', + 'pageblocks.5', + #'phoneme', # <-- I haven't found this one... + 'semeion', + 'sonar', + 'spambase', + 'spectf', + 'tictactoe', + 'transfusion', + 'wdbc', + 'wine.1', 'wine.2', 'wine.3', + 'wine-q-red', + 'wine-q-white', + 'yeast', +] -UCI_MULTICLASS_DATASETS = ['dry-bean', - 'wine-quality', - 'academic-success', - 'digits', - 'letter'] +UCI_MULTICLASS_DATASETS = [ + 'dry-bean', + 'wine-quality', + 'academic-success', + 'digits', + 'letter', + 'abalone', + 'obesity', + 'covertype', + 'nursery', + 'diabetes', + 'yeast', + 'hand_digits', + 'satellite', + 'shuttle', + 'cmc', + 'isolet', + 'waveform.v1', + 'molecular', + 'poker_hand', + 'connect-4', + 'cardiotocography', + 'mhr', + 'chess2', + 'page_block', + 'room', + 'phishing2', + 'rt-iot22', + 'support2', + 'image_seg', + 'steel_plates', + 'hcv', +] LEQUA2022_TASKS = ['T1A', 'T1B', 'T2A', 'T2B'] @@ -556,7 +591,7 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=Fals return data -def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False) -> Dataset: +def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False, min_ipc=100) -> Dataset: """ Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. @@ -580,13 +615,15 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver ~/quay_data/ directory) :param test_split: proportion of documents to be included in the test set. The rest conforms the training set :param verbose: set to True (default is False) to get information (stats) about the dataset + :param min_ipc: minimum number of istances per class. Classes with less instances than min_ipc are discarded + (deafult is 100) :return: a :class:`quapy.data.base.Dataset` instance """ - data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose) + data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose, min_ipc) return Dataset(*data.split_stratified(1 - test_split, random_state=0)) -def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False) -> LabelledCollection: +def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False, min_ipc=100) -> LabelledCollection: """ Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. @@ -610,6 +647,8 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= ~/quay_data/ directory) :param test_split: proportion of documents to be included in the test set. The rest conforms the training set :param verbose: set to True (default is False) to get information (stats) about the dataset + :param min_ipc: minimum number of istances per class. Classes with less instances than min_ipc are discarded + (deafult is 100) :return: a :class:`quapy.data.base.LabelledCollection` instance """ assert dataset_name in UCI_MULTICLASS_DATASETS, \ @@ -621,19 +660,71 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= data_home = get_quapy_home() identifiers = { - "dry-bean": 602, - "wine-quality": 186, - "academic-success": 697, - "digits": 80, - "letter": 59 + 'dry-bean': 602, + 'wine-quality': 186, + 'academic-success': 697, + 'digits': 80, + 'letter': 59, + 'abalone': 1, + 'obesity': 544, + 'covertype': 31, + 'nursery': 76, + 'diabetes': 296, + 'yeast': 110, + 'hand_digits': 81, + 'satellite': 146, + 'shuttle': 148, + 'cmc': 30, + 'isolet': 54, + 'waveform.v1': 107, + 'molecular': 69, + 'poker_hand': 158, + 'connect-4': 26, + 'cardiotocography': 193, + 'mhr': 863, + 'chess2': 23, + 'page_block': 78, + 'room': 864, + 'phishing2': 379, + 'rt-iot22': 942, + 'support2': 880, + 'image_seg': 147, + 'steel_plates': 198, + 'hcv': 503, } full_names = { - "dry-bean": "Dry Bean Dataset", - "wine-quality": "Wine Quality", - "academic-success": "Predict students' dropout and academic success", - "digits": "Optical Recognition of Handwritten Digits", - "letter": "Letter Recognition" + 'dry-bean': 'Dry Bean Dataset', + 'wine-quality': 'Wine Quality', + 'academic-success': 'Predict students\' dropout and academic success', + 'digits': 'Optical Recognition of Handwritten Digits', + 'letter': 'Letter Recognition', + 'abalone': 'Abalone', + 'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition', + 'covertype': 'Covertype', + 'nursery': 'Nursery', + 'diabetes': 'Diabetes 130-US Hospitals for Years 1999-2008', + 'yeast': 'Yeast', + 'hand_digits': 'Pen-Based Recognition of Handwritten Digits', + 'satellite': 'Statlog Landsat Satellite', + 'shuttle': 'Statlog Shuttle', + 'cmc': 'Contraceptive Method Choice', + 'isolet': 'ISOLET', + 'waveform.v1': 'Waveform Database Generator (Version 1)', + 'molecular': 'Molecular Biology (Splice-junction Gene Sequences)', + 'poker_hand': 'Poker Hand', + 'connect-4': 'Connect-4', + 'cardiotocography': 'Cardiotocography', + 'mhr': 'Maternal Health Risk', + 'chess2': 'Chess (King-Rook vs. King)', + 'page_block': 'Page Blocks Classification', + 'room': 'Room Occupancy Estimation', + 'phishing2': 'Website Phishing', + 'rt-iot22': 'RT-IoT2022', + 'support2': 'SUPPORT2', + 'image_seg': 'Statlog (Image Segmentation)', + 'steel_plates': 'Steel Plates Faults', + 'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients', } identifier = identifiers[dataset_name] @@ -644,14 +735,36 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= file = join(data_home, 'uci_multiclass', dataset_name+'.pkl') - def download(id): + def download(id, name): data = fetch_ucirepo(id=id) X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() + # classes represented as arrays are transformed to tuples to treat them as signle objects + if name == 'support2': + y[:, 2] = np.fromiter((str(elm) for elm in y[:, 2]), dtype='object') + if y.ndim > 1: + y = np.fromiter((tuple(elm) for elm in y), dtype='object') classes = np.sort(np.unique(y)) y = np.searchsorted(classes, y) return LabelledCollection(X, y) - data = pickled_resource(file, download, identifier) + def filter_classes(data: LabelledCollection, min_ipc): + classes = data.classes_ + # restrict classes to only those with at least min_ipc instances + classes = classes[data.counts() >= min_ipc] + # filter X and y keeping only datapoints belonging to valid classes + filter_idx = np.in1d(data.y, classes) + X, y = data.X[filter_idx], data.y[filter_idx] + # map classes to range(len(classes)) + y = np.searchsorted(classes, y) + return LabelledCollection(X, y) + + data = pickled_resource(file, download, identifier, dataset_name) + data = filter_classes(data, min_ipc) + if data.n_classes <= 2: + raise ValueError( + f'Dataset {dataset_name} has too few valid classes to be multiclass with {min_ipc=}. ' + 'Try a lower value for min_ipc.' + ) if verbose: data.stats() From b53d41724091f8b3cc2f5ee6fbb7abd54405a5d0 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 12 Apr 2024 13:35:13 +0200 Subject: [PATCH 03/11] merged --- quapy/data/_ifcb.py | 26 ++++++++++---------------- quapy/data/datasets.py | 7 ++++--- quapy/method/aggregative.py | 2 +- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/quapy/data/_ifcb.py b/quapy/data/_ifcb.py index f862ed0..d5c1bdf 100644 --- a/quapy/data/_ifcb.py +++ b/quapy/data/_ifcb.py @@ -1,20 +1,17 @@ import os import pandas as pd import math - from quapy.data import LabelledCollection from quapy.protocol import AbstractProtocol from pathlib import Path def get_sample_list(path_dir): - """Gets a sample list finding the csv files in a directory + """ + Gets a sample list finding the csv files in a directory - Args: - path_dir (_type_): directory to look for samples - - Returns: - _type_: list of samples + :param path_dir: directory to look for samples + :return: list of samples """ samples = [] for filename in sorted(os.listdir(path_dir)): @@ -23,18 +20,15 @@ def get_sample_list(path_dir): return samples -def generate_modelselection_split(samples, split=0.3): - """This function generates a train/test split for model selection +def generate_modelselection_split(samples, test_prop=0.3): + """This function generates a train/test partition for model selection without the use of random numbers so the split is always the same - Args: - samples (_type_): list of samples - split (float, optional): percentage saved for test. Defaults to 0.3. - - Returns: - _type_: list of samples to use as train and list of samples to use as test + :param samples: list of samples + :param test_prop: float, percentage saved for test. Defaults to 0.3. + :return: list of samples to use as train and list of samples to use as test """ - num_items_to_pick = math.ceil(len(samples) * split) + num_items_to_pick = math.ceil(len(samples) * test_prop) step_size = math.floor(len(samples) / num_items_to_pick) test_indices = [i * step_size for i in range(num_items_to_pick)] test = [samples[i] for i in test_indices] diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 5b9806f..bcbdb0e 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -735,14 +735,15 @@ def fetch_lequa2022(task, data_home=None): return train, val_gen, test_gen + def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None): """ Loads the IFCB dataset for quantification from `Zenodo `_ (for more information on this dataset, please follow the zenodo link). This dataset is based on the data available publicly at `WHOI-Plankton repo `_. - The scripts for the processing are available at `P. González's repo `_. - Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms. + The dataset already comes with processed features. + The scripts used for the processing are available at `P. González's repo `_. The datasets are downloaded only once, and stored for fast reuse. @@ -798,7 +799,7 @@ def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=No if for_model_selection: # In this case, return 70% of training data as the training set and 30% as the test set samples = get_sample_list(train_samples_path) - train, test = generate_modelselection_split(samples, split=0.3) + train, test = generate_modelselection_split(samples, test_prop=0.3) train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes, samples=train) # Test prevalence is computed from class labels diff --git a/quapy/method/aggregative.py b/quapy/method/aggregative.py index 46e56d7..2f3fab5 100644 --- a/quapy/method/aggregative.py +++ b/quapy/method/aggregative.py @@ -577,7 +577,7 @@ class PACC(AggregativeSoftQuantifier): raise ValueError(f"unknown solver; valid ones are {ACC.SOLVERS}") if self.method not in ACC.METHODS: raise ValueError(f"unknown method; valid ones are {ACC.METHODS}") - if self.clipping not in ACC.NORMALIZATIONS: + if self.norm not in ACC.NORMALIZATIONS: raise ValueError(f"unknown clipping; valid ones are {ACC.NORMALIZATIONS}") def aggregation_fit(self, classif_predictions: LabelledCollection, data: LabelledCollection): From 4abec6629b3aa5438fe12c585a996c23beb630ed Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 12 Apr 2024 18:08:00 +0200 Subject: [PATCH 04/11] integrating more uci-multiclass datasets --- examples/uci_experiments.py | 11 ++- examples/ucimulti_experiments.py | 113 +++++++++++++++++++++++++++++++ quapy/data/datasets.py | 34 ++++++---- quapy/method/_neural.py | 2 +- quapy/util.py | 25 +++++++ 5 files changed, 167 insertions(+), 18 deletions(-) create mode 100644 examples/ucimulti_experiments.py diff --git a/examples/uci_experiments.py b/examples/uci_experiments.py index 07db7cd..b452feb 100644 --- a/examples/uci_experiments.py +++ b/examples/uci_experiments.py @@ -29,12 +29,17 @@ def newLR(): def calibratedLR(): - return CalibratedClassifierCV(LogisticRegression(max_iter=1000, solver='lbfgs', n_jobs=-1)) + return CalibratedClassifierCV(newLR()) __C_range = np.logspace(-3, 3, 7) -lr_params = {'classifier__C': __C_range, 'classifier__class_weight': [None, 'balanced']} -svmperf_params = {'classifier__C': __C_range} +lr_params = { + 'classifier__C': __C_range, + 'classifier__class_weight': [None, 'balanced'] +} +svmperf_params = { + 'classifier__C': __C_range +} def quantification_models(): diff --git a/examples/ucimulti_experiments.py b/examples/ucimulti_experiments.py new file mode 100644 index 0000000..1b48834 --- /dev/null +++ b/examples/ucimulti_experiments.py @@ -0,0 +1,113 @@ +import pickle +import os + +import numpy as np +from sklearn.linear_model import LogisticRegression + +import quapy as qp +from quapy.method.aggregative import PACC, EMQ, KDEyML +from quapy.model_selection import GridSearchQ +from quapy.protocol import UPP +from pathlib import Path + + +SEED = 1 + + +def newLR(): + return LogisticRegression(max_iter=3000) + +# typical hyperparameters explored for Logistic Regression +logreg_grid = { + 'C': np.logspace(-3, 3, 7), + 'class_weight': ['balanced', None] +} + +def wrap_hyper(classifier_hyper_grid:dict): + return {'classifier__'+k:v for k, v in classifier_hyper_grid.items()} + +METHODS = [ + ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), + ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)), + ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}), +] + + +def show_results(result_path): + import pandas as pd + df = pd.read_csv(result_path+'.csv', sep='\t') + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"], margins=True) + print(pv) + + +if __name__ == '__main__': + + qp.environ['SAMPLE_SIZE'] = 500 + qp.environ['N_JOBS'] = -1 + n_bags_val = 250 + n_bags_test = 1000 + result_dir = f'results/ucimulti' + + os.makedirs(result_dir, exist_ok=True) + + global_result_path = f'{result_dir}/allmethods' + with open(global_result_path + '.csv', 'wt') as csv: + csv.write(f'Method\tDataset\tMAE\tMRAE\n') + + for method_name, quantifier, param_grid in METHODS: + + print('Init method', method_name) + + with open(global_result_path + '.csv', 'at') as csv: + + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS[:5]: + + if dataset in ['covertype', 'diabetes']: + continue + + print('init', dataset) + + local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe') + + if os.path.exists(local_result_path): + print(f'result file {local_result_path} already exist; skipping') + report = qp.util.load_report(local_result_path) + + else: + with qp.util.temp_seed(SEED): + + data = qp.datasets.fetch_UCIMulticlassDataset(dataset, verbose=True) + + # model selection + train, test = data.train_test + train, val = train.split_stratified(random_state=SEED) + + protocol = UPP(val, repeats=n_bags_val) + modsel = GridSearchQ( + quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae' + ) + + try: + modsel.fit(train) + + print(f'best params {modsel.best_params_}') + print(f'best score {modsel.best_score_}') + + quantifier = modsel.best_model() + except: + print('something went wrong... trying to fit the default model') + quantifier.fit(train) + + protocol = UPP(test, repeats=n_bags_test) + report = qp.evaluation.evaluation_report( + quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True + ) + report.to_csv(local_result_path) + + means = report.mean() + csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') + csv.flush() + + show_results(global_result_path) \ No newline at end of file diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index a5a5677..1e0750e 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -591,7 +591,7 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=Fals return data -def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, verbose=False, min_ipc=100) -> Dataset: +def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, min_class_support=100, verbose=False) -> Dataset: """ Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. @@ -614,16 +614,16 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, ver :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default ~/quay_data/ directory) :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param min_class_support: minimum number of istances per class. Classes with fewer instances + are discarded (deafult is 100) :param verbose: set to True (default is False) to get information (stats) about the dataset - :param min_ipc: minimum number of istances per class. Classes with less instances than min_ipc are discarded - (deafult is 100) :return: a :class:`quapy.data.base.Dataset` instance """ - data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, verbose, min_ipc) + data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, min_class_support, verbose=verbose) return Dataset(*data.split_stratified(1 - test_split, random_state=0)) -def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose=False, min_ipc=100) -> LabelledCollection: +def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_class_support=100, verbose=False) -> LabelledCollection: """ Loads a UCI multiclass collection as an instance of :class:`quapy.data.base.LabelledCollection`. @@ -646,9 +646,9 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default ~/quay_data/ directory) :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param min_class_support: minimum number of istances per class. Classes with fewer instances + are discarded (deafult is 100) :param verbose: set to True (default is False) to get information (stats) about the dataset - :param min_ipc: minimum number of istances per class. Classes with less instances than min_ipc are discarded - (deafult is 100) :return: a :class:`quapy.data.base.LabelledCollection` instance """ assert dataset_name in UCI_MULTICLASS_DATASETS, \ @@ -736,13 +736,20 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= file = join(data_home, 'uci_multiclass', dataset_name+'.pkl') def download(id, name): - data = fetch_ucirepo(id=id) - X, y = data['data']['features'].to_numpy(), data['data']['targets'].to_numpy().squeeze() - # classes represented as arrays are transformed to tuples to treat them as signle objects + df = fetch_ucirepo(id=id) + + df.data.features = pd.get_dummies(df.data.features, drop_first=True) + + X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze() + # classes represented as arrays are transformed to tuples to treat them as single objects if name == 'support2': y[:, 2] = np.fromiter((str(elm) for elm in y[:, 2]), dtype='object') + raise ValueError('this is support 2') + if y.ndim > 1: y = np.fromiter((tuple(elm) for elm in y), dtype='object') + raise ValueError('more than one y') + classes = np.sort(np.unique(y)) y = np.searchsorted(classes, y) return LabelledCollection(X, y) @@ -759,11 +766,11 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, verbose= return LabelledCollection(X, y) data = pickled_resource(file, download, identifier, dataset_name) - data = filter_classes(data, min_ipc) + data = filter_classes(data, min_class_support) if data.n_classes <= 2: raise ValueError( - f'Dataset {dataset_name} has too few valid classes to be multiclass with {min_ipc=}. ' - 'Try a lower value for min_ipc.' + f'After filtering out classes with less than {min_class_support=} instances, the dataset {dataset_name} ' + f'is no longer multiclass. Try a reducing this value.' ) if verbose: @@ -848,7 +855,6 @@ def fetch_lequa2022(task, data_home=None): return train, val_gen, test_gen - def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None): """ Loads the IFCB dataset for quantification from `Zenodo `_ (for more diff --git a/quapy/method/_neural.py b/quapy/method/_neural.py index 11c2dc4..28d848a 100644 --- a/quapy/method/_neural.py +++ b/quapy/method/_neural.py @@ -21,7 +21,7 @@ class QuaNetTrainer(BaseQuantifier): Example: >>> import quapy as qp - >>> from quapy.method.meta import QuaNet + >>> from quapy.method_name.meta import QuaNet >>> from quapy.classification.neural import NeuralClassifierTrainer, CNNnet >>> >>> # use samples of 100 elements diff --git a/quapy/util.py b/quapy/util.py index 7f0abc4..9165499 100644 --- a/quapy/util.py +++ b/quapy/util.py @@ -6,6 +6,9 @@ import pickle import urllib from pathlib import Path from contextlib import ExitStack + +import pandas as pd + import quapy as qp import numpy as np @@ -246,6 +249,28 @@ def _check_sample_size(sample_size): return sample_size +def load_report(path, as_dict=False): + def str2prev_arr(strprev): + within = strprev.strip('[]').split() + float_list = [float(p) for p in within] + float_list[-1] = 1. - sum(float_list[:-1]) + return np.asarray(float_list) + + df = pd.read_csv(path, index_col=0) + df['true-prev'] = df['true-prev'].apply(str2prev_arr) + df['estim-prev'] = df['estim-prev'].apply(str2prev_arr) + if as_dict: + d = {} + for col in df.columns.values: + vals = df[col].values + if col in ['true-prev', 'estim-prev']: + vals = np.vstack(vals) + d[col] = vals + return d + else: + return df + + class EarlyStop: """ A class implementing the early-stopping condition typically used for training neural networks. From e0b80167b972d92080ecfd67b368fcb7f82f6583 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 12 Apr 2024 18:24:12 +0200 Subject: [PATCH 05/11] added max_train_instances to fetch_UCIMulticlassLabelledCollection --- examples/ucimulti_experiments.py | 2 +- quapy/data/datasets.py | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/examples/ucimulti_experiments.py b/examples/ucimulti_experiments.py index 1b48834..b01163a 100644 --- a/examples/ucimulti_experiments.py +++ b/examples/ucimulti_experiments.py @@ -29,7 +29,7 @@ def wrap_hyper(classifier_hyper_grid:dict): METHODS = [ ('PACC', PACC(newLR()), wrap_hyper(logreg_grid)), ('EMQ', EMQ(newLR()), wrap_hyper(logreg_grid)), - ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}), + # ('KDEy-ML', KDEyML(newLR()), {**wrap_hyper(logreg_grid), **{'bandwidth': np.linspace(0.01, 0.2, 20)}}), ] diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 1e0750e..ad0ef6a 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -591,7 +591,13 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=Fals return data -def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, min_class_support=100, verbose=False) -> Dataset: +def fetch_UCIMulticlassDataset( + dataset_name, + data_home=None, + min_test_split=0.3, + max_train_instances=25000, + min_class_support=100, + verbose=False) -> Dataset: """ Loads a UCI multiclass dataset as an instance of :class:`quapy.data.base.Dataset`. @@ -613,14 +619,24 @@ def fetch_UCIMulticlassDataset(dataset_name, data_home=None, test_split=0.3, min :param dataset_name: a dataset name :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default ~/quay_data/ directory) - :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param min_test_split: minimum proportion of instances to be included in the test set. This value is interpreted + as a minimum proportion, meaning that the real proportion could be higher in case the training proportion + (1-`min_test_split`% of the instances) surpasses `max_train_instances`. In such case, only `max_train_instances` + are taken for training, and the rest (irrespective of `min_test_split`) is taken for test. + :param max_train_instances: maximum number of instances to keep for training (defaults to 25000) :param min_class_support: minimum number of istances per class. Classes with fewer instances are discarded (deafult is 100) :param verbose: set to True (default is False) to get information (stats) about the dataset :return: a :class:`quapy.data.base.Dataset` instance """ data = fetch_UCIMulticlassLabelledCollection(dataset_name, data_home, min_class_support, verbose=verbose) - return Dataset(*data.split_stratified(1 - test_split, random_state=0)) + n = len(data) + train_prop = (1.-min_test_split) + n_train = int(n*train_prop) + if n_train > max_train_instances: + train_prop = (max_train_instances / n) + + return Dataset(*data.split_stratified(train_prop, random_state=0)) def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_class_support=100, verbose=False) -> LabelledCollection: @@ -645,7 +661,7 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas :param dataset_name: a dataset name :param data_home: specify the quapy home directory where the dataset will be dumped (leave empty to use the default ~/quay_data/ directory) - :param test_split: proportion of documents to be included in the test set. The rest conforms the training set + :param test_split: proportion of instances to be included in the test set. The rest conforms the training set :param min_class_support: minimum number of istances per class. Classes with fewer instances are discarded (deafult is 100) :param verbose: set to True (default is False) to get information (stats) about the dataset From 522d0740875cb3fdbbccc7f0b037f9c21cb5d659 Mon Sep 17 00:00:00 2001 From: Lorenzo Volpi Date: Tue, 23 Apr 2024 16:29:19 +0200 Subject: [PATCH 06/11] report mean fixed, datasets included --- examples/ucimulti_experiments.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/ucimulti_experiments.py b/examples/ucimulti_experiments.py index b01163a..16144cd 100644 --- a/examples/ucimulti_experiments.py +++ b/examples/ucimulti_experiments.py @@ -62,9 +62,9 @@ if __name__ == '__main__': with open(global_result_path + '.csv', 'at') as csv: - for dataset in qp.datasets.UCI_MULTICLASS_DATASETS[:5]: + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS[:12]: - if dataset in ['covertype', 'diabetes']: + if dataset in []: continue print('init', dataset) @@ -106,7 +106,7 @@ if __name__ == '__main__': ) report.to_csv(local_result_path) - means = report.mean() + means = report.mean(numeric_only=True) csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') csv.flush() From ecfc175622ba71bd02a78275b99de8fb65f35c4f Mon Sep 17 00:00:00 2001 From: Lorenzo Volpi Date: Tue, 23 Apr 2024 16:30:17 +0200 Subject: [PATCH 07/11] datasets removed, debug output added --- quapy/data/datasets.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index ad0ef6a..66be54a 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -79,7 +79,6 @@ UCI_MULTICLASS_DATASETS = [ 'room', 'phishing2', 'rt-iot22', - 'support2', 'image_seg', 'steel_plates', 'hcv', @@ -703,7 +702,6 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas 'room': 864, 'phishing2': 379, 'rt-iot22': 942, - 'support2': 880, 'image_seg': 147, 'steel_plates': 198, 'hcv': 503, @@ -737,7 +735,6 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas 'room': 'Room Occupancy Estimation', 'phishing2': 'Website Phishing', 'rt-iot22': 'RT-IoT2022', - 'support2': 'SUPPORT2', 'image_seg': 'Statlog (Image Segmentation)', 'steel_plates': 'Steel Plates Faults', 'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients', @@ -753,17 +750,25 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas def download(id, name): df = fetch_ucirepo(id=id) + df.data.features = pd.get_dummies(df.data.features, drop_first=True) X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze() - # classes represented as arrays are transformed to tuples to treat them as single objects - if name == 'support2': - y[:, 2] = np.fromiter((str(elm) for elm in y[:, 2]), dtype='object') - raise ValueError('this is support 2') + + with open(f"var/{name}_Xy.txt", "w") as f: + for row in X: + f.write(str(row) + "\n") + f.write("\n\n") + if y.ndim > 1: + unique_y = np.unique(np.fromiter((tuple(elm) for elm in y), dtype='object')) + else: + unique_y = np.unique(y) + f.write(str(unique_y) + "\n\n") + for row in y: + f.write(str(row) + "\n") if y.ndim > 1: - y = np.fromiter((tuple(elm) for elm in y), dtype='object') raise ValueError('more than one y') classes = np.sort(np.unique(y)) From f74b048e2d0a4bee150712d367b89e18eb301e01 Mon Sep 17 00:00:00 2001 From: Lorenzo Volpi Date: Wed, 24 Apr 2024 15:20:14 +0200 Subject: [PATCH 08/11] uci_multi dataset removed --- .gitignore | 5 +++++ quapy/data/datasets.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 5a3d613..418b54f 100644 --- a/.gitignore +++ b/.gitignore @@ -88,6 +88,11 @@ ipython_config.py # pyenv .python-version +# poetry +poetry.toml +pyproject.toml +poetry.lock + # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 66be54a..2e56b48 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -61,7 +61,7 @@ UCI_MULTICLASS_DATASETS = [ 'obesity', 'covertype', 'nursery', - 'diabetes', + # 'diabetes', --> very slow, skipped 'yeast', 'hand_digits', 'satellite', From 498fd8b05062e33b297286300d8652eb9fe305cb Mon Sep 17 00:00:00 2001 From: Lorenzo Volpi Date: Wed, 24 Apr 2024 17:23:01 +0200 Subject: [PATCH 09/11] datasets removed from ucimulti --- examples/ucimulti_experiments.py | 2 +- quapy/data/datasets.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/ucimulti_experiments.py b/examples/ucimulti_experiments.py index 16144cd..aae8c88 100644 --- a/examples/ucimulti_experiments.py +++ b/examples/ucimulti_experiments.py @@ -62,7 +62,7 @@ if __name__ == '__main__': with open(global_result_path + '.csv', 'at') as csv: - for dataset in qp.datasets.UCI_MULTICLASS_DATASETS[:12]: + for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: if dataset in []: continue diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 2e56b48..d197717 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -59,7 +59,7 @@ UCI_MULTICLASS_DATASETS = [ 'letter', 'abalone', 'obesity', - 'covertype', + # 'covertype', --> very slow, skipped 'nursery', # 'diabetes', --> very slow, skipped 'yeast', @@ -70,17 +70,17 @@ UCI_MULTICLASS_DATASETS = [ 'isolet', 'waveform.v1', 'molecular', - 'poker_hand', + # 'poker_hand', --> very slow, skipped 'connect-4', - 'cardiotocography', + # 'cardiotocography', --> multiple labels, skipped 'mhr', - 'chess2', + 'chess', 'page_block', - 'room', + # 'room', --> very slow, skipped 'phishing2', - 'rt-iot22', + # 'rt-iot22', --> very slow, skipped 'image_seg', - 'steel_plates', + # 'steel_plates', --> multiple labels, skipped 'hcv', ] @@ -697,7 +697,7 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas 'connect-4': 26, 'cardiotocography': 193, 'mhr': 863, - 'chess2': 23, + 'chess': 23, 'page_block': 78, 'room': 864, 'phishing2': 379, @@ -730,7 +730,7 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas 'connect-4': 'Connect-4', 'cardiotocography': 'Cardiotocography', 'mhr': 'Maternal Health Risk', - 'chess2': 'Chess (King-Rook vs. King)', + 'chess': 'Chess (King-Rook vs. King)', 'page_block': 'Page Blocks Classification', 'room': 'Room Occupancy Estimation', 'phishing2': 'Website Phishing', From 93dd6cb1c15eda8ef3ee6701364c11fc5bfcb5ec Mon Sep 17 00:00:00 2001 From: Lorenzo Volpi Date: Mon, 29 Apr 2024 17:35:43 +0200 Subject: [PATCH 10/11] training times added to globar report --- examples/ucimulti_experiments.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/examples/ucimulti_experiments.py b/examples/ucimulti_experiments.py index aae8c88..5193376 100644 --- a/examples/ucimulti_experiments.py +++ b/examples/ucimulti_experiments.py @@ -1,5 +1,7 @@ import pickle import os +from time import time +from collections import defaultdict import numpy as np from sklearn.linear_model import LogisticRegression @@ -38,9 +40,17 @@ def show_results(result_path): df = pd.read_csv(result_path+'.csv', sep='\t') pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) - pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"], margins=True) + pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE", "t_train"], margins=True) print(pv) +def load_timings(result_path): + import pandas as pd + timings = defaultdict(lambda: {}) + if not Path(result_path + '.csv').exists(): + return timings + + df = pd.read_csv(result_path+'.csv', sep='\t') + return timings | df.pivot_table(index='Dataset', columns='Method', values='t_train').to_dict() if __name__ == '__main__': @@ -53,8 +63,9 @@ if __name__ == '__main__': os.makedirs(result_dir, exist_ok=True) global_result_path = f'{result_dir}/allmethods' + timings = load_timings(global_result_path) with open(global_result_path + '.csv', 'wt') as csv: - csv.write(f'Method\tDataset\tMAE\tMRAE\n') + csv.write(f'Method\tDataset\tMAE\tMRAE\tt_train\n') for method_name, quantifier, param_grid in METHODS: @@ -64,9 +75,6 @@ if __name__ == '__main__': for dataset in qp.datasets.UCI_MULTICLASS_DATASETS: - if dataset in []: - continue - print('init', dataset) local_result_path = os.path.join(Path(global_result_path).parent, method_name + '_' + dataset + '.dataframe') @@ -88,7 +96,8 @@ if __name__ == '__main__': modsel = GridSearchQ( quantifier, param_grid, protocol, refit=True, n_jobs=-1, verbose=1, error='mae' ) - + + t_init = time() try: modsel.fit(train) @@ -99,6 +108,8 @@ if __name__ == '__main__': except: print('something went wrong... trying to fit the default model') quantifier.fit(train) + timings[method_name][dataset] = time() - t_init + protocol = UPP(test, repeats=n_bags_test) report = qp.evaluation.evaluation_report( @@ -107,7 +118,7 @@ if __name__ == '__main__': report.to_csv(local_result_path) means = report.mean(numeric_only=True) - csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n') + csv.write(f'{method_name}\t{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{timings[method_name][dataset]:.3f}\n') csv.flush() show_results(global_result_path) \ No newline at end of file From 19524f9aa82a8cb70601009e19d53433cbf8b0e3 Mon Sep 17 00:00:00 2001 From: Lorenzo Volpi Date: Mon, 29 Apr 2024 17:36:13 +0200 Subject: [PATCH 11/11] ucimulti datasets removed, cleaning --- quapy/data/datasets.py | 49 +++++++----------------------------------- 1 file changed, 8 insertions(+), 41 deletions(-) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index d197717..bfd709d 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -59,28 +59,22 @@ UCI_MULTICLASS_DATASETS = [ 'letter', 'abalone', 'obesity', - # 'covertype', --> very slow, skipped 'nursery', - # 'diabetes', --> very slow, skipped 'yeast', 'hand_digits', 'satellite', 'shuttle', 'cmc', 'isolet', - 'waveform.v1', + 'waveform-v1', 'molecular', - # 'poker_hand', --> very slow, skipped + 'poker_hand', 'connect-4', - # 'cardiotocography', --> multiple labels, skipped 'mhr', 'chess', 'page_block', - # 'room', --> very slow, skipped - 'phishing2', - # 'rt-iot22', --> very slow, skipped + 'phishing', 'image_seg', - # 'steel_plates', --> multiple labels, skipped 'hcv', ] @@ -682,28 +676,22 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas 'letter': 59, 'abalone': 1, 'obesity': 544, - 'covertype': 31, 'nursery': 76, - 'diabetes': 296, 'yeast': 110, 'hand_digits': 81, 'satellite': 146, 'shuttle': 148, 'cmc': 30, 'isolet': 54, - 'waveform.v1': 107, + 'waveform-v1': 107, 'molecular': 69, 'poker_hand': 158, 'connect-4': 26, - 'cardiotocography': 193, 'mhr': 863, 'chess': 23, 'page_block': 78, - 'room': 864, - 'phishing2': 379, - 'rt-iot22': 942, + 'phishing': 379, 'image_seg': 147, - 'steel_plates': 198, 'hcv': 503, } @@ -715,28 +703,22 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas 'letter': 'Letter Recognition', 'abalone': 'Abalone', 'obesity': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition', - 'covertype': 'Covertype', 'nursery': 'Nursery', - 'diabetes': 'Diabetes 130-US Hospitals for Years 1999-2008', 'yeast': 'Yeast', 'hand_digits': 'Pen-Based Recognition of Handwritten Digits', 'satellite': 'Statlog Landsat Satellite', 'shuttle': 'Statlog Shuttle', 'cmc': 'Contraceptive Method Choice', 'isolet': 'ISOLET', - 'waveform.v1': 'Waveform Database Generator (Version 1)', + 'waveform-v1': 'Waveform Database Generator (Version 1)', 'molecular': 'Molecular Biology (Splice-junction Gene Sequences)', 'poker_hand': 'Poker Hand', 'connect-4': 'Connect-4', - 'cardiotocography': 'Cardiotocography', 'mhr': 'Maternal Health Risk', 'chess': 'Chess (King-Rook vs. King)', 'page_block': 'Page Blocks Classification', - 'room': 'Room Occupancy Estimation', - 'phishing2': 'Website Phishing', - 'rt-iot22': 'RT-IoT2022', + 'phishing': 'Website Phishing', 'image_seg': 'Statlog (Image Segmentation)', - 'steel_plates': 'Steel Plates Faults', 'hcv': 'Hepatitis C Virus (HCV) for Egyptian patients', } @@ -750,26 +732,11 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas def download(id, name): df = fetch_ucirepo(id=id) - df.data.features = pd.get_dummies(df.data.features, drop_first=True) - X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze() - with open(f"var/{name}_Xy.txt", "w") as f: - for row in X: - f.write(str(row) + "\n") - f.write("\n\n") - if y.ndim > 1: - unique_y = np.unique(np.fromiter((tuple(elm) for elm in y), dtype='object')) - else: - unique_y = np.unique(y) - f.write(str(unique_y) + "\n\n") - for row in y: - f.write(str(row) + "\n") - - if y.ndim > 1: - raise ValueError('more than one y') + assert y.ndim == 1, 'more than one y' classes = np.sort(np.unique(y)) y = np.searchsorted(classes, y)