From f18bce5f80d6ba69aafa056b2ec7b835be0196e4 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Wed, 8 Nov 2023 11:07:47 +0100 Subject: [PATCH] added dataset IFCB plankton --- examples/ifcb_experiments.py | 24 +++++++++++ quapy/data/_ifcb.py | 34 ++++++++++++++++ quapy/data/datasets.py | 79 ++++++++++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+) create mode 100644 examples/ifcb_experiments.py create mode 100644 quapy/data/_ifcb.py diff --git a/examples/ifcb_experiments.py b/examples/ifcb_experiments.py new file mode 100644 index 0000000..fff13ef --- /dev/null +++ b/examples/ifcb_experiments.py @@ -0,0 +1,24 @@ +import quapy as qp +from sklearn.linear_model import LogisticRegression +from quapy.evaluation import evaluation_report + + +def newLR(): + return LogisticRegression(n_jobs=-1) + +quantifiers = {'CC':qp.method.aggregative.CC(newLR()), + 'ACC':qp.method.aggregative.ACC(newLR()), + 'PCC':qp.method.aggregative.PCC(newLR()), + 'PACC':qp.method.aggregative.PACC(newLR()), + 'HDy':qp.method.aggregative.DistributionMatching(newLR()), + 'EMQ':qp.method.aggregative.EMQ(newLR())} + +for quant_name, quantifier in quantifiers.items(): + print("Experiment with "+quant_name) + + train, test_gen = qp.datasets.fetch_IFCB() + + quantifier.fit(train) + + report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True) + print(report.mean()) diff --git a/quapy/data/_ifcb.py b/quapy/data/_ifcb.py new file mode 100644 index 0000000..87bb030 --- /dev/null +++ b/quapy/data/_ifcb.py @@ -0,0 +1,34 @@ +import os +import pandas as pd +from quapy.protocol import AbstractProtocol + +class IFCBTrainSamplesFromDir(AbstractProtocol): + + def __init__(self, path_dir:str, classes: list): + self.path_dir = path_dir + self.classes = classes + self.samples = [] + for filename in os.listdir(path_dir): + if filename.endswith('.csv'): + self.samples.append(filename) + + def __call__(self): + for sample in self.samples: + s = pd.read_csv(os.path.join(self.path_dir,sample)) + # all columns but the first where we get the class + X = s.iloc[:, 1:].to_numpy() + y = s.iloc[:, 0].to_numpy() + yield X, y + +class IFCBTestSamples(AbstractProtocol): + + def __init__(self, path_dir:str, test_prevalences_path: str): + self.path_dir = path_dir + self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path)) + + def __call__(self): + for _, test_sample in self.test_prevalences.iterrows(): + #Load the sample from disk + X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy() + prevalences = test_sample.iloc[1:].to_numpy().astype(float) + yield X, prevalences \ No newline at end of file diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index b01dcec..2c82b0e 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -732,3 +732,82 @@ def fetch_lequa2022(task, data_home=None): return train, val_gen, test_gen + +def fetch_IFCB(single_sample_train=True, data_home=None): + """ + Loads the IFCB dataset for quantification `. For more + information on this dataset check the zenodo site. + This dataset is based on the data available publicly at . + The scripts for the processing are available at + + Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms. + + The datasets are downloaded only once, and stored for fast reuse. + + :param single_sample_train: boolean. If True (default), it returns the train dataset as an instance of + :class:`quapy.data.base.LabelledCollection` (all examples together). + If False, a generator of training samples will be returned. + Each example in the training set has an individual class label. + :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default + ~/quay_data/ directory) + :return: a tuple `(train, test_gen)` where `train` is an instance of + :class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is True or + :class:`quapy.data._ifcb.IFCBTrainSamplesFromDir` otherwise, i.e. a sampling protocol that + returns a series of samples labelled example by example. + test_gen is an instance of :class:`quapy.data._ifcb.IFCBTestSamples`, + i.e., a sampling protocol that returns a series of samples labelled by prevalence. + """ + + from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples + + if data_home is None: + data_home = get_quapy_home() + + URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip' + URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip' + URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip' + + ifcb_dir = join(data_home, 'ifcb') + os.makedirs(ifcb_dir, exist_ok=True) + + def download_unzip_and_remove(unzipped_path, url): + tmp_path = join(ifcb_dir, 'ifcb_tmp.zip') + download_file_if_not_exists(url, tmp_path) + with zipfile.ZipFile(tmp_path) as file: + file.extractall(unzipped_path) + os.remove(tmp_path) + + if not os.path.exists(os.path.join(ifcb_dir,'train')): + download_unzip_and_remove(ifcb_dir, URL_TRAIN) + if not os.path.exists(os.path.join(ifcb_dir,'test')): + download_unzip_and_remove(ifcb_dir, URL_TEST) + if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')): + download_unzip_and_remove(ifcb_dir, URL_TEST_PREV) + + # Load test prevalences and classes + test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv') + test_true_prev = pd.read_csv(test_true_prev_path) + classes = test_true_prev.columns[1:] + + #Load train samples + train_samples_path = join(ifcb_dir,'train') + train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes) + + #Load test samples + test_samples_path = join(ifcb_dir,'test') + test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences_path=test_true_prev_path) + + # In the case the user wants it, join all the train samples in one LabelledCollection + if single_sample_train: + X = [] + y = [] + for X_, y_ in train_gen(): + X.append(X_) + y.append(y_) + + X = np.vstack(X) + y = np.concatenate(y) + train = LabelledCollection(X,y, classes=classes) + return train, test_gen + else: + return train_gen, test_gen