added dataset IFCB plankton

2023-11-08 11:07:47 +01:00 · 2023-11-08 11:07:47 +01:00 · f18bce5f80
parent cc5ab8ad70
commit f18bce5f80
3 changed files with 137 additions and 0 deletions
--- a/examples/ifcb_experiments.py
+++ b/examples/ifcb_experiments.py
@ -0,0 +1,24 @@
+import quapy as qp
+from sklearn.linear_model import LogisticRegression
+from quapy.evaluation import evaluation_report
+
+
+def newLR():
+    return LogisticRegression(n_jobs=-1)
+
+quantifiers = {'CC':qp.method.aggregative.CC(newLR()),
+               'ACC':qp.method.aggregative.ACC(newLR()),
+               'PCC':qp.method.aggregative.PCC(newLR()),
+               'PACC':qp.method.aggregative.PACC(newLR()),
+               'HDy':qp.method.aggregative.DistributionMatching(newLR()),
+               'EMQ':qp.method.aggregative.EMQ(newLR())}
+
+for quant_name, quantifier in quantifiers.items():
+    print("Experiment with "+quant_name)
+
+    train, test_gen = qp.datasets.fetch_IFCB()
+
+    quantifier.fit(train)
+
+    report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
+    print(report.mean())
--- a/quapy/data/_ifcb.py
+++ b/quapy/data/_ifcb.py
@ -0,0 +1,34 @@
+import os
+import pandas as pd
+from quapy.protocol import AbstractProtocol
+
+class IFCBTrainSamplesFromDir(AbstractProtocol):
+
+    def __init__(self, path_dir:str, classes: list):
+        self.path_dir = path_dir
+        self.classes = classes
+        self.samples = []
+        for filename in os.listdir(path_dir):
+            if filename.endswith('.csv'):
+                self.samples.append(filename)
+
+    def __call__(self):
+        for sample in self.samples:
+            s = pd.read_csv(os.path.join(self.path_dir,sample))
+            # all columns but the first where we get the class
+            X = s.iloc[:, 1:].to_numpy()
+            y = s.iloc[:, 0].to_numpy()
+            yield X, y
+
+class IFCBTestSamples(AbstractProtocol):
+
+    def __init__(self, path_dir:str, test_prevalences_path: str):
+        self.path_dir = path_dir
+        self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path))
+
+    def __call__(self):
+        for _, test_sample in self.test_prevalences.iterrows():
+            #Load the sample from disk
+            X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy()
+            prevalences = test_sample.iloc[1:].to_numpy().astype(float)
+            yield X, prevalences
--- a/quapy/data/datasets.py
+++ b/quapy/data/datasets.py
@ -732,3 +732,82 @@ def fetch_lequa2022(task, data_home=None):

    return train, val_gen, test_gen

+
+def fetch_IFCB(single_sample_train=True, data_home=None):
+    """
+    Loads the IFCB dataset for quantification <https://zenodo.org/records/10036244>`. For more
+    information on this dataset check the zenodo site.
+    This dataset is based on the data available publicly at <https://github.com/hsosik/WHOI-Plankton>.
+    The scripts for the processing are available at <https://github.com/pglez82/IFCB_Zenodo>
+
+    Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
+
+    The datasets are downloaded only once, and stored for fast reuse.
+
+    :param single_sample_train: boolean. If True (default), it returns the train dataset as an instance of
+        :class:`quapy.data.base.LabelledCollection` (all examples together).
+        If False, a generator of training samples will be returned.
+        Each example in the training set has an individual class label.
+    :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
+        ~/quay_data/ directory)
+    :return: a tuple `(train, test_gen)` where `train` is an instance of
+        :class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is True or
+        :class:`quapy.data._ifcb.IFCBTrainSamplesFromDir` otherwise, i.e. a sampling protocol that
+        returns a series of samples labelled example by example.
+        test_gen is an instance of  :class:`quapy.data._ifcb.IFCBTestSamples`,
+        i.e., a sampling protocol that returns a series of samples labelled by prevalence.
+    """
+
+    from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples
+
+    if data_home is None:
+        data_home = get_quapy_home()
+    
+    URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
+    URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
+    URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
+
+    ifcb_dir = join(data_home, 'ifcb')
+    os.makedirs(ifcb_dir, exist_ok=True)
+
+    def download_unzip_and_remove(unzipped_path, url):
+        tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
+        download_file_if_not_exists(url, tmp_path)
+        with zipfile.ZipFile(tmp_path) as file:
+            file.extractall(unzipped_path)
+        os.remove(tmp_path)
+
+    if not os.path.exists(os.path.join(ifcb_dir,'train')):
+        download_unzip_and_remove(ifcb_dir, URL_TRAIN)
+    if not os.path.exists(os.path.join(ifcb_dir,'test')):
+        download_unzip_and_remove(ifcb_dir, URL_TEST)
+    if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
+        download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
+
+    # Load test prevalences and classes
+    test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
+    test_true_prev = pd.read_csv(test_true_prev_path)
+    classes = test_true_prev.columns[1:]
+
+    #Load train samples
+    train_samples_path = join(ifcb_dir,'train')
+    train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
+
+    #Load test samples
+    test_samples_path = join(ifcb_dir,'test')
+    test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences_path=test_true_prev_path)
+
+    # In the case the user wants it, join all the train samples in one LabelledCollection
+    if single_sample_train:
+        X = []
+        y = []
+        for X_, y_ in train_gen():
+            X.append(X_)
+            y.append(y_)   
+
+        X = np.vstack(X)
+        y = np.concatenate(y)
+        train = LabelledCollection(X,y, classes=classes)
+        return train, test_gen
+    else:
+        return train_gen, test_gen