added dataset IFCB plankton

This commit is contained in:
Alejandro Moreo Fernandez 2023-11-08 11:07:47 +01:00
parent cc5ab8ad70
commit f18bce5f80
3 changed files with 137 additions and 0 deletions

View File

@ -0,0 +1,24 @@
import quapy as qp
from sklearn.linear_model import LogisticRegression
from quapy.evaluation import evaluation_report
def newLR():
return LogisticRegression(n_jobs=-1)
quantifiers = {'CC':qp.method.aggregative.CC(newLR()),
'ACC':qp.method.aggregative.ACC(newLR()),
'PCC':qp.method.aggregative.PCC(newLR()),
'PACC':qp.method.aggregative.PACC(newLR()),
'HDy':qp.method.aggregative.DistributionMatching(newLR()),
'EMQ':qp.method.aggregative.EMQ(newLR())}
for quant_name, quantifier in quantifiers.items():
print("Experiment with "+quant_name)
train, test_gen = qp.datasets.fetch_IFCB()
quantifier.fit(train)
report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
print(report.mean())

34
quapy/data/_ifcb.py Normal file
View File

@ -0,0 +1,34 @@
import os
import pandas as pd
from quapy.protocol import AbstractProtocol
class IFCBTrainSamplesFromDir(AbstractProtocol):
def __init__(self, path_dir:str, classes: list):
self.path_dir = path_dir
self.classes = classes
self.samples = []
for filename in os.listdir(path_dir):
if filename.endswith('.csv'):
self.samples.append(filename)
def __call__(self):
for sample in self.samples:
s = pd.read_csv(os.path.join(self.path_dir,sample))
# all columns but the first where we get the class
X = s.iloc[:, 1:].to_numpy()
y = s.iloc[:, 0].to_numpy()
yield X, y
class IFCBTestSamples(AbstractProtocol):
def __init__(self, path_dir:str, test_prevalences_path: str):
self.path_dir = path_dir
self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path))
def __call__(self):
for _, test_sample in self.test_prevalences.iterrows():
#Load the sample from disk
X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy()
prevalences = test_sample.iloc[1:].to_numpy().astype(float)
yield X, prevalences

View File

@ -732,3 +732,82 @@ def fetch_lequa2022(task, data_home=None):
return train, val_gen, test_gen
def fetch_IFCB(single_sample_train=True, data_home=None):
"""
Loads the IFCB dataset for quantification <https://zenodo.org/records/10036244>`. For more
information on this dataset check the zenodo site.
This dataset is based on the data available publicly at <https://github.com/hsosik/WHOI-Plankton>.
The scripts for the processing are available at <https://github.com/pglez82/IFCB_Zenodo>
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
The datasets are downloaded only once, and stored for fast reuse.
:param single_sample_train: boolean. If True (default), it returns the train dataset as an instance of
:class:`quapy.data.base.LabelledCollection` (all examples together).
If False, a generator of training samples will be returned.
Each example in the training set has an individual class label.
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:return: a tuple `(train, test_gen)` where `train` is an instance of
:class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is True or
:class:`quapy.data._ifcb.IFCBTrainSamplesFromDir` otherwise, i.e. a sampling protocol that
returns a series of samples labelled example by example.
test_gen is an instance of :class:`quapy.data._ifcb.IFCBTestSamples`,
i.e., a sampling protocol that returns a series of samples labelled by prevalence.
"""
from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples
if data_home is None:
data_home = get_quapy_home()
URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
ifcb_dir = join(data_home, 'ifcb')
os.makedirs(ifcb_dir, exist_ok=True)
def download_unzip_and_remove(unzipped_path, url):
tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
download_file_if_not_exists(url, tmp_path)
with zipfile.ZipFile(tmp_path) as file:
file.extractall(unzipped_path)
os.remove(tmp_path)
if not os.path.exists(os.path.join(ifcb_dir,'train')):
download_unzip_and_remove(ifcb_dir, URL_TRAIN)
if not os.path.exists(os.path.join(ifcb_dir,'test')):
download_unzip_and_remove(ifcb_dir, URL_TEST)
if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
# Load test prevalences and classes
test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
test_true_prev = pd.read_csv(test_true_prev_path)
classes = test_true_prev.columns[1:]
#Load train samples
train_samples_path = join(ifcb_dir,'train')
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
#Load test samples
test_samples_path = join(ifcb_dir,'test')
test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences_path=test_true_prev_path)
# In the case the user wants it, join all the train samples in one LabelledCollection
if single_sample_train:
X = []
y = []
for X_, y_ in train_gen():
X.append(X_)
y.append(y_)
X = np.vstack(X)
y = np.concatenate(y)
train = LabelledCollection(X,y, classes=classes)
return train, test_gen
else:
return train_gen, test_gen