forked from moreo/QuaPy
added dataset IFCB plankton
This commit is contained in:
parent
cc5ab8ad70
commit
f18bce5f80
|
@ -0,0 +1,24 @@
|
|||
import quapy as qp
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from quapy.evaluation import evaluation_report
|
||||
|
||||
|
||||
def newLR():
|
||||
return LogisticRegression(n_jobs=-1)
|
||||
|
||||
quantifiers = {'CC':qp.method.aggregative.CC(newLR()),
|
||||
'ACC':qp.method.aggregative.ACC(newLR()),
|
||||
'PCC':qp.method.aggregative.PCC(newLR()),
|
||||
'PACC':qp.method.aggregative.PACC(newLR()),
|
||||
'HDy':qp.method.aggregative.DistributionMatching(newLR()),
|
||||
'EMQ':qp.method.aggregative.EMQ(newLR())}
|
||||
|
||||
for quant_name, quantifier in quantifiers.items():
|
||||
print("Experiment with "+quant_name)
|
||||
|
||||
train, test_gen = qp.datasets.fetch_IFCB()
|
||||
|
||||
quantifier.fit(train)
|
||||
|
||||
report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
|
||||
print(report.mean())
|
|
@ -0,0 +1,34 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
from quapy.protocol import AbstractProtocol
|
||||
|
||||
class IFCBTrainSamplesFromDir(AbstractProtocol):
|
||||
|
||||
def __init__(self, path_dir:str, classes: list):
|
||||
self.path_dir = path_dir
|
||||
self.classes = classes
|
||||
self.samples = []
|
||||
for filename in os.listdir(path_dir):
|
||||
if filename.endswith('.csv'):
|
||||
self.samples.append(filename)
|
||||
|
||||
def __call__(self):
|
||||
for sample in self.samples:
|
||||
s = pd.read_csv(os.path.join(self.path_dir,sample))
|
||||
# all columns but the first where we get the class
|
||||
X = s.iloc[:, 1:].to_numpy()
|
||||
y = s.iloc[:, 0].to_numpy()
|
||||
yield X, y
|
||||
|
||||
class IFCBTestSamples(AbstractProtocol):
|
||||
|
||||
def __init__(self, path_dir:str, test_prevalences_path: str):
|
||||
self.path_dir = path_dir
|
||||
self.test_prevalences = pd.read_csv(os.path.join(path_dir, test_prevalences_path))
|
||||
|
||||
def __call__(self):
|
||||
for _, test_sample in self.test_prevalences.iterrows():
|
||||
#Load the sample from disk
|
||||
X = pd.read_csv(os.path.join(self.path_dir,test_sample['sample']+'.csv')).to_numpy()
|
||||
prevalences = test_sample.iloc[1:].to_numpy().astype(float)
|
||||
yield X, prevalences
|
|
@ -732,3 +732,82 @@ def fetch_lequa2022(task, data_home=None):
|
|||
|
||||
return train, val_gen, test_gen
|
||||
|
||||
|
||||
def fetch_IFCB(single_sample_train=True, data_home=None):
|
||||
"""
|
||||
Loads the IFCB dataset for quantification <https://zenodo.org/records/10036244>`. For more
|
||||
information on this dataset check the zenodo site.
|
||||
This dataset is based on the data available publicly at <https://github.com/hsosik/WHOI-Plankton>.
|
||||
The scripts for the processing are available at <https://github.com/pglez82/IFCB_Zenodo>
|
||||
|
||||
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
|
||||
|
||||
The datasets are downloaded only once, and stored for fast reuse.
|
||||
|
||||
:param single_sample_train: boolean. If True (default), it returns the train dataset as an instance of
|
||||
:class:`quapy.data.base.LabelledCollection` (all examples together).
|
||||
If False, a generator of training samples will be returned.
|
||||
Each example in the training set has an individual class label.
|
||||
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||
~/quay_data/ directory)
|
||||
:return: a tuple `(train, test_gen)` where `train` is an instance of
|
||||
:class:`quapy.data.base.LabelledCollection`, if `single_sample_train` is True or
|
||||
:class:`quapy.data._ifcb.IFCBTrainSamplesFromDir` otherwise, i.e. a sampling protocol that
|
||||
returns a series of samples labelled example by example.
|
||||
test_gen is an instance of :class:`quapy.data._ifcb.IFCBTestSamples`,
|
||||
i.e., a sampling protocol that returns a series of samples labelled by prevalence.
|
||||
"""
|
||||
|
||||
from quapy.data._ifcb import IFCBTrainSamplesFromDir, IFCBTestSamples
|
||||
|
||||
if data_home is None:
|
||||
data_home = get_quapy_home()
|
||||
|
||||
URL_TRAIN=f'https://zenodo.org/records/10036244/files/IFCB.train.zip'
|
||||
URL_TEST=f'https://zenodo.org/records/10036244/files/IFCB.test.zip'
|
||||
URL_TEST_PREV=f'https://zenodo.org/records/10036244/files/IFCB.test_prevalences.zip'
|
||||
|
||||
ifcb_dir = join(data_home, 'ifcb')
|
||||
os.makedirs(ifcb_dir, exist_ok=True)
|
||||
|
||||
def download_unzip_and_remove(unzipped_path, url):
|
||||
tmp_path = join(ifcb_dir, 'ifcb_tmp.zip')
|
||||
download_file_if_not_exists(url, tmp_path)
|
||||
with zipfile.ZipFile(tmp_path) as file:
|
||||
file.extractall(unzipped_path)
|
||||
os.remove(tmp_path)
|
||||
|
||||
if not os.path.exists(os.path.join(ifcb_dir,'train')):
|
||||
download_unzip_and_remove(ifcb_dir, URL_TRAIN)
|
||||
if not os.path.exists(os.path.join(ifcb_dir,'test')):
|
||||
download_unzip_and_remove(ifcb_dir, URL_TEST)
|
||||
if not os.path.exists(os.path.join(ifcb_dir,'test_prevalences.csv')):
|
||||
download_unzip_and_remove(ifcb_dir, URL_TEST_PREV)
|
||||
|
||||
# Load test prevalences and classes
|
||||
test_true_prev_path = join(ifcb_dir, 'test_prevalences.csv')
|
||||
test_true_prev = pd.read_csv(test_true_prev_path)
|
||||
classes = test_true_prev.columns[1:]
|
||||
|
||||
#Load train samples
|
||||
train_samples_path = join(ifcb_dir,'train')
|
||||
train_gen = IFCBTrainSamplesFromDir(path_dir=train_samples_path, classes=classes)
|
||||
|
||||
#Load test samples
|
||||
test_samples_path = join(ifcb_dir,'test')
|
||||
test_gen = IFCBTestSamples(path_dir=test_samples_path, test_prevalences_path=test_true_prev_path)
|
||||
|
||||
# In the case the user wants it, join all the train samples in one LabelledCollection
|
||||
if single_sample_train:
|
||||
X = []
|
||||
y = []
|
||||
for X_, y_ in train_gen():
|
||||
X.append(X_)
|
||||
y.append(y_)
|
||||
|
||||
X = np.vstack(X)
|
||||
y = np.concatenate(y)
|
||||
train = LabelledCollection(X,y, classes=classes)
|
||||
return train, test_gen
|
||||
else:
|
||||
return train_gen, test_gen
|
||||
|
|
Loading…
Reference in New Issue