testing IFCB dataset

This commit is contained in:
Alejandro Moreo Fernandez 2024-02-08 14:33:22 +01:00
parent 3c28a75b8c
commit a8230827e2
7 changed files with 78 additions and 41 deletions

View File

@ -51,7 +51,6 @@
<li class="toctree-l4"><a class="reference internal" href="quapy.classification.html">quapy.classification package</a></li> <li class="toctree-l4"><a class="reference internal" href="quapy.classification.html">quapy.classification package</a></li>
<li class="toctree-l4 current"><a class="current reference internal" href="#">quapy.data package</a></li> <li class="toctree-l4 current"><a class="current reference internal" href="#">quapy.data package</a></li>
<li class="toctree-l4"><a class="reference internal" href="quapy.method.html">quapy.method package</a></li> <li class="toctree-l4"><a class="reference internal" href="quapy.method.html">quapy.method package</a></li>
<li class="toctree-l4"><a class="reference internal" href="quapy.tests.html">quapy.tests package</a></li>
</ul> </ul>
</li> </li>
<li class="toctree-l3"><a class="reference internal" href="quapy.html#submodules">Submodules</a></li> <li class="toctree-l3"><a class="reference internal" href="quapy.html#submodules">Submodules</a></li>
@ -627,30 +626,31 @@ otherwise.</p>
<span id="quapy-data-datasets-module"></span><h2>quapy.data.datasets module<a class="headerlink" href="#module-quapy.data.datasets" title="Link to this heading"></a></h2> <span id="quapy-data-datasets-module"></span><h2>quapy.data.datasets module<a class="headerlink" href="#module-quapy.data.datasets" title="Link to this heading"></a></h2>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_IFCB"> <dt class="sig sig-object py" id="quapy.data.datasets.fetch_IFCB">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_IFCB</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">single_sample_train</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_IFCB"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_IFCB" title="Link to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_IFCB</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">single_sample_train</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">for_model_selection</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_IFCB"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_IFCB" title="Link to this definition"></a></dt>
<dd><p>Loads the IFCB dataset for quantification &lt;<a class="reference external" href="https://zenodo.org/records/10036244">https://zenodo.org/records/10036244</a>&gt;`. For more <dd><p>Loads the IFCB dataset for quantification from <a class="reference external" href="https://zenodo.org/records/10036244">Zenodo</a> (for more
information on this dataset check the zenodo site. information on this dataset, please follow the zenodo link).
This dataset is based on the data available publicly at &lt;<a class="reference external" href="https://github.com/hsosik/WHOI-Plankton">https://github.com/hsosik/WHOI-Plankton</a>&gt;. This dataset is based on the data available publicly at
The scripts for the processing are available at &lt;<a class="reference external" href="https://github.com/pglez82/IFCB_Zenodo">https://github.com/pglez82/IFCB_Zenodo</a>&gt;</p> <a class="reference external" href="https://github.com/hsosik/WHOI-Plankton">WHOI-Plankton repo</a>.
<p>Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.</p> The scripts for the processing are available at <a class="reference external" href="https://github.com/pglez82/IFCB_Zenodo">P. Gonzálezs repo</a>.
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.</p>
<p>The datasets are downloaded only once, and stored for fast reuse.</p> <p>The datasets are downloaded only once, and stored for fast reuse.</p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt> <dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple"> <dd class="field-odd"><ul class="simple">
<li><p><strong>single_sample_train</strong> boolean. If True (default), it returns the train dataset as an instance of <li><p><strong>single_sample_train</strong> a boolean. If true, it will return the train dataset as a
<a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a> (all examples together). <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a> (all examples together).
If False, a generator of training samples will be returned. If false, a generator of training samples will be returned. Each example in the training set has an individual label.</p></li>
Each example in the training set has an individual class label.</p></li> <li><p><strong>for_model_selection</strong> if True, then returns a split 30% of the training set (86 out of 286 samples) to be used for model selection;
if False, then returns the full training set as training set and the test set as the test set</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default <li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li> ~/quay_data/ directory)</p></li>
</ul> </ul>
</dd> </dd>
<dt class="field-even">Returns<span class="colon">:</span></dt> <dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a tuple <cite>(train, test_gen)</cite> where <cite>train</cite> is an instance of <dd class="field-even"><p>a tuple <cite>(train, test_gen)</cite> where <cite>train</cite> is an instance of
<a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, if <cite>single_sample_train</cite> is True or <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, if <cite>single_sample_train</cite> is true or
<code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data._ifcb.IFCBTrainSamplesFromDir</span></code> otherwise, i.e. a sampling protocol that <code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data._ifcb.IFCBTrainSamplesFromDir</span></code>, i.e. a sampling protocol that returns a series of samples
returns a series of samples labelled example by example. labelled example by example. test_gen will be a <code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data._ifcb.IFCBTestSamples</span></code>,
test_gen is an instance of <code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data._ifcb.IFCBTestSamples</span></code>,
i.e., a sampling protocol that returns a series of samples labelled by prevalence.</p> i.e., a sampling protocol that returns a series of samples labelled by prevalence.</p>
</dd> </dd>
</dl> </dl>

View File

@ -22,7 +22,6 @@
<script src="_static/js/theme.js"></script> <script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="genindex.html" /> <link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" /> <link rel="search" title="Search" href="search.html" />
<link rel="next" title="quapy.tests package" href="quapy.tests.html" />
<link rel="prev" title="quapy.data package" href="quapy.data.html" /> <link rel="prev" title="quapy.data package" href="quapy.data.html" />
</head> </head>
@ -52,7 +51,6 @@
<li class="toctree-l4"><a class="reference internal" href="quapy.classification.html">quapy.classification package</a></li> <li class="toctree-l4"><a class="reference internal" href="quapy.classification.html">quapy.classification package</a></li>
<li class="toctree-l4"><a class="reference internal" href="quapy.data.html">quapy.data package</a></li> <li class="toctree-l4"><a class="reference internal" href="quapy.data.html">quapy.data package</a></li>
<li class="toctree-l4 current"><a class="current reference internal" href="#">quapy.method package</a></li> <li class="toctree-l4 current"><a class="current reference internal" href="#">quapy.method package</a></li>
<li class="toctree-l4"><a class="reference internal" href="quapy.tests.html">quapy.tests package</a></li>
</ul> </ul>
</li> </li>
<li class="toctree-l3"><a class="reference internal" href="quapy.html#submodules">Submodules</a></li> <li class="toctree-l3"><a class="reference internal" href="quapy.html#submodules">Submodules</a></li>
@ -2820,7 +2818,6 @@ any quantification method should beat.</p>
</div> </div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer"> <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="quapy.data.html" class="btn btn-neutral float-left" title="quapy.data package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a> <a href="quapy.data.html" class="btn btn-neutral float-left" title="quapy.data package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="quapy.tests.html" class="btn btn-neutral float-right" title="quapy.tests package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div> </div>
<hr/> <hr/>

File diff suppressed because one or more lines are too long

View File

@ -1,29 +1,49 @@
import numpy as np
import quapy as qp import quapy as qp
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
from quapy.model_selection import GridSearchQ
from quapy.evaluation import evaluation_report from quapy.evaluation import evaluation_report
def newLR(): print('Quantifying the IFCB dataset with PACC\n')
return LogisticRegression(n_jobs=-1)
# model selection
print('loading dataset for model selection...', end='')
train, val_gen = qp.datasets.fetch_IFCB(for_model_selection=True, single_sample_train=True)
print('[done]')
print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={train.n_classes}')
print(f'\tvalidation samples={val_gen.total()}')
quantifiers = [ print('model selection starts')
('CC', qp.method.aggregative.CC(newLR())), quantifier = qp.method.aggregative.PACC(LogisticRegression())
('ACC', qp.method.aggregative.ACC(newLR())),
('PCC', qp.method.aggregative.PCC(newLR())),
('PACC', qp.method.aggregative.PACC(newLR())),
('HDy', qp.method.aggregative.DMy(newLR())),
('EMQ', qp.method.aggregative.EMQ(newLR()))
]
mod_sel = GridSearchQ(
quantifier,
param_grid={
'classifier__C': np.logspace(-3,3,7),
'classifier__class_weight': [None, 'balanced']
},
protocol=val_gen,
refit=False,
n_jobs=-1,
verbose=True,
raise_errors=True
).fit(train)
for quant_name, quantifier in quantifiers: print(f'model selection chose hyperparameters: {mod_sel.best_params_}')
quantifier = mod_sel.best_model_
print("Experiment with "+quant_name) print('loading dataset for test...', end='')
train, test_gen = qp.datasets.fetch_IFCB(for_model_selection=False, single_sample_train=True)
print('[done]')
print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={train.n_classes}')
print(f'\ttest samples={test_gen.total()}')
train, test_gen = qp.datasets.fetch_IFCB() print('training on the whole dataset before test')
quantifier.fit(train)
quantifier.fit(train) print('testing...')
report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True) print(report.mean())
print(report.mean())

View File

@ -4,6 +4,7 @@ import math
from quapy.protocol import AbstractProtocol from quapy.protocol import AbstractProtocol
from pathlib import Path from pathlib import Path
def get_sample_list(path_dir): def get_sample_list(path_dir):
"""Gets a sample list finding the csv files in a directory """Gets a sample list finding the csv files in a directory
@ -19,6 +20,7 @@ def get_sample_list(path_dir):
samples.append(filename) samples.append(filename)
return samples return samples
def generate_modelselection_split(samples, split=0.3): def generate_modelselection_split(samples, split=0.3):
"""This function generates a train/test split for model selection """This function generates a train/test split for model selection
without the use of random numbers so the split is always the same without the use of random numbers so the split is always the same
@ -37,6 +39,7 @@ def generate_modelselection_split(samples, split=0.3):
train = [item for i, item in enumerate(samples) if i not in test_indices] train = [item for i, item in enumerate(samples) if i not in test_indices]
return train, test return train, test
class IFCBTrainSamplesFromDir(AbstractProtocol): class IFCBTrainSamplesFromDir(AbstractProtocol):
def __init__(self, path_dir:str, classes: list, samples: list = None): def __init__(self, path_dir:str, classes: list, samples: list = None):
@ -64,6 +67,7 @@ class IFCBTrainSamplesFromDir(AbstractProtocol):
""" """
return len(self.samples) return len(self.samples)
class IFCBTestSamples(AbstractProtocol): class IFCBTestSamples(AbstractProtocol):
def __init__(self, path_dir:str, test_prevalences: pd.DataFrame, samples: list = None, classes: list=None): def __init__(self, path_dir:str, test_prevalences: pd.DataFrame, samples: list = None, classes: list=None):

View File

@ -734,13 +734,14 @@ def fetch_lequa2022(task, data_home=None):
return train, val_gen, test_gen return train, val_gen, test_gen
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None): def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
""" """
Loads the IFCB dataset for quantification <https://zenodo.org/records/10036244>`. For more Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more
information on this dataset check the zenodo site. information on this dataset, please follow the zenodo link).
This dataset is based on the data available publicly at <https://github.com/hsosik/WHOI-Plankton>. This dataset is based on the data available publicly at
The scripts for the processing are available at <https://github.com/pglez82/IFCB_Zenodo> `WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms. Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
The datasets are downloaded only once, and stored for fast reuse. The datasets are downloaded only once, and stored for fast reuse.

View File

@ -60,6 +60,19 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
""" """
pass pass
def _check_non_empty_classes(self, data: LabelledCollection):
"""
Asserts all classes have positive instances.
:param data: LabelledCollection
:return: Nothing. May raise an exception.
"""
sample_prevs = data.prevalence()
empty_classes = np.argwhere(sample_prevs==0).flatten()
if len(empty_classes)>0:
empty_class_names = data.classes_[empty_classes]
raise ValueError(f'classes {empty_class_names} have no training examples')
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None): def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
""" """
Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function. Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function.
@ -93,6 +106,9 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba')) self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba'))
if fit_classifier:
self._check_non_empty_classes(data)
if predict_on is None: if predict_on is None:
predict_on = self.val_split predict_on = self.val_split
@ -100,7 +116,6 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
if fit_classifier: if fit_classifier:
self.classifier.fit(*data.Xy) self.classifier.fit(*data.Xy)
predictions = None predictions = None
elif isinstance(predict_on, float): elif isinstance(predict_on, float):
if fit_classifier: if fit_classifier:
if not (0. < predict_on < 1.): if not (0. < predict_on < 1.):