testing IFCB dataset
This commit is contained in:
parent
3c28a75b8c
commit
a8230827e2
|
@ -51,7 +51,6 @@
|
||||||
<li class="toctree-l4"><a class="reference internal" href="quapy.classification.html">quapy.classification package</a></li>
|
<li class="toctree-l4"><a class="reference internal" href="quapy.classification.html">quapy.classification package</a></li>
|
||||||
<li class="toctree-l4 current"><a class="current reference internal" href="#">quapy.data package</a></li>
|
<li class="toctree-l4 current"><a class="current reference internal" href="#">quapy.data package</a></li>
|
||||||
<li class="toctree-l4"><a class="reference internal" href="quapy.method.html">quapy.method package</a></li>
|
<li class="toctree-l4"><a class="reference internal" href="quapy.method.html">quapy.method package</a></li>
|
||||||
<li class="toctree-l4"><a class="reference internal" href="quapy.tests.html">quapy.tests package</a></li>
|
|
||||||
</ul>
|
</ul>
|
||||||
</li>
|
</li>
|
||||||
<li class="toctree-l3"><a class="reference internal" href="quapy.html#submodules">Submodules</a></li>
|
<li class="toctree-l3"><a class="reference internal" href="quapy.html#submodules">Submodules</a></li>
|
||||||
|
@ -627,30 +626,31 @@ otherwise.</p>
|
||||||
<span id="quapy-data-datasets-module"></span><h2>quapy.data.datasets module<a class="headerlink" href="#module-quapy.data.datasets" title="Link to this heading"></a></h2>
|
<span id="quapy-data-datasets-module"></span><h2>quapy.data.datasets module<a class="headerlink" href="#module-quapy.data.datasets" title="Link to this heading"></a></h2>
|
||||||
<dl class="py function">
|
<dl class="py function">
|
||||||
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_IFCB">
|
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_IFCB">
|
||||||
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_IFCB</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">single_sample_train</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_IFCB"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_IFCB" title="Link to this definition"></a></dt>
|
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_IFCB</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">single_sample_train</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">for_model_selection</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_IFCB"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_IFCB" title="Link to this definition"></a></dt>
|
||||||
<dd><p>Loads the IFCB dataset for quantification <<a class="reference external" href="https://zenodo.org/records/10036244">https://zenodo.org/records/10036244</a>>`. For more
|
<dd><p>Loads the IFCB dataset for quantification from <a class="reference external" href="https://zenodo.org/records/10036244">Zenodo</a> (for more
|
||||||
information on this dataset check the zenodo site.
|
information on this dataset, please follow the zenodo link).
|
||||||
This dataset is based on the data available publicly at <<a class="reference external" href="https://github.com/hsosik/WHOI-Plankton">https://github.com/hsosik/WHOI-Plankton</a>>.
|
This dataset is based on the data available publicly at
|
||||||
The scripts for the processing are available at <<a class="reference external" href="https://github.com/pglez82/IFCB_Zenodo">https://github.com/pglez82/IFCB_Zenodo</a>></p>
|
<a class="reference external" href="https://github.com/hsosik/WHOI-Plankton">WHOI-Plankton repo</a>.
|
||||||
<p>Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.</p>
|
The scripts for the processing are available at <a class="reference external" href="https://github.com/pglez82/IFCB_Zenodo">P. González’s repo</a>.
|
||||||
|
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.</p>
|
||||||
<p>The datasets are downloaded only once, and stored for fast reuse.</p>
|
<p>The datasets are downloaded only once, and stored for fast reuse.</p>
|
||||||
<dl class="field-list simple">
|
<dl class="field-list simple">
|
||||||
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
|
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
|
||||||
<dd class="field-odd"><ul class="simple">
|
<dd class="field-odd"><ul class="simple">
|
||||||
<li><p><strong>single_sample_train</strong> – boolean. If True (default), it returns the train dataset as an instance of
|
<li><p><strong>single_sample_train</strong> – a boolean. If true, it will return the train dataset as a
|
||||||
<a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a> (all examples together).
|
<a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a> (all examples together).
|
||||||
If False, a generator of training samples will be returned.
|
If false, a generator of training samples will be returned. Each example in the training set has an individual label.</p></li>
|
||||||
Each example in the training set has an individual class label.</p></li>
|
<li><p><strong>for_model_selection</strong> – if True, then returns a split 30% of the training set (86 out of 286 samples) to be used for model selection;
|
||||||
|
if False, then returns the full training set as training set and the test set as the test set</p></li>
|
||||||
<li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
|
<li><p><strong>data_home</strong> – specify the quapy home directory where collections will be dumped (leave empty to use the default
|
||||||
~/quay_data/ directory)</p></li>
|
~/quay_data/ directory)</p></li>
|
||||||
</ul>
|
</ul>
|
||||||
</dd>
|
</dd>
|
||||||
<dt class="field-even">Returns<span class="colon">:</span></dt>
|
<dt class="field-even">Returns<span class="colon">:</span></dt>
|
||||||
<dd class="field-even"><p>a tuple <cite>(train, test_gen)</cite> where <cite>train</cite> is an instance of
|
<dd class="field-even"><p>a tuple <cite>(train, test_gen)</cite> where <cite>train</cite> is an instance of
|
||||||
<a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, if <cite>single_sample_train</cite> is True or
|
<a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, if <cite>single_sample_train</cite> is true or
|
||||||
<code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data._ifcb.IFCBTrainSamplesFromDir</span></code> otherwise, i.e. a sampling protocol that
|
<code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data._ifcb.IFCBTrainSamplesFromDir</span></code>, i.e. a sampling protocol that returns a series of samples
|
||||||
returns a series of samples labelled example by example.
|
labelled example by example. test_gen will be a <code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data._ifcb.IFCBTestSamples</span></code>,
|
||||||
test_gen is an instance of <code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data._ifcb.IFCBTestSamples</span></code>,
|
|
||||||
i.e., a sampling protocol that returns a series of samples labelled by prevalence.</p>
|
i.e., a sampling protocol that returns a series of samples labelled by prevalence.</p>
|
||||||
</dd>
|
</dd>
|
||||||
</dl>
|
</dl>
|
||||||
|
|
|
@ -22,7 +22,6 @@
|
||||||
<script src="_static/js/theme.js"></script>
|
<script src="_static/js/theme.js"></script>
|
||||||
<link rel="index" title="Index" href="genindex.html" />
|
<link rel="index" title="Index" href="genindex.html" />
|
||||||
<link rel="search" title="Search" href="search.html" />
|
<link rel="search" title="Search" href="search.html" />
|
||||||
<link rel="next" title="quapy.tests package" href="quapy.tests.html" />
|
|
||||||
<link rel="prev" title="quapy.data package" href="quapy.data.html" />
|
<link rel="prev" title="quapy.data package" href="quapy.data.html" />
|
||||||
</head>
|
</head>
|
||||||
|
|
||||||
|
@ -52,7 +51,6 @@
|
||||||
<li class="toctree-l4"><a class="reference internal" href="quapy.classification.html">quapy.classification package</a></li>
|
<li class="toctree-l4"><a class="reference internal" href="quapy.classification.html">quapy.classification package</a></li>
|
||||||
<li class="toctree-l4"><a class="reference internal" href="quapy.data.html">quapy.data package</a></li>
|
<li class="toctree-l4"><a class="reference internal" href="quapy.data.html">quapy.data package</a></li>
|
||||||
<li class="toctree-l4 current"><a class="current reference internal" href="#">quapy.method package</a></li>
|
<li class="toctree-l4 current"><a class="current reference internal" href="#">quapy.method package</a></li>
|
||||||
<li class="toctree-l4"><a class="reference internal" href="quapy.tests.html">quapy.tests package</a></li>
|
|
||||||
</ul>
|
</ul>
|
||||||
</li>
|
</li>
|
||||||
<li class="toctree-l3"><a class="reference internal" href="quapy.html#submodules">Submodules</a></li>
|
<li class="toctree-l3"><a class="reference internal" href="quapy.html#submodules">Submodules</a></li>
|
||||||
|
@ -2820,7 +2818,6 @@ any quantification method should beat.</p>
|
||||||
</div>
|
</div>
|
||||||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||||||
<a href="quapy.data.html" class="btn btn-neutral float-left" title="quapy.data package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
<a href="quapy.data.html" class="btn btn-neutral float-left" title="quapy.data package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||||||
<a href="quapy.tests.html" class="btn btn-neutral float-right" title="quapy.tests package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<hr/>
|
<hr/>
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,29 +1,49 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
|
from quapy.model_selection import GridSearchQ
|
||||||
from quapy.evaluation import evaluation_report
|
from quapy.evaluation import evaluation_report
|
||||||
|
|
||||||
|
|
||||||
def newLR():
|
print('Quantifying the IFCB dataset with PACC\n')
|
||||||
return LogisticRegression(n_jobs=-1)
|
|
||||||
|
|
||||||
|
# model selection
|
||||||
|
print('loading dataset for model selection...', end='')
|
||||||
|
train, val_gen = qp.datasets.fetch_IFCB(for_model_selection=True, single_sample_train=True)
|
||||||
|
print('[done]')
|
||||||
|
print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={train.n_classes}')
|
||||||
|
print(f'\tvalidation samples={val_gen.total()}')
|
||||||
|
|
||||||
quantifiers = [
|
print('model selection starts')
|
||||||
('CC', qp.method.aggregative.CC(newLR())),
|
quantifier = qp.method.aggregative.PACC(LogisticRegression())
|
||||||
('ACC', qp.method.aggregative.ACC(newLR())),
|
|
||||||
('PCC', qp.method.aggregative.PCC(newLR())),
|
|
||||||
('PACC', qp.method.aggregative.PACC(newLR())),
|
|
||||||
('HDy', qp.method.aggregative.DMy(newLR())),
|
|
||||||
('EMQ', qp.method.aggregative.EMQ(newLR()))
|
|
||||||
]
|
|
||||||
|
|
||||||
|
mod_sel = GridSearchQ(
|
||||||
|
quantifier,
|
||||||
|
param_grid={
|
||||||
|
'classifier__C': np.logspace(-3,3,7),
|
||||||
|
'classifier__class_weight': [None, 'balanced']
|
||||||
|
},
|
||||||
|
protocol=val_gen,
|
||||||
|
refit=False,
|
||||||
|
n_jobs=-1,
|
||||||
|
verbose=True,
|
||||||
|
raise_errors=True
|
||||||
|
).fit(train)
|
||||||
|
|
||||||
for quant_name, quantifier in quantifiers:
|
print(f'model selection chose hyperparameters: {mod_sel.best_params_}')
|
||||||
|
quantifier = mod_sel.best_model_
|
||||||
|
|
||||||
print("Experiment with "+quant_name)
|
print('loading dataset for test...', end='')
|
||||||
|
train, test_gen = qp.datasets.fetch_IFCB(for_model_selection=False, single_sample_train=True)
|
||||||
train, test_gen = qp.datasets.fetch_IFCB()
|
print('[done]')
|
||||||
|
print(f'\ttraining size={len(train)}, features={train.X.shape[1]}, classes={train.n_classes}')
|
||||||
|
print(f'\ttest samples={test_gen.total()}')
|
||||||
|
|
||||||
|
print('training on the whole dataset before test')
|
||||||
quantifier.fit(train)
|
quantifier.fit(train)
|
||||||
|
|
||||||
|
print('testing...')
|
||||||
report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
|
report = evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae'], verbose=True)
|
||||||
print(report.mean())
|
print(report.mean())
|
||||||
|
|
|
@ -4,6 +4,7 @@ import math
|
||||||
from quapy.protocol import AbstractProtocol
|
from quapy.protocol import AbstractProtocol
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def get_sample_list(path_dir):
|
def get_sample_list(path_dir):
|
||||||
"""Gets a sample list finding the csv files in a directory
|
"""Gets a sample list finding the csv files in a directory
|
||||||
|
|
||||||
|
@ -19,6 +20,7 @@ def get_sample_list(path_dir):
|
||||||
samples.append(filename)
|
samples.append(filename)
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
def generate_modelselection_split(samples, split=0.3):
|
def generate_modelselection_split(samples, split=0.3):
|
||||||
"""This function generates a train/test split for model selection
|
"""This function generates a train/test split for model selection
|
||||||
without the use of random numbers so the split is always the same
|
without the use of random numbers so the split is always the same
|
||||||
|
@ -37,6 +39,7 @@ def generate_modelselection_split(samples, split=0.3):
|
||||||
train = [item for i, item in enumerate(samples) if i not in test_indices]
|
train = [item for i, item in enumerate(samples) if i not in test_indices]
|
||||||
return train, test
|
return train, test
|
||||||
|
|
||||||
|
|
||||||
class IFCBTrainSamplesFromDir(AbstractProtocol):
|
class IFCBTrainSamplesFromDir(AbstractProtocol):
|
||||||
|
|
||||||
def __init__(self, path_dir:str, classes: list, samples: list = None):
|
def __init__(self, path_dir:str, classes: list, samples: list = None):
|
||||||
|
@ -64,6 +67,7 @@ class IFCBTrainSamplesFromDir(AbstractProtocol):
|
||||||
"""
|
"""
|
||||||
return len(self.samples)
|
return len(self.samples)
|
||||||
|
|
||||||
|
|
||||||
class IFCBTestSamples(AbstractProtocol):
|
class IFCBTestSamples(AbstractProtocol):
|
||||||
|
|
||||||
def __init__(self, path_dir:str, test_prevalences: pd.DataFrame, samples: list = None, classes: list=None):
|
def __init__(self, path_dir:str, test_prevalences: pd.DataFrame, samples: list = None, classes: list=None):
|
||||||
|
|
|
@ -734,13 +734,14 @@ def fetch_lequa2022(task, data_home=None):
|
||||||
|
|
||||||
return train, val_gen, test_gen
|
return train, val_gen, test_gen
|
||||||
|
|
||||||
|
|
||||||
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
|
def fetch_IFCB(single_sample_train=True, for_model_selection=False, data_home=None):
|
||||||
"""
|
"""
|
||||||
Loads the IFCB dataset for quantification <https://zenodo.org/records/10036244>`. For more
|
Loads the IFCB dataset for quantification from `Zenodo <https://zenodo.org/records/10036244>`_ (for more
|
||||||
information on this dataset check the zenodo site.
|
information on this dataset, please follow the zenodo link).
|
||||||
This dataset is based on the data available publicly at <https://github.com/hsosik/WHOI-Plankton>.
|
This dataset is based on the data available publicly at
|
||||||
The scripts for the processing are available at <https://github.com/pglez82/IFCB_Zenodo>
|
`WHOI-Plankton repo <https://github.com/hsosik/WHOI-Plankton>`_.
|
||||||
|
The scripts for the processing are available at `P. González's repo <https://github.com/pglez82/IFCB_Zenodo>`_.
|
||||||
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
|
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.
|
||||||
|
|
||||||
The datasets are downloaded only once, and stored for fast reuse.
|
The datasets are downloaded only once, and stored for fast reuse.
|
||||||
|
|
|
@ -60,6 +60,19 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def _check_non_empty_classes(self, data: LabelledCollection):
|
||||||
|
"""
|
||||||
|
Asserts all classes have positive instances.
|
||||||
|
|
||||||
|
:param data: LabelledCollection
|
||||||
|
:return: Nothing. May raise an exception.
|
||||||
|
"""
|
||||||
|
sample_prevs = data.prevalence()
|
||||||
|
empty_classes = np.argwhere(sample_prevs==0).flatten()
|
||||||
|
if len(empty_classes)>0:
|
||||||
|
empty_class_names = data.classes_[empty_classes]
|
||||||
|
raise ValueError(f'classes {empty_class_names} have no training examples')
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
|
def fit(self, data: LabelledCollection, fit_classifier=True, val_split=None):
|
||||||
"""
|
"""
|
||||||
Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function.
|
Trains the aggregative quantifier. This comes down to training a classifier and an aggregation function.
|
||||||
|
@ -93,6 +106,9 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
|
||||||
|
|
||||||
self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba'))
|
self._check_classifier(adapt_if_necessary=(self._classifier_method() == 'predict_proba'))
|
||||||
|
|
||||||
|
if fit_classifier:
|
||||||
|
self._check_non_empty_classes(data)
|
||||||
|
|
||||||
if predict_on is None:
|
if predict_on is None:
|
||||||
predict_on = self.val_split
|
predict_on = self.val_split
|
||||||
|
|
||||||
|
@ -100,7 +116,6 @@ class AggregativeQuantifier(BaseQuantifier, ABC):
|
||||||
if fit_classifier:
|
if fit_classifier:
|
||||||
self.classifier.fit(*data.Xy)
|
self.classifier.fit(*data.Xy)
|
||||||
predictions = None
|
predictions = None
|
||||||
|
|
||||||
elif isinstance(predict_on, float):
|
elif isinstance(predict_on, float):
|
||||||
if fit_classifier:
|
if fit_classifier:
|
||||||
if not (0. < predict_on < 1.):
|
if not (0. < predict_on < 1.):
|
||||||
|
|
Loading…
Reference in New Issue