QuaPy/docs/build/html/quapy.data.html

1208 lines
122 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="./">
<head>
<meta charset="utf-8" /><meta name="generator" content="Docutils 0.19: https://docutils.sourceforge.io/" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>quapy.data package &mdash; QuaPy: A Python-based open-source framework for quantification 0.1.8 documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=92fd9be5" />
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
<!--[if lt IE 9]>
<script src="_static/js/html5shiv.min.js"></script>
<![endif]-->
<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=22607128"></script>
<script src="_static/doctools.js?v=9a2dae69"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/js/theme.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="quapy.method package" href="quapy.method.html" />
<link rel="prev" title="quapy.classification package" href="quapy.classification.html" />
</head>
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >
<a href="index.html" class="icon icon-home">
QuaPy: A Python-based open-source framework for quantification
</a>
<div role="search">
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
<input type="hidden" name="check_keywords" value="yes" />
<input type="hidden" name="area" value="default" />
</form>
</div>
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
<ul class="current">
<li class="toctree-l1 current"><a class="reference internal" href="modules.html">quapy</a><ul class="current">
<li class="toctree-l2 current"><a class="reference internal" href="quapy.html">quapy package</a><ul class="current">
<li class="toctree-l3 current"><a class="reference internal" href="quapy.html#subpackages">Subpackages</a><ul class="current">
<li class="toctree-l4"><a class="reference internal" href="quapy.classification.html">quapy.classification package</a></li>
<li class="toctree-l4 current"><a class="current reference internal" href="#">quapy.data package</a></li>
<li class="toctree-l4"><a class="reference internal" href="quapy.method.html">quapy.method package</a></li>
</ul>
</li>
<li class="toctree-l3"><a class="reference internal" href="quapy.html#submodules">Submodules</a></li>
<li class="toctree-l3"><a class="reference internal" href="quapy.html#module-quapy.error">quapy.error module</a></li>
<li class="toctree-l3"><a class="reference internal" href="quapy.html#module-quapy.evaluation">quapy.evaluation module</a></li>
<li class="toctree-l3"><a class="reference internal" href="quapy.html#module-quapy.functional">quapy.functional module</a></li>
<li class="toctree-l3"><a class="reference internal" href="quapy.html#module-quapy.model_selection">quapy.model_selection module</a></li>
<li class="toctree-l3"><a class="reference internal" href="quapy.html#module-quapy.plot">quapy.plot module</a></li>
<li class="toctree-l3"><a class="reference internal" href="quapy.html#module-quapy.protocol">quapy.protocol module</a></li>
<li class="toctree-l3"><a class="reference internal" href="quapy.html#module-quapy.util">quapy.util module</a></li>
<li class="toctree-l3"><a class="reference internal" href="quapy.html#module-quapy">Module contents</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
</div>
</nav>
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
<a href="index.html">QuaPy: A Python-based open-source framework for quantification</a>
</nav>
<div class="wy-nav-content">
<div class="rst-content">
<div role="navigation" aria-label="Page navigation">
<ul class="wy-breadcrumbs">
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
<li class="breadcrumb-item"><a href="modules.html">quapy</a></li>
<li class="breadcrumb-item"><a href="quapy.html">quapy package</a></li>
<li class="breadcrumb-item active">quapy.data package</li>
<li class="wy-breadcrumbs-aside">
<a href="_sources/quapy.data.rst.txt" rel="nofollow"> View page source</a>
</li>
</ul>
<hr/>
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">
<section id="quapy-data-package">
<h1>quapy.data package<a class="headerlink" href="#quapy-data-package" title="Link to this heading"></a></h1>
<section id="submodules">
<h2>Submodules<a class="headerlink" href="#submodules" title="Link to this heading"></a></h2>
</section>
<section id="module-quapy.data.base">
<span id="quapy-data-base-module"></span><h2>quapy.data.base module<a class="headerlink" href="#module-quapy.data.base" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="quapy.data.base.Dataset">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">Dataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">training</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">test</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">vocabulary</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">dict</span><span class="w"> </span><span class="p"><span class="pre">|</span></span><span class="w"> </span><span class="pre">None</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">name</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#Dataset"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.Dataset" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>Abstraction of training and test <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> objects.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>training</strong> a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
<li><p><strong>test</strong> a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance</p></li>
<li><p><strong>vocabulary</strong> if indicated, is a dictionary of the terms used in this textual dataset</p></li>
<li><p><strong>name</strong> a string representing the name of the dataset</p></li>
</ul>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.SplitStratified">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">SplitStratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">collection</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">train_size</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#Dataset.SplitStratified"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.Dataset.SplitStratified" title="Link to this definition"></a></dt>
<dd><p>Generates a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> from a stratified split of a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance.
See <a class="reference internal" href="#quapy.data.base.LabelledCollection.split_stratified" title="quapy.data.base.LabelledCollection.split_stratified"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.split_stratified()</span></code></a></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>collection</strong> <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p></li>
<li><p><strong>train_size</strong> the proportion of training documents (the rest conforms the test split)</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.binary">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.Dataset.binary" title="Link to this definition"></a></dt>
<dd><p>Returns True if the training collection is labelled according to two classes</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>boolean</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.classes_">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">classes_</span></span><a class="headerlink" href="#quapy.data.base.Dataset.classes_" title="Link to this definition"></a></dt>
<dd><p>The classes according to which the training collection is labelled</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>The classes according to which the training collection is labelled</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.kFCV">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">data</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">LabelledCollection</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#Dataset.kFCV"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.Dataset.kFCV" title="Link to this definition"></a></dt>
<dd><p>Generator of stratified folds to be used in k-fold cross validation. This function is only a wrapper around
<a class="reference internal" href="#quapy.data.base.LabelledCollection.kFCV" title="quapy.data.base.LabelledCollection.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.kFCV()</span></code></a> that returns <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instances made of training and test folds.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>nfolds</strong> integer (default 5), the number of folds to generate</p></li>
<li><p><strong>nrepeats</strong> integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
<li><p><strong>random_state</strong> integer (default 0), guarantees that the folds generated are reproducible</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation as instances of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.load">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#Dataset.load"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.Dataset.load" title="Link to this definition"></a></dt>
<dd><p>Loads a training and a test labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> instance.
The function in charge of reading the instances must be specified. This function can be a custom one, or any of
the reading functions defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>train_path</strong> string, the path to the file containing the training instances</p></li>
<li><p><strong>test_path</strong> string, the path to the file containing the test instances</p></li>
<li><p><strong>loader_func</strong> a custom function that implements the data loader and returns a tuple with instances and
labels</p></li>
<li><p><strong>classes</strong> array-like, the classes according to which the instances are labelled</p></li>
<li><p><strong>loader_kwargs</strong> any argument that the <cite>loader_func</cite> function needs in order to read the instances.
See <a class="reference internal" href="#quapy.data.base.LabelledCollection.load" title="quapy.data.base.LabelledCollection.load"><code class="xref py py-meth docutils literal notranslate"><span class="pre">LabelledCollection.load()</span></code></a> for further details.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">Dataset</span></code></a> object</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.n_classes">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.Dataset.n_classes" title="Link to this definition"></a></dt>
<dd><p>The number of classes according to which the training collection is labelled</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.reduce">
<span class="sig-name descname"><span class="pre">reduce</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">n_train</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_test</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">100</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#Dataset.reduce"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.Dataset.reduce" title="Link to this definition"></a></dt>
<dd><p>Reduce the number of instances in place for quick experiments. Preserves the prevalence of each set.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>n_train</strong> number of training documents to keep (default 100)</p></li>
<li><p><strong>n_test</strong> number of test documents to keep (default 100)</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>self</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.stats">
<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#Dataset.stats"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.Dataset.stats" title="Link to this definition"></a></dt>
<dd><p>Returns (and eventually prints) a dictionary with some stats of this dataset. E.g.,:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">&#39;kindle&#39;</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">Dataset</span><span class="o">=</span><span class="n">kindle</span> <span class="c1">#tr-instances=3821, #te-instances=21591, type=&lt;class &#39;scipy.sparse.csr.csr_matrix&#39;&gt;, #features=4403, #classes=[0 1], tr-prevs=[0.081, 0.919], te-prevs=[0.063, 0.937]</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>show</strong> if set to True (default), prints the stats in standard output</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a dictionary containing some stats of this collection for the training and test collections. The keys
are <cite>train</cite> and <cite>test</cite>, and point to dedicated dictionaries of stats, for each collection, with keys
<cite>#instances</cite> (the number of instances), <cite>type</cite> (the type representing the instances),
<cite>#features</cite> (the number of features, if the instances are in array-like format), <cite>#classes</cite> (the classes of
the collection), <cite>prevs</cite> (the prevalence values for each class)</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.train_test">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">train_test</span></span><a class="headerlink" href="#quapy.data.base.Dataset.train_test" title="Link to this definition"></a></dt>
<dd><p>Alias to <cite>self.training</cite> and <cite>self.test</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>the training and test collections</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>the training and test collections</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.Dataset.vocabulary_size">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">vocabulary_size</span></span><a class="headerlink" href="#quapy.data.base.Dataset.vocabulary_size" title="Link to this definition"></a></dt>
<dd><p>If the dataset is textual, and the vocabulary was indicated, returns the size of the vocabulary</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">quapy.data.base.</span></span><span class="sig-name descname"><span class="pre">LabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">instances</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">labels</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>A LabelledCollection is a set of objects each with a label attached to each of them.
This class implements several sampling routines and other utilities.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>instances</strong> array-like (np.ndarray, list, or csr_matrix are supported)</p></li>
<li><p><strong>labels</strong> array-like with the same length of instances</p></li>
<li><p><strong>classes</strong> optional, list of classes from which labels are taken. If not specified, the classes are inferred
from the labels. The classes must be indicated in cases in which some of the labels might have no examples
(i.e., a prevalence of 0)</p></li>
</ul>
</dd>
</dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.X">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">X</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.X" title="Link to this definition"></a></dt>
<dd><p>An alias to self.instances</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>self.instances</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.Xp">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">Xp</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.Xp" title="Link to this definition"></a></dt>
<dd><p>Gets the instances and the true prevalence. This is useful when implementing evaluation protocols from
a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>a tuple <cite>(instances, prevalence)</cite> from this collection</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.Xy">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">Xy</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.Xy" title="Link to this definition"></a></dt>
<dd><p>Gets the instances and labels. This is useful when working with <cite>sklearn</cite> estimators, e.g.:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">svm</span> <span class="o">=</span> <span class="n">LinearSVC</span><span class="p">()</span><span class="o">.</span><span class="n">fit</span><span class="p">(</span><span class="o">*</span><span class="n">my_collection</span><span class="o">.</span><span class="n">Xy</span><span class="p">)</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>a tuple <cite>(instances, labels)</cite> from this collection</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.binary">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">binary</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.binary" title="Link to this definition"></a></dt>
<dd><p>Returns True if the number of classes is 2</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>boolean</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.counts">
<span class="sig-name descname"><span class="pre">counts</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.counts"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.counts" title="Link to this definition"></a></dt>
<dd><p>Returns the number of instances for each of the classes in the codeframe.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the number of instances of each class, in the same order
as listed by <cite>self.classes_</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.join">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">join</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">Iterable</span><span class="p"><span class="pre">[</span></span><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">LabelledCollection</span></a><span class="p"><span class="pre">]</span></span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.join"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.join" title="Link to this definition"></a></dt>
<dd><p>Returns a new <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> as the union of the collections given in input.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>args</strong> instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> representing the union of both collections</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.kFCV">
<span class="sig-name descname"><span class="pre">kFCV</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">nfolds</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nrepeats</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.kFCV"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.kFCV" title="Link to this definition"></a></dt>
<dd><p>Generator of stratified folds to be used in k-fold cross validation.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>nfolds</strong> integer (default 5), the number of folds to generate</p></li>
<li><p><strong>nrepeats</strong> integer (default 1), the number of rounds of k-fold cross validation to run</p></li>
<li><p><strong>random_state</strong> integer (default 0), guarantees that the folds generated are reproducible</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>yields <cite>nfolds * nrepeats</cite> folds for k-fold cross validation</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.load">
<em class="property"><span class="pre">classmethod</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">loader_func</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">callable</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">classes</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">loader_kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.load"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.load" title="Link to this definition"></a></dt>
<dd><p>Loads a labelled set of data and convert it into a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> instance. The function in charge
of reading the instances must be specified. This function can be a custom one, or any of the reading functions
defined in <a class="reference internal" href="#module-quapy.data.reader" title="quapy.data.reader"><code class="xref py py-mod docutils literal notranslate"><span class="pre">quapy.data.reader</span></code></a> module.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>path</strong> string, the path to the file containing the labelled instances</p></li>
<li><p><strong>loader_func</strong> a custom function that implements the data loader and returns a tuple with instances and
labels</p></li>
<li><p><strong>classes</strong> array-like, the classes according to which the instances are labelled</p></li>
<li><p><strong>loader_kwargs</strong> any argument that the <cite>loader_func</cite> function needs in order to read the instances, i.e.,
these arguments are used to call <cite>loader_func(path, **loader_kwargs)</cite></p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> object</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.n_classes">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">n_classes</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.n_classes" title="Link to this definition"></a></dt>
<dd><p>The number of classes</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.p">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">p</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.p" title="Link to this definition"></a></dt>
<dd><p>An alias to self.prevalence()</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>self.prevalence()</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.prevalence">
<span class="sig-name descname"><span class="pre">prevalence</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.prevalence"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.prevalence" title="Link to this definition"></a></dt>
<dd><p>Returns the prevalence, or relative frequency, of the classes in the codeframe.</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>a np.ndarray of shape <cite>(n_classes)</cite> with the relative frequencies of each class, in the same order
as listed by <cite>self.classes_</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling">
<span class="sig-name descname"><span class="pre">sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.sampling"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling" title="Link to this definition"></a></dt>
<dd><p>Return a random sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size and desired prevalence
values. For each class, the sampling is drawn without replacement if the requested prevalence is larger than
the actual prevalence of the class, or with replacement otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>size</strong> integer, the requested size</p></li>
<li><p><strong>prevs</strong> the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
<cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
<li><p><strong>shuffle</strong> if set to True (default), shuffles the index before returning it</p></li>
<li><p><strong>random_state</strong> seed for reproducing sampling</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite> and prevalence close to <cite>prevs</cite> (or
prevalence == <cite>prevs</cite> if the exact prevalence values can be met as proportions of instances)</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_from_index">
<span class="sig-name descname"><span class="pre">sampling_from_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">index</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.sampling_from_index"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_from_index" title="Link to this definition"></a></dt>
<dd><p>Returns an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> whose elements are sampled from this collection using the
index.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>index</strong> np.ndarray</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.sampling_index">
<span class="sig-name descname"><span class="pre">sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">prevs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shuffle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.sampling_index"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.sampling_index" title="Link to this definition"></a></dt>
<dd><p>Returns an index to be used to extract a random sample of desired size and desired prevalence values. If the
prevalence values are not specified, then returns the index of a uniform sampling.
For each class, the sampling is drawn with replacement if the requested prevalence is larger than
the actual prevalence of the class, or without replacement otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>size</strong> integer, the requested size</p></li>
<li><p><strong>prevs</strong> the prevalence for each class; the prevalence value for the last class can be lead empty since
it is constrained. E.g., for binary collections, only the prevalence <cite>p</cite> for the first class (as listed in
<cite>self.classes_</cite> can be specified, while the other class takes prevalence value <cite>1-p</cite></p></li>
<li><p><strong>shuffle</strong> if set to True (default), shuffles the index before returning it</p></li>
<li><p><strong>random_state</strong> seed for reproducing sampling</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.split_random">
<span class="sig-name descname"><span class="pre">split_random</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_prop</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.split_random"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.split_random" title="Link to this definition"></a></dt>
<dd><p>Returns two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> split randomly from this collection, at desired
proportion.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>train_prop</strong> the proportion of elements to include in the left-most returned collection (typically used
as the training collection). The rest of elements are included in the right-most returned collection
(typically used as a test collection).</p></li>
<li><p><strong>random_state</strong> if specified, guarantees reproducibility of the split.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>, the first one with <cite>train_prop</cite> elements, and the
second one with <cite>1-train_prop</cite> elements</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.split_stratified">
<span class="sig-name descname"><span class="pre">split_stratified</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">train_prop</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.6</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.split_stratified"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.split_stratified" title="Link to this definition"></a></dt>
<dd><p>Returns two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> split with stratification from this collection, at desired
proportion.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>train_prop</strong> the proportion of elements to include in the left-most returned collection (typically used
as the training collection). The rest of elements are included in the right-most returned collection
(typically used as a test collection).</p></li>
<li><p><strong>random_state</strong> if specified, guarantees reproducibility of the split.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>two instances of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>, the first one with <cite>train_prop</cite> elements, and the
second one with <cite>1-train_prop</cite> elements</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.stats">
<span class="sig-name descname"><span class="pre">stats</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">show</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.stats"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.stats" title="Link to this definition"></a></dt>
<dd><p>Returns (and eventually prints) a dictionary with some stats of this collection. E.g.,:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">data</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_reviews</span><span class="p">(</span><span class="s1">&#39;kindle&#39;</span><span class="p">,</span> <span class="n">tfidf</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="n">min_df</span><span class="o">=</span><span class="mi">5</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">data</span><span class="o">.</span><span class="n">training</span><span class="o">.</span><span class="n">stats</span><span class="p">()</span>
<span class="gp">&gt;&gt;&gt; </span><span class="c1">#instances=3821, type=&lt;class &#39;scipy.sparse.csr.csr_matrix&#39;&gt;, #features=4403, #classes=[0 1], prevs=[0.081, 0.919]</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>show</strong> if set to True (default), prints the stats in standard output</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a dictionary containing some stats of this collection. Keys include <cite>#instances</cite> (the number of
instances), <cite>type</cite> (the type representing the instances), <cite>#features</cite> (the number of features, if the
instances are in array-like format), <cite>#classes</cite> (the classes of the collection), <cite>prevs</cite> (the prevalence
values for each class)</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling">
<span class="sig-name descname"><span class="pre">uniform_sampling</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.uniform_sampling"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling" title="Link to this definition"></a></dt>
<dd><p>Returns a uniform sample (an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a>) of desired size. The sampling is drawn
with replacement if the requested size is greater than the number of instances, or without replacement
otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>size</strong> integer, the requested size</p></li>
<li><p><strong>random_state</strong> if specified, guarantees reproducibility of the split.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">LabelledCollection</span></code></a> with length == <cite>size</cite></p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.uniform_sampling_index">
<span class="sig-name descname"><span class="pre">uniform_sampling_index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">size</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">random_state</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/base.html#LabelledCollection.uniform_sampling_index"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.base.LabelledCollection.uniform_sampling_index" title="Link to this definition"></a></dt>
<dd><p>Returns an index to be used to extract a uniform sample of desired size. The sampling is drawn
with replacement if the requested size is greater than the number of instances, or without replacement
otherwise.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>size</strong> integer, the size of the uniform sample</p></li>
<li><p><strong>random_state</strong> if specified, guarantees reproducibility of the split.</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a np.ndarray of shape <cite>(size)</cite> with the indexes</p>
</dd>
</dl>
</dd></dl>
<dl class="py property">
<dt class="sig sig-object py" id="quapy.data.base.LabelledCollection.y">
<em class="property"><span class="pre">property</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">y</span></span><a class="headerlink" href="#quapy.data.base.LabelledCollection.y" title="Link to this definition"></a></dt>
<dd><p>An alias to self.labels</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>self.labels</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
</section>
<section id="module-quapy.data.datasets">
<span id="quapy-data-datasets-module"></span><h2>quapy.data.datasets module<a class="headerlink" href="#module-quapy.data.datasets" title="Link to this heading"></a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_IFCB">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_IFCB</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">single_sample_train</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">for_model_selection</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_IFCB"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_IFCB" title="Link to this definition"></a></dt>
<dd><p>Loads the IFCB dataset for quantification from <a class="reference external" href="https://zenodo.org/records/10036244">Zenodo</a> (for more
information on this dataset, please follow the zenodo link).
This dataset is based on the data available publicly at
<a class="reference external" href="https://github.com/hsosik/WHOI-Plankton">WHOI-Plankton repo</a>.
The scripts for the processing are available at <a class="reference external" href="https://github.com/pglez82/IFCB_Zenodo">P. Gonzálezs repo</a>.
Basically, this is the IFCB dataset with precomputed features for testing quantification algorithms.</p>
<p>The datasets are downloaded only once, and stored for fast reuse.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>single_sample_train</strong> a boolean. If true, it will return the train dataset as a
<a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a> (all examples together).
If false, a generator of training samples will be returned. Each example in the training set has an individual label.</p></li>
<li><p><strong>for_model_selection</strong> if True, then returns a split 30% of the training set (86 out of 286 samples) to be used for model selection;
if False, then returns the full training set as training set and the test set as the test set</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a tuple <cite>(train, test_gen)</cite> where <cite>train</cite> is an instance of
<a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, if <cite>single_sample_train</cite> is true or
<code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data._ifcb.IFCBTrainSamplesFromDir</span></code>, i.e. a sampling protocol that returns a series of samples
labelled example by example. test_gen will be a <code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data._ifcb.IFCBTestSamples</span></code>,
i.e., a sampling protocol that returns a series of samples labelled by prevalence.</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCIBinaryDataset">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCIBinaryDataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_split</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">Dataset</span></a></span></span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_UCIBinaryDataset"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_UCIBinaryDataset" title="Link to this definition"></a></dt>
<dd><p>Loads a UCI dataset as an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>, as used in
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., &amp; del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100.</a>
and
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., &amp; del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15.</a>.
The datasets do not come with a predefined train-test split (see <code class="xref py py-meth docutils literal notranslate"><span class="pre">fetch_UCILabelledCollection()</span></code> for further
information on how to use these collections), and so a train-test split is generated at desired proportion.
The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> a dataset name</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>test_split</strong> proportion of documents to be included in the test set. The rest conforms the training set</p></li>
<li><p><strong>verbose</strong> set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCIBinaryLabelledCollection">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCIBinaryLabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">LabelledCollection</span></a></span></span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_UCIBinaryLabelledCollection"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_UCIBinaryLabelledCollection" title="Link to this definition"></a></dt>
<dd><p>Loads a UCI collection as an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, as used in
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253516300628">Pérez-Gállego, P., Quevedo, J. R., &amp; del Coz, J. J. (2017).
Using ensembles for problems with characterizable changes in data distribution: A case study on quantification.
Information Fusion, 34, 87-100.</a>
and
<a class="reference external" href="https://www.sciencedirect.com/science/article/pii/S1566253517303652">Pérez-Gállego, P., Castano, A., Quevedo, J. R., &amp; del Coz, J. J. (2019).
Dynamic ensemble selection for quantification tasks.
Information Fusion, 45, 1-15.</a>.
The datasets do not come with a predefined train-test split, and so Pérez-Gállego et al. adopted a 5FCVx2 evaluation
protocol, meaning that each collection was used to generate two rounds (hence the x2) of 5 fold cross validation.
This can be reproduced by using <a class="reference internal" href="#quapy.data.base.Dataset.kFCV" title="quapy.data.base.Dataset.kFCV"><code class="xref py py-meth docutils literal notranslate"><span class="pre">quapy.data.base.Dataset.kFCV()</span></code></a>, e.g.:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">collection</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_UCIBinaryLabelledCollection</span><span class="p">(</span><span class="s2">&quot;yeast&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="k">for</span> <span class="n">data</span> <span class="ow">in</span> <span class="n">qp</span><span class="o">.</span><span class="n">train</span><span class="o">.</span><span class="n">Dataset</span><span class="o">.</span><span class="n">kFCV</span><span class="p">(</span><span class="n">collection</span><span class="p">,</span> <span class="n">nfolds</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">nrepeats</span><span class="o">=</span><span class="mi">2</span><span class="p">):</span>
<span class="gp">&gt;&gt;&gt; </span> <span class="o">...</span>
</pre></div>
</div>
<p>The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_DATASETS</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> a dataset name</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>test_split</strong> proportion of documents to be included in the test set. The rest conforms the training set</p></li>
<li><p><strong>verbose</strong> set to True (default is False) to get information (from the UCI ML repository) about the datasets</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a> instance</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCIMulticlassDataset">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCIMulticlassDataset</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">test_split</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">0.3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">Dataset</span></a></span></span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_UCIMulticlassDataset"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_UCIMulticlassDataset" title="Link to this definition"></a></dt>
<dd><p>Loads a UCI multiclass dataset as an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>.</p>
<p>The list of available datasets is taken from <a class="reference external" href="https://archive.ics.uci.edu/">https://archive.ics.uci.edu/</a>, following these criteria:
- It has more than 1000 instances
- It is suited for classification
- It has more than two classes
- It is available for Python import (requires ucimlrepo package)</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">dataset</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_UCIMulticlassDataset</span><span class="p">(</span><span class="s2">&quot;dry-bean&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">train</span><span class="p">,</span> <span class="n">test</span> <span class="o">=</span> <span class="n">dataset</span><span class="o">.</span><span class="n">train_test</span>
<span class="gp">&gt;&gt;&gt; </span> <span class="o">...</span>
</pre></div>
</div>
<p>The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_MULTICLASS_DATASETS</cite></p>
<p>The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> a dataset name</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>test_split</strong> proportion of documents to be included in the test set. The rest conforms the training set</p></li>
<li><p><strong>verbose</strong> set to True (default is False) to get information (stats) about the dataset</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_UCIMulticlassLabelledCollection">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_UCIMulticlassLabelledCollection</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><span class="pre">LabelledCollection</span></a></span></span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_UCIMulticlassLabelledCollection"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_UCIMulticlassLabelledCollection" title="Link to this definition"></a></dt>
<dd><p>Loads a UCI multiclass collection as an instance of <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>.</p>
<p>The list of available datasets is taken from <a class="reference external" href="https://archive.ics.uci.edu/">https://archive.ics.uci.edu/</a>, following these criteria:
- It has more than 1000 instances
- It is suited for classification
- It has more than two classes
- It is available for Python import (requires ucimlrepo package)</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="kn">import</span> <span class="nn">quapy</span> <span class="k">as</span> <span class="nn">qp</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">collection</span> <span class="o">=</span> <span class="n">qp</span><span class="o">.</span><span class="n">datasets</span><span class="o">.</span><span class="n">fetch_UCIMulticlassLabelledCollection</span><span class="p">(</span><span class="s2">&quot;dry-bean&quot;</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">X</span><span class="p">,</span> <span class="n">y</span> <span class="o">=</span> <span class="n">collection</span><span class="o">.</span><span class="n">Xy</span>
<span class="gp">&gt;&gt;&gt; </span> <span class="o">...</span>
</pre></div>
</div>
<p>The list of valid dataset names can be accessed in <cite>quapy.data.datasets.UCI_MULTICLASS_DATASETS</cite></p>
<p>The datasets are downloaded only once and pickled into disk, saving time for consecutive calls.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> a dataset name</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where the dataset will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>test_split</strong> proportion of documents to be included in the test set. The rest conforms the training set</p></li>
<li><p><strong>verbose</strong> set to True (default is False) to get information (stats) about the dataset</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a> instance</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_lequa2022">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_lequa2022</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">task</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_lequa2022"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_lequa2022" title="Link to this definition"></a></dt>
<dd><p>Loads the official datasets provided for the <a class="reference external" href="https://lequa2022.github.io/index">LeQua</a> competition.
In brief, there are 4 tasks (T1A, T1B, T2A, T2B) having to do with text quantification
problems. Tasks T1A and T1B provide documents in vector form, while T2A and T2B provide raw documents instead.
Tasks T1A and T2A are binary sentiment quantification problems, while T2A and T2B are multiclass quantification
problems consisting of estimating the class prevalence values of 28 different merchandise products.
We refer to the <a class="reference external" href="https://ceur-ws.org/Vol-3180/paper-146.pdf">Esuli, A., Moreo, A., Sebastiani, F., &amp; Sperduti, G. (2022).
A Detailed Overview of LeQua&#64; CLEF 2022: Learning to Quantify.</a> for a detailed description
on the tasks and datasets.</p>
<p>The datasets are downloaded only once, and stored for fast reuse.</p>
<p>See <cite>lequa2022_experiments.py</cite> provided in the example folder, that can serve as a guide on how to use these
datasets.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>task</strong> a string representing the task name; valid ones are T1A, T1B, T2A, and T2B</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a tuple <cite>(train, val_gen, test_gen)</cite> where <cite>train</cite> is an instance of
<a class="reference internal" href="#quapy.data.base.LabelledCollection" title="quapy.data.base.LabelledCollection"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.LabelledCollection</span></code></a>, <cite>val_gen</cite> and <cite>test_gen</cite> are instances of
<code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data._lequa2022.SamplesFromDir</span></code>, a subclass of <a class="reference internal" href="quapy.html#quapy.protocol.AbstractProtocol" title="quapy.protocol.AbstractProtocol"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.protocol.AbstractProtocol</span></code></a>,
that return a series of samples stored in a directory which are labelled by prevalence.</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_reviews">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_reviews</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">tfidf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">Dataset</span></a></span></span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_reviews"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_reviews" title="Link to this definition"></a></dt>
<dd><p>Loads a Reviews dataset as a Dataset instance, as used in
<a class="reference external" href="https://dl.acm.org/doi/abs/10.1145/3269206.3269287">Esuli, A., Moreo, A., and Sebastiani, F. “A recurrent neural network for sentiment quantification.”
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.</a>.
The list of valid dataset names can be accessed in <cite>quapy.data.datasets.REVIEWS_SENTIMENT_DATASETS</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> the name of the dataset: valid ones are hp, kindle, imdb</p></li>
<li><p><strong>tfidf</strong> set to True to transform the raw documents into tfidf weighted matrices</p></li>
<li><p><strong>min_df</strong> minimun number of documents that should contain a term in order for the term to be
kept (ignored if tfidf==False)</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>pickle</strong> set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.fetch_twitter">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">fetch_twitter</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset_name</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">for_model_selection</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">data_home</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pickle</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span> <span class="sig-return"><span class="sig-return-icon">&#x2192;</span> <span class="sig-return-typehint"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">Dataset</span></a></span></span><a class="reference internal" href="_modules/quapy/data/datasets.html#fetch_twitter"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.fetch_twitter" title="Link to this definition"></a></dt>
<dd><p>Loads a Twitter dataset as a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance, as used in:
<a class="reference external" href="https://link.springer.com/content/pdf/10.1007/s13278-016-0327-z.pdf">Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016)</a>
Note that the datasets semeval13, semeval14, semeval15 share the same training set.
The list of valid dataset names corresponding to training sets can be accessed in
<cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TRAIN</cite>, while the test sets can be accessed in
<cite>quapy.data.datasets.TWITTER_SENTIMENT_DATASETS_TEST</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset_name</strong> the name of the dataset: valid ones are gasp, hcr, omd, sanders, semeval13,
semeval14, semeval15, semeval16, sst, wa, wb</p></li>
<li><p><strong>for_model_selection</strong> if True, then returns the train split as the training set and the devel split
as the test set; if False, then returns the train+devel split as the training set and the test set as the
test set</p></li>
<li><p><strong>min_df</strong> minimun number of documents that should contain a term in order for the term to be kept</p></li>
<li><p><strong>data_home</strong> specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)</p></li>
<li><p><strong>pickle</strong> set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> instance</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.datasets.warn">
<span class="sig-prename descclassname"><span class="pre">quapy.data.datasets.</span></span><span class="sig-name descname"><span class="pre">warn</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">*</span></span><span class="n"><span class="pre">args</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/datasets.html#warn"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.datasets.warn" title="Link to this definition"></a></dt>
<dd></dd></dl>
</section>
<section id="module-quapy.data.preprocessing">
<span id="quapy-data-preprocessing-module"></span><h2>quapy.data.preprocessing module<a class="headerlink" href="#module-quapy.data.preprocessing" title="Link to this heading"></a></h2>
<dl class="py class">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">IndexTransformer</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/preprocessing.html#IndexTransformer"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer" title="Link to this definition"></a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal notranslate"><span class="pre">object</span></code></p>
<p>This class implements a sklearns-style transformer that indexes text as numerical ids for the tokens it
contains, and that would be generated by sklearns
<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>kwargs</strong> <p>keyworded arguments from
<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">CountVectorizer</a></p>
</p>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.add_word">
<span class="sig-name descname"><span class="pre">add_word</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">word</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">id</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">nogaps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/preprocessing.html#IndexTransformer.add_word"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.add_word" title="Link to this definition"></a></dt>
<dd><p>Adds a new token (regardless of whether it has been found in the text or not), with dedicated id.
Useful to define special tokens for codifying unknown words, or padding tokens.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>word</strong> string, surface form of the token</p></li>
<li><p><strong>id</strong> integer, numerical value to assign to the token (leave as None for indicating the next valid id,
default)</p></li>
<li><p><strong>nogaps</strong> if set to True (default) asserts that the id indicated leads to no numerical gaps with
precedent ids stored so far</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>integer, the numerical id for the new token</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit">
<span class="sig-name descname"><span class="pre">fit</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/preprocessing.html#IndexTransformer.fit"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit" title="Link to this definition"></a></dt>
<dd><p>Fits the transformer, i.e., decides on the vocabulary, given a list of strings.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>X</strong> a list of strings</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>self</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.fit_transform">
<span class="sig-name descname"><span class="pre">fit_transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/preprocessing.html#IndexTransformer.fit_transform"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.fit_transform" title="Link to this definition"></a></dt>
<dd><p>Fits the transform on <cite>X</cite> and transforms it.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>X</strong> a list of strings</p></li>
<li><p><strong>n_jobs</strong> the number of parallel workers to carry out this task</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.transform">
<span class="sig-name descname"><span class="pre">transform</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">X</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_jobs</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/preprocessing.html#IndexTransformer.transform"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.transform" title="Link to this definition"></a></dt>
<dd><p>Transforms the strings in <cite>X</cite> as lists of numerical ids</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>X</strong> a list of strings</p></li>
<li><p><strong>n_jobs</strong> the number of parallel workers to carry out this task</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <cite>np.ndarray</cite> of numerical ids</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="quapy.data.preprocessing.IndexTransformer.vocabulary_size">
<span class="sig-name descname"><span class="pre">vocabulary_size</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/preprocessing.html#IndexTransformer.vocabulary_size"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.preprocessing.IndexTransformer.vocabulary_size" title="Link to this definition"></a></dt>
<dd><p>Gets the length of the vocabulary according to which the document tokens have been indexed</p>
<dl class="field-list simple">
<dt class="field-odd">Returns<span class="colon">:</span></dt>
<dd class="field-odd"><p>integer</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.index">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">index</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/preprocessing.html#index"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.preprocessing.index" title="Link to this definition"></a></dt>
<dd><p>Indexes the tokens of a textual <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of string documents.
To index a document means to replace each different token by a unique numerical index.
Rare words (i.e., words occurring less than <cite>min_df</cite> times) are replaced by a special token <cite>UNK</cite></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object where the instances of training and test documents
are lists of str</p></li>
<li><p><strong>min_df</strong> minimum number of occurrences below which the term is replaced by a <cite>UNK</cite> index</p></li>
<li><p><strong>inplace</strong> whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
<li><p><strong>kwargs</strong> the rest of parameters of the transformation (as for sklearns
<cite>CountVectorizer &lt;https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html&gt;_</cite>)</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current
<a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) consisting of lists of integer values representing indices.</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.reduce_columns">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">reduce_columns</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">5</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/preprocessing.html#reduce_columns"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.preprocessing.reduce_columns" title="Link to this definition"></a></dt>
<dd><p>Reduces the dimensionality of the instances, represented as a <cite>csr_matrix</cite> (or any subtype of
<cite>scipy.sparse.spmatrix</cite>), of training and test documents by removing the columns of words which are not present
in at least <cite>min_df</cite> instances in the training set</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in which instances are represented in sparse format (any
subtype of scipy.sparse.spmatrix)</p></li>
<li><p><strong>min_df</strong> integer, minimum number of instances below which the columns are removed</p></li>
<li><p><strong>inplace</strong> whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (if inplace=False) or a reference to the current
<a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> (inplace=True) where the dimensions corresponding to infrequent terms
in the training set have been removed</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.standardize">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">standardize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/preprocessing.html#standardize"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.preprocessing.standardize" title="Link to this definition"></a></dt>
<dd><p>Standardizes the real-valued columns of a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a>.
Standardization, aka z-scoring, of a variable <cite>X</cite> comes down to subtracting the average and normalizing by the
standard deviation.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> object</p></li>
<li><p><strong>inplace</strong> set to True if the transformation is to be applied inplace, or to False (default) if a new
<a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> is to be returned</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>an instance of <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a></p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.preprocessing.text2tfidf">
<span class="sig-prename descclassname"><span class="pre">quapy.data.preprocessing.</span></span><span class="sig-name descname"><span class="pre">text2tfidf</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">dataset</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><span class="pre">Dataset</span></a></span></em>, <em class="sig-param"><span class="n"><span class="pre">min_df</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">3</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sublinear_tf</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">inplace</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="o"><span class="pre">**</span></span><span class="n"><span class="pre">kwargs</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/preprocessing.html#text2tfidf"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.preprocessing.text2tfidf" title="Link to this definition"></a></dt>
<dd><p>Transforms a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of textual instances into a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> of
tfidf weighted sparse vectors</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>dataset</strong> a <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> where the instances of training and test collections are
lists of str</p></li>
<li><p><strong>min_df</strong> minimum number of occurrences for a word to be considered as part of the vocabulary (default 3)</p></li>
<li><p><strong>sublinear_tf</strong> whether or not to apply the log scalling to the tf counters (default True)</p></li>
<li><p><strong>inplace</strong> whether or not to apply the transformation inplace (True), or to a new copy (False, default)</p></li>
<li><p><strong>kwargs</strong> the rest of parameters of the transformation (as for sklearns
<a class="reference external" href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html">TfidfVectorizer</a>)</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a new <a class="reference internal" href="#quapy.data.base.Dataset" title="quapy.data.base.Dataset"><code class="xref py py-class docutils literal notranslate"><span class="pre">quapy.data.base.Dataset</span></code></a> in <cite>csr_matrix</cite> format (if inplace=False) or a reference to the
current Dataset (if inplace=True) where the instances are stored in a <cite>csr_matrix</cite> of real-valued tfidf scores</p>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-quapy.data.reader">
<span id="quapy-data-reader-module"></span><h2>quapy.data.reader module<a class="headerlink" href="#module-quapy.data.reader" title="Link to this heading"></a></h2>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.reader.binarize">
<span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">binarize</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pos_class</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/reader.html#binarize"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.reader.binarize" title="Link to this definition"></a></dt>
<dd><p>Binarizes a categorical array-like collection of labels towards the positive class <cite>pos_class</cite>. E.g.,:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">binarize</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">],</span> <span class="n">pos_class</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="gp">&gt;&gt;&gt; </span><span class="n">array</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">0</span><span class="p">])</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>y</strong> array-like of labels</p></li>
<li><p><strong>pos_class</strong> integer, the positive class</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a binary np.ndarray, in which values 1 corresponds to positions in whcih <cite>y</cite> had <cite>pos_class</cite> labels, and
0 otherwise</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.reader.from_csv">
<span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">from_csv</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">encoding</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'utf-8'</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/reader.html#from_csv"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.reader.from_csv" title="Link to this definition"></a></dt>
<dd><p>Reads a csv file in which columns are separated by ,.
File format &lt;label&gt;,&lt;feat1&gt;,&lt;feat2&gt;,…,&lt;featn&gt;</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>path</strong> path to the csv file</p></li>
<li><p><strong>encoding</strong> the text encoding used to open the file</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a np.ndarray for the labels and a ndarray (float) for the covariates</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.reader.from_sparse">
<span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">from_sparse</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/reader.html#from_sparse"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.reader.from_sparse" title="Link to this definition"></a></dt>
<dd><p>Reads a labelled collection of real-valued instances expressed in sparse format
File format &lt;-1 or 0 or 1&gt;[s col(int):val(float)]</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>path</strong> path to the labelled collection</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a <cite>csr_matrix</cite> containing the instances (rows), and a ndarray containing the labels</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.reader.from_text">
<span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">from_text</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">path</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">encoding</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'utf-8'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">verbose</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">1</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">class2int</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/reader.html#from_text"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.reader.from_text" title="Link to this definition"></a></dt>
<dd><p>Reads a labelled colletion of documents.
File fomart &lt;0 or 1&gt; &lt;document&gt;</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>path</strong> path to the labelled collection</p></li>
<li><p><strong>encoding</strong> the text encoding used to open the file</p></li>
<li><p><strong>verbose</strong> if &gt;0 (default) shows some progress information in standard output</p></li>
</ul>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a list of sentences, and a list of labels</p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="quapy.data.reader.reindex_labels">
<span class="sig-prename descclassname"><span class="pre">quapy.data.reader.</span></span><span class="sig-name descname"><span class="pre">reindex_labels</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">y</span></span></em><span class="sig-paren">)</span><a class="reference internal" href="_modules/quapy/data/reader.html#reindex_labels"><span class="viewcode-link"><span class="pre">[source]</span></span></a><a class="headerlink" href="#quapy.data.reader.reindex_labels" title="Link to this definition"></a></dt>
<dd><p>Re-indexes a list of labels as a list of indexes, and returns the classnames corresponding to the indexes.
E.g.:</p>
<div class="doctest highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">reindex_labels</span><span class="p">([</span><span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;C&#39;</span><span class="p">])</span>
<span class="gp">&gt;&gt;&gt; </span><span class="p">(</span><span class="n">array</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">2</span><span class="p">]),</span> <span class="n">array</span><span class="p">([</span><span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;C&#39;</span><span class="p">],</span> <span class="n">dtype</span><span class="o">=</span><span class="s1">&#39;&lt;U1&#39;</span><span class="p">))</span>
</pre></div>
</div>
<dl class="field-list simple">
<dt class="field-odd">Parameters<span class="colon">:</span></dt>
<dd class="field-odd"><p><strong>y</strong> the list or array of original labels</p>
</dd>
<dt class="field-even">Returns<span class="colon">:</span></dt>
<dd class="field-even"><p>a ndarray (int) of class indexes, and a ndarray of classnames corresponding to the indexes.</p>
</dd>
</dl>
</dd></dl>
</section>
<section id="module-quapy.data">
<span id="module-contents"></span><h2>Module contents<a class="headerlink" href="#module-quapy.data" title="Link to this heading"></a></h2>
</section>
</section>
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="quapy.classification.html" class="btn btn-neutral float-left" title="quapy.classification package" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="quapy.method.html" class="btn btn-neutral float-right" title="quapy.method package" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<p>&#169; Copyright 2024, Alejandro Moreo.</p>
</div>
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.
</footer>
</div>
</div>
</section>
</div>
<script>
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</body>
</html>